beginning datagov scraper
[disclosr.git] / documents / datagov.py
blob:a/documents/datagov.py -> blob:b/documents/datagov.py
--- a/documents/datagov.py
+++ b/documents/datagov.py
@@ -1,1 +1,48 @@
+import sys, os
 
+import scrape
+from bs4 import BeautifulSoup
+
+
+listurl = "http://data.gov.au/data/"
+(url, mime_type, datasetlisthtml) = scrape.fetchURL(scrape.docsdb,
+    listurl, "data", "AGIMO")
+soup = BeautifulSoup(datasetlisthtml)
+for atag in soup.find_all(class_='result-title'):
+    if atag.has_key('href'):
+        url = scrape.fullurl(listurl, atag['href'])
+        (url, mime_type, html) = scrape.fetchURL(scrape.docsdb,
+            url, "data", "AGIMO")
+        hash = scrape.mkhash(scrape.canonurl(url))
+        doc = scrape.docsdb.get(hash)
+        if "metadata" not in doc.keys():
+            doc['metadata'] = {}
+        soup = BeautifulSoup(html)
+        for metatag in soup.find_all('meta'):
+            if metatag.has_key('name'):
+                doc['metadata'][metatag['name']] = metatag['content']
+        for list in soup.find_all('dl'):
+            last_title = ""
+            for child in list.children:
+                if str(type(child)) != "<class 'bs4.element.NavigableString'>":
+                    if child.name == 'dt' and child.string != None:
+                        last_title = child.string.strip()
+                    if child.name == 'dd':
+                        #print last_title
+                        if last_title == "Download":
+                            for item in child.find_all("li"):
+                                link = item.find("a")
+                                format = item.find(property="dc:format")
+                                linkobj = {"href":link['href'].replace("/bye?","").strip(), "name": link.string.strip(),
+                                    "format": format.string.strip(), "size": format.next_sibling.string.strip()}
+                                doc['metadata'][last_title] = linkobj
+
+                        else:
+                            atags = child.find_all('a')
+                            if len(atags) < 2:
+                                [s.extract() for s in child(class_='viewAll')]
+                                doc['metadata'][last_title] = ''.join(child.stripped_strings).strip()
+                            else:
+                                doc['metadata'][last_title] = [item.string.replace(",","").strip() for item in atags]
+        print doc['metadata']
+        sys.exit("ggg")