--- a/documents/datagov.py +++ b/documents/datagov.py @@ -1,1 +1,48 @@ +import sys, os +import scrape +from bs4 import BeautifulSoup + + +listurl = "http://data.gov.au/data/" +(url, mime_type, datasetlisthtml) = scrape.fetchURL(scrape.docsdb, + listurl, "data", "AGIMO") +soup = BeautifulSoup(datasetlisthtml) +for atag in soup.find_all(class_='result-title'): + if atag.has_key('href'): + url = scrape.fullurl(listurl, atag['href']) + (url, mime_type, html) = scrape.fetchURL(scrape.docsdb, + url, "data", "AGIMO") + hash = scrape.mkhash(scrape.canonurl(url)) + doc = scrape.docsdb.get(hash) + if "metadata" not in doc.keys(): + doc['metadata'] = {} + soup = BeautifulSoup(html) + for metatag in soup.find_all('meta'): + if metatag.has_key('name'): + doc['metadata'][metatag['name']] = metatag['content'] + for list in soup.find_all('dl'): + last_title = "" + for child in list.children: + if str(type(child)) != "": + if child.name == 'dt' and child.string != None: + last_title = child.string.strip() + if child.name == 'dd': + #print last_title + if last_title == "Download": + for item in child.find_all("li"): + link = item.find("a") + format = item.find(property="dc:format") + linkobj = {"href":link['href'].replace("/bye?","").strip(), "name": link.string.strip(), + "format": format.string.strip(), "size": format.next_sibling.string.strip()} + doc['metadata'][last_title] = linkobj + + else: + atags = child.find_all('a') + if len(atags) < 2: + [s.extract() for s in child(class_='viewAll')] + doc['metadata'][last_title] = ''.join(child.stripped_strings).strip() + else: + doc['metadata'][last_title] = [item.string.replace(",","").strip() for item in atags] + print doc['metadata'] + sys.exit("ggg")