--- a/documents/datagov.py +++ b/documents/datagov.py @@ -1,8 +1,9 @@ import sys, os - +import time import scrape from bs4 import BeautifulSoup +from unidecode import unidecode listurl = "http://data.gov.au/data/" (url, mime_type, datasetlisthtml) = scrape.fetchURL(scrape.docsdb, @@ -15,34 +16,42 @@ url, "data", "AGIMO") hash = scrape.mkhash(scrape.canonurl(url)) doc = scrape.docsdb.get(hash) - if "metadata" not in doc.keys(): + if "metadata" not in doc.keys() or True: + doc['type'] = "dataset" doc['metadata'] = {} - soup = BeautifulSoup(html) - for metatag in soup.find_all('meta'): - if metatag.has_key('name'): - doc['metadata'][metatag['name']] = metatag['content'] - for list in soup.find_all('dl'): - last_title = "" - for child in list.children: - if str(type(child)) != "": - if child.name == 'dt' and child.string != None: - last_title = child.string.strip() - if child.name == 'dd': - #print last_title - if last_title == "Download": - for item in child.find_all("li"): - link = item.find("a") - format = item.find(property="dc:format") - linkobj = {"href":link['href'].replace("/bye?","").strip(), "name": link.string.strip(), - "format": format.string.strip(), "size": format.next_sibling.string.strip()} - doc['metadata'][last_title] = linkobj + soup = BeautifulSoup(html) + for metatag in soup.find_all('meta'): + if metatag.has_key('name'): + doc['metadata'][metatag['name']] = metatag['content'] + for list in soup.find_all('dl'): + last_title = "" + for child in list.children: + if str(type(child)) != "": + if child.name == 'dt' and child.string != None: + last_title = child.string.strip() + if child.name == 'dd': + #print last_title + if last_title == "Description": + doc['metadata'][last_title] = unidecode(str(child)).encode('ascii', 'ignore') + elif last_title == "Download": + doc['metadata'][last_title] = {} + for item in child.find_all("li"): + link = item.find("a") + format = item.find(property="dc:format") + linkobj = {"href":link['href'].replace("/bye?","").strip(), + "format": format.string.strip(), "size": format.next_sibling.string.strip()} + if link.string != None: + linkobj["name"] = link.string.strip() + doc['metadata'][last_title][] = linkobj - else: - atags = child.find_all('a') - if len(atags) < 2: - [s.extract() for s in child(class_='viewAll')] - doc['metadata'][last_title] = ''.join(child.stripped_strings).strip() else: - doc['metadata'][last_title] = [item.string.replace(",","").strip() for item in atags] - print doc['metadata'] - sys.exit("ggg") + atags = child.find_all('a') + if len(atags) < 2: + [s.extract() for s in child(class_='viewAll')] + doc['metadata'][last_title] = ''.join(child.stripped_strings).strip() + else: + doc['metadata'][last_title] = [item.string.replace(",","").strip() for item in atags] + print doc['metadata'] + scrape.docsdb.save(doc) + #time.sleep(2) +