add rtk ausbudget import
[disclosr.git] / documents / datagov.py
blob:a/documents/datagov.py -> blob:b/documents/datagov.py
import sys, os import sys, os
  import time
import scrape import scrape
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
   
  from unidecode import unidecode
   
listurl = "http://data.gov.au/data/" listurl = "http://data.gov.au/data/"
(url, mime_type, datasetlisthtml) = scrape.fetchURL(scrape.docsdb, (url, mime_type, datasetlisthtml) = scrape.fetchURL(scrape.docsdb,
listurl, "data", "AGIMO") listurl, "data", "AGIMO")
soup = BeautifulSoup(datasetlisthtml) soup = BeautifulSoup(datasetlisthtml)
for atag in soup.find_all(class_='result-title'): for atag in soup.find_all(class_='result-title'):
if atag.has_key('href'): if atag.has_key('href'):
url = scrape.fullurl(listurl, atag['href']) url = scrape.fullurl(listurl, atag['href'])
(url, mime_type, html) = scrape.fetchURL(scrape.docsdb, (url, mime_type, html) = scrape.fetchURL(scrape.docsdb,
url, "data", "AGIMO") url, "data", "AGIMO")
hash = scrape.mkhash(scrape.canonurl(url)) hash = scrape.mkhash(scrape.canonurl(url))
doc = scrape.docsdb.get(hash) doc = scrape.docsdb.get(hash)
if "metadata" not in doc.keys(): if "metadata" not in doc.keys() or True:
  doc['type'] = "dataset"
doc['metadata'] = {} doc['metadata'] = {}
soup = BeautifulSoup(html) soup = BeautifulSoup(html)
for metatag in soup.find_all('meta'): for metatag in soup.find_all('meta'):
if metatag.has_key('name'): if metatag.has_key('name'):
doc['metadata'][metatag['name']] = metatag['content'] doc['metadata'][metatag['name']] = metatag['content']
for list in soup.find_all('dl'): for list in soup.find_all('dl'):
last_title = "" last_title = ""
for child in list.children: for child in list.children:
if str(type(child)) != "<class 'bs4.element.NavigableString'>": if str(type(child)) != "<class 'bs4.element.NavigableString'>":
if child.name == 'dt' and child.string != None: if child.name == 'dt' and child.string != None:
last_title = child.string.strip() last_title = child.string.strip()
if child.name == 'dd': if child.name == 'dd':
#print last_title #print last_title
if last_title == "Download": if last_title == "Description":
for item in child.find_all("li"): doc['metadata'][last_title] = unidecode(str(child)).encode('ascii', 'ignore')
link = item.find("a") elif last_title == "Download":
format = item.find(property="dc:format") doc['metadata'][last_title] = {}
linkobj = {"href":link['href'].replace("/bye?","").strip(), "name": link.string.strip(), for item in child.find_all("li"):
"format": format.string.strip(), "size": format.next_sibling.string.strip()} link = item.find("a")
doc['metadata'][last_title] = linkobj format = item.find(property="dc:format")
  linkobj = {"href":link['href'].replace("/bye?","").strip(),
  "format": format.string.strip(), "size": format.next_sibling.string.strip()}
  if link.string != None:
  linkobj["name"] = link.string.strip()
  doc['metadata'][last_title][] = linkobj
   
else:  
atags = child.find_all('a')  
if len(atags) < 2:  
[s.extract() for s in child(class_='viewAll')]  
doc['metadata'][last_title] = ''.join(child.stripped_strings).strip()  
else: else:
doc['metadata'][last_title] = [item.string.replace(",","").strip() for item in atags] atags = child.find_all('a')
print doc['metadata'] if len(atags) < 2:
sys.exit("ggg") [s.extract() for s in child(class_='viewAll')]
  doc['metadata'][last_title] = ''.join(child.stripped_strings).strip()
  else:
  doc['metadata'][last_title] = [item.string.replace(",","").strip() for item in atags]
  print doc['metadata']
  scrape.docsdb.save(doc)
  #time.sleep(2)