beginning datagov scraper
[disclosr.git] / documents / datagov.py
blob:a/documents/datagov.py -> blob:b/documents/datagov.py
  import sys, os
   
  import scrape
  from bs4 import BeautifulSoup
   
   
  listurl = "http://data.gov.au/data/"
  (url, mime_type, datasetlisthtml) = scrape.fetchURL(scrape.docsdb,
  listurl, "data", "AGIMO")
  soup = BeautifulSoup(datasetlisthtml)
  for atag in soup.find_all(class_='result-title'):
  if atag.has_key('href'):
  url = scrape.fullurl(listurl, atag['href'])
  (url, mime_type, html) = scrape.fetchURL(scrape.docsdb,
  url, "data", "AGIMO")
  hash = scrape.mkhash(scrape.canonurl(url))
  doc = scrape.docsdb.get(hash)
  if "metadata" not in doc.keys():
  doc['metadata'] = {}
  soup = BeautifulSoup(html)
  for metatag in soup.find_all('meta'):
  if metatag.has_key('name'):
  doc['metadata'][metatag['name']] = metatag['content']
  for list in soup.find_all('dl'):
  last_title = ""
  for child in list.children:
  if str(type(child)) != "<class 'bs4.element.NavigableString'>":
  if child.name == 'dt' and child.string != None:
  last_title = child.string.strip()
  if child.name == 'dd':
  #print last_title
  if last_title == "Download":
  for item in child.find_all("li"):
  link = item.find("a")
  format = item.find(property="dc:format")
  linkobj = {"href":link['href'].replace("/bye?","").strip(), "name": link.string.strip(),
  "format": format.string.strip(), "size": format.next_sibling.string.strip()}
  doc['metadata'][last_title] = linkobj
   
  else:
  atags = child.find_all('a')
  if len(atags) < 2:
  [s.extract() for s in child(class_='viewAll')]
  doc['metadata'][last_title] = ''.join(child.stripped_strings).strip()
  else:
  doc['metadata'][last_title] = [item.string.replace(",","").strip() for item in atags]
  print doc['metadata']
  sys.exit("ggg")