export
[disclosr.git] / documents / gazette.py
blob:a/documents/gazette.py -> blob:b/documents/gazette.py
  import sys, os
  import time
  import scrape
  from bs4 import BeautifulSoup
   
  from unidecode import unidecode
   
  listurl = "http://gazettes.ag.gov.au/portal/govgazonline.nsf/publications?OpenView&Start=3960"
  (url, mime_type, listhtml) = scrape.fetchURL(scrape.docsdb,
  listurl, "gazette", "AGD")
  soup = BeautifulSoup(listhtml)
  for row in soup.find_all('tr'):
  if row.has_key('valign'):
  for col in tr.find_all('td'):
  print col.string
  #url = scrape.fullurl(listurl, atag['href'])
  #(url, mime_type, html) = scrape.fetchURL(scrape.docsdb,
  # url, "data", "AGIMO")
  #hash = scrape.mkhash(scrape.canonurl(url))
  #doc = scrape.docsdb.get(hash)
  #print doc['metadata']
  #scrape.docsdb.save(doc)
  #time.sleep(2)