--- a/documents/gazette.py +++ b/documents/gazette.py @@ -1,1 +1,24 @@ +import sys, os +import time +import scrape +from bs4 import BeautifulSoup +from unidecode import unidecode + +listurl = "http://gazettes.ag.gov.au/portal/govgazonline.nsf/publications?OpenView&Start=3960" +(url, mime_type, listhtml) = scrape.fetchURL(scrape.docsdb, + listurl, "gazette", "AGD") +soup = BeautifulSoup(listhtml) +for row in soup.find_all('tr'): + if row.has_key('valign'): + for col in tr.find_all('td'): + print col.string + #url = scrape.fullurl(listurl, atag['href']) + #(url, mime_type, html) = scrape.fetchURL(scrape.docsdb, + # url, "data", "AGIMO") + #hash = scrape.mkhash(scrape.canonurl(url)) + #doc = scrape.docsdb.get(hash) + #print doc['metadata'] + #scrape.docsdb.save(doc) + #time.sleep(2) +