--- a/documents/gazette.py +++ b/documents/gazette.py @@ -1,1 +1,48 @@ +import sys, os +import time +import scrape +from bs4 import BeautifulSoup +from unidecode import unidecode + +listurl = "http://gazettes.ag.gov.au/portal/govgazonline.nsf/publications?OpenView&Start=3950" +(url, mime_type, listhtml) = scrape.fetchURL(scrape.docsdb, + listurl, "gazette", "AGD") +for line in listhtml.split('\n'): + soup = BeautifulSoup(line) + #print line + for row in soup.find_all('tr'): + print line + if row.has_key('valign'): + i = 0 + date = "" + id = "" + type = "" + description = "" + name = "" + url = "" + for col in soup.find_all('td'): + #print ''.join(col.stripped_strings) + if i == 0: + date = ''.join(col.stripped_strings) + if i == 1: + id = ''.join(col.stripped_strings) + if i == 2: + type = ''.join(col.stripped_strings) + if i == 3: + description = ''.join(col.stripped_strings) + for link in col.findAll('a'): + if link.has_key("href"): + url = link['href'] + name = ''.join(link.stripped_strings) + print [date,id,type,description,name, url] + i = i +1; + #url = scrape.fullurl(listurl, atag['href']) + #(url, mime_type, html) = scrape.fetchURL(scrape.docsdb, + # url, "data", "AGIMO") + #hash = scrape.mkhash(scrape.canonurl(url)) + #doc = scrape.docsdb.get(hash) + #print doc['metadata'] + #scrape.docsdb.save(doc) + #time.sleep(2) +