--- a/documents/gazette.py +++ b/documents/gazette.py @@ -1,1 +1,57 @@ +import sys, os +import time +import scrape +from bs4 import BeautifulSoup +from unidecode import unidecode + +items = 3950 +items = 1 +while True: + print str(items) + " (" +str(items/25) +" screens to go)" + listurl = "http://gazettes.ag.gov.au/portal/govgazonline.nsf/publications?OpenView&Start=" + str(items) + (listurl, mime_type, listhtml) = scrape.fetchURL(scrape.docsdb, + listurl, "gazette", "AGD", False) + for line in listhtml.split('\n'): + soup = BeautifulSoup(line) + #print line + for row in soup.find_all('tr'): + print line + if row.has_key('valign'): + i = 0 + date = "" + id = "" + type = "" + description = "" + name = "" + url = "" + for col in soup.find_all('td'): + #print ''.join(col.stripped_strings) + if i == 0: + date = ''.join(col.stripped_strings) + if i == 1: + id = ''.join(col.stripped_strings) + if i == 2: + type = ''.join(col.stripped_strings) + if i == 3: + description = ''.join(col.stripped_strings) + for link in col.findAll('a'): + if link.has_key("href"): + url = link['href'] + name = ''.join(link.stripped_strings) + print str(items) + " (" +str(items/25) +" screens to go)" + print [date, id, type, description, name, url] + itemurl = scrape.fullurl(listurl, url) + (itemurl, mime_type, html) = scrape.fetchURL(scrape.docsdb, + itemurl, "gazette", "AGD", False) + hash = scrape.mkhash(scrape.canonurl(itemurl)) + doc = scrape.docsdb.get(hash) + doc['metadata'] = {"date": date, "date": id, "type":type, "description":description,"name": name,"url": url} + scrape.docsdb.save(doc) + #time.sleep(2) + i = i + 1; + + items = items - 25 + if items <= 0: + break +