finished gazette parser
[disclosr.git] / documents / gazette.py
blob:a/documents/gazette.py -> blob:b/documents/gazette.py
import sys, os import sys, os
import time import time
import scrape import scrape
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
   
from unidecode import unidecode from unidecode import unidecode
   
listurl = "http://gazettes.ag.gov.au/portal/govgazonline.nsf/publications?OpenView&Start=3950" items = 3950
(url, mime_type, listhtml) = scrape.fetchURL(scrape.docsdb, items = 1
listurl, "gazette", "AGD") while True:
for line in listhtml.split('\n'): print str(items) + " (" +str(items/25) +" screens to go)"
soup = BeautifulSoup(line) listurl = "http://gazettes.ag.gov.au/portal/govgazonline.nsf/publications?OpenView&Start=" + str(items)
#print line (listurl, mime_type, listhtml) = scrape.fetchURL(scrape.docsdb,
for row in soup.find_all('tr'): listurl, "gazette", "AGD", False)
print line for line in listhtml.split('\n'):
if row.has_key('valign'): soup = BeautifulSoup(line)
i = 0 #print line
date = "" for row in soup.find_all('tr'):
id = "" print line
type = "" if row.has_key('valign'):
description = "" i = 0
name = "" date = ""
url = "" id = ""
for col in soup.find_all('td'): type = ""
#print ''.join(col.stripped_strings) description = ""
if i == 0: name = ""
date = ''.join(col.stripped_strings) url = ""
if i == 1: for col in soup.find_all('td'):
id = ''.join(col.stripped_strings) #print ''.join(col.stripped_strings)
if i == 2: if i == 0:
type = ''.join(col.stripped_strings) date = ''.join(col.stripped_strings)
if i == 3: if i == 1:
description = ''.join(col.stripped_strings) id = ''.join(col.stripped_strings)
for link in col.findAll('a'): if i == 2:
if link.has_key("href"): type = ''.join(col.stripped_strings)
url = link['href'] if i == 3:
name = ''.join(link.stripped_strings) description = ''.join(col.stripped_strings)
print [date,id,type,description,name, url] for link in col.findAll('a'):
i = i +1; if link.has_key("href"):
#url = scrape.fullurl(listurl, atag['href']) url = link['href']
#(url, mime_type, html) = scrape.fetchURL(scrape.docsdb, name = ''.join(link.stripped_strings)
# url, "data", "AGIMO") print str(items) + " (" +str(items/25) +" screens to go)"
#hash = scrape.mkhash(scrape.canonurl(url)) print [date, id, type, description, name, url]
#doc = scrape.docsdb.get(hash) itemurl = scrape.fullurl(listurl, url)
#print doc['metadata'] (itemurl, mime_type, html) = scrape.fetchURL(scrape.docsdb,
#scrape.docsdb.save(doc) itemurl, "gazette", "AGD", False)
#time.sleep(2) hash = scrape.mkhash(scrape.canonurl(itemurl))
  doc = scrape.docsdb.get(hash)
  doc['metadata'] = {"date": date, "date": id, "type":type, "description":description,"name": name,"url": url}
  scrape.docsdb.save(doc)
  #time.sleep(2)
  i = i + 1;
   
  items = items - 25
  if items <= 0:
  break