export
[disclosr.git] / documents / gazette.py
blob:a/documents/gazette.py -> blob:b/documents/gazette.py
import sys, os import sys, os
import time import time
import scrape import scrape
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
   
from unidecode import unidecode from unidecode import unidecode
   
listurl = "http://gazettes.ag.gov.au/portal/govgazonline.nsf/publications?OpenView&Start=3960" items = 3950
(url, mime_type, listhtml) = scrape.fetchURL(scrape.docsdb, items = 1
listurl, "gazette", "AGD") while True:
soup = BeautifulSoup(listhtml) print str(items) + " (" +str(items/25) +" screens to go)"
for row in soup.find_all('tr'): listurl = "http://gazettes.ag.gov.au/portal/govgazonline.nsf/publications?OpenView&Start=" + str(items)
if row.has_key('valign'): (listurl, mime_type, listhtml) = scrape.fetchURL(scrape.docsdb,
for col in tr.find_all('td'): listurl, "gazette", "AGD", False)
print col.string for line in listhtml.split('\n'):
#url = scrape.fullurl(listurl, atag['href']) soup = BeautifulSoup(line)
#(url, mime_type, html) = scrape.fetchURL(scrape.docsdb, #print line
# url, "data", "AGIMO") for row in soup.find_all('tr'):
#hash = scrape.mkhash(scrape.canonurl(url)) print line
#doc = scrape.docsdb.get(hash) if row.has_key('valign'):
#print doc['metadata'] i = 0
#scrape.docsdb.save(doc) date = ""
#time.sleep(2) id = ""
  type = ""
  description = ""
  name = ""
  url = ""
  for col in soup.find_all('td'):
  #print ''.join(col.stripped_strings)
  if i == 0:
  date = ''.join(col.stripped_strings)
  if i == 1:
  id = ''.join(col.stripped_strings)
  if i == 2:
  type = ''.join(col.stripped_strings)
  if i == 3:
  description = ''.join(col.stripped_strings)
  for link in col.findAll('a'):
  if link.has_key("href"):
  url = link['href']
  name = ''.join(link.stripped_strings)
  print str(items) + " (" +str(items/25) +" screens to go)"
  print [date, id, type, description, name, url]
  itemurl = scrape.fullurl(listurl, url)
  (itemurl, mime_type, html) = scrape.fetchURL(scrape.docsdb,
  itemurl, "gazette", "AGD", False)
  hash = scrape.mkhash(scrape.canonurl(itemurl))
  doc = scrape.docsdb.get(hash)
  doc['metadata'] = {"date": date, "date": id, "type":type, "description":description,"name": name,"url": url}
  scrape.docsdb.save(doc)
  #time.sleep(2)
  i = i + 1;
   
  items = items - 25
  if items <= 0:
  break