beginning gazette
[disclosr.git] / documents / gazette.py
blob:a/documents/gazette.py -> blob:b/documents/gazette.py
import sys, os import sys, os
import time import time
import scrape import scrape
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
   
from unidecode import unidecode from unidecode import unidecode
   
listurl = "http://gazettes.ag.gov.au/portal/govgazonline.nsf/publications?OpenView&Start=3960" listurl = "http://gazettes.ag.gov.au/portal/govgazonline.nsf/publications?OpenView&Start=3950"
(url, mime_type, listhtml) = scrape.fetchURL(scrape.docsdb, (url, mime_type, listhtml) = scrape.fetchURL(scrape.docsdb,
listurl, "gazette", "AGD") listurl, "gazette", "AGD")
soup = BeautifulSoup(listhtml) for line in listhtml.split('\n'):
for row in soup.find_all('tr'): soup = BeautifulSoup(line)
if row.has_key('valign'): #print line
for col in tr.find_all('td'): for row in soup.find_all('tr'):
print col.string print line
#url = scrape.fullurl(listurl, atag['href']) if row.has_key('valign'):
#(url, mime_type, html) = scrape.fetchURL(scrape.docsdb, i = 0
# url, "data", "AGIMO") date = ""
#hash = scrape.mkhash(scrape.canonurl(url)) id = ""
#doc = scrape.docsdb.get(hash) type = ""
#print doc['metadata'] description = ""
#scrape.docsdb.save(doc) name = ""
#time.sleep(2) url = ""
  for col in soup.find_all('td'):
  #print ''.join(col.stripped_strings)
  if i == 0:
  date = ''.join(col.stripped_strings)
  if i == 1:
  id = ''.join(col.stripped_strings)
  if i == 2:
  type = ''.join(col.stripped_strings)
  if i == 3:
  description = ''.join(col.stripped_strings)
  for link in col.findAll('a'):
  if link.has_key("href"):
  url = link['href']
  name = ''.join(link.stripped_strings)
  print [date,id,type,description,name, url]
  i = i +1;
  #url = scrape.fullurl(listurl, atag['href'])
  #(url, mime_type, html) = scrape.fetchURL(scrape.docsdb,
  # url, "data", "AGIMO")
  #hash = scrape.mkhash(scrape.canonurl(url))
  #doc = scrape.docsdb.get(hash)
  #print doc['metadata']
  #scrape.docsdb.save(doc)
  #time.sleep(2)