gitphp 0.2.9.1 :: disclosr.git/blobdiff

import sys, os

import time

import scrape

from bs4 import BeautifulSoup

from unidecode import unidecode

items = 3950

items = 1

while True:

print str(items) + " (" +str(items/25) +" screens to go)"

listurl = "http://gazettes.ag.gov.au/portal/govgazonline.nsf/publications?OpenView&Start=" + str(items)

(listurl, mime_type, listhtml) = scrape.fetchURL(scrape.docsdb,

listurl, "gazette", "AGD", False)

for line in listhtml.split('\n'):

soup = BeautifulSoup(line)

#print line

for row in soup.find_all('tr'):

print line

if row.has_key('valign'):

i = 0

date = ""

id = ""

type = ""

description = ""

name = ""

url = ""

for col in soup.find_all('td'):

#print ''.join(col.stripped_strings)

if i == 0:

date = ''.join(col.stripped_strings)

if i == 1:

id = ''.join(col.stripped_strings)

if i == 2:

type = ''.join(col.stripped_strings)

if i == 3:

description = ''.join(col.stripped_strings)

for link in col.findAll('a'):

if link.has_key("href"):

url = link['href']

name = ''.join(link.stripped_strings)

print str(items) + " (" +str(items/25) +" screens to go)"

print [date, id, type, description, name, url]

itemurl = scrape.fullurl(listurl, url)

(itemurl, mime_type, html) = scrape.fetchURL(scrape.docsdb,

itemurl, "gazette", "AGD", False)

hash = scrape.mkhash(scrape.canonurl(itemurl))

doc = scrape.docsdb.get(hash)

doc['metadata'] = {"date": date, "date": id, "type":type, "description":description,"name": name,"url": url}

scrape.docsdb.save(doc)

#time.sleep(2)

i = i + 1;

items = items - 25

if items <= 0:

break