gitphp 0.2.9.1 :: disclosr.git/blobdiff

blob:a/documents/gazette.py -> blob:b/documents/gazette.py

import sys, os	import sys, os
import time	import time
import scrape	import scrape
from bs4 import BeautifulSoup	from bs4 import BeautifulSoup

from unidecode import unidecode	from unidecode import unidecode

listurl = "http://gazettes.ag.gov.au/portal/govgazonline.nsf/publications?OpenView&Start=3950"	items = 3950
(url, mime_type, listhtml) = scrape.fetchURL(scrape.docsdb,	items = 1
listurl, "gazette", "AGD")	while True:
for line in listhtml.split('\n'):	print str(items) + " (" +str(items/25) +" screens to go)"
soup = BeautifulSoup(line)	listurl = "http://gazettes.ag.gov.au/portal/govgazonline.nsf/publications?OpenView&Start=" + str(items)
#print line	(listurl, mime_type, listhtml) = scrape.fetchURL(scrape.docsdb,
for row in soup.find_all('tr'):	listurl, "gazette", "AGD", False)
print line	for line in listhtml.split('\n'):
if row.has_key('valign'):	soup = BeautifulSoup(line)
i = 0	#print line
date = ""	for row in soup.find_all('tr'):
id = ""	print line
type = ""	if row.has_key('valign'):
description = ""	i = 0
name = ""	date = ""
url = ""	id = ""
for col in soup.find_all('td'):	type = ""
#print ''.join(col.stripped_strings)	description = ""
if i == 0:	name = ""
date = ''.join(col.stripped_strings)	url = ""
if i == 1:	for col in soup.find_all('td'):
id = ''.join(col.stripped_strings)	#print ''.join(col.stripped_strings)
if i == 2:	if i == 0:
type = ''.join(col.stripped_strings)	date = ''.join(col.stripped_strings)
if i == 3:	if i == 1:
description = ''.join(col.stripped_strings)	id = ''.join(col.stripped_strings)
for link in col.findAll('a'):	if i == 2:
if link.has_key("href"):	type = ''.join(col.stripped_strings)
url = link['href']	if i == 3:
name = ''.join(link.stripped_strings)	description = ''.join(col.stripped_strings)
print [date,id,type,description,name, url]	for link in col.findAll('a'):
i = i +1;	if link.has_key("href"):
#url = scrape.fullurl(listurl, atag['href'])	url = link['href']
#(url, mime_type, html) = scrape.fetchURL(scrape.docsdb,	name = ''.join(link.stripped_strings)
# url, "data", "AGIMO")	print str(items) + " (" +str(items/25) +" screens to go)"
#hash = scrape.mkhash(scrape.canonurl(url))	print [date, id, type, description, name, url]
#doc = scrape.docsdb.get(hash)	itemurl = scrape.fullurl(listurl, url)
#print doc['metadata']	(itemurl, mime_type, html) = scrape.fetchURL(scrape.docsdb,
#scrape.docsdb.save(doc)	itemurl, "gazette", "AGD", False)
#time.sleep(2)	hash = scrape.mkhash(scrape.canonurl(itemurl))
	doc = scrape.docsdb.get(hash)
	doc['metadata'] = {"date": date, "date": id, "type":type, "description":description,"name": name,"url": url}
	scrape.docsdb.save(doc)
	#time.sleep(2)
	i = i + 1;

	items = items - 25
	if items <= 0:
	break