|
import sys, os |
|
import time |
|
import scrape |
|
from bs4 import BeautifulSoup |
|
|
|
from unidecode import unidecode |
|
|
|
listurl = "http://gazettes.ag.gov.au/portal/govgazonline.nsf/publications?OpenView&Start=3950" |
|
(url, mime_type, listhtml) = scrape.fetchURL(scrape.docsdb, |
|
listurl, "gazette", "AGD") |
|
for line in listhtml.split('\n'): |
|
soup = BeautifulSoup(line) |
|
#print line |
|
for row in soup.find_all('tr'): |
|
print line |
|
if row.has_key('valign'): |
|
i = 0 |
|
date = "" |
|
id = "" |
|
type = "" |
|
description = "" |
|
name = "" |
|
url = "" |
|
for col in soup.find_all('td'): |
|
#print ''.join(col.stripped_strings) |
|
if i == 0: |
|
date = ''.join(col.stripped_strings) |
|
if i == 1: |
|
id = ''.join(col.stripped_strings) |
|
if i == 2: |
|
type = ''.join(col.stripped_strings) |
|
if i == 3: |
|
description = ''.join(col.stripped_strings) |
|
for link in col.findAll('a'): |
|
if link.has_key("href"): |
|
url = link['href'] |
|
name = ''.join(link.stripped_strings) |
|
print [date,id,type,description,name, url] |
|
i = i +1; |
|
#url = scrape.fullurl(listurl, atag['href']) |
|
#(url, mime_type, html) = scrape.fetchURL(scrape.docsdb, |
|
# url, "data", "AGIMO") |
|
#hash = scrape.mkhash(scrape.canonurl(url)) |
|
#doc = scrape.docsdb.get(hash) |
|
#print doc['metadata'] |
|
#scrape.docsdb.save(doc) |
|
#time.sleep(2) |
|
|