|
import sys, os |
|
import time |
|
import scrape |
|
from bs4 import BeautifulSoup |
|
|
|
from unidecode import unidecode |
|
|
|
listurl = "http://gazettes.ag.gov.au/portal/govgazonline.nsf/publications?OpenView&Start=3960" |
|
(url, mime_type, listhtml) = scrape.fetchURL(scrape.docsdb, |
|
listurl, "gazette", "AGD") |
|
soup = BeautifulSoup(listhtml) |
|
for row in soup.find_all('tr'): |
|
if row.has_key('valign'): |
|
for col in tr.find_all('td'): |
|
print col.string |
|
#url = scrape.fullurl(listurl, atag['href']) |
|
#(url, mime_type, html) = scrape.fetchURL(scrape.docsdb, |
|
# url, "data", "AGIMO") |
|
#hash = scrape.mkhash(scrape.canonurl(url)) |
|
#doc = scrape.docsdb.get(hash) |
|
#print doc['metadata'] |
|
#scrape.docsdb.save(doc) |
|
#time.sleep(2) |
|
|