import sys, os |
import sys, os |
import time |
import time |
import scrape |
import scrape |
from bs4 import BeautifulSoup |
from bs4 import BeautifulSoup |
|
|
from unidecode import unidecode |
from unidecode import unidecode |
|
|
listurl = "http://gazettes.ag.gov.au/portal/govgazonline.nsf/publications?OpenView&Start=3960" |
listurl = "http://gazettes.ag.gov.au/portal/govgazonline.nsf/publications?OpenView&Start=3950" |
(url, mime_type, listhtml) = scrape.fetchURL(scrape.docsdb, |
(url, mime_type, listhtml) = scrape.fetchURL(scrape.docsdb, |
listurl, "gazette", "AGD") |
listurl, "gazette", "AGD") |
soup = BeautifulSoup(listhtml) |
for line in listhtml.split('\n'): |
for row in soup.find_all('tr'): |
soup = BeautifulSoup(line) |
if row.has_key('valign'): |
#print line |
for col in tr.find_all('td'): |
for row in soup.find_all('tr'): |
print col.string |
print line |
#url = scrape.fullurl(listurl, atag['href']) |
if row.has_key('valign'): |
#(url, mime_type, html) = scrape.fetchURL(scrape.docsdb, |
i = 0 |
# url, "data", "AGIMO") |
date = "" |
#hash = scrape.mkhash(scrape.canonurl(url)) |
id = "" |
#doc = scrape.docsdb.get(hash) |
type = "" |
#print doc['metadata'] |
description = "" |
#scrape.docsdb.save(doc) |
name = "" |
#time.sleep(2) |
url = "" |
|
for col in soup.find_all('td'): |
|
#print ''.join(col.stripped_strings) |
|
if i == 0: |
|
date = ''.join(col.stripped_strings) |
|
if i == 1: |
|
id = ''.join(col.stripped_strings) |
|
if i == 2: |
|
type = ''.join(col.stripped_strings) |
|
if i == 3: |
|
description = ''.join(col.stripped_strings) |
|
for link in col.findAll('a'): |
|
if link.has_key("href"): |
|
url = link['href'] |
|
name = ''.join(link.stripped_strings) |
|
print [date,id,type,description,name, url] |
|
i = i +1; |
|
#url = scrape.fullurl(listurl, atag['href']) |
|
#(url, mime_type, html) = scrape.fetchURL(scrape.docsdb, |
|
# url, "data", "AGIMO") |
|
#hash = scrape.mkhash(scrape.canonurl(url)) |
|
#doc = scrape.docsdb.get(hash) |
|
#print doc['metadata'] |
|
#scrape.docsdb.save(doc) |
|
#time.sleep(2) |
|
|