import sys, os |
import sys, os |
import time |
import time |
import scrape |
import scrape |
from bs4 import BeautifulSoup |
from bs4 import BeautifulSoup |
|
|
from unidecode import unidecode |
from unidecode import unidecode |
|
|
listurl = "http://gazettes.ag.gov.au/portal/govgazonline.nsf/publications?OpenView&Start=3960" |
items = 3950 |
(url, mime_type, listhtml) = scrape.fetchURL(scrape.docsdb, |
items = 1 |
listurl, "gazette", "AGD") |
while True: |
soup = BeautifulSoup(listhtml) |
print str(items) + " (" +str(items/25) +" screens to go)" |
for row in soup.find_all('tr'): |
listurl = "http://gazettes.ag.gov.au/portal/govgazonline.nsf/publications?OpenView&Start=" + str(items) |
if row.has_key('valign'): |
(listurl, mime_type, listhtml) = scrape.fetchURL(scrape.docsdb, |
for col in tr.find_all('td'): |
listurl, "gazette", "AGD", False) |
print col.string |
for line in listhtml.split('\n'): |
#url = scrape.fullurl(listurl, atag['href']) |
soup = BeautifulSoup(line) |
#(url, mime_type, html) = scrape.fetchURL(scrape.docsdb, |
#print line |
# url, "data", "AGIMO") |
for row in soup.find_all('tr'): |
#hash = scrape.mkhash(scrape.canonurl(url)) |
print line |
#doc = scrape.docsdb.get(hash) |
if row.has_key('valign'): |
#print doc['metadata'] |
i = 0 |
#scrape.docsdb.save(doc) |
date = "" |
#time.sleep(2) |
id = "" |
|
type = "" |
|
description = "" |
|
name = "" |
|
url = "" |
|
for col in soup.find_all('td'): |
|
#print ''.join(col.stripped_strings) |
|
if i == 0: |
|
date = ''.join(col.stripped_strings) |
|
if i == 1: |
|
id = ''.join(col.stripped_strings) |
|
if i == 2: |
|
type = ''.join(col.stripped_strings) |
|
if i == 3: |
|
description = ''.join(col.stripped_strings) |
|
for link in col.findAll('a'): |
|
if link.has_key("href"): |
|
url = link['href'] |
|
name = ''.join(link.stripped_strings) |
|
print str(items) + " (" +str(items/25) +" screens to go)" |
|
print [date, id, type, description, name, url] |
|
itemurl = scrape.fullurl(listurl, url) |
|
(itemurl, mime_type, html) = scrape.fetchURL(scrape.docsdb, |
|
itemurl, "gazette", "AGD", False) |
|
hash = scrape.mkhash(scrape.canonurl(itemurl)) |
|
doc = scrape.docsdb.get(hash) |
|
doc['metadata'] = {"date": date, "date": id, "type":type, "description":description,"name": name,"url": url} |
|
scrape.docsdb.save(doc) |
|
#time.sleep(2) |
|
i = i + 1; |
|
|
|
items = items - 25 |
|
if items <= 0: |
|
break |
|
|