import sys, os |
import sys, os |
import time |
import time |
import scrape |
import scrape |
from bs4 import BeautifulSoup |
from bs4 import BeautifulSoup |
|
|
from unidecode import unidecode |
from unidecode import unidecode |
|
|
listurl = "http://gazettes.ag.gov.au/portal/govgazonline.nsf/publications?OpenView&Start=3950" |
items = 3950 |
(url, mime_type, listhtml) = scrape.fetchURL(scrape.docsdb, |
items = 1 |
listurl, "gazette", "AGD") |
while True: |
for line in listhtml.split('\n'): |
print str(items) + " (" +str(items/25) +" screens to go)" |
soup = BeautifulSoup(line) |
listurl = "http://gazettes.ag.gov.au/portal/govgazonline.nsf/publications?OpenView&Start=" + str(items) |
#print line |
(listurl, mime_type, listhtml) = scrape.fetchURL(scrape.docsdb, |
for row in soup.find_all('tr'): |
listurl, "gazette", "AGD", False) |
print line |
for line in listhtml.split('\n'): |
if row.has_key('valign'): |
soup = BeautifulSoup(line) |
i = 0 |
#print line |
date = "" |
for row in soup.find_all('tr'): |
id = "" |
print line |
type = "" |
if row.has_key('valign'): |
description = "" |
i = 0 |
name = "" |
date = "" |
url = "" |
id = "" |
for col in soup.find_all('td'): |
type = "" |
#print ''.join(col.stripped_strings) |
description = "" |
if i == 0: |
name = "" |
date = ''.join(col.stripped_strings) |
url = "" |
if i == 1: |
for col in soup.find_all('td'): |
id = ''.join(col.stripped_strings) |
#print ''.join(col.stripped_strings) |
if i == 2: |
if i == 0: |
type = ''.join(col.stripped_strings) |
date = ''.join(col.stripped_strings) |
if i == 3: |
if i == 1: |
description = ''.join(col.stripped_strings) |
id = ''.join(col.stripped_strings) |
for link in col.findAll('a'): |
if i == 2: |
if link.has_key("href"): |
type = ''.join(col.stripped_strings) |
url = link['href'] |
if i == 3: |
name = ''.join(link.stripped_strings) |
description = ''.join(col.stripped_strings) |
print [date,id,type,description,name, url] |
for link in col.findAll('a'): |
i = i +1; |
if link.has_key("href"): |
#url = scrape.fullurl(listurl, atag['href']) |
url = link['href'] |
#(url, mime_type, html) = scrape.fetchURL(scrape.docsdb, |
name = ''.join(link.stripped_strings) |
# url, "data", "AGIMO") |
print str(items) + " (" +str(items/25) +" screens to go)" |
#hash = scrape.mkhash(scrape.canonurl(url)) |
print [date, id, type, description, name, url] |
#doc = scrape.docsdb.get(hash) |
itemurl = scrape.fullurl(listurl, url) |
#print doc['metadata'] |
(itemurl, mime_type, html) = scrape.fetchURL(scrape.docsdb, |
#scrape.docsdb.save(doc) |
itemurl, "gazette", "AGD", False) |
#time.sleep(2) |
hash = scrape.mkhash(scrape.canonurl(itemurl)) |
|
doc = scrape.docsdb.get(hash) |
|
doc['metadata'] = {"date": date, "date": id, "type":type, "description":description,"name": name,"url": url} |
|
scrape.docsdb.save(doc) |
|
#time.sleep(2) |
|
i = i + 1; |
|
|
|
items = items - 25 |
|
if items <= 0: |
|
break |
|
|