From: Maxious Date: Mon, 11 Feb 2013 05:29:13 +0000 Subject: beginning gazette X-Git-Url: https://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=1908e3aa6265269f88d15344d90f756348953600 --- beginning gazette Former-commit-id: f853fcb1ec8fa48819367abd692d0f576974b9cf --- --- a/documents/gazette.py +++ b/documents/gazette.py @@ -5,20 +5,44 @@ from unidecode import unidecode -listurl = "http://gazettes.ag.gov.au/portal/govgazonline.nsf/publications?OpenView&Start=3960" +listurl = "http://gazettes.ag.gov.au/portal/govgazonline.nsf/publications?OpenView&Start=3950" (url, mime_type, listhtml) = scrape.fetchURL(scrape.docsdb, listurl, "gazette", "AGD") -soup = BeautifulSoup(listhtml) -for row in soup.find_all('tr'): - if row.has_key('valign'): - for col in tr.find_all('td'): - print col.string - #url = scrape.fullurl(listurl, atag['href']) - #(url, mime_type, html) = scrape.fetchURL(scrape.docsdb, - # url, "data", "AGIMO") - #hash = scrape.mkhash(scrape.canonurl(url)) - #doc = scrape.docsdb.get(hash) - #print doc['metadata'] - #scrape.docsdb.save(doc) - #time.sleep(2) +for line in listhtml.split('\n'): + soup = BeautifulSoup(line) + #print line + for row in soup.find_all('tr'): + print line + if row.has_key('valign'): + i = 0 + date = "" + id = "" + type = "" + description = "" + name = "" + url = "" + for col in soup.find_all('td'): + #print ''.join(col.stripped_strings) + if i == 0: + date = ''.join(col.stripped_strings) + if i == 1: + id = ''.join(col.stripped_strings) + if i == 2: + type = ''.join(col.stripped_strings) + if i == 3: + description = ''.join(col.stripped_strings) + for link in col.findAll('a'): + if link.has_key("href"): + url = link['href'] + name = ''.join(link.stripped_strings) + print [date,id,type,description,name, url] + i = i +1; + #url = scrape.fullurl(listurl, atag['href']) + #(url, mime_type, html) = scrape.fetchURL(scrape.docsdb, + # url, "data", "AGIMO") + #hash = scrape.mkhash(scrape.canonurl(url)) + #doc = scrape.docsdb.get(hash) + #print doc['metadata'] + #scrape.docsdb.save(doc) + #time.sleep(2) --- a/documents/scrape.py +++ b/documents/scrape.py @@ -13,8 +13,8 @@ import socket #couch = couchdb.Server('http://192.168.1.148:5984/') -couch = couchdb.Server('http://192.168.1.113:5984/') -#couch = couchdb.Server('http://127.0.0.1:5984/') +#couch = couchdb.Server('http://192.168.1.113:5984/') +couch = couchdb.Server('http://127.0.0.1:5984/') def mkhash(input):