import sys,os | import sys,os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import scrape | import scrape |
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup |
import parsedatetime as pdt | import parsedatetime as pdt |
from time import mktime | from time import mktime |
from datetime import datetime | from datetime import datetime |
import feedparser | import feedparser |
import abc | import abc |
class GenericRSSDisclogScraper(object): | class GenericDisclogScraper(object): |
__metaclass__ = abc.ABCMeta | __metaclass__ = abc.ABCMeta |
@abc.abstractmethod | agencyID = None |
disclogURL = None | |
def getAgencyID(self): | def getAgencyID(self): |
""" disclosr agency id """ | """ disclosr agency id """ |
return | if self.agencyID == None: |
self.agencyID = os.path.basename(sys.argv[0]).replace(".py","") | |
return self.agencyID | |
@abc.abstractmethod | |
def getURL(self): | def getURL(self): |
""" disclog URL""" | """ disclog URL""" |
return | if self.disclogURL == None: |
agency = scrape.agencydb.get(self.getAgencyID()) | |
self.disclogURL = agency['FOIDocumentsURL'] | |
return self.disclogURL | |
@abc.abstractmethod | |
def doScrape(self): | |
""" do the scraping """ | |
return | |
class GenericRSSDisclogScraper(GenericDisclogScraper): | |
def getDescription(self, entry, doc): | def getDescription(self, entry, doc): |
""" get description from rss entry""" | """ get description from rss entry""" |
doc['description'] = entry.summary | doc['description'] = entry.summary |
return | return |
def doScrape(self): | def doScrape(self): |
foidocsdb = scrape.couch['disclosr-foidocuments'] | foidocsdb = scrape.couch['disclosr-foidocuments'] |
(url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) | (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) |
feed = feedparser.parse(content) | feed = feedparser.parse(content) |
for entry in feed.entries: | for entry in feed.entries: |
#print entry | #print entry |
print entry.id | print entry.id |
hash = scrape.mkhash(entry.id) | hash = scrape.mkhash(entry.id) |
#print hash | #print hash |
doc = foidocsdb.get(hash) | doc = foidocsdb.get(hash) |
#print doc | #print doc |
if doc == None: | if doc == None: |
print "saving" | print "saving" |
edate = datetime.fromtimestamp(mktime( entry.published_parsed)).strftime("%Y-%m-%d") | edate = datetime.fromtimestamp(mktime( entry.published_parsed)).strftime("%Y-%m-%d") |
doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': entry.link, 'docID': entry.id, | doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': entry.link, 'docID': entry.id, |
"date": edate,"title": entry.title} | "date": edate,"title": entry.title} |
self.getDescription(entry, doc) | self.getDescription(entry, doc) |
foidocsdb.save(doc) | foidocsdb.save(doc) |
else: | else: |
print "already saved" | print "already saved" |
class GenericOAICDisclogScraper(object): | class GenericOAICDisclogScraper(GenericDisclogScraper): |
__metaclass__ = abc.ABCMeta | __metaclass__ = abc.ABCMeta |
@abc.abstractmethod | |
def getAgencyID(self): | |
""" disclosr agency id """ | |
return | |
@abc.abstractmethod | |
def getURL(self): | |
""" disclog URL""" | |
return | |
@abc.abstractmethod | @abc.abstractmethod |
def getColumns(self,columns): | def getColumns(self,columns): |
""" rearranges columns if required """ | """ rearranges columns if required """ |
return | return |
def doScrape(self): | def doScrape(self): |
cal = pdt.Calendar() | cal = pdt.Calendar() |
foidocsdb = scrape.couch['disclosr-foidocuments'] | foidocsdb = scrape.couch['disclosr-foidocuments'] |
(url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) | (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) |
if content != None: | if content != None: |
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": | if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": |
# http://www.crummy.com/software/BeautifulSoup/documentation.html | # http://www.crummy.com/software/BeautifulSoup/documentation.html |
soup = BeautifulSoup(content) | soup = BeautifulSoup(content) |
for row in soup.table.find_all('tr'): | for row in soup.table.find_all('tr'): |
columns = row.find_all('td') | columns = row.find_all('td') |
if len(columns) == 5: | if len(columns) == 5: |
(id, date, description, title, notes) = self.getColumns(columns) | (id, date, description, title, notes) = self.getColumns(columns) |
print id.string | print id.string |
hash = scrape.mkhash(url+id.string) | hash = scrape.mkhash(url+id.string) |
links = [] | links = [] |
for atag in row.find_all("a"): | for atag in row.find_all("a"): |
if atag.has_key('href'): | if atag.has_key('href'): |
links.append(scrape.fullurl(url,atag['href'])) | links.append(scrape.fullurl(url,atag['href'])) |
doc = foidocsdb.get(hash) | doc = foidocsdb.get(hash) |
descriptiontxt = "" | descriptiontxt = "" |
for string in description.stripped_strings: | for string in description.stripped_strings: |
descriptiontxt = descriptiontxt + " \n" + string | descriptiontxt = descriptiontxt + " \n" + string |
if doc == None: | if doc == None: |
print "saving" | print "saving" |
dtresult = cal.parseDateText(date.string) | dtresult = cal.parseDateText(date.string) |
if len(dtresult) == 2: | if len(dtresult) == 2: |
(dtdate,dtr) = dtresult | (dtdate,dtr) = dtresult |
print dtdate | print dtdate |
edate = ""+str(dtdate[0])+'-'+str(dtdate[1])+'-'+str(dtdate[2]) | edate = ""+str(dtdate[0])+'-'+str(dtdate[1])+'-'+str(dtdate[2]) |
else: | else: |
edate = "" | edate = "" |
doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), "links": links, 'docID': id.string, | doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), "links": links, 'docID': id.string, |
"date": edate, "description": descriptiontxt,"title": title.string,"notes": notes.string} | "date": edate, "description": descriptiontxt,"title": title.string,"notes": notes.string} |
foidocsdb.save(doc) | foidocsdb.save(doc) |
else: | else: |
print "already saved" | print "already saved" |
elif len(row.find_all('th')) == 5: | elif len(row.find_all('th')) == 5: |
print "header row" | print "header row" |
else: | else: |
print "ERROR number of columns incorrect" | print "ERROR number of columns incorrect" |
print row | print row |
import sys,os | import sys,os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import genericScrapers | import genericScrapers |
#RSS feed not detailed | #RSS feed not detailed |
#http://www.doughellmann.com/PyMOTW/abc/ | #http://www.doughellmann.com/PyMOTW/abc/ |
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): | class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): |
def getAgencyID(self): | |
return "3cd40b1240e987cbcd3f0e67054ce259" | |
def getURL(self): | |
return "http://www.apvma.gov.au/about/foi/disclosure/index.php" | |
def getColumns(self,columns): | def getColumns(self,columns): |
(id, date, description, title, notes) = columns | (id, date, description, title, notes) = columns |
return (id, date, description, title, notes) | return (id, date, description, title, notes) |
if __name__ == '__main__': | if __name__ == '__main__': |
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) | print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) |
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) | print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) |
ScraperImplementation().doScrape() | ScraperImplementation().doScrape() |
import sys,os | import sys,os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import genericScrapers | import genericScrapers |
#RSS feed not detailed | #RSS feed not detailed |
#http://www.doughellmann.com/PyMOTW/abc/ | #http://www.doughellmann.com/PyMOTW/abc/ |
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): | class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): |
def getAgencyID(self): | |
return "8c9421f852c441910bf1d93a57b31d64" | |
def getURL(self): | |
return "http://www.daff.gov.au/about/foi/ips/disclosure-log" | |
def getColumns(self,columns): | def getColumns(self,columns): |
(id, date, title, description, notes) = columns | (id, date, title, description, notes) = columns |
return (id, date, description, title, notes) | return (id, date, description, title, notes) |
if __name__ == '__main__': | if __name__ == '__main__': |
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) | print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) |
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) | print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) |
ScraperImplementation().doScrape() | ScraperImplementation().doScrape() |
import sys,os | import sys,os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import genericScrapers | import genericScrapers |
#RSS feed not detailed | #RSS feed not detailed |
import scrape | import scrape |
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup |
#http://www.doughellmann.com/PyMOTW/abc/ | #http://www.doughellmann.com/PyMOTW/abc/ |
class ScraperImplementation(genericScrapers.GenericRSSDisclogScraper): | class ScraperImplementation(genericScrapers.GenericRSSDisclogScraper): |
def getAgencyID(self): | |
return "be9996f0ac58f71f23d074e82d44ead3" | |
def getURL(self): | |
return "http://foi.deewr.gov.au/disclosure-log/rss" | |
def getDescription(self,entry,doc): | def getDescription(self,entry,doc): |
(url,mime_type,content) = scrape.fetchURL(scrape.docsdb, entry.link, "foidocuments", self.getAgencyID(), False) | (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, entry.link, "foidocuments", self.getAgencyID(), False) |
if content != None: | if content != None: |
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": | if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": |
# http://www.crummy.com/software/BeautifulSoup/documentation.html | # http://www.crummy.com/software/BeautifulSoup/documentation.html |
soup = BeautifulSoup(content) | soup = BeautifulSoup(content) |
links = [] | links = [] |
description = "" | description = "" |
dldivs = soup.find('div',class_="download") | dldivs = soup.find('div',class_="download") |
if dldivs != None: | if dldivs != None: |
for atag in dldivs.find_all("a"): | for atag in dldivs.find_all("a"): |
if atag.has_key('href'): | if atag.has_key('href'): |
links.append(scrape.fullurl(url,atag['href'])) | links.append(scrape.fullurl(url,atag['href'])) |
nodldivs = soup.find('div',class_="incompleteNotification") | nodldivs = soup.find('div',class_="incompleteNotification") |
if nodldivs != None and nodldivs.stripped_strings != None: | if nodldivs != None and nodldivs.stripped_strings != None: |
for text in nodldivs.stripped_strings: | for text in nodldivs.stripped_strings: |
description = description + text | description = description + text |
for row in soup.table.find_all('tr'): | for row in soup.table.find_all('tr'): |
if row != None: | if row != None: |
description = description + "\n" + row.find('th').string + ": " | description = description + "\n" + row.find('th').string + ": " |
for text in row.find('div').stripped_strings: | for text in row.find('div').stripped_strings: |
description = description + text | description = description + text |
if links != []: | if links != []: |
doc.update({'links': links}) | doc.update({'links': links}) |
if description != "": | if description != "": |
doc.update({ 'description': description}) | doc.update({ 'description': description}) |
if __name__ == '__main__': | if __name__ == '__main__': |
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericRSSDisclogScraper) | print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericRSSDisclogScraper) |
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericRSSDisclogScraper) | print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericRSSDisclogScraper) |
ScraperImplementation().doScrape() | ScraperImplementation().doScrape() |
import sys,os | import sys,os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import genericScrapers | import genericScrapers |
#RSS feed not detailed | #RSS feed not detailed |
#http://www.doughellmann.com/PyMOTW/abc/ | #http://www.doughellmann.com/PyMOTW/abc/ |
class ScraperImplementation(genericScrapers.GenericRSSDisclogScraper): | class ScraperImplementation(genericScrapers.GenericRSSDisclogScraper): |
def getAgencyID(self): | |
return "be9996f0ac58f71f23d074e82d44ead3" | |
def getURL(self): | |
return "http://foi.deewr.gov.au/disclosure-log/rss" | |
def getColumns(self,columns): | def getColumns(self,columns): |
(id, date, title, description, notes) = columns | (id, date, title, description, notes) = columns |
return (id, date, description, title, notes) | return (id, date, description, title, notes) |
if __name__ == '__main__': | if __name__ == '__main__': |
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericRSSDisclogScraper) | print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericRSSDisclogScraper) |
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericRSSDisclogScraper) | print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericRSSDisclogScraper) |
ScraperImplementation().doScrape() | ScraperImplementation().doScrape() |
www.finance.gov.au/foi/disclosure-log/foi-rss.xml | www.finance.gov.au/foi/disclosure-log/foi-rss.xml |
import sys,os | import sys,os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import genericScrapers | import genericScrapers |
#RSS feed not detailed | #RSS feed not detailed |
#http://www.doughellmann.com/PyMOTW/abc/ | #http://www.doughellmann.com/PyMOTW/abc/ |
class ScraperImplementation(genericScrapers.GenericRSSDisclogScraper): | class ScraperImplementation(genericScrapers.GenericRSSDisclogScraper): |
def getAgencyID(self): | |
return "be9996f0ac58f71f23d074e82d44ead3" | |
def getURL(self): | |
return "http://foi.deewr.gov.au/disclosure-log/rss" | |
def getColumns(self,columns): | def getColumns(self,columns): |
(id, date, title, description, notes) = columns | (id, date, title, description, notes) = columns |
return (id, date, description, title, notes) | return (id, date, description, title, notes) |
if __name__ == '__main__': | if __name__ == '__main__': |
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericRSSDisclogScraper) | print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericRSSDisclogScraper) |
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericRSSDisclogScraper) | print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericRSSDisclogScraper) |
ScraperImplementation().doScrape() | ScraperImplementation().doScrape() |
http://www.righttoknow.org.au/feed/search/%20(latest_status:successful%20OR%20latest_status:partially_successful) | http://www.righttoknow.org.au/feed/search/%20(latest_status:successful%20OR%20latest_status:partially_successful) |
<?php | <?php |
function include_header_documents($title) { | function include_header_documents($title) { |
?> | ?> |
<!doctype html> | <!doctype html> |
<!-- paulirish.com/2008/conditional-stylesheets-vs-css-hacks-answer-neither/ --> | <!-- paulirish.com/2008/conditional-stylesheets-vs-css-hacks-answer-neither/ --> |
<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]--> | <!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]--> |
<!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]--> | <!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]--> |
<!--[if IE 8]> <html class="no-js lt-ie9" lang="en"> <![endif]--> | <!--[if IE 8]> <html class="no-js lt-ie9" lang="en"> <![endif]--> |
<!-- Consider adding a manifest.appcache: h5bp.com/d/Offline --> | <!-- Consider adding a manifest.appcache: h5bp.com/d/Offline --> |
<!--[if gt IE 8]><!--> <html class="no-js" lang="en"> <!--<![endif]--> | <!--[if gt IE 8]><!--> <html class="no-js" lang="en"> <!--<![endif]--> |
<head> | <head> |
<meta charset="utf-8"> | <meta charset="utf-8"> |
<!-- Use the .htaccess and remove these lines to avoid edge case issues. | <!-- Use the .htaccess and remove these lines to avoid edge case issues. |
More info: h5bp.com/i/378 --> | More info: h5bp.com/i/378 --> |
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"> | <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"> |
<title>Australian Disclosure Logs<?php if ($title != "") echo " - $title";?></title> | <title>Australian Disclosure Logs<?php if ($title != "") echo " - $title";?></title> |
<meta name="description" content=""> | <meta name="description" content=""> |
<!-- Mobile viewport optimized: h5bp.com/viewport --> | <!-- Mobile viewport optimized: h5bp.com/viewport --> |
<meta name="viewport" content="width=device-width"> | <meta name="viewport" content="width=device-width"> |
<!-- Place favicon.ico and apple-touch-icon.png in the root directory: mathiasbynens.be/notes/touch-icons --> | <!-- Place favicon.ico and apple-touch-icon.png in the root directory: mathiasbynens.be/notes/touch-icons --> |
<meta name="google-site-verification" content="jkknX5g2FCpQvrW030b1Nq2hyoa6mb3EDiA7kCoHNj8" /> | <meta name="google-site-verification" content="jkknX5g2FCpQvrW030b1Nq2hyoa6mb3EDiA7kCoHNj8" /> |
&nbs |