import sys,os | import sys,os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import scrape | import scrape |
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup |
import parsedatetime as pdt | import parsedatetime as pdt |
from time import mktime | |
from datetime import datetime | |
import feedparser | |
import abc | import abc |
class GenericOAICDisclogScraper(object): | |
__metaclass__ = abc.ABCMeta | class GenericDisclogScraper(object): |
__metaclass__ = abc.ABCMeta | |
agencyID = None | |
disclogURL = None | |
def getAgencyID(self): | |
""" disclosr agency id """ | |
if self.agencyID == None: | |
self.agencyID = os.path.basename(sys.argv[0]).replace(".py","") | |
return self.agencyID | |
def getURL(self): | |
""" disclog URL""" | |
if self.disclogURL == None: | |
agency = scrape.agencydb.get(self.getAgencyID()) | |
self.disclogURL = agency['FOIDocumentsURL'] | |
return self.disclogURL | |
@abc.abstractmethod | @abc.abstractmethod |
def getAgencyID(self): | def doScrape(self): |
""" disclosr agency id """ | """ do the scraping """ |
return | return |
@abc.abstractmethod | |
def getURL(self): | |
""" disclog URL""" | class GenericRSSDisclogScraper(GenericDisclogScraper): |
def getDescription(self, entry, doc): | |
""" get description from rss entry""" | |
doc['description'] = entry.summary | |
return | return |
def doScrape(self): | |
foidocsdb = scrape.couch['disclosr-foidocuments'] | |
(url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) | |
feed = feedparser.parse(content) | |
for entry in feed.entries: | |
#print entry | |
print entry.id | |
hash = scrape.mkhash(entry.id) | |
#print hash | |
doc = foidocsdb.get(hash) | |
#print doc | |
if doc == None: | |
print "saving" | |
edate = datetime.fromtimestamp(mktime( entry.published_parsed)).strftime("%Y-%m-%d") | |
doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': entry.link, 'docID': entry.id, | |
"date": edate,"title": entry.title} | |
self.getDescription(entry, doc) | |
foidocsdb.save(doc) | |
else: | |
print "already saved" | |
class GenericOAICDisclogScraper(GenericDisclogScraper): | |
__metaclass__ = abc.ABCMeta | |
@abc.abstractmethod | @abc.abstractmethod |
def getColumns(self,columns): | def getColumns(self,columns): |
""" rearranges columns if required """ | """ rearranges columns if required """ |
return | return |
def doScrape(self): | def doScrape(self): |
cal = pdt.Calendar() | cal = pdt.Calendar() |
foidocsdb = scrape.couch['disclosr-foidocuments'] | foidocsdb = scrape.couch['disclosr-foidocuments'] |
(url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) | (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) |
if content != None: | if content != None: |
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": | if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": |
# http://www.crummy.com/software/BeautifulSoup/documentation.html | # http://www.crummy.com/software/BeautifulSoup/documentation.html |
soup = BeautifulSoup(content) | soup = BeautifulSoup(content) |
for row in soup.table.find_all('tr'): | for row in soup.table.find_all('tr'): |
columns = row.find_all('td') | columns = row.find_all('td') |
if len(columns) == 5: | if len(columns) == 5: |
(id, date, description, title, notes) = self.getColumns(columns) | (id, date, description, title, notes) = self.getColumns(columns) |
print id.string | print id.string |
hash = scrape.mkhash(url+id.string) | hash = scrape.mkhash(url+id.string) |
links = [] | links = [] |
for atag in row.find_all("a"): | for atag in row.find_all("a"): |
if atag.has_key('href'): | if atag.has_key('href'): |
links.append(scrape.fullurl(url,atag['href'])) | links.append(scrape.fullurl(url,atag['href'])) |
doc = foidocsdb.get(hash) | doc = foidocsdb.get(hash) |
descriptiontxt = "" | descriptiontxt = "" |
for string in description.stripped_strings: | for string in description.stripped_strings: |
descriptiontxt = descriptiontxt + string | descriptiontxt = descriptiontxt + " \n" + string |
if doc == None: | if doc == None: |
print "saving" | print "saving" |
dtresult = cal.parseDateText(date.string) | dtresult = cal.parseDateText(date.string) |
if len(dtresult) == 2: | if len(dtresult) == 2: |
(dtdate,dtr) = dtresult | (dtdate,dtr) = dtresult |
print dtdate | print dtdate |
edate = ""+str(dtdate[0])+'-'+str(dtdate[1])+'-'+str(dtdate[2]) | edate = ""+str(dtdate[0])+'-'+str(dtdate[1])+'-'+str(dtdate[2]) |
else: | else: |
edate = "" | edate = "" |
doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), "links": links, 'docID': id.string, | doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), "links": links, 'docID': id.string, |
"date": edate, "description": descriptiontxt,"title": title.string,"notes": notes.string} | "date": edate, "description": descriptiontxt,"title": title.string,"notes": notes.string} |
foidocsdb.save(doc) | foidocsdb.save(doc) |
else: | else: |
print "already saved" | print "already saved" |
elif len(row.find_all('th')) == 5: | elif len(row.find_all('th')) == 5: |
print "header row" | print "header row" |
else: | else: |
print "ERROR number of columns incorrect" | print "ERROR number of columns incorrect" |
print row | print row |
<?php | <?php |
include('template.inc.php'); | include('template.inc.php'); |
include_header_documents(""); | include_header_documents(""); |
include_once('../include/common.inc.php'); | include_once('../include/common.inc.php'); |
?> | ?> |
<?php | <?php |
$agenciesdb = $server->get_db('disclosr-agencies'); | $agenciesdb = $server->get_db('disclosr-agencies'); |
$idtoname = Array(); | $idtoname = Array(); |
foreach ($agenciesdb->get_view("app", "byName")->rows as $row) { | foreach ($agenciesdb->get_view("app", "byCanonicalName")->rows as $row) { |
$idtoname[$row->value] = trim($row->key); | $idtoname[$row->id] = trim($row->value->name); |
} | } |
$foidocsdb = $server->get_db('disclosr-foidocuments'); | $foidocsdb = $server->get_db('disclosr-foidocuments'); |
try { | try { |
$rows = $foidocsdb->get_view("app", "byDate", Array('9999-99-99','0000-00-00'), true)->rows; | $rows = $foidocsdb->get_view("app", "byDate", Array('9999-99-99','0000-00-00'), true)->rows; |
if ($rows) { | if ($rows) { |
foreach ($rows as $row) { | foreach ($rows as $row) { |
//print_r($row); | //print_r($row); |
displayLogEntry($row,$idtoname); | displayLogEntry($row,$idtoname); |
/* 1/1/11 title (Dept dfggdfgdf) | /* 1/1/11 title (Dept dfggdfgdf) |
description: | description: |
source link: | source link: |
documents: | documents: |
#1 title link */ | #1 title link */ |
} | } |
} | } |
} catch (SetteeRestClientException $e) { | } catch (SetteeRestClientException $e) { |
setteErrorHandler($e); | setteErrorHandler($e); |
} | } |
include_footer_documents(); | include_footer_documents(); |
?> | ?> |
import sys,os | import sys,os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import genericScrapers | import genericScrapers |
#RSS feed not detailed | #RSS feed not detailed |
#http://www.doughellmann.com/PyMOTW/abc/ | #http://www.doughellmann.com/PyMOTW/abc/ |
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): | class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): |
def getAgencyID(self): | |
return "3cd40b1240e987cbcd3f0e67054ce259" | |
def getURL(self): | |
return "http://www.apvma.gov.au/about/foi/disclosure/index.php" | |
def getColumns(self,columns): | def getColumns(self,columns): |
(id, date, description, title, notes) = columns | (id, date, description, title, notes) = columns |
return (id, date, description, title, notes) | return (id, date, description, title, notes) |
if __name__ == '__main__': | if __name__ == '__main__': |
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) | print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) |
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) | print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) |
ScraperImplementation().doScrape() | ScraperImplementation().doScrape() |
import sys,os | import sys,os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import genericScrapers | import genericScrapers |
#RSS feed not detailed | #RSS feed not detailed |
#http://www.doughellmann.com/PyMOTW/abc/ | #http://www.doughellmann.com/PyMOTW/abc/ |
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): | class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): |
def getAgencyID(self): | |
return "8c9421f852c441910bf1d93a57b31d64" | |
def getURL(self): | |
return "http://www.daff.gov.au/about/foi/ips/disclosure-log" | |
def getColumns(self,columns): | def getColumns(self,columns): |
(id, date, title, description, notes) = columns | (id, date, title, description, notes) = columns |
return (id, date, description, title, notes) | return (id, date, description, title, notes) |
if __name__ == '__main__': | if __name__ == '__main__': |
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) | print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) |
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) | print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) |
ScraperImplementation().doScrape() | ScraperImplementation().doScrape() |
import sys,os | import sys,os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import genericScrapers | |
#RSS feed not detailed | |
import scrape | import scrape |
foidocsdb = scrape.couch['disclosr-foidocuments'] | from bs4 import BeautifulSoup |
#http://www.doughellmann.com/PyMOTW/abc/ | |
import feedparser | class ScraperImplementation(genericScrapers.GenericRSSDisclogScraper): |
feed = feedparser.parse( "http://foi.deewr.gov.au/disclosure-log/rss") | def getDescription(self,entry,doc): |
print feed.entries[0] | (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, entry.link, "foidocuments", self.getAgencyID(), False) |
#foreach feed.entries | if content != None: |
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": | |
# http://www.crummy.com/software/BeautifulSoup/documentation.html | |
soup = BeautifulSoup(content) | |
links = [] | |
description = "" | |
dldivs = soup.find('div',class_="download") | |
if dldivs != None: | |
for atag in dldivs.find_all("a"): | |
if atag.has_key('href'): | |
links.append(scrape.fullurl(url,atag['href'])) | |
nodldivs = soup.find('div',class_="incompleteNotification") | |
if nodldivs != None and nodldivs.stripped_strings != None: | |
for text in nodldivs.stripped_strings: | |
description = description + text | |
for row in soup.table.find_all('tr'): | |
if row != None: | |
description = description + "\n" + row.find('th').string + ": " | |
for text in row.find('div').stripped_strings: | |
description = description + text | |
if links != []: | |
doc.update({'links': links}) | |
if description != "": | |
doc.update({ 'description': description}) | |
if __name__ == '__main__': | |
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericRSSDisclogScraper) | |
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericRSSDisclogScraper) | |
ScraperImplementation().doScrape() | |
import sys,os | |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | |
import genericScrapers | |
#RSS feed not detailed | |
#http://www.doughellmann.com/PyMOTW/abc/ | |
class ScraperImplementation(genericScrapers.GenericRSSDisclogScraper): | |
def getColumns(self,columns): | |
(id, date, title, description, notes) = columns | |
return (id, date, description, title, notes) | |
if __name__ == '__main__': | |
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericRSSDisclogScraper) | |
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericRSSDisclogScraper) | |
ScraperImplementation().doScrape() | |
www.finance.gov.au/foi/disclosure-log/foi-rss.xml | www.finance.gov.au/foi/disclosure-log/foi-rss.xml |
import sys,os | |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | |
import genericScrapers | |
#RSS feed not detailed | |
#http://www.doughellmann.com/PyMOTW/abc/ | |
class ScraperImplementation(genericScrapers.GenericRSSDisclogScraper): | |
def getColumns(self,columns): | |
(id, date, title, description, notes) = columns | |
return (id, date, description, title, notes) | |
if __name__ == '__main__': | |
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericRSSDisclogScraper) | |
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericRSSDisclogScraper) | |
ScraperImplementation().doScrape() | |
http://www.righttoknow.org.au/feed/search/%20(latest_status:successful%20OR%20latest_status:partially_successful) | http://www.righttoknow.org.au/feed/search/%20(latest_status:successful%20OR%20latest_status:partially_successful) |
<?php | <?php |
function include_header_documents($title) { | function include_header_documents($title) { |
?> | ?> |
<!doctype html> | <!doctype html> |
<!-- paulirish.com/2008/conditional-stylesheets-vs-css-hacks-answer-neither/ --> | <!-- paulirish.com/2008/conditional-stylesheets-vs-css-hacks-answer-neither/ --> |
<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]--> | <!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]--> |
<!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]--> | <!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]--> |
<!--[if IE 8]> <html class="no-js lt-ie9" lang="en"> <![endif]--> | <!--[if IE 8]> <html class="no-js lt-ie9" lang="en"> <![endif]--> |
<!-- Consider adding a manifest.appcache: h5bp.com/d/Offline --> | <!-- Consider adding a manifest.appcache: h5bp.com/d/Offline --> |
<!--[if gt IE 8]><!--> <html class="no-js" lang="en"> <!--<![endif]--> | <!--[if gt IE 8]><!--> <html class="no-js" lang="en"> <!--<![endif]--> |
<head> | <head> |
<meta charset="utf-8"> | <meta charset="utf-8"> |
<!-- Use the .htaccess and remove these lines to avoid edge case issues. | <!-- Use the .htaccess and remove these lines to avoid edge case issues. |
More info: h5bp.com/i/378 --> | More info: h5bp.com/i/378 --> |
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"> | <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"> |
<title>Australian Disclosure Logs<?php if ($title != "") echo " - $title";?></title> | <title>Australian Disclosure Logs<?php if ($title != "") echo " - $title";?></title> |
<meta name="description" content=""> | <meta name="description" content=""> |
<!-- Mobile viewport optimized: h5bp.com/viewport --> | <!-- Mobile viewport optimized: h5bp.com/viewport --> |
<meta name="viewport" content="width=device-width"> | <meta name="viewport" content="width=device-width"> |
<!-- Place favicon.ico and apple-touch-icon.png in the root directory: mathiasbynens.be/notes/touch-icons --> | <!-- Place favicon.ico and apple-touch-icon.png in the root directory: mathiasbynens.be/notes/touch-icons --> |
<meta name="google-site-verification" content="jkknX5g2FCpQvrW030b1Nq2hyoa6mb3EDiA7kCoHNj8" /> | <meta name="google-site-verification" content="jkknX5g2FCpQvrW030b1Nq2hyoa |