DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | |
cd $DIR | |
echo "" > /tmp/disclosr-error | echo "" > /tmp/disclosr-error |
for f in scrapers/*.py; do | for f in scrapers/*.py; do |
echo "Processing $f file.."; | echo "Processing $f file.."; |
md5=`md5sum /tmp/disclosr-error` | md5=`md5sum /tmp/disclosr-error` |
python $f 3>&1 1>&2 2>&3 | tee --append /tmp/disclosr-error; | python $f 3>&1 1>&2 2>&3 | tee --append /tmp/disclosr-error; |
md52=`md5sum /tmp/disclosr-error` | md52=`md5sum /tmp/disclosr-error` |
if [ "$md5" != "$md52" ]; then | if [ "$md5" != "$md52" ]; then |
echo "^^^^^^^^^^^^^^ $f" >> /tmp/disclosr-error; | echo "^^^^^^^^^^^^^^ $f" >> /tmp/disclosr-error; |
fi | fi |
if [ "$?" -ne "0" ]; then | if [ "$?" -ne "0" ]; then |
echo "error"; | echo "error"; |
sleep 1; | sleep 1; |
fi | fi |
done | done |
if [ -s /tmp/disclosr-error ] ; then | if [ -s /tmp/disclosr-error ] ; then |
echo "emailling logs.."; | echo "emailling logs.."; |
mail -E -s "Disclosr errors" maxious@lambdacomplex.org < /tmp/disclosr-error ; | mail -E -s "Disclosr errors" maxious@lambdacomplex.org < /tmp/disclosr-error ; |
fi | fi |
import sys,os | import sys,os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import genericScrapers | import genericScrapers |
import scrape | import scrape |
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup |
import dateutil | import dateutil |
from dateutil.parser import * | from dateutil.parser import * |
from datetime import * | from datetime import * |
#http://www.doughellmann.com/PyMOTW/abc/ | #http://www.doughellmann.com/PyMOTW/abc/ |
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): | class ScraperImplementation(genericScrapers.GenericHTMLDisclogScraper): |
def getTable(self,soup): | |
return soup.find(class_ = "inner-column").table | |
def getRows(self,table): | |
return table.tbody.find_all('tr',recursive=False) | |
def getColumnCount(self): | def getColumnCount(self): |
return 3 | return 0 |
def getColumns(self,columns): | |
(date, title, description) = columns | |
return (date, date, title, description, None) | |
def getDate(self, content, entry, doc): | |
i = 0 | |
date = "" | |
for string in content.stripped_strings: | |
if i ==1: | |
date = string | |
i = i+1 | |
edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") | |
print edate | |
doc.update({'date': edate}) | |
return | |
def getTitle(self, content, entry, doc): | |
i = 0 | |
title = "" | |
for string in content.stripped_strings: | |
if i < 2: | |
title = title + string | |
i = i+1 | |
doc.update({'title': title}) | |
#print title | |
return | |
if __name__ == '__main__': | if __name__ == '__main__': |
#http://www.csiro.au/Portals/About-CSIRO/How-we-work/Governance/FOI-Request-Disclosure-Log-2012-13.aspx | #http://www.csiro.au/Portals/About-CSIRO/How-we-work/Governance/FOI-Request-Disclosure-Log-2012-13.aspx |
#http://www.csiro.au/Portals/About-CSIRO/How-we-work/Governance/FOI-Request-Disclosure-Log-2011-12.aspx | #http://www.csiro.au/Portals/About-CSIRO/How-we-work/Governance/FOI-Request-Disclosure-Log-2011-12.aspx |
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) | print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericHTMLDisclogScraper) |
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) | print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericHTMLDisclogScraper) |
ScraperImplementation().doScrape() | ScraperImplementation().doScrape() |
import sys | import sys |
import os | import os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import genericScrapers | import genericScrapers |
import scrape | import scrape |
from datetime import date | from datetime import date |
from pyquery import PyQuery as pq | from pyquery import PyQuery as pq |
from lxml import etree | from lxml import etree |
import urllib | import urllib |
import dateutil | import dateutil |
from dateutil.parser import * | from dateutil.parser import * |
class ACMADisclogScraper(genericScrapers.GenericDisclogScraper): | class ACMADisclogScraper(genericScrapers.GenericDisclogScraper): |
def doScrape(self): | def doScrape(self): |
foidocsdb = scrape.couch['disclosr-foidocuments'] | foidocsdb = scrape.couch['disclosr-foidocuments'] |
(url, mime_type, content) = scrape.fetchURL(scrape.docsdb, | (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, |
self.getURL(), "foidocuments", self.getAgencyID()) | self.getURL(), "foidocuments", self.getAgencyID()) |
d = pq(content) | d = pq(content) |
d.make_links_absolute(base_url = self.getURL()) | d.make_links_absolute(base_url = self.getURL()) |
for table in d('table').items(): | for table in d('table').items(): |
title= table('thead').text() | title= table('thead').text() |
print title | print self.remove_control_chars(title) |
(idate,descA,descB,link,deldate,notes) = table('tbody tr').map(lambda i, e: pq(e).children().eq(1).text()) | (idate,descA,descB,link,deldate,notes) = table('tbody tr').map(lambda i, e: pq(e).children().eq(1).text()) |
links = table('a').map(lambda i, e: pq(e).attr('href')) | links = table('a').map(lambda i, e: pq(e).attr('href')) |
description = descA+" "+descB | description = descA+" "+descB |
try: | try: |
edate = parse(idate[:12], dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") | edate = parse(idate[:12], dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") |
except ValueError: | except ValueError: |
edate = date.today().strftime("%Y-%m-%d") | edate = date.today().strftime("%Y-%m-%d") |
pass | pass |
print edate | print edate |
dochash = scrape.mkhash(self.remove_control_chars(title)) | dochash = scrape.mkhash(self.remove_control_chars(title)) |
doc = foidocsdb.get(dochash) | doc = foidocsdb.get(dochash) |
if doc is None: | if doc is None: |
print "saving " + dochash | print "saving " + dochash |
edate = date.today().strftime("%Y-%m-%d") | edate = date.today().strftime("%Y-%m-%d") |
doc = {'_id': dochash, 'agencyID': self.getAgencyID() | doc = {'_id': dochash, 'agencyID': self.getAgencyID() |
, 'url': self.getURL(), 'docID': dochash, | , 'url': self.getURL(), 'docID': dochash, |
"links": links, | "links": links, |
"date": edate, "notes": notes, "title": title, "description": description} | "date": edate, "notes": notes, "title": title, "description": description} |
#print doc | #print doc |
foidocsdb.save(doc) | foidocsdb.save(doc) |
else: | else: |
print "already saved" | print "already saved" |
if __name__ == '__main__': | if __name__ == '__main__': |
print 'Subclass:', issubclass(ACMADisclogScraper, | print 'Subclass:', issubclass(ACMADisclogScraper, |
genericScrapers.GenericDisclogScraper) | genericScrapers.GenericDisclogScraper) |
print 'Instance:', isinstance(ACMADisclogScraper(), | print 'Instance:', isinstance(ACMADisclogScraper(), |
genericScrapers.GenericDisclogScraper) | genericScrapers.GenericDisclogScraper) |
ACMADisclogScraper().doScrape() | ACMADisclogScraper().doScrape() |