ignore CSIRO disclog formatting
ignore CSIRO disclog formatting


Former-commit-id: 225487e3a636c4cf2758c6ad60437c748bc1df7c

  DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
  cd $DIR
echo "" > /tmp/disclosr-error echo "" > /tmp/disclosr-error
for f in scrapers/*.py; do for f in scrapers/*.py; do
echo "Processing $f file.."; echo "Processing $f file..";
md5=`md5sum /tmp/disclosr-error` md5=`md5sum /tmp/disclosr-error`
python $f 3>&1 1>&2 2>&3 | tee --append /tmp/disclosr-error; python $f 3>&1 1>&2 2>&3 | tee --append /tmp/disclosr-error;
md52=`md5sum /tmp/disclosr-error` md52=`md5sum /tmp/disclosr-error`
if [ "$md5" != "$md52" ]; then if [ "$md5" != "$md52" ]; then
echo "^^^^^^^^^^^^^^ $f" >> /tmp/disclosr-error; echo "^^^^^^^^^^^^^^ $f" >> /tmp/disclosr-error;
fi fi
if [ "$?" -ne "0" ]; then if [ "$?" -ne "0" ]; then
echo "error"; echo "error";
sleep 1; sleep 1;
fi fi
done done
if [ -s /tmp/disclosr-error ] ; then if [ -s /tmp/disclosr-error ] ; then
echo "emailling logs.."; echo "emailling logs..";
mail -E -s "Disclosr errors" maxious@lambdacomplex.org < /tmp/disclosr-error ; mail -E -s "Disclosr errors" maxious@lambdacomplex.org < /tmp/disclosr-error ;
fi fi
   
   
import sys,os import sys,os
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import genericScrapers import genericScrapers
import scrape import scrape
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import dateutil import dateutil
from dateutil.parser import * from dateutil.parser import *
from datetime import * from datetime import *
   
#http://www.doughellmann.com/PyMOTW/abc/ #http://www.doughellmann.com/PyMOTW/abc/
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): class ScraperImplementation(genericScrapers.GenericHTMLDisclogScraper):
def getTable(self,soup):  
return soup.find(class_ = "inner-column").table  
def getRows(self,table):  
return table.tbody.find_all('tr',recursive=False)  
def getColumnCount(self): def getColumnCount(self):
return 3 return 0
def getColumns(self,columns):  
(date, title, description) = columns  
return (date, date, title, description, None)  
def getDate(self, content, entry, doc):  
i = 0  
date = ""  
for string in content.stripped_strings:  
if i ==1:  
date = string  
i = i+1  
edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")  
print edate  
doc.update({'date': edate})  
return  
def getTitle(self, content, entry, doc):  
i = 0  
title = ""  
for string in content.stripped_strings:  
if i < 2:  
title = title + string  
i = i+1  
doc.update({'title': title})  
#print title  
return  
   
if __name__ == '__main__': if __name__ == '__main__':
#http://www.csiro.au/Portals/About-CSIRO/How-we-work/Governance/FOI-Request-Disclosure-Log-2012-13.aspx #http://www.csiro.au/Portals/About-CSIRO/How-we-work/Governance/FOI-Request-Disclosure-Log-2012-13.aspx
#http://www.csiro.au/Portals/About-CSIRO/How-we-work/Governance/FOI-Request-Disclosure-Log-2011-12.aspx #http://www.csiro.au/Portals/About-CSIRO/How-we-work/Governance/FOI-Request-Disclosure-Log-2011-12.aspx
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericHTMLDisclogScraper)
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericHTMLDisclogScraper)
ScraperImplementation().doScrape() ScraperImplementation().doScrape()
   
import sys import sys
import os import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import genericScrapers import genericScrapers
import scrape import scrape
from datetime import date from datetime import date
from pyquery import PyQuery as pq from pyquery import PyQuery as pq
from lxml import etree from lxml import etree
import urllib import urllib
import dateutil import dateutil
from dateutil.parser import * from dateutil.parser import *
   
class ACMADisclogScraper(genericScrapers.GenericDisclogScraper): class ACMADisclogScraper(genericScrapers.GenericDisclogScraper):
   
def doScrape(self): def doScrape(self):
foidocsdb = scrape.couch['disclosr-foidocuments'] foidocsdb = scrape.couch['disclosr-foidocuments']
(url, mime_type, content) = scrape.fetchURL(scrape.docsdb, (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
self.getURL(), "foidocuments", self.getAgencyID()) self.getURL(), "foidocuments", self.getAgencyID())
   
d = pq(content) d = pq(content)
d.make_links_absolute(base_url = self.getURL()) d.make_links_absolute(base_url = self.getURL())
for table in d('table').items(): for table in d('table').items():
title= table('thead').text() title= table('thead').text()
print title print self.remove_control_chars(title)
(idate,descA,descB,link,deldate,notes) = table('tbody tr').map(lambda i, e: pq(e).children().eq(1).text()) (idate,descA,descB,link,deldate,notes) = table('tbody tr').map(lambda i, e: pq(e).children().eq(1).text())
links = table('a').map(lambda i, e: pq(e).attr('href')) links = table('a').map(lambda i, e: pq(e).attr('href'))
description = descA+" "+descB description = descA+" "+descB
try: try:
edate = parse(idate[:12], dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") edate = parse(idate[:12], dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
except ValueError: except ValueError:
edate = date.today().strftime("%Y-%m-%d") edate = date.today().strftime("%Y-%m-%d")
pass pass
print edate print edate
dochash = scrape.mkhash(self.remove_control_chars(title)) dochash = scrape.mkhash(self.remove_control_chars(title))
doc = foidocsdb.get(dochash) doc = foidocsdb.get(dochash)
if doc is None: if doc is None:
print "saving " + dochash print "saving " + dochash
edate = date.today().strftime("%Y-%m-%d") edate = date.today().strftime("%Y-%m-%d")
doc = {'_id': dochash, 'agencyID': self.getAgencyID() doc = {'_id': dochash, 'agencyID': self.getAgencyID()
, 'url': self.getURL(), 'docID': dochash, , 'url': self.getURL(), 'docID': dochash,
"links": links, "links": links,
"date": edate, "notes": notes, "title": title, "description": description} "date": edate, "notes": notes, "title": title, "description": description}
#print doc #print doc
foidocsdb.save(doc) foidocsdb.save(doc)
else: else:
print "already saved" print "already saved"
   
   
if __name__ == '__main__': if __name__ == '__main__':
print 'Subclass:', issubclass(ACMADisclogScraper, print 'Subclass:', issubclass(ACMADisclogScraper,
genericScrapers.GenericDisclogScraper) genericScrapers.GenericDisclogScraper)
print 'Instance:', isinstance(ACMADisclogScraper(), print 'Instance:', isinstance(ACMADisclogScraper(),
genericScrapers.GenericDisclogScraper) genericScrapers.GenericDisclogScraper)
ACMADisclogScraper().doScrape() ACMADisclogScraper().doScrape()