From: maxious Date: Sat, 10 Nov 2012 09:15:21 +0000 Subject: derive agencyID and disclog url from filename X-Git-Url: https://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=8ac510d67c62c859f6f548b3224996516ef1708b --- derive agencyID and disclog url from filename Former-commit-id: eddf90809214ee502e593c769c4bd0b0b2fafc3a --- --- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -8,18 +8,31 @@ import feedparser import abc -class GenericRSSDisclogScraper(object): +class GenericDisclogScraper(object): __metaclass__ = abc.ABCMeta - @abc.abstractmethod + agencyID = None + disclogURL = None def getAgencyID(self): """ disclosr agency id """ - return + if self.agencyID == None: + self.agencyID = os.path.basename(sys.argv[0]).replace(".py","") + return self.agencyID - @abc.abstractmethod def getURL(self): """ disclog URL""" - return + if self.disclogURL == None: + agency = scrape.agencydb.get(self.getAgencyID()) + self.disclogURL = agency['FOIDocumentsURL'] + return self.disclogURL + @abc.abstractmethod + def doScrape(self): + """ do the scraping """ + return + + + +class GenericRSSDisclogScraper(GenericDisclogScraper): def getDescription(self, entry, doc): """ get description from rss entry""" doc['description'] = entry.summary @@ -46,18 +59,8 @@ else: print "already saved" -class GenericOAICDisclogScraper(object): - __metaclass__ = abc.ABCMeta - @abc.abstractmethod - def getAgencyID(self): - """ disclosr agency id """ - return - - @abc.abstractmethod - def getURL(self): - """ disclog URL""" - return - +class GenericOAICDisclogScraper(GenericDisclogScraper): + __metaclass__ = abc.ABCMeta @abc.abstractmethod def getColumns(self,columns): """ rearranges columns if required """ --- a/documents/scrapers/3cd40b1240e987cbcd3f0e67054ce259.py +++ b/documents/scrapers/3cd40b1240e987cbcd3f0e67054ce259.py @@ -5,12 +5,6 @@ #http://www.doughellmann.com/PyMOTW/abc/ class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): - def getAgencyID(self): - return "3cd40b1240e987cbcd3f0e67054ce259" - - def getURL(self): - return "http://www.apvma.gov.au/about/foi/disclosure/index.php" - def getColumns(self,columns): (id, date, description, title, notes) = columns return (id, date, description, title, notes) @@ -19,3 +13,4 @@ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) ScraperImplementation().doScrape() + --- a/documents/scrapers/8c9421f852c441910bf1d93a57b31d64.py +++ b/documents/scrapers/8c9421f852c441910bf1d93a57b31d64.py @@ -5,12 +5,6 @@ #http://www.doughellmann.com/PyMOTW/abc/ class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): - def getAgencyID(self): - return "8c9421f852c441910bf1d93a57b31d64" - - def getURL(self): - return "http://www.daff.gov.au/about/foi/ips/disclosure-log" - def getColumns(self,columns): (id, date, title, description, notes) = columns return (id, date, description, title, notes) --- a/documents/scrapers/be9996f0ac58f71f23d074e82d44ead3.py +++ b/documents/scrapers/be9996f0ac58f71f23d074e82d44ead3.py @@ -6,12 +6,6 @@ from bs4 import BeautifulSoup #http://www.doughellmann.com/PyMOTW/abc/ class ScraperImplementation(genericScrapers.GenericRSSDisclogScraper): - def getAgencyID(self): - return "be9996f0ac58f71f23d074e82d44ead3" - - def getURL(self): - return "http://foi.deewr.gov.au/disclosure-log/rss" - def getDescription(self,entry,doc): (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, entry.link, "foidocuments", self.getAgencyID(), False) if content != None: --- a/documents/scrapers/e2a845e55bc9986e6c75c5ad2c508b8d.py +++ b/documents/scrapers/e2a845e55bc9986e6c75c5ad2c508b8d.py @@ -5,12 +5,6 @@ #http://www.doughellmann.com/PyMOTW/abc/ class ScraperImplementation(genericScrapers.GenericRSSDisclogScraper): - def getAgencyID(self): - return "be9996f0ac58f71f23d074e82d44ead3" - - def getURL(self): - return "http://foi.deewr.gov.au/disclosure-log/rss" - def getColumns(self,columns): (id, date, title, description, notes) = columns return (id, date, description, title, notes) --- a/documents/scrapers/rtk.py +++ b/documents/scrapers/rtk.py @@ -5,12 +5,6 @@ #http://www.doughellmann.com/PyMOTW/abc/ class ScraperImplementation(genericScrapers.GenericRSSDisclogScraper): - def getAgencyID(self): - return "be9996f0ac58f71f23d074e82d44ead3" - - def getURL(self): - return "http://foi.deewr.gov.au/disclosure-log/rss" - def getColumns(self,columns): (id, date, title, description, notes) = columns return (id, date, description, title, notes) --- a/documents/template.inc.php +++ b/documents/template.inc.php @@ -127,7 +127,7 @@ } function displayLogEntry($row, $idtoname) { - echo "

".$row->value->date.": ".$row->value->title." (".$idtoname[$row->value->agencyID].")

".$row->value->description; + echo "

".$row->value->date.": ".$row->value->title." (".$idtoname[$row->value->agencyID].")

".str_replace("\n","
",$row->value->description); if (isset($row->value->notes)) { echo "
Note: ".$row->value->notes; }