derive agencyID and disclog url from filename
Former-commit-id: eddf90809214ee502e593c769c4bd0b0b2fafc3a
--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -8,18 +8,31 @@
import feedparser
import abc
-class GenericRSSDisclogScraper(object):
+class GenericDisclogScraper(object):
__metaclass__ = abc.ABCMeta
- @abc.abstractmethod
+ agencyID = None
+ disclogURL = None
def getAgencyID(self):
""" disclosr agency id """
- return
+ if self.agencyID == None:
+ self.agencyID = os.path.basename(sys.argv[0]).replace(".py","")
+ return self.agencyID
- @abc.abstractmethod
def getURL(self):
""" disclog URL"""
- return
+ if self.disclogURL == None:
+ agency = scrape.agencydb.get(self.getAgencyID())
+ self.disclogURL = agency['FOIDocumentsURL']
+ return self.disclogURL
+ @abc.abstractmethod
+ def doScrape(self):
+ """ do the scraping """
+ return
+
+
+
+class GenericRSSDisclogScraper(GenericDisclogScraper):
def getDescription(self, entry, doc):
""" get description from rss entry"""
doc['description'] = entry.summary
@@ -46,18 +59,8 @@
else:
print "already saved"
-class GenericOAICDisclogScraper(object):
- __metaclass__ = abc.ABCMeta
- @abc.abstractmethod
- def getAgencyID(self):
- """ disclosr agency id """
- return
-
- @abc.abstractmethod
- def getURL(self):
- """ disclog URL"""
- return
-
+class GenericOAICDisclogScraper(GenericDisclogScraper):
+ __metaclass__ = abc.ABCMeta
@abc.abstractmethod
def getColumns(self,columns):
""" rearranges columns if required """
--- a/documents/scrapers/3cd40b1240e987cbcd3f0e67054ce259.py
+++ b/documents/scrapers/3cd40b1240e987cbcd3f0e67054ce259.py
@@ -5,12 +5,6 @@
#http://www.doughellmann.com/PyMOTW/abc/
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
- def getAgencyID(self):
- return "3cd40b1240e987cbcd3f0e67054ce259"
-
- def getURL(self):
- return "http://www.apvma.gov.au/about/foi/disclosure/index.php"
-
def getColumns(self,columns):
(id, date, description, title, notes) = columns
return (id, date, description, title, notes)
@@ -19,3 +13,4 @@
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
ScraperImplementation().doScrape()
+
--- a/documents/scrapers/8c9421f852c441910bf1d93a57b31d64.py
+++ b/documents/scrapers/8c9421f852c441910bf1d93a57b31d64.py
@@ -5,12 +5,6 @@
#http://www.doughellmann.com/PyMOTW/abc/
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
- def getAgencyID(self):
- return "8c9421f852c441910bf1d93a57b31d64"
-
- def getURL(self):
- return "http://www.daff.gov.au/about/foi/ips/disclosure-log"
-
def getColumns(self,columns):
(id, date, title, description, notes) = columns
return (id, date, description, title, notes)
--- a/documents/scrapers/be9996f0ac58f71f23d074e82d44ead3.py
+++ b/documents/scrapers/be9996f0ac58f71f23d074e82d44ead3.py
@@ -6,12 +6,6 @@
from bs4 import BeautifulSoup
#http://www.doughellmann.com/PyMOTW/abc/
class ScraperImplementation(genericScrapers.GenericRSSDisclogScraper):
- def getAgencyID(self):
- return "be9996f0ac58f71f23d074e82d44ead3"
-
- def getURL(self):
- return "http://foi.deewr.gov.au/disclosure-log/rss"
-
def getDescription(self,entry,doc):
(url,mime_type,content) = scrape.fetchURL(scrape.docsdb, entry.link, "foidocuments", self.getAgencyID(), False)
if content != None:
--- a/documents/scrapers/e2a845e55bc9986e6c75c5ad2c508b8d.py
+++ b/documents/scrapers/e2a845e55bc9986e6c75c5ad2c508b8d.py
@@ -5,12 +5,6 @@
#http://www.doughellmann.com/PyMOTW/abc/
class ScraperImplementation(genericScrapers.GenericRSSDisclogScraper):
- def getAgencyID(self):
- return "be9996f0ac58f71f23d074e82d44ead3"
-
- def getURL(self):
- return "http://foi.deewr.gov.au/disclosure-log/rss"
-
def getColumns(self,columns):
(id, date, title, description, notes) = columns
return (id, date, description, title, notes)
--- a/documents/scrapers/rtk.py
+++ b/documents/scrapers/rtk.py
@@ -5,12 +5,6 @@
#http://www.doughellmann.com/PyMOTW/abc/
class ScraperImplementation(genericScrapers.GenericRSSDisclogScraper):
- def getAgencyID(self):
- return "be9996f0ac58f71f23d074e82d44ead3"
-
- def getURL(self):
- return "http://foi.deewr.gov.au/disclosure-log/rss"
-
def getColumns(self,columns):
(id, date, title, description, notes) = columns
return (id, date, description, title, notes)
--- a/documents/template.inc.php
+++ b/documents/template.inc.php
@@ -127,7 +127,7 @@
}
function displayLogEntry($row, $idtoname) {
- echo "<div><h2>".$row->value->date.": ".$row->value->title." (".$idtoname[$row->value->agencyID].")</h2> <p>".$row->value->description;
+ echo "<div><h2>".$row->value->date.": ".$row->value->title." (".$idtoname[$row->value->agencyID].")</h2> <p>".str_replace("\n","<br>",$row->value->description);
if (isset($row->value->notes)) {
echo " <br>Note: ".$row->value->notes;
}