From: Maxious <maxious@lambdacomplex.org>
Date: Sun, 17 Mar 2013 07:22:38 +0000
Subject: ignore CSIRO disclog formatting
X-Git-Url: https://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=ce8a1e63f91a6cc81d644a787b0cd5d115370965
---
ignore CSIRO disclog formatting


Former-commit-id: 225487e3a636c4cf2758c6ad60437c748bc1df7c
---


--- a/documents/runScrapers.sh
+++ b/documents/runScrapers.sh
@@ -1,3 +1,5 @@
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+cd $DIR
 echo "" > /tmp/disclosr-error
 for f in scrapers/*.py; do
 	echo "Processing $f file..";

--- a/documents/scrapers/3d5871a44abbbc81ef5b3a420070755d.py
+++ b/documents/scrapers/3d5871a44abbbc81ef5b3a420070755d.py
@@ -8,42 +8,14 @@
 from datetime import *
 
 #http://www.doughellmann.com/PyMOTW/abc/
-class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
-        def getTable(self,soup):
-                return soup.find(class_ = "inner-column").table       
-        def getRows(self,table):
-                return table.tbody.find_all('tr',recursive=False)
+class ScraperImplementation(genericScrapers.GenericHTMLDisclogScraper):
         def getColumnCount(self):
-                return 3
-        def getColumns(self,columns):
-                (date, title, description) = columns
-                return (date, date, title, description, None)
-        def getDate(self, content, entry, doc):
-		i = 0
-		date = ""
-		for string in content.stripped_strings:
-    			if i ==1:
-				date = string
-			i = i+1
-                edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
-                print edate
-                doc.update({'date': edate})
-                return
-   	def getTitle(self, content, entry, doc):
-		i = 0
-		title = ""
-		for string in content.stripped_strings:
-    			if i < 2:
-				title = title + string
-			i = i+1
-                doc.update({'title': title})
-		#print title
-                return
+                return 0
 
 if __name__ == '__main__':
 #http://www.csiro.au/Portals/About-CSIRO/How-we-work/Governance/FOI-Request-Disclosure-Log-2012-13.aspx
 #http://www.csiro.au/Portals/About-CSIRO/How-we-work/Governance/FOI-Request-Disclosure-Log-2011-12.aspx
-    print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
-    print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+    print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericHTMLDisclogScraper)
+    print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericHTMLDisclogScraper)
     ScraperImplementation().doScrape()
 

--- a/documents/scrapers/7c6adc1d41cf029bf1a0959e5156477a.py
+++ b/documents/scrapers/7c6adc1d41cf029bf1a0959e5156477a.py
@@ -21,7 +21,7 @@
         d.make_links_absolute(base_url = self.getURL())
         for table in d('table').items():
             title= table('thead').text()
-            print title
+            print self.remove_control_chars(title)
             (idate,descA,descB,link,deldate,notes) = table('tbody tr').map(lambda i, e: pq(e).children().eq(1).text())
             links = table('a').map(lambda i, e: pq(e).attr('href'))
             description = descA+" "+descB