From: Maxious <maxious@lambdacomplex.org>
Date: Fri, 30 Nov 2012 12:50:36 +0000
Subject: more scrapers
X-Git-Url: https://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=bffa93e6d665a9a84f42927ea7a61776fccc2bf9
---
more scrapers


Former-commit-id: a1b071fe60c375afcb0527b3c5a86844b8f0035b
---


--- a/documents/disclogsList.php
+++ b/documents/disclogsList.php
@@ -12,6 +12,7 @@
 $disclogs = 0;
 $red = 0;
 $green = 0;
+$yellow = 0;
 $orange = 0;
 try {
     $rows = $agenciesdb->get_view("app", "byCanonicalName", null, true)->rows;
@@ -46,8 +47,14 @@
                         echo "<font color='green'><abbr title='Yes'>✔</abbr></font>";
                         $green++;
                     } else if (file_exists("./scrapers/" . $row->id . '.txt')) {
+                        if (trim(file_get_contents("./scrapers/" . $row->id . '.txt')) == "no disclog") {
+                            echo "<font color='yellow'><abbr title='No log table exists at URL to scrape'><b>◎</b></abbr></font>";
+                        $yellow++;
+                        } else {
+                            echo file_get_contents("./scrapers/" . $row->id . '.txt');
                         echo "<font color='orange'><abbr title='Work in progress'><b>▬</b></abbr></font>";
                         $orange++;
+                        }
                     } else {
                         echo "<font color='red'><abbr title='No'>✘</abbr></font>";
                         $red++;

--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -9,6 +9,7 @@
 import dateutil
 from dateutil.parser import *
 from datetime import *
+import codecs
 
 class GenericDisclogScraper(object):
         __metaclass__ = abc.ABCMeta
@@ -120,7 +121,7 @@
 					columns = row.find_all('td')
 					if len(columns) == self.getColumnCount():
 						(id, date, title, description, notes) = self.getColumns(columns)
-						print ''.join(id.stripped_strings)
+						print ''.join(id.stripped_strings).encode('ascii', 'ignore')
 						if id.string == None:
 							hash = scrape.mkhash(self.remove_control_chars(url+(''.join(date.stripped_strings))))
 						else:

--- a/documents/scrapers/627f116dfe42c9f27ad6747be0aa44e2.txt
+++ b/documents/scrapers/627f116dfe42c9f27ad6747be0aa44e2.txt
@@ -1,2 +1,1 @@
-see parent dhs
-
+no disclog

--- a/documents/scrapers/7c6adc1d41cf029bf1a0959e5156477a.txt
+++ b/documents/scrapers/7c6adc1d41cf029bf1a0959e5156477a.txt
@@ -1,1 +1,1 @@
-
+acma style

--- /dev/null
+++ b/documents/scrapers/8e874a2fde8aa0ccdc6d14573d766540.py
@@ -1,1 +1,86 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+import codecs 
+#http://www.doughellmann.com/PyMOTW/abc/
+class NewScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+        def getDescription(self,content, entry,doc):
+		link = None
+                links = []
+                description = ""
+		for atag in entry.find_all('a'):
+			if atag.has_key('href'):
+				link = scrape.fullurl(self.getURL(),atag['href'])			
+                                (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False)
+                                if htcontent != None:
+                                        if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
+                                        # http://www.crummy.com/software/BeautifulSoup/documentation.html
+                                                soup = BeautifulSoup(htcontent)
+                                                for text in soup.find(id="divFullWidthColumn").stripped_strings:
+                                                    description = description + text.encode('ascii', 'ignore')
+                                                
+                                                for atag in soup.find(id="divFullWidthColumn").find_all("a"):
+                                                      	if atag.has_key('href'):
+                                                              	links.append(scrape.fullurl(link,atag['href']))
 
+		if links != []:
+                 	doc.update({'links': links})
+                if description != "":
+                        doc.update({ 'description': description})
+
+	def getColumnCount(self):
+		return 2
+	def getTable(self,soup):
+		return soup.find(id = "TwoColumnSorting")
+	def getColumns(self,columns):
+		( title, date) = columns
+		return (title, date, title, title, None)
+class OldScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+        def getDescription(self,content, entry,doc):
+		link = None
+                links = []
+                description = ""
+		for atag in entry.find_all('a'):
+			if atag.has_key('href'):
+				link = scrape.fullurl(self.getURL(),atag['href'])			
+                                (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False)
+                                if htcontent != None:
+                                        if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
+                                        # http://www.crummy.com/software/BeautifulSoup/documentation.html
+                                                soup = BeautifulSoup(htcontent)
+                                                for text in soup.find(id="content-item").stripped_strings:
+                                                    description = description + text + " \n"
+                                                for atag in soup.find(id="content-item").find_all("a"):
+                                                    if atag.has_key('href'):
+                                                        links.append(scrape.fullurl(link,atag['href']))
+		if links != []:
+                 	doc.update({'links': links})
+                if description != "":
+                        doc.update({ 'description': description})
+
+		if links != []:
+                 	doc.update({'links': links})
+                if description != "":
+                        doc.update({ 'description': description})
+
+	def getColumnCount(self):
+		return 2
+	def getTable(self,soup):
+		return soup.find(class_ = "doc-list")
+	def getColumns(self,columns):
+		(date, title) = columns
+		return (title, date, title, title, None)
+
+if __name__ == '__main__':
+    print 'Subclass:', issubclass(NewScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+    print 'Instance:', isinstance(NewScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+    #NewScraperImplementation().doScrape()
+    print 'Subclass:', issubclass(OldScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+    print 'Instance:', isinstance(OldScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+    osi = OldScraperImplementation()
+    osi.disclogURL = "http://archive.treasury.gov.au/content/foi_publications.asp?year=-1&abstract=0&classification=&=&titl=Disclosure+Log+-+Documents+Released+Under+FOI"
+    osi.doScrape()
+# old site too
+

--- a/documents/scrapers/8e874a2fde8aa0ccdc6d14573d766540.txt
+++ /dev/null
@@ -1,49 +1,1 @@
-import sys,os
-sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
-import genericScrapers
-import scrape
-from bs4 import BeautifulSoup
 
-#http://www.doughellmann.com/PyMOTW/abc/
-class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
-        def getDescription(self,content, entry,doc):
-		link = None
-                links = []
-                description = ""
-		for atag in entry.find_all('a'):
-			if atag.has_key('href'):
-				link = scrape.fullurl(self.getURL(),atag['href'])			
-                                (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False)
-                                if htcontent != None:
-                                        if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
-                                        # http://www.crummy.com/software/BeautifulSoup/documentation.html
-                                                soup = BeautifulSoup(htcontent)
-                                                for row in soup.find(class_ = "ms-rteTable-GreyAlternating").find_all('tr'):
-                                                        if row != None:
-								rowtitle = row.find('th').string
-                                                                description = description + "\n" + rowtitle + ": "
-                                                                for text in row.find('td').stripped_strings:
-                                                                        description = description + text
-                                                     		for atag in row.find_all("a"):
-                                                                	if atag.has_key('href'):
-                                                                        	links.append(scrape.fullurl(link,atag['href']))
-
-		if links != []:
-                 	doc.update({'links': links})
-                if description != "":
-                        doc.update({ 'description': description})
-
-	def getColumnCount(self):
-		return 2
-	def getTable(self,soup):
-		return soup.find(class_ = "ms-rteTable-GreyAlternating")
-	def getColumns(self,columns):
-		(date, title) = columns
-		return (title, date, title, title, None)
-
-if __name__ == '__main__':
-    print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
-    print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
-    ScraperImplementation().doScrape()
-# old site too http://archive.treasury.gov.au/content/foi_publications.asp
-

--- a/documents/scrapers/bb96fe4065afb7e0872136dd657f9369.txt
+++ b/documents/scrapers/bb96fe4065afb7e0872136dd657f9369.txt
@@ -1,2 +1,1 @@
-# does not have any disclog entries or table
-
+no disclog

--- a/documents/scrapers/bf6e587f166040b63681cd2ff76fbfdf.txt
+++ b/documents/scrapers/bf6e587f166040b63681cd2ff76fbfdf.txt
@@ -1,1 +1,1 @@
-no disclog yet
+no disclog

--- a/documents/scrapers/cb7f40e3495b682de6eee61bf09c1cfc.txt
+++ b/documents/scrapers/cb7f40e3495b682de6eee61bf09c1cfc.txt
@@ -1,2 +1,1 @@
-no log
-
+no disclog

--- a/documents/scrapers/d72744fb1e5d6e87af9a5ea16cc27fa5.txt
+++ b/documents/scrapers/d72744fb1e5d6e87af9a5ea16cc27fa5.txt
@@ -1,1 +1,1 @@
-
+acma style

--- a/documents/scrapers/e770921522a49dc77de208cc724ce134.txt
+++ b/documents/scrapers/e770921522a49dc77de208cc724ce134.txt
@@ -1,2 +1,1 @@
-c'est ne pas une table
-
+no disclog