From: Maxious <maxious@lambdacomplex.org>
Date: Tue, 27 Nov 2012 05:21:34 +0000
Subject: more scrapers
X-Git-Url: http://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=c76480cdb04a2246aae5543485cf8896a2c87740
---
more scrapers


Former-commit-id: 624a5af265cf19be61f44ec7a025acad87daa5f8
---


--- a/documents/disclogsList.php
+++ b/documents/disclogsList.php
@@ -36,7 +36,7 @@
                 if (file_exists("./scrapers/" . $row->id . '.py')) {
                     echo "<font color='green'>✔</font>";
                 } else if (file_exists("./scrapers/" . $row->id . '.txt')) {
-                    echo "<font color='blue'><b>▬</b></font>";
+                    echo "<font color='orange'><b>▬</b></font>";
                 } else {
                     echo "<font color='red'>✘</font>";
                 }
@@ -50,3 +50,4 @@
 echo "</table>";
 include_footer_documents();
 ?>
+

--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -55,7 +55,7 @@
 		  	doc = foidocsdb.get(hash)
 			#print doc
 			if doc == None:
-                        	print "saving"
+                        	print "saving "+ hash
 				edate = datetime.fromtimestamp(mktime( entry.published_parsed)).strftime("%Y-%m-%d")
                                 doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': entry.link, 'docID': entry.id,
                                 "date": edate,"title": entry.title}
@@ -89,9 +89,17 @@
 	def getTable(self, soup):
 		return soup.table
 	def getDate(self, content, entry, doc):
-		edate = parse(content.string.strip(), dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
+		edate = parse(''.join(content.stripped_strings).strip(), dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
 		print edate
 		doc.update({'date': edate})
+		return
+	def getLinks(self, content, entry, doc):
+                links = []
+                for atag in entry.find_all("a"):
+                       	if atag.has_key('href'):
+                               	links.append(scrape.fullurl(content,atag['href']))
+                if links != []:
+	                doc.update({'links': links})
 		return
 
 	def doScrape(self):
@@ -106,22 +114,17 @@
 					columns = row.find_all('td')
 					if len(columns) == self.getColumnCount():
 						(id, date, description, title, notes) = self.getColumns(columns)
-						print id.string
+						print ''.join(id.stripped_strings)
 						if id.string == None:
-							hash = scrape.mkhash(self.remove_control_chars(url+date.string))
+							hash = scrape.mkhash(self.remove_control_chars(url+(''.join(date.stripped_strings))))
 						else:
-							hash = scrape.mkhash(self.remove_control_chars(url+id.string))
-						links = []
-						for atag in row.find_all("a"):
-							if atag.has_key('href'):
-								links.append(scrape.fullurl(url,atag['href']))
+							hash = scrape.mkhash(self.remove_control_chars(url+(''.join(id.stripped_strings))))
 						doc = foidocsdb.get(hash)
 							
 						if doc == None:
-							print "saving"
+							print "saving " +hash
 							doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': id.string}
-                                			if links != []:
-                                        			doc.update({'links': links})
+							self.getLinks(self.getURL(),row,doc)
                                 			self.getTitle(title,row, doc)
                                 			self.getDate(date,row, doc)
 							self.getDescription(description,row, doc)

--- /dev/null
+++ b/documents/scrapers/0e46f8bd1414b1fdd4f0543d54a97500.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
 
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+        def getTable(self,soup):
+                return soup.find(id = "maincontentcontainer").table
+        def getColumnCount(self):
+                return 5
+        def getColumns(self,columns):
+                (date, disclogdate, title, description, notes) = columns
+                return (date, date, title, description, notes)
+
+if __name__ == '__main__':
+    print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+    print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+    ScraperImplementation().doScrape()
+

--- /dev/null
+++ b/documents/scrapers/1097fa8afdcf5db89d212d0979226667.py
@@ -1,1 +1,17 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
 
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+        def getColumns(self,columns):
+                (id, date, title, description, notes) = columns
+                return (id, date, description, title, notes)
+
+if __name__ == '__main__':
+    print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+    print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+    ScraperImplementation().doScrape()
+

--- /dev/null
+++ b/documents/scrapers/1ad74ca88932f90f0b92b69387171441.py
@@ -1,1 +1,17 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
 
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+        def getColumns(self,columns):
+                (id,  date, title, description, notes) = columns
+                return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+    print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+    print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+    ScraperImplementation().doScrape()
+

--- /dev/null
+++ b/documents/scrapers/269680be088f3d8e663251655f3825b4.py
@@ -1,1 +1,17 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
 
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+        def getColumns(self,columns):
+                (date, id, title, description, notes) = columns
+                return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+    print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+    print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+    ScraperImplementation().doScrape()
+

--- /dev/null
+++ b/documents/scrapers/31b7c75cf484747b6b120680bddd33b0.py
@@ -1,1 +1,19 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
 
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+        def getColumnCount(self):
+                return 2
+        def getColumns(self,columns):
+                (title, date) = columns
+                return (date, date, title, title, None)
+
+if __name__ == '__main__':
+    print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+    print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+    ScraperImplementation().doScrape()
+

--- /dev/null
+++ b/documents/scrapers/329fccdca068b78ab7edd550e2957398.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
 
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+        def getColumnCount(self):
+                return 7
+        def getTable(self,soup):
+                return soup.find(class_ = "foi-disclosure")
+        def getColumns(self,columns):
+                (disclogid, id,  date, title, link, removedate, notes) = columns
+                return (id, date, title, title, notes)
+
+if __name__ == '__main__':
+    print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+    print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+    ScraperImplementation().doScrape()
+

--- /dev/null
+++ b/documents/scrapers/53d2884f8afd026096a27bd5051ec50e.py
@@ -1,1 +1,39 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
 
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+        def getTable(self,soup):
+                return soup.find(class_ = "ms-rtestate-field").table
+        def getColumns(self,columns):
+                (id, date, title, description, notes) = columns
+                return (id, date, description, title, notes)
+
+        def getLinks(self, content, entry, doc):
+		link = None
+                links = []
+		for atag in entry.find_all('a'):
+			if atag.has_key('href'):
+				link = scrape.fullurl(self.getURL(),atag['href'])			
+                                (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False)
+                                if htcontent != None:
+                                        if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
+                                        # http://www.crummy.com/software/BeautifulSoup/documentation.html
+                                                soup = BeautifulSoup(htcontent)
+                                                for atag in soup.find(class_ = "article-content").find_all('a'):
+	                                               	if atag.has_key('href'):
+        	                                              	links.append(scrape.fullurl(link,atag['href']))
+
+		if links != []:
+                 	doc.update({'links': links})
+                 	doc.update({'url': link})
+		return
+
+if __name__ == '__main__':
+    print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+    print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+    ScraperImplementation().doScrape()
+

--- /dev/null
+++ b/documents/scrapers/6fe3c812a99d486963133459b2768cf6.py
@@ -1,1 +1,17 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
 
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+        def getColumns(self,columns):
+                (id, date, title, description, notes) = columns
+                return (id, date, description, title, notes)
+
+if __name__ == '__main__':
+    print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+    print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+    ScraperImplementation().doScrape()
+

--- /dev/null
+++ b/documents/scrapers/734f12db07e844b30cd11dc98500f2ce.py
@@ -1,1 +1,19 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
 
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+        def getColumnCount(self):
+                return 4
+        def getColumns(self,columns):
+                (id,  date, title, description) = columns
+                return (id, date, title, description, None)
+
+if __name__ == '__main__':
+    print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+    print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+    ScraperImplementation().doScrape()
+

--- /dev/null
+++ b/documents/scrapers/7c6adc1d41cf029bf1a0959e5156477a.txt

--- /dev/null
+++ b/documents/scrapers/8ef0e5802f99800f514b3a148e013b75.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
 
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+	def getColumnCount(self):
+		return 4;
+        def getTable(self,soup):
+                return soup.find(class_ = "content").table
+        def getColumns(self,columns):
+                (id, date, title, description) = columns
+                return (id, date, description, title, None)
+
+if __name__ == '__main__':
+    print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+    print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+    ScraperImplementation().doScrape()
+

--- /dev/null
+++ b/documents/scrapers/955dc4cb047b5439dfb65549ce2696a6.py
@@ -1,1 +1,17 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
 
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+        def getColumns(self,columns):
+                (id,  date, title, description, notes) = columns
+                return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+    print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+    print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+    ScraperImplementation().doScrape()
+

--- /dev/null
+++ b/documents/scrapers/a2d871c5d28de1dde8a3b66c4957e1a5.py
@@ -1,1 +1,19 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
 
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+        def getColumnCount(self):
+                return 7
+        def getColumns(self,columns):
+                (id, date, title, description,link,deldate, notes) = columns
+                return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+    print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+    print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+    ScraperImplementation().doScrape()
+

--- /dev/null
+++ b/documents/scrapers/c414e65ed728d05307d5b27f13e195e1.py
@@ -1,1 +1,19 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
 
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+        def getColumnCount(self):
+                return 3
+        def getColumns(self,columns):
+                (date, title, description) = columns
+                return (date, date, title, description, None)
+
+if __name__ == '__main__':
+    print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+    print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+    ScraperImplementation().doScrape()
+

--- /dev/null
+++ b/documents/scrapers/d72744fb1e5d6e87af9a5ea16cc27fa5.txt

--- /dev/null
+++ b/documents/scrapers/fa9b3badd6c686398c1c6982a4b02475.py
@@ -1,1 +1,17 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
 
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+        def getColumns(self,columns):
+                (id,  date, title, description, notes) = columns
+                return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+    print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+    print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+    ScraperImplementation().doScrape()
+