more scrapers
[disclosr.git] / documents / scrapers / 53d2884f8afd026096a27bd5051ec50e.py
blob:a/documents/scrapers/53d2884f8afd026096a27bd5051ec50e.py -> blob:b/documents/scrapers/53d2884f8afd026096a27bd5051ec50e.py
--- a/documents/scrapers/53d2884f8afd026096a27bd5051ec50e.py
+++ b/documents/scrapers/53d2884f8afd026096a27bd5051ec50e.py
@@ -1,1 +1,39 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
 
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+        def getTable(self,soup):
+                return soup.find(class_ = "ms-rtestate-field").table
+        def getColumns(self,columns):
+                (id, date, title, description, notes) = columns
+                return (id, date, description, title, notes)
+
+        def getLinks(self, content, entry, doc):
+		link = None
+                links = []
+		for atag in entry.find_all('a'):
+			if atag.has_key('href'):
+				link = scrape.fullurl(self.getURL(),atag['href'])			
+                                (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False)
+                                if htcontent != None:
+                                        if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
+                                        # http://www.crummy.com/software/BeautifulSoup/documentation.html
+                                                soup = BeautifulSoup(htcontent)
+                                                for atag in soup.find(class_ = "article-content").find_all('a'):
+	                                               	if atag.has_key('href'):
+        	                                              	links.append(scrape.fullurl(link,atag['href']))
+
+		if links != []:
+                 	doc.update({'links': links})
+                 	doc.update({'url': link})
+		return
+
+if __name__ == '__main__':
+    print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+    print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+    ScraperImplementation().doScrape()
+