more scrapers
[disclosr.git] / documents / scrapers / 8e874a2fde8aa0ccdc6d14573d766540.txt
blob:a/documents/scrapers/8e874a2fde8aa0ccdc6d14573d766540.txt -> blob:b/documents/scrapers/8e874a2fde8aa0ccdc6d14573d766540.txt
--- a/documents/scrapers/8e874a2fde8aa0ccdc6d14573d766540.txt
+++ b/documents/scrapers/8e874a2fde8aa0ccdc6d14573d766540.txt
@@ -1,1 +1,49 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
 
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+        def getDescription(self,content, entry,doc):
+		link = None
+                links = []
+                description = ""
+		for atag in entry.find_all('a'):
+			if atag.has_key('href'):
+				link = scrape.fullurl(self.getURL(),atag['href'])			
+                                (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False)
+                                if htcontent != None:
+                                        if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
+                                        # http://www.crummy.com/software/BeautifulSoup/documentation.html
+                                                soup = BeautifulSoup(htcontent)
+                                                for row in soup.find(class_ = "ms-rteTable-GreyAlternating").find_all('tr'):
+                                                        if row != None:
+								rowtitle = row.find('th').string
+                                                                description = description + "\n" + rowtitle + ": "
+                                                                for text in row.find('td').stripped_strings:
+                                                                        description = description + text
+                                                     		for atag in row.find_all("a"):
+                                                                	if atag.has_key('href'):
+                                                                        	links.append(scrape.fullurl(link,atag['href']))
+
+		if links != []:
+                 	doc.update({'links': links})
+                if description != "":
+                        doc.update({ 'description': description})
+
+	def getColumnCount(self):
+		return 2
+	def getTable(self,soup):
+		return soup.find(class_ = "ms-rteTable-GreyAlternating")
+	def getColumns(self,columns):
+		(date, title) = columns
+		return (title, date, title, title, None)
+
+if __name__ == '__main__':
+    print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+    print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+    ScraperImplementation().doScrape()
+# old site too http://archive.treasury.gov.au/content/foi_publications.asp
+