made generic OAIC format table scraper class
[disclosr.git] / documents / genericScrapers.py
blob:a/documents/genericScrapers.py -> blob:b/documents/genericScrapers.py
--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -1,28 +1,73 @@
 import sys,os
 sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
 import scrape
-
 from bs4 import BeautifulSoup
+import parsedatetime as pdt
+from time import mktime
+from datetime import datetime
+import feedparser
 import abc
 
-class GenericOAICDisclogScraper(object):
-	__metaclass__ = abc.ABCMeta
+class GenericDisclogScraper(object):
+        __metaclass__ = abc.ABCMeta
+	agencyID = None
+	disclogURL = None
+        def getAgencyID(self):
+                """ disclosr agency id """
+		if self.agencyID == None:
+			self.agencyID = os.path.basename(sys.argv[0]).replace(".py","")
+                return self.agencyID
+
+        def getURL(self):
+                """ disclog URL"""
+		if self.disclogURL == None:
+			agency = scrape.agencydb.get(self.getAgencyID())
+			self.disclogURL = agency['FOIDocumentsURL']
+                return self.disclogURL
+
 	@abc.abstractmethod
-	def getAgencyID(self):
-		""" disclosr agency id """
+	def doScrape(self):
+		""" do the scraping """
 		return
 
-	@abc.abstractmethod
-	def getURL(self):
-		""" disclog URL"""
+
+
+class GenericRSSDisclogScraper(GenericDisclogScraper):
+        def getDescription(self, entry, doc):
+                """ get description from rss entry"""
+                doc['description'] = entry.summary
 		return
 
+       	def doScrape(self):
+               	foidocsdb = scrape.couch['disclosr-foidocuments']
+                (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID())
+		feed = feedparser.parse(content)		
+		for entry in feed.entries:
+			#print entry
+			print entry.id
+			hash = scrape.mkhash(entry.id)
+			#print hash
+		  	doc = foidocsdb.get(hash)
+			#print doc
+			if doc == None:
+                        	print "saving"
+				edate = datetime.fromtimestamp(mktime( entry.published_parsed)).strftime("%Y-%m-%d")
+                                doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': entry.link, 'docID': entry.id,
+                                "date": edate,"title": entry.title}
+				self.getDescription(entry, doc)
+                                foidocsdb.save(doc)
+                        else:
+                        	print "already saved"			
+
+class GenericOAICDisclogScraper(GenericDisclogScraper):
+        __metaclass__ = abc.ABCMeta
 	@abc.abstractmethod
 	def getColumns(self,columns):
 		""" rearranges columns if required """
 		return
 
 	def doScrape(self):
+		cal = pdt.Calendar()
 		foidocsdb = scrape.couch['disclosr-foidocuments']
 		(url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID())
 		if content != None:
@@ -42,12 +87,19 @@
 						doc = foidocsdb.get(hash)
 						descriptiontxt = ""
 						for string in description.stripped_strings:
-							descriptiontxt = descriptiontxt + string
+							descriptiontxt = descriptiontxt + " \n" + string
 							
 						if doc == None:
 							print "saving"
+                                                        dtresult = cal.parseDateText(date.string)
+							if len(dtresult) == 2:
+								(dtdate,dtr) = dtresult
+								print dtdate
+                                                        	edate = ""+str(dtdate[0])+'-'+str(dtdate[1])+'-'+str(dtdate[2])
+							else:
+								edate = ""
 							doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), "links": links, 'docID': id.string,
-			 				 "date": date.string, "description": descriptiontxt,"title": title.string,"notes": notes.string}
+			 				 "date": edate, "description": descriptiontxt,"title": title.string,"notes": notes.string}
 							foidocsdb.save(doc)
 						else:
 							print "already saved"