made generic OAIC format table scraper class
[disclosr.git] / documents / genericScrapers.py
blob:a/documents/genericScrapers.py -> blob:b/documents/genericScrapers.py
--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -1,10 +1,9 @@
 import sys,os
 sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
 import scrape
-
 from bs4 import BeautifulSoup
+import parsedatetime as pdt
 import abc
-
 class GenericOAICDisclogScraper(object):
 	__metaclass__ = abc.ABCMeta
 	@abc.abstractmethod
@@ -23,6 +22,7 @@
 		return
 
 	def doScrape(self):
+		cal = pdt.Calendar()
 		foidocsdb = scrape.couch['disclosr-foidocuments']
 		(url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID())
 		if content != None:
@@ -46,8 +46,15 @@
 							
 						if doc == None:
 							print "saving"
+                                                        dtresult = cal.parseDateText(date.string)
+							if len(dtresult) == 2:
+								(dtdate,dtr) = dtresult
+								print dtdate
+                                                        	edate = ""+str(dtdate[0])+'-'+str(dtdate[1])+'-'+str(dtdate[2])
+							else:
+								edate = ""
 							doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), "links": links, 'docID': id.string,
-			 				 "date": date.string, "description": descriptiontxt,"title": title.string,"notes": notes.string}
+			 				 "date": edate, "description": descriptiontxt,"title": title.string,"notes": notes.string}
 							foidocsdb.save(doc)
 						else:
 							print "already saved"