--- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -1,10 +1,9 @@ import sys,os sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) import scrape - from bs4 import BeautifulSoup +import parsedatetime as pdt import abc - class GenericOAICDisclogScraper(object): __metaclass__ = abc.ABCMeta @abc.abstractmethod @@ -23,6 +22,7 @@ return def doScrape(self): + cal = pdt.Calendar() foidocsdb = scrape.couch['disclosr-foidocuments'] (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) if content != None: @@ -46,8 +46,15 @@ if doc == None: print "saving" + dtresult = cal.parseDateText(date.string) + if len(dtresult) == 2: + (dtdate,dtr) = dtresult + print dtdate + edate = ""+str(dtdate[0])+'-'+str(dtdate[1])+'-'+str(dtdate[2]) + else: + edate = "" doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), "links": links, 'docID': id.string, - "date": date.string, "description": descriptiontxt,"title": title.string,"notes": notes.string} + "date": edate, "description": descriptiontxt,"title": title.string,"notes": notes.string} foidocsdb.save(doc) else: print "already saved"