--- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -1,11 +1,9 @@ import sys,os sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) import scrape - from bs4 import BeautifulSoup +import parsedatetime as pdt import abc -import dateutil.parser - class GenericOAICDisclogScraper(object): __metaclass__ = abc.ABCMeta @abc.abstractmethod @@ -24,6 +22,7 @@ return def doScrape(self): + cal = pdt.Calendar() foidocsdb = scrape.couch['disclosr-foidocuments'] (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) if content != None: @@ -47,7 +46,13 @@ if doc == None: print "saving" - edate = dateutil.parser.parse(date.string).date().strftime("%Y-%m-%d") + dtresult = cal.parseDateText(date.string) + if len(dtresult) == 2: + (dtdate,dtr) = dtresult + print dtdate + edate = ""+str(dtdate[0])+'-'+str(dtdate[1])+'-'+str(dtdate[2]) + else: + edate = "" doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), "links": links, 'docID': id.string, "date": edate, "description": descriptiontxt,"title": title.string,"notes": notes.string} foidocsdb.save(doc)