From: Maxious Date: Thu, 22 Nov 2012 05:56:01 +0000 Subject: better date parser X-Git-Url: https://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=053611411d5736d01925713f07d6681cc7b5fdb8 --- better date parser Former-commit-id: 64dff9bcaeb72426a713440e995584a6ea0472b9 --- --- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -2,12 +2,13 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) import scrape from bs4 import BeautifulSoup -import parsedatetime as pdt from time import mktime -from datetime import datetime import feedparser import abc import unicodedata, re +import dateutil +from dateutil.parser import * +from datetime import * class GenericDisclogScraper(object): __metaclass__ = abc.ABCMeta @@ -88,18 +89,12 @@ def getTable(self, soup): return soup.table def getDate(self, content, entry, doc): - dtresult = cal.parseDateText(content.string) - if len(dtresult) == 2: - (dtdate,dtr) = dtresult - edate = ""+str(dtdate[0])+'-'+str(dtdate[1])+'-'+str(dtdate[2]) - else: - edate = datetime.strptime(date.string.strip(), "%d %B %Y").strftime("%Y-%m-%d") + edate = parse(content.string.strip(), dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") print edate doc.update({'date': edate}) return def doScrape(self): - cal = pdt.Calendar() foidocsdb = scrape.couch['disclosr-foidocuments'] (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) if content != None: