better date parser
Former-commit-id: 64dff9bcaeb72426a713440e995584a6ea0472b9
--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -2,12 +2,13 @@
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import scrape
from bs4 import BeautifulSoup
-import parsedatetime as pdt
from time import mktime
-from datetime import datetime
import feedparser
import abc
import unicodedata, re
+import dateutil
+from dateutil.parser import *
+from datetime import *
class GenericDisclogScraper(object):
__metaclass__ = abc.ABCMeta
@@ -88,18 +89,12 @@
def getTable(self, soup):
return soup.table
def getDate(self, content, entry, doc):
- dtresult = cal.parseDateText(content.string)
- if len(dtresult) == 2:
- (dtdate,dtr) = dtresult
- edate = ""+str(dtdate[0])+'-'+str(dtdate[1])+'-'+str(dtdate[2])
- else:
- edate = datetime.strptime(date.string.strip(), "%d %B %Y").strftime("%Y-%m-%d")
+ edate = parse(content.string.strip(), dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
print edate
doc.update({'date': edate})
return
def doScrape(self):
- cal = pdt.Calendar()
foidocsdb = scrape.couch['disclosr-foidocuments']
(url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID())
if content != None: