--- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -2,12 +2,13 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) import scrape from bs4 import BeautifulSoup -import parsedatetime as pdt from time import mktime -from datetime import datetime import feedparser import abc import unicodedata, re +import dateutil +from dateutil.parser import * +from datetime import * class GenericDisclogScraper(object): __metaclass__ = abc.ABCMeta @@ -82,11 +83,18 @@ descriptiontxt = descriptiontxt + " \n" + string doc.update({'description': descriptiontxt}) return + def getTitle(self, content, entry, doc): + doc.update({'title': content.string}) + return def getTable(self, soup): return soup.table + def getDate(self, content, entry, doc): + edate = parse(content.string.strip(), dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") + print edate + doc.update({'date': edate}) + return def doScrape(self): - cal = pdt.Calendar() foidocsdb = scrape.couch['disclosr-foidocuments'] (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) if content != None: @@ -111,19 +119,13 @@ if doc == None: print "saving" - dtresult = cal.parseDateText(date.string) - if len(dtresult) == 2: - (dtdate,dtr) = dtresult - print dtdate - edate = ""+str(dtdate[0])+'-'+str(dtdate[1])+'-'+str(dtdate[2]) - else: - edate = datetime.strptime(date.string.strip(), "%d %B %Y").strftime("%Y-%m-%d") - doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': id.string, - "date": edate,"title": title.string} + doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': id.string} if links != []: doc.update({'links': links}) + self.getTitle(title,row, doc) + self.getDate(date,row, doc) self.getDescription(description,row, doc) - if notes != None: + if notes != None: doc.update({ 'notes': notes.string}) foidocsdb.save(doc) else: