--- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -1,9 +1,41 @@ import sys,os sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) import scrape +from bs4 import BeautifulSoup +import parsedatetime as pdt +from time import mktime +from datetime import datetime +import feedparser +import abc -from bs4 import BeautifulSoup -import abc +class GenericRSSDisclogScraper(object): + __metaclass__ = abc.ABCMeta + @abc.abstractmethod + def getAgencyID(self): + """ disclosr agency id """ + return + + @abc.abstractmethod + def getURL(self): + """ disclog URL""" + return + def doScrape(self): + foidocsdb = scrape.couch['disclosr-foidocuments'] + (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) + feed = feedparser.parse(content) + for entry in feed.entries: + #print entry + print entry.id + hash = scrape.mkhash(entry.link) + doc = foidocsdb.get(hash) + if doc == None: + print "saving" + edate = datetime.fromtimestamp(mktime( entry.published_parsed)).strftime("%Y-%m-%d") + doc = {'id': hash, 'agencyID': self.getAgencyID(), 'url': entry.link, 'docID': entry.id, + "date": edate, "description": entry.summary,"title": entry.title} + foidocsdb.save(doc) + else: + print "already saved" class GenericOAICDisclogScraper(object): __metaclass__ = abc.ABCMeta @@ -23,6 +55,7 @@ return def doScrape(self): + cal = pdt.Calendar() foidocsdb = scrape.couch['disclosr-foidocuments'] (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) if content != None: @@ -42,12 +75,19 @@ doc = foidocsdb.get(hash) descriptiontxt = "" for string in description.stripped_strings: - descriptiontxt = descriptiontxt + string + descriptiontxt = descriptiontxt + " \n" + string if doc == None: print "saving" - doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), "links": links, 'docID': id.string, - "date": date.string, "description": descriptiontxt,"title": title.string,"notes": notes.string} + dtresult = cal.parseDateText(date.string) + if len(dtresult) == 2: + (dtdate,dtr) = dtresult + print dtdate + edate = ""+str(dtdate[0])+'-'+str(dtdate[1])+'-'+str(dtdate[2]) + else: + edate = "" + doc = {'id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), "links": links, 'docID': id.string, + "date": edate, "description": descriptiontxt,"title": title.string,"notes": notes.string} foidocsdb.save(doc) else: print "already saved"