--- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -3,7 +3,40 @@ import scrape from bs4 import BeautifulSoup import parsedatetime as pdt +from time import mktime +from datetime import datetime +import feedparser import abc + +class GenericRSSDisclogScraper(object): + __metaclass__ = abc.ABCMeta + @abc.abstractmethod + def getAgencyID(self): + """ disclosr agency id """ + return + + @abc.abstractmethod + def getURL(self): + """ disclog URL""" + return + def doScrape(self): + foidocsdb = scrape.couch['disclosr-foidocuments'] + (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) + feed = feedparser.parse(content) + for entry in feed.entries: + #print entry + print entry.id + hash = scrape.mkhash(entry.link) + doc = foidocsdb.get(hash) + if doc == None: + print "saving" + edate = datetime.fromtimestamp(mktime( entry.published_parsed)).strftime("%Y-%m-%d") + doc = {'id': hash, 'agencyID': self.getAgencyID(), 'url': entry.link, 'docID': entry.id, + "date": edate, "description": entry.summary,"title": entry.title} + foidocsdb.save(doc) + else: + print "already saved" + class GenericOAICDisclogScraper(object): __metaclass__ = abc.ABCMeta @abc.abstractmethod @@ -42,7 +75,7 @@ doc = foidocsdb.get(hash) descriptiontxt = "" for string in description.stripped_strings: - descriptiontxt = descriptiontxt + string + descriptiontxt = descriptiontxt + " \n" + string if doc == None: print "saving" @@ -53,7 +86,7 @@ edate = ""+str(dtdate[0])+'-'+str(dtdate[1])+'-'+str(dtdate[2]) else: edate = "" - doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), "links": links, 'docID': id.string, + doc = {'id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), "links": links, 'docID': id.string, "date": edate, "description": descriptiontxt,"title": title.string,"notes": notes.string} foidocsdb.save(doc) else: