From: Maxious Date: Wed, 05 Dec 2012 10:19:52 +0000 Subject: pyquery scraper beginnings X-Git-Url: http://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=21df7544b85976999a5b67fdfb6f4c75f1f671ab --- pyquery scraper beginnings Former-commit-id: 7f7aa89526b530d6444397489fd11eed7d17369f --- --- /dev/null +++ b/documents/scrapers/7c6adc1d41cf029bf1a0959e5156477a.py @@ -1,1 +1,42 @@ +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from datetime import date +from pyquery import PyQuery as pq +from lxml import etree +import urllib +class ACMADisclogScraper(genericScrapers.GenericDisclogScraper): + + def doScrape(self): + foidocsdb = scrape.couch['disclosr-foidocuments'] + (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, + self.getURL(), "foidocuments", self.getAgencyID()) + + d = pq(content) + d.make_links_absolute() + d.table.filter('.ncTAF_DataTABLE') + print [i.text() for i in d.items('span')] + description = "" + dochash = scrape.mkhash(description) + doc = foidocsdb.get(dochash) + if doc is None: + print "saving " + dochash + edate = date.today().strftime("%Y-%m-%d") + doc = {'_id': dochash, 'agencyID': self.getAgencyID() + , 'url': self.getURL(), 'docID': dochash, + "date": edate, "title": "Disclosure Log Updated", "description": description} + #foidocsdb.save(doc) + else: + print "already saved" + + +if __name__ == '__main__': + print 'Subclass:', issubclass(ACMADisclogScraper, + genericScrapers.GenericDisclogScraper) + print 'Instance:', isinstance(ACMADisclogScraper(), + genericScrapers.GenericDisclogScraper) + ACMADisclogScraper().doScrape() +