1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 | import sys import os sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) import genericScrapers import scrape from datetime import date from pyquery import PyQuery as pq from lxml import etree import urllib class ACMADisclogScraper(genericScrapers.GenericDisclogScraper): def doScrape(self): foidocsdb = scrape.couch['disclosr-foidocuments'] (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) d = pq(content) d.make_links_absolute() d.table.filter('.ncTAF_DataTABLE') print [i.text() for i in d.items('span')] description = "" dochash = scrape.mkhash(description) doc = foidocsdb.get(dochash) if doc is None: print "saving " + dochash edate = date.today().strftime("%Y-%m-%d") doc = {'_id': dochash, 'agencyID': self.getAgencyID() , 'url': self.getURL(), 'docID': dochash, "date": edate, "title": "Disclosure Log Updated", "description": description} #foidocsdb.save(doc) else: print "already saved" if __name__ == '__main__': print 'Subclass:', issubclass(ACMADisclogScraper, genericScrapers.GenericDisclogScraper) print 'Instance:', isinstance(ACMADisclogScraper(), genericScrapers.GenericDisclogScraper) ACMADisclogScraper().doScrape() |