|
import sys |
|
import os |
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
|
import genericScrapers |
|
import scrape |
|
from datetime import date |
|
from pyquery import PyQuery as pq |
|
from lxml import etree |
|
import urllib |
|
|
|
class ACMADisclogScraper(genericScrapers.GenericDisclogScraper): |
|
|
|
def doScrape(self): |
|
foidocsdb = scrape.couch['disclosr-foidocuments'] |
|
(url, mime_type, content) = scrape.fetchURL(scrape.docsdb, |
|
self.getURL(), "foidocuments", self.getAgencyID()) |
|
|
|
d = pq(content) |
|
d.make_links_absolute() |
|
d.table.filter('.ncTAF_DataTABLE') |
|
print [i.text() for i in d.items('span')] |
|
description = "" |
|
dochash = scrape.mkhash(description) |
|
doc = foidocsdb.get(dochash) |
|
if doc is None: |
|
print "saving " + dochash |
|
edate = date.today().strftime("%Y-%m-%d") |
|
doc = {'_id': dochash, 'agencyID': self.getAgencyID() |
|
, 'url': self.getURL(), 'docID': dochash, |
|
"date": edate, "title": "Disclosure Log Updated", "description": description} |
|
#foidocsdb.save(doc) |
|
else: |
|
print "already saved" |
|
|
|
|
|
if __name__ == '__main__': |
|
print 'Subclass:', issubclass(ACMADisclogScraper, |
|
genericScrapers.GenericDisclogScraper) |
|
print 'Instance:', isinstance(ACMADisclogScraper(), |
|
genericScrapers.GenericDisclogScraper) |
|
ACMADisclogScraper().doScrape() |
|
|