pyquery scraper beginnings
[disclosr.git] / documents / scrapers / 7c6adc1d41cf029bf1a0959e5156477a.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import genericScrapers
import scrape
from datetime import date
from pyquery import PyQuery as pq
from lxml import etree
import urllib
 
class ACMADisclogScraper(genericScrapers.GenericDisclogScraper):
 
    def doScrape(self):
        foidocsdb = scrape.couch['disclosr-foidocuments']
        (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
             self.getURL(), "foidocuments", self.getAgencyID())
 
        d = pq(content)
        d.make_links_absolute()
        d.table.filter('.ncTAF_DataTABLE')
        print [i.text() for i in d.items('span')]
        description = ""
        dochash = scrape.mkhash(description)
        doc = foidocsdb.get(dochash)
        if doc is None:
            print "saving " + dochash
            edate = date.today().strftime("%Y-%m-%d")
            doc = {'_id': dochash, 'agencyID': self.getAgencyID()
            , 'url': self.getURL(), 'docID': dochash,
            "date": edate, "title": "Disclosure Log Updated", "description": description}
            #foidocsdb.save(doc)
        else:
            print "already saved"
 
 
if __name__ == '__main__':
    print 'Subclass:', issubclass(ACMADisclogScraper,
         genericScrapers.GenericDisclogScraper)
    print 'Instance:', isinstance(ACMADisclogScraper(),
         genericScrapers.GenericDisclogScraper)
    ACMADisclogScraper().doScrape()