import sys |
import sys |
import os |
import os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import genericScrapers |
import genericScrapers |
import scrape |
import scrape |
from datetime import date |
from datetime import date |
from pyquery import PyQuery as pq |
from pyquery import PyQuery as pq |
from lxml import etree |
from lxml import etree |
import urllib |
import urllib |
|
import dateutil |
|
from dateutil.parser import * |
|
|
class ACMADisclogScraper(genericScrapers.GenericDisclogScraper): |
class ACMADisclogScraper(genericScrapers.GenericDisclogScraper): |
|
|
def doScrape(self): |
def doScrape(self): |
foidocsdb = scrape.couch['disclosr-foidocuments'] |
foidocsdb = scrape.couch['disclosr-foidocuments'] |
(url, mime_type, content) = scrape.fetchURL(scrape.docsdb, |
(url, mime_type, content) = scrape.fetchURL(scrape.docsdb, |
self.getURL(), "foidocuments", self.getAgencyID()) |
self.getURL(), "foidocuments", self.getAgencyID()) |
|
|
d = pq(content) |
d = pq(content.read()) |
d.make_links_absolute() |
d.make_links_absolute(base_url = self.getURL()) |
d.table.filter('.ncTAF_DataTABLE') |
for table in d('table').items(): |
print [i.text() for i in d.items('span')] |
title= table('thead').text() |
description = "" |
print title |
dochash = scrape.mkhash(description) |
(idate,descA,descB,link,deldate,notes) = table('tbody tr').map(lambda i, e: pq(e).children().eq(1).text()) |
doc = foidocsdb.get(dochash) |
links = table('a').map(lambda i, e: pq(e).attr('href')) |
if doc is None: |
description = descA+" "+descB |
print "saving " + dochash |
edate = parse(idate[:12], dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") |
edate = date.today().strftime("%Y-%m-%d") |
print edate |
doc = {'_id': dochash, 'agencyID': self.getAgencyID() |
dochash = scrape.mkhash(self.remove_control_chars(title)) |
, 'url': self.getURL(), 'docID': dochash, |
doc = foidocsdb.get(dochash) |
"date": edate, "title": "Disclosure Log Updated", "description": description} |
if doc is None: |
#foidocsdb.save(doc) |
print "saving " + dochash |
else: |
edate = date.today().strftime("%Y-%m-%d") |
print "already saved" |
doc = {'_id': dochash, 'agencyID': self.getAgencyID() |
|
, 'url': self.getURL(), 'docID': dochash, |
|
"links": links, |
|
"date": edate, "notes": notes, "title": "Disclosure Log Updated", "description": description} |
|
#print doc |
|
foidocsdb.save(doc) |
|
else: |
|
print "already saved" |
|
|
|
|
if __name__ == '__main__': |
if __name__ == '__main__': |
print 'Subclass:', issubclass(ACMADisclogScraper, |
print 'Subclass:', issubclass(ACMADisclogScraper, |
genericScrapers.GenericDisclogScraper) |
genericScrapers.GenericDisclogScraper) |
print 'Instance:', isinstance(ACMADisclogScraper(), |
print 'Instance:', isinstance(ACMADisclogScraper(), |
genericScrapers.GenericDisclogScraper) |
genericScrapers.GenericDisclogScraper) |
ACMADisclogScraper().doScrape() |
ACMADisclogScraper().doScrape() |
|
|