import sys |
import sys |
import os |
import os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import genericScrapers |
import genericScrapers |
import scrape |
import scrape |
from datetime import date |
from datetime import date |
from pyquery import PyQuery as pq |
from pyquery import PyQuery as pq |
from lxml import etree |
from lxml import etree |
import urllib |
import urllib |
import dateutil |
import dateutil |
from dateutil.parser import * |
from dateutil.parser import * |
|
|
class ACMADisclogScraper(genericScrapers.GenericDisclogScraper): |
class ACMADisclogScraper(genericScrapers.GenericDisclogScraper): |
|
|
def doScrape(self): |
def doScrape(self): |
foidocsdb = scrape.couch['disclosr-foidocuments'] |
foidocsdb = scrape.couch['disclosr-foidocuments'] |
(url, mime_type, content) = scrape.fetchURL(scrape.docsdb, |
(url, mime_type, content) = scrape.fetchURL(scrape.docsdb, |
self.getURL(), "foidocuments", self.getAgencyID()) |
self.getURL(), "foidocuments", self.getAgencyID()) |
|
|
d = pq(content.read()) |
d = pq(content) |
d.make_links_absolute(base_url = self.getURL()) |
d.make_links_absolute(base_url = self.getURL()) |
for table in d('table').items(): |
for table in d('table').items(): |
title= table('thead').text() |
title= table('thead').text() |
print title |
print title |
(idate,descA,descB,link,deldate,notes) = table('tbody tr').map(lambda i, e: pq(e).children().eq(1).text()) |
(idate,descA,descB,link,deldate,notes) = table('tbody tr').map(lambda i, e: pq(e).children().eq(1).text()) |
links = table('a').map(lambda i, e: pq(e).attr('href')) |
links = table('a').map(lambda i, e: pq(e).attr('href')) |
description = descA+" "+descB |
description = descA+" "+descB |
edate = parse(idate[:12], dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") |
try: |
|
edate = parse(idate[:12], dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") |
|
except ValueError: |
|
edate = date.today().strftime("%Y-%m-%d") |
|
pass |
print edate |
print edate |
dochash = scrape.mkhash(self.remove_control_chars(title)) |
dochash = scrape.mkhash(self.remove_control_chars(title)) |
doc = foidocsdb.get(dochash) |
doc = foidocsdb.get(dochash) |
if doc is None: |
if doc is None: |
print "saving " + dochash |
print "saving " + dochash |
edate = date.today().strftime("%Y-%m-%d") |
edate = date.today().strftime("%Y-%m-%d") |
doc = {'_id': dochash, 'agencyID': self.getAgencyID() |
doc = {'_id': dochash, 'agencyID': self.getAgencyID() |
, 'url': self.getURL(), 'docID': dochash, |
, 'url': self.getURL(), 'docID': dochash, |
"links": links, |
"links": links, |
"date": edate, "notes": notes, "title": "Disclosure Log Updated", "description": description} |
"date": edate, "notes": notes, "title": title, "description": description} |
#print doc |
#print doc |
foidocsdb.save(doc) |
foidocsdb.save(doc) |
else: |
else: |
print "already saved" |
print "already saved" |
|
|
|
|
if __name__ == '__main__': |
if __name__ == '__main__': |
print 'Subclass:', issubclass(ACMADisclogScraper, |
print 'Subclass:', issubclass(ACMADisclogScraper, |
genericScrapers.GenericDisclogScraper) |
genericScrapers.GenericDisclogScraper) |
print 'Instance:', isinstance(ACMADisclogScraper(), |
print 'Instance:', isinstance(ACMADisclogScraper(), |
genericScrapers.GenericDisclogScraper) |
genericScrapers.GenericDisclogScraper) |
ACMADisclogScraper().doScrape() |
ACMADisclogScraper().doScrape() |
|
|