import sys |
import sys |
import os |
import os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import genericScrapers |
import genericScrapers |
import scrape |
import scrape |
from datetime import date |
from datetime import date |
from pyquery import PyQuery as pq |
from pyquery import PyQuery as pq |
from lxml import etree |
from lxml import etree |
import urllib |
import urllib |
import dateutil |
import dateutil |
from dateutil.parser import * |
from dateutil.parser import * |
|
|
class ACMADisclogScraper(genericScrapers.GenericDisclogScraper): |
class ACMADisclogScraper(genericScrapers.GenericDisclogScraper): |
|
|
def doScrape(self): |
def doScrape(self): |
foidocsdb = scrape.couch['disclosr-foidocuments'] |
foidocsdb = scrape.couch['disclosr-foidocuments'] |
(url, mime_type, content) = scrape.fetchURL(scrape.docsdb, |
(url, mime_type, content) = scrape.fetchURL(scrape.docsdb, |
self.getURL(), "foidocuments", self.getAgencyID()) |
self.getURL(), "foidocuments", self.getAgencyID()) |
|
|
d = pq(content.read()) |
d = pq(content) |
d.make_links_absolute(base_url = self.getURL()) |
d.make_links_absolute(base_url = self.getURL()) |
for item in d('.item-list').items(): |
for item in d('.item-list').items(): |
title= item('h3').text() |
title= item('h3').text() |
print title |
print title |
links = item('a').map(lambda i, e: pq(e).attr('href')) |
links = item('a').map(lambda i, e: pq(e).attr('href')) |
description = title= item('ul').text() |
description = title= item('ul').text() |
edate = date.today().strftime("%Y-%m-%d") |
edate = date.today().strftime("%Y-%m-%d") |
print edate |
print edate |
dochash = scrape.mkhash(self.remove_control_chars(title)) |
dochash = scrape.mkhash(self.remove_control_chars(title)) |
doc = foidocsdb.get(dochash) |
doc = foidocsdb.get(dochash) |
if doc is None: |
if doc is None: |
print "saving " + dochash |
print "saving " + dochash |
doc = {'_id': dochash, 'agencyID': self.getAgencyID() |
doc = {'_id': dochash, 'agencyID': self.getAgencyID() |
, 'url': self.getURL(), 'docID': dochash, |
, 'url': self.getURL(), 'docID': dochash, |
"links": links, |
"links": links, |
"date": edate, "title": title, "description": description} |
"date": edate, "title": title, "description": description} |
#print doc |
#print doc |
foidocsdb.save(doc) |
foidocsdb.save(doc) |
else: |
else: |
print "already saved" |
print "already saved" |
|
|
|
|
if __name__ == '__main__': |
if __name__ == '__main__': |
print 'Subclass:', issubclass(ACMADisclogScraper, |
print 'Subclass:', issubclass(ACMADisclogScraper, |
genericScrapers.GenericDisclogScraper) |
genericScrapers.GenericDisclogScraper) |
print 'Instance:', isinstance(ACMADisclogScraper(), |
print 'Instance:', isinstance(ACMADisclogScraper(), |
genericScrapers.GenericDisclogScraper) |
genericScrapers.GenericDisclogScraper) |
ACMADisclogScraper().doScrape() |
ACMADisclogScraper().doScrape() |
|
|