pyquery scraper beginnings
[disclosr.git] / documents / scrapers / 7c6adc1d41cf029bf1a0959e5156477a.py
blob:a/documents/scrapers/7c6adc1d41cf029bf1a0959e5156477a.py -> blob:b/documents/scrapers/7c6adc1d41cf029bf1a0959e5156477a.py
import sys import sys
import os import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import genericScrapers import genericScrapers
import scrape import scrape
from datetime import date from datetime import date
from pyquery import PyQuery as pq from pyquery import PyQuery as pq
from lxml import etree from lxml import etree
import urllib import urllib
  import dateutil
  from dateutil.parser import *
   
class ACMADisclogScraper(genericScrapers.GenericDisclogScraper): class ACMADisclogScraper(genericScrapers.GenericDisclogScraper):
   
def doScrape(self): def doScrape(self):
foidocsdb = scrape.couch['disclosr-foidocuments'] foidocsdb = scrape.couch['disclosr-foidocuments']
(url, mime_type, content) = scrape.fetchURL(scrape.docsdb, (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
self.getURL(), "foidocuments", self.getAgencyID()) self.getURL(), "foidocuments", self.getAgencyID())
   
d = pq(content) d = pq(content)
d.make_links_absolute() d.make_links_absolute(base_url = self.getURL())
d.table.filter('.ncTAF_DataTABLE') for table in d('table').items():
print [i.text() for i in d.items('span')] title= table('thead').text()
description = "" print self.remove_control_chars(title)
dochash = scrape.mkhash(description) (idate,descA,descB,link,deldate,notes) = table('tbody tr').map(lambda i, e: pq(e).children().eq(1).text())
doc = foidocsdb.get(dochash) links = table('a').map(lambda i, e: pq(e).attr('href'))
if doc is None: description = descA+" "+descB
print "saving " + dochash try:
edate = date.today().strftime("%Y-%m-%d") edate = parse(idate[:12], dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
doc = {'_id': dochash, 'agencyID': self.getAgencyID() except ValueError:
, 'url': self.getURL(), 'docID': dochash, edate = date.today().strftime("%Y-%m-%d")
"date": edate, "title": "Disclosure Log Updated", "description": description} pass
#foidocsdb.save(doc) print edate
else: dochash = scrape.mkhash(self.remove_control_chars(title))
print "already saved" doc = foidocsdb.get(dochash)
  if doc is None:
  print "saving " + dochash
  edate = date.today().strftime("%Y-%m-%d")
  doc = {'_id': dochash, 'agencyID': self.getAgencyID()
  , 'url': self.getURL(), 'docID': dochash,
  "links": links,
  "date": edate, "notes": notes, "title": title, "description": description}
  #print doc
  foidocsdb.save(doc)
  else:
  print "already saved"
   
   
if __name__ == '__main__': if __name__ == '__main__':
print 'Subclass:', issubclass(ACMADisclogScraper, print 'Subclass:', issubclass(ACMADisclogScraper,
genericScrapers.GenericDisclogScraper) genericScrapers.GenericDisclogScraper)
print 'Instance:', isinstance(ACMADisclogScraper(), print 'Instance:', isinstance(ACMADisclogScraper(),
genericScrapers.GenericDisclogScraper) genericScrapers.GenericDisclogScraper)
ACMADisclogScraper().doScrape() ACMADisclogScraper().doScrape()