add ACMA scraper
add ACMA scraper


Former-commit-id: 79c714f2e5d9cc2ebd82d89d586fe3ed43ed2cd5

import sys import sys
import os import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import genericScrapers import genericScrapers
import scrape import scrape
from datetime import date from datetime import date
from pyquery import PyQuery as pq from pyquery import PyQuery as pq
from lxml import etree from lxml import etree
import urllib import urllib
  import dateutil
  from dateutil.parser import *
   
class ACMADisclogScraper(genericScrapers.GenericDisclogScraper): class ACMADisclogScraper(genericScrapers.GenericDisclogScraper):
   
def doScrape(self): def doScrape(self):
foidocsdb = scrape.couch['disclosr-foidocuments'] foidocsdb = scrape.couch['disclosr-foidocuments']
(url, mime_type, content) = scrape.fetchURL(scrape.docsdb, (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
self.getURL(), "foidocuments", self.getAgencyID()) self.getURL(), "foidocuments", self.getAgencyID())
   
d = pq(content) d = pq(content.read())
d.make_links_absolute() d.make_links_absolute(base_url = self.getURL())
d.table.filter('.ncTAF_DataTABLE') for table in d('table').items():
print [i.text() for i in d.items('span')] title= table('thead').text()
description = "" print title
dochash = scrape.mkhash(description) (idate,descA,descB,link,deldate,notes) = table('tbody tr').map(lambda i, e: pq(e).children().eq(1).text())
doc = foidocsdb.get(dochash) links = table('a').map(lambda i, e: pq(e).attr('href'))
if doc is None: description = descA+" "+descB
print "saving " + dochash edate = parse(idate[:12], dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
edate = date.today().strftime("%Y-%m-%d") print edate
doc = {'_id': dochash, 'agencyID': self.getAgencyID() dochash = scrape.mkhash(self.remove_control_chars(title))
, 'url': self.getURL(), 'docID': dochash, doc = foidocsdb.get(dochash)
"date": edate, "title": "Disclosure Log Updated", "description": description} if doc is None:
#foidocsdb.save(doc) print "saving " + dochash
else: edate = date.today().strftime("%Y-%m-%d")
print "already saved" doc = {'_id': dochash, 'agencyID': self.getAgencyID()
  , 'url': self.getURL(), 'docID': dochash,
  "links": links,
  "date": edate, "notes": notes, "title": "Disclosure Log Updated", "description": description}
  #print doc
  foidocsdb.save(doc)
  else:
  print "already saved"
   
   
if __name__ == '__main__': if __name__ == '__main__':
print 'Subclass:', issubclass(ACMADisclogScraper, print 'Subclass:', issubclass(ACMADisclogScraper,
genericScrapers.GenericDisclogScraper) genericScrapers.GenericDisclogScraper)
print 'Instance:', isinstance(ACMADisclogScraper(), print 'Instance:', isinstance(ACMADisclogScraper(),
genericScrapers.GenericDisclogScraper) genericScrapers.GenericDisclogScraper)
ACMADisclogScraper().doScrape() ACMADisclogScraper().doScrape()