add innovation scrAper
add innovation scrAper


Former-commit-id: 80558a9217d1bcad0766200d0e1d42aa022ff501

  import sys,os
  sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
  import genericScrapers
  import scrape
  from bs4 import BeautifulSoup
  from datetime import date
 
  #http://www.doughellmann.com/PyMOTW/abc/
  class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
  def getTable(self,soup):
  return soup.find(id= "ctl00_MSO_ContentDiv").table
 
  def getColumns(self,columns):
  (id, title, description, notes) = columns
  return (id, title, title, description, notes)
  def getDate(self, content, entry, doc):
  edate = date.today().strftime("%Y-%m-%d")
  doc.update({'date': edate})
  return
  def getColumnCount(self):
  return 4
 
  if __name__ == '__main__':
  print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
  print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
  ScraperImplementation().doScrape()
 
import sys import sys
import os import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import genericScrapers import genericScrapers
import scrape import scrape
from datetime import date from datetime import date
from pyquery import PyQuery as pq from pyquery import PyQuery as pq
from lxml import etree from lxml import etree
import urllib import urllib
import dateutil import dateutil
from dateutil.parser import * from dateutil.parser import *
   
class ACMADisclogScraper(genericScrapers.GenericDisclogScraper): class ACMADisclogScraper(genericScrapers.GenericDisclogScraper):
   
def doScrape(self): def doScrape(self):
foidocsdb = scrape.couch['disclosr-foidocuments'] foidocsdb = scrape.couch['disclosr-foidocuments']
(url, mime_type, content) = scrape.fetchURL(scrape.docsdb, (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
self.getURL(), "foidocuments", self.getAgencyID()) self.getURL(), "foidocuments", self.getAgencyID())
   
d = pq(content.read()) d = pq(content.read())
d.make_links_absolute(base_url = self.getURL()) d.make_links_absolute(base_url = self.getURL())
for item in d('.item-list').items(): for item in d('.item-list').items():
title= item('h3').text() title= item('h3').text()
print title print title
links = item('a').map(lambda i, e: pq(e).attr('href')) links = item('a').map(lambda i, e: pq(e).attr('href'))
description = title= item('ul').text() description = title= item('ul').text()
edate = date.today().strftime("%Y-%m-%d") edate = date.today().strftime("%Y-%m-%d")
print edate print edate
dochash = scrape.mkhash(self.remove_control_chars(title)) dochash = scrape.mkhash(self.remove_control_chars(title))
doc = foidocsdb.get(dochash) doc = foidocsdb.get(dochash)
if doc is None: if doc is None:
print "saving " + dochash print "saving " + dochash
edate = date.today().strftime("%Y-%m-%d")  
doc = {'_id': dochash, 'agencyID': self.getAgencyID() doc = {'_id': dochash, 'agencyID': self.getAgencyID()
, 'url': self.getURL(), 'docID': dochash, , 'url': self.getURL(), 'docID': dochash,
"links": links, "links": links,
"date": edate, "title": title, "description": description} "date": edate, "title": title, "description": description}
#print doc #print doc
foidocsdb.save(doc) foidocsdb.save(doc)
else: else:
print "already saved" print "already saved"
   
   
if __name__ == '__main__': if __name__ == '__main__':
print 'Subclass:', issubclass(ACMADisclogScraper, print 'Subclass:', issubclass(ACMADisclogScraper,
genericScrapers.GenericDisclogScraper) genericScrapers.GenericDisclogScraper)
print 'Instance:', isinstance(ACMADisclogScraper(), print 'Instance:', isinstance(ACMADisclogScraper(),
genericScrapers.GenericDisclogScraper) genericScrapers.GenericDisclogScraper)
ACMADisclogScraper().doScrape() ACMADisclogScraper().doScrape()