made generic OAIC format table scraper class
[disclosr.git] / documents / genericScrapers.py
blob:a/documents/genericScrapers.py -> blob:b/documents/genericScrapers.py
  import sys,os
  sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
  import scrape
   
  from bs4 import BeautifulSoup
  import abc
   
  class GenericOAICDisclogScraper(object):
  __metaclass__ = abc.ABCMeta
  @abc.abstractmethod
  def getAgencyID(self):
  """ disclosr agency id """
  return
   
  @abc.abstractmethod
  def getURL(self):
  """ disclog URL"""
  return
   
  @abc.abstractmethod
  def getColumns(self,columns):
  """ rearranges columns if required """
  return
   
  def doScrape(self):
  foidocsdb = scrape.couch['disclosr-foidocuments']
  (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID())
  if content != None:
  if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
  # http://www.crummy.com/software/BeautifulSoup/documentation.html
  soup = BeautifulSoup(content)
  for row in soup.table.find_all('tr'):
  columns = row.find_all('td')
  if len(columns) == 5:
  (id, date, description, title, notes) = self.getColumns(columns)
  print id.string
  hash = scrape.mkhash(url+id.string)
  links = []
  for atag in row.find_all("a"):
  if atag.has_key('href'):
  links.append(scrape.fullurl(url,atag['href']))
  doc = foidocsdb.get(hash)
  descriptiontxt = ""
  for string in description.stripped_strings:
  descriptiontxt = descriptiontxt + string
   
  if doc == None:
  print "saving"
  doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), "links": links, 'docID': id.string,
  "date": date.string, "description": descriptiontxt,"title": title.string,"notes": notes.string}
  foidocsdb.save(doc)
  else:
  print "already saved"
   
  elif len(row.find_all('th')) == 5:
  print "header row"
   
  else:
  print "ERROR number of columns incorrect"
  print row