Add bootstrap css
[disclosr.git] / documents / genericScrapers.py
blob:a/documents/genericScrapers.py -> blob:b/documents/genericScrapers.py
import sys,os import sys
  import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import scrape import scrape
   
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
  from time import mktime
  import feedparser
import abc import abc
import dateutil.parser import unicodedata
  import re
class GenericOAICDisclogScraper(object): import dateutil
__metaclass__ = abc.ABCMeta from dateutil.parser import *
@abc.abstractmethod from datetime import *
def getAgencyID(self): import codecs
""" disclosr agency id """  
return import difflib
   
@abc.abstractmethod from StringIO import StringIO
def getURL(self):  
""" disclog URL""" from pdfminer.pdfparser import PDFDocument, PDFParser
return from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
  from pdfminer.pdfdevice import PDFDevice, TagExtractor
@abc.abstractmethod from pdfminer.converter import TextConverter
def getColumns(self,columns): from pdfminer.cmapdb import CMapDB
""" rearranges columns if required """ from pdfminer.layout import LAParams
return  
   
def doScrape(self): class GenericDisclogScraper(object):
foidocsdb = scrape.couch['disclosr-foidocuments'] __metaclass__ = abc.ABCMeta
(url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) agencyID = None
if content != None: disclogURL = None
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":  
# http://www.crummy.com/software/BeautifulSoup/documentation.html def remove_control_chars(self, input):
soup = BeautifulSoup(content) return "".join([i for i in input if ord(i) in range(32, 127)])
for row in soup.table.find_all('tr'):  
columns = row.find_all('td') def getAgencyID(self):
if len(columns) == 5: """ disclosr agency id """
(id, date, description, title, notes) = self.getColumns(columns) if self.agencyID is None:
print id.string self.agencyID = os.path.basename(sys.argv[0]).replace(".py", "")
hash = scrape.mkhash(url+id.string) return self.agencyID
links = []  
for atag in row.find_all("a"): def getURL(self):
if atag.has_key('href'): """ disclog URL"""
links.append(scrape.fullurl(url,atag['href'])) if self.disclogURL is None:
doc = foidocsdb.get(hash) agency = scrape.agencydb.get(self.getAgencyID())
descriptiontxt = "" self.disclogURL = agency['FOIDocumentsURL']
for string in description.stripped_strings: return self.disclogURL
descriptiontxt = descriptiontxt + string  
  @abc.abstractmethod
if doc == None: def doScrape(self):
print "saving" """ do the scraping """
edate = dateutil.parser.parse(date.string).date().strftime("%Y-%m-%d") return
doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), "links": links, 'docID': id.string,  
"date": edate, "description": descriptiontxt,"title": title.string,"notes": notes.string} class GenericHTMLDisclogScraper(GenericDisclogScraper):
foidocsdb.save(doc)  
else: def doScrape(self):
print "already saved" foidocsdb = scrape.couch['disclosr-foidocuments']
  (url, mime_type, rcontent) = scrape.fetchURL(scrape.docsdb,
elif len(row.find_all('th')) == 5: self.getURL(), "foidocuments", self.getAgencyID())
print "header row" content = rcontent.read()
  dochash = scrape.mkhash(content)
else: doc = foidocsdb.get(dochash)
print "ERROR number of columns incorrect" if doc is None:
print row print "saving " + dochash
  description = "This log may have updated but as it was not in a table last time we viewed it, we cannot extract what has changed. Please refer to the agency's website Disclosure Log to see the most recent entries"
  last_attach = scrape.getLastAttachment(scrape.docsdb, self.getURL())
  if last_attach != None:
  html_diff = difflib.HtmlDiff()
  description = description + "\nChanges: "
  description = description + html_diff.make_table(last_attach.read().split('\n'),
  content.split('\n'))
  edate = date.today().strftime("%Y-%m-%d")
  doc = {'_id': dochash, 'agencyID': self.getAgencyID()
  , 'url': self.getURL(), 'docID': dochash,
  "date": edate, "title": "Disclosure Log Updated", "description": description}
  foidocsdb.save(doc)
  else:
  print "already saved"
   
  class GenericPDFDisclogScraper(GenericDisclogScraper):
   
  def doScrape(self):
  foidocsdb = scrape.couch['disclosr-foidocuments']
  (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
  self.getURL(), "foidocuments", self.getAgencyID())
  laparams = LAParams()
  rsrcmgr = PDFResourceManager(caching=True)
  outfp = StringIO()
  device = TextConverter(rsrcmgr, outfp, codec='utf-8',
  laparams=laparams)
  fp = StringIO()
  fp.write(content.read())
   
  process_pdf(rsrcmgr, device, fp, set(), caching=True,
  check_extractable=True)
  description = outfp.getvalue()
  fp.close()
  device.close()
  outfp.close()
  dochash = scrape.mkhash(description)
  doc = foidocsdb.get(dochash)
  if doc is None:
  print "saving " + dochash
  edate = date.today().strftime("%Y-%m-%d")
  doc = {'_id': dochash, 'agencyID': self.getAgencyID()
  , 'url': self.getURL(), 'docID': dochash,
  "date": edate, "title": "Disclosure Log Updated", "description": description}
  foidocsdb.save(doc)
  else:
  print "already saved"
   
   
  class GenericDOCXDisclogScraper(GenericDisclogScraper):
   
  def doScrape(self):
  foidocsdb = scrape.couch['disclosr-foidocuments']
  (url, mime_type, content) = scrape.fetchURL(scrape.docsdb
  , self.getURL(), "foidocuments", self.getAgencyID())
  mydoc = zipfile.ZipFile(file)
  xmlcontent = mydoc.read('word/document.xml')
  document = etree.fromstring(xmlcontent)
  ## Fetch all the text out of the document we just created
  paratextlist = getdocumenttext(document)
  # Make explicit unicode version
  newparatextlist = []
  for paratext in paratextlist:
  newparatextlist.append(paratext.encode("utf-8"))
  ## Print our documnts test with two newlines under each paragraph
  description = '\n\n'.join(newparatextlist).strip(' \t\n\r')
  dochash = scrape.mkhash(description)
  doc = foidocsdb.get(dochash)
   
  if doc is None:
  print "saving " + dochash
  edate = time().strftime("%Y-%m-%d")
  doc = {'_id': dochash, 'agencyID': self.getAgencyID()
  , 'url': self.getURL(), 'docID': dochash,
  "date": edate, "title": "Disclosure Log Updated", "description": description}
  foidocsdb.save(doc)
  else:
  print "already saved"
   
   
  class GenericRSSDisclogScraper(GenericDisclogScraper):
   
  def doScrape(self):
  foidocsdb = scrape.couch['disclosr-foidocuments']
  (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
  self.getURL(), "foidocuments", self.getAgencyID())
  feed = feedparser.parse(content)
  for entry in feed.entries:
  #print entry
  print entry.id
  dochash = scrape.mkhash(entry.id)
  doc = foidocsdb.get(dochash)
  #print doc
  if doc is None:
  print "saving " + dochash
  edate = datetime.fromtimestamp(
  mktime(entry.published_parsed)).strftime("%Y-%m-%d")
  doc = {'_id': dochash, 'agencyID': self.getAgencyID(),
  'url': entry.link, 'docID': entry.id,
  "date": edate, "title": entry.title}
  self.getDescription(entry, entry, doc)
  foidocsdb.save(doc)
  else:
  print "already saved"
   
  def getDescription(self, content, entry, doc):
  """ get description from rss entry"""
  doc.update({'description': content.summary})
  return
   
   
  class GenericOAICDisclogScraper(GenericDisclogScraper):
  __metaclass__ = abc.ABCMeta
   
  @abc.abstractmethod
  def getColumns(self, columns):
  """ rearranges columns if required """
  return
   
  def getColumnCount(self):
  return 5
   
  def getDescription(self, content, entry, doc):
  """ get description from rss entry"""
  descriptiontxt = ""
  for string in content.stripped_strings:
  descriptiontxt = descriptiontxt + " \n" + string
  doc.update({'description': descriptiontxt})
   
  def getTitle(self, content, entry, doc):
  doc.update({'title': (''.join(content.stripped_strings))})
   
  def getTable(self, soup):
  return soup.table
   
  def getRows(self, table):
  return table.find_all('tr')
   
  def getDate(self, content, entry, doc):
  date = ''.join(content.stripped_strings).strip()
  (a, b, c) = date.partition("(")
  date = self.remove_control_chars(a.replace("Octber", "October"))
  print date
  edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
  print edate
  doc.update({'date': edate})
  return
   
  def getLinks(self, content, entry, doc):
  links = []
  for atag in entry.find_all("a"):
  if atag.has_key('href'):
  links.append(scrape.fullurl(content, atag['href']))
  if links != []:
  doc.update({'links': links})
  return
   
  def doScrape(self):
  foidocsdb = scrape.couch['disclosr-foidocuments']
  (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
  self.getURL(), "foidocuments", self.getAgencyID())
  if content is not None:
  if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type == "application/xml":
  # http://www.crummy.com/software/BeautifulSoup/documentation.html
  print "parsing"
  soup = BeautifulSoup(content)
  table = self.getTable(soup)
  for row in self.getRows(table):
  columns = row.find_all('td')
  if len(columns) is self.getColumnCount():
  (id, date, title,
  description, notes) = self.getColumns(columns)
  print self.remove_control_chars(
  ''.join(id.stripped_strings))
  if id.string is None:
  dochash = scrape.mkhash(
  self.remove_control_chars(
  url + (''.join(date.stripped_strings))))
  else:
  dochash = scrape.mkhash(
  self.remove_control_chars(
  url + (''.join(id.stripped_strings))))
  doc = foidocsdb.get(dochash)
   
  if doc is None:
  print "saving " + dochash
  doc = {'_id': dochash,
  'agencyID': self.getAgencyID(),
  'url': self.getURL(),
  'docID': (''.join(id.stripped_strings))}
  self.getLinks(self.getURL(), row, doc)
  self.getTitle(title, row, doc)
  self.getDate(date, row, doc)
  self.getDescription(description, row, doc)
  if notes is not None:
  doc.update({ 'notes': (
  ''.join(notes.stripped_strings))})
  badtitles = ['-','Summary of FOI Request'
  , 'FOI request(in summary form)'
  , 'Summary of FOI request received by the ASC',
  'Summary of FOI request received by agency/minister',
  'Description of Documents Requested','FOI request',
  'Description of FOI Request','Summary of request','Description','Summary',
  'Summary of FOIrequest received by agency/minister','Summary of FOI request received','Description of FOI Request',"FOI request",'Results 1 to 67 of 67']
  if doc['title'] not in badtitles\
  and doc['description'] != '':
  print "saving"
  foidocsdb.save(doc)
  else:
  print "already saved " + dochash
   
  elif len(row.find_all('th')) is self.getColumnCount():
  print "header row"
   
  else:
  print "ERROR number of columns incorrect"
  print row