--- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -1,149 +1,293 @@ -import sys,os +import sys +import os + sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) import scrape from bs4 import BeautifulSoup from time import mktime import feedparser import abc -import unicodedata, re +import unicodedata +import re import dateutil from dateutil.parser import * from datetime import * +import codecs + +import difflib + +from StringIO import StringIO + +from pdfminer.pdfparser import PDFDocument, PDFParser +from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf +from pdfminer.pdfdevice import PDFDevice, TagExtractor +from pdfminer.converter import TextConverter +from pdfminer.cmapdb import CMapDB +from pdfminer.layout import LAParams + class GenericDisclogScraper(object): - __metaclass__ = abc.ABCMeta - agencyID = None - disclogURL = None - def remove_control_chars(self, input): - return "".join([i for i in input if ord(i) in range(32, 127)]) - def getAgencyID(self): - """ disclosr agency id """ - if self.agencyID == None: - self.agencyID = os.path.basename(sys.argv[0]).replace(".py","") - return self.agencyID - - def getURL(self): - """ disclog URL""" - if self.disclogURL == None: - agency = scrape.agencydb.get(self.getAgencyID()) - self.disclogURL = agency['FOIDocumentsURL'] - return self.disclogURL - - @abc.abstractmethod - def doScrape(self): - """ do the scraping """ - return - - @abc.abstractmethod + __metaclass__ = abc.ABCMeta + agencyID = None + disclogURL = None + + def remove_control_chars(self, input): + return "".join([i for i in input if ord(i) in range(32, 127)]) + + def getAgencyID(self): + """ disclosr agency id """ + if self.agencyID is None: + self.agencyID = os.path.basename(sys.argv[0]).replace(".py", "") + return self.agencyID + + def getURL(self): + """ disclog URL""" + if self.disclogURL is None: + agency = scrape.agencydb.get(self.getAgencyID()) + self.disclogURL = agency['FOIDocumentsURL'] + return self.disclogURL + + @abc.abstractmethod + def doScrape(self): + """ do the scraping """ + return + + +class GenericHTMLDisclogScraper(GenericDisclogScraper): + def doScrape(self): + foidocsdb = scrape.couch['disclosr-foidocuments'] + (url, mime_type, rcontent) = scrape.fetchURL(scrape.docsdb, + self.getURL(), "foidocuments", self.getAgencyID()) + content = rcontent + dochash = scrape.mkhash(content) + doc = foidocsdb.get(dochash) + if doc is None: + print "saving " + dochash + description = "This log may have updated but as it was not in a table last time we viewed it, we cannot extract what has changed. Please refer to the agency's website Disclosure Log to see the most recent entries" + last_attach = scrape.getLastAttachment(scrape.docsdb, self.getURL()) + if last_attach != None: + html_diff = difflib.HtmlDiff() + diff = html_diff.make_table(last_attach.read().split('\n'), + content.split('\n')) + edate = date.today().strftime("%Y-%m-%d") + doc = {'_id': dochash, 'agencyID': self.getAgencyID() + , 'url': self.getURL(), 'docID': dochash, + "date": edate, "title": "Disclosure Log Updated", + "description": self.remove_control_chars(description), "diff": self.remove_control_chars(diff)} + foidocsdb.save(doc) + else: + print "already saved" + + +class GenericPDFDisclogScraper(GenericDisclogScraper): + def doScrape(self): + foidocsdb = scrape.couch['disclosr-foidocuments'] + (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, + self.getURL(), "foidocuments", self.getAgencyID()) + laparams = LAParams() + rsrcmgr = PDFResourceManager(caching=True) + outfp = StringIO() + device = TextConverter(rsrcmgr, outfp, codec='utf-8', + laparams=laparams) + fp = StringIO() + fp.write(content) + + process_pdf(rsrcmgr, device, fp, set(), caching=True, + check_extractable=True) + description = outfp.getvalue() + fp.close() + device.close() + outfp.close() + dochash = scrape.mkhash(description) + doc = foidocsdb.get(dochash) + if doc is None: + print "saving " + dochash + edate = date.today().strftime("%Y-%m-%d") + doc = {'_id': dochash, 'agencyID': self.getAgencyID() + , 'url': self.getURL(), 'docID': dochash, + "date": edate, "title": "Disclosure Log Updated", "description": self.remove_control_chars(description)} + foidocsdb.save(doc) + else: + print "already saved" + + +class GenericDOCXDisclogScraper(GenericDisclogScraper): + def doScrape(self): + foidocsdb = scrape.couch['disclosr-foidocuments'] + (url, mime_type, content) = scrape.fetchURL(scrape.docsdb + , self.getURL(), "foidocuments", self.getAgencyID()) + mydoc = zipfile.ZipFile(file) + xmlcontent = mydoc.read('word/document.xml') + document = etree.fromstring(xmlcontent) + ## Fetch all the text out of the document we just created + paratextlist = getdocumenttext(document) + # Make explicit unicode version + newparatextlist = [] + for paratext in paratextlist: + newparatextlist.append(paratext.encode("utf-8")) + ## Print our documnts test with two newlines under each paragraph + description = '\n\n'.join(newparatextlist).strip(' \t\n\r') + dochash = scrape.mkhash(description) + doc = foidocsdb.get(dochash) + + if doc is None: + print "saving " + dochash + edate = time().strftime("%Y-%m-%d") + doc = {'_id': dochash, 'agencyID': self.getAgencyID() + , 'url': self.getURL(), 'docID': dochash, + "date": edate, "title": "Disclosure Log Updated", "description": description} + foidocsdb.save(doc) + else: + print "already saved" + + +class GenericRSSDisclogScraper(GenericDisclogScraper): + def doScrape(self): + foidocsdb = scrape.couch['disclosr-foidocuments'] + (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, + self.getURL(), "foidocuments", self.getAgencyID()) + feed = feedparser.parse(content) + for entry in feed.entries: + #print entry + print entry.id + dochash = scrape.mkhash(entry.id) + doc = foidocsdb.get(dochash) + #print doc + if doc is None: + print "saving " + dochash + edate = datetime.fromtimestamp( + mktime(entry.published_parsed)).strftime("%Y-%m-%d") + doc = {'_id': dochash, 'agencyID': self.getAgencyID(), + 'url': entry.link, 'docID': entry.id, + "date": edate, "title": entry.title} + self.getDescription(entry, entry, doc) + foidocsdb.save(doc) + else: + print "already saved" + def getDescription(self, content, entry, doc): - """ get description""" - return - - - -class GenericRSSDisclogScraper(GenericDisclogScraper): - - def doScrape(self): - foidocsdb = scrape.couch['disclosr-foidocuments'] - (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) - feed = feedparser.parse(content) - for entry in feed.entries: - #print entry - print entry.id - hash = scrape.mkhash(entry.id) - #print hash - doc = foidocsdb.get(hash) - #print doc - if doc == None: - print "saving "+ hash - edate = datetime.fromtimestamp(mktime( entry.published_parsed)).strftime("%Y-%m-%d") - doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': entry.link, 'docID': entry.id, - "date": edate,"title": entry.title} - self.getDescription(entry,entry, doc) + """ get description from rss entry""" + doc.update({'description': content.summary}) + + return + + +class GenericOAICDisclogScraper(GenericDisclogScraper): + __metaclass__ = abc.ABCMeta + + @abc.abstractmethod + def getColumns(self, columns): + """ rearranges columns if required """ + return + + def getColumnCount(self): + return 5 + + def getDescription(self, content, entry, doc): + """ get description from rss entry""" + descriptiontxt = "" + for string in content.stripped_strings: + descriptiontxt = descriptiontxt + " \n" + string + doc.update({'description': descriptiontxt}) + + def getTitle(self, content, entry, doc): + doc.update({'title': (''.join(content.stripped_strings))}) + + def getTable(self, soup): + return soup.table + + def getRows(self, table): + return table.find_all('tr') + def findColumns(self, row): + return row.find_all('td') + + def getDocHash(self, id,date, url): + if id.string is None: + print "no id, using date as hash" + return scrape.mkhash( + self.remove_control_chars( + url + (''.join(date.stripped_strings)))) + else: + return scrape.mkhash( + self.remove_control_chars( + url + (''.join(id.stripped_strings)))) + + def getDate(self, content, entry, doc): + strdate = ''.join(content.stripped_strings).strip() + (a, b, c) = strdate.partition("(") + strdate = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012").replace("Janrurary", "January").replace("1012","2012")) + print strdate + try: + edate = parse(strdate, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") + except ValueError: + print >> sys.stderr, "ERROR date invalid %s " % strdate + print >> sys.stderr, "ERROR date originally %s " % ''.join(content.stripped_strings).strip() + edate = date.today().strftime("%Y-%m-%d") + print edate + doc.update({'date': edate}) + return + + def getLinks(self, content, entry, doc): + links = [] + for atag in entry.find_all("a"): + if atag.has_key('href'): + links.append(scrape.fullurl(content, atag['href'])) + if links != []: + doc.update({'links': links}) + return + + def doScrape(self): + foidocsdb = scrape.couch['disclosr-foidocuments'] + (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, + self.getURL(), "foidocuments", self.getAgencyID()) + if content is not None: + if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type == "application/xml": + # http://www.crummy.com/software/BeautifulSoup/documentation.html + print "parsing" + soup = BeautifulSoup(content) + table = self.getTable(soup) + for row in self.getRows(table): + columns = self.findColumns(row) + if len(columns) is self.getColumnCount(): + (id, date, title, + description, notes) = self.getColumns(columns) + print self.remove_control_chars( + ''.join(id.stripped_strings)) + dochash = self.getDocHash(id,date,url) + doc = foidocsdb.get(dochash) + + if doc is None: + print "saving " + dochash + doc = {'_id': dochash, + 'agencyID': self.getAgencyID(), + 'url': self.getURL(), + 'docID': (''.join(id.stripped_strings))} + self.getLinks(self.getURL(), row, doc) + self.getTitle(title, row, doc) + self.getDate(date, row, doc) + self.getDescription(description, row, doc) + if notes is not None: + doc.update({'notes': ( + ''.join(notes.stripped_strings))}) + badtitles = ['-', 'Summary of FOI Request' + , 'FOI request(in summary form)' + , 'Summary of FOI request received by the ASC', + 'Summary of FOI request received by agency/minister', + 'Description of Documents Requested', 'FOI request', + 'Description of FOI Request', 'Summary of request', 'Description', 'Summary', + 'Summary of FOIrequest received by agency/minister', + 'Summary of FOI request received', 'Description of FOI Request', + "FOI request", 'Results 1 to 67 of 67'] + if doc['title'] not in badtitles and 'description' in doc.keys() and doc['description'] != '': + print "saving" foidocsdb.save(doc) else: - print "already saved" - def getDescription(self, content, entry, doc): - """ get description from rss entry""" - doc.update({'description': content.summary}) - return - -class GenericOAICDisclogScraper(GenericDisclogScraper): - __metaclass__ = abc.ABCMeta - @abc.abstractmethod - def getColumns(self,columns): - """ rearranges columns if required """ - return - def getColumnCount(self): - return 5 - def getDescription(self, content, entry, doc): - """ get description from rss entry""" - descriptiontxt = "" - for string in content.stripped_strings: - descriptiontxt = descriptiontxt + " \n" + string - doc.update({'description': descriptiontxt}) - return - def getTitle(self, content, entry, doc): - doc.update({'title': (''.join(content.stripped_strings))}) - return - def getTable(self, soup): - return soup.table - def getRows(self, table): - return table.find_all('tr') - def getDate(self, content, entry, doc): - date = ''.join(content.stripped_strings).strip() - (a,b,c) = date.partition("(") - date = a.replace("Octber","October") - print date - edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") - print edate - doc.update({'date': edate}) - return - def getLinks(self, content, entry, doc): - links = [] - for atag in entry.find_all("a"): - if atag.has_key('href'): - links.append(scrape.fullurl(content,atag['href'])) - if links != []: - doc.update({'links': links}) - return - - def doScrape(self): - foidocsdb = scrape.couch['disclosr-foidocuments'] - (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) - if content != None: - if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": - # http://www.crummy.com/software/BeautifulSoup/documentation.html - soup = BeautifulSoup(content) - table = self.getTable(soup) - for row in self.getRows(table): - columns = row.find_all('td') - if len(columns) == self.getColumnCount(): - (id, date, title, description, notes) = self.getColumns(columns) - print ''.join(id.stripped_strings) - if id.string == None: - hash = scrape.mkhash(self.remove_control_chars(url+(''.join(date.stripped_strings)))) - else: - hash = scrape.mkhash(self.remove_control_chars(url+(''.join(id.stripped_strings)))) - doc = foidocsdb.get(hash) - - if doc == None: - print "saving " +hash - doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': (''.join(id.stripped_strings))} - self.getLinks(self.getURL(),row,doc) - self.getTitle(title,row, doc) - self.getDate(date,row, doc) - self.getDescription(description,row, doc) - if notes != None: - doc.update({ 'notes': (''.join(notes.stripped_strings))}) - foidocsdb.save(doc) - else: - print "already saved "+hash - - elif len(row.find_all('th')) == self.getColumnCount(): - print "header row" - - else: - print "ERROR number of columns incorrect" - print row - + print "already saved " + dochash + + elif len(row.find_all('th')) is self.getColumnCount(): + print "header row" + + else: + print >> sys.stderr, "ERROR number of columns incorrect" + print row +