more scrapers
[disclosr.git] / documents / genericScrapers.py
blob:a/documents/genericScrapers.py -> blob:b/documents/genericScrapers.py
--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -1,148 +1,281 @@
-import sys,os
+import sys
+import os
 sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
 import scrape
 from bs4 import BeautifulSoup
 from time import mktime
 import feedparser
 import abc
-import unicodedata, re
+import unicodedata
+import re
 import dateutil
 from dateutil.parser import *
 from datetime import *
+import codecs
+
+import difflib
+
+from StringIO import StringIO
+
+from pdfminer.pdfparser import PDFDocument, PDFParser
+from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
+from pdfminer.pdfdevice import PDFDevice, TagExtractor
+from pdfminer.converter import TextConverter
+from pdfminer.cmapdb import CMapDB
+from pdfminer.layout import LAParams
+
 
 class GenericDisclogScraper(object):
-        __metaclass__ = abc.ABCMeta
-	agencyID = None
-	disclogURL = None
-	def remove_control_chars(self, input):
-		return "".join([i for i in input if ord(i) in range(32, 127)])
-        def getAgencyID(self):
-                """ disclosr agency id """
-		if self.agencyID == None:
-			self.agencyID = os.path.basename(sys.argv[0]).replace(".py","")
-                return self.agencyID
-
-        def getURL(self):
-                """ disclog URL"""
-		if self.disclogURL == None:
-			agency = scrape.agencydb.get(self.getAgencyID())
-			self.disclogURL = agency['FOIDocumentsURL']
-                return self.disclogURL
-
-	@abc.abstractmethod
-	def doScrape(self):
-		""" do the scraping """
-		return
-
-	@abc.abstractmethod
-        def getDescription(self, content, entry, doc):
-                """ get description"""
-		return
-
+    __metaclass__ = abc.ABCMeta
+    agencyID = None
+    disclogURL = None
+
+    def remove_control_chars(self, input):
+        return "".join([i for i in input if ord(i) in range(32, 127)])
+
+    def getAgencyID(self):
+        """ disclosr agency id """
+        if self.agencyID is None:
+            self.agencyID = os.path.basename(sys.argv[0]).replace(".py", "")
+        return self.agencyID
+
+    def getURL(self):
+        """ disclog URL"""
+        if self.disclogURL is None:
+            agency = scrape.agencydb.get(self.getAgencyID())
+            self.disclogURL = agency['FOIDocumentsURL']
+        return self.disclogURL
+
+    @abc.abstractmethod
+    def doScrape(self):
+        """ do the scraping """
+        return
+
+class GenericHTMLDisclogScraper(GenericDisclogScraper):
+
+    def doScrape(self):
+        foidocsdb = scrape.couch['disclosr-foidocuments']
+        (url, mime_type, rcontent) = scrape.fetchURL(scrape.docsdb,
+             self.getURL(), "foidocuments", self.getAgencyID())
+        content = rcontent.read()
+        dochash = scrape.mkhash(content)
+        doc = foidocsdb.get(dochash)
+        if doc is None:
+            print "saving " + dochash
+            description = "This log may have updated but as it was not in a table last time we viewed it, we cannot extract what has changed. Please refer to the agency's website Disclosure Log to see the most recent entries"
+            last_attach = scrape.getLastAttachment(scrape.docsdb, self.getURL())
+            if last_attach != None:
+                html_diff = difflib.HtmlDiff()
+                description = description + "\nChanges: "
+                description = description + html_diff.make_table(last_attach.read().split('\n'),
+                           content.split('\n'))
+            edate = date.today().strftime("%Y-%m-%d")
+            doc = {'_id': dochash, 'agencyID': self.getAgencyID()
+            , 'url': self.getURL(), 'docID': dochash,
+            "date": edate, "title": "Disclosure Log Updated", "description": description}
+            foidocsdb.save(doc)
+        else:
+            print "already saved"
+
+class GenericPDFDisclogScraper(GenericDisclogScraper):
+
+    def doScrape(self):
+        foidocsdb = scrape.couch['disclosr-foidocuments']
+        (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
+             self.getURL(), "foidocuments", self.getAgencyID())
+        laparams = LAParams()
+        rsrcmgr = PDFResourceManager(caching=True)
+        outfp = StringIO()
+        device = TextConverter(rsrcmgr, outfp, codec='utf-8',
+             laparams=laparams)
+        fp = StringIO()
+        fp.write(content.read())
+
+        process_pdf(rsrcmgr, device, fp, set(), caching=True,
+             check_extractable=True)
+        description = outfp.getvalue()
+        fp.close()
+        device.close()
+        outfp.close()
+        dochash = scrape.mkhash(description)
+        doc = foidocsdb.get(dochash)
+        if doc is None:
+            print "saving " + dochash
+            edate = date.today().strftime("%Y-%m-%d")
+            doc = {'_id': dochash, 'agencyID': self.getAgencyID()
+            , 'url': self.getURL(), 'docID': dochash,
+            "date": edate, "title": "Disclosure Log Updated", "description": description}
+            foidocsdb.save(doc)
+        else:
+            print "already saved"
+
+
+class GenericDOCXDisclogScraper(GenericDisclogScraper):
+
+    def doScrape(self):
+        foidocsdb = scrape.couch['disclosr-foidocuments']
+        (url, mime_type, content) = scrape.fetchURL(scrape.docsdb
+        , self.getURL(), "foidocuments", self.getAgencyID())
+        mydoc = zipfile.ZipFile(file)
+        xmlcontent = mydoc.read('word/document.xml')
+        document = etree.fromstring(xmlcontent)
+        ## Fetch all the text out of the document we just created
+        paratextlist = getdocumenttext(document)
+        # Make explicit unicode version
+        newparatextlist = []
+        for paratext in paratextlist:
+            newparatextlist.append(paratext.encode("utf-8"))
+        ## Print our documnts test with two newlines under each paragraph
+        description = '\n\n'.join(newparatextlist).strip(' \t\n\r')
+        dochash = scrape.mkhash(description)
+        doc = foidocsdb.get(dochash)
+
+        if doc is None:
+            print "saving " + dochash
+            edate = time().strftime("%Y-%m-%d")
+            doc = {'_id': dochash, 'agencyID': self.getAgencyID()
+            , 'url': self.getURL(), 'docID': dochash,
+            "date": edate, "title": "Disclosure Log Updated", "description": description}
+            foidocsdb.save(doc)
+        else:
+            print "already saved"
 
 
 class GenericRSSDisclogScraper(GenericDisclogScraper):
 
-       	def doScrape(self):
-               	foidocsdb = scrape.couch['disclosr-foidocuments']
-                (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID())
-		feed = feedparser.parse(content)		
-		for entry in feed.entries:
-			#print entry
-			print entry.id
-			hash = scrape.mkhash(entry.id)
-			#print hash
-		  	doc = foidocsdb.get(hash)
-			#print doc
-			if doc == None:
-                        	print "saving "+ hash
-				edate = datetime.fromtimestamp(mktime( entry.published_parsed)).strftime("%Y-%m-%d")
-                                doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': entry.link, 'docID': entry.id,
-                                "date": edate,"title": entry.title}
-				self.getDescription(entry,entry, doc)
-                                foidocsdb.save(doc)
+        def doScrape(self):
+            foidocsdb = scrape.couch['disclosr-foidocuments']
+            (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
+                 self.getURL(), "foidocuments", self.getAgencyID())
+            feed = feedparser.parse(content)
+            for entry in feed.entries:
+                #print entry
+                print entry.id
+                dochash = scrape.mkhash(entry.id)
+                doc = foidocsdb.get(dochash)
+                #print doc
+                if doc is None:
+                    print "saving " + dochash
+                    edate = datetime.fromtimestamp(
+                        mktime(entry.published_parsed)).strftime("%Y-%m-%d")
+                    doc = {'_id': dochash, 'agencyID': self.getAgencyID(),
+                        'url': entry.link, 'docID': entry.id,
+                        "date": edate, "title": entry.title}
+                    self.getDescription(entry, entry, doc)
+                    foidocsdb.save(doc)
+                else:
+                    print "already saved"
+
+            def getDescription(self, content, entry, doc):
+                    """ get description from rss entry"""
+                    doc.update({'description': content.summary})
+            return
+
+
+class GenericOAICDisclogScraper(GenericDisclogScraper):
+    __metaclass__ = abc.ABCMeta
+
+    @abc.abstractmethod
+    def getColumns(self, columns):
+        """ rearranges columns if required """
+        return
+
+    def getColumnCount(self):
+        return 5
+
+    def getDescription(self, content, entry, doc):
+        """ get description from rss entry"""
+        descriptiontxt = ""
+        for string in content.stripped_strings:
+                    descriptiontxt = descriptiontxt + " \n" + string
+        doc.update({'description': descriptiontxt})
+
+    def getTitle(self, content, entry, doc):
+        doc.update({'title': (''.join(content.stripped_strings))})
+
+    def getTable(self, soup):
+        return soup.table
+
+    def getRows(self, table):
+        return table.find_all('tr')
+
+    def getDate(self, content, entry, doc):
+        date = ''.join(content.stripped_strings).strip()
+        (a, b, c) = date.partition("(")
+        date = self.remove_control_chars(a.replace("Octber", "October"))
+        print date
+        edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
+        print edate
+        doc.update({'date': edate})
+        return
+
+    def getLinks(self, content, entry, doc):
+        links = []
+        for atag in entry.find_all("a"):
+            if atag.has_key('href'):
+                links.append(scrape.fullurl(content, atag['href']))
+        if links != []:
+                    doc.update({'links': links})
+        return
+
+    def doScrape(self):
+        foidocsdb = scrape.couch['disclosr-foidocuments']
+        (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
+            self.getURL(), "foidocuments", self.getAgencyID())
+        if content is not None:
+            if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type == "application/xml":
+            # http://www.crummy.com/software/BeautifulSoup/documentation.html
+                print "parsing"
+                soup = BeautifulSoup(content)
+                table = self.getTable(soup)
+                for row in self.getRows(table):
+                    columns = row.find_all('td')
+                    if len(columns) is self.getColumnCount():
+                        (id, date, title,
+                        description, notes) = self.getColumns(columns)
+                        print self.remove_control_chars(
+                            ''.join(id.stripped_strings))
+                        if id.string is None:
+                            dochash = scrape.mkhash(
+                                self.remove_control_chars(
+                                    url + (''.join(date.stripped_strings))))
                         else:
-                        	print "already saved"			
-        def getDescription(self, content, entry, doc):
-                """ get description from rss entry"""
-                doc.update({'description': content.summary})
-		return
-
-class GenericOAICDisclogScraper(GenericDisclogScraper):
-        __metaclass__ = abc.ABCMeta
-	@abc.abstractmethod
-	def getColumns(self,columns):
-		""" rearranges columns if required """
-		return
-        def getColumnCount(self):
-                return 5
-        def getDescription(self, content, entry, doc):
-                """ get description from rss entry"""
-		descriptiontxt = ""
-		for string in content.stripped_strings:
-                	descriptiontxt = descriptiontxt + " \n" + string
-                doc.update({'description': descriptiontxt})
-		return
-        def getTitle(self, content, entry, doc):
-                doc.update({'title': (''.join(content.stripped_strings))})
-		return
-	def getTable(self, soup):
-		return soup.table
-	def getRows(self, table):
-		return table.find_all('tr')
-	def getDate(self, content, entry, doc):
-		date = ''.join(content.stripped_strings).strip()
-		date = date.replace("Octber","October")
-		print date
-		edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
-		print edate
-		doc.update({'date': edate})
-		return
-	def getLinks(self, content, entry, doc):
-                links = []
-                for atag in entry.find_all("a"):
-                       	if atag.has_key('href'):
-                               	links.append(scrape.fullurl(content,atag['href']))
-                if links != []:
-	                doc.update({'links': links})
-		return
-
-	def doScrape(self):
-		foidocsdb = scrape.couch['disclosr-foidocuments']
-		(url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID())
-		if content != None:
-			if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
-			# http://www.crummy.com/software/BeautifulSoup/documentation.html
-				soup = BeautifulSoup(content)
-				table = self.getTable(soup)
-				for row in self.getRows(table):
-					columns = row.find_all('td')
-					if len(columns) == self.getColumnCount():
-						(id, date, title, description, notes) = self.getColumns(columns)
-						print ''.join(id.stripped_strings)
-						if id.string == None:
-							hash = scrape.mkhash(self.remove_control_chars(url+(''.join(date.stripped_strings))))
-						else:
-							hash = scrape.mkhash(self.remove_control_chars(url+(''.join(id.stripped_strings))))
-						doc = foidocsdb.get(hash)
-							
-						if doc == None:
-							print "saving " +hash
-							doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': (''.join(id.stripped_strings))}
-							self.getLinks(self.getURL(),row,doc)
-                                			self.getTitle(title,row, doc)
-                                			self.getDate(date,row, doc)
-							self.getDescription(description,row, doc)
-							if notes != None:
-                                        			doc.update({ 'notes': (''.join(notes.stripped_strings))})
-							foidocsdb.save(doc)
-						else:
-							print "already saved "+hash
-					
-					elif len(row.find_all('th')) == self.getColumnCount():
-						print "header row"
-					
-					else:
-						print "ERROR number of columns incorrect"
-						print row
-
+                            dochash = scrape.mkhash(
+                                self.remove_control_chars(
+                                    url + (''.join(id.stripped_strings))))
+                        doc = foidocsdb.get(dochash)
+
+                        if doc is None:
+                            print "saving " + dochash
+                            doc = {'_id': dochash,
+                            'agencyID': self.getAgencyID(),
+                            'url': self.getURL(),
+                            'docID': (''.join(id.stripped_strings))}
+                            self.getLinks(self.getURL(), row, doc)
+                            self.getTitle(title, row, doc)
+                            self.getDate(date, row, doc)
+                            self.getDescription(description, row, doc)
+                            if notes is not None:
+                                doc.update({ 'notes': (
+                                    ''.join(notes.stripped_strings))})
+                            badtitles = ['-','Summary of FOI Request'
+                            , 'FOI request(in summary form)'
+                            , 'Summary of FOI request received by the ASC',
+'Summary of FOI request received by agency/minister',
+'Description of Documents Requested','FOI request',
+'Description of FOI Request','Summary of request','Description','Summary',
+'Summary of FOIrequest received by agency/minister','Summary of FOI request received','Description of    FOI Request',"FOI request",'Results 1 to 67 of 67']
+                            if doc['title'] not in badtitles\
+                            and doc['description'] != '':
+                                                            print "saving"
+                                                            foidocsdb.save(doc)
+                        else:
+                            print "already saved " + dochash
+
+                    elif len(row.find_all('th')) is self.getColumnCount():
+                        print "header row"
+
+                    else:
+                        print "ERROR number of columns incorrect"
+                        print row
+