[disclosr.git] / documents / genericScrapers.py
--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -13,11 +13,9 @@
 from datetime import *
 import codecs
 
+import difflib
+
 from StringIO import StringIO
-
-from docx import *
-from lxml import etree
-import zipfile
 
 from pdfminer.pdfparser import PDFDocument, PDFParser
 from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
@@ -39,20 +37,45 @@
         """ disclosr agency id """
         if self.agencyID is None:
             self.agencyID = os.path.basename(sys.argv[0]).replace(".py", "")
-            return self.agencyID
+        return self.agencyID
 
     def getURL(self):
         """ disclog URL"""
         if self.disclogURL is None:
             agency = scrape.agencydb.get(self.getAgencyID())
             self.disclogURL = agency['FOIDocumentsURL']
-            return self.disclogURL
+        return self.disclogURL
 
     @abc.abstractmethod
     def doScrape(self):
         """ do the scraping """
         return
 
+class GenericHTMLDisclogScraper(GenericDisclogScraper):
+
+    def doScrape(self):
+        foidocsdb = scrape.couch['disclosr-foidocuments']
+        (url, mime_type, rcontent) = scrape.fetchURL(scrape.docsdb,
+             self.getURL(), "foidocuments", self.getAgencyID())
+        content = rcontent
+        dochash = scrape.mkhash(content)
+        doc = foidocsdb.get(dochash)
+        if doc is None:
+            print "saving " + dochash
+            description = "This log may have updated but as it was not in a table last time we viewed it, we cannot extract what has changed. Please refer to the agency's website Disclosure Log to see the most recent entries"
+            last_attach = scrape.getLastAttachment(scrape.docsdb, self.getURL())
+            if last_attach != None:
+                html_diff = difflib.HtmlDiff()
+                description = description + "\nChanges: "
+                description = description + html_diff.make_table(last_attach.read().split('\n'),
+                           content.split('\n'))
+            edate = date.today().strftime("%Y-%m-%d")
+            doc = {'_id': dochash, 'agencyID': self.getAgencyID()
+            , 'url': self.getURL(), 'docID': dochash,
+            "date": edate, "title": "Disclosure Log Updated", "description":  self.remove_control_chars(description)}
+            foidocsdb.save(doc)
+        else:
+            print "already saved"
 
 class GenericPDFDisclogScraper(GenericDisclogScraper):
 
@@ -62,14 +85,15 @@
              self.getURL(), "foidocuments", self.getAgencyID())
         laparams = LAParams()
         rsrcmgr = PDFResourceManager(caching=True)
-        outfp = StringIO.StringIO()
+        outfp = StringIO()
         device = TextConverter(rsrcmgr, outfp, codec='utf-8',
              laparams=laparams)
-        fp = StringIO.StringIO()
+        fp = StringIO()
         fp.write(content)
-        description = output.getvalue()
+
         process_pdf(rsrcmgr, device, fp, set(), caching=True,
              check_extractable=True)
+        description = outfp.getvalue()
         fp.close()
         device.close()
         outfp.close()
@@ -77,11 +101,10 @@
         doc = foidocsdb.get(dochash)
         if doc is None:
             print "saving " + dochash
-            edate = datetime.fromtimestamp(mktime()).strftime("%Y-%m-%d")
+            edate = date.today().strftime("%Y-%m-%d")
             doc = {'_id': dochash, 'agencyID': self.getAgencyID()
             , 'url': self.getURL(), 'docID': dochash,
-            "date": edate, "title": "Disclosure Log Updated"}
-            self.getDescription(entry, entry, doc)
+            "date": edate, "title": "Disclosure Log Updated", "description": self.remove_control_chars(description)}
             foidocsdb.save(doc)
         else:
             print "already saved"
@@ -103,17 +126,16 @@
         for paratext in paratextlist:
             newparatextlist.append(paratext.encode("utf-8"))
         ## Print our documnts test with two newlines under each paragraph
-        description = '\n\n'.join(newparatextlist)
+        description = '\n\n'.join(newparatextlist).strip(' \t\n\r')
         dochash = scrape.mkhash(description)
         doc = foidocsdb.get(dochash)
 
         if doc is None:
             print "saving " + dochash
-            edate = datetime.fromtimestamp(mktime()).strftime("%Y-%m-%d")
+            edate = time().strftime("%Y-%m-%d")
             doc = {'_id': dochash, 'agencyID': self.getAgencyID()
             , 'url': self.getURL(), 'docID': dochash,
-            "date": edate, "title": "Disclosure Log Updated"}
-            self.getDescription(entry, entry, doc)
+            "date": edate, "title": "Disclosure Log Updated", "description": description}
             foidocsdb.save(doc)
         else:
             print "already saved"
@@ -180,7 +202,7 @@
     def getDate(self, content, entry, doc):
         date = ''.join(content.stripped_strings).strip()
         (a, b, c) = date.partition("(")
-        date = self.remove_control_chars(a.replace("Octber", "October"))
+        date = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012"))
         print date
         edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
         print edate
@@ -201,10 +223,9 @@
         (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
             self.getURL(), "foidocuments", self.getAgencyID())
         if content is not None:
-            if mime_type is "text/html"\
-            or mime_type is "application/xhtml+xml"\
-            or mime_type is"application/xml":
+            if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type == "application/xml":
             # http://www.crummy.com/software/BeautifulSoup/documentation.html
+                print "parsing"
                 soup = BeautifulSoup(content)
                 table = self.getTable(soup)
                 for row in self.getRows(table):
@@ -222,11 +243,11 @@
                             dochash = scrape.mkhash(
                                 self.remove_control_chars(
                                     url + (''.join(id.stripped_strings))))
-                        doc = foidocsdb.get(hash)
+                        doc = foidocsdb.get(dochash)
 
                         if doc is None:
-                            print "saving " + hash
-                            doc = {'_id': hash,
+                            print "saving " + dochash
+                            doc = {'_id': dochash,
                             'agencyID': self.getAgencyID(),
                             'url': self.getURL(),
                             'docID': (''.join(id.stripped_strings))}