pdf scrapers
[disclosr.git] / documents / genericScrapers.py
blob:a/documents/genericScrapers.py -> blob:b/documents/genericScrapers.py
--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -13,6 +13,8 @@
 from datetime import *
 import codecs
 
+import difflib
+
 from StringIO import StringIO
 
 from pdfminer.pdfparser import PDFDocument, PDFParser
@@ -49,6 +51,31 @@
         """ do the scraping """
         return
 
+class GenericHTMLDisclogScraper(GenericDisclogScraper):
+
+    def doScrape(self):
+        foidocsdb = scrape.couch['disclosr-foidocuments']
+        (url, mime_type, rcontent) = scrape.fetchURL(scrape.docsdb,
+             self.getURL(), "foidocuments", self.getAgencyID())
+        content = rcontent
+        dochash = scrape.mkhash(content)
+        doc = foidocsdb.get(dochash)
+        if doc is None:
+            print "saving " + dochash
+            description = "This log may have updated but as it was not in a table last time we viewed it, we cannot extract what has changed. Please refer to the agency's website Disclosure Log to see the most recent entries"
+            last_attach = scrape.getLastAttachment(scrape.docsdb, self.getURL())
+            if last_attach != None:
+                html_diff = difflib.HtmlDiff()
+                description = description + "\nChanges: "
+                description = description + html_diff.make_table(last_attach.read().split('\n'),
+                           content.split('\n'))
+            edate = date.today().strftime("%Y-%m-%d")
+            doc = {'_id': dochash, 'agencyID': self.getAgencyID()
+            , 'url': self.getURL(), 'docID': dochash,
+            "date": edate, "title": "Disclosure Log Updated", "description":  self.remove_control_chars(description)}
+            foidocsdb.save(doc)
+        else:
+            print "already saved"
 
 class GenericPDFDisclogScraper(GenericDisclogScraper):
 
@@ -62,7 +89,7 @@
         device = TextConverter(rsrcmgr, outfp, codec='utf-8',
              laparams=laparams)
         fp = StringIO()
-        fp.write(content.read())
+        fp.write(content)
 
         process_pdf(rsrcmgr, device, fp, set(), caching=True,
              check_extractable=True)
@@ -77,7 +104,7 @@
             edate = date.today().strftime("%Y-%m-%d")
             doc = {'_id': dochash, 'agencyID': self.getAgencyID()
             , 'url': self.getURL(), 'docID': dochash,
-            "date": edate, "title": "Disclosure Log Updated", "description": description}
+            "date": edate, "title": "Disclosure Log Updated", "description": self.remove_control_chars(description)}
             foidocsdb.save(doc)
         else:
             print "already saved"
@@ -175,7 +202,7 @@
     def getDate(self, content, entry, doc):
         date = ''.join(content.stripped_strings).strip()
         (a, b, c) = date.partition("(")
-        date = self.remove_control_chars(a.replace("Octber", "October"))
+        date = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012"))
         print date
         edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
         print edate
@@ -196,10 +223,9 @@
         (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
             self.getURL(), "foidocuments", self.getAgencyID())
         if content is not None:
-            if mime_type is "text/html"\
-            or mime_type is "application/xhtml+xml"\
-            or mime_type is"application/xml":
+            if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type == "application/xml":
             # http://www.crummy.com/software/BeautifulSoup/documentation.html
+                print "parsing"
                 soup = BeautifulSoup(content)
                 table = self.getTable(soup)
                 for row in self.getRows(table):
@@ -217,11 +243,11 @@
                             dochash = scrape.mkhash(
                                 self.remove_control_chars(
                                     url + (''.join(id.stripped_strings))))
-                        doc = foidocsdb.get(hash)
+                        doc = foidocsdb.get(dochash)
 
                         if doc is None:
-                            print "saving " + hash
-                            doc = {'_id': hash,
+                            print "saving " + dochash
+                            doc = {'_id': dochash,
                             'agencyID': self.getAgencyID(),
                             'url': self.getURL(),
                             'docID': (''.join(id.stripped_strings))}