Merge branch 'master' of ssh://maxious.lambdacomplex.org/git/disclosr
[disclosr.git] / documents / genericScrapers.py
blob:a/documents/genericScrapers.py -> blob:b/documents/genericScrapers.py
--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -1,5 +1,6 @@
 import sys
 import os
+
 sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
 import scrape
 from bs4 import BeautifulSoup
@@ -51,12 +52,12 @@
         """ do the scraping """
         return
 
+
 class GenericHTMLDisclogScraper(GenericDisclogScraper):
-
     def doScrape(self):
         foidocsdb = scrape.couch['disclosr-foidocuments']
         (url, mime_type, rcontent) = scrape.fetchURL(scrape.docsdb,
-             self.getURL(), "foidocuments", self.getAgencyID())
+            self.getURL(), "foidocuments", self.getAgencyID())
         content = rcontent
         dochash = scrape.mkhash(content)
         doc = foidocsdb.get(dochash)
@@ -66,33 +67,32 @@
             last_attach = scrape.getLastAttachment(scrape.docsdb, self.getURL())
             if last_attach != None:
                 html_diff = difflib.HtmlDiff()
-                #description = description + "\nChanges: "
-                #description = description + html_diff.make_table(last_attach.read().split('\n'),
-                #           content.split('\n'))
+                diff = html_diff.make_table(last_attach.read().split('\n'),
+                    content.split('\n'))
             edate = date.today().strftime("%Y-%m-%d")
             doc = {'_id': dochash, 'agencyID': self.getAgencyID()
             , 'url': self.getURL(), 'docID': dochash,
-            "date": edate, "title": "Disclosure Log Updated", "description": description}
+            "date": edate, "title": "Disclosure Log Updated", "description":  self.remove_control_chars(description), "diff": diff}
             foidocsdb.save(doc)
         else:
             print "already saved"
 
+
 class GenericPDFDisclogScraper(GenericDisclogScraper):
-
     def doScrape(self):
         foidocsdb = scrape.couch['disclosr-foidocuments']
         (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
-             self.getURL(), "foidocuments", self.getAgencyID())
+            self.getURL(), "foidocuments", self.getAgencyID())
         laparams = LAParams()
         rsrcmgr = PDFResourceManager(caching=True)
         outfp = StringIO()
         device = TextConverter(rsrcmgr, outfp, codec='utf-8',
-             laparams=laparams)
+            laparams=laparams)
         fp = StringIO()
         fp.write(content)
 
         process_pdf(rsrcmgr, device, fp, set(), caching=True,
-             check_extractable=True)
+            check_extractable=True)
         description = outfp.getvalue()
         fp.close()
         device.close()
@@ -104,18 +104,17 @@
             edate = date.today().strftime("%Y-%m-%d")
             doc = {'_id': dochash, 'agencyID': self.getAgencyID()
             , 'url': self.getURL(), 'docID': dochash,
-            "date": edate, "title": "Disclosure Log Updated", "description": description}
+            "date": edate, "title": "Disclosure Log Updated", "description": self.remove_control_chars(description)}
             foidocsdb.save(doc)
         else:
             print "already saved"
 
 
 class GenericDOCXDisclogScraper(GenericDisclogScraper):
-
     def doScrape(self):
         foidocsdb = scrape.couch['disclosr-foidocuments']
         (url, mime_type, content) = scrape.fetchURL(scrape.docsdb
-        , self.getURL(), "foidocuments", self.getAgencyID())
+            , self.getURL(), "foidocuments", self.getAgencyID())
         mydoc = zipfile.ZipFile(file)
         xmlcontent = mydoc.read('word/document.xml')
         document = etree.fromstring(xmlcontent)
@@ -125,7 +124,7 @@
         newparatextlist = []
         for paratext in paratextlist:
             newparatextlist.append(paratext.encode("utf-8"))
-        ## Print our documnts test with two newlines under each paragraph
+            ## Print our documnts test with two newlines under each paragraph
         description = '\n\n'.join(newparatextlist).strip(' \t\n\r')
         dochash = scrape.mkhash(description)
         doc = foidocsdb.get(dochash)
@@ -134,42 +133,42 @@
             print "saving " + dochash
             edate = time().strftime("%Y-%m-%d")
             doc = {'_id': dochash, 'agencyID': self.getAgencyID()
-            , 'url': self.getURL(), 'docID': dochash,
-            "date": edate, "title": "Disclosure Log Updated", "description": description}
+                , 'url': self.getURL(), 'docID': dochash,
+                   "date": edate, "title": "Disclosure Log Updated", "description": description}
             foidocsdb.save(doc)
         else:
             print "already saved"
 
 
 class GenericRSSDisclogScraper(GenericDisclogScraper):
-
-        def doScrape(self):
-            foidocsdb = scrape.couch['disclosr-foidocuments']
-            (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
-                 self.getURL(), "foidocuments", self.getAgencyID())
-            feed = feedparser.parse(content)
-            for entry in feed.entries:
-                #print entry
-                print entry.id
-                dochash = scrape.mkhash(entry.id)
-                doc = foidocsdb.get(dochash)
-                #print doc
-                if doc is None:
-                    print "saving " + dochash
-                    edate = datetime.fromtimestamp(
-                        mktime(entry.published_parsed)).strftime("%Y-%m-%d")
-                    doc = {'_id': dochash, 'agencyID': self.getAgencyID(),
-                        'url': entry.link, 'docID': entry.id,
-                        "date": edate, "title": entry.title}
-                    self.getDescription(entry, entry, doc)
-                    foidocsdb.save(doc)
-                else:
-                    print "already saved"
-
-            def getDescription(self, content, entry, doc):
-                    """ get description from rss entry"""
-                    doc.update({'description': content.summary})
-            return
+    def doScrape(self):
+        foidocsdb = scrape.couch['disclosr-foidocuments']
+        (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
+            self.getURL(), "foidocuments", self.getAgencyID())
+        feed = feedparser.parse(content)
+        for entry in feed.entries:
+            #print entry
+            print entry.id
+            dochash = scrape.mkhash(entry.id)
+            doc = foidocsdb.get(dochash)
+            #print doc
+            if doc is None:
+                print "saving " + dochash
+                edate = datetime.fromtimestamp(
+                    mktime(entry.published_parsed)).strftime("%Y-%m-%d")
+                doc = {'_id': dochash, 'agencyID': self.getAgencyID(),
+                       'url': entry.link, 'docID': entry.id,
+                       "date": edate, "title": entry.title}
+                self.getDescription(entry, entry, doc)
+                foidocsdb.save(doc)
+            else:
+                print "already saved"
+
+        def getDescription(self, content, entry, doc):
+            """ get description from rss entry"""
+            doc.update({'description': content.summary})
+
+        return
 
 
 class GenericOAICDisclogScraper(GenericDisclogScraper):
@@ -187,7 +186,7 @@
         """ get description from rss entry"""
         descriptiontxt = ""
         for string in content.stripped_strings:
-                    descriptiontxt = descriptiontxt + " \n" + string
+            descriptiontxt = descriptiontxt + " \n" + string
         doc.update({'description': descriptiontxt})
 
     def getTitle(self, content, entry, doc):
@@ -202,7 +201,7 @@
     def getDate(self, content, entry, doc):
         date = ''.join(content.stripped_strings).strip()
         (a, b, c) = date.partition("(")
-        date = self.remove_control_chars(a.replace("Octber", "October"))
+        date = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012"))
         print date
         edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
         print edate
@@ -215,7 +214,7 @@
             if atag.has_key('href'):
                 links.append(scrape.fullurl(content, atag['href']))
         if links != []:
-                    doc.update({'links': links})
+            doc.update({'links': links})
         return
 
     def doScrape(self):
@@ -232,7 +231,7 @@
                     columns = row.find_all('td')
                     if len(columns) is self.getColumnCount():
                         (id, date, title,
-                        description, notes) = self.getColumns(columns)
+                         description, notes) = self.getColumns(columns)
                         print self.remove_control_chars(
                             ''.join(id.stripped_strings))
                         if id.string is None:
@@ -248,27 +247,29 @@
                         if doc is None:
                             print "saving " + dochash
                             doc = {'_id': dochash,
-                            'agencyID': self.getAgencyID(),
-                            'url': self.getURL(),
-                            'docID': (''.join(id.stripped_strings))}
+                                   'agencyID': self.getAgencyID(),
+                                   'url': self.getURL(),
+                                   'docID': (''.join(id.stripped_strings))}
                             self.getLinks(self.getURL(), row, doc)
                             self.getTitle(title, row, doc)
                             self.getDate(date, row, doc)
                             self.getDescription(description, row, doc)
                             if notes is not None:
-                                doc.update({ 'notes': (
+                                doc.update({'notes': (
                                     ''.join(notes.stripped_strings))})
-                            badtitles = ['-','Summary of FOI Request'
-                            , 'FOI request(in summary form)'
-                            , 'Summary of FOI request received by the ASC',
-'Summary of FOI request received by agency/minister',
-'Description of Documents Requested','FOI request',
-'Description of FOI Request','Summary of request','Description','Summary',
-'Summary of FOIrequest received by agency/minister','Summary of FOI request received','Description of    FOI Request',"FOI request",'Results 1 to 67 of 67']
+                            badtitles = ['-', 'Summary of FOI Request'
+                                , 'FOI request(in summary form)'
+                                , 'Summary of FOI request received by the ASC',
+                                         'Summary of FOI request received by agency/minister',
+                                         'Description of Documents Requested', 'FOI request',
+                                         'Description of FOI Request', 'Summary of request', 'Description', 'Summary',
+                                         'Summary of FOIrequest received by agency/minister',
+                                         'Summary of FOI request received', 'Description of    FOI Request',
+                                         "FOI request", 'Results 1 to 67 of 67']
                             if doc['title'] not in badtitles\
                             and doc['description'] != '':
-                                                            print "saving"
-                                                            foidocsdb.save(doc)
+                                print "saving"
+                                foidocsdb.save(doc)
                         else:
                             print "already saved " + dochash