more scrapers
[disclosr.git] / documents / genericScrapers.py
blob:a/documents/genericScrapers.py -> blob:b/documents/genericScrapers.py
--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -57,7 +57,7 @@
         foidocsdb = scrape.couch['disclosr-foidocuments']
         (url, mime_type, rcontent) = scrape.fetchURL(scrape.docsdb,
              self.getURL(), "foidocuments", self.getAgencyID())
-        content = rcontent.read()
+        content = rcontent
         dochash = scrape.mkhash(content)
         doc = foidocsdb.get(dochash)
         if doc is None:
@@ -72,7 +72,7 @@
             edate = date.today().strftime("%Y-%m-%d")
             doc = {'_id': dochash, 'agencyID': self.getAgencyID()
             , 'url': self.getURL(), 'docID': dochash,
-            "date": edate, "title": "Disclosure Log Updated", "description": description}
+            "date": edate, "title": "Disclosure Log Updated", "description":  self.remove_control_chars(description)}
             foidocsdb.save(doc)
         else:
             print "already saved"
@@ -89,7 +89,7 @@
         device = TextConverter(rsrcmgr, outfp, codec='utf-8',
              laparams=laparams)
         fp = StringIO()
-        fp.write(content.read())
+        fp.write(content)
 
         process_pdf(rsrcmgr, device, fp, set(), caching=True,
              check_extractable=True)
@@ -104,7 +104,7 @@
             edate = date.today().strftime("%Y-%m-%d")
             doc = {'_id': dochash, 'agencyID': self.getAgencyID()
             , 'url': self.getURL(), 'docID': dochash,
-            "date": edate, "title": "Disclosure Log Updated", "description": description}
+            "date": edate, "title": "Disclosure Log Updated", "description": self.remove_control_chars(description)}
             foidocsdb.save(doc)
         else:
             print "already saved"
@@ -202,7 +202,7 @@
     def getDate(self, content, entry, doc):
         date = ''.join(content.stripped_strings).strip()
         (a, b, c) = date.partition("(")
-        date = self.remove_control_chars(a.replace("Octber", "October"))
+        date = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012"))
         print date
         edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
         print edate