From: Maxious <maxious@lambdacomplex.org>
Date: Mon, 03 Dec 2012 12:05:38 +0000
Subject: fix scraping
X-Git-Url: https://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=6b5dfd7e2271af6e25ef0285597ae9749a9c6392
---
fix scraping


Former-commit-id: c96cc5c23e3497cb03991f1ee4e2990548817cf3
---


--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -196,10 +196,9 @@
         (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
             self.getURL(), "foidocuments", self.getAgencyID())
         if content is not None:
-            if mime_type is "text/html"\
-            or mime_type is "application/xhtml+xml"\
-            or mime_type is"application/xml":
+            if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type == "application/xml":
             # http://www.crummy.com/software/BeautifulSoup/documentation.html
+                print "parsing"
                 soup = BeautifulSoup(content)
                 table = self.getTable(soup)
                 for row in self.getRows(table):
@@ -217,11 +216,11 @@
                             dochash = scrape.mkhash(
                                 self.remove_control_chars(
                                     url + (''.join(id.stripped_strings))))
-                        doc = foidocsdb.get(hash)
+                        doc = foidocsdb.get(dochash)
 
                         if doc is None:
-                            print "saving " + hash
-                            doc = {'_id': hash,
+                            print "saving " + dochash
+                            doc = {'_id': dochash,
                             'agencyID': self.getAgencyID(),
                             'url': self.getURL(),
                             'docID': (''.join(id.stripped_strings))}