fix scraping
Former-commit-id: c96cc5c23e3497cb03991f1ee4e2990548817cf3
--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -196,10 +196,9 @@
(url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
self.getURL(), "foidocuments", self.getAgencyID())
if content is not None:
- if mime_type is "text/html"\
- or mime_type is "application/xhtml+xml"\
- or mime_type is"application/xml":
+ if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type == "application/xml":
# http://www.crummy.com/software/BeautifulSoup/documentation.html
+ print "parsing"
soup = BeautifulSoup(content)
table = self.getTable(soup)
for row in self.getRows(table):
@@ -217,11 +216,11 @@
dochash = scrape.mkhash(
self.remove_control_chars(
url + (''.join(id.stripped_strings))))
- doc = foidocsdb.get(hash)
+ doc = foidocsdb.get(dochash)
if doc is None:
- print "saving " + hash
- doc = {'_id': hash,
+ print "saving " + dochash
+ doc = {'_id': dochash,
'agencyID': self.getAgencyID(),
'url': self.getURL(),
'docID': (''.join(id.stripped_strings))}