From: Maxious Date: Mon, 03 Dec 2012 12:05:38 +0000 Subject: fix scraping X-Git-Url: https://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=6b5dfd7e2271af6e25ef0285597ae9749a9c6392 --- fix scraping Former-commit-id: c96cc5c23e3497cb03991f1ee4e2990548817cf3 --- --- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -196,10 +196,9 @@ (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) if content is not None: - if mime_type is "text/html"\ - or mime_type is "application/xhtml+xml"\ - or mime_type is"application/xml": + if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type == "application/xml": # http://www.crummy.com/software/BeautifulSoup/documentation.html + print "parsing" soup = BeautifulSoup(content) table = self.getTable(soup) for row in self.getRows(table): @@ -217,11 +216,11 @@ dochash = scrape.mkhash( self.remove_control_chars( url + (''.join(id.stripped_strings)))) - doc = foidocsdb.get(hash) + doc = foidocsdb.get(dochash) if doc is None: - print "saving " + hash - doc = {'_id': hash, + print "saving " + dochash + doc = {'_id': dochash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': (''.join(id.stripped_strings))}