--- a/documents/scrape.py +++ b/documents/scrape.py @@ -90,7 +90,7 @@ def getLastAttachment(docsdb, url): hash = mkhash(url) doc = docsdb.get(hash) - if doc != None: + if doc != None and "_attachments" in doc.keys(): last_attachment_fname = doc["_attachments"].keys()[-1] last_attachment = docsdb.get_attachment(doc, last_attachment_fname) return last_attachment @@ -112,10 +112,15 @@ else: if (('page_scraped' in doc) and ((time.time() - doc['page_scraped']) < 60 * 24 * 14) or (scrape_again == False)): print "Uh oh, trying to scrape URL again too soon!" + hash - last_attachment_fname = doc["_attachments"].keys()[-1] - last_attachment = docsdb.get_attachment(doc, last_attachment_fname) - content = last_attachment - return (doc['url'], doc['mime_type'], content.read()) + if "_attachments" in doc.keys(): + last_attachment_fname = doc["_attachments"].keys()[-1] + last_attachment = docsdb.get_attachment(doc, last_attachment_fname) + content = last_attachment.read() + mime_type = doc['mime_type'] + else: + content = None + mime_type = None + return (doc['url'], mime_type, content) req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)") #if there is a previous version stored in couchdb, load caching helper tags