--- a/documents/scrape.py +++ b/documents/scrape.py @@ -7,6 +7,7 @@ from urlparse import urljoin import time import os +import sys import mimetypes import urllib import urlparse @@ -89,7 +90,7 @@ def getLastAttachment(docsdb, url): hash = mkhash(url) doc = docsdb.get(hash) - if doc != None: + if doc != None and "_attachments" in doc.keys(): last_attachment_fname = doc["_attachments"].keys()[-1] last_attachment = docsdb.get_attachment(doc, last_attachment_fname) return last_attachment @@ -103,7 +104,7 @@ req = urllib2.Request(url) print "Fetching %s (%s)" % (url, hash) if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "": - print "Not a valid HTTP url" + print >> sys.stderr, "Not a valid HTTP url" return (None, None, None) doc = docsdb.get(hash) if doc == None: @@ -111,10 +112,15 @@ else: if (('page_scraped' in doc) and ((time.time() - doc['page_scraped']) < 60 * 24 * 14) or (scrape_again == False)): print "Uh oh, trying to scrape URL again too soon!" + hash - last_attachment_fname = doc["_attachments"].keys()[-1] - last_attachment = docsdb.get_attachment(doc, last_attachment_fname) - content = last_attachment - return (doc['url'], doc['mime_type'], content.read()) + if "_attachments" in doc.keys(): + last_attachment_fname = doc["_attachments"].keys()[-1] + last_attachment = docsdb.get_attachment(doc, last_attachment_fname) + content = last_attachment.read() + mime_type = doc['mime_type'] + else: + content = None + mime_type = None + return (doc['url'], mime_type, content) req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)") #if there is a previous version stored in couchdb, load caching helper tags @@ -159,13 +165,13 @@ #store as attachment epoch-filename except (urllib2.URLError, socket.timeout) as e: - print "error!" + print >> sys.stderr,"error!" error = "" if hasattr(e, 'reason'): error = "error %s in downloading %s" % (str(e.reason), url) elif hasattr(e, 'code'): error = "error %s in downloading %s" % (e.code, url) - print error + print >> sys.stderr, error doc['error'] = error docsdb.save(doc) return (None, None, None)