--- a/documents/scrape.py +++ b/documents/scrape.py @@ -82,7 +82,7 @@ url = canonurl(url) hash = mkhash(url) req = urllib2.Request(url) - print "Fetching %s" % url + print "Fetching %s (%s)" % (url,hash) if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "": print "Not a valid HTTP url" return (None,None,None) @@ -94,7 +94,8 @@ print "Uh oh, trying to scrape URL again too soon!" last_attachment_fname = doc["_attachments"].keys()[-1] last_attachment = docsdb.get_attachment(doc,last_attachment_fname) - return (doc['url'],doc['mime_type'],last_attachment.read()) + content = last_attachment + return (doc['url'],doc['mime_type'],content) if scrape_again == False: print "Not scraping this URL again as requested" return (None,None,None)