[disclosr.git] / documents / scrape.py
--- a/documents/scrape.py
+++ b/documents/scrape.py
@@ -82,7 +82,7 @@
 	url = canonurl(url)
 	hash = mkhash(url)
 	req = urllib2.Request(url)
-	print "Fetching %s" % url
+	print "Fetching %s (%s)" % (url,hash)
 	if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "":
 		print "Not a valid HTTP url"
 		return (None,None,None)
@@ -94,7 +94,8 @@
 			print "Uh oh, trying to scrape URL again too soon!"
 			last_attachment_fname = doc["_attachments"].keys()[-1]
 			last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
-			return (doc['url'],doc['mime_type'],last_attachment.read())
+			content = last_attachment
+			return (doc['url'],doc['mime_type'],content)
 		if scrape_again == False:
 			print "Not scraping this URL again as requested"
 			return (None,None,None)