better date parser
[disclosr.git] / documents / scrape.py
blob:a/documents/scrape.py -> blob:b/documents/scrape.py
--- a/documents/scrape.py
+++ b/documents/scrape.py
@@ -82,7 +82,7 @@
 	url = canonurl(url)
 	hash = mkhash(url)
 	req = urllib2.Request(url)
-	print "Fetching %s" % url
+	print "Fetching %s (%s)" % (url,hash)
 	if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "":
 		print "Not a valid HTTP url"
 		return (None,None,None)
@@ -94,7 +94,8 @@
 			print "Uh oh, trying to scrape URL again too soon!"
 			last_attachment_fname = doc["_attachments"].keys()[-1]
 			last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
-			return (doc['url'],doc['mime_type'],last_attachment.read())
+			content = last_attachment
+			return (doc['url'],doc['mime_type'],content)
 		if scrape_again == False:
 			print "Not scraping this URL again as requested"
 			return (None,None,None)
@@ -203,12 +204,12 @@
 				scrapeAndStore(docsdb, agency[key],0,key,agency['_id'])
 			if key == 'website' and False:
 				scrapeAndStore(docsdb, agency[key],0,key,agency['_id'])
+                                agency['metadata']['lastScraped'] = time.time()
 			if key.endswith('URL') and False:
 				print key 
 				depth = 1
 				if 'scrapeDepth' in agency.keys():
 					depth = agency['scrapeDepth']
 				scrapeAndStore(docsdb, agency[key],depth,key,agency['_id'])
-		agency['metadata']['lastScraped'] = time.time()
 		agencydb.save(agency)