--- a/documents/scrape.py +++ b/documents/scrape.py @@ -82,7 +82,7 @@ url = canonurl(url) hash = mkhash(url) req = urllib2.Request(url) - print "Fetching %s" % url + print "Fetching %s (%s)" % (url,hash) if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "": print "Not a valid HTTP url" return (None,None,None) @@ -94,7 +94,8 @@ print "Uh oh, trying to scrape URL again too soon!" last_attachment_fname = doc["_attachments"].keys()[-1] last_attachment = docsdb.get_attachment(doc,last_attachment_fname) - return (doc['url'],doc['mime_type'],last_attachment.read()) + content = last_attachment + return (doc['url'],doc['mime_type'],content) if scrape_again == False: print "Not scraping this URL again as requested" return (None,None,None) @@ -189,7 +190,7 @@ scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID) #couch = couchdb.Server('http://192.168.1.148:5984/') -couch = couchdb.Server('http://192.168.1.148:5984/') +couch = couchdb.Server('http://127.0.0.1:5984/') # select database agencydb = couch['disclosr-agencies'] docsdb = couch['disclosr-documents']