--- a/documents/scrape.py +++ b/documents/scrape.py @@ -82,7 +82,7 @@ url = canonurl(url) hash = mkhash(url) req = urllib2.Request(url) - print "Fetching %s" % url + print "Fetching %s (%s)" % (url,hash) if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "": print "Not a valid HTTP url" return (None,None,None) @@ -94,7 +94,8 @@ print "Uh oh, trying to scrape URL again too soon!" last_attachment_fname = doc["_attachments"].keys()[-1] last_attachment = docsdb.get_attachment(doc,last_attachment_fname) - return (doc['url'],doc['mime_type'],last_attachment.read()) + content = last_attachment + return (doc['url'],doc['mime_type'],content) if scrape_again == False: print "Not scraping this URL again as requested" return (None,None,None) @@ -189,7 +190,7 @@ scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID) #couch = couchdb.Server('http://192.168.1.148:5984/') -couch = couchdb.Server('http://192.168.1.148:5984/') +couch = couchdb.Server('http://127.0.0.1:5984/') # select database agencydb = couch['disclosr-agencies'] docsdb = couch['disclosr-documents'] @@ -203,12 +204,12 @@ scrapeAndStore(docsdb, agency[key],0,key,agency['_id']) if key == 'website' and False: scrapeAndStore(docsdb, agency[key],0,key,agency['_id']) + agency['metadata']['lastScraped'] = time.time() if key.endswith('URL') and False: print key depth = 1 if 'scrapeDepth' in agency.keys(): depth = agency['scrapeDepth'] scrapeAndStore(docsdb, agency[key],depth,key,agency['_id']) - agency['metadata']['lastScraped'] = time.time() agencydb.save(agency)