--- a/documents/scrape.py +++ b/documents/scrape.py @@ -86,7 +86,7 @@ print "Uh oh, trying to scrape URL again too soon!" last_attachment_fname = doc["_attachments"].keys()[-1] last_attachment = docsdb.get_attachment(doc,last_attachment_fname) - return (doc['url'],doc['mime_type'],last_attachment) + return (doc['url'],doc['mime_type'],last_attachment.read()) if scrape_again == False: print "Not scraping this URL again as requested" return (None,None,None) @@ -182,24 +182,23 @@ scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID) couch = couchdb.Server('http://127.0.0.1:5984/') - # select database agencydb = couch['disclosr-agencies'] docsdb = couch['disclosr-documents'] -for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view? - agency = agencydb.get(row.id) - print agency['name'] - for key in agency.keys(): - if key == 'website': - scrapeAndStore(docsdb, agency[key],0,key,agency['_id']) - if key.endswith('URL'): - print key - depth = 1 - if 'scrapeDepth' in agency.keys(): - depth = agency['scrapeDepth'] - scrapeAndStore(docsdb, agency[key],depth,key,agency['_id']) - - agency['metadata']['lastScraped'] = time.time() - agencydb.save(agency) - +if __name__ == "__main__": + for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view? + agency = agencydb.get(row.id) + print agency['name'] + for key in agency.keys(): + if key == 'website': + scrapeAndStore(docsdb, agency[key],0,key,agency['_id']) + if key.endswith('URL'): + print key + depth = 1 + if 'scrapeDepth' in agency.keys(): + depth = agency['scrapeDepth'] + scrapeAndStore(docsdb, agency[key],depth,key,agency['_id']) + agency['metadata']['lastScraped'] = time.time() + agencydb.save(agency) +