scrapers work
[disclosr.git] / documents / scrape.py
blob:a/documents/scrape.py -> blob:b/documents/scrape.py
--- a/documents/scrape.py
+++ b/documents/scrape.py
@@ -86,7 +86,7 @@
 		print "Uh oh, trying to scrape URL again too soon!"
 		last_attachment_fname = doc["_attachments"].keys()[-1]
 		last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
-		return (doc['url'],doc['mime_type'],last_attachment)
+		return (doc['url'],doc['mime_type'],last_attachment.read())
 	if scrape_again == False:
 		print "Not scraping this URL again as requested"
 		return (None,None,None)
@@ -182,24 +182,23 @@
 		scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID)    
 
 couch = couchdb.Server('http://127.0.0.1:5984/')
-
 # select database
 agencydb = couch['disclosr-agencies']
 docsdb = couch['disclosr-documents']
 
-for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view?
-    agency = agencydb.get(row.id)
-    print agency['name']
-    for key in agency.keys():
-	if key == 'website':
-    		scrapeAndStore(docsdb, agency[key],0,key,agency['_id'])
-	if key.endswith('URL'):
-		print key 
-		depth = 1
-		if 'scrapeDepth' in agency.keys():
-			depth = agency['scrapeDepth']
-   		scrapeAndStore(docsdb, agency[key],depth,key,agency['_id'])
-
-    agency['metadata']['lastScraped'] = time.time()
-    agencydb.save(agency)
-
+if __name__ == "__main__":
+	for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view?
+		agency = agencydb.get(row.id)
+		print agency['name']
+		for key in agency.keys():
+			if key == 'website':
+	    			scrapeAndStore(docsdb, agency[key],0,key,agency['_id'])
+			if key.endswith('URL'):
+				print key 
+				depth = 1
+				if 'scrapeDepth' in agency.keys():
+					depth = agency['scrapeDepth']
+   				scrapeAndStore(docsdb, agency[key],depth,key,agency['_id'])
+	     	agency['metadata']['lastScraped'] = time.time()
+   	     	agencydb.save(agency)
+