Scrape required and chart of complied features views
[disclosr.git] / scrape.py
blob:a/scrape.py -> blob:b/scrape.py
--- a/scrape.py
+++ b/scrape.py
@@ -3,15 +3,6 @@
 import urllib2
 from BeautifulSoup import BeautifulSoup
 import re
-
-couch = couchdb.Server('http://192.168.1.148:5984/')
-
-# select database
-agencydb = couch['disclosr-agencies']
-
-for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view?
-    agency = agencydb.get(row.id)
-    print agency['agencyName']
 
 #http://diveintopython.org/http_web_services/etags.html
 class NotModifiedHandler(urllib2.BaseHandler):  
@@ -21,9 +12,10 @@
         return addinfourl
 
 def scrapeAndStore(URL, depth, agency):
-    URL = "http://www.hole.fi/jajvirta/weblog/"
+    URL = "http://www.google.com"
     req = urllib2.Request(URL)
-    
+    etag = 'y'
+    last_modified = 'y'
     #if there is a previous version sotred in couchdb, load caching helper tags
     if etag:
         req.add_header("If-None-Match", etag)
@@ -39,7 +31,7 @@
     file_size = headers.getheader("Content-Length") 
     mime_type = headers.getheader("Content-Type") 
      
-    if hasattr(url_handle, 'code') 
+    if hasattr(url_handle, 'code'): 
         if url_handle.code == 304:
             print "the web page has not been modified"
         else: 
@@ -63,3 +55,22 @@
         #record/alert error to error database
     
     
+
+
+
+
+
+
+
+
+
+couch = couchdb.Server('http://192.168.1.148:5984/')
+
+# select database
+agencydb = couch['disclosr-agencies']
+
+for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view?
+    agency = agencydb.get(row.id)
+    print agency['name']
+scrapeAndStore("A",1,1)
+