pdf scrapers
[disclosr.git] / documents / scrape.py
blob:a/documents/scrape.py -> blob:b/documents/scrape.py
--- a/documents/scrape.py
+++ b/documents/scrape.py
@@ -10,6 +10,8 @@
 import mimetypes
 import urllib
 import urlparse
+import socket
+
 
 def mkhash(input):
     return hashlib.md5(input).hexdigest().encode("utf-8")
@@ -77,6 +79,16 @@
         addinfourl.code = code
         return addinfourl
 
+def getLastAttachment(docsdb,url):
+    hash = mkhash(url)
+    doc = docsdb.get(hash)
+    if doc != None:
+        last_attachment_fname = doc["_attachments"].keys()[-1]
+        last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
+        return last_attachment
+    else:
+        return None
+
 def fetchURL(docsdb, url, fieldName, agencyID, scrape_again=True):
     url = canonurl(url)
     hash = mkhash(url)
@@ -87,17 +99,17 @@
         return (None,None,None)
     doc = docsdb.get(hash)
     if doc == None:
-        doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName}
+        doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName, 'type': 'website'}
     else:
         if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 60*24*14*1000):
             print "Uh oh, trying to scrape URL again too soon!"+hash
             last_attachment_fname = doc["_attachments"].keys()[-1]
             last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
             content = last_attachment
-            return (doc['url'],doc['mime_type'],content)
+            return (doc['url'],doc['mime_type'],content.read())
         if scrape_again == False:
             print "Not scraping this URL again as requested"
-            return (None,None,None)
+            return (doc['url'],doc['mime_type'],content.read())
 
     req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)")
     #if there is a previous version stored in couchdb, load caching helper tags
@@ -108,7 +120,9 @@
 
     opener = urllib2.build_opener(NotModifiedHandler())
     try:
-        url_handle = opener.open(req)
+	#default_timeout = 12
+	#socket.setdefaulttimeout(default_timeout)
+        url_handle = opener.open(req,None,3)
         doc['url'] = url_handle.geturl() # may have followed a redirect to a new url
         headers = url_handle.info() # the addinfourls have the .info() too
         doc['etag'] = headers.getheader("ETag")
@@ -131,7 +145,7 @@
                 last_attachment_fname = doc["_attachments"].keys()[-1]
                 last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
                 content = last_attachment
-                return (doc['url'],doc['mime_type'],content)
+                return (doc['url'],doc['mime_type'],content.read())
             else:
                 print "new webpage loaded"
                 content = url_handle.read()
@@ -141,7 +155,7 @@
                 return (doc['url'], doc['mime_type'], content)
                 #store as attachment epoch-filename
 
-    except urllib2.URLError as e:
+    except (urllib2.URLError, socket.timeout) as e:
             print "error!"
             error = ""
             if hasattr(e, 'reason'):
@@ -192,20 +206,23 @@
                                scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID)
 
 #couch = couchdb.Server('http://192.168.1.148:5984/')
-couch = couchdb.Server('http://127.0.0.1:5984/')
+couch = couchdb.Server('http://192.168.1.113:5984/')
+#couch = couchdb.Server('http://127.0.0.1:5984/')
 # select database
 agencydb = couch['disclosr-agencies']
 docsdb = couch['disclosr-documents']
 
 if __name__ == "__main__":
-    for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view?
+    for row in agencydb.view('app/all'): #not recently scraped agencies view?
         agency = agencydb.get(row.id)
         print agency['name']
         for key in agency.keys():
-            if key == "FOIDocumentsURL" and "status" not in agency.keys:
+            if key == "FOIDocumentsURL" and "status" not in agency.keys() and False:
                 scrapeAndStore(docsdb, agency[key],0,key,agency['_id'])
-            if key == 'website' and False:
+            if key == 'website' and True:
                 scrapeAndStore(docsdb, agency[key],0,key,agency['_id'])
+		if "metadata" not in agency.keys():
+			agency['metadata'] = {}
                 agency['metadata']['lastScraped'] = time.time()
             if key.endswith('URL') and False:
                 print key