finished gazette parser
[disclosr.git] / documents / scrape.py
blob:a/documents/scrape.py -> blob:b/documents/scrape.py
--- a/documents/scrape.py
+++ b/documents/scrape.py
@@ -7,14 +7,15 @@
 from urlparse import urljoin
 import time
 import os
+import sys
 import mimetypes
 import urllib
 import urlparse
 import socket
 
 #couch = couchdb.Server('http://192.168.1.148:5984/')
-couch = couchdb.Server('http://192.168.1.113:5984/')
-#couch = couchdb.Server('http://127.0.0.1:5984/')
+#couch = couchdb.Server('http://192.168.1.113:5984/')
+couch = couchdb.Server('http://127.0.0.1:5984/')
 
 
 def mkhash(input):
@@ -89,7 +90,7 @@
 def getLastAttachment(docsdb, url):
     hash = mkhash(url)
     doc = docsdb.get(hash)
-    if doc != None:
+    if doc != None and "_attachments" in doc.keys():
         last_attachment_fname = doc["_attachments"].keys()[-1]
         last_attachment = docsdb.get_attachment(doc, last_attachment_fname)
         return last_attachment
@@ -103,7 +104,7 @@
     req = urllib2.Request(url)
     print "Fetching %s (%s)" % (url, hash)
     if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "":
-        print "Not a valid HTTP url"
+        print >> sys.stderr, "Not a valid HTTP url"
         return (None, None, None)
     doc = docsdb.get(hash)
     if doc == None:
@@ -111,10 +112,15 @@
     else:
         if (('page_scraped' in doc) and ((time.time() - doc['page_scraped']) < 60 * 24 * 14) or (scrape_again == False)):
             print "Uh oh, trying to scrape URL again too soon!" + hash
-            last_attachment_fname = doc["_attachments"].keys()[-1]
-            last_attachment = docsdb.get_attachment(doc, last_attachment_fname)
-            content = last_attachment
-            return (doc['url'], doc['mime_type'], content.read())
+	    if (not doc.has_key('file_size') or doc["file_size"] != "0") and "_attachments" in doc.keys():
+	            last_attachment_fname = doc["_attachments"].keys()[-1]
+	            last_attachment = docsdb.get_attachment(doc, last_attachment_fname)
+        	    content = last_attachment.read()
+		    mime_type = doc['mime_type']
+	    else:
+		    content = None
+		    mime_type = None
+            return (doc['url'], mime_type, content)
 
     req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)")
     #if there is a previous version stored in couchdb, load caching helper tags
@@ -159,13 +165,13 @@
                 #store as attachment epoch-filename
 
     except (urllib2.URLError, socket.timeout) as e:
-        print "error!"
+        print >> sys.stderr,"error!"
         error = ""
         if hasattr(e, 'reason'):
             error = "error %s in downloading %s" % (str(e.reason), url)
         elif hasattr(e, 'code'):
             error = "error %s in downloading %s" % (e.code, url)
-        print error
+        print >> sys.stderr, error
         doc['error'] = error
         docsdb.save(doc)
         return (None, None, None)
@@ -191,7 +197,7 @@
                 links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-"))
                 linkurls = set([])
                 for link in links:
-                    if link.has_key("href"):
+                    if link.has_attr("href"):
                         if link['href'].startswith("http"):
                             # lets not do external links for now
                             # linkurls.add(link['href'])