pdf scrapers
[disclosr.git] / documents / scrape.py
blob:a/documents/scrape.py -> blob:b/documents/scrape.py
--- a/documents/scrape.py
+++ b/documents/scrape.py
@@ -76,6 +76,16 @@
         addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url())
         addinfourl.code = code
         return addinfourl
+
+def getLastAttachment(docsdb,url):
+    hash = mkhash(url)
+    doc = docsdb.get(hash)
+    if doc != None:
+        last_attachment_fname = doc["_attachments"].keys()[-1]
+        last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
+        return last_attachment
+    else:
+        return None
 
 def fetchURL(docsdb, url, fieldName, agencyID, scrape_again=True):
     url = canonurl(url)
@@ -94,10 +104,10 @@
             last_attachment_fname = doc["_attachments"].keys()[-1]
             last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
             content = last_attachment
-            return (doc['url'],doc['mime_type'],content)
+            return (doc['url'],doc['mime_type'],content.read())
         if scrape_again == False:
             print "Not scraping this URL again as requested"
-            return (None,None,None)
+            return (doc['url'],doc['mime_type'],content.read())
 
     req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)")
     #if there is a previous version stored in couchdb, load caching helper tags
@@ -131,7 +141,7 @@
                 last_attachment_fname = doc["_attachments"].keys()[-1]
                 last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
                 content = last_attachment
-                return (doc['url'],doc['mime_type'],content)
+                return (doc['url'],doc['mime_type'],content.read())
             else:
                 print "new webpage loaded"
                 content = url_handle.read()