From: maxious <maxious@lambdacomplex.org>
Date: Sun, 11 Mar 2012 10:38:25 +0000
Subject: Scraper updates
X-Git-Url: https://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=4c1fd2637ff0b9a5342c9e1986ac5f706de4a6de
---
Scraper updates


Former-commit-id: ffe2d04341e6cde9180b829dd7eb40c585c67494
---


--- a/scrape.py
+++ b/scrape.py
@@ -77,7 +77,7 @@
     print "Fetching %s" % url
     if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "":
 		print "Not a valid HTTP url"
-		return (None,None)
+		return (None,None,None)
     doc = docsdb.get(hash) 
     if doc == None:
 	doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName}
@@ -86,13 +86,14 @@
 		print "Uh oh, trying to scrape URL again too soon!"
 		last_attachment_fname = doc["_attachments"].keys()[-1]
 		last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
-		return (doc['mime_type'],last_attachment)
+		return (doc['url'],doc['mime_type'],last_attachment)
 	if scrape_again == False:
 		print "Not scraping this URL again as requested"
-		return (None,None)
+		return (None,None,None)
 
     time.sleep(3) # wait 3 seconds to give webserver time to recover
     
+    req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)")
     #if there is a previous version stored in couchdb, load caching helper tags
     if doc.has_key('etag'):
         req.add_header("If-None-Match", doc['etag'])
@@ -102,12 +103,14 @@
     opener = urllib2.build_opener(NotModifiedHandler())
     try:
      url_handle = opener.open(req)
+     doc['url'] = url_handle.geturl() # may have followed a redirect to a new url
      headers = url_handle.info() # the addinfourls have the .info() too
      doc['etag'] = headers.getheader("ETag")
      doc['last_modified'] = headers.getheader("Last-Modified") 
      doc['date'] = headers.getheader("Date") 
      doc['page_scraped'] = time.time() 
      doc['web_server'] = headers.getheader("Server") 
+     doc['via'] = headers.getheader("Via") 
      doc['powered_by'] = headers.getheader("X-Powered-By") 
      doc['file_size'] = headers.getheader("Content-Length") 
      content_type = headers.getheader("Content-Type")
@@ -119,13 +122,13 @@
      if hasattr(url_handle, 'code'): 
         if url_handle.code == 304:
             print "the web page has not been modified"
-	    return (None,None)
+	    return (None,None,None)
         else: 
             content = url_handle.read()
 	    docsdb.save(doc)
 	    doc = docsdb.get(hash) # need to get a _rev
 	    docsdb.put_attachment(doc, content, str(time.time())+"-"+os.path.basename(url), doc['mime_type']) 
-	    return (doc['mime_type'], content)
+	    return (doc['url'], doc['mime_type'], content)
 	    #store as attachment epoch-filename
     except urllib2.URLError as e:
     	error = ""
@@ -136,21 +139,22 @@
         print error
 	doc['error'] = error
         docsdb.save(doc)
-        return (None,None)
+        return (None,None,None)
 
 
 def scrapeAndStore(docsdb, url, depth, fieldName, agencyID):
-    (mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID)
-    if content != None and depth > 0:
+    (url,mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID)
+    badURLs = ["http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report"]
+    if content != None and depth > 0 and url != "http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report":
 	if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
             # http://www.crummy.com/software/BeautifulSoup/documentation.html
             soup = BeautifulSoup(content)
-	    navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar'))
+	    navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header'))
 	    for nav in navIDs:
 		print "Removing element", nav['id']
 		nav.extract()
-	    navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar')})
+	    navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')})
 	    for nav in navClasses:
 		print "Removing element", nav['class']
 		nav.extract()
@@ -169,7 +173,10 @@
 				# not http
 				None
 			else:
-                		linkurls.add(urljoin(url,link['href'].replace(" ","%20")))
+				# remove anchors and spaces in urls
+				link['href'] = link['href'].replace(" ","%20")
+				link['href'] = re.sub('#.*$','',link['href'])
+                		linkurls.add(urljoin(url,link['href']))
             for linkurl in linkurls:
 		#print linkurl
 		scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID)