Former-commit-id: ffe2d04341e6cde9180b829dd7eb40c585c67494
--- a/scrape.py
+++ b/scrape.py
@@ -77,7 +77,7 @@
print "Fetching %s" % url
if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "":
print "Not a valid HTTP url"
- return (None,None)
+ return (None,None,None)
doc = docsdb.get(hash)
if doc == None:
doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName}
@@ -86,13 +86,14 @@
print "Uh oh, trying to scrape URL again too soon!"
last_attachment_fname = doc["_attachments"].keys()[-1]
last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
- return (doc['mime_type'],last_attachment)
+ return (doc['url'],doc['mime_type'],last_attachment)
if scrape_again == False:
print "Not scraping this URL again as requested"
- return (None,None)
+ return (None,None,None)
time.sleep(3) # wait 3 seconds to give webserver time to recover
+ req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)")
#if there is a previous version stored in couchdb, load caching helper tags
if doc.has_key('etag'):
req.add_header("If-None-Match", doc['etag'])
@@ -102,12 +103,14 @@
opener = urllib2.build_opener(NotModifiedHandler())
try:
url_handle = opener.open(req)
+ doc['url'] = url_handle.geturl() # may have followed a redirect to a new url
headers = url_handle.info() # the addinfourls have the .info() too
doc['etag'] = headers.getheader("ETag")
doc['last_modified'] = headers.getheader("Last-Modified")
doc['date'] = headers.getheader("Date")
doc['page_scraped'] = time.time()
doc['web_server'] = headers.getheader("Server")
+ doc['via'] = headers.getheader("Via")
doc['powered_by'] = headers.getheader("X-Powered-By")
doc['file_size'] = headers.getheader("Content-Length")
content_type = headers.getheader("Content-Type")
@@ -119,13 +122,13 @@
if hasattr(url_handle, 'code'):
if url_handle.code == 304:
print "the web page has not been modified"
- return (None,None)
+ return (None,None,None)
else:
content = url_handle.read()
docsdb.save(doc)
doc = docsdb.get(hash) # need to get a _rev
docsdb.put_attachment(doc, content, str(time.time())+"-"+os.path.basename(url), doc['mime_type'])
- return (doc['mime_type'], content)
+ return (doc['url'], doc['mime_type'], content)
#store as attachment epoch-filename
except urllib2.URLError as e:
error = ""
@@ -136,21 +139,22 @@
print error
doc['error'] = error
docsdb.save(doc)
- return (None,None)
+ return (None,None,None)
def scrapeAndStore(docsdb, url, depth, fieldName, agencyID):
- (mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID)
- if content != None and depth > 0:
+ (url,mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID)
+ badURLs = ["http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report"]
+ if content != None and depth > 0 and url != "http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report":
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
# http://www.crummy.com/software/BeautifulSoup/documentation.html
soup = BeautifulSoup(content)
- navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar'))
+ navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header'))
for nav in navIDs:
print "Removing element", nav['id']
nav.extract()
- navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar')})
+ navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')})
for nav in navClasses:
print "Removing element", nav['class']
nav.extract()
@@ -169,7 +173,10 @@
# not http
None
else:
- linkurls.add(urljoin(url,link['href'].replace(" ","%20")))
+ # remove anchors and spaces in urls
+ link['href'] = link['href'].replace(" ","%20")
+ link['href'] = re.sub('#.*$','',link['href'])
+ linkurls.add(urljoin(url,link['href']))
for linkurl in linkurls:
#print linkurl
scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID)