|
#http://packages.python.org/CouchDB/client.html |
|
import couchdb |
|
import urllib2 |
|
from BeautifulSoup import BeautifulSoup |
|
import re |
|
import hashlib |
|
from urlparse import urljoin |
|
import time |
|
import os |
|
|
|
#http://diveintopython.org/http_web_services/etags.html |
|
class NotModifiedHandler(urllib2.BaseHandler): |
|
def http_error_304(self, req, fp, code, message, headers): |
|
addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url()) |
|
addinfourl.code = code |
|
return addinfourl |
|
|
|
def fetchURL(docsdb, url, fieldName, agencyID, scrape_again=True): |
|
hash = hashlib.md5(url).hexdigest() |
|
req = urllib2.Request(url) |
|
print "Fetching %s" % url |
|
doc = docsdb.get(hash) |
|
if doc == None: |
|
doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName} |
|
else: |
|
if (time.time() - doc['page_scraped']) < 3600: |
|
print "Uh oh, trying to scrape URL again too soon!" |
|
last_attachment_fname = doc["_attachments"].keys()[-1] |
|
last_attachment = docsdb.get_attachment(doc,last_attachment_fname) |
|
return (doc['mime_type'],last_attachment) |
|
if scrape_again == False: |
|
print "Not scraping this URL again as requested" |
|
return (None,None) |
|
|
|
time.sleep(3) # wait 3 seconds to give webserver time to recover |
|
|
|
#if there is a previous version stored in couchdb, load caching helper tags |
|
if doc.has_key('etag'): |
|
req.add_header("If-None-Match", doc['etag']) |
|
if doc.has_key('last_modified'): |
|
req.add_header("If-Modified-Since", doc['last_modified']) |
|
|
|
opener = urllib2.build_opener(NotModifiedHandler()) |
|
url_handle = opener.open(req) |
|
headers = url_handle.info() # the addinfourls have the .info() too |
|
doc['etag'] = headers.getheader("ETag") |
|
doc['last_modified'] = headers.getheader("Last-Modified") |
|
doc['date'] = headers.getheader("Date") |
|
doc['page_scraped'] = time.time() |
|
doc['web_server'] = headers.getheader("Server") |
|
doc['powered_by'] = headers.getheader("X-Powered-By") |
|
doc['file_size'] = headers.getheader("Content-Length") |
|
doc['mime_type'] = headers.getheader("Content-Type").split(";")[0] |
|
if hasattr(url_handle, 'code'): |
|
if url_handle.code == 304: |
|
print "the web page has not been modified" |
|
return (None,None) |
|
else: |
|
content = url_handle.read() |
|
docsdb.save(doc) |
|
doc = docsdb.get(hash) # need to get a _rev |
|
docsdb.put_attachment(doc, content, str(time.time())+"-"+os.path.basename(url), doc['mime_type']) |
|
return (doc['mime_type'], content) |
|
#store as attachment epoch-filename |
|
else: |
|
print "error %s in downloading %s" % url_handle.code, URL |
|
doc['error'] = "error %s in downloading %s" % url_handle.code, URL |
|
docsdb.save(doc) |
|
return (None,None) |
|
|
|
|
|
|
|
def scrapeAndStore(docsdb, url, depth, fieldName, agencyID): |
|
(mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID) |
|
if content != None and depth > 0: |
|
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": |
|
# http://www.crummy.com/software/BeautifulSoup/documentation.html |
|
soup = BeautifulSoup(content) |
|
navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar')) |
|
for nav in navIDs: |
|
print "Removing element", nav['id'] |
|
nav.extract() |
|
navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar')}) |
|
for nav in navClasses: |
|
print "Removing element", nav['class'] |
|
nav.extract() |
|
links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-")) |
|
linkurls = set([]) |
|
for link in links: |
|
if link.has_key("href"): |
|
if link['href'].startswith("http"): |
|
# lets not do external links for now |
|
# linkurls.add(link['href']) |
|
None |
|
else: |
|
linkurls.add(urljoin(url,link['href'].replace(" ","%20"))) |
|
for linkurl in linkurls: |
|
#print linkurl |
|
scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID) |
|
|
|
couch = couchdb.Server('http://127.0.0.1:5984/') |
|
|
|
# select database |
|
agencydb = couch['disclosr-agencies'] |
|
docsdb = couch['disclosr-documents'] |
|
|
|
for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view? |
|
agency = agencydb.get(row.id) |
|
print agency['name'] |
|
for key in agency.keys(): |
|
if key == 'website' or key.endswith('URL'): |
|
print key |
|
scrapeAndStore(docsdb, agency[key],agency['scrapeDepth'],key,agency['_id']) |
|
agency['metadata']['lastscraped'] = time.time() |
|
agencydb.save(agency) |
|
|