Store scraper results to couchdb
Store scraper results to couchdb


Former-commit-id: 234bb19e5682c98cb4cbd9c6d6b1bf542ff16d50

file:a/scrape.py -> file:b/scrape.py
#http://packages.python.org/CouchDB/client.html #http://packages.python.org/CouchDB/client.html
import couchdb import couchdb
import urllib2 import urllib2
from BeautifulSoup import BeautifulSoup from BeautifulSoup import BeautifulSoup
import re import re
import hashlib import hashlib
  from urlparse import urljoin
  import time
  import os
   
#http://diveintopython.org/http_web_services/etags.html #http://diveintopython.org/http_web_services/etags.html
class NotModifiedHandler(urllib2.BaseHandler): class NotModifiedHandler(urllib2.BaseHandler):
def http_error_304(self, req, fp, code, message, headers): def http_error_304(self, req, fp, code, message, headers):
addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url()) addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url())
addinfourl.code = code addinfourl.code = code
return addinfourl return addinfourl
   
def scrapeAndStore(docsdb, url, depth, agencyID): def fetchURL(docsdb, url, agencyID):
hash = hashlib.md5(url).hexdigest() hash = hashlib.md5(url).hexdigest()
req = urllib2.Request(url) req = urllib2.Request(url)
print "Fetching %s", url print "Fetching %s", url
doc = docsdb['hash'] doc = docsdb.get(hash)
  if doc == None:
  doc = {'_id': hash, 'agencyID': agencyID}
#if there is a previous version stored in couchdb, load caching helper tags #if there is a previous version stored in couchdb, load caching helper tags
if doc.has_key('etag'): if doc.has_key('etag'):
req.add_header("If-None-Match", doc['etag']) req.add_header("If-None-Match", doc['etag'])
if doc.has_key('last_modified'): if doc.has_key('last_modified'):
req.add_header("If-Modified-Since", doc['last_modified']) req.add_header("If-Modified-Since", doc['last_modified'])
opener = urllib2.build_opener(NotModifiedHandler()) opener = urllib2.build_opener(NotModifiedHandler())
url_handle = opener.open(req) url_handle = opener.open(req)
headers = url_handle.info() # the addinfourls have the .info() too headers = url_handle.info() # the addinfourls have the .info() too
doc['etag'] = headers.getheader("ETag") doc['etag'] = headers.getheader("ETag")
doc['last_modified'] = headers.getheader("Last-Modified") doc['last_modified'] = headers.getheader("Last-Modified")
doc['web_server'] = headers.getheader("Server") doc['web_server'] = headers.getheader("Server")
  doc['powered_by'] = headers.getheader("X-Powered-By")
doc['file_size'] = headers.getheader("Content-Length") doc['file_size'] = headers.getheader("Content-Length")
doc['mime_type'] = headers.getheader("Content-Type") doc['mime_type'] = headers.getheader("Content-Type").split(";")[0]
   
if hasattr(url_handle, 'code'): if hasattr(url_handle, 'code'):
if url_handle.code == 304: if url_handle.code == 304:
print "the web page has not been modified" print "the web page has not been modified"
  return None
else: else:
#do scraping content = url_handle.read()
html = url_handle.read() docsdb.save(doc)
  doc = docsdb.get(hash) # need to get a _rev
  docsdb.put_attachment(doc, content, str(time.time())+"-"+os.path.basename(url), doc['mime_type'])
  return (doc['mime_type'], content)
  #store as attachment epoch-filename
  else:
  print "error %s in downloading %s", url_handle.code, URL
  #record/alert error to error database
   
   
   
   
  def scrapeAndStore(docsdb, url, depth, agencyID):
  (mime_type,content) = fetchURL(docsdb, url, agencyID)
  if content != None:
  if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
# http://www.crummy.com/software/BeautifulSoup/documentation.html # http://www.crummy.com/software/BeautifulSoup/documentation.html
soup = BeautifulSoup(html) soup = BeautifulSoup(content)
links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-")) navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar'))
for link in links: for nav in navIDs:
if link.has_key("href"): print "Removing element", nav['id']
print link['href'] nav.extract()
  navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar')})
  for nav in navClasses:
  print "Removing element", nav['class']
  nav.extract()
  links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-"))
  for link in links:
  if link.has_key("href"):
  if link['href'].startswith("http"):
  linkurl = link['href']
  else:
  linkurl = urljoin(url,link['href'])
  print linkurl
#for each unique link #for each unique link
  # if
#if html mimetype #if html mimetype
# go down X levels, # go down X levels,
# diff with last stored attachment, store in document # diff with last stored attachment, store in document
#if not #if not
# remember to save parentURL and title (link text that lead to document) # remember to save parentURL and title (link text that lead to document)
#store as attachment epoch-filename  
else:  
print "error %s in downloading %s", url_handle.code, URL  
#record/alert error to error database  
   
   
   
  couch = couchdb.Server('http://127.0.0.1:5984/')
   
   
   
   
   
   
   
couch = couchdb.Server('http://192.168.1.148:5984/')  
   
# select database # select database
agencydb = couch['disclosr-agencies'] agencydb = couch['disclosr-agencies']
docsdb = couch['disclosr-documents'] docsdb = couch['disclosr-documents']
   
for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view? for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view?
agency = agencydb.get(row.id) agency = agencydb.get(row.id)
print agency['name'] print agency['name']
scrapeAndStore(docsdb, agency['website'],1,agency['_id']) scrapeAndStore(docsdb, agency['website'],1,agency['_id'])