gitphp 0.2.9.1 :: disclosr.git/commitdiff

Store scraper results to couchdb

Former-commit-id: 234bb19e5682c98cb4cbd9c6d6b1bf542ff16d50

file:a/scrape.py -> file:b/scrape.py

#http://packages.python.org/CouchDB/client.html	#http://packages.python.org/CouchDB/client.html
import couchdb	import couchdb
import urllib2	import urllib2
from BeautifulSoup import BeautifulSoup	from BeautifulSoup import BeautifulSoup
import re	import re
import hashlib	import hashlib
	from urlparse import urljoin
	import time
	import os

#http://diveintopython.org/http_web_services/etags.html	#http://diveintopython.org/http_web_services/etags.html
class NotModifiedHandler(urllib2.BaseHandler):	class NotModifiedHandler(urllib2.BaseHandler):
def http_error_304(self, req, fp, code, message, headers):	def http_error_304(self, req, fp, code, message, headers):
addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url())	addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url())
addinfourl.code = code	addinfourl.code = code
return addinfourl	return addinfourl

def scrapeAndStore(docsdb, url, depth, agencyID):	def fetchURL(docsdb, url, agencyID):
hash = hashlib.md5(url).hexdigest()	hash = hashlib.md5(url).hexdigest()
req = urllib2.Request(url)	req = urllib2.Request(url)
print "Fetching %s", url	print "Fetching %s", url
doc = docsdb['hash']	doc = docsdb.get(hash)
	if doc == None:
	doc = {'_id': hash, 'agencyID': agencyID}
#if there is a previous version stored in couchdb, load caching helper tags	#if there is a previous version stored in couchdb, load caching helper tags
if doc.has_key('etag'):	if doc.has_key('etag'):
req.add_header("If-None-Match", doc['etag'])	req.add_header("If-None-Match", doc['etag'])
if doc.has_key('last_modified'):	if doc.has_key('last_modified'):
req.add_header("If-Modified-Since", doc['last_modified'])	req.add_header("If-Modified-Since", doc['last_modified'])

opener = urllib2.build_opener(NotModifiedHandler())	opener = urllib2.build_opener(NotModifiedHandler())
url_handle = opener.open(req)	url_handle = opener.open(req)
headers = url_handle.info() # the addinfourls have the .info() too	headers = url_handle.info() # the addinfourls have the .info() too
doc['etag'] = headers.getheader("ETag")	doc['etag'] = headers.getheader("ETag")
doc['last_modified'] = headers.getheader("Last-Modified")	doc['last_modified'] = headers.getheader("Last-Modified")
doc['web_server'] = headers.getheader("Server")	doc['web_server'] = headers.getheader("Server")
	doc['powered_by'] = headers.getheader("X-Powered-By")
doc['file_size'] = headers.getheader("Content-Length")	doc['file_size'] = headers.getheader("Content-Length")
doc['mime_type'] = headers.getheader("Content-Type")	doc['mime_type'] = headers.getheader("Content-Type").split(";")[0]

if hasattr(url_handle, 'code'):	if hasattr(url_handle, 'code'):
if url_handle.code == 304:	if url_handle.code == 304:
print "the web page has not been modified"	print "the web page has not been modified"
	return None
else:	else:
#do scraping	content = url_handle.read()
html = url_handle.read()	docsdb.save(doc)
	doc = docsdb.get(hash) # need to get a _rev
	docsdb.put_attachment(doc, content, str(time.time())+"-"+os.path.basename(url), doc['mime_type'])
	return (doc['mime_type'], content)
	#store as attachment epoch-filename
	else:
	print "error %s in downloading %s", url_handle.code, URL
	#record/alert error to error database




	def scrapeAndStore(docsdb, url, depth, agencyID):
	(mime_type,content) = fetchURL(docsdb, url, agencyID)
	if content != None:
	if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
# http://www.crummy.com/software/BeautifulSoup/documentation.html	# http://www.crummy.com/software/BeautifulSoup/documentation.html
soup = BeautifulSoup(html)	soup = BeautifulSoup(content)
links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-"))	navIDs = soup.findAll(id=re.compile('nav\|Nav\|menu\|bar'))
for link in links:	for nav in navIDs:
if link.has_key("href"):	print "Removing element", nav['id']
print link['href']	nav.extract()
	navClasses = soup.findAll(attrs={'class' : re.compile('nav\|menu\|bar')})
	for nav in navClasses:
	print "Removing element", nav['class']
	nav.extract()
	links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-"))
	for link in links:
	if link.has_key("href"):
	if link['href'].startswith("http"):
	linkurl = link['href']
	else:
	linkurl = urljoin(url,link['href'])
	print linkurl
#for each unique link	#for each unique link
	# if
#if html mimetype	#if html mimetype
# go down X levels,	# go down X levels,
# diff with last stored attachment, store in document	# diff with last stored attachment, store in document
#if not	#if not
# remember to save parentURL and title (link text that lead to document)	# remember to save parentURL and title (link text that lead to document)

#store as attachment epoch-filename
else:
print "error %s in downloading %s", url_handle.code, URL
#record/alert error to error database



	couch = couchdb.Server('http://127.0.0.1:5984/')







couch = couchdb.Server('http://192.168.1.148:5984/')

# select database	# select database
agencydb = couch['disclosr-agencies']	agencydb = couch['disclosr-agencies']
docsdb = couch['disclosr-documents']	docsdb = couch['disclosr-documents']

for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view?	for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view?
agency = agencydb.get(row.id)	agency = agencydb.get(row.id)
print agency['name']	print agency['name']
scrapeAndStore(docsdb, agency['website'],1,agency['_id'])	scrapeAndStore(docsdb, agency['website'],1,agency['_id'])