gitphp 0.2.9.1 :: disclosr.git/blobdiff

#http://packages.python.org/CouchDB/client.html

import couchdb

import urllib2

from BeautifulSoup import BeautifulSoup

import re

import hashlib

from urlparse import urljoin

import time

import os

import mimetypes

import re

import urllib

import urlparse

def canonurl(url):

r"""Return the canonical, ASCII-encoded form of a UTF-8 encoded URL, or ''

if the URL looks invalid.

>>> canonurl('\xe2\x9e\xa1.ws') # tinyarro.ws

'http://xn--hgi.ws/'

"""

# strip spaces at the ends and ensure it's prefixed with 'scheme://'

url = url.strip()

if not url:

return ''

if not urlparse.urlsplit(url).scheme:

url = 'http://' + url

# turn it into Unicode

#try:

# url = unicode(url, 'utf-8')

#except UnicodeDecodeError:

# return '' # bad UTF-8 chars in URL

# parse the URL into its components

parsed = urlparse.urlsplit(url)

scheme, netloc, path, query, fragment = parsed

# ensure scheme is a letter followed by letters, digits, and '+-.' chars

if not re.match(r'[a-z][-+.a-z0-9]*$', scheme, flags=re.I):

return ''

scheme = str(scheme)

# ensure domain and port are valid, eg: sub.domain.<1-to-6-TLD-chars>[:port]

match = re.match(r'(.+\.[a-z0-9]{1,6})(:\d{1,5})?$', netloc, flags=re.I)

if not match:

return ''

domain, port = match.groups()

netloc = domain + (port if port else '')

netloc = netloc.encode('idna')

# ensure path is valid and convert Unicode chars to %-encoded

if not path:

path = '/' # eg: 'http://google.com' -> 'http://google.com/'

path = urllib.quote(urllib.unquote(path.encode('utf-8')), safe='/;')

# ensure query is valid

query = urllib.quote(urllib.unquote(query.encode('utf-8')), safe='=&?/')

# ensure fragment is valid

fragment = urllib.quote(urllib.unquote(fragment.encode('utf-8')))

# piece it all back together, truncating it to a maximum of 4KB

url = urlparse.urlunsplit((scheme, netloc, path, query, fragment))

return url[:4096]

#http://diveintopython.org/http_web_services/etags.html

class NotModifiedHandler(urllib2.BaseHandler):

def http_error_304(self, req, fp, code, message, headers):

addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url())

addinfourl.code = code

return addinfourl

def fetchURL(docsdb, url, fieldName, agencyID, scrape_again=True):

url = canonurl(url)

hash = hashlib.md5(url).hexdigest().encode("utf-8")

req = urllib2.Request(url)

print "Fetching %s" % url

if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "":

print "Not a valid HTTP url"

return (None,None,None)

doc = docsdb.get(hash)

if doc == None:

doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName}

else:

if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 999999):

print "Uh oh, trying to scrape URL again too soon!"

last_attachment_fname = doc["_attachments"].keys()[-1]

last_attachment = docsdb.get_attachment(doc,last_attachment_fname)

return (doc['url'],doc['mime_type'],last_attachment.read())

if scrape_again == False:

print "Not scraping this URL again as requested"

return (None,None,None)

time.sleep(3) # wait 3 seconds to give webserver time to recover

req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)")

#if there is a previous version stored in couchdb, load caching helper tags

if doc.has_key('etag'):

req.add_header("If-None-Match", doc['etag'])

if doc.has_key('last_modified'):

req.add_header("If-Modified-Since", doc['last_modified'])

opener = urllib2.build_opener(NotModifiedHandler())

try:

url_handle = opener.open(req)

doc['url'] = url_handle.geturl() # may have followed a redirect to a new url

headers = url_handle.info() # the addinfourls have the .info() too

doc['etag'] = headers.getheader("ETag")

doc['last_modified'] = headers.getheader("Last-Modified")

doc['date'] = headers.getheader("Date")

doc['page_scraped'] = time.time()

doc['web_server'] = headers.getheader("Server")

doc['via'] = headers.getheader("Via")

doc['powered_by'] = headers.getheader("X-Powered-By")

doc['file_size'] = headers.getheader("Content-Length")

content_type = headers.getheader("Content-Type")

if content_type != None:

doc['mime_type'] = content_type.split(";")[0]

else:

(type,encoding) = mimetypes.guess_type(url)

doc['mime_type'] = type

if hasattr(url_handle, 'code'):

if url_handle.code == 304:

print "the web page has not been modified"

return (None,None,None)

else:

content = url_handle.read()

docsdb.save(doc)

doc = docsdb.get(hash) # need to get a _rev

docsdb.put_attachment(doc, content, str(time.time())+"-"+os.path.basename(url), doc['mime_type'])

return (doc['url'], doc['mime_type'], content)

#store as attachment epoch-filename

except urllib2.URLError as e:

error = ""

if hasattr(e, 'reason'):

error = "error %s in downloading %s" % (str(e.reason), url)

elif hasattr(e, 'code'):

error = "error %s in downloading %s" % (e.code, url)

print error

doc['error'] = error

docsdb.save(doc)

return (None,None,None)

def scrapeAndStore(docsdb, url, depth, fieldName, agencyID):

(url,mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID)

badURLs = ["http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report"]

if content != None and depth > 0 and url != "http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report":

if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":

# http://www.crummy.com/software/BeautifulSoup/documentation.html

soup = BeautifulSoup(content)

for nav in navIDs:

print "Removing element", nav['id']

nav.extract()

navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')})

for nav in navClasses:

print "Removing element", nav['class']

nav.extract()

links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-"))

linkurls = set([])

for link in links:

if link.has_key("href"):

if link['href'].startswith("http"):

# lets not do external links for now

# linkurls.add(link['href'])

None

if link['href'].startswith("mailto"):

# not http

None

if link['href'].startswith("javascript"):

# not http

None

else:

# remove anchors and spaces in urls

link['href'] = link['href'].replace(" ","%20")

link['href'] = re.sub('#.*$','',link['href'])

linkurls.add(urljoin(url,link['href']))

for linkurl in linkurls:

#print linkurl

scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID)

couch = couchdb.Server('http://127.0.0.1:5984/')

# select database

agencydb = couch['disclosr-agencies']

docsdb = couch['disclosr-documents']

if __name__ == "__main__":

for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view?

agency = agencydb.get(row.id)

print agency['name']

for key in agency.keys():

if key == 'website':

scrapeAndStore(docsdb, agency[key],0,key,agency['_id'])

if key.endswith('URL'):

print key

depth = 1

if 'scrapeDepth' in agency.keys():

depth = agency['scrapeDepth']

scrapeAndStore(docsdb, agency[key],depth,key,agency['_id'])

agency['metadata']['lastScraped'] = time.time()

agencydb.save(agency)