#http://packages.python.org/CouchDB/client.html |
#http://packages.python.org/CouchDB/client.html |
import couchdb |
import couchdb |
import urllib2 |
import urllib2 |
from BeautifulSoup import BeautifulSoup |
from BeautifulSoup import BeautifulSoup |
import re |
import re |
import hashlib |
import hashlib |
from urlparse import urljoin |
from urlparse import urljoin |
import time |
import time |
import os |
import os |
import sys |
import sys |
import mimetypes |
import mimetypes |
import urllib |
import urllib |
import urlparse |
import urlparse |
import socket |
import socket |
|
|
#couch = couchdb.Server('http://192.168.1.148:5984/') |
#couch = couchdb.Server('http://192.168.1.148:5984/') |
#couch = couchdb.Server('http://192.168.1.113:5984/') |
#couch = couchdb.Server('http://192.168.1.113:5984/') |
couch = couchdb.Server('http://127.0.0.1:5984/') |
couch = couchdb.Server('http://127.0.0.1:5984/') |
|
|
|
|
def mkhash(input): |
def mkhash(input): |
return hashlib.md5(input).hexdigest().encode("utf-8") |
return hashlib.md5(input).hexdigest().encode("utf-8") |
|
|
|
|
def canonurl(url): |
def canonurl(url): |
r"""Return the canonical, ASCII-encoded form of a UTF-8 encoded URL, or '' |
r"""Return the canonical, ASCII-encoded form of a UTF-8 encoded URL, or '' |
if the URL looks invalid. |
if the URL looks invalid. |
>>> canonurl('\xe2\x9e\xa1.ws') # tinyarro.ws |
>>> canonurl('\xe2\x9e\xa1.ws') # tinyarro.ws |
'http://xn--hgi.ws/' |
'http://xn--hgi.ws/' |
""" |
""" |
# strip spaces at the ends and ensure it's prefixed with 'scheme://' |
# strip spaces at the ends and ensure it's prefixed with 'scheme://' |
url = url.strip() |
url = url.strip() |
if not url: |
if not url: |
return '' |
return '' |
if not urlparse.urlsplit(url).scheme: |
if not urlparse.urlsplit(url).scheme: |
url = 'http://' + url |
url = 'http://' + url |
|
|
# turn it into Unicode |
# turn it into Unicode |
#try: |
#try: |
# url = unicode(url, 'utf-8') |
# url = unicode(url, 'utf-8') |
#except UnicodeDecodeError: |
#except UnicodeDecodeError: |
# return '' # bad UTF-8 chars in URL |
# return '' # bad UTF-8 chars in URL |
|
|
# parse the URL into its components |
# parse the URL into its components |
parsed = urlparse.urlsplit(url) |
parsed = urlparse.urlsplit(url) |
scheme, netloc, path, query, fragment = parsed |
scheme, netloc, path, query, fragment = parsed |
|
|
# ensure scheme is a letter followed by letters, digits, and '+-.' chars |
# ensure scheme is a letter followed by letters, digits, and '+-.' chars |
if not re.match(r'[a-z][-+.a-z0-9]*$', scheme, flags=re.I): |
if not re.match(r'[a-z][-+.a-z0-9]*$', scheme, flags=re.I): |
return '' |
return '' |
scheme = str(scheme) |
scheme = str(scheme) |
|
|
# ensure domain and port are valid, eg: sub.domain.<1-to-6-TLD-chars>[:port] |
# ensure domain and port are valid, eg: sub.domain.<1-to-6-TLD-chars>[:port] |
match = re.match(r'(.+\.[a-z0-9]{1,6})(:\d{1,5})?$', netloc, flags=re.I) |
match = re.match(r'(.+\.[a-z0-9]{1,6})(:\d{1,5})?$', netloc, flags=re.I) |
if not match: |
if not match: |
return '' |
return '' |
domain, port = match.groups() |
domain, port = match.groups() |
netloc = domain + (port if port else '') |
netloc = domain + (port if port else '') |
netloc = netloc.encode('idna') |
netloc = netloc.encode('idna') |
|
|
# ensure path is valid and convert Unicode chars to %-encoded |
# ensure path is valid and convert Unicode chars to %-encoded |
if not path: |
if not path: |
path = '/' # eg: 'http://google.com' -> 'http://google.com/' |
path = '/' # eg: 'http://google.com' -> 'http://google.com/' |
path = urllib.quote(urllib.unquote(path.encode('utf-8')), safe='/;') |
path = urllib.quote(urllib.unquote(path.encode('utf-8')), safe='/;') |
|
|
# ensure query is valid |
# ensure query is valid |
query = urllib.quote(urllib.unquote(query.encode('utf-8')), safe='=&?/') |
query = urllib.quote(urllib.unquote(query.encode('utf-8')), safe='=&?/') |
|
|
# ensure fragment is valid |
# ensure fragment is valid |
fragment = urllib.quote(urllib.unquote(fragment.encode('utf-8'))) |
fragment = urllib.quote(urllib.unquote(fragment.encode('utf-8'))) |
|
|
# piece it all back together, truncating it to a maximum of 4KB |
# piece it all back together, truncating it to a maximum of 4KB |
url = urlparse.urlunsplit((scheme, netloc, path, query, fragment)) |
url = urlparse.urlunsplit((scheme, netloc, path, query, fragment)) |
return url[:4096] |
return url[:4096] |
|
|
|
|
def fullurl(url, href): |
def fullurl(url, href): |
href = href.replace(" ", "%20") |
href = href.replace(" ", "%20") |
href = re.sub('#.*$', '', href) |
href = re.sub('#.*$', '', href) |
return urljoin(url, href) |
return urljoin(url, href) |
|
|
#http://diveintopython.org/http_web_services/etags.html |
#http://diveintopython.org/http_web_services/etags.html |
class NotModifiedHandler(urllib2.BaseHandler): |
class NotModifiedHandler(urllib2.BaseHandler): |
def http_error_304(self, req, fp, code, message, headers): |
def http_error_304(self, req, fp, code, message, headers): |
addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url()) |
addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url()) |
addinfourl.code = code |
addinfourl.code = code |
return addinfourl |
return addinfourl |
|
|
|
|
def getLastAttachment(docsdb, url): |
def getLastAttachment(docsdb, url): |
hash = mkhash(url) |
hash = mkhash(url) |
doc = docsdb.get(hash) |
doc = docsdb.get(hash) |
if doc != None and "_attachments" in doc.keys(): |
if doc != None and "_attachments" in doc.keys(): |
last_attachment_fname = doc["_attachments"].keys()[-1] |
last_attachment_fname = doc["_attachments"].keys()[-1] |
last_attachment = docsdb.get_attachment(doc, last_attachment_fname) |
last_attachment = docsdb.get_attachment(doc, last_attachment_fname) |
return last_attachment |
return last_attachment |
else: |
else: |
return None |
return None |
|
|
|
|
def fetchURL(docsdb, url, fieldName, agencyID, scrape_again=True): |
def fetchURL(docsdb, url, fieldName, agencyID, scrape_again=True): |
url = canonurl(url) |
url = canonurl(url) |
hash = mkhash(url) |
hash = mkhash(url) |
req = urllib2.Request(url) |
req = urllib2.Request(url) |
print "Fetching %s (%s)" % (url, hash) |
print "Fetching %s (%s)" % (url, hash) |
if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "": |
if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "": |
print >> sys.stderr, "Not a valid HTTP url" |
print >> sys.stderr, "Not a valid HTTP url" |
return (None, None, None) |
return (None, None, None) |
doc = docsdb.get(hash) |
doc = docsdb.get(hash) |
if doc == None: |
if doc == None: |
doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName': fieldName, 'type': 'website'} |
doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName': fieldName, 'type': 'website'} |
else: |
else: |
if (('page_scraped' in doc) and ((time.time() - doc['page_scraped']) < 60 * 24 * 14) or (scrape_again == False)): |
if (('page_scraped' in doc) and ((time.time() - doc['page_scraped']) < 60 * 24 * 14) or (scrape_again == False)): |
print "Uh oh, trying to scrape URL again too soon!" + hash |
print "Uh oh, trying to scrape URL again too soon!" + hash |
if (not doc.has_key('file_size') or doc["file_size"] != "0") and "_attachments" in doc.keys(): |
if (not doc.has_key('file_size') or doc["file_size"] != "0") and "_attachments" in doc.keys(): |
last_attachment_fname = doc["_attachments"].keys()[-1] |
last_attachment_fname = doc["_attachments"].keys()[-1] |
last_attachment = docsdb.get_attachment(doc, last_attachment_fname) |
last_attachment = docsdb.get_attachment(doc, last_attachment_fname) |
content = last_attachment.read() |
content = last_attachment.read() |
mime_type = doc['mime_type'] |
mime_type = doc['mime_type'] |
else: |
else: |
content = None |
content = None |
mime_type = None |
mime_type = None |
return (doc['url'], mime_type, content) |
return (doc['url'], mime_type, content) |
|
|
req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)") |
req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)") |
#if there is a previous version stored in couchdb, load caching helper tags |
#if there is a previous version stored in couchdb, load caching helper tags |
if doc.has_key('etag'): |
if doc.has_key('etag'): |
req.add_header("If-None-Match", doc['etag']) |
req.add_header("If-None-Match", doc['etag']) |
if doc.has_key('last_modified'): |
if doc.has_key('last_modified'): |
req.add_header("If-Modified-Since", doc['last_modified']) |
req.add_header("If-Modified-Since", doc['last_modified']) |
|
|
opener = urllib2.build_opener(NotModifiedHandler()) |
opener = urllib2.build_opener(NotModifiedHandler()) |
try: |
try: |
url_handle = opener.open(req, None, 20) |
url_handle = opener.open(req, None, 20) |
doc['url'] = url_handle.geturl() # may have followed a redirect to a new url |
doc['url'] = url_handle.geturl() # may have followed a redirect to a new url |
headers = url_handle.info() # the addinfourls have the .info() too |
headers = url_handle.info() # the addinfourls have the .info() too |
doc['etag'] = headers.getheader("ETag") |
doc['etag'] = headers.getheader("ETag") |
doc['last_modified'] = headers.getheader("Last-Modified") |
doc['last_modified'] = headers.getheader("Last-Modified") |
doc['date'] = headers.getheader("Date") |
doc['date'] = headers.getheader("Date") |
doc['page_scraped'] = time.time() |
doc['page_scraped'] = time.time() |
doc['web_server'] = headers.getheader("Server") |
doc['web_server'] = headers.getheader("Server") |
doc['via'] = headers.getheader("Via") |
doc['via'] = headers.getheader("Via") |
doc['powered_by'] = headers.getheader("X-Powered-By") |
doc['powered_by'] = headers.getheader("X-Powered-By") |
doc['file_size'] = headers.getheader("Content-Length") |
doc['file_size'] = headers.getheader("Content-Length") |
content_type = headers.getheader("Content-Type") |
content_type = headers.getheader("Content-Type") |
if content_type != None: |
if content_type != None: |
doc['mime_type'] = content_type.split(";")[0] |
doc['mime_type'] = content_type.split(";")[0] |
else: |
else: |
(type, encoding) = mimetypes.guess_type(url) |
(type, encoding) = mimetypes.guess_type(url) |
doc['mime_type'] = type |
doc['mime_type'] = type |
if hasattr(url_handle, 'code'): |
if hasattr(url_handle, 'code'): |
if url_handle.code == 304: |
if url_handle.code == 304: |
print "the web page has not been modified" + hash |
print "the web page has not been modified" + hash |
last_attachment_fname = doc["_attachments"].keys()[-1] |
last_attachment_fname = doc["_attachments"].keys()[-1] |
last_attachment = docsdb.get_attachment(doc, last_attachment_fname) |
last_attachment = docsdb.get_attachment(doc, last_attachment_fname) |
content = last_attachment |
content = last_attachment |
return (doc['url'], doc['mime_type'], content.read()) |
return (doc['url'], doc['mime_type'], content.read()) |
else: |
else: |
print "new webpage loaded" |
print "new webpage loaded" |
content = url_handle.read() |
content = url_handle.read() |
docsdb.save(doc) |
docsdb.save(doc) |
doc = docsdb.get(hash) # need to get a _rev |
doc = docsdb.get(hash) # need to get a _rev |
docsdb.put_attachment(doc, content, str(time.time()) + "-" + os.path.basename(url), doc['mime_type']) |
docsdb.put_attachment(doc, content, str(time.time()) + "-" + os.path.basename(url), doc['mime_type']) |
return (doc['url'], doc['mime_type'], content) |
return (doc['url'], doc['mime_type'], content) |
#store as attachment epoch-filename |
#store as attachment epoch-filename |
|
|
except (urllib2.URLError, socket.timeout) as e: |
except (urllib2.URLError, socket.timeout) as e: |
print >> sys.stderr,"error!" |
print >> sys.stderr,"error!" |
error = "" |
error = "" |
if hasattr(e, 'reason'): |
if hasattr(e, 'reason'): |
error = "error %s in downloading %s" % (str(e.reason), url) |
error = "error %s in downloading %s" % (str(e.reason), url) |
elif hasattr(e, 'code'): |
elif hasattr(e, 'code'): |
error = "error %s in downloading %s" % (e.code, url) |
error = "error %s in downloading %s" % (e.code, url) |
print >> sys.stderr, error |
print >> sys.stderr, error |
doc['error'] = error |
doc['error'] = error |
docsdb.save(doc) |
docsdb.save(doc) |
return (None, None, None) |
return (None, None, None) |
|
|
|
|
def scrapeAndStore(docsdb, url, depth, fieldName, agencyID): |
def scrapeAndStore(docsdb, url, depth, fieldName, agencyID): |
(url, mime_type, content) = fetchURL(docsdb, url, fieldName, agencyID) |
(url, mime_type, content) = fetchURL(docsdb, url, fieldName, agencyID) |
badURLs = ["http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report"] |
badURLs = ["http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report"] |
if content != None and depth > 0 and url != "http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report": |
if content != None and depth > 0 and url != "http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report": |
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type == "application/xml": |
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type == "application/xml": |
# http://www.crummy.com/software/BeautifulSoup/documentation.html |
# http://www.crummy.com/software/BeautifulSoup/documentation.html |
soup = BeautifulSoup(content) |
soup = BeautifulSoup(content) |
navIDs = soup.findAll( |
navIDs = soup.findAll( |
id=re.compile('nav|Nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')) |
id=re.compile('nav|Nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')) |
for nav in navIDs: |
for nav in navIDs: |
print "Removing element", nav['id'] |
print "Removing element", nav['id'] |
nav.extract() |
nav.extract() |
navClasses = soup.findAll( |
navClasses = soup.findAll( |
attrs={'class': re.compile('nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')}) |
attrs={'class': re.compile('nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')}) |
for nav in navClasses: |
for nav in navClasses: |
print "Removing element", nav['class'] |
print "Removing element", nav['class'] |
nav.extract() |
nav.extract() |
links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-")) |
links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-")) |
linkurls = set([]) |
linkurls = set([]) |
for link in links: |
for link in links: |
if link.has_key("href"): |
if link.has_attr("href"): |
if link['href'].startswith("http"): |
if link['href'].startswith("http"): |
# lets not do external links for now |
# lets not do external links for now |
# linkurls.add(link['href']) |
# linkurls.add(link['href']) |
None |
None |
if link['href'].startswith("mailto"): |
if link['href'].startswith("mailto"): |
# not http |
# not http |
None |
None |
if link['href'].startswith("javascript"): |
if link['href'].startswith("javascript"): |
# not http |
# not http |
None |
None |
else: |
else: |
# remove anchors and spaces in urls |
# remove anchors and spaces in urls |
linkurls.add(fullurl(url, link['href'])) |
linkurls.add(fullurl(url, link['href'])) |
for linkurl in linkurls: |
for linkurl in linkurls: |
#print linkurl |
#print linkurl |
scrapeAndStore(docsdb, linkurl, depth - 1, fieldName, agencyID) |
scrapeAndStore(docsdb, linkurl, depth - 1, fieldName, agencyID) |
|
|
# select database |
# select database |
agencydb = couch['disclosr-agencies'] |
agencydb = couch['disclosr-agencies'] |
docsdb = couch['disclosr-documents'] |
docsdb = couch['disclosr-documents'] |
|
|
if __name__ == "__main__": |
if __name__ == "__main__": |
for row in agencydb.view('app/all'): #not recently scraped agencies view? |
for row in agencydb.view('app/all'): #not recently scraped agencies view? |
agency = agencydb.get(row.id) |
agency = agencydb.get(row.id) |
print agency['name'] |
print agency['name'] |
for key in agency.keys(): |
for key in agency.keys(): |
if key == "FOIDocumentsURL" and "status" not in agency.keys() and False: |
if key == "FOIDocumentsURL" and "status" not in agency.keys() and False: |
scrapeAndStore(docsdb, agency[key], 0, key, agency['_id']) |
scrapeAndStore(docsdb, agency[key], 0, key, agency['_id']) |
if key == 'website' and True: |
if key == 'website' and True: |
scrapeAndStore(docsdb, agency[key], 0, key, agency['_id']) |
scrapeAndStore(docsdb, agency[key], 0, key, agency['_id']) |
if "metadata" not in agency.keys(): |
if "metadata" not in agency.keys(): |
agency['metadata'] = {} |
agency['metadata'] = {} |
agency['metadata']['lastScraped'] = time.time() |
agency['metadata']['lastScraped'] = time.time() |
if key.endswith('URL') and False: |
if key.endswith('URL') and False: |
print key |
print key |
depth = 1 |
depth = 1 |
if 'scrapeDepth' in agency.keys(): |
if 'scrapeDepth' in agency.keys(): |
depth = agency['scrapeDepth'] |
depth = agency['scrapeDepth'] |
scrapeAndStore(docsdb, agency[key], depth, key, agency['_id']) |
scrapeAndStore(docsdb, agency[key], depth, key, agency['_id']) |
agencydb.save(agency) |
agencydb.save(agency) |
|
|