gitphp 0.2.9.1 :: disclosr.git/blobdiff

blob:a/documents/datagov-resourcereport.py -> blob:b/documents/datagov-resourcereport.py

import couchdb	import couchdb
couch = couchdb.Server('http://127.0.0.1:5984/')	couch = couchdb.Server('http://127.0.0.1:5984/')
#couch = couchdb.Server('http://192.168.1.113:5984/')	#couch = couchdb.Server('http://192.168.1.113:5984/')

import urllib	import urllib
import urlparse	import urlparse
import httplib2	import httplib2
	import httplib
import csv	import csv
import ssl

context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
context.verify_mode = ssl.CERT_NONE

def url_fix(s, charset='utf-8'):	def url_fix(s, charset='utf-8'):
"""Sometimes you get an URL by a user that just isn't a real	"""Sometimes you get an URL by a user that just isn't a real
URL because it contains unsafe characters like ' ' and so on. This	URL because it contains unsafe characters like ' ' and so on. This
function can fix some of the problems in a similar way browsers	function can fix some of the problems in a similar way browsers
handle data entered by the user:	handle data entered by the user:

:param charset: The target charset for the URL if the url was	:param charset: The target charset for the URL if the url was
given as unicode string.	given as unicode string.
"""	"""
if isinstance(s, unicode):	if isinstance(s, unicode):
s = s.encode(charset, 'ignore')	s = s.encode(charset, 'ignore')
if not urlparse.urlparse(s).scheme:	if not urlparse.urlparse(s).scheme:
s = "http://"+s	s = "http://"+s
scheme, netloc, path, qs, anchor = urlparse.urlsplit(s)	scheme, netloc, path, qs, anchor = urlparse.urlsplit(s)
path = urllib.quote(path, '/%')	path = urllib.quote(path, '/%')
qs = urllib.quote_plus(qs, ':&=')	qs = urllib.quote_plus(qs, ':&=')
return urlparse.urlunsplit((scheme, netloc, path, qs, anchor))	return urlparse.urlunsplit((scheme, netloc, path, qs, anchor))

# http://code.activestate.com/recipes/578019-bytes-to-human-human-to-bytes-converter/	# http://code.activestate.com/recipes/578019-bytes-to-human-human-to-bytes-converter/
SYMBOLS = {	SYMBOLS = {
'customary': ('B', 'KB', 'MB', 'GB', 'T', 'P', 'E', 'Z', 'Y'),	'customary': ('B', 'KB', 'MB', 'GB', 'T', 'P', 'E', 'Z', 'Y'),
'customary_ext': ('byte', 'kilo', 'mega', 'giga', 'tera', 'peta', 'exa',	'customary_ext': ('byte', 'kilo', 'mega', 'giga', 'tera', 'peta', 'exa',
'zetta', 'iotta'),	'zetta', 'iotta'),
'iec': ('Bi', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'),	'iec': ('Bi', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'),
'iec_ext': ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi',	'iec_ext': ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi',
'zebi', 'yobi'),	'zebi', 'yobi'),
}	}


docsdb = couch['disclosr-documents']	docsdb = couch['disclosr-documents']
out = csv.writer(open("output.csv","w"), delimiter=',',quoting=csv.QUOTE_ALL)	out = csv.writer(open("output.csv","w"), delimiter=',',quoting=csv.QUOTE_ALL)
if __name__ == "__main__":	if __name__ == "__main__":
for doc in docsdb.view('app/datasets'):	for doc in docsdb.view('app/datasets'):
if doc.value['url'] != "http://data.gov.au/data/" and doc.value['agencyID'] != "qld":	if doc.value['url'] != "http://data.gov.au/data/" and doc.value['agencyID'] != "qld":
# Collect the package metadata.	# Collect the package metadata.
pkg_name = filter(lambda x: x in '0123456789abcdefghijklmnopqrstuvwxyz-_',	pkg_name = filter(lambda x: x in '0123456789abcdefghijklmnopqrstuvwxyz-_',
doc.value['url'].replace("http://data.gov.au/dataset/", '').replace('/', '')[:100]);	doc.value['url'].replace("http://data.gov.au/dataset/", '').replace('/', '')[:100]);
if 'Download' in doc.value['metadata'].keys() and len(doc.value['metadata']['Download']) > 0:	if 'Download' in doc.value['metadata'].keys() and len(doc.value['metadata']['Download']) > 0:
for resource in doc.value['metadata']['Download']:	for resource in doc.value['metadata']['Download']:
# http://docs.ckan.org/en/ckan-1.7/domain-model-resource.html	# http://docs.ckan.org/en/ckan-1.7/domain-model-resource.html
# (KML/KMZ) / (Shapefile) /(Other)	# (KML/KMZ) / (Shapefile) /(Other)
format = "plain"	format = "plain"
if resource['format'] == '(XML)':	if resource['format'] == '(XML)':
format = 'xml'	format = 'xml'
if resource['format'] == '(CSV/XLS)':	if resource['format'] == '(CSV/XLS)':
format = 'csv'	format = 'csv'
if resource['format'] == '(Shapefile)':	if resource['format'] == '(Shapefile)':
format = 'shp'	format = 'shp'
if resource['format'] == '(KML/KMZ)':	if resource['format'] == '(KML/KMZ)':
format = 'kml'	format = 'kml'
name = resource['href']	name = resource['href']
if 'name' in resource.keys():	if 'name' in resource.keys():
name = resource['name']	name = resource['name']
if resource['href'].startswith("ftp"):	if resource['href'].startswith("ftp"):
out.writerow([pkg_name, url_fix(resource['href']), name,format, "ftp", ""])	out.writerow([pkg_name, url_fix(resource['href']), name,format, "ftp", ""])
else:	else:
try:	try:
h = httplib2.Http(disable_ssl_certificate_validation=True)	h = httplib2.Http(disable_ssl_certificate_validation=True)
resp = h.request(url_fix(resource['href']), 'HEAD')	resp = h.request(url_fix(resource['href']), 'HEAD')
content_type = resp[0]['content-type'] if 'content-type' in resp[0].keys() else ""	content_type = resp[0]['content-type'] if 'content-type' in resp[0].keys() else ""
out.writerow([pkg_name, url_fix(resource['href']), name,format, resp[0]['status'], content_type])	out.writerow([pkg_name.encode('ascii', 'ignore'), url_fix(resource['href']).encode('ascii', 'ignore'), name.encode('ascii', 'ignore'),format, resp[0]['status'], content_type])
except httplib2.ServerNotFoundError:	except httplib2.ServerNotFoundError:
out.writerow([pkg_name, url_fix(resource['href']), name,format, "500","badurl"])	out.writerow([pkg_name.encode('ascii', 'ignore'), url_fix(resource['href']).encode('ascii', 'ignore'), name.encode('ascii', 'ignore'),format, "500","badurl"])
	except httplib.InvalidURL:
	out.writerow([pkg_name.encode('ascii', 'ignore'), url_fix(resource['href']).encode('ascii', 'ignore'), name.encode('ascii', 'ignore'),format, "500","badurl"])
	except httplib2.RelativeURIError:
	out.writerow([pkg_name.encode('ascii', 'ignore'), url_fix(resource['href']).encode('ascii', 'ignore'), name.encode('ascii', 'ignore'),format, "500","badurl"])
else:	else:
out.writerow([pkg_name])	out.writerow([pkg_name.encode('ascii', 'ignore')])