import couchdb |
import couchdb |
couch = couchdb.Server('http://127.0.0.1:5984/') |
couch = couchdb.Server('http://127.0.0.1:5984/') |
#couch = couchdb.Server('http://192.168.1.113:5984/') |
#couch = couchdb.Server('http://192.168.1.113:5984/') |
|
|
import urllib |
import urllib |
import urlparse |
import urlparse |
import httplib2 |
import httplib2 |
|
import httplib |
import csv |
import csv |
import ssl |
|
|
|
context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) |
|
context.verify_mode = ssl.CERT_NONE |
|
|
|
def url_fix(s, charset='utf-8'): |
def url_fix(s, charset='utf-8'): |
"""Sometimes you get an URL by a user that just isn't a real |
"""Sometimes you get an URL by a user that just isn't a real |
URL because it contains unsafe characters like ' ' and so on. This |
URL because it contains unsafe characters like ' ' and so on. This |
function can fix some of the problems in a similar way browsers |
function can fix some of the problems in a similar way browsers |
handle data entered by the user: |
handle data entered by the user: |
|
|
:param charset: The target charset for the URL if the url was |
:param charset: The target charset for the URL if the url was |
given as unicode string. |
given as unicode string. |
""" |
""" |
if isinstance(s, unicode): |
if isinstance(s, unicode): |
s = s.encode(charset, 'ignore') |
s = s.encode(charset, 'ignore') |
if not urlparse.urlparse(s).scheme: |
if not urlparse.urlparse(s).scheme: |
s = "http://"+s |
s = "http://"+s |
scheme, netloc, path, qs, anchor = urlparse.urlsplit(s) |
scheme, netloc, path, qs, anchor = urlparse.urlsplit(s) |
path = urllib.quote(path, '/%') |
path = urllib.quote(path, '/%') |
qs = urllib.quote_plus(qs, ':&=') |
qs = urllib.quote_plus(qs, ':&=') |
return urlparse.urlunsplit((scheme, netloc, path, qs, anchor)) |
return urlparse.urlunsplit((scheme, netloc, path, qs, anchor)) |
|
|
# http://code.activestate.com/recipes/578019-bytes-to-human-human-to-bytes-converter/ |
# http://code.activestate.com/recipes/578019-bytes-to-human-human-to-bytes-converter/ |
SYMBOLS = { |
SYMBOLS = { |
'customary': ('B', 'KB', 'MB', 'GB', 'T', 'P', 'E', 'Z', 'Y'), |
'customary': ('B', 'KB', 'MB', 'GB', 'T', 'P', 'E', 'Z', 'Y'), |
'customary_ext': ('byte', 'kilo', 'mega', 'giga', 'tera', 'peta', 'exa', |
'customary_ext': ('byte', 'kilo', 'mega', 'giga', 'tera', 'peta', 'exa', |
'zetta', 'iotta'), |
'zetta', 'iotta'), |
'iec': ('Bi', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'), |
'iec': ('Bi', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'), |
'iec_ext': ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi', |
'iec_ext': ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi', |
'zebi', 'yobi'), |
'zebi', 'yobi'), |
} |
} |
|
|
|
|
docsdb = couch['disclosr-documents'] |
docsdb = couch['disclosr-documents'] |
out = csv.writer(open("output.csv","w"), delimiter=',',quoting=csv.QUOTE_ALL) |
out = csv.writer(open("output.csv","w"), delimiter=',',quoting=csv.QUOTE_ALL) |
if __name__ == "__main__": |
if __name__ == "__main__": |
for doc in docsdb.view('app/datasets'): |
for doc in docsdb.view('app/datasets'): |
if doc.value['url'] != "http://data.gov.au/data/" and doc.value['agencyID'] != "qld": |
if doc.value['url'] != "http://data.gov.au/data/" and doc.value['agencyID'] != "qld": |
# Collect the package metadata. |
# Collect the package metadata. |
pkg_name = filter(lambda x: x in '0123456789abcdefghijklmnopqrstuvwxyz-_', |
pkg_name = filter(lambda x: x in '0123456789abcdefghijklmnopqrstuvwxyz-_', |
doc.value['url'].replace("http://data.gov.au/dataset/", '').replace('/', '')[:100]); |
doc.value['url'].replace("http://data.gov.au/dataset/", '').replace('/', '')[:100]); |
if 'Download' in doc.value['metadata'].keys() and len(doc.value['metadata']['Download']) > 0: |
if 'Download' in doc.value['metadata'].keys() and len(doc.value['metadata']['Download']) > 0: |
for resource in doc.value['metadata']['Download']: |
for resource in doc.value['metadata']['Download']: |
# http://docs.ckan.org/en/ckan-1.7/domain-model-resource.html |
# http://docs.ckan.org/en/ckan-1.7/domain-model-resource.html |
# (KML/KMZ) / (Shapefile) /(Other) |
# (KML/KMZ) / (Shapefile) /(Other) |
format = "plain" |
format = "plain" |
if resource['format'] == '(XML)': |
if resource['format'] == '(XML)': |
format = 'xml' |
format = 'xml' |
if resource['format'] == '(CSV/XLS)': |
if resource['format'] == '(CSV/XLS)': |
format = 'csv' |
format = 'csv' |
if resource['format'] == '(Shapefile)': |
if resource['format'] == '(Shapefile)': |
format = 'shp' |
format = 'shp' |
if resource['format'] == '(KML/KMZ)': |
if resource['format'] == '(KML/KMZ)': |
format = 'kml' |
format = 'kml' |
name = resource['href'] |
name = resource['href'] |
if 'name' in resource.keys(): |
if 'name' in resource.keys(): |
name = resource['name'] |
name = resource['name'] |
if resource['href'].startswith("ftp"): |
if resource['href'].startswith("ftp"): |
out.writerow([pkg_name, url_fix(resource['href']), name,format, "ftp", ""]) |
out.writerow([pkg_name, url_fix(resource['href']), name,format, "ftp", ""]) |
else: |
else: |
try: |
try: |
h = httplib2.Http(disable_ssl_certificate_validation=True) |
h = httplib2.Http(disable_ssl_certificate_validation=True) |
resp = h.request(url_fix(resource['href']), 'HEAD') |
resp = h.request(url_fix(resource['href']), 'HEAD') |
content_type = resp[0]['content-type'] if 'content-type' in resp[0].keys() else "" |
content_type = resp[0]['content-type'] if 'content-type' in resp[0].keys() else "" |
out.writerow([pkg_name, url_fix(resource['href']), name,format, resp[0]['status'], content_type]) |
out.writerow([pkg_name.encode('ascii', 'ignore'), url_fix(resource['href']).encode('ascii', 'ignore'), name.encode('ascii', 'ignore'),format, resp[0]['status'], content_type]) |
except httplib2.ServerNotFoundError: |
except httplib2.ServerNotFoundError: |
out.writerow([pkg_name, url_fix(resource['href']), name,format, "500","badurl"]) |
out.writerow([pkg_name.encode('ascii', 'ignore'), url_fix(resource['href']).encode('ascii', 'ignore'), name.encode('ascii', 'ignore'),format, "500","badurl"]) |
|
except httplib.InvalidURL: |
|
out.writerow([pkg_name.encode('ascii', 'ignore'), url_fix(resource['href']).encode('ascii', 'ignore'), name.encode('ascii', 'ignore'),format, "500","badurl"]) |
|
except httplib2.RelativeURIError: |
|
out.writerow([pkg_name.encode('ascii', 'ignore'), url_fix(resource['href']).encode('ascii', 'ignore'), name.encode('ascii', 'ignore'),format, "500","badurl"]) |
else: |
else: |
out.writerow([pkg_name]) |
out.writerow([pkg_name.encode('ascii', 'ignore')]) |
|
|