From: Alex Sadleir Date: Fri, 26 Apr 2013 12:41:45 +0000 Subject: tool to check datagov resources X-Git-Url: https://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=f8e2c3836140f6e9b4f226b93c8b7a5a8cda035f --- tool to check datagov resources Former-commit-id: f406384c3ba09ba04f639abb5731511ddf02b88b --- --- /dev/null +++ b/documents/datagov-resourcereport.py @@ -1,1 +1,79 @@ +import couchdb +couch = couchdb.Server('http://127.0.0.1:5984/') +#couch = couchdb.Server('http://192.168.1.113:5984/') +import urllib +import urlparse +import httplib2 +import csv +import ssl + +context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) +context.verify_mode = ssl.CERT_NONE + +def url_fix(s, charset='utf-8'): + """Sometimes you get an URL by a user that just isn't a real + URL because it contains unsafe characters like ' ' and so on. This + function can fix some of the problems in a similar way browsers + handle data entered by the user: + + :param charset: The target charset for the URL if the url was + given as unicode string. + """ + if isinstance(s, unicode): + s = s.encode(charset, 'ignore') + if not urlparse.urlparse(s).scheme: + s = "http://"+s + scheme, netloc, path, qs, anchor = urlparse.urlsplit(s) + path = urllib.quote(path, '/%') + qs = urllib.quote_plus(qs, ':&=') + return urlparse.urlunsplit((scheme, netloc, path, qs, anchor)) + +# http://code.activestate.com/recipes/578019-bytes-to-human-human-to-bytes-converter/ +SYMBOLS = { + 'customary': ('B', 'KB', 'MB', 'GB', 'T', 'P', 'E', 'Z', 'Y'), + 'customary_ext': ('byte', 'kilo', 'mega', 'giga', 'tera', 'peta', 'exa', + 'zetta', 'iotta'), + 'iec': ('Bi', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'), + 'iec_ext': ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi', + 'zebi', 'yobi'), +} + + +docsdb = couch['disclosr-documents'] +out = csv.writer(open("output.csv","w"), delimiter=',',quoting=csv.QUOTE_ALL) +if __name__ == "__main__": + for doc in docsdb.view('app/datasets'): + if doc.value['url'] != "http://data.gov.au/data/" and doc.value['agencyID'] != "qld": + # Collect the package metadata. + pkg_name = filter(lambda x: x in '0123456789abcdefghijklmnopqrstuvwxyz-_', + doc.value['url'].replace("http://data.gov.au/dataset/", '').replace('/', '')[:100]); + if 'Download' in doc.value['metadata'].keys() and len(doc.value['metadata']['Download']) > 0: + for resource in doc.value['metadata']['Download']: + # http://docs.ckan.org/en/ckan-1.7/domain-model-resource.html + # (KML/KMZ) / (Shapefile) /(Other) + format = "plain" + if resource['format'] == '(XML)': + format = 'xml' + if resource['format'] == '(CSV/XLS)': + format = 'csv' + if resource['format'] == '(Shapefile)': + format = 'shp' + if resource['format'] == '(KML/KMZ)': + format = 'kml' + name = resource['href'] + if 'name' in resource.keys(): + name = resource['name'] + if resource['href'].startswith("ftp"): + out.writerow([pkg_name, url_fix(resource['href']), name,format, "ftp", ""]) + else: + try: + h = httplib2.Http(disable_ssl_certificate_validation=True) + resp = h.request(url_fix(resource['href']), 'HEAD') + content_type = resp[0]['content-type'] if 'content-type' in resp[0].keys() else "" + out.writerow([pkg_name, url_fix(resource['href']), name,format, resp[0]['status'], content_type]) + except httplib2.ServerNotFoundError: + out.writerow([pkg_name, url_fix(resource['href']), name,format, "500","badurl"]) + else: + out.writerow([pkg_name]) +