gitphp 0.2.9.1 :: disclosr.git/commitdiff

import couchdb

couch = couchdb.Server('http://127.0.0.1:5984/')

#couch = couchdb.Server('http://192.168.1.113:5984/')

import urllib

import urlparse

import httplib2

import csv

import ssl

context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)

context.verify_mode = ssl.CERT_NONE

def url_fix(s, charset='utf-8'):

"""Sometimes you get an URL by a user that just isn't a real

URL because it contains unsafe characters like ' ' and so on. This

function can fix some of the problems in a similar way browsers

handle data entered by the user:

:param charset: The target charset for the URL if the url was

given as unicode string.

"""

if isinstance(s, unicode):

s = s.encode(charset, 'ignore')

if not urlparse.urlparse(s).scheme:

s = "http://"+s

scheme, netloc, path, qs, anchor = urlparse.urlsplit(s)

path = urllib.quote(path, '/%')

qs = urllib.quote_plus(qs, ':&=')

return urlparse.urlunsplit((scheme, netloc, path, qs, anchor))

# http://code.activestate.com/recipes/578019-bytes-to-human-human-to-bytes-converter/

SYMBOLS = {

'customary': ('B', 'KB', 'MB', 'GB', 'T', 'P', 'E', 'Z', 'Y'),

'customary_ext': ('byte', 'kilo', 'mega', 'giga', 'tera', 'peta', 'exa',

'zetta', 'iotta'),

'iec': ('Bi', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'),

'iec_ext': ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi',

'zebi', 'yobi'),

}

docsdb = couch['disclosr-documents']

out = csv.writer(open("output.csv","w"), delimiter=',',quoting=csv.QUOTE_ALL)

if __name__ == "__main__":

for doc in docsdb.view('app/datasets'):

if doc.value['url'] != "http://data.gov.au/data/" and doc.value['agencyID'] != "qld":

# Collect the package metadata.

pkg_name = filter(lambda x: x in '0123456789abcdefghijklmnopqrstuvwxyz-_',

doc.value['url'].replace("http://data.gov.au/dataset/", '').replace('/', '')[:100]);

if 'Download' in doc.value['metadata'].keys() and len(doc.value['metadata']['Download']) > 0:

for resource in doc.value['metadata']['Download']:

# http://docs.ckan.org/en/ckan-1.7/domain-model-resource.html

# (KML/KMZ) / (Shapefile) /(Other)

format = "plain"

if resource['format'] == '(XML)':

format = 'xml'

if resource['format'] == '(CSV/XLS)':

format = 'csv'

if resource['format'] == '(Shapefile)':

format = 'shp'

if resource['format'] == '(KML/KMZ)':

format = 'kml'

name = resource['href']

if 'name' in resource.keys():

name = resource['name']

if resource['href'].startswith("ftp"):

out.writerow([pkg_name, url_fix(resource['href']), name,format, "ftp", ""])

else:

try:

h = httplib2.Http(disable_ssl_certificate_validation=True)

resp = h.request(url_fix(resource['href']), 'HEAD')

content_type = resp[0]['content-type'] if 'content-type' in resp[0].keys() else ""

out.writerow([pkg_name, url_fix(resource['href']), name,format, resp[0]['status'], content_type])

except httplib2.ServerNotFoundError:

out.writerow([pkg_name, url_fix(resource['href']), name,format, "500","badurl"])

else:

out.writerow([pkg_name])