From: Maxious Date: Thu, 18 Apr 2013 03:12:31 +0000 Subject: fix broken urls on datagov export X-Git-Url: http://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=42e670b6749747aac765792212dc32cf63441525 --- fix broken urls on datagov export Former-commit-id: 3c41d2694ab3235f5b6e99681b28dc21dda4e617 --- --- a/documents/datagov-export.py +++ b/documents/datagov-export.py @@ -18,6 +18,30 @@ ckandirect = ckanapi.RemoteCKAN('http://'+server, api_key=api_key) couch = couchdb.Server('http://127.0.0.1:5984/') #couch = couchdb.Server('http://192.168.1.113:5984/') + +import urllib +import urlparse + +def url_fix(s, charset='utf-8'): + """Sometimes you get an URL by a user that just isn't a real + URL because it contains unsafe characters like ' ' and so on. This + function can fix some of the problems in a similar way browsers + handle data entered by the user: + + >>> url_fix(u'http://de.wikipedia.org/wiki/Elf (Begriffsklärung)') + 'http://de.wikipedia.org/wiki/Elf%20%28Begriffskl%C3%A4rung%29' + + :param charset: The target charset for the URL if the url was + given as unicode string. + """ + if isinstance(s, unicode): + s = s.encode(charset, 'ignore') + if not urlparse.urlparse(s).scheme: + s = "http://"+s + scheme, netloc, path, qs, anchor = urlparse.urlsplit(s) + path = urllib.quote(path, '/%') + qs = urllib.quote_plus(qs, ':&=') + return urlparse.urlunsplit((scheme, netloc, path, qs, anchor)) # http://code.activestate.com/recipes/578019-bytes-to-human-human-to-bytes-converter/ SYMBOLS = { @@ -238,7 +262,7 @@ if 'name' in resource.keys(): name = resource['name'] print resource - ckan.add_package_resource(pkg_name, resource['href'], name=name, resource_type='data', + ckan.add_package_resource(pkg_name, url_fix(resource['href']), name=name, resource_type='data', format=format, size=human2bytes(resource.get('size','0B'))) else: