fix broken urls on datagov export
Former-commit-id: 3c41d2694ab3235f5b6e99681b28dc21dda4e617
--- a/documents/datagov-export.py
+++ b/documents/datagov-export.py
@@ -18,6 +18,30 @@
ckandirect = ckanapi.RemoteCKAN('http://'+server, api_key=api_key)
couch = couchdb.Server('http://127.0.0.1:5984/')
#couch = couchdb.Server('http://192.168.1.113:5984/')
+
+import urllib
+import urlparse
+
+def url_fix(s, charset='utf-8'):
+ """Sometimes you get an URL by a user that just isn't a real
+ URL because it contains unsafe characters like ' ' and so on. This
+ function can fix some of the problems in a similar way browsers
+ handle data entered by the user:
+
+ >>> url_fix(u'http://de.wikipedia.org/wiki/Elf (Begriffsklärung)')
+ 'http://de.wikipedia.org/wiki/Elf%20%28Begriffskl%C3%A4rung%29'
+
+ :param charset: The target charset for the URL if the url was
+ given as unicode string.
+ """
+ if isinstance(s, unicode):
+ s = s.encode(charset, 'ignore')
+ if not urlparse.urlparse(s).scheme:
+ s = "http://"+s
+ scheme, netloc, path, qs, anchor = urlparse.urlsplit(s)
+ path = urllib.quote(path, '/%')
+ qs = urllib.quote_plus(qs, ':&=')
+ return urlparse.urlunsplit((scheme, netloc, path, qs, anchor))
# http://code.activestate.com/recipes/578019-bytes-to-human-human-to-bytes-converter/
SYMBOLS = {
@@ -238,7 +262,7 @@
if 'name' in resource.keys():
name = resource['name']
print resource
- ckan.add_package_resource(pkg_name, resource['href'], name=name, resource_type='data',
+ ckan.add_package_resource(pkg_name, url_fix(resource['href']), name=name, resource_type='data',
format=format,
size=human2bytes(resource.get('size','0B')))
else: