fix broken urls on datagov export
Former-commit-id: 3c41d2694ab3235f5b6e99681b28dc21dda4e617
--- a/documents/
+++ b/documents/
@@ -18,6 +18,30 @@
ckandirect = ckanapi.RemoteCKAN('http://'+server, api_key=api_key)
couch = couchdb.Server('')
#couch = couchdb.Server('')
+import urllib
+import urlparse
+def url_fix(s, charset='utf-8'):
+ """Sometimes you get an URL by a user that just isn't a real
+ URL because it contains unsafe characters like ' ' and so on. This
+ function can fix some of the problems in a similar way browsers
+ handle data entered by the user:
+ >>> url_fix(u' (Begriffsklärung)')
+ ''
+ :param charset: The target charset for the URL if the url was
+ given as unicode string.
+ """
+ if isinstance(s, unicode):
+ s = s.encode(charset, 'ignore')
+ if not urlparse.urlparse(s).scheme:
+ s = "http://"+s
+ scheme, netloc, path, qs, anchor = urlparse.urlsplit(s)
+ path = urllib.quote(path, '/%')
+ qs = urllib.quote_plus(qs, ':&=')
+ return urlparse.urlunsplit((scheme, netloc, path, qs, anchor))
@@ -238,7 +262,7 @@
if 'name' in resource.keys():
name = resource['name']
print resource
- ckan.add_package_resource(pkg_name, resource['href'], name=name, resource_type='data',
+ ckan.add_package_resource(pkg_name, url_fix(resource['href']), name=name, resource_type='data',