fix broken urls on datagov export
fix broken urls on datagov export


Former-commit-id: 3c41d2694ab3235f5b6e99681b28dc21dda4e617

--- a/documents/datagov-export.py
+++ b/documents/datagov-export.py
@@ -18,6 +18,30 @@
 ckandirect = ckanapi.RemoteCKAN('http://'+server, api_key=api_key)
 couch = couchdb.Server('http://127.0.0.1:5984/')
 #couch = couchdb.Server('http://192.168.1.113:5984/')
+
+import urllib
+import urlparse
+
+def url_fix(s, charset='utf-8'):
+    """Sometimes you get an URL by a user that just isn't a real
+    URL because it contains unsafe characters like ' ' and so on.  This
+    function can fix some of the problems in a similar way browsers
+    handle data entered by the user:
+
+    >>> url_fix(u'http://de.wikipedia.org/wiki/Elf (Begriffsklärung)')
+    'http://de.wikipedia.org/wiki/Elf%20%28Begriffskl%C3%A4rung%29'
+
+    :param charset: The target charset for the URL if the url was
+                    given as unicode string.
+    """
+    if isinstance(s, unicode):
+        s = s.encode(charset, 'ignore')
+    if not urlparse.urlparse(s).scheme:
+   	s = "http://"+s
+    scheme, netloc, path, qs, anchor = urlparse.urlsplit(s)
+    path = urllib.quote(path, '/%')
+    qs = urllib.quote_plus(qs, ':&=')
+    return urlparse.urlunsplit((scheme, netloc, path, qs, anchor))
 
 # http://code.activestate.com/recipes/578019-bytes-to-human-human-to-bytes-converter/
 SYMBOLS = {
@@ -238,7 +262,7 @@
                             if 'name' in resource.keys():
                                 name = resource['name']
                             print resource
-                            ckan.add_package_resource(pkg_name, resource['href'], name=name, resource_type='data',
+                            ckan.add_package_resource(pkg_name, url_fix(resource['href']), name=name, resource_type='data',
                                                       format=format,
                                                       size=human2bytes(resource.get('size','0B')))
                     else: