fix broken urls on datagov export
Former-commit-id: 3c41d2694ab3235f5b6e99681b28dc21dda4e617
--- a/documents/datagov-export-groups.py
+++ b/documents/datagov-export-groups.py
@@ -9,7 +9,7 @@
# Instantiate the CKAN client.
#ckan = ckanclient.CkanClient(base_location='http://localhost:5000/api', api_key='b47b24cd-591d-40c1-8677-d73101d56d1b')
-api_key = 'b3ab75e4-afbb-465b-a09d-8171c8c69a7a'
+api_key = 'ff34526e-f794-4068-8235-fcbba38cd8bc'
ckan = ckanclient.CkanClient(base_location='http://data.disclosurelo.gs/api',
api_key=api_key)
couch = couchdb.Server('http://127.0.0.1:5984/')
--- a/documents/datagov-export.py
+++ b/documents/datagov-export.py
@@ -11,11 +11,37 @@
# Instantiate the CKAN client.
api_key = 'ff34526e-f794-4068-8235-fcbba38cd8bc'
-ckan = ckanclient.CkanClient(base_location='http://data.disclosurelo.gs/api',
+server = 'data.disclosurelo.gs'
+
+ckan = ckanclient.CkanClient(base_location='http://'+server+'api',
api_key=api_key)
-ckandirect = ckanapi.RemoteCKAN('http://data.disclosurelo.gs', api_key=api_key)
+ckandirect = ckanapi.RemoteCKAN('http://'+server, api_key=api_key)
couch = couchdb.Server('http://127.0.0.1:5984/')
#couch = couchdb.Server('http://192.168.1.113:5984/')
+
+import urllib
+import urlparse
+
+def url_fix(s, charset='utf-8'):
+ """Sometimes you get an URL by a user that just isn't a real
+ URL because it contains unsafe characters like ' ' and so on. This
+ function can fix some of the problems in a similar way browsers
+ handle data entered by the user:
+
+ >>> url_fix(u'http://de.wikipedia.org/wiki/Elf (Begriffsklärung)')
+ 'http://de.wikipedia.org/wiki/Elf%20%28Begriffskl%C3%A4rung%29'
+
+ :param charset: The target charset for the URL if the url was
+ given as unicode string.
+ """
+ if isinstance(s, unicode):
+ s = s.encode(charset, 'ignore')
+ if not urlparse.urlparse(s).scheme:
+ s = "http://"+s
+ scheme, netloc, path, qs, anchor = urlparse.urlsplit(s)
+ path = urllib.quote(path, '/%')
+ qs = urllib.quote_plus(qs, ':&=')
+ return urlparse.urlunsplit((scheme, netloc, path, qs, anchor))
# http://code.activestate.com/recipes/578019-bytes-to-human-human-to-bytes-converter/
SYMBOLS = {
@@ -182,8 +208,9 @@
extras = []
for extra_key in doc.value['metadata'].keys():
- if extra_key != "Keywords / Tags" and extra_key != "data.gov.au Category" and extra_key != "Download" :
- extras.append({'key':extra_key, 'value':doc.value['metadata'][extra_key]})
+ if extra_key not in ["Description","Content-Language","DCTERMS.Description", "Keywords / Tags" ,"data.gov.au Category", "Download", "Permalink","DCTERMS.Identifier"]:
+ if doc.value['metadata'][extra_key] != None and doc.value['metadata'][extra_key] != "":
+ extras.append([extra_key, doc.value['metadata'][extra_key]])
package_entity = {
'name': pkg_name,
@@ -235,7 +262,7 @@
if 'name' in resource.keys():
name = resource['name']
print resource
- ckan.add_package_resource(pkg_name, resource['href'], name=name, resource_type='data',
+ ckan.add_package_resource(pkg_name, url_fix(resource['href']), name=name, resource_type='data',
format=format,
size=human2bytes(resource.get('size','0B')))
else: