Former-commit-id: 997dc9ece51a37dc25779ca4125d0960cdc195c9
--- a/documents/datagov-export.py
+++ b/documents/datagov-export.py
@@ -2,14 +2,75 @@
import couchdb
from ckanclient import CkanApiError
import re
+
class LoaderError(Exception):
pass
-# https://github.com/okfn/ckanext-importlib
+
# Instantiate the CKAN client.
ckan = ckanclient.CkanClient(base_location='http://localhost:5000/api',
- api_key='b47b24cd-591d-40c1-8677-d73101d56d1b')
+ api_key='b47b24cd-591d-40c1-8677-d73101d56d1b')
# (use your own api_key from http://thedatahub.org/user/me )
+# http://code.activestate.com/recipes/578019-bytes-to-human-human-to-bytes-converter/
+SYMBOLS = {
+ 'customary': ('B', 'KB', 'MB', 'GB', 'T', 'P', 'E', 'Z', 'Y'),
+ 'customary_ext': ('byte', 'kilo', 'mega', 'giga', 'tera', 'peta', 'exa',
+ 'zetta', 'iotta'),
+ 'iec': ('Bi', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'),
+ 'iec_ext': ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi',
+ 'zebi', 'yobi'),
+}
+def human2bytes(s):
+ """
+ Attempts to guess the string format based on default symbols
+ set and return the corresponding bytes as an integer.
+ When unable to recognize the format ValueError is raised.
+
+ >>> human2bytes('0 B')
+ 0
+ >>> human2bytes('1 K')
+ 1024
+ >>> human2bytes('1 M')
+ 1048576
+ >>> human2bytes('1 Gi')
+ 1073741824
+ >>> human2bytes('1 tera')
+ 1099511627776
+
+ >>> human2bytes('0.5kilo')
+ 512
+ >>> human2bytes('0.1 byte')
+ 0
+ >>> human2bytes('1 k') # k is an alias for K
+ 1024
+ >>> human2bytes('12 foo')
+ Traceback (most recent call last):
+ ...
+ ValueError: can't interpret '12 foo'
+ """
+ init = s
+ num = ""
+ while s and s[0:1].isdigit() or s[0:1] == '.':
+ num += s[0]
+ s = s[1:]
+ num = float(num)
+ letter = s.strip()
+ for name, sset in SYMBOLS.items():
+ if letter in sset:
+ break
+ else:
+ if letter == 'k':
+ # treat 'k' as an alias for 'K' as per: http://goo.gl/kTQMs
+ sset = SYMBOLS['customary']
+ letter = letter.upper()
+ else:
+ raise ValueError("can't interpret %r" % init)
+ prefix = {sset[0]: 1}
+ for i, s in enumerate(sset[1:]):
+ prefix[s] = 1 << (i + 1) * 10
+ return int(num * prefix[letter])
+
+# https://github.com/okfn/ckanext-importlib
def munge(name):
# convert spaces to underscores
name = re.sub(' ', '_', name).lower()
@@ -21,6 +82,8 @@
# remove double underscores
name = re.sub('__', '_', name).lower()
return name
+
+
def name_munge(input_name):
return munge(input_name.replace(' ', '').replace('.', '_').replace('&', 'and'))
#return input_name.replace(' ', '').replace('.', '_').replace('&', 'and')
@@ -34,25 +97,58 @@
if doc.value['url'] != "http://data.gov.au/data/":
# Collect the package metadata.
pkg_name = name_munge(doc.value['metadata']['DCTERMS.Title'][:100])
+ tags = doc.value['metadata']["Keywords / Tags"]
+ if not hasattr(tags, '__iter__'):
+ tags = [tags]
+ [re.sub('[^a-zA-Z0-9-_]', '', tag).lower() for tag in tags]
package_entity = {
'name': pkg_name,
'title': doc.value['metadata']['DCTERMS.Title'],
'url': doc.value['metadata']['DCTERMS.Source.URI'],
- 'tags': doc.value['metadata']["Keywords / Tags"], #todo must be alphanumeric characters or symbols
-
+ 'tags': tags,
'author': doc.value['metadata']["DCTERMS.Creator"],
'maintainer': doc.value['metadata']["DCTERMS.Creator"],
- 'licence_id': doc.value['metadata']['DCTERMS.License'],
+ 'licence_id': doc.value['metadata']['DCTERMS.License'], #todo licence id mapping
'notes': doc.value['metadata']['Description'],
- }
+ }
try:
+ #print doc.id
ckan.package_register_post(package_entity)
except CkanApiError, e:
if ckan.last_status == 409:
print "already exists"
else:
- raise LoaderError('Unexpected status %s checking for package under \'%s\': %r' % (ckan.last_status, doc.id, e.args))
+ raise LoaderError('Unexpected status %s checking for package under \'%s\': %r' % (
+ ckan.last_status, pkg_name, e.args))
print package_entity
- ckan.add_package_resource(pkg_name, 'http://example.org/', name='Foo', resource_type='data', format='csv')
+ #todo add to organisation (author/creator/maintainer)
+ #if 'data.gov.au Category' in doc.value['metadata'].keys(): #todo add to group
+ if 'Download' in doc.value['metadata'].keys():
+ try:
+ pkg = ckan.package_entity_get(pkg_name)
+ resources = pkg.get('resources', [])
+ if len(resources) < len(doc.value['metadata']['Download']):
+ for resource in doc.value['metadata']['Download']:
+ #print resource
+ # http://docs.ckan.org/en/ckan-1.7/domain-model-resource.html
+ # (KML/KMZ) / (Shapefile) /(Other)
+ format = "plain"
+ if resource['format'] == '(XML)':
+ format = 'xml'
+ if resource['format'] == '(CSV/XLS)':
+ format = 'csv'
+ name = resource['href']
+ if 'name' in resource.keys():
+ name = resource['name']
+ ckan.add_package_resource(pkg_name, resource['href'], name=name, resource_type='data',
+ format=format, size=human2bytes(resource['size'].replace(',', '')))
+ else:
+ print "resources already exist"
+ except CkanApiError, e:
+ if ckan.last_status == 404:
+ print "parent dataset does not exist"
+ else:
+ raise LoaderError('Unexpected status %s checking for package under \'%s\': %r' % (
+ ckan.last_status, pkg_name, e.args))