--- a/documents/datagov-export.py +++ b/documents/datagov-export.py @@ -1,1 +1,58 @@ +import ckanclient +import couchdb +from ckanclient import CkanApiError +import re +class LoaderError(Exception): + pass +# https://github.com/okfn/ckanext-importlib +# Instantiate the CKAN client. +ckan = ckanclient.CkanClient(base_location='http://localhost:5000/api', + api_key='b47b24cd-591d-40c1-8677-d73101d56d1b') +# (use your own api_key from http://thedatahub.org/user/me ) +def munge(name): + # convert spaces to underscores + name = re.sub(' ', '_', name).lower() + # convert symbols to dashes + name = re.sub('[:]', '_-', name).lower() + name = re.sub('[/]', '-', name).lower() + # take out not-allowed characters + name = re.sub('[^a-zA-Z0-9-_]', '', name).lower() + # remove double underscores + name = re.sub('__', '_', name).lower() + return name +def name_munge(input_name): + return munge(input_name.replace(' ', '').replace('.', '_').replace('&', 'and')) + #return input_name.replace(' ', '').replace('.', '_').replace('&', 'and') + +couch = couchdb.Server('http://127.0.0.1:5984/') +docsdb = couch['disclosr-documents'] + +if __name__ == "__main__": + for doc in docsdb.view('app/datasets'): + print doc.id + if doc.value['url'] != "http://data.gov.au/data/": + # Collect the package metadata. + pkg_name = name_munge(doc.value['metadata']['DCTERMS.Title'][:100]) + package_entity = { + 'name': pkg_name, + 'title': doc.value['metadata']['DCTERMS.Title'], + 'url': doc.value['metadata']['DCTERMS.Source.URI'], + 'tags': doc.value['metadata']["Keywords / Tags"], #todo must be alphanumeric characters or symbols + + 'author': doc.value['metadata']["DCTERMS.Creator"], + 'maintainer': doc.value['metadata']["DCTERMS.Creator"], + 'licence_id': doc.value['metadata']['DCTERMS.License'], + 'notes': doc.value['metadata']['Description'], + } + try: + ckan.package_register_post(package_entity) + except CkanApiError, e: + if ckan.last_status == 409: + print "already exists" + else: + raise LoaderError('Unexpected status %s checking for package under \'%s\': %r' % (ckan.last_status, doc.id, e.args)) + + print package_entity + ckan.add_package_resource(pkg_name, 'http://example.org/', name='Foo', resource_type='data', format='csv') +