1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 | import ckanclient import couchdb from ckanclient import CkanApiError import re class LoaderError(Exception): pass # https://github.com/okfn/ckanext-importlib # Instantiate the CKAN client. ckan = ckanclient.CkanClient(base_location='http://localhost:5000/api', api_key='b47b24cd-591d-40c1-8677-d73101d56d1b') # (use your own api_key from http://thedatahub.org/user/me ) def munge(name): # convert spaces to underscores name = re.sub(' ', '_', name).lower() # convert symbols to dashes name = re.sub('[:]', '_-', name).lower() name = re.sub('[/]', '-', name).lower() # take out not-allowed characters name = re.sub('[^a-zA-Z0-9-_]', '', name).lower() # remove double underscores name = re.sub('__', '_', name).lower() return name def name_munge(input_name): return munge(input_name.replace(' ', '').replace('.', '_').replace('&', 'and')) #return input_name.replace(' ', '').replace('.', '_').replace('&', 'and') couch = couchdb.Server('http://127.0.0.1:5984/') docsdb = couch['disclosr-documents'] if __name__ == "__main__": for doc in docsdb.view('app/datasets'): print doc.id if doc.value['url'] != "http://data.gov.au/data/": # Collect the package metadata. pkg_name = name_munge(doc.value['metadata']['DCTERMS.Title'][:100]) package_entity = { 'name': pkg_name, 'title': doc.value['metadata']['DCTERMS.Title'], 'url': doc.value['metadata']['DCTERMS.Source.URI'], 'tags': doc.value['metadata']["Keywords / Tags"], #todo must be alphanumeric characters or symbols 'author': doc.value['metadata']["DCTERMS.Creator"], 'maintainer': doc.value['metadata']["DCTERMS.Creator"], 'licence_id': doc.value['metadata']['DCTERMS.License'], 'notes': doc.value['metadata']['Description'], } try: ckan.package_register_post(package_entity) except CkanApiError, e: if ckan.last_status == 409: print "already exists" else: raise LoaderError('Unexpected status %s checking for package under \'%s\': %r' % (ckan.last_status, doc.id, e.args)) print package_entity ckan.add_package_resource(pkg_name, 'http://example.org/', name='Foo', resource_type='data', format='csv') |