export
[disclosr.git] / documents / datagov-export.py
blob:a/documents/datagov-export.py -> blob:b/documents/datagov-export.py
--- a/documents/datagov-export.py
+++ b/documents/datagov-export.py
@@ -1,1 +1,58 @@
+import ckanclient
+import couchdb
+from ckanclient import CkanApiError
+import re
+class LoaderError(Exception):
+    pass
+# https://github.com/okfn/ckanext-importlib
+# Instantiate the CKAN client.
+ckan = ckanclient.CkanClient(base_location='http://localhost:5000/api',
+                             api_key='b47b24cd-591d-40c1-8677-d73101d56d1b')
+# (use your own api_key from http://thedatahub.org/user/me )
 
+def munge(name):
+    # convert spaces to underscores
+    name = re.sub(' ', '_', name).lower()
+    # convert symbols to dashes
+    name = re.sub('[:]', '_-', name).lower()
+    name = re.sub('[/]', '-', name).lower()
+    # take out not-allowed characters
+    name = re.sub('[^a-zA-Z0-9-_]', '', name).lower()
+    # remove double underscores
+    name = re.sub('__', '_', name).lower()
+    return name
+def name_munge(input_name):
+    return munge(input_name.replace(' ', '').replace('.', '_').replace('&', 'and'))
+    #return input_name.replace(' ', '').replace('.', '_').replace('&', 'and')
+
+couch = couchdb.Server('http://127.0.0.1:5984/')
+docsdb = couch['disclosr-documents']
+
+if __name__ == "__main__":
+    for doc in docsdb.view('app/datasets'):
+        print doc.id
+        if doc.value['url'] != "http://data.gov.au/data/":
+            # Collect the package metadata.
+            pkg_name = name_munge(doc.value['metadata']['DCTERMS.Title'][:100])
+            package_entity = {
+                'name': pkg_name,
+                'title': doc.value['metadata']['DCTERMS.Title'],
+                'url': doc.value['metadata']['DCTERMS.Source.URI'],
+                'tags': doc.value['metadata']["Keywords / Tags"],   #todo   must be alphanumeric characters or symbols
+
+                'author': doc.value['metadata']["DCTERMS.Creator"],
+                'maintainer': doc.value['metadata']["DCTERMS.Creator"],
+                'licence_id': doc.value['metadata']['DCTERMS.License'],
+                'notes': doc.value['metadata']['Description'],
+                }
+            try:
+                ckan.package_register_post(package_entity)
+            except CkanApiError, e:
+                if ckan.last_status == 409:
+                    print "already exists"
+                else:
+                    raise LoaderError('Unexpected status %s checking for package under \'%s\': %r' % (ckan.last_status, doc.id, e.args))
+
+            print package_entity
+            ckan.add_package_resource(pkg_name, 'http://example.org/', name='Foo', resource_type='data', format='csv')
+