import
[disclosr.git] / documents / datagov-export.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import ckanclient
import couchdb
from ckanclient import CkanApiError
import re
class LoaderError(Exception):
    pass
# https://github.com/okfn/ckanext-importlib
# Instantiate the CKAN client.
ckan = ckanclient.CkanClient(base_location='http://localhost:5000/api',
                             api_key='b47b24cd-591d-40c1-8677-d73101d56d1b')
# (use your own api_key from http://thedatahub.org/user/me )
 
def munge(name):
    # convert spaces to underscores
    name = re.sub(' ', '_', name).lower()
    # convert symbols to dashes
    name = re.sub('[:]', '_-', name).lower()
    name = re.sub('[/]', '-', name).lower()
    # take out not-allowed characters
    name = re.sub('[^a-zA-Z0-9-_]', '', name).lower()
    # remove double underscores
    name = re.sub('__', '_', name).lower()
    return name
def name_munge(input_name):
    return munge(input_name.replace(' ', '').replace('.', '_').replace('&', 'and'))
    #return input_name.replace(' ', '').replace('.', '_').replace('&', 'and')
 
couch = couchdb.Server('http://127.0.0.1:5984/')
docsdb = couch['disclosr-documents']
 
if __name__ == "__main__":
    for doc in docsdb.view('app/datasets'):
        print doc.id
        if doc.value['url'] != "http://data.gov.au/data/":
            # Collect the package metadata.
            pkg_name = name_munge(doc.value['metadata']['DCTERMS.Title'][:100])
            package_entity = {
                'name': pkg_name,
                'title': doc.value['metadata']['DCTERMS.Title'],
                'url': doc.value['metadata']['DCTERMS.Source.URI'],
                'tags': doc.value['metadata']["Keywords / Tags"],   #todo   must be alphanumeric characters or symbols
 
                'author': doc.value['metadata']["DCTERMS.Creator"],
                'maintainer': doc.value['metadata']["DCTERMS.Creator"],
                'licence_id': doc.value['metadata']['DCTERMS.License'],
                'notes': doc.value['metadata']['Description'],
                }
            try:
                ckan.package_register_post(package_entity)
            except CkanApiError, e:
                if ckan.last_status == 409:
                    print "already exists"
                else:
                    raise LoaderError('Unexpected status %s checking for package under \'%s\': %r' % (ckan.last_status, doc.id, e.args))
 
            print package_entity
            ckan.add_package_resource(pkg_name, 'http://example.org/', name='Foo', resource_type='data', format='csv')