cleanse data group/org name for datagov
[disclosr.git] / documents / datagov-export-groups.py
blob:a/documents/datagov-export-groups.py -> blob:b/documents/datagov-export-groups.py
import ckanclient import ckanclient
import couchdb import couchdb
from ckanclient import CkanApiError from ckanclient import CkanApiError
import re import re
   
   
class LoaderError(Exception): class LoaderError(Exception):
pass pass
   
# Instantiate the CKAN client. # Instantiate the CKAN client.
#ckan = ckanclient.CkanClient(base_location='http://localhost:5000/api', api_key='b47b24cd-591d-40c1-8677-d73101d56d1b') #ckan = ckanclient.CkanClient(base_location='http://localhost:5000/api', api_key='b47b24cd-591d-40c1-8677-d73101d56d1b')
api_key = 'b3ab75e4-afbb-465b-a09d-8171c8c69a7a' api_key = 'b3ab75e4-afbb-465b-a09d-8171c8c69a7a'
ckan = ckanclient.CkanClient(base_location='http://data.disclosurelo.gs/api', ckan = ckanclient.CkanClient(base_location='http://data.disclosurelo.gs/api',
api_key=api_key) api_key=api_key)
couch = couchdb.Server('http://127.0.0.1:5984/') couch = couchdb.Server('http://127.0.0.1:5984/')
#couch = couchdb.Server('http://192.168.1.113:5984/') #couch = couchdb.Server('http://192.168.1.113:5984/')
   
# https://github.com/okfn/ckanext-importlib # https://github.com/okfn/ckanext-importlib
def munge(name): def munge(name):
# convert spaces to underscores # convert spaces to underscores
name = re.sub(' ', '_', name).lower() name = re.sub(' ', '_', name).lower()
# convert symbols to dashes # convert symbols to dashes
name = re.sub('[:]', '_-', name).lower() name = re.sub('[:]', '_-', name).lower()
name = re.sub('[/]', '-', name).lower() name = re.sub('[/]', '-', name).lower()
# take out not-allowed characters # take out not-allowed characters
name = re.sub('[^a-zA-Z0-9-_]', '', name).lower() name = re.sub('[^a-zA-Z0-9-_]', '', name).lower()
# remove double underscores # remove double underscores
name = re.sub('__', '_', name).lower() name = re.sub('__', '_', name).lower()
return name return name
   
   
def name_munge(input_name): def name_munge(input_name):
return munge(input_name.replace(' ', '').replace('.', '_').replace('&', 'and')) return munge(input_name.replace(' ', '').replace('.', '_').replace('&', 'and'))
   
   
docsdb = couch['disclosr-documents'] docsdb = couch['disclosr-documents']
   
if __name__ == "__main__": if __name__ == "__main__":
groups = {} groups = {}
for doc in docsdb.view('app/datasetGroups'): for doc in docsdb.view('app/datasetGroups'):
group_name = doc.key group_name = doc.key
pkg_name = filter(lambda x: x in '0123456789abcdefghijklmnopqrstuvwxyz-_', if group_name != "Not specified":
doc.value.replace("http://data.gov.au/dataset/", '').replace('/', '')[:100]); pkg_name = filter(lambda x: x in '0123456789abcdefghijklmnopqrstuvwxyz-_',
if group_name in groups.keys(): doc.value.replace("http://data.gov.au/dataset/", '').replace('/', '')[:100]);
groups[group_name] = list(set(groups[group_name] + [pkg_name])) if group_name in groups.keys():
else: groups[group_name] = list(set(groups[group_name] + [pkg_name]))
groups[group_name] = [pkg_name] else:
  groups[group_name] = [pkg_name]
   
# add dataset to group(s) # add dataset to group(s)
for group_name in groups.keys(): for group_name in groups.keys():
group_url = name_munge(group_name[:100]) if group_name != "Not specified":
print group_name group_url = name_munge(group_name[:100])
print groups[group_name] print group_name
try: print groups[group_name]
# Update the group details try:
group_entity = ckan.group_entity_get(group_url) # Update the group details
print "group "+group_name+" exists" group_entity = ckan.group_entity_get(group_url)
if 'packages' in group_entity.keys(): print "group "+group_name+" exists"
group_entity['packages'] = list(set(group_entity['packages'] + groups[group_name])) if 'packages' in group_entity.keys():
else: group_entity['packages'] = list(set(group_entity['packages'] + groups[group_name]))
group_entity['packages'] = groups[group_name] else:
ckan.group_entity_put(group_entity) group_entity['packages'] = groups[group_name]
except CkanApiError, e: ckan.group_entity_put(group_entity)
if ckan.last_status == 404: except CkanApiError, e:
print "group "+group_name+" does not exist, creating" if ckan.last_status == 404:
group_entity = { print "group "+group_name+" does not exist, creating"
'name': group_url, group_entity = {
'title': group_name, 'name': group_url,
'description': group_name, 'title': group_name,
'packages': groups[group_name] 'description': group_name,
} 'packages': groups[group_name]
#print group_entity }
ckan.group_register_post(group_entity) #print group_entity
elif ckan.last_status == 409: ckan.group_register_post(group_entity)
print "group already exists" elif ckan.last_status == 409:
else: print "group already exists"
raise LoaderError('Unexpected status %s adding to group under \'%s\': %r' % ( else:
ckan.last_status, pkg_name, e.args)) raise LoaderError('Unexpected status %s adding to group under \'%s\': %r' % (
  ckan.last_status, pkg_name, e.args))