import ckanclient |
import ckanclient |
import couchdb |
import couchdb |
from ckanclient import CkanApiError |
from ckanclient import CkanApiError |
import re |
import re |
|
|
class LoaderError(Exception): |
class LoaderError(Exception): |
pass |
pass |
# https://github.com/okfn/ckanext-importlib |
|
# Instantiate the CKAN client. |
# Instantiate the CKAN client. |
ckan = ckanclient.CkanClient(base_location='http://localhost:5000/api', |
ckan = ckanclient.CkanClient(base_location='http://localhost:5000/api', |
api_key='b47b24cd-591d-40c1-8677-d73101d56d1b') |
api_key='b47b24cd-591d-40c1-8677-d73101d56d1b') |
# (use your own api_key from http://thedatahub.org/user/me ) |
# (use your own api_key from http://thedatahub.org/user/me ) |
|
# http://code.activestate.com/recipes/578019-bytes-to-human-human-to-bytes-converter/ |
|
SYMBOLS = { |
|
'customary': ('B', 'KB', 'MB', 'GB', 'T', 'P', 'E', 'Z', 'Y'), |
|
'customary_ext': ('byte', 'kilo', 'mega', 'giga', 'tera', 'peta', 'exa', |
|
'zetta', 'iotta'), |
|
'iec': ('Bi', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'), |
|
'iec_ext': ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi', |
|
'zebi', 'yobi'), |
|
} |
|
|
|
def human2bytes(s): |
|
""" |
|
Attempts to guess the string format based on default symbols |
|
set and return the corresponding bytes as an integer. |
|
When unable to recognize the format ValueError is raised. |
|
|
|
>>> human2bytes('0 B') |
|
0 |
|
>>> human2bytes('1 K') |
|
1024 |
|
>>> human2bytes('1 M') |
|
1048576 |
|
>>> human2bytes('1 Gi') |
|
1073741824 |
|
>>> human2bytes('1 tera') |
|
1099511627776 |
|
|
|
>>> human2bytes('0.5kilo') |
|
512 |
|
>>> human2bytes('0.1 byte') |
|
0 |
|
>>> human2bytes('1 k') # k is an alias for K |
|
1024 |
|
>>> human2bytes('12 foo') |
|
Traceback (most recent call last): |
|
... |
|
ValueError: can't interpret '12 foo' |
|
""" |
|
init = s |
|
num = "" |
|
while s and s[0:1].isdigit() or s[0:1] == '.': |
|
num += s[0] |
|
s = s[1:] |
|
num = float(num) |
|
letter = s.strip() |
|
for name, sset in SYMBOLS.items(): |
|
if letter in sset: |
|
break |
|
else: |
|
if letter == 'k': |
|
# treat 'k' as an alias for 'K' as per: http://goo.gl/kTQMs |
|
sset = SYMBOLS['customary'] |
|
letter = letter.upper() |
|
else: |
|
raise ValueError("can't interpret %r" % init) |
|
prefix = {sset[0]: 1} |
|
for i, s in enumerate(sset[1:]): |
|
prefix[s] = 1 << (i + 1) * 10 |
|
return int(num * prefix[letter]) |
|
|
|
# https://github.com/okfn/ckanext-importlib |
def munge(name): |
def munge(name): |
# convert spaces to underscores |
# convert spaces to underscores |
name = re.sub(' ', '_', name).lower() |
name = re.sub(' ', '_', name).lower() |
# convert symbols to dashes |
# convert symbols to dashes |
name = re.sub('[:]', '_-', name).lower() |
name = re.sub('[:]', '_-', name).lower() |
name = re.sub('[/]', '-', name).lower() |
name = re.sub('[/]', '-', name).lower() |
# take out not-allowed characters |
# take out not-allowed characters |
name = re.sub('[^a-zA-Z0-9-_]', '', name).lower() |
name = re.sub('[^a-zA-Z0-9-_]', '', name).lower() |
# remove double underscores |
# remove double underscores |
name = re.sub('__', '_', name).lower() |
name = re.sub('__', '_', name).lower() |
return name |
return name |
|
|
|
|
def name_munge(input_name): |
def name_munge(input_name): |
return munge(input_name.replace(' ', '').replace('.', '_').replace('&', 'and')) |
return munge(input_name.replace(' ', '').replace('.', '_').replace('&', 'and')) |
#return input_name.replace(' ', '').replace('.', '_').replace('&', 'and') |
#return input_name.replace(' ', '').replace('.', '_').replace('&', 'and') |
|
|
couch = couchdb.Server('http://127.0.0.1:5984/') |
couch = couchdb.Server('http://127.0.0.1:5984/') |
docsdb = couch['disclosr-documents'] |
docsdb = couch['disclosr-documents'] |
|
|
if __name__ == "__main__": |
if __name__ == "__main__": |
for doc in docsdb.view('app/datasets'): |
for doc in docsdb.view('app/datasets'): |
print doc.id |
print doc.id |
if doc.value['url'] != "http://data.gov.au/data/": |
if doc.value['url'] != "http://data.gov.au/data/": |
# Collect the package metadata. |
# Collect the package metadata. |
pkg_name = name_munge(doc.value['metadata']['DCTERMS.Title'][:100]) |
pkg_name = name_munge(doc.value['metadata']['DCTERMS.Title'][:100]) |
|
tags = doc.value['metadata']["Keywords / Tags"] |
|
if not hasattr(tags, '__iter__'): |
|
tags = [tags] |
|
[re.sub('[^a-zA-Z0-9-_]', '', tag).lower() for tag in tags] |
package_entity = { |
package_entity = { |
'name': pkg_name, |
'name': pkg_name, |
'title': doc.value['metadata']['DCTERMS.Title'], |
'title': doc.value['metadata']['DCTERMS.Title'], |
'url': doc.value['metadata']['DCTERMS.Source.URI'], |
'url': doc.value['metadata']['DCTERMS.Source.URI'], |
'tags': doc.value['metadata']["Keywords / Tags"], #todo must be alphanumeric characters or symbols |
'tags': tags, |
|
|
'author': doc.value['metadata']["DCTERMS.Creator"], |
'author': doc.value['metadata']["DCTERMS.Creator"], |
'maintainer': doc.value['metadata']["DCTERMS.Creator"], |
'maintainer': doc.value['metadata']["DCTERMS.Creator"], |
'licence_id': doc.value['metadata']['DCTERMS.License'], |
'licence_id': doc.value['metadata']['DCTERMS.License'], #todo licence id mapping |
'notes': doc.value['metadata']['Description'], |
'notes': doc.value['metadata']['Description'], |
} |
} |
try: |
try: |
|
#print doc.id |
ckan.package_register_post(package_entity) |
ckan.package_register_post(package_entity) |
except CkanApiError, e: |
except CkanApiError, e: |
if ckan.last_status == 409: |
if ckan.last_status == 409: |
print "already exists" |
print "already exists" |
else: |
else: |
raise LoaderError('Unexpected status %s checking for package under \'%s\': %r' % (ckan.last_status, doc.id, e.args)) |
raise LoaderError('Unexpected status %s checking for package under \'%s\': %r' % ( |
|
ckan.last_status, pkg_name, e.args)) |
|
|
print package_entity |
print package_entity |
ckan.add_package_resource(pkg_name, 'http://example.org/', name='Foo', resource_type='data', format='csv') |
#todo add to organisation (author/creator/maintainer) |
|
#if 'data.gov.au Category' in doc.value['metadata'].keys(): #todo add to group |
|
if 'Download' in doc.value['metadata'].keys(): |
|
try: |
|
pkg = ckan.package_entity_get(pkg_name) |
|
resources = pkg.get('resources', []) |
|
if len(resources) < len(doc.value['metadata']['Download']): |
|
for resource in doc.value['metadata']['Download']: |
|
#print resource |
|
# http://docs.ckan.org/en/ckan-1.7/domain-model-resource.html |
|
# (KML/KMZ) / (Shapefile) /(Other) |
|
format = "plain" |
|
if resource['format'] == '(XML)': |
|
format = 'xml' |
|
if resource['format'] == '(CSV/XLS)': |
|
format = 'csv' |
|
name = resource['href'] |
|
if 'name' in resource.keys(): |
|
name = resource['name'] |
|
ckan.add_package_resource(pkg_name, resource['href'], name=name, resource_type='data', |
|
format=format, size=human2bytes(resource['size'].replace(',', ''))) |
|
else: |
|
print "resources already exist" |
|
except CkanApiError, e: |
|
if ckan.last_status == 404: |
|
print "parent dataset does not exist" |
|
else: |
|
raise LoaderError('Unexpected status %s checking for package under \'%s\': %r' % ( |
|
ckan.last_status, pkg_name, e.args)) |
|
|