import
import


Former-commit-id: 997dc9ece51a37dc25779ca4125d0960cdc195c9

import ckanclient import ckanclient
import couchdb import couchdb
from ckanclient import CkanApiError from ckanclient import CkanApiError
import re import re
   
class LoaderError(Exception): class LoaderError(Exception):
pass pass
# https://github.com/okfn/ckanext-importlib  
# Instantiate the CKAN client. # Instantiate the CKAN client.
ckan = ckanclient.CkanClient(base_location='http://localhost:5000/api', ckan = ckanclient.CkanClient(base_location='http://localhost:5000/api',
api_key='b47b24cd-591d-40c1-8677-d73101d56d1b') api_key='b47b24cd-591d-40c1-8677-d73101d56d1b')
# (use your own api_key from http://thedatahub.org/user/me ) # (use your own api_key from http://thedatahub.org/user/me )
  # http://code.activestate.com/recipes/578019-bytes-to-human-human-to-bytes-converter/
  SYMBOLS = {
  'customary': ('B', 'KB', 'MB', 'GB', 'T', 'P', 'E', 'Z', 'Y'),
  'customary_ext': ('byte', 'kilo', 'mega', 'giga', 'tera', 'peta', 'exa',
  'zetta', 'iotta'),
  'iec': ('Bi', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'),
  'iec_ext': ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi',
  'zebi', 'yobi'),
  }
   
  def human2bytes(s):
  """
  Attempts to guess the string format based on default symbols
  set and return the corresponding bytes as an integer.
  When unable to recognize the format ValueError is raised.
   
  >>> human2bytes('0 B')
  0
  >>> human2bytes('1 K')
  1024
  >>> human2bytes('1 M')
  1048576
  >>> human2bytes('1 Gi')
  1073741824
  >>> human2bytes('1 tera')
  1099511627776
   
  >>> human2bytes('0.5kilo')
  512
  >>> human2bytes('0.1 byte')
  0
  >>> human2bytes('1 k') # k is an alias for K
  1024
  >>> human2bytes('12 foo')
  Traceback (most recent call last):
  ...
  ValueError: can't interpret '12 foo'
  """
  init = s
  num = ""
  while s and s[0:1].isdigit() or s[0:1] == '.':
  num += s[0]
  s = s[1:]
  num = float(num)
  letter = s.strip()
  for name, sset in SYMBOLS.items():
  if letter in sset:
  break
  else:
  if letter == 'k':
  # treat 'k' as an alias for 'K' as per: http://goo.gl/kTQMs
  sset = SYMBOLS['customary']
  letter = letter.upper()
  else:
  raise ValueError("can't interpret %r" % init)
  prefix = {sset[0]: 1}
  for i, s in enumerate(sset[1:]):
  prefix[s] = 1 << (i + 1) * 10
  return int(num * prefix[letter])
   
  # https://github.com/okfn/ckanext-importlib
def munge(name): def munge(name):
# convert spaces to underscores # convert spaces to underscores
name = re.sub(' ', '_', name).lower() name = re.sub(' ', '_', name).lower()
# convert symbols to dashes # convert symbols to dashes
name = re.sub('[:]', '_-', name).lower() name = re.sub('[:]', '_-', name).lower()
name = re.sub('[/]', '-', name).lower() name = re.sub('[/]', '-', name).lower()
# take out not-allowed characters # take out not-allowed characters
name = re.sub('[^a-zA-Z0-9-_]', '', name).lower() name = re.sub('[^a-zA-Z0-9-_]', '', name).lower()
# remove double underscores # remove double underscores
name = re.sub('__', '_', name).lower() name = re.sub('__', '_', name).lower()
return name return name
   
   
def name_munge(input_name): def name_munge(input_name):
return munge(input_name.replace(' ', '').replace('.', '_').replace('&', 'and')) return munge(input_name.replace(' ', '').replace('.', '_').replace('&', 'and'))
#return input_name.replace(' ', '').replace('.', '_').replace('&', 'and') #return input_name.replace(' ', '').replace('.', '_').replace('&', 'and')
   
couch = couchdb.Server('http://127.0.0.1:5984/') couch = couchdb.Server('http://127.0.0.1:5984/')
docsdb = couch['disclosr-documents'] docsdb = couch['disclosr-documents']
   
if __name__ == "__main__": if __name__ == "__main__":
for doc in docsdb.view('app/datasets'): for doc in docsdb.view('app/datasets'):
print doc.id print doc.id
if doc.value['url'] != "http://data.gov.au/data/": if doc.value['url'] != "http://data.gov.au/data/":
# Collect the package metadata. # Collect the package metadata.
pkg_name = name_munge(doc.value['metadata']['DCTERMS.Title'][:100]) pkg_name = name_munge(doc.value['metadata']['DCTERMS.Title'][:100])
  tags = doc.value['metadata']["Keywords / Tags"]
  if not hasattr(tags, '__iter__'):
  tags = [tags]
  [re.sub('[^a-zA-Z0-9-_]', '', tag).lower() for tag in tags]
package_entity = { package_entity = {
'name': pkg_name, 'name': pkg_name,
'title': doc.value['metadata']['DCTERMS.Title'], 'title': doc.value['metadata']['DCTERMS.Title'],
'url': doc.value['metadata']['DCTERMS.Source.URI'], 'url': doc.value['metadata']['DCTERMS.Source.URI'],
'tags': doc.value['metadata']["Keywords / Tags"], #todo must be alphanumeric characters or symbols 'tags': tags,
   
'author': doc.value['metadata']["DCTERMS.Creator"], 'author': doc.value['metadata']["DCTERMS.Creator"],
'maintainer': doc.value['metadata']["DCTERMS.Creator"], 'maintainer': doc.value['metadata']["DCTERMS.Creator"],
'licence_id': doc.value['metadata']['DCTERMS.License'], 'licence_id': doc.value['metadata']['DCTERMS.License'], #todo licence id mapping
'notes': doc.value['metadata']['Description'], 'notes': doc.value['metadata']['Description'],
} }
try: try:
  #print doc.id
ckan.package_register_post(package_entity) ckan.package_register_post(package_entity)
except CkanApiError, e: except CkanApiError, e:
if ckan.last_status == 409: if ckan.last_status == 409:
print "already exists" print "already exists"
else: else:
raise LoaderError('Unexpected status %s checking for package under \'%s\': %r' % (ckan.last_status, doc.id, e.args)) raise LoaderError('Unexpected status %s checking for package under \'%s\': %r' % (
  ckan.last_status, pkg_name, e.args))
   
print package_entity print package_entity
ckan.add_package_resource(pkg_name, 'http://example.org/', name='Foo', resource_type='data', format='csv') #todo add to organisation (author/creator/maintainer)
  #if 'data.gov.au Category' in doc.value['metadata'].keys(): #todo add to group
  if 'Download' in doc.value['metadata'].keys():
  try:
  pkg = ckan.package_entity_get(pkg_name)
  resources = pkg.get('resources', [])
  if len(resources) < len(doc.value['metadata']['Download']):
  for resource in doc.value['metadata']['Download']:
  #print resource
  # http://docs.ckan.org/en/ckan-1.7/domain-model-resource.html
  # (KML/KMZ) / (Shapefile) /(Other)
  format = "plain"
  if resource['format'] == '(XML)':
  format = 'xml'
  if resource['format'] == '(CSV/XLS)':
  format = 'csv'
  name = resource['href']
  if 'name' in resource.keys():
  name = resource['name']
  ckan.add_package_resource(pkg_name, resource['href'], name=name, resource_type='data',
  format=format, size=human2bytes(resource['size'].replace(',', '')))
  else:
  print "resources already exist"
  except CkanApiError, e:
  if ckan.last_status == 404:
  print "parent dataset does not exist"
  else:
  raise LoaderError('Unexpected status %s checking for package under \'%s\': %r' % (
  ckan.last_status, pkg_name, e.args))