--- a/documents/datagov-export.py +++ b/documents/datagov-export.py @@ -2,43 +2,22 @@ import couchdb from ckanclient import CkanApiError import re +import html2text # aaronsw :( +import ckanapi # https://github.com/open-data/ckanapi + class LoaderError(Exception): pass # Instantiate the CKAN client. -#ckan = ckanclient.CkanClient(base_location='http://localhost:5000/api', api_key='b47b24cd-591d-40c1-8677-d73101d56d1b') -ckan = ckanclient.CkanClient(base_location='http://data.disclosurelo.gs/api', api_key='72f90359-0396-438c-804f-a26a24336747') -#couch = couchdb.Server('http://127.0.0.1:5984/') -couch = couchdb.Server('http://192.168.1.113:5984/') - -# http://stackoverflow.com/a/7778368/684978 -from HTMLParser import HTMLParser -import htmlentitydefs - -class HTMLTextExtractor(HTMLParser): - def __init__(self): - HTMLParser.__init__(self) - self.result = [ ] - - def handle_data(self, d): - self.result.append(d) - - def handle_charref(self, number): - codepoint = int(number[1:], 16) if number[0] in (u'x', u'X') else int(number) - self.result.append(unichr(codepoint)) - - def handle_entityref(self, name): - codepoint = htmlentitydefs.name2codepoint[name] - self.result.append(unichr(codepoint)) - - def get_text(self): - return u''.join(self.result) - -def html_to_text(html): - s = HTMLTextExtractor() - s.feed(html) - return s.get_text() +api_key = 'ff34526e-f794-4068-8235-fcbba38cd8bc' +server = 'data.disclosurelo.gs' + +ckan = ckanclient.CkanClient(base_location='http://'+server+'api', + api_key=api_key) +ckandirect = ckanapi.RemoteCKAN('http://'+server, api_key=api_key) +couch = couchdb.Server('http://127.0.0.1:5984/') +#couch = couchdb.Server('http://192.168.1.113:5984/') # http://code.activestate.com/recipes/578019-bytes-to-human-human-to-bytes-converter/ SYMBOLS = { @@ -49,6 +28,7 @@ 'iec_ext': ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi', 'zebi', 'yobi'), } + def human2bytes(s): """ @@ -78,6 +58,9 @@ ... ValueError: can't interpret '12 foo' """ + if s == None: + return 0 + s = s.replace(',', '') init = s num = "" while s and s[0:1].isdigit() or s[0:1] == '.': @@ -116,7 +99,7 @@ def name_munge(input_name): return munge(input_name.replace(' ', '').replace('.', '_').replace('&', 'and')) - #return input_name.replace(' ', '').replace('.', '_').replace('&', 'and') + def get_licence_id(licencename): map = { @@ -131,56 +114,115 @@ "Creative Commons - Attribution 2.5 Australia (CC-BY)": 'cc-by', 'CreativeCommonsAttributionCCBY25': 'cc-by', "PublicDomain": 'other-pd', - } + } if licencename not in map.keys(): - raise Exception(licencename + " not found"); + raise Exception(licencename + " not found"); return map[licencename]; + docsdb = couch['disclosr-documents'] if __name__ == "__main__": + orgs_list = [] + orgs_ids = {} for doc in docsdb.view('app/datasets'): + print " --- " print doc.id - if doc.value['url'] != "http://data.gov.au/data/": + + if doc.value['url'] != "http://data.gov.au/data/" and doc.value['agencyID'] != "qld": + + # Collect the package metadata. - pkg_name = name_munge(doc.value['metadata']['DCTERMS.Title'][:100]) - tags = doc.value['metadata']["Keywords / Tags"] - if not hasattr(tags, '__iter__'): - tags = [tags] - [re.sub('[^a-zA-Z0-9-_()]', '', tag).replace('&', 'and').lower() for tag in tags] - package_entity = { - 'name': pkg_name, - 'title': doc.value['metadata']['DCTERMS.Title'], - 'url': doc.value['metadata']['DCTERMS.Source.URI'], - - 'author': doc.value['metadata']["DCTERMS.Creator"], - 'maintainer': doc.value['metadata']["DCTERMS.Creator"], - 'licence_id': get_licence_id(doc.value['metadata']['DCTERMS.License']), - 'notes': html_to_text(doc.value['metadata']['Description']), - } - if len(tags) > 0: - package_entity['tags'] = tags - print tags + pkg_name = filter(lambda x: x in '0123456789abcdefghijklmnopqrstuvwxyz-_', + doc.value['url'].replace("http://data.gov.au/dataset/", '').replace('/', '')[:100]); + print pkg_name + #add to or create organization using direct API + agency = doc.value['metadata']["Agency"] + if agency == "APS": + agency = "Australian Public Service Commission" + if agency == "Shared Services, Treasury Directorate": + agency = "Shared Services Procurement, Treasury Directorate" + if agency == "Treasury - Shared Services": + agency = "Shared Services Procurement, Treasury Directorate" + if agency == "Territory and Municipal Services (TAMS)": + agency = "Territory and Municipal Services Directorate" + if agency == "State Library of NSW": + agency = "State Library of New South Wales" + org_name = name_munge(agency[:100]) + if org_name not in orgs_list: + orgs_list = ckandirect.action.organization_list()['result'] + #print orgs_list + if org_name not in orgs_list: + try: + print "org not found, creating " + org_name + ckandirect.action.organization_create(name=org_name, title=agency, + description=agency) + orgs_list.append(org_name) + except ckanapi.ValidationError, e: + print e + raise LoaderError('Unexpected status') + else: + print "org found, adding dataset to " + org_name + + # cache org names -> id mapping + if org_name not in orgs_ids: + org = ckandirect.action.organization_show(id=org_name) + orgs_ids[org_name] = org["result"]["id"] + org_id = orgs_ids[org_name] + print "org id is "+org_id + tags = [] + creator = doc.value['metadata']["DCTERMS.Creator"] + if doc.value['agencyID'] == "AGIMO": + if len(doc.value['metadata']["Keywords / Tags"]) > 0: + if hasattr(doc.value['metadata']["Keywords / Tags"], '__iter__'): + tags = tags + doc.value['metadata']["Keywords / Tags"] + else: + tags = tags + [doc.value['metadata']["Keywords / Tags"]] + + tags = [re.sub('[^a-zA-Z0-9-_.]', '', tag.replace('&', 'and')).lower() for tag in tags if tag] + #print tags + extras = [] + + for extra_key in doc.value['metadata'].keys(): + if extra_key not in ["Description","Content-Language","DCTERMS.Description", "Keywords / Tags" ,"data.gov.au Category", "Download", "Permalink","DCTERMS.Identifier"]: + if doc.value['metadata'][extra_key] != None and doc.value['metadata'][extra_key] != "": + extras.append([extra_key, doc.value['metadata'][extra_key]]) + + package_entity = { + 'name': pkg_name, + 'title': doc.value['metadata']['DCTERMS.Title'], + 'url': doc.value['metadata']['DCTERMS.Source.URI'], + 'tags': tags, #tags are mandatory? + 'author': creator, + 'maintainer': creator, + 'licence_id': get_licence_id(doc.value['metadata']['DCTERMS.License']), + 'notes': html2text.html2text(doc.value['metadata']['Description']), + 'owner_org': org_id, + 'extras': extras + } + + try: - #print doc.id + #print package_entity ckan.package_register_post(package_entity) except CkanApiError, e: - if ckan.last_status == 409: - print "already exists" + if ckan.last_message == "{\"name\": [\"That URL is already in use.\"]}": + print "package already exists" else: + print ckan.last_message raise LoaderError('Unexpected status %s checking for package under \'%s\': %r' % ( ckan.last_status, pkg_name, e.args)) - - print package_entity - #todo add to organisation (author/creator/maintainer) http://docs.ckan.org/en/latest/apiv3.html#examples ckan.logic.action.update.package_owner_org_update - #if 'data.gov.au Category' in doc.value['metadata'].keys(): #todo add to group + pkg = ckan.package_entity_get(pkg_name) + + + # add resources (downloadable data files) if 'Download' in doc.value['metadata'].keys(): try: - pkg = ckan.package_entity_get(pkg_name) + resources = pkg.get('resources', []) if len(resources) < len(doc.value['metadata']['Download']): for resource in doc.value['metadata']['Download']: - print resource + # http://docs.ckan.org/en/ckan-1.7/domain-model-resource.html # (KML/KMZ) / (Shapefile) /(Other) format = "plain" @@ -188,11 +230,17 @@ format = 'xml' if resource['format'] == '(CSV/XLS)': format = 'csv' + if resource['format'] == '(Shapefile)': + format = 'shp' + if resource['format'] == '(KML/KMZ)': + format = 'kml' name = resource['href'] if 'name' in resource.keys(): name = resource['name'] + print resource ckan.add_package_resource(pkg_name, resource['href'], name=name, resource_type='data', - format=format, size=human2bytes(resource['size'].replace(',', ''))) + format=format, + size=human2bytes(resource.get('size','0B'))) else: print "resources already exist" except CkanApiError, e: