be permissive with datagov resource file size
Former-commit-id: c0a71c3c09369965e8748c2a8f7062eb2abbe01a
--- a/documents/datagov-export.py
+++ b/documents/datagov-export.py
@@ -10,8 +10,7 @@
pass
# Instantiate the CKAN client.
-#ckan = ckanclient.CkanClient(base_location='http://localhost:5000/api', api_key='b47b24cd-591d-40c1-8677-d73101d56d1b')
-api_key = 'b3ab75e4-afbb-465b-a09d-8171c8c69a7a'
+api_key = 'ff34526e-f794-4068-8235-fcbba38cd8bc'
ckan = ckanclient.CkanClient(base_location='http://data.disclosurelo.gs/api',
api_key=api_key)
ckandirect = ckanapi.RemoteCKAN('http://data.disclosurelo.gs', api_key=api_key)
@@ -57,6 +56,9 @@
...
ValueError: can't interpret '12 foo'
"""
+ if s == None:
+ return 0
+ s = s.replace(',', '')
init = s
num = ""
while s and s[0:1].isdigit() or s[0:1] == '.':
@@ -92,12 +94,9 @@
name = re.sub('__', '_', name).lower()
return name
-#todo "{'name': [u'Url must be purely lowercase alphanumeric (ascii) characters and these symbols: -_']}"
-# http://data.gov.au/dataset/australian-domestic-regional-and-international-airline-activity-%E2%80%93-time-series/
+
def name_munge(input_name):
return munge(input_name.replace(' ', '').replace('.', '_').replace('&', 'and'))
- #[:100]
- #return input_name.replace(' ', '').replace('.', '_').replace('&', 'and')
def get_licence_id(licencename):
@@ -123,7 +122,9 @@
if __name__ == "__main__":
orgs_list = []
+ orgs_ids = {}
for doc in docsdb.view('app/datasets'):
+ print " --- "
print doc.id
if doc.value['url'] != "http://data.gov.au/data/" and doc.value['agencyID'] != "qld":
@@ -150,8 +151,12 @@
else:
print "org found, adding dataset to " + org_name
- org = ckandirect.action.organization_show(id=org_name)
- # todo cache org names -> id mapping
+ # cache org names -> id mapping
+ if org_name not in orgs_ids:
+ org = ckandirect.action.organization_show(id=org_name)
+ orgs_ids[org_name] = org["result"]["id"]
+ org_id = orgs_ids[org_name]
+ print "org id is "+org_id
tags = []
if doc.value['agencyID'] == "AGIMO":
if len(doc.value['metadata']["Keywords / Tags"]) > 0:
@@ -171,7 +176,7 @@
'maintainer': doc.value['metadata']["DCTERMS.Creator"],
'licence_id': get_licence_id(doc.value['metadata']['DCTERMS.License']),
'notes': html2text.html2text(doc.value['metadata']['Description']),
- 'owner_org': org["result"]["id"]
+ 'owner_org': org_id
#todo add missing key values like jurasdiction
}
if doc.value['agencyID'] == "qld":
@@ -201,10 +206,8 @@
for group_name in groups:
group_url = name_munge(group_name[:100])
try:
- print ckan.group_entity_get(group_url)
-
# Update the group details
- group_entity = ckan.last_message
+ group_entity = ckan.group_entity_get(group_url)
print "group "+group_name+" exists"
if 'packages' in group_entity.keys():
group_entity['packages'] = list(set(group_entity['packages'] + [pkg_name]))
@@ -243,13 +246,17 @@
format = 'xml'
if resource['format'] == '(CSV/XLS)':
format = 'csv'
+ if resource['format'] == '(Shapefile)':
+ format = 'shp'
+ if resource['format'] == '(KML/KMZ)':
+ format = 'kml'
name = resource['href']
if 'name' in resource.keys():
name = resource['name']
print resource
ckan.add_package_resource(pkg_name, resource['href'], name=name, resource_type='data',
format=format,
- size=human2bytes(resource['size'].replace(',', '')))
+ size=human2bytes(resource.get('size','0B')))
else:
print "resources already exist"
except CkanApiError, e:
--- a/documents/datagov.py
+++ b/documents/datagov.py
@@ -39,7 +39,9 @@
link = item.find("a")
format = item.find(property="dc:format")
linkobj = {"href":link['href'].replace("/bye?","").strip(),
- "format": format.string.strip(), "size": format.next_sibling.string.strip()}
+ "format": format.string.strip()}
+ if format.next_sibling.string != None:
+ linkobj["size"] = format.next_sibling.string.strip()
if link.string != None:
linkobj["name"] = link.string.strip()
doc['metadata'][last_title].append(linkobj)