be permissive with datagov resource file size
be permissive with datagov resource file size


Former-commit-id: c0a71c3c09369965e8748c2a8f7062eb2abbe01a

--- a/documents/datagov-export.py
+++ b/documents/datagov-export.py
@@ -10,8 +10,7 @@
     pass
 
 # Instantiate the CKAN client.
-#ckan = ckanclient.CkanClient(base_location='http://localhost:5000/api',    api_key='b47b24cd-591d-40c1-8677-d73101d56d1b')
-api_key = 'b3ab75e4-afbb-465b-a09d-8171c8c69a7a'
+api_key = 'ff34526e-f794-4068-8235-fcbba38cd8bc'
 ckan = ckanclient.CkanClient(base_location='http://data.disclosurelo.gs/api',
                              api_key=api_key)
 ckandirect = ckanapi.RemoteCKAN('http://data.disclosurelo.gs', api_key=api_key)
@@ -57,6 +56,9 @@
           ...
       ValueError: can't interpret '12 foo'
     """
+    if s == None:
+	return 0
+    s = s.replace(',', '')
     init = s
     num = ""
     while s and s[0:1].isdigit() or s[0:1] == '.':
@@ -92,12 +94,9 @@
     name = re.sub('__', '_', name).lower()
     return name
 
-#todo "{'name': [u'Url must be purely lowercase alphanumeric (ascii) characters and these symbols: -_']}"
-# http://data.gov.au/dataset/australian-domestic-regional-and-international-airline-activity-%E2%80%93-time-series/
+
 def name_munge(input_name):
     return munge(input_name.replace(' ', '').replace('.', '_').replace('&', 'and'))
-    #[:100]
-    #return input_name.replace(' ', '').replace('.', '_').replace('&', 'and')
 
 
 def get_licence_id(licencename):
@@ -123,7 +122,9 @@
 
 if __name__ == "__main__":
     orgs_list = []
+    orgs_ids = {}
     for doc in docsdb.view('app/datasets'):
+        print "   ---   "
         print doc.id
 
         if doc.value['url'] != "http://data.gov.au/data/" and doc.value['agencyID'] != "qld":
@@ -150,8 +151,12 @@
                 else:
                     print "org found, adding dataset to " + org_name
 
-            org = ckandirect.action.organization_show(id=org_name)
-            # todo cache org names -> id mapping
+            # cache org names -> id mapping
+            if org_name not in orgs_ids:
+                org = ckandirect.action.organization_show(id=org_name)
+                orgs_ids[org_name] = org["result"]["id"]
+            org_id = orgs_ids[org_name]
+            print "org id is "+org_id
             tags = []
             if doc.value['agencyID'] == "AGIMO":
                 if len(doc.value['metadata']["Keywords / Tags"]) > 0:
@@ -171,7 +176,7 @@
                     'maintainer': doc.value['metadata']["DCTERMS.Creator"],
                     'licence_id': get_licence_id(doc.value['metadata']['DCTERMS.License']),
                     'notes': html2text.html2text(doc.value['metadata']['Description']),
-                    'owner_org': org["result"]["id"]
+                    'owner_org': org_id
                     #todo add missing key values like jurasdiction
                 }
             if doc.value['agencyID'] == "qld":
@@ -201,10 +206,8 @@
             for group_name in groups:
                 group_url = name_munge(group_name[:100])
                 try:
-                    print ckan.group_entity_get(group_url)
-
                     # Update the group details
-                    group_entity = ckan.last_message
+                    group_entity = ckan.group_entity_get(group_url)
                     print "group "+group_name+" exists"
                     if 'packages' in group_entity.keys():
                         group_entity['packages'] = list(set(group_entity['packages'] + [pkg_name]))
@@ -243,13 +246,17 @@
                                 format = 'xml'
                             if resource['format'] == '(CSV/XLS)':
                                 format = 'csv'
+                            if resource['format'] == '(Shapefile)':
+                                format = 'shp'
+                            if resource['format'] == '(KML/KMZ)':
+                                format = 'kml'
                             name = resource['href']
                             if 'name' in resource.keys():
                                 name = resource['name']
                             print resource
                             ckan.add_package_resource(pkg_name, resource['href'], name=name, resource_type='data',
                                                       format=format,
-                                                      size=human2bytes(resource['size'].replace(',', '')))
+                                                      size=human2bytes(resource.get('size','0B')))
                     else:
                         print "resources already exist"
                 except CkanApiError, e:

--- a/documents/datagov.py
+++ b/documents/datagov.py
@@ -39,7 +39,9 @@
                                     link = item.find("a")
                                     format = item.find(property="dc:format")
                                     linkobj = {"href":link['href'].replace("/bye?","").strip(),
-                                            "format": format.string.strip(), "size": format.next_sibling.string.strip()}
+                                            "format": format.string.strip()}
+				    if format.next_sibling.string != None:
+					linkobj["size"] = format.next_sibling.string.strip()
                                     if link.string != None:
                                         linkobj["name"] = link.string.strip()
                                     doc['metadata'][last_title].append(linkobj)