preprod loading datgov
preprod loading datgov


Former-commit-id: 8ef7c117df568d7a28eeb28264803656527501d9

--- a/documents/datagov-export-groups.py
+++ b/documents/datagov-export-groups.py
@@ -9,7 +9,7 @@
 
 # Instantiate the CKAN client.
 #ckan = ckanclient.CkanClient(base_location='http://localhost:5000/api',    api_key='b47b24cd-591d-40c1-8677-d73101d56d1b')
-api_key = 'b3ab75e4-afbb-465b-a09d-8171c8c69a7a'
+api_key = 'ff34526e-f794-4068-8235-fcbba38cd8bc'
 ckan = ckanclient.CkanClient(base_location='http://data.disclosurelo.gs/api',
                              api_key=api_key)
 couch = couchdb.Server('http://127.0.0.1:5984/')

--- a/documents/datagov-export.py
+++ b/documents/datagov-export.py
@@ -10,11 +10,12 @@
     pass
 
 # Instantiate the CKAN client.
-#ckan = ckanclient.CkanClient(base_location='http://localhost:5000/api',    api_key='b47b24cd-591d-40c1-8677-d73101d56d1b')
-api_key = 'b3ab75e4-afbb-465b-a09d-8171c8c69a7a'
-ckan = ckanclient.CkanClient(base_location='http://data.disclosurelo.gs/api',
+api_key = 'ff34526e-f794-4068-8235-fcbba38cd8bc'
+server = 'data.disclosurelo.gs'
+
+ckan = ckanclient.CkanClient(base_location='http://'+server+'api',
                              api_key=api_key)
-ckandirect = ckanapi.RemoteCKAN('http://data.disclosurelo.gs', api_key=api_key)
+ckandirect = ckanapi.RemoteCKAN('http://'+server, api_key=api_key)
 couch = couchdb.Server('http://127.0.0.1:5984/')
 #couch = couchdb.Server('http://192.168.1.113:5984/')
 
@@ -57,6 +58,9 @@
           ...
       ValueError: can't interpret '12 foo'
     """
+    if s == None:
+	return 0
+    s = s.replace(',', '')
     init = s
     num = ""
     while s and s[0:1].isdigit() or s[0:1] == '.':
@@ -177,6 +181,13 @@
 
                 tags = [re.sub('[^a-zA-Z0-9-_.]', '', tag.replace('&', 'and')).lower() for tag in tags if tag]
                 #print tags
+                extras = []
+
+                for extra_key in doc.value['metadata'].keys():
+                    if extra_key not in ["Description","Content-Language","DCTERMS.Description", "Keywords / Tags" ,"data.gov.au Category", "Download", "Permalink","DCTERMS.Identifier"]:
+			if doc.value['metadata'][extra_key] != None and doc.value['metadata'][extra_key] != "":
+	                        extras.append([extra_key, doc.value['metadata'][extra_key]])
+
                 package_entity = {
                     'name': pkg_name,
                     'title': doc.value['metadata']['DCTERMS.Title'],
@@ -186,8 +197,8 @@
                     'maintainer': creator,
                     'licence_id': get_licence_id(doc.value['metadata']['DCTERMS.License']),
                     'notes': html2text.html2text(doc.value['metadata']['Description']),
-                    'owner_org': org_id
-                    #todo add missing key values like jurasdiction
+                    'owner_org': org_id,
+                    'extras': extras
                 }
 
 
@@ -219,13 +230,17 @@
                                 format = 'xml'
                             if resource['format'] == '(CSV/XLS)':
                                 format = 'csv'
+                            if resource['format'] == '(Shapefile)':
+                                format = 'shp'
+                            if resource['format'] == '(KML/KMZ)':
+                                format = 'kml'
                             name = resource['href']
                             if 'name' in resource.keys():
                                 name = resource['name']
                             print resource
                             ckan.add_package_resource(pkg_name, resource['href'], name=name, resource_type='data',
                                                       format=format,
-                                                      size=human2bytes(resource['size'].replace(',', '')))
+                                                      size=human2bytes(resource.get('size','0B')))
                     else:
                         print "resources already exist"
                 except CkanApiError, e:

--- a/documents/datagov.py
+++ b/documents/datagov.py
@@ -39,7 +39,9 @@
                                     link = item.find("a")
                                     format = item.find(property="dc:format")
                                     linkobj = {"href":link['href'].replace("/bye?","").strip(),
-                                            "format": format.string.strip(), "size": format.next_sibling.string.strip()}
+                                            "format": format.string.strip()}
+				    if format.next_sibling.string != None:
+					linkobj["size"] = format.next_sibling.string.strip()
                                     if link.string != None:
                                         linkobj["name"] = link.string.strip()
                                     doc['metadata'][last_title].append(linkobj)