my datasets deduplication
my datasets deduplication

file:a/README.rst -> file:b/README.rst
This CKAN Extension customises a CKAN instance for the hosting of data.gov.au. This CKAN Extension customises a CKAN instance for the hosting of data.gov.au.
   
It comprises: It comprises:
   
* A CKAN Extension "plugin" at ``ckanext/datagovau/plugin.py`` which, when * A CKAN Extension "plugin" at ``ckanext/datagovau/plugin.py`` which, when
loaded, overrides various settings in the core ``ini``-file to provide: loaded, overrides various settings in the core ``ini``-file to provide:
* A path to local customisations of the core templates * A path to local customisations of the core templates to include AGLS/Dublin Core minimum metadata
* A custom Package edit form that defaults to cc-by licence * A custom Package edit form that defaults to cc-by licence
* A custom n3/rdf output format * A custom n3/rdf output format
  * Replaces links with http/https protocol independent versions
  * Provides HTML to users to embed data previews on their own website
   
* A cut down licenses.json file * A cut down licenses.json file
   
Installation Installation
============ ============
   
To install this package, from your CKAN virtualenv, run the following from your CKAN base folder (e.g. ``pyenv/``):: To install this package, from your CKAN virtualenv, run the following from your CKAN base folder (e.g. ``pyenv/``)::
   
pip install -e git+https://github.com/okfn/ckanext-datagovau#egg=ckanext-datagovau pip install -e git+https://github.com/okfn/ckanext-datagovau#egg=ckanext-datagovau
   
Then activate it by setting ``ckan.plugins = datagovau`` in your main ``ini``-file. Then activate it by setting ``ckan.plugins = datagovau`` in your main ``ini``-file.
   
To add the cut down licenses.json set ``licenses_group_url = http://%(ckan.site_url)/licenses.json`` To add the cut down licenses.json set ``licenses_group_url = http://%(ckan.site_url)/licenses.json``
or copy ``ckanext/datagovau/theme/public/licenses.json`` to the same folder as your CKAN config ini file or copy ``ckanext/datagovau/theme/public/licenses.json`` to the same folder as your CKAN config ini file
and set ``licenses_group_url = file://%(here)s/licenses.json`` and set ``licenses_group_url = file://%(here)s/licenses.json``
   
   
   
import urllib import urllib
import json import json
from pprint import pprint from pprint import pprint
import logging import logging
import ckan.logic as logic import ckan.logic as logic
import hashlib import hashlib
import threading import threading
from ckan.common import _, c, request, response from ckan.common import _, c, request, response
from pylons import config from pylons import config
  from webob.multidict import UnicodeMultiDict
  from paste.util.multidict import MultiDict
   
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
   
from ckan.controllers.api import ApiController from ckan.controllers.api import ApiController
   
class DGAApiController(ApiController): class DGAApiController(ApiController):
   
def _post_analytics(self,user,request_obj_type,request_function,request_id): def _post_analytics(self,user,request_obj_type,request_function,request_id):
if (config.get('googleanalytics.id') != None): if (config.get('googleanalytics.id') != None):
data = urllib.urlencode({ data = urllib.urlencode({
"v":1, "v":1,
"tid":config.get('googleanalytics.id'), "tid":config.get('googleanalytics.id'),
"cid":hashlib.md5(user).hexdigest(), "cid":hashlib.md5(user).hexdigest(),
"t":"event", "t":"event",
"dh":c.environ['HTTP_HOST'], "dh":c.environ['HTTP_HOST'],
"dp":c.environ['PATH_INFO'], "dp":c.environ['PATH_INFO'],
"dr":c.environ.get('HTTP_REFERER',''), "dr":c.environ.get('HTTP_REFERER',''),
"ec":"CKAN API Request", "ec":"CKAN API Request",
"ea":request_obj_type+request_function, "ea":request_obj_type+request_function,
"el":request_id, "el":request_id,
}) })
log.debug("Sending API Analytics Data: "+data) log.debug("Sending API Analytics Data: "+data)
# send analytics asynchronously # send analytics asynchronously
threading.Thread(target=urllib.urlopen,args=("http://www.google-analytics.com/collect", data)).start() threading.Thread(target=urllib.urlopen,args=("http://www.google-analytics.com/collect", data)).start()
   
def action(self, logic_function, ver=None): def action(self, logic_function, ver=None):
try: try:
function = logic.get_action(logic_function) function = logic.get_action(logic_function)
except Exception,e: except Exception,e:
log.debug(e) log.debug(e)
pass pass
try: try:
side_effect_free = getattr(function, 'side_effect_free', False) side_effect_free = getattr(function, 'side_effect_free', False)
request_data = self._get_request_data(try_url_params=side_effect_free) request_data = self._get_request_data(try_url_params=side_effect_free)
if isinstance(request_data, dict): if isinstance(request_data, dict):
id = request_data.get('id','') id = request_data.get('id','')
if 'q' in request_data.keys(): if 'q' in request_data.keys():
id = request_data['q'] id = request_data['q']
if 'query' in request_data.keys(): if 'query' in request_data.keys():
id = request_data['query'] id = request_data['query']
self._post_analytics(c.user,logic_function,'', id) self._post_analytics(c.user,logic_function,'', id)
except Exception,e: except Exception,e:
print log.debug(e) print log.debug(e)
pass pass
return ApiController.action(self,logic_function, ver) return ApiController.action(self,logic_function, ver)
def list(self, ver=None, register=None, subregister=None, id=None): def list(self, ver=None, register=None, subregister=None, id=None):
self._post_analytics(c.user,register+("_"+str(subregister) if subregister else ""),"list",id) self._post_analytics(c.user,register+("_"+str(subregister) if subregister else ""),"list",id)
return ApiController.list(self,ver, register, subregister, id) return ApiController.list(self,ver, register, subregister, id)
def show(self, ver=None, register=None, subregister=None, id=None, id2=None): def show(self, ver=None, register=None, subregister=None, id=None, id2=None):
self._post_analytics(c.user,register+("_"+str(subregister) if subregister else ""),"show",id) self._post_analytics(c.user,register+("_"+str(subregister) if subregister else ""),"show",id)
return ApiController.show(self,ver, register, subregister, id,id2) return ApiController.show(self,ver, register, subregister, id,id2)
def update(self, ver=None, register=None, subregister=None, id=None, id2=None): def update(self, ver=None, register=None, subregister=None, id=None, id2=None):
self._post_analytics(c.user,register+("_"+str(subregister) if subregister else ""),"update",id) self._post_analytics(c.user,register+("_"+str(subregister) if subregister else ""),"update",id)
return ApiController.update(self,ver, register, subregister, id,id2) return ApiController.update(self,ver, register, subregister, id,id2)
def delete(self, ver=None, register=None, subregister=None, id=None, id2=None): def delete(self, ver=None, register=None, subregister=None, id=None, id2=None):
self._post_analytics(c.user,register+("_"+str(subregister) if subregister else ""),"delete",id) self._post_analytics(c.user,register+("_"+str(subregister) if subregister else ""),"delete",id)
return ApiController.delete(self,ver, register, subregister, id,id2) return ApiController.delete(self,ver, register, subregister, id,id2)
def search(self, ver=None, register=None): def search(self, ver=None, register=None):
id = None id = None
try: try:
params = MultiDict(self._get_search_params(request.params)) params = MultiDict(self._get_search_params(request.params))
if 'q' in params.keys(): if 'q' in params.keys():
id = params['q'] id = params['q']
if 'query' in params.keys(): if 'query' in params.keys():
id = params['query'] id = params['query']
except ValueError, e: except ValueError, e:
print str(e) print str(e)
pass pass
self._post_analytics(c.user,register,"search",id) self._post_analytics(c.user,register,"search",id)
   
import logging import logging
   
import ckan.plugins as plugins import ckan.plugins as plugins
import ckan.lib as lib import ckan.lib as lib
import ckan.lib.dictization.model_dictize as model_dictize import ckan.lib.dictization.model_dictize as model_dictize
import ckan.plugins.toolkit as tk import ckan.plugins.toolkit as tk
import ckan.model as model import ckan.model as model
from pylons import config from pylons import config
from routes.mapper import SubMapper, Mapper as _Mapper from routes.mapper import SubMapper, Mapper as _Mapper
   
from sqlalchemy import orm from sqlalchemy import orm
import ckan.model import ckan.model
   
#parse the activity feed for last active non-system user #parse the activity feed for last active non-system user
def get_last_active_user(id): def get_last_active_user(id):
system_user = lib.helpers.get_action('user_show',{'id': config.get('ckan.site_id', 'ckan_site_user')}) system_user = lib.helpers.get_action('user_show',{'id': config.get('ckan.site_id', 'ckan_site_user')})
user_list = [x for x in lib.helpers.get_action('package_activity_list',{'id':id}) if x['user_id'] != system_user['id']] user_list = [x for x in lib.helpers.get_action('package_activity_list',{'id':id}) if x['user_id'] != system_user['id']]
user = None user = None
if len(user_list) > 0: if len(user_list) > 0:
user = user_list[0].get('user_id', None) user = user_list[0].get('user_id', None)
if user is None: if user is None:
return system_user return system_user
else: else:
return lib.helpers.get_action('user_show',{'id':user}) return lib.helpers.get_action('user_show',{'id':user})
   
# get user created datasets and those they have edited # get user created datasets and those they have edited
def get_user_datasets(user_dict): def get_user_datasets(user_dict):
created_datasets_list = user_dict['datasets'] created_datasets_list = user_dict['datasets']
active_datasets_list = [x['data']['package'] for x in active_datasets_list = [x['data']['package'] for x in
lib.helpers.get_action('user_activity_list',{'id':user_dict['id']}) if x['data'].get('package')] lib.helpers.get_action('user_activity_list',{'id':user_dict['id']}) if x['data'].get('package')]
return created_datasets_list + active_datasets_list raw_list = created_datasets_list + active_datasets_list
  filtered_dict = {}
  for dataset in raw_list:
  if dataset['id'] not in filtered_dict.keys():
  filtered_dict[dataset['id']] = dataset
  return filtered_dict.values()
   
class DataGovAuPlugin(plugins.SingletonPlugin, class DataGovAuPlugin(plugins.SingletonPlugin,
tk.DefaultDatasetForm): tk.DefaultDatasetForm):
'''An example IDatasetForm CKAN plugin. '''An example IDatasetForm CKAN plugin.
   
Uses a tag vocabulary to add a custom metadata field to datasets. Uses a tag vocabulary to add a custom metadata field to datasets.
   
''' '''
plugins.implements(plugins.IConfigurer, inherit=False) plugins.implements(plugins.IConfigurer, inherit=False)
plugins.implements(plugins.IDatasetForm, inherit=False) plugins.implements(plugins.IDatasetForm, inherit=False)
plugins.implements(plugins.ITemplateHelpers, inherit=False) plugins.implements(plugins.ITemplateHelpers, inherit=False)
plugins.implements(plugins.IRoutes, inherit=True) plugins.implements(plugins.IRoutes, inherit=True)
   
def before_map(self, map): def before_map(self, map):
   
# Helpers to reduce code clutter # Helpers to reduce code clutter
GET = dict(method=['GET']) GET = dict(method=['GET'])
PUT = dict(method=['PUT']) PUT = dict(method=['PUT'])
POST = dict(method=['POST']) POST = dict(method=['POST'])
DELETE = dict(method=['DELETE']) DELETE = dict(method=['DELETE'])
GET_POST = dict(method=['GET', 'POST']) GET_POST = dict(method=['GET', 'POST'])
# intercept API calls that we want to capture analytics on # intercept API calls that we want to capture analytics on
register_list = [ register_list = [
'package', 'package',
'dataset', 'dataset',
'resource', 'resource',
'tag', 'tag',
'group', 'group',
'related', 'related',
'revision', 'revision',
'licenses', 'licenses',
'rating', 'rating',
'user', 'user',
'activity' 'activity'
] ]
register_list_str = '|'.join(register_list) register_list_str = '|'.join(register_list)
# /api ver 3 or none # /api ver 3 or none
with SubMapper(map, controller='ckanext.datagovau.controller:DGAApiController', path_prefix='/api{ver:/3|}', with SubMapper(map, controller='ckanext.datagovau.controller:DGAApiController', path_prefix='/api{ver:/3|}',
ver='/3') as m: ver='/3') as m:
m.connect('/action/{logic_function}', action='action', m.connect('/action/{logic_function}', action='action',
conditions=GET_POST) conditions=GET_POST)
   
# /api ver 1, 2, 3 or none # /api ver 1, 2, 3 or none
with SubMapper(map, controller='ckanext.datagovau.controller:DGAApiController', path_prefix='/api{ver:/1|/2|/3|}', with SubMapper(map, controller='ckanext.datagovau.controller:DGAApiController', path_prefix='/api{ver:/1|/2|/3|}',
ver='/1') as m: ver='/1') as m:
m.connect('/search/{register}', action='search') m.connect('/search/{register}', action='search')
# /api/rest ver 1, 2 or none # /api/rest ver 1, 2 or none
with SubMapper(map, controller='ckanext.datagovau.controller:DGAApiController', path_prefix='/api{ver:/1|/2|}', with SubMapper(map, controller='ckanext.datagovau.controller:DGAApiController', path_prefix='/api{ver:/1|/2|}',
ver='/1', requirements=dict(register=register_list_str) ver='/1', requirements=dict(register=register_list_str)
) as m: ) as m:
   
m.connect('/rest/{register}', action='list', conditions=GET) m.connect('/rest/{register}', action='list', conditions=GET)
m.connect('/rest/{register}', action='create', conditions=POST) m.connect('/rest/{register}', action='create', conditions=POST)
m.connect('/rest/{register}/{id}', action='show', conditions=GET) m.connect('/rest/{register}/{id}', action='show', conditions=GET)
m.connect('/rest/{register}/{id}', action='update', conditions=PUT) m.connect('/rest/{register}/{id}', action='update', conditions=PUT)
m.connect('/rest/{register}/{id}', action='update', conditions=POST) m.connect('/rest/{register}/{id}', action='update', conditions=POST)
m.connect('/rest/{register}/{id}', action='delete', conditions=DELETE) m.connect('/rest/{register}/{id}', action='delete', conditions=DELETE)
   
return map return map
   
def update_config(self, config): def update_config(self, config):
# Add this plugin's templates dir to CKAN's extra_template_paths, so # Add this plugin's templates dir to CKAN's extra_template_paths, so
# that CKAN will use this plugin's custom templates. # that CKAN will use this plugin's custom templates.
# here = os.path.dirname(__file__) # here = os.path.dirname(__file__)
# rootdir = os.path.dirname(os.path.dirname(here)) # rootdir = os.path.dirname(os.path.dirname(here))
   
tk.add_template_directory(config, 'templates') tk.add_template_directory(config, 'templates')
tk.add_public_directory(config, 'theme/public') tk.add_public_directory(config, 'theme/public')
tk.add_resource('theme/public', 'ckanext-datagovau') tk.add_resource('theme/public', 'ckanext-datagovau')
# config['licenses_group_url'] = 'http://%(ckan.site_url)/licenses.json' # config['licenses_group_url'] = 'http://%(ckan.site_url)/licenses.json'
   
def get_helpers(self): def get_helpers(self):
return {'get_last_active_user': get_last_active_user, 'get_user_datasets': get_user_datasets} return {'get_last_active_user': get_last_active_user, 'get_user_datasets': get_user_datasets}
   
def is_fallback(self): def is_fallback(self):
# Return True to register this plugin as the default handler for # Return True to register this plugin as the default handler for
# package types not handled by any other IDatasetForm plugin. # package types not handled by any other IDatasetForm plugin.
return True return True
   
def package_types(self): def package_types(self):
# This plugin doesn't handle any special package types, it just # This plugin doesn't handle any special package types, it just
# registers itself as the default (above). # registers itself as the default (above).
return [] return []
   
   
def create_package_schema(self): def create_package_schema(self):
schema = super(DataGovAuPlugin, self).create_package_schema() schema = super(DataGovAuPlugin, self).create_package_schema()
schema = self._modify_package_schema(schema) schema = self._modify_package_schema(schema)
return schema return schema
   
def update_package_schema(self): def update_package_schema(self):
schema = super(DataGovAuPlugin, self).update_package_schema() schema = super(DataGovAuPlugin, self).update_package_schema()
schema = self._modify_package_schema(schema) schema = self._modify_package_schema(schema)
return schema return schema
   
def show_package_schema(self): def show_package_schema(self):
schema = super(DataGovAuPlugin, self).show_package_schema() schema = super(DataGovAuPlugin, self).show_package_schema()
   
# Don't show vocab tags mixed in with normal 'free' tags # Don't show vocab tags mixed in with normal 'free' tags
# (e.g. on dataset pages, or on the search page) # (e.g. on dataset pages, or on the search page)
schema['tags']['__extras'].append(tk.get_converter('free_tags_only')) schema['tags']['__extras'].append(tk.get_converter('free_tags_only'))
   
# Add our custom_text field to the dataset schema. # Add our custom_text field to the dataset schema.
# ignore_missing == optional # ignore_missing == optional
# ignore_empty == mandatory but not for viewing # ignore_empty == mandatory but not for viewing
# !!! always convert_from_extras first # !!! always convert_from_extras first
schema.update({ schema.update({
'agency_program': [tk.get_converter('convert_from_extras'), 'agency_program': [tk.get_converter('convert_from_extras'),
tk.get_validator('ignore_missing')], tk.get_validator('ignore_missing')],
'contact_point': [tk.get_converter('convert_from_extras'), 'contact_point': [tk.get_converter('convert_from_extras'),
tk.get_validator('ignore_empty')], tk.get_validator('ignore_empty')],
'spatial_coverage': [tk.get_converter('convert_from_extras'), 'spatial_coverage': [tk.get_converter('convert_from_extras'),
tk.get_validator('ignore_empty')], tk.get_validator('ignore_empty')],
'granularity': [tk.get_converter('convert_from_extras'), 'granularity': [tk.get_converter('convert_from_extras'),
tk.get_validator('ignore_empty')], tk.get_validator('ignore_empty')],
'jurisdiction': [tk.get_converter('convert_from_extras'), 'jurisdiction': [tk.get_converter('convert_from_extras'),
tk.get_validator('ignore_empty')], tk.get_validator('ignore_empty')],
'temporal_coverage': [tk.get_converter('convert_from_extras'), 'temporal_coverage': [tk.get_converter('convert_from_extras'),
tk.get_validator('ignore_empty')], tk.get_validator('ignore_empty')],
'data_state': [tk.get_converter('convert_from_extras'), 'data_state': [tk.get_converter('convert_from_extras'),
tk.get_validator('ignore_empty')], tk.get_validator('ignore_empty')],
'update_freq': [tk.get_converter('convert_from_extras'), 'update_freq': [tk.get_converter('convert_from_extras'),
tk.get_validator('ignore_empty')] tk.get_validator('ignore_empty')]
}) })
return schema return schema
   
def _modify_package_schema(self, schema): def _modify_package_schema(self, schema):
# Add our custom_test metadata field to the schema, this one will use # Add our custom_test metadata field to the schema, this one will use
# convert_to_extras instead of convert_to_tags. # convert_to_extras instead of convert_to_tags.
# ignore_missing == optional # ignore_missing == optional
# not_empty == mandatory, enforced here while modifying # not_empty == mandatory, enforced here while modifying
   
schema.update({ schema.update({
'agency_program': [tk.get_validator('ignore_missing'), 'agency_program': [tk.get_validator('ignore_missing'),
tk.get_converter('convert_to_extras')], tk.get_converter('convert_to_extras')],
'contact_point': [tk.get_converter('convert_to_extras'), 'contact_point': [tk.get_converter('convert_to_extras'),
tk.get_validator('not_empty')], tk.get_validator('not_empty')],
'spatial_coverage': [tk.get_converter('convert_to_extras'), 'spatial_coverage': [tk.get_converter('convert_to_extras'),
tk.get_validator('not_empty')], tk.get_validator('not_empty')],
'granularity': [tk.get_converter('convert_to_extras'), 'granularity': [tk.get_converter('convert_to_extras'),
tk.get_validator('not_empty')], tk.get_validator('not_empty')],
'jurisdiction': [tk.get_converter('convert_to_extras'), 'jurisdiction': [tk.get_converter('convert_to_extras'),
tk.get_validator('not_empty')], tk.get_validator('not_empty')],
'temporal_coverage': [tk.get_converter('convert_to_extras'), 'temporal_coverage': [tk.get_converter('convert_to_extras'),
tk.get_validator('not_empty')], tk.get_validator('not_empty')],
'data_state': [tk.get_converter('convert_to_extras'), 'data_state': [tk.get_converter('convert_to_extras'),
tk.get_validator('not_empty')], tk.get_validator('not_empty')],
'update_freq': [tk.get_converter('convert_to_extras'),