add siteanalytics, exclude empty datasets from some stats
add siteanalytics, exclude empty datasets from some stats

import datetime import datetime
   
from pylons import config from pylons import config
from sqlalchemy import Table, select, func, and_ from sqlalchemy import Table, select, func, and_
from sqlalchemy.sql.expression import text from sqlalchemy.sql.expression import text
   
import ckan.plugins as p import ckan.plugins as p
import ckan.model as model import ckan.model as model
   
import re import re
   
cache_enabled = p.toolkit.asbool(config.get('ckanext.stats.cache_enabled', 'True')) cache_enabled = p.toolkit.asbool(config.get('ckanext.stats.cache_enabled', 'True'))
   
if cache_enabled: if cache_enabled:
from pylons import cache from pylons import cache
our_cache = cache.get_cache('stats', type='dbm') our_cache = cache.get_cache('stats', type='dbm')
   
DATE_FORMAT = '%Y-%m-%d' DATE_FORMAT = '%Y-%m-%d'
   
def table(name): def table(name):
return Table(name, model.meta.metadata, autoload=True) return Table(name, model.meta.metadata, autoload=True)
   
def datetime2date(datetime_): def datetime2date(datetime_):
return datetime.date(datetime_.year, datetime_.month, datetime_.day) return datetime.date(datetime_.year, datetime_.month, datetime_.day)
   
   
class Stats(object): class Stats(object):
@classmethod @classmethod
def top_rated_packages(cls, limit=10): def top_rated_packages(cls, limit=10):
# NB Not using sqlalchemy as sqla 0.4 doesn't work using both group_by # NB Not using sqlalchemy as sqla 0.4 doesn't work using both group_by
# and apply_avg # and apply_avg
package = table('package') package = table('package')
rating = table('rating') rating = table('rating')
sql = select([package.c.id, func.avg(rating.c.rating), func.count(rating.c.rating)], from_obj=[package.join(rating)]).\ sql = select([package.c.id, func.avg(rating.c.rating), func.count(rating.c.rating)], from_obj=[package.join(rating)]).\
where(package.c.private == 'f').\ where(package.c.private == 'f').\
group_by(package.c.id).\ group_by(package.c.id).\
order_by(func.avg(rating.c.rating).desc(), func.count(rating.c.rating).desc()).\ order_by(func.avg(rating.c.rating).desc(), func.count(rating.c.rating).desc()).\
limit(limit) limit(limit)
res_ids = model.Session.execute(sql).fetchall() res_ids = model.Session.execute(sql).fetchall()
res_pkgs = [(model.Session.query(model.Package).get(unicode(pkg_id)), avg, num) for pkg_id, avg, num in res_ids] res_pkgs = [(model.Session.query(model.Package).get(unicode(pkg_id)), avg, num) for pkg_id, avg, num in res_ids]
return res_pkgs return res_pkgs
   
@classmethod @classmethod
def most_edited_packages(cls, limit=10): def most_edited_packages(cls, limit=10):
package_revision = table('package_revision') package_revision = table('package_revision')
package = table('package') package = table('package')
s = select([package_revision.c.id, func.count(package_revision.c.revision_id)], from_obj=[package_revision.join(package)]).\ s = select([package_revision.c.id, func.count(package_revision.c.revision_id)], from_obj=[package_revision.join(package)]).\
where(package.c.private == 'f').\ where(package.c.private == 'f').\
group_by(package_revision.c.id).\ group_by(package_revision.c.id).\
order_by(func.count(package_revision.c.revision_id).desc()).\ order_by(func.count(package_revision.c.revision_id).desc()).\
limit(limit) limit(limit)
res_ids = model.Session.execute(s).fetchall() res_ids = model.Session.execute(s).fetchall()
res_pkgs = [(model.Session.query(model.Package).get(unicode(pkg_id)), val) for pkg_id, val in res_ids] res_pkgs = [(model.Session.query(model.Package).get(unicode(pkg_id)), val) for pkg_id, val in res_ids]
return res_pkgs return res_pkgs
   
@classmethod @classmethod
def largest_groups(cls, limit=10): def largest_groups(cls, limit=10):
member = table('member') member = table('member')
s = select([member.c.group_id, func.count(member.c.table_id)]).\ s = select([member.c.group_id, func.count(member.c.table_id)]).\
group_by(member.c.group_id).\ group_by(member.c.group_id).\
where(member.c.group_id!=None).\ where(member.c.group_id!=None).\
where(member.c.table_name=='package').\ where(member.c.table_name=='package').\
where(member.c.capacity=='public').\ where(member.c.capacity=='public').\
order_by(func.count(member.c.table_id).desc()) order_by(func.count(member.c.table_id).desc())
#limit(limit) #limit(limit)
   
res_ids = model.Session.execute(s).fetchall() res_ids = model.Session.execute(s).fetchall()
res_groups = [(model.Session.query(model.Group).get(unicode(group_id)), val) for group_id, val in res_ids] res_groups = [(model.Session.query(model.Group).get(unicode(group_id)), val) for group_id, val in res_ids]
return res_groups return res_groups
   
@classmethod @classmethod
def by_org(cls, limit=10): def by_org(cls, limit=10):
connection = model.Session.connection() connection = model.Session.connection()
res = connection.execute("select package.owner_org, package.private, count(*) from package \ res = connection.execute("select package.owner_org, package.private, count(*) from package \
  inner join (select distinct package_id from resource_group inner join resource on resource.resource_group_id = resource_group.id) as r on package.id = r.package_id \
inner join \"group\" on package.owner_org = \"group\".id \ inner join \"group\" on package.owner_org = \"group\".id \
where package.state='active'\ where package.state='active'\
group by package.owner_org,\"group\".name, package.private \ group by package.owner_org,\"group\".name, package.private \
order by \"group\".name, package.private;").fetchall(); order by \"group\".name, package.private;").fetchall();
res_groups = [(model.Session.query(model.Group).get(unicode(group_id)), private, val) for group_id, private, val in res] res_groups = [(model.Session.query(model.Group).get(unicode(group_id)), private, val) for group_id, private, val in res]
return res_groups return res_groups
   
@classmethod @classmethod
def res_by_org(cls, limit=10): def res_by_org(cls, limit=10):
connection = model.Session.connection() connection = model.Session.connection()
reses = connection.execute("select owner_org,format,count(*) from \ reses = connection.execute("select owner_org,format,count(*) from \
resource inner join resource_group on resource.resource_group_id = resource_group.id \ resource inner join resource_group on resource.resource_group_id = resource_group.id \
inner join package on resource_group.package_id = package.id group by owner_org,format order by count desc;").fetchall(); inner join package on resource_group.package_id = package.id group by owner_org,format order by count desc;").fetchall();
group_ids = [] group_ids = []
group_tab = {} group_tab = {}
group_spatial = {} group_spatial = {}
group_other = {} group_other = {}
for group_id,format,count in reses: for group_id,format,count in reses:
if group_id not in group_ids: if group_id not in group_ids:
group_ids.append(group_id) group_ids.append(group_id)
group_tab[group_id] = 0 group_tab[group_id] = 0
group_spatial[group_id] = 0 group_spatial[group_id] = 0
group_other[group_id] = 0 group_other[group_id] = 0
if re.search('xls|csv|ms-excel|spreadsheetml.sheet|zip|netcdf',format, re.IGNORECASE): if re.search('xls|csv|ms-excel|spreadsheetml.sheet|zip|netcdf',format, re.IGNORECASE):
group_tab[group_id] = group_tab[group_id] + count group_tab[group_id] = group_tab[group_id] + count
elif re.search('wms|wfs|wcs|shp|kml|kmz',format, re.IGNORECASE): elif re.search('wms|wfs|wcs|shp|kml|kmz',format, re.IGNORECASE):
group_spatial[group_id] = group_spatial[group_id] + count group_spatial[group_id] = group_spatial[group_id] + count
else: else:
group_other[group_id] = group_other[group_id] + count group_other[group_id] = group_other[group_id] + count
return [(model.Session.query(model.Group).get(unicode(group_id)), group_tab[group_id],group_spatial[group_id],group_other[group_id], group_tab[group_id]+group_spatial[group_id]+group_other[group_id]) for group_id in group_ids] return [(model.Session.query(model.Group).get(unicode(group_id)), group_tab[group_id],group_spatial[group_id],group_other[group_id], group_tab[group_id]+group_spatial[group_id]+group_other[group_id]) for group_id in group_ids]
   
@classmethod @classmethod
def top_active_orgs(cls, limit=10): def top_active_orgs(cls, limit=10):
connection = model.Session.connection() connection = model.Session.connection()
res = connection.execute("select package.owner_org, count(*) from package \ res = connection.execute("select package.owner_org, count(*) from package \
  inner join (select distinct package_id from resource_group inner join resource on resource.resource_group_id = resource_group.id) as r on package.id = r.package_id \
inner join \"group\" on package.owner_org = \"group\".id \ inner join \"group\" on package.owner_org = \"group\".id \
inner join (select distinct object_id from activity where activity.timestamp > (now() - interval '60 day')) \ inner join (select distinct object_id from activity where activity.timestamp > (now() - interval '60 day')) \
latestactivities on latestactivities.object_id = package.id \ latestactivities on latestactivities.object_id = package.id \
where package.state='active' \ where package.state='active' \
and package.private = 'f' \ and package.private = 'f' \
group by package.owner_org \ group by package.owner_org \
order by count(*) desc;").fetchall(); order by count(*) desc;").fetchall();
res_groups = [(model.Session.query(model.Group).get(unicode(group_id)), val) for group_id, val in res] res_groups = [(model.Session.query(model.Group).get(unicode(group_id)), val) for group_id, val in res]
return res_groups return res_groups
   
@classmethod @classmethod
def top_package_owners(cls, limit=10): def top_package_owners(cls, limit=10):
package_role = table('package_role') package_role = table('package_role')
user_object_role = table('user_object_role') user_object_role = table('user_object_role')
package = table('package') package = table('package')
s = select([user_object_role.c.user_id, func.count(user_object_role.c.role)], from_obj=[user_object_role.join(package_role).join(package, package_role.c.package_id == package.c.id)]).\ s = select([user_object_role.c.user_id, func.count(user_object_role.c.role)], from_obj=[user_object_role.join(package_role).join(package, package_role.c.package_id == package.c.id)]).\
where(user_object_role.c.role==model.authz.Role.ADMIN).\ where(user_object_role.c.role==model.authz.Role.ADMIN).\
where(package.c.private == 'f').\ where(package.c.private == 'f').\
where(user_object_role.c.user_id!=None).\ where(user_object_role.c.user_id!=None).\
group_by(user_object_role.c.user_id).\ group_by(user_object_role.c.user_id).\
order_by(func.count(user_object_role.c.role).desc()).\ order_by(func.count(user_object_role.c.role).desc()).\
limit(limit) limit(limit)
res_ids = model.Session.execute(s).fetchall() res_ids = model.Session.execute(s).fetchall()
res_users = [(model.Session.query(model.User).get(unicode(user_id)), val) for user_id, val in res_ids] res_users = [(model.Session.query(model.User).get(unicode(user_id)), val) for user_id, val in res_ids]
return res_users return res_users
   
@classmethod @classmethod
def summary_stats(cls): def summary_stats(cls):
connection = model.Session.connection() connection = model.Session.connection()
   
res = connection.execute("SELECT 'Total Organisations', count(*) from \"group\" where type = 'organization' and state = 'active' union \ res = connection.execute("SELECT 'Total Organisations', count(*) from \"group\" where type = 'organization' and state = 'active' union \
select 'Total Datasets', count(*) from package where (state='active' or state='draft' or state='draft-complete') and private = 'f' union \ select 'Total Datasets', count(*) from package inner join (select distinct package_id from resource_group inner join resource on resource.resource_group_id = resource_group.id) as r on package.id = r.package_id where (package.state='active' or package.state='draft' or package.state='draft-complete') and private = 'f' union \
select 'Total Archived Datasets', count(*) from package where (state='active' or state='draft' or state='draft-complete') and private = 't' union \ select 'Total Archived Datasets', count(*) from package where (state='active' or state='draft' or state='draft-complete') and private = 't' union \
select 'Total Data Files/Resources', count(*) from resource where state='active' union \ select 'Total Data Files/Resources', count(*) from resource where state='active' union \
select 'Total Machine Readable/Data API Resources', count(*) from resource where state='active' and webstore_url = 'active'\ select 'Total Machine Readable/Data API Resources', count(*) from resource where state='active' and (webstore_url = 'active' or format="wms")\
").fetchall(); ").fetchall();
return res return res
   
   
@classmethod @classmethod
def activity_counts(cls): def activity_counts(cls):
connection = model.Session.connection() connection = model.Session.connection()
res = connection.execute("select to_char(timestamp, 'YYYY-MM') as month,activity_type, count(*) from activity group by month, activity_type order by month;").fetchall(); res = connection.execute("select to_char(timestamp, 'YYYY-MM') as month,activity_type, count(*) from activity group by month, activity_type order by month;").fetchall();
return res return res
   
@classmethod @classmethod
def user_access_list(cls): def user_access_list(cls):
connection = model.Session.connection() connection = model.Session.connection()
res = connection.execute("select name,sysadmin,role from user_object_role right outer join \"user\" on user_object_role.user_id = \"user\".id where name not in ('logged_in','visitor') group by name,sysadmin,role order by sysadmin desc, role asc;").fetchall(); res = connection.execute("select name,sysadmin,role from user_object_role right outer join \"user\" on user_object_role.user_id = \"user\".id where name not in ('logged_in','visitor') group by name,sysadmin,role order by sysadmin desc, role asc;").fetchall();
return res return res
   
@classmethod @classmethod
def recent_datasets(cls): def recent_datasets(cls):
activity = table('activity') activity = table('activity')
package = table('package') package = table('package')
s = select([func.max(activity.c.timestamp),package.c.id, activity.c.activity_type], from_obj=[activity.join(package,activity.c.object_id == package.c.id)]).where(package.c.private == 'f').\ s = select([func.max(activity.c.timestamp),package.c.id, activity.c.activity_type], from_obj=[activity.join(package,activity.c.object_id == package.c.id)]).where(package.c.private == 'f').\
where(activity.c.timestamp > func.now() - text("interval '60 day'")).group_by(package.c.id,activity.c.activity_type).order_by(func.max(activity.c.timestamp)) where(activity.c.timestamp > func.now() - text("interval '60 day'")).group_by(package.c.id,activity.c.activity_type).order_by(func.max(activity.c.timestamp))
result = model.Session.execute(s).fetchall() result = model.Session.execute(s).fetchall()
return [(datetime2date(timestamp), model.Session.query(model.Package).get(unicode(package_id)), activity_type) for timestamp,package_id,activity_type in result] return [(datetime2date(timestamp), model.Session.query(model.Package).get(unicode(package_id)), activity_type) for timestamp,package_id,activity_type in result]
   
   
   
class RevisionStats(object): class RevisionStats(object):
@classmethod @classmethod
def package_addition_rate(cls, weeks_ago=0): def package_addition_rate(cls, weeks_ago=0):
week_commenced = cls.get_date_weeks_ago(weeks_ago) week_commenced = cls.get_date_weeks_ago(weeks_ago)
return cls.get_objects_in_a_week(week_commenced, return cls.get_objects_in_a_week(week_commenced,
type_='package_addition_rate') type_='package_addition_rate')
   
@classmethod @classmethod
def package_revision_rate(cls, weeks_ago=0): def package_revision_rate(cls, weeks_ago=0):
week_commenced = cls.get_date_weeks_ago(weeks_ago) week_commenced = cls.get_date_weeks_ago(weeks_ago)
return cls.get_objects_in_a_week(week_commenced, return cls.get_objects_in_a_week(week_commenced,
type_='package_revision_rate') type_='package_revision_rate')
   
@classmethod @classmethod
def get_date_weeks_ago(cls, weeks_ago): def get_date_weeks_ago(cls, weeks_ago):
''' '''
@param weeks_ago: specify how many weeks ago to give count for @param weeks_ago: specify how many weeks ago to give count for
(0 = this week so far) (0 = this week so far)
''' '''
date_ = datetime.date.today() date_ = datetime.date.today()
return date_ - datetime.timedelta(days= return date_ - datetime.timedelta(days=
datetime.date.weekday(date_) + 7 * weeks_ago) datetime.date.weekday(date_) + 7 * weeks_ago)
   
@classmethod @classmethod
def get_week_dates(cls, weeks_ago): def get_week_dates(cls, weeks_ago):
''' '''
@param weeks_ago: specify how many weeks ago to give count for @param weeks_ago: specify how many weeks ago to give count for
(0 = this week so far) (0 = this week so far)
''' '''
package_revision = table('package_revision') package_revision = table('package_revision')
revision = table('revision') revision = table('revision')
today = datetime.date.today() today = datetime.date.today()
date_from = datetime.datetime(today.year, today.month, today.day) -\ date_from = datetime.datetime(today.year, today.month, today.day) -\
datetime.timedelta(days=datetime.date.weekday(today) + \ datetime.timedelta(days=datetime.date.weekday(today) + \
7 * weeks_ago) 7 * weeks_ago)
date_to = date_from + datetime.timedelta(days=7) date_to = date_from + datetime.timedelta(days=7)
return (date_from, date_to) return (date_from, date_to)
   
@classmethod @classmethod
def get_date_week_started(cls, date_): def get_date_week_started(cls, date_):
assert isinstance(date_, datetime.date) assert isinstance(date_, datetime.date)
if isinstance(date_, datetime.datetime): if isinstance(date_, datetime.datetime):
date_ = datetime2date(date_) date_ = datetime2date(date_)
return date_ - datetime.timedelta(days=datetime.date.weekday(date_)) return date_ - datetime.timedelta(days=datetime.date.weekday(date_))
   
@classmethod @classmethod
def get_package_revisions(cls): def get_package_revisions(cls):
''' '''
@return: Returns list of revisions and date of them, in @return: Returns list of revisions and date of them, in
format: [(id, date), ...] format: [(id, date), ...]
''' '''
package_revision = table('package_revision') package_revision = table('package_revision')
revision = table('revision') revision = table('revision')
s = select([package_revision.c.id, revision.c.timestamp], from_obj=[package_revision.join(revision)]).order_by(revision.c.timestamp) s = select([package_revision.c.id, revision.c.timestamp], from_obj=[package_revision.join(revision)]).order_by(revision.c.timestamp)
res = model.Session.execute(s).fetchall() # [(id, datetime), ...] res = model.Session.execute(s).fetchall() # [(id, datetime), ...]
return res return res
   
@classmethod @classmethod
def get_new_packages(cls): def get_new_packages(cls):
''' '''
@return: Returns list of new pkgs and date when they were created, in @return: Returns list of new pkgs and date when they were created, in
format: [(id, date_ordinal), ...] format: [(id, date_ordinal), ...]
''' '''
def new_packages(): def new_packages():
# Can't filter by time in select because 'min' function has to # Can't filter by time in select because 'min' function has to
# be 'for all time' else you get first revision in the time period. # be 'for all time' else you get first revision in the time period.
package_revision = table('package_revision') package_revision = table('package_revision')
revision = table('revision') revision = table('revision')
package = table('package') package = table('package')
s = select([package_revision.c.id, func.min(revision.c.timestamp)], from_obj=[package_revision.join(revision).join(package)]).\ s = select([package_revision.c.id, func.min(revision.c.timestamp)], from_obj=[package_revision.join(revision).join(package)]).\
where(package.c.private == 'f').\ where(package.c.private == 'f').\
group_by(package_revision.c.id).order_by(func.min(revision.c.timestamp)) group_by(package_revision.c.id).order_by(func.min(revision.c.timestamp))
res = model.Session.execute(s).fetchall() # [(id, datetime), ...] res = model.Session.execute(s).fetchall() # [(id, datetime), ...]
res_pickleable = [] res_pickleable = []
for pkg_id, created_datetime in res: for pkg_id, created_datetime in res:
res_pickleable.append((pkg_id, created_datetime.toordinal())) res_pickleable.append((pkg_id, created_datetime.toordinal()))
return res_pickleable return res_pickleable
if cache_enabled: if cache_enabled:
week_commences = cls.get_date_week_started(datetime.date.today()) week_commences = cls.get_date_week_started(datetime.date.today())
key = 'all_new_packages_%s' + week_commences.strftime(DATE_FORMAT) key = 'all_new_packages_%s' + week_commences.strftime(DATE_FORMAT)
new_packages = our_cache.get_value(key=key, new_packages = our_cache.get_value(key=key,
createfunc=new_packages) createfunc=new_packages)
else: else:
new_packages = new_packages() new_packages = new_packages()
return new_packages return new_packages
   
@classmethod @classmethod
def get_deleted_packages(cls): def get_deleted_packages(cls):
''' '''
@return: Returns list of deleted pkgs and date when they were deleted, in @return: Returns list of deleted pkgs and date when they were deleted, in
format: [(id, date_ordinal), ...] format: [(id, date_ordinal), ...]
''' '''
def deleted_packages(): def deleted_packages():
# Can't filter by time in select because 'min' function has to # Can't filter by time in select because 'min' function has to
# be 'for all time' else you get first revision in the time period. # be 'for all time' else you get first revision in the time period.
package_revision = table('package_revision') package_revision = table('package_revision')
revision = table('revision') revision = table('revision')
package = table('package') package = table('package')
s = select([package_revision.c.id, func.min(revision.c.timestamp)], from_obj=[package_revision.join(revision).join(package)]).\ s = select([package_revision.c.id, func.min(revision.c.timestamp)], from_obj=[package_revision.join(revision).join(package)]).\
where(package_revision.c.state==model.State.DELETED).\ where(package_revision.c.state==model.State.DELETED).\
where(package.c.private == 'f').\ where(package.c.private == 'f').\
group_by(package_revision.c.id).\ group_by(package_revision.c.id).\
order_by(func.min(revision.c.timestamp)) order_by(func.min(revision.c.timestamp))
res = model.Session.execute(s).fetchall() # [(id, datetime), ...] res = model.Session.execute(s).fetchall() # [(id, datetime), ...]
res_pickleable = [] res_pickleable = []
for pkg_id, deleted_datetime in res: for pkg_id, deleted_datetime in res:
res_pickleable.append((pkg_id, deleted_datetime.toordinal())) res_pickleable.append((pkg_id, deleted_datetime.toordinal()))
return res_pickleable return res_pickleable
if cache_enabled: if cache_enabled:
week_commences = cls.get_date_week_started(datetime.date.today()) week_commences = cls.get_date_week_started(datetime.date.today())
key = 'all_deleted_packages_%s' + week_commences.strftime(DATE_FORMAT) key = 'all_deleted_packages_%s' + week_commences.strftime(DATE_FORMAT)
deleted_packages = our_cache.get_value(key=key, deleted_packages = our_cache.get_value(key=key,
createfunc=deleted_packages) createfunc=deleted_packages)
else: else:
deleted_packages = deleted_packages() deleted_packages = deleted_packages()
return deleted_packages return deleted_packages
   
@classmethod @classmethod
def get_num_packages_by_week(cls): def get_num_packages_by_week(cls):
def num_packages(): def num_packages():
new_packages_by_week = cls.get_by_week('new_packages') new_packages_by_week = cls.get_by_week('new_packages')
deleted_packages_by_week = cls.get_by_week('deleted_packages') deleted_packages_by_week = cls.get_by_week('deleted_packages')
first_date = (min(datetime.datetime.strptime(new_packages_by_week[0][0], DATE_FORMAT), first_date = (min(datetime.datetime.strptime(new_packages_by_week[0][0], DATE_FORMAT),
datetime.datetime.strptime(deleted_packages_by_week[0][0], DATE_FORMAT))).date() datetime.datetime.strptime(deleted_packages_by_week[0][0], DATE_FORMAT))).date()
cls._cumulative_num_pkgs = 0 cls._cumulative_num_pkgs = 0
new_pkgs = [] new_pkgs = []