put overview/suymmary at bottom of stats tabs list so is shown by default
put overview/suymmary at bottom of stats tabs list so is shown by default

import datetime import datetime
   
from pylons import config from pylons import config
from sqlalchemy import Table, select, func, and_ from sqlalchemy import Table, select, func, and_
from sqlalchemy.sql.expression import text from sqlalchemy.sql.expression import text
   
import ckan.plugins as p import ckan.plugins as p
import ckan.model as model import ckan.model as model
   
cache_enabled = p.toolkit.asbool(config.get('ckanext.stats.cache_enabled', 'True')) cache_enabled = p.toolkit.asbool(config.get('ckanext.stats.cache_enabled', 'True'))
   
if cache_enabled: if cache_enabled:
from pylons import cache from pylons import cache
our_cache = cache.get_cache('stats', type='dbm') our_cache = cache.get_cache('stats', type='dbm')
   
DATE_FORMAT = '%Y-%m-%d' DATE_FORMAT = '%Y-%m-%d'
   
def table(name): def table(name):
return Table(name, model.meta.metadata, autoload=True) return Table(name, model.meta.metadata, autoload=True)
   
def datetime2date(datetime_): def datetime2date(datetime_):
return datetime.date(datetime_.year, datetime_.month, datetime_.day) return datetime.date(datetime_.year, datetime_.month, datetime_.day)
   
   
class Stats(object): class Stats(object):
@classmethod @classmethod
def top_rated_packages(cls, limit=10): def top_rated_packages(cls, limit=10):
# NB Not using sqlalchemy as sqla 0.4 doesn't work using both group_by # NB Not using sqlalchemy as sqla 0.4 doesn't work using both group_by
# and apply_avg # and apply_avg
package = table('package') package = table('package')
rating = table('rating') rating = table('rating')
sql = select([package.c.id, func.avg(rating.c.rating), func.count(rating.c.rating)], from_obj=[package.join(rating)]).\ sql = select([package.c.id, func.avg(rating.c.rating), func.count(rating.c.rating)], from_obj=[package.join(rating)]).\
where(package.c.private == 'f').\ where(package.c.private == 'f').\
group_by(package.c.id).\ group_by(package.c.id).\
order_by(func.avg(rating.c.rating).desc(), func.count(rating.c.rating).desc()).\ order_by(func.avg(rating.c.rating).desc(), func.count(rating.c.rating).desc()).\
limit(limit) limit(limit)
res_ids = model.Session.execute(sql).fetchall() res_ids = model.Session.execute(sql).fetchall()
res_pkgs = [(model.Session.query(model.Package).get(unicode(pkg_id)), avg, num) for pkg_id, avg, num in res_ids] res_pkgs = [(model.Session.query(model.Package).get(unicode(pkg_id)), avg, num) for pkg_id, avg, num in res_ids]
return res_pkgs return res_pkgs
   
@classmethod @classmethod
def most_edited_packages(cls, limit=10): def most_edited_packages(cls, limit=10):
package_revision = table('package_revision') package_revision = table('package_revision')
package = table('package') package = table('package')
s = select([package_revision.c.id, func.count(package_revision.c.revision_id)], from_obj=[package_revision.join(package)]).\ s = select([package_revision.c.id, func.count(package_revision.c.revision_id)], from_obj=[package_revision.join(package)]).\
where(package.c.private == 'f').\ where(package.c.private == 'f').\
group_by(package_revision.c.id).\ group_by(package_revision.c.id).\
order_by(func.count(package_revision.c.revision_id).desc()).\ order_by(func.count(package_revision.c.revision_id).desc()).\
limit(limit) limit(limit)
res_ids = model.Session.execute(s).fetchall() res_ids = model.Session.execute(s).fetchall()
res_pkgs = [(model.Session.query(model.Package).get(unicode(pkg_id)), val) for pkg_id, val in res_ids] res_pkgs = [(model.Session.query(model.Package).get(unicode(pkg_id)), val) for pkg_id, val in res_ids]
return res_pkgs return res_pkgs
   
@classmethod @classmethod
def largest_groups(cls, limit=10): def largest_groups(cls, limit=10):
member = table('member') member = table('member')
s = select([member.c.group_id, func.count(member.c.table_id)]).\ s = select([member.c.group_id, func.count(member.c.table_id)]).\
group_by(member.c.group_id).\ group_by(member.c.group_id).\
where(and_(member.c.group_id!=None, member.c.table_name=='package')).\ where(member.c.group_id!=None).\
  where(member.c.table_name=='package').\
  where(member.c.capacity=='public').\
order_by(func.count(member.c.table_id).desc()) order_by(func.count(member.c.table_id).desc())
#limit(limit) #limit(limit)
   
res_ids = model.Session.execute(s).fetchall() res_ids = model.Session.execute(s).fetchall()
res_groups = [(model.Session.query(model.Group).get(unicode(group_id)), val) for group_id, val in res_ids] res_groups = [(model.Session.query(model.Group).get(unicode(group_id)), val) for group_id, val in res_ids]
return res_groups return res_groups
   
@classmethod @classmethod
def by_org(cls, limit=10): def by_org(cls, limit=10):
group = table('group') connection = model.Session.connection()
package = table('package') res = connection.execute("select package.owner_org, package.private, count(*) from package \
s = select([group.c.id, package.c.private, func.count('*')], group_by=[group.c.id, package.c.private]).\ inner join \"group\" on package.owner_org = \"group\".id \
where(group.c.is_organization == True).\ where package.state='active'\
group_by(group.c.id, package.c.private).\ group by package.owner_org,\"group\".name, package.private \
order_by(group.c.name) order by \"group\".name, package.private;").fetchall();
#limit(limit) res_groups = [(model.Session.query(model.Group).get(unicode(group_id)), private, val) for group_id, private, val in res]
   
res_ids = model.Session.execute(s).fetchall()  
res_groups = [(model.Session.query(model.Group).get(unicode(group_id)), private, val) for group_id, private, val in res_ids]  
return res_groups return res_groups
   
@classmethod @classmethod
def top_tags(cls, limit=10, returned_tag_info='object'): # by package def top_tags(cls, limit=10, returned_tag_info='object'): # by package
assert returned_tag_info in ('name', 'id', 'object') assert returned_tag_info in ('name', 'id', 'object')
tag = table('tag') tag = table('tag')
package_tag = table('package_tag') package_tag = table('package_tag')
package = table('package') package = table('package')
#TODO filter out tags with state=deleted #TODO filter out tags with state=deleted
if returned_tag_info == 'name': if returned_tag_info == 'name':
from_obj = [package_tag.join(tag)] from_obj = [package_tag.join(tag)]
tag_column = tag.c.name tag_column = tag.c.name
else: else:
from_obj = None from_obj = None
tag_column = package_tag.c.tag_id tag_column = package_tag.c.tag_id
s = select([tag_column, func.count(package_tag.c.package_id)], s = select([tag_column, func.count(package_tag.c.package_id)],
from_obj=from_obj) from_obj=from_obj)
s = s.group_by(tag_column).\ s = s.group_by(tag_column).\
where(package.c.private == 'f').\ where(package.c.private == 'f').\
order_by(func.count(package_tag.c.package_id).desc()).\ order_by(func.count(package_tag.c.package_id).desc()).\
limit(limit) limit(limit)
res_col = model.Session.execute(s).fetchall() res_col = model.Session.execute(s).fetchall()
if returned_tag_info in ('id', 'name'): if returned_tag_info in ('id', 'name'):
return res_col return res_col
elif returned_tag_info == 'object': elif returned_tag_info == 'object':
res_tags = [(model.Session.query(model.Tag).get(unicode(tag_id)), val) for tag_id, val in res_col] res_tags = [(model.Session.query(model.Tag).get(unicode(tag_id)), val) for tag_id, val in res_col]
return res_tags return res_tags
   
@classmethod @classmethod
def top_package_owners(cls, limit=10): def top_package_owners(cls, limit=10):
package_role = table('package_role') package_role = table('package_role')
user_object_role = table('user_object_role') user_object_role = table('user_object_role')
package = table('package') package = table('package')
s = select([user_object_role.c.user_id, func.count(user_object_role.c.role)], from_obj=[user_object_role.join(package_role).join(package, package_role.c.package_id == package.c.id)]).\ s = select([user_object_role.c.user_id, func.count(user_object_role.c.role)], from_obj=[user_object_role.join(package_role).join(package, package_role.c.package_id == package.c.id)]).\
where(user_object_role.c.role==model.authz.Role.ADMIN).\ where(user_object_role.c.role==model.authz.Role.ADMIN).\
where(package.c.private == 'f').\ where(package.c.private == 'f').\
where(user_object_role.c.user_id!=None).\ where(user_object_role.c.user_id!=None).\
group_by(user_object_role.c.user_id).\ group_by(user_object_role.c.user_id).\
order_by(func.count(user_object_role.c.role).desc()).\ order_by(func.count(user_object_role.c.role).desc()).\
limit(limit) limit(limit)
res_ids = model.Session.execute(s).fetchall() res_ids = model.Session.execute(s).fetchall()
res_users = [(model.Session.query(model.User).get(unicode(user_id)), val) for user_id, val in res_ids] res_users = [(model.Session.query(model.User).get(unicode(user_id)), val) for user_id, val in res_ids]
return res_users return res_users
   
@classmethod @classmethod
def summary_stats(cls): def summary_stats(cls):
connection = model.Session.connection() connection = model.Session.connection()
   
  # select 'Total Archived Datasets', count(*) from package where (state='active' or state='draft' or state='draft-complete') and private = 't' union \
res = connection.execute("SELECT 'Total Organisations', count(*) from \"group\" where type = 'organization' and state = 'active' union \ res = connection.execute("SELECT 'Total Organisations', count(*) from \"group\" where type = 'organization' and state = 'active' union \
select 'Total Datasets', count(*) from package where state='active' or state='draft' or state='draft-complete' union \ select 'Total Datasets', count(*) from package where (state='active' or state='draft' or state='draft-complete') and private = 'f' union \
select 'Total Data Files/Resources', count(*) from resource where state='active'").fetchall(); select 'Total Data Files/Resources', count(*) from resource where state='active'").fetchall();
return res return res
   
   
@classmethod @classmethod
def activity_counts(cls): def activity_counts(cls):
connection = model.Session.connection() connection = model.Session.connection()
res = connection.execute("select to_char(timestamp, 'YYYY-MM') as month,activity_type, count(*) from activity group by month, activity_type order by month;").fetchall(); res = connection.execute("select to_char(timestamp, 'YYYY-MM') as month,activity_type, count(*) from activity group by month, activity_type order by month;").fetchall();
return res return res
   
@classmethod @classmethod
def user_access_list(cls): def user_access_list(cls):
connection = model.Session.connection() connection = model.Session.connection()
res = connection.execute("select name,sysadmin,role from user_object_role right outer join \"user\" on user_object_role.user_id = \"user\".id where name not in ('logged_in','visitor') group by name,sysadmin,role order by sysadmin desc, role asc;").fetchall(); res = connection.execute("select name,sysadmin,role from user_object_role right outer join \"user\" on user_object_role.user_id = \"user\".id where name not in ('logged_in','visitor') group by name,sysadmin,role order by sysadmin desc, role asc;").fetchall();
return res return res
   
@classmethod @classmethod
def recent_datasets(cls): def recent_datasets(cls):
activity = table('activity') activity = table('activity')
package = table('package') package = table('package')
s = select([func.max(activity.c.timestamp),package.c.id, activity.c.activity_type], from_obj=[activity.join(package,activity.c.object_id == package.c.id)]).where(package.c.private == 'f').\ s = select([func.max(activity.c.timestamp),package.c.id, activity.c.activity_type], from_obj=[activity.join(package,activity.c.object_id == package.c.id)]).where(package.c.private == 'f').\
where(activity.c.timestamp > func.now() - text("interval '60 day'")).group_by(package.c.id,activity.c.activity_type).order_by(func.max(activity.c.timestamp)) where(activity.c.timestamp > func.now() - text("interval '60 day'")).group_by(package.c.id,activity.c.activity_type).order_by(func.max(activity.c.timestamp))
result = model.Session.execute(s).fetchall() result = model.Session.execute(s).fetchall()
return [(datetime2date(timestamp), model.Session.query(model.Package).get(unicode(package_id)), activity_type) for timestamp,package_id,activity_type in result] return [(datetime2date(timestamp), model.Session.query(model.Package).get(unicode(package_id)), activity_type) for timestamp,package_id,activity_type in result]
   
   
   
class RevisionStats(object): class RevisionStats(object):
@classmethod @classmethod
def package_addition_rate(cls, weeks_ago=0): def package_addition_rate(cls, weeks_ago=0):
week_commenced = cls.get_date_weeks_ago(weeks_ago) week_commenced = cls.get_date_weeks_ago(weeks_ago)
return cls.get_objects_in_a_week(week_commenced, return cls.get_objects_in_a_week(week_commenced,
type_='package_addition_rate') type_='package_addition_rate')
   
@classmethod @classmethod
def package_revision_rate(cls, weeks_ago=0): def package_revision_rate(cls, weeks_ago=0):
week_commenced = cls.get_date_weeks_ago(weeks_ago) week_commenced = cls.get_date_weeks_ago(weeks_ago)
return cls.get_objects_in_a_week(week_commenced, return cls.get_objects_in_a_week(week_commenced,
type_='package_revision_rate') type_='package_revision_rate')
   
@classmethod @classmethod
def get_date_weeks_ago(cls, weeks_ago): def get_date_weeks_ago(cls, weeks_ago):
''' '''
@param weeks_ago: specify how many weeks ago to give count for @param weeks_ago: specify how many weeks ago to give count for
(0 = this week so far) (0 = this week so far)
''' '''
date_ = datetime.date.today() date_ = datetime.date.today()
return date_ - datetime.timedelta(days= return date_ - datetime.timedelta(days=
datetime.date.weekday(date_) + 7 * weeks_ago) datetime.date.weekday(date_) + 7 * weeks_ago)
   
@classmethod @classmethod
def get_week_dates(cls, weeks_ago): def get_week_dates(cls, weeks_ago):
''' '''
@param weeks_ago: specify how many weeks ago to give count for @param weeks_ago: specify how many weeks ago to give count for
(0 = this week so far) (0 = this week so far)
''' '''
package_revision = table('package_revision') package_revision = table('package_revision')
revision = table('revision') revision = table('revision')
today = datetime.date.today() today = datetime.date.today()
date_from = datetime.datetime(today.year, today.month, today.day) -\ date_from = datetime.datetime(today.year, today.month, today.day) -\
datetime.timedelta(days=datetime.date.weekday(today) + \ datetime.timedelta(days=datetime.date.weekday(today) + \
7 * weeks_ago) 7 * weeks_ago)
date_to = date_from + datetime.timedelta(days=7) date_to = date_from + datetime.timedelta(days=7)
return (date_from, date_to) return (date_from, date_to)
   
@classmethod @classmethod
def get_date_week_started(cls, date_): def get_date_week_started(cls, date_):
assert isinstance(date_, datetime.date) assert isinstance(date_, datetime.date)
if isinstance(date_, datetime.datetime): if isinstance(date_, datetime.datetime):
date_ = datetime2date(date_) date_ = datetime2date(date_)
return date_ - datetime.timedelta(days=datetime.date.weekday(date_)) return date_ - datetime.timedelta(days=datetime.date.weekday(date_))
   
@classmethod @classmethod
def get_package_revisions(cls): def get_package_revisions(cls):
''' '''
@return: Returns list of revisions and date of them, in @return: Returns list of revisions and date of them, in
format: [(id, date), ...] format: [(id, date), ...]
''' '''
package_revision = table('package_revision') package_revision = table('package_revision')
revision = table('revision') revision = table('revision')
s = select([package_revision.c.id, revision.c.timestamp], from_obj=[package_revision.join(revision)]).order_by(revision.c.timestamp) s = select([package_revision.c.id, revision.c.timestamp], from_obj=[package_revision.join(revision)]).order_by(revision.c.timestamp)
res = model.Session.execute(s).fetchall() # [(id, datetime), ...] res = model.Session.execute(s).fetchall() # [(id, datetime), ...]
return res return res
   
@classmethod @classmethod
def get_new_packages(cls): def get_new_packages(cls):
''' '''
@return: Returns list of new pkgs and date when they were created, in @return: Returns list of new pkgs and date when they were created, in
format: [(id, date_ordinal), ...] format: [(id, date_ordinal), ...]
''' '''
def new_packages(): def new_packages():
# Can't filter by time in select because 'min' function has to # Can't filter by time in select because 'min' function has to
# be 'for all time' else you get first revision in the time period. # be 'for all time' else you get first revision in the time period.
package_revision = table('package_revision') package_revision = table('package_revision')
revision = table('revision') revision = table('revision')
package = table('package') package = table('package')
s = select([package_revision.c.id, func.min(revision.c.timestamp)], from_obj=[package_revision.join(revision).join(package)]).\ s = select([package_revision.c.id, func.min(revision.c.timestamp)], from_obj=[package_revision.join(revision).join(package)]).\
where(package.c.private == 'f').\ where(package.c.private == 'f').\
group_by(package_revision.c.id).order_by(func.min(revision.c.timestamp)) group_by(package_revision.c.id).order_by(func.min(revision.c.timestamp))
res = model.Session.execute(s).fetchall() # [(id, datetime), ...] res = model.Session.execute(s).fetchall() # [(id, datetime), ...]
res_pickleable = [] res_pickleable = []
for pkg_id, created_datetime in res: for pkg_id, created_datetime in res:
res_pickleable.append((pkg_id, created_datetime.toordinal())) res_pickleable.append((pkg_id, created_datetime.toordinal()))
return res_pickleable return res_pickleable
if cache_enabled: if cache_enabled:
week_commences = cls.get_date_week_started(datetime.date.today()) week_commences = cls.get_date_week_started(datetime.date.today())
key = 'all_new_packages_%s' + week_commences.strftime(DATE_FORMAT) key = 'all_new_packages_%s' + week_commences.strftime(DATE_FORMAT)
new_packages = our_cache.get_value(key=key, new_packages = our_cache.get_value(key=key,
createfunc=new_packages) createfunc=new_packages)
else: else:
new_packages = new_packages() new_packages = new_packages()
return new_packages return new_packages
   
@classmethod @classmethod
def get_deleted_packages(cls): def get_deleted_packages(cls):
''' '''
@return: Returns list of deleted pkgs and date when they were deleted, in @return: Returns list of deleted pkgs and date when they were deleted, in
format: [(id, date_ordinal), ...] format: [(id, date_ordinal), ...]
''' '''
def deleted_packages(): def deleted_packages():
# Can't filter by time in select because 'min' function has to # Can't filter by time in select because 'min' function has to
# be 'for all time' else you get first revision in the time period. # be 'for all time' else you get first revision in the time period.
package_revision = table('package_revision') package_revision = table('package_revision')
revision = table('revision') revision = table('revision')
package = table('package') package = table('package')
s = select([package_revision.c.id, func.min(revision.c.timestamp)], from_obj=[package_revision.join(revision).join(package)]).\ s = select([package_revision.c.id, func.min(revision.c.timestamp)], from_obj=[package_revision.join(revision).join(package)]).\
where(package_revision.c.state==model.State.DELETED).\ where(package_revision.c.state==model.State.DELETED).\
where(package.c.private == 'f').\ where(package.c.private == 'f').\
group_by(package_revision.c.id).\ group_by(package_revision.c.id).\
order_by(func.min(revision.c.timestamp)) order_by(func.min(revision.c.timestamp))
res = model.Session.execute(s).fetchall() # [(id, datetime), ...] res = model.Session.execute(s).fetchall() # [(id, datetime), ...]
res_pickleable = [] res_pickleable = []
for pkg_id, deleted_datetime in res: for pkg_id, deleted_datetime in res:
res_pickleable.append((pkg_id, deleted_datetime.toordinal())) res_pickleable.append((pkg_id, deleted_datetime.toordinal()))
return res_pickleable return res_pickleable
if cache_enabled: if cache_enabled:
week_commences = cls.get_date_week_started(datetime.date.today()) week_commences = cls.get_date_week_started(datetime.date.today())
key = 'all_deleted_packages_%s' + week_commences.strftime(DATE_FORMAT) key = 'all_deleted_packages_%s' + week_commences.strftime(DATE_FORMAT)
deleted_packages = our_cache.get_value(key=key, deleted_packages = our_cache.get_value(key=key,
createfunc=deleted_packages) createfunc=deleted_packages)
else: else:
deleted_packages = deleted_packages() deleted_packages = deleted_packages()
return deleted_packages return deleted_packages
   
@classmethod @classmethod
def get_num_packages_by_week(cls): def get_num_packages_by_week(cls):
def num_packages(): def num_packages():
new_packages_by_week = cls.get_by_week('new_packages') new_packages_by_week = cls.get_by_week('new_packages')
deleted_packages_by_week = cls.get_by_week('deleted_packages') deleted_packages_by_week = cls.get_by_week('deleted_packages')
first_date = (min(datetime.datetime.strptime(new_packages_by_week[0][0], DATE_FORMAT), first_date = (min(datetime.datetime.strptime(new_packages_by_week[0][0], DATE_FORMAT),
datetime.datetime.strptime(deleted_packages_by_week[0][0], DATE_FORMAT))).date() datetime.datetime.strptime(deleted_packages_by_week[0][0], DATE_FORMAT))).date()
cls._cumulative_num_pkgs = 0 cls._cumulative_num_pkgs = 0
new_pkgs = [] new_pkgs = []
deleted_pkgs = [] deleted_pkgs = []
def build_weekly_stats(week_commences, new_pkg_ids, deleted_pkg_ids): def build_weekly_stats(week_commences, new_pkg_ids, deleted_pkg_ids):
num_pkgs = len(new_pkg_ids) - len(deleted_pkg_ids) num_pkgs = len(new_pkg_ids) - len(deleted_pkg_ids)
new_pkgs.extend([model.Session.query(model.Package).get(id).name for id in new_pkg_ids]) new_pkgs.extend([model.Session.query(model.Package).get(id).name for id in new_pkg_ids])
deleted_pkgs.extend([model.Session.query(model.Package).get(id).name for id in deleted_pkg_ids]) deleted_pkgs.extend([model.Session.query(model.Package).get(id).name for id in deleted_pkg_ids])
cls._cumulative_num_pkgs += num_pkgs cls._cumulative_num_pkgs += num_pkgs
return (week_commences.strftime(DATE_FORMAT), return (week_commences.strftime(DATE_FORMAT),
num_pkgs, cls._cumulative_num_pkgs) num_pkgs, cls._cumulative_num_pkgs)
week_ends = first_date week_ends = first_date
today = datetime.date.today() today = datetime.date.today()
new_package_week_index = 0 new_package_week_index = 0
deleted_package_week_index = 0 deleted_package_week_index = 0
weekly_numbers = [] # [(week_commences, num_packages, cumulative_num_pkgs])] weekly_numbers = [] # [(week_commences, num_packages, cumulative_num_pkgs])]
while week_ends <= today: while week_ends <= today:
week_commences = week_ends week_commences = week_ends
week_ends = week_commences + datetime.timedelta(days=7) week_ends = week_commences + datetime.timedelta(days=7)
if datetime.datetime.strptime(new_packages_by_week[new_package_week_index][0], DATE_FORMAT).date() == week_commences: if datetime.datetime.strptime(new_packages_by_week[new_package_week_index][0], DATE_FORMAT).date() == week_commences:
new_pkg_ids = n