|
import datetime |
|
|
|
from pylons import config |
|
from sqlalchemy import Table, select, func, and_ |
|
|
|
import ckan.plugins as p |
|
import ckan.model as model |
|
|
|
cache_enabled = p.toolkit.asbool(config.get('ckanext.stats.cache_enabled', 'True')) |
|
|
|
if cache_enabled: |
|
from pylons import cache |
|
our_cache = cache.get_cache('stats', type='dbm') |
|
|
|
DATE_FORMAT = '%Y-%m-%d' |
|
|
|
def table(name): |
|
return Table(name, model.meta.metadata, autoload=True) |
|
|
|
def datetime2date(datetime_): |
|
return datetime.date(datetime_.year, datetime_.month, datetime_.day) |
|
|
|
|
|
class Stats(object): |
|
@classmethod |
|
def top_rated_packages(cls, limit=10): |
|
# NB Not using sqlalchemy as sqla 0.4 doesn't work using both group_by |
|
# and apply_avg |
|
package = table('package') |
|
rating = table('rating') |
|
sql = select([package.c.id, func.avg(rating.c.rating), func.count(rating.c.rating)], from_obj=[package.join(rating)]).\ |
|
where(package.c.private == 'f').\ |
|
group_by(package.c.id).\ |
|
order_by(func.avg(rating.c.rating).desc(), func.count(rating.c.rating).desc()).\ |
|
limit(limit) |
|
res_ids = model.Session.execute(sql).fetchall() |
|
res_pkgs = [(model.Session.query(model.Package).get(unicode(pkg_id)), avg, num) for pkg_id, avg, num in res_ids] |
|
return res_pkgs |
|
|
|
@classmethod |
|
def most_edited_packages(cls, limit=10): |
|
package_revision = table('package_revision') |
|
package = table('package') |
|
s = select([package_revision.c.id, func.count(package_revision.c.revision_id)], from_obj=[package_revision.join(package)]).\ |
|
where(package.c.private == 'f').\ |
|
group_by(package_revision.c.id).\ |
|
order_by(func.count(package_revision.c.revision_id).desc()).\ |
|
limit(limit) |
|
res_ids = model.Session.execute(s).fetchall() |
|
res_pkgs = [(model.Session.query(model.Package).get(unicode(pkg_id)), val) for pkg_id, val in res_ids] |
|
return res_pkgs |
|
|
|
@classmethod |
|
def largest_groups(cls, limit=10): |
|
member = table('member') |
|
package = table('package') |
|
s = select([member.c.group_id, func.count(member.c.table_id)]).\ |
|
group_by(member.c.group_id).\ |
|
where(and_(member.c.group_id!=None, member.c.table_name=='package', package.c.private == 'f')).\ |
|
order_by(func.count(member.c.table_id).desc()).\ |
|
limit(limit) |
|
|
|
res_ids = model.Session.execute(s).fetchall() |
|
res_groups = [(model.Session.query(model.Group).get(unicode(group_id)), val) for group_id, val in res_ids] |
|
return res_groups |
|
|
|
@classmethod |
|
def top_tags(cls, limit=10, returned_tag_info='object'): # by package |
|
assert returned_tag_info in ('name', 'id', 'object') |
|
tag = table('tag') |
|
package_tag = table('package_tag') |
|
package = table('package') |
|
#TODO filter out tags with state=deleted |
|
if returned_tag_info == 'name': |
|
from_obj = [package_tag.join(tag)] |
|
tag_column = tag.c.name |
|
else: |
|
from_obj = None |
|
tag_column = package_tag.c.tag_id |
|
s = select([tag_column, func.count(package_tag.c.package_id)], |
|
from_obj=from_obj) |
|
s = s.group_by(tag_column).\ |
|
where(package.c.private == 'f').\ |
|
order_by(func.count(package_tag.c.package_id).desc()).\ |
|
limit(limit) |
|
res_col = model.Session.execute(s).fetchall() |
|
if returned_tag_info in ('id', 'name'): |
|
return res_col |
|
elif returned_tag_info == 'object': |
|
res_tags = [(model.Session.query(model.Tag).get(unicode(tag_id)), val) for tag_id, val in res_col] |
|
return res_tags |
|
|
|
@classmethod |
|
def top_package_owners(cls, limit=10): |
|
package_role = table('package_role') |
|
user_object_role = table('user_object_role') |
|
package = table('package') |
|
# join(package, package_role.c.package_id == package.c.id).\ |
|
# where(package.c.private == 'f').\ |
|
s = select([user_object_role.c.user_id, func.count(user_object_role.c.role)], from_obj=[user_object_role.join(package_role)]).\ |
|
where(user_object_role.c.role==model.authz.Role.ADMIN).\ |
|
where(user_object_role.c.user_id!=None).\ |
|
group_by(user_object_role.c.user_id).\ |
|
order_by(func.count(user_object_role.c.role).desc()).\ |
|
limit(limit) |
|
res_ids = model.Session.execute(s).fetchall() |
|
res_users = [(model.Session.query(model.User).get(unicode(user_id)), val) for user_id, val in res_ids] |
|
return res_users |
|
|
|
@classmethod |
|
def summary_stats(cls): |
|
connection = model.Session.connection() |
|
# select name,role from user_object_role inner join \"user\" on user_object_role.user_id = \"user\".id where name not in ('logged_in','visitor') group by name,role" |
|
|
|
res = connection.execute("SELECT 'Total Organisations', count(*) from \"group\" where type = 'organization' and state = 'active' union \ |
|
select 'Total Datasets', count(*) from package where state='active' or state='draft' or state='draft-complete' union \ |
|
select 'Total Data Files/Resources', count(*) from resource where state='active'").fetchall(); |
|
return res |
|
|
|
|
|
@classmethod |
|
def activity_counts(cls): |
|
connection = model.Session.connection() |
|
res = connection.execute("select to_char(timestamp, 'YYYY-MM') as month,activity_type, count(*) from activity group by month, activity_type order by month;").fetchall(); |
|
return res |
|
|
|
|
|
class RevisionStats(object): |
|
@classmethod |
|
def package_addition_rate(cls, weeks_ago=0): |
|
week_commenced = cls.get_date_weeks_ago(weeks_ago) |
|
return cls.get_objects_in_a_week(week_commenced, |
|
type_='package_addition_rate') |
|
|
|
@classmethod |
|
def package_revision_rate(cls, weeks_ago=0): |
|
week_commenced = cls.get_date_weeks_ago(weeks_ago) |
|
return cls.get_objects_in_a_week(week_commenced, |
|
type_='package_revision_rate') |
|
|
|
@classmethod |
|
def get_date_weeks_ago(cls, weeks_ago): |
|
''' |
|
@param weeks_ago: specify how many weeks ago to give count for |
|
(0 = this week so far) |
|
''' |
|
date_ = datetime.date.today() |
|
return date_ - datetime.timedelta(days= |
|
datetime.date.weekday(date_) + 7 * weeks_ago) |
|
|
|
@classmethod |
|
def get_week_dates(cls, weeks_ago): |
|
''' |
|
@param weeks_ago: specify how many weeks ago to give count for |
|
(0 = this week so far) |
|
''' |
|
package_revision = table('package_revision') |
|
revision = table('revision') |
|
today = datetime.date.today() |
|
date_from = datetime.datetime(today.year, today.month, today.day) -\ |
|
datetime.timedelta(days=datetime.date.weekday(today) + \ |
|
7 * weeks_ago) |
|
date_to = date_from + datetime.timedelta(days=7) |
|
return (date_from, date_to) |
|
|
|
@classmethod |
|
def get_date_week_started(cls, date_): |
|
assert isinstance(date_, datetime.date) |
|
if isinstance(date_, datetime.datetime): |
|
date_ = datetime2date(date_) |
|
return date_ - datetime.timedelta(days=datetime.date.weekday(date_)) |
|
|
|
@classmethod |
|
def get_package_revisions(cls): |
|
''' |
|
@return: Returns list of revisions and date of them, in |
|
format: [(id, date), ...] |
|
''' |
|
package_revision = table('package_revision') |
|
revision = table('revision') |
|
s = select([package_revision.c.id, revision.c.timestamp], from_obj=[package_revision.join(revision)]).order_by(revision.c.timestamp) |
|
res = model.Session.execute(s).fetchall() # [(id, datetime), ...] |
|
return res |
|
|
|
@classmethod |
|
def get_new_packages(cls): |
|
''' |
|
@return: Returns list of new pkgs and date when they were created, in |
|
format: [(id, date_ordinal), ...] |
|
''' |
|
def new_packages(): |
|
# Can't filter by time in select because 'min' function has to |
|
# be 'for all time' else you get first revision in the time period. |
|
package_revision = table('package_revision') |
|
revision = table('revision') |
|
s = select([package_revision.c.id, func.min(revision.c.timestamp)], from_obj=[package_revision.join(revision)]).\ |
|
where(package.c.private == 'f').\ |
|
group_by(package_revision.c.id).order_by(func.min(revision.c.timestamp)) |
|
res = model.Session.execute(s).fetchall() # [(id, datetime), ...] |
|
res_pickleable = [] |
|
for pkg_id, created_datetime in res: |
|
res_pickleable.append((pkg_id, created_datetime.toordinal())) |
|
return res_pickleable |
|
if cache_enabled: |
|
week_commences = cls.get_date_week_started(datetime.date.today()) |
|
key = 'all_new_packages_%s' + week_commences.strftime(DATE_FORMAT) |
|
new_packages = our_cache.get_value(key=key, |
|
createfunc=new_packages) |
|
else: |
|
new_packages = new_packages() |
|
return new_packages |
|
|
|
@classmethod |
|
def get_deleted_packages(cls): |
|
''' |
|
@return: Returns list of deleted pkgs and date when they were deleted, in |
|
format: [(id, date_ordinal), ...] |
|
''' |
|
def deleted_packages(): |
|
# Can't filter by time in select because 'min' function has to |
|
# be 'for all time' else you get first revision in the time period. |
|
package_revision = table('package_revision') |
|
revision = table('revision') |
|
s = select([package_revision.c.id, func.min(revision.c.timestamp)], from_obj=[package_revision.join(revision)]).\ |
|
where(package_revision.c.state==model.State.DELETED).\ |
|
group_by(package_revision.c.id).\ |
|
order_by(func.min(revision.c.timestamp)) |
|
res = model.Session.execute(s).fetchall() # [(id, datetime), ...] |
|
res_pickleable = [] |
|
for pkg_id, deleted_datetime in res: |
|
res_pickleable.append((pkg_id, deleted_datetime.toordinal())) |
|
return res_pickleable |
|
if cache_enabled: |
|
week_commences = cls.get_date_week_started(datetime.date.today()) |
|
key = 'all_deleted_packages_%s' + week_commences.strftime(DATE_FORMAT) |
|
deleted_packages = our_cache.get_value(key=key, |
|
createfunc=deleted_packages) |
|
else: |
|
deleted_packages = deleted_packages() |
|
return deleted_packages |
|
|
|
@classmethod |
|
def get_num_packages_by_week(cls): |
|
def num_packages(): |
|
new_packages_by_week = cls.get_by_week('new_packages') |
|
deleted_packages_by_week = cls.get_by_week('deleted_packages') |
|
first_date = (min(datetime.datetime.strptime(new_packages_by_week[0][0], DATE_FORMAT), |
|
datetime.datetime.strptime(deleted_packages_by_week[0][0], DATE_FORMAT))).date() |
|
cls._cumulative_num_pkgs = 0 |
|
new_pkgs = [] |
|
deleted_pkgs = [] |
|
def build_weekly_stats(week_commences, new_pkg_ids, deleted_pkg_ids): |
|
num_pkgs = len(new_pkg_ids) - len(deleted_pkg_ids) |
|
new_pkgs.extend([model.Session.query(model.Package).get(id).name for id in new_pkg_ids]) |
|
deleted_pkgs.extend([model.Session.query(model.Package).get(id).name for id in deleted_pkg_ids]) |
|
cls._cumulative_num_pkgs += num_pkgs |
|
return (week_commences.strftime(DATE_FORMAT), |
|
num_pkgs, cls._cumulative_num_pkgs) |
|
week_ends = first_date |
|
today = datetime.date.today() |
|
new_package_week_index = 0 |
|
deleted_package_week_index = 0 |
|
weekly_numbers = [] # [(week_commences, num_packages, cumulative_num_pkgs])] |
|
while week_ends <= today: |
|
week_commences = week_ends |
|
week_ends = week_commences + datetime.timedelta(days=7) |
|
if datetime.datetime.strptime(new_packages_by_week[new_package_week_index][0], DATE_FORMAT).date() == week_commences: |
|
new_pkg_ids = new_packages_by_week[new_package_week_index][1] |
|
new_package_week_index += 1 |
|
else: |
|
new_pkg_ids = [] |
|
if datetime.datetime.strptime(deleted_packages_by_week[deleted_package_week_index][0], DATE_FORMAT).date() == week_commences: |
|
deleted_pkg_ids = deleted_packages_by_week[deleted_package_week_index][1] |
|
deleted_package_week_index += 1 |
|
else: |
|
deleted_pkg_ids = [] |
|
weekly_numbers.append(build_weekly_stats(week_commences, new_pkg_ids, deleted_pkg_ids)) |
|
# just check we got to the end of each count |
|
assert new_package_week_index == len(new_packages_by_week) |
|
assert deleted_package_week_index == len(deleted_packages_by_week) |
|
return weekly_numbers |
|
if cache_enabled: |
|
week_commences = cls.get_date_week_started(datetime.date.today()) |
|
key = 'number_packages_%s' + week_commences.strftime(DATE_FORMAT) |
|
num_packages = our_cache.get_value(key=key, |
|
createfunc=num_packages) |
|
else: |
|
num_packages = num_packages() |
|
return num_packages |
|
|
|
@classmethod |
|
def get_by_week(cls, object_type): |
|
cls._object_type = object_type |
|
def objects_by_week(): |
|
if cls._object_type == 'new_packages': |
|
objects = cls.get_new_packages() |
|
def get_date(object_date): |
|
return datetime.date.fromordinal(object_date) |
|
elif cls._object_type == 'deleted_packages': |
|
objects = cls.get_deleted_packages() |
|
def get_date(object_date): |
|
return datetime.date.fromordinal(object_date) |
|
elif cls._object_type == 'package_revisions': |
|
objects = cls.get_package_revisions() |
|
def get_date(object_date): |
|
return datetime2date(object_date) |
|
else: |
|
raise NotImplementedError() |
|
first_date = get_date(objects[0][1]) if objects else datetime.date.today() |
|
week_commences = cls.get_date_week_started(first_date) |
|
week_ends = week_commences + datetime.timedelta(days=7) |
|
week_index = 0 |
|
weekly_pkg_ids = [] # [(week_commences, [pkg_id1, pkg_id2, ...])] |
|
pkg_id_stack = [] |
|
cls._cumulative_num_pkgs = 0 |
|
def build_weekly_stats(week_commences, pkg_ids): |
|
num_pkgs = len(pkg_ids) |
|
cls._cumulative_num_pkgs += num_pkgs |
|
return (week_commences.strftime(DATE_FORMAT), |
|
pkg_ids, num_pkgs, cls._cumulative_num_pkgs) |
|
for pkg_id, date_field in objects: |
|
date_ = get_date(date_field) |
|
if date_ >= week_ends: |
|
weekly_pkg_ids.append(build_weekly_stats(week_commences, pkg_id_stack)) |
|
pkg_id_stack = [] |
|
week_commences = week_ends |
|
week_ends = week_commences + datetime.timedelta(days=7) |
|
pkg_id_stack.append(pkg_id) |
|
weekly_pkg_ids.append(build_weekly_stats(week_commences, pkg_id_stack)) |
|
today = datetime.date.today() |
|
while week_ends <= today: |
|
week_commences = week_ends |
|
week_ends = week_commences + datetime.timedelta(days=7) |
|
weekly_pkg_ids.append(build_weekly_stats(week_commences, [])) |
|
return weekly_pkg_ids |
|
if cache_enabled: |
|
week_commences = cls.get_date_week_started(datetime.date.today()) |
|
key = '%s_by_week_%s' % (cls._object_type, week_commences.strftime(DATE_FORMAT)) |
|
objects_by_week_ = our_cache.get_value(key=key, |
|
createfunc=objects_by_week) |
|
else: |
|
objects_by_week_ = objects_by_week() |
|
return objects_by_week_ |
|
|
|
@classmethod |
|
def get_objects_in_a_week(cls, date_week_commences, |
|
type_='new-package-rate'): |
|
''' |
|
@param type: Specifies what to return about the specified week: |
|
"package_addition_rate" number of new packages |
|
"package_revision_rate" number of package revisions |
|
"new_packages" a list of the packages created |
|
in a tuple with the date. |
|
"deleted_packages" a list of the packages deleted |
|
in a tuple with the date. |
|
@param dates: date range of interest - a tuple: |
|
(start_date, end_date) |
|
''' |
|
assert isinstance(date_week_commences, datetime.date) |
|
if type_ in ('package_addition_rate', 'new_packages'): |
|
object_type = 'new_packages' |
|
elif type_ == 'deleted_packages': |
|
object_type = 'deleted_packages' |
|
elif type_ == 'package_revision_rate': |
|
object_type = 'package_revisions' |
|
else: |
|
raise NotImplementedError() |
|
objects_by_week = cls.get_by_week(object_type) |
|
date_wc_str = date_week_commences.strftime(DATE_FORMAT) |
|
object_ids = None |
|
for objects_in_a_week in objects_by_week: |
|
if objects_in_a_week[0] == date_wc_str: |
|
object_ids = objects_in_a_week[1] |
|
break |
|
if object_ids is None: |
|
raise TypeError('Week specified is outside range') |
|
assert isinstance(object_ids, list) |
|
if type_ in ('package_revision_rate', 'package_addition_rate'): |
|
return len(object_ids) |
|
elif type_ in ('new_packages', 'deleted_packages'): |
|
return [ model.Session.query(model.Package).get(pkg_id) \ |
|
for pkg_id in object_ids ] |
|
|
|
|