--- a/ckanext/ga_report/controller.py +++ b/ckanext/ga_report/controller.py @@ -1,6 +1,7 @@ import re import csv import sys +import json import logging import operator import collections @@ -13,6 +14,7 @@ log = logging.getLogger('ckanext.ga-report') +DOWNLOADS_AVAILABLE_FROM = '2012-12' def _get_month_name(strdate): import calendar @@ -20,8 +22,12 @@ d = strptime(strdate, '%Y-%m') return '%s %s' % (calendar.month_name[d.tm_mon], d.tm_year) - -def _month_details(cls): +def _get_unix_epoch(strdate): + from time import strptime,mktime + d = strptime(strdate, '%Y-%m') + return int(mktime(d)) + +def _month_details(cls, stat_key=None): ''' Returns a list of all the periods for which we have data, unfortunately knows too much about the type of the cls being passed as GA_Url has a @@ -32,9 +38,13 @@ months = [] day = None - vals = model.Session.query(cls.period_name,cls.period_complete_day)\ - .filter(cls.period_name!='All').distinct(cls.period_name)\ - .order_by("period_name desc").all() + q = model.Session.query(cls.period_name,cls.period_complete_day)\ + .filter(cls.period_name!='All').distinct(cls.period_name) + if stat_key: + q= q.filter(cls.stat_name==stat_key) + + vals = q.order_by("period_name desc").all() + if vals and vals[0][1]: day = int(vals[0][1]) ordinal = 'th' if 11 <= day <= 13 \ @@ -66,25 +76,6 @@ for entry in entries: writer.writerow([entry.period_name.encode('utf-8'), entry.stat_name.encode('utf-8'), - entry.key.encode('utf-8'), - entry.value.encode('utf-8')]) - - def csv_downloads(self, month): - import csv - - q = model.Session.query(GA_Stat).filter(GA_Stat.stat_name=='Downloads') - if month != 'all': - q = q.filter(GA_Stat.period_name==month) - entries = q.order_by('GA_Stat.period_name, GA_Stat.key').all() - - response.headers['Content-Type'] = "text/csv; charset=utf-8" - response.headers['Content-Disposition'] = str('attachment; filename=downloads_%s.csv' % (month,)) - - writer = csv.writer(response) - writer.writerow(["Period", "Resource URL", "Count"]) - - for entry in entries: - writer.writerow([entry.period_name.encode('utf-8'), entry.key.encode('utf-8'), entry.value.encode('utf-8')]) @@ -121,11 +112,26 @@ return key, val + # Query historic values for sparkline rendering + sparkline_query = model.Session.query(GA_Stat)\ + .filter(GA_Stat.stat_name=='Totals')\ + .order_by(GA_Stat.period_name) + sparkline_data = {} + for x in sparkline_query: + sparkline_data[x.key] = sparkline_data.get(x.key,[]) + key, val = clean_key(x.key,float(x.value)) + tooltip = '%s: %s' % (_get_month_name(x.period_name), val) + sparkline_data[x.key].append( (tooltip,x.value) ) + # Trim the latest month, as it looks like a huge dropoff + for key in sparkline_data: + sparkline_data[key] = sparkline_data[key][:-1] + c.global_totals = [] if c.month: for e in entries: key, val = clean_key(e.key, e.value) - c.global_totals.append((key, val)) + sparkline = sparkline_data[e.key] + c.global_totals.append((key, val, sparkline)) else: d = collections.defaultdict(list) for e in entries: @@ -135,10 +141,18 @@ v = sum(v) else: v = float(sum(v))/float(len(v)) + sparkline = sparkline_data[k] key, val = clean_key(k,v) - c.global_totals.append((key, val)) - c.global_totals = sorted(c.global_totals, key=operator.itemgetter(0)) + c.global_totals.append((key, val, sparkline)) + # Sort the global totals into a more pleasant order + def sort_func(x): + key = x[0] + total_order = ['Total page views','Total visits','Pages per visit'] + if key in total_order: + return total_order.index(key) + return 999 + c.global_totals = sorted(c.global_totals, key=sort_func) keys = { 'Browser versions': 'browser_versions', @@ -175,7 +189,29 @@ for k, v in keys.iteritems(): q = model.Session.query(GA_Stat).\ - filter(GA_Stat.stat_name==k) + filter(GA_Stat.stat_name==k).\ + order_by(GA_Stat.period_name) + # Run the query on all months to gather graph data + series = {} + x_axis = set() + for stat in q: + x_val = _get_unix_epoch(stat.period_name) + series[ stat.key ] = series.get(stat.key,{}) + series[ stat.key ][x_val] = float(stat.value) + x_axis.add(x_val) + # Common x-axis for all series. Exclude this month (incomplete data) + x_axis = sorted(list(x_axis))[:-1] + # Buffer a rickshaw dataset from the series + def create_graph(series_name, series_data): + return { + 'name':series_name, + 'data':[ {'x':x,'y':series_data.get(x,0)} for x in x_axis ] + } + rickshaw = [ create_graph(name,data) for name, data in series.items() ] + rickshaw = sorted(rickshaw,key=lambda x:x['data'][-1]['y']) + setattr(c, v+'_graph', json.dumps(rickshaw)) + + # Buffer the tabular data if c.month: entries = [] q = q.filter(GA_Stat.period_name==c.month).\ @@ -192,41 +228,12 @@ # Get the total for each set of values and then set the value as # a percentage of the total if k == 'Social sources': - total = sum([x for n,x in c.global_totals if n == 'Total visits']) + total = sum([x for n,x,graph in c.global_totals if n == 'Total visits']) else: total = sum([num for _,num in entries]) setattr(c, v, [(k,_percent(v,total)) for k,v in entries ]) return render('ga_report/site/index.html') - - def downloads(self): - - # Get the month details by fetching distinct values and determining the - # month names from the values. - c.months, c.day = _month_details(GA_Stat) - - # Work out which month to show, based on query params of the first item - c.month_desc = 'all months' - c.month = request.params.get('month', '') - if c.month: - c.month_desc = ''.join([m[1] for m in c.months if m[0]==c.month]) - - c.downloads = [] - q = model.Session.query(GA_Stat).filter(GA_Stat.stat_name=='Downloads') - q = q.filter(GA_Stat.period_name==c.month) if c.month else q - q = q.order_by("ga_stat.value::int desc") - - data = collections.defaultdict(int) - for entry in q.all(): - r = model.Session.query(model.Resource).filter(model.Resource.url==entry.key).first() - if not r: - continue - data[r] += int(entry.value) - - c.downloads = [(k,v,) for k,v in data.iteritems()] - c.downloads = sorted(c.downloads, key=operator.itemgetter(1), reverse=True) - - return render('ga_report/site/downloads.html') class GaDatasetReport(BaseController): @@ -272,13 +279,14 @@ str('attachment; filename=datasets_%s_%s.csv' % (c.publisher_name, month,)) writer = csv.writer(response) - writer.writerow(["Dataset Title", "Dataset Name", "Views", "Visits", "Period Name"]) - - for package,view,visit in packages: + writer.writerow(["Dataset Title", "Dataset Name", "Views", "Visits", "Resource downloads", "Period Name"]) + + for package,view,visit,downloads in packages: writer.writerow([package.title.encode('utf-8'), package.name.encode('utf-8'), view, visit, + downloads, month]) def publishers(self): @@ -299,10 +307,10 @@ def _get_packages(self, publisher=None, count=-1): '''Returns the datasets in order of views''' - if count == -1: - count = sys.maxint - + have_download_data = True month = c.month or 'All' + if month != 'All': + have_download_data = month >= DOWNLOADS_AVAILABLE_FROM q = model.Session.query(GA_Url,model.Package)\ .filter(model.Package.name==GA_Url.package_id)\ @@ -312,9 +320,26 @@ q = q.filter(GA_Url.period_name==month) q = q.order_by('ga_url.pageviews::int desc') top_packages = [] - for entry,package in q.limit(count): + if count == -1: + entries = q.all() + else: + entries = q.limit(count) + + for entry,package in entries: if package: - top_packages.append((package, entry.pageviews, entry.visits)) + # Downloads .... + if have_download_data: + dls = model.Session.query(GA_Stat).\ + filter(GA_Stat.stat_name=='Downloads').\ + filter(GA_Stat.key==package.name) + if month != 'All': # Fetch everything unless the month is specific + dls = dls.filter(GA_Stat.period_name==month) + downloads = 0 + for x in dls: + downloads += int(x.value) + else: + downloads = 'No data' + top_packages.append((package, entry.pageviews, entry.visits, downloads)) else: log.warning('Could not find package associated package') @@ -361,6 +386,26 @@ c.publisher_page_views = entry.pageviews if entry else 0 c.top_packages = self._get_packages(c.publisher, 20) + + # Graph query + top_package_names = [ x[0].name for x in c.top_packages ] + graph_query = model.Session.query(GA_Url,model.Package)\ + .filter(model.Package.name==GA_Url.package_id)\ + .filter(GA_Url.url.like('/dataset/%'))\ + .filter(GA_Url.package_id.in_(top_package_names)) + graph_data = {} + for entry,package in graph_query: + if not package: continue + if entry.period_name=='All': continue + graph_data[package.id] = graph_data.get(package.id,{ + 'name':package.title, + 'data':[] + }) + graph_data[package.id]['data'].append({ + 'x':_get_unix_epoch(entry.period_name), + 'y':int(entry.pageviews), + }) + c.graph_data = json.dumps(graph_data.values()) return render('ga_report/publisher/read.html')