From: Tom Rees Date: Sun, 13 Jan 2013 16:37:38 +0000 Subject: Feature #162: Added Rickshaw graphs to most panes of statistics. Needs a little tidying for clarity. X-Git-Url: http://maxious.lambdacomplex.org/git/?p=ckanext-ga-report.git&a=commitdiff&h=5f5b462f359e7c74ddbcc4ba01b53f9bdde0738e --- Feature #162: Added Rickshaw graphs to most panes of statistics. Needs a little tidying for clarity. --- --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ *.py[co] *.py~ .gitignore +ckan.log # Packages *.egg --- a/README.rst +++ b/README.rst @@ -32,11 +32,11 @@ googleanalytics.id = UA-1010101-1 googleanalytics.account = Account name (e.g. data.gov.uk, see top level item at https://www.google.com/analytics) + googleanalytics.token.filepath = ~/pyenv/token.dat ga-report.period = monthly - ga-report.bounce_url = /data + ga-report.bounce_url = / - The ga-report.bounce_url specifies the path to use when calculating bounces. For DGU this is /data - but you may want to set this to /. + The ga-report.bounce_url specifies a particular path to record the bounce rate for. Typically it is / (the home page). 3. Set up this extension's database tables using a paster command. (Ensure your CKAN pyenv is still activated, run the command from ``src/ckanext-ga-report``, alter the ``--config`` option to point to your site config file):: @@ -83,13 +83,17 @@ $ paster getauthtoken --config=../ckan/development.ini +Now ensure you reference the correct path to your token.dat in your CKAN config file (e.g. development.ini):: + + googleanalytics.token.filepath = ~/pyenv/token.dat + Tutorial -------- Download some GA data and store it in CKAN's database. (Ensure your CKAN pyenv is still activated, run the command from ``src/ckanext-ga-report``, alter the ``--config`` option to point to your site config file) and specifying the name of your auth file (token.dat by default) from the previous step:: - $ paster loadanalytics token.dat latest --config=../ckan/development.ini + $ paster loadanalytics latest --config=../ckan/development.ini The value after the token file is how much data you want to retrieve, this can be --- a/ckanext/ga_report/command.py +++ b/ckanext/ga_report/command.py @@ -1,5 +1,8 @@ import logging import datetime +import os + +from pylons import config from ckan.lib.cli import CkanCommand # No other CKAN imports allowed until _load_config is run, @@ -53,25 +56,52 @@ self.args[0] if self.args else 'credentials.json') +class FixTimePeriods(CkanCommand): + """ + Fixes the 'All' records for GA_Urls + + It is possible that older urls that haven't recently been visited + do not have All records. This command will traverse through those + records and generate valid All records for them. + """ + summary = __doc__.split('\n')[0] + usage = __doc__ + max_args = 0 + min_args = 0 + + def __init__(self, name): + super(FixTimePeriods, self).__init__(name) + + def command(self): + import ckan.model as model + from ga_model import post_update_url_stats + self._load_config() + model.Session.remove() + model.Session.configure(bind=model.meta.engine) + + log = logging.getLogger('ckanext.ga_report') + + log.info("Updating 'All' records for old URLs") + post_update_url_stats() + log.info("Processing complete") + + class LoadAnalytics(CkanCommand): """Get data from Google Analytics API and save it in the ga_model - Usage: paster loadanalytics + Usage: paster loadanalytics - Where is the name of the auth token file from - the getauthtoken step. - - And where is: + Where is: all - data for all time latest - (default) just the 'latest' data YYYY-MM - just data for the specific month """ summary = __doc__.split('\n')[0] usage = __doc__ - max_args = 2 - min_args = 1 + max_args = 1 + min_args = 0 def __init__(self, name): super(LoadAnalytics, self).__init__(name) @@ -80,6 +110,11 @@ default=False, dest='delete_first', help='Delete data for the period first') + self.parser.add_option('-s', '--skip_url_stats', + action='store_true', + default=False, + dest='skip_url_stats', + help='Skip the download of URL data - just do site-wide stats') def command(self): self._load_config() @@ -87,18 +122,25 @@ from download_analytics import DownloadAnalytics from ga_auth import (init_service, get_profile_id) + ga_token_filepath = os.path.expanduser(config.get('googleanalytics.token.filepath', '')) + if not ga_token_filepath: + print 'ERROR: In the CKAN config you need to specify the filepath of the ' \ + 'Google Analytics token file under key: googleanalytics.token.filepath' + return + try: - svc = init_service(self.args[0], None) + svc = init_service(ga_token_filepath, None) except TypeError: print ('Have you correctly run the getauthtoken task and ' - 'specified the correct token file?') + 'specified the correct token file in the CKAN config under ' + '"googleanalytics.token.filepath"?') return downloader = DownloadAnalytics(svc, profile_id=get_profile_id(svc), - delete_first=self.options.delete_first) + delete_first=self.options.delete_first, + skip_url_stats=self.options.skip_url_stats) - time_period = self.args[1] if self.args and len(self.args) > 1 \ - else 'latest' + time_period = self.args[0] if self.args else 'latest' if time_period == 'all': downloader.all_() elif time_period == 'latest': --- a/ckanext/ga_report/controller.py +++ b/ckanext/ga_report/controller.py @@ -1,6 +1,7 @@ import re import csv import sys +import json import logging import operator import collections @@ -9,10 +10,11 @@ import sqlalchemy from sqlalchemy import func, cast, Integer import ckan.model as model -from ga_model import GA_Url, GA_Stat, GA_ReferralStat +from ga_model import GA_Url, GA_Stat, GA_ReferralStat, GA_Publisher log = logging.getLogger('ckanext.ga-report') +DOWNLOADS_AVAILABLE_FROM = '2012-12' def _get_month_name(strdate): import calendar @@ -20,13 +22,39 @@ d = strptime(strdate, '%Y-%m') return '%s %s' % (calendar.month_name[d.tm_mon], d.tm_year) - -def _month_details(cls): +def _get_unix_epoch(strdate): + from time import strptime,mktime + d = strptime(strdate, '%Y-%m') + return int(mktime(d)) + +def _month_details(cls, stat_key=None): + ''' + Returns a list of all the periods for which we have data, unfortunately + knows too much about the type of the cls being passed as GA_Url has a + more complex query + + This may need extending if we add a period_name to the stats + ''' months = [] - vals = model.Session.query(cls.period_name).filter(cls.period_name!='All').distinct().all() + day = None + + q = model.Session.query(cls.period_name,cls.period_complete_day)\ + .filter(cls.period_name!='All').distinct(cls.period_name) + if stat_key: + q= q.filter(cls.stat_name==stat_key) + + vals = q.order_by("period_name desc").all() + + if vals and vals[0][1]: + day = int(vals[0][1]) + ordinal = 'th' if 11 <= day <= 13 \ + else {1:'st',2:'nd',3:'rd'}.get(day % 10, 'th') + day = "{day}{ordinal}".format(day=day, ordinal=ordinal) + for m in vals: months.append( (m[0], _get_month_name(m[0]))) - return sorted(months, key=operator.itemgetter(0), reverse=True) + + return months, day class GaReport(BaseController): @@ -34,7 +62,7 @@ def csv(self, month): import csv - q = model.Session.query(GA_Stat) + q = model.Session.query(GA_Stat).filter(GA_Stat.stat_name!='Downloads') if month != 'all': q = q.filter(GA_Stat.period_name==month) entries = q.order_by('GA_Stat.period_name, GA_Stat.stat_name, GA_Stat.key').all() @@ -51,11 +79,12 @@ entry.key.encode('utf-8'), entry.value.encode('utf-8')]) + def index(self): # Get the month details by fetching distinct values and determining the # month names from the values. - c.months = _month_details(GA_Stat) + c.months, c.day = _month_details(GA_Stat) # Work out which month to show, based on query params of the first item c.month_desc = 'all months' @@ -70,24 +99,39 @@ entries = q.order_by('ga_stat.key').all() def clean_key(key, val): - if key in ['Average time on site', 'Pages per visit', 'New visits', 'Bounces']: + if key in ['Average time on site', 'Pages per visit', 'New visits', 'Bounce rate (home page)']: val = "%.2f" % round(float(val), 2) if key == 'Average time on site': mins, secs = divmod(float(val), 60) hours, mins = divmod(mins, 60) val = '%02d:%02d:%02d (%s seconds) ' % (hours, mins, secs, val) - if key in ['New visits','Bounces']: + if key in ['New visits','Bounce rate (home page)']: val = "%s%%" % val if key in ['Total page views', 'Total visits']: val = int(val) return key, val + + # Query historic values for sparkline rendering + graph_query = model.Session.query(GA_Stat)\ + .filter(GA_Stat.stat_name=='Totals')\ + .order_by(GA_Stat.period_name) + graph_data = {} + for x in graph_query: + graph_data[x.key] = graph_data.get(x.key,[]) + key, val = clean_key(x.key,float(x.value)) + tooltip = '%s: %s' % (_get_month_name(x.period_name), val) + graph_data[x.key].append( (tooltip,x.value) ) + # Trim the latest month, as it looks like a huge dropoff + for key in graph_data: + graph_data[key] = graph_data[key][:-1] c.global_totals = [] if c.month: for e in entries: key, val = clean_key(e.key, e.value) - c.global_totals.append((key, val)) + sparkline = graph_data[e.key] + c.global_totals.append((key, val, sparkline)) else: d = collections.defaultdict(list) for e in entries: @@ -96,10 +140,11 @@ if k in ['Total page views', 'Total visits']: v = sum(v) else: - v = float(sum(v))/len(v) + v = float(sum(v))/float(len(v)) + sparkline = graph_data[k] key, val = clean_key(k,v) - c.global_totals.append((key, val)) + c.global_totals.append((key, val, sparkline)) c.global_totals = sorted(c.global_totals, key=operator.itemgetter(0)) keys = { @@ -137,7 +182,29 @@ for k, v in keys.iteritems(): q = model.Session.query(GA_Stat).\ - filter(GA_Stat.stat_name==k) + filter(GA_Stat.stat_name==k).\ + order_by(GA_Stat.period_name) + # Run the query on all months to gather graph data + series = {} + x_axis = set() + for stat in q: + x_val = _get_unix_epoch(stat.period_name) + series[ stat.key ] = series.get(stat.key,{}) + series[ stat.key ][x_val] = float(stat.value) + x_axis.add(x_val) + # Common x-axis for all series. Exclude this month (incomplete data) + x_axis = sorted(list(x_axis))[:-1] + # Buffer a rickshaw dataset from the series + def create_graph(series_name, series_data): + return { + 'name':series_name, + 'data':[ {'x':x,'y':series_data.get(x,0)} for x in x_axis ] + } + rickshaw = [ create_graph(name,data) for name, data in series.items() ] + rickshaw = sorted(rickshaw,key=lambda x:x['data'][-1]['y']) + setattr(c, v+'_graph', json.dumps(rickshaw)) + + # Buffer the tabular data if c.month: entries = [] q = q.filter(GA_Stat.period_name==c.month).\ @@ -154,7 +221,7 @@ # Get the total for each set of values and then set the value as # a percentage of the total if k == 'Social sources': - total = sum([x for n,x in c.global_totals if n == 'Total visits']) + total = sum([x for n,x,graph in c.global_totals if n == 'Total visits']) else: total = sum([num for _,num in entries]) setattr(c, v, [(k,_percent(v,total)) for k,v in entries ]) @@ -205,13 +272,14 @@ str('attachment; filename=datasets_%s_%s.csv' % (c.publisher_name, month,)) writer = csv.writer(response) - writer.writerow(["Dataset Title", "Dataset Name", "Views", "Visits", "Period Name"]) - - for package,view,visit in packages: + writer.writerow(["Dataset Title", "Dataset Name", "Views", "Visits", "Resource downloads", "Period Name"]) + + for package,view,visit,downloads in packages: writer.writerow([package.title.encode('utf-8'), package.name.encode('utf-8'), view, visit, + downloads, month]) def publishers(self): @@ -219,7 +287,7 @@ # Get the month details by fetching distinct values and determining the # month names from the values. - c.months = _month_details(GA_Url) + c.months, c.day = _month_details(GA_Url) # Work out which month to show, based on query params of the first item c.month = request.params.get('month', '') @@ -228,15 +296,14 @@ c.month_desc = ''.join([m[1] for m in c.months if m[0]==c.month]) c.top_publishers = _get_top_publishers() - return render('ga_report/publisher/index.html') def _get_packages(self, publisher=None, count=-1): - '''Returns the datasets in order of visits''' - if count == -1: - count = sys.maxint - + '''Returns the datasets in order of views''' + have_download_data = True month = c.month or 'All' + if month != 'All': + have_download_data = month >= DOWNLOADS_AVAILABLE_FROM q = model.Session.query(GA_Url,model.Package)\ .filter(model.Package.name==GA_Url.package_id)\ @@ -244,12 +311,27 @@ if publisher: q = q.filter(GA_Url.department_id==publisher.name) q = q.filter(GA_Url.period_name==month) - q = q.order_by('ga_url.visitors::int desc') + q = q.order_by('ga_url.pageviews::int desc') top_packages = [] - - for entry,package in q.limit(count): + if count == -1: + entries = q.all() + else: + entries = q.limit(count) + + for entry,package in entries: if package: - top_packages.append((package, entry.pageviews, entry.visitors)) + # Downloads .... + if have_download_data: + dls = model.Session.query(GA_Stat).\ + filter(GA_Stat.stat_name=='Downloads').\ + filter(GA_Stat.key==package.name) + if month != 'All': # Fetch everything unless the month is specific + dls = dls.filter(GA_Stat.period_name==month) + + downloads = sum(int(d.value) for d in dls.all()) + else: + downloads = 'No data' + top_packages.append((package, entry.pageviews, entry.visits, downloads)) else: log.warning('Could not find package associated package') @@ -279,7 +361,7 @@ # Get the month details by fetching distinct values and determining the # month names from the values. - c.months = _month_details(GA_Url) + c.months, c.day = _month_details(GA_Url) # Work out which month to show, based on query params of the first item c.month = request.params.get('month', '') @@ -288,7 +370,7 @@ else: c.month_desc = ''.join([m[1] for m in c.months if m[0]==c.month]) - month = c.mnth or 'All' + month = c.month or 'All' c.publisher_page_views = 0 q = model.Session.query(GA_Url).\ filter(GA_Url.url=='/publisher/%s' % c.publisher_name) @@ -304,38 +386,33 @@ Returns a list of the top 20 publishers by dataset visits. (The number to show can be varied with 'limit') ''' + month = c.month or 'All' connection = model.Session.connection() q = """ - select department_id, sum(pageviews::int) views, sum(visitors::int) visits + select department_id, sum(pageviews::int) views, sum(visits::int) visits from ga_url - where department_id <> ''""" - if c.month: - q = q + """ - and period_name=%s - """ - q = q + """ - group by department_id order by visits desc + where department_id <> '' + and package_id <> '' + and url like '/dataset/%%' + and period_name=%s + group by department_id order by views desc """ if limit: q = q + " limit %s;" % (limit) - # Add this back (before and period_name =%s) if you want to ignore publisher - # homepage views - # and not url like '/publisher/%%' - top_publishers = [] - res = connection.execute(q, c.month) - + res = connection.execute(q, month) for row in res: g = model.Group.get(row[0]) if g: top_publishers.append((g, row[1], row[2])) return top_publishers + def _get_publishers(): ''' Returns a list of all publishers. Each item is a tuple: - (names, title) + (name, title) ''' publishers = [] for pub in model.Session.query(model.Group).\ --- a/ckanext/ga_report/download_analytics.py +++ b/ckanext/ga_report/download_analytics.py @@ -3,7 +3,7 @@ import datetime import collections from pylons import config - +from ga_model import _normalize_url import ga_model #from ga_client import GA @@ -13,15 +13,18 @@ FORMAT_MONTH = '%Y-%m' MIN_VIEWS = 50 MIN_VISITS = 20 +MIN_DOWNLOADS = 10 class DownloadAnalytics(object): '''Downloads and stores analytics info''' - def __init__(self, service=None, profile_id=None, delete_first=False): + def __init__(self, service=None, profile_id=None, delete_first=False, + skip_url_stats=False): self.period = config['ga-report.period'] self.service = service self.profile_id = profile_id self.delete_first = delete_first + self.skip_url_stats = skip_url_stats def specific_month(self, date): import calendar @@ -92,29 +95,47 @@ def download_and_store(self, periods): for period_name, period_complete_day, start_date, end_date in periods: + log.info('Period "%s" (%s - %s)', + self.get_full_period_name(period_name, period_complete_day), + start_date.strftime('%Y-%m-%d'), + end_date.strftime('%Y-%m-%d')) + if self.delete_first: - log.info('Deleting existing Analytics for period "%s"', + log.info('Deleting existing Analytics for this period "%s"', period_name) ga_model.delete(period_name) - log.info('Downloading Analytics for period "%s" (%s - %s)', - self.get_full_period_name(period_name, period_complete_day), - start_date.strftime('%Y %m %d'), - end_date.strftime('%Y %m %d')) - - data = self.download(start_date, end_date, '~/dataset/[a-z0-9-_]+') - log.info('Storing Dataset Analytics for period "%s"', - self.get_full_period_name(period_name, period_complete_day)) - self.store(period_name, period_complete_day, data, ) - - data = self.download(start_date, end_date, '~/publisher/[a-z0-9-_]+') - log.info('Storing Publisher Analytics for period "%s"', - self.get_full_period_name(period_name, period_complete_day)) - self.store(period_name, period_complete_day, data,) - - ga_model.update_publisher_stats(period_name) # about 30 seconds. - self.sitewide_stats( period_name ) - + + if not self.skip_url_stats: + # Clean out old url data before storing the new + ga_model.pre_update_url_stats(period_name) + + accountName = config.get('googleanalytics.account') + + log.info('Downloading analytics for dataset views') + data = self.download(start_date, end_date, '~/%s/dataset/[a-z0-9-_]+' % accountName) + + log.info('Storing dataset views (%i rows)', len(data.get('url'))) + self.store(period_name, period_complete_day, data, ) + + log.info('Downloading analytics for publisher views') + data = self.download(start_date, end_date, '~/%s/publisher/[a-z0-9-_]+' % accountName) + + log.info('Storing publisher views (%i rows)', len(data.get('url'))) + self.store(period_name, period_complete_day, data,) + + # Make sure the All records are correct. + ga_model.post_update_url_stats() + + log.info('Aggregating datasets by publisher') + ga_model.update_publisher_stats(period_name) # about 30 seconds. + + + log.info('Downloading and storing analytics for site-wide stats') + self.sitewide_stats( period_name, period_complete_day ) + + log.info('Downloading and storing analytics for social networks') self.update_social_info(period_name, start_date, end_date) + def update_social_info(self, period_name, start_date, end_date): start_date = start_date.strftime('%Y-%m-%d') @@ -137,18 +158,18 @@ data = collections.defaultdict(list) rows = results.get('rows',[]) for row in rows: - from ga_model import _normalize_url - data[_normalize_url(row[0])].append( (row[1], int(row[2]),) ) + url = _normalize_url('http:/' + row[0]) + data[url].append( (row[1], int(row[2]),) ) ga_model.update_social(period_name, data) - def download(self, start_date, end_date, path='~/dataset/[a-z0-9-_]+'): + def download(self, start_date, end_date, path=None): '''Get data from GA for a given time period''' start_date = start_date.strftime('%Y-%m-%d') end_date = end_date.strftime('%Y-%m-%d') query = 'ga:pagePath=%s$' % path - metrics = 'ga:uniquePageviews, ga:visits' - sort = '-ga:uniquePageviews' + metrics = 'ga:pageviews, ga:visits' + sort = '-ga:pageviews' # Supported query params at # https://developers.google.com/analytics/devguides/reporting/core/v3/reference @@ -163,29 +184,35 @@ end_date=end_date).execute() packages = [] + log.info("There are %d results" % results['totalResults']) for entry in results.get('rows'): (loc,pageviews,visits) = entry - packages.append( ('http:/' + loc, pageviews, visits,) ) # Temporary hack + url = _normalize_url('http:/' + loc) # strips off domain e.g. www.data.gov.uk or data.gov.uk + + if not url.startswith('/dataset/') and not url.startswith('/publisher/'): + # filter out strays like: + # /data/user/login?came_from=http://data.gov.uk/dataset/os-code-point-open + # /403.html?page=/about&from=http://data.gov.uk/publisher/planning-inspectorate + continue + packages.append( (url, pageviews, visits,) ) # Temporary hack return dict(url=packages) def store(self, period_name, period_complete_day, data): if 'url' in data: ga_model.update_url_stats(period_name, period_complete_day, data['url']) - def sitewide_stats(self, period_name): + def sitewide_stats(self, period_name, period_complete_day): import calendar year, month = period_name.split('-') _, last_day_of_month = calendar.monthrange(int(year), int(month)) start_date = '%s-01' % period_name end_date = '%s-%s' % (period_name, last_day_of_month) - print 'Sitewide_stats for %s (%s -> %s)' % (period_name, start_date, end_date) - funcs = ['_totals_stats', '_social_stats', '_os_stats', - '_locale_stats', '_browser_stats', '_mobile_stats'] + '_locale_stats', '_browser_stats', '_mobile_stats', '_download_stats'] for f in funcs: - print ' + Fetching %s stats' % f.split('_')[1] - getattr(self, f)(start_date, end_date, period_name) + log.info('Downloading analytics for %s' % f.split('_')[1]) + getattr(self, f)(start_date, end_date, period_name, period_complete_day) def _get_results(result_data, f): data = {} @@ -194,17 +221,18 @@ data[key] = data.get(key,0) + result[1] return data - def _totals_stats(self, start_date, end_date, period_name): + def _totals_stats(self, start_date, end_date, period_name, period_complete_day): """ Fetches distinct totals, total pageviews etc """ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', - max_results=10000, - end_date=end_date).execute() - result_data = results.get('rows') - ga_model.update_sitewide_stats(period_name, "Totals", {'Total page views': result_data[0][0]}) + metrics='ga:pageviews', + sort='-ga:pageviews', + max_results=10000, + end_date=end_date).execute() + result_data = results.get('rows') + ga_model.update_sitewide_stats(period_name, "Totals", {'Total page views': result_data[0][0]}, + period_complete_day) results = self.service.data().ga().get( ids='ga:' + self.profile_id, @@ -219,36 +247,39 @@ 'New visits': result_data[0][2], 'Total visits': result_data[0][3], } - ga_model.update_sitewide_stats(period_name, "Totals", data) - - # Bounces from /data. This url is specified in configuration because - # for DGU we don't want /. - path = config.get('ga-report.bounce_url','/') - print path - results = self.service.data().ga().get( - ids='ga:' + self.profile_id, - filters='ga:pagePath=~%s$' % (path,), - start_date=start_date, - metrics='ga:bounces,ga:uniquePageviews', + ga_model.update_sitewide_stats(period_name, "Totals", data, period_complete_day) + + # Bounces from / or another configurable page. + path = '/%s%s' % (config.get('googleanalytics.account'), + config.get('ga-report.bounce_url', '/')) + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + filters='ga:pagePath==%s' % (path,), + start_date=start_date, + metrics='ga:visitBounceRate', dimensions='ga:pagePath', max_results=10000, end_date=end_date).execute() result_data = results.get('rows') - for results in result_data: - if results[0] == path: - bounce, total = [float(x) for x in results[1:]] - pct = 100 * bounce/total - print "%d bounces from %d total == %s" % (bounce, total, pct) - ga_model.update_sitewide_stats(period_name, "Totals", {'Bounces': pct}) - - - def _locale_stats(self, start_date, end_date, period_name): + if not result_data or len(result_data) != 1: + log.error('Could not pinpoint the bounces for path: %s. Got results: %r', + path, result_data) + return + results = result_data[0] + bounces = float(results[1]) + # visitBounceRate is already a % + log.info('Google reports visitBounceRate as %s', bounces) + ga_model.update_sitewide_stats(period_name, "Totals", {'Bounce rate (home page)': float(bounces)}, + period_complete_day) + + + def _locale_stats(self, start_date, end_date, period_name, period_complete_day): """ Fetches stats about language and country """ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:language,ga:country", max_results=10000, end_date=end_date).execute() @@ -257,22 +288,78 @@ for result in result_data: data[result[0]] = data.get(result[0], 0) + int(result[2]) self._filter_out_long_tail(data, MIN_VIEWS) - ga_model.update_sitewide_stats(period_name, "Languages", data) + ga_model.update_sitewide_stats(period_name, "Languages", data, period_complete_day) data = {} for result in result_data: data[result[1]] = data.get(result[1], 0) + int(result[2]) self._filter_out_long_tail(data, MIN_VIEWS) - ga_model.update_sitewide_stats(period_name, "Country", data) - - - def _social_stats(self, start_date, end_date, period_name): + ga_model.update_sitewide_stats(period_name, "Country", data, period_complete_day) + + + def _download_stats(self, start_date, end_date, period_name, period_complete_day): + """ Fetches stats about language and country """ + import ckan.model as model + + data = {} + + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + start_date=start_date, + filters='ga:eventAction==download', + metrics='ga:totalEvents', + sort='-ga:totalEvents', + dimensions="ga:eventLabel", + max_results=10000, + end_date=end_date).execute() + result_data = results.get('rows') + if not result_data: + # We may not have data for this time period, so we need to bail + # early. + log.info("There is no download data for this time period") + return + + def process_result_data(result_data, cached=False): + for result in result_data: + url = result[0].strip() + + # Get package id associated with the resource that has this URL. + q = model.Session.query(model.Resource) + if cached: + r = q.filter(model.Resource.cache_url.like("%s%%" % url)).first() + else: + r = q.filter(model.Resource.url.like("%s%%" % url)).first() + + package_name = r.resource_group.package.name if r else "" + if package_name: + data[package_name] = data.get(package_name, 0) + int(result[1]) + else: + log.warning(u"Could not find resource for URL: {url}".format(url=url)) + continue + + process_result_data(results.get('rows')) + + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + start_date=start_date, + filters='ga:eventAction==download-cache', + metrics='ga:totalEvents', + sort='-ga:totalEvents', + dimensions="ga:eventLabel", + max_results=10000, + end_date=end_date).execute() + process_result_data(results.get('rows'), cached=False) + + self._filter_out_long_tail(data, MIN_DOWNLOADS) + ga_model.update_sitewide_stats(period_name, "Downloads", data, period_complete_day) + + def _social_stats(self, start_date, end_date, period_name, period_complete_day): """ Finds out which social sites people are referred from """ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:socialNetwork,ga:referralPath", max_results=10000, end_date=end_date).execute() @@ -282,16 +369,16 @@ if not result[0] == '(not set)': data[result[0]] = data.get(result[0], 0) + int(result[2]) self._filter_out_long_tail(data, 3) - ga_model.update_sitewide_stats(period_name, "Social sources", data) - - - def _os_stats(self, start_date, end_date, period_name): + ga_model.update_sitewide_stats(period_name, "Social sources", data, period_complete_day) + + + def _os_stats(self, start_date, end_date, period_name, period_complete_day): """ Operating system stats """ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:operatingSystem,ga:operatingSystemVersion", max_results=10000, end_date=end_date).execute() @@ -300,23 +387,23 @@ for result in result_data: data[result[0]] = data.get(result[0], 0) + int(result[2]) self._filter_out_long_tail(data, MIN_VIEWS) - ga_model.update_sitewide_stats(period_name, "Operating Systems", data) + ga_model.update_sitewide_stats(period_name, "Operating Systems", data, period_complete_day) data = {} for result in result_data: if int(result[2]) >= MIN_VIEWS: key = "%s %s" % (result[0],result[1]) data[key] = result[2] - ga_model.update_sitewide_stats(period_name, "Operating Systems versions", data) - - - def _browser_stats(self, start_date, end_date, period_name): + ga_model.update_sitewide_stats(period_name, "Operating Systems versions", data, period_complete_day) + + + def _browser_stats(self, start_date, end_date, period_name, period_complete_day): """ Information about browsers and browser versions """ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:browser,ga:browserVersion", max_results=10000, end_date=end_date).execute() @@ -327,14 +414,14 @@ for result in result_data: data[result[0]] = data.get(result[0], 0) + int(result[2]) self._filter_out_long_tail(data, MIN_VIEWS) - ga_model.update_sitewide_stats(period_name, "Browsers", data) + ga_model.update_sitewide_stats(period_name, "Browsers", data, period_complete_day) data = {} for result in result_data: key = "%s %s" % (result[0], self._filter_browser_version(result[0], result[1])) data[key] = data.get(key, 0) + int(result[2]) self._filter_out_long_tail(data, MIN_VIEWS) - ga_model.update_sitewide_stats(period_name, "Browser versions", data) + ga_model.update_sitewide_stats(period_name, "Browser versions", data, period_complete_day) @classmethod def _filter_browser_version(cls, browser, version_str): @@ -358,14 +445,14 @@ ver = ver[0] + ver[1] + 'X' * num_hidden_digits return ver - def _mobile_stats(self, start_date, end_date, period_name): + def _mobile_stats(self, start_date, end_date, period_name, period_complete_day): """ Info about mobile devices """ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:mobileDeviceBranding, ga:mobileDeviceInfo", max_results=10000, end_date=end_date).execute() @@ -375,13 +462,13 @@ for result in result_data: data[result[0]] = data.get(result[0], 0) + int(result[2]) self._filter_out_long_tail(data, MIN_VIEWS) - ga_model.update_sitewide_stats(period_name, "Mobile brands", data) + ga_model.update_sitewide_stats(period_name, "Mobile brands", data, period_complete_day) data = {} for result in result_data: data[result[1]] = data.get(result[1], 0) + int(result[2]) self._filter_out_long_tail(data, MIN_VIEWS) - ga_model.update_sitewide_stats(period_name, "Mobile devices", data) + ga_model.update_sitewide_stats(period_name, "Mobile devices", data, period_complete_day) @classmethod def _filter_out_long_tail(cls, data, threshold=10): --- a/ckanext/ga_report/ga_model.py +++ b/ckanext/ga_report/ga_model.py @@ -9,6 +9,8 @@ import ckan.model as model from ckan.lib.base import * + +log = __import__('logging').getLogger(__name__) def make_uuid(): return unicode(uuid.uuid4()) @@ -27,7 +29,7 @@ Column('period_name', types.UnicodeText), Column('period_complete_day', types.Integer), Column('pageviews', types.UnicodeText), - Column('visitors', types.UnicodeText), + Column('visits', types.UnicodeText), Column('url', types.UnicodeText), Column('department_id', types.UnicodeText), Column('package_id', types.UnicodeText), @@ -45,6 +47,7 @@ Column('id', types.UnicodeText, primary_key=True, default=make_uuid), Column('period_name', types.UnicodeText), + Column('period_complete_day', types.UnicodeText), Column('stat_name', types.UnicodeText), Column('key', types.UnicodeText), Column('value', types.UnicodeText), ) @@ -63,7 +66,7 @@ Column('period_name', types.UnicodeText), Column('publisher_name', types.UnicodeText), Column('views', types.UnicodeText), - Column('visitors', types.UnicodeText), + Column('visits', types.UnicodeText), Column('toplevel', types.Boolean, default=False), Column('subpublishercount', types.Integer, default=0), Column('parent', types.UnicodeText), @@ -111,12 +114,10 @@ >>> normalize_url('http://data.gov.uk/dataset/weekly_fuel_prices') '/dataset/weekly_fuel_prices' ''' - # Deliberately leaving a / - url = url.replace('http:/','') - return '/' + '/'.join(url.split('/')[2:]) - - -def _get_department_id_of_url(url): + return '/' + '/'.join(url.split('/')[3:]) + + +def _get_package_and_publisher(url): # e.g. /dataset/fuel_prices # e.g. /dataset/fuel_prices/resource/e63380d4 dataset_match = re.match('/dataset/([^/]+)(/.*)?', url) @@ -126,14 +127,15 @@ if dataset: publisher_groups = dataset.get_groups('publisher') if publisher_groups: - return publisher_groups[0].name + return dataset_ref,publisher_groups[0].name + return dataset_ref, None else: publisher_match = re.match('/publisher/([^/]+)(/.*)?', url) if publisher_match: - return publisher_match.groups()[0] - - -def update_sitewide_stats(period_name, stat_name, data): + return None, publisher_match.groups()[0] + return None, None + +def update_sitewide_stats(period_name, stat_name, data, period_complete_day): for k,v in data.iteritems(): item = model.Session.query(GA_Stat).\ filter(GA_Stat.period_name==period_name).\ @@ -143,11 +145,13 @@ item.period_name = period_name item.key = k item.value = v + item.period_complete_day = period_complete_day model.Session.add(item) else: # create the row values = {'id': make_uuid(), 'period_name': period_name, + 'period_complete_day': period_complete_day, 'key': k, 'value': v, 'stat_name': stat_name @@ -156,61 +160,118 @@ model.Session.commit() +def pre_update_url_stats(period_name): + log.debug("Deleting '%s' records" % period_name) + model.Session.query(GA_Url).\ + filter(GA_Url.period_name==period_name).delete() + + count = model.Session.query(GA_Url).\ + filter(GA_Url.period_name == 'All').count() + log.debug("Deleting %d 'All' records" % count) + count = model.Session.query(GA_Url).\ + filter(GA_Url.period_name == 'All').delete() + log.debug("Deleted %d 'All' records" % count) + + model.Session.flush() + model.Session.commit() + model.repo.commit_and_remove() + +def post_update_url_stats(): + + """ Check the distinct url field in ga_url and make sure + it has an All record. If not then create one. + + After running this then every URL should have an All + record regardless of whether the URL has an entry for + the month being currently processed. + """ + query = """select url, pageviews::int, visits::int + from ga_url + where url not in (select url from ga_url where period_name ='All')""" + connection = model.Session.connection() + res = connection.execute(query) + + views, visits = {}, {} + # url, views, visits + for row in res: + views[row[0]] = views.get(row[0], 0) + row[1] + visits[row[0]] = visits.get(row[0], 0) + row[2] + + for key in views.keys(): + package, publisher = _get_package_and_publisher(key) + + values = {'id': make_uuid(), + 'period_name': "All", + 'period_complete_day': 0, + 'url': key, + 'pageviews': views[key], + 'visits': visits[key], + 'department_id': publisher, + 'package_id': publisher + } + model.Session.add(GA_Url(**values)) + model.Session.commit() + def update_url_stats(period_name, period_complete_day, url_data): - for url, views, visitors in url_data: - url = _normalize_url(url) - department_id = _get_department_id_of_url(url) - - package = None - if url.startswith('/dataset/'): - package = url[len('/dataset/'):] - - # see if the row for this url & month is in the table already + ''' + Given a list of urls and number of hits for each during a given period, + stores them in GA_Url under the period and recalculates the totals for + the 'All' period. + ''' + for url, views, visits in url_data: + package, publisher = _get_package_and_publisher(url) + + item = model.Session.query(GA_Url).\ filter(GA_Url.period_name==period_name).\ filter(GA_Url.url==url).first() if item: - item.period_name = period_name - item.pageviews = views - item.visitors = visitors - item.department_id = department_id - item.package_id = package + item.pageviews = item.pageviews + views + item.visits = item.visits + visits + if not item.package_id: + item.package_id = package + if not item.department_id: + item.department_id = publisher model.Session.add(item) else: - # create the row values = {'id': make_uuid(), 'period_name': period_name, 'period_complete_day': period_complete_day, 'url': url, 'pageviews': views, - 'visitors': visitors, - 'department_id': department_id, + 'visits': visits, + 'department_id': publisher, 'package_id': package } model.Session.add(GA_Url(**values)) - - # We now need to recaculate the ALL time_period from the data we have - # Delete the old 'All' - old = model.Session.query(GA_Url).\ - filter(GA_Url.period_name == "All").\ - filter(GA_Url.url==url).delete() - - items = model.Session.query(GA_Url).\ - filter(GA_Url.period_name != "All").\ - filter(GA_Url.url==url).all() - values = {'id': make_uuid(), - 'period_name': "All", - 'period_complete_day': "0", - 'url': url, - 'pageviews': sum([int(x.pageviews) for x in items]), - 'visitors': sum([int(x.visitors) for x in items]), - 'department_id': department_id, - 'package_id': package - } - model.Session.add(GA_Url(**values)) - model.Session.commit() + + if package: + old_pageviews, old_visits = 0, 0 + old = model.Session.query(GA_Url).\ + filter(GA_Url.period_name=='All').\ + filter(GA_Url.url==url).all() + old_pageviews = sum([int(o.pageviews) for o in old]) + old_visits = sum([int(o.visits) for o in old]) + + entries = model.Session.query(GA_Url).\ + filter(GA_Url.period_name!='All').\ + filter(GA_Url.url==url).all() + values = {'id': make_uuid(), + 'period_name': 'All', + 'period_complete_day': 0, + 'url': url, + 'pageviews': sum([int(e.pageviews) for e in entries]) + int(old_pageviews), + 'visits': sum([int(e.visits or 0) for e in entries]) + int(old_visits), + 'department_id': publisher, + 'package_id': package + } + + model.Session.add(GA_Url(**values)) + model.Session.commit() + + def update_social(period_name, data): @@ -252,7 +313,7 @@ filter(model.Group.type=='publisher').\ filter(model.Group.state=='active').all() for publisher in publishers: - views, visitors, subpub = update_publisher(period_name, publisher, publisher.name) + views, visits, subpub = update_publisher(period_name, publisher, publisher.name) parent, parents = '', publisher.get_groups('publisher') if parents: parent = parents[0].name @@ -261,7 +322,7 @@ filter(GA_Publisher.publisher_name==publisher.name).first() if item: item.views = views - item.visitors = visitors + item.visits = visits item.publisher_name = publisher.name item.toplevel = publisher in toplevel item.subpublishercount = subpub @@ -273,7 +334,7 @@ 'period_name': period_name, 'publisher_name': publisher.name, 'views': views, - 'visitors': visitors, + 'visits': visits, 'toplevel': publisher in toplevel, 'subpublishercount': subpub, 'parent': parent @@ -283,7 +344,7 @@ def update_publisher(period_name, pub, part=''): - views,visitors,subpub = 0, 0, 0 + views,visits,subpub = 0, 0, 0 for publisher in go_down_tree(pub): subpub = subpub + 1 items = model.Session.query(GA_Url).\ @@ -291,9 +352,9 @@ filter(GA_Url.department_id==publisher.name).all() for item in items: views = views + int(item.pageviews) - visitors = visitors + int(item.visitors) - - return views, visitors, (subpub-1) + visits = visits + int(item.visits) + + return views, visits, (subpub-1) def get_top_level(): @@ -328,8 +389,39 @@ ''' for object_type in (GA_Url, GA_Stat, GA_Publisher, GA_ReferralStat): q = model.Session.query(object_type) - if period_name != 'all': + if period_name != 'All': q = q.filter_by(period_name=period_name) q.delete() - model.Session.commit() - + model.repo.commit_and_remove() + +def get_score_for_dataset(dataset_name): + ''' + Returns a "current popularity" score for a dataset, + based on how many views it has had recently. + ''' + import datetime + now = datetime.datetime.now() + last_month = now - datetime.timedelta(days=30) + period_names = ['%s-%02d' % (last_month.year, last_month.month), + '%s-%02d' % (now.year, now.month), + ] + + score = 0 + for period_name in period_names: + score /= 2 # previous periods are discounted by 50% + entry = model.Session.query(GA_Url)\ + .filter(GA_Url.period_name==period_name)\ + .filter(GA_Url.package_id==dataset_name).first() + # score + if entry: + views = float(entry.pageviews) + if entry.period_complete_day: + views_per_day = views / entry.period_complete_day + else: + views_per_day = views / 15 # guess + score += views_per_day + + score = int(score * 100) + log.debug('Popularity %s: %s', score, dataset_name) + return score + --- a/ckanext/ga_report/helpers.py +++ b/ckanext/ga_report/helpers.py @@ -1,7 +1,9 @@ import logging import operator + import ckan.lib.base as base import ckan.model as model +from ckan.logic import get_action from ckanext.ga_report.ga_model import GA_Url, GA_Publisher from ckanext.ga_report.controller import _get_publishers @@ -39,25 +41,42 @@ order_by('ga_url.pageviews::int desc') num_top_datasets = top_datasets.count() + dataset = None if num_top_datasets: - dataset = None + count = 0 while not dataset: rand = random.randrange(0, min(top, num_top_datasets)) ga_url = top_datasets[rand] dataset = model.Package.get(ga_url.url[len('/dataset/'):]) if dataset and not dataset.state == 'active': dataset = None - else: + # When testing, it is possible that top datasets are not available + # so only go round this loop a few times before falling back on + # a random dataset. + count += 1 + if count > 10: + break + if not dataset: + # fallback dataset = model.Session.query(model.Package)\ .filter_by(state='active').first() - publisher = dataset.get_groups('publisher')[0] - return { - 'dataset': dataset, - 'publisher': publisher - } + if not dataset: + return None + dataset_dict = get_action('package_show')({'model': model, + 'session': model.Session, + 'validate': False}, + {'id':dataset.id}) + return dataset_dict def single_popular_dataset_html(top=20): - context = single_popular_dataset(top) + dataset_dict = single_popular_dataset(top) + groups = package.get('groups', []) + publishers = [ g for g in groups if g.get('type') == 'publisher' ] + publisher = publishers[0] if publishers else {'name':'', 'title': ''} + context = { + 'dataset': dataset_dict, + 'publisher': publisher_dict + } return base.render_snippet('ga_report/ga_popular_single.html', **context) @@ -90,7 +109,7 @@ if not p in datasets: datasets[p] = {'views':0, 'visits': 0} datasets[p]['views'] = datasets[p]['views'] + int(entry.pageviews) - datasets[p]['visits'] = datasets[p]['visits'] + int(entry.visitors) + datasets[p]['visits'] = datasets[p]['visits'] + int(entry.visits) results = [] for k, v in datasets.iteritems(): --- a/ckanext/ga_report/plugin.py +++ b/ckanext/ga_report/plugin.py @@ -42,6 +42,16 @@ controller='ckanext.ga_report.controller:GaReport', action='csv' ) + map.connect( + '/data/site-usage/downloads', + controller='ckanext.ga_report.controller:GaReport', + action='downloads' + ) + map.connect( + '/data/site-usage/downloads_{month}.csv', + controller='ckanext.ga_report.controller:GaReport', + action='csv_downloads' + ) # GaDatasetReport map.connect( --- /dev/null +++ b/ckanext/ga_report/public/scripts/vendor/d3.layout.min.js @@ -1,1 +1,1 @@ - +(function(){function a(a){var b=a.source,d=a.target,e=c(b,d),f=[b];while(b!==e)b=b.parent,f.push(b);var g=f.length;while(d!==e)f.splice(g,0,d),d=d.parent;return f}function b(a){var b=[],c=a.parent;while(c!=null)b.push(a),a=c,c=c.parent;return b.push(a),b}function c(a,c){if(a===c)return a;var d=b(a),e=b(c),f=d.pop(),g=e.pop(),h=null;while(f===g)h=f,f=d.pop(),g=e.pop();return h}function g(a){a.fixed|=2}function h(a){a!==f&&(a.fixed&=1)}function i(){j(),f.fixed&=1,e=f=null}function j(){f.px+=d3.event.dx,f.py+=d3.event.dy,e.resume()}function k(a,b,c){var d=0,e=0;a.charge=0;if(!a.leaf){var f=a.nodes,g=f.length,h=-1,i;while(++hd&&(c=b,d=e);return c}function u(a){return a.reduce(v,0)}function v(a,b){return a+b[1]}function w(a,b){return x(a,Math.ceil(Math.log(b.length)/Math.LN2+1))}function x(a,b){var c=-1,d=+a[0],e=(a[1]-d)/b,f=[];while(++c<=b)f[c]=e*c+d;return f}function y(a){return[d3.min(a),d3.max(a)]}function z(a,b){return a.sort=d3.rebind(a,b.sort),a.children=d3.rebind(a,b.children),a.links=D,a.value=d3.rebind(a,b.value),a.nodes=function(b){return E=!0,(a.nodes=a)(b)},a}function A(a){return a.children}function B(a){return a.value}function C(a,b){return b.value-a.value}function D(a){return d3.merge(a.map(function(a){return(a.children||[]).map(function(b){return{source:a,target:b}})}))}function F(a,b){return a.value-b.value}function G(a,b){var c=a._pack_next;a._pack_next=b,b._pack_prev=a,b._pack_next=c,c._pack_prev=b}function H(a,b){a._pack_next=b,b._pack_prev=a}function I(a,b){var c=b.x-a.x,d=b.y-a.y,e=a.r+b.r;return e*e-c*c-d*d>.001}function J(a){function l(a){b=Math.min(a.x-a.r,b),c=Math.max(a.x+a.r,c),d=Math.min(a.y-a.r,d),e=Math.max(a.y+a.r,e)}var b=Infinity,c=-Infinity,d=Infinity,e=-Infinity,f=a.length,g,h,i,j,k;a.forEach(K),g=a[0],g.x=-g.r,g.y=0,l(g);if(f>1){h=a[1],h.x=h.r,h.y=0,l(h);if(f>2){i=a[2],O(g,h,i),l(i),G(g,i),g._pack_prev=i,G(i,h),h=g._pack_next;for(var m=3;m0?(H(g,j),h=j,m--):(H(j,h),g=j,m--)}}}var q=(b+c)/2,r=(d+e)/2,s=0;for(var m=0;m0&&(a=d)}return a}function X(a,b){return a.x-b.x}function Y(a,b){return b.x-a.x}function Z(a,b){return a.depth-b.depth}function $(a,b){function c(a,d){var e=a.children;if(e&&(i=e.length)){var f,g=null,h=-1,i;while(++h=0)f=d[e]._tree,f.prelim+=b,f.mod+=b,b+=f.shift+(c+=f.change)}function ba(a,b,c){a=a._tree,b=b._tree;var d=c/(b.number-a.number);a.change+=d,b.change-=d,b.shift+=c,b.prelim+=c,b.mod+=c}function bb(a,b,c){return a._tree.ancestor.parent==b.parent?a._tree.ancestor:c}function bc(a){return{x:a.x,y:a.y,dx:a.dx,dy:a.dy}}function bd(a,b){var c=a.x+b[3],d=a.y+b[0],e=a.dx-b[1]-b[3],f=a.dy-b[0]-b[2];return e<0&&(c+=e/2,e=0),f<0&&(d+=f/2,f=0),{x:c,y:d,dx:e,dy:f}}d3.layout={},d3.layout.bundle=function(){return function(b){var c=[],d=-1,e=b.length;while(++d