From: David Read Date: Thu, 14 Feb 2013 15:19:55 +0000 Subject: Merge branch 'master' of github.com:datagovuk/ckanext-ga-report X-Git-Url: https://maxious.lambdacomplex.org/git/?p=ckanext-ga-report.git&a=commitdiff&h=61b148e0a9cda85b8034740a4cdea93d1f9e880f --- Merge branch 'master' of github.com:datagovuk/ckanext-ga-report --- --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ *.py[co] *.py~ .gitignore +ckan.log # Packages *.egg --- a/README.rst +++ b/README.rst @@ -26,15 +26,17 @@ 1. Activate you CKAN python environment and install this extension's software:: $ pyenv/bin/activate - $ pip install -e git+https://github.com/okfn/ckanext-ga-report.git#egg=ckanext-ga-report + $ pip install -e git+https://github.com/datagovuk/ckanext-ga-report.git#egg=ckanext-ga-report 2. Ensure you development.ini (or similar) contains the info about your Google Analytics account and configuration:: googleanalytics.id = UA-1010101-1 - googleanalytics.account = Account name (i.e. data.gov.uk, see top level item at https://www.google.com/analytics) + googleanalytics.account = Account name (e.g. data.gov.uk, see top level item at https://www.google.com/analytics) + googleanalytics.token.filepath = ~/pyenv/token.dat ga-report.period = monthly + ga-report.bounce_url = / - Note that your credentials will be readable by system administrators on your server. Rather than use sensitive account details, it is suggested you give access to the GA account to a new Google account that you create just for this purpose. + The ga-report.bounce_url specifies a particular path to record the bounce rate for. Typically it is / (the home page). 3. Set up this extension's database tables using a paster command. (Ensure your CKAN pyenv is still activated, run the command from ``src/ckanext-ga-report``, alter the ``--config`` option to point to your site config file):: @@ -43,6 +45,12 @@ 4. Enable the extension in your CKAN config file by adding it to ``ckan.plugins``:: ckan.plugins = ga-report + +Problem shooting +---------------- + +* ``(ProgrammingError) relation "ga_url" does not exist`` + This means that the ``paster initdb`` step has not been run successfully. Refer to the installation instructions for this extension. Authorization @@ -75,13 +83,17 @@ $ paster getauthtoken --config=../ckan/development.ini +Now ensure you reference the correct path to your token.dat in your CKAN config file (e.g. development.ini):: + + googleanalytics.token.filepath = ~/pyenv/token.dat + Tutorial -------- -Download some GA data and store it in CKAN's db. (Ensure your CKAN pyenv is still activated, run the command from ``src/ckanext-ga-report``, alter the ``--config`` option to point to your site config file) and specifying the name of your auth file (token.dat by default) from the previous step:: +Download some GA data and store it in CKAN's database. (Ensure your CKAN pyenv is still activated, run the command from ``src/ckanext-ga-report``, alter the ``--config`` option to point to your site config file) and specifying the name of your auth file (token.dat by default) from the previous step:: - $ paster loadanalytics token.dat latest --config=../ckan/development.ini + $ paster loadanalytics latest --config=../ckan/development.ini The value after the token file is how much data you want to retrieve, this can be --- a/ckanext/ga_report/command.py +++ b/ckanext/ga_report/command.py @@ -1,5 +1,8 @@ import logging import datetime +import os + +from pylons import config from ckan.lib.cli import CkanCommand # No other CKAN imports allowed until _load_config is run, @@ -20,7 +23,7 @@ import ckan.model as model model.Session.remove() model.Session.configure(bind=model.meta.engine) - log = logging.getLogger('ckanext.ga-report') + log = logging.getLogger('ckanext.ga_report') import ga_model ga_model.init_tables() @@ -53,26 +56,65 @@ self.args[0] if self.args else 'credentials.json') +class FixTimePeriods(CkanCommand): + """ + Fixes the 'All' records for GA_Urls + + It is possible that older urls that haven't recently been visited + do not have All records. This command will traverse through those + records and generate valid All records for them. + """ + summary = __doc__.split('\n')[0] + usage = __doc__ + max_args = 0 + min_args = 0 + + def __init__(self, name): + super(FixTimePeriods, self).__init__(name) + + def command(self): + import ckan.model as model + from ga_model import post_update_url_stats + self._load_config() + model.Session.remove() + model.Session.configure(bind=model.meta.engine) + + log = logging.getLogger('ckanext.ga_report') + + log.info("Updating 'All' records for old URLs") + post_update_url_stats() + log.info("Processing complete") + + class LoadAnalytics(CkanCommand): """Get data from Google Analytics API and save it in the ga_model - Usage: paster loadanalytics + Usage: paster loadanalytics - Where is the name of the auth token file from - the getauthtoken step. - - And where is: + Where is: all - data for all time latest - (default) just the 'latest' data - YYYY-MM-DD - just data for all time periods going - back to (and including) this date + YYYY-MM - just data for the specific month """ summary = __doc__.split('\n')[0] usage = __doc__ - max_args = 2 - min_args = 1 + max_args = 1 + min_args = 0 + + def __init__(self, name): + super(LoadAnalytics, self).__init__(name) + self.parser.add_option('-d', '--delete-first', + action='store_true', + default=False, + dest='delete_first', + help='Delete data for the period first') + self.parser.add_option('-s', '--skip_url_stats', + action='store_true', + default=False, + dest='skip_url_stats', + help='Skip the download of URL data - just do site-wide stats') def command(self): self._load_config() @@ -80,22 +122,31 @@ from download_analytics import DownloadAnalytics from ga_auth import (init_service, get_profile_id) + ga_token_filepath = os.path.expanduser(config.get('googleanalytics.token.filepath', '')) + if not ga_token_filepath: + print 'ERROR: In the CKAN config you need to specify the filepath of the ' \ + 'Google Analytics token file under key: googleanalytics.token.filepath' + return + try: - svc = init_service(self.args[0], None) + svc = init_service(ga_token_filepath, None) except TypeError: print ('Have you correctly run the getauthtoken task and ' - 'specified the correct file here') + 'specified the correct token file in the CKAN config under ' + '"googleanalytics.token.filepath"?') return - downloader = DownloadAnalytics(svc, profile_id=get_profile_id(svc)) + downloader = DownloadAnalytics(svc, profile_id=get_profile_id(svc), + delete_first=self.options.delete_first, + skip_url_stats=self.options.skip_url_stats) - time_period = self.args[1] if self.args and len(self.args) > 1 \ - else 'latest' + time_period = self.args[0] if self.args else 'latest' if time_period == 'all': downloader.all_() elif time_period == 'latest': downloader.latest() else: - since_date = datetime.datetime.strptime(time_period, '%Y-%m-%d') - downloader.since_date(since_date) + # The month to use + for_date = datetime.datetime.strptime(time_period, '%Y-%m') + downloader.specific_month(for_date) --- a/ckanext/ga_report/controller.py +++ b/ckanext/ga_report/controller.py @@ -1,10 +1,538 @@ +import re +import csv +import sys +import json import logging -from ckan.lib.base import BaseController, c, render -import report_model +import operator +import collections +from ckan.lib.base import (BaseController, c, g, render, request, response, abort) + +import sqlalchemy +from sqlalchemy import func, cast, Integer +import ckan.model as model +from ga_model import GA_Url, GA_Stat, GA_ReferralStat, GA_Publisher log = logging.getLogger('ckanext.ga-report') +DOWNLOADS_AVAILABLE_FROM = '2012-12' + +def _get_month_name(strdate): + import calendar + from time import strptime + d = strptime(strdate, '%Y-%m') + return '%s %s' % (calendar.month_name[d.tm_mon], d.tm_year) + +def _get_unix_epoch(strdate): + from time import strptime,mktime + d = strptime(strdate, '%Y-%m') + return int(mktime(d)) + +def _month_details(cls, stat_key=None): + ''' + Returns a list of all the periods for which we have data, unfortunately + knows too much about the type of the cls being passed as GA_Url has a + more complex query + + This may need extending if we add a period_name to the stats + ''' + months = [] + day = None + + q = model.Session.query(cls.period_name,cls.period_complete_day)\ + .filter(cls.period_name!='All').distinct(cls.period_name) + if stat_key: + q= q.filter(cls.stat_name==stat_key) + + vals = q.order_by("period_name desc").all() + + if vals and vals[0][1]: + day = int(vals[0][1]) + ordinal = 'th' if 11 <= day <= 13 \ + else {1:'st',2:'nd',3:'rd'}.get(day % 10, 'th') + day = "{day}{ordinal}".format(day=day, ordinal=ordinal) + + for m in vals: + months.append( (m[0], _get_month_name(m[0]))) + + return months, day + + class GaReport(BaseController): + + def csv(self, month): + import csv + + q = model.Session.query(GA_Stat).filter(GA_Stat.stat_name!='Downloads') + if month != 'all': + q = q.filter(GA_Stat.period_name==month) + entries = q.order_by('GA_Stat.period_name, GA_Stat.stat_name, GA_Stat.key').all() + + response.headers['Content-Type'] = "text/csv; charset=utf-8" + response.headers['Content-Disposition'] = str('attachment; filename=stats_%s.csv' % (month,)) + + writer = csv.writer(response) + writer.writerow(["Period", "Statistic", "Key", "Value"]) + + for entry in entries: + writer.writerow([entry.period_name.encode('utf-8'), + entry.stat_name.encode('utf-8'), + entry.key.encode('utf-8'), + entry.value.encode('utf-8')]) + + def index(self): - return render('index.html') - + + # Get the month details by fetching distinct values and determining the + # month names from the values. + c.months, c.day = _month_details(GA_Stat) + + # Work out which month to show, based on query params of the first item + c.month_desc = 'all months' + c.month = request.params.get('month', '') + if c.month: + c.month_desc = ''.join([m[1] for m in c.months if m[0]==c.month]) + + q = model.Session.query(GA_Stat).\ + filter(GA_Stat.stat_name=='Totals') + if c.month: + q = q.filter(GA_Stat.period_name==c.month) + entries = q.order_by('ga_stat.key').all() + + def clean_key(key, val): + if key in ['Average time on site', 'Pages per visit', 'New visits', 'Bounce rate (home page)']: + val = "%.2f" % round(float(val), 2) + if key == 'Average time on site': + mins, secs = divmod(float(val), 60) + hours, mins = divmod(mins, 60) + val = '%02d:%02d:%02d (%s seconds) ' % (hours, mins, secs, val) + if key in ['New visits','Bounce rate (home page)']: + val = "%s%%" % val + if key in ['Total page views', 'Total visits']: + val = int(val) + + return key, val + + # Query historic values for sparkline rendering + sparkline_query = model.Session.query(GA_Stat)\ + .filter(GA_Stat.stat_name=='Totals')\ + .order_by(GA_Stat.period_name) + sparkline_data = {} + for x in sparkline_query: + sparkline_data[x.key] = sparkline_data.get(x.key,[]) + key, val = clean_key(x.key,float(x.value)) + tooltip = '%s: %s' % (_get_month_name(x.period_name), val) + sparkline_data[x.key].append( (tooltip,x.value) ) + # Trim the latest month, as it looks like a huge dropoff + for key in sparkline_data: + sparkline_data[key] = sparkline_data[key][:-1] + + c.global_totals = [] + if c.month: + for e in entries: + key, val = clean_key(e.key, e.value) + sparkline = sparkline_data[e.key] + c.global_totals.append((key, val, sparkline)) + else: + d = collections.defaultdict(list) + for e in entries: + d[e.key].append(float(e.value)) + for k, v in d.iteritems(): + if k in ['Total page views', 'Total visits']: + v = sum(v) + else: + v = float(sum(v))/float(len(v)) + sparkline = sparkline_data[k] + key, val = clean_key(k,v) + + c.global_totals.append((key, val, sparkline)) + # Sort the global totals into a more pleasant order + def sort_func(x): + key = x[0] + total_order = ['Total page views','Total visits','Pages per visit'] + if key in total_order: + return total_order.index(key) + return 999 + c.global_totals = sorted(c.global_totals, key=sort_func) + + keys = { + 'Browser versions': 'browser_versions', + 'Browsers': 'browsers', + 'Operating Systems versions': 'os_versions', + 'Operating Systems': 'os', + 'Social sources': 'social_networks', + 'Languages': 'languages', + 'Country': 'country' + } + + def shorten_name(name, length=60): + return (name[:length] + '..') if len(name) > 60 else name + + def fill_out_url(url): + import urlparse + return urlparse.urljoin(g.site_url, url) + + c.social_referrer_totals, c.social_referrers = [], [] + q = model.Session.query(GA_ReferralStat) + q = q.filter(GA_ReferralStat.period_name==c.month) if c.month else q + q = q.order_by('ga_referrer.count::int desc') + for entry in q.all(): + c.social_referrers.append((shorten_name(entry.url), fill_out_url(entry.url), + entry.source,entry.count)) + + q = model.Session.query(GA_ReferralStat.url, + func.sum(GA_ReferralStat.count).label('count')) + q = q.filter(GA_ReferralStat.period_name==c.month) if c.month else q + q = q.order_by('count desc').group_by(GA_ReferralStat.url) + for entry in q.all(): + c.social_referrer_totals.append((shorten_name(entry[0]), fill_out_url(entry[0]),'', + entry[1])) + + for k, v in keys.iteritems(): + q = model.Session.query(GA_Stat).\ + filter(GA_Stat.stat_name==k).\ + order_by(GA_Stat.period_name) + # Buffer the tabular data + if c.month: + entries = [] + q = q.filter(GA_Stat.period_name==c.month).\ + order_by('ga_stat.value::int desc') + d = collections.defaultdict(int) + for e in q.all(): + d[e.key] += int(e.value) + entries = [] + for key, val in d.iteritems(): + entries.append((key,val,)) + entries = sorted(entries, key=operator.itemgetter(1), reverse=True) + + # Run a query on all months to gather graph data + graph_query = model.Session.query(GA_Stat).\ + filter(GA_Stat.stat_name==k).\ + order_by(GA_Stat.period_name) + graph_dict = {} + for stat in graph_query: + graph_dict[ stat.key ] = graph_dict.get(stat.key,{ + 'name':stat.key, + 'data': [] + }) + graph_dict[ stat.key ]['data'].append({ + 'x':_get_unix_epoch(stat.period_name), + 'y':float(stat.value) + }) + graph = [ graph_dict[x[0]] for x in entries ] + setattr(c, v+'_graph', json.dumps( _to_rickshaw(graph,percentageMode=True) )) + + # Get the total for each set of values and then set the value as + # a percentage of the total + if k == 'Social sources': + total = sum([x for n,x,graph in c.global_totals if n == 'Total visits']) + else: + total = sum([num for _,num in entries]) + setattr(c, v, [(k,_percent(v,total)) for k,v in entries ]) + + return render('ga_report/site/index.html') + + +class GaDatasetReport(BaseController): + """ + Displays the pageview and visit count for datasets + with options to filter by publisher and time period. + """ + def publisher_csv(self, month): + ''' + Returns a CSV of each publisher with the total number of dataset + views & visits. + ''' + c.month = month if not month == 'all' else '' + response.headers['Content-Type'] = "text/csv; charset=utf-8" + response.headers['Content-Disposition'] = str('attachment; filename=publishers_%s.csv' % (month,)) + + writer = csv.writer(response) + writer.writerow(["Publisher Title", "Publisher Name", "Views", "Visits", "Period Name"]) + + top_publishers, top_publishers_graph = _get_top_publishers(None) + + for publisher,view,visit in top_publishers: + writer.writerow([publisher.title.encode('utf-8'), + publisher.name.encode('utf-8'), + view, + visit, + month]) + + def dataset_csv(self, id='all', month='all'): + ''' + Returns a CSV with the number of views & visits for each dataset. + + :param id: A Publisher ID or None if you want for all + :param month: The time period, or 'all' + ''' + c.month = month if not month == 'all' else '' + if id != 'all': + c.publisher = model.Group.get(id) + if not c.publisher: + abort(404, 'A publisher with that name could not be found') + + packages = self._get_packages(c.publisher) + response.headers['Content-Type'] = "text/csv; charset=utf-8" + response.headers['Content-Disposition'] = \ + str('attachment; filename=datasets_%s_%s.csv' % (c.publisher_name, month,)) + + writer = csv.writer(response) + writer.writerow(["Dataset Title", "Dataset Name", "Views", "Visits", "Resource downloads", "Period Name"]) + + for package,view,visit,downloads in packages: + writer.writerow([package.title.encode('utf-8'), + package.name.encode('utf-8'), + view, + visit, + downloads, + month]) + + def publishers(self): + '''A list of publishers and the number of views/visits for each''' + + # Get the month details by fetching distinct values and determining the + # month names from the values. + c.months, c.day = _month_details(GA_Url) + + # Work out which month to show, based on query params of the first item + c.month = request.params.get('month', '') + c.month_desc = 'all months' + if c.month: + c.month_desc = ''.join([m[1] for m in c.months if m[0]==c.month]) + + c.top_publishers, graph_data = _get_top_publishers() + c.top_publishers_graph = json.dumps( _to_rickshaw(graph_data) ) + + return render('ga_report/publisher/index.html') + + def _get_packages(self, publisher=None, count=-1): + '''Returns the datasets in order of views''' + have_download_data = True + month = c.month or 'All' + if month != 'All': + have_download_data = month >= DOWNLOADS_AVAILABLE_FROM + + q = model.Session.query(GA_Url,model.Package)\ + .filter(model.Package.name==GA_Url.package_id)\ + .filter(GA_Url.url.like('/dataset/%')) + if publisher: + q = q.filter(GA_Url.department_id==publisher.name) + q = q.filter(GA_Url.period_name==month) + q = q.order_by('ga_url.pageviews::int desc') + top_packages = [] + if count == -1: + entries = q.all() + else: + entries = q.limit(count) + + for entry,package in entries: + if package: + # Downloads .... + if have_download_data: + dls = model.Session.query(GA_Stat).\ + filter(GA_Stat.stat_name=='Downloads').\ + filter(GA_Stat.key==package.name) + if month != 'All': # Fetch everything unless the month is specific + dls = dls.filter(GA_Stat.period_name==month) + downloads = 0 + for x in dls: + downloads += int(x.value) + else: + downloads = 'No data' + top_packages.append((package, entry.pageviews, entry.visits, downloads)) + else: + log.warning('Could not find package associated package') + + return top_packages + + def read(self): + ''' + Lists the most popular datasets across all publishers + ''' + return self.read_publisher(None) + + def read_publisher(self, id): + ''' + Lists the most popular datasets for a publisher (or across all publishers) + ''' + count = 20 + + c.publishers = _get_publishers() + + id = request.params.get('publisher', id) + if id and id != 'all': + c.publisher = model.Group.get(id) + if not c.publisher: + abort(404, 'A publisher with that name could not be found') + c.publisher_name = c.publisher.name + c.top_packages = [] # package, dataset_views in c.top_packages + + # Get the month details by fetching distinct values and determining the + # month names from the values. + c.months, c.day = _month_details(GA_Url) + + # Work out which month to show, based on query params of the first item + c.month = request.params.get('month', '') + if not c.month: + c.month_desc = 'all months' + else: + c.month_desc = ''.join([m[1] for m in c.months if m[0]==c.month]) + + month = c.month or 'All' + c.publisher_page_views = 0 + q = model.Session.query(GA_Url).\ + filter(GA_Url.url=='/publisher/%s' % c.publisher_name) + entry = q.filter(GA_Url.period_name==c.month).first() + c.publisher_page_views = entry.pageviews if entry else 0 + + c.top_packages = self._get_packages(c.publisher, 20) + + # Graph query + top_package_names = [ x[0].name for x in c.top_packages ] + graph_query = model.Session.query(GA_Url,model.Package)\ + .filter(model.Package.name==GA_Url.package_id)\ + .filter(GA_Url.url.like('/dataset/%'))\ + .filter(GA_Url.package_id.in_(top_package_names)) + graph_dict = {} + for entry,package in graph_query: + if not package: continue + if entry.period_name=='All': continue + graph_dict[package.name] = graph_dict.get(package.name,{ + 'name':package.title, + 'data':[] + }) + graph_dict[package.name]['data'].append({ + 'x':_get_unix_epoch(entry.period_name), + 'y':int(entry.pageviews), + }) + graph = [ graph_dict[x] for x in top_package_names ] + + c.graph_data = json.dumps( _to_rickshaw(graph) ) + + return render('ga_report/publisher/read.html') + +def _to_rickshaw(data, percentageMode=False): + if data==[]: + return data + # Create a consistent x-axis between all series + num_points = [ len(series['data']) for series in data ] + ideal_index = num_points.index( max(num_points) ) + x_axis = [] + for series in data: + for point in series['data']: + x_axis.append(point['x']) + x_axis = sorted( list( set(x_axis) ) ) + # Zero pad any missing values + for series in data: + xs = [ point['x'] for point in series['data'] ] + for x in set(x_axis).difference(set(xs)): + series['data'].append( {'x':x, 'y':0} ) + if percentageMode: + def get_totals(series_list): + totals = {} + for series in series_list: + for point in series['data']: + totals[point['x']] = totals.get(point['x'],0) + point['y'] + return totals + # Transform data into percentage stacks + totals = get_totals(data) + # Roll insignificant series into a catch-all + THRESHOLD = 0.01 + raw_data = data + data = [] + for series in raw_data: + for point in series['data']: + fraction = float(point['y']) / totals[point['x']] + if not (series in data) and fraction>THRESHOLD: + data.append(series) + # Overwrite data with a set of intereting series + others = [ x for x in raw_data if not (x in data) ] + data.append({ + 'name':'Other', + 'data': [ {'x':x,'y':y} for x,y in get_totals(others).items() ] + }) + # Turn each point into a percentage + for series in data: + for point in series['data']: + point['y'] = (point['y']*100) / totals[point['x']] + # Sort the points + for series in data: + series['data'] = sorted( series['data'], key=lambda x:x['x'] ) + # Strip the latest month's incomplete analytics + series['data'] = series['data'][:-1] + return data + + +def _get_top_publishers(limit=20): + ''' + Returns a list of the top 20 publishers by dataset visits. + (The number to show can be varied with 'limit') + ''' + month = c.month or 'All' + connection = model.Session.connection() + q = """ + select department_id, sum(pageviews::int) views, sum(visits::int) visits + from ga_url + where department_id <> '' + and package_id <> '' + and url like '/dataset/%%' + and period_name=%s + group by department_id order by views desc + """ + if limit: + q = q + " limit %s;" % (limit) + + top_publishers = [] + res = connection.execute(q, month) + department_ids = [] + for row in res: + g = model.Group.get(row[0]) + if g: + department_ids.append(row[0]) + top_publishers.append((g, row[1], row[2])) + + graph = [] + if limit is not None: + # Query for a history graph of these publishers + q = model.Session.query( + GA_Url.department_id, + GA_Url.period_name, + func.sum(cast(GA_Url.pageviews,sqlalchemy.types.INT)))\ + .filter( GA_Url.department_id.in_(department_ids) )\ + .filter( GA_Url.period_name!='All' )\ + .filter( GA_Url.url.like('/dataset/%') )\ + .filter( GA_Url.package_id!='' )\ + .group_by( GA_Url.department_id, GA_Url.period_name ) + graph_dict = {} + for dept_id,period_name,views in q: + graph_dict[dept_id] = graph_dict.get( dept_id, { + 'name' : model.Group.get(dept_id).title, + 'data' : [] + }) + graph_dict[dept_id]['data'].append({ + 'x': _get_unix_epoch(period_name), + 'y': views + }) + # Sort dict into ordered list + for id in department_ids: + graph.append( graph_dict[id] ) + return top_publishers, graph + + +def _get_publishers(): + ''' + Returns a list of all publishers. Each item is a tuple: + (name, title) + ''' + publishers = [] + for pub in model.Session.query(model.Group).\ + filter(model.Group.type=='publisher').\ + filter(model.Group.state=='active').\ + order_by(model.Group.name): + publishers.append((pub.name, pub.title)) + return publishers + +def _percent(num, total): + p = 100 * float(num)/float(total) + return "%.2f%%" % round(p, 2) + --- a/ckanext/ga_report/download_analytics.py +++ b/ckanext/ga_report/download_analytics.py @@ -1,9 +1,9 @@ import os import logging import datetime - +import collections from pylons import config - +from ga_model import _normalize_url import ga_model #from ga_client import GA @@ -11,18 +11,37 @@ log = logging.getLogger('ckanext.ga-report') FORMAT_MONTH = '%Y-%m' +MIN_VIEWS = 50 +MIN_VISITS = 20 +MIN_DOWNLOADS = 10 class DownloadAnalytics(object): '''Downloads and stores analytics info''' - def __init__(self, service=None, profile_id=None): + def __init__(self, service=None, profile_id=None, delete_first=False, + skip_url_stats=False): self.period = config['ga-report.period'] self.service = service self.profile_id = profile_id - - - def all_(self): - self.since_date(datetime.datetime(2010, 1, 1)) + self.delete_first = delete_first + self.skip_url_stats = skip_url_stats + + def specific_month(self, date): + import calendar + + first_of_this_month = datetime.datetime(date.year, date.month, 1) + _, last_day_of_month = calendar.monthrange(int(date.year), int(date.month)) + last_of_this_month = datetime.datetime(date.year, date.month, last_day_of_month) + # if this is the latest month, note that it is only up until today + now = datetime.datetime.now() + if now.year == date.year and now.month == date.month: + last_day_of_month = now.day + last_of_this_month = now + periods = ((date.strftime(FORMAT_MONTH), + last_day_of_month, + first_of_this_month, last_of_this_month),) + self.download_and_store(periods) + def latest(self): if self.period == 'monthly': @@ -37,13 +56,13 @@ self.download_and_store(periods) - def since_date(self, since_date): + def for_date(self, for_date): assert isinstance(since_date, datetime.datetime) periods = [] # (period_name, period_complete_day, start_date, end_date) if self.period == 'monthly': first_of_the_months_until_now = [] - year = since_date.year - month = since_date.month + year = for_date.year + month = for_date.month now = datetime.datetime.now() first_of_this_month = datetime.datetime(now.year, now.month, 1) while True: @@ -81,32 +100,81 @@ def download_and_store(self, periods): for period_name, period_complete_day, start_date, end_date in periods: - log.info('Downloading Analytics for period "%s" (%s - %s)', + log.info('Period "%s" (%s - %s)', self.get_full_period_name(period_name, period_complete_day), - start_date.strftime('%Y %m %d'), - end_date.strftime('%Y %m %d')) - """ - data = self.download(start_date, end_date, '~/dataset/[a-z0-9-_]+') - log.info('Storing Dataset Analytics for period "%s"', - self.get_full_period_name(period_name, period_complete_day)) - self.store(period_name, period_complete_day, data, ) - - data = self.download(start_date, end_date, '~/publisher/[a-z0-9-_]+') - log.info('Storing Publisher Analytics for period "%s"', - self.get_full_period_name(period_name, period_complete_day)) - self.store(period_name, period_complete_day, data,) - """ - ga_model.update_publisher_stats(period_name) # about 30 seconds. - self.sitewide_stats( period_name ) - - - def download(self, start_date, end_date, path='~/dataset/[a-z0-9-_]+'): + start_date.strftime('%Y-%m-%d'), + end_date.strftime('%Y-%m-%d')) + + if self.delete_first: + log.info('Deleting existing Analytics for this period "%s"', + period_name) + ga_model.delete(period_name) + + if not self.skip_url_stats: + # Clean out old url data before storing the new + ga_model.pre_update_url_stats(period_name) + + accountName = config.get('googleanalytics.account') + + log.info('Downloading analytics for dataset views') + data = self.download(start_date, end_date, '~/%s/dataset/[a-z0-9-_]+' % accountName) + + log.info('Storing dataset views (%i rows)', len(data.get('url'))) + self.store(period_name, period_complete_day, data, ) + + log.info('Downloading analytics for publisher views') + data = self.download(start_date, end_date, '~/%s/publisher/[a-z0-9-_]+' % accountName) + + log.info('Storing publisher views (%i rows)', len(data.get('url'))) + self.store(period_name, period_complete_day, data,) + + # Make sure the All records are correct. + ga_model.post_update_url_stats() + + log.info('Associating datasets with their publisher') + ga_model.update_publisher_stats(period_name) # about 30 seconds. + + + log.info('Downloading and storing analytics for site-wide stats') + self.sitewide_stats( period_name, period_complete_day ) + + log.info('Downloading and storing analytics for social networks') + self.update_social_info(period_name, start_date, end_date) + + + def update_social_info(self, period_name, start_date, end_date): + start_date = start_date.strftime('%Y-%m-%d') + end_date = end_date.strftime('%Y-%m-%d') + query = 'ga:hasSocialSourceReferral=~Yes$' + metrics = 'ga:entrances' + sort = '-ga:entrances' + + # Supported query params at + # https://developers.google.com/analytics/devguides/reporting/core/v3/reference + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + filters=query, + start_date=start_date, + metrics=metrics, + sort=sort, + dimensions="ga:landingPagePath,ga:socialNetwork", + max_results=10000, + end_date=end_date).execute() + data = collections.defaultdict(list) + rows = results.get('rows',[]) + for row in rows: + url = _normalize_url('http:/' + row[0]) + data[url].append( (row[1], int(row[2]),) ) + ga_model.update_social(period_name, data) + + + def download(self, start_date, end_date, path=None): '''Get data from GA for a given time period''' start_date = start_date.strftime('%Y-%m-%d') end_date = end_date.strftime('%Y-%m-%d') query = 'ga:pagePath=%s$' % path - metrics = 'ga:uniquePageviews, ga:visitors' - sort = '-ga:uniquePageviews' + metrics = 'ga:pageviews, ga:visits' + sort = '-ga:pageviews' # Supported query params at # https://developers.google.com/analytics/devguides/reporting/core/v3/reference @@ -120,35 +188,36 @@ max_results=10000, end_date=end_date).execute() - if os.getenv('DEBUG'): - import pprint - pprint.pprint(results) - print 'Total results: %s' % results.get('totalResults') - packages = [] + log.info("There are %d results" % results['totalResults']) for entry in results.get('rows'): (loc,pageviews,visits) = entry - packages.append( ('http:/' + loc, pageviews, visits,) ) # Temporary hack + url = _normalize_url('http:/' + loc) # strips off domain e.g. www.data.gov.uk or data.gov.uk + + if not url.startswith('/dataset/') and not url.startswith('/publisher/'): + # filter out strays like: + # /data/user/login?came_from=http://data.gov.uk/dataset/os-code-point-open + # /403.html?page=/about&from=http://data.gov.uk/publisher/planning-inspectorate + continue + packages.append( (url, pageviews, visits,) ) # Temporary hack return dict(url=packages) def store(self, period_name, period_complete_day, data): if 'url' in data: ga_model.update_url_stats(period_name, period_complete_day, data['url']) - def sitewide_stats(self, period_name): + def sitewide_stats(self, period_name, period_complete_day): import calendar year, month = period_name.split('-') _, last_day_of_month = calendar.monthrange(int(year), int(month)) start_date = '%s-01' % period_name end_date = '%s-%s' % (period_name, last_day_of_month) - print 'Sitewide_stats for %s (%s -> %s)' % (period_name, start_date, end_date) - funcs = ['_totals_stats', '_social_stats', '_os_stats', - '_locale_stats', '_browser_stats', '_mobile_stats'] + '_locale_stats', '_browser_stats', '_mobile_stats', '_download_stats'] for f in funcs: - print ' + Fetching %s stats' % f.split('_')[1] - getattr(self, f)(start_date, end_date, period_name) + log.info('Downloading analytics for %s' % f.split('_')[1]) + getattr(self, f)(start_date, end_date, period_name, period_complete_day) def _get_results(result_data, f): data = {} @@ -157,41 +226,65 @@ data[key] = data.get(key,0) + result[1] return data - def _totals_stats(self, start_date, end_date, period_name): + def _totals_stats(self, start_date, end_date, period_name, period_complete_day): """ Fetches distinct totals, total pageviews etc """ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', - max_results=10000, - end_date=end_date).execute() - result_data = results.get('rows') - ga_model.update_sitewide_stats(period_name, "Totals", {'Total pageviews': result_data[0][0]}) - - results = self.service.data().ga().get( - ids='ga:' + self.profile_id, - start_date=start_date, - metrics='ga:pageviewsPerVisit,ga:bounces,ga:avgTimeOnSite,ga:percentNewVisits', + metrics='ga:pageviews', + sort='-ga:pageviews', + max_results=10000, + end_date=end_date).execute() + result_data = results.get('rows') + ga_model.update_sitewide_stats(period_name, "Totals", {'Total page views': result_data[0][0]}, + period_complete_day) + + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + start_date=start_date, + metrics='ga:pageviewsPerVisit,ga:avgTimeOnSite,ga:percentNewVisits,ga:visits', max_results=10000, end_date=end_date).execute() result_data = results.get('rows') data = { 'Pages per visit': result_data[0][0], - 'Bounces': result_data[0][1], - 'Average time on site': result_data[0][2], - 'Percent new visits': result_data[0][3], + 'Average time on site': result_data[0][1], + 'New visits': result_data[0][2], + 'Total visits': result_data[0][3], } - ga_model.update_sitewide_stats(period_name, "Totals", data) - - - def _locale_stats(self, start_date, end_date, period_name): + ga_model.update_sitewide_stats(period_name, "Totals", data, period_complete_day) + + # Bounces from / or another configurable page. + path = '/%s%s' % (config.get('googleanalytics.account'), + config.get('ga-report.bounce_url', '/')) + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + filters='ga:pagePath==%s' % (path,), + start_date=start_date, + metrics='ga:visitBounceRate', + dimensions='ga:pagePath', + max_results=10000, + end_date=end_date).execute() + result_data = results.get('rows') + if not result_data or len(result_data) != 1: + log.error('Could not pinpoint the bounces for path: %s. Got results: %r', + path, result_data) + return + results = result_data[0] + bounces = float(results[1]) + # visitBounceRate is already a % + log.info('Google reports visitBounceRate as %s', bounces) + ga_model.update_sitewide_stats(period_name, "Totals", {'Bounce rate (home page)': float(bounces)}, + period_complete_day) + + + def _locale_stats(self, start_date, end_date, period_name, period_complete_day): """ Fetches stats about language and country """ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:language,ga:country", max_results=10000, end_date=end_date).execute() @@ -199,42 +292,110 @@ data = {} for result in result_data: data[result[0]] = data.get(result[0], 0) + int(result[2]) - ga_model.update_sitewide_stats(period_name, "Languages", data) + self._filter_out_long_tail(data, MIN_VIEWS) + ga_model.update_sitewide_stats(period_name, "Languages", data, period_complete_day) data = {} for result in result_data: data[result[1]] = data.get(result[1], 0) + int(result[2]) - ga_model.update_sitewide_stats(period_name, "Country", data) - - - def _social_stats(self, start_date, end_date, period_name): + self._filter_out_long_tail(data, MIN_VIEWS) + ga_model.update_sitewide_stats(period_name, "Country", data, period_complete_day) + + + def _download_stats(self, start_date, end_date, period_name, period_complete_day): + """ Fetches stats about data downloads """ + import ckan.model as model + + data = {} + + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + start_date=start_date, + filters='ga:eventAction==download', + metrics='ga:totalEvents', + sort='-ga:totalEvents', + dimensions="ga:eventLabel", + max_results=10000, + end_date=end_date).execute() + result_data = results.get('rows') + if not result_data: + # We may not have data for this time period, so we need to bail + # early. + log.info("There is no download data for this time period") + return + + def process_result_data(result_data, cached=False): + progress_total = len(result_data) + progress_count = 0 + resources_not_matched = [] + for result in result_data: + progress_count += 1 + if progress_count % 100 == 0: + log.debug('.. %d/%d done so far', progress_count, progress_total) + + url = result[0].strip() + + # Get package id associated with the resource that has this URL. + q = model.Session.query(model.Resource) + if cached: + r = q.filter(model.Resource.cache_url.like("%s%%" % url)).first() + else: + r = q.filter(model.Resource.url.like("%s%%" % url)).first() + + package_name = r.resource_group.package.name if r else "" + if package_name: + data[package_name] = data.get(package_name, 0) + int(result[1]) + else: + resources_not_matched.append(url) + continue + if resources_not_matched: + log.debug('Could not match %i or %i resource URLs to datasets. e.g. %r', + len(resources_not_matched), progress_total, resources_not_matched[:3]) + + log.info('Associating downloads of resource URLs with their respective datasets') + process_result_data(results.get('rows')) + + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + start_date=start_date, + filters='ga:eventAction==download-cache', + metrics='ga:totalEvents', + sort='-ga:totalEvents', + dimensions="ga:eventLabel", + max_results=10000, + end_date=end_date).execute() + log.info('Associating downloads of cache resource URLs with their respective datasets') + process_result_data(results.get('rows'), cached=False) + + self._filter_out_long_tail(data, MIN_DOWNLOADS) + ga_model.update_sitewide_stats(period_name, "Downloads", data, period_complete_day) + + def _social_stats(self, start_date, end_date, period_name, period_complete_day): """ Finds out which social sites people are referred from """ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:socialNetwork,ga:referralPath", max_results=10000, end_date=end_date).execute() result_data = results.get('rows') - twitter_links = [] data = {} for result in result_data: if not result[0] == '(not set)': data[result[0]] = data.get(result[0], 0) + int(result[2]) - if result[0] == 'Twitter': - twitter_links.append(result[1]) - ga_model.update_sitewide_stats(period_name, "Social sources", data) - - - def _os_stats(self, start_date, end_date, period_name): + self._filter_out_long_tail(data, 3) + ga_model.update_sitewide_stats(period_name, "Social sources", data, period_complete_day) + + + def _os_stats(self, start_date, end_date, period_name, period_complete_day): """ Operating system stats """ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:operatingSystem,ga:operatingSystemVersion", max_results=10000, end_date=end_date).execute() @@ -242,46 +403,73 @@ data = {} for result in result_data: data[result[0]] = data.get(result[0], 0) + int(result[2]) - ga_model.update_sitewide_stats(period_name, "Operating Systems", data) - - data = {} - for result in result_data: - key = "%s (%s)" % (result[0],result[1]) - data[key] = result[2] - ga_model.update_sitewide_stats(period_name, "Operating Systems versions", data) - - - def _browser_stats(self, start_date, end_date, period_name): + self._filter_out_long_tail(data, MIN_VIEWS) + ga_model.update_sitewide_stats(period_name, "Operating Systems", data, period_complete_day) + + data = {} + for result in result_data: + if int(result[2]) >= MIN_VIEWS: + key = "%s %s" % (result[0],result[1]) + data[key] = result[2] + ga_model.update_sitewide_stats(period_name, "Operating Systems versions", data, period_complete_day) + + + def _browser_stats(self, start_date, end_date, period_name, period_complete_day): """ Information about browsers and browser versions """ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:browser,ga:browserVersion", max_results=10000, end_date=end_date).execute() result_data = results.get('rows') + # e.g. [u'Firefox', u'19.0', u'20'] + data = {} for result in result_data: data[result[0]] = data.get(result[0], 0) + int(result[2]) - ga_model.update_sitewide_stats(period_name, "Browsers", data) - - data = {} - for result in result_data: - key = "%s (%s)" % (result[0], result[1]) - data[key] = result[2] - ga_model.update_sitewide_stats(period_name, "Browser versions", data) - - - def _mobile_stats(self, start_date, end_date, period_name): + self._filter_out_long_tail(data, MIN_VIEWS) + ga_model.update_sitewide_stats(period_name, "Browsers", data, period_complete_day) + + data = {} + for result in result_data: + key = "%s %s" % (result[0], self._filter_browser_version(result[0], result[1])) + data[key] = data.get(key, 0) + int(result[2]) + self._filter_out_long_tail(data, MIN_VIEWS) + ga_model.update_sitewide_stats(period_name, "Browser versions", data, period_complete_day) + + @classmethod + def _filter_browser_version(cls, browser, version_str): + ''' + Simplifies a browser version string if it is detailed. + i.e. groups together Firefox 3.5.1 and 3.5.2 to be just 3. + This is helpful when viewing stats and good to protect privacy. + ''' + ver = version_str + parts = ver.split('.') + if len(parts) > 1: + if parts[1][0] == '0': + ver = parts[0] + else: + ver = "%s" % (parts[0]) + # Special case complex version nums + if browser in ['Safari', 'Android Browser']: + ver = parts[0] + if len(ver) > 2: + num_hidden_digits = len(ver) - 2 + ver = ver[0] + ver[1] + 'X' * num_hidden_digits + return ver + + def _mobile_stats(self, start_date, end_date, period_name, period_complete_day): """ Info about mobile devices """ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:mobileDeviceBranding, ga:mobileDeviceInfo", max_results=10000, end_date=end_date).execute() @@ -290,10 +478,23 @@ data = {} for result in result_data: data[result[0]] = data.get(result[0], 0) + int(result[2]) - ga_model.update_sitewide_stats(period_name, "Mobile brands", data) + self._filter_out_long_tail(data, MIN_VIEWS) + ga_model.update_sitewide_stats(period_name, "Mobile brands", data, period_complete_day) data = {} for result in result_data: data[result[1]] = data.get(result[1], 0) + int(result[2]) - ga_model.update_sitewide_stats(period_name, "Mobile devices", data) - + self._filter_out_long_tail(data, MIN_VIEWS) + ga_model.update_sitewide_stats(period_name, "Mobile devices", data, period_complete_day) + + @classmethod + def _filter_out_long_tail(cls, data, threshold=10): + ''' + Given data which is a frequency distribution, filter out + results which are below a threshold count. This is good to protect + privacy. + ''' + for key, value in data.items(): + if value < threshold: + del data[key] + --- a/ckanext/ga_report/ga_auth.py +++ b/ckanext/ga_report/ga_auth.py @@ -53,7 +53,11 @@ return None accountName = config.get('googleanalytics.account') + if not accountName: + raise Exception('googleanalytics.account needs to be configured') webPropertyId = config.get('googleanalytics.id') + if not webPropertyId: + raise Exception('googleanalytics.id needs to be configured') for acc in accounts.get('items'): if acc.get('name') == accountName: accountId = acc.get('id') --- a/ckanext/ga_report/ga_model.py +++ b/ckanext/ga_report/ga_model.py @@ -1,19 +1,21 @@ import re import uuid -from sqlalchemy import Table, Column, MetaData +from sqlalchemy import Table, Column, MetaData, ForeignKey from sqlalchemy import types from sqlalchemy.sql import select -from sqlalchemy.orm import mapper +from sqlalchemy.orm import mapper, relation from sqlalchemy import func import ckan.model as model from ckan.lib.base import * +log = __import__('logging').getLogger(__name__) + def make_uuid(): return unicode(uuid.uuid4()) - +metadata = MetaData() class GA_Url(object): @@ -21,41 +23,42 @@ for k,v in kwargs.items(): setattr(self, k, v) -class GA_Stat(object): - - def __init__(self, **kwargs): - for k,v in kwargs.items(): - setattr(self, k, v) - -class GA_Publisher(object): - - def __init__(self, **kwargs): - for k,v in kwargs.items(): - setattr(self, k, v) - - -metadata = MetaData() url_table = Table('ga_url', metadata, Column('id', types.UnicodeText, primary_key=True, default=make_uuid), Column('period_name', types.UnicodeText), Column('period_complete_day', types.Integer), Column('pageviews', types.UnicodeText), - Column('visitors', types.UnicodeText), + Column('visits', types.UnicodeText), Column('url', types.UnicodeText), Column('department_id', types.UnicodeText), + Column('package_id', types.UnicodeText), ) mapper(GA_Url, url_table) + + +class GA_Stat(object): + + def __init__(self, **kwargs): + for k,v in kwargs.items(): + setattr(self, k, v) stat_table = Table('ga_stat', metadata, Column('id', types.UnicodeText, primary_key=True, default=make_uuid), Column('period_name', types.UnicodeText), + Column('period_complete_day', types.UnicodeText), Column('stat_name', types.UnicodeText), Column('key', types.UnicodeText), Column('value', types.UnicodeText), ) mapper(GA_Stat, stat_table) + +class GA_Publisher(object): + + def __init__(self, **kwargs): + for k,v in kwargs.items(): + setattr(self, k, v) pub_table = Table('ga_publisher', metadata, Column('id', types.UnicodeText, primary_key=True, @@ -63,12 +66,30 @@ Column('period_name', types.UnicodeText), Column('publisher_name', types.UnicodeText), Column('views', types.UnicodeText), - Column('visitors', types.UnicodeText), + Column('visits', types.UnicodeText), Column('toplevel', types.Boolean, default=False), Column('subpublishercount', types.Integer, default=0), Column('parent', types.UnicodeText), ) mapper(GA_Publisher, pub_table) + + +class GA_ReferralStat(object): + + def __init__(self, **kwargs): + for k,v in kwargs.items(): + setattr(self, k, v) + +referrer_table = Table('ga_referrer', metadata, + Column('id', types.UnicodeText, primary_key=True, + default=make_uuid), + Column('period_name', types.UnicodeText), + Column('source', types.UnicodeText), + Column('url', types.UnicodeText), + Column('count', types.Integer), + ) +mapper(GA_ReferralStat, referrer_table) + def init_tables(): @@ -93,11 +114,10 @@ >>> normalize_url('http://data.gov.uk/dataset/weekly_fuel_prices') '/dataset/weekly_fuel_prices' ''' - url = re.sub('https?://(www\.)?data.gov.uk', '', url) - return url - - -def _get_department_id_of_url(url): + return '/' + '/'.join(url.split('/')[3:]) + + +def _get_package_and_publisher(url): # e.g. /dataset/fuel_prices # e.g. /dataset/fuel_prices/resource/e63380d4 dataset_match = re.match('/dataset/([^/]+)(/.*)?', url) @@ -107,14 +127,15 @@ if dataset: publisher_groups = dataset.get_groups('publisher') if publisher_groups: - return publisher_groups[0].name + return dataset_ref,publisher_groups[0].name + return dataset_ref, None else: publisher_match = re.match('/publisher/([^/]+)(/.*)?', url) if publisher_match: - return publisher_match.groups()[0] - - -def update_sitewide_stats(period_name, stat_name, data): + return None, publisher_match.groups()[0] + return None, None + +def update_sitewide_stats(period_name, stat_name, data, period_complete_day): for k,v in data.iteritems(): item = model.Session.query(GA_Stat).\ filter(GA_Stat.period_name==period_name).\ @@ -124,11 +145,13 @@ item.period_name = period_name item.key = k item.value = v + item.period_complete_day = period_complete_day model.Session.add(item) else: # create the row values = {'id': make_uuid(), 'period_name': period_name, + 'period_complete_day': period_complete_day, 'key': k, 'value': v, 'stat_name': stat_name @@ -137,36 +160,160 @@ model.Session.commit() +def pre_update_url_stats(period_name): + q = model.Session.query(GA_Url).\ + filter(GA_Url.period_name==period_name) + log.debug("Deleting %d '%s' records" % (q.count(), period_name)) + q.delete() + + q = model.Session.query(GA_Url).\ + filter(GA_Url.period_name == 'All') + log.debug("Deleting %d 'All' records..." % q.count()) + q.delete() + + model.Session.flush() + model.Session.commit() + model.repo.commit_and_remove() + log.debug('...done') + +def post_update_url_stats(): + + """ Check the distinct url field in ga_url and make sure + it has an All record. If not then create one. + + After running this then every URL should have an All + record regardless of whether the URL has an entry for + the month being currently processed. + """ + log.debug('Post-processing "All" records...') + query = """select url, pageviews::int, visits::int + from ga_url + where url not in (select url from ga_url where period_name ='All')""" + connection = model.Session.connection() + res = connection.execute(query) + + views, visits = {}, {} + # url, views, visits + for row in res: + views[row[0]] = views.get(row[0], 0) + row[1] + visits[row[0]] = visits.get(row[0], 0) + row[2] + + progress_total = len(views.keys()) + progress_count = 0 + for key in views.keys(): + progress_count += 1 + if progress_count % 100 == 0: + log.debug('.. %d/%d done so far', progress_count, progress_total) + + package, publisher = _get_package_and_publisher(key) + + values = {'id': make_uuid(), + 'period_name': "All", + 'period_complete_day': 0, + 'url': key, + 'pageviews': views[key], + 'visits': visits[key], + 'department_id': publisher, + 'package_id': package + } + model.Session.add(GA_Url(**values)) + model.Session.commit() + log.debug('..done') + def update_url_stats(period_name, period_complete_day, url_data): - for url, views, visitors in url_data: - url = _normalize_url(url) - department_id = _get_department_id_of_url(url) - - # see if the row for this url & month is in the table already + ''' + Given a list of urls and number of hits for each during a given period, + stores them in GA_Url under the period and recalculates the totals for + the 'All' period. + ''' + progress_total = len(url_data) + progress_count = 0 + for url, views, visits in url_data: + progress_count += 1 + if progress_count % 100 == 0: + log.debug('.. %d/%d done so far', progress_count, progress_total) + + package, publisher = _get_package_and_publisher(url) + item = model.Session.query(GA_Url).\ filter(GA_Url.period_name==period_name).\ filter(GA_Url.url==url).first() if item: - item.period_name = period_name - item.pageviews = views - item.visitors = visitors - item.department_id = department_id + item.pageviews = item.pageviews + views + item.visits = item.visits + visits + if not item.package_id: + item.package_id = package + if not item.department_id: + item.department_id = publisher model.Session.add(item) else: - # create the row values = {'id': make_uuid(), 'period_name': period_name, 'period_complete_day': period_complete_day, 'url': url, 'pageviews': views, - 'visitors': visitors, - 'department_id': department_id + 'visits': visits, + 'department_id': publisher, + 'package_id': package } model.Session.add(GA_Url(**values)) model.Session.commit() - + if package: + old_pageviews, old_visits = 0, 0 + old = model.Session.query(GA_Url).\ + filter(GA_Url.period_name=='All').\ + filter(GA_Url.url==url).all() + old_pageviews = sum([int(o.pageviews) for o in old]) + old_visits = sum([int(o.visits) for o in old]) + + entries = model.Session.query(GA_Url).\ + filter(GA_Url.period_name!='All').\ + filter(GA_Url.url==url).all() + values = {'id': make_uuid(), + 'period_name': 'All', + 'period_complete_day': 0, + 'url': url, + 'pageviews': sum([int(e.pageviews) for e in entries]) + int(old_pageviews), + 'visits': sum([int(e.visits or 0) for e in entries]) + int(old_visits), + 'department_id': publisher, + 'package_id': package + } + + model.Session.add(GA_Url(**values)) + model.Session.commit() + + + + +def update_social(period_name, data): + # Clean up first. + model.Session.query(GA_ReferralStat).\ + filter(GA_ReferralStat.period_name==period_name).delete() + + for url,data in data.iteritems(): + for entry in data: + source = entry[0] + count = entry[1] + + item = model.Session.query(GA_ReferralStat).\ + filter(GA_ReferralStat.period_name==period_name).\ + filter(GA_ReferralStat.source==source).\ + filter(GA_ReferralStat.url==url).first() + if item: + item.count = item.count + count + model.Session.add(item) + else: + # create the row + values = {'id': make_uuid(), + 'period_name': period_name, + 'source': source, + 'url': url, + 'count': count, + } + model.Session.add(GA_ReferralStat(**values)) + model.Session.commit() def update_publisher_stats(period_name): """ @@ -179,7 +326,7 @@ filter(model.Group.type=='publisher').\ filter(model.Group.state=='active').all() for publisher in publishers: - views, visitors, subpub = update_publisher(period_name, publisher, publisher.name) + views, visits, subpub = update_publisher(period_name, publisher, publisher.name) parent, parents = '', publisher.get_groups('publisher') if parents: parent = parents[0].name @@ -188,7 +335,7 @@ filter(GA_Publisher.publisher_name==publisher.name).first() if item: item.views = views - item.visitors = visitors + item.visits = visits item.publisher_name = publisher.name item.toplevel = publisher in toplevel item.subpublishercount = subpub @@ -200,7 +347,7 @@ 'period_name': period_name, 'publisher_name': publisher.name, 'views': views, - 'visitors': visitors, + 'visits': visits, 'toplevel': publisher in toplevel, 'subpublishercount': subpub, 'parent': parent @@ -210,7 +357,7 @@ def update_publisher(period_name, pub, part=''): - views,visitors,subpub = 0, 0, 0 + views,visits,subpub = 0, 0, 0 for publisher in go_down_tree(pub): subpub = subpub + 1 items = model.Session.query(GA_Url).\ @@ -218,9 +365,9 @@ filter(GA_Url.department_id==publisher.name).all() for item in items: views = views + int(item.pageviews) - visitors = visitors + int(item.visitors) - - return views, visitors, (subpub-1) + visits = visits + int(item.visits) + + return views, visits, (subpub-1) def get_top_level(): @@ -248,3 +395,46 @@ for grandchild in go_down_tree(child): yield grandchild +def delete(period_name): + ''' + Deletes table data for the specified period, or specify 'all' + for all periods. + ''' + for object_type in (GA_Url, GA_Stat, GA_Publisher, GA_ReferralStat): + q = model.Session.query(object_type) + if period_name != 'All': + q = q.filter_by(period_name=period_name) + q.delete() + model.repo.commit_and_remove() + +def get_score_for_dataset(dataset_name): + ''' + Returns a "current popularity" score for a dataset, + based on how many views it has had recently. + ''' + import datetime + now = datetime.datetime.now() + last_month = now - datetime.timedelta(days=30) + period_names = ['%s-%02d' % (last_month.year, last_month.month), + '%s-%02d' % (now.year, now.month), + ] + + score = 0 + for period_name in period_names: + score /= 2 # previous periods are discounted by 50% + entry = model.Session.query(GA_Url)\ + .filter(GA_Url.period_name==period_name)\ + .filter(GA_Url.package_id==dataset_name).first() + # score + if entry: + views = float(entry.pageviews) + if entry.period_complete_day: + views_per_day = views / entry.period_complete_day + else: + views_per_day = views / 15 # guess + score += views_per_day + + score = int(score * 100) + log.debug('Popularity %s: %s', score, dataset_name) + return score + --- /dev/null +++ b/ckanext/ga_report/helpers.py @@ -1,1 +1,123 @@ +import logging +import operator +import ckan.lib.base as base +import ckan.model as model +from ckan.logic import get_action + +from ckanext.ga_report.ga_model import GA_Url, GA_Publisher +from ckanext.ga_report.controller import _get_publishers +_log = logging.getLogger(__name__) + +def popular_datasets(count=10): + import random + + publisher = None + publishers = _get_publishers(30) + total = len(publishers) + while not publisher or not datasets: + rand = random.randrange(0, total) + publisher = publishers[rand][0] + if not publisher.state == 'active': + publisher = None + continue + datasets = _datasets_for_publisher(publisher, 10)[:count] + + ctx = { + 'datasets': datasets, + 'publisher': publisher + } + return base.render_snippet('ga_report/ga_popular_datasets.html', **ctx) + +def single_popular_dataset(top=20): + '''Returns a random dataset from the most popular ones. + + :param top: the number of top datasets to select from + ''' + import random + + top_datasets = model.Session.query(GA_Url).\ + filter(GA_Url.url.like('/dataset/%')).\ + order_by('ga_url.pageviews::int desc') + num_top_datasets = top_datasets.count() + + dataset = None + if num_top_datasets: + count = 0 + while not dataset: + rand = random.randrange(0, min(top, num_top_datasets)) + ga_url = top_datasets[rand] + dataset = model.Package.get(ga_url.url[len('/dataset/'):]) + if dataset and not dataset.state == 'active': + dataset = None + # When testing, it is possible that top datasets are not available + # so only go round this loop a few times before falling back on + # a random dataset. + count += 1 + if count > 10: + break + if not dataset: + # fallback + dataset = model.Session.query(model.Package)\ + .filter_by(state='active').first() + if not dataset: + return None + dataset_dict = get_action('package_show')({'model': model, + 'session': model.Session, + 'validate': False}, + {'id':dataset.id}) + return dataset_dict + +def single_popular_dataset_html(top=20): + dataset_dict = single_popular_dataset(top) + groups = package.get('groups', []) + publishers = [ g for g in groups if g.get('type') == 'publisher' ] + publisher = publishers[0] if publishers else {'name':'', 'title': ''} + context = { + 'dataset': dataset_dict, + 'publisher': publisher_dict + } + return base.render_snippet('ga_report/ga_popular_single.html', **context) + + +def most_popular_datasets(publisher, count=20): + + if not publisher: + _log.error("No valid publisher passed to 'most_popular_datasets'") + return "" + + results = _datasets_for_publisher(publisher, count) + + ctx = { + 'dataset_count': len(results), + 'datasets': results, + + 'publisher': publisher + } + + return base.render_snippet('ga_report/publisher/popular.html', **ctx) + +def _datasets_for_publisher(publisher, count): + datasets = {} + entries = model.Session.query(GA_Url).\ + filter(GA_Url.department_id==publisher.name).\ + filter(GA_Url.url.like('/dataset/%')).\ + order_by('ga_url.pageviews::int desc').all() + for entry in entries: + if len(datasets) < count: + p = model.Package.get(entry.url[len('/dataset/'):]) + if not p: + _log.warning("Could not find Package for {url}".format(url=entry.url)) + continue + + if not p in datasets: + datasets[p] = {'views':0, 'visits': 0} + datasets[p]['views'] = datasets[p]['views'] + int(entry.pageviews) + datasets[p]['visits'] = datasets[p]['visits'] + int(entry.visits) + + results = [] + for k, v in datasets.iteritems(): + results.append((k,v['views'],v['visits'])) + + return sorted(results, key=operator.itemgetter(1), reverse=True) + --- a/ckanext/ga_report/plugin.py +++ b/ckanext/ga_report/plugin.py @@ -1,25 +1,83 @@ import logging import ckan.lib.helpers as h +import ckan.plugins as p from ckan.plugins import implements, toolkit -import gasnippet -import commands -import dbutil + +from ckanext.ga_report.helpers import (most_popular_datasets, + popular_datasets, + single_popular_dataset) log = logging.getLogger('ckanext.ga-report') -class GoogleAnalyticsPlugin(p.SingletonPlugin): +class GAReportPlugin(p.SingletonPlugin): implements(p.IConfigurer, inherit=True) implements(p.IRoutes, inherit=True) + implements(p.ITemplateHelpers, inherit=True) def update_config(self, config): toolkit.add_template_directory(config, 'templates') toolkit.add_public_directory(config, 'public') + def get_helpers(self): + """ + A dictionary of extra helpers that will be available to provide + ga report info to templates. + """ + return { + 'ga_report_installed': lambda: True, + 'popular_datasets': popular_datasets, + 'most_popular_datasets': most_popular_datasets, + 'single_popular_dataset': single_popular_dataset + } + def after_map(self, map): + # GaReport map.connect( - '/data/analytics/index', - controller='ckanext.ga-report.controller:GaReport', + '/data/site-usage', + controller='ckanext.ga_report.controller:GaReport', action='index' + ) + map.connect( + '/data/site-usage/data_{month}.csv', + controller='ckanext.ga_report.controller:GaReport', + action='csv' + ) + map.connect( + '/data/site-usage/downloads', + controller='ckanext.ga_report.controller:GaReport', + action='downloads' + ) + map.connect( + '/data/site-usage/downloads_{month}.csv', + controller='ckanext.ga_report.controller:GaReport', + action='csv_downloads' + ) + + # GaDatasetReport + map.connect( + '/data/site-usage/publisher', + controller='ckanext.ga_report.controller:GaDatasetReport', + action='publishers' + ) + map.connect( + '/data/site-usage/publishers_{month}.csv', + controller='ckanext.ga_report.controller:GaDatasetReport', + action='publisher_csv' + ) + map.connect( + '/data/site-usage/dataset/datasets_{id}_{month}.csv', + controller='ckanext.ga_report.controller:GaDatasetReport', + action='dataset_csv' + ) + map.connect( + '/data/site-usage/dataset', + controller='ckanext.ga_report.controller:GaDatasetReport', + action='read' + ) + map.connect( + '/data/site-usage/dataset/{id}', + controller='ckanext.ga_report.controller:GaDatasetReport', + action='read_publisher' ) return map --- /dev/null +++ b/ckanext/ga_report/public/css/ga_report.css @@ -1,1 +1,59 @@ +.table-condensed td.sparkline-cell { + padding: 1px 0 0 0; + width: 108px; + text-align: center; +} +.rickshaw_chart_container { + position: relative; + height: 350px; + margin: 0 auto 20px auto; +} +.rickshaw_chart { + position: absolute; + left: 40px; + width: 500px; + top: 0; + bottom: 0; +} +.rickshaw_legend { + background: transparent; + width: 100%; + padding-top: 4px; +} +.rickshaw_y_axis { + position: absolute; + top: 0; + bottom: 0; + width: 40px; +} +.rickshaw_legend .label { + background: transparent !important; + color: #000000 !important; + font-weight: normal !important; +} +.rickshaw_legend .instructions { + color: #000; + margin-bottom: 6px; +} +.rickshaw_legend .line .action { + display: none; +} +.rickshaw_legend .line .swatch { + display: block; + float: left; +} +.rickshaw_legend .line .label { + display: block; + white-space: normal; + float: left; + width: 200px; +} +.rickshaw_legend .line .label:hover { + text-decoration: underline; +} + +.ga-reports-table .td-numeric { + text-align: center; +} + --- /dev/null +++ b/ckanext/ga_report/public/scripts/ckanext_ga_reports.js @@ -1,1 +1,134 @@ +var CKAN = CKAN || {}; +CKAN.GA_Reports = {}; +CKAN.GA_Reports.render_rickshaw = function( css_name, data, mode, colorscheme ) { + var graphLegends = $('#graph-legend-container'); + + if (!Modernizr.svg) { + $("#chart_"+css_name) + .html( '
Your browser does not support vector graphics. No graphs can be rendered.
') + .closest('.rickshaw_chart_container').css('height',50); + var myLegend = $('
') + .html('(Graph cannot be rendered)') + .appendTo(graphLegends); + return; + } + var myLegend = $('
').appendTo(graphLegends); + + var palette = new Rickshaw.Color.Palette( { scheme: colorscheme } ); + $.each(data, function(i, object) { + object['color'] = palette.color(); + }); + // Rickshaw renders the legend in reverse order... + data.reverse(); + + var graphElement = document.querySelector("#chart_"+css_name); + + var graph = new Rickshaw.Graph( { + element: document.querySelector("#chart_"+css_name), + renderer: mode, + series: data , + height: 328 + }); + var x_axis = new Rickshaw.Graph.Axis.Time( { graph: graph } ); + var y_axis = new Rickshaw.Graph.Axis.Y( { + graph: graph, + orientation: 'left', + tickFormat: Rickshaw.Fixtures.Number.formatKMBT, + element: document.getElementById('y_axis_'+css_name) + } ); + var legend = new Rickshaw.Graph.Legend( { + element: document.querySelector('#legend_'+css_name), + graph: graph + } ); + var shelving = new Rickshaw.Graph.Behavior.Series.Toggle( { + graph: graph, + legend: legend + } ); + myLegend.prepend('
Click on a series below to isolate its graph:
'); + graph.render(); +}; + +CKAN.GA_Reports.bind_sparklines = function() { + /* + * Bind to the 'totals' tab being on screen, when the + * Sparkline graphs should be drawn. + * Note that they cannot be drawn sooner. + */ + $('a[href="#totals"]').on( + 'shown', + function() { + var sparkOptions = { + enableTagOptions: true, + type: 'line', + width: 100, + height: 26, + chartRangeMin: 0, + spotColor: '', + maxSpotColor: '', + minSpotColor: '', + highlightSpotColor: '000000', + lineColor: '3F8E6D', + fillColor: 'B7E66B' + }; + $('.sparkline').sparkline('html',sparkOptions); + } + ); +}; + +CKAN.GA_Reports.bind_sidebar = function() { + /* + * Bind to changes in the tab behaviour: + * Show the correct rickshaw graph in the sidebar. + * Not to be called before all graphs load. + */ + $('a[data-toggle="hashchange"]').on( + 'shown', + function(e) { + var href = $(e.target).attr('href'); + var pane = $(href); + if (!pane.length) { console.err('bad href',href); return; } + var legend_name = "none"; + var graph = pane.find('.rickshaw_chart'); + if (graph.length) { + legend_name = graph.attr('id').replace('chart_',''); + } + legend_name = '#legend_'+legend_name; + $('#graph-legend-container > *').hide(); + $('#graph-legend-container .instructions').show(); + $(legend_name).show(); + } + ); +}; + +CKAN.GA_Reports.bind_month_selector = function() { + var handler = function(e) { + var target = $(e.delegateTarget); + var form = target.closest('form'); + var url = form.attr('action')+'?month='+target.val()+window.location.hash; + window.location = url; + }; + var selectors = $('select[name="month"]'); + assert(selectors.length>0); + selectors.bind('change', handler); +}; + +/* + * Custom bootstrap plugin for handling data-toggle="hashchange". + * Behaves like data-toggle="tab" but I respond to the hashchange. + * Page state is memo-ized in the URL this way. Why doesn't Bootstrap do this? + */ +$(function() { + var mapping = {}; + $('a[data-toggle="hashchange"]').each( + function(i,link) { + link = $(link); + mapping[link.attr('href')] = link; + } + ); + $(window).hashchange(function() { + var link = mapping[window.location.hash]; + if (link) { link.tab('show'); } + }); +}); + --- /dev/null +++ b/ckanext/ga_report/public/scripts/rickshaw_ie7_shim.js @@ -1,1 +1,109 @@ +/* + * Collection of shims to allow d3 and Rickshaw to load, error-free + * (but ultimately unusable) on Internet Explorer 7. The browser's + * API lacks several crucial functions which these libraries depend + * upon to load; we try to hide these errors from the user. + * + * With thanks to Array functions from: + * http://stackoverflow.com/questions/2790001/fixing-javascript-array-functions-in-internet-explorer-indexof-foreach-etc + * + * Use (Modernizr.svg==true) to detect whether it's okay to draw a graph. + */ +'use strict'; +window.Element = window.Element || {'prototype': {}}; +window.CSSStyleDeclaration = window.CSSStyleDeclaration || {'prototype':{}}; + +// Add ECMA262-5 method binding if not supported natively +// +if (!('bind' in Function.prototype)) { + Function.prototype.bind= function(owner) { + var that= this; + if (arguments.length<=1) { + return function() { + return that.apply(owner, arguments); + }; + } else { + var args= Array.prototype.slice.call(arguments, 1); + return function() { + return that.apply(owner, arguments.length===0? args : args.concat(Array.prototype.slice.call(arguments))); + }; + } + }; +} + +// Add ECMA262-5 string trim if not supported natively +// +if (!('trim' in String.prototype)) { + String.prototype.trim= function() { + return this.replace(/^\s+/, '').replace(/\s+$/, ''); + }; +} + +// Add ECMA262-5 Array methods if not supported natively +// +if (!('indexOf' in Array.prototype)) { + Array.prototype.indexOf= function(find, i /*opt*/) { + if (i===undefined) i= 0; + if (i<0) i+= this.length; + if (i<0) i= 0; + for (var n= this.length; ithis.length-1) i= this.length-1; + for (i++; i-->0;) /* i++ because from-argument is sadly inclusive */ + if (i in this && this[i]===find) + return i; + return -1; + }; +} +if (!('forEach' in Array.prototype)) { + Array.prototype.forEach= function(action, that /*opt*/) { + for (var i= 0, n= this.length; id&&(c=b,d=e);return c}function u(a){return a.reduce(v,0)}function v(a,b){return a+b[1]}function w(a,b){return x(a,Math.ceil(Math.log(b.length)/Math.LN2+1))}function x(a,b){var c=-1,d=+a[0],e=(a[1]-d)/b,f=[];while(++c<=b)f[c]=e*c+d;return f}function y(a){return[d3.min(a),d3.max(a)]}function z(a,b){return a.sort=d3.rebind(a,b.sort),a.children=d3.rebind(a,b.children),a.links=D,a.value=d3.rebind(a,b.value),a.nodes=function(b){return E=!0,(a.nodes=a)(b)},a}function A(a){return a.children}function B(a){return a.value}function C(a,b){return b.value-a.value}function D(a){return d3.merge(a.map(function(a){return(a.children||[]).map(function(b){return{source:a,target:b}})}))}function F(a,b){return a.value-b.value}function G(a,b){var c=a._pack_next;a._pack_next=b,b._pack_prev=a,b._pack_next=c,c._pack_prev=b}function H(a,b){a._pack_next=b,b._pack_prev=a}function I(a,b){var c=b.x-a.x,d=b.y-a.y,e=a.r+b.r;return e*e-c*c-d*d>.001}function J(a){function l(a){b=Math.min(a.x-a.r,b),c=Math.max(a.x+a.r,c),d=Math.min(a.y-a.r,d),e=Math.max(a.y+a.r,e)}var b=Infinity,c=-Infinity,d=Infinity,e=-Infinity,f=a.length,g,h,i,j,k;a.forEach(K),g=a[0],g.x=-g.r,g.y=0,l(g);if(f>1){h=a[1],h.x=h.r,h.y=0,l(h);if(f>2){i=a[2],O(g,h,i),l(i),G(g,i),g._pack_prev=i,G(i,h),h=g._pack_next;for(var m=3;m0?(H(g,j),h=j,m--):(H(j,h),g=j,m--)}}}var q=(b+c)/2,r=(d+e)/2,s=0;for(var m=0;m0&&(a=d)}return a}function X(a,b){return a.x-b.x}function Y(a,b){return b.x-a.x}function Z(a,b){return a.depth-b.depth}function $(a,b){function c(a,d){var e=a.children;if(e&&(i=e.length)){var f,g=null,h=-1,i;while(++h=0)f=d[e]._tree,f.prelim+=b,f.mod+=b,b+=f.shift+(c+=f.change)}function ba(a,b,c){a=a._tree,b=b._tree;var d=c/(b.number-a.number);a.change+=d,b.change-=d,b.shift+=c,b.prelim+=c,b.mod+=c}function bb(a,b,c){return a._tree.ancestor.parent==b.parent?a._tree.ancestor:c}function bc(a){return{x:a.x,y:a.y,dx:a.dx,dy:a.dy}}function bd(a,b){var c=a.x+b[3],d=a.y+b[0],e=a.dx-b[1]-b[3],f=a.dy-b[0]-b[2];return e<0&&(c+=e/2,e=0),f<0&&(d+=f/2,f=0),{x:c,y:d,dx:e,dy:f}}d3.layout={},d3.layout.bundle=function(){return function(b){var c=[],d=-1,e=b.length;while(++de&&(e=h),d.push(h)}for(g=0;g=i[0]&&o<=i[1]&&(k=g[d3.bisect(j,o,1,m)-1],k.y+=n,k.push(e[f]));return g}var a=!0,b=Number,c=y,d=w;return e.value=function(a){return arguments.length?(b=a,e):b},e.range=function(a){return arguments.length?(c=d3.functor(a),e):c},e.bins=function(a){return arguments.length?(d=typeof a=="number"?function(b){return x(b,a)}:d3.functor(a),e):d},e.frequency=function(b){return arguments.length?(a=!!b,e):a},e},d3.layout.hierarchy=function(){function e(f,h,i){var j=b.call(g,f,h),k=E?f:{data:f};k.depth=h,i.push(k);if(j&&(m=j.length)){var l=-1,m,n=k.children=[],o=0,p=h+1;while(++l0&&(ba(bb(g,a,d),a,m),i+=m,j+=m),k+=g._tree.mod,i+=e._tree.mod,l+=h._tree.mod,j+=f._tree.mod;g&&!V(f)&&(f._tree.thread=g,f._tree.mod+=k-j),e&&!U(h)&&(h._tree.thread=e,h._tree.mod+=i-l,d=a)}return d}var f=a.call(this,d,e),g=f[0];$(g,function(a,b){a._tree={ancestor:a,prelim:0,mod:0,change:0,shift:0,number:b?b._tree.number+1:0}}),h(g),i(g,-g._tree.prelim);var k=W(g,Y),l=W(g,X),m=W(g,Z),n=k.x-b(k,l)/2,o=l.x+b(l,k)/2,p=m.depth||1;return $(g,function(a){a.x=(a.x-n)/(o-n)*c[0],a.y=a.depth/p*c[1],delete a._tree}),f}var a=d3.layout.hierarchy().sort(null).value(null),b=T,c=[1,1];return d.separation=function(a){return arguments.length?(b=a,d):b},d.size=function(a){return arguments.length?(c=a,d):c},z(d,a)},d3.layout.treemap=function(){function i(a,b){var c=-1,d=a.length,e,f;while(++c0)d.push(g=f[o-1]),d.area+=g.area,(k=l(d,n))<=h?(f.pop(),h=k):(d.area-=d.pop().area,m(d,n,c,!1),n=Math.min(c.dx,c.dy),d.length=d.area=0,h=Infinity);d.length&&(m(d,n,c,!0),d.length=d.area=0),b.forEach(j)}}function k(a){var b=a.children;if(b&&b.length){var c=e(a),d=b.slice(),f,g=[];i(d,c.dx*c.dy/a.value),g.area=0;while(f=d.pop())g.push(f),g.area+=f.area,f.z!=null&&(m(g,f.z?c.dx:c.dy,c,!d.length),g.length=g.area=0);b.forEach(k)}}function l(a,b){var c=a.area,d,e=0,f=Infinity,g=-1,i=a.length;while(++ge&&(e=d)}return c*=c,b*=b,c?Math.max(b*e*h/c,c/(b*f*h)):Infinity}function m(a,c,d,e){var f=-1,g=a.length,h=d.x,i=d.y,j=c?b(a.area/c):0,k;if(c==d.dx){if(e||j>d.dy)j=j?d.dy:0;while(++fd.dx)j=j?d.dx:0;while(++f= 0 ? value.substring(i) : (i = value.length, ""), t = []; + while (i > 0) t.push(value.substring(i -= 3, i + 3)); + return t.reverse().join(",") + f; + } + function d3_formatPrefix(d, i) { + var k = Math.pow(10, Math.abs(8 - i) * 3); + return { + scale: i > 8 ? function(d) { + return d / k; + } : function(d) { + return d * k; + }, + symbol: d + }; + } + function d3_ease_clamp(f) { + return function(t) { + return t <= 0 ? 0 : t >= 1 ? 1 : f(t); + }; + } + function d3_ease_reverse(f) { + return function(t) { + return 1 - f(1 - t); + }; + } + function d3_ease_reflect(f) { + return function(t) { + return .5 * (t < .5 ? f(2 * t) : 2 - f(2 - 2 * t)); + }; + } + function d3_ease_identity(t) { + return t; + } + function d3_ease_poly(e) { + return function(t) { + return Math.pow(t, e); + }; + } + function d3_ease_sin(t) { + return 1 - Math.cos(t * Math.PI / 2); + } + function d3_ease_exp(t) { + return Math.pow(2, 10 * (t - 1)); + } + function d3_ease_circle(t) { + return 1 - Math.sqrt(1 - t * t); + } + function d3_ease_elastic(a, p) { + var s; + if (arguments.length < 2) p = .45; + if (arguments.length < 1) { + a = 1; + s = p / 4; + } else s = p / (2 * Math.PI) * Math.asin(1 / a); + return function(t) { + return 1 + a * Math.pow(2, 10 * -t) * Math.sin((t - s) * 2 * Math.PI / p); + }; + } + function d3_ease_back(s) { + if (!s) s = 1.70158; + return function(t) { + return t * t * ((s + 1) * t - s); + }; + } + function d3_ease_bounce(t) { + return t < 1 / 2.75 ? 7.5625 * t * t : t < 2 / 2.75 ? 7.5625 * (t -= 1.5 / 2.75) * t + .75 : t < 2.5 / 2.75 ? 7.5625 * (t -= 2.25 / 2.75) * t + .9375 : 7.5625 * (t -= 2.625 / 2.75) * t + .984375; + } + function d3_eventCancel() { + d3.event.stopPropagation(); + d3.event.preventDefault(); + } + function d3_eventSource() { + var e = d3.event, s; + while (s = e.sourceEvent) e = s; + return e; + } + function d3_eventDispatch(target) { + var dispatch = new d3_dispatch, i = 0, n = arguments.length; + while (++i < n) dispatch[arguments[i]] = d3_dispatch_event(dispatch); + dispatch.of = function(thiz, argumentz) { + return function(e1) { + try { + var e0 = e1.sourceEvent = d3.event; + e1.target = target; + d3.event = e1; + dispatch[e1.type].apply(thiz, argumentz); + } finally { + d3.event = e0; + } + }; + }; + return dispatch; + } + function d3_transform(m) { + var r0 = [ m.a, m.b ], r1 = [ m.c, m.d ], kx = d3_transformNormalize(r0), kz = d3_transformDot(r0, r1), ky = d3_transformNormalize(d3_transformCombine(r1, r0, -kz)) || 0; + if (r0[0] * r1[1] < r1[0] * r0[1]) { + r0[0] *= -1; + r0[1] *= -1; + kx *= -1; + kz *= -1; + } + this.rotate = (kx ? Math.atan2(r0[1], r0[0]) : Math.atan2(-r1[0], r1[1])) * d3_transformDegrees; + this.translate = [ m.e, m.f ]; + this.scale = [ kx, ky ]; + this.skew = ky ? Math.atan2(kz, ky) * d3_transformDegrees : 0; + } + function d3_transformDot(a, b) { + return a[0] * b[0] + a[1] * b[1]; + } + function d3_transformNormalize(a) { + var k = Math.sqrt(d3_transformDot(a, a)); + if (k) { + a[0] /= k; + a[1] /= k; + } + return k; + } + function d3_transformCombine(a, b, k) { + a[0] += k * b[0]; + a[1] += k * b[1]; + return a; + } + function d3_interpolateByName(name) { + return name == "transform" ? d3.interpolateTransform : d3.interpolate; + } + function d3_uninterpolateNumber(a, b) { + b = b - (a = +a) ? 1 / (b - a) : 0; + return function(x) { + return (x - a) * b; + }; + } + function d3_uninterpolateClamp(a, b) { + b = b - (a = +a) ? 1 / (b - a) : 0; + return function(x) { + return Math.max(0, Math.min(1, (x - a) * b)); + }; + } + function d3_Color() {} + function d3_rgb(r, g, b) { + return new d3_Rgb(r, g, b); + } + function d3_Rgb(r, g, b) { + this.r = r; + this.g = g; + this.b = b; + } + function d3_rgb_hex(v) { + return v < 16 ? "0" + Math.max(0, v).toString(16) : Math.min(255, v).toString(16); + } + function d3_rgb_parse(format, rgb, hsl) { + var r = 0, g = 0, b = 0, m1, m2, name; + m1 = /([a-z]+)\((.*)\)/i.exec(format); + if (m1) { + m2 = m1[2].split(","); + switch (m1[1]) { + case "hsl": + { + return hsl(parseFloat(m2[0]), parseFloat(m2[1]) / 100, parseFloat(m2[2]) / 100); + } + case "rgb": + { + return rgb(d3_rgb_parseNumber(m2[0]), d3_rgb_parseNumber(m2[1]), d3_rgb_parseNumber(m2[2])); + } + } + } + if (name = d3_rgb_names.get(format)) return rgb(name.r, name.g, name.b); + if (format != null && format.charAt(0) === "#") { + if (format.length === 4) { + r = format.charAt(1); + r += r; + g = format.charAt(2); + g += g; + b = format.charAt(3); + b += b; + } else if (format.length === 7) { + r = format.substring(1, 3); + g = format.substring(3, 5); + b = format.substring(5, 7); + } + r = parseInt(r, 16); + g = parseInt(g, 16); + b = parseInt(b, 16); + } + return rgb(r, g, b); + } + function d3_rgb_hsl(r, g, b) { + var min = Math.min(r /= 255, g /= 255, b /= 255), max = Math.max(r, g, b), d = max - min, h, s, l = (max + min) / 2; + if (d) { + s = l < .5 ? d / (max + min) : d / (2 - max - min); + if (r == max) h = (g - b) / d + (g < b ? 6 : 0); else if (g == max) h = (b - r) / d + 2; else h = (r - g) / d + 4; + h *= 60; + } else { + s = h = 0; + } + return d3_hsl(h, s, l); + } + function d3_rgb_lab(r, g, b) { + r = d3_rgb_xyz(r); + g = d3_rgb_xyz(g); + b = d3_rgb_xyz(b); + var x = d3_xyz_lab((.4124564 * r + .3575761 * g + .1804375 * b) / d3_lab_X), y = d3_xyz_lab((.2126729 * r + .7151522 * g + .072175 * b) / d3_lab_Y), z = d3_xyz_lab((.0193339 * r + .119192 * g + .9503041 * b) / d3_lab_Z); + return d3_lab(116 * y - 16, 500 * (x - y), 200 * (y - z)); + } + function d3_rgb_xyz(r) { + return (r /= 255) <= .04045 ? r / 12.92 : Math.pow((r + .055) / 1.055, 2.4); + } + function d3_rgb_parseNumber(c) { + var f = parseFloat(c); + return c.charAt(c.length - 1) === "%" ? Math.round(f * 2.55) : f; + } + function d3_hsl(h, s, l) { + return new d3_Hsl(h, s, l); + } + function d3_Hsl(h, s, l) { + this.h = h; + this.s = s; + this.l = l; + } + function d3_hsl_rgb(h, s, l) { + function v(h) { + if (h > 360) h -= 360; else if (h < 0) h += 360; + if (h < 60) return m1 + (m2 - m1) * h / 60; + if (h < 180) return m2; + if (h < 240) return m1 + (m2 - m1) * (240 - h) / 60; + return m1; + } + function vv(h) { + return Math.round(v(h) * 255); + } + var m1, m2; + h = h % 360; + if (h < 0) h += 360; + s = s < 0 ? 0 : s > 1 ? 1 : s; + l = l < 0 ? 0 : l > 1 ? 1 : l; + m2 = l <= .5 ? l * (1 + s) : l + s - l * s; + m1 = 2 * l - m2; + return d3_rgb(vv(h + 120), vv(h), vv(h - 120)); + } + function d3_hcl(h, c, l) { + return new d3_Hcl(h, c, l); + } + function d3_Hcl(h, c, l) { + this.h = h; + this.c = c; + this.l = l; + } + function d3_hcl_lab(h, c, l) { + return d3_lab(l, Math.cos(h *= Math.PI / 180) * c, Math.sin(h) * c); + } + function d3_lab(l, a, b) { + return new d3_Lab(l, a, b); + } + function d3_Lab(l, a, b) { + this.l = l; + this.a = a; + this.b = b; + } + function d3_lab_rgb(l, a, b) { + var y = (l + 16) / 116, x = y + a / 500, z = y - b / 200; + x = d3_lab_xyz(x) * d3_lab_X; + y = d3_lab_xyz(y) * d3_lab_Y; + z = d3_lab_xyz(z) * d3_lab_Z; + return d3_rgb(d3_xyz_rgb(3.2404542 * x - 1.5371385 * y - .4985314 * z), d3_xyz_rgb(-.969266 * x + 1.8760108 * y + .041556 * z), d3_xyz_rgb(.0556434 * x - .2040259 * y + 1.0572252 * z)); + } + function d3_lab_hcl(l, a, b) { + return d3_hcl(Math.atan2(b, a) / Math.PI * 180, Math.sqrt(a * a + b * b), l); + } + function d3_lab_xyz(x) { + return x > .206893034 ? x * x * x : (x - 4 / 29) / 7.787037; + } + function d3_xyz_lab(x) { + return x > .008856 ? Math.pow(x, 1 / 3) : 7.787037 * x + 4 / 29; + } + function d3_xyz_rgb(r) { + return Math.round(255 * (r <= .00304 ? 12.92 * r : 1.055 * Math.pow(r, 1 / 2.4) - .055)); + } + function d3_selection(groups) { + d3_arraySubclass(groups, d3_selectionPrototype); + return groups; + } + function d3_selection_selector(selector) { + return function() { + return d3_select(selector, this); + }; + } + function d3_selection_selectorAll(selector) { + return function() { + return d3_selectAll(selector, this); + }; + } + function d3_selection_attr(name, value) { + function attrNull() { + this.removeAttribute(name); + } + function attrNullNS() { + this.removeAttributeNS(name.space, name.local); + } + function attrConstant() { + this.setAttribute(name, value); + } + function attrConstantNS() { + this.setAttributeNS(name.space, name.local, value); + } + function attrFunction() { + var x = value.apply(this, arguments); + if (x == null) this.removeAttribute(name); else this.setAttribute(name, x); + } + function attrFunctionNS() { + var x = value.apply(this, arguments); + if (x == null) this.removeAttributeNS(name.space, name.local); else this.setAttributeNS(name.space, name.local, x); + } + name = d3.ns.qualify(name); + return value == null ? name.local ? attrNullNS : attrNull : typeof value === "function" ? name.local ? attrFunctionNS : attrFunction : name.local ? attrConstantNS : attrConstant; + } + function d3_selection_classedRe(name) { + return new RegExp("(?:^|\\s+)" + d3.requote(name) + "(?:\\s+|$)", "g"); + } + function d3_selection_classed(name, value) { + function classedConstant() { + var i = -1; + while (++i < n) name[i](this, value); + } + function classedFunction() { + var i = -1, x = value.apply(this, arguments); + while (++i < n) name[i](this, x); + } + name = name.trim().split(/\s+/).map(d3_selection_classedName); + var n = name.length; + return typeof value === "function" ? classedFunction : classedConstant; + } + function d3_selection_classedName(name) { + var re = d3_selection_classedRe(name); + return function(node, value) { + if (c = node.classList) return value ? c.add(name) : c.remove(name); + var c = node.className, cb = c.baseVal != null, cv = cb ? c.baseVal : c; + if (value) { + re.lastIndex = 0; + if (!re.test(cv)) { + cv = d3_collapse(cv + " " + name); + if (cb) c.baseVal = cv; else node.className = cv; + } + } else if (cv) { + cv = d3_collapse(cv.replace(re, " ")); + if (cb) c.baseVal = cv; else node.className = cv; + } + }; + } + function d3_selection_style(name, value, priority) { + function styleNull() { + this.style.removeProperty(name); + } + function styleConstant() { + this.style.setProperty(name, value, priority); + } + function styleFunction() { + var x = value.apply(this, arguments); + if (x == null) this.style.removeProperty(name); else this.style.setProperty(name, x, priority); + } + return value == null ? styleNull : typeof value === "function" ? styleFunction : styleConstant; + } + function d3_selection_property(name, value) { + function propertyNull() { + delete this[name]; + } + function propertyConstant() { + this[name] = value; + } + function propertyFunction() { + var x = value.apply(this, arguments); + if (x == null) delete this[name]; else this[name] = x; + } + return value == null ? propertyNull : typeof value === "function" ? propertyFunction : propertyConstant; + } + function d3_selection_dataNode(data) { + return { + __data__: data + }; + } + function d3_selection_filter(selector) { + return function() { + return d3_selectMatches(this, selector); + }; + } + function d3_selection_sortComparator(comparator) { + if (!arguments.length) comparator = d3.ascending; + return function(a, b) { + return comparator(a && a.__data__, b && b.__data__); + }; + } + function d3_selection_on(type, listener, capture) { + function onRemove() { + var wrapper = this[name]; + if (wrapper) { + this.removeEventListener(type, wrapper, wrapper.$); + delete this[name]; + } + } + function onAdd() { + function wrapper(e) { + var o = d3.event; + d3.event = e; + args[0] = node.__data__; + try { + listener.apply(node, args); + } finally { + d3.event = o; + } + } + var node = this, args = arguments; + onRemove.call(this); + this.addEventListener(type, this[name] = wrapper, wrapper.$ = capture); + wrapper._ = listener; + } + var name = "__on" + type, i = type.indexOf("."); + if (i > 0) type = type.substring(0, i); + return listener ? onAdd : onRemove; + } + function d3_selection_each(groups, callback) { + for (var j = 0, m = groups.length; j < m; j++) { + for (var group = groups[j], i = 0, n = group.length, node; i < n; i++) { + if (node = group[i]) callback(node, i, j); + } + } + return groups; + } + function d3_selection_enter(selection) { + d3_arraySubclass(selection, d3_selection_enterPrototype); + return selection; + } + function d3_transition(groups, id, time) { + d3_arraySubclass(groups, d3_transitionPrototype); + var tweens = new d3_Map, event = d3.dispatch("start", "end"), ease = d3_transitionEase; + groups.id = id; + groups.time = time; + groups.tween = function(name, tween) { + if (arguments.length < 2) return tweens.get(name); + if (tween == null) tween