From: Tom Rees Date: Thu, 14 Mar 2013 14:38:24 +0000 Subject: [noticket] Hide momentary flash of text on sparkline cells. X-Git-Url: http://maxious.lambdacomplex.org/git/?p=ckanext-ga-report.git&a=commitdiff&h=6fbba7da539dda71174745a285b52967353d7f00 --- [noticket] Hide momentary flash of text on sparkline cells. --- --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,7 @@ *.py[co] +*.py~ +.gitignore +ckan.log # Packages *.egg @@ -13,6 +16,10 @@ develop-eggs .installed.cfg +# Private info +credentials.json +token.dat + # Installer logs pip-log.txt --- a/README.md +++ /dev/null @@ -1,4 +1,1 @@ -ckanext-ga-report -================= -For creating detailed reports of CKAN analytics, sliced by group --- /dev/null +++ b/README.rst @@ -1,1 +1,114 @@ +ckanext-ga-report +================= +**Status:** Development + +**CKAN Version:** 1.7.1+ + + +Overview +-------- + +For creating detailed reports of CKAN analytics, including totals per group. + +Whereas ckanext-googleanalytics focusses on providing page view stats a recent period and for all time (aimed at end users), ckanext-ga-report is more interested in building regular periodic reports (more for site managers to monitor). + +Contents of this extension: + + * Use the CLI tool to download Google Analytics data for each time period into this extension's database tables + + * Users can view the data as web page reports + + +Installation +------------ + +1. Activate you CKAN python environment and install this extension's software:: + + $ pyenv/bin/activate + $ pip install -e git+https://github.com/datagovuk/ckanext-ga-report.git#egg=ckanext-ga-report + +2. Ensure you development.ini (or similar) contains the info about your Google Analytics account and configuration:: + + googleanalytics.id = UA-1010101-1 + googleanalytics.account = Account name (e.g. data.gov.uk, see top level item at https://www.google.com/analytics) + googleanalytics.token.filepath = ~/pyenv/token.dat + ga-report.period = monthly + ga-report.bounce_url = / + + The ga-report.bounce_url specifies a particular path to record the bounce rate for. Typically it is / (the home page). + +3. Set up this extension's database tables using a paster command. (Ensure your CKAN pyenv is still activated, run the command from ``src/ckanext-ga-report``, alter the ``--config`` option to point to your site config file):: + + $ paster initdb --config=../ckan/development.ini + +4. Enable the extension in your CKAN config file by adding it to ``ckan.plugins``:: + + ckan.plugins = ga-report + +Problem shooting +---------------- + +* ``(ProgrammingError) relation "ga_url" does not exist`` + This means that the ``paster initdb`` step has not been run successfully. Refer to the installation instructions for this extension. + + +Authorization +-------------- + +Before you can access the data, you need to set up the OAUTH details which you can do by following the `instructions `_ the outcome of which will be a file called credentials.json which should look like credentials.json.template with the relevant fields completed. These steps are below for convenience: + +1. Visit the `Google APIs Console `_ + +2. Sign-in and create a project or use an existing project. + +3. In the `Services pane `_ , activate Analytics API for your project. If prompted, read and accept the terms of service. + +4. Go to the `API Access pane `_ + +5. Click Create an OAuth 2.0 client ID.... + +6. Fill out the Branding Information fields and click Next. + +7. In Client ID Settings, set Application type to Installed application. + +8. Click Create client ID + +9. The details you need below are Client ID, Client secret, and Redirect URIs + + +Once you have set up your credentials.json file you can generate an oauth token file by using the +following command, which will store your oauth token in a file called token.dat once you have finished +giving permission in the browser:: + + $ paster getauthtoken --config=../ckan/development.ini + +Now ensure you reference the correct path to your token.dat in your CKAN config file (e.g. development.ini):: + + googleanalytics.token.filepath = ~/pyenv/token.dat + + +Tutorial +-------- + +Download some GA data and store it in CKAN's database. (Ensure your CKAN pyenv is still activated, run the command from ``src/ckanext-ga-report``, alter the ``--config`` option to point to your site config file) and specifying the name of your auth file (token.dat by default) from the previous step:: + + $ paster loadanalytics latest --config=../ckan/development.ini + +The value after the token file is how much data you want to retrieve, this can be + +* **all** - data for all time (since 2010) + +* **latest** - (default) just the 'latest' data + +* **YYYY-MM-DD** - just data for all time periods going back to (and including) this date + + + +Software Licence +================ + +This software is developed by Cabinet Office. It is Crown Copyright and opened up under the Open Government Licence (OGL) (which is compatible with Creative Commons Attibution License). + +OGL terms: http://www.nationalarchives.gov.uk/doc/open-government-licence/ + --- /dev/null +++ b/ckanext/__init__.py @@ -1,1 +1,8 @@ +# this is a namespace package +try: + import pkg_resources + pkg_resources.declare_namespace(__name__) +except ImportError: + import pkgutil + __path__ = pkgutil.extend_path(__path__, __name__) --- /dev/null +++ b/ckanext/ga_report/__init__.py @@ -1,1 +1,8 @@ +# this is a namespace package +try: + import pkg_resources + pkg_resources.declare_namespace(__name__) +except ImportError: + import pkgutil + __path__ = pkgutil.extend_path(__path__, __name__) --- /dev/null +++ b/ckanext/ga_report/command.py @@ -1,1 +1,152 @@ +import logging +import datetime +import os +from pylons import config + +from ckan.lib.cli import CkanCommand +# No other CKAN imports allowed until _load_config is run, +# or logging is disabled + + +class InitDB(CkanCommand): + """Initialise the extension's database tables + """ + summary = __doc__.split('\n')[0] + usage = __doc__ + max_args = 0 + min_args = 0 + + def command(self): + self._load_config() + + import ckan.model as model + model.Session.remove() + model.Session.configure(bind=model.meta.engine) + log = logging.getLogger('ckanext.ga_report') + + import ga_model + ga_model.init_tables() + log.info("DB tables are setup") + + +class GetAuthToken(CkanCommand): + """ Get's the Google auth token + + Usage: paster getauthtoken + + Where is the file name containing the details + for the service (obtained from https://code.google.com/apis/console). + By default this is set to credentials.json + """ + summary = __doc__.split('\n')[0] + usage = __doc__ + max_args = 0 + min_args = 0 + + def command(self): + """ + In this case we don't want a valid service, but rather just to + force the user through the auth flow. We allow this to complete to + act as a form of verification instead of just getting the token and + assuming it is correct. + """ + from ga_auth import init_service + init_service('token.dat', + self.args[0] if self.args + else 'credentials.json') + +class FixTimePeriods(CkanCommand): + """ + Fixes the 'All' records for GA_Urls + + It is possible that older urls that haven't recently been visited + do not have All records. This command will traverse through those + records and generate valid All records for them. + """ + summary = __doc__.split('\n')[0] + usage = __doc__ + max_args = 0 + min_args = 0 + + def __init__(self, name): + super(FixTimePeriods, self).__init__(name) + + def command(self): + import ckan.model as model + from ga_model import post_update_url_stats + self._load_config() + model.Session.remove() + model.Session.configure(bind=model.meta.engine) + + log = logging.getLogger('ckanext.ga_report') + + log.info("Updating 'All' records for old URLs") + post_update_url_stats() + log.info("Processing complete") + + + +class LoadAnalytics(CkanCommand): + """Get data from Google Analytics API and save it + in the ga_model + + Usage: paster loadanalytics + + Where is: + all - data for all time + latest - (default) just the 'latest' data + YYYY-MM - just data for the specific month + """ + summary = __doc__.split('\n')[0] + usage = __doc__ + max_args = 1 + min_args = 0 + + def __init__(self, name): + super(LoadAnalytics, self).__init__(name) + self.parser.add_option('-d', '--delete-first', + action='store_true', + default=False, + dest='delete_first', + help='Delete data for the period first') + self.parser.add_option('-s', '--skip_url_stats', + action='store_true', + default=False, + dest='skip_url_stats', + help='Skip the download of URL data - just do site-wide stats') + + def command(self): + self._load_config() + + from download_analytics import DownloadAnalytics + from ga_auth import (init_service, get_profile_id) + + ga_token_filepath = os.path.expanduser(config.get('googleanalytics.token.filepath', '')) + if not ga_token_filepath: + print 'ERROR: In the CKAN config you need to specify the filepath of the ' \ + 'Google Analytics token file under key: googleanalytics.token.filepath' + return + + try: + svc = init_service(ga_token_filepath, None) + except TypeError: + print ('Have you correctly run the getauthtoken task and ' + 'specified the correct token file in the CKAN config under ' + '"googleanalytics.token.filepath"?') + return + + downloader = DownloadAnalytics(svc, profile_id=get_profile_id(svc), + delete_first=self.options.delete_first, + skip_url_stats=self.options.skip_url_stats) + + time_period = self.args[0] if self.args else 'latest' + if time_period == 'all': + downloader.all_() + elif time_period == 'latest': + downloader.latest() + else: + # The month to use + for_date = datetime.datetime.strptime(time_period, '%Y-%m') + downloader.specific_month(for_date) + --- /dev/null +++ b/ckanext/ga_report/controller.py @@ -1,1 +1,538 @@ - +import re +import csv +import sys +import json +import logging +import operator +import collections +from ckan.lib.base import (BaseController, c, g, render, request, response, abort) + +import sqlalchemy +from sqlalchemy import func, cast, Integer +import ckan.model as model +from ga_model import GA_Url, GA_Stat, GA_ReferralStat, GA_Publisher + +log = logging.getLogger('ckanext.ga-report') + +DOWNLOADS_AVAILABLE_FROM = '2012-12' + +def _get_month_name(strdate): + import calendar + from time import strptime + d = strptime(strdate, '%Y-%m') + return '%s %s' % (calendar.month_name[d.tm_mon], d.tm_year) + +def _get_unix_epoch(strdate): + from time import strptime,mktime + d = strptime(strdate, '%Y-%m') + return int(mktime(d)) + +def _month_details(cls, stat_key=None): + ''' + Returns a list of all the periods for which we have data, unfortunately + knows too much about the type of the cls being passed as GA_Url has a + more complex query + + This may need extending if we add a period_name to the stats + ''' + months = [] + day = None + + q = model.Session.query(cls.period_name,cls.period_complete_day)\ + .filter(cls.period_name!='All').distinct(cls.period_name) + if stat_key: + q= q.filter(cls.stat_name==stat_key) + + vals = q.order_by("period_name desc").all() + + if vals and vals[0][1]: + day = int(vals[0][1]) + ordinal = 'th' if 11 <= day <= 13 \ + else {1:'st',2:'nd',3:'rd'}.get(day % 10, 'th') + day = "{day}{ordinal}".format(day=day, ordinal=ordinal) + + for m in vals: + months.append( (m[0], _get_month_name(m[0]))) + + return months, day + + +class GaReport(BaseController): + + def csv(self, month): + import csv + + q = model.Session.query(GA_Stat).filter(GA_Stat.stat_name!='Downloads') + if month != 'all': + q = q.filter(GA_Stat.period_name==month) + entries = q.order_by('GA_Stat.period_name, GA_Stat.stat_name, GA_Stat.key').all() + + response.headers['Content-Type'] = "text/csv; charset=utf-8" + response.headers['Content-Disposition'] = str('attachment; filename=stats_%s.csv' % (month,)) + + writer = csv.writer(response) + writer.writerow(["Period", "Statistic", "Key", "Value"]) + + for entry in entries: + writer.writerow([entry.period_name.encode('utf-8'), + entry.stat_name.encode('utf-8'), + entry.key.encode('utf-8'), + entry.value.encode('utf-8')]) + + + def index(self): + + # Get the month details by fetching distinct values and determining the + # month names from the values. + c.months, c.day = _month_details(GA_Stat) + + # Work out which month to show, based on query params of the first item + c.month_desc = 'all months' + c.month = request.params.get('month', '') + if c.month: + c.month_desc = ''.join([m[1] for m in c.months if m[0]==c.month]) + + q = model.Session.query(GA_Stat).\ + filter(GA_Stat.stat_name=='Totals') + if c.month: + q = q.filter(GA_Stat.period_name==c.month) + entries = q.order_by('ga_stat.key').all() + + def clean_key(key, val): + if key in ['Average time on site', 'Pages per visit', 'New visits', 'Bounce rate (home page)']: + val = "%.2f" % round(float(val), 2) + if key == 'Average time on site': + mins, secs = divmod(float(val), 60) + hours, mins = divmod(mins, 60) + val = '%02d:%02d:%02d (%s seconds) ' % (hours, mins, secs, val) + if key in ['New visits','Bounce rate (home page)']: + val = "%s%%" % val + if key in ['Total page views', 'Total visits']: + val = int(val) + + return key, val + + # Query historic values for sparkline rendering + sparkline_query = model.Session.query(GA_Stat)\ + .filter(GA_Stat.stat_name=='Totals')\ + .order_by(GA_Stat.period_name) + sparkline_data = {} + for x in sparkline_query: + sparkline_data[x.key] = sparkline_data.get(x.key,[]) + key, val = clean_key(x.key,float(x.value)) + tooltip = '%s: %s' % (_get_month_name(x.period_name), val) + sparkline_data[x.key].append( (tooltip,x.value) ) + # Trim the latest month, as it looks like a huge dropoff + for key in sparkline_data: + sparkline_data[key] = sparkline_data[key][:-1] + + c.global_totals = [] + if c.month: + for e in entries: + key, val = clean_key(e.key, e.value) + sparkline = sparkline_data[e.key] + c.global_totals.append((key, val, sparkline)) + else: + d = collections.defaultdict(list) + for e in entries: + d[e.key].append(float(e.value)) + for k, v in d.iteritems(): + if k in ['Total page views', 'Total visits']: + v = sum(v) + else: + v = float(sum(v))/float(len(v)) + sparkline = sparkline_data[k] + key, val = clean_key(k,v) + + c.global_totals.append((key, val, sparkline)) + # Sort the global totals into a more pleasant order + def sort_func(x): + key = x[0] + total_order = ['Total page views','Total visits','Pages per visit'] + if key in total_order: + return total_order.index(key) + return 999 + c.global_totals = sorted(c.global_totals, key=sort_func) + + keys = { + 'Browser versions': 'browser_versions', + 'Browsers': 'browsers', + 'Operating Systems versions': 'os_versions', + 'Operating Systems': 'os', + 'Social sources': 'social_networks', + 'Languages': 'languages', + 'Country': 'country' + } + + def shorten_name(name, length=60): + return (name[:length] + '..') if len(name) > 60 else name + + def fill_out_url(url): + import urlparse + return urlparse.urljoin(g.site_url, url) + + c.social_referrer_totals, c.social_referrers = [], [] + q = model.Session.query(GA_ReferralStat) + q = q.filter(GA_ReferralStat.period_name==c.month) if c.month else q + q = q.order_by('ga_referrer.count::int desc') + for entry in q.all(): + c.social_referrers.append((shorten_name(entry.url), fill_out_url(entry.url), + entry.source,entry.count)) + + q = model.Session.query(GA_ReferralStat.url, + func.sum(GA_ReferralStat.count).label('count')) + q = q.filter(GA_ReferralStat.period_name==c.month) if c.month else q + q = q.order_by('count desc').group_by(GA_ReferralStat.url) + for entry in q.all(): + c.social_referrer_totals.append((shorten_name(entry[0]), fill_out_url(entry[0]),'', + entry[1])) + + for k, v in keys.iteritems(): + q = model.Session.query(GA_Stat).\ + filter(GA_Stat.stat_name==k).\ + order_by(GA_Stat.period_name) + # Buffer the tabular data + if c.month: + entries = [] + q = q.filter(GA_Stat.period_name==c.month).\ + order_by('ga_stat.value::int desc') + d = collections.defaultdict(int) + for e in q.all(): + d[e.key] += int(e.value) + entries = [] + for key, val in d.iteritems(): + entries.append((key,val,)) + entries = sorted(entries, key=operator.itemgetter(1), reverse=True) + + # Run a query on all months to gather graph data + graph_query = model.Session.query(GA_Stat).\ + filter(GA_Stat.stat_name==k).\ + order_by(GA_Stat.period_name) + graph_dict = {} + for stat in graph_query: + graph_dict[ stat.key ] = graph_dict.get(stat.key,{ + 'name':stat.key, + 'data': [] + }) + graph_dict[ stat.key ]['data'].append({ + 'x':_get_unix_epoch(stat.period_name), + 'y':float(stat.value) + }) + graph = [ graph_dict[x[0]] for x in entries ] + setattr(c, v+'_graph', json.dumps( _to_rickshaw(graph,percentageMode=True) )) + + # Get the total for each set of values and then set the value as + # a percentage of the total + if k == 'Social sources': + total = sum([x for n,x,graph in c.global_totals if n == 'Total visits']) + else: + total = sum([num for _,num in entries]) + setattr(c, v, [(k,_percent(v,total)) for k,v in entries ]) + + return render('ga_report/site/index.html') + + +class GaDatasetReport(BaseController): + """ + Displays the pageview and visit count for datasets + with options to filter by publisher and time period. + """ + def publisher_csv(self, month): + ''' + Returns a CSV of each publisher with the total number of dataset + views & visits. + ''' + c.month = month if not month == 'all' else '' + response.headers['Content-Type'] = "text/csv; charset=utf-8" + response.headers['Content-Disposition'] = str('attachment; filename=publishers_%s.csv' % (month,)) + + writer = csv.writer(response) + writer.writerow(["Publisher Title", "Publisher Name", "Views", "Visits", "Period Name"]) + + top_publishers, top_publishers_graph = _get_top_publishers(None) + + for publisher,view,visit in top_publishers: + writer.writerow([publisher.title.encode('utf-8'), + publisher.name.encode('utf-8'), + view, + visit, + month]) + + def dataset_csv(self, id='all', month='all'): + ''' + Returns a CSV with the number of views & visits for each dataset. + + :param id: A Publisher ID or None if you want for all + :param month: The time period, or 'all' + ''' + c.month = month if not month == 'all' else '' + if id != 'all': + c.publisher = model.Group.get(id) + if not c.publisher: + abort(404, 'A publisher with that name could not be found') + + packages = self._get_packages(c.publisher) + response.headers['Content-Type'] = "text/csv; charset=utf-8" + response.headers['Content-Disposition'] = \ + str('attachment; filename=datasets_%s_%s.csv' % (c.publisher_name, month,)) + + writer = csv.writer(response) + writer.writerow(["Dataset Title", "Dataset Name", "Views", "Visits", "Resource downloads", "Period Name"]) + + for package,view,visit,downloads in packages: + writer.writerow([package.title.encode('utf-8'), + package.name.encode('utf-8'), + view, + visit, + downloads, + month]) + + def publishers(self): + '''A list of publishers and the number of views/visits for each''' + + # Get the month details by fetching distinct values and determining the + # month names from the values. + c.months, c.day = _month_details(GA_Url) + + # Work out which month to show, based on query params of the first item + c.month = request.params.get('month', '') + c.month_desc = 'all months' + if c.month: + c.month_desc = ''.join([m[1] for m in c.months if m[0]==c.month]) + + c.top_publishers, graph_data = _get_top_publishers() + c.top_publishers_graph = json.dumps( _to_rickshaw(graph_data) ) + + return render('ga_report/publisher/index.html') + + def _get_packages(self, publisher=None, count=-1): + '''Returns the datasets in order of views''' + have_download_data = True + month = c.month or 'All' + if month != 'All': + have_download_data = month >= DOWNLOADS_AVAILABLE_FROM + + q = model.Session.query(GA_Url,model.Package)\ + .filter(model.Package.name==GA_Url.package_id)\ + .filter(GA_Url.url.like('/dataset/%')) + if publisher: + q = q.filter(GA_Url.department_id==publisher.name) + q = q.filter(GA_Url.period_name==month) + q = q.order_by('ga_url.pageviews::int desc') + top_packages = [] + if count == -1: + entries = q.all() + else: + entries = q.limit(count) + + for entry,package in entries: + if package: + # Downloads .... + if have_download_data: + dls = model.Session.query(GA_Stat).\ + filter(GA_Stat.stat_name=='Downloads').\ + filter(GA_Stat.key==package.name) + if month != 'All': # Fetch everything unless the month is specific + dls = dls.filter(GA_Stat.period_name==month) + downloads = 0 + for x in dls: + downloads += int(x.value) + else: + downloads = 'No data' + top_packages.append((package, entry.pageviews, entry.visits, downloads)) + else: + log.warning('Could not find package associated package') + + return top_packages + + def read(self): + ''' + Lists the most popular datasets across all publishers + ''' + return self.read_publisher(None) + + def read_publisher(self, id): + ''' + Lists the most popular datasets for a publisher (or across all publishers) + ''' + count = 20 + + c.publishers = _get_publishers() + + id = request.params.get('publisher', id) + if id and id != 'all': + c.publisher = model.Group.get(id) + if not c.publisher: + abort(404, 'A publisher with that name could not be found') + c.publisher_name = c.publisher.name + c.top_packages = [] # package, dataset_views in c.top_packages + + # Get the month details by fetching distinct values and determining the + # month names from the values. + c.months, c.day = _month_details(GA_Url) + + # Work out which month to show, based on query params of the first item + c.month = request.params.get('month', '') + if not c.month: + c.month_desc = 'all months' + else: + c.month_desc = ''.join([m[1] for m in c.months if m[0]==c.month]) + + month = c.month or 'All' + c.publisher_page_views = 0 + q = model.Session.query(GA_Url).\ + filter(GA_Url.url=='/publisher/%s' % c.publisher_name) + entry = q.filter(GA_Url.period_name==c.month).first() + c.publisher_page_views = entry.pageviews if entry else 0 + + c.top_packages = self._get_packages(c.publisher, 20) + + # Graph query + top_package_names = [ x[0].name for x in c.top_packages ] + graph_query = model.Session.query(GA_Url,model.Package)\ + .filter(model.Package.name==GA_Url.package_id)\ + .filter(GA_Url.url.like('/dataset/%'))\ + .filter(GA_Url.package_id.in_(top_package_names)) + graph_dict = {} + for entry,package in graph_query: + if not package: continue + if entry.period_name=='All': continue + graph_dict[package.name] = graph_dict.get(package.name,{ + 'name':package.title, + 'data':[] + }) + graph_dict[package.name]['data'].append({ + 'x':_get_unix_epoch(entry.period_name), + 'y':int(entry.pageviews), + }) + graph = [ graph_dict[x] for x in top_package_names ] + + c.graph_data = json.dumps( _to_rickshaw(graph) ) + + return render('ga_report/publisher/read.html') + +def _to_rickshaw(data, percentageMode=False): + if data==[]: + return data + # Create a consistent x-axis between all series + num_points = [ len(series['data']) for series in data ] + ideal_index = num_points.index( max(num_points) ) + x_axis = [] + for series in data: + for point in series['data']: + x_axis.append(point['x']) + x_axis = sorted( list( set(x_axis) ) ) + # Zero pad any missing values + for series in data: + xs = [ point['x'] for point in series['data'] ] + for x in set(x_axis).difference(set(xs)): + series['data'].append( {'x':x, 'y':0} ) + if percentageMode: + def get_totals(series_list): + totals = {} + for series in series_list: + for point in series['data']: + totals[point['x']] = totals.get(point['x'],0) + point['y'] + return totals + # Transform data into percentage stacks + totals = get_totals(data) + # Roll insignificant series into a catch-all + THRESHOLD = 0.01 + raw_data = data + data = [] + for series in raw_data: + for point in series['data']: + fraction = float(point['y']) / totals[point['x']] + if not (series in data) and fraction>THRESHOLD: + data.append(series) + # Overwrite data with a set of intereting series + others = [ x for x in raw_data if not (x in data) ] + data.append({ + 'name':'Other', + 'data': [ {'x':x,'y':y} for x,y in get_totals(others).items() ] + }) + # Turn each point into a percentage + for series in data: + for point in series['data']: + point['y'] = (point['y']*100) / totals[point['x']] + # Sort the points + for series in data: + series['data'] = sorted( series['data'], key=lambda x:x['x'] ) + # Strip the latest month's incomplete analytics + series['data'] = series['data'][:-1] + return data + + +def _get_top_publishers(limit=20): + ''' + Returns a list of the top 20 publishers by dataset visits. + (The number to show can be varied with 'limit') + ''' + month = c.month or 'All' + connection = model.Session.connection() + q = """ + select department_id, sum(pageviews::int) views, sum(visits::int) visits + from ga_url + where department_id <> '' + and package_id <> '' + and url like '/dataset/%%' + and period_name=%s + group by department_id order by views desc + """ + if limit: + q = q + " limit %s;" % (limit) + + top_publishers = [] + res = connection.execute(q, month) + department_ids = [] + for row in res: + g = model.Group.get(row[0]) + if g: + department_ids.append(row[0]) + top_publishers.append((g, row[1], row[2])) + + graph = [] + if limit is not None: + # Query for a history graph of these publishers + q = model.Session.query( + GA_Url.department_id, + GA_Url.period_name, + func.sum(cast(GA_Url.pageviews,sqlalchemy.types.INT)))\ + .filter( GA_Url.department_id.in_(department_ids) )\ + .filter( GA_Url.period_name!='All' )\ + .filter( GA_Url.url.like('/dataset/%') )\ + .filter( GA_Url.package_id!='' )\ + .group_by( GA_Url.department_id, GA_Url.period_name ) + graph_dict = {} + for dept_id,period_name,views in q: + graph_dict[dept_id] = graph_dict.get( dept_id, { + 'name' : model.Group.get(dept_id).title, + 'data' : [] + }) + graph_dict[dept_id]['data'].append({ + 'x': _get_unix_epoch(period_name), + 'y': views + }) + # Sort dict into ordered list + for id in department_ids: + graph.append( graph_dict[id] ) + return top_publishers, graph + + +def _get_publishers(): + ''' + Returns a list of all publishers. Each item is a tuple: + (name, title) + ''' + publishers = [] + for pub in model.Session.query(model.Group).\ + filter(model.Group.type=='publisher').\ + filter(model.Group.state=='active').\ + order_by(model.Group.name): + publishers.append((pub.name, pub.title)) + return publishers + +def _percent(num, total): + p = 100 * float(num)/float(total) + return "%.2f%%" % round(p, 2) + --- /dev/null +++ b/ckanext/ga_report/download_analytics.py @@ -1,1 +1,500 @@ - +import os +import logging +import datetime +import collections +from pylons import config +from ga_model import _normalize_url +import ga_model + +#from ga_client import GA + +log = logging.getLogger('ckanext.ga-report') + +FORMAT_MONTH = '%Y-%m' +MIN_VIEWS = 50 +MIN_VISITS = 20 +MIN_DOWNLOADS = 10 + +class DownloadAnalytics(object): + '''Downloads and stores analytics info''' + + def __init__(self, service=None, profile_id=None, delete_first=False, + skip_url_stats=False): + self.period = config['ga-report.period'] + self.service = service + self.profile_id = profile_id + self.delete_first = delete_first + self.skip_url_stats = skip_url_stats + + def specific_month(self, date): + import calendar + + first_of_this_month = datetime.datetime(date.year, date.month, 1) + _, last_day_of_month = calendar.monthrange(int(date.year), int(date.month)) + last_of_this_month = datetime.datetime(date.year, date.month, last_day_of_month) + # if this is the latest month, note that it is only up until today + now = datetime.datetime.now() + if now.year == date.year and now.month == date.month: + last_day_of_month = now.day + last_of_this_month = now + periods = ((date.strftime(FORMAT_MONTH), + last_day_of_month, + first_of_this_month, last_of_this_month),) + self.download_and_store(periods) + + + def latest(self): + if self.period == 'monthly': + # from first of this month to today + now = datetime.datetime.now() + first_of_this_month = datetime.datetime(now.year, now.month, 1) + periods = ((now.strftime(FORMAT_MONTH), + now.day, + first_of_this_month, now),) + else: + raise NotImplementedError + self.download_and_store(periods) + + + def for_date(self, for_date): + assert isinstance(since_date, datetime.datetime) + periods = [] # (period_name, period_complete_day, start_date, end_date) + if self.period == 'monthly': + first_of_the_months_until_now = [] + year = for_date.year + month = for_date.month + now = datetime.datetime.now() + first_of_this_month = datetime.datetime(now.year, now.month, 1) + while True: + first_of_the_month = datetime.datetime(year, month, 1) + if first_of_the_month == first_of_this_month: + periods.append((now.strftime(FORMAT_MONTH), + now.day, + first_of_this_month, now)) + break + elif first_of_the_month < first_of_this_month: + in_the_next_month = first_of_the_month + datetime.timedelta(40) + last_of_the_month = datetime.datetime(in_the_next_month.year, + in_the_next_month.month, 1)\ + - datetime.timedelta(1) + periods.append((now.strftime(FORMAT_MONTH), 0, + first_of_the_month, last_of_the_month)) + else: + # first_of_the_month has got to the future somehow + break + month += 1 + if month > 12: + year += 1 + month = 1 + else: + raise NotImplementedError + self.download_and_store(periods) + + @staticmethod + def get_full_period_name(period_name, period_complete_day): + if period_complete_day: + return period_name + ' (up to %ith)' % period_complete_day + else: + return period_name + + + def download_and_store(self, periods): + for period_name, period_complete_day, start_date, end_date in periods: + log.info('Period "%s" (%s - %s)', + self.get_full_period_name(period_name, period_complete_day), + start_date.strftime('%Y-%m-%d'), + end_date.strftime('%Y-%m-%d')) + + if self.delete_first: + log.info('Deleting existing Analytics for this period "%s"', + period_name) + ga_model.delete(period_name) + + if not self.skip_url_stats: + # Clean out old url data before storing the new + ga_model.pre_update_url_stats(period_name) + + accountName = config.get('googleanalytics.account') + + log.info('Downloading analytics for dataset views') + data = self.download(start_date, end_date, '~/%s/dataset/[a-z0-9-_]+' % accountName) + + log.info('Storing dataset views (%i rows)', len(data.get('url'))) + self.store(period_name, period_complete_day, data, ) + + log.info('Downloading analytics for publisher views') + data = self.download(start_date, end_date, '~/%s/publisher/[a-z0-9-_]+' % accountName) + + log.info('Storing publisher views (%i rows)', len(data.get('url'))) + self.store(period_name, period_complete_day, data,) + + # Make sure the All records are correct. + ga_model.post_update_url_stats() + + log.info('Associating datasets with their publisher') + ga_model.update_publisher_stats(period_name) # about 30 seconds. + + + log.info('Downloading and storing analytics for site-wide stats') + self.sitewide_stats( period_name, period_complete_day ) + + log.info('Downloading and storing analytics for social networks') + self.update_social_info(period_name, start_date, end_date) + + + def update_social_info(self, period_name, start_date, end_date): + start_date = start_date.strftime('%Y-%m-%d') + end_date = end_date.strftime('%Y-%m-%d') + query = 'ga:hasSocialSourceReferral=~Yes$' + metrics = 'ga:entrances' + sort = '-ga:entrances' + + # Supported query params at + # https://developers.google.com/analytics/devguides/reporting/core/v3/reference + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + filters=query, + start_date=start_date, + metrics=metrics, + sort=sort, + dimensions="ga:landingPagePath,ga:socialNetwork", + max_results=10000, + end_date=end_date).execute() + data = collections.defaultdict(list) + rows = results.get('rows',[]) + for row in rows: + url = _normalize_url('http:/' + row[0]) + data[url].append( (row[1], int(row[2]),) ) + ga_model.update_social(period_name, data) + + + def download(self, start_date, end_date, path=None): + '''Get data from GA for a given time period''' + start_date = start_date.strftime('%Y-%m-%d') + end_date = end_date.strftime('%Y-%m-%d') + query = 'ga:pagePath=%s$' % path + metrics = 'ga:pageviews, ga:visits' + sort = '-ga:pageviews' + + # Supported query params at + # https://developers.google.com/analytics/devguides/reporting/core/v3/reference + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + filters=query, + start_date=start_date, + metrics=metrics, + sort=sort, + dimensions="ga:pagePath", + max_results=10000, + end_date=end_date).execute() + + packages = [] + log.info("There are %d results" % results['totalResults']) + for entry in results.get('rows'): + (loc,pageviews,visits) = entry + url = _normalize_url('http:/' + loc) # strips off domain e.g. www.data.gov.uk or data.gov.uk + + if not url.startswith('/dataset/') and not url.startswith('/publisher/'): + # filter out strays like: + # /data/user/login?came_from=http://data.gov.uk/dataset/os-code-point-open + # /403.html?page=/about&from=http://data.gov.uk/publisher/planning-inspectorate + continue + packages.append( (url, pageviews, visits,) ) # Temporary hack + return dict(url=packages) + + def store(self, period_name, period_complete_day, data): + if 'url' in data: + ga_model.update_url_stats(period_name, period_complete_day, data['url']) + + def sitewide_stats(self, period_name, period_complete_day): + import calendar + year, month = period_name.split('-') + _, last_day_of_month = calendar.monthrange(int(year), int(month)) + + start_date = '%s-01' % period_name + end_date = '%s-%s' % (period_name, last_day_of_month) + funcs = ['_totals_stats', '_social_stats', '_os_stats', + '_locale_stats', '_browser_stats', '_mobile_stats', '_download_stats'] + for f in funcs: + log.info('Downloading analytics for %s' % f.split('_')[1]) + getattr(self, f)(start_date, end_date, period_name, period_complete_day) + + def _get_results(result_data, f): + data = {} + for result in result_data: + key = f(result) + data[key] = data.get(key,0) + result[1] + return data + + def _totals_stats(self, start_date, end_date, period_name, period_complete_day): + """ Fetches distinct totals, total pageviews etc """ + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + start_date=start_date, + metrics='ga:pageviews', + sort='-ga:pageviews', + max_results=10000, + end_date=end_date).execute() + result_data = results.get('rows') + ga_model.update_sitewide_stats(period_name, "Totals", {'Total page views': result_data[0][0]}, + period_complete_day) + + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + start_date=start_date, + metrics='ga:pageviewsPerVisit,ga:avgTimeOnSite,ga:percentNewVisits,ga:visits', + max_results=10000, + end_date=end_date).execute() + result_data = results.get('rows') + data = { + 'Pages per visit': result_data[0][0], + 'Average time on site': result_data[0][1], + 'New visits': result_data[0][2], + 'Total visits': result_data[0][3], + } + ga_model.update_sitewide_stats(period_name, "Totals", data, period_complete_day) + + # Bounces from / or another configurable page. + path = '/%s%s' % (config.get('googleanalytics.account'), + config.get('ga-report.bounce_url', '/')) + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + filters='ga:pagePath==%s' % (path,), + start_date=start_date, + metrics='ga:visitBounceRate', + dimensions='ga:pagePath', + max_results=10000, + end_date=end_date).execute() + result_data = results.get('rows') + if not result_data or len(result_data) != 1: + log.error('Could not pinpoint the bounces for path: %s. Got results: %r', + path, result_data) + return + results = result_data[0] + bounces = float(results[1]) + # visitBounceRate is already a % + log.info('Google reports visitBounceRate as %s', bounces) + ga_model.update_sitewide_stats(period_name, "Totals", {'Bounce rate (home page)': float(bounces)}, + period_complete_day) + + + def _locale_stats(self, start_date, end_date, period_name, period_complete_day): + """ Fetches stats about language and country """ + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + start_date=start_date, + metrics='ga:pageviews', + sort='-ga:pageviews', + dimensions="ga:language,ga:country", + max_results=10000, + end_date=end_date).execute() + result_data = results.get('rows') + data = {} + for result in result_data: + data[result[0]] = data.get(result[0], 0) + int(result[2]) + self._filter_out_long_tail(data, MIN_VIEWS) + ga_model.update_sitewide_stats(period_name, "Languages", data, period_complete_day) + + data = {} + for result in result_data: + data[result[1]] = data.get(result[1], 0) + int(result[2]) + self._filter_out_long_tail(data, MIN_VIEWS) + ga_model.update_sitewide_stats(period_name, "Country", data, period_complete_day) + + + def _download_stats(self, start_date, end_date, period_name, period_complete_day): + """ Fetches stats about data downloads """ + import ckan.model as model + + data = {} + + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + start_date=start_date, + filters='ga:eventAction==download', + metrics='ga:totalEvents', + sort='-ga:totalEvents', + dimensions="ga:eventLabel", + max_results=10000, + end_date=end_date).execute() + result_data = results.get('rows') + if not result_data: + # We may not have data for this time period, so we need to bail + # early. + log.info("There is no download data for this time period") + return + + def process_result_data(result_data, cached=False): + progress_total = len(result_data) + progress_count = 0 + resources_not_matched = [] + for result in result_data: + progress_count += 1 + if progress_count % 100 == 0: + log.debug('.. %d/%d done so far', progress_count, progress_total) + + url = result[0].strip() + + # Get package id associated with the resource that has this URL. + q = model.Session.query(model.Resource) + if cached: + r = q.filter(model.Resource.cache_url.like("%s%%" % url)).first() + else: + r = q.filter(model.Resource.url.like("%s%%" % url)).first() + + package_name = r.resource_group.package.name if r else "" + if package_name: + data[package_name] = data.get(package_name, 0) + int(result[1]) + else: + resources_not_matched.append(url) + continue + if resources_not_matched: + log.debug('Could not match %i or %i resource URLs to datasets. e.g. %r', + len(resources_not_matched), progress_total, resources_not_matched[:3]) + + log.info('Associating downloads of resource URLs with their respective datasets') + process_result_data(results.get('rows')) + + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + start_date=start_date, + filters='ga:eventAction==download-cache', + metrics='ga:totalEvents', + sort='-ga:totalEvents', + dimensions="ga:eventLabel", + max_results=10000, + end_date=end_date).execute() + log.info('Associating downloads of cache resource URLs with their respective datasets') + process_result_data(results.get('rows'), cached=False) + + self._filter_out_long_tail(data, MIN_DOWNLOADS) + ga_model.update_sitewide_stats(period_name, "Downloads", data, period_complete_day) + + def _social_stats(self, start_date, end_date, period_name, period_complete_day): + """ Finds out which social sites people are referred from """ + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + start_date=start_date, + metrics='ga:pageviews', + sort='-ga:pageviews', + dimensions="ga:socialNetwork,ga:referralPath", + max_results=10000, + end_date=end_date).execute() + result_data = results.get('rows') + data = {} + for result in result_data: + if not result[0] == '(not set)': + data[result[0]] = data.get(result[0], 0) + int(result[2]) + self._filter_out_long_tail(data, 3) + ga_model.update_sitewide_stats(period_name, "Social sources", data, period_complete_day) + + + def _os_stats(self, start_date, end_date, period_name, period_complete_day): + """ Operating system stats """ + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + start_date=start_date, + metrics='ga:pageviews', + sort='-ga:pageviews', + dimensions="ga:operatingSystem,ga:operatingSystemVersion", + max_results=10000, + end_date=end_date).execute() + result_data = results.get('rows') + data = {} + for result in result_data: + data[result[0]] = data.get(result[0], 0) + int(result[2]) + self._filter_out_long_tail(data, MIN_VIEWS) + ga_model.update_sitewide_stats(period_name, "Operating Systems", data, period_complete_day) + + data = {} + for result in result_data: + if int(result[2]) >= MIN_VIEWS: + key = "%s %s" % (result[0],result[1]) + data[key] = result[2] + ga_model.update_sitewide_stats(period_name, "Operating Systems versions", data, period_complete_day) + + + def _browser_stats(self, start_date, end_date, period_name, period_complete_day): + """ Information about browsers and browser versions """ + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + start_date=start_date, + metrics='ga:pageviews', + sort='-ga:pageviews', + dimensions="ga:browser,ga:browserVersion", + max_results=10000, + end_date=end_date).execute() + result_data = results.get('rows') + # e.g. [u'Firefox', u'19.0', u'20'] + + data = {} + for result in result_data: + data[result[0]] = data.get(result[0], 0) + int(result[2]) + self._filter_out_long_tail(data, MIN_VIEWS) + ga_model.update_sitewide_stats(period_name, "Browsers", data, period_complete_day) + + data = {} + for result in result_data: + key = "%s %s" % (result[0], self._filter_browser_version(result[0], result[1])) + data[key] = data.get(key, 0) + int(result[2]) + self._filter_out_long_tail(data, MIN_VIEWS) + ga_model.update_sitewide_stats(period_name, "Browser versions", data, period_complete_day) + + @classmethod + def _filter_browser_version(cls, browser, version_str): + ''' + Simplifies a browser version string if it is detailed. + i.e. groups together Firefox 3.5.1 and 3.5.2 to be just 3. + This is helpful when viewing stats and good to protect privacy. + ''' + ver = version_str + parts = ver.split('.') + if len(parts) > 1: + if parts[1][0] == '0': + ver = parts[0] + else: + ver = "%s" % (parts[0]) + # Special case complex version nums + if browser in ['Safari', 'Android Browser']: + ver = parts[0] + if len(ver) > 2: + num_hidden_digits = len(ver) - 2 + ver = ver[0] + ver[1] + 'X' * num_hidden_digits + return ver + + def _mobile_stats(self, start_date, end_date, period_name, period_complete_day): + """ Info about mobile devices """ + + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + start_date=start_date, + metrics='ga:pageviews', + sort='-ga:pageviews', + dimensions="ga:mobileDeviceBranding, ga:mobileDeviceInfo", + max_results=10000, + end_date=end_date).execute() + + result_data = results.get('rows') + data = {} + for result in result_data: + data[result[0]] = data.get(result[0], 0) + int(result[2]) + self._filter_out_long_tail(data, MIN_VIEWS) + ga_model.update_sitewide_stats(period_name, "Mobile brands", data, period_complete_day) + + data = {} + for result in result_data: + data[result[1]] = data.get(result[1], 0) + int(result[2]) + self._filter_out_long_tail(data, MIN_VIEWS) + ga_model.update_sitewide_stats(period_name, "Mobile devices", data, period_complete_day) + + @classmethod + def _filter_out_long_tail(cls, data, threshold=10): + ''' + Given data which is a frequency distribution, filter out + results which are below a threshold count. This is good to protect + privacy. + ''' + for key, value in data.items(): + if value < threshold: + del data[key] + --- /dev/null +++ b/ckanext/ga_report/ga_auth.py @@ -1,1 +1,74 @@ +import os +import httplib2 +from apiclient.discovery import build +from oauth2client.client import flow_from_clientsecrets +from oauth2client.file import Storage +from oauth2client.tools import run +from pylons import config + + +def _prepare_credentials(token_filename, credentials_filename): + """ + Either returns the user's oauth credentials or uses the credentials + file to generate a token (by forcing the user to login in the browser) + """ + storage = Storage(token_filename) + credentials = storage.get() + + if credentials is None or credentials.invalid: + flow = flow_from_clientsecrets(credentials_filename, + scope='https://www.googleapis.com/auth/analytics.readonly', + message="Can't find the credentials file") + credentials = run(flow, storage) + + return credentials + + +def init_service(token_file, credentials_file): + """ + Given a file containing the user's oauth token (and another with + credentials in case we need to generate the token) will return a + service object representing the analytics API. + """ + http = httplib2.Http() + + credentials = _prepare_credentials(token_file, credentials_file) + http = credentials.authorize(http) # authorize the http object + + return build('analytics', 'v3', http=http) + + +def get_profile_id(service): + """ + Get the profile ID for this user and the service specified by the + 'googleanalytics.id' configuration option. This function iterates + over all of the accounts available to the user who invoked the + service to find one where the account name matches (in case the + user has several). + """ + accounts = service.management().accounts().list().execute() + + if not accounts.get('items'): + return None + + accountName = config.get('googleanalytics.account') + if not accountName: + raise Exception('googleanalytics.account needs to be configured') + webPropertyId = config.get('googleanalytics.id') + if not webPropertyId: + raise Exception('googleanalytics.id needs to be configured') + for acc in accounts.get('items'): + if acc.get('name') == accountName: + accountId = acc.get('id') + + webproperties = service.management().webproperties().list(accountId=accountId).execute() + + profiles = service.management().profiles().list( + accountId=accountId, webPropertyId=webPropertyId).execute() + + if profiles.get('items'): + return profiles.get('items')[0].get('id') + + return None + --- /dev/null +++ b/ckanext/ga_report/ga_model.py @@ -1,1 +1,440 @@ - +import re +import uuid + +from sqlalchemy import Table, Column, MetaData, ForeignKey +from sqlalchemy import types +from sqlalchemy.sql import select +from sqlalchemy.orm import mapper, relation +from sqlalchemy import func + +import ckan.model as model +from ckan.lib.base import * + +log = __import__('logging').getLogger(__name__) + +def make_uuid(): + return unicode(uuid.uuid4()) + +metadata = MetaData() + +class GA_Url(object): + + def __init__(self, **kwargs): + for k,v in kwargs.items(): + setattr(self, k, v) + +url_table = Table('ga_url', metadata, + Column('id', types.UnicodeText, primary_key=True, + default=make_uuid), + Column('period_name', types.UnicodeText), + Column('period_complete_day', types.Integer), + Column('pageviews', types.UnicodeText), + Column('visits', types.UnicodeText), + Column('url', types.UnicodeText), + Column('department_id', types.UnicodeText), + Column('package_id', types.UnicodeText), + ) +mapper(GA_Url, url_table) + + +class GA_Stat(object): + + def __init__(self, **kwargs): + for k,v in kwargs.items(): + setattr(self, k, v) + +stat_table = Table('ga_stat', metadata, + Column('id', types.UnicodeText, primary_key=True, + default=make_uuid), + Column('period_name', types.UnicodeText), + Column('period_complete_day', types.UnicodeText), + Column('stat_name', types.UnicodeText), + Column('key', types.UnicodeText), + Column('value', types.UnicodeText), ) +mapper(GA_Stat, stat_table) + + +class GA_Publisher(object): + + def __init__(self, **kwargs): + for k,v in kwargs.items(): + setattr(self, k, v) + +pub_table = Table('ga_publisher', metadata, + Column('id', types.UnicodeText, primary_key=True, + default=make_uuid), + Column('period_name', types.UnicodeText), + Column('publisher_name', types.UnicodeText), + Column('views', types.UnicodeText), + Column('visits', types.UnicodeText), + Column('toplevel', types.Boolean, default=False), + Column('subpublishercount', types.Integer, default=0), + Column('parent', types.UnicodeText), +) +mapper(GA_Publisher, pub_table) + + +class GA_ReferralStat(object): + + def __init__(self, **kwargs): + for k,v in kwargs.items(): + setattr(self, k, v) + +referrer_table = Table('ga_referrer', metadata, + Column('id', types.UnicodeText, primary_key=True, + default=make_uuid), + Column('period_name', types.UnicodeText), + Column('source', types.UnicodeText), + Column('url', types.UnicodeText), + Column('count', types.Integer), + ) +mapper(GA_ReferralStat, referrer_table) + + + +def init_tables(): + metadata.create_all(model.meta.engine) + + +cached_tables = {} + + +def get_table(name): + if name not in cached_tables: + meta = MetaData() + meta.reflect(bind=model.meta.engine) + table = meta.tables[name] + cached_tables[name] = table + return cached_tables[name] + + +def _normalize_url(url): + '''Strip off the hostname etc. Do this before storing it. + + >>> normalize_url('http://data.gov.uk/dataset/weekly_fuel_prices') + '/dataset/weekly_fuel_prices' + ''' + return '/' + '/'.join(url.split('/')[3:]) + + +def _get_package_and_publisher(url): + # e.g. /dataset/fuel_prices + # e.g. /dataset/fuel_prices/resource/e63380d4 + dataset_match = re.match('/dataset/([^/]+)(/.*)?', url) + if dataset_match: + dataset_ref = dataset_match.groups()[0] + dataset = model.Package.get(dataset_ref) + if dataset: + publisher_groups = dataset.get_groups('publisher') + if publisher_groups: + return dataset_ref,publisher_groups[0].name + return dataset_ref, None + else: + publisher_match = re.match('/publisher/([^/]+)(/.*)?', url) + if publisher_match: + return None, publisher_match.groups()[0] + return None, None + +def update_sitewide_stats(period_name, stat_name, data, period_complete_day): + for k,v in data.iteritems(): + item = model.Session.query(GA_Stat).\ + filter(GA_Stat.period_name==period_name).\ + filter(GA_Stat.key==k).\ + filter(GA_Stat.stat_name==stat_name).first() + if item: + item.period_name = period_name + item.key = k + item.value = v + item.period_complete_day = period_complete_day + model.Session.add(item) + else: + # create the row + values = {'id': make_uuid(), + 'period_name': period_name, + 'period_complete_day': period_complete_day, + 'key': k, + 'value': v, + 'stat_name': stat_name + } + model.Session.add(GA_Stat(**values)) + model.Session.commit() + + +def pre_update_url_stats(period_name): + q = model.Session.query(GA_Url).\ + filter(GA_Url.period_name==period_name) + log.debug("Deleting %d '%s' records" % (q.count(), period_name)) + q.delete() + + q = model.Session.query(GA_Url).\ + filter(GA_Url.period_name == 'All') + log.debug("Deleting %d 'All' records..." % q.count()) + q.delete() + + model.Session.flush() + model.Session.commit() + model.repo.commit_and_remove() + log.debug('...done') + +def post_update_url_stats(): + + """ Check the distinct url field in ga_url and make sure + it has an All record. If not then create one. + + After running this then every URL should have an All + record regardless of whether the URL has an entry for + the month being currently processed. + """ + log.debug('Post-processing "All" records...') + query = """select url, pageviews::int, visits::int + from ga_url + where url not in (select url from ga_url where period_name ='All')""" + connection = model.Session.connection() + res = connection.execute(query) + + views, visits = {}, {} + # url, views, visits + for row in res: + views[row[0]] = views.get(row[0], 0) + row[1] + visits[row[0]] = visits.get(row[0], 0) + row[2] + + progress_total = len(views.keys()) + progress_count = 0 + for key in views.keys(): + progress_count += 1 + if progress_count % 100 == 0: + log.debug('.. %d/%d done so far', progress_count, progress_total) + + package, publisher = _get_package_and_publisher(key) + + values = {'id': make_uuid(), + 'period_name': "All", + 'period_complete_day': 0, +