From: root Date: Fri, 20 Jun 2014 10:44:49 +0000 Subject: Merge branch 'master' of git+ssh://maxious.lambdacomplex.org/git/ckanext-ga-report X-Git-Url: http://maxious.lambdacomplex.org/git/?p=ckanext-ga-report.git&a=commitdiff&h=987914a6710fc950b838977695240f27fe40f988 --- Merge branch 'master' of git+ssh://maxious.lambdacomplex.org/git/ckanext-ga-report --- --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ *.py[co] *.py~ .gitignore +ckan.log # Packages *.egg --- a/README.rst +++ b/README.rst @@ -32,11 +32,11 @@ googleanalytics.id = UA-1010101-1 googleanalytics.account = Account name (e.g. data.gov.uk, see top level item at https://www.google.com/analytics) + googleanalytics.token.filepath = ~/pyenv/token.dat ga-report.period = monthly - ga-report.bounce_url = /data + ga-report.bounce_url = / - The ga-report.bounce_url specifies the path to use when calculating bounces. For DGU this is /data - but you may want to set this to /. + The ga-report.bounce_url specifies a particular path to record the bounce rate for. Typically it is / (the home page). 3. Set up this extension's database tables using a paster command. (Ensure your CKAN pyenv is still activated, run the command from ``src/ckanext-ga-report``, alter the ``--config`` option to point to your site config file):: @@ -83,13 +83,17 @@ $ paster getauthtoken --config=../ckan/development.ini +Now ensure you reference the correct path to your token.dat in your CKAN config file (e.g. development.ini):: + + googleanalytics.token.filepath = ~/pyenv/token.dat + Tutorial -------- Download some GA data and store it in CKAN's database. (Ensure your CKAN pyenv is still activated, run the command from ``src/ckanext-ga-report``, alter the ``--config`` option to point to your site config file) and specifying the name of your auth file (token.dat by default) from the previous step:: - $ paster loadanalytics token.dat latest --config=../ckan/development.ini + $ paster loadanalytics latest --config=../ckan/development.ini The value after the token file is how much data you want to retrieve, this can be --- a/ckanext/ga_report/command.py +++ b/ckanext/ga_report/command.py @@ -1,5 +1,8 @@ import logging import datetime +import os + +from pylons import config from ckan.lib.cli import CkanCommand # No other CKAN imports allowed until _load_config is run, @@ -20,7 +23,7 @@ import ckan.model as model model.Session.remove() model.Session.configure(bind=model.meta.engine) - log = logging.getLogger('ckanext.ga-report') + log = logging.getLogger('ckanext.ga_report') import ga_model ga_model.init_tables() @@ -49,29 +52,54 @@ assuming it is correct. """ from ga_auth import init_service - init_service('token.dat', - self.args[0] if self.args - else 'credentials.json') + init_service('token.dat', 'credentials.json') + +class FixTimePeriods(CkanCommand): + """ + Fixes the 'All' records for GA_Urls + + It is possible that older urls that haven't recently been visited + do not have All records. This command will traverse through those + records and generate valid All records for them. + """ + summary = __doc__.split('\n')[0] + usage = __doc__ + max_args = 0 + min_args = 0 + + def __init__(self, name): + super(FixTimePeriods, self).__init__(name) + + def command(self): + import ckan.model as model + from ga_model import post_update_url_stats + self._load_config() + model.Session.remove() + model.Session.configure(bind=model.meta.engine) + + log = logging.getLogger('ckanext.ga_report') + + log.info("Updating 'All' records for old URLs") + post_update_url_stats() + log.info("Processing complete") + class LoadAnalytics(CkanCommand): """Get data from Google Analytics API and save it in the ga_model - Usage: paster loadanalytics + Usage: paster loadanalytics - Where is the name of the auth token file from - the getauthtoken step. - - And where is: + Where is: all - data for all time latest - (default) just the 'latest' data YYYY-MM - just data for the specific month """ summary = __doc__.split('\n')[0] usage = __doc__ - max_args = 2 - min_args = 1 + max_args = 1 + min_args = 0 def __init__(self, name): super(LoadAnalytics, self).__init__(name) @@ -80,6 +108,12 @@ default=False, dest='delete_first', help='Delete data for the period first') + self.parser.add_option('-s', '--skip_url_stats', + action='store_true', + default=False, + dest='skip_url_stats', + help='Skip the download of URL data - just do site-wide stats') + self.token = "" def command(self): self._load_config() @@ -87,18 +121,25 @@ from download_analytics import DownloadAnalytics from ga_auth import (init_service, get_profile_id) + ga_token_filepath = os.path.expanduser(config.get('googleanalytics.token.filepath', '')) + if not ga_token_filepath: + print 'ERROR: In the CKAN config you need to specify the filepath of the ' \ + 'Google Analytics token file under key: googleanalytics.token.filepath' + return + try: - svc = init_service(self.args[0], None) + self.token, svc = init_service(ga_token_filepath, None) except TypeError: print ('Have you correctly run the getauthtoken task and ' - 'specified the correct token file?') + 'specified the correct token file in the CKAN config under ' + '"googleanalytics.token.filepath"?') return - downloader = DownloadAnalytics(svc, profile_id=get_profile_id(svc), - delete_first=self.options.delete_first) + downloader = DownloadAnalytics(svc, self.token, profile_id=get_profile_id(svc), + delete_first=self.options.delete_first, + skip_url_stats=self.options.skip_url_stats) - time_period = self.args[1] if self.args and len(self.args) > 1 \ - else 'latest' + time_period = self.args[0] if self.args else 'latest' if time_period == 'all': downloader.all_() elif time_period == 'latest': --- a/ckanext/ga_report/controller.py +++ b/ckanext/ga_report/controller.py @@ -1,6 +1,7 @@ import re import csv import sys +import json import logging import operator import collections @@ -13,6 +14,7 @@ log = logging.getLogger('ckanext.ga-report') +DOWNLOADS_AVAILABLE_FROM = '2012-12' def _get_month_name(strdate): import calendar @@ -20,14 +22,39 @@ d = strptime(strdate, '%Y-%m') return '%s %s' % (calendar.month_name[d.tm_mon], d.tm_year) - -def _month_details(cls): - '''Returns a list of all the month names''' +def _get_unix_epoch(strdate): + from time import strptime,mktime + d = strptime(strdate, '%Y-%m') + return int(mktime(d)) + +def _month_details(cls, stat_key=None): + ''' + Returns a list of all the periods for which we have data, unfortunately + knows too much about the type of the cls being passed as GA_Url has a + more complex query + + This may need extending if we add a period_name to the stats + ''' months = [] - vals = model.Session.query(cls.period_name).filter(cls.period_name!='All').distinct().all() + day = None + + q = model.Session.query(cls.period_name,cls.period_complete_day)\ + .filter(cls.period_name!='All').distinct(cls.period_name) + if stat_key: + q= q.filter(cls.stat_name==stat_key) + + vals = q.order_by("period_name desc").all() + + if vals and vals[0][1]: + day = int(vals[0][1]) + ordinal = 'th' if 11 <= day <= 13 \ + else {1:'st',2:'nd',3:'rd'}.get(day % 10, 'th') + day = "{day}{ordinal}".format(day=day, ordinal=ordinal) + for m in vals: months.append( (m[0], _get_month_name(m[0]))) - return sorted(months, key=operator.itemgetter(0), reverse=True) + + return months, day class GaReport(BaseController): @@ -35,7 +62,7 @@ def csv(self, month): import csv - q = model.Session.query(GA_Stat) + q = model.Session.query(GA_Stat).filter(GA_Stat.stat_name!='Downloads') if month != 'all': q = q.filter(GA_Stat.period_name==month) entries = q.order_by('GA_Stat.period_name, GA_Stat.stat_name, GA_Stat.key').all() @@ -52,11 +79,12 @@ entry.key.encode('utf-8'), entry.value.encode('utf-8')]) + def index(self): # Get the month details by fetching distinct values and determining the # month names from the values. - c.months = _month_details(GA_Stat) + c.months, c.day = _month_details(GA_Stat) # Work out which month to show, based on query params of the first item c.month_desc = 'all months' @@ -71,24 +99,39 @@ entries = q.order_by('ga_stat.key').all() def clean_key(key, val): - if key in ['Average time on site', 'Pages per visit', 'New visits', 'Bounces']: + if key in ['Average time on site', 'Pages per visit', 'New visits', 'Bounce rate (home page)']: val = "%.2f" % round(float(val), 2) if key == 'Average time on site': mins, secs = divmod(float(val), 60) hours, mins = divmod(mins, 60) val = '%02d:%02d:%02d (%s seconds) ' % (hours, mins, secs, val) - if key in ['New visits','Bounces']: + if key in ['New visits','Bounce rate (home page)']: val = "%s%%" % val if key in ['Total page views', 'Total visits']: val = int(val) return key, val + + # Query historic values for sparkline rendering + sparkline_query = model.Session.query(GA_Stat)\ + .filter(GA_Stat.stat_name=='Totals')\ + .order_by(GA_Stat.period_name) + sparkline_data = {} + for x in sparkline_query: + sparkline_data[x.key] = sparkline_data.get(x.key,[]) + key, val = clean_key(x.key,float(x.value)) + tooltip = '%s: %s' % (_get_month_name(x.period_name), val) + sparkline_data[x.key].append( (tooltip,x.value) ) + # Trim the latest month, as it looks like a huge dropoff + for key in sparkline_data: + sparkline_data[key] = sparkline_data[key][:-1] c.global_totals = [] if c.month: for e in entries: key, val = clean_key(e.key, e.value) - c.global_totals.append((key, val)) + sparkline = sparkline_data[e.key] + c.global_totals.append((key, val, sparkline)) else: d = collections.defaultdict(list) for e in entries: @@ -97,11 +140,19 @@ if k in ['Total page views', 'Total visits']: v = sum(v) else: - v = float(sum(v))/len(v) + v = float(sum(v))/float(len(v)) + sparkline = sparkline_data[k] key, val = clean_key(k,v) - c.global_totals.append((key, val)) - c.global_totals = sorted(c.global_totals, key=operator.itemgetter(0)) + c.global_totals.append((key, val, sparkline)) + # Sort the global totals into a more pleasant order + def sort_func(x): + key = x[0] + total_order = ['Total page views','Total visits','Pages per visit'] + if key in total_order: + return total_order.index(key) + return 999 + c.global_totals = sorted(c.global_totals, key=sort_func) keys = { 'Browser versions': 'browser_versions', @@ -138,12 +189,13 @@ for k, v in keys.iteritems(): q = model.Session.query(GA_Stat).\ - filter(GA_Stat.stat_name==k) + filter(GA_Stat.stat_name==k).\ + order_by(GA_Stat.period_name) + # Buffer the tabular data if c.month: entries = [] q = q.filter(GA_Stat.period_name==c.month).\ order_by('ga_stat.value::int desc') - d = collections.defaultdict(int) for e in q.all(): d[e.key] += int(e.value) @@ -152,10 +204,27 @@ entries.append((key,val,)) entries = sorted(entries, key=operator.itemgetter(1), reverse=True) + # Run a query on all months to gather graph data + graph_query = model.Session.query(GA_Stat).\ + filter(GA_Stat.stat_name==k).\ + order_by(GA_Stat.period_name) + graph_dict = {} + for stat in graph_query: + graph_dict[ stat.key ] = graph_dict.get(stat.key,{ + 'name':stat.key, + 'raw': {} + }) + graph_dict[ stat.key ]['raw'][stat.period_name] = float(stat.value) + stats_in_table = [x[0] for x in entries] + stats_not_in_table = set(graph_dict.keys()) - set(stats_in_table) + stats = stats_in_table + sorted(list(stats_not_in_table)) + graph = [graph_dict[x] for x in stats] + setattr(c, v+'_graph', json.dumps( _to_rickshaw(graph,percentageMode=True) )) + # Get the total for each set of values and then set the value as # a percentage of the total if k == 'Social sources': - total = sum([x for n,x in c.global_totals if n == 'Total visits']) + total = sum([x for n,x,graph in c.global_totals if n == 'Total visits']) else: total = sum([num for _,num in entries]) setattr(c, v, [(k,_percent(v,total)) for k,v in entries ]) @@ -180,7 +249,9 @@ writer = csv.writer(response) writer.writerow(["Publisher Title", "Publisher Name", "Views", "Visits", "Period Name"]) - for publisher,view,visit in _get_top_publishers(None): + top_publishers = _get_top_publishers(limit=None) + + for publisher,view,visit in top_publishers: writer.writerow([publisher.title.encode('utf-8'), publisher.name.encode('utf-8'), view, @@ -200,19 +271,20 @@ if not c.publisher: abort(404, 'A publisher with that name could not be found') - packages = self._get_packages(c.publisher) + packages = self._get_packages(publisher=c.publisher, month=c.month) response.headers['Content-Type'] = "text/csv; charset=utf-8" response.headers['Content-Disposition'] = \ str('attachment; filename=datasets_%s_%s.csv' % (c.publisher_name, month,)) writer = csv.writer(response) - writer.writerow(["Dataset Title", "Dataset Name", "Views", "Visits", "Period Name"]) - - for package,view,visit in packages: + writer.writerow(["Dataset Title", "Dataset Name", "Views", "Visits", "Resource downloads", "Period Name"]) + + for package,view,visit,downloads in packages: writer.writerow([package.title.encode('utf-8'), package.name.encode('utf-8'), view, visit, + downloads, month]) def publishers(self): @@ -220,7 +292,7 @@ # Get the month details by fetching distinct values and determining the # month names from the values. - c.months = _month_details(GA_Url) + c.months, c.day = _month_details(GA_Url) # Work out which month to show, based on query params of the first item c.month = request.params.get('month', '') @@ -229,14 +301,19 @@ c.month_desc = ''.join([m[1] for m in c.months if m[0]==c.month]) c.top_publishers = _get_top_publishers() - return render('ga_report/publisher/index.html') - - def _get_packages(self, publisher=None, count=-1): - '''Returns the datasets in order of visits''' - if count == -1: - count = sys.maxint - - month = c.month or 'All' + graph_data = _get_top_publishers_graph() + c.top_publishers_graph = json.dumps( _to_rickshaw(graph_data) ) + + x = render('ga_report/publisher/index.html') + + return x + + def _get_packages(self, publisher=None, month='', count=-1): + '''Returns the datasets in order of views''' + have_download_data = True + month = month or 'All' + if month != 'All': + have_download_data = month >= DOWNLOADS_AVAILABLE_FROM q = model.Session.query(GA_Url,model.Package)\ .filter(model.Package.name==GA_Url.package_id)\ @@ -244,11 +321,29 @@ if publisher: q = q.filter(GA_Url.department_id==publisher.name) q = q.filter(GA_Url.period_name==month) - q = q.order_by('ga_url.visitors::int desc') + q = q.order_by('ga_url.pageviews::int desc') top_packages = [] - for entry,package in q.limit(count): + if count == -1: + entries = q.all() + else: + entries = q.limit(count) + + for entry,package in entries: if package: - top_packages.append((package, entry.pageviews, entry.visitors)) + # Downloads .... + if have_download_data: + dls = model.Session.query(GA_Stat).\ + filter(GA_Stat.stat_name=='Downloads').\ + filter(GA_Stat.key==package.name) + if month != 'All': # Fetch everything unless the month is specific + dls = dls.filter(GA_Stat.period_name==month) + downloads = 0 + for x in dls: + downloads += int(x.value) + else: + downloads = 'No data' + if package.private == False: + top_packages.append((package, entry.pageviews, entry.visits, downloads)) else: log.warning('Could not find package associated package') @@ -278,7 +373,7 @@ # Get the month details by fetching distinct values and determining the # month names from the values. - c.months = _month_details(GA_Url) + c.months, c.day = _month_details(GA_Url) # Work out which month to show, based on query params of the first item c.month = request.params.get('month', '') @@ -294,9 +389,73 @@ entry = q.filter(GA_Url.period_name==c.month).first() c.publisher_page_views = entry.pageviews if entry else 0 - c.top_packages = self._get_packages(c.publisher, 20) + c.top_packages = self._get_packages(publisher=c.publisher, count=20, month=c.month) + + # Graph query + top_packages_all_time = self._get_packages(publisher=c.publisher, count=20, month='All') + top_package_names = [ x[0].name for x in top_packages_all_time ] + graph_query = model.Session.query(GA_Url,model.Package)\ + .filter(model.Package.name==GA_Url.package_id)\ + .filter(GA_Url.url.like('/dataset/%'))\ + .filter(GA_Url.package_id.in_(top_package_names)) + all_series = {} + for entry,package in graph_query: + if not package: continue + if entry.period_name=='All': continue + all_series[package.name] = all_series.get(package.name,{ + 'name':package.title, + 'raw': {} + }) + all_series[package.name]['raw'][entry.period_name] = int(entry.pageviews) + graph = [ all_series[series_name] for series_name in top_package_names ] + c.graph_data = json.dumps( _to_rickshaw(graph) ) return render('ga_report/publisher/read.html') + +def _to_rickshaw(data, percentageMode=False): + if data==[]: + return data + # x-axis is every month in c.months. Note that data might not exist + # for entire history, eg. for recently-added datasets + x_axis = [x[0] for x in c.months] + x_axis.reverse() # Ascending order + x_axis = x_axis[:-1] # Remove latest month + totals = {} + for series in data: + series['data'] = [] + for x_string in x_axis: + x = _get_unix_epoch( x_string ) + y = series['raw'].get(x_string,0) + series['data'].append({'x':x,'y':y}) + totals[x] = totals.get(x,0)+y + if not percentageMode: + return data + # Turn all data into percentages + # Roll insignificant series into a catch-all + THRESHOLD = 1 + raw_data = data + data = [] + for series in raw_data: + for point in series['data']: + percentage = (100*float(point['y'])) / totals[point['x']] + if not (series in data) and percentage>THRESHOLD: + data.append(series) + point['y'] = percentage + others = [ x for x in raw_data if not (x in data) ] + if len(others): + data_other = [] + for i in range(len(x_axis)): + x = _get_unix_epoch(x_axis[i]) + y = 0 + for series in others: + y += series['data'][i]['y'] + data_other.append({'x':x,'y':y}) + data.append({ + 'name':'Other', + 'data': data_other + }) + return data + def _get_top_publishers(limit=20): ''' @@ -306,11 +465,13 @@ month = c.month or 'All' connection = model.Session.connection() q = """ - select department_id, sum(pageviews::int) views, sum(visitors::int) visits + select department_id, sum(pageviews::int) views, sum(visits::int) visits from ga_url where department_id <> '' + and package_id <> '' + and url like '/dataset/%%' and period_name=%s - group by department_id order by visits desc + group by department_id order by views desc """ if limit: q = q + " limit %s;" % (limit) @@ -324,14 +485,54 @@ return top_publishers +def _get_top_publishers_graph(limit=20): + ''' + Returns a list of the top 20 publishers by dataset visits. + (The number to show can be varied with 'limit') + ''' + connection = model.Session.connection() + q = """ + select department_id, sum(pageviews::int) views + from ga_url + where department_id <> '' + and package_id <> '' + and url like '/dataset/%%' + and period_name='All' + group by department_id order by views desc + """ + if limit: + q = q + " limit %s;" % (limit) + + res = connection.execute(q) + department_ids = [ row[0] for row in res ] + + # Query for a history graph of these department ids + q = model.Session.query( + GA_Url.department_id, + GA_Url.period_name, + func.sum(cast(GA_Url.pageviews,sqlalchemy.types.INT)))\ + .filter( GA_Url.department_id.in_(department_ids) )\ + .filter( GA_Url.url.like('/dataset/%') )\ + .filter( GA_Url.package_id!='' )\ + .group_by( GA_Url.department_id, GA_Url.period_name ) + graph_dict = {} + for dept_id,period_name,views in q: + graph_dict[dept_id] = graph_dict.get( dept_id, { + 'name' : model.Group.get(dept_id).title, + 'raw' : {} + }) + graph_dict[dept_id]['raw'][period_name] = views + return [ graph_dict[id] for id in department_ids ] + + def _get_publishers(): ''' Returns a list of all publishers. Each item is a tuple: - (names, title) + (name, title) ''' publishers = [] for pub in model.Session.query(model.Group).\ - filter(model.Group.type=='publisher').\ + filter(model.Group.type=='organization').\ filter(model.Group.state=='active').\ order_by(model.Group.name): publishers.append((pub.name, pub.title)) --- a/ckanext/ga_report/download_analytics.py +++ b/ckanext/ga_report/download_analytics.py @@ -1,7 +1,12 @@ import os import logging import datetime +import httplib +import urllib import collections +import requests +import json +import re from pylons import config from ga_model import _normalize_url import ga_model @@ -13,15 +18,19 @@ FORMAT_MONTH = '%Y-%m' MIN_VIEWS = 50 MIN_VISITS = 20 +MIN_DOWNLOADS = 10 class DownloadAnalytics(object): '''Downloads and stores analytics info''' - def __init__(self, service=None, profile_id=None, delete_first=False): + def __init__(self, service=None, token=None, profile_id=None, delete_first=False, + skip_url_stats=False): self.period = config['ga-report.period'] self.service = service self.profile_id = profile_id self.delete_first = delete_first + self.skip_url_stats = skip_url_stats + self.token = token def specific_month(self, date): import calendar @@ -29,6 +38,11 @@ first_of_this_month = datetime.datetime(date.year, date.month, 1) _, last_day_of_month = calendar.monthrange(int(date.year), int(date.month)) last_of_this_month = datetime.datetime(date.year, date.month, last_day_of_month) + # if this is the latest month, note that it is only up until today + now = datetime.datetime.now() + if now.year == date.year and now.month == date.month: + last_day_of_month = now.day + last_of_this_month = now periods = ((date.strftime(FORMAT_MONTH), last_day_of_month, first_of_this_month, last_of_this_month),) @@ -96,34 +110,39 @@ self.get_full_period_name(period_name, period_complete_day), start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d')) - + if self.delete_first: log.info('Deleting existing Analytics for this period "%s"', period_name) ga_model.delete(period_name) - # Clean up the entries before we run this - ga_model.pre_update_url_stats(period_name) - - accountName = config.get('googleanalytics.account') - - log.info('Downloading analytics for dataset views') - data = self.download(start_date, end_date, '~/%s/dataset/[a-z0-9-_]+' % accountName) - - log.info('Storing dataset views (%i rows)', len(data.get('url'))) - self.store(period_name, period_complete_day, data, ) - - log.info('Downloading analytics for publisher views') - data = self.download(start_date, end_date, '~/%s/publisher/[a-z0-9-_]+' % accountName) - - log.info('Storing publisher views (%i rows)', len(data.get('url'))) - self.store(period_name, period_complete_day, data,) - - log.info('Aggregating datasets by publisher') - ga_model.update_publisher_stats(period_name) # about 30 seconds. + if not self.skip_url_stats: + # Clean out old url data before storing the new + ga_model.pre_update_url_stats(period_name) + + accountName = config.get('googleanalytics.account') + + log.info('Downloading analytics for dataset views') + data = self.download(start_date, end_date, '~^/dataset/[a-z0-9-_]+') + + log.info('Storing dataset views (%i rows)', len(data.get('url'))) + self.store(period_name, period_complete_day, data, ) + + log.info('Downloading analytics for publisher views') + data = self.download(start_date, end_date, '~^/organization/[a-z0-9-_]+') + + log.info('Storing publisher views (%i rows)', len(data.get('url'))) + self.store(period_name, period_complete_day, data,) + + # Make sure the All records are correct. + ga_model.post_update_url_stats() + + log.info('Associating datasets with their publisher') + ga_model.update_publisher_stats(period_name) # about 30 seconds. + log.info('Downloading and storing analytics for site-wide stats') - self.sitewide_stats( period_name ) + self.sitewide_stats( period_name, period_complete_day ) log.info('Downloading and storing analytics for social networks') self.update_social_info(period_name, start_date, end_date) @@ -136,21 +155,32 @@ metrics = 'ga:entrances' sort = '-ga:entrances' - # Supported query params at - # https://developers.google.com/analytics/devguides/reporting/core/v3/reference - results = self.service.data().ga().get( - ids='ga:' + self.profile_id, - filters=query, - start_date=start_date, - metrics=metrics, - sort=sort, - dimensions="ga:landingPagePath,ga:socialNetwork", - max_results=10000, - end_date=end_date).execute() + try: + # Because of issues of invalid responses, we are going to make these requests + # ourselves. + headers = {'authorization': 'Bearer ' + self.token} + + args = dict(ids='ga:' + self.profile_id, + filters=query, + metrics=metrics, + sort=sort, + dimensions="ga:landingPagePath,ga:socialNetwork", + max_results=10000) + + args['start-date'] = start_date + args['end-date'] = end_date + + results = self._get_json(args) + except Exception, e: + log.exception(e) + results = dict(url=[]) + + data = collections.defaultdict(list) rows = results.get('rows',[]) for row in rows: - data[_normalize_url(row[0])].append( (row[1], int(row[2]),) ) + url = row[0] + data[url].append( (row[1], int(row[2]),) ) ga_model.update_social(period_name, data) @@ -159,26 +189,42 @@ start_date = start_date.strftime('%Y-%m-%d') end_date = end_date.strftime('%Y-%m-%d') query = 'ga:pagePath=%s$' % path - metrics = 'ga:uniquePageviews, ga:visits' - sort = '-ga:uniquePageviews' + metrics = 'ga:pageviews, ga:visits' + sort = '-ga:pageviews' # Supported query params at # https://developers.google.com/analytics/devguides/reporting/core/v3/reference - results = self.service.data().ga().get( - ids='ga:' + self.profile_id, - filters=query, - start_date=start_date, - metrics=metrics, - sort=sort, - dimensions="ga:pagePath", - max_results=10000, - end_date=end_date).execute() + # https://ga-dev-tools.appspot.com/explorer/ + try: + args = {} + args["sort"] = "-ga:pageviews" + args["max-results"] = 100000 + args["dimensions"] = "ga:pagePath" + args["start-date"] = start_date + args["end-date"] = end_date + args["metrics"] = metrics + args["ids"] = "ga:" + self.profile_id + args["filters"] = query + args["alt"] = "json" + print args + results = self._get_json(args) + + except Exception, e: + log.exception(e) + return dict(url=[]) packages = [] - for entry in results.get('rows'): + log.info("There are %d results" % results['totalResults']) + if results['totalResults'] > 0: + for entry in results.get('rows'): (loc,pageviews,visits) = entry - url = _normalize_url('http:/' + loc) - if not url.startswith('/dataset/') and not url.startswith('/publisher/'): + #url = _normalize_url('http:/' + loc) # strips off domain e.g. www.data.gov.uk or data.gov.uk + url = loc + #print url + if not url.startswith('/dataset/') and not url.startswith('/organization/'): + # filter out strays like: + # /data/user/login?came_from=http://data.gov.uk/dataset/os-code-point-open + # /403.html?page=/about&from=http://data.gov.uk/publisher/planning-inspectorate continue packages.append( (url, pageviews, visits,) ) # Temporary hack return dict(url=packages) @@ -187,7 +233,7 @@ if 'url' in data: ga_model.update_url_stats(period_name, period_complete_day, data['url']) - def sitewide_stats(self, period_name): + def sitewide_stats(self, period_name, period_complete_day): import calendar year, month = period_name.split('-') _, last_day_of_month = calendar.monthrange(int(year), int(month)) @@ -195,10 +241,10 @@ start_date = '%s-01' % period_name end_date = '%s-%s' % (period_name, last_day_of_month) funcs = ['_totals_stats', '_social_stats', '_os_stats', - '_locale_stats', '_browser_stats', '_mobile_stats'] + '_locale_stats', '_browser_stats', '_mobile_stats', '_download_stats'] for f in funcs: log.info('Downloading analytics for %s' % f.split('_')[1]) - getattr(self, f)(start_date, end_date, period_name) + getattr(self, f)(start_date, end_date, period_name, period_complete_day) def _get_results(result_data, f): data = {} @@ -207,24 +253,78 @@ data[key] = data.get(key,0) + result[1] return data - def _totals_stats(self, start_date, end_date, period_name): + def _get_json(self, params, prev_fail=False): + ga_token_filepath = os.path.expanduser(config.get('googleanalytics.token.filepath', '')) + if not ga_token_filepath: + print 'ERROR: In the CKAN config you need to specify the filepath of the ' \ + 'Google Analytics token file under key: googleanalytics.token.filepath' + return + + log.info("Trying to refresh our OAuth token") + try: + from ga_auth import init_service + self.token, svc = init_service(ga_token_filepath, None) + log.info("OAuth token refreshed") + except Exception, auth_exception: + log.error("Oauth refresh failed") + log.exception(auth_exception) + return + + try: + headers = {'authorization': 'Bearer ' + self.token} + r = requests.get("https://www.googleapis.com/analytics/v3/data/ga", params=params, headers=headers) + if r.status_code != 200: + log.info("STATUS: %s" % (r.status_code,)) + log.info("CONTENT: %s" % (r.content,)) + raise Exception("Request with params: %s failed" % params) + + return json.loads(r.content) + except Exception, e: + log.exception(e) + + return dict(url=[]) + + def _totals_stats(self, start_date, end_date, period_name, period_complete_day): """ Fetches distinct totals, total pageviews etc """ - results = self.service.data().ga().get( - ids='ga:' + self.profile_id, - start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', - max_results=10000, - end_date=end_date).execute() - result_data = results.get('rows') - ga_model.update_sitewide_stats(period_name, "Totals", {'Total page views': result_data[0][0]}) - - results = self.service.data().ga().get( - ids='ga:' + self.profile_id, - start_date=start_date, - metrics='ga:pageviewsPerVisit,ga:avgTimeOnSite,ga:percentNewVisits,ga:visits', - max_results=10000, - end_date=end_date).execute() + try: + args = {} + args["max-results"] = 100000 + args["start-date"] = start_date + args["end-date"] = end_date + args["ids"] = "ga:" + self.profile_id + + args["metrics"] = "ga:pageviews" + args["sort"] = "-ga:pageviews" + args["alt"] = "json" + + results = self._get_json(args) + except Exception, e: + log.exception(e) + results = dict(url=[]) + + result_data = results.get('rows') + ga_model.update_sitewide_stats(period_name, "Totals", {'Total page views': result_data[0][0]}, + period_complete_day) + + try: + # Because of issues of invalid responses, we are going to make these requests + # ourselves. + headers = {'authorization': 'Bearer ' + self.token} + + args = {} + args["max-results"] = 100000 + args["start-date"] = start_date + args["end-date"] = end_date + args["ids"] = "ga:" + self.profile_id + + args["metrics"] = "ga:pageviewsPerVisit,ga:avgTimeOnSite,ga:percentNewVisits,ga:visits" + args["alt"] = "json" + + results = self._get_json(args) + except Exception, e: + log.exception(e) + results = dict(url=[]) + result_data = results.get('rows') data = { 'Pages per visit': result_data[0][0], @@ -232,107 +332,268 @@ 'New visits': result_data[0][2], 'Total visits': result_data[0][3], } - ga_model.update_sitewide_stats(period_name, "Totals", data) - - # Bounces from /data. This url is specified in configuration because - # for DGU we don't want /. - path = config.get('ga-report.bounce_url','/') - print path - results = self.service.data().ga().get( - ids='ga:' + self.profile_id, - filters='ga:pagePath=~%s$' % (path,), - start_date=start_date, - metrics='ga:bounces,ga:uniquePageviews', - dimensions='ga:pagePath', - max_results=10000, - end_date=end_date).execute() - result_data = results.get('rows') - for results in result_data: - if results[0] == path: - bounce, total = [float(x) for x in results[1:]] - pct = 100 * bounce/total - print "%d bounces from %d total == %s" % (bounce, total, pct) - ga_model.update_sitewide_stats(period_name, "Totals", {'Bounces': pct}) - - - def _locale_stats(self, start_date, end_date, period_name): + ga_model.update_sitewide_stats(period_name, "Totals", data, period_complete_day) + + # Bounces from / or another configurable page. + path = '/' #% (config.get('googleanalytics.account'), config.get('ga-report.bounce_url', '/')) + + try: + # Because of issues of invalid responses, we are going to make these requests + # ourselves. + headers = {'authorization': 'Bearer ' + self.token} + + args = {} + args["max-results"] = 100000 + args["start-date"] = start_date + args["end-date"] = end_date + args["ids"] = "ga:" + self.profile_id + + args["filters"] = 'ga:pagePath==%s' % (path,) + args["dimensions"] = 'ga:pagePath' + args["metrics"] = "ga:visitBounceRate" + args["alt"] = "json" + + results = self._get_json(args) + except Exception, e: + log.exception(e) + results = dict(url=[]) + + result_data = results.get('rows') + if not result_data or len(result_data) != 1: + log.error('Could not pinpoint the bounces for path: %s. Got results: %r', + path, result_data) + return + results = result_data[0] + bounces = float(results[1]) + # visitBounceRate is already a % + log.info('Google reports visitBounceRate as %s', bounces) + ga_model.update_sitewide_stats(period_name, "Totals", {'Bounce rate (home page)': float(bounces)}, + period_complete_day) + + + def _locale_stats(self, start_date, end_date, period_name, period_complete_day): """ Fetches stats about language and country """ - results = self.service.data().ga().get( - ids='ga:' + self.profile_id, - start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', - dimensions="ga:language,ga:country", - max_results=10000, - end_date=end_date).execute() + + try: + # Because of issues of invalid responses, we are going to make these requests + # ourselves. + headers = {'authorization': 'Bearer ' + self.token} + + args = {} + args["max-results"] = 100000 + args["start-date"] = start_date + args["end-date"] = end_date + args["ids"] = "ga:" + self.profile_id + + args["dimensions"] = "ga:language,ga:country" + args["metrics"] = "ga:pageviews" + args["sort"] = "-ga:pageviews" + args["alt"] = "json" + + results = self._get_json(args) + except Exception, e: + log.exception(e) + results = dict(url=[]) + result_data = results.get('rows') data = {} for result in result_data: data[result[0]] = data.get(result[0], 0) + int(result[2]) self._filter_out_long_tail(data, MIN_VIEWS) - ga_model.update_sitewide_stats(period_name, "Languages", data) + ga_model.update_sitewide_stats(period_name, "Languages", data, period_complete_day) data = {} for result in result_data: data[result[1]] = data.get(result[1], 0) + int(result[2]) self._filter_out_long_tail(data, MIN_VIEWS) - ga_model.update_sitewide_stats(period_name, "Country", data) - - - def _social_stats(self, start_date, end_date, period_name): + ga_model.update_sitewide_stats(period_name, "Country", data, period_complete_day) + + + def _download_stats(self, start_date, end_date, period_name, period_complete_day): + """ Fetches stats about data downloads """ + import ckan.model as model + + data = {} + + try: + # Because of issues of invalid responses, we are going to make these requests + # ourselves. + headers = {'authorization': 'Bearer ' + self.token} + + args = {} + args["max-results"] = 100000 + args["start-date"] = start_date + args["end-date"] = end_date + args["ids"] = "ga:" + self.profile_id + + args["filters"] = 'ga:eventAction==Download' + args["dimensions"] = "ga:eventLabel" + args["metrics"] = "ga:totalEvents" + args["alt"] = "json" + + results = self._get_json(args) + except Exception, e: + log.exception(e) + results = dict(url=[]) + + result_data = results.get('rows') + if not result_data: + # We may not have data for this time period, so we need to bail + # early. + log.info("There is no download data for this time period") + return + + def process_result_data(result_data, cached=False): + progress_total = len(result_data) + progress_count = 0 + resources_not_matched = [] + for result in result_data: + progress_count += 1 + if progress_count % 100 == 0: + log.debug('.. %d/%d done so far', progress_count, progress_total) + + url = urllib.unquote(result[0].strip()) + + # Get package id associated with the resource that has this URL. + q = model.Session.query(model.Resource) + if cached: + r = q.filter(model.Resource.cache_url.like("%s%%" % url)).first() + else: + r = q.filter(model.Resource.url.like("%s%%" % url)).first() + + # new style internal download links + if re.search('(?:/resource/)(.*)(?:/download/)',url): + resource_id = re.search('(?:/resource/)(.*)(?:/download/)',url) + r = q.filter(model.Resource.id.like("%s%%" % resource_id.group(1))).first() + + package_name = r.resource_group.package.name if r else "" + + + if package_name: + data[package_name] = data.get(package_name, 0) + int(result[1]) + else: + resources_not_matched.append(url) + continue + if resources_not_matched: + log.debug('Could not match %i or %i resource URLs to datasets. e.g. %r', + len(resources_not_matched), progress_total, resources_not_matched[:3]) + + log.info('Associating downloads of resource URLs with their respective datasets') + process_result_data(results.get('rows')) + + '''try: + # Because of issues of invalid responses, we are going to make these requests + # ourselves. + headers = {'authorization': 'Bearer ' + self.token} + + args = dict( ids='ga:' + self.profile_id, + filters='ga:eventAction==download-cache', + metrics='ga:totalEvents', + sort='-ga:totalEvents', + dimensions="ga:eventLabel", + max_results=10000) + args['start-date'] = start_date + args['end-date'] = end_date + + results = self._get_json(args) + except Exception, e: + log.exception(e) + results = dict(url=[]) + + log.info('Associating downloads of cache resource URLs with their respective datasets') + process_result_data(results.get('rows'), cached=False)''' + + self._filter_out_long_tail(data, MIN_DOWNLOADS) + ga_model.update_sitewide_stats(period_name, "Downloads", data, period_complete_day) + + def _social_stats(self, start_date, end_date, period_name, period_complete_day): """ Finds out which social sites people are referred from """ - results = self.service.data().ga().get( - ids='ga:' + self.profile_id, - start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', - dimensions="ga:socialNetwork,ga:referralPath", - max_results=10000, - end_date=end_date).execute() + + try: + # Because of issues of invalid responses, we are going to make these requests + # ourselves. + headers = {'authorization': 'Bearer ' + self.token} + + args = dict( ids='ga:' + self.profile_id, + metrics='ga:pageviews', + sort='-ga:pageviews', + dimensions="ga:socialNetwork,ga:referralPath", + max_results=10000) + args['start-date'] = start_date + args['end-date'] = end_date + + results = self._get_json(args) + except Exception, e: + log.exception(e) + results = dict(url=[]) + result_data = results.get('rows') data = {} for result in result_data: if not result[0] == '(not set)': data[result[0]] = data.get(result[0], 0) + int(result[2]) self._filter_out_long_tail(data, 3) - ga_model.update_sitewide_stats(period_name, "Social sources", data) - - - def _os_stats(self, start_date, end_date, period_name): + ga_model.update_sitewide_stats(period_name, "Social sources", data, period_complete_day) + + + def _os_stats(self, start_date, end_date, period_name, period_complete_day): """ Operating system stats """ - results = self.service.data().ga().get( - ids='ga:' + self.profile_id, - start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', - dimensions="ga:operatingSystem,ga:operatingSystemVersion", - max_results=10000, - end_date=end_date).execute() + try: + # Because of issues of invalid responses, we are going to make these requests + # ourselves. + headers = {'authorization': 'Bearer ' + self.token} + + args = dict( ids='ga:' + self.profile_id, + metrics='ga:pageviews', + sort='-ga:pageviews', + dimensions="ga:operatingSystem,ga:operatingSystemVersion", + max_results=10000) + args['start-date'] = start_date + args['end-date'] = end_date + + results = self._get_json(args) + except Exception, e: + log.exception(e) + results = dict(url=[]) + result_data = results.get('rows') data = {} for result in result_data: data[result[0]] = data.get(result[0], 0) + int(result[2]) self._filter_out_long_tail(data, MIN_VIEWS) - ga_model.update_sitewide_stats(period_name, "Operating Systems", data) + ga_model.update_sitewide_stats(period_name, "Operating Systems", data, period_complete_day) data = {} for result in result_data: if int(result[2]) >= MIN_VIEWS: key = "%s %s" % (result[0],result[1]) data[key] = result[2] - ga_model.update_sitewide_stats(period_name, "Operating Systems versions", data) - - - def _browser_stats(self, start_date, end_date, period_name): + ga_model.update_sitewide_stats(period_name, "Operating Systems versions", data, period_complete_day) + + + def _browser_stats(self, start_date, end_date, period_name, period_complete_day): """ Information about browsers and browser versions """ - results = self.service.data().ga().get( - ids='ga:' + self.profile_id, - start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', - dimensions="ga:browser,ga:browserVersion", - max_results=10000, - end_date=end_date).execute() + + try: + # Because of issues of invalid responses, we are going to make these requests + # ourselves. + headers = {'authorization': 'Bearer ' + self.token} + + args = dict( ids='ga:' + self.profile_id, + metrics='ga:pageviews', + sort='-ga:pageviews', + dimensions="ga:browser,ga:browserVersion", + max_results=10000) + + args['start-date'] = start_date + args['end-date'] = end_date + + results = self._get_json(args) + except Exception, e: + log.exception(e) + results = dict(url=[]) + + result_data = results.get('rows') # e.g. [u'Firefox', u'19.0', u'20'] @@ -340,14 +601,14 @@ for result in result_data: data[result[0]] = data.get(result[0], 0) + int(result[2]) self._filter_out_long_tail(data, MIN_VIEWS) - ga_model.update_sitewide_stats(period_name, "Browsers", data) + ga_model.update_sitewide_stats(period_name, "Browsers", data, period_complete_day) data = {} for result in result_data: key = "%s %s" % (result[0], self._filter_browser_version(result[0], result[1])) data[key] = data.get(key, 0) + int(result[2]) self._filter_out_long_tail(data, MIN_VIEWS) - ga_model.update_sitewide_stats(period_name, "Browser versions", data) + ga_model.update_sitewide_stats(period_name, "Browser versions", data, period_complete_day) @classmethod def _filter_browser_version(cls, browser, version_str): @@ -371,30 +632,40 @@ ver = ver[0] + ver[1] + 'X' * num_hidden_digits return ver - def _mobile_stats(self, start_date, end_date, period_name): + def _mobile_stats(self, start_date, end_date, period_name, period_complete_day): """ Info about mobile devices """ - results = self.service.data().ga().get( - ids='ga:' + self.profile_id, - start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', - dimensions="ga:mobileDeviceBranding, ga:mobileDeviceInfo", - max_results=10000, - end_date=end_date).execute() + try: + # Because of issues of invalid responses, we are going to make these requests + # ourselves. + headers = {'authorization': 'Bearer ' + self.token} + + args = dict( ids='ga:' + self.profile_id, + metrics='ga:pageviews', + sort='-ga:pageviews', + dimensions="ga:mobileDeviceBranding, ga:mobileDeviceInfo", + max_results=10000) + args['start-date'] = start_date + args['end-date'] = end_date + + results = self._get_json(args) + except Exception, e: + log.exception(e) + results = dict(url=[]) + result_data = results.get('rows') data = {} for result in result_data: data[result[0]] = data.get(result[0], 0) + int(result[2]) self._filter_out_long_tail(data, MIN_VIEWS) - ga_model.update_sitewide_stats(period_name, "Mobile brands", data) + ga_model.update_sitewide_stats(period_name, "Mobile brands", data, period_complete_day) data = {} for result in result_data: data[result[1]] = data.get(result[1], 0) + int(result[2]) self._filter_out_long_tail(data, MIN_VIEWS) - ga_model.update_sitewide_stats(period_name, "Mobile devices", data) + ga_model.update_sitewide_stats(period_name, "Mobile devices", data, period_complete_day) @classmethod def _filter_out_long_tail(cls, data, threshold=10): --- a/ckanext/ga_report/ga_auth.py +++ b/ckanext/ga_report/ga_auth.py @@ -36,7 +36,7 @@ credentials = _prepare_credentials(token_file, credentials_file) http = credentials.authorize(http) # authorize the http object - return build('analytics', 'v3', http=http) + return credentials.access_token, build('analytics', 'v3', http=http) def get_profile_id(service): --- a/ckanext/ga_report/ga_model.py +++ b/ckanext/ga_report/ga_model.py @@ -9,6 +9,8 @@ import ckan.model as model from ckan.lib.base import * + +log = __import__('logging').getLogger(__name__) def make_uuid(): return unicode(uuid.uuid4()) @@ -27,7 +29,7 @@ Column('period_name', types.UnicodeText), Column('period_complete_day', types.Integer), Column('pageviews', types.UnicodeText), - Column('visitors', types.UnicodeText), + Column('visits', types.UnicodeText), Column('url', types.UnicodeText), Column('department_id', types.UnicodeText), Column('package_id', types.UnicodeText), @@ -45,6 +47,7 @@ Column('id', types.UnicodeText, primary_key=True, default=make_uuid), Column('period_name', types.UnicodeText), + Column('period_complete_day', types.UnicodeText), Column('stat_name', types.UnicodeText), Column('key', types.UnicodeText), Column('value', types.UnicodeText), ) @@ -63,7 +66,7 @@ Column('period_name', types.UnicodeText), Column('publisher_name', types.UnicodeText), Column('views', types.UnicodeText), - Column('visitors', types.UnicodeText), + Column('visits', types.UnicodeText), Column('toplevel', types.Boolean, default=False), Column('subpublishercount', types.Integer, default=0), Column('parent', types.UnicodeText), @@ -111,12 +114,10 @@ >>> normalize_url('http://data.gov.uk/dataset/weekly_fuel_prices') '/dataset/weekly_fuel_prices' ''' - # Deliberately leaving a / - url = url.replace('http:/','') - return '/' + '/'.join(url.split('/')[2:]) - - -def _get_department_id_of_url(url): + return url #'/' + '/'.join(url.split('/')[3:]) + + +def _get_package_and_publisher(url): # e.g. /dataset/fuel_prices # e.g. /dataset/fuel_prices/resource/e63380d4 dataset_match = re.match('/dataset/([^/]+)(/.*)?', url) @@ -124,16 +125,17 @@ dataset_ref = dataset_match.groups()[0] dataset = model.Package.get(dataset_ref) if dataset: - publisher_groups = dataset.get_groups('publisher') + publisher_groups = dataset.get_groups('organization') if publisher_groups: - return publisher_groups[0].name + return dataset_ref,publisher_groups[0].name + return dataset_ref, None else: - publisher_match = re.match('/publisher/([^/]+)(/.*)?', url) + publisher_match = re.match('/organization/([^/]+)(/.*)?', url) if publisher_match: - return publisher_match.groups()[0] - - -def update_sitewide_stats(period_name, stat_name, data): + return None, publisher_match.groups()[0] + return None, None + +def update_sitewide_stats(period_name, stat_name, data, period_complete_day): for k,v in data.iteritems(): item = model.Session.query(GA_Stat).\ filter(GA_Stat.period_name==period_name).\ @@ -143,11 +145,13 @@ item.period_name = period_name item.key = k item.value = v + item.period_complete_day = period_complete_day model.Session.add(item) else: # create the row values = {'id': make_uuid(), 'period_name': period_name, + 'period_complete_day': period_complete_day, 'key': k, 'value': v, 'stat_name': stat_name @@ -156,54 +160,114 @@ model.Session.commit() -def update_url_stat_totals(period_name): - +def pre_update_url_stats(period_name): + q = model.Session.query(GA_Url).\ + filter(GA_Url.period_name==period_name) + log.debug("Deleting %d '%s' records" % (q.count(), period_name)) + q.delete() + + q = model.Session.query(GA_Url).\ + filter(GA_Url.period_name == 'All') + log.debug("Deleting %d 'All' records..." % q.count()) + q.delete() + + model.Session.flush() + model.Session.commit() + model.repo.commit_and_remove() + log.debug('...done') + +def post_update_url_stats(): + + """ Check the distinct url field in ga_url and make sure + it has an All record. If not then create one. + + After running this then every URL should have an All + record regardless of whether the URL has an entry for + the month being currently processed. """ - items = model.Session.query(GA_Url).\ - filter(GA_Url.period_name != "All").\ - filter(GA_Url.url==url).all() + log.debug('Post-processing "All" records...') + query = """select url, pageviews::int, visits::int + from ga_url + where url not in (select url from ga_url where period_name ='All')""" + connection = model.Session.connection() + res = connection.execute(query) + + views, visits = {}, {} + # url, views, visits + for row in res: + views[row[0]] = views.get(row[0], 0) + row[1] + visits[row[0]] = visits.get(row[0], 0) + row[2] + + progress_total = len(views.keys()) + progress_count = 0 + for key in views.keys(): + progress_count += 1 + if progress_count % 100 == 0: + log.debug('.. %d/%d done so far', progress_count, progress_total) + + package, publisher = _get_package_and_publisher(key) + values = {'id': make_uuid(), 'period_name': "All", - 'period_complete_day': "0", - 'url': url, - 'pageviews': sum([int(x.pageviews) for x in items]), - 'visitors': sum([int(x.visitors) for x in items]), - 'department_id': department_id, + 'period_complete_day': 0, + 'url': key, + 'pageviews': views[key], + 'visits': visits[key], + 'department_id': publisher, 'package_id': package - } + } model.Session.add(GA_Url(**values)) + model.Session.commit() + log.debug('..done') + + +def update_url_stats(period_name, period_complete_day, url_data): + ''' + Given a list of urls and number of hits for each during a given period, + stores them in GA_Url under the period and recalculates the totals for + the 'All' period. + ''' + progress_total = len(url_data) + progress_count = 0 + for url, views, visits in url_data: + progress_count += 1 + if progress_count % 100 == 0: + log.debug('.. %d/%d done so far', progress_count, progress_total) + + package, publisher = _get_package_and_publisher(url) + + item = model.Session.query(GA_Url).\ + filter(GA_Url.period_name==period_name).\ + filter(GA_Url.url==url).first() + if item: + item.pageviews = item.pageviews + views + item.visits = item.visits + visits + if not item.package_id: + item.package_id = package + if not item.department_id: + item.department_id = publisher + model.Session.add(item) + else: + values = {'id': make_uuid(), + 'period_name': period_name, + 'period_complete_day': period_complete_day, + 'url': url, + 'pageviews': views, + 'visits': visits, + 'department_id': publisher, + 'package_id': package + } + model.Session.add(GA_Url(**values)) model.Session.commit() - """ - -def pre_update_url_stats(period_name): - model.Session.query(GA_Url).\ - filter(GA_Url.period_name==period_name).delete() - model.Session.query(GA_Url).\ - filter(GA_Url.period_name=='All').delete() - - -def update_url_stats(period_name, period_complete