From: David Read Date: Wed, 07 Nov 2012 13:38:29 +0000 Subject: Tidy logging. X-Git-Url: http://maxious.lambdacomplex.org/git/?p=ckanext-ga-report.git&a=commitdiff&h=753e746cceb78d2cb57df91505b36e76fa4ad38e --- Tidy logging. --- --- a/README.rst +++ b/README.rst @@ -33,6 +33,10 @@ googleanalytics.id = UA-1010101-1 googleanalytics.account = Account name (e.g. data.gov.uk, see top level item at https://www.google.com/analytics) ga-report.period = monthly + ga-report.bounce_url = /data + + The ga-report.bounce_url specifies the path to use when calculating bounces. For DGU this is /data + but you may want to set this to /. 3. Set up this extension's database tables using a paster command. (Ensure your CKAN pyenv is still activated, run the command from ``src/ckanext-ga-report``, alter the ``--config`` option to point to your site config file):: --- a/ckanext/ga_report/controller.py +++ b/ckanext/ga_report/controller.py @@ -22,8 +22,9 @@ def _month_details(cls): + '''Returns a list of all the month names''' months = [] - vals = model.Session.query(cls.period_name).distinct().all() + vals = model.Session.query(cls.period_name).filter(cls.period_name!='All').distinct().all() for m in vals: months.append( (m[0], _get_month_name(m[0]))) return sorted(months, key=operator.itemgetter(0), reverse=True) @@ -70,15 +71,15 @@ entries = q.order_by('ga_stat.key').all() def clean_key(key, val): - if key in ['Average time on site', 'Pages per visit', 'New visits']: + if key in ['Average time on site', 'Pages per visit', 'New visits', 'Bounces']: val = "%.2f" % round(float(val), 2) if key == 'Average time on site': mins, secs = divmod(float(val), 60) hours, mins = divmod(mins, 60) val = '%02d:%02d:%02d (%s seconds) ' % (hours, mins, secs, val) - if key == 'New visits': + if key in ['New visits','Bounces']: val = "%s%%" % val - if key in ['Bounces', 'Total page views', 'Total visits']: + if key in ['Total page views', 'Total visits']: val = int(val) return key, val @@ -93,11 +94,12 @@ for e in entries: d[e.key].append(float(e.value)) for k, v in d.iteritems(): - if k in ['Bounces', 'Total page views', 'Total visits']: + if k in ['Total page views', 'Total visits']: v = sum(v) else: v = float(sum(v))/len(v) key, val = clean_key(k,v) + c.global_totals.append((key, val)) c.global_totals = sorted(c.global_totals, key=operator.itemgetter(0)) @@ -134,29 +136,7 @@ c.social_referrer_totals.append((shorten_name(entry[0]), fill_out_url(entry[0]),'', entry[1])) - - browser_version_re = re.compile("(.*)\((.*)\)") for k, v in keys.iteritems(): - - def clean_field(key): - if k != 'Browser versions': - return key - m = browser_version_re.match(key) - browser = m.groups()[0].strip() - ver = m.groups()[1] - parts = ver.split('.') - if len(parts) > 1: - if parts[1][0] == '0': - ver = parts[0] - else: - ver = "%s.%s" % (parts[0],parts[1]) - if browser in ['Safari','Android Browser']: # Special case complex version nums - ver = parts[0] - if len(ver) > 2: - ver = "%s%sX" % (ver[0], ver[1]) - - return "%s (%s)" % (browser, ver,) - q = model.Session.query(GA_Stat).\ filter(GA_Stat.stat_name==k) if c.month: @@ -172,17 +152,13 @@ entries.append((key,val,)) entries = sorted(entries, key=operator.itemgetter(1), reverse=True) - def percent(num, total): - p = 100 * float(num)/float(total) - return "%.2f%%" % round(p, 2) - # Get the total for each set of values and then set the value as # a percentage of the total if k == 'Social sources': total = sum([x for n,x in c.global_totals if n == 'Total visits']) else: total = sum([num for _,num in entries]) - setattr(c, v, [(k,percent(v,total)) for k,v in entries ]) + setattr(c, v, [(k,_percent(v,total)) for k,v in entries ]) return render('ga_report/site/index.html') @@ -261,43 +237,23 @@ if count == -1: count = sys.maxint - q = model.Session.query(GA_Url)\ + month = c.month or 'All' + + q = model.Session.query(GA_Url,model.Package)\ + .filter(model.Package.name==GA_Url.package_id)\ .filter(GA_Url.url.like('/dataset/%')) if publisher: q = q.filter(GA_Url.department_id==publisher.name) - if c.month: - q = q.filter(GA_Url.period_name==c.month) + q = q.filter(GA_Url.period_name==month) q = q.order_by('ga_url.visitors::int desc') - - if c.month: - top_packages = [] - for entry in q.limit(count): - package_name = entry.url[len('/dataset/'):] - p = model.Package.get(package_name) - if p: - top_packages.append((p, entry.pageviews, entry.visitors)) - else: - log.warning('Could not find package "%s"', package_name) - else: - ds = {} - for entry in q: - if len(ds) >= count: - break - package_name = entry.url[len('/dataset/'):] - p = model.Package.get(package_name) - if p: - if not p in ds: - ds[p] = {'views': 0, 'visits': 0} - ds[p]['views'] = ds[p]['views'] + int(entry.pageviews) - ds[p]['visits'] = ds[p]['visits'] + int(entry.visitors) - else: - log.warning('Could not find package "%s"', package_name) - - results = [] - for k, v in ds.iteritems(): - results.append((k,v['views'],v['visits'])) - - top_packages = sorted(results, key=operator.itemgetter(1), reverse=True) + top_packages = [] + + for entry,package in q.limit(count): + if package: + top_packages.append((package, entry.pageviews, entry.visitors)) + else: + log.warning('Could not find package associated package') + return top_packages def read(self): @@ -333,15 +289,12 @@ else: c.month_desc = ''.join([m[1] for m in c.months if m[0]==c.month]) + month = c.mnth or 'All' c.publisher_page_views = 0 q = model.Session.query(GA_Url).\ filter(GA_Url.url=='/publisher/%s' % c.publisher_name) - if c.month: - entry = q.filter(GA_Url.period_name==c.month).first() - c.publisher_page_views = entry.pageviews if entry else 0 - else: - for e in q.all(): - c.publisher_page_views = c.publisher_page_views + int(e.pageviews) + entry = q.filter(GA_Url.period_name==c.month).first() + c.publisher_page_views = entry.pageviews if entry else 0 c.top_packages = self._get_packages(c.publisher, 20) @@ -393,3 +346,7 @@ publishers.append((pub.name, pub.title)) return publishers +def _percent(num, total): + p = 100 * float(num)/float(total) + return "%.2f%%" % round(p, 2) + --- a/ckanext/ga_report/download_analytics.py +++ b/ckanext/ga_report/download_analytics.py @@ -92,27 +92,34 @@ def download_and_store(self, periods): for period_name, period_complete_day, start_date, end_date in periods: + log.info('Period "%s" (%s - %s)', + self.get_full_period_name(period_name, period_complete_day), + start_date.strftime('%Y-%m-%d'), + end_date.strftime('%Y-%m-%d')) + if self.delete_first: - log.info('Deleting existing Analytics for period "%s"', + log.info('Deleting existing Analytics for this period "%s"', period_name) ga_model.delete(period_name) - log.info('Downloading Analytics for period "%s" (%s - %s)', - self.get_full_period_name(period_name, period_complete_day), - start_date.strftime('%Y %m %d'), - end_date.strftime('%Y %m %d')) + + log.info('Downloading analytics for dataset views') data = self.download(start_date, end_date, '~/dataset/[a-z0-9-_]+') - log.info('Storing Dataset Analytics for period "%s"', - self.get_full_period_name(period_name, period_complete_day)) + + log.info('Storing dataset views (%i rows)', len(data.get('url'))) self.store(period_name, period_complete_day, data, ) + log.info('Downloading analytics for publisher views') data = self.download(start_date, end_date, '~/publisher/[a-z0-9-_]+') - log.info('Storing Publisher Analytics for period "%s"', - self.get_full_period_name(period_name, period_complete_day)) + log.info('Storing publisher views (%i rows)', len(data.get('url'))) self.store(period_name, period_complete_day, data,) + log.info('Aggregating datasets by publisher') ga_model.update_publisher_stats(period_name) # about 30 seconds. + + log.info('Downloading and storing analytics for site-wide stats') self.sitewide_stats( period_name ) + log.info('Downloading and storing analytics for social networks') self.update_social_info(period_name, start_date, end_date) def update_social_info(self, period_name, start_date, end_date): @@ -146,7 +153,7 @@ start_date = start_date.strftime('%Y-%m-%d') end_date = end_date.strftime('%Y-%m-%d') query = 'ga:pagePath=%s$' % path - metrics = 'ga:uniquePageviews, ga:visitors' + metrics = 'ga:uniquePageviews, ga:visits' sort = '-ga:uniquePageviews' # Supported query params at @@ -160,11 +167,6 @@ dimensions="ga:pagePath", max_results=10000, end_date=end_date).execute() - - if os.getenv('DEBUG'): - import pprint - pprint.pprint(results) - print 'Total results: %s' % results.get('totalResults') packages = [] for entry in results.get('rows'): @@ -183,12 +185,10 @@ start_date = '%s-01' % period_name end_date = '%s-%s' % (period_name, last_day_of_month) - print 'Sitewide_stats for %s (%s -> %s)' % (period_name, start_date, end_date) - funcs = ['_totals_stats', '_social_stats', '_os_stats', '_locale_stats', '_browser_stats', '_mobile_stats'] for f in funcs: - print ' + Fetching %s stats' % f.split('_')[1] + log.info('Downloading analytics for %s' % f.split('_')[1]) getattr(self, f)(start_date, end_date, period_name) def _get_results(result_data, f): @@ -213,18 +213,37 @@ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:pageviewsPerVisit,ga:bounces,ga:avgTimeOnSite,ga:percentNewVisits,ga:visitors', + metrics='ga:pageviewsPerVisit,ga:avgTimeOnSite,ga:percentNewVisits,ga:visits', max_results=10000, end_date=end_date).execute() result_data = results.get('rows') data = { 'Pages per visit': result_data[0][0], - 'Bounces': result_data[0][1], - 'Average time on site': result_data[0][2], - 'New visits': result_data[0][3], - 'Total visits': result_data[0][4], + 'Average time on site': result_data[0][1], + 'New visits': result_data[0][2], + 'Total visits': result_data[0][3], } ga_model.update_sitewide_stats(period_name, "Totals", data) + + # Bounces from /data. This url is specified in configuration because + # for DGU we don't want /. + path = config.get('ga-report.bounce_url','/') + print path + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + filters='ga:pagePath=~%s$' % (path,), + start_date=start_date, + metrics='ga:bounces,ga:uniquePageviews', + dimensions='ga:pagePath', + max_results=10000, + end_date=end_date).execute() + result_data = results.get('rows') + for results in result_data: + if results[0] == path: + bounce, total = [float(x) for x in results[1:]] + pct = 100 * bounce/total + print "%d bounces from %d total == %s" % (bounce, total, pct) + ga_model.update_sitewide_stats(period_name, "Totals", {'Bounces': pct}) def _locale_stats(self, start_date, end_date, period_name): --- a/ckanext/ga_report/ga_model.py +++ b/ckanext/ga_report/ga_model.py @@ -1,10 +1,10 @@ import re import uuid -from sqlalchemy import Table, Column, MetaData +from sqlalchemy import Table, Column, MetaData, ForeignKey from sqlalchemy import types from sqlalchemy.sql import select -from sqlalchemy.orm import mapper +from sqlalchemy.orm import mapper, relation from sqlalchemy import func import ckan.model as model @@ -14,8 +14,6 @@ return unicode(uuid.uuid4()) metadata = MetaData() - - class GA_Url(object): @@ -32,6 +30,7 @@ Column('visitors', types.UnicodeText), Column('url', types.UnicodeText), Column('department_id', types.UnicodeText), + Column('package_id', types.UnicodeText), ) mapper(GA_Url, url_table) @@ -163,6 +162,10 @@ url = _normalize_url(url) department_id = _get_department_id_of_url(url) + package = None + if url.startswith('/dataset/'): + package = url[len('/dataset/'):] + # see if the row for this url & month is in the table already item = model.Session.query(GA_Url).\ filter(GA_Url.period_name==period_name).\ @@ -172,6 +175,7 @@ item.pageviews = views item.visitors = visitors item.department_id = department_id + item.package_id = package model.Session.add(item) else: # create the row @@ -181,9 +185,31 @@ 'url': url, 'pageviews': views, 'visitors': visitors, - 'department_id': department_id + 'department_id': department_id, + 'package_id': package } model.Session.add(GA_Url(**values)) + + # We now need to recaculate the ALL time_period from the data we have + # Delete the old 'All' + old = model.Session.query(GA_Url).\ + filter(GA_Url.period_name == "All").\ + filter(GA_Url.url==url).delete() + + items = model.Session.query(GA_Url).\ + filter(GA_Url.period_name != "All").\ + filter(GA_Url.url==url).all() + values = {'id': make_uuid(), + 'period_name': "All", + 'period_complete_day': "0", + 'url': url, + 'pageviews': sum([int(x.pageviews) for x in items]), + 'visitors': sum([int(x.visitors) for x in items]), + 'department_id': department_id, + 'package_id': package + } + model.Session.add(GA_Url(**values)) + model.Session.commit() --- a/ckanext/ga_report/helpers.py +++ b/ckanext/ga_report/helpers.py @@ -1,7 +1,9 @@ import logging import operator + import ckan.lib.base as base import ckan.model as model +from ckan.logic import get_action from ckanext.ga_report.ga_model import GA_Url, GA_Publisher from ckanext.ga_report.controller import _get_publishers @@ -39,25 +41,38 @@ order_by('ga_url.pageviews::int desc') num_top_datasets = top_datasets.count() + dataset = None if num_top_datasets: - dataset = None + count = 0 while not dataset: rand = random.randrange(0, min(top, num_top_datasets)) ga_url = top_datasets[rand] dataset = model.Package.get(ga_url.url[len('/dataset/'):]) if dataset and not dataset.state == 'active': dataset = None - else: + count += 1 + if count > 10: + break + if not dataset: + # fallback dataset = model.Session.query(model.Package)\ .filter_by(state='active').first() - publisher = dataset.get_groups('publisher')[0] - return { - 'dataset': dataset, - 'publisher': publisher - } + if not dataset: + return None + dataset_dict = get_action('package_show')({'model': model, + 'session': model.Session}, + {'id':dataset.id}) + return dataset_dict def single_popular_dataset_html(top=20): - context = single_popular_dataset(top) + dataset_dict = single_popular_dataset(top) + groups = package.get('groups', []) + publishers = [ g for g in groups if g.get('type') == 'publisher' ] + publisher = publishers[0] if publishers else {'name':'', 'title': ''} + context = { + 'dataset': dataset_dict, + 'publisher': publisher_dict + } return base.render_snippet('ga_report/ga_popular_single.html', **context) --- a/ckanext/ga_report/templates/ga_report/ga_popular_single.html +++ b/ckanext/ga_report/templates/ga_report/ga_popular_single.html @@ -8,14 +8,14 @@

Featured dataset

- -

${dataset.title}

+
+

${dataset['title']}

Publisher : - ${publisher.title} + ${publisher['title']}

-
${h.truncate(dataset.notes, length=200, whole_word=True)}
+
${h.truncate(dataset['notes_rendered'], length=200, whole_word=True)}
Other popular datasets --- a/ckanext/ga_report/templates/ga_report/notes.html +++ b/ckanext/ga_report/templates/ga_report/notes.html @@ -6,11 +6,11 @@
  • Notes

      -
    • 'Views' is the number of sessions during which that page was viewed one or more times ('Unique Pageviews').
    • - -
    • 'Visitors' is the number of unique users visiting the site (whether once or more times).
    • +
    • "Views" is the number of sessions during which the page was viewed one or more times (technically known as "unique pageviews").
    • +
    • "Visits" is the number of unique user visits to a page, counted once for each visitor for each session.
    • +
    • These usage statistics are confined to users with javascript enabled, which excludes web crawlers and API calls.
    • -
    • The results for only small numbers of views/visits are not shown. Where these relate to site pages, then they are available in full in the CSV download. Where these relate to users' web browser information, they are not disclosed, for privacy reasons.
    • +
    • The results are not shown when the number of views/visits is tiny. Where these relate to site pages, results are available in full in the CSV download. Where these relate to users' web browser information, results are not disclosed, for privacy reasons.