From: David Read Date: Wed, 21 Nov 2012 14:29:53 +0000 Subject: Avoid problem when testing. X-Git-Url: https://maxious.lambdacomplex.org/git/?p=ckanext-ga-report.git&a=commitdiff&h=4dc59ec64eb44f79fb66fe8347e2456cb1b5cae0 --- Avoid problem when testing. --- --- a/README.rst +++ b/README.rst @@ -32,11 +32,11 @@ googleanalytics.id = UA-1010101-1 googleanalytics.account = Account name (e.g. data.gov.uk, see top level item at https://www.google.com/analytics) + googleanalytics.token.filepath = ~/pyenv/token.dat ga-report.period = monthly - ga-report.bounce_url = /data + ga-report.bounce_url = / - The ga-report.bounce_url specifies the path to use when calculating bounces. For DGU this is /data - but you may want to set this to /. + The ga-report.bounce_url specifies a particular path to record the bounce rate for. Typically it is / (the home page). 3. Set up this extension's database tables using a paster command. (Ensure your CKAN pyenv is still activated, run the command from ``src/ckanext-ga-report``, alter the ``--config`` option to point to your site config file):: @@ -83,13 +83,17 @@ $ paster getauthtoken --config=../ckan/development.ini +Now ensure you reference the correct path to your token.dat in your CKAN config file (e.g. development.ini):: + + googleanalytics.token.filepath = ~/pyenv/token.dat + Tutorial -------- Download some GA data and store it in CKAN's database. (Ensure your CKAN pyenv is still activated, run the command from ``src/ckanext-ga-report``, alter the ``--config`` option to point to your site config file) and specifying the name of your auth file (token.dat by default) from the previous step:: - $ paster loadanalytics token.dat latest --config=../ckan/development.ini + $ paster loadanalytics latest --config=../ckan/development.ini The value after the token file is how much data you want to retrieve, this can be --- a/ckanext/ga_report/command.py +++ b/ckanext/ga_report/command.py @@ -1,5 +1,8 @@ import logging import datetime +import os + +from pylons import config from ckan.lib.cli import CkanCommand # No other CKAN imports allowed until _load_config is run, @@ -58,20 +61,17 @@ """Get data from Google Analytics API and save it in the ga_model - Usage: paster loadanalytics + Usage: paster loadanalytics - Where is the name of the auth token file from - the getauthtoken step. - - And where is: + Where is: all - data for all time latest - (default) just the 'latest' data YYYY-MM - just data for the specific month """ summary = __doc__.split('\n')[0] usage = __doc__ - max_args = 2 - min_args = 1 + max_args = 1 + min_args = 0 def __init__(self, name): super(LoadAnalytics, self).__init__(name) @@ -80,6 +80,11 @@ default=False, dest='delete_first', help='Delete data for the period first') + self.parser.add_option('-s', '--skip_url_stats', + action='store_true', + default=False, + dest='skip_url_stats', + help='Skip the download of URL data - just do site-wide stats') def command(self): self._load_config() @@ -87,18 +92,25 @@ from download_analytics import DownloadAnalytics from ga_auth import (init_service, get_profile_id) + ga_token_filepath = os.path.expanduser(config.get('googleanalytics.token.filepath', '')) + if not ga_token_filepath: + print 'ERROR: In the CKAN config you need to specify the filepath of the ' \ + 'Google Analytics token file under key: googleanalytics.token.filepath' + return + try: - svc = init_service(self.args[0], None) + svc = init_service(ga_token_filepath, None) except TypeError: print ('Have you correctly run the getauthtoken task and ' - 'specified the correct token file?') + 'specified the correct token file in the CKAN config under ' + '"googleanalytics.token.filepath"?') return downloader = DownloadAnalytics(svc, profile_id=get_profile_id(svc), - delete_first=self.options.delete_first) + delete_first=self.options.delete_first, + skip_url_stats=self.options.skip_url_stats) - time_period = self.args[1] if self.args and len(self.args) > 1 \ - else 'latest' + time_period = self.args[0] if self.args else 'latest' if time_period == 'all': downloader.all_() elif time_period == 'latest': --- a/ckanext/ga_report/controller.py +++ b/ckanext/ga_report/controller.py @@ -71,13 +71,13 @@ entries = q.order_by('ga_stat.key').all() def clean_key(key, val): - if key in ['Average time on site', 'Pages per visit', 'New visits', 'Bounces']: + if key in ['Average time on site', 'Pages per visit', 'New visits', 'Bounce rate (home page)']: val = "%.2f" % round(float(val), 2) if key == 'Average time on site': mins, secs = divmod(float(val), 60) hours, mins = divmod(mins, 60) val = '%02d:%02d:%02d (%s seconds) ' % (hours, mins, secs, val) - if key in ['New visits','Bounces']: + if key in ['New visits','Bounce rate (home page)']: val = "%s%%" % val if key in ['Total page views', 'Total visits']: val = int(val) @@ -232,7 +232,7 @@ return render('ga_report/publisher/index.html') def _get_packages(self, publisher=None, count=-1): - '''Returns the datasets in order of visits''' + '''Returns the datasets in order of views''' if count == -1: count = sys.maxint @@ -244,11 +244,11 @@ if publisher: q = q.filter(GA_Url.department_id==publisher.name) q = q.filter(GA_Url.period_name==month) - q = q.order_by('ga_url.visitors::int desc') + q = q.order_by('ga_url.pageviews::int desc') top_packages = [] for entry,package in q.limit(count): if package: - top_packages.append((package, entry.pageviews, entry.visitors)) + top_packages.append((package, entry.pageviews, entry.visits)) else: log.warning('Could not find package associated package') @@ -306,13 +306,13 @@ month = c.month or 'All' connection = model.Session.connection() q = """ - select department_id, sum(pageviews::int) views, sum(visitors::int) visits + select department_id, sum(pageviews::int) views, sum(visits::int) visits from ga_url where department_id <> '' and package_id <> '' and url like '/dataset/%%' and period_name=%s - group by department_id order by visits desc + group by department_id order by views desc """ if limit: q = q + " limit %s;" % (limit) @@ -329,7 +329,7 @@ def _get_publishers(): ''' Returns a list of all publishers. Each item is a tuple: - (names, title) + (name, title) ''' publishers = [] for pub in model.Session.query(model.Group).\ --- a/ckanext/ga_report/download_analytics.py +++ b/ckanext/ga_report/download_analytics.py @@ -17,11 +17,13 @@ class DownloadAnalytics(object): '''Downloads and stores analytics info''' - def __init__(self, service=None, profile_id=None, delete_first=False): + def __init__(self, service=None, profile_id=None, delete_first=False, + skip_url_stats=False): self.period = config['ga-report.period'] self.service = service self.profile_id = profile_id self.delete_first = delete_first + self.skip_url_stats = skip_url_stats def specific_month(self, date): import calendar @@ -92,33 +94,41 @@ def download_and_store(self, periods): for period_name, period_complete_day, start_date, end_date in periods: + log.info('Period "%s" (%s - %s)', + self.get_full_period_name(period_name, period_complete_day), + start_date.strftime('%Y-%m-%d'), + end_date.strftime('%Y-%m-%d')) + if self.delete_first: - log.info('Deleting existing Analytics for period "%s"', + log.info('Deleting existing Analytics for this period "%s"', period_name) ga_model.delete(period_name) - log.info('Downloading Analytics for period "%s" (%s - %s)', - self.get_full_period_name(period_name, period_complete_day), - start_date.strftime('%Y %m %d'), - end_date.strftime('%Y %m %d')) - - # Clean up the entries before we run this - ga_model.pre_update_url_stats(period_name) - - accountName = config.get('googleanalytics.account') - - data = self.download(start_date, end_date, '~/%s/dataset/[a-z0-9-_]+' % accountName) - log.info('Storing Dataset Analytics for period "%s"', - self.get_full_period_name(period_name, period_complete_day)) - self.store(period_name, period_complete_day, data, ) - - data = self.download(start_date, end_date, '~/%s/publisher/[a-z0-9-_]+' % accountName) - log.info('Storing Publisher Analytics for period "%s"', - self.get_full_period_name(period_name, period_complete_day)) - self.store(period_name, period_complete_day, data,) - - ga_model.update_publisher_stats(period_name) # about 30 seconds. + + if not self.skip_url_stats: + # Clean out old url data before storing the new + ga_model.pre_update_url_stats(period_name) + + accountName = config.get('googleanalytics.account') + + log.info('Downloading analytics for dataset views') + data = self.download(start_date, end_date, '~/%s/dataset/[a-z0-9-_]+' % accountName) + + log.info('Storing dataset views (%i rows)', len(data.get('url'))) + self.store(period_name, period_complete_day, data, ) + + log.info('Downloading analytics for publisher views') + data = self.download(start_date, end_date, '~/%s/publisher/[a-z0-9-_]+' % accountName) + + log.info('Storing publisher views (%i rows)', len(data.get('url'))) + self.store(period_name, period_complete_day, data,) + + log.info('Aggregating datasets by publisher') + ga_model.update_publisher_stats(period_name) # about 30 seconds. + + log.info('Downloading and storing analytics for site-wide stats') self.sitewide_stats( period_name ) + log.info('Downloading and storing analytics for social networks') self.update_social_info(period_name, start_date, end_date) @@ -152,8 +162,8 @@ start_date = start_date.strftime('%Y-%m-%d') end_date = end_date.strftime('%Y-%m-%d') query = 'ga:pagePath=%s$' % path - metrics = 'ga:uniquePageviews, ga:visits' - sort = '-ga:uniquePageviews' + metrics = 'ga:pageviews, ga:visits' + sort = '-ga:pageviews' # Supported query params at # https://developers.google.com/analytics/devguides/reporting/core/v3/reference @@ -170,8 +180,12 @@ packages = [] for entry in results.get('rows'): (loc,pageviews,visits) = entry - url = _normalize_url('http:/' + loc) + url = _normalize_url('http:/' + loc) # strips off domain e.g. www.data.gov.uk or data.gov.uk + if not url.startswith('/dataset/') and not url.startswith('/publisher/'): + # filter out strays like: + # /data/user/login?came_from=http://data.gov.uk/dataset/os-code-point-open + # /403.html?page=/about&from=http://data.gov.uk/publisher/planning-inspectorate continue packages.append( (url, pageviews, visits,) ) # Temporary hack return dict(url=packages) @@ -187,12 +201,10 @@ start_date = '%s-01' % period_name end_date = '%s-%s' % (period_name, last_day_of_month) - print 'Sitewide_stats for %s (%s -> %s)' % (period_name, start_date, end_date) - funcs = ['_totals_stats', '_social_stats', '_os_stats', '_locale_stats', '_browser_stats', '_mobile_stats'] for f in funcs: - print ' + Fetching %s stats' % f.split('_')[1] + log.info('Downloading analytics for %s' % f.split('_')[1]) getattr(self, f)(start_date, end_date, period_name) def _get_results(result_data, f): @@ -207,8 +219,8 @@ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', max_results=10000, end_date=end_date).execute() result_data = results.get('rows') @@ -229,25 +241,27 @@ } ga_model.update_sitewide_stats(period_name, "Totals", data) - # Bounces from /data. This url is specified in configuration because - # for DGU we don't want /. - path = config.get('ga-report.bounce_url','/') - print path - results = self.service.data().ga().get( - ids='ga:' + self.profile_id, - filters='ga:pagePath=~%s$' % (path,), - start_date=start_date, - metrics='ga:bounces,ga:uniquePageviews', + # Bounces from / or another configurable page. + path = '/%s%s' % (config.get('googleanalytics.account'), + config.get('ga-report.bounce_url', '/')) + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + filters='ga:pagePath==%s' % (path,), + start_date=start_date, + metrics='ga:bounces,ga:pageviews', dimensions='ga:pagePath', max_results=10000, end_date=end_date).execute() result_data = results.get('rows') - for results in result_data: - if results[0] == path: - bounce, total = [float(x) for x in results[1:]] - pct = 100 * bounce/total - print "%d bounces from %d total == %s" % (bounce, total, pct) - ga_model.update_sitewide_stats(period_name, "Totals", {'Bounces': pct}) + if not result_data or len(result_data) != 1: + log.error('Could not pinpoint the bounces for path: %s. Got results: %r', + path, result_data) + return + results = result_data[0] + bounces, total = [float(x) for x in result_data[0][1:]] + pct = 100 * bounces/total + log.info('%d bounces from %d total == %s', bounces, total, pct) + ga_model.update_sitewide_stats(period_name, "Totals", {'Bounce rate (home page)': pct}) def _locale_stats(self, start_date, end_date, period_name): @@ -255,8 +269,8 @@ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:language,ga:country", max_results=10000, end_date=end_date).execute() @@ -279,8 +293,8 @@ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:socialNetwork,ga:referralPath", max_results=10000, end_date=end_date).execute() @@ -298,8 +312,8 @@ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:operatingSystem,ga:operatingSystemVersion", max_results=10000, end_date=end_date).execute() @@ -323,8 +337,8 @@ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:browser,ga:browserVersion", max_results=10000, end_date=end_date).execute() @@ -372,8 +386,8 @@ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:mobileDeviceBranding, ga:mobileDeviceInfo", max_results=10000, end_date=end_date).execute() --- a/ckanext/ga_report/ga_model.py +++ b/ckanext/ga_report/ga_model.py @@ -9,6 +9,8 @@ import ckan.model as model from ckan.lib.base import * + +log = __import__('logging').getLogger(__name__) def make_uuid(): return unicode(uuid.uuid4()) @@ -27,7 +29,7 @@ Column('period_name', types.UnicodeText), Column('period_complete_day', types.Integer), Column('pageviews', types.UnicodeText), - Column('visitors', types.UnicodeText), + Column('visits', types.UnicodeText), Column('url', types.UnicodeText), Column('department_id', types.UnicodeText), Column('package_id', types.UnicodeText), @@ -63,7 +65,7 @@ Column('period_name', types.UnicodeText), Column('publisher_name', types.UnicodeText), Column('views', types.UnicodeText), - Column('visitors', types.UnicodeText), + Column('visits', types.UnicodeText), Column('toplevel', types.Boolean, default=False), Column('subpublishercount', types.Integer, default=0), Column('parent', types.UnicodeText), @@ -111,9 +113,7 @@ >>> normalize_url('http://data.gov.uk/dataset/weekly_fuel_prices') '/dataset/weekly_fuel_prices' ''' - # Deliberately leaving a / - url = url.replace('http:/','') - return '/' + '/'.join(url.split('/')[2:]) + return '/' + '/'.join(url.split('/')[3:]) def _get_package_and_publisher(url): @@ -157,25 +157,6 @@ model.Session.commit() -def update_url_stat_totals(period_name): - - """ - items = model.Session.query(GA_Url).\ - filter(GA_Url.period_name != "All").\ - filter(GA_Url.url==url).all() - values = {'id': make_uuid(), - 'period_name': "All", - 'period_complete_day': "0", - 'url': url, - 'pageviews': sum([int(x.pageviews) for x in items]), - 'visitors': sum([int(x.visitors) for x in items]), - 'department_id': department_id, - 'package_id': package - } - model.Session.add(GA_Url(**values)) - model.Session.commit() - """ - def pre_update_url_stats(period_name): model.Session.query(GA_Url).\ filter(GA_Url.period_name==period_name).delete() @@ -184,16 +165,25 @@ def update_url_stats(period_name, period_complete_day, url_data): - - for url, views, visitors in url_data: + ''' + Given a list of urls and number of hits for each during a given period, + stores them in GA_Url under the period and recalculates the totals for + the 'All' period. + ''' + for url, views, visits in url_data: package, publisher = _get_package_and_publisher(url) + item = model.Session.query(GA_Url).\ filter(GA_Url.period_name==period_name).\ filter(GA_Url.url==url).first() if item: item.pageviews = item.pageviews + views - item.visitors = item.visitors + visitors + item.visits = item.visits + visits + if not item.package_id: + item.package_id = package + if not item.department_id: + item.department_id = publisher model.Session.add(item) else: values = {'id': make_uuid(), @@ -201,7 +191,7 @@ 'period_complete_day': period_complete_day, 'url': url, 'pageviews': views, - 'visitors': visitors, + 'visits': visits, 'department_id': publisher, 'package_id': package } @@ -209,6 +199,13 @@ model.Session.commit() if package: + old_pageviews, old_visits = 0, 0 + old = model.Session.query(GA_Url).\ + filter(GA_Url.period_name=='All').\ + filter(GA_Url.url==url).all() + old_pageviews = sum([int(o.pageviews) for o in old]) + old_visits = sum([int(o.visits) for o in old]) + entries = model.Session.query(GA_Url).\ filter(GA_Url.period_name!='All').\ filter(GA_Url.url==url).all() @@ -216,15 +213,14 @@ 'period_name': 'All', 'period_complete_day': 0, 'url': url, - 'pageviews': sum([int(e.pageviews) for e in entries]), - 'visitors': sum([int(e.visitors) for e in entries]), + 'pageviews': sum([int(e.pageviews) for e in entries]) + old_pageviews, + 'visits': sum([int(e.visits or 0) for e in entries]) + old_visits, 'department_id': publisher, 'package_id': package } model.Session.add(GA_Url(**values)) model.Session.commit() - @@ -268,7 +264,7 @@ filter(model.Group.type=='publisher').\ filter(model.Group.state=='active').all() for publisher in publishers: - views, visitors, subpub = update_publisher(period_name, publisher, publisher.name) + views, visits, subpub = update_publisher(period_name, publisher, publisher.name) parent, parents = '', publisher.get_groups('publisher') if parents: parent = parents[0].name @@ -277,7 +273,7 @@ filter(GA_Publisher.publisher_name==publisher.name).first() if item: item.views = views - item.visitors = visitors + item.visits = visits item.publisher_name = publisher.name item.toplevel = publisher in toplevel item.subpublishercount = subpub @@ -289,7 +285,7 @@ 'period_name': period_name, 'publisher_name': publisher.name, 'views': views, - 'visitors': visitors, + 'visits': visits, 'toplevel': publisher in toplevel, 'subpublishercount': subpub, 'parent': parent @@ -299,7 +295,7 @@ def update_publisher(period_name, pub, part=''): - views,visitors,subpub = 0, 0, 0 + views,visits,subpub = 0, 0, 0 for publisher in go_down_tree(pub): subpub = subpub + 1 items = model.Session.query(GA_Url).\ @@ -307,9 +303,9 @@ filter(GA_Url.department_id==publisher.name).all() for item in items: views = views + int(item.pageviews) - visitors = visitors + int(item.visitors) - - return views, visitors, (subpub-1) + visits = visits + int(item.visits) + + return views, visits, (subpub-1) def get_top_level(): @@ -349,3 +345,34 @@ q.delete() model.Session.commit() +def get_score_for_dataset(dataset_name): + ''' + Returns a "current popularity" score for a dataset, + based on how many views it has had recently. + ''' + import datetime + now = datetime.datetime.now() + last_month = now - datetime.timedelta(days=30) + period_names = ['%s-%02d' % (last_month.year, last_month.month), + '%s-%02d' % (now.year, now.month), + ] + + score = 0 + for period_name in period_names: + score /= 2 # previous periods are discounted by 50% + entry = model.Session.query(GA_Url)\ + .filter(GA_Url.period_name==period_name)\ + .filter(GA_Url.package_id==dataset_name).first() + # score + if entry: + views = float(entry.pageviews) + if entry.period_complete_day: + views_per_day = views / entry.period_complete_day + else: + views_per_day = views / 15 # guess + score += views_per_day + + score = int(score * 100) + log.debug('Popularity %s: %s', score, dataset_name) + return score + --- a/ckanext/ga_report/helpers.py +++ b/ckanext/ga_report/helpers.py @@ -50,9 +50,12 @@ dataset = model.Package.get(ga_url.url[len('/dataset/'):]) if dataset and not dataset.state == 'active': dataset = None - count += 1 - if count > 10: - break + # When testing, it is possible that top datasets are not available + # so only go round this loop a few times before falling back on + # a random dataset. + count += 1 + if count > 10: + break if not dataset: # fallback dataset = model.Session.query(model.Package)\ @@ -60,7 +63,8 @@ if not dataset: return None dataset_dict = get_action('package_show')({'model': model, - 'session': model.Session}, + 'session': model.Session, + 'validate': False}, {'id':dataset.id}) return dataset_dict @@ -105,7 +109,7 @@ if not p in datasets: datasets[p] = {'views':0, 'visits': 0} datasets[p]['views'] = datasets[p]['views'] + int(entry.pageviews) - datasets[p]['visits'] = datasets[p]['visits'] + int(entry.visitors) + datasets[p]['visits'] = datasets[p]['visits'] + int(entry.visits) results = [] for k, v in datasets.iteritems(): --- a/ckanext/ga_report/templates/ga_report/notes.html +++ b/ckanext/ga_report/templates/ga_report/notes.html @@ -6,11 +6,10 @@
  • Notes

      -
    • 'Views' is the number of sessions during which that page was viewed one or more times ('Unique Pageviews').
    • - -
    • 'Visitors' is the number of unique users visiting the site (whether once or more times).
    • +
    • "Views" is the number of times a page was loaded in users' browsers.
    • +
    • "Visits" is the number of unique user visits to a page, counted once for each visitor for each of their browsing sessions.
    • These usage statistics are confined to users with javascript enabled, which excludes web crawlers and API calls.
    • -
    • The results for only small numbers of views/visits are not shown. Where these relate to site pages, then they are available in full in the CSV download. Where these relate to users' web browser information, they are not disclosed, for privacy reasons.
    • +
    • The results are not shown when the number of views/visits is tiny. Where these relate to site pages, results are available in full in the CSV download. Where these relate to users' web browser information, results are not disclosed, for privacy reasons.
  • --- a/ckanext/ga_report/templates/ga_report/publisher/index.html +++ b/ckanext/ga_report/templates/ga_report/publisher/index.html @@ -41,14 +41,14 @@ - + - + --- a/ckanext/ga_report/templates/ga_report/publisher/read.html +++ b/ckanext/ga_report/templates/ga_report/publisher/read.html @@ -47,14 +47,14 @@
    PublisherDataset Visits Dataset Views
    ${h.link_to(publisher.title, h.url_for(controller='ckanext.ga_report.controller:GaDatasetReport', action='read_publisher', id=publisher.name))} ${visits} ${views}
    - + - + --- /dev/null +++ b/ckanext/ga_report/tests/test_model.py @@ -1,1 +1,18 @@ +from nose.tools import assert_equal +from ckanext.ga_report.ga_model import _normalize_url + +class TestNormalizeUrl: + def test_normal(self): + assert_equal(_normalize_url('http://data.gov.uk/dataset/weekly_fuel_prices'), + '/dataset/weekly_fuel_prices') + + def test_www_dot(self): + assert_equal(_normalize_url('http://www.data.gov.uk/dataset/weekly_fuel_prices'), + '/dataset/weekly_fuel_prices') + + def test_https(self): + assert_equal(_normalize_url('https://data.gov.uk/dataset/weekly_fuel_prices'), + '/dataset/weekly_fuel_prices') + +
    DatasetVisits Views
    ${h.link_to(package.title or package.name, h.url_for(controller='package', action='read', id=package.name))} ${visits} ${views}