Avoid problem when testing.
--- a/README.rst
+++ b/README.rst
@@ -32,6 +32,7 @@
googleanalytics.id = UA-1010101-1
googleanalytics.account = Account name (e.g. data.gov.uk, see top level item at https://www.google.com/analytics)
+ googleanalytics.token.filepath = ~/pyenv/token.dat
ga-report.period = monthly
ga-report.bounce_url = /
@@ -82,13 +83,17 @@
$ paster getauthtoken --config=../ckan/development.ini
+Now ensure you reference the correct path to your token.dat in your CKAN config file (e.g. development.ini)::
+
+ googleanalytics.token.filepath = ~/pyenv/token.dat
+
Tutorial
--------
Download some GA data and store it in CKAN's database. (Ensure your CKAN pyenv is still activated, run the command from ``src/ckanext-ga-report``, alter the ``--config`` option to point to your site config file) and specifying the name of your auth file (token.dat by default) from the previous step::
- $ paster loadanalytics token.dat latest --config=../ckan/development.ini
+ $ paster loadanalytics latest --config=../ckan/development.ini
The value after the token file is how much data you want to retrieve, this can be
--- a/ckanext/ga_report/command.py
+++ b/ckanext/ga_report/command.py
@@ -1,5 +1,8 @@
import logging
import datetime
+import os
+
+from pylons import config
from ckan.lib.cli import CkanCommand
# No other CKAN imports allowed until _load_config is run,
@@ -58,20 +61,17 @@
"""Get data from Google Analytics API and save it
in the ga_model
- Usage: paster loadanalytics <tokenfile> <time-period>
+ Usage: paster loadanalytics <time-period>
- Where <tokenfile> is the name of the auth token file from
- the getauthtoken step.
-
- And where <time-period> is:
+ Where <time-period> is:
all - data for all time
latest - (default) just the 'latest' data
YYYY-MM - just data for the specific month
"""
summary = __doc__.split('\n')[0]
usage = __doc__
- max_args = 2
- min_args = 1
+ max_args = 1
+ min_args = 0
def __init__(self, name):
super(LoadAnalytics, self).__init__(name)
@@ -80,7 +80,7 @@
default=False,
dest='delete_first',
help='Delete data for the period first')
- self.parser.add_option('-s', '--slip_url_stats',
+ self.parser.add_option('-s', '--skip_url_stats',
action='store_true',
default=False,
dest='skip_url_stats',
@@ -92,19 +92,25 @@
from download_analytics import DownloadAnalytics
from ga_auth import (init_service, get_profile_id)
+ ga_token_filepath = os.path.expanduser(config.get('googleanalytics.token.filepath', ''))
+ if not ga_token_filepath:
+ print 'ERROR: In the CKAN config you need to specify the filepath of the ' \
+ 'Google Analytics token file under key: googleanalytics.token.filepath'
+ return
+
try:
- svc = init_service(self.args[0], None)
+ svc = init_service(ga_token_filepath, None)
except TypeError:
print ('Have you correctly run the getauthtoken task and '
- 'specified the correct token file?')
+ 'specified the correct token file in the CKAN config under '
+ '"googleanalytics.token.filepath"?')
return
downloader = DownloadAnalytics(svc, profile_id=get_profile_id(svc),
delete_first=self.options.delete_first,
skip_url_stats=self.options.skip_url_stats)
- time_period = self.args[1] if self.args and len(self.args) > 1 \
- else 'latest'
+ time_period = self.args[0] if self.args else 'latest'
if time_period == 'all':
downloader.all_()
elif time_period == 'latest':
--- a/ckanext/ga_report/controller.py
+++ b/ckanext/ga_report/controller.py
@@ -71,13 +71,13 @@
entries = q.order_by('ga_stat.key').all()
def clean_key(key, val):
- if key in ['Average time on site', 'Pages per visit', 'New visits', 'Bounces']:
+ if key in ['Average time on site', 'Pages per visit', 'New visits', 'Bounce rate (home page)']:
val = "%.2f" % round(float(val), 2)
if key == 'Average time on site':
mins, secs = divmod(float(val), 60)
hours, mins = divmod(mins, 60)
val = '%02d:%02d:%02d (%s seconds) ' % (hours, mins, secs, val)
- if key in ['New visits','Bounces']:
+ if key in ['New visits','Bounce rate (home page)']:
val = "%s%%" % val
if key in ['Total page views', 'Total visits']:
val = int(val)
@@ -232,7 +232,7 @@
return render('ga_report/publisher/index.html')
def _get_packages(self, publisher=None, count=-1):
- '''Returns the datasets in order of visits'''
+ '''Returns the datasets in order of views'''
if count == -1:
count = sys.maxint
@@ -244,7 +244,7 @@
if publisher:
q = q.filter(GA_Url.department_id==publisher.name)
q = q.filter(GA_Url.period_name==month)
- q = q.order_by('ga_url.visits::int desc')
+ q = q.order_by('ga_url.pageviews::int desc')
top_packages = []
for entry,package in q.limit(count):
if package:
@@ -312,7 +312,7 @@
and package_id <> ''
and url like '/dataset/%%'
and period_name=%s
- group by department_id order by visits desc
+ group by department_id order by views desc
"""
if limit:
q = q + " limit %s;" % (limit)
@@ -329,7 +329,7 @@
def _get_publishers():
'''
Returns a list of all publishers. Each item is a tuple:
- (names, title)
+ (name, title)
'''
publishers = []
for pub in model.Session.query(model.Group).\
--- a/ckanext/ga_report/download_analytics.py
+++ b/ckanext/ga_report/download_analytics.py
@@ -253,7 +253,7 @@
max_results=10000,
end_date=end_date).execute()
result_data = results.get('rows')
- if len(result_data) != 1:
+ if not result_data or len(result_data) != 1:
log.error('Could not pinpoint the bounces for path: %s. Got results: %r',
path, result_data)
return
@@ -261,7 +261,7 @@
bounces, total = [float(x) for x in result_data[0][1:]]
pct = 100 * bounces/total
log.info('%d bounces from %d total == %s', bounces, total, pct)
- ga_model.update_sitewide_stats(period_name, "Totals", {'Bounce rate': pct})
+ ga_model.update_sitewide_stats(period_name, "Totals", {'Bounce rate (home page)': pct})
def _locale_stats(self, start_date, end_date, period_name):
--- a/ckanext/ga_report/ga_model.py
+++ b/ckanext/ga_report/ga_model.py
@@ -9,6 +9,8 @@
import ckan.model as model
from ckan.lib.base import *
+
+log = __import__('logging').getLogger(__name__)
def make_uuid():
return unicode(uuid.uuid4())
@@ -212,7 +214,7 @@
'period_complete_day': 0,
'url': url,
'pageviews': sum([int(e.pageviews) for e in entries]) + old_pageviews,
- 'visits': sum([int(e.visits) for e in entries]) + old_visits,
+ 'visits': sum([int(e.visits or 0) for e in entries]) + old_visits,
'department_id': publisher,
'package_id': package
}
@@ -343,3 +345,34 @@
q.delete()
model.Session.commit()
+def get_score_for_dataset(dataset_name):
+ '''
+ Returns a "current popularity" score for a dataset,
+ based on how many views it has had recently.
+ '''
+ import datetime
+ now = datetime.datetime.now()
+ last_month = now - datetime.timedelta(days=30)
+ period_names = ['%s-%02d' % (last_month.year, last_month.month),
+ '%s-%02d' % (now.year, now.month),
+ ]
+
+ score = 0
+ for period_name in period_names:
+ score /= 2 # previous periods are discounted by 50%
+ entry = model.Session.query(GA_Url)\
+ .filter(GA_Url.period_name==period_name)\
+ .filter(GA_Url.package_id==dataset_name).first()
+ # score
+ if entry:
+ views = float(entry.pageviews)
+ if entry.period_complete_day:
+ views_per_day = views / entry.period_complete_day
+ else:
+ views_per_day = views / 15 # guess
+ score += views_per_day
+
+ score = int(score * 100)
+ log.debug('Popularity %s: %s', score, dataset_name)
+ return score
+
--- a/ckanext/ga_report/helpers.py
+++ b/ckanext/ga_report/helpers.py
@@ -50,9 +50,12 @@
dataset = model.Package.get(ga_url.url[len('/dataset/'):])
if dataset and not dataset.state == 'active':
dataset = None
- count += 1
- if count > 10:
- break
+ # When testing, it is possible that top datasets are not available
+ # so only go round this loop a few times before falling back on
+ # a random dataset.
+ count += 1
+ if count > 10:
+ break
if not dataset:
# fallback
dataset = model.Session.query(model.Package)\
--- a/ckanext/ga_report/templates/ga_report/notes.html
+++ b/ckanext/ga_report/templates/ga_report/notes.html
@@ -6,8 +6,8 @@
<li class="widget-container boxed widget_text">
<h4>Notes</h4>
<ul>
- <li>"Views" is the number of sessions during which the page was viewed one or more times (technically known as "unique pageviews").</li>
- <li>"Visits" is the number of unique user visits to a page, counted once for each visitor for each session.</li>
+ <li>"Views" is the number of times a page was loaded in users' browsers.</li>
+ <li>"Visits" is the number of unique user visits to a page, counted once for each visitor for each of their browsing sessions.</li>
<li>These usage statistics are confined to users with javascript enabled, which excludes web crawlers and API calls.</li>
<li>The results are not shown when the number of views/visits is tiny. Where these relate to site pages, results are available in full in the CSV download. Where these relate to users' web browser information, results are not disclosed, for privacy reasons.</li>
</ul>
--- a/ckanext/ga_report/templates/ga_report/publisher/index.html
+++ b/ckanext/ga_report/templates/ga_report/publisher/index.html
@@ -41,14 +41,14 @@
<table class="table table-condensed table-bordered table-striped">
<tr>
<th>Publisher</th>
- <th>Dataset Visits</th>
+<!-- <th>Dataset Visits</th>-->
<th>Dataset Views</th>
</tr>
<py:for each="publisher, views, visits in c.top_publishers">
<tr>
<td>${h.link_to(publisher.title, h.url_for(controller='ckanext.ga_report.controller:GaDatasetReport', action='read_publisher', id=publisher.name))}
</td>
- <td>${visits}</td>
+<!-- <td>${visits}</td> -->
<td>${views}</td>
</tr>
</py:for>
--- a/ckanext/ga_report/templates/ga_report/publisher/read.html
+++ b/ckanext/ga_report/templates/ga_report/publisher/read.html
@@ -47,14 +47,14 @@
<table py:if="c.top_packages" class="table table-condensed table-bordered table-striped">
<tr>
<th>Dataset</th>
- <th>Visits</th>
+<!-- <th>Visits</th> -->
<th>Views</th>
</tr>
<py:for each="package, views, visits in c.top_packages">
<tr>
<td>${h.link_to(package.title or package.name, h.url_for(controller='package', action='read', id=package.name))}
</td>
- <td>${visits}</td>
+<!-- <td>${visits}</td> -->
<td>${views}</td>
</tr>
</py:for>