Fixed merge conflict
--- a/README.rst
+++ b/README.rst
@@ -33,10 +33,9 @@
googleanalytics.id = UA-1010101-1
googleanalytics.account = Account name (e.g. data.gov.uk, see top level item at https://www.google.com/analytics)
ga-report.period = monthly
- ga-report.bounce_url = /data
+ ga-report.bounce_url = /
- The ga-report.bounce_url specifies the path to use when calculating bounces. For DGU this is /data
- but you may want to set this to /.
+ The ga-report.bounce_url specifies a particular path to record the bounce rate for. Typically it is / (the home page).
3. Set up this extension's database tables using a paster command. (Ensure your CKAN pyenv is still activated, run the command from ``src/ckanext-ga-report``, alter the ``--config`` option to point to your site config file)::
--- a/ckanext/ga_report/command.py
+++ b/ckanext/ga_report/command.py
@@ -80,6 +80,11 @@
default=False,
dest='delete_first',
help='Delete data for the period first')
+ self.parser.add_option('-s', '--slip_url_stats',
+ action='store_true',
+ default=False,
+ dest='skip_url_stats',
+ help='Skip the download of URL data - just do site-wide stats')
def command(self):
self._load_config()
@@ -95,7 +100,8 @@
return
downloader = DownloadAnalytics(svc, profile_id=get_profile_id(svc),
- delete_first=self.options.delete_first)
+ delete_first=self.options.delete_first,
+ skip_url_stats=self.options.skip_url_stats)
time_period = self.args[1] if self.args and len(self.args) > 1 \
else 'latest'
--- a/ckanext/ga_report/download_analytics.py
+++ b/ckanext/ga_report/download_analytics.py
@@ -17,11 +17,13 @@
class DownloadAnalytics(object):
'''Downloads and stores analytics info'''
- def __init__(self, service=None, profile_id=None, delete_first=False):
+ def __init__(self, service=None, profile_id=None, delete_first=False,
+ skip_url_stats=False):
self.period = config['ga-report.period']
self.service = service
self.profile_id = profile_id
self.delete_first = delete_first
+ self.skip_url_stats = skip_url_stats
def specific_month(self, date):
import calendar
@@ -102,25 +104,26 @@
period_name)
ga_model.delete(period_name)
- # Clean up the entries before we run this
- ga_model.pre_update_url_stats(period_name)
-
- accountName = config.get('googleanalytics.account')
-
- log.info('Downloading analytics for dataset views')
- data = self.download(start_date, end_date, '~/%s/dataset/[a-z0-9-_]+' % accountName)
-
- log.info('Storing dataset views (%i rows)', len(data.get('url')))
- self.store(period_name, period_complete_day, data, )
-
- log.info('Downloading analytics for publisher views')
- data = self.download(start_date, end_date, '~/%s/publisher/[a-z0-9-_]+' % accountName)
-
- log.info('Storing publisher views (%i rows)', len(data.get('url')))
- self.store(period_name, period_complete_day, data,)
-
- log.info('Aggregating datasets by publisher')
- ga_model.update_publisher_stats(period_name) # about 30 seconds.
+ if not self.skip_url_stats:
+ # Clean out old url data before storing the new
+ ga_model.pre_update_url_stats(period_name)
+
+ accountName = config.get('googleanalytics.account')
+
+ log.info('Downloading analytics for dataset views')
+ data = self.download(start_date, end_date, '~/%s/dataset/[a-z0-9-_]+' % accountName)
+
+ log.info('Storing dataset views (%i rows)', len(data.get('url')))
+ self.store(period_name, period_complete_day, data, )
+
+ log.info('Downloading analytics for publisher views')
+ data = self.download(start_date, end_date, '~/%s/publisher/[a-z0-9-_]+' % accountName)
+
+ log.info('Storing publisher views (%i rows)', len(data.get('url')))
+ self.store(period_name, period_complete_day, data,)
+
+ log.info('Aggregating datasets by publisher')
+ ga_model.update_publisher_stats(period_name) # about 30 seconds.
log.info('Downloading and storing analytics for site-wide stats')
self.sitewide_stats( period_name )
@@ -177,8 +180,12 @@
packages = []
for entry in results.get('rows'):
(loc,pageviews,visits) = entry
- url = _normalize_url('http:/' + loc)
+ url = _normalize_url('http:/' + loc) # strips off domain e.g. www.data.gov.uk or data.gov.uk
+
if not url.startswith('/dataset/') and not url.startswith('/publisher/'):
+ # filter out strays like:
+ # /data/user/login?came_from=http://data.gov.uk/dataset/os-code-point-open
+ # /403.html?page=/about&from=http://data.gov.uk/publisher/planning-inspectorate
continue
packages.append( (url, pageviews, visits,) ) # Temporary hack
return dict(url=packages)
@@ -234,25 +241,27 @@
}
ga_model.update_sitewide_stats(period_name, "Totals", data)
- # Bounces from /data. This url is specified in configuration because
- # for DGU we don't want /.
- path = config.get('ga-report.bounce_url','/')
- print path
- results = self.service.data().ga().get(
- ids='ga:' + self.profile_id,
- filters='ga:pagePath=~%s$' % (path,),
+ # Bounces from / or another configurable page.
+ path = '/%s%s' % (config.get('googleanalytics.account'),
+ config.get('ga-report.bounce_url', '/'))
+ results = self.service.data().ga().get(
+ ids='ga:' + self.profile_id,
+ filters='ga:pagePath==%s' % (path,),
start_date=start_date,
metrics='ga:bounces,ga:pageviews',
dimensions='ga:pagePath',
max_results=10000,
end_date=end_date).execute()
result_data = results.get('rows')
- for results in result_data:
- if results[0] == path:
- bounce, total = [float(x) for x in results[1:]]
- pct = 100 * bounce/total
- print "%d bounces from %d total == %s" % (bounce, total, pct)
- ga_model.update_sitewide_stats(period_name, "Totals", {'Bounces': pct})
+ if len(result_data) != 1:
+ log.error('Could not pinpoint the bounces for path: %s. Got results: %r',
+ path, result_data)
+ return
+ results = result_data[0]
+ bounces, total = [float(x) for x in result_data[0][1:]]
+ pct = 100 * bounces/total
+ log.info('%d bounces from %d total == %s', bounces, total, pct)
+ ga_model.update_sitewide_stats(period_name, "Totals", {'Bounce rate': pct})
def _locale_stats(self, start_date, end_date, period_name):
--- a/ckanext/ga_report/ga_model.py
+++ b/ckanext/ga_report/ga_model.py
@@ -111,9 +111,7 @@
>>> normalize_url('http://data.gov.uk/dataset/weekly_fuel_prices')
'/dataset/weekly_fuel_prices'
'''
- # Deliberately leaving a /
- url = url.replace('http:/','')
- return '/' + '/'.join(url.split('/')[2:])
+ return '/' + '/'.join(url.split('/')[3:])
def _get_package_and_publisher(url):
@@ -165,7 +163,11 @@
def update_url_stats(period_name, period_complete_day, url_data):
-
+ '''
+ Given a list of urls and number of hits for each during a given period,
+ stores them in GA_Url under the period and recalculates the totals for
+ the 'All' period.
+ '''
for url, views, visits in url_data:
package, publisher = _get_package_and_publisher(url)
--- a/ckanext/ga_report/helpers.py
+++ b/ckanext/ga_report/helpers.py
@@ -60,7 +60,8 @@
if not dataset:
return None
dataset_dict = get_action('package_show')({'model': model,
- 'session': model.Session},
+ 'session': model.Session,
+ 'validate': False},
{'id':dataset.id})
return dataset_dict
--- /dev/null
+++ b/ckanext/ga_report/tests/test_model.py
@@ -1,1 +1,18 @@
+from nose.tools import assert_equal
+from ckanext.ga_report.ga_model import _normalize_url
+
+class TestNormalizeUrl:
+ def test_normal(self):
+ assert_equal(_normalize_url('http://data.gov.uk/dataset/weekly_fuel_prices'),
+ '/dataset/weekly_fuel_prices')
+
+ def test_www_dot(self):
+ assert_equal(_normalize_url('http://www.data.gov.uk/dataset/weekly_fuel_prices'),
+ '/dataset/weekly_fuel_prices')
+
+ def test_https(self):
+ assert_equal(_normalize_url('https://data.gov.uk/dataset/weekly_fuel_prices'),
+ '/dataset/weekly_fuel_prices')
+
+