From: David Read Date: Wed, 07 Nov 2012 18:18:56 +0000 Subject: Improved bounce rate stuff. X-Git-Url: http://maxious.lambdacomplex.org/git/?p=ckanext-ga-report.git&a=commitdiff&h=49fcdd2a96f6999b669ac4306a4810d9e0ff0d58 --- Improved bounce rate stuff. --- --- a/README.rst +++ b/README.rst @@ -33,10 +33,9 @@ googleanalytics.id = UA-1010101-1 googleanalytics.account = Account name (e.g. data.gov.uk, see top level item at https://www.google.com/analytics) ga-report.period = monthly - ga-report.bounce_url = /data + ga-report.bounce_url = / - The ga-report.bounce_url specifies the path to use when calculating bounces. For DGU this is /data - but you may want to set this to /. + The ga-report.bounce_url specifies a particular path to record the bounce rate for. Typically it is / (the home page). 3. Set up this extension's database tables using a paster command. (Ensure your CKAN pyenv is still activated, run the command from ``src/ckanext-ga-report``, alter the ``--config`` option to point to your site config file):: --- a/ckanext/ga_report/command.py +++ b/ckanext/ga_report/command.py @@ -80,6 +80,11 @@ default=False, dest='delete_first', help='Delete data for the period first') + self.parser.add_option('-s', '--slip_url_stats', + action='store_true', + default=False, + dest='skip_url_stats', + help='Skip the download of URL data - just do site-wide stats') def command(self): self._load_config() @@ -95,7 +100,8 @@ return downloader = DownloadAnalytics(svc, profile_id=get_profile_id(svc), - delete_first=self.options.delete_first) + delete_first=self.options.delete_first, + skip_url_stats=self.options.skip_url_stats) time_period = self.args[1] if self.args and len(self.args) > 1 \ else 'latest' --- a/ckanext/ga_report/controller.py +++ b/ckanext/ga_report/controller.py @@ -309,6 +309,8 @@ select department_id, sum(pageviews::int) views, sum(visitors::int) visits from ga_url where department_id <> '' + and package_id <> '' + and url like '/dataset/%%' and period_name=%s group by department_id order by visits desc """ --- a/ckanext/ga_report/download_analytics.py +++ b/ckanext/ga_report/download_analytics.py @@ -17,11 +17,13 @@ class DownloadAnalytics(object): '''Downloads and stores analytics info''' - def __init__(self, service=None, profile_id=None, delete_first=False): + def __init__(self, service=None, profile_id=None, delete_first=False, + skip_url_stats=False): self.period = config['ga-report.period'] self.service = service self.profile_id = profile_id self.delete_first = delete_first + self.skip_url_stats = skip_url_stats def specific_month(self, date): import calendar @@ -92,33 +94,41 @@ def download_and_store(self, periods): for period_name, period_complete_day, start_date, end_date in periods: + log.info('Period "%s" (%s - %s)', + self.get_full_period_name(period_name, period_complete_day), + start_date.strftime('%Y-%m-%d'), + end_date.strftime('%Y-%m-%d')) + if self.delete_first: - log.info('Deleting existing Analytics for period "%s"', + log.info('Deleting existing Analytics for this period "%s"', period_name) ga_model.delete(period_name) - log.info('Downloading Analytics for period "%s" (%s - %s)', - self.get_full_period_name(period_name, period_complete_day), - start_date.strftime('%Y %m %d'), - end_date.strftime('%Y %m %d')) - - # Clean up the entries before we run this - ga_model.pre_update_url_stats(period_name) - - accountName = config.get('googleanalytics.account') - - data = self.download(start_date, end_date, '~/%s/dataset/[a-z0-9-_]+' % accountName) - log.info('Storing Dataset Analytics for period "%s"', - self.get_full_period_name(period_name, period_complete_day)) - self.store(period_name, period_complete_day, data, ) - - data = self.download(start_date, end_date, '~/%s/publisher/[a-z0-9-_]+' % accountName) - log.info('Storing Publisher Analytics for period "%s"', - self.get_full_period_name(period_name, period_complete_day)) - self.store(period_name, period_complete_day, data,) - - ga_model.update_publisher_stats(period_name) # about 30 seconds. + + if not self.skip_url_stats: + # Clean out old url data before storing the new + ga_model.pre_update_url_stats(period_name) + + accountName = config.get('googleanalytics.account') + + log.info('Downloading analytics for dataset views') + data = self.download(start_date, end_date, '~/%s/dataset/[a-z0-9-_]+' % accountName) + + log.info('Storing dataset views (%i rows)', len(data.get('url'))) + self.store(period_name, period_complete_day, data, ) + + log.info('Downloading analytics for publisher views') + data = self.download(start_date, end_date, '~/%s/publisher/[a-z0-9-_]+' % accountName) + + log.info('Storing publisher views (%i rows)', len(data.get('url'))) + self.store(period_name, period_complete_day, data,) + + log.info('Aggregating datasets by publisher') + ga_model.update_publisher_stats(period_name) # about 30 seconds. + + log.info('Downloading and storing analytics for site-wide stats') self.sitewide_stats( period_name ) + log.info('Downloading and storing analytics for social networks') self.update_social_info(period_name, start_date, end_date) @@ -170,8 +180,12 @@ packages = [] for entry in results.get('rows'): (loc,pageviews,visits) = entry - url = _normalize_url('http:/' + loc) + url = _normalize_url('http:/' + loc) # strips off domain e.g. www.data.gov.uk or data.gov.uk + if not url.startswith('/dataset/') and not url.startswith('/publisher/'): + # filter out strays like: + # /data/user/login?came_from=http://data.gov.uk/dataset/os-code-point-open + # /403.html?page=/about&from=http://data.gov.uk/publisher/planning-inspectorate continue packages.append( (url, pageviews, visits,) ) # Temporary hack return dict(url=packages) @@ -187,12 +201,10 @@ start_date = '%s-01' % period_name end_date = '%s-%s' % (period_name, last_day_of_month) - print 'Sitewide_stats for %s (%s -> %s)' % (period_name, start_date, end_date) - funcs = ['_totals_stats', '_social_stats', '_os_stats', '_locale_stats', '_browser_stats', '_mobile_stats'] for f in funcs: - print ' + Fetching %s stats' % f.split('_')[1] + log.info('Downloading analytics for %s' % f.split('_')[1]) getattr(self, f)(start_date, end_date, period_name) def _get_results(result_data, f): @@ -229,25 +241,27 @@ } ga_model.update_sitewide_stats(period_name, "Totals", data) - # Bounces from /data. This url is specified in configuration because - # for DGU we don't want /. - path = config.get('ga-report.bounce_url','/') - print path - results = self.service.data().ga().get( - ids='ga:' + self.profile_id, - filters='ga:pagePath=~%s$' % (path,), + # Bounces from / or another configurable page. + path = '/%s%s' % (config.get('googleanalytics.account'), + config.get('ga-report.bounce_url', '/')) + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + filters='ga:pagePath==%s' % (path,), start_date=start_date, metrics='ga:bounces,ga:uniquePageviews', dimensions='ga:pagePath', max_results=10000, end_date=end_date).execute() result_data = results.get('rows') - for results in result_data: - if results[0] == path: - bounce, total = [float(x) for x in results[1:]] - pct = 100 * bounce/total - print "%d bounces from %d total == %s" % (bounce, total, pct) - ga_model.update_sitewide_stats(period_name, "Totals", {'Bounces': pct}) + if len(result_data) != 1: + log.error('Could not pinpoint the bounces for path: %s. Got results: %r', + path, result_data) + return + results = result_data[0] + bounces, total = [float(x) for x in result_data[0][1:]] + pct = 100 * bounces/total + log.info('%d bounces from %d total == %s', bounces, total, pct) + ga_model.update_sitewide_stats(period_name, "Totals", {'Bounce rate': pct}) def _locale_stats(self, start_date, end_date, period_name): --- a/ckanext/ga_report/ga_model.py +++ b/ckanext/ga_report/ga_model.py @@ -111,12 +111,10 @@ >>> normalize_url('http://data.gov.uk/dataset/weekly_fuel_prices') '/dataset/weekly_fuel_prices' ''' - # Deliberately leaving a / - url = url.replace('http:/','') - return '/' + '/'.join(url.split('/')[2:]) - - -def _get_department_id_of_url(url): + return '/' + '/'.join(url.split('/')[3:]) + + +def _get_package_and_publisher(url): # e.g. /dataset/fuel_prices # e.g. /dataset/fuel_prices/resource/e63380d4 dataset_match = re.match('/dataset/([^/]+)(/.*)?', url) @@ -126,12 +124,13 @@ if dataset: publisher_groups = dataset.get_groups('publisher') if publisher_groups: - return publisher_groups[0].name + return dataset_ref,publisher_groups[0].name + return dataset_ref, None else: publisher_match = re.match('/publisher/([^/]+)(/.*)?', url) if publisher_match: - return publisher_match.groups()[0] - + return None, publisher_match.groups()[0] + return None, None def update_sitewide_stats(period_name, stat_name, data): for k,v in data.iteritems(): @@ -183,27 +182,47 @@ def update_url_stats(period_name, period_complete_day, url_data): - + ''' + Given a list of urls and number of hits for each during a given period, + stores them in GA_Url under the period and recalculates the totals for + the 'All' period. + ''' for url, views, visitors in url_data: - department_id = _get_department_id_of_url(url) - - package = None - if url.startswith('/dataset/'): - package = url[len('/dataset/'):] - - values = {'id': make_uuid(), - 'period_name': period_name, - 'period_complete_day': period_complete_day, - 'url': url, - 'pageviews': views, - 'visitors': visitors, - 'department_id': department_id, - 'package_id': package - } - model.Session.add(GA_Url(**values)) + package, publisher = _get_package_and_publisher(url) + + + item = model.Session.query(GA_Url).\ + filter(GA_Url.period_name==period_name).\ + filter(GA_Url.url==url).first() + if item: + item.pageviews = item.pageviews + views + item.visitors = item.visitors + visitors + if not item.package_id: + item.package_id = package + if not item.department_id: + item.department_id = publisher + model.Session.add(item) + else: + values = {'id': make_uuid(), + 'period_name': period_name, + 'period_complete_day': period_complete_day, + 'url': url, + 'pageviews': views, + 'visitors': visitors, + 'department_id': publisher, + 'package_id': package + } + model.Session.add(GA_Url(**values)) model.Session.commit() if package: + old_pageviews, old_visits = 0, 0 + old = model.Session.query(GA_Url).\ + filter(GA_Url.period_name=='All').\ + filter(GA_Url.url==url).all() + old_pageviews = sum([int(o.pageviews) for o in old]) + old_visits = sum([int(o.visitors) for o in old]) + entries = model.Session.query(GA_Url).\ filter(GA_Url.period_name!='All').\ filter(GA_Url.url==url).all() @@ -211,14 +230,14 @@ 'period_name': 'All', 'period_complete_day': 0, 'url': url, - 'pageviews': sum([int(e.pageviews) for e in entries]), - 'visitors': sum([int(e.visitors) for e in entries]), - 'department_id': department_id, + 'pageviews': sum([int(e.pageviews) for e in entries]) + old_pageviews, + 'visitors': sum([int(e.visitors) for e in entries]) + old_visits, + 'department_id': publisher, 'package_id': package } + model.Session.add(GA_Url(**values)) model.Session.commit() - --- a/ckanext/ga_report/helpers.py +++ b/ckanext/ga_report/helpers.py @@ -60,7 +60,8 @@ if not dataset: return None dataset_dict = get_action('package_show')({'model': model, - 'session': model.Session}, + 'session': model.Session, + 'validate': False}, {'id':dataset.id}) return dataset_dict --- a/ckanext/ga_report/templates/ga_report/notes.html +++ b/ckanext/ga_report/templates/ga_report/notes.html @@ -6,11 +6,11 @@
  • Notes

  • --- /dev/null +++ b/ckanext/ga_report/tests/test_model.py @@ -1,1 +1,18 @@ +from nose.tools import assert_equal +from ckanext.ga_report.ga_model import _normalize_url + +class TestNormalizeUrl: + def test_normal(self): + assert_equal(_normalize_url('http://data.gov.uk/dataset/weekly_fuel_prices'), + '/dataset/weekly_fuel_prices') + + def test_www_dot(self): + assert_equal(_normalize_url('http://www.data.gov.uk/dataset/weekly_fuel_prices'), + '/dataset/weekly_fuel_prices') + + def test_https(self): + assert_equal(_normalize_url('https://data.gov.uk/dataset/weekly_fuel_prices'), + '/dataset/weekly_fuel_prices') + +