--- a/ckanext/ga_report/download_analytics.py +++ b/ckanext/ga_report/download_analytics.py @@ -1,9 +1,9 @@ import os import logging import datetime - +import collections from pylons import config - +from ga_model import _normalize_url import ga_model #from ga_client import GA @@ -11,15 +11,19 @@ log = logging.getLogger('ckanext.ga-report') FORMAT_MONTH = '%Y-%m' +MIN_VIEWS = 50 +MIN_VISITS = 20 class DownloadAnalytics(object): '''Downloads and stores analytics info''' - def __init__(self, service=None, profile_id=None): + def __init__(self, service=None, profile_id=None, delete_first=False, + skip_url_stats=False): self.period = config['ga-report.period'] self.service = service self.profile_id = profile_id - + self.delete_first = delete_first + self.skip_url_stats = skip_url_stats def specific_month(self, date): import calendar @@ -90,32 +94,77 @@ def download_and_store(self, periods): for period_name, period_complete_day, start_date, end_date in periods: - log.info('Downloading Analytics for period "%s" (%s - %s)', + log.info('Period "%s" (%s - %s)', self.get_full_period_name(period_name, period_complete_day), - start_date.strftime('%Y %m %d'), - end_date.strftime('%Y %m %d')) - - data = self.download(start_date, end_date, '~/dataset/[a-z0-9-_]+') - log.info('Storing Dataset Analytics for period "%s"', - self.get_full_period_name(period_name, period_complete_day)) - self.store(period_name, period_complete_day, data, ) - - data = self.download(start_date, end_date, '~/publisher/[a-z0-9-_]+') - log.info('Storing Publisher Analytics for period "%s"', - self.get_full_period_name(period_name, period_complete_day)) - self.store(period_name, period_complete_day, data,) - - ga_model.update_publisher_stats(period_name) # about 30 seconds. - self.sitewide_stats( period_name ) - - - def download(self, start_date, end_date, path='~/dataset/[a-z0-9-_]+'): + start_date.strftime('%Y-%m-%d'), + end_date.strftime('%Y-%m-%d')) + + if self.delete_first: + log.info('Deleting existing Analytics for this period "%s"', + period_name) + ga_model.delete(period_name) + + if not self.skip_url_stats: + # Clean out old url data before storing the new + ga_model.pre_update_url_stats(period_name) + + accountName = config.get('googleanalytics.account') + + log.info('Downloading analytics for dataset views') + data = self.download(start_date, end_date, '~/%s/dataset/[a-z0-9-_]+' % accountName) + + log.info('Storing dataset views (%i rows)', len(data.get('url'))) + self.store(period_name, period_complete_day, data, ) + + log.info('Downloading analytics for publisher views') + data = self.download(start_date, end_date, '~/%s/publisher/[a-z0-9-_]+' % accountName) + + log.info('Storing publisher views (%i rows)', len(data.get('url'))) + self.store(period_name, period_complete_day, data,) + + log.info('Aggregating datasets by publisher') + ga_model.update_publisher_stats(period_name) # about 30 seconds. + + log.info('Downloading and storing analytics for site-wide stats') + self.sitewide_stats( period_name, period_complete_day ) + + log.info('Downloading and storing analytics for social networks') + self.update_social_info(period_name, start_date, end_date) + + + def update_social_info(self, period_name, start_date, end_date): + start_date = start_date.strftime('%Y-%m-%d') + end_date = end_date.strftime('%Y-%m-%d') + query = 'ga:hasSocialSourceReferral=~Yes$' + metrics = 'ga:entrances' + sort = '-ga:entrances' + + # Supported query params at + # https://developers.google.com/analytics/devguides/reporting/core/v3/reference + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + filters=query, + start_date=start_date, + metrics=metrics, + sort=sort, + dimensions="ga:landingPagePath,ga:socialNetwork", + max_results=10000, + end_date=end_date).execute() + data = collections.defaultdict(list) + rows = results.get('rows',[]) + for row in rows: + url = _normalize_url('http:/' + row[0]) + data[url].append( (row[1], int(row[2]),) ) + ga_model.update_social(period_name, data) + + + def download(self, start_date, end_date, path=None): '''Get data from GA for a given time period''' start_date = start_date.strftime('%Y-%m-%d') end_date = end_date.strftime('%Y-%m-%d') query = 'ga:pagePath=%s$' % path - metrics = 'ga:uniquePageviews, ga:visitors' - sort = '-ga:uniquePageviews' + metrics = 'ga:pageviews, ga:visits' + sort = '-ga:pageviews' # Supported query params at # https://developers.google.com/analytics/devguides/reporting/core/v3/reference @@ -129,35 +178,35 @@ max_results=10000, end_date=end_date).execute() - if os.getenv('DEBUG'): - import pprint - pprint.pprint(results) - print 'Total results: %s' % results.get('totalResults') - packages = [] for entry in results.get('rows'): (loc,pageviews,visits) = entry - packages.append( ('http:/' + loc, pageviews, visits,) ) # Temporary hack + url = _normalize_url('http:/' + loc) # strips off domain e.g. www.data.gov.uk or data.gov.uk + + if not url.startswith('/dataset/') and not url.startswith('/publisher/'): + # filter out strays like: + # /data/user/login?came_from=http://data.gov.uk/dataset/os-code-point-open + # /403.html?page=/about&from=http://data.gov.uk/publisher/planning-inspectorate + continue + packages.append( (url, pageviews, visits,) ) # Temporary hack return dict(url=packages) def store(self, period_name, period_complete_day, data): if 'url' in data: ga_model.update_url_stats(period_name, period_complete_day, data['url']) - def sitewide_stats(self, period_name): + def sitewide_stats(self, period_name, period_complete_day): import calendar year, month = period_name.split('-') _, last_day_of_month = calendar.monthrange(int(year), int(month)) start_date = '%s-01' % period_name end_date = '%s-%s' % (period_name, last_day_of_month) - print 'Sitewide_stats for %s (%s -> %s)' % (period_name, start_date, end_date) - funcs = ['_totals_stats', '_social_stats', '_os_stats', '_locale_stats', '_browser_stats', '_mobile_stats'] for f in funcs: - print ' + Fetching %s stats' % f.split('_')[1] - getattr(self, f)(start_date, end_date, period_name) + log.info('Downloading analytics for %s' % f.split('_')[1]) + getattr(self, f)(start_date, end_date, period_name, period_complete_day) def _get_results(result_data, f): data = {} @@ -166,41 +215,65 @@ data[key] = data.get(key,0) + result[1] return data - def _totals_stats(self, start_date, end_date, period_name): + def _totals_stats(self, start_date, end_date, period_name, period_complete_day): """ Fetches distinct totals, total pageviews etc """ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', - max_results=10000, - end_date=end_date).execute() - result_data = results.get('rows') - ga_model.update_sitewide_stats(period_name, "Totals", {'Total pageviews': result_data[0][0]}) - - results = self.service.data().ga().get( - ids='ga:' + self.profile_id, - start_date=start_date, - metrics='ga:pageviewsPerVisit,ga:bounces,ga:avgTimeOnSite,ga:percentNewVisits', + metrics='ga:pageviews', + sort='-ga:pageviews', + max_results=10000, + end_date=end_date).execute() + result_data = results.get('rows') + ga_model.update_sitewide_stats(period_name, "Totals", {'Total page views': result_data[0][0]}, + period_complete_day) + + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + start_date=start_date, + metrics='ga:pageviewsPerVisit,ga:avgTimeOnSite,ga:percentNewVisits,ga:visits', max_results=10000, end_date=end_date).execute() result_data = results.get('rows') data = { 'Pages per visit': result_data[0][0], - 'Bounces': result_data[0][1], - 'Average time on site': result_data[0][2], - 'Percent new visits': result_data[0][3], + 'Average time on site': result_data[0][1], + 'New visits': result_data[0][2], + 'Total visits': result_data[0][3], } - ga_model.update_sitewide_stats(period_name, "Totals", data) - - - def _locale_stats(self, start_date, end_date, period_name): + ga_model.update_sitewide_stats(period_name, "Totals", data, period_complete_day) + + # Bounces from / or another configurable page. + path = '/%s%s' % (config.get('googleanalytics.account'), + config.get('ga-report.bounce_url', '/')) + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + filters='ga:pagePath==%s' % (path,), + start_date=start_date, + metrics='ga:bounces,ga:pageviews', + dimensions='ga:pagePath', + max_results=10000, + end_date=end_date).execute() + result_data = results.get('rows') + if not result_data or len(result_data) != 1: + log.error('Could not pinpoint the bounces for path: %s. Got results: %r', + path, result_data) + return + results = result_data[0] + bounces, total = [float(x) for x in result_data[0][1:]] + pct = 100 * bounces/total + log.info('%d bounces from %d total == %s', bounces, total, pct) + ga_model.update_sitewide_stats(period_name, "Totals", {'Bounce rate (home page)': pct}, + period_complete_day) + + + def _locale_stats(self, start_date, end_date, period_name, period_complete_day): """ Fetches stats about language and country """ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:language,ga:country", max_results=10000, end_date=end_date).execute() @@ -208,42 +281,42 @@ data = {} for result in result_data: data[result[0]] = data.get(result[0], 0) + int(result[2]) - ga_model.update_sitewide_stats(period_name, "Languages", data) + self._filter_out_long_tail(data, MIN_VIEWS) + ga_model.update_sitewide_stats(period_name, "Languages", data, period_complete_day) data = {} for result in result_data: data[result[1]] = data.get(result[1], 0) + int(result[2]) - ga_model.update_sitewide_stats(period_name, "Country", data) - - - def _social_stats(self, start_date, end_date, period_name): + self._filter_out_long_tail(data, MIN_VIEWS) + ga_model.update_sitewide_stats(period_name, "Country", data, period_complete_day) + + + def _social_stats(self, start_date, end_date, period_name, period_complete_day): """ Finds out which social sites people are referred from """ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:socialNetwork,ga:referralPath", max_results=10000, end_date=end_date).execute() result_data = results.get('rows') - twitter_links = [] data = {} for result in result_data: if not result[0] == '(not set)': data[result[0]] = data.get(result[0], 0) + int(result[2]) - if result[0] == 'Twitter': - twitter_links.append(result[1]) - ga_model.update_sitewide_stats(period_name, "Social sources", data) - - - def _os_stats(self, start_date, end_date, period_name): + self._filter_out_long_tail(data, 3) + ga_model.update_sitewide_stats(period_name, "Social sources", data, period_complete_day) + + + def _os_stats(self, start_date, end_date, period_name, period_complete_day): """ Operating system stats """ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:operatingSystem,ga:operatingSystemVersion", max_results=10000, end_date=end_date).execute() @@ -251,46 +324,73 @@ data = {} for result in result_data: data[result[0]] = data.get(result[0], 0) + int(result[2]) - ga_model.update_sitewide_stats(period_name, "Operating Systems", data) - - data = {} - for result in result_data: - key = "%s (%s)" % (result[0],result[1]) - data[key] = result[2] - ga_model.update_sitewide_stats(period_name, "Operating Systems versions", data) - - - def _browser_stats(self, start_date, end_date, period_name): + self._filter_out_long_tail(data, MIN_VIEWS) + ga_model.update_sitewide_stats(period_name, "Operating Systems", data, period_complete_day) + + data = {} + for result in result_data: + if int(result[2]) >= MIN_VIEWS: + key = "%s %s" % (result[0],result[1]) + data[key] = result[2] + ga_model.update_sitewide_stats(period_name, "Operating Systems versions", data, period_complete_day) + + + def _browser_stats(self, start_date, end_date, period_name, period_complete_day): """ Information about browsers and browser versions """ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:browser,ga:browserVersion", max_results=10000, end_date=end_date).execute() result_data = results.get('rows') + # e.g. [u'Firefox', u'19.0', u'20'] + data = {} for result in result_data: data[result[0]] = data.get(result[0], 0) + int(result[2]) - ga_model.update_sitewide_stats(period_name, "Browsers", data) - - data = {} - for result in result_data: - key = "%s (%s)" % (result[0], result[1]) - data[key] = result[2] - ga_model.update_sitewide_stats(period_name, "Browser versions", data) - - - def _mobile_stats(self, start_date, end_date, period_name): + self._filter_out_long_tail(data, MIN_VIEWS) + ga_model.update_sitewide_stats(period_name, "Browsers", data, period_complete_day) + + data = {} + for result in result_data: + key = "%s %s" % (result[0], self._filter_browser_version(result[0], result[1])) + data[key] = data.get(key, 0) + int(result[2]) + self._filter_out_long_tail(data, MIN_VIEWS) + ga_model.update_sitewide_stats(period_name, "Browser versions", data, period_complete_day) + + @classmethod + def _filter_browser_version(cls, browser, version_str): + ''' + Simplifies a browser version string if it is detailed. + i.e. groups together Firefox 3.5.1 and 3.5.2 to be just 3. + This is helpful when viewing stats and good to protect privacy. + ''' + ver = version_str + parts = ver.split('.') + if len(parts) > 1: + if parts[1][0] == '0': + ver = parts[0] + else: + ver = "%s" % (parts[0]) + # Special case complex version nums + if browser in ['Safari', 'Android Browser']: + ver = parts[0] + if len(ver) > 2: + num_hidden_digits = len(ver) - 2 + ver = ver[0] + ver[1] + 'X' * num_hidden_digits + return ver + + def _mobile_stats(self, start_date, end_date, period_name, period_complete_day): """ Info about mobile devices """ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:mobileDeviceBranding, ga:mobileDeviceInfo", max_results=10000, end_date=end_date).execute() @@ -299,10 +399,23 @@ data = {} for result in result_data: data[result[0]] = data.get(result[0], 0) + int(result[2]) - ga_model.update_sitewide_stats(period_name, "Mobile brands", data) + self._filter_out_long_tail(data, MIN_VIEWS) + ga_model.update_sitewide_stats(period_name, "Mobile brands", data, period_complete_day) data = {} for result in result_data: data[result[1]] = data.get(result[1], 0) + int(result[2]) - ga_model.update_sitewide_stats(period_name, "Mobile devices", data) - + self._filter_out_long_tail(data, MIN_VIEWS) + ga_model.update_sitewide_stats(period_name, "Mobile devices", data, period_complete_day) + + @classmethod + def _filter_out_long_tail(cls, data, threshold=10): + ''' + Given data which is a frequency distribution, filter out + results which are below a threshold count. This is good to protect + privacy. + ''' + for key, value in data.items(): + if value < threshold: + del data[key] +