--- a/ckanext/ga_report/download_analytics.py +++ b/ckanext/ga_report/download_analytics.py @@ -3,7 +3,7 @@ import datetime import collections from pylons import config - +from ga_model import _normalize_url import ga_model #from ga_client import GA @@ -11,15 +11,17 @@ log = logging.getLogger('ckanext.ga-report') FORMAT_MONTH = '%Y-%m' +MIN_VIEWS = 50 +MIN_VISITS = 20 class DownloadAnalytics(object): '''Downloads and stores analytics info''' - def __init__(self, service=None, profile_id=None): + def __init__(self, service=None, profile_id=None, delete_first=False): self.period = config['ga-report.period'] self.service = service self.profile_id = profile_id - + self.delete_first = delete_first def specific_month(self, date): import calendar @@ -90,16 +92,26 @@ def download_and_store(self, periods): for period_name, period_complete_day, start_date, end_date in periods: + if self.delete_first: + log.info('Deleting existing Analytics for period "%s"', + period_name) + ga_model.delete(period_name) log.info('Downloading Analytics for period "%s" (%s - %s)', self.get_full_period_name(period_name, period_complete_day), start_date.strftime('%Y %m %d'), end_date.strftime('%Y %m %d')) - data = self.download(start_date, end_date, '~/dataset/[a-z0-9-_]+') + + # Clean up the entries before we run this + ga_model.pre_update_url_stats(period_name) + + accountName = config.get('googleanalytics.account') + + data = self.download(start_date, end_date, '~/%s/dataset/[a-z0-9-_]+' % accountName) log.info('Storing Dataset Analytics for period "%s"', self.get_full_period_name(period_name, period_complete_day)) self.store(period_name, period_complete_day, data, ) - data = self.download(start_date, end_date, '~/publisher/[a-z0-9-_]+') + data = self.download(start_date, end_date, '~/%s/publisher/[a-z0-9-_]+' % accountName) log.info('Storing Publisher Analytics for period "%s"', self.get_full_period_name(period_name, period_complete_day)) self.store(period_name, period_complete_day, data,) @@ -108,6 +120,7 @@ self.sitewide_stats( period_name ) self.update_social_info(period_name, start_date, end_date) + def update_social_info(self, period_name, start_date, end_date): start_date = start_date.strftime('%Y-%m-%d') @@ -130,17 +143,16 @@ data = collections.defaultdict(list) rows = results.get('rows',[]) for row in rows: - from ga_model import _normalize_url data[_normalize_url(row[0])].append( (row[1], int(row[2]),) ) ga_model.update_social(period_name, data) - def download(self, start_date, end_date, path='~/dataset/[a-z0-9-_]+'): + def download(self, start_date, end_date, path=None): '''Get data from GA for a given time period''' start_date = start_date.strftime('%Y-%m-%d') end_date = end_date.strftime('%Y-%m-%d') query = 'ga:pagePath=%s$' % path - metrics = 'ga:uniquePageviews, ga:visitors' + metrics = 'ga:uniquePageviews, ga:visits' sort = '-ga:uniquePageviews' # Supported query params at @@ -155,15 +167,13 @@ max_results=10000, end_date=end_date).execute() - if os.getenv('DEBUG'): - import pprint - pprint.pprint(results) - print 'Total results: %s' % results.get('totalResults') - packages = [] for entry in results.get('rows'): (loc,pageviews,visits) = entry - packages.append( ('http:/' + loc, pageviews, visits,) ) # Temporary hack + url = _normalize_url('http:/' + loc) + if not url.startswith('/dataset/') and not url.startswith('/publisher/'): + continue + packages.append( (url, pageviews, visits,) ) # Temporary hack return dict(url=packages) def store(self, period_name, period_complete_day, data): @@ -207,19 +217,38 @@ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:pageviewsPerVisit,ga:bounces,ga:avgTimeOnSite,ga:percentNewVisits,ga:visitors', + metrics='ga:pageviewsPerVisit,ga:avgTimeOnSite,ga:percentNewVisits,ga:visits', max_results=10000, end_date=end_date).execute() result_data = results.get('rows') data = { 'Pages per visit': result_data[0][0], - 'Bounces': result_data[0][1], - 'Average time on site': result_data[0][2], - 'New visits': result_data[0][3], - 'Total visits': result_data[0][4], + 'Average time on site': result_data[0][1], + 'New visits': result_data[0][2], + 'Total visits': result_data[0][3], } ga_model.update_sitewide_stats(period_name, "Totals", data) + # Bounces from /data. This url is specified in configuration because + # for DGU we don't want /. + path = config.get('ga-report.bounce_url','/') + print path + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + filters='ga:pagePath=~%s$' % (path,), + start_date=start_date, + metrics='ga:bounces,ga:uniquePageviews', + dimensions='ga:pagePath', + max_results=10000, + end_date=end_date).execute() + result_data = results.get('rows') + for results in result_data: + if results[0] == path: + bounce, total = [float(x) for x in results[1:]] + pct = 100 * bounce/total + print "%d bounces from %d total == %s" % (bounce, total, pct) + ga_model.update_sitewide_stats(period_name, "Totals", {'Bounces': pct}) + def _locale_stats(self, start_date, end_date, period_name): """ Fetches stats about language and country """ @@ -235,11 +264,13 @@ data = {} for result in result_data: data[result[0]] = data.get(result[0], 0) + int(result[2]) + self._filter_out_long_tail(data, MIN_VIEWS) ga_model.update_sitewide_stats(period_name, "Languages", data) data = {} for result in result_data: data[result[1]] = data.get(result[1], 0) + int(result[2]) + self._filter_out_long_tail(data, MIN_VIEWS) ga_model.update_sitewide_stats(period_name, "Country", data) @@ -254,13 +285,11 @@ max_results=10000, end_date=end_date).execute() result_data = results.get('rows') - twitter_links = [] data = {} for result in result_data: if not result[0] == '(not set)': data[result[0]] = data.get(result[0], 0) + int(result[2]) - if result[0] == 'Twitter': - twitter_links.append(result[1]) + self._filter_out_long_tail(data, 3) ga_model.update_sitewide_stats(period_name, "Social sources", data) @@ -278,12 +307,14 @@ data = {} for result in result_data: data[result[0]] = data.get(result[0], 0) + int(result[2]) + self._filter_out_long_tail(data, MIN_VIEWS) ga_model.update_sitewide_stats(period_name, "Operating Systems", data) data = {} for result in result_data: - key = "%s (%s)" % (result[0],result[1]) - data[key] = result[2] + if int(result[2]) >= MIN_VIEWS: + key = "%s %s" % (result[0],result[1]) + data[key] = result[2] ga_model.update_sitewide_stats(period_name, "Operating Systems versions", data) @@ -298,17 +329,42 @@ max_results=10000, end_date=end_date).execute() result_data = results.get('rows') + # e.g. [u'Firefox', u'19.0', u'20'] + data = {} for result in result_data: data[result[0]] = data.get(result[0], 0) + int(result[2]) + self._filter_out_long_tail(data, MIN_VIEWS) ga_model.update_sitewide_stats(period_name, "Browsers", data) data = {} for result in result_data: - key = "%s (%s)" % (result[0], result[1]) - data[key] = result[2] + key = "%s %s" % (result[0], self._filter_browser_version(result[0], result[1])) + data[key] = data.get(key, 0) + int(result[2]) + self._filter_out_long_tail(data, MIN_VIEWS) ga_model.update_sitewide_stats(period_name, "Browser versions", data) + @classmethod + def _filter_browser_version(cls, browser, version_str): + ''' + Simplifies a browser version string if it is detailed. + i.e. groups together Firefox 3.5.1 and 3.5.2 to be just 3. + This is helpful when viewing stats and good to protect privacy. + ''' + ver = version_str + parts = ver.split('.') + if len(parts) > 1: + if parts[1][0] == '0': + ver = parts[0] + else: + ver = "%s" % (parts[0]) + # Special case complex version nums + if browser in ['Safari', 'Android Browser']: + ver = parts[0] + if len(ver) > 2: + num_hidden_digits = len(ver) - 2 + ver = ver[0] + ver[1] + 'X' * num_hidden_digits + return ver def _mobile_stats(self, start_date, end_date, period_name): """ Info about mobile devices """ @@ -326,10 +382,23 @@ data = {} for result in result_data: data[result[0]] = data.get(result[0], 0) + int(result[2]) + self._filter_out_long_tail(data, MIN_VIEWS) ga_model.update_sitewide_stats(period_name, "Mobile brands", data) data = {} for result in result_data: data[result[1]] = data.get(result[1], 0) + int(result[2]) + self._filter_out_long_tail(data, MIN_VIEWS) ga_model.update_sitewide_stats(period_name, "Mobile devices", data) + @classmethod + def _filter_out_long_tail(cls, data, threshold=10): + ''' + Given data which is a frequency distribution, filter out + results which are below a threshold count. This is good to protect + privacy. + ''' + for key, value in data.items(): + if value < threshold: + del data[key] +