--- a/ckanext/ga_report/download_analytics.py +++ b/ckanext/ga_report/download_analytics.py @@ -3,7 +3,7 @@ import datetime import collections from pylons import config - +from ga_model import _normalize_url import ga_model #from ga_client import GA @@ -11,15 +11,20 @@ log = logging.getLogger('ckanext.ga-report') FORMAT_MONTH = '%Y-%m' +MIN_VIEWS = 50 +MIN_VISITS = 20 +MIN_DOWNLOADS = 10 class DownloadAnalytics(object): '''Downloads and stores analytics info''' - def __init__(self, service=None, profile_id=None): + def __init__(self, service=None, profile_id=None, delete_first=False, + skip_url_stats=False): self.period = config['ga-report.period'] self.service = service self.profile_id = profile_id - + self.delete_first = delete_first + self.skip_url_stats = skip_url_stats def specific_month(self, date): import calendar @@ -27,6 +32,11 @@ first_of_this_month = datetime.datetime(date.year, date.month, 1) _, last_day_of_month = calendar.monthrange(int(date.year), int(date.month)) last_of_this_month = datetime.datetime(date.year, date.month, last_day_of_month) + # if this is the latest month, note that it is only up until today + now = datetime.datetime.now() + if now.year == date.year and now.month == date.month: + last_day_of_month = now.day + last_of_this_month = now periods = ((date.strftime(FORMAT_MONTH), last_day_of_month, first_of_this_month, last_of_this_month),) @@ -90,24 +100,47 @@ def download_and_store(self, periods): for period_name, period_complete_day, start_date, end_date in periods: - log.info('Downloading Analytics for period "%s" (%s - %s)', + log.info('Period "%s" (%s - %s)', self.get_full_period_name(period_name, period_complete_day), - start_date.strftime('%Y %m %d'), - end_date.strftime('%Y %m %d')) - data = self.download(start_date, end_date, '~/dataset/[a-z0-9-_]+') - log.info('Storing Dataset Analytics for period "%s"', - self.get_full_period_name(period_name, period_complete_day)) - self.store(period_name, period_complete_day, data, ) - - data = self.download(start_date, end_date, '~/publisher/[a-z0-9-_]+') - log.info('Storing Publisher Analytics for period "%s"', - self.get_full_period_name(period_name, period_complete_day)) - self.store(period_name, period_complete_day, data,) - - ga_model.update_publisher_stats(period_name) # about 30 seconds. - self.sitewide_stats( period_name ) - + start_date.strftime('%Y-%m-%d'), + end_date.strftime('%Y-%m-%d')) + + if self.delete_first: + log.info('Deleting existing Analytics for this period "%s"', + period_name) + ga_model.delete(period_name) + + if not self.skip_url_stats: + # Clean out old url data before storing the new + ga_model.pre_update_url_stats(period_name) + + accountName = config.get('googleanalytics.account') + + log.info('Downloading analytics for dataset views') + data = self.download(start_date, end_date, '~/%s/dataset/[a-z0-9-_]+' % accountName) + + log.info('Storing dataset views (%i rows)', len(data.get('url'))) + self.store(period_name, period_complete_day, data, ) + + log.info('Downloading analytics for publisher views') + data = self.download(start_date, end_date, '~/%s/publisher/[a-z0-9-_]+' % accountName) + + log.info('Storing publisher views (%i rows)', len(data.get('url'))) + self.store(period_name, period_complete_day, data,) + + # Make sure the All records are correct. + ga_model.post_update_url_stats() + + log.info('Aggregating datasets by publisher') + ga_model.update_publisher_stats(period_name) # about 30 seconds. + + + log.info('Downloading and storing analytics for site-wide stats') + self.sitewide_stats( period_name, period_complete_day ) + + log.info('Downloading and storing analytics for social networks') self.update_social_info(period_name, start_date, end_date) + def update_social_info(self, period_name, start_date, end_date): start_date = start_date.strftime('%Y-%m-%d') @@ -130,18 +163,18 @@ data = collections.defaultdict(list) rows = results.get('rows',[]) for row in rows: - from ga_model import _normalize_url - data[_normalize_url(row[0])].append( (row[1], int(row[2]),) ) + url = _normalize_url('http:/' + row[0]) + data[url].append( (row[1], int(row[2]),) ) ga_model.update_social(period_name, data) - def download(self, start_date, end_date, path='~/dataset/[a-z0-9-_]+'): + def download(self, start_date, end_date, path=None): '''Get data from GA for a given time period''' start_date = start_date.strftime('%Y-%m-%d') end_date = end_date.strftime('%Y-%m-%d') query = 'ga:pagePath=%s$' % path - metrics = 'ga:uniquePageviews, ga:visitors' - sort = '-ga:uniquePageviews' + metrics = 'ga:pageviews, ga:visits' + sort = '-ga:pageviews' # Supported query params at # https://developers.google.com/analytics/devguides/reporting/core/v3/reference @@ -155,35 +188,36 @@ max_results=10000, end_date=end_date).execute() - if os.getenv('DEBUG'): - import pprint - pprint.pprint(results) - print 'Total results: %s' % results.get('totalResults') - packages = [] + log.info("There are %d results" % results['totalResults']) for entry in results.get('rows'): (loc,pageviews,visits) = entry - packages.append( ('http:/' + loc, pageviews, visits,) ) # Temporary hack + url = _normalize_url('http:/' + loc) # strips off domain e.g. www.data.gov.uk or data.gov.uk + + if not url.startswith('/dataset/') and not url.startswith('/publisher/'): + # filter out strays like: + # /data/user/login?came_from=http://data.gov.uk/dataset/os-code-point-open + # /403.html?page=/about&from=http://data.gov.uk/publisher/planning-inspectorate + continue + packages.append( (url, pageviews, visits,) ) # Temporary hack return dict(url=packages) def store(self, period_name, period_complete_day, data): if 'url' in data: ga_model.update_url_stats(period_name, period_complete_day, data['url']) - def sitewide_stats(self, period_name): + def sitewide_stats(self, period_name, period_complete_day): import calendar year, month = period_name.split('-') _, last_day_of_month = calendar.monthrange(int(year), int(month)) start_date = '%s-01' % period_name end_date = '%s-%s' % (period_name, last_day_of_month) - print 'Sitewide_stats for %s (%s -> %s)' % (period_name, start_date, end_date) - funcs = ['_totals_stats', '_social_stats', '_os_stats', - '_locale_stats', '_browser_stats', '_mobile_stats'] + '_locale_stats', '_browser_stats', '_mobile_stats', '_download_stats'] for f in funcs: - print ' + Fetching %s stats' % f.split('_')[1] - getattr(self, f)(start_date, end_date, period_name) + log.info('Downloading analytics for %s' % f.split('_')[1]) + getattr(self, f)(start_date, end_date, period_name, period_complete_day) def _get_results(result_data, f): data = {} @@ -192,42 +226,65 @@ data[key] = data.get(key,0) + result[1] return data - def _totals_stats(self, start_date, end_date, period_name): + def _totals_stats(self, start_date, end_date, period_name, period_complete_day): """ Fetches distinct totals, total pageviews etc """ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', - max_results=10000, - end_date=end_date).execute() - result_data = results.get('rows') - ga_model.update_sitewide_stats(period_name, "Totals", {'Total page views': result_data[0][0]}) - - results = self.service.data().ga().get( - ids='ga:' + self.profile_id, - start_date=start_date, - metrics='ga:pageviewsPerVisit,ga:bounces,ga:avgTimeOnSite,ga:percentNewVisits,ga:visitors', + metrics='ga:pageviews', + sort='-ga:pageviews', + max_results=10000, + end_date=end_date).execute() + result_data = results.get('rows') + ga_model.update_sitewide_stats(period_name, "Totals", {'Total page views': result_data[0][0]}, + period_complete_day) + + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + start_date=start_date, + metrics='ga:pageviewsPerVisit,ga:avgTimeOnSite,ga:percentNewVisits,ga:visits', max_results=10000, end_date=end_date).execute() result_data = results.get('rows') data = { 'Pages per visit': result_data[0][0], - 'Bounces': result_data[0][1], - 'Average time on site': result_data[0][2], - 'New visits': result_data[0][3], - 'Total visits': result_data[0][4], + 'Average time on site': result_data[0][1], + 'New visits': result_data[0][2], + 'Total visits': result_data[0][3], } - ga_model.update_sitewide_stats(period_name, "Totals", data) - - - def _locale_stats(self, start_date, end_date, period_name): + ga_model.update_sitewide_stats(period_name, "Totals", data, period_complete_day) + + # Bounces from / or another configurable page. + path = '/%s%s' % (config.get('googleanalytics.account'), + config.get('ga-report.bounce_url', '/')) + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + filters='ga:pagePath==%s' % (path,), + start_date=start_date, + metrics='ga:visitBounceRate', + dimensions='ga:pagePath', + max_results=10000, + end_date=end_date).execute() + result_data = results.get('rows') + if not result_data or len(result_data) != 1: + log.error('Could not pinpoint the bounces for path: %s. Got results: %r', + path, result_data) + return + results = result_data[0] + bounces = float(results[1]) + # visitBounceRate is already a % + log.info('Google reports visitBounceRate as %s', bounces) + ga_model.update_sitewide_stats(period_name, "Totals", {'Bounce rate (home page)': float(bounces)}, + period_complete_day) + + + def _locale_stats(self, start_date, end_date, period_name, period_complete_day): """ Fetches stats about language and country """ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:language,ga:country", max_results=10000, end_date=end_date).execute() @@ -235,42 +292,98 @@ data = {} for result in result_data: data[result[0]] = data.get(result[0], 0) + int(result[2]) - ga_model.update_sitewide_stats(period_name, "Languages", data) + self._filter_out_long_tail(data, MIN_VIEWS) + ga_model.update_sitewide_stats(period_name, "Languages", data, period_complete_day) data = {} for result in result_data: data[result[1]] = data.get(result[1], 0) + int(result[2]) - ga_model.update_sitewide_stats(period_name, "Country", data) - - - def _social_stats(self, start_date, end_date, period_name): + self._filter_out_long_tail(data, MIN_VIEWS) + ga_model.update_sitewide_stats(period_name, "Country", data, period_complete_day) + + + def _download_stats(self, start_date, end_date, period_name, period_complete_day): + """ Fetches stats about language and country """ + import ckan.model as model + + data = {} + + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + start_date=start_date, + filters='ga:eventAction==download', + metrics='ga:totalEvents', + sort='-ga:totalEvents', + dimensions="ga:eventLabel", + max_results=10000, + end_date=end_date).execute() + result_data = results.get('rows') + if not result_data: + # We may not have data for this time period, so we need to bail + # early. + log.info("There is no download data for this time period") + return + + def process_result_data(result_data, cached=False): + for result in result_data: + url = result[0].strip() + + # Get package id associated with the resource that has this URL. + q = model.Session.query(model.Resource) + if cached: + r = q.filter(model.Resource.cache_url.like("%s%%" % url)).first() + else: + r = q.filter(model.Resource.url.like("%s%%" % url)).first() + + package_name = r.resource_group.package.name if r else "" + if package_name: + data[package_name] = data.get(package_name, 0) + int(result[1]) + else: + log.warning(u"Could not find resource for URL: {url}".format(url=url)) + continue + + process_result_data(results.get('rows')) + + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + start_date=start_date, + filters='ga:eventAction==download-cache', + metrics='ga:totalEvents', + sort='-ga:totalEvents', + dimensions="ga:eventLabel", + max_results=10000, + end_date=end_date).execute() + process_result_data(results.get('rows'), cached=False) + + self._filter_out_long_tail(data, MIN_DOWNLOADS) + ga_model.update_sitewide_stats(period_name, "Downloads", data, period_complete_day) + + def _social_stats(self, start_date, end_date, period_name, period_complete_day): """ Finds out which social sites people are referred from """ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:socialNetwork,ga:referralPath", max_results=10000, end_date=end_date).execute() result_data = results.get('rows') - twitter_links = [] data = {} for result in result_data: if not result[0] == '(not set)': data[result[0]] = data.get(result[0], 0) + int(result[2]) - if result[0] == 'Twitter': - twitter_links.append(result[1]) - ga_model.update_sitewide_stats(period_name, "Social sources", data) - - - def _os_stats(self, start_date, end_date, period_name): + self._filter_out_long_tail(data, 3) + ga_model.update_sitewide_stats(period_name, "Social sources", data, period_complete_day) + + + def _os_stats(self, start_date, end_date, period_name, period_complete_day): """ Operating system stats """ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:operatingSystem,ga:operatingSystemVersion", max_results=10000, end_date=end_date).execute() @@ -278,46 +391,73 @@ data = {} for result in result_data: data[result[0]] = data.get(result[0], 0) + int(result[2]) - ga_model.update_sitewide_stats(period_name, "Operating Systems", data) - - data = {} - for result in result_data: - key = "%s (%s)" % (result[0],result[1]) - data[key] = result[2] - ga_model.update_sitewide_stats(period_name, "Operating Systems versions", data) - - - def _browser_stats(self, start_date, end_date, period_name): + self._filter_out_long_tail(data, MIN_VIEWS) + ga_model.update_sitewide_stats(period_name, "Operating Systems", data, period_complete_day) + + data = {} + for result in result_data: + if int(result[2]) >= MIN_VIEWS: + key = "%s %s" % (result[0],result[1]) + data[key] = result[2] + ga_model.update_sitewide_stats(period_name, "Operating Systems versions", data, period_complete_day) + + + def _browser_stats(self, start_date, end_date, period_name, period_complete_day): """ Information about browsers and browser versions """ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:browser,ga:browserVersion", max_results=10000, end_date=end_date).execute() result_data = results.get('rows') + # e.g. [u'Firefox', u'19.0', u'20'] + data = {} for result in result_data: data[result[0]] = data.get(result[0], 0) + int(result[2]) - ga_model.update_sitewide_stats(period_name, "Browsers", data) - - data = {} - for result in result_data: - key = "%s (%s)" % (result[0], result[1]) - data[key] = result[2] - ga_model.update_sitewide_stats(period_name, "Browser versions", data) - - - def _mobile_stats(self, start_date, end_date, period_name): + self._filter_out_long_tail(data, MIN_VIEWS) + ga_model.update_sitewide_stats(period_name, "Browsers", data, period_complete_day) + + data = {} + for result in result_data: + key = "%s %s" % (result[0], self._filter_browser_version(result[0], result[1])) + data[key] = data.get(key, 0) + int(result[2]) + self._filter_out_long_tail(data, MIN_VIEWS) + ga_model.update_sitewide_stats(period_name, "Browser versions", data, period_complete_day) + + @classmethod + def _filter_browser_version(cls, browser, version_str): + ''' + Simplifies a browser version string if it is detailed. + i.e. groups together Firefox 3.5.1 and 3.5.2 to be just 3. + This is helpful when viewing stats and good to protect privacy. + ''' + ver = version_str + parts = ver.split('.') + if len(parts) > 1: + if parts[1][0] == '0': + ver = parts[0] + else: + ver = "%s" % (parts[0]) + # Special case complex version nums + if browser in ['Safari', 'Android Browser']: + ver = parts[0] + if len(ver) > 2: + num_hidden_digits = len(ver) - 2 + ver = ver[0] + ver[1] + 'X' * num_hidden_digits + return ver + + def _mobile_stats(self, start_date, end_date, period_name, period_complete_day): """ Info about mobile devices """ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:mobileDeviceBranding, ga:mobileDeviceInfo", max_results=10000, end_date=end_date).execute() @@ -326,10 +466,23 @@ data = {} for result in result_data: data[result[0]] = data.get(result[0], 0) + int(result[2]) - ga_model.update_sitewide_stats(period_name, "Mobile brands", data) + self._filter_out_long_tail(data, MIN_VIEWS) + ga_model.update_sitewide_stats(period_name, "Mobile brands", data, period_complete_day) data = {} for result in result_data: data[result[1]] = data.get(result[1], 0) + int(result[2]) - ga_model.update_sitewide_stats(period_name, "Mobile devices", data) - + self._filter_out_long_tail(data, MIN_VIEWS) + ga_model.update_sitewide_stats(period_name, "Mobile devices", data, period_complete_day) + + @classmethod + def _filter_out_long_tail(cls, data, threshold=10): + ''' + Given data which is a frequency distribution, filter out + results which are below a threshold count. This is good to protect + privacy. + ''' + for key, value in data.items(): + if value < threshold: + del data[key] +