--- a/ckanext/ga_report/download_analytics.py +++ b/ckanext/ga_report/download_analytics.py @@ -13,6 +13,7 @@ FORMAT_MONTH = '%Y-%m' MIN_VIEWS = 50 MIN_VISITS = 20 +MIN_DOWNLOADS = 10 class DownloadAnalytics(object): '''Downloads and stores analytics info''' @@ -31,6 +32,11 @@ first_of_this_month = datetime.datetime(date.year, date.month, 1) _, last_day_of_month = calendar.monthrange(int(date.year), int(date.month)) last_of_this_month = datetime.datetime(date.year, date.month, last_day_of_month) + # if this is the latest month, note that it is only up until today + now = datetime.datetime.now() + if now.year == date.year and now.month == date.month: + last_day_of_month = now.day + last_of_this_month = now periods = ((date.strftime(FORMAT_MONTH), last_day_of_month, first_of_this_month, last_of_this_month),) @@ -98,7 +104,7 @@ self.get_full_period_name(period_name, period_complete_day), start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d')) - + if self.delete_first: log.info('Deleting existing Analytics for this period "%s"', period_name) @@ -122,11 +128,15 @@ log.info('Storing publisher views (%i rows)', len(data.get('url'))) self.store(period_name, period_complete_day, data,) + # Make sure the All records are correct. + ga_model.post_update_url_stats() + log.info('Aggregating datasets by publisher') ga_model.update_publisher_stats(period_name) # about 30 seconds. + log.info('Downloading and storing analytics for site-wide stats') - self.sitewide_stats( period_name ) + self.sitewide_stats( period_name, period_complete_day ) log.info('Downloading and storing analytics for social networks') self.update_social_info(period_name, start_date, end_date) @@ -153,7 +163,8 @@ data = collections.defaultdict(list) rows = results.get('rows',[]) for row in rows: - data[_normalize_url(row[0])].append( (row[1], int(row[2]),) ) + url = _normalize_url('http:/' + row[0]) + data[url].append( (row[1], int(row[2]),) ) ga_model.update_social(period_name, data) @@ -162,8 +173,8 @@ start_date = start_date.strftime('%Y-%m-%d') end_date = end_date.strftime('%Y-%m-%d') query = 'ga:pagePath=%s$' % path - metrics = 'ga:uniquePageviews, ga:visits' - sort = '-ga:uniquePageviews' + metrics = 'ga:pageviews, ga:visits' + sort = '-ga:pageviews' # Supported query params at # https://developers.google.com/analytics/devguides/reporting/core/v3/reference @@ -178,6 +189,7 @@ end_date=end_date).execute() packages = [] + log.info("There are %d results" % results['totalResults']) for entry in results.get('rows'): (loc,pageviews,visits) = entry url = _normalize_url('http:/' + loc) # strips off domain e.g. www.data.gov.uk or data.gov.uk @@ -194,7 +206,7 @@ if 'url' in data: ga_model.update_url_stats(period_name, period_complete_day, data['url']) - def sitewide_stats(self, period_name): + def sitewide_stats(self, period_name, period_complete_day): import calendar year, month = period_name.split('-') _, last_day_of_month = calendar.monthrange(int(year), int(month)) @@ -202,10 +214,10 @@ start_date = '%s-01' % period_name end_date = '%s-%s' % (period_name, last_day_of_month) funcs = ['_totals_stats', '_social_stats', '_os_stats', - '_locale_stats', '_browser_stats', '_mobile_stats'] + '_locale_stats', '_browser_stats', '_mobile_stats', '_download_stats'] for f in funcs: log.info('Downloading analytics for %s' % f.split('_')[1]) - getattr(self, f)(start_date, end_date, period_name) + getattr(self, f)(start_date, end_date, period_name, period_complete_day) def _get_results(result_data, f): data = {} @@ -214,17 +226,18 @@ data[key] = data.get(key,0) + result[1] return data - def _totals_stats(self, start_date, end_date, period_name): + def _totals_stats(self, start_date, end_date, period_name, period_complete_day): """ Fetches distinct totals, total pageviews etc """ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', - max_results=10000, - end_date=end_date).execute() - result_data = results.get('rows') - ga_model.update_sitewide_stats(period_name, "Totals", {'Total page views': result_data[0][0]}) + metrics='ga:pageviews', + sort='-ga:pageviews', + max_results=10000, + end_date=end_date).execute() + result_data = results.get('rows') + ga_model.update_sitewide_stats(period_name, "Totals", {'Total page views': result_data[0][0]}, + period_complete_day) results = self.service.data().ga().get( ids='ga:' + self.profile_id, @@ -239,7 +252,7 @@ 'New visits': result_data[0][2], 'Total visits': result_data[0][3], } - ga_model.update_sitewide_stats(period_name, "Totals", data) + ga_model.update_sitewide_stats(period_name, "Totals", data, period_complete_day) # Bounces from / or another configurable page. path = '/%s%s' % (config.get('googleanalytics.account'), @@ -248,29 +261,30 @@ ids='ga:' + self.profile_id, filters='ga:pagePath==%s' % (path,), start_date=start_date, - metrics='ga:bounces,ga:uniquePageviews', + metrics='ga:visitBounceRate', dimensions='ga:pagePath', max_results=10000, end_date=end_date).execute() result_data = results.get('rows') - if len(result_data) != 1: + if not result_data or len(result_data) != 1: log.error('Could not pinpoint the bounces for path: %s. Got results: %r', path, result_data) return results = result_data[0] - bounces, total = [float(x) for x in result_data[0][1:]] - pct = 100 * bounces/total - log.info('%d bounces from %d total == %s', bounces, total, pct) - ga_model.update_sitewide_stats(period_name, "Totals", {'Bounce rate': pct}) - - - def _locale_stats(self, start_date, end_date, period_name): + bounces = float(results[1]) + # visitBounceRate is already a % + log.info('Google reports visitBounceRate as %s', bounces) + ga_model.update_sitewide_stats(period_name, "Totals", {'Bounce rate (home page)': float(bounces)}, + period_complete_day) + + + def _locale_stats(self, start_date, end_date, period_name, period_complete_day): """ Fetches stats about language and country """ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:language,ga:country", max_results=10000, end_date=end_date).execute() @@ -279,22 +293,78 @@ for result in result_data: data[result[0]] = data.get(result[0], 0) + int(result[2]) self._filter_out_long_tail(data, MIN_VIEWS) - ga_model.update_sitewide_stats(period_name, "Languages", data) + ga_model.update_sitewide_stats(period_name, "Languages", data, period_complete_day) data = {} for result in result_data: data[result[1]] = data.get(result[1], 0) + int(result[2]) self._filter_out_long_tail(data, MIN_VIEWS) - ga_model.update_sitewide_stats(period_name, "Country", data) - - - def _social_stats(self, start_date, end_date, period_name): + ga_model.update_sitewide_stats(period_name, "Country", data, period_complete_day) + + + def _download_stats(self, start_date, end_date, period_name, period_complete_day): + """ Fetches stats about language and country """ + import ckan.model as model + + data = {} + + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + start_date=start_date, + filters='ga:eventAction==download', + metrics='ga:totalEvents', + sort='-ga:totalEvents', + dimensions="ga:eventLabel", + max_results=10000, + end_date=end_date).execute() + result_data = results.get('rows') + if not result_data: + # We may not have data for this time period, so we need to bail + # early. + log.info("There is no download data for this time period") + return + + def process_result_data(result_data, cached=False): + for result in result_data: + url = result[0].strip() + + # Get package id associated with the resource that has this URL. + q = model.Session.query(model.Resource) + if cached: + r = q.filter(model.Resource.cache_url.like("%s%%" % url)).first() + else: + r = q.filter(model.Resource.url.like("%s%%" % url)).first() + + package_name = r.resource_group.package.name if r else "" + if package_name: + data[package_name] = data.get(package_name, 0) + int(result[1]) + else: + log.warning(u"Could not find resource for URL: {url}".format(url=url)) + continue + + process_result_data(results.get('rows')) + + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + start_date=start_date, + filters='ga:eventAction==download-cache', + metrics='ga:totalEvents', + sort='-ga:totalEvents', + dimensions="ga:eventLabel", + max_results=10000, + end_date=end_date).execute() + process_result_data(results.get('rows'), cached=False) + + self._filter_out_long_tail(data, MIN_DOWNLOADS) + ga_model.update_sitewide_stats(period_name, "Downloads", data, period_complete_day) + + def _social_stats(self, start_date, end_date, period_name, period_complete_day): """ Finds out which social sites people are referred from """ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:socialNetwork,ga:referralPath", max_results=10000, end_date=end_date).execute() @@ -304,16 +374,16 @@ if not result[0] == '(not set)': data[result[0]] = data.get(result[0], 0) + int(result[2]) self._filter_out_long_tail(data, 3) - ga_model.update_sitewide_stats(period_name, "Social sources", data) - - - def _os_stats(self, start_date, end_date, period_name): + ga_model.update_sitewide_stats(period_name, "Social sources", data, period_complete_day) + + + def _os_stats(self, start_date, end_date, period_name, period_complete_day): """ Operating system stats """ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:operatingSystem,ga:operatingSystemVersion", max_results=10000, end_date=end_date).execute() @@ -322,23 +392,23 @@ for result in result_data: data[result[0]] = data.get(result[0], 0) + int(result[2]) self._filter_out_long_tail(data, MIN_VIEWS) - ga_model.update_sitewide_stats(period_name, "Operating Systems", data) + ga_model.update_sitewide_stats(period_name, "Operating Systems", data, period_complete_day) data = {} for result in result_data: if int(result[2]) >= MIN_VIEWS: key = "%s %s" % (result[0],result[1]) data[key] = result[2] - ga_model.update_sitewide_stats(period_name, "Operating Systems versions", data) - - - def _browser_stats(self, start_date, end_date, period_name): + ga_model.update_sitewide_stats(period_name, "Operating Systems versions", data, period_complete_day) + + + def _browser_stats(self, start_date, end_date, period_name, period_complete_day): """ Information about browsers and browser versions """ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:browser,ga:browserVersion", max_results=10000, end_date=end_date).execute() @@ -349,14 +419,14 @@ for result in result_data: data[result[0]] = data.get(result[0], 0) + int(result[2]) self._filter_out_long_tail(data, MIN_VIEWS) - ga_model.update_sitewide_stats(period_name, "Browsers", data) + ga_model.update_sitewide_stats(period_name, "Browsers", data, period_complete_day) data = {} for result in result_data: key = "%s %s" % (result[0], self._filter_browser_version(result[0], result[1])) data[key] = data.get(key, 0) + int(result[2]) self._filter_out_long_tail(data, MIN_VIEWS) - ga_model.update_sitewide_stats(period_name, "Browser versions", data) + ga_model.update_sitewide_stats(period_name, "Browser versions", data, period_complete_day) @classmethod def _filter_browser_version(cls, browser, version_str): @@ -380,14 +450,14 @@ ver = ver[0] + ver[1] + 'X' * num_hidden_digits return ver - def _mobile_stats(self, start_date, end_date, period_name): + def _mobile_stats(self, start_date, end_date, period_name, period_complete_day): """ Info about mobile devices """ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:mobileDeviceBranding, ga:mobileDeviceInfo", max_results=10000, end_date=end_date).execute() @@ -397,13 +467,13 @@ for result in result_data: data[result[0]] = data.get(result[0], 0) + int(result[2]) self._filter_out_long_tail(data, MIN_VIEWS) - ga_model.update_sitewide_stats(period_name, "Mobile brands", data) + ga_model.update_sitewide_stats(period_name, "Mobile brands", data, period_complete_day) data = {} for result in result_data: data[result[1]] = data.get(result[1], 0) + int(result[2]) self._filter_out_long_tail(data, MIN_VIEWS) - ga_model.update_sitewide_stats(period_name, "Mobile devices", data) + ga_model.update_sitewide_stats(period_name, "Mobile devices", data, period_complete_day) @classmethod def _filter_out_long_tail(cls, data, threshold=10):