--- a/ckanext/ga_report/download_analytics.py +++ b/ckanext/ga_report/download_analytics.py @@ -1,6 +1,7 @@ import os import logging import datetime +import httplib import collections from pylons import config from ga_model import _normalize_url @@ -131,7 +132,7 @@ # Make sure the All records are correct. ga_model.post_update_url_stats() - log.info('Aggregating datasets by publisher') + log.info('Associating datasets with their publisher') ga_model.update_publisher_stats(period_name) # about 30 seconds. @@ -178,15 +179,20 @@ # Supported query params at # https://developers.google.com/analytics/devguides/reporting/core/v3/reference - results = self.service.data().ga().get( - ids='ga:' + self.profile_id, - filters=query, - start_date=start_date, - metrics=metrics, - sort=sort, - dimensions="ga:pagePath", - max_results=10000, - end_date=end_date).execute() + try: + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + filters=query, + start_date=start_date, + metrics=metrics, + sort=sort, + dimensions="ga:pagePath", + max_results=10000, + end_date=end_date).execute() + except httplib.BadStatusLine: + log.error(u"Failed to download data=> ids: ga:{0}, filters: {1}, start_date: {2}, end_date: {3}, metrics: {4}, sort: {5}, dimensions: ga:pagePath".format( + self.profile_id, query, start_date, end_date, metrics, sort )) + return dict(url=[]) packages = [] log.info("There are %d results" % results['totalResults']) @@ -303,7 +309,7 @@ def _download_stats(self, start_date, end_date, period_name, period_complete_day): - """ Fetches stats about language and country """ + """ Fetches stats about data downloads """ import ckan.model as model data = {} @@ -325,7 +331,14 @@ return def process_result_data(result_data, cached=False): + progress_total = len(result_data) + progress_count = 0 + resources_not_matched = [] for result in result_data: + progress_count += 1 + if progress_count % 100 == 0: + log.debug('.. %d/%d done so far', progress_count, progress_total) + url = result[0].strip() # Get package id associated with the resource that has this URL. @@ -339,9 +352,13 @@ if package_name: data[package_name] = data.get(package_name, 0) + int(result[1]) else: - log.warning(u"Could not find resource for URL: {url}".format(url=url)) + resources_not_matched.append(url) continue - + if resources_not_matched: + log.debug('Could not match %i or %i resource URLs to datasets. e.g. %r', + len(resources_not_matched), progress_total, resources_not_matched[:3]) + + log.info('Associating downloads of resource URLs with their respective datasets') process_result_data(results.get('rows')) results = self.service.data().ga().get( @@ -353,6 +370,7 @@ dimensions="ga:eventLabel", max_results=10000, end_date=end_date).execute() + log.info('Associating downloads of cache resource URLs with their respective datasets') process_result_data(results.get('rows'), cached=False) self._filter_out_long_tail(data, MIN_DOWNLOADS)