When downloading this month, and specifying this month (rather than "latest"), note that it only goes up to today (instead of the end of the month).
[ckanext-ga-report.git] / ckanext / ga_report / download_analytics.py
blob:a/ckanext/ga_report/download_analytics.py -> blob:b/ckanext/ga_report/download_analytics.py
--- a/ckanext/ga_report/download_analytics.py
+++ b/ckanext/ga_report/download_analytics.py
@@ -1,6 +1,7 @@
 import os
 import logging
 import datetime
+import httplib
 import collections
 from pylons import config
 from ga_model import _normalize_url
@@ -131,7 +132,7 @@
                 # Make sure the All records are correct.
                 ga_model.post_update_url_stats()
 
-                log.info('Aggregating datasets by publisher')
+                log.info('Associating datasets with their publisher')
                 ga_model.update_publisher_stats(period_name) # about 30 seconds.
 
 
@@ -178,15 +179,20 @@
 
         # Supported query params at
         # https://developers.google.com/analytics/devguides/reporting/core/v3/reference
-        results = self.service.data().ga().get(
-                                 ids='ga:' + self.profile_id,
-                                 filters=query,
-                                 start_date=start_date,
-                                 metrics=metrics,
-                                 sort=sort,
-                                 dimensions="ga:pagePath",
-                                 max_results=10000,
-                                 end_date=end_date).execute()
+        try:
+            results = self.service.data().ga().get(
+                                     ids='ga:' + self.profile_id,
+                                     filters=query,
+                                     start_date=start_date,
+                                     metrics=metrics,
+                                     sort=sort,
+                                     dimensions="ga:pagePath",
+                                     max_results=10000,
+                                     end_date=end_date).execute()
+        except httplib.BadStatusLine:
+            log.error(u"Failed to download data=> ids: ga:{0}, filters: {1}, start_date: {2}, end_date: {3}, metrics: {4}, sort: {5}, dimensions: ga:pagePath".format(
+                self.profile_id, query, start_date, end_date, metrics, sort ))
+            return dict(url=[])
 
         packages = []
         log.info("There are %d results" % results['totalResults'])
@@ -303,7 +309,7 @@
 
 
     def _download_stats(self, start_date, end_date, period_name, period_complete_day):
-        """ Fetches stats about language and country """
+        """ Fetches stats about data downloads """
         import ckan.model as model
 
         data = {}
@@ -325,7 +331,14 @@
             return
 
         def process_result_data(result_data, cached=False):
+            progress_total = len(result_data)
+            progress_count = 0
+            resources_not_matched = []
             for result in result_data:
+                progress_count += 1
+                if progress_count % 100 == 0:
+                    log.debug('.. %d/%d done so far', progress_count, progress_total)
+
                 url = result[0].strip()
 
                 # Get package id associated with the resource that has this URL.
@@ -339,9 +352,13 @@
                 if package_name:
                     data[package_name] = data.get(package_name, 0) + int(result[1])
                 else:
-                    log.warning(u"Could not find resource for URL: {url}".format(url=url))
+                    resources_not_matched.append(url)
                     continue
-
+            if resources_not_matched:
+                log.debug('Could not match %i or %i resource URLs to datasets. e.g. %r',
+                          len(resources_not_matched), progress_total, resources_not_matched[:3])
+
+        log.info('Associating downloads of resource URLs with their respective datasets')
         process_result_data(results.get('rows'))
 
         results = self.service.data().ga().get(
@@ -353,6 +370,7 @@
                                  dimensions="ga:eventLabel",
                                  max_results=10000,
                                  end_date=end_date).execute()
+        log.info('Associating downloads of cache resource URLs with their respective datasets')
         process_result_data(results.get('rows'), cached=False)
 
         self._filter_out_long_tail(data, MIN_DOWNLOADS)