Force a refresh of the oauth token before every request to ensure we do not fail because of that
[ckanext-ga-report.git] / ckanext / ga_report / download_analytics.py
blob:a/ckanext/ga_report/download_analytics.py -> blob:b/ckanext/ga_report/download_analytics.py
--- a/ckanext/ga_report/download_analytics.py
+++ b/ckanext/ga_report/download_analytics.py
@@ -2,9 +2,11 @@
 import logging
 import datetime
 import httplib
+import urllib
 import collections
 import requests
 import json
+import re
 from pylons import config
 from ga_model import _normalize_url
 import ga_model
@@ -121,13 +123,13 @@
                 accountName = config.get('googleanalytics.account')
 
                 log.info('Downloading analytics for dataset views')
-                data = self.download(start_date, end_date, '~/%s/dataset/[a-z0-9-_]+' % accountName)
+                data = self.download(start_date, end_date, '~^/dataset/[a-z0-9-_]+')
 
                 log.info('Storing dataset views (%i rows)', len(data.get('url')))
                 self.store(period_name, period_complete_day, data, )
 
                 log.info('Downloading analytics for publisher views')
-                data = self.download(start_date, end_date, '~/%s/publisher/[a-z0-9-_]+' % accountName)
+                data = self.download(start_date, end_date, '~^/organization/[a-z0-9-_]+')
 
                 log.info('Storing publisher views (%i rows)', len(data.get('url')))
                 self.store(period_name, period_complete_day, data,)
@@ -177,7 +179,7 @@
         data = collections.defaultdict(list)
         rows = results.get('rows',[])
         for row in rows:
-            url = _normalize_url('http:/' + row[0])
+            url = row[0]
             data[url].append( (row[1], int(row[2]),) )
         ga_model.update_social(period_name, data)
 
@@ -192,11 +194,8 @@
 
         # Supported query params at
         # https://developers.google.com/analytics/devguides/reporting/core/v3/reference
-        try:
-            # Because of issues of invalid responses, we are going to make these requests
-            # ourselves.
-            headers = {'authorization': 'Bearer ' + self.token}
-
+	# https://ga-dev-tools.appspot.com/explorer/
+        try:
             args = {}
             args["sort"] = "-ga:pageviews"
             args["max-results"] = 100000
@@ -207,25 +206,22 @@
             args["ids"] = "ga:" + self.profile_id
             args["filters"] = query
             args["alt"] = "json"
-
-            r = requests.get("https://www.googleapis.com/analytics/v3/data/ga", params=args, headers=headers)
-            if r.status_code != 200:
-              raise Exception("Request with params: %s failed" % args)
-
-            results = json.loads(r.content)
-            print len(results.keys())
-        except Exception, e:
-            log.exception(e)
-            #return dict(url=[])
-            raise e
+            print args
+            results = self._get_json(args)
+
+        except Exception, e:
+            log.exception(e)
+            return dict(url=[])
 
         packages = []
         log.info("There are %d results" % results['totalResults'])
-        for entry in results.get('rows'):
+	if results['totalResults'] > 0:
+          for entry in results.get('rows'):
             (loc,pageviews,visits) = entry
-            url = _normalize_url('http:/' + loc) # strips off domain e.g. www.data.gov.uk or data.gov.uk
-
-            if not url.startswith('/dataset/') and not url.startswith('/publisher/'):
+            #url = _normalize_url('http:/' + loc) # strips off domain e.g. www.data.gov.uk or data.gov.uk
+            url = loc
+	    #print url
+            if not url.startswith('/dataset/') and not url.startswith('/organization/'):
                 # filter out strays like:
                 # /data/user/login?came_from=http://data.gov.uk/dataset/os-code-point-open
                 # /403.html?page=/about&from=http://data.gov.uk/publisher/planning-inspectorate
@@ -262,13 +258,14 @@
         if not ga_token_filepath:
             print 'ERROR: In the CKAN config you need to specify the filepath of the ' \
                 'Google Analytics token file under key: googleanalytics.token.filepath'
-        return
-
-        try:
-            log.info("Trying to refresh our OAuth token")
+            return
+
+        log.info("Trying to refresh our OAuth token")
+        try:
+            from ga_auth import init_service
             self.token, svc = init_service(ga_token_filepath, None)
             log.info("OAuth token refreshed")
-        except Exception auth_exception:
+        except Exception, auth_exception:
             log.error("Oauth refresh failed")
             log.exception(auth_exception)
             return
@@ -277,9 +274,9 @@
             headers = {'authorization': 'Bearer ' + self.token}
             r = requests.get("https://www.googleapis.com/analytics/v3/data/ga", params=params, headers=headers)
             if r.status_code != 200:
-              log.info("STATUS: %s" % (r.status_code,))
-              log.info("CONTENT: %s" % (r.content,))
-              raise Exception("Request with params: %s failed" % params)
+                log.info("STATUS: %s" % (r.status_code,))
+                log.info("CONTENT: %s" % (r.content,))
+                raise Exception("Request with params: %s failed" % params)
 
             return json.loads(r.content)
         except Exception, e:
@@ -338,8 +335,7 @@
         ga_model.update_sitewide_stats(period_name, "Totals", data, period_complete_day)
 
         # Bounces from / or another configurable page.
-        path = '/%s%s' % (config.get('googleanalytics.account'),
-                          config.get('ga-report.bounce_url', '/'))
+        path = '/' #% (config.get('googleanalytics.account'),                          config.get('ga-report.bounce_url', '/'))
 
         try:
             # Because of issues of invalid responses, we are going to make these requests
@@ -430,7 +426,7 @@
             args["end-date"] = end_date
             args["ids"] = "ga:" + self.profile_id
 
-            args["filters"] = 'ga:eventAction==download'
+            args["filters"] = 'ga:eventAction==Download'
             args["dimensions"] = "ga:eventLabel"
             args["metrics"] = "ga:totalEvents"
             args["alt"] = "json"
@@ -456,7 +452,7 @@
                 if progress_count % 100 == 0:
                     log.debug('.. %d/%d done so far', progress_count, progress_total)
 
-                url = result[0].strip()
+                url = urllib.unquote(result[0].strip())
 
                 # Get package id associated with the resource that has this URL.
                 q = model.Session.query(model.Resource)
@@ -464,8 +460,15 @@
                     r = q.filter(model.Resource.cache_url.like("%s%%" % url)).first()
                 else:
                     r = q.filter(model.Resource.url.like("%s%%" % url)).first()
+		
+		# new style internal download links
+		if re.search('(?:/resource/)(.*)(?:/download/)',url):
+		    resource_id = re.search('(?:/resource/)(.*)(?:/download/)',url)
+                    r = q.filter(model.Resource.id.like("%s%%" % resource_id.group(1))).first()
 
                 package_name = r.resource_group.package.name if r else ""
+
+
                 if package_name:
                     data[package_name] = data.get(package_name, 0) + int(result[1])
                 else:
@@ -478,7 +481,7 @@
         log.info('Associating downloads of resource URLs with their respective datasets')
         process_result_data(results.get('rows'))
 
-        try:
+        '''try:
             # Because of issues of invalid responses, we are going to make these requests
             # ourselves.
             headers = {'authorization': 'Bearer ' + self.token}
@@ -498,7 +501,7 @@
             results = dict(url=[])
 
         log.info('Associating downloads of cache resource URLs with their respective datasets')
-        process_result_data(results.get('rows'), cached=False)
+        process_result_data(results.get('rows'), cached=False)'''
 
         self._filter_out_long_tail(data, MIN_DOWNLOADS)
         ga_model.update_sitewide_stats(period_name, "Downloads", data, period_complete_day)