--- a/ckanext/ga_report/download_analytics.py
+++ b/ckanext/ga_report/download_analytics.py
@@ -3,15 +3,13 @@
import datetime
import httplib
import collections
+import requests
+import json
from pylons import config
from ga_model import _normalize_url
import ga_model
#from ga_client import GA
-
-import logging
-logger.setLevel(logging.DEBUG)
-
log = logging.getLogger('ckanext.ga-report')
@@ -23,13 +21,14 @@
class DownloadAnalytics(object):
'''Downloads and stores analytics info'''
- def __init__(self, service=None, profile_id=None, delete_first=False,
+ def __init__(self, service=None, token=None, profile_id=None, delete_first=False,
skip_url_stats=False):
self.period = config['ga-report.period']
self.service = service
self.profile_id = profile_id
self.delete_first = delete_first
self.skip_url_stats = skip_url_stats
+ self.token = token
def specific_month(self, date):
import calendar
@@ -122,13 +121,13 @@
accountName = config.get('googleanalytics.account')
log.info('Downloading analytics for dataset views')
- data = self.download(start_date, end_date, '~/%s/dataset/[a-z0-9-_]+' % accountName)
+ data = self.download(start_date, end_date, '~^/dataset/[a-z0-9-_]+')
log.info('Storing dataset views (%i rows)', len(data.get('url')))
self.store(period_name, period_complete_day, data, )
log.info('Downloading analytics for publisher views')
- data = self.download(start_date, end_date, '~/%s/publisher/[a-z0-9-_]+' % accountName)
+ data = self.download(start_date, end_date, '~^/organization/[a-z0-9-_]+')
log.info('Storing publisher views (%i rows)', len(data.get('url')))
self.store(period_name, period_complete_day, data,)
@@ -154,21 +153,31 @@
metrics = 'ga:entrances'
sort = '-ga:entrances'
- # Supported query params at
- # https://developers.google.com/analytics/devguides/reporting/core/v3/reference
- results = self.service.data().ga().get(
- ids='ga:' + self.profile_id,
- filters=query,
- start_date=start_date,
- metrics=metrics,
- sort=sort,
- dimensions="ga:landingPagePath,ga:socialNetwork",
- max_results=10000,
- end_date=end_date).execute()
+ try:
+ # Because of issues of invalid responses, we are going to make these requests
+ # ourselves.
+ headers = {'authorization': 'Bearer ' + self.token}
+
+ args = dict(ids='ga:' + self.profile_id,
+ filters=query,
+ metrics=metrics,
+ sort=sort,
+ dimensions="ga:landingPagePath,ga:socialNetwork",
+ max_results=10000)
+
+ args['start-date'] = start_date
+ args['end-date'] = end_date
+
+ results = self._get_json(args)
+ except Exception, e:
+ log.exception(e)
+ results = dict(url=[])
+
+
data = collections.defaultdict(list)
rows = results.get('rows',[])
for row in rows:
- url = _normalize_url('http:/' + row[0])
+ url = row[0]
data[url].append( (row[1], int(row[2]),) )
ga_model.update_social(period_name, data)
@@ -183,28 +192,34 @@
# Supported query params at
# https://developers.google.com/analytics/devguides/reporting/core/v3/reference
- try:
- results = self.service.data().ga().get(
- ids='ga:' + self.profile_id,
- filters=query,
- start_date=start_date,
- metrics=metrics,
- sort=sort,
- dimensions="ga:pagePath",
- max_results=10000,
- end_date=end_date).execute()
- except httplib.BadStatusLine:
- log.error(u"Failed to download data=> ids: ga:{0}, filters: {1}, start_date: {2}, end_date: {3}, metrics: {4}, sort: {5}, dimensions: ga:pagePath".format(
- self.profile_id, query, start_date, end_date, metrics, sort ))
+ # https://ga-dev-tools.appspot.com/explorer/
+ try:
+ args = {}
+ args["sort"] = "-ga:pageviews"
+ args["max-results"] = 100000
+ args["dimensions"] = "ga:pagePath"
+ args["start-date"] = start_date
+ args["end-date"] = end_date
+ args["metrics"] = metrics
+ args["ids"] = "ga:" + self.profile_id
+ args["filters"] = query
+ args["alt"] = "json"
+ print args
+ results = self._get_json(args)
+
+ except Exception, e:
+ log.exception(e)
return dict(url=[])
packages = []
log.info("There are %d results" % results['totalResults'])
- for entry in results.get('rows'):
+ if results['totalResults'] > 0:
+ for entry in results.get('rows'):
(loc,pageviews,visits) = entry
- url = _normalize_url('http:/' + loc) # strips off domain e.g. www.data.gov.uk or data.gov.uk
-
- if not url.startswith('/dataset/') and not url.startswith('/publisher/'):
+ #url = _normalize_url('http:/' + loc) # strips off domain e.g. www.data.gov.uk or data.gov.uk
+ url = loc
+ #print url
+ if not url.startswith('/dataset/') and not url.startswith('/organization/'):
# filter out strays like:
# /data/user/login?came_from=http://data.gov.uk/dataset/os-code-point-open
# /403.html?page=/about&from=http://data.gov.uk/publisher/planning-inspectorate
@@ -236,25 +251,78 @@
data[key] = data.get(key,0) + result[1]
return data
+ def _get_json(self, params, prev_fail=False):
+ ga_token_filepath = os.path.expanduser(config.get('googleanalytics.token.filepath', ''))
+ if not ga_token_filepath:
+ print 'ERROR: In the CKAN config you need to specify the filepath of the ' \
+ 'Google Analytics token file under key: googleanalytics.token.filepath'
+ return
+
+ log.info("Trying to refresh our OAuth token")
+ try:
+ from ga_auth import init_service
+ self.token, svc = init_service(ga_token_filepath, None)
+ log.info("OAuth token refreshed")
+ except Exception, auth_exception:
+ log.error("Oauth refresh failed")
+ log.exception(auth_exception)
+ return
+
+ try:
+ headers = {'authorization': 'Bearer ' + self.token}
+ r = requests.get("https://www.googleapis.com/analytics/v3/data/ga", params=params, headers=headers)
+ if r.status_code != 200:
+ log.info("STATUS: %s" % (r.status_code,))
+ log.info("CONTENT: %s" % (r.content,))
+ raise Exception("Request with params: %s failed" % params)
+
+ return json.loads(r.content)
+ except Exception, e:
+ log.exception(e)
+
+ return dict(url=[])
+
def _totals_stats(self, start_date, end_date, period_name, period_complete_day):
""" Fetches distinct totals, total pageviews etc """
- results = self.service.data().ga().get(
- ids='ga:' + self.profile_id,
- start_date=start_date,
- metrics='ga:pageviews',
- sort='-ga:pageviews',
- max_results=10000,
- end_date=end_date).execute()
+ try:
+ args = {}
+ args["max-results"] = 100000
+ args["start-date"] = start_date
+ args["end-date"] = end_date
+ args["ids"] = "ga:" + self.profile_id
+
+ args["metrics"] = "ga:pageviews"
+ args["sort"] = "-ga:pageviews"
+ args["alt"] = "json"
+
+ results = self._get_json(args)
+ except Exception, e:
+ log.exception(e)
+ results = dict(url=[])
+
result_data = results.get('rows')
ga_model.update_sitewide_stats(period_name, "Totals", {'Total page views': result_data[0][0]},
period_complete_day)
- results = self.service.data().ga().get(
- ids='ga:' + self.profile_id,
- start_date=start_date,
- metrics='ga:pageviewsPerVisit,ga:avgTimeOnSite,ga:percentNewVisits,ga:visits',
- max_results=10000,
- end_date=end_date).execute()
+ try:
+ # Because of issues of invalid responses, we are going to make these requests
+ # ourselves.
+ headers = {'authorization': 'Bearer ' + self.token}
+
+ args = {}
+ args["max-results"] = 100000
+ args["start-date"] = start_date
+ args["end-date"] = end_date
+ args["ids"] = "ga:" + self.profile_id
+
+ args["metrics"] = "ga:pageviewsPerVisit,ga:avgTimeOnSite,ga:percentNewVisits,ga:visits"
+ args["alt"] = "json"
+
+ results = self._get_json(args)
+ except Exception, e:
+ log.exception(e)
+ results = dict(url=[])
+
result_data = results.get('rows')
data = {
'Pages per visit': result_data[0][0],
@@ -265,16 +333,29 @@
ga_model.update_sitewide_stats(period_name, "Totals", data, period_complete_day)
# Bounces from / or another configurable page.
- path = '/%s%s' % (config.get('googleanalytics.account'),
- config.get('ga-report.bounce_url', '/'))
- results = self.service.data().ga().get(
- ids='ga:' + self.profile_id,
- filters='ga:pagePath==%s' % (path,),
- start_date=start_date,
- metrics='ga:visitBounceRate',
- dimensions='ga:pagePath',
- max_results=10000,
- end_date=end_date).execute()
+ path = '/' #% (config.get('googleanalytics.account'), config.get('ga-report.bounce_url', '/'))
+
+ try:
+ # Because of issues of invalid responses, we are going to make these requests
+ # ourselves.
+ headers = {'authorization': 'Bearer ' + self.token}
+
+ args = {}
+ args["max-results"] = 100000
+ args["start-date"] = start_date
+ args["end-date"] = end_date
+ args["ids"] = "ga:" + self.profile_id
+
+ args["filters"] = 'ga:pagePath==%s' % (path,)
+ args["dimensions"] = 'ga:pagePath'
+ args["metrics"] = "ga:visitBounceRate"
+ args["alt"] = "json"
+
+ results = self._get_json(args)
+ except Exception, e:
+ log.exception(e)
+ results = dict(url=[])
+
result_data = results.get('rows')
if not result_data or len(result_data) != 1:
log.error('Could not pinpoint the bounces for path: %s. Got results: %r',
@@ -290,14 +371,28 @@
def _locale_stats(self, start_date, end_date, period_name, period_complete_day):
""" Fetches stats about language and country """
- results = self.service.data().ga().get(
- ids='ga:' + self.profile_id,
- start_date=start_date,
- metrics='ga:pageviews',
- sort='-ga:pageviews',
- dimensions="ga:language,ga:country",
- max_results=10000,
- end_date=end_date).execute()
+
+ try:
+ # Because of issues of invalid responses, we are going to make these requests
+ # ourselves.
+ headers = {'authorization': 'Bearer ' + self.token}
+
+ args = {}
+ args["max-results"] = 100000
+ args["start-date"] = start_date
+ args["end-date"] = end_date
+ args["ids"] = "ga:" + self.profile_id
+
+ args["dimensions"] = "ga:language,ga:country"
+ args["metrics"] = "ga:pageviews"
+ args["sort"] = "-ga:pageviews"
+ args["alt"] = "json"
+
+ results = self._get_json(args)
+ except Exception, e:
+ log.exception(e)
+ results = dict(url=[])
+
result_data = results.get('rows')
data = {}
for result in result_data:
@@ -318,15 +413,27 @@
data = {}
- results = self.service.data().ga().get(
- ids='ga:' + self.profile_id,
- start_date=start_date,
- filters='ga:eventAction==download',
- metrics='ga:totalEvents',
- sort='-ga:totalEvents',
- dimensions="ga:eventLabel",
- max_results=10000,
- end_date=end_date).execute()
+ try:
+ # Because of issues of invalid responses, we are going to make these requests
+ # ourselves.
+ headers = {'authorization': 'Bearer ' + self.token}
+
+ args = {}
+ args["max-results"] = 100000
+ args["start-date"] = start_date
+ args["end-date"] = end_date
+ args["ids"] = "ga:" + self.profile_id
+
+ args["filters"] = 'ga:eventAction==download'
+ args["dimensions"] = "ga:eventLabel"
+ args["metrics"] = "ga:totalEvents"
+ args["alt"] = "json"
+
+ results = self._get_json(args)
+ except Exception, e:
+ log.exception(e)
+ results = dict(url=[])
+
result_data = results.get('rows')
if not result_data:
# We may not have data for this time period, so we need to bail
@@ -365,15 +472,25 @@
log.info('Associating downloads of resource URLs with their respective datasets')
process_result_data(results.get('rows'))
- results = self.service.data().ga().get(
- ids='ga:' + self.profile_id,
- start_date=start_date,
- filters='ga:eventAction==download-cache',
- metrics='ga:totalEvents',
- sort='-ga:totalEvents',
- dimensions="ga:eventLabel",
- max_results=10000,
- end_date=end_date).execute()
+ try:
+ # Because of issues of invalid responses, we are going to make these requests
+ # ourselves.
+ headers = {'authorization': 'Bearer ' + self.token}
+
+ args = dict( ids='ga:' + self.profile_id,
+ filters='ga:eventAction==download-cache',
+ metrics='ga:totalEvents',
+ sort='-ga:totalEvents',
+ dimensions="ga:eventLabel",
+ max_results=10000)
+ args['start-date'] = start_date
+ args['end-date'] = end_date
+
+ results = self._get_json(args)
+ except Exception, e:
+ log.exception(e)
+ results = dict(url=[])
+
log.info('Associating downloads of cache resource URLs with their respective datasets')
process_result_data(results.get('rows'), cached=False)
@@ -382,14 +499,25 @@
def _social_stats(self, start_date, end_date, period_name, period_complete_day):
""" Finds out which social sites people are referred from """
- results = self.service.data().ga().get(
- ids='ga:' + self.profile_id,
- start_date=start_date,
- metrics='ga:pageviews',
- sort='-ga:pageviews',
- dimensions="ga:socialNetwork,ga:referralPath",
- max_results=10000,
- end_date=end_date).execute()
+
+ try:
+ # Because of issues of invalid responses, we are going to make these requests
+ # ourselves.
+ headers = {'authorization': 'Bearer ' + self.token}
+
+ args = dict( ids='ga:' + self.profile_id,
+ metrics='ga:pageviews',
+ sort='-ga:pageviews',
+ dimensions="ga:socialNetwork,ga:referralPath",
+ max_results=10000)
+ args['start-date'] = start_date
+ args['end-date'] = end_date
+
+ results = self._get_json(args)
+ except Exception, e:
+ log.exception(e)
+ results = dict(url=[])
+
result_data = results.get('rows')
data = {}
for result in result_data:
@@ -401,14 +529,24 @@
def _os_stats(self, start_date, end_date, period_name, period_complete_day):
""" Operating system stats """
- results = self.service.data().ga().get(
- ids='ga:' + self.profile_id,
- start_date=start_date,
- metrics='ga:pageviews',
- sort='-ga:pageviews',
- dimensions="ga:operatingSystem,ga:operatingSystemVersion",
- max_results=10000,
- end_date=end_date).execute()
+ try:
+ # Because of issues of invalid responses, we are going to make these requests
+ # ourselves.
+ headers = {'authorization': 'Bearer ' + self.token}
+
+ args = dict( ids='ga:' + self.profile_id,
+ metrics='ga:pageviews',
+ sort='-ga:pageviews',
+ dimensions="ga:operatingSystem,ga:operatingSystemVersion",
+ max_results=10000)
+ args['start-date'] = start_date
+ args['end-date'] = end_date
+
+ results = self._get_json(args)
+ except Exception, e:
+ log.exception(e)
+ results = dict(url=[])
+
result_data = results.get('rows')
data = {}
for result in result_data:
@@ -426,14 +564,27 @@
def _browser_stats(self, start_date, end_date, period_name, period_complete_day):
""" Information about browsers and browser versions """
- results = self.service.data().ga().get(
- ids='ga:' + self.profile_id,
- start_date=start_date,
- metrics='ga:pageviews',
- sort='-ga:pageviews',
- dimensions="ga:browser,ga:browserVersion",
- max_results=10000,
- end_date=end_date).execute()
+
+ try:
+ # Because of issues of invalid responses, we are going to make these requests
+ # ourselves.
+ headers = {'authorization': 'Bearer ' + self.token}
+
+ args = dict( ids='ga:' + self.profile_id,
+ metrics='ga:pageviews',
+ sort='-ga:pageviews',
+ dimensions="ga:browser,ga:browserVersion",
+ max_results=10000)
+
+ args['start-date'] = start_date
+ args['end-date'] = end_date
+
+ results = self._get_json(args)
+ except Exception, e:
+ log.exception(e)
+ results = dict(url=[])
+
+
result_data = results.get('rows')
# e.g. [u'Firefox', u'19.0', u'20']
@@ -475,14 +626,24 @@
def _mobile_stats(self, start_date, end_date, period_name, period_complete_day):
""" Info about mobile devices """
- results = self.service.data().ga().get(
- ids='ga:' + self.profile_id,
- start_date=start_date,
- metrics='ga:pageviews',
- sort='-ga:pageviews',
- dimensions="ga:mobileDeviceBranding, ga:mobileDeviceInfo",
- max_results=10000,
- end_date=end_date).execute()
+ try:
+ # Because of issues of invalid responses, we are going to make these requests
+ # ourselves.
+ headers = {'authorization': 'Bearer ' + self.token}
+
+ args = dict( ids='ga:' + self.profile_id,
+ metrics='ga:pageviews',
+ sort='-ga:pageviews',
+ dimensions="ga:mobileDeviceBranding, ga:mobileDeviceInfo",
+ max_results=10000)
+ args['start-date'] = start_date
+ args['end-date'] = end_date
+
+ results = self._get_json(args)
+ except Exception, e:
+ log.exception(e)
+ results = dict(url=[])
+
result_data = results.get('rows')
data = {}