fix new style download link stats
fix new style download link stats

import os import os
import logging import logging
import datetime import datetime
import httplib import httplib
  import urllib
import collections import collections
import requests import requests
import json import json
  import re
from pylons import config from pylons import config
from ga_model import _normalize_url from ga_model import _normalize_url
import ga_model import ga_model
   
#from ga_client import GA #from ga_client import GA
   
log = logging.getLogger('ckanext.ga-report') log = logging.getLogger('ckanext.ga-report')
   
FORMAT_MONTH = '%Y-%m' FORMAT_MONTH = '%Y-%m'
MIN_VIEWS = 50 MIN_VIEWS = 50
MIN_VISITS = 20 MIN_VISITS = 20
MIN_DOWNLOADS = 10 MIN_DOWNLOADS = 10
   
class DownloadAnalytics(object): class DownloadAnalytics(object):
'''Downloads and stores analytics info''' '''Downloads and stores analytics info'''
   
def __init__(self, service=None, token=None, profile_id=None, delete_first=False, def __init__(self, service=None, token=None, profile_id=None, delete_first=False,
skip_url_stats=False): skip_url_stats=False):
self.period = config['ga-report.period'] self.period = config['ga-report.period']
self.service = service self.service = service
self.profile_id = profile_id self.profile_id = profile_id
self.delete_first = delete_first self.delete_first = delete_first
self.skip_url_stats = skip_url_stats self.skip_url_stats = skip_url_stats
self.token = token self.token = token
   
def specific_month(self, date): def specific_month(self, date):
import calendar import calendar
   
first_of_this_month = datetime.datetime(date.year, date.month, 1) first_of_this_month = datetime.datetime(date.year, date.month, 1)
_, last_day_of_month = calendar.monthrange(int(date.year), int(date.month)) _, last_day_of_month = calendar.monthrange(int(date.year), int(date.month))
last_of_this_month = datetime.datetime(date.year, date.month, last_day_of_month) last_of_this_month = datetime.datetime(date.year, date.month, last_day_of_month)
# if this is the latest month, note that it is only up until today # if this is the latest month, note that it is only up until today
now = datetime.datetime.now() now = datetime.datetime.now()
if now.year == date.year and now.month == date.month: if now.year == date.year and now.month == date.month:
last_day_of_month = now.day last_day_of_month = now.day
last_of_this_month = now last_of_this_month = now
periods = ((date.strftime(FORMAT_MONTH), periods = ((date.strftime(FORMAT_MONTH),
last_day_of_month, last_day_of_month,
first_of_this_month, last_of_this_month),) first_of_this_month, last_of_this_month),)
self.download_and_store(periods) self.download_and_store(periods)
   
   
def latest(self): def latest(self):
if self.period == 'monthly': if self.period == 'monthly':
# from first of this month to today # from first of this month to today
now = datetime.datetime.now() now = datetime.datetime.now()
first_of_this_month = datetime.datetime(now.year, now.month, 1) first_of_this_month = datetime.datetime(now.year, now.month, 1)
periods = ((now.strftime(FORMAT_MONTH), periods = ((now.strftime(FORMAT_MONTH),
now.day, now.day,
first_of_this_month, now),) first_of_this_month, now),)
else: else:
raise NotImplementedError raise NotImplementedError
self.download_and_store(periods) self.download_and_store(periods)
   
   
def for_date(self, for_date): def for_date(self, for_date):
assert isinstance(since_date, datetime.datetime) assert isinstance(since_date, datetime.datetime)
periods = [] # (period_name, period_complete_day, start_date, end_date) periods = [] # (period_name, period_complete_day, start_date, end_date)
if self.period == 'monthly': if self.period == 'monthly':
first_of_the_months_until_now = [] first_of_the_months_until_now = []
year = for_date.year year = for_date.year
month = for_date.month month = for_date.month
now = datetime.datetime.now() now = datetime.datetime.now()
first_of_this_month = datetime.datetime(now.year, now.month, 1) first_of_this_month = datetime.datetime(now.year, now.month, 1)
while True: while True:
first_of_the_month = datetime.datetime(year, month, 1) first_of_the_month = datetime.datetime(year, month, 1)
if first_of_the_month == first_of_this_month: if first_of_the_month == first_of_this_month:
periods.append((now.strftime(FORMAT_MONTH), periods.append((now.strftime(FORMAT_MONTH),
now.day, now.day,
first_of_this_month, now)) first_of_this_month, now))
break break
elif first_of_the_month < first_of_this_month: elif first_of_the_month < first_of_this_month:
in_the_next_month = first_of_the_month + datetime.timedelta(40) in_the_next_month = first_of_the_month + datetime.timedelta(40)
last_of_the_month = datetime.datetime(in_the_next_month.year, last_of_the_month = datetime.datetime(in_the_next_month.year,
in_the_next_month.month, 1)\ in_the_next_month.month, 1)\
- datetime.timedelta(1) - datetime.timedelta(1)
periods.append((now.strftime(FORMAT_MONTH), 0, periods.append((now.strftime(FORMAT_MONTH), 0,
first_of_the_month, last_of_the_month)) first_of_the_month, last_of_the_month))
else: else:
# first_of_the_month has got to the future somehow # first_of_the_month has got to the future somehow
break break
month += 1 month += 1
if month > 12: if month > 12:
year += 1 year += 1
month = 1 month = 1
else: else:
raise NotImplementedError raise NotImplementedError
self.download_and_store(periods) self.download_and_store(periods)
   
@staticmethod @staticmethod
def get_full_period_name(period_name, period_complete_day): def get_full_period_name(period_name, period_complete_day):
if period_complete_day: if period_complete_day:
return period_name + ' (up to %ith)' % period_complete_day return period_name + ' (up to %ith)' % period_complete_day
else: else:
return period_name return period_name
   
   
def download_and_store(self, periods): def download_and_store(self, periods):
for period_name, period_complete_day, start_date, end_date in periods: for period_name, period_complete_day, start_date, end_date in periods:
log.info('Period "%s" (%s - %s)', log.info('Period "%s" (%s - %s)',
self.get_full_period_name(period_name, period_complete_day), self.get_full_period_name(period_name, period_complete_day),
start_date.strftime('%Y-%m-%d'), start_date.strftime('%Y-%m-%d'),
end_date.strftime('%Y-%m-%d')) end_date.strftime('%Y-%m-%d'))
   
if self.delete_first: if self.delete_first:
log.info('Deleting existing Analytics for this period "%s"', log.info('Deleting existing Analytics for this period "%s"',
period_name) period_name)
ga_model.delete(period_name) ga_model.delete(period_name)
   
if not self.skip_url_stats: if not self.skip_url_stats:
# Clean out old url data before storing the new # Clean out old url data before storing the new
ga_model.pre_update_url_stats(period_name) ga_model.pre_update_url_stats(period_name)
   
accountName = config.get('googleanalytics.account') accountName = config.get('googleanalytics.account')
   
log.info('Downloading analytics for dataset views') log.info('Downloading analytics for dataset views')
data = self.download(start_date, end_date, '~^/dataset/[a-z0-9-_]+') data = self.download(start_date, end_date, '~^/dataset/[a-z0-9-_]+')
   
log.info('Storing dataset views (%i rows)', len(data.get('url'))) log.info('Storing dataset views (%i rows)', len(data.get('url')))
self.store(period_name, period_complete_day, data, ) self.store(period_name, period_complete_day, data, )
   
log.info('Downloading analytics for publisher views') log.info('Downloading analytics for publisher views')
data = self.download(start_date, end_date, '~^/organization/[a-z0-9-_]+') data = self.download(start_date, end_date, '~^/organization/[a-z0-9-_]+')
   
log.info('Storing publisher views (%i rows)', len(data.get('url'))) log.info('Storing publisher views (%i rows)', len(data.get('url')))
self.store(period_name, period_complete_day, data,) self.store(period_name, period_complete_day, data,)
   
# Make sure the All records are correct. # Make sure the All records are correct.
ga_model.post_update_url_stats() ga_model.post_update_url_stats()
   
log.info('Associating datasets with their publisher') log.info('Associating datasets with their publisher')
ga_model.update_publisher_stats(period_name) # about 30 seconds. ga_model.update_publisher_stats(period_name) # about 30 seconds.
   
   
log.info('Downloading and storing analytics for site-wide stats') log.info('Downloading and storing analytics for site-wide stats')
self.sitewide_stats( period_name, period_complete_day ) self.sitewide_stats( period_name, period_complete_day )
   
log.info('Downloading and storing analytics for social networks') log.info('Downloading and storing analytics for social networks')
self.update_social_info(period_name, start_date, end_date) self.update_social_info(period_name, start_date, end_date)
   
   
def update_social_info(self, period_name, start_date, end_date): def update_social_info(self, period_name, start_date, end_date):
start_date = start_date.strftime('%Y-%m-%d') start_date = start_date.strftime('%Y-%m-%d')
end_date = end_date.strftime('%Y-%m-%d') end_date = end_date.strftime('%Y-%m-%d')
query = 'ga:hasSocialSourceReferral=~Yes$' query = 'ga:hasSocialSourceReferral=~Yes$'
metrics = 'ga:entrances' metrics = 'ga:entrances'
sort = '-ga:entrances' sort = '-ga:entrances'
   
try: try:
# Because of issues of invalid responses, we are going to make these requests # Because of issues of invalid responses, we are going to make these requests
# ourselves. # ourselves.
headers = {'authorization': 'Bearer ' + self.token} headers = {'authorization': 'Bearer ' + self.token}
   
args = dict(ids='ga:' + self.profile_id, args = dict(ids='ga:' + self.profile_id,
filters=query, filters=query,
metrics=metrics, metrics=metrics,
sort=sort, sort=sort,
dimensions="ga:landingPagePath,ga:socialNetwork", dimensions="ga:landingPagePath,ga:socialNetwork",
max_results=10000) max_results=10000)
   
args['start-date'] = start_date args['start-date'] = start_date
args['end-date'] = end_date args['end-date'] = end_date
   
results = self._get_json(args) results = self._get_json(args)
except Exception, e: except Exception, e:
log.exception(e) log.exception(e)
results = dict(url=[]) results = dict(url=[])
   
   
data = collections.defaultdict(list) data = collections.defaultdict(list)
rows = results.get('rows',[]) rows = results.get('rows',[])
for row in rows: for row in rows:
url = row[0] url = row[0]
data[url].append( (row[1], int(row[2]),) ) data[url].append( (row[1], int(row[2]),) )
ga_model.update_social(period_name, data) ga_model.update_social(period_name, data)
   
   
def download(self, start_date, end_date, path=None): def download(self, start_date, end_date, path=None):
'''Get data from GA for a given time period''' '''Get data from GA for a given time period'''
start_date = start_date.strftime('%Y-%m-%d') start_date = start_date.strftime('%Y-%m-%d')
end_date = end_date.strftime('%Y-%m-%d') end_date = end_date.strftime('%Y-%m-%d')
query = 'ga:pagePath=%s$' % path query = 'ga:pagePath=%s$' % path
metrics = 'ga:pageviews, ga:visits' metrics = 'ga:pageviews, ga:visits'
sort = '-ga:pageviews' sort = '-ga:pageviews'
   
# Supported query params at # Supported query params at
# https://developers.google.com/analytics/devguides/reporting/core/v3/reference # https://developers.google.com/analytics/devguides/reporting/core/v3/reference
# https://ga-dev-tools.appspot.com/explorer/ # https://ga-dev-tools.appspot.com/explorer/
try: try:
args = {} args = {}
args["sort"] = "-ga:pageviews" args["sort"] = "-ga:pageviews"
args["max-results"] = 100000 args["max-results"] = 100000
args["dimensions"] = "ga:pagePath" args["dimensions"] = "ga:pagePath"
args["start-date"] = start_date args["start-date"] = start_date
args["end-date"] = end_date args["end-date"] = end_date
args["metrics"] = metrics args["metrics"] = metrics
args["ids"] = "ga:" + self.profile_id args["ids"] = "ga:" + self.profile_id
args["filters"] = query args["filters"] = query
args["alt"] = "json" args["alt"] = "json"
print args print args
results = self._get_json(args) results = self._get_json(args)
   
except Exception, e: except Exception, e:
log.exception(e) log.exception(e)
return dict(url=[]) return dict(url=[])
   
packages = [] packages = []
log.info("There are %d results" % results['totalResults']) log.info("There are %d results" % results['totalResults'])
if results['totalResults'] > 0: if results['totalResults'] > 0:
for entry in results.get('rows'): for entry in results.get('rows'):
(loc,pageviews,visits) = entry (loc,pageviews,visits) = entry
#url = _normalize_url('http:/' + loc) # strips off domain e.g. www.data.gov.uk or data.gov.uk #url = _normalize_url('http:/' + loc) # strips off domain e.g. www.data.gov.uk or data.gov.uk
url = loc url = loc
#print url #print url
if not url.startswith('/dataset/') and not url.startswith('/organization/'): if not url.startswith('/dataset/') and not url.startswith('/organization/'):
# filter out strays like: # filter out strays like:
# /data/user/login?came_from=http://data.gov.uk/dataset/os-code-point-open # /data/user/login?came_from=http://data.gov.uk/dataset/os-code-point-open
# /403.html?page=/about&from=http://data.gov.uk/publisher/planning-inspectorate # /403.html?page=/about&from=http://data.gov.uk/publisher/planning-inspectorate
continue continue
packages.append( (url, pageviews, visits,) ) # Temporary hack packages.append( (url, pageviews, visits,) ) # Temporary hack
return dict(url=packages) return dict(url=packages)
   
def store(self, period_name, period_complete_day, data): def store(self, period_name, period_complete_day, data):
if 'url' in data: if 'url' in data:
ga_model.update_url_stats(period_name, period_complete_day, data['url']) ga_model.update_url_stats(period_name, period_complete_day, data['url'])
   
def sitewide_stats(self, period_name, period_complete_day): def sitewide_stats(self, period_name, period_complete_day):
import calendar import calendar
year, month = period_name.split('-') year, month = period_name.split('-')
_, last_day_of_month = calendar.monthrange(int(year), int(month)) _, last_day_of_month = calendar.monthrange(int(year), int(month))
   
start_date = '%s-01' % period_name start_date = '%s-01' % period_name
end_date = '%s-%s' % (period_name, last_day_of_month) end_date = '%s-%s' % (period_name, last_day_of_month)
funcs = ['_totals_stats', '_social_stats', '_os_stats', funcs = ['_totals_stats', '_social_stats', '_os_stats',
'_locale_stats', '_browser_stats', '_mobile_stats', '_download_stats'] '_locale_stats', '_browser_stats', '_mobile_stats', '_download_stats']
for f in funcs: for f in funcs:
log.info('Downloading analytics for %s' % f.split('_')[1]) log.info('Downloading analytics for %s' % f.split('_')[1])
getattr(self, f)(start_date, end_date, period_name, period_complete_day) getattr(self, f)(start_date, end_date, period_name, period_complete_day)
   
def _get_results(result_data, f): def _get_results(result_data, f):
data = {} data = {}
for result in result_data: for result in result_data:
key = f(result) key = f(result)
data[key] = data.get(key,0) + result[1] data[key] = data.get(key,0) + result[1]
return data return data
   
def _get_json(self, params, prev_fail=False): def _get_json(self, params, prev_fail=False):
ga_token_filepath = os.path.expanduser(config.get('googleanalytics.token.filepath', '')) ga_token_filepath = os.path.expanduser(config.get('googleanalytics.token.filepath', ''))
if not ga_token_filepath: if not ga_token_filepath:
print 'ERROR: In the CKAN config you need to specify the filepath of the ' \ print 'ERROR: In the CKAN config you need to specify the filepath of the ' \
'Google Analytics token file under key: googleanalytics.token.filepath' 'Google Analytics token file under key: googleanalytics.token.filepath'
return return
   
log.info("Trying to refresh our OAuth token") log.info("Trying to refresh our OAuth token")
try: try:
from ga_auth import init_service from ga_auth import init_service
self.token, svc = init_service(ga_token_filepath, None) self.token, svc = init_service(ga_token_filepath, None)
log.info("OAuth token refreshed") log.info("OAuth token refreshed")
except Exception, auth_exception: except Exception, auth_exception:
log.error("Oauth refresh failed") log.error("Oauth refresh failed")
log.exception(auth_exception) log.exception(auth_exception)
return return
   
try: try:
headers = {'authorization': 'Bearer ' + self.token} headers = {'authorization': 'Bearer ' + self.token}
r = requests.get("https://www.googleapis.com/analytics/v3/data/ga", params=params, headers=headers) r = requests.get("https://www.googleapis.com/analytics/v3/data/ga", params=params, headers=headers)
if r.status_code != 200: if r.status_code != 200:
log.info("STATUS: %s" % (r.status_code,)) log.info("STATUS: %s" % (r.status_code,))
log.info("CONTENT: %s" % (r.content,)) log.info("CONTENT: %s" % (r.content,))
raise Exception("Request with params: %s failed" % params) raise Exception("Request with params: %s failed" % params)
   
return json.loads(r.content) return json.loads(r.content)
except Exception, e: except Exception, e:
log.exception(e) log.exception(e)
   
return dict(url=[]) return dict(url=[])
   
def _totals_stats(self, start_date, end_date, period_name, period_complete_day): def _totals_stats(self, start_date, end_date, period_name, period_complete_day):
""" Fetches distinct totals, total pageviews etc """ """ Fetches distinct totals, total pageviews etc """
try: try:
args = {} args = {}
args["max-results"] = 100000 args["max-results"] = 100000
args["start-date"] = start_date args["start-date"] = start_date
args["end-date"] = end_date args["end-date"] = end_date
args["ids"] = "ga:" + self.profile_id args["ids"] = "ga:" + self.profile_id
   
args["metrics"] = "ga:pageviews" args["metrics"] = "ga:pageviews"
args["sort"] = "-ga:pageviews" args["sort"] = "-ga:pageviews"
args["alt"] = "json" args["alt"] = "json"
   
results = self._get_json(args) results = self._get_json(args)
except Exception, e: except Exception, e:
log.exception(e) log.exception(e)
results = dict(url=[]) results = dict(url=[])
   
result_data = results.get('rows') result_data = results.get('rows')
ga_model.update_sitewide_stats(period_name, "Totals", {'Total page views': result_data[0][0]}, ga_model.update_sitewide_stats(period_name, "Totals", {'Total page views': result_data[0][0]},
period_complete_day) period_complete_day)
   
try: try:
# Because of issues of invalid responses, we are going to make these requests # Because of issues of invalid responses, we are going to make these requests
# ourselves. # ourselves.
headers = {'authorization': 'Bearer ' + self.token} headers = {'authorization': 'Bearer ' + self.token}
   
args = {} args = {}
args["max-results"] = 100000 args["max-results"] = 100000
args["start-date"] = start_date args["start-date"] = start_date
args["end-date"] = end_date args["end-date"] = end_date
args["ids"] = "ga:" + self.profile_id args["ids"] = "ga:" + self.profile_id
   
args["metrics"] = "ga:pageviewsPerVisit,ga:avgTimeOnSite,ga:percentNewVisits,ga:visits" args["metrics"] = "ga:pageviewsPerVisit,ga:avgTimeOnSite,ga:percentNewVisits,ga:visits"
args["alt"] = "json" args["alt"] = "json"
   
results = self._get_json(args) results = self._get_json(args)
except Exception, e: except Exception, e:
log.exception(e) log.exception(e)
results = dict(url=[]) results = dict(url=[])
   
result_data = results.get('rows') result_data = results.get('rows')
data = { data = {
'Pages per visit': result_data[0][0], 'Pages per visit': result_data[0][0],
'Average time on site': result_data[0][1], 'Average time on site': result_data[0][1],
'New visits': result_data[0][2], 'New visits': result_data[0][2],