Remove prettyprinting unless DEBUG is set in env and add a new
[ckanext-ga-report.git] / ckanext / ga_report / download_analytics.py
blob:a/ckanext/ga_report/download_analytics.py -> blob:b/ckanext/ga_report/download_analytics.py
import os import os
import logging import logging
import datetime import datetime
  import httplib
  import urllib
  import collections
  import requests
  import json
  import re
from pylons import config from pylons import config
  from ga_model import _normalize_url
import ga_model import ga_model
   
#from ga_client import GA #from ga_client import GA
   
log = logging.getLogger('ckanext.ga-report') log = logging.getLogger('ckanext.ga-report')
   
FORMAT_MONTH = '%Y-%m' FORMAT_MONTH = '%Y-%m'
  MIN_VIEWS = 50
  MIN_VISITS = 20
  MIN_DOWNLOADS = 10
   
class DownloadAnalytics(object): class DownloadAnalytics(object):
'''Downloads and stores analytics info''' '''Downloads and stores analytics info'''
   
def __init__(self, service=None, profile_id=None): def __init__(self, service=None, token=None, profile_id=None, delete_first=False,
  skip_url_stats=False):
self.period = config['ga-report.period'] self.period = config['ga-report.period']
self.service = service self.service = service
self.profile_id = profile_id self.profile_id = profile_id
  self.delete_first = delete_first
  self.skip_url_stats = skip_url_stats
def all_(self): self.token = token
self.since_date(datetime.datetime(2010, 1, 1))  
  def specific_month(self, date):
  import calendar
   
  first_of_this_month = datetime.datetime(date.year, date.month, 1)
  _, last_day_of_month = calendar.monthrange(int(date.year), int(date.month))
  last_of_this_month = datetime.datetime(date.year, date.month, last_day_of_month)
  # if this is the latest month, note that it is only up until today
  now = datetime.datetime.now()
  if now.year == date.year and now.month == date.month:
  last_day_of_month = now.day
  last_of_this_month = now
  periods = ((date.strftime(FORMAT_MONTH),
  last_day_of_month,
  first_of_this_month, last_of_this_month),)
  self.download_and_store(periods)
   
   
def latest(self): def latest(self):
if self.period == 'monthly': if self.period == 'monthly':
# from first of this month to today # from first of this month to today
now = datetime.datetime.now() now = datetime.datetime.now()
first_of_this_month = datetime.datetime(now.year, now.month, 1) first_of_this_month = datetime.datetime(now.year, now.month, 1)
periods = ((now.strftime(FORMAT_MONTH), periods = ((now.strftime(FORMAT_MONTH),
now.day, now.day,
first_of_this_month, now),) first_of_this_month, now),)
else: else:
raise NotImplementedError raise NotImplementedError
self.download_and_store(periods) self.download_and_store(periods)
   
   
def since_date(self, since_date): def for_date(self, for_date):
assert isinstance(since_date, datetime.datetime) assert isinstance(since_date, datetime.datetime)
periods = [] # (period_name, period_complete_day, start_date, end_date) periods = [] # (period_name, period_complete_day, start_date, end_date)
if self.period == 'monthly': if self.period == 'monthly':
first_of_the_months_until_now = [] first_of_the_months_until_now = []
year = since_date.year year = for_date.year
month = since_date.month month = for_date.month
now = datetime.datetime.now() now = datetime.datetime.now()
first_of_this_month = datetime.datetime(now.year, now.month, 1) first_of_this_month = datetime.datetime(now.year, now.month, 1)
while True: while True:
first_of_the_month = datetime.datetime(year, month, 1) first_of_the_month = datetime.datetime(year, month, 1)
if first_of_the_month == first_of_this_month: if first_of_the_month == first_of_this_month:
periods.append((now.strftime(FORMAT_MONTH), periods.append((now.strftime(FORMAT_MONTH),
now.day, now.day,
first_of_this_month, now)) first_of_this_month, now))
break break
elif first_of_the_month < first_of_this_month: elif first_of_the_month < first_of_this_month:
in_the_next_month = first_of_the_month + datetime.timedelta(40) in_the_next_month = first_of_the_month + datetime.timedelta(40)
last_of_the_month = datetime.datetime(in_the_next_month.year, last_of_the_month = datetime.datetime(in_the_next_month.year,
in_the_next_month.month, 1)\ in_the_next_month.month, 1)\
- datetime.timedelta(1) - datetime.timedelta(1)
periods.append((now.strftime(FORMAT_MONTH), 0, periods.append((now.strftime(FORMAT_MONTH), 0,
first_of_the_month, last_of_the_month)) first_of_the_month, last_of_the_month))
else: else:
# first_of_the_month has got to the future somehow # first_of_the_month has got to the future somehow
break break
month += 1 month += 1
if month > 12: if month > 12:
year += 1 year += 1
month = 1 month = 1
else: else:
raise NotImplementedError raise NotImplementedError
self.download_and_store(periods) self.download_and_store(periods)
   
@staticmethod @staticmethod
def get_full_period_name(period_name, period_complete_day): def get_full_period_name(period_name, period_complete_day):
if period_complete_day: if period_complete_day:
return period_name + ' (up to %ith)' % period_complete_day return period_name + ' (up to %ith)' % period_complete_day
else: else:
return period_name return period_name
   
   
def download_and_store(self, periods): def download_and_store(self, periods):
for period_name, period_complete_day, start_date, end_date in periods: for period_name, period_complete_day, start_date, end_date in periods:
log.info('Downloading Analytics for period "%s" (%s - %s)', log.info('Period "%s" (%s - %s)',
self.get_full_period_name(period_name, period_complete_day), self.get_full_period_name(period_name, period_complete_day),
start_date.strftime('%Y %m %d'), start_date.strftime('%Y-%m-%d'),
end_date.strftime('%Y %m %d')) end_date.strftime('%Y-%m-%d'))
data = self.download(start_date, end_date)  
log.info('Storing Analytics for period "%s"', if self.delete_first:
self.get_full_period_name(period_name, period_complete_day)) log.info('Deleting existing Analytics for this period "%s"',
self.store(period_name, period_complete_day, data) period_name)
  ga_model.delete(period_name)
   
def download(self, start_date, end_date): if not self.skip_url_stats:
  # Clean out old url data before storing the new
  ga_model.pre_update_url_stats(period_name)
   
  accountName = config.get('googleanalytics.account')
   
  log.info('Downloading analytics for dataset views')
  data = self.download(start_date, end_date, '~^/dataset/[a-z0-9-_]+')
   
  log.info('Storing dataset views (%i rows)', len(data.get('url')))
  self.store(period_name, period_complete_day, data, )
   
  log.info('Downloading analytics for publisher views')
  data = self.download(start_date, end_date, '~^/organization/[a-z0-9-_]+')
   
  log.info('Storing publisher views (%i rows)', len(data.get('url')))
  self.store(period_name, period_complete_day, data,)
   
  # Make sure the All records are correct.
  ga_model.post_update_url_stats()
   
  log.info('Associating datasets with their publisher')
  ga_model.update_publisher_stats(period_name) # about 30 seconds.
   
   
  log.info('Downloading and storing analytics for site-wide stats')
  self.sitewide_stats( period_name, period_complete_day )
   
  log.info('Downloading and storing analytics for social networks')
  self.update_social_info(period_name, start_date, end_date)
   
   
  def update_social_info(self, period_name, start_date, end_date):
  start_date = start_date.strftime('%Y-%m-%d')
  end_date = end_date.strftime('%Y-%m-%d')
  query = 'ga:hasSocialSourceReferral=~Yes$'
  metrics = 'ga:entrances'
  sort = '-ga:entrances'
   
  try:
  # Because of issues of invalid responses, we are going to make these requests
  # ourselves.
  headers = {'authorization': 'Bearer ' + self.token}
   
  args = dict(ids='ga:' + self.profile_id,
  filters=query,
  metrics=metrics,
  sort=sort,
  dimensions="ga:landingPagePath,ga:socialNetwork",
  max_results=10000)
   
  args['start-date'] = start_date
  args['end-date'] = end_date
   
  results = self._get_json(args)
  except Exception, e:
  log.exception(e)
  results = dict(url=[])
   
   
  data = collections.defaultdict(list)
  rows = results.get('rows',[])
  for row in rows:
  url = row[0]
  data[url].append( (row[1], int(row[2]),) )
  ga_model.update_social(period_name, data)
   
   
  def download(self, start_date, end_date, path=None):
'''Get data from GA for a given time period''' '''Get data from GA for a given time period'''
start_date = start_date.strftime('%Y-%m-%d') start_date = start_date.strftime('%Y-%m-%d')
end_date = end_date.strftime('%Y-%m-%d') end_date = end_date.strftime('%Y-%m-%d')
query = 'ga:pagePath=~/dataset/[a-z0-9-]+$' query = 'ga:pagePath=%s$' % path
metrics = 'ga:uniquePageviews' metrics = 'ga:pageviews, ga:visits'
sort = '-ga:uniquePageviews' sort = '-ga:pageviews'
   
# Supported query params at # Supported query params at
# https://developers.google.com/analytics/devguides/reporting/core/v3/reference # https://developers.google.com/analytics/devguides/reporting/core/v3/reference
results = self.service.data().ga().get( # https://ga-dev-tools.appspot.com/explorer/
ids='ga:' + self.profile_id, try:
filters=query, args = {}
start_date=start_date, args["sort"] = "-ga:pageviews"
metrics=metrics, args["max-results"] = 100000
sort=sort, args["dimensions"] = "ga:pagePath"
dimensions="ga:pagePath", args["start-date"] = start_date
max_results=10000, args["end-date"] = end_date
end_date=end_date).execute() args["metrics"] = metrics
  args["ids"] = "ga:" + self.profile_id
if os.getenv('DEBUG'): args["filters"] = query
import pprint args["alt"] = "json"
pprint.pprint(results) print args
print 'Total results: %s' % results.get('totalResults') results = self._get_json(args)
   
  except Exception, e:
  log.exception(e)
  return dict(url=[])
   
packages = [] packages = []
for entry in results.get('rows'): log.info("There are %d results" % results['totalResults'])
(loc,size,) = entry if results['totalResults'] > 0:
packages.append( ('http:/' + loc,size, '',) ) # Temporary hack for entry in results.get('rows'):
  (loc,pageviews,visits) = entry
  #url = _normalize_url('http:/' + loc) # strips off domain e.g. www.data.gov.uk or data.gov.uk
  url = loc
  #print url
  if not url.startswith('/dataset/') and not url.startswith('/organization/'):
  # filter out strays like:
  # /data/user/login?came_from=http://data.gov.uk/dataset/os-code-point-open
  # /403.html?page=/about&from=http://data.gov.uk/publisher/planning-inspectorate
  continue
  packages.append( (url, pageviews, visits,) ) # Temporary hack
return dict(url=packages) return dict(url=packages)
   
def store(self, period_name, period_complete_day, data): def store(self, period_name, period_complete_day, data):
if 'url' in data: if 'url' in data:
ga_model.update_url_stats(period_name, period_complete_day, data['url']) ga_model.update_url_stats(period_name, period_complete_day, data['url'])
   
  def sitewide_stats(self, period_name, period_complete_day):
  import calendar
  year, month = period_name.split('-')
  _, last_day_of_month = calendar.monthrange(int(year), int(month))
   
  start_date = '%s-01' % period_name
  end_date = '%s-%s' % (period_name, last_day_of_month)
  funcs = ['_totals_stats', '_social_stats', '_os_stats',
  '_locale_stats', '_browser_stats', '_mobile_stats', '_download_stats']
  for f in funcs:
  log.info('Downloading analytics for %s' % f.split('_')[1])
  getattr(self, f)(start_date, end_date, period_name, period_complete_day)
   
  def _get_results(result_data, f):
  data = {}
  for result in result_data:
  key = f(result)
  data[key] = data.get(key,0) + result[1]
  return data
   
  def _get_json(self, params, prev_fail=False):
  ga_token_filepath = os.path.expanduser(config.get('googleanalytics.token.filepath', ''))
  if not ga_token_filepath:
  print 'ERROR: In the CKAN config you need to specify the filepath of the ' \
  'Google Analytics token file under key: googleanalytics.token.filepath'
  return
   
  log.info("Trying to refresh our OAuth token")
  try:
  from ga_auth import init_service
  self.token, svc = init_service(ga_token_filepath, None)
  log.info("OAuth token refreshed")
  except Exception, auth_exception:
  log.error("Oauth refresh failed")
  log.exception(auth_exception)
  return
   
  try:
  headers = {'authorization': 'Bearer ' + self.token}
  r = requests.get("https://www.googleapis.com/analytics/v3/data/ga", params=params, headers=headers)
  if r.status_code != 200:
  log.info("STATUS: %s" % (r.status_code,))
  log.info("CONTENT: %s" % (r.content,))
  raise Exception("Request with params: %s failed" % params)
   
  return json.loads(r.content)
  except Exception, e:
  log.exception(e)
   
  return dict(url=[])
   
  def _totals_stats(self, start_date, end_date, period_name, period_complete_day):
  """ Fetches distinct totals, total pageviews etc """
  try:
  args = {}
  args["max-results"] = 100000
  args["start-date"] = start_date
  args["end-date"] = end_date
  args["ids"] = "ga:" + self.profile_id
   
  args["metrics"] = "ga:pageviews"
  args["sort"] = "-ga:pageviews"
  args["alt"] = "json"
   
  results = self._get_json(args)
  except Exception, e:
  log.exception(e)
  results = dict(url=[])
   
  result_data = results.get('rows')
  ga_model.update_sitewide_stats(period_name, "Totals", {'Total page views': result_data[0][0]},
  period_complete_day)
   
  try:
  # Because of issues of invalid responses, we are going to make these requests
  # ourselves.
  headers = {'authorization': 'Bearer ' + self.token}
   
  args = {}
  args["max-results"] = 100000
  args["start-date"] = start_date
  args["end-date"] = end_date
  args["ids"] = "ga:" + self.profile_id
   
  args["metrics"] = "ga:pageviewsPerVisit,ga:avgTimeOnSite,ga:percentNewVisits,ga:visits"
  args["alt"] = "json"
   
  results = self._get_json(args)
  except Exception, e:
  log.exception(e)
  results = dict(url=[])
   
  result_data = results.get('rows')
  data = {
  'Pages per visit': result_data[0][0],
  'Average time on site': result_data[0][1],
  'New visits': result_data[0][2],
  'Total visits': result_data[0][3],
  }
  ga_model.update_sitewide_stats(period_name, "Totals", data, period_complete_day)
   
  # Bounces from / or another configurable page.
  path = '/' #% (config.get('googleanalytics.account'), config.get('ga-report.bounce_url', '/'))
   
  try:
  # Because of issues of invalid responses, we ar