Remove prettyprinting unless DEBUG is set in env and add a new
[ckanext-ga-report.git] / ckanext / ga_report / download_analytics.py
blob:a/ckanext/ga_report/download_analytics.py -> blob:b/ckanext/ga_report/download_analytics.py
import os import os
import logging import logging
import datetime import datetime
  import collections
from pylons import config from pylons import config
  from ga_model import _normalize_url
import ga_model import ga_model
   
#from ga_client import GA #from ga_client import GA
   
log = logging.getLogger('ckanext.ga-report') log = logging.getLogger('ckanext.ga-report')
   
FORMAT_MONTH = '%Y-%m' FORMAT_MONTH = '%Y-%m'
  MIN_VIEWS = 50
  MIN_VISITS = 20
   
class DownloadAnalytics(object): class DownloadAnalytics(object):
'''Downloads and stores analytics info''' '''Downloads and stores analytics info'''
   
def __init__(self, service=None, profile_id=None): def __init__(self, service=None, profile_id=None, delete_first=False):
self.period = config['ga-report.period'] self.period = config['ga-report.period']
self.service = service self.service = service
self.profile_id = profile_id self.profile_id = profile_id
  self.delete_first = delete_first
   
def all_(self): def specific_month(self, date):
self.since_date(datetime.datetime(2010, 1, 1)) import calendar
   
  first_of_this_month = datetime.datetime(date.year, date.month, 1)
  _, last_day_of_month = calendar.monthrange(int(date.year), int(date.month))
  last_of_this_month = datetime.datetime(date.year, date.month, last_day_of_month)
  periods = ((date.strftime(FORMAT_MONTH),
  last_day_of_month,
  first_of_this_month, last_of_this_month),)
  self.download_and_store(periods)
   
   
def latest(self): def latest(self):
if self.period == 'monthly': if self.period == 'monthly':
# from first of this month to today # from first of this month to today
now = datetime.datetime.now() now = datetime.datetime.now()
first_of_this_month = datetime.datetime(now.year, now.month, 1) first_of_this_month = datetime.datetime(now.year, now.month, 1)
periods = ((now.strftime(FORMAT_MONTH), periods = ((now.strftime(FORMAT_MONTH),
now.day, now.day,
first_of_this_month, now),) first_of_this_month, now),)
else: else:
raise NotImplementedError raise NotImplementedError
self.download_and_store(periods) self.download_and_store(periods)
   
   
def since_date(self, since_date): def for_date(self, for_date):
assert isinstance(since_date, datetime.datetime) assert isinstance(since_date, datetime.datetime)
periods = [] # (period_name, period_complete_day, start_date, end_date) periods = [] # (period_name, period_complete_day, start_date, end_date)
if self.period == 'monthly': if self.period == 'monthly':
first_of_the_months_until_now = [] first_of_the_months_until_now = []
year = since_date.year year = for_date.year
month = since_date.month month = for_date.month
now = datetime.datetime.now() now = datetime.datetime.now()
first_of_this_month = datetime.datetime(now.year, now.month, 1) first_of_this_month = datetime.datetime(now.year, now.month, 1)
while True: while True:
first_of_the_month = datetime.datetime(year, month, 1) first_of_the_month = datetime.datetime(year, month, 1)
if first_of_the_month == first_of_this_month: if first_of_the_month == first_of_this_month:
periods.append((now.strftime(FORMAT_MONTH), periods.append((now.strftime(FORMAT_MONTH),
now.day, now.day,
first_of_this_month, now)) first_of_this_month, now))
break break
elif first_of_the_month < first_of_this_month: elif first_of_the_month < first_of_this_month:
in_the_next_month = first_of_the_month + datetime.timedelta(40) in_the_next_month = first_of_the_month + datetime.timedelta(40)
last_of_the_month = datetime.datetime(in_the_next_month.year, last_of_the_month = datetime.datetime(in_the_next_month.year,
in_the_next_month.month, 1)\ in_the_next_month.month, 1)\
- datetime.timedelta(1) - datetime.timedelta(1)
periods.append((now.strftime(FORMAT_MONTH), 0, periods.append((now.strftime(FORMAT_MONTH), 0,
first_of_the_month, last_of_the_month)) first_of_the_month, last_of_the_month))
else: else:
# first_of_the_month has got to the future somehow # first_of_the_month has got to the future somehow
break break
month += 1 month += 1
if month > 12: if month > 12:
year += 1 year += 1
month = 1 month = 1
else: else:
raise NotImplementedError raise NotImplementedError
self.download_and_store(periods) self.download_and_store(periods)
   
@staticmethod @staticmethod
def get_full_period_name(period_name, period_complete_day): def get_full_period_name(period_name, period_complete_day):
if period_complete_day: if period_complete_day:
return period_name + ' (up to %ith)' % period_complete_day return period_name + ' (up to %ith)' % period_complete_day
else: else:
return period_name return period_name
   
   
def download_and_store(self, periods): def download_and_store(self, periods):
for period_name, period_complete_day, start_date, end_date in periods: for period_name, period_complete_day, start_date, end_date in periods:
log.info('Downloading Analytics for period "%s" (%s - %s)', log.info('Period "%s" (%s - %s)',
self.get_full_period_name(period_name, period_complete_day), self.get_full_period_name(period_name, period_complete_day),
start_date.strftime('%Y %m %d'), start_date.strftime('%Y-%m-%d'),
end_date.strftime('%Y %m %d')) end_date.strftime('%Y-%m-%d'))
data = self.download(start_date, end_date)  
log.info('Storing Analytics for period "%s"', if self.delete_first:
self.get_full_period_name(period_name, period_complete_day)) log.info('Deleting existing Analytics for this period "%s"',
self.store(period_name, period_complete_day, data) period_name)
  ga_model.delete(period_name)
   
def download(self, start_date, end_date): # Clean up the entries before we run this
  ga_model.pre_update_url_stats(period_name)
   
  accountName = config.get('googleanalytics.account')
   
  log.info('Downloading analytics for dataset views')
  data = self.download(start_date, end_date, '~/%s/dataset/[a-z0-9-_]+' % accountName)
   
  log.info('Storing dataset views (%i rows)', len(data.get('url')))
  self.store(period_name, period_complete_day, data, )
   
  log.info('Downloading analytics for publisher views')
  data = self.download(start_date, end_date, '~/%s/publisher/[a-z0-9-_]+' % accountName)
   
  log.info('Storing publisher views (%i rows)', len(data.get('url')))
  self.store(period_name, period_complete_day, data,)
   
  log.info('Aggregating datasets by publisher')
  ga_model.update_publisher_stats(period_name) # about 30 seconds.
   
  log.info('Downloading and storing analytics for site-wide stats')
  self.sitewide_stats( period_name )
   
  log.info('Downloading and storing analytics for social networks')
  self.update_social_info(period_name, start_date, end_date)
   
   
  def update_social_info(self, period_name, start_date, end_date):
  start_date = start_date.strftime('%Y-%m-%d')
  end_date = end_date.strftime('%Y-%m-%d')
  query = 'ga:hasSocialSourceReferral=~Yes$'
  metrics = 'ga:entrances'
  sort = '-ga:entrances'
   
  # Supported query params at
  # https://developers.google.com/analytics/devguides/reporting/core/v3/reference
  results = self.service.data().ga().get(
  ids='ga:' + self.profile_id,
  filters=query,
  start_date=start_date,
  metrics=metrics,
  sort=sort,
  dimensions="ga:landingPagePath,ga:socialNetwork",
  max_results=10000,
  end_date=end_date).execute()
  data = collections.defaultdict(list)
  rows = results.get('rows',[])
  for row in rows:
  data[_normalize_url(row[0])].append( (row[1], int(row[2]),) )
  ga_model.update_social(period_name, data)
   
   
  def download(self, start_date, end_date, path=None):
'''Get data from GA for a given time period''' '''Get data from GA for a given time period'''
start_date = start_date.strftime('%Y-%m-%d') start_date = start_date.strftime('%Y-%m-%d')
end_date = end_date.strftime('%Y-%m-%d') end_date = end_date.strftime('%Y-%m-%d')
query = 'ga:pagePath=~/dataset/[a-z0-9-]+$' query = 'ga:pagePath=%s$' % path
metrics = 'ga:uniquePageviews' metrics = 'ga:uniquePageviews, ga:visits'
sort = '-ga:uniquePageviews' sort = '-ga:uniquePageviews'
   
# Supported query params at # Supported query params at
# https://developers.google.com/analytics/devguides/reporting/core/v3/reference # https://developers.google.com/analytics/devguides/reporting/core/v3/reference
results = self.service.data().ga().get( results = self.service.data().ga().get(
ids='ga:' + self.profile_id, ids='ga:' + self.profile_id,
filters=query, filters=query,
start_date=start_date, start_date=start_date,
metrics=metrics, metrics=metrics,
sort=sort, sort=sort,
dimensions="ga:pagePath", dimensions="ga:pagePath",
max_results=10000, max_results=10000,
end_date=end_date).execute() end_date=end_date).execute()
   
if os.getenv('DEBUG'):  
import pprint  
pprint.pprint(results)  
print 'Total results: %s' % results.get('totalResults')  
   
packages = [] packages = []
for entry in results.get('rows'): for entry in results.get('rows'):
(loc,size,) = entry (loc,pageviews,visits) = entry
packages.append( ('http:/' + loc,size, '',) ) # Temporary hack url = _normalize_url('http:/' + loc)
  if not url.startswith('/dataset/') and not url.startswith('/publisher/'):
  continue
  packages.append( (url, pageviews, visits,) ) # Temporary hack
return dict(url=packages) return dict(url=packages)
   
def store(self, period_name, period_complete_day, data): def store(self, period_name, period_complete_day, data):
if 'url' in data: if 'url' in data:
ga_model.update_url_stats(period_name, period_complete_day, data['url']) ga_model.update_url_stats(period_name, period_complete_day, data['url'])
   
  def sitewide_stats(self, period_name):
  import calendar
  year, month = period_name.split('-')
  _, last_day_of_month = calendar.monthrange(int(year), int(month))
   
  start_date = '%s-01' % period_name
  end_date = '%s-%s' % (period_name, last_day_of_month)
  funcs = ['_totals_stats', '_social_stats', '_os_stats',
  '_locale_stats', '_browser_stats', '_mobile_stats']
  for f in funcs:
  log.info('Downloading analytics for %s' % f.split('_')[1])
  getattr(self, f)(start_date, end_date, period_name)
   
  def _get_results(result_data, f):
  data = {}
  for result in result_data:
  key = f(result)
  data[key] = data.get(key,0) + result[1]
  return data
   
  def _totals_stats(self, start_date, end_date, period_name):
  """ Fetches distinct totals, total pageviews etc """
  results = self.service.data().ga().get(
  ids='ga:' + self.profile_id,
  start_date=start_date,
  metrics='ga:uniquePageviews',
  sort='-ga:uniquePageviews',
  max_results=10000,
  end_date=end_date).execute()
  result_data = results.get('rows')
  ga_model.update_sitewide_stats(period_name, "Totals", {'Total page views': result_data[0][0]})
   
  results = self.service.data().ga().get(
  ids='ga:' + self.profile_id,
  start_date=start_date,
  metrics='ga:pageviewsPerVisit,ga:avgTimeOnSite,ga:percentNewVisits,ga:visits',
  max_results=10000,
  end_date=end_date).execute()
  result_data = results.get('rows')
  data = {
  'Pages per visit': result_data[0][0],
  'Average time on site': result_data[0][1],
  'New visits': result_data[0][2],
  'Total visits': result_data[0][3],
  }
  ga_model.update_sitewide_stats(period_name, "Totals", data)
   
  # Bounces from /data. This url is specified in configuration because
  # for DGU we don't want /.
  path = config.get('ga-report.bounce_url','/')
  print path
  results = self.service.data().ga().get(
  ids='ga:' + self.profile_id,
  filters='ga:pagePath=~%s$' % (path,),
  start_date=start_date,
  metrics='ga:bounces,ga:uniquePageviews',
  dimensions='ga:pagePath',
  max_results=10000,
  end_date=end_date).execute()
  result_data = results.get('rows')
  for results in result_data:
  if results[0] == path:
  bounce, total = [float(x) for x in results[1:]]
  pct = 100 * bounce/total
  print "%d bounces from %d total == %s" % (bounce, total, pct)
  ga_model.update_sitewide_stats(period_name, "Totals", {'Bounces': pct})
   
   
  def _locale_stats(self, start_date, end_date, period_name):
  """ Fetches stats about language and country """
  results = self.service.data().ga().get(
  ids='ga:' + self.profile_id,
  start_date=start_date,
  metrics='ga:uniquePageviews',
  sort='-ga:uniquePageviews',
  dimensions="ga:language,ga:country",
  max_results=10000,
  end_date=end_date).execute()
  result_data = results.get('rows')
  data = {}
  for result in result_data:
  data[result[0]] = data.get(result[0], 0) + int(result[2])
  self._filter_out_long_tail(data, MIN_VIEWS)
  ga_model.update_sitewide_stats(period_name, "Languages", data)
   
  data = {}
  for result in result_data:
  data[result[1]] = data.get(result[1], 0) + int(result[2])
  self._filter_out_long_tail(data, MIN_VIEWS)
  ga_model.update_sitewide_stats(period_name, "Country", data)
   
   
  def _social_stats(self, start_date, end_date, period_name):
  """ Finds out which social sites people are referred from """
  results = self.service.data().ga().get(
  ids='ga:' + self.profile_id,
  start_date=start_date,
  metrics='ga:uniquePageviews',
  sort='-ga:uniquePageviews',
  dimensions="ga:socialNetwork,ga:referralPath",
  max_results=10000,
  end_date=end_date).execute()
  result_data = results.get('rows')
  data = {}
  for result in result_data:
  if not result[0] == '(not set)':
  data[result[0]] = data.get(result[0], 0) + int(result[2])
  self._filter_out_long_tail(data, 3)
  ga_model.update_sitewide_stats(period_name, "Social sources", data)
   
   
  def _os_stats(self, start_date, end_date, period_name):
  """ Operating system stats """
  results = self.service.data().ga().get(
  ids='ga:' + self.profile_id,
  start_date=start_date,
  metrics='ga:uniquePageviews',
  sort='-ga:uniquePageviews',
  dimensions="ga:operatingSystem,ga:operatingSystemVersion",
  max_results=10000,
  end_date=end_date).execute()
  result_data = results.get('rows')
  data = {}
  for result in result_data:
  data[result[0]] = data.get(result[0], 0) + int(result[2])
  self._filter_out_long_tail(data, MIN_VIEWS)
  ga_model.update_sitewide_stats(period_name, "Operating Systems", data)
   
  data = {}
  for result in result_data:
  if int(result[2]) >= MIN_VIEWS:
  key = "%s %s" % (result[0],result[1])
  data[key] = result[2]
  ga_model.update_sitewide_stats(period_name, "Operating Systems versions", data)
   
   
  def _browser_stats(self, start_date, end_date, period_name):
  """ Information about browsers and browser versions """
  results = self.service.data().ga().get(
  ids='ga:' + self.profile_id,