import os |
import os |
import logging |
import logging |
import datetime |
import datetime |
import httplib |
import httplib |
import collections |
import collections |
|
import requests |
|
import json |
from pylons import config |
from pylons import config |
from ga_model import _normalize_url |
from ga_model import _normalize_url |
import ga_model |
import ga_model |
|
|
#from ga_client import GA |
#from ga_client import GA |
|
|
log = logging.getLogger('ckanext.ga-report') |
log = logging.getLogger('ckanext.ga-report') |
|
|
FORMAT_MONTH = '%Y-%m' |
FORMAT_MONTH = '%Y-%m' |
MIN_VIEWS = 50 |
MIN_VIEWS = 50 |
MIN_VISITS = 20 |
MIN_VISITS = 20 |
MIN_DOWNLOADS = 10 |
MIN_DOWNLOADS = 10 |
|
|
class DownloadAnalytics(object): |
class DownloadAnalytics(object): |
'''Downloads and stores analytics info''' |
'''Downloads and stores analytics info''' |
|
|
def __init__(self, service=None, profile_id=None, delete_first=False, |
def __init__(self, service=None, token=None, profile_id=None, delete_first=False, |
skip_url_stats=False): |
skip_url_stats=False): |
self.period = config['ga-report.period'] |
self.period = config['ga-report.period'] |
self.service = service |
self.service = service |
self.profile_id = profile_id |
self.profile_id = profile_id |
self.delete_first = delete_first |
self.delete_first = delete_first |
self.skip_url_stats = skip_url_stats |
self.skip_url_stats = skip_url_stats |
|
self.token = token |
|
|
def specific_month(self, date): |
def specific_month(self, date): |
import calendar |
import calendar |
|
|
first_of_this_month = datetime.datetime(date.year, date.month, 1) |
first_of_this_month = datetime.datetime(date.year, date.month, 1) |
_, last_day_of_month = calendar.monthrange(int(date.year), int(date.month)) |
_, last_day_of_month = calendar.monthrange(int(date.year), int(date.month)) |
last_of_this_month = datetime.datetime(date.year, date.month, last_day_of_month) |
last_of_this_month = datetime.datetime(date.year, date.month, last_day_of_month) |
# if this is the latest month, note that it is only up until today |
# if this is the latest month, note that it is only up until today |
now = datetime.datetime.now() |
now = datetime.datetime.now() |
if now.year == date.year and now.month == date.month: |
if now.year == date.year and now.month == date.month: |
last_day_of_month = now.day |
last_day_of_month = now.day |
last_of_this_month = now |
last_of_this_month = now |
periods = ((date.strftime(FORMAT_MONTH), |
periods = ((date.strftime(FORMAT_MONTH), |
last_day_of_month, |
last_day_of_month, |
first_of_this_month, last_of_this_month),) |
first_of_this_month, last_of_this_month),) |
self.download_and_store(periods) |
self.download_and_store(periods) |
|
|
|
|
def latest(self): |
def latest(self): |
if self.period == 'monthly': |
if self.period == 'monthly': |
# from first of this month to today |
# from first of this month to today |
now = datetime.datetime.now() |
now = datetime.datetime.now() |
first_of_this_month = datetime.datetime(now.year, now.month, 1) |
first_of_this_month = datetime.datetime(now.year, now.month, 1) |
periods = ((now.strftime(FORMAT_MONTH), |
periods = ((now.strftime(FORMAT_MONTH), |
now.day, |
now.day, |
first_of_this_month, now),) |
first_of_this_month, now),) |
else: |
else: |
raise NotImplementedError |
raise NotImplementedError |
self.download_and_store(periods) |
self.download_and_store(periods) |
|
|
|
|
def for_date(self, for_date): |
def for_date(self, for_date): |
assert isinstance(since_date, datetime.datetime) |
assert isinstance(since_date, datetime.datetime) |
periods = [] # (period_name, period_complete_day, start_date, end_date) |
periods = [] # (period_name, period_complete_day, start_date, end_date) |
if self.period == 'monthly': |
if self.period == 'monthly': |
first_of_the_months_until_now = [] |
first_of_the_months_until_now = [] |
year = for_date.year |
year = for_date.year |
month = for_date.month |
month = for_date.month |
now = datetime.datetime.now() |
now = datetime.datetime.now() |
first_of_this_month = datetime.datetime(now.year, now.month, 1) |
first_of_this_month = datetime.datetime(now.year, now.month, 1) |
while True: |
while True: |
first_of_the_month = datetime.datetime(year, month, 1) |
first_of_the_month = datetime.datetime(year, month, 1) |
if first_of_the_month == first_of_this_month: |
if first_of_the_month == first_of_this_month: |
periods.append((now.strftime(FORMAT_MONTH), |
periods.append((now.strftime(FORMAT_MONTH), |
now.day, |
now.day, |
first_of_this_month, now)) |
first_of_this_month, now)) |
break |
break |
elif first_of_the_month < first_of_this_month: |
elif first_of_the_month < first_of_this_month: |
in_the_next_month = first_of_the_month + datetime.timedelta(40) |
in_the_next_month = first_of_the_month + datetime.timedelta(40) |
last_of_the_month = datetime.datetime(in_the_next_month.year, |
last_of_the_month = datetime.datetime(in_the_next_month.year, |
in_the_next_month.month, 1)\ |
in_the_next_month.month, 1)\ |
- datetime.timedelta(1) |
- datetime.timedelta(1) |
periods.append((now.strftime(FORMAT_MONTH), 0, |
periods.append((now.strftime(FORMAT_MONTH), 0, |
first_of_the_month, last_of_the_month)) |
first_of_the_month, last_of_the_month)) |
else: |
else: |
# first_of_the_month has got to the future somehow |
# first_of_the_month has got to the future somehow |
break |
break |
month += 1 |
month += 1 |
if month > 12: |
if month > 12: |
year += 1 |
year += 1 |
month = 1 |
month = 1 |
else: |
else: |
raise NotImplementedError |
raise NotImplementedError |
self.download_and_store(periods) |
self.download_and_store(periods) |
|
|
@staticmethod |
@staticmethod |
def get_full_period_name(period_name, period_complete_day): |
def get_full_period_name(period_name, period_complete_day): |
if period_complete_day: |
if period_complete_day: |
return period_name + ' (up to %ith)' % period_complete_day |
return period_name + ' (up to %ith)' % period_complete_day |
else: |
else: |
return period_name |
return period_name |
|
|
|
|
def download_and_store(self, periods): |
def download_and_store(self, periods): |
for period_name, period_complete_day, start_date, end_date in periods: |
for period_name, period_complete_day, start_date, end_date in periods: |
log.info('Period "%s" (%s - %s)', |
log.info('Period "%s" (%s - %s)', |
self.get_full_period_name(period_name, period_complete_day), |
self.get_full_period_name(period_name, period_complete_day), |
start_date.strftime('%Y-%m-%d'), |
start_date.strftime('%Y-%m-%d'), |
end_date.strftime('%Y-%m-%d')) |
end_date.strftime('%Y-%m-%d')) |
|
|
if self.delete_first: |
if self.delete_first: |
log.info('Deleting existing Analytics for this period "%s"', |
log.info('Deleting existing Analytics for this period "%s"', |
period_name) |
period_name) |
ga_model.delete(period_name) |
ga_model.delete(period_name) |
|
|
if not self.skip_url_stats: |
if not self.skip_url_stats: |
# Clean out old url data before storing the new |
# Clean out old url data before storing the new |
ga_model.pre_update_url_stats(period_name) |
ga_model.pre_update_url_stats(period_name) |
|
|
accountName = config.get('googleanalytics.account') |
accountName = config.get('googleanalytics.account') |
|
|
log.info('Downloading analytics for dataset views') |
log.info('Downloading analytics for dataset views') |
data = self.download(start_date, end_date, '~/%s/dataset/[a-z0-9-_]+' % accountName) |
data = self.download(start_date, end_date, '~/%s/dataset/[a-z0-9-_]+' % accountName) |
|
|
log.info('Storing dataset views (%i rows)', len(data.get('url'))) |
log.info('Storing dataset views (%i rows)', len(data.get('url'))) |
self.store(period_name, period_complete_day, data, ) |
self.store(period_name, period_complete_day, data, ) |
|
|
log.info('Downloading analytics for publisher views') |
log.info('Downloading analytics for publisher views') |
data = self.download(start_date, end_date, '~/%s/publisher/[a-z0-9-_]+' % accountName) |
data = self.download(start_date, end_date, '~/%s/publisher/[a-z0-9-_]+' % accountName) |
|
|
log.info('Storing publisher views (%i rows)', len(data.get('url'))) |
log.info('Storing publisher views (%i rows)', len(data.get('url'))) |
self.store(period_name, period_complete_day, data,) |
self.store(period_name, period_complete_day, data,) |
|
|
# Make sure the All records are correct. |
# Make sure the All records are correct. |
ga_model.post_update_url_stats() |
ga_model.post_update_url_stats() |
|
|
log.info('Associating datasets with their publisher') |
log.info('Associating datasets with their publisher') |
ga_model.update_publisher_stats(period_name) # about 30 seconds. |
ga_model.update_publisher_stats(period_name) # about 30 seconds. |
|
|
|
|
log.info('Downloading and storing analytics for site-wide stats') |
log.info('Downloading and storing analytics for site-wide stats') |
self.sitewide_stats( period_name, period_complete_day ) |
self.sitewide_stats( period_name, period_complete_day ) |
|
|
log.info('Downloading and storing analytics for social networks') |
log.info('Downloading and storing analytics for social networks') |
self.update_social_info(period_name, start_date, end_date) |
self.update_social_info(period_name, start_date, end_date) |
|
|
|
|
def update_social_info(self, period_name, start_date, end_date): |
def update_social_info(self, period_name, start_date, end_date): |
start_date = start_date.strftime('%Y-%m-%d') |
start_date = start_date.strftime('%Y-%m-%d') |
end_date = end_date.strftime('%Y-%m-%d') |
end_date = end_date.strftime('%Y-%m-%d') |
query = 'ga:hasSocialSourceReferral=~Yes$' |
query = 'ga:hasSocialSourceReferral=~Yes$' |
metrics = 'ga:entrances' |
metrics = 'ga:entrances' |
sort = '-ga:entrances' |
sort = '-ga:entrances' |
|
|
# Supported query params at |
try: |
# https://developers.google.com/analytics/devguides/reporting/core/v3/reference |
# Because of issues of invalid responses, we are going to make these requests |
results = self.service.data().ga().get( |
# ourselves. |
ids='ga:' + self.profile_id, |
headers = {'authorization': 'Bearer ' + self.token} |
filters=query, |
|
start_date=start_date, |
args = dict(ids='ga:' + self.profile_id, |
metrics=metrics, |
filters=query, |
sort=sort, |
metrics=metrics, |
dimensions="ga:landingPagePath,ga:socialNetwork", |
sort=sort, |
max_results=10000, |
dimensions="ga:landingPagePath,ga:socialNetwork", |
end_date=end_date).execute() |
max_results=10000) |
|
|
|
args['start-date'] = start_date |
|
args['end-date'] = end_date |
|
|
|
results = self._get_json(args) |
|
except Exception, e: |
|
log.exception(e) |
|
results = dict(url=[]) |
|
|
|
|
data = collections.defaultdict(list) |
data = collections.defaultdict(list) |
rows = results.get('rows',[]) |
rows = results.get('rows',[]) |
for row in rows: |
for row in rows: |
url = _normalize_url('http:/' + row[0]) |
url = _normalize_url('http:/' + row[0]) |
data[url].append( (row[1], int(row[2]),) ) |
data[url].append( (row[1], int(row[2]),) ) |
ga_model.update_social(period_name, data) |
ga_model.update_social(period_name, data) |
|
|
|
|
def download(self, start_date, end_date, path=None): |
def download(self, start_date, end_date, path=None): |
'''Get data from GA for a given time period''' |
'''Get data from GA for a given time period''' |
start_date = start_date.strftime('%Y-%m-%d') |
start_date = start_date.strftime('%Y-%m-%d') |
end_date = end_date.strftime('%Y-%m-%d') |
end_date = end_date.strftime('%Y-%m-%d') |
query = 'ga:pagePath=%s$' % path |
query = 'ga:pagePath=%s$' % path |
metrics = 'ga:pageviews, ga:visits' |
metrics = 'ga:pageviews, ga:visits' |
sort = '-ga:pageviews' |
sort = '-ga:pageviews' |
|
|
# Supported query params at |
# Supported query params at |
# https://developers.google.com/analytics/devguides/reporting/core/v3/reference |
# https://developers.google.com/analytics/devguides/reporting/core/v3/reference |
try: |
try: |
results = self.service.data().ga().get( |
# Because of issues of invalid responses, we are going to make these requests |
ids='ga:' + self.profile_id, |
# ourselves. |
filters=query, |
headers = {'authorization': 'Bearer ' + self.token} |
start_date=start_date, |
|
metrics=metrics, |
args = {} |
sort=sort, |
args["sort"] = "-ga:pageviews" |
dimensions="ga:pagePath", |
args["max-results"] = 100000 |
max_results=10000, |
args["dimensions"] = "ga:pagePath" |
end_date=end_date).execute() |
args["start-date"] = start_date |
except httplib.BadStatusLine: |
args["end-date"] = end_date |
log.error(u"Failed to download data=> ids: ga:{0}, filters: {1}, start_date: {2}, end_date: {3}, metrics: {4}, sort: {5}, dimensions: ga:pagePath".format( |
args["metrics"] = metrics |
self.profile_id, query, start_date, end_date, metrics, sort )) |
args["ids"] = "ga:" + self.profile_id |
return dict(url=[]) |
args["filters"] = query |
|
args["alt"] = "json" |
|
|
|
r = requests.get("https://www.googleapis.com/analytics/v3/data/ga", params=args, headers=headers) |
|
if r.status_code != 200: |
|
raise Exception("Request with params: %s failed" % args) |
|
|
|
results = json.loads(r.content) |
|
print len(results.keys()) |
|
except Exception, e: |
|
log.exception(e) |
|
#return dict(url=[]) |
|
raise e |
|
|
packages = [] |
packages = [] |
log.info("There are %d results" % results['totalResults']) |
log.info("There are %d results" % results['totalResults']) |
for entry in results.get('rows'): |
for entry in results.get('rows'): |
(loc,pageviews,visits) = entry |
(loc,pageviews,visits) = entry |
url = _normalize_url('http:/' + loc) # strips off domain e.g. www.data.gov.uk or data.gov.uk |
url = _normalize_url('http:/' + loc) # strips off domain e.g. www.data.gov.uk or data.gov.uk |
|
|
if not url.startswith('/dataset/') and not url.startswith('/publisher/'): |
if not url.startswith('/dataset/') and not url.startswith('/publisher/'): |
# filter out strays like: |
# filter out strays like: |
# /data/user/login?came_from=http://data.gov.uk/dataset/os-code-point-open |
# /data/user/login?came_from=http://data.gov.uk/dataset/os-code-point-open |
# /403.html?page=/about&from=http://data.gov.uk/publisher/planning-inspectorate |
# /403.html?page=/about&from=http://data.gov.uk/publisher/planning-inspectorate |
continue |
continue |
packages.append( (url, pageviews, visits,) ) # Temporary hack |
packages.append( (url, pageviews, visits,) ) # Temporary hack |
return dict(url=packages) |
return dict(url=packages) |
|
|
def store(self, period_name, period_complete_day, data): |
def store(self, period_name, period_complete_day, data): |
if 'url' in data: |
if 'url' in data: |
ga_model.update_url_stats(period_name, period_complete_day, data['url']) |
ga_model.update_url_stats(period_name, period_complete_day, data['url']) |
|
|
def sitewide_stats(self, period_name, period_complete_day): |
def sitewide_stats(self, period_name, period_complete_day): |
import calendar |
import calendar |
year, month = period_name.split('-') |
year, month = period_name.split('-') |
_, last_day_of_month = calendar.monthrange(int(year), int(month)) |
_, last_day_of_month = calendar.monthrange(int(year), int(month)) |
|
|
start_date = '%s-01' % period_name |
start_date = '%s-01' % period_name |
end_date = '%s-%s' % (period_name, last_day_of_month) |
end_date = '%s-%s' % (period_name, last_day_of_month) |
funcs = ['_totals_stats', '_social_stats', '_os_stats', |
funcs = ['_totals_stats', '_social_stats', '_os_stats', |
'_locale_stats', '_browser_stats', '_mobile_stats', '_download_stats'] |
'_locale_stats', '_browser_stats', '_mobile_stats', '_download_stats'] |
for f in funcs: |
for f in funcs: |
log.info('Downloading analytics for %s' % f.split('_')[1]) |
log.info('Downloading analytics for %s' % f.split('_')[1]) |
getattr(self, f)(start_date, end_date, period_name, period_complete_day) |
getattr(self, f)(start_date, end_date, period_name, period_complete_day) |
|
|
def _get_results(result_data, f): |
def _get_results(result_data, f): |
data = {} |
data = {} |
for result in result_data: |
for result in result_data: |
key = f(result) |
key = f(result) |
data[key] = data.get(key,0) + result[1] |
data[key] = data.get(key,0) + result[1] |
return data |
return data |
|
|
|
def _get_json(self, params, prev_fail=False): |
|
ga_token_filepath = os.path.expanduser(config.get('googleanalytics.token.filepath', '')) |
|
if not ga_token_filepath: |
|
print 'ERROR: In the CKAN config you need to specify the filepath of the ' \ |
|
'Google Analytics token file under key: googleanalytics.token.filepath' |
|
return |
|
|
|
try: |
|
log.info("Trying to refresh our OAuth token") |
|
self.token, svc = init_service(ga_token_filepath, None) |
|
log.info("OAuth token refreshed") |
|
except Exception auth_exception: |
|
log.error("Oauth refresh failed") |
|
log.exception(auth_exception) |
|
return |
|
|
|
try: |
|
headers = {'authorization': 'Bearer ' + self.token} |
|
r = requests.get("https://www.googleapis.com/analytics/v3/data/ga", params=params, headers=headers) |
|
if r.status_code != 200: |
|
log.info("STATUS: %s" % (r.status_code,)) |
|
log.info("CONTENT: %s" % (r.content,)) |
|
raise Exception("Request with params: %s failed" % params) |
|
|
|
return json.loads(r.content) |
|
except Exception, e: |
|
log.exception(e) |
|
|
|
return dict(url=[]) |
|
|
def _totals_stats(self, start_date, end_date, period_name, period_complete_day): |
def _totals_stats(self, start_date, end_date, period_name, period_complete_day): |
""" Fetches distinct totals, total pageviews etc """ |
""" Fetches distinct totals, total pageviews etc """ |
results = self.service.data().ga().get( |
try: |
ids='ga:' + self.profile_id, |
args = {} |
start_date=start_date, |
args["max-results"] = 100000 |
metrics='ga:pageviews', |
args["start-date"] = start_date |
sort='-ga:pageviews', |
args["end-date"] = end_date |
max_results=10000, |
args["ids"] = "ga:" + self.profile_id |
end_date=end_date).execute() |
|
|
args["metrics"] = "ga:pageviews" |
|
args["sort"] = "-ga:pageviews" |
|
args["alt"] = "json" |
|
|
|
results = self._get_json(args) |
|
except Exception, e: |
|
log.exception(e) |
|
results = dict(url=[]) |
|
|
result_data = results.get('rows') |
result_data = results.get('rows') |
ga_model.update_sitewide_stats(period_name, "Totals", {'Total page views': result_data[0][0]}, |
ga_model.update_sitewide_stats(period_name, "Totals", {'Total page views': result_data[0][0]}, |
period_complete_day) |
period_complete_day) |
|
|
results = self.service.data().ga().get( |
try: |
ids='ga:' + self.profile_id, |
# Because of issues of invalid responses, we are going to make these requests |
start_date=start_date, |
# ourselves. |
metrics='ga:pageviewsPerVisit,ga:avgTimeOnSite,ga:percentNewVisits,ga:visits', |
headers = {'authorization': 'Bearer ' + self.token} |
max_results=10000, |
|
end_date=end_date).execute() |
args = {} |
|
args["max-results"] = 100000 |
|
args["start-date"] = start_date |
|
args["end-date"] = end_date |
|
args["ids"] = "ga:" + self.profile_id |
|
|
|
args["metrics"] = "ga:pageviewsPerVisit,ga:avgTimeOnSite,ga:percentNewVisits,ga:visits" |
|
args["alt"] = "json" |
|
|
|
results = self._get_json(args) |
|
except Exception, e: |
|
log.exception(e) |
|
results = dict(url=[]) |
|
|
result_data = results.get('rows') |
result_data = results.get('rows') |
data = { |
data = { |
'Pages per visit': result_data[0][0], |
'Pages per visit': result_data[0][0], |
'Average time on site': result_data[0][1], |
'Average time on site': result_data[0][1], |
'New visits': result_data[0][2], |
'New visits': result_data[0][2], |
'Total visits': result_data[0][3], |
'Total visits': result_data[0][3], |
} |
} |
ga_model.update_sitewide_stats(period_name, "Totals", data, period_complete_day) |
ga_model.update_sitewide_stats(period_name, "Totals", data, period_complete_day) |
|
|
# Bounces from / or another configurable page. |
# Bounces from / or another configurable page. |
path = '/%s%s' % (config.get('googleanalytics.account'), |
path = '/%s%s' % (config.get('googleanalytics.account'), |
config.get('ga-report.bounce_url', '/')) |
config.get('ga-report.bounce_url', '/')) |
results = self.service.data().ga().get( |
|
ids='ga:' + self.profile_id, |
try: |
filters='ga:pagePath==%s' % (path,), |
# Because of issues of invalid responses, we are going to make these requests |
start_date=start_date, |
# ourselves. |
metrics='ga:visitBounceRate', |
headers = {'authorization': 'Bearer ' + self.token} |
dimensions='ga:pagePath', |
|
max_results=10000, |
args = {} |
end_date=end_date).execute() |
args["max-results"] = 100000 |
|
args["start-date"] = start_date |
|
args["end-date"] = end_date |
|
args["ids"] = "ga:" + self.profile_id |
|
|
|
args["filters"] = 'ga:pagePath==%s' % (path,) |
|
args["dimensions"] = 'ga:pagePath' |
|
args["metrics"] = "ga:visitBounceRate" |
|
args["alt"] = "json" |
|
|
|
results = self._get_json(args) |
|
except Exception, e: |
|
log.exception(e) |
|
results = dict(url=[]) |
|
|
result_data = results.get('rows') |
result_data = results.get('rows') |
if not result_data or len(result_data) != 1: |
if not result_data or len(result_data) != 1: |
log.error('Could not pinpoint the bounces for path: %s. Got results: %r', |
log.error('Could not pinpoint the bounces for path: %s. Got results: %r', |
path, result_data) |
path, result_data) |
return |
return |
results = result_data[0] |
results = result_data[0] |
bounces = float(results[1]) |
bounces = float(results[1]) |
# visitBounceRate is already a % |
# visitBounceRate is already a % |
log.info('Google reports visitBounceRate as %s', bounces) |
log.info('Google reports visitBounceRate as %s', bounces) |
ga_model.update_sitewide_stats(period_name, "Totals", {'Bounce rate (home page)': float(bounces)}, |
ga_model.update_sitewide_stats(period_name, "Totals", {'Bounce rate (home page)': float(bounces)}, |
period_complete_day) |
period_complete_day) |
|
|
|
|
def _locale_stats(self, start_date, end_date, period_name, period_complete_day): |
def _locale_stats(self, start_date, end_date, period_name, period_complete_day): |
""" Fetches stats about language and country """ |
""" Fetches stats about language and country """ |
results = self.service.data().ga().get( |
|
ids='ga:' + self.profile_id, |
try: |
start_date=start_date, |
# Because of issues of invalid responses, we are going to make these requests |
metrics='ga:pageviews', |
# ourselves. |
sort='-ga:pageviews', |
headers = {'authorization': 'Bearer ' + self.token} |
dimensions="ga:language,ga:country", |
|
max_results=10000, |
args = {} |
end_date=end_date).execute() |
args["max-results"] = 100000 |
|
args["start-date"] = start_date |
|
args["end-date"] = end_date |
|
args["ids"] = "ga:" + self.profile_id |
|
|
|
args["dimensions"] = "ga:language,ga:country" |
|
args["metrics"] = "ga:pageviews" |
|
args["sort"] = "-ga:pageviews" |
|
args["alt"] = "json" |
|
|
|
results = self._get_json(args) |
|
except Exception, e: |
|
log.exception(e) |
|
results = dict(url=[]) |
|
|
result_data = results.get('rows') |
result_data = results.get('rows') |
data = {} |
data = {} |
for result in result_data: |
for result in result_data: |
data[result[0]] = data.get(result[0], 0) + int(result[2]) |
data[result[0]] = data.get(result[0], 0) + int(result[2]) |
self._filter_out_long_tail(data, MIN_VIEWS) |
self._filter_out_long_tail(data, MIN_VIEWS) |
ga_model.update_sitewide_stats(period_name, "Languages", data, period_complete_day) |
ga_model.update_sitewide_stats(period_name, "Languages", data, period_complete_day) |
|
|
data = {} |
data = {} |
for result in result_data: |
for result in result_data: |
data[result[1]] = data.get(result[1], 0) + int(result[2]) |
data[result[1]] = data.get(result[1], 0) + int(result[2]) |
self._filter_out_long_tail(data, MIN_VIEWS) |
self._filter_out_long_tail(data, MIN_VIEWS) |
ga_model.update_sitewide_stats(period_name, "Country", data, period_complete_day) |
ga_model.update_sitewide_stats(period_name, "Country", data, period_complete_day) |
|
|
|
|
def _download_stats(self, start_date, end_date, period_name, period_complete_day): |
def _download_stats(self, start_date, end_date, period_name, period_complete_day): |
""" Fetches stats about data downloads """ |
""" Fetches stats about data downloads """ |
import ckan.model as model |
import ckan.model as model |
|
|
data = {} |
data = {} |
|
|
results = self.service.data().ga().get( |
try: |
ids='ga:' + self.profile_id, |
# Because of issues of invalid responses, we are going to make these requests |
start_date=start_date, |
# ourselves. |
filters='ga:eventAction==download', |
headers = {'authorization': 'Bearer ' + self.token} |
metrics='ga:totalEvents', |
|
sort='-ga:totalEvents', |
args = {} |
dimensions="ga:eventLabel", |
args["max-results"] = 100000 |
max_results=10000, |
args["start-date"] = start_date |
end_date=end_date).execute() |
args["end-date"] = end_date |
|
args["ids"] = "ga:" + self.profile_id |
|
|
|
args["filters"] = 'ga:eventAction==download' |
|
args["dimensions"] = "ga:eventLabel" |
|
args["metrics"] = "ga:totalEvents" |
|
args["alt"] = "json" |
|
|
|
results = self._get_json(args) |
|
except Exception, e: |
|
log.exception(e) |
|
results = dict(url=[]) |
|
|
result_data = results.get('rows') |
result_data = results.get('rows') |
if not result_data: |
if not result_data: |
# We may not have data for this time period, so we need to bail |
# We may not have data for this time period, so we need to bail |
# early. |
# early. |
log.info("There is no download data for this time period") |
log.info("There is no download data for this time period") |
return |
return |
|
|
def process_result_data(result_data, cached=False): |
def process_result_data(result_data, cached=False): |
progress_total = len(result_data) |
progress_total = len(result_data) |
progress_count = 0 |
progress_count = 0 |
resources_not_matched = [] |
resources_not_matched = [] |
for result in result_data: |
for result in result_data: |
progress_count += 1 |
progress_count += 1 |
if progress_count % 100 == 0: |
if progress_count % 100 == 0: |
log.debug('.. %d/%d done so far', progress_count, progress_total) |
log.debug('.. %d/%d done so far', progress_count, progress_total) |
|
|
url = result[0].strip() |
url = result[0].strip() |
|
|
# Get package id associated with the resource that has this URL. |
# Get package id associated with the resource that has this URL. |
q = model.Session.query(model.Resource) |
q = model.Session.query(model.Resource) |
if cached: |
if cached: |
r = q.filter(model.Resource.cache_url.like("%s%%" % url)).first() |
r = q.filter(model.Resource.cache_url.like("%s%%" % url)).first() |
else: |
else: |
r = q.filter(model.Resource.url.like("%s%%" % url)).first() |
r = q.filter(model.Resource.url.like("%s%%" % url)).first() |
|
|
package_name = r.resource_group.package.name if r else "" |
package_name = r.resource_group.package.name if r else "" |
if package_name: |
if package_name: |
data[package_name] = data.get(package_name, 0) + int(result[1]) |
data[package_name] = data.get(package_name, 0) + int(result[1]) |
else: |
else: |
resources_not_matched.append(url) |
resources_not_matched.append(url) |
continue |
continue |
if resources_not_matched: |
if resources_not_matched: |
log.debug('Could not match %i or %i resource URLs to datasets. e.g. %r', |
log.debug('Could not match %i or %i resource URLs to datasets. e.g. %r', |
len(resources_not_matched), progress_total, resources_not_matched[:3]) |
len(resources_not_matched), progress_total, resources_not_matched[:3]) |
|
|
log.info('Associating downloads of resource URLs with their respective datasets') |
log.info('Associating downloads of resource URLs with their respective datasets') |
process_result_data(results.get('rows')) |
process_result_data(results.get('rows')) |
|
|
results = self.service.data().ga().get( |
try: |
ids='ga:' + self.profile_id, |
# Because of issues of invalid responses, we are going to make these requests |
start_date=start_date, |
# ourselves. |
filters='ga:eventAction==download-cache', |
headers = {'authorization': 'Bearer ' + self.token} |
metrics='ga:totalEvents', |
|
sort='-ga:totalEvents', |
args = dict( ids='ga:' + self.profile_id, |
dimensions="ga:eventLabel", |
filters='ga:eventAction==download-cache', |
max_results=10000, |
metrics='ga:totalEvents', |
end_date=end_date).execute() |
sort='-ga:totalEvents', |
|
dimensions="ga:eventLabel", |
|
max_results=10000) |
|
args['start-date'] = start_date |
|
args['end-date'] = end_date |
|
|
|
results = self._get_json(args) |
|
except Exception, e: |
|
log.exception(e) |
|
results = dict(url=[]) |
|
|
log.info('Associating downloads of cache resource URLs with their respective datasets') |
log.info('Associating downloads of cache resource URLs with their respective datasets') |
process_result_data(results.get('rows'), cached=False) |
process_result_data(results.get('rows'), cached=False) |
|
|
self._filter_out_long_tail(data, MIN_DOWNLOADS) |
self._filter_out_long_tail(data, MIN_DOWNLOADS) |
ga_model.update_sitewide_stats(period_name, "Downloads", data, period_complete_day) |
ga_model.update_sitewide_stats(period_name, "Downloads", data, period_complete_day) |
|
|
def _social_stats(self, start_date, end_date, period_name, period_complete_day): |
def _social_stats(self, start_date, end_date, period_name, period_complete_day): |
""" Finds out which social sites people are referred from """ |
""" Finds out which social sites people are referred from """ |
results = self.service.data().ga().get( |
|
ids='ga:' + self.profile_id, |
try: |
start_date=start_date, |
# Because of issues of invalid responses, we are going to make these requests |
metrics='ga:pageviews', |
# ourselves. |
sort='-ga:pageviews', |
headers = {'authorization': 'Bearer ' + self.token} |
dimensions="ga:socialNetwork,ga:referralPath", |
|
max_results=10000, |
args = dict( ids='ga:' + self.profile_id, |
end_date=end_date).execute() |
metrics='ga:pageviews', |
|
sort='-ga:pageviews', |
|
dimensions="ga:socialNetwork,ga:referralPath", |
|
max_results=10000) |
|
args['start-date'] = start_date |
|
args['end-date'] = end_date |
|
|
|
results = self._get_json(args) |
|
except Exception, e: |
|
log.exception(e) |
|
results = dict(url=[]) |
|
|
result_data = results.get('rows') |
result_data = results.get('rows') |
data = {} |
data = {} |
for result in result_data: |
for result in result_data: |
if not result[0] == '(not set)': |
if not result[0] == '(not set)': |
data[result[0]] = data.get(result[0], 0) + int(result[2]) |
data[result[0]] = data.get(result[0], 0) + int(result[2]) |
self._filter_out_long_tail(data, 3) |
self._filter_out_long_tail(data, 3) |
ga_model.update_sitewide_stats(period_name, "Social sources", data, period_complete_day) |
ga_model.update_sitewide_stats(period_name, "Social sources", data, period_complete_day) |
|
|
|
|
def _os_stats(self, start_date, end_date, period_name, period_complete_day): |
def _os_stats(self, start_date, end_date, period_name, period_complete_day): |
""" Operating system stats """ |
""" Operating system stats """ |
results = self.service.data().ga().get( |
try: |
ids='ga:' + self.profile_id, |
# Because of issues of invalid responses, we are going to make these requests |
start_date=start_date, |
# ourselves. |
metrics='ga:pageviews', |
headers = {'authorization': 'Bearer ' + self.token} |
sort='-ga:pageviews', |
|
dimensions="ga:operatingSystem,ga:operatingSystemVersion", |
args = dict( ids='ga:' + self.profile_id, |
max_results=10000, |
metrics='ga:pageviews', |
end_date=end_date).execute() |
sort='-ga:pageviews', |
|
dimensions="ga:operatingSystem,ga:operatingSystemVersion", |
|
max_results=10000) |
|
args['start-date'] = start_date |
|
args['end-date'] = end_date |
|
|
|
results = self._get_json(args) |
|
except Exception, e: |
|
log.exception(e) |
|
results = dict(url=[]) |
|
|
result_data = results.get('rows') |
result_data = results.get('rows') |
data = {} |
data = {} |
for result in result_data: |
for result in result_data: |
data[result[0]] = data.get(result[0], 0) + int(result[2]) |
data[result[0]] = data.get(result[0], 0) + int(result[2]) |
self._filter_out_long_tail(data, MIN_VIEWS) |
self._filter_out_long_tail(data, MIN_VIEWS) |
ga_model.update_sitewide_stats(period_name, "Operating Systems", data, period_complete_day) |
ga_model.update_sitewide_stats(period_name, "Operating Systems", data, period_complete_day) |
|
|
data = {} |
data = {} |
for result in result_data: |
for result in result_data: |
if int(result[2]) >= MIN_VIEWS: |
if int(result[2]) >= MIN_VIEWS: |
key = "%s %s" % (result[0],result[1]) |
key = "%s %s" % (result[0],result[1]) |
data[key] = result[2] |
data[key] = result[2] |
ga_model.update_sitewide_stats(period_name, "Operating Systems versions", data, period_complete_day) |
ga_model.update_sitewide_stats(period_name, "Operating Systems versions", data, period_complete_day) |
|
|
|
|
def _browser_stats(self, start_date, end_date, period_name, period_complete_day): |
def _browser_stats(self, start_date, end_date, period_name, period_complete_day): |
""" Information about browsers and browser versions """ |
""" Information about browsers and browser versions """ |
results = self.service.data().ga().get( |
|
ids='ga:' + self.profile_id, |
try: |
start_date=start_date, |
# Because of issues of invalid responses, we are going to make these requests |
metrics='ga:pageviews', |
# ourselves. |
sort='-ga:pageviews', |
headers = {'authorization': 'Bearer ' + self.token} |
dimensions="ga:browser,ga:browserVersion", |
|
max_results=10000, |
args = dict( ids='ga:' + self.profile_id, |
end_date=end_date).execute() |
metrics='ga:pageviews', |
|
sort='-ga:pageviews', |
|
dimensions="ga:browser,ga:browserVersion", |
|
max_results=10000) |
|
|
|
args['start-date'] = start_date |
|
args['end-date'] = end_date |
|
|
|
results = self._get_json(args) |
|
except Exception, e: |
|
log.exception(e) |
|
results = dict(url=[]) |
|
|
|
|
result_data = results.get('rows') |
result_data = results.get('rows') |
# e.g. [u'Firefox', u'19.0', u'20'] |
# e.g. [u'Firefox', u'19.0', u'20'] |
|
|
data = {} |
data = {} |
for result in result_data: |
for result in result_data: |
data[result[0]] = data.get(result[0], 0) + int(result[2]) |
data[result[0]] = data.get(result[0], 0) + int(result[2]) |
self._filter_out_long_tail(data, MIN_VIEWS) |
self._filter_out_long_tail(data, MIN_VIEWS) |
ga_model.update_sitewide_stats(period_name, "Browsers", data, period_complete_day) |
ga_model.update_sitewide_stats(period_name, "Browsers", data, period_complete_day) |
|
|
data = {} |
data = {} |
for result in result_data: |
for result in result_data: |
key = "%s %s" % (result[0], self._filter_browser_version(result[0], result[1])) |
key = "%s %s" % (result[0], self._filter_browser_version(result[0], result[1])) |
data[key] = data.get(key, 0) + int(result[2]) |
data[key] = data.get(key, 0) + int(result[2]) |
self._filter_out_long_tail(data, MIN_VIEWS) |
self._filter_out_long_tail(data, MIN_VIEWS) |
ga_model.update_sitewide_stats(period_name, "Browser versions", data, period_complete_day) |
ga_model.update_sitewide_stats(period_name, "Browser versions", data, period_complete_day) |
|
|
@classmethod |
@classmethod |
def _filter_browser_version(cls, browser, version_str): |
def _filter_browser_version(cls, browser, version_str): |
''' |
''' |
Simplifies a browser version string if it is detailed. |
Simplifies a browser version string if it is detailed. |
i.e. groups together Firefox 3.5.1 and 3.5.2 to be just 3. |
i.e. groups together Firefox 3.5.1 and 3.5.2 to be just 3. |
This is helpful when viewing stats and good to protect privacy. |
This is helpful when viewing stats and good to protect privacy. |
''' |
''' |
ver = version_str |
ver = version_str |
parts = ver.split('.') |
parts = ver.split('.') |
if len(parts) > 1: |
if len(parts) > 1: |
if parts[1][0] == '0': |
if parts[1][0] == '0': |
ver = parts[0] |
ver = parts[0] |
else: |
else: |
ver = "%s" % (parts[0]) |
ver = "%s" % (parts[0]) |
# Special case complex version nums |
# Special case complex version nums |
if browser in ['Safari', 'Android Browser']: |
if browser in ['Safari', 'Android Browser']: |
ver = parts[0] |
ver = parts[0] |
if len(ver) > 2: |
if len(ver) > 2: |
num_hidden_digits = len(ver) - 2 |
num_hidden_digits = len(ver) - 2 |
ver = ver[0] + ver[1] + 'X' * num_hidden_digits |
ver = ver[0] + ver[1] + 'X' * num_hidden_digits |
return ver |
return ver |
|
|
def _mobile_stats(self, start_date, end_date, period_name, period_complete_day): |
def _mobile_stats(self, start_date, end_date, period_name, period_complete_day): |
""" Info about mobile devices """ |
""" Info about mobile devices """ |
|
|
results = self.service.data().ga().get( |
try: |
ids='ga:' + self.profile_id, |
# Because of issues of invalid responses, we are going to make these requests |
start_date=start_date, |
# ourselves. |
metrics='ga:pageviews', |
headers = {'authorization': 'Bearer ' + self.token} |
sort='-ga:pageviews', |
|
dimensions="ga:mobileDeviceBranding, ga:mobileDeviceInfo", |
args = dict( ids='ga:' + self.profile_id, |
max_results=10000, |
metrics='ga:pageviews', |
end_date=end_date).execute() |
sort='-ga:pageviews', |
|
dimensions="ga:mobileDeviceBranding, ga:mobileDeviceInfo", |
|
max_results=10000) |
|
args['start-date'] = start_date |
|
args['end-date'] = end_date |
|
|
|
results = self._get_json(args) |
|
except Exception, e: |
|
log.exception(e) |
|
results = dict(url=[]) |
|
|
|
|
result_data = results.get('rows') |
result_data = results.get('rows') |
data = {} |
data = {} |
for result in result_data: |
for result in result_data: |
data[result[0]] = data.get(result[0], 0) + int(result[2]) |
data[result[0]] = data.get(result[0], 0) + int(result[2]) |
self._filter_out_long_tail(data, MIN_VIEWS) |
self._filter_out_long_tail(data, MIN_VIEWS) |
ga_model.update_sitewide_stats(period_name, "Mobile brands", data, period_complete_day) |
ga_model.update_sitewide_stats(period_name, "Mobile brands", data, period_complete_day) |
|
|
data = {} |
data = {} |
for result in result_data: |
for result in result_data: |
data[result[1]] = data.get(result[1], 0) + int(result[2]) |
data[result[1]] = data.get(result[1], 0) + int(result[2]) |
self._filter_out_long_tail(data, MIN_VIEWS) |
self._filter_out_long_tail(data, MIN_VIEWS) |
ga_model.update_sitewide_stats(period_name, "Mobile devices", data, period_complete_day) |
ga_model.update_sitewide_stats(period_name, "Mobile devices", data, period_complete_day) |
|
|
@classmethod |
@classmethod |
def _filter_out_long_tail(cls, data, threshold=10): |
def _filter_out_long_tail(cls, data, threshold=10): |
''' |
''' |
Given data which is a frequency distribution, filter out |
Given data which is a frequency distribution, filter out |
results which are below a threshold count. This is good to protect |
results which are below a threshold count. This is good to protect |
privacy. |
privacy. |
''' |
''' |
for key, value in data.items(): |
for key, value in data.items(): |
if value < threshold: |
if value < threshold: |
del data[key] |
del data[key] |
|
|