| import os | import os |
| import logging | import logging |
| import datetime | import datetime |
| import httplib | import httplib |
| import urllib | |
| import collections | import collections |
| import requests | import requests |
| import json | import json |
| import re | |
| from pylons import config | from pylons import config |
| from ga_model import _normalize_url | from ga_model import _normalize_url |
| import ga_model | import ga_model |
| #from ga_client import GA | #from ga_client import GA |
| log = logging.getLogger('ckanext.ga-report') | log = logging.getLogger('ckanext.ga-report') |
| FORMAT_MONTH = '%Y-%m' | FORMAT_MONTH = '%Y-%m' |
| MIN_VIEWS = 50 | MIN_VIEWS = 50 |
| MIN_VISITS = 20 | MIN_VISITS = 20 |
| MIN_DOWNLOADS = 10 | MIN_DOWNLOADS = 10 |
| class DownloadAnalytics(object): | class DownloadAnalytics(object): |
| '''Downloads and stores analytics info''' | '''Downloads and stores analytics info''' |
| def __init__(self, service=None, token=None, profile_id=None, delete_first=False, | def __init__(self, service=None, token=None, profile_id=None, delete_first=False, |
| skip_url_stats=False): | skip_url_stats=False): |
| self.period = config['ga-report.period'] | self.period = config['ga-report.period'] |
| self.service = service | self.service = service |
| self.profile_id = profile_id | self.profile_id = profile_id |
| self.delete_first = delete_first | self.delete_first = delete_first |
| self.skip_url_stats = skip_url_stats | self.skip_url_stats = skip_url_stats |
| self.token = token | self.token = token |
| def specific_month(self, date): | def specific_month(self, date): |
| import calendar | import calendar |
| first_of_this_month = datetime.datetime(date.year, date.month, 1) | first_of_this_month = datetime.datetime(date.year, date.month, 1) |
| _, last_day_of_month = calendar.monthrange(int(date.year), int(date.month)) | _, last_day_of_month = calendar.monthrange(int(date.year), int(date.month)) |
| last_of_this_month = datetime.datetime(date.year, date.month, last_day_of_month) | last_of_this_month = datetime.datetime(date.year, date.month, last_day_of_month) |
| # if this is the latest month, note that it is only up until today | # if this is the latest month, note that it is only up until today |
| now = datetime.datetime.now() | now = datetime.datetime.now() |
| if now.year == date.year and now.month == date.month: | if now.year == date.year and now.month == date.month: |
| last_day_of_month = now.day | last_day_of_month = now.day |
| last_of_this_month = now | last_of_this_month = now |
| periods = ((date.strftime(FORMAT_MONTH), | periods = ((date.strftime(FORMAT_MONTH), |
| last_day_of_month, | last_day_of_month, |
| first_of_this_month, last_of_this_month),) | first_of_this_month, last_of_this_month),) |
| self.download_and_store(periods) | self.download_and_store(periods) |
| def latest(self): | def latest(self): |
| if self.period == 'monthly': | if self.period == 'monthly': |
| # from first of this month to today | # from first of this month to today |
| now = datetime.datetime.now() | now = datetime.datetime.now() |
| first_of_this_month = datetime.datetime(now.year, now.month, 1) | first_of_this_month = datetime.datetime(now.year, now.month, 1) |
| periods = ((now.strftime(FORMAT_MONTH), | periods = ((now.strftime(FORMAT_MONTH), |
| now.day, | now.day, |
| first_of_this_month, now),) | first_of_this_month, now),) |
| else: | else: |
| raise NotImplementedError | raise NotImplementedError |
| self.download_and_store(periods) | self.download_and_store(periods) |
| def for_date(self, for_date): | def for_date(self, for_date): |
| assert isinstance(since_date, datetime.datetime) | assert isinstance(since_date, datetime.datetime) |
| periods = [] # (period_name, period_complete_day, start_date, end_date) | periods = [] # (period_name, period_complete_day, start_date, end_date) |
| if self.period == 'monthly': | if self.period == 'monthly': |
| first_of_the_months_until_now = [] | first_of_the_months_until_now = [] |
| year = for_date.year | year = for_date.year |
| month = for_date.month | month = for_date.month |
| now = datetime.datetime.now() | now = datetime.datetime.now() |
| first_of_this_month = datetime.datetime(now.year, now.month, 1) | first_of_this_month = datetime.datetime(now.year, now.month, 1) |
| while True: | while True: |
| first_of_the_month = datetime.datetime(year, month, 1) | first_of_the_month = datetime.datetime(year, month, 1) |
| if first_of_the_month == first_of_this_month: | if first_of_the_month == first_of_this_month: |
| periods.append((now.strftime(FORMAT_MONTH), | periods.append((now.strftime(FORMAT_MONTH), |
| now.day, | now.day, |
| first_of_this_month, now)) | first_of_this_month, now)) |
| break | break |
| elif first_of_the_month < first_of_this_month: | elif first_of_the_month < first_of_this_month: |
| in_the_next_month = first_of_the_month + datetime.timedelta(40) | in_the_next_month = first_of_the_month + datetime.timedelta(40) |
| last_of_the_month = datetime.datetime(in_the_next_month.year, | last_of_the_month = datetime.datetime(in_the_next_month.year, |
| in_the_next_month.month, 1)\ | in_the_next_month.month, 1)\ |
| - datetime.timedelta(1) | - datetime.timedelta(1) |
| periods.append((now.strftime(FORMAT_MONTH), 0, | periods.append((now.strftime(FORMAT_MONTH), 0, |
| first_of_the_month, last_of_the_month)) | first_of_the_month, last_of_the_month)) |
| else: | else: |
| # first_of_the_month has got to the future somehow | # first_of_the_month has got to the future somehow |
| break | break |
| month += 1 | month += 1 |
| if month > 12: | if month > 12: |
| year += 1 | year += 1 |
| month = 1 | month = 1 |
| else: | else: |
| raise NotImplementedError | raise NotImplementedError |
| self.download_and_store(periods) | self.download_and_store(periods) |
| @staticmethod | @staticmethod |
| def get_full_period_name(period_name, period_complete_day): | def get_full_period_name(period_name, period_complete_day): |
| if period_complete_day: | if period_complete_day: |
| return period_name + ' (up to %ith)' % period_complete_day | return period_name + ' (up to %ith)' % period_complete_day |
| else: | else: |
| return period_name | return period_name |
| def download_and_store(self, periods): | def download_and_store(self, periods): |
| for period_name, period_complete_day, start_date, end_date in periods: | for period_name, period_complete_day, start_date, end_date in periods: |
| log.info('Period "%s" (%s - %s)', | log.info('Period "%s" (%s - %s)', |
| self.get_full_period_name(period_name, period_complete_day), | self.get_full_period_name(period_name, period_complete_day), |
| start_date.strftime('%Y-%m-%d'), | start_date.strftime('%Y-%m-%d'), |
| end_date.strftime('%Y-%m-%d')) | end_date.strftime('%Y-%m-%d')) |
| if self.delete_first: | if self.delete_first: |
| log.info('Deleting existing Analytics for this period "%s"', | log.info('Deleting existing Analytics for this period "%s"', |
| period_name) | period_name) |
| ga_model.delete(period_name) | ga_model.delete(period_name) |
| if not self.skip_url_stats: | if not self.skip_url_stats: |
| # Clean out old url data before storing the new | # Clean out old url data before storing the new |
| ga_model.pre_update_url_stats(period_name) | ga_model.pre_update_url_stats(period_name) |
| accountName = config.get('googleanalytics.account') | accountName = config.get('googleanalytics.account') |
| log.info('Downloading analytics for dataset views') | log.info('Downloading analytics for dataset views') |
| data = self.download(start_date, end_date, '~^/dataset/[a-z0-9-_]+') | data = self.download(start_date, end_date, '~^/dataset/[a-z0-9-_]+') |
| log.info('Storing dataset views (%i rows)', len(data.get('url'))) | log.info('Storing dataset views (%i rows)', len(data.get('url'))) |
| self.store(period_name, period_complete_day, data, ) | self.store(period_name, period_complete_day, data, ) |
| log.info('Downloading analytics for publisher views') | log.info('Downloading analytics for publisher views') |
| data = self.download(start_date, end_date, '~^/organization/[a-z0-9-_]+') | data = self.download(start_date, end_date, '~^/organization/[a-z0-9-_]+') |
| log.info('Storing publisher views (%i rows)', len(data.get('url'))) | log.info('Storing publisher views (%i rows)', len(data.get('url'))) |
| self.store(period_name, period_complete_day, data,) | self.store(period_name, period_complete_day, data,) |
| # Make sure the All records are correct. | # Make sure the All records are correct. |
| ga_model.post_update_url_stats() | ga_model.post_update_url_stats() |
| log.info('Associating datasets with their publisher') | log.info('Associating datasets with their publisher') |
| ga_model.update_publisher_stats(period_name) # about 30 seconds. | ga_model.update_publisher_stats(period_name) # about 30 seconds. |
| log.info('Downloading and storing analytics for site-wide stats') | log.info('Downloading and storing analytics for site-wide stats') |
| self.sitewide_stats( period_name, period_complete_day ) | self.sitewide_stats( period_name, period_complete_day ) |
| log.info('Downloading and storing analytics for social networks') | log.info('Downloading and storing analytics for social networks') |
| self.update_social_info(period_name, start_date, end_date) | self.update_social_info(period_name, start_date, end_date) |
| def update_social_info(self, period_name, start_date, end_date): | def update_social_info(self, period_name, start_date, end_date): |
| start_date = start_date.strftime('%Y-%m-%d') | start_date = start_date.strftime('%Y-%m-%d') |
| end_date = end_date.strftime('%Y-%m-%d') | end_date = end_date.strftime('%Y-%m-%d') |
| query = 'ga:hasSocialSourceReferral=~Yes$' | query = 'ga:hasSocialSourceReferral=~Yes$' |
| metrics = 'ga:entrances' | metrics = 'ga:entrances' |
| sort = '-ga:entrances' | sort = '-ga:entrances' |
| try: | try: |
| # Because of issues of invalid responses, we are going to make these requests | # Because of issues of invalid responses, we are going to make these requests |
| # ourselves. | # ourselves. |
| headers = {'authorization': 'Bearer ' + self.token} | headers = {'authorization': 'Bearer ' + self.token} |
| args = dict(ids='ga:' + self.profile_id, | args = dict(ids='ga:' + self.profile_id, |
| filters=query, | filters=query, |
| metrics=metrics, | metrics=metrics, |
| sort=sort, | sort=sort, |
| dimensions="ga:landingPagePath,ga:socialNetwork", | dimensions="ga:landingPagePath,ga:socialNetwork", |
| max_results=10000) | max_results=10000) |
| args['start-date'] = start_date | args['start-date'] = start_date |
| args['end-date'] = end_date | args['end-date'] = end_date |
| results = self._get_json(args) | results = self._get_json(args) |
| except Exception, e: | except Exception, e: |
| log.exception(e) | log.exception(e) |
| results = dict(url=[]) | results = dict(url=[]) |
| data = collections.defaultdict(list) | data = collections.defaultdict(list) |
| rows = results.get('rows',[]) | rows = results.get('rows',[]) |
| for row in rows: | for row in rows: |
| url = row[0] | url = row[0] |
| data[url].append( (row[1], int(row[2]),) ) | data[url].append( (row[1], int(row[2]),) ) |
| ga_model.update_social(period_name, data) | ga_model.update_social(period_name, data) |
| def download(self, start_date, end_date, path=None): | def download(self, start_date, end_date, path=None): |
| '''Get data from GA for a given time period''' | '''Get data from GA for a given time period''' |
| start_date = start_date.strftime('%Y-%m-%d') | start_date = start_date.strftime('%Y-%m-%d') |
| end_date = end_date.strftime('%Y-%m-%d') | end_date = end_date.strftime('%Y-%m-%d') |
| query = 'ga:pagePath=%s$' % path | query = 'ga:pagePath=%s$' % path |
| metrics = 'ga:pageviews, ga:visits' | metrics = 'ga:pageviews, ga:visits' |
| sort = '-ga:pageviews' | sort = '-ga:pageviews' |
| # Supported query params at | # Supported query params at |
| # https://developers.google.com/analytics/devguides/reporting/core/v3/reference | # https://developers.google.com/analytics/devguides/reporting/core/v3/reference |
| # https://ga-dev-tools.appspot.com/explorer/ | # https://ga-dev-tools.appspot.com/explorer/ |
| try: | try: |
| args = {} | args = {} |
| args["sort"] = "-ga:pageviews" | args["sort"] = "-ga:pageviews" |
| args["max-results"] = 100000 | args["max-results"] = 100000 |
| args["dimensions"] = "ga:pagePath" | args["dimensions"] = "ga:pagePath" |
| args["start-date"] = start_date | args["start-date"] = start_date |
| args["end-date"] = end_date | args["end-date"] = end_date |
| args["metrics"] = metrics | args["metrics"] = metrics |
| args["ids"] = "ga:" + self.profile_id | args["ids"] = "ga:" + self.profile_id |
| args["filters"] = query | args["filters"] = query |
| args["alt"] = "json" | args["alt"] = "json" |
| print args | print args |
| results = self._get_json(args) | results = self._get_json(args) |
| except Exception, e: | except Exception, e: |
| log.exception(e) | log.exception(e) |
| return dict(url=[]) | return dict(url=[]) |
| packages = [] | packages = [] |
| log.info("There are %d results" % results['totalResults']) | log.info("There are %d results" % results['totalResults']) |
| if results['totalResults'] > 0: | if results['totalResults'] > 0: |
| for entry in results.get('rows'): | for entry in results.get('rows'): |
| (loc,pageviews,visits) = entry | (loc,pageviews,visits) = entry |
| #url = _normalize_url('http:/' + loc) # strips off domain e.g. www.data.gov.uk or data.gov.uk | #url = _normalize_url('http:/' + loc) # strips off domain e.g. www.data.gov.uk or data.gov.uk |
| url = loc | url = loc |
| #print url | #print url |
| if not url.startswith('/dataset/') and not url.startswith('/organization/'): | if not url.startswith('/dataset/') and not url.startswith('/organization/'): |
| # filter out strays like: | # filter out strays like: |
| # /data/user/login?came_from=http://data.gov.uk/dataset/os-code-point-open | # /data/user/login?came_from=http://data.gov.uk/dataset/os-code-point-open |
| # /403.html?page=/about&from=http://data.gov.uk/publisher/planning-inspectorate | # /403.html?page=/about&from=http://data.gov.uk/publisher/planning-inspectorate |
| continue | continue |
| packages.append( (url, pageviews, visits,) ) # Temporary hack | packages.append( (url, pageviews, visits,) ) # Temporary hack |
| return dict(url=packages) | return dict(url=packages) |
| def store(self, period_name, period_complete_day, data): | def store(self, period_name, period_complete_day, data): |
| if 'url' in data: | if 'url' in data: |
| ga_model.update_url_stats(period_name, period_complete_day, data['url']) | ga_model.update_url_stats(period_name, period_complete_day, data['url']) |
| def sitewide_stats(self, period_name, period_complete_day): | def sitewide_stats(self, period_name, period_complete_day): |
| import calendar | import calendar |
| year, month = period_name.split('-') | year, month = period_name.split('-') |
| _, last_day_of_month = calendar.monthrange(int(year), int(month)) | _, last_day_of_month = calendar.monthrange(int(year), int(month)) |
| start_date = '%s-01' % period_name | start_date = '%s-01' % period_name |
| end_date = '%s-%s' % (period_name, last_day_of_month) | end_date = '%s-%s' % (period_name, last_day_of_month) |
| funcs = ['_totals_stats', '_social_stats', '_os_stats', | funcs = ['_totals_stats', '_social_stats', '_os_stats', |
| '_locale_stats', '_browser_stats', '_mobile_stats', '_download_stats'] | '_locale_stats', '_browser_stats', '_mobile_stats', '_download_stats'] |
| for f in funcs: | for f in funcs: |
| log.info('Downloading analytics for %s' % f.split('_')[1]) | log.info('Downloading analytics for %s' % f.split('_')[1]) |
| getattr(self, f)(start_date, end_date, period_name, period_complete_day) | getattr(self, f)(start_date, end_date, period_name, period_complete_day) |
| def _get_results(result_data, f): | def _get_results(result_data, f): |
| data = {} | data = {} |
| for result in result_data: | for result in result_data: |
| key = f(result) | key = f(result) |
| data[key] = data.get(key,0) + result[1] | data[key] = data.get(key,0) + result[1] |
| return data | return data |
| def _get_json(self, params, prev_fail=False): | def _get_json(self, params, prev_fail=False): |
| ga_token_filepath = os.path.expanduser(config.get('googleanalytics.token.filepath', '')) | ga_token_filepath = os.path.expanduser(config.get('googleanalytics.token.filepath', '')) |
| if not ga_token_filepath: | if not ga_token_filepath: |
| print 'ERROR: In the CKAN config you need to specify the filepath of the ' \ | print 'ERROR: In the CKAN config you need to specify the filepath of the ' \ |
| 'Google Analytics token file under key: googleanalytics.token.filepath' | 'Google Analytics token file under key: googleanalytics.token.filepath' |
| return | return |
| log.info("Trying to refresh our OAuth token") | log.info("Trying to refresh our OAuth token") |
| try: | try: |
| from ga_auth import init_service | from ga_auth import init_service |
| self.token, svc = init_service(ga_token_filepath, None) | self.token, svc = init_service(ga_token_filepath, None) |
| log.info("OAuth token refreshed") | log.info("OAuth token refreshed") |
| except Exception, auth_exception: | except Exception, auth_exception: |
| log.error("Oauth refresh failed") | log.error("Oauth refresh failed") |
| log.exception(auth_exception) | log.exception(auth_exception) |
| return | return |
| try: | try: |
| headers = {'authorization': 'Bearer ' + self.token} | headers = {'authorization': 'Bearer ' + self.token} |
| r = requests.get("https://www.googleapis.com/analytics/v3/data/ga", params=params, headers=headers) | r = requests.get("https://www.googleapis.com/analytics/v3/data/ga", params=params, headers=headers) |
| if r.status_code != 200: | if r.status_code != 200: |
| log.info("STATUS: %s" % (r.status_code,)) | log.info("STATUS: %s" % (r.status_code,)) |
| log.info("CONTENT: %s" % (r.content,)) | log.info("CONTENT: %s" % (r.content,)) |
| raise Exception("Request with params: %s failed" % params) | raise Exception("Request with params: %s failed" % params) |
| return json.loads(r.content) | return json.loads(r.content) |
| except Exception, e: | except Exception, e: |
| log.exception(e) | log.exception(e) |
| return dict(url=[]) | return dict(url=[]) |
| def _totals_stats(self, start_date, end_date, period_name, period_complete_day): | def _totals_stats(self, start_date, end_date, period_name, period_complete_day): |
| """ Fetches distinct totals, total pageviews etc """ | """ Fetches distinct totals, total pageviews etc """ |
| try: | try: |
| args = {} | args = {} |
| args["max-results"] = 100000 | args["max-results"] = 100000 |
| args["start-date"] = start_date | args["start-date"] = start_date |
| args["end-date"] = end_date | args["end-date"] = end_date |
| args["ids"] = "ga:" + self.profile_id | args["ids"] = "ga:" + self.profile_id |
| args["metrics"] = "ga:pageviews" | args["metrics"] = "ga:pageviews" |
| args["sort"] = "-ga:pageviews" | args["sort"] = "-ga:pageviews" |
| args["alt"] = "json" | args["alt"] = "json" |
| results = self._get_json(args) | results = self._get_json(args) |
| except Exception, e: | except Exception, e: |
| log.exception(e) | log.exception(e) |
| results = dict(url=[]) | results = dict(url=[]) |
| result_data = results.get('rows') | result_data = results.get('rows') |
| ga_model.update_sitewide_stats(period_name, "Totals", {'Total page views': result_data[0][0]}, | ga_model.update_sitewide_stats(period_name, "Totals", {'Total page views': result_data[0][0]}, |
| period_complete_day) | period_complete_day) |
| try: | try: |
| # Because of issues of invalid responses, we are going to make these requests | # Because of issues of invalid responses, we are going to make these requests |
| # ourselves. | # ourselves. |
| headers = {'authorization': 'Bearer ' + self.token} | headers = {'authorization': 'Bearer ' + self.token} |
| args = {} | args = {} |
| args["max-results"] = 100000 | args["max-results"] = 100000 |
| args["start-date"] = start_date | args["start-date"] = start_date |
| args["end-date"] = end_date | args["end-date"] = end_date |
| args["ids"] = "ga:" + self.profile_id | args["ids"] = "ga:" + self.profile_id |
| args["metrics"] = "ga:pageviewsPerVisit,ga:avgTimeOnSite,ga:percentNewVisits,ga:visits" | args["metrics"] = "ga:pageviewsPerVisit,ga:avgTimeOnSite,ga:percentNewVisits,ga:visits" |
| args["alt"] = "json" | args["alt"] = "json" |
| results = self._get_json(args) | results = self._get_json(args) |
| except Exception, e: | except Exception, e: |
| log.exception(e) | log.exception(e) |
| results = dict(url=[]) | results = dict(url=[]) |
| result_data = results.get('rows') | result_data = results.get('rows') |
| data = { | data = { |
| 'Pages per visit': result_data[0][0], | 'Pages per visit': result_data[0][0], |
| 'Average time on site': result_data[0][1], | 'Average time on site': result_data[0][1], |