--- a/ckanext/ga_report/download_analytics.py +++ b/ckanext/ga_report/download_analytics.py @@ -2,9 +2,11 @@ import logging import datetime import httplib +import urllib import collections import requests import json +import re from pylons import config from ga_model import _normalize_url import ga_model @@ -121,13 +123,13 @@ accountName = config.get('googleanalytics.account') log.info('Downloading analytics for dataset views') - data = self.download(start_date, end_date, '~/%s/dataset/[a-z0-9-_]+' % accountName) + data = self.download(start_date, end_date, '~^/dataset/[a-z0-9-_]+') log.info('Storing dataset views (%i rows)', len(data.get('url'))) self.store(period_name, period_complete_day, data, ) log.info('Downloading analytics for publisher views') - data = self.download(start_date, end_date, '~/%s/publisher/[a-z0-9-_]+' % accountName) + data = self.download(start_date, end_date, '~^/organization/[a-z0-9-_]+') log.info('Storing publisher views (%i rows)', len(data.get('url'))) self.store(period_name, period_complete_day, data,) @@ -177,7 +179,7 @@ data = collections.defaultdict(list) rows = results.get('rows',[]) for row in rows: - url = _normalize_url('http:/' + row[0]) + url = row[0] data[url].append( (row[1], int(row[2]),) ) ga_model.update_social(period_name, data) @@ -192,11 +194,8 @@ # Supported query params at # https://developers.google.com/analytics/devguides/reporting/core/v3/reference - try: - # Because of issues of invalid responses, we are going to make these requests - # ourselves. - headers = {'authorization': 'Bearer ' + self.token} - + # https://ga-dev-tools.appspot.com/explorer/ + try: args = {} args["sort"] = "-ga:pageviews" args["max-results"] = 100000 @@ -207,25 +206,22 @@ args["ids"] = "ga:" + self.profile_id args["filters"] = query args["alt"] = "json" - - r = requests.get("https://www.googleapis.com/analytics/v3/data/ga", params=args, headers=headers) - if r.status_code != 200: - raise Exception("Request with params: %s failed" % args) - - results = json.loads(r.content) - print len(results.keys()) - except Exception, e: - log.exception(e) - #return dict(url=[]) - raise e + print args + results = self._get_json(args) + + except Exception, e: + log.exception(e) + return dict(url=[]) packages = [] log.info("There are %d results" % results['totalResults']) - for entry in results.get('rows'): + if results['totalResults'] > 0: + for entry in results.get('rows'): (loc,pageviews,visits) = entry - url = _normalize_url('http:/' + loc) # strips off domain e.g. www.data.gov.uk or data.gov.uk - - if not url.startswith('/dataset/') and not url.startswith('/publisher/'): + #url = _normalize_url('http:/' + loc) # strips off domain e.g. www.data.gov.uk or data.gov.uk + url = loc + #print url + if not url.startswith('/dataset/') and not url.startswith('/organization/'): # filter out strays like: # /data/user/login?came_from=http://data.gov.uk/dataset/os-code-point-open # /403.html?page=/about&from=http://data.gov.uk/publisher/planning-inspectorate @@ -262,13 +258,14 @@ if not ga_token_filepath: print 'ERROR: In the CKAN config you need to specify the filepath of the ' \ 'Google Analytics token file under key: googleanalytics.token.filepath' - return - - try: - log.info("Trying to refresh our OAuth token") + return + + log.info("Trying to refresh our OAuth token") + try: + from ga_auth import init_service self.token, svc = init_service(ga_token_filepath, None) log.info("OAuth token refreshed") - except Exception auth_exception: + except Exception, auth_exception: log.error("Oauth refresh failed") log.exception(auth_exception) return @@ -277,9 +274,9 @@ headers = {'authorization': 'Bearer ' + self.token} r = requests.get("https://www.googleapis.com/analytics/v3/data/ga", params=params, headers=headers) if r.status_code != 200: - log.info("STATUS: %s" % (r.status_code,)) - log.info("CONTENT: %s" % (r.content,)) - raise Exception("Request with params: %s failed" % params) + log.info("STATUS: %s" % (r.status_code,)) + log.info("CONTENT: %s" % (r.content,)) + raise Exception("Request with params: %s failed" % params) return json.loads(r.content) except Exception, e: @@ -338,8 +335,7 @@ ga_model.update_sitewide_stats(period_name, "Totals", data, period_complete_day) # Bounces from / or another configurable page. - path = '/%s%s' % (config.get('googleanalytics.account'), - config.get('ga-report.bounce_url', '/')) + path = '/' #% (config.get('googleanalytics.account'), config.get('ga-report.bounce_url', '/')) try: # Because of issues of invalid responses, we are going to make these requests @@ -430,7 +426,7 @@ args["end-date"] = end_date args["ids"] = "ga:" + self.profile_id - args["filters"] = 'ga:eventAction==download' + args["filters"] = 'ga:eventAction==Download' args["dimensions"] = "ga:eventLabel" args["metrics"] = "ga:totalEvents" args["alt"] = "json" @@ -456,7 +452,7 @@ if progress_count % 100 == 0: log.debug('.. %d/%d done so far', progress_count, progress_total) - url = result[0].strip() + url = urllib.unquote(result[0].strip()) # Get package id associated with the resource that has this URL. q = model.Session.query(model.Resource) @@ -464,8 +460,15 @@ r = q.filter(model.Resource.cache_url.like("%s%%" % url)).first() else: r = q.filter(model.Resource.url.like("%s%%" % url)).first() + + # new style internal download links + if re.search('(?:/resource/)(.*)(?:/download/)',url): + resource_id = re.search('(?:/resource/)(.*)(?:/download/)',url) + r = q.filter(model.Resource.id.like("%s%%" % resource_id.group(1))).first() package_name = r.resource_group.package.name if r else "" + + if package_name: data[package_name] = data.get(package_name, 0) + int(result[1]) else: @@ -478,7 +481,7 @@ log.info('Associating downloads of resource URLs with their respective datasets') process_result_data(results.get('rows')) - try: + '''try: # Because of issues of invalid responses, we are going to make these requests # ourselves. headers = {'authorization': 'Bearer ' + self.token} @@ -498,7 +501,7 @@ results = dict(url=[]) log.info('Associating downloads of cache resource URLs with their respective datasets') - process_result_data(results.get('rows'), cached=False) + process_result_data(results.get('rows'), cached=False)''' self._filter_out_long_tail(data, MIN_DOWNLOADS) ga_model.update_sitewide_stats(period_name, "Downloads", data, period_complete_day)