--- a/ckanext/ga_report/download_analytics.py +++ b/ckanext/ga_report/download_analytics.py @@ -2,9 +2,11 @@ import logging import datetime import httplib +import urllib import collections import requests import json +import re from pylons import config from ga_model import _normalize_url import ga_model @@ -121,13 +123,13 @@ accountName = config.get('googleanalytics.account') log.info('Downloading analytics for dataset views') - data = self.download(start_date, end_date, '~/%s/dataset/[a-z0-9-_]+' % accountName) + data = self.download(start_date, end_date, '~^/dataset/[a-z0-9-_]+') log.info('Storing dataset views (%i rows)', len(data.get('url'))) self.store(period_name, period_complete_day, data, ) log.info('Downloading analytics for publisher views') - data = self.download(start_date, end_date, '~/%s/publisher/[a-z0-9-_]+' % accountName) + data = self.download(start_date, end_date, '~^/organization/[a-z0-9-_]+') log.info('Storing publisher views (%i rows)', len(data.get('url'))) self.store(period_name, period_complete_day, data,) @@ -177,7 +179,7 @@ data = collections.defaultdict(list) rows = results.get('rows',[]) for row in rows: - url = _normalize_url('http:/' + row[0]) + url = row[0] data[url].append( (row[1], int(row[2]),) ) ga_model.update_social(period_name, data) @@ -192,6 +194,7 @@ # Supported query params at # https://developers.google.com/analytics/devguides/reporting/core/v3/reference + # https://ga-dev-tools.appspot.com/explorer/ try: args = {} args["sort"] = "-ga:pageviews" @@ -203,7 +206,7 @@ args["ids"] = "ga:" + self.profile_id args["filters"] = query args["alt"] = "json" - + print args results = self._get_json(args) except Exception, e: @@ -212,11 +215,13 @@ packages = [] log.info("There are %d results" % results['totalResults']) - for entry in results.get('rows'): + if results['totalResults'] > 0: + for entry in results.get('rows'): (loc,pageviews,visits) = entry - url = _normalize_url('http:/' + loc) # strips off domain e.g. www.data.gov.uk or data.gov.uk - - if not url.startswith('/dataset/') and not url.startswith('/publisher/'): + #url = _normalize_url('http:/' + loc) # strips off domain e.g. www.data.gov.uk or data.gov.uk + url = loc + #print url + if not url.startswith('/dataset/') and not url.startswith('/organization/'): # filter out strays like: # /data/user/login?came_from=http://data.gov.uk/dataset/os-code-point-open # /403.html?page=/about&from=http://data.gov.uk/publisher/planning-inspectorate @@ -330,8 +335,7 @@ ga_model.update_sitewide_stats(period_name, "Totals", data, period_complete_day) # Bounces from / or another configurable page. - path = '/%s%s' % (config.get('googleanalytics.account'), - config.get('ga-report.bounce_url', '/')) + path = '/' #% (config.get('googleanalytics.account'), config.get('ga-report.bounce_url', '/')) try: # Because of issues of invalid responses, we are going to make these requests @@ -422,7 +426,7 @@ args["end-date"] = end_date args["ids"] = "ga:" + self.profile_id - args["filters"] = 'ga:eventAction==download' + args["filters"] = 'ga:eventAction==Download' args["dimensions"] = "ga:eventLabel" args["metrics"] = "ga:totalEvents" args["alt"] = "json" @@ -448,7 +452,7 @@ if progress_count % 100 == 0: log.debug('.. %d/%d done so far', progress_count, progress_total) - url = result[0].strip() + url = urllib.unquote(result[0].strip()) # Get package id associated with the resource that has this URL. q = model.Session.query(model.Resource) @@ -456,8 +460,15 @@ r = q.filter(model.Resource.cache_url.like("%s%%" % url)).first() else: r = q.filter(model.Resource.url.like("%s%%" % url)).first() + + # new style internal download links + if re.search('(?:/resource/)(.*)(?:/download/)',url): + resource_id = re.search('(?:/resource/)(.*)(?:/download/)',url) + r = q.filter(model.Resource.id.like("%s%%" % resource_id.group(1))).first() package_name = r.resource_group.package.name if r else "" + + if package_name: data[package_name] = data.get(package_name, 0) + int(result[1]) else: @@ -470,7 +481,7 @@ log.info('Associating downloads of resource URLs with their respective datasets') process_result_data(results.get('rows')) - try: + '''try: # Because of issues of invalid responses, we are going to make these requests # ourselves. headers = {'authorization': 'Bearer ' + self.token} @@ -490,7 +501,7 @@ results = dict(url=[]) log.info('Associating downloads of cache resource URLs with their respective datasets') - process_result_data(results.get('rows'), cached=False) + process_result_data(results.get('rows'), cached=False)''' self._filter_out_long_tail(data, MIN_DOWNLOADS) ga_model.update_sitewide_stats(period_name, "Downloads", data, period_complete_day)