[854] Fixes unexplained failure with fetching publisher information.
[854] Fixes unexplained failure with fetching publisher information.

Whilst we are still using the google lib we only let it do the oauth dance
before we take over making the requests using the python-requests library.

In cases where it fails with a 401 we will retry the request ONCE after
trying to obtain a new token.

Re-enabled all of the relevant links and html blocks that were commented out.

file:a/.gitignore -> file:b/.gitignore
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 *.py[co]
 *.py~
 .gitignore
+ckan.log
 
 # Packages
 *.egg

file:a/README.rst -> file:b/README.rst
--- a/README.rst
+++ b/README.rst
@@ -32,6 +32,7 @@
 
       googleanalytics.id = UA-1010101-1
       googleanalytics.account = Account name (e.g. data.gov.uk, see top level item at https://www.google.com/analytics)
+      googleanalytics.token.filepath = ~/pyenv/token.dat
       ga-report.period = monthly
       ga-report.bounce_url = /
 
@@ -82,13 +83,17 @@
 
     $ paster getauthtoken --config=../ckan/development.ini
 
+Now ensure you reference the correct path to your token.dat in your CKAN config file (e.g. development.ini)::
+
+    googleanalytics.token.filepath = ~/pyenv/token.dat
+
 
 Tutorial
 --------
 
 Download some GA data and store it in CKAN's database. (Ensure your CKAN pyenv is still activated, run the command from ``src/ckanext-ga-report``, alter the ``--config`` option to point to your site config file) and specifying the name of your auth file (token.dat by default) from the previous step::
 
-    $ paster loadanalytics token.dat latest --config=../ckan/development.ini
+    $ paster loadanalytics latest --config=../ckan/development.ini
 
 The value after the token file is how much data you want to retrieve, this can be
 

--- a/ckanext/ga_report/command.py
+++ b/ckanext/ga_report/command.py
@@ -1,5 +1,8 @@
 import logging
 import datetime
+import os
+
+from pylons import config
 
 from ckan.lib.cli import CkanCommand
 # No other CKAN imports allowed until _load_config is run,
@@ -20,7 +23,7 @@
         import ckan.model as model
         model.Session.remove()
         model.Session.configure(bind=model.meta.engine)
-        log = logging.getLogger('ckanext.ga-report')
+        log = logging.getLogger('ckanext.ga_report')
 
         import ga_model
         ga_model.init_tables()
@@ -53,25 +56,52 @@
                       self.args[0] if self.args
                                    else 'credentials.json')
 
+class FixTimePeriods(CkanCommand):
+    """
+    Fixes the 'All' records for GA_Urls
+
+    It is possible that older urls that haven't recently been visited
+    do not have All records.  This command will traverse through those
+    records and generate valid All records for them.
+    """
+    summary = __doc__.split('\n')[0]
+    usage = __doc__
+    max_args = 0
+    min_args = 0
+
+    def __init__(self, name):
+        super(FixTimePeriods, self).__init__(name)
+
+    def command(self):
+        import ckan.model as model
+        from ga_model import post_update_url_stats
+        self._load_config()
+        model.Session.remove()
+        model.Session.configure(bind=model.meta.engine)
+
+        log = logging.getLogger('ckanext.ga_report')
+
+        log.info("Updating 'All' records for old URLs")
+        post_update_url_stats()
+        log.info("Processing complete")
+
+
 
 class LoadAnalytics(CkanCommand):
     """Get data from Google Analytics API and save it
     in the ga_model
 
-    Usage: paster loadanalytics <tokenfile> <time-period>
+    Usage: paster loadanalytics <time-period>
 
-    Where <tokenfile> is the name of the auth token file from
-    the getauthtoken step.
-
-    And where <time-period> is:
+    Where <time-period> is:
         all         - data for all time
         latest      - (default) just the 'latest' data
         YYYY-MM     - just data for the specific month
     """
     summary = __doc__.split('\n')[0]
     usage = __doc__
-    max_args = 2
-    min_args = 1
+    max_args = 1
+    min_args = 0
 
     def __init__(self, name):
         super(LoadAnalytics, self).__init__(name)
@@ -85,6 +115,7 @@
                                default=False,
                                dest='skip_url_stats',
                                help='Skip the download of URL data - just do site-wide stats')
+        self.token = ""
 
     def command(self):
         self._load_config()
@@ -92,19 +123,25 @@
         from download_analytics import DownloadAnalytics
         from ga_auth import (init_service, get_profile_id)
 
+        ga_token_filepath = os.path.expanduser(config.get('googleanalytics.token.filepath', ''))
+        if not ga_token_filepath:
+            print 'ERROR: In the CKAN config you need to specify the filepath of the ' \
+                  'Google Analytics token file under key: googleanalytics.token.filepath'
+            return
+
         try:
-            svc = init_service(self.args[0], None)
+            self.token, svc = init_service(ga_token_filepath, None)
         except TypeError:
             print ('Have you correctly run the getauthtoken task and '
-                   'specified the correct token file?')
+                   'specified the correct token file in the CKAN config under '
+                   '"googleanalytics.token.filepath"?')
             return
 
-        downloader = DownloadAnalytics(svc, profile_id=get_profile_id(svc),
+        downloader = DownloadAnalytics(svc, self.token, profile_id=get_profile_id(svc),
                                        delete_first=self.options.delete_first,
                                        skip_url_stats=self.options.skip_url_stats)
 
-        time_period = self.args[1] if self.args and len(self.args) > 1 \
-            else 'latest'
+        time_period = self.args[0] if self.args else 'latest'
         if time_period == 'all':
             downloader.all_()
         elif time_period == 'latest':

--- a/ckanext/ga_report/controller.py
+++ b/ckanext/ga_report/controller.py
@@ -1,6 +1,7 @@
 import re
 import csv
 import sys
+import json
 import logging
 import operator
 import collections
@@ -13,6 +14,7 @@
 
 log = logging.getLogger('ckanext.ga-report')
 
+DOWNLOADS_AVAILABLE_FROM = '2012-12'
 
 def _get_month_name(strdate):
     import calendar
@@ -20,14 +22,39 @@
     d = strptime(strdate, '%Y-%m')
     return '%s %s' % (calendar.month_name[d.tm_mon], d.tm_year)
 
-
-def _month_details(cls):
-    '''Returns a list of all the month names'''
+def _get_unix_epoch(strdate):
+    from time import strptime,mktime
+    d = strptime(strdate, '%Y-%m')
+    return int(mktime(d))
+
+def _month_details(cls, stat_key=None):
+    '''
+    Returns a list of all the periods for which we have data, unfortunately
+    knows too much about the type of the cls being passed as GA_Url has a
+    more complex query
+
+    This may need extending if we add a period_name to the stats
+    '''
     months = []
-    vals = model.Session.query(cls.period_name).filter(cls.period_name!='All').distinct().all()
+    day = None
+
+    q = model.Session.query(cls.period_name,cls.period_complete_day)\
+        .filter(cls.period_name!='All').distinct(cls.period_name)
+    if stat_key:
+        q=  q.filter(cls.stat_name==stat_key)
+
+    vals = q.order_by("period_name desc").all()
+
+    if vals and vals[0][1]:
+        day = int(vals[0][1])
+        ordinal = 'th' if 11 <= day <= 13 \
+            else {1:'st',2:'nd',3:'rd'}.get(day % 10, 'th')
+        day = "{day}{ordinal}".format(day=day, ordinal=ordinal)
+
     for m in vals:
         months.append( (m[0], _get_month_name(m[0])))
-    return sorted(months, key=operator.itemgetter(0), reverse=True)
+
+    return months, day
 
 
 class GaReport(BaseController):
@@ -35,7 +62,7 @@
     def csv(self, month):
         import csv
 
-        q = model.Session.query(GA_Stat)
+        q = model.Session.query(GA_Stat).filter(GA_Stat.stat_name!='Downloads')
         if month != 'all':
             q = q.filter(GA_Stat.period_name==month)
         entries = q.order_by('GA_Stat.period_name, GA_Stat.stat_name, GA_Stat.key').all()
@@ -52,11 +79,12 @@
                              entry.key.encode('utf-8'),
                              entry.value.encode('utf-8')])
 
+
     def index(self):
 
         # Get the month details by fetching distinct values and determining the
         # month names from the values.
-        c.months = _month_details(GA_Stat)
+        c.months, c.day = _month_details(GA_Stat)
 
         # Work out which month to show, based on query params of the first item
         c.month_desc = 'all months'
@@ -84,11 +112,26 @@
 
             return key, val
 
+        # Query historic values for sparkline rendering
+        sparkline_query = model.Session.query(GA_Stat)\
+                .filter(GA_Stat.stat_name=='Totals')\
+                .order_by(GA_Stat.period_name)
+        sparkline_data = {}
+        for x in sparkline_query:
+            sparkline_data[x.key] = sparkline_data.get(x.key,[])
+            key, val = clean_key(x.key,float(x.value))
+            tooltip = '%s: %s' % (_get_month_name(x.period_name), val)
+            sparkline_data[x.key].append( (tooltip,x.value) )
+        # Trim the latest month, as it looks like a huge dropoff
+        for key in sparkline_data:
+            sparkline_data[key] = sparkline_data[key][:-1]
+
         c.global_totals = []
         if c.month:
             for e in entries:
                 key, val = clean_key(e.key, e.value)
-                c.global_totals.append((key, val))
+                sparkline = sparkline_data[e.key]
+                c.global_totals.append((key, val, sparkline))
         else:
             d = collections.defaultdict(list)
             for e in entries:
@@ -97,11 +140,19 @@
                 if k in ['Total page views', 'Total visits']:
                     v = sum(v)
                 else:
-                    v = float(sum(v))/len(v)
+                    v = float(sum(v))/float(len(v))
+                sparkline = sparkline_data[k]
                 key, val = clean_key(k,v)
 
-                c.global_totals.append((key, val))
-                c.global_totals = sorted(c.global_totals, key=operator.itemgetter(0))
+                c.global_totals.append((key, val, sparkline))
+        # Sort the global totals into a more pleasant order
+        def sort_func(x):
+            key = x[0]
+            total_order = ['Total page views','Total visits','Pages per visit']
+            if key in total_order:
+                return total_order.index(key)
+            return 999
+        c.global_totals = sorted(c.global_totals, key=sort_func)
 
         keys = {
             'Browser versions': 'browser_versions',
@@ -138,12 +189,13 @@
 
         for k, v in keys.iteritems():
             q = model.Session.query(GA_Stat).\
-                filter(GA_Stat.stat_name==k)
+                filter(GA_Stat.stat_name==k).\
+                order_by(GA_Stat.period_name)
+            # Buffer the tabular data
             if c.month:
                 entries = []
                 q = q.filter(GA_Stat.period_name==c.month).\
                           order_by('ga_stat.value::int desc')
-
             d = collections.defaultdict(int)
             for e in q.all():
                 d[e.key] += int(e.value)
@@ -152,10 +204,27 @@
                 entries.append((key,val,))
             entries = sorted(entries, key=operator.itemgetter(1), reverse=True)
 
+            # Run a query on all months to gather graph data
+            graph_query = model.Session.query(GA_Stat).\
+                filter(GA_Stat.stat_name==k).\
+                order_by(GA_Stat.period_name)
+            graph_dict = {}
+            for stat in graph_query:
+                graph_dict[ stat.key ] = graph_dict.get(stat.key,{
+                    'name':stat.key,
+                    'raw': {}
+                    })
+                graph_dict[ stat.key ]['raw'][stat.period_name] = float(stat.value)
+            stats_in_table = [x[0] for x in entries]
+            stats_not_in_table = set(graph_dict.keys()) - set(stats_in_table)
+            stats = stats_in_table + sorted(list(stats_not_in_table))
+            graph = [graph_dict[x] for x in stats]
+            setattr(c, v+'_graph', json.dumps( _to_rickshaw(graph,percentageMode=True) ))
+
             # Get the total for each set of values and then set the value as
             # a percentage of the total
             if k == 'Social sources':
-                total = sum([x for n,x in c.global_totals if n == 'Total visits'])
+                total = sum([x for n,x,graph in c.global_totals if n == 'Total visits'])
             else:
                 total = sum([num for _,num in entries])
             setattr(c, v, [(k,_percent(v,total)) for k,v in entries ])
@@ -180,7 +249,9 @@
         writer = csv.writer(response)
         writer.writerow(["Publisher Title", "Publisher Name", "Views", "Visits", "Period Name"])
 
-        for publisher,view,visit in _get_top_publishers(None):
+        top_publishers = _get_top_publishers(limit=None)
+
+        for publisher,view,visit in top_publishers:
             writer.writerow([publisher.title.encode('utf-8'),
                              publisher.name.encode('utf-8'),
                              view,
@@ -200,19 +271,20 @@
             if not c.publisher:
                 abort(404, 'A publisher with that name could not be found')
 
-        packages = self._get_packages(c.publisher)
+        packages = self._get_packages(publisher=c.publisher, month=c.month)
         response.headers['Content-Type'] = "text/csv; charset=utf-8"
         response.headers['Content-Disposition'] = \
             str('attachment; filename=datasets_%s_%s.csv' % (c.publisher_name, month,))
 
         writer = csv.writer(response)
-        writer.writerow(["Dataset Title", "Dataset Name", "Views", "Visits", "Period Name"])
-
-        for package,view,visit in packages:
+        writer.writerow(["Dataset Title", "Dataset Name", "Views", "Visits", "Resource downloads", "Period Name"])
+
+        for package,view,visit,downloads in packages:
             writer.writerow([package.title.encode('utf-8'),
                              package.name.encode('utf-8'),
                              view,
                              visit,
+                             downloads,
                              month])
 
     def publishers(self):
@@ -220,7 +292,7 @@
 
         # Get the month details by fetching distinct values and determining the
         # month names from the values.
-        c.months = _month_details(GA_Url)
+        c.months, c.day = _month_details(GA_Url)
 
         # Work out which month to show, based on query params of the first item
         c.month = request.params.get('month', '')
@@ -229,14 +301,19 @@
             c.month_desc = ''.join([m[1] for m in c.months if m[0]==c.month])
 
         c.top_publishers = _get_top_publishers()
-        return render('ga_report/publisher/index.html')
-
-    def _get_packages(self, publisher=None, count=-1):
-        '''Returns the datasets in order of visits'''
-        if count == -1:
-            count = sys.maxint
-
-        month = c.month or 'All'
+        graph_data = _get_top_publishers_graph()
+        c.top_publishers_graph = json.dumps( _to_rickshaw(graph_data) )
+
+        x =  render('ga_report/publisher/index.html')
+
+        return x
+
+    def _get_packages(self, publisher=None, month='', count=-1):
+        '''Returns the datasets in order of views'''
+        have_download_data = True
+        month = month or 'All'
+        if month != 'All':
+            have_download_data = month >= DOWNLOADS_AVAILABLE_FROM
 
         q = model.Session.query(GA_Url,model.Package)\
             .filter(model.Package.name==GA_Url.package_id)\
@@ -244,11 +321,28 @@
         if publisher:
             q = q.filter(GA_Url.department_id==publisher.name)
         q = q.filter(GA_Url.period_name==month)
-        q = q.order_by('ga_url.visits::int desc')
+        q = q.order_by('ga_url.pageviews::int desc')
         top_packages = []
-        for entry,package in q.limit(count):
+        if count == -1:
+            entries = q.all()
+        else:
+            entries = q.limit(count)
+
+        for entry,package in entries:
             if package:
-                top_packages.append((package, entry.pageviews, entry.visits))
+                # Downloads ....
+                if have_download_data:
+                    dls = model.Session.query(GA_Stat).\
+                        filter(GA_Stat.stat_name=='Downloads').\
+                        filter(GA_Stat.key==package.name)
+                    if month != 'All':  # Fetch everything unless the month is specific
+                        dls = dls.filter(GA_Stat.period_name==month)
+                    downloads = 0
+                    for x in dls:
+                        downloads += int(x.value)
+                else:
+                    downloads = 'No data'
+                top_packages.append((package, entry.pageviews, entry.visits, downloads))
             else:
                 log.warning('Could not find package associated package')
 
@@ -278,7 +372,7 @@
 
         # Get the month details by fetching distinct values and determining the
         # month names from the values.
-        c.months = _month_details(GA_Url)
+        c.months, c.day = _month_details(GA_Url)
 
         # Work out which month to show, based on query params of the first item
         c.month = request.params.get('month', '')
@@ -294,9 +388,73 @@
         entry = q.filter(GA_Url.period_name==c.month).first()
         c.publisher_page_views = entry.pageviews if entry else 0
 
-        c.top_packages = self._get_packages(c.publisher, 20)
+        c.top_packages = self._get_packages(publisher=c.publisher, count=20, month=c.month)
+
+        # Graph query
+        top_packages_all_time = self._get_packages(publisher=c.publisher, count=20, month='All')
+        top_package_names = [ x[0].name for x in top_packages_all_time ]
+        graph_query = model.Session.query(GA_Url,model.Package)\
+            .filter(model.Package.name==GA_Url.package_id)\
+            .filter(GA_Url.url.like('/dataset/%'))\
+            .filter(GA_Url.package_id.in_(top_package_names))
+        all_series = {}
+        for entry,package in graph_query:
+            if not package: continue
+            if entry.period_name=='All': continue
+            all_series[package.name] = all_series.get(package.name,{
+                'name':package.title,
+                'raw': {}
+                })
+            all_series[package.name]['raw'][entry.period_name] = int(entry.pageviews)
+        graph = [ all_series[series_name] for series_name in top_package_names ]
+        c.graph_data = json.dumps( _to_rickshaw(graph) )
 
         return render('ga_report/publisher/read.html')
+
+def _to_rickshaw(data, percentageMode=False):
+    if data==[]:
+        return data
+    # x-axis is every month in c.months. Note that data might not exist
+    # for entire history, eg. for recently-added datasets
+    x_axis = [x[0] for x in c.months]
+    x_axis.reverse() # Ascending order
+    x_axis = x_axis[:-1] # Remove latest month
+    totals = {}
+    for series in data:
+        series['data'] = []
+        for x_string in x_axis:
+            x = _get_unix_epoch( x_string )
+            y = series['raw'].get(x_string,0)
+            series['data'].append({'x':x,'y':y})
+            totals[x] = totals.get(x,0)+y
+    if not percentageMode:
+        return data
+    # Turn all data into percentages
+    # Roll insignificant series into a catch-all
+    THRESHOLD = 1
+    raw_data = data
+    data = []
+    for series in raw_data:
+        for point in series['data']:
+            percentage = (100*float(point['y'])) / totals[point['x']]
+            if not (series in data) and percentage>THRESHOLD:
+                data.append(series)
+            point['y'] = percentage
+    others = [ x for x in raw_data if not (x in data) ]
+    if len(others):
+        data_other = []
+        for i in range(len(x_axis)):
+            x = _get_unix_epoch(x_axis[i])
+            y = 0
+            for series in others:
+                y += series['data'][i]['y']
+            data_other.append({'x':x,'y':y})
+        data.append({
+            'name':'Other',
+            'data': data_other
+            })
+    return data
+
 
 def _get_top_publishers(limit=20):
     '''
@@ -326,10 +484,50 @@
     return top_publishers
 
 
+def _get_top_publishers_graph(limit=20):
+    '''
+    Returns a list of the top 20 publishers by dataset visits.
+    (The number to show can be varied with 'limit')
+    '''
+    connection = model.Session.connection()
+    q = """
+        select department_id, sum(pageviews::int) views
+        from ga_url
+        where department_id <> ''
+          and package_id <> ''
+          and url like '/dataset/%%'
+          and period_name='All'
+        group by department_id order by views desc
+        """
+    if limit:
+        q = q + " limit %s;" % (limit)
+
+    res = connection.execute(q)
+    department_ids = [ row[0] for row in res ]
+
+    # Query for a history graph of these department ids
+    q = model.Session.query(
+            GA_Url.department_id,
+            GA_Url.period_name,
+            func.sum(cast(GA_Url.pageviews,sqlalchemy.types.INT)))\
+        .filter( GA_Url.department_id.in_(department_ids) )\
+        .filter( GA_Url.url.like('/dataset/%') )\
+        .filter( GA_Url.package_id!='' )\
+        .group_by( GA_Url.department_id, GA_Url.period_name )
+    graph_dict = {}
+    for dept_id,period_name,views in q:
+        graph_dict[dept_id] = graph_dict.get( dept_id, {
+            'name' : model.Group.get(dept_id).title,
+            'raw' : {}
+            })
+        graph_dict[dept_id]['raw'][period_name] = views
+    return [ graph_dict[id] for id in department_ids ]
+
+
 def _get_publishers():
     '''
     Returns a list of all publishers. Each item is a tuple:
-      (names, title)
+      (name, title)
     '''
     publishers = []
     for pub in model.Session.query(model.Group).\

--- a/ckanext/ga_report/download_analytics.py
+++ b/ckanext/ga_report/download_analytics.py
@@ -1,7 +1,10 @@
 import os
 import logging
 import datetime
+import httplib
 import collections
+import requests
+import json
 from pylons import config
 from ga_model import _normalize_url
 import ga_model
@@ -13,17 +16,19 @@
 FORMAT_MONTH = '%Y-%m'
 MIN_VIEWS = 50
 MIN_VISITS = 20
+MIN_DOWNLOADS = 10
 
 class DownloadAnalytics(object):
     '''Downloads and stores analytics info'''
 
-    def __init__(self, service=None, profile_id=None, delete_first=False,
+    def __init__(self, service=None, token=None, profile_id=None, delete_first=False,
                  skip_url_stats=False):
         self.period = config['ga-report.period']
         self.service = service
         self.profile_id = profile_id
         self.delete_first = delete_first
         self.skip_url_stats = skip_url_stats
+        self.token = token
 
     def specific_month(self, date):
         import calendar
@@ -31,6 +36,11 @@
         first_of_this_month = datetime.datetime(date.year, date.month, 1)
         _, last_day_of_month = calendar.monthrange(int(date.year), int(date.month))
         last_of_this_month =  datetime.datetime(date.year, date.month, last_day_of_month)
+        # if this is the latest month, note that it is only up until today
+        now = datetime.datetime.now()
+        if now.year == date.year and now.month == date.month:
+            last_day_of_month = now.day
+            last_of_this_month = now
         periods = ((date.strftime(FORMAT_MONTH),
                     last_day_of_month,
                     first_of_this_month, last_of_this_month),)
@@ -122,11 +132,15 @@
                 log.info('Storing publisher views (%i rows)', len(data.get('url')))
                 self.store(period_name, period_complete_day, data,)
 
-                log.info('Aggregating datasets by publisher')
+                # Make sure the All records are correct.
+                ga_model.post_update_url_stats()
+
+                log.info('Associating datasets with their publisher')
                 ga_model.update_publisher_stats(period_name) # about 30 seconds.
 
+
             log.info('Downloading and storing analytics for site-wide stats')
-            self.sitewide_stats( period_name )
+            self.sitewide_stats( period_name, period_complete_day )
 
             log.info('Downloading and storing analytics for social networks')
             self.update_social_info(period_name, start_date, end_date)
@@ -139,21 +153,32 @@
         metrics = 'ga:entrances'
         sort = '-ga:entrances'
 
-        # Supported query params at
-        # https://developers.google.com/analytics/devguides/reporting/core/v3/reference
-        results = self.service.data().ga().get(
-                                 ids='ga:' + self.profile_id,
-                                 filters=query,
-                                 start_date=start_date,
-                                 metrics=metrics,
-                                 sort=sort,
-                                 dimensions="ga:landingPagePath,ga:socialNetwork",
-                                 max_results=10000,
-                                 end_date=end_date).execute()
+        try:
+            # Because of issues of invalid responses, we are going to make these requests
+            # ourselves.
+            headers = {'authorization': 'Bearer ' + self.token}
+
+            args = dict(ids='ga:' + self.profile_id,
+                       filters=query,
+                       metrics=metrics,
+                       sort=sort,
+                       dimensions="ga:landingPagePath,ga:socialNetwork",
+                       max_results=10000)
+
+            args['start-date'] = start_date
+            args['end-date'] = end_date
+
+            results = self._get_json(args)
+        except Exception, e:
+            log.exception(e)
+            results = dict(url=[])
+
+
         data = collections.defaultdict(list)
         rows = results.get('rows',[])
         for row in rows:
-            data[_normalize_url(row[0])].append( (row[1], int(row[2]),) )
+            url = _normalize_url('http:/' + row[0])
+            data[url].append( (row[1], int(row[2]),) )
         ga_model.update_social(period_name, data)
 
 
@@ -167,17 +192,35 @@
 
         # Supported query params at
         # https://developers.google.com/analytics/devguides/reporting/core/v3/reference
-        results = self.service.data().ga().get(
-                                 ids='ga:' + self.profile_id,
-                                 filters=query,
-                                 start_date=start_date,
-                                 metrics=metrics,
-                                 sort=sort,
-                                 dimensions="ga:pagePath",
-                                 max_results=10000,
-                                 end_date=end_date).execute()
+        try:
+            # Because of issues of invalid responses, we are going to make these requests
+            # ourselves.
+            headers = {'authorization': 'Bearer ' + self.token}
+
+            args = {}
+            args["sort"] = "-ga:pageviews"
+            args["max-results"] = 100000
+            args["dimensions"] = "ga:pagePath"
+            args["start-date"] = start_date
+            args["end-date"] = end_date
+            args["metrics"] = metrics
+            args["ids"] = "ga:" + self.profile_id
+            args["filters"] = query
+            args["alt"] = "json"
+
+            r = requests.get("https://www.googleapis.com/analytics/v3/data/ga", params=args, headers=headers)
+            if r.status_code != 200:
+              raise Exception("Request with params: %s failed" % args)
+
+            results = json.loads(r.content)
+            print len(results.keys())
+        except Exception, e:
+            log.exception(e)
+            #return dict(url=[])
+            raise e
 
         packages = []
+        log.info("There are %d results" % results['totalResults'])
         for entry in results.get('rows'):
             (loc,pageviews,visits) = entry
             url = _normalize_url('http:/' + loc) # strips off domain e.g. www.data.gov.uk or data.gov.uk
@@ -194,7 +237,7 @@
         if 'url' in data:
             ga_model.update_url_stats(period_name, period_complete_day, data['url'])
 
-    def sitewide_stats(self, period_name):
+    def sitewide_stats(self, period_name, period_complete_day):
         import calendar
         year, month = period_name.split('-')
         _, last_day_of_month = calendar.monthrange(int(year), int(month))
@@ -202,10 +245,10 @@
         start_date = '%s-01' % period_name
         end_date = '%s-%s' % (period_name, last_day_of_month)
         funcs = ['_totals_stats', '_social_stats', '_os_stats',
-                 '_locale_stats', '_browser_stats', '_mobile_stats']
+                 '_locale_stats', '_browser_stats', '_mobile_stats', '_download_stats']
         for f in funcs:
             log.info('Downloading analytics for %s' % f.split('_')[1])
-            getattr(self, f)(start_date, end_date, period_name)
+            getattr(self, f)(start_date, end_date, period_name, period_complete_day)
 
     def _get_results(result_data, f):
         data = {}
@@ -214,24 +257,83 @@
             data[key] = data.get(key,0) + result[1]
         return data
 
-    def _totals_stats(self, start_date, end_date, period_name):
+    def _get_json(self, params, prev_fail=False):
+        if prev_fail:
+          import os
+          ga_token_filepath = os.path.expanduser(config.get('googleanalytics.token.filepath', ''))
+          if not ga_token_filepath:
+              print 'ERROR: In the CKAN config you need to specify the filepath of the ' \
+                    'Google Analytics token file under key: googleanalytics.token.filepath'
+              return
+
+          try:
+              self.token, svc = init_service(ga_token_filepath, None)
+          except TypeError:
+              print ('Have you correctly run the getauthtoken task and '
+                     'specified the correct token file in the CKAN config under '
+                     '"googleanalytics.token.filepath"?')
+
+        try:
+            # Because of issues of invalid responses, we are going to make these requests
+            # ourselves.
+            headers = {'authorization': 'Bearer ' + self.token}
+            r = requests.get("https://www.googleapis.com/analytics/v3/data/ga", params=params, headers=headers)
+            if r.status_code != 200:
+              log.info("STATUS: %s" % (r.status_code,))
+              log.info("CONTENT: %s" % (r.content,))
+              raise Exception("Request with params: %s failed" % params)
+
+            return json.loads(r.content)
+        except Exception, e:
+            if not prev_fail:
+              print e
+              results = self._get_json(self, params, prev_fail=True)
+            else:
+              log.exception(e)
+
+        return dict(url=[])
+
+    def _totals_stats(self, start_date, end_date, period_name, period_complete_day):
         """ Fetches distinct totals, total pageviews etc """
-        results = self.service.data().ga().get(
-                                 ids='ga:' + self.profile_id,
-                                 start_date=start_date,
-                                 metrics='ga:pageviews',
-                                 sort='-ga:pageviews',
-                                 max_results=10000,
-                                 end_date=end_date).execute()
-        result_data = results.get('rows')
-        ga_model.update_sitewide_stats(period_name, "Totals", {'Total page views': result_data[0][0]})
-
-        results = self.service.data().ga().get(
-                                 ids='ga:' + self.profile_id,
-                                 start_date=start_date,
-                                 metrics='ga:pageviewsPerVisit,ga:avgTimeOnSite,ga:percentNewVisits,ga:visits',
-                                 max_results=10000,
-                                 end_date=end_date).execute()
+        try:
+            args = {}
+            args["max-results"] = 100000
+            args["start-date"] = start_date
+            args["end-date"] = end_date
+            args["ids"] = "ga:" + self.profile_id
+
+            args["metrics"] = "ga:pageviews"
+            args["sort"] = "-ga:pageviews"
+            args["alt"] = "json"
+
+            results = self._get_json(args)
+        except Exception, e:
+            log.exception(e)
+            results = dict(url=[])
+
+        result_data = results.get('rows')
+        ga_model.update_sitewide_stats(period_name, "Totals", {'Total page views': result_data[0][0]},
+            period_complete_day)
+
+        try:
+