Implemented further analytics by fetching more specific items.
Implemented further analytics by fetching more specific items.

Now fetches:
[x] Pages per visit
[x] Average visit duration
[x] % new visits
[x] Bounce rate
[x] Browser distribution
[x] IE version distribution (can work out from Browser stats)
[x] OS distribution
[x] Language distribution
[x] Location distribution
[x] Mobile device distribution
[x] Top twitter referrals

--- a/ckanext/ga_report/
+++ b/ckanext/ga_report/
@@ -1,3 +1,4 @@
+import os
 import logging
 import datetime
@@ -84,18 +85,27 @@
                      self.get_full_period_name(period_name, period_complete_day),
                      start_date.strftime('%Y %m %d'),
                      end_date.strftime('%Y %m %d'))
-            data =, end_date)
-  'Storing Analytics for period "%s"',
+            data =, end_date, '~/dataset/[a-z0-9-_]+')
+  'Storing Dataset Analytics for period "%s"',
                      self.get_full_period_name(period_name, period_complete_day))
-  , period_complete_day, data)
-    def download(self, start_date, end_date):
+  , period_complete_day, data, )
+            data =, end_date, '~/publisher/[a-z0-9-_]+')
+  'Storing Publisher Analytics for period "%s"',
+                     self.get_full_period_name(period_name, period_complete_day))
+  , period_complete_day, data,)
+            ga_model.update_publisher_stats(period_name)
+            self.sitewide_stats( period_name )
+    def download(self, start_date, end_date, path='~/dataset/[a-z0-9-_]+'):
         '''Get data from GA for a given time period'''
         start_date = start_date.strftime('%Y-%m-%d')
         end_date = end_date.strftime('%Y-%m-%d')
-        query = 'ga:pagePath=~/dataset/[a-z0-9-]+$'
-        metrics = 'ga:uniquePageviews'
+        query = 'ga:pagePath=%s$' % path
+        metrics = 'ga:uniquePageviews, ga:visits'
         sort = '-ga:uniquePageviews'
         # Supported query params at
@@ -110,18 +120,175 @@
-        import pprint
-        pprint.pprint(results)
-        print 'Total results: %s' % results.get('totalResults')
+        if os.getenv('DEBUG'):
+            import pprint
+            pprint.pprint(results)
+            print 'Total results: %s' % results.get('totalResults')
         packages = []
         for entry in results.get('rows'):
-            (loc,size,) = entry
-            packages.append( ('http:/' + loc,size, '',) ) # Temporary hack
+            (loc,pageviews,visits) = entry
+            packages.append( ('http:/' + loc, pageviews, visits,) ) # Temporary hack
         return dict(url=packages)
     def store(self, period_name, period_complete_day, data):
         if 'url' in data:
             ga_model.update_url_stats(period_name, period_complete_day, data['url'])
+    def sitewide_stats(self, period_name):
+        import calendar
+        year, month = period_name.split('-')
+        _, last_day_of_month = calendar.monthrange(int(year), int(month))
+        start_date = '%s-01' % period_name
+        end_date = '%s-%s' % (period_name, last_day_of_month)
+        print 'Sitewide_stats for %s (%s -> %s)' % (period_name, start_date, end_date)
+        funcs = ['_totals_stats', '_social_stats', '_os_stats',
+                 '_locale_stats', '_browser_stats', '_mobile_stats']
+        for f in funcs:
+            print ' + Fetching %s stats' % f.split('_')[1]
+            getattr(self, f)(start_date, end_date, period_name)
+    def _totals_stats(self, start_date, end_date, period_name):
+        """ Fetches distinct totals, total pageviews etc """
+        results =
+                                 ids='ga:' + self.profile_id,
+                                 start_date=start_date,
+                                 metrics='ga:uniquePageviews',
+                                 sort='-ga:uniquePageviews',
+                                 max_results=10000,
+                                 end_date=end_date).execute()
+        result_data = results.get('rows')
+        ga_model.update_sitewide_stats(period_name, "Totals", {'Total pageviews': result_data[0][0]})
+        results =
+                                 ids='ga:' + self.profile_id,
+                                 start_date=start_date,
+                                 metrics='ga:pageviewsPerVisit,ga:bounces,ga:avgTimeOnSite,ga:percentNewVisits',
+                                 max_results=10000,
+                                 end_date=end_date).execute()
+        result_data = results.get('rows')
+        data = {
+            'Pages per visit': result_data[0][0],
+            'Bounces': result_data[0][1],
+            'Average time on site': result_data[0][2],
+            'Percent new visits': result_data[0][3],
+        }
+        ga_model.update_sitewide_stats(period_name, "Totals", data)
+    def _locale_stats(self, start_date, end_date, period_name):
+        """ Fetches stats about language and country """
+        results =
+                                 ids='ga:' + self.profile_id,
+                                 start_date=start_date,
+                                 metrics='ga:uniquePageviews',
+                                 sort='-ga:uniquePageviews',
+                                 dimensions="ga:language,ga:country",
+                                 max_results=10000,
+                                 end_date=end_date).execute()
+        result_data = results.get('rows')
+        data = {}
+        for result in result_data:
+            data[result[0]] = data.get(result[0], 0) + int(result[2])
+        ga_model.update_sitewide_stats(period_name, "Languages", data)
+        data = {}
+        for result in result_data:
+            data[result[1]] = data.get(result[1], 0) + int(result[2])
+        ga_model.update_sitewide_stats(period_name, "Country", data)
+    def _social_stats(self, start_date, end_date, period_name):
+        """ Finds out which social sites people are referred from """
+        results =
+                                 ids='ga:' + self.profile_id,
+                                 start_date=start_date,
+                                 metrics='ga:uniquePageviews',
+                                 sort='-ga:uniquePageviews',
+                                 dimensions="ga:socialNetwork,ga:referralPath",
+                                 max_results=10000,
+                                 end_date=end_date).execute()
+        result_data = results.get('rows')
+        twitter_links = []
+        data = {}
+        for result in result_data:
+            if not result[0] == '(not set)':
+                data[result[0]] = data.get(result[0], 0) + int(result[2])
+                if result[0] == 'Twitter':
+                    twitter_links.append(result[1])
+        ga_model.update_sitewide_stats(period_name, "Social sources", data)
+    def _os_stats(self, start_date, end_date, period_name):
+        """ Operating system stats """
+        results =
+                                 ids='ga:' + self.profile_id,
+                                 start_date=start_date,
+                                 metrics='ga:uniquePageviews',
+                                 sort='-ga:uniquePageviews',
+                                 dimensions="ga:operatingSystem,ga:operatingSystemVersion",
+                                 max_results=10000,
+                                 end_date=end_date).execute()
+        result_data = results.get('rows')
+        data = {}
+        for result in result_data:
+            data[result[0]] = data.get(result[0], 0) + int(result[2])
+        ga_model.update_sitewide_stats(period_name, "Operating Systems", data)
+        data = {}
+        for result in result_data:
+            key = "%s (%s)" % (result[0],result[1])
+            data[key] = result[2]
+        ga_model.update_sitewide_stats(period_name, "Operating Systems versions", data)
+    def _browser_stats(self, start_date, end_date, period_name):
+        """ Information about browsers and browser versions """
+        results =
+                                 ids='ga:' + self.profile_id,
+                                 start_date=start_date,
+                                 metrics='ga:uniquePageviews',
+                                 sort='-ga:uniquePageviews',
+                                 dimensions="ga:browser,ga:browserVersion",
+                                 max_results=10000,
+                                 end_date=end_date).execute()
+        result_data = results.get('rows')
+        data = {}
+        for result in result_data:
+            data[result[0]] = data.get(result[0], 0) + int(result[2])
+        ga_model.update_sitewide_stats(period_name, "Browsers", data)
+        data = {}
+        for result in result_data:
+            key = "%s (%s)" % (result[0],result[1])
+            data[key] = result[2]
+        ga_model.update_sitewide_stats(period_name, "Browser versions", data)
+    def _mobile_stats(self, start_date, end_date, period_name):
+        """ Info about mobile devices """
+        results =
+                                 ids='ga:' + self.profile_id,
+                                 start_date=start_date,
+                                 metrics='ga:uniquePageviews',
+                                 sort='-ga:uniquePageviews',
+                                 dimensions="ga:mobileDeviceBranding, ga:mobileDeviceInfo",
+                                 max_results=10000,
+                                 end_date=end_date).execute()
+        result_data = results.get('rows')
+        data = {}
+        for result in result_data:
+            data[result[0]] = data.get(result[0], 0) + int(result[2])
+        ga_model.update_sitewide_stats(period_name, "Mobile brands", data)
+        data = {}
+        for result in result_data:
+            data[result[1]] = data.get(result[1], 0) + int(result[2])
+        ga_model.update_sitewide_stats(period_name, "Mobile devices", data)

--- a/ckanext/ga_report/
+++ b/ckanext/ga_report/
@@ -16,6 +16,18 @@
 class GA_Url(object):
+    def __init__(self, **kwargs):
+        for k,v in kwargs.items():
+            setattr(self, k, v)
+class GA_Stat(object):
+    def __init__(self, **kwargs):
+        for k,v in kwargs.items():
+            setattr(self, k, v)
+class GA_Publisher(object):
     def __init__(self, **kwargs):
         for k,v in kwargs.items():
@@ -28,12 +40,32 @@
                       Column('period_name', types.UnicodeText),
                       Column('period_complete_day', types.Integer),
-                      Column('metric', types.UnicodeText),
-                      Column('value', types.UnicodeText),
+                      Column('pageviews', types.UnicodeText),
+                      Column('visits', types.UnicodeText),
                       Column('url', types.UnicodeText),
                       Column('department_id', types.UnicodeText),
 mapper(GA_Url, url_table)
+stat_table = Table('ga_stat', metadata,
+                  Column('id', types.UnicodeText, primary_key=True,
+                         default=make_uuid),
+                  Column('period_name', types.UnicodeText),
+                  Column('stat_name', types.UnicodeText),
+                  Column('key', types.UnicodeText),
+                  Column('value', types.UnicodeText), )
+mapper(GA_Stat, stat_table)
+pub_table = Table('ga_publisher', metadata,
+                  Column('id', types.UnicodeText, primary_key=True,
+                         default=make_uuid),
+                  Column('period_name', types.UnicodeText),
+                  Column('publisher_name', types.UnicodeText),
+                  Column('views', types.UnicodeText),
+                  Column('visits', types.UnicodeText),
+mapper(GA_Publisher, pub_table)
 def init_tables():
@@ -73,22 +105,49 @@
             publisher_groups = dataset.get_groups('publisher')
             if publisher_groups:
                 return publisher_groups[0].name
+    else:
+        publisher_match = re.match('/publisher/([^/]+)(/.*)?', url)
+        if publisher_match:
+            return publisher_match.groups()[0]
+def update_sitewide_stats(period_name, stat_name, data):
+    for k,v in data.iteritems():
+        item = model.Session.query(GA_Stat).\
+            filter(GA_Stat.period_name==period_name).\
+            filter(GA_Stat.key==k).\
+            filter(GA_Stat.stat_name==stat_name).first()
+        if item:
+            item.period_name = period_name
+            item.key = k
+            item.value = v
+            model.Session.add(item)
+        else:
+            # create the row
+            values = {'id': make_uuid(),
+                     'period_name': period_name,
+                     'key': k,
+                     'value': v,
+                     'stat_name': stat_name
+                     }
+            model.Session.add(GA_Stat(**values))
+        model.Session.commit()
 def update_url_stats(period_name, period_complete_day, url_data):
-    table = get_table('ga_url')
-    for url, views, next_page in url_data:
+    for url, views, visits in url_data:
         url = _normalize_url(url)
         department_id = _get_department_id_of_url(url)
         # see if the row for this url & month is in the table already
         item = model.Session.query(GA_Url).\
-            filter(GA_Url.url==url).\
-            filter(GA_Url.metric == 'Total views').first()
+            filter(GA_Url.url==url).first()
         if item:
-            item.period_name = period_complete_day = period_complete_day
-            item.value = views
+            item.period_name = period_name
+            item.pageviews = views
+            item.visits = visits
             item.department_id = department_id
@@ -97,10 +156,74 @@
                       'period_name': period_name,
                       'period_complete_day': period_complete_day,
                       'url': url,
-                      'value': views,
-                      'metric': 'Total views',
+                      'pageviews': views,
+                      'visits': visits,
                       'department_id': department_id
+def update_publisher_stats(period_name):
+    publishers = get_top_level()
+    for publisher in publishers:
+        views, visits = update_publisher(period_name, publisher,
+        item = model.Session.query(GA_Publisher).\
+            filter(GA_Publisher.period_name==period_name).\
+            filter(
+        if item:
+            item.views = views
+            item.visits = visits
+            item.publisher_name =
+            model.Session.add(item)
+        else:
+            # create the row
+            values = {'id': make_uuid(),
+                     'period_name': period_name,
+                     'publisher_name':,
+                     'views': views,
+                     'visits': visits,
+                     }
+            model.Session.add(GA_Publisher(**values))
+        model.Session.commit()
+def update_publisher(period_name, pub, part=''):
+    views,visits = 0, 0
+    for publisher in go_down_tree(pub):
+        f = model.Session.query(GA_Url).\
+                filter(GA_Url.period_name==period_name).\
+                filter(GA_Url.url=='/publisher/' +
+        if f:
+            views = views + int(f.pageviews)
+            visits = visits + int(f.visits)
+    return views, visits
+def get_top_level():
+    '''Returns the top level publishers.'''
+    return model.Session.query(model.Group).\
+           outerjoin(model.Member, model.Member.table_id == and \
+                     model.Member.table_name == 'group' and \
+                     model.Member.state == 'active').\
+           filter(\
+           filter(model.Group.type=='publisher').\
+           order_by(
+def get_children(publisher):
+    '''Finds child publishers for the given publisher (object). (Not recursive)'''
+    from import HIERARCHY_CTE
+    return model.Session.query(model.Group).\
+           from_statement(HIERARCHY_CTE).params(, type='publisher').\
+           all()
+def go_down_tree(publisher):
+    '''Provided with a publisher object, it walks down the hierarchy and yields each publisher,
+    including the one you supply.'''
+    yield publisher
+    for child in get_children(publisher):
+        for grandchild in go_down_tree(child):
+            yield grandchild

--- /dev/null
+++ b/ckanext/ga_report/tests/
@@ -1,1 +1,38 @@
+import os
+import datetime
+from import assert_equal
+from ckanext.ga_report.download_analytics import DownloadAnalytics
+from ckanext.ga_report.ga_auth import (init_service, get_profile_id)
+from ckanext.ga_report.ga_model import init_tables
+class TestAPI:
+    @classmethod
+    def setup_class(cls):
+        if not os.path.exists("token.dat") or not os.path.exists("credentials.json"):
+            print '*' * 60
+            print "Tests may not run without first having run the auth process"
+            print '*' * 60
+        init_tables()
+    @classmethod
+    def teardown_class(cls):
+        pass
+    def test_latest(self):
+        svc = init_service("token.dat", "credentials.json")
+        try:
+            downloader = DownloadAnalytics(svc, profile_id=get_profile_id(svc))
+            downloader.latest()
+        except Exception as e:
+            assert False, e
+    def test_since(self):
+        svc = init_service("token.dat", "credentials.json")
+        downloader = DownloadAnalytics(svc, profile_id=get_profile_id(svc))
+        try:
+            downloader.since_date( - datetime.timedelta(days=-30))
+        except Exception as e:
+            assert False, e