Fixed the publishr information (to enable leaderboard) to show the number
Fixed the publishr information (to enable leaderboard) to show the number
of children publishers and the total for all sub-publishers

--- a/ckanext/ga_report/download_analytics.py
+++ b/ckanext/ga_report/download_analytics.py
@@ -85,18 +85,27 @@
                      self.get_full_period_name(period_name, period_complete_day),
                      start_date.strftime('%Y %m %d'),
                      end_date.strftime('%Y %m %d'))
-            data = self.download(start_date, end_date)
-            log.info('Storing Analytics for period "%s"',
+            """
+            data = self.download(start_date, end_date, '~/dataset/[a-z0-9-_]+')
+            log.info('Storing Dataset Analytics for period "%s"',
                      self.get_full_period_name(period_name, period_complete_day))
-            self.store(period_name, period_complete_day, data)
-
-
-    def download(self, start_date, end_date):
+            self.store(period_name, period_complete_day, data, )
+
+            data = self.download(start_date, end_date, '~/publisher/[a-z0-9-_]+')
+            log.info('Storing Publisher Analytics for period "%s"',
+                     self.get_full_period_name(period_name, period_complete_day))
+            self.store(period_name, period_complete_day, data,)
+            """
+            ga_model.update_publisher_stats(period_name) # about 30 seconds.
+            self.sitewide_stats( period_name )
+
+
+    def download(self, start_date, end_date, path='~/dataset/[a-z0-9-_]+'):
         '''Get data from GA for a given time period'''
         start_date = start_date.strftime('%Y-%m-%d')
         end_date = end_date.strftime('%Y-%m-%d')
-        query = 'ga:pagePath=~/dataset/[a-z0-9-]+$'
-        metrics = 'ga:uniquePageviews'
+        query = 'ga:pagePath=%s$' % path
+        metrics = 'ga:uniquePageviews, ga:visitors'
         sort = '-ga:uniquePageviews'
 
         # Supported query params at
@@ -118,11 +127,173 @@
 
         packages = []
         for entry in results.get('rows'):
-            (loc,size,) = entry
-            packages.append( ('http:/' + loc,size, '',) ) # Temporary hack
+            (loc,pageviews,visits) = entry
+            packages.append( ('http:/' + loc, pageviews, visits,) ) # Temporary hack
         return dict(url=packages)
 
     def store(self, period_name, period_complete_day, data):
         if 'url' in data:
             ga_model.update_url_stats(period_name, period_complete_day, data['url'])
 
+    def sitewide_stats(self, period_name):
+        import calendar
+        year, month = period_name.split('-')
+        _, last_day_of_month = calendar.monthrange(int(year), int(month))
+
+        start_date = '%s-01' % period_name
+        end_date = '%s-%s' % (period_name, last_day_of_month)
+        print 'Sitewide_stats for %s (%s -> %s)' % (period_name, start_date, end_date)
+
+        funcs = ['_totals_stats', '_social_stats', '_os_stats',
+                 '_locale_stats', '_browser_stats', '_mobile_stats']
+        for f in funcs:
+            print ' + Fetching %s stats' % f.split('_')[1]
+            getattr(self, f)(start_date, end_date, period_name)
+
+    def _get_results(result_data, f):
+        data = {}
+        for result in result_data:
+            key = f(result)
+            data[key] = data.get(key,0) + result[1]
+        return data
+
+    def _totals_stats(self, start_date, end_date, period_name):
+        """ Fetches distinct totals, total pageviews etc """
+        results = self.service.data().ga().get(
+                                 ids='ga:' + self.profile_id,
+                                 start_date=start_date,
+                                 metrics='ga:uniquePageviews',
+                                 sort='-ga:uniquePageviews',
+                                 max_results=10000,
+                                 end_date=end_date).execute()
+        result_data = results.get('rows')
+        ga_model.update_sitewide_stats(period_name, "Totals", {'Total pageviews': result_data[0][0]})
+
+        results = self.service.data().ga().get(
+                                 ids='ga:' + self.profile_id,
+                                 start_date=start_date,
+                                 metrics='ga:pageviewsPerVisit,ga:bounces,ga:avgTimeOnSite,ga:percentNewVisits',
+                                 max_results=10000,
+                                 end_date=end_date).execute()
+        result_data = results.get('rows')
+        data = {
+            'Pages per visit': result_data[0][0],
+            'Bounces': result_data[0][1],
+            'Average time on site': result_data[0][2],
+            'Percent new visits': result_data[0][3],
+        }
+        ga_model.update_sitewide_stats(period_name, "Totals", data)
+
+
+    def _locale_stats(self, start_date, end_date, period_name):
+        """ Fetches stats about language and country """
+        results = self.service.data().ga().get(
+                                 ids='ga:' + self.profile_id,
+                                 start_date=start_date,
+                                 metrics='ga:uniquePageviews',
+                                 sort='-ga:uniquePageviews',
+                                 dimensions="ga:language,ga:country",
+                                 max_results=10000,
+                                 end_date=end_date).execute()
+        result_data = results.get('rows')
+        data = {}
+        for result in result_data:
+            data[result[0]] = data.get(result[0], 0) + int(result[2])
+        ga_model.update_sitewide_stats(period_name, "Languages", data)
+
+        data = {}
+        for result in result_data:
+            data[result[1]] = data.get(result[1], 0) + int(result[2])
+        ga_model.update_sitewide_stats(period_name, "Country", data)
+
+
+    def _social_stats(self, start_date, end_date, period_name):
+        """ Finds out which social sites people are referred from """
+        results = self.service.data().ga().get(
+                                 ids='ga:' + self.profile_id,
+                                 start_date=start_date,
+                                 metrics='ga:uniquePageviews',
+                                 sort='-ga:uniquePageviews',
+                                 dimensions="ga:socialNetwork,ga:referralPath",
+                                 max_results=10000,
+                                 end_date=end_date).execute()
+        result_data = results.get('rows')
+        twitter_links = []
+        data = {}
+        for result in result_data:
+            if not result[0] == '(not set)':
+                data[result[0]] = data.get(result[0], 0) + int(result[2])
+                if result[0] == 'Twitter':
+                    twitter_links.append(result[1])
+        ga_model.update_sitewide_stats(period_name, "Social sources", data)
+
+
+    def _os_stats(self, start_date, end_date, period_name):
+        """ Operating system stats """
+        results = self.service.data().ga().get(
+                                 ids='ga:' + self.profile_id,
+                                 start_date=start_date,
+                                 metrics='ga:uniquePageviews',
+                                 sort='-ga:uniquePageviews',
+                                 dimensions="ga:operatingSystem,ga:operatingSystemVersion",
+                                 max_results=10000,
+                                 end_date=end_date).execute()
+        result_data = results.get('rows')
+        data = {}
+        for result in result_data:
+            data[result[0]] = data.get(result[0], 0) + int(result[2])
+        ga_model.update_sitewide_stats(period_name, "Operating Systems", data)
+
+        data = {}
+        for result in result_data:
+            key = "%s (%s)" % (result[0],result[1])
+            data[key] = result[2]
+        ga_model.update_sitewide_stats(period_name, "Operating Systems versions", data)
+
+
+    def _browser_stats(self, start_date, end_date, period_name):
+        """ Information about browsers and browser versions """
+        results = self.service.data().ga().get(
+                                 ids='ga:' + self.profile_id,
+                                 start_date=start_date,
+                                 metrics='ga:uniquePageviews',
+                                 sort='-ga:uniquePageviews',
+                                 dimensions="ga:browser,ga:browserVersion",
+                                 max_results=10000,
+                                 end_date=end_date).execute()
+        result_data = results.get('rows')
+        data = {}
+        for result in result_data:
+            data[result[0]] = data.get(result[0], 0) + int(result[2])
+        ga_model.update_sitewide_stats(period_name, "Browsers", data)
+
+        data = {}
+        for result in result_data:
+            key = "%s (%s)" % (result[0], result[1])
+            data[key] = result[2]
+        ga_model.update_sitewide_stats(period_name, "Browser versions", data)
+
+
+    def _mobile_stats(self, start_date, end_date, period_name):
+        """ Info about mobile devices """
+
+        results = self.service.data().ga().get(
+                                 ids='ga:' + self.profile_id,
+                                 start_date=start_date,
+                                 metrics='ga:uniquePageviews',
+                                 sort='-ga:uniquePageviews',
+                                 dimensions="ga:mobileDeviceBranding, ga:mobileDeviceInfo",
+                                 max_results=10000,
+                                 end_date=end_date).execute()
+
+        result_data = results.get('rows')
+        data = {}
+        for result in result_data:
+            data[result[0]] = data.get(result[0], 0) + int(result[2])
+        ga_model.update_sitewide_stats(period_name, "Mobile brands", data)
+
+        data = {}
+        for result in result_data:
+            data[result[1]] = data.get(result[1], 0) + int(result[2])
+        ga_model.update_sitewide_stats(period_name, "Mobile devices", data)
+

--- a/ckanext/ga_report/ga_model.py
+++ b/ckanext/ga_report/ga_model.py
@@ -16,6 +16,18 @@
 
 
 class GA_Url(object):
+
+    def __init__(self, **kwargs):
+        for k,v in kwargs.items():
+            setattr(self, k, v)
+
+class GA_Stat(object):
+
+    def __init__(self, **kwargs):
+        for k,v in kwargs.items():
+            setattr(self, k, v)
+
+class GA_Publisher(object):
 
     def __init__(self, **kwargs):
         for k,v in kwargs.items():
@@ -28,12 +40,35 @@
                              default=make_uuid),
                       Column('period_name', types.UnicodeText),
                       Column('period_complete_day', types.Integer),
-                      Column('metric', types.UnicodeText),
-                      Column('value', types.UnicodeText),
+                      Column('pageviews', types.UnicodeText),
+                      Column('visitors', types.UnicodeText),
                       Column('url', types.UnicodeText),
                       Column('department_id', types.UnicodeText),
                 )
 mapper(GA_Url, url_table)
+
+stat_table = Table('ga_stat', metadata,
+                  Column('id', types.UnicodeText, primary_key=True,
+                         default=make_uuid),
+                  Column('period_name', types.UnicodeText),
+                  Column('stat_name', types.UnicodeText),
+                  Column('key', types.UnicodeText),
+                  Column('value', types.UnicodeText), )
+mapper(GA_Stat, stat_table)
+
+
+pub_table = Table('ga_publisher', metadata,
+                  Column('id', types.UnicodeText, primary_key=True,
+                         default=make_uuid),
+                  Column('period_name', types.UnicodeText),
+                  Column('publisher_name', types.UnicodeText),
+                  Column('views', types.UnicodeText),
+                  Column('visitors', types.UnicodeText),
+                  Column('toplevel', types.Boolean, default=False),
+                  Column('subpublishercount', types.Integer, default=0),
+                  Column('parent', types.UnicodeText),
+)
+mapper(GA_Publisher, pub_table)
 
 
 def init_tables():
@@ -73,21 +108,49 @@
             publisher_groups = dataset.get_groups('publisher')
             if publisher_groups:
                 return publisher_groups[0].name
+    else:
+        publisher_match = re.match('/publisher/([^/]+)(/.*)?', url)
+        if publisher_match:
+            return publisher_match.groups()[0]
+
+
+def update_sitewide_stats(period_name, stat_name, data):
+    for k,v in data.iteritems():
+        item = model.Session.query(GA_Stat).\
+            filter(GA_Stat.period_name==period_name).\
+            filter(GA_Stat.key==k).\
+            filter(GA_Stat.stat_name==stat_name).first()
+        if item:
+            item.period_name = period_name
+            item.key = k
+            item.value = v
+            model.Session.add(item)
+        else:
+            # create the row
+            values = {'id': make_uuid(),
+                     'period_name': period_name,
+                     'key': k,
+                     'value': v,
+                     'stat_name': stat_name
+                     }
+            model.Session.add(GA_Stat(**values))
+        model.Session.commit()
+
 
 
 def update_url_stats(period_name, period_complete_day, url_data):
-    for url, views, next_page in url_data:
+    for url, views, visitors in url_data:
         url = _normalize_url(url)
         department_id = _get_department_id_of_url(url)
 
         # see if the row for this url & month is in the table already
         item = model.Session.query(GA_Url).\
             filter(GA_Url.period_name==period_name).\
-            filter(GA_Url.url==url).\
-            filter(GA_Url.metric == 'Total views').first()
+            filter(GA_Url.url==url).first()
         if item:
-            item.period_name = period_complete_day = period_complete_day
-            item.value = views
+            item.period_name = period_name
+            item.pageviews = views
+            item.visitors = visitors
             item.department_id = department_id
             model.Session.add(item)
         else:
@@ -96,10 +159,92 @@
                       'period_name': period_name,
                       'period_complete_day': period_complete_day,
                       'url': url,
-                      'value': views,
-                      'metric': 'Total views',
+                      'pageviews': views,
+                      'visitors': visitors,
                       'department_id': department_id
                      }
             model.Session.add(GA_Url(**values))
         model.Session.commit()
 
+
+
+def update_publisher_stats(period_name):
+    """
+    Updates the publisher stats from the data retrieved for /dataset/*
+    and /publisher/*. Will run against each dataset and generates the
+    totals for the entire tree beneath each publisher.
+    """
+    toplevel = get_top_level()
+    publishers = model.Session.query(model.Group).\
+        filter(model.Group.type=='publisher').\
+        filter(model.Group.state=='active').all()
+    for publisher in publishers:
+        views, visitors, subpub = update_publisher(period_name, publisher, publisher.name)
+        parent, parents = '', publisher.get_groups('publisher')
+        if parents:
+            parent = parents[0].name
+        item = model.Session.query(GA_Publisher).\
+            filter(GA_Publisher.period_name==period_name).\
+            filter(GA_Publisher.publisher_name==publisher.name).first()
+        if item:
+            item.views = views
+            item.visitors = visitors
+            item.publisher_name = publisher.name
+            item.toplevel = publisher in toplevel
+            item.subpublishercount = subpub
+            item.parent = parent
+            model.Session.add(item)
+        else:
+            # create the row
+            values = {'id': make_uuid(),
+                     'period_name': period_name,
+                     'publisher_name': publisher.name,
+                     'views': views,
+                     'visitors': visitors,
+                     'toplevel': publisher in toplevel,
+                     'subpublishercount': subpub,
+                     'parent': parent
+                     }
+            model.Session.add(GA_Publisher(**values))
+        model.Session.commit()
+
+
+def update_publisher(period_name, pub, part=''):
+    views,visitors,subpub = 0, 0, 0
+    for publisher in go_down_tree(pub):
+        subpub = subpub + 1
+        items = model.Session.query(GA_Url).\
+                filter(GA_Url.period_name==period_name).\
+                filter(GA_Url.department_id==publisher.name).all()
+        for item in items:
+            views = views + int(item.pageviews)
+            visitors = visitors + int(item.visitors)
+
+    return views, visitors, (subpub-1)
+
+
+def get_top_level():
+    '''Returns the top level publishers.'''
+    return model.Session.query(model.Group).\
+           outerjoin(model.Member, model.Member.table_id == model.Group.id and \
+                     model.Member.table_name == 'group' and \
+                     model.Member.state == 'active').\
+           filter(model.Member.id==None).\
+           filter(model.Group.type=='publisher').\
+           order_by(model.Group.name).all()
+
+def get_children(publisher):
+    '''Finds child publishers for the given publisher (object). (Not recursive)'''
+    from ckan.model.group import HIERARCHY_CTE
+    return model.Session.query(model.Group).\
+           from_statement(HIERARCHY_CTE).params(id=publisher.id, type='publisher').\
+           all()
+
+def go_down_tree(publisher):
+    '''Provided with a publisher object, it walks down the hierarchy and yields each publisher,
+    including the one you supply.'''
+    yield publisher
+    for child in get_children(publisher):
+        for grandchild in go_down_tree(child):
+            yield grandchild
+