Fixes to index for publishers
Fixes to index for publishers

file:a/README.rst -> file:b/README.rst
--- a/README.rst
+++ b/README.rst
@@ -33,6 +33,10 @@
       googleanalytics.id = UA-1010101-1
       googleanalytics.account = Account name (e.g. data.gov.uk, see top level item at https://www.google.com/analytics)
       ga-report.period = monthly
+      ga-report.bounce_url = /data
+
+   The ga-report.bounce_url specifies the path to use when calculating bounces. For DGU this is /data
+   but you may want to set this to /.
 
 3. Set up this extension's database tables using a paster command. (Ensure your CKAN pyenv is still activated, run the command from ``src/ckanext-ga-report``, alter the ``--config`` option to point to your site config file)::
 

--- a/ckanext/ga_report/controller.py
+++ b/ckanext/ga_report/controller.py
@@ -23,7 +23,7 @@
 
 def _month_details(cls):
     months = []
-    vals = model.Session.query(cls.period_name).distinct().all()
+    vals = model.Session.query(cls.period_name).filter(cls.period_name!='All').distinct().all()
     for m in vals:
         months.append( (m[0], _get_month_name(m[0])))
     return sorted(months, key=operator.itemgetter(0), reverse=True)
@@ -70,15 +70,15 @@
         entries = q.order_by('ga_stat.key').all()
 
         def clean_key(key, val):
-            if key in ['Average time on site', 'Pages per visit', 'New visits']:
+            if key in ['Average time on site', 'Pages per visit', 'New visits', 'Bounces']:
                 val =  "%.2f" % round(float(val), 2)
                 if key == 'Average time on site':
                     mins, secs = divmod(float(val), 60)
                     hours, mins = divmod(mins, 60)
                     val = '%02d:%02d:%02d (%s seconds) ' % (hours, mins, secs, val)
-                if key == 'New visits':
+                if key in ['New visits','Bounces']:
                     val = "%s%%" % val
-            if key in ['Bounces', 'Total page views', 'Total visits']:
+            if key in ['Total page views', 'Total visits']:
                 val = int(val)
 
             return key, val
@@ -93,11 +93,12 @@
             for e in entries:
                 d[e.key].append(float(e.value))
             for k, v in d.iteritems():
-                if k in ['Bounces', 'Total page views', 'Total visits']:
+                if k in ['Total page views', 'Total visits']:
                     v = sum(v)
                 else:
                     v = float(sum(v))/len(v)
                 key, val = clean_key(k,v)
+
                 c.global_totals.append((key, val))
                 c.global_totals = sorted(c.global_totals, key=operator.itemgetter(0))
 
@@ -134,29 +135,7 @@
             c.social_referrer_totals.append((shorten_name(entry[0]), fill_out_url(entry[0]),'',
                                             entry[1]))
 
-
-        browser_version_re = re.compile("(.*)\((.*)\)")
         for k, v in keys.iteritems():
-
-            def clean_field(key):
-                if k != 'Browser versions':
-                    return key
-                m = browser_version_re.match(key)
-                browser = m.groups()[0].strip()
-                ver = m.groups()[1]
-                parts = ver.split('.')
-                if len(parts) > 1:
-                    if parts[1][0] == '0':
-                        ver = parts[0]
-                    else:
-                        ver = "%s.%s" % (parts[0],parts[1])
-                if browser in ['Safari','Android Browser']:  # Special case complex version nums
-                    ver = parts[0]
-                    if len(ver) > 2:
-                        ver = "%s%sX" % (ver[0], ver[1])
-
-                return "%s (%s)" % (browser, ver,)
-
             q = model.Session.query(GA_Stat).\
                 filter(GA_Stat.stat_name==k)
             if c.month:
@@ -172,17 +151,13 @@
                 entries.append((key,val,))
             entries = sorted(entries, key=operator.itemgetter(1), reverse=True)
 
-            def percent(num, total):
-                p = 100 * float(num)/float(total)
-                return "%.2f%%" % round(p, 2)
-
             # Get the total for each set of values and then set the value as
             # a percentage of the total
             if k == 'Social sources':
                 total = sum([x for n,x in c.global_totals if n == 'Total visits'])
             else:
                 total = sum([num for _,num in entries])
-            setattr(c, v, [(k,percent(v,total)) for k,v in entries ])
+            setattr(c, v, [(k,_percent(v,total)) for k,v in entries ])
 
         return render('ga_report/site/index.html')
 
@@ -261,43 +236,23 @@
         if count == -1:
             count = sys.maxint
 
-        q = model.Session.query(GA_Url)\
+        month = c.month or 'All'
+
+        q = model.Session.query(GA_Url,model.Package)\
+            .filter(model.Package.name==GA_Url.package_id)\
             .filter(GA_Url.url.like('/dataset/%'))
         if publisher:
             q = q.filter(GA_Url.department_id==publisher.name)
-        if c.month:
-            q = q.filter(GA_Url.period_name==c.month)
+        q = q.filter(GA_Url.period_name==month)
         q = q.order_by('ga_url.visitors::int desc')
-
-        if c.month:
-            top_packages = []
-            for entry in q.limit(count):
-                package_name = entry.url[len('/dataset/'):]
-                p = model.Package.get(package_name)
-                if p:
-                    top_packages.append((p, entry.pageviews, entry.visitors))
-                else:
-                    log.warning('Could not find package "%s"', package_name)
-        else:
-            ds = {}
-            for entry in q:
-                if len(ds) >= count:
-                    break
-                package_name = entry.url[len('/dataset/'):]
-                p = model.Package.get(package_name)
-                if p:
-                    if not p in ds:
-                        ds[p] = {'views': 0, 'visits': 0}
-                    ds[p]['views'] = ds[p]['views'] + int(entry.pageviews)
-                    ds[p]['visits'] = ds[p]['visits'] + int(entry.visitors)
-                else:
-                    log.warning('Could not find package "%s"', package_name)
-
-            results = []
-            for k, v in ds.iteritems():
-                results.append((k,v['views'],v['visits']))
-
-            top_packages = sorted(results, key=operator.itemgetter(1), reverse=True)
+        top_packages = []
+
+        for entry,package in q.limit(count):
+            if package:
+                top_packages.append((package, entry.pageviews, entry.visitors))
+            else:
+                log.warning('Could not find package associated package')
+
         return top_packages
 
     def read(self):
@@ -333,15 +288,12 @@
         else:
             c.month_desc = ''.join([m[1] for m in c.months if m[0]==c.month])
 
+        month = c.mnth or 'All'
         c.publisher_page_views = 0
         q = model.Session.query(GA_Url).\
             filter(GA_Url.url=='/publisher/%s' % c.publisher_name)
-        if c.month:
-            entry = q.filter(GA_Url.period_name==c.month).first()
-            c.publisher_page_views = entry.pageviews if entry else 0
-        else:
-            for e in q.all():
-                c.publisher_page_views = c.publisher_page_views  + int(e.pageviews)
+        entry = q.filter(GA_Url.period_name==c.month).first()
+        c.publisher_page_views = entry.pageviews if entry else 0
 
         c.top_packages = self._get_packages(c.publisher, 20)
 
@@ -393,3 +345,7 @@
         publishers.append((pub.name, pub.title))
     return publishers
 
+def _percent(num, total):
+    p = 100 * float(num)/float(total)
+    return "%.2f%%" % round(p, 2)
+

--- a/ckanext/ga_report/download_analytics.py
+++ b/ckanext/ga_report/download_analytics.py
@@ -100,6 +100,7 @@
                      self.get_full_period_name(period_name, period_complete_day),
                      start_date.strftime('%Y %m %d'),
                      end_date.strftime('%Y %m %d'))
+
             data = self.download(start_date, end_date, '~/dataset/[a-z0-9-_]+')
             log.info('Storing Dataset Analytics for period "%s"',
                      self.get_full_period_name(period_name, period_complete_day))
@@ -146,7 +147,7 @@
         start_date = start_date.strftime('%Y-%m-%d')
         end_date = end_date.strftime('%Y-%m-%d')
         query = 'ga:pagePath=%s$' % path
-        metrics = 'ga:uniquePageviews, ga:visitors'
+        metrics = 'ga:uniquePageviews, ga:visits'
         sort = '-ga:uniquePageviews'
 
         # Supported query params at
@@ -160,11 +161,6 @@
                                  dimensions="ga:pagePath",
                                  max_results=10000,
                                  end_date=end_date).execute()
-
-        if os.getenv('DEBUG'):
-            import pprint
-            pprint.pprint(results)
-            print 'Total results: %s' % results.get('totalResults')
 
         packages = []
         for entry in results.get('rows'):
@@ -213,18 +209,37 @@
         results = self.service.data().ga().get(
                                  ids='ga:' + self.profile_id,
                                  start_date=start_date,
-                                 metrics='ga:pageviewsPerVisit,ga:bounces,ga:avgTimeOnSite,ga:percentNewVisits,ga:visitors',
+                                 metrics='ga:pageviewsPerVisit,ga:avgTimeOnSite,ga:percentNewVisits,ga:visits',
                                  max_results=10000,
                                  end_date=end_date).execute()
         result_data = results.get('rows')
         data = {
             'Pages per visit': result_data[0][0],
-            'Bounces': result_data[0][1],
-            'Average time on site': result_data[0][2],
-            'New visits': result_data[0][3],
-            'Total visits': result_data[0][4],
+            'Average time on site': result_data[0][1],
+            'New visits': result_data[0][2],
+            'Total visits': result_data[0][3],
         }
         ga_model.update_sitewide_stats(period_name, "Totals", data)
+
+        # Bounces from /data. This url is specified in configuration because
+        # for DGU we don't want /.
+        path = config.get('ga-report.bounce_url','/')
+        print path
+        results = self.service.data().ga().get(
+                                 ids='ga:' + self.profile_id,
+                                 filters='ga:pagePath=~%s$' % (path,),
+                                 start_date=start_date,
+                                 metrics='ga:bounces,ga:uniquePageviews',
+                                 dimensions='ga:pagePath',
+                                 max_results=10000,
+                                 end_date=end_date).execute()
+        result_data = results.get('rows')
+        for results in result_data:
+            if results[0] == path:
+                bounce, total = [float(x) for x in results[1:]]
+                pct = 100 * bounce/total
+                print "%d bounces from %d total == %s" % (bounce, total, pct)
+                ga_model.update_sitewide_stats(period_name, "Totals", {'Bounces': pct})
 
 
     def _locale_stats(self, start_date, end_date, period_name):

--- a/ckanext/ga_report/ga_model.py
+++ b/ckanext/ga_report/ga_model.py
@@ -1,10 +1,10 @@
 import re
 import uuid
 
-from sqlalchemy import Table, Column, MetaData
+from sqlalchemy import Table, Column, MetaData, ForeignKey
 from sqlalchemy import types
 from sqlalchemy.sql import select
-from sqlalchemy.orm import mapper
+from sqlalchemy.orm import mapper, relation
 from sqlalchemy import func
 
 import ckan.model as model
@@ -14,8 +14,6 @@
     return unicode(uuid.uuid4())
 
 metadata = MetaData()
-
-
 
 class GA_Url(object):
 
@@ -32,6 +30,7 @@
                       Column('visitors', types.UnicodeText),
                       Column('url', types.UnicodeText),
                       Column('department_id', types.UnicodeText),
+                      Column('package_id', types.UnicodeText),
                 )
 mapper(GA_Url, url_table)
 
@@ -163,6 +162,10 @@
         url = _normalize_url(url)
         department_id = _get_department_id_of_url(url)
 
+        package = None
+        if url.startswith('/dataset/'):
+            package = url[len('/dataset/'):]
+
         # see if the row for this url & month is in the table already
         item = model.Session.query(GA_Url).\
             filter(GA_Url.period_name==period_name).\
@@ -172,6 +175,7 @@
             item.pageviews = views
             item.visitors = visitors
             item.department_id = department_id
+            item.package_id = package
             model.Session.add(item)
         else:
             # create the row
@@ -181,9 +185,31 @@
                       'url': url,
                       'pageviews': views,
                       'visitors': visitors,
-                      'department_id': department_id
+                      'department_id': department_id,
+                      'package_id': package
                      }
             model.Session.add(GA_Url(**values))
+
+        # We now need to recaculate the ALL time_period from the data we have
+        # Delete the old 'All'
+        old = model.Session.query(GA_Url).\
+            filter(GA_Url.period_name == "All").\
+            filter(GA_Url.url==url).delete()
+
+        items = model.Session.query(GA_Url).\
+            filter(GA_Url.period_name != "All").\
+            filter(GA_Url.url==url).all()
+        values = {'id': make_uuid(),
+                  'period_name': "All",
+                  'period_complete_day': "0",
+                  'url': url,
+                  'pageviews': sum([int(x.pageviews) for x in items]),
+                  'visitors': sum([int(x.visitors) for x in items]),
+                  'department_id': department_id,
+                  'package_id': package
+                 }
+        model.Session.add(GA_Url(**values))
+
         model.Session.commit()