Improved bounce rate stuff.
Improved bounce rate stuff.

file:a/README.rst -> file:b/README.rst
--- a/README.rst
+++ b/README.rst
@@ -33,10 +33,9 @@
       googleanalytics.id = UA-1010101-1
       googleanalytics.account = Account name (e.g. data.gov.uk, see top level item at https://www.google.com/analytics)
       ga-report.period = monthly
-      ga-report.bounce_url = /data
+      ga-report.bounce_url = /
 
-   The ga-report.bounce_url specifies the path to use when calculating bounces. For DGU this is /data
-   but you may want to set this to /.
+   The ga-report.bounce_url specifies a particular path to record the bounce rate for. Typically it is / (the home page).
 
 3. Set up this extension's database tables using a paster command. (Ensure your CKAN pyenv is still activated, run the command from ``src/ckanext-ga-report``, alter the ``--config`` option to point to your site config file)::
 

--- a/ckanext/ga_report/command.py
+++ b/ckanext/ga_report/command.py
@@ -80,6 +80,11 @@
                                default=False,
                                dest='delete_first',
                                help='Delete data for the period first')
+        self.parser.add_option('-s', '--slip_url_stats',
+                               action='store_true',
+                               default=False,
+                               dest='skip_url_stats',
+                               help='Skip the download of URL data - just do site-wide stats')
 
     def command(self):
         self._load_config()
@@ -95,7 +100,8 @@
             return
 
         downloader = DownloadAnalytics(svc, profile_id=get_profile_id(svc),
-                                       delete_first=self.options.delete_first)
+                                       delete_first=self.options.delete_first,
+                                       skip_url_stats=self.options.skip_url_stats)
 
         time_period = self.args[1] if self.args and len(self.args) > 1 \
             else 'latest'

--- a/ckanext/ga_report/download_analytics.py
+++ b/ckanext/ga_report/download_analytics.py
@@ -17,11 +17,13 @@
 class DownloadAnalytics(object):
     '''Downloads and stores analytics info'''
 
-    def __init__(self, service=None, profile_id=None, delete_first=False):
+    def __init__(self, service=None, profile_id=None, delete_first=False,
+                 skip_url_stats=False):
         self.period = config['ga-report.period']
         self.service = service
         self.profile_id = profile_id
         self.delete_first = delete_first
+        self.skip_url_stats = skip_url_stats
 
     def specific_month(self, date):
         import calendar
@@ -92,33 +94,41 @@
 
     def download_and_store(self, periods):
         for period_name, period_complete_day, start_date, end_date in periods:
+            log.info('Period "%s" (%s - %s)',
+                     self.get_full_period_name(period_name, period_complete_day),
+                     start_date.strftime('%Y-%m-%d'),
+                     end_date.strftime('%Y-%m-%d'))
+ 
             if self.delete_first:
-                log.info('Deleting existing Analytics for period "%s"',
+                log.info('Deleting existing Analytics for this period "%s"',
                          period_name)
                 ga_model.delete(period_name)
-            log.info('Downloading Analytics for period "%s" (%s - %s)',
-                     self.get_full_period_name(period_name, period_complete_day),
-                     start_date.strftime('%Y %m %d'),
-                     end_date.strftime('%Y %m %d'))
-
-            # Clean up the entries before we run this
-            ga_model.pre_update_url_stats(period_name)
-
-            accountName = config.get('googleanalytics.account')
-
-            data = self.download(start_date, end_date, '~/%s/dataset/[a-z0-9-_]+' % accountName)
-            log.info('Storing Dataset Analytics for period "%s"',
-                     self.get_full_period_name(period_name, period_complete_day))
-            self.store(period_name, period_complete_day, data, )
-
-            data = self.download(start_date, end_date, '~/%s/publisher/[a-z0-9-_]+' % accountName)
-            log.info('Storing Publisher Analytics for period "%s"',
-                     self.get_full_period_name(period_name, period_complete_day))
-            self.store(period_name, period_complete_day, data,)
-
-            ga_model.update_publisher_stats(period_name) # about 30 seconds.
+
+            if not self.skip_url_stats:
+                # Clean out old url data before storing the new
+                ga_model.pre_update_url_stats(period_name)
+
+                accountName = config.get('googleanalytics.account')
+
+                log.info('Downloading analytics for dataset views')
+                data = self.download(start_date, end_date, '~/%s/dataset/[a-z0-9-_]+' % accountName)
+
+                log.info('Storing dataset views (%i rows)', len(data.get('url')))
+                self.store(period_name, period_complete_day, data, )
+
+                log.info('Downloading analytics for publisher views')
+                data = self.download(start_date, end_date, '~/%s/publisher/[a-z0-9-_]+' % accountName)
+
+                log.info('Storing publisher views (%i rows)', len(data.get('url')))
+                self.store(period_name, period_complete_day, data,)
+
+                log.info('Aggregating datasets by publisher')
+                ga_model.update_publisher_stats(period_name) # about 30 seconds.
+
+            log.info('Downloading and storing analytics for site-wide stats')
             self.sitewide_stats( period_name )
 
+            log.info('Downloading and storing analytics for social networks')
             self.update_social_info(period_name, start_date, end_date)
 
 
@@ -170,8 +180,12 @@
         packages = []
         for entry in results.get('rows'):
             (loc,pageviews,visits) = entry
-            url = _normalize_url('http:/' + loc)
+            url = _normalize_url('http:/' + loc) # strips off domain e.g. www.data.gov.uk or data.gov.uk
+
             if not url.startswith('/dataset/') and not url.startswith('/publisher/'):
+                # filter out strays like:
+                # /data/user/login?came_from=http://data.gov.uk/dataset/os-code-point-open
+                # /403.html?page=/about&from=http://data.gov.uk/publisher/planning-inspectorate
                 continue
             packages.append( (url, pageviews, visits,) ) # Temporary hack
         return dict(url=packages)
@@ -187,12 +201,10 @@
 
         start_date = '%s-01' % period_name
         end_date = '%s-%s' % (period_name, last_day_of_month)
-        print 'Sitewide_stats for %s (%s -> %s)' % (period_name, start_date, end_date)
-
         funcs = ['_totals_stats', '_social_stats', '_os_stats',
                  '_locale_stats', '_browser_stats', '_mobile_stats']
         for f in funcs:
-            print ' + Fetching %s stats' % f.split('_')[1]
+            log.info('Downloading analytics for %s' % f.split('_')[1])
             getattr(self, f)(start_date, end_date, period_name)
 
     def _get_results(result_data, f):
@@ -229,25 +241,27 @@
         }
         ga_model.update_sitewide_stats(period_name, "Totals", data)
 
-        # Bounces from /data. This url is specified in configuration because
-        # for DGU we don't want /.
-        path = config.get('ga-report.bounce_url','/')
-        print path
-        results = self.service.data().ga().get(
-                                 ids='ga:' + self.profile_id,
-                                 filters='ga:pagePath=~%s$' % (path,),
+        # Bounces from / or another configurable page.
+        path = '/%s%s' % (config.get('googleanalytics.account'),
+                          config.get('ga-report.bounce_url', '/'))
+        results = self.service.data().ga().get(
+                                 ids='ga:' + self.profile_id,
+                                 filters='ga:pagePath==%s' % (path,),
                                  start_date=start_date,
                                  metrics='ga:bounces,ga:uniquePageviews',
                                  dimensions='ga:pagePath',
                                  max_results=10000,
                                  end_date=end_date).execute()
         result_data = results.get('rows')
-        for results in result_data:
-            if results[0] == path:
-                bounce, total = [float(x) for x in results[1:]]
-                pct = 100 * bounce/total
-                print "%d bounces from %d total == %s" % (bounce, total, pct)
-                ga_model.update_sitewide_stats(period_name, "Totals", {'Bounces': pct})
+        if len(result_data) != 1:
+            log.error('Could not pinpoint the bounces for path: %s. Got results: %r',
+                      path, result_data)
+            return
+        results = result_data[0]
+        bounces, total = [float(x) for x in result_data[0][1:]]
+        pct = 100 * bounces/total
+        log.info('%d bounces from %d total == %s', bounces, total, pct)
+        ga_model.update_sitewide_stats(period_name, "Totals", {'Bounce rate': pct})
 
 
     def _locale_stats(self, start_date, end_date, period_name):

--- a/ckanext/ga_report/ga_model.py
+++ b/ckanext/ga_report/ga_model.py
@@ -111,9 +111,7 @@
     >>> normalize_url('http://data.gov.uk/dataset/weekly_fuel_prices')
     '/dataset/weekly_fuel_prices'
     '''
-    # Deliberately leaving a /
-    url = url.replace('http:/','')
-    return '/' + '/'.join(url.split('/')[2:])
+    return '/' + '/'.join(url.split('/')[3:])
 
 
 def _get_package_and_publisher(url):
@@ -184,9 +182,14 @@
 
 
 def update_url_stats(period_name, period_complete_day, url_data):
-
+    '''
+    Given a list of urls and number of hits for each during a given period,
+    stores them in GA_Url under the period and recalculates the totals for
+    the 'All' period.
+    '''
     for url, views, visitors in url_data:
         package, publisher = _get_package_and_publisher(url)
+
 
         item = model.Session.query(GA_Url).\
             filter(GA_Url.period_name==period_name).\
@@ -194,6 +197,10 @@
         if item:
             item.pageviews = item.pageviews + views
             item.visitors = item.visitors + visitors
+            if not item.package_id:
+                item.package_id = package
+            if not item.department_id:
+                item.department_id = publisher
             model.Session.add(item)
         else:
             values = {'id': make_uuid(),
@@ -209,6 +216,13 @@
         model.Session.commit()
 
         if package:
+            old_pageviews, old_visits = 0, 0
+            old = model.Session.query(GA_Url).\
+                filter(GA_Url.period_name=='All').\
+                filter(GA_Url.url==url).all()
+            old_pageviews = sum([int(o.pageviews) for o in old])
+            old_visits = sum([int(o.visitors) for o in old])
+
             entries = model.Session.query(GA_Url).\
                 filter(GA_Url.period_name!='All').\
                 filter(GA_Url.url==url).all()
@@ -216,15 +230,14 @@
                       'period_name': 'All',
                       'period_complete_day': 0,
                       'url': url,
-                      'pageviews': sum([int(e.pageviews) for e in entries]),
-                      'visitors': sum([int(e.visitors) for e in entries]),
+                      'pageviews': sum([int(e.pageviews) for e in entries]) + old_pageviews,
+                      'visitors': sum([int(e.visitors) for e in entries]) + old_visits,
                       'department_id': publisher,
                       'package_id': package
                      }
 
             model.Session.add(GA_Url(**values))
             model.Session.commit()
-
 
 
 

--- a/ckanext/ga_report/helpers.py
+++ b/ckanext/ga_report/helpers.py
@@ -60,7 +60,8 @@
         if not dataset:
             return None
     dataset_dict = get_action('package_show')({'model': model,
-                                               'session': model.Session},
+                                               'session': model.Session,
+                                               'validate': False},
                                               {'id':dataset.id})
     return dataset_dict
 

--- a/ckanext/ga_report/templates/ga_report/notes.html
+++ b/ckanext/ga_report/templates/ga_report/notes.html
@@ -6,11 +6,11 @@
     <li class="widget-container boxed widget_text">
       <h4>Notes</h4>
       <ul>
-          <li>'Views' is the number of sessions during which that page was viewed one or more times ('Unique Pageviews').</li>
-<!--          <li>'Visits' is the number of individual sessions initiated by all the visitors to your site, counted once for each visitor for each session.</li>-->
-          <li>'Visitors' is the number of unique users visiting the site (whether once or more times).</li>
+          <li>"Views" is the number of sessions during which the page was viewed one or more times (technically known as "unique pageviews").</li>
+          <li>"Visits" is the number of unique user visits to a page, counted once for each visitor for each session.</li>
+<!--!          <li>"Visitors" is the number of unique users visiting the site (whether once or more times).</li> -->
           <li>These usage statistics are confined to users with javascript enabled, which excludes web crawlers and API calls.</li>
-          <li>The results for only small numbers of views/visits are not shown. Where these relate to site pages, then they are available in full in the CSV download. Where these relate to users' web browser information, they are not disclosed, for privacy reasons.</li>
+          <li>The results are not shown when the number of views/visits is tiny. Where these relate to site pages, results are available in full in the CSV download. Where these relate to users' web browser information, results are not disclosed, for privacy reasons.</li>
       </ul>
     </li>
 </html>

--- /dev/null
+++ b/ckanext/ga_report/tests/test_model.py
@@ -1,1 +1,18 @@
+from nose.tools import assert_equal
 
+from ckanext.ga_report.ga_model import _normalize_url
+
+class TestNormalizeUrl:
+    def test_normal(self):
+        assert_equal(_normalize_url('http://data.gov.uk/dataset/weekly_fuel_prices'),
+                     '/dataset/weekly_fuel_prices')
+
+    def test_www_dot(self):
+        assert_equal(_normalize_url('http://www.data.gov.uk/dataset/weekly_fuel_prices'),
+                     '/dataset/weekly_fuel_prices')
+
+    def test_https(self):
+        assert_equal(_normalize_url('https://data.gov.uk/dataset/weekly_fuel_prices'),
+                     '/dataset/weekly_fuel_prices')
+
+