Implements downloads counts (for dataset resources) and fixes an issue with 'All' records.
Implements downloads counts (for dataset resources) and fixes an issue with 'All' records.

- Fetches the data for downloads (either direct or cached) from when a user clicked on a
Download button. We lookup the resource for this url (based on .url or .cache_url) and
then associate the download count with the package it belongs to.
- Fixes a bug (#211) where the All records are deleted for every url, even if that url was
not fetched (and therefore won't get a new All record).

file:a/.gitignore -> file:b/.gitignore
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 *.py[co]
 *.py~
 .gitignore
+ckan.log
 
 # Packages
 *.egg

--- a/ckanext/ga_report/command.py
+++ b/ckanext/ga_report/command.py
@@ -55,6 +55,36 @@
         init_service('token.dat',
                       self.args[0] if self.args
                                    else 'credentials.json')
+
+class FixTimePeriods(CkanCommand):
+    """
+    Fixes the 'All' records for GA_Urls
+
+    It is possible that older urls that haven't recently been visited
+    do not have All records.  This command will traverse through those
+    records and generate valid All records for them.
+    """
+    summary = __doc__.split('\n')[0]
+    usage = __doc__
+    max_args = 0
+    min_args = 0
+
+    def __init__(self, name):
+        super(FixTimePeriods, self).__init__(name)
+
+    def command(self):
+        import ckan.model as model
+        from ga_model import post_update_url_stats
+        self._load_config()
+        model.Session.remove()
+        model.Session.configure(bind=model.meta.engine)
+
+        log = logging.getLogger('ckanext.ga_report')
+
+        log.info("Updating 'All' records for old URLs")
+        post_update_url_stats()
+        log.info("Processing complete")
+
 
 
 class LoadAnalytics(CkanCommand):

--- a/ckanext/ga_report/controller.py
+++ b/ckanext/ga_report/controller.py
@@ -13,6 +13,7 @@
 
 log = logging.getLogger('ckanext.ga-report')
 
+DOWNLOADS_AVAILABLE_FROM = '2012-12'
 
 def _get_month_name(strdate):
     import calendar
@@ -21,7 +22,7 @@
     return '%s %s' % (calendar.month_name[d.tm_mon], d.tm_year)
 
 
-def _month_details(cls):
+def _month_details(cls, stat_key=None):
     '''
     Returns a list of all the periods for which we have data, unfortunately
     knows too much about the type of the cls being passed as GA_Url has a
@@ -32,9 +33,13 @@
     months = []
     day = None
 
-    vals = model.Session.query(cls.period_name,cls.period_complete_day)\
-        .filter(cls.period_name!='All').distinct(cls.period_name)\
-        .order_by("period_name desc").all()
+    q = model.Session.query(cls.period_name,cls.period_complete_day)\
+        .filter(cls.period_name!='All').distinct(cls.period_name)
+    if stat_key:
+        q=  q.filter(cls.stat_name==stat_key)
+
+    vals = q.order_by("period_name desc").all()
+
     if vals and vals[0][1]:
         day = int(vals[0][1])
         ordinal = 'th' if 11 <= day <= 13 \
@@ -52,7 +57,7 @@
     def csv(self, month):
         import csv
 
-        q = model.Session.query(GA_Stat)
+        q = model.Session.query(GA_Stat).filter(GA_Stat.stat_name!='Downloads')
         if month != 'all':
             q = q.filter(GA_Stat.period_name==month)
         entries = q.order_by('GA_Stat.period_name, GA_Stat.stat_name, GA_Stat.key').all()
@@ -68,6 +73,7 @@
                              entry.stat_name.encode('utf-8'),
                              entry.key.encode('utf-8'),
                              entry.value.encode('utf-8')])
+
 
     def index(self):
 
@@ -114,7 +120,7 @@
                 if k in ['Total page views', 'Total visits']:
                     v = sum(v)
                 else:
-                    v = float(sum(v))/len(v)
+                    v = float(sum(v))/float(len(v))
                 key, val = clean_key(k,v)
 
                 c.global_totals.append((key, val))
@@ -223,13 +229,14 @@
             str('attachment; filename=datasets_%s_%s.csv' % (c.publisher_name, month,))
 
         writer = csv.writer(response)
-        writer.writerow(["Dataset Title", "Dataset Name", "Views", "Visits", "Period Name"])
-
-        for package,view,visit in packages:
+        writer.writerow(["Dataset Title", "Dataset Name", "Views", "Visits", "Resource downloads", "Period Name"])
+
+        for package,view,visit,downloads in packages:
             writer.writerow([package.title.encode('utf-8'),
                              package.name.encode('utf-8'),
                              view,
                              visit,
+                             downloads,
                              month])
 
     def publishers(self):
@@ -250,10 +257,10 @@
 
     def _get_packages(self, publisher=None, count=-1):
         '''Returns the datasets in order of views'''
-        if count == -1:
-            count = sys.maxint
-
+        have_download_data = True
         month = c.month or 'All'
+        if month != 'All':
+            have_download_data = month >= DOWNLOADS_AVAILABLE_FROM
 
         q = model.Session.query(GA_Url,model.Package)\
             .filter(model.Package.name==GA_Url.package_id)\
@@ -263,9 +270,25 @@
         q = q.filter(GA_Url.period_name==month)
         q = q.order_by('ga_url.pageviews::int desc')
         top_packages = []
-        for entry,package in q.limit(count):
+        if count == -1:
+            entries = q.all()
+        else:
+            entries = q.limit(count)
+
+        for entry,package in entries:
             if package:
-                top_packages.append((package, entry.pageviews, entry.visits))
+                # Downloads ....
+                if have_download_data:
+                    dls = model.Session.query(GA_Stat).\
+                        filter(GA_Stat.stat_name=='Downloads').\
+                        filter(GA_Stat.key==package.name)
+                    if month != 'All':  # Fetch everything unless the month is specific
+                        dls = dls.filter(GA_Stat.period_name==month)
+
+                    downloads = sum(int(d.value) for d in dls.all())
+                else:
+                    downloads = 'No data'
+                top_packages.append((package, entry.pageviews, entry.visits, downloads))
             else:
                 log.warning('Could not find package associated package')
 

--- a/ckanext/ga_report/download_analytics.py
+++ b/ckanext/ga_report/download_analytics.py
@@ -13,6 +13,7 @@
 FORMAT_MONTH = '%Y-%m'
 MIN_VIEWS = 50
 MIN_VISITS = 20
+MIN_DOWNLOADS = 10
 
 class DownloadAnalytics(object):
     '''Downloads and stores analytics info'''
@@ -122,8 +123,12 @@
                 log.info('Storing publisher views (%i rows)', len(data.get('url')))
                 self.store(period_name, period_complete_day, data,)
 
+                # Make sure the All records are correct.
+                ga_model.post_update_url_stats()
+
                 log.info('Aggregating datasets by publisher')
                 ga_model.update_publisher_stats(period_name) # about 30 seconds.
+
 
             log.info('Downloading and storing analytics for site-wide stats')
             self.sitewide_stats( period_name, period_complete_day )
@@ -179,6 +184,7 @@
                                  end_date=end_date).execute()
 
         packages = []
+        log.info("There are %d results" % results['totalResults'])
         for entry in results.get('rows'):
             (loc,pageviews,visits) = entry
             url = _normalize_url('http:/' + loc) # strips off domain e.g. www.data.gov.uk or data.gov.uk
@@ -203,7 +209,7 @@
         start_date = '%s-01' % period_name
         end_date = '%s-%s' % (period_name, last_day_of_month)
         funcs = ['_totals_stats', '_social_stats', '_os_stats',
-                 '_locale_stats', '_browser_stats', '_mobile_stats']
+                 '_locale_stats', '_browser_stats', '_mobile_stats', '_download_stats']
         for f in funcs:
             log.info('Downloading analytics for %s' % f.split('_')[1])
             getattr(self, f)(start_date, end_date, period_name, period_complete_day)
@@ -250,7 +256,7 @@
                                  ids='ga:' + self.profile_id,
                                  filters='ga:pagePath==%s' % (path,),
                                  start_date=start_date,
-                                 metrics='ga:bounces,ga:pageviews',
+                                 metrics='ga:visitBounceRate',
                                  dimensions='ga:pagePath',
                                  max_results=10000,
                                  end_date=end_date).execute()
@@ -260,10 +266,10 @@
                       path, result_data)
             return
         results = result_data[0]
-        bounces, total = [float(x) for x in result_data[0][1:]]
-        pct = 100 * bounces/total
-        log.info('%d bounces from %d total == %s', bounces, total, pct)
-        ga_model.update_sitewide_stats(period_name, "Totals", {'Bounce rate (home page)': pct},
+        bounces = float(results[1])
+        # visitBounceRate is already a %
+        log.info('Google reports visitBounceRate as %s', bounces)
+        ga_model.update_sitewide_stats(period_name, "Totals", {'Bounce rate (home page)': float(bounces)},
             period_complete_day)
 
 
@@ -290,6 +296,62 @@
         self._filter_out_long_tail(data, MIN_VIEWS)
         ga_model.update_sitewide_stats(period_name, "Country", data, period_complete_day)
 
+
+    def _download_stats(self, start_date, end_date, period_name, period_complete_day):
+        """ Fetches stats about language and country """
+        import ckan.model as model
+
+        data = {}
+
+        results = self.service.data().ga().get(
+                                 ids='ga:' + self.profile_id,
+                                 start_date=start_date,
+                                 filters='ga:eventAction==download',
+                                 metrics='ga:totalEvents',
+                                 sort='-ga:totalEvents',
+                                 dimensions="ga:eventLabel",
+                                 max_results=10000,
+                                 end_date=end_date).execute()
+        result_data = results.get('rows')
+        if not result_data:
+            # We may not have data for this time period, so we need to bail
+            # early.
+            log.info("There is no download data for this time period")
+            return
+
+        def process_result_data(result_data, cached=False):
+            for result in result_data:
+                url = result[0].strip()
+
+                # Get package id associated with the resource that has this URL.
+                q = model.Session.query(model.Resource)
+                if cached:
+                    r = q.filter(model.Resource.cache_url.like("%s%%" % url)).first()
+                else:
+                    r = q.filter(model.Resource.url.like("%s%%" % url)).first()
+
+                package_name = r.resource_group.package.name if r else ""
+                if package_name:
+                    data[package_name] = data.get(package_name, 0) + int(result[1])
+                else:
+                    log.warning(u"Could not find resource for URL: {url}".format(url=url))
+                    continue
+
+        process_result_data(results.get('rows'))
+
+        results = self.service.data().ga().get(
+                                 ids='ga:' + self.profile_id,
+                                 start_date=start_date,
+                                 filters='ga:eventAction==download-cache',
+                                 metrics='ga:totalEvents',
+                                 sort='-ga:totalEvents',
+                                 dimensions="ga:eventLabel",
+                                 max_results=10000,
+                                 end_date=end_date).execute()
+        process_result_data(results.get('rows'), cached=False)
+
+        self._filter_out_long_tail(data, MIN_DOWNLOADS)
+        ga_model.update_sitewide_stats(period_name, "Downloads", data, period_complete_day)
 
     def _social_stats(self, start_date, end_date, period_name, period_complete_day):
         """ Finds out which social sites people are referred from """

--- a/ckanext/ga_report/ga_model.py
+++ b/ckanext/ga_report/ga_model.py
@@ -161,10 +161,56 @@
 
 
 def pre_update_url_stats(period_name):
+    log.debug("Deleting '%s' records" % period_name)
     model.Session.query(GA_Url).\
             filter(GA_Url.period_name==period_name).delete()
-    model.Session.query(GA_Url).\
-            filter(GA_Url.period_name=='All').delete()
+
+    count = model.Session.query(GA_Url).\
+            filter(GA_Url.period_name == 'All').count()
+    log.debug("Deleting %d 'All' records" % count)
+    count = model.Session.query(GA_Url).\
+            filter(GA_Url.period_name == 'All').delete()
+    log.debug("Deleted %d 'All' records" % count)
+
+    model.Session.flush()
+    model.Session.commit()
+    model.repo.commit_and_remove()
+
+def post_update_url_stats():
+
+    """ Check the distinct url field in ga_url and make sure
+        it has an All record.  If not then create one.
+
+        After running this then every URL should have an All
+        record regardless of whether the URL has an entry for
+        the month being currently processed.
+    """
+    query = """select url, pageviews::int, visits::int
+               from ga_url
+               where url not in (select url from ga_url where period_name ='All')"""
+    connection = model.Session.connection()
+    res = connection.execute(query)
+
+    views, visits = {}, {}
+    # url, views, visits
+    for row in res:
+        views[row[0]] = views.get(row[0], 0) + row[1]
+        visits[row[0]] = visits.get(row[0], 0) + row[2]
+
+    for key in views.keys():
+        package, publisher = _get_package_and_publisher(key)
+
+        values = {'id': make_uuid(),
+                  'period_name': "All",
+                  'period_complete_day': 0,
+                  'url': key,
+                  'pageviews': views[key],
+                  'visits': visits[key],
+                  'department_id': publisher,
+                  'package_id': publisher
+                  }
+        model.Session.add(GA_Url(**values))
+    model.Session.commit()
 
 
 def update_url_stats(period_name, period_complete_day, url_data):
@@ -216,8 +262,8 @@
                       'period_name': 'All',
                       'period_complete_day': 0,
                       'url': url,
-                      'pageviews': sum([int(e.pageviews) for e in entries]) + old_pageviews,
-                      'visits': sum([int(e.visits or 0) for e in entries]) + old_visits,
+                      'pageviews': sum([int(e.pageviews) for e in entries]) + int(old_pageviews),
+                      'visits': sum([int(e.visits or 0) for e in entries]) + int(old_visits),
                       'department_id': publisher,
                       'package_id': package
                      }
@@ -343,10 +389,10 @@
     '''
     for object_type in (GA_Url, GA_Stat, GA_Publisher, GA_ReferralStat):
         q = model.Session.query(object_type)
-        if period_name != 'all':
+        if period_name != 'All':
             q = q.filter_by(period_name=period_name)
         q.delete()
-    model.Session.commit()
+    model.repo.commit_and_remove()
 
 def get_score_for_dataset(dataset_name):
     '''

--- a/ckanext/ga_report/plugin.py
+++ b/ckanext/ga_report/plugin.py
@@ -42,6 +42,16 @@
             controller='ckanext.ga_report.controller:GaReport',
             action='csv'
         )
+        map.connect(
+            '/data/site-usage/downloads',
+            controller='ckanext.ga_report.controller:GaReport',
+            action='downloads'
+        )
+        map.connect(
+            '/data/site-usage/downloads_{month}.csv',
+            controller='ckanext.ga_report.controller:GaReport',
+            action='csv_downloads'
+        )
 
         # GaDatasetReport
         map.connect(

--- a/ckanext/ga_report/templates/ga_report/notes.html
+++ b/ckanext/ga_report/templates/ga_report/notes.html
@@ -7,7 +7,7 @@
       <h4>Notes</h4>
       <ul>
           <li>"Views" is the number of times a page was loaded in users' browsers.</li>
-          <li>"Visits" is the number of unique user visits to a page, counted once for each visitor for each of their browsing sessions.</li>
+          <li>"Downloads" is the number of times a user has clicked to download either an original or cached resource for a particular dataset since December 2012</li>
           <li>These usage statistics are confined to users with javascript enabled, which excludes web crawlers and API calls.</li>
           <li>The results are not shown when the number of views/visits is tiny. Where these relate to site pages, results are available in full in the CSV download. Where these relate to users' web browser information, results are not disclosed, for privacy reasons.</li>
       </ul>

--- a/ckanext/ga_report/templates/ga_report/publisher/read.html
+++ b/ckanext/ga_report/templates/ga_report/publisher/read.html
@@ -44,15 +44,15 @@
      <table py:if="c.top_packages" class="table table-condensed table-bordered table-striped">
 	 <tr>
 	   <th>Dataset</th>
-<!--	   <th>Visits</th> -->
 	   <th>Views</th>
+     <th>Downloads</th>
 	 </tr>
-        <py:for each="package, views, visits in c.top_packages">
+        <py:for each="package, views, visits,downloads in c.top_packages">
 	  <tr>
 	    <td>${h.link_to(package.title or package.name, h.url_for(controller='package', action='read', id=package.name))}
 	    </td>
-<!--	    <td>${visits}</td> -->
 	    <td>${views}</td>
+      <td>${downloads}</td>
 	  </tr>
         </py:for>
      </table>

--- /dev/null
+++ b/ckanext/ga_report/templates/ga_report/site/downloads.html
@@ -1,1 +1,59 @@
+<html xmlns:py="http://genshi.edgewall.org/"
+  xmlns:i18n="http://genshi.edgewall.org/i18n"
+  xmlns:xi="http://www.w3.org/2001/XInclude"
+  py:strip="">
 
+  <xi:include href="../ga_util.html" />
+
+  <py:def function="page_title">Downloads</py:def>
+
+  <py:match path="primarysidebar">
+    <li py:if="c.downloads" class="widget-container boxed widget_text">
+      <h4>Download</h4>
+      <p><center>
+          <a class="btn button btn-primary" href="${h.url_for(controller='ckanext.ga_report.controller:GaReport',action='csv_downloads',month=c.month or 'all')}">Download as CSV</a></center>
+      </p>
+    </li>
+    <xi:include href="../notes.html" />
+
+  </py:match>
+
+  <div py:match="content">
+      <h1>Downloads</h1>
+      ${usage_nav('Downloads')}
+
+      <form class="form-inline" action="${h.url_for(controller='ckanext.ga_report.controller:GaReport',action='downloads')}" method="get">
+          <div class="controls">
+
+          ${month_selector(c.month, c.months, c.day)}
+
+           <input class="btn button btn-primary" type='submit' value="Update"/>
+          </div>
+       </form>
+
+       <py:if test="c.downloads">
+         ${downloads_table(c.downloads)}
+       </py:if>
+       <py:if test="not c.downloads">
+         <h4>No data</h4>
+         <p>There is no download data available for this month</p>
+       </py:if>
+  </div>
+
+  <xi:include href="../../layout.html" />
+
+  <py:def function="optional_footer">
+    <script type='text/javascript'>
+        $('.dropdown-toggle').dropdown();
+        $('.nav-tabs li a').click(function (e) {
+          e.preventDefault();
+          $(this).tab('show');
+        })
+        alert(window.location.hash);
+    </script>
+  </py:def>
+</html>
+
+
+
+

file:a/setup.py -> file:b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -33,6 +33,7 @@
         loadanalytics = ckanext.ga_report.command:LoadAnalytics
         initdb = ckanext.ga_report.command:InitDB
         getauthtoken = ckanext.ga_report.command:GetAuthToken
+        fixtimeperiods = ckanext.ga_report.command:FixTimePeriods
 	""",
 )