Implements downloads counts (for dataset resources) and fixes an issue with 'All' records.
Implements downloads counts (for dataset resources) and fixes an issue with 'All' records.

- Fetches the data for downloads (either direct or cached) from when a user clicked on a
Download button. We lookup the resource for this url (based on .url or .cache_url) and
then associate the download count with the package it belongs to.
- Fixes a bug (#211) where the All records are deleted for every url, even if that url was
not fetched (and therefore won't get a new All record).

--- a/ckanext/ga_report/command.py
+++ b/ckanext/ga_report/command.py
@@ -55,6 +55,36 @@
         init_service('token.dat',
                       self.args[0] if self.args
                                    else 'credentials.json')
+
+class FixTimePeriods(CkanCommand):
+    """
+    Fixes the 'All' records for GA_Urls
+
+    It is possible that older urls that haven't recently been visited
+    do not have All records.  This command will traverse through those
+    records and generate valid All records for them.
+    """
+    summary = __doc__.split('\n')[0]
+    usage = __doc__
+    max_args = 0
+    min_args = 0
+
+    def __init__(self, name):
+        super(FixTimePeriods, self).__init__(name)
+
+    def command(self):
+        import ckan.model as model
+        from ga_model import post_update_url_stats
+        self._load_config()
+        model.Session.remove()
+        model.Session.configure(bind=model.meta.engine)
+
+        log = logging.getLogger('ckanext.ga_report')
+
+        log.info("Updating 'All' records for old URLs")
+        post_update_url_stats()
+        log.info("Processing complete")
+
 
 
 class LoadAnalytics(CkanCommand):

--- a/ckanext/ga_report/controller.py
+++ b/ckanext/ga_report/controller.py
@@ -13,6 +13,7 @@
 
 log = logging.getLogger('ckanext.ga-report')
 
+DOWNLOADS_AVAILABLE_FROM = '2012-12'
 
 def _get_month_name(strdate):
     import calendar
@@ -38,6 +39,7 @@
         q=  q.filter(cls.stat_name==stat_key)
 
     vals = q.order_by("period_name desc").all()
+
     if vals and vals[0][1]:
         day = int(vals[0][1])
         ordinal = 'th' if 11 <= day <= 13 \
@@ -69,25 +71,6 @@
         for entry in entries:
             writer.writerow([entry.period_name.encode('utf-8'),
                              entry.stat_name.encode('utf-8'),
-                             entry.key.encode('utf-8'),
-                             entry.value.encode('utf-8')])
-
-    def csv_downloads(self, month):
-        import csv
-
-        q = model.Session.query(GA_Stat).filter(GA_Stat.stat_name=='Downloads')
-        if month != 'all':
-            q = q.filter(GA_Stat.period_name==month)
-        entries = q.order_by('GA_Stat.period_name, GA_Stat.key').all()
-
-        response.headers['Content-Type'] = "text/csv; charset=utf-8"
-        response.headers['Content-Disposition'] = str('attachment; filename=downloads_%s.csv' % (month,))
-
-        writer = csv.writer(response)
-        writer.writerow(["Period", "Resource URL", "Count"])
-
-        for entry in entries:
-            writer.writerow([entry.period_name.encode('utf-8'),
                              entry.key.encode('utf-8'),
                              entry.value.encode('utf-8')])
 
@@ -202,35 +185,6 @@
 
         return render('ga_report/site/index.html')
 
-    def downloads(self):
-
-        # Get the month details by fetching distinct values and determining the
-        # month names from the values.
-        c.months, c.day = _month_details(GA_Stat, "Downloads")
-
-        # Work out which month to show, based on query params of the first item
-        c.month_desc = 'all months'
-        c.month = request.params.get('month', '')
-        if c.month:
-            c.month_desc = ''.join([m[1] for m in c.months if m[0]==c.month])
-
-        c.downloads = []
-        q = model.Session.query(GA_Stat).filter(GA_Stat.stat_name=='Downloads')
-        q = q.filter(GA_Stat.period_name==c.month) if c.month else q
-        q = q.order_by("ga_stat.value::int desc")
-
-        data = collections.defaultdict(int)
-        for entry in q.all():
-            r = model.Session.query(model.Resource).filter(model.Resource.url==entry.key).first()
-            if not r:
-                continue
-            data[r] += int(entry.value)
-
-        c.downloads = [(k,v,) for k,v in data.iteritems()]
-        c.downloads = sorted(c.downloads, key=operator.itemgetter(1), reverse=True)
-
-        return render('ga_report/site/downloads.html')
-
 
 class GaDatasetReport(BaseController):
     """
@@ -275,13 +229,14 @@
             str('attachment; filename=datasets_%s_%s.csv' % (c.publisher_name, month,))
 
         writer = csv.writer(response)
-        writer.writerow(["Dataset Title", "Dataset Name", "Views", "Visits", "Period Name"])
-
-        for package,view,visit in packages:
+        writer.writerow(["Dataset Title", "Dataset Name", "Views", "Visits", "Resource downloads", "Period Name"])
+
+        for package,view,visit,downloads in packages:
             writer.writerow([package.title.encode('utf-8'),
                              package.name.encode('utf-8'),
                              view,
                              visit,
+                             downloads,
                              month])
 
     def publishers(self):
@@ -302,10 +257,10 @@
 
     def _get_packages(self, publisher=None, count=-1):
         '''Returns the datasets in order of views'''
-        if count == -1:
-            count = sys.maxint
-
+        have_download_data = True
         month = c.month or 'All'
+        if month != 'All':
+            have_download_data = month >= DOWNLOADS_AVAILABLE_FROM
 
         q = model.Session.query(GA_Url,model.Package)\
             .filter(model.Package.name==GA_Url.package_id)\
@@ -315,9 +270,25 @@
         q = q.filter(GA_Url.period_name==month)
         q = q.order_by('ga_url.pageviews::int desc')
         top_packages = []
-        for entry,package in q.limit(count):
+        if count == -1:
+            entries = q.all()
+        else:
+            entries = q.limit(count)
+
+        for entry,package in entries:
             if package:
-                top_packages.append((package, entry.pageviews, entry.visits))
+                # Downloads ....
+                if have_download_data:
+                    dls = model.Session.query(GA_Stat).\
+                        filter(GA_Stat.stat_name=='Downloads').\
+                        filter(GA_Stat.key==package.name)
+                    if month != 'All':  # Fetch everything unless the month is specific
+                        dls = dls.filter(GA_Stat.period_name==month)
+
+                    downloads = sum(int(d.value) for d in dls.all())
+                else:
+                    downloads = 'No data'
+                top_packages.append((package, entry.pageviews, entry.visits, downloads))
             else:
                 log.warning('Could not find package associated package')
 

--- a/ckanext/ga_report/download_analytics.py
+++ b/ckanext/ga_report/download_analytics.py
@@ -123,8 +123,12 @@
                 log.info('Storing publisher views (%i rows)', len(data.get('url')))
                 self.store(period_name, period_complete_day, data,)
 
+                # Make sure the All records are correct.
+                ga_model.post_update_url_stats()
+
                 log.info('Aggregating datasets by publisher')
                 ga_model.update_publisher_stats(period_name) # about 30 seconds.
+
 
             log.info('Downloading and storing analytics for site-wide stats')
             self.sitewide_stats( period_name, period_complete_day )
@@ -180,6 +184,7 @@
                                  end_date=end_date).execute()
 
         packages = []
+        log.info("There are %d results" % results['totalResults'])
         for entry in results.get('rows'):
             (loc,pageviews,visits) = entry
             url = _normalize_url('http:/' + loc) # strips off domain e.g. www.data.gov.uk or data.gov.uk
@@ -294,6 +299,10 @@
 
     def _download_stats(self, start_date, end_date, period_name, period_complete_day):
         """ Fetches stats about language and country """
+        import ckan.model as model
+
+        data = {}
+
         results = self.service.data().ga().get(
                                  ids='ga:' + self.profile_id,
                                  start_date=start_date,
@@ -310,10 +319,37 @@
             log.info("There is no download data for this time period")
             return
 
-        # [[url, count], [url],count]
-        data = {}
-        for result in result_data:
-            data[result[0]] = data.get(result[0], 0) + int(result[1])
+        def process_result_data(result_data, cached=False):
+            for result in result_data:
+                url = result[0].strip()
+
+                # Get package id associated with the resource that has this URL.
+                q = model.Session.query(model.Resource)
+                if cached:
+                    r = q.filter(model.Resource.cache_url.like("%s%%" % url)).first()
+                else:
+                    r = q.filter(model.Resource.url.like("%s%%" % url)).first()
+
+                package_name = r.resource_group.package.name if r else ""
+                if package_name:
+                    data[package_name] = data.get(package_name, 0) + int(result[1])
+                else:
+                    log.warning(u"Could not find resource for URL: {url}".format(url=url))
+                    continue
+
+        process_result_data(results.get('rows'))
+
+        results = self.service.data().ga().get(
+                                 ids='ga:' + self.profile_id,
+                                 start_date=start_date,
+                                 filters='ga:eventAction==download-cache',
+                                 metrics='ga:totalEvents',
+                                 sort='-ga:totalEvents',
+                                 dimensions="ga:eventLabel",
+                                 max_results=10000,
+                                 end_date=end_date).execute()
+        process_result_data(results.get('rows'), cached=False)
+
         self._filter_out_long_tail(data, MIN_DOWNLOADS)
         ga_model.update_sitewide_stats(period_name, "Downloads", data, period_complete_day)
 

--- a/ckanext/ga_report/ga_model.py
+++ b/ckanext/ga_report/ga_model.py
@@ -175,6 +175,42 @@
     model.Session.flush()
     model.Session.commit()
     model.repo.commit_and_remove()
+
+def post_update_url_stats():
+
+    """ Check the distinct url field in ga_url and make sure
+        it has an All record.  If not then create one.
+
+        After running this then every URL should have an All
+        record regardless of whether the URL has an entry for
+        the month being currently processed.
+    """
+    query = """select url, pageviews::int, visits::int
+               from ga_url
+               where url not in (select url from ga_url where period_name ='All')"""
+    connection = model.Session.connection()
+    res = connection.execute(query)
+
+    views, visits = {}, {}
+    # url, views, visits
+    for row in res:
+        views[row[0]] = views.get(row[0], 0) + row[1]
+        visits[row[0]] = visits.get(row[0], 0) + row[2]
+
+    for key in views.keys():
+        package, publisher = _get_package_and_publisher(key)
+
+        values = {'id': make_uuid(),
+                  'period_name': "All",
+                  'period_complete_day': 0,
+                  'url': key,
+                  'pageviews': views[key],
+                  'visits': visits[key],
+                  'department_id': publisher,
+                  'package_id': publisher
+                  }
+        model.Session.add(GA_Url(**values))
+    model.Session.commit()
 
 
 def update_url_stats(period_name, period_complete_day, url_data):

--- a/ckanext/ga_report/templates/ga_report/ga_util.html
+++ b/ckanext/ga_report/templates/ga_report/ga_util.html
@@ -44,23 +44,6 @@
  </table>
 
 
-<table py:def="downloads_table(items)" class="table table-condensed table-bordered table-striped">
-    <tr>
-        <th>Dataset and resource</th>
-        <th>Downloads</th>
-    </tr>
-    <py:for each="resource, value in items">
-    <tr>
-        <td>
-          <strong>${resource.resource_group.package.title}</strong><br/>
-          ${h.link_to((resource.name or resource.description).strip() or "No name", h.url_for(controller='package', action='resource_read', id=resource.resource_group.package.name, resource_id=resource.id))}<br/>
-        </td>
-        <td>${value}</td>
-    </tr>
-    </py:for>
- </table>
-
-
 <div py:def="usage_nav(active_name)" id="minornavigation">
     <div id="minornavigation-bg-left">
     <div id="minornavigation-bg-right">
@@ -72,9 +55,6 @@
         <li py:attrs="{'class': 'active' if active_name=='Datasets' else None}">
                 <a py:attrs="{'class': 'active' if active_name=='Datasets' else None}"  href="${h.url_for(controller='ckanext.ga_report.controller:GaDatasetReport',action='read')}"><img src="/images/icons/page_white.png" height="16px" width="16px" alt="None" class="inline-icon "/> Datasets</a>
         </li>
-        <li py:attrs="{'class': 'active' if active_name=='Downloads' else None}">
-                <a py:attrs="{'class': 'active' if active_name=='Downloads' else None}"  href="${h.url_for(controller='ckanext.ga_report.controller:GaReport',action='downloads')}"><img src="/images/icons/page_white.png" height="16px" width="16px" alt="None" class="inline-icon "/> Downloads</a>
-        </li>
       </ul>
     </div>
     </div>

--- a/ckanext/ga_report/templates/ga_report/notes.html
+++ b/ckanext/ga_report/templates/ga_report/notes.html
@@ -7,7 +7,7 @@
       <h4>Notes</h4>
       <ul>
           <li>"Views" is the number of times a page was loaded in users' browsers.</li>
-          <li>"Visits" is the number of unique user visits to a page, counted once for each visitor for each of their browsing sessions.</li>
+          <li>"Downloads" is the number of times a user has clicked to download either an original or cached resource for a particular dataset since December 2012</li>
           <li>These usage statistics are confined to users with javascript enabled, which excludes web crawlers and API calls.</li>
           <li>The results are not shown when the number of views/visits is tiny. Where these relate to site pages, results are available in full in the CSV download. Where these relate to users' web browser information, results are not disclosed, for privacy reasons.</li>
       </ul>

--- a/ckanext/ga_report/templates/ga_report/publisher/read.html
+++ b/ckanext/ga_report/templates/ga_report/publisher/read.html
@@ -44,15 +44,15 @@
      <table py:if="c.top_packages" class="table table-condensed table-bordered table-striped">
 	 <tr>
 	   <th>Dataset</th>
-<!--	   <th>Visits</th> -->
 	   <th>Views</th>
+     <th>Downloads</th>
 	 </tr>
-        <py:for each="package, views, visits in c.top_packages">
+        <py:for each="package, views, visits,downloads in c.top_packages">
 	  <tr>
 	    <td>${h.link_to(package.title or package.name, h.url_for(controller='package', action='read', id=package.name))}
 	    </td>
-<!--	    <td>${visits}</td> -->
 	    <td>${views}</td>
+      <td>${downloads}</td>
 	  </tr>
         </py:for>
      </table>

file:a/setup.py -> file:b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -33,6 +33,7 @@
         loadanalytics = ckanext.ga_report.command:LoadAnalytics
         initdb = ckanext.ga_report.command:InitDB
         getauthtoken = ckanext.ga_report.command:GetAuthToken
+        fixtimeperiods = ckanext.ga_report.command:FixTimePeriods
 	""",
 )