Makes sure we don't add (and log) when we can't find the dataset for popular, fixes #242
Makes sure we don't add (and log) when we can't find the dataset for popular, fixes #242

--- a/ckanext/ga_report/command.py
+++ b/ckanext/ga_report/command.py
@@ -23,7 +23,7 @@
         import ckan.model as model
         model.Session.remove()
         model.Session.configure(bind=model.meta.engine)
-        log = logging.getLogger('ckanext.ga-report')
+        log = logging.getLogger('ckanext.ga_report')
 
         import ga_model
         ga_model.init_tables()

--- a/ckanext/ga_report/controller.py
+++ b/ckanext/ga_report/controller.py
@@ -192,24 +192,17 @@
                 filter(GA_Stat.stat_name==k).\
                 order_by(GA_Stat.period_name)
             # Run the query on all months to gather graph data
-            series = {}
-            x_axis = set()
+            graph = {}
             for stat in q:
-                x_val = _get_unix_epoch(stat.period_name)
-                series[ stat.key ] = series.get(stat.key,{})
-                series[ stat.key ][x_val] = float(stat.value)
-                x_axis.add(x_val)
-            # Common x-axis for all series. Exclude this month (incomplete data)
-            x_axis = sorted(list(x_axis))[:-1]
-            # Buffer a rickshaw dataset from the series
-            def create_graph(series_name, series_data):
-                return { 
-                    'name':series_name, 
-                    'data':[ {'x':x,'y':series_data.get(x,0)} for x in x_axis ]
-                    }
-            rickshaw = [ create_graph(name,data) for name, data in series.items() ]
-            rickshaw = sorted(rickshaw,key=lambda x:x['data'][-1]['y'])
-            setattr(c, v+'_graph', json.dumps(rickshaw))
+                graph[ stat.key ] = graph.get(stat.key,{
+                    'name':stat.key, 
+                    'data': []
+                    })
+                graph[ stat.key ]['data'].append({
+                    'x':_get_unix_epoch(stat.period_name),
+                    'y':float(stat.value)
+                    })
+            setattr(c, v+'_graph', json.dumps( _to_rickshaw(graph.values(),percentageMode=True) ))
 
             # Buffer the tabular data
             if c.month:
@@ -253,7 +246,9 @@
         writer = csv.writer(response)
         writer.writerow(["Publisher Title", "Publisher Name", "Views", "Visits", "Period Name"])
 
-        for publisher,view,visit in _get_top_publishers(None):
+        top_publishers, top_publishers_graph = _get_top_publishers(None)
+
+        for publisher,view,visit in top_publishers:
             writer.writerow([publisher.title.encode('utf-8'),
                              publisher.name.encode('utf-8'),
                              view,
@@ -302,7 +297,9 @@
         if c.month:
             c.month_desc = ''.join([m[1] for m in c.months if m[0]==c.month])
 
-        c.top_publishers = _get_top_publishers()
+        c.top_publishers, graph_data = _get_top_publishers()
+        c.top_publishers_graph = json.dumps( _to_rickshaw(graph_data.values()) )
+
         return render('ga_report/publisher/index.html')
 
     def _get_packages(self, publisher=None, count=-1):
@@ -334,8 +331,9 @@
                         filter(GA_Stat.key==package.name)
                     if month != 'All':  # Fetch everything unless the month is specific
                         dls = dls.filter(GA_Stat.period_name==month)
-
-                    downloads = sum(int(d.value) for d in dls.all())
+                    downloads = 0
+                    for x in dls:
+                        downloads += int(x.value)
                 else:
                     downloads = 'No data'
                 top_packages.append((package, entry.pageviews, entry.visits, downloads))
@@ -386,7 +384,78 @@
 
         c.top_packages = self._get_packages(c.publisher, 20)
 
+        # Graph query
+        top_package_names = [ x[0].name for x in c.top_packages ]
+        graph_query = model.Session.query(GA_Url,model.Package)\
+            .filter(model.Package.name==GA_Url.package_id)\
+            .filter(GA_Url.url.like('/dataset/%'))\
+            .filter(GA_Url.package_id.in_(top_package_names))
+        graph_data = {}
+        for entry,package in graph_query:
+            if not package: continue
+            if entry.period_name=='All': continue
+            graph_data[package.id] = graph_data.get(package.id,{
+                'name':package.title,
+                'data':[]
+                })
+            graph_data[package.id]['data'].append({
+                'x':_get_unix_epoch(entry.period_name),
+                'y':int(entry.pageviews),
+                })
+                    
+        c.graph_data = json.dumps( _to_rickshaw(graph_data.values()) )
+
         return render('ga_report/publisher/read.html')
+
+def _to_rickshaw(data, percentageMode=False):
+    if data==[]:
+        return data
+    # Create a consistent x-axis
+    num_points = [ len(package['data']) for package in data ]
+    ideal_index = num_points.index( max(num_points) )
+    x_axis = [ point['x'] for point in data[ideal_index]['data'] ]
+    for package in data:
+        xs = [ point['x'] for point in package['data'] ]
+        assert set(xs).issubset( set(x_axis) ), (xs, x_axis)
+        # Zero pad any missing values
+        for x in set(x_axis).difference(set(xs)):
+            package['data'].append( {'x':x, 'y':0} )
+        assert len(package['data'])==len(x_axis), (len(package['data']),len(x_axis),package['data'],x_axis,set(x_axis).difference(set(xs)))
+    if percentageMode:
+        # Transform data into percentage stacks
+        totals = {}
+        for x in x_axis:
+            for package in data:
+                for point in package['data']:
+                    totals[ point['x'] ] = totals.get(point['x'],0) + point['y']
+        # Roll insignificant series into a catch-all
+        THRESHOLD = 0.01
+        significant_series = []
+        for package in data:
+            for point in package['data']:
+                fraction = float(point['y']) / totals[point['x']]
+                if fraction>THRESHOLD and not (package in significant_series):
+                    significant_series.append(package)
+        temp = {}
+        for package in data:
+            if package in significant_series: continue
+            for point in package['data']:
+                temp[point['x']] = temp.get(point['x'],0) + point['y']
+        catch_all = { 'name':'Other','data': [ {'x':x,'y':y} for x,y in temp.items() ] }
+        # Roll insignificant series into one
+        data = significant_series
+        data.append(catch_all)
+        # Turn each point into a percentage
+        for package in data:
+            for point in package['data']:
+                point['y'] = (point['y']*100) / totals[point['x']]
+    # Sort the points
+    for package in data:
+        package['data'] = sorted( package['data'], key=lambda x:x['x'] )
+        # Strip the latest month's incomplete analytics
+        package['data'] = package['data'][:-1]
+    return data
+
 
 def _get_top_publishers(limit=20):
     '''
@@ -409,11 +478,35 @@
 
     top_publishers = []
     res = connection.execute(q, month)
+    department_ids = []
     for row in res:
         g = model.Group.get(row[0])
         if g:
+            department_ids.append(row[0])
             top_publishers.append((g, row[1], row[2]))
-    return top_publishers
+
+    graph = {}
+    if limit is not None:
+        # Query for a history graph of these publishers
+        q = model.Session.query(
+                GA_Url.department_id, 
+                GA_Url.period_name, 
+                func.sum(cast(GA_Url.pageviews,sqlalchemy.types.INT)))\
+            .filter( GA_Url.department_id.in_(department_ids) )\
+            .filter( GA_Url.period_name!='All' )\
+            .filter( GA_Url.url.like('/dataset/%') )\
+            .filter( GA_Url.package_id!='' )\
+            .group_by( GA_Url.department_id, GA_Url.period_name )
+        for dept_id,period_name,views in q:
+            graph[dept_id] = graph.get( dept_id, {
+                'name' : model.Group.get(dept_id).title,
+                'data' : []
+                })
+            graph[dept_id]['data'].append({
+                'x': _get_unix_epoch(period_name),
+                'y': views
+                })
+    return top_publishers, graph
 
 
 def _get_publishers():

--- a/ckanext/ga_report/download_analytics.py
+++ b/ckanext/ga_report/download_analytics.py
@@ -32,6 +32,11 @@
         first_of_this_month = datetime.datetime(date.year, date.month, 1)
         _, last_day_of_month = calendar.monthrange(int(date.year), int(date.month))
         last_of_this_month =  datetime.datetime(date.year, date.month, last_day_of_month)
+        # if this is the latest month, note that it is only up until today
+        now = datetime.datetime.now()
+        if now.year == date.year and now.month == date.month:
+            last_day_of_month = now.day
+            last_of_this_month = now
         periods = ((date.strftime(FORMAT_MONTH),
                     last_day_of_month,
                     first_of_this_month, last_of_this_month),)
@@ -126,7 +131,7 @@
                 # Make sure the All records are correct.
                 ga_model.post_update_url_stats()
 
-                log.info('Aggregating datasets by publisher')
+                log.info('Associating datasets with their publisher')
                 ga_model.update_publisher_stats(period_name) # about 30 seconds.
 
 
@@ -298,7 +303,7 @@
 
 
     def _download_stats(self, start_date, end_date, period_name, period_complete_day):
-        """ Fetches stats about language and country """
+        """ Fetches stats about data downloads """
         import ckan.model as model
 
         data = {}
@@ -320,7 +325,14 @@
             return
 
         def process_result_data(result_data, cached=False):
+            progress_total = len(result_data)
+            progress_count = 0
+            resources_not_matched = []
             for result in result_data:
+                progress_count += 1
+                if progress_count % 100 == 0:
+                    log.debug('.. %d/%d done so far', progress_count, progress_total)
+
                 url = result[0].strip()
 
                 # Get package id associated with the resource that has this URL.
@@ -334,9 +346,13 @@
                 if package_name:
                     data[package_name] = data.get(package_name, 0) + int(result[1])
                 else:
-                    log.warning(u"Could not find resource for URL: {url}".format(url=url))
+                    resources_not_matched.append(url)
                     continue
-
+            if resources_not_matched:
+                log.debug('Could not match %i or %i resource URLs to datasets. e.g. %r',
+                          len(resources_not_matched), progress_total, resources_not_matched[:3])
+
+        log.info('Associating downloads of resource URLs with their respective datasets')
         process_result_data(results.get('rows'))
 
         results = self.service.data().ga().get(
@@ -348,6 +364,7 @@
                                  dimensions="ga:eventLabel",
                                  max_results=10000,
                                  end_date=end_date).execute()
+        log.info('Associating downloads of cache resource URLs with their respective datasets')
         process_result_data(results.get('rows'), cached=False)
 
         self._filter_out_long_tail(data, MIN_DOWNLOADS)

--- a/ckanext/ga_report/ga_model.py
+++ b/ckanext/ga_report/ga_model.py
@@ -161,20 +161,20 @@
 
 
 def pre_update_url_stats(period_name):
-    log.debug("Deleting '%s' records" % period_name)
-    model.Session.query(GA_Url).\
-            filter(GA_Url.period_name==period_name).delete()
-
-    count = model.Session.query(GA_Url).\
-            filter(GA_Url.period_name == 'All').count()
-    log.debug("Deleting %d 'All' records" % count)
-    count = model.Session.query(GA_Url).\
-            filter(GA_Url.period_name == 'All').delete()
-    log.debug("Deleted %d 'All' records" % count)
+    q = model.Session.query(GA_Url).\
+        filter(GA_Url.period_name==period_name)
+    log.debug("Deleting %d '%s' records" % (q.count(), period_name))
+    q.delete()
+
+    q = model.Session.query(GA_Url).\
+        filter(GA_Url.period_name == 'All')
+    log.debug("Deleting %d 'All' records..." % q.count())
+    q.delete()
 
     model.Session.flush()
     model.Session.commit()
     model.repo.commit_and_remove()
+    log.debug('...done')
 
 def post_update_url_stats():
 
@@ -185,6 +185,7 @@
         record regardless of whether the URL has an entry for
         the month being currently processed.
     """
+    log.debug('Post-processing "All" records...')
     query = """select url, pageviews::int, visits::int
                from ga_url
                where url not in (select url from ga_url where period_name ='All')"""
@@ -197,7 +198,13 @@
         views[row[0]] = views.get(row[0], 0) + row[1]
         visits[row[0]] = visits.get(row[0], 0) + row[2]
 
+    progress_total = len(views.keys())
+    progress_count = 0
     for key in views.keys():
+        progress_count += 1
+        if progress_count % 100 == 0:
+            log.debug('.. %d/%d done so far', progress_count, progress_total)
+
         package, publisher = _get_package_and_publisher(key)
 
         values = {'id': make_uuid(),
@@ -211,6 +218,7 @@
                   }
         model.Session.add(GA_Url(**values))
     model.Session.commit()
+    log.debug('..done')
 
 
 def update_url_stats(period_name, period_complete_day, url_data):
@@ -219,9 +227,14 @@
     stores them in GA_Url under the period and recalculates the totals for
     the 'All' period.
     '''
+    progress_total = len(progress_data)
+    progress_count = 0
     for url, views, visits in url_data:
+        progress_count += 1
+        if progress_count % 100 == 0:
+            log.debug('.. %d/%d done so far', progress_count, progress_total)
+
         package, publisher = _get_package_and_publisher(url)
-
 
         item = model.Session.query(GA_Url).\
             filter(GA_Url.period_name==period_name).\

--- a/ckanext/ga_report/helpers.py
+++ b/ckanext/ga_report/helpers.py
@@ -106,6 +106,10 @@
     for entry in entries:
         if len(datasets) < count:
             p = model.Package.get(entry.url[len('/dataset/'):])
+            if not p:
+                _log.warning("Could not find Package for {url}".format(url=entry.url))
+                continue
+
             if not p in datasets:
                 datasets[p] = {'views':0, 'visits': 0}
             datasets[p]['views'] = datasets[p]['views'] + int(entry.pageviews)

--- a/ckanext/ga_report/public/css/ga_report.css
+++ b/ckanext/ga_report/public/css/ga_report.css
@@ -5,7 +5,7 @@
 }
 .rickshaw_chart_container {
   position: relative;
-  height: 300px;
+  height: 350px;
   margin: 0 auto 20px auto;
 }
 .rickshaw_chart {
@@ -20,9 +20,12 @@
   right: 0;
   top: 0;
   margin-left: 15px;
+  padding: 0 5px;
   background: transparent;
   max-width: 150px;
   overflow: hidden;
+  background: rgba(0,0,0,0.05);
+  border-radius:5px;
 }
 .rickshaw_y_axis {
   position: absolute;
@@ -30,4 +33,9 @@
   bottom: 0;
   width: 40px;
 }
+.rickshaw_legend .label {
+  background: transparent !important;
+  color: #000000 !important;
+  font-weight: normal !important;
+}
 

--- /dev/null
+++ b/ckanext/ga_report/public/scripts/ckanext_ga_reports.js
@@ -1,1 +1,42 @@
 
+var CKAN = CKAN || {};
+CKAN.GA_Reports = {};
+
+CKAN.GA_Reports.render_rickshaw = function( css_name, data, mode, colorscheme ) {
+    var palette = new Rickshaw.Color.Palette( { scheme: colorscheme } );
+    $.each(data, function(i, object) {
+        object['color'] = palette.color();
+    });
+
+    var graphElement =  document.querySelector("#chart_"+css_name);
+
+    var graph = new Rickshaw.Graph( {
+        element: document.querySelector("#chart_"+css_name),
+        renderer: mode,
+        series: data ,
+        height: 328
+    });
+    var x_axis = new Rickshaw.Graph.Axis.Time( { graph: graph } );
+    var y_axis = new Rickshaw.Graph.Axis.Y( {
+        graph: graph,
+        orientation: 'left',
+        tickFormat: Rickshaw.Fixtures.Number.formatKMBT,
+        element: document.getElementById('y_axis_'+css_name),
+    } );
+    var legend = new Rickshaw.Graph.Legend( {
+        element: document.querySelector('#legend_'+css_name),
+        graph: graph
+    } );
+    var hoverDetail = new Rickshaw.Graph.HoverDetail( {
+      graph: graph,
+      formatter: function(series, x, y) {
+        var date = '<span class="date">' + new Date(x * 1000).toUTCString() + '</span>';
+        var swatch = '<span class="detail_swatch" style="background-color: ' + series.color + '"></span>';
+        var content = swatch + series.name + ": " + parseInt(y) + '<br>' + date;
+        return content;
+      }
+    } );
+    graph.render();
+};
+
+

--- a/ckanext/ga_report/templates/ga_report/ga_util.html
+++ b/ckanext/ga_report/templates/ga_report/ga_util.html
@@ -30,36 +30,14 @@
  </table>
 
 
-<div py:def="rickshaw_graph(items_json,id,debug=False)">
+<div py:def="rickshaw_graph(items_json,id,mode='line',colorscheme='munin')">
   <div id="chart_container_$id" class="rickshaw_chart_container">
     <div id="y_axis_$id" class="rickshaw_y_axis"></div>
     <div id="chart_$id" class="rickshaw_chart"></div>
     <div id="legend_$id" class="rickshaw_legend"></div>
     <script type="text/javascript">
       $(function() {
-          var series = ${items_json};
-          <py:if test="debug">console.log(series);</py:if>
-          var palette = new Rickshaw.Color.Palette( { scheme: 'spectrum2001' } );
-          $.each(series, function(i, object) {
-              object['color'] = palette.color();
-          });
-          var graph = new Rickshaw.Graph( {
-              element: document.querySelector("#chart_$id"),
-              renderer: 'line',
-              series: series 
-          });
-          var x_axis = new Rickshaw.Graph.Axis.Time( { graph: graph } );
-          var y_axis = new Rickshaw.Graph.Axis.Y( {
-              graph: graph,
-              orientation: 'left',
-              tickFormat: Rickshaw.Fixtures.Number.formatKMBT,
-              element: document.getElementById('y_axis_$id'),
-          } );
-          var legend = new Rickshaw.Graph.Legend( {
-              element: document.querySelector('#legend_$id'),
-              graph: graph
-          } );
-          graph.render();
+          CKAN.GA_Reports.render_rickshaw('$id', $items_json, '$mode', '$colorscheme');
       });
     </script>
   </div>

--- a/ckanext/ga_report/templates/ga_report/notes.html
+++ b/ckanext/ga_report/templates/ga_report/notes.html
@@ -7,7 +7,7 @@
       <h4>Notes</h4>
       <ul>
           <li>"Views" is the number of times a page was loaded in users' browsers.</li>
-          <li>"Downloads" is the number of times a user has clicked to download either an original or cached resource for a particular dataset since December 2012</li>
+          <li>"Downloads" is the number of times a user has clicked to download either an original or cached resource for a particular dataset. Download information is only available from 2nd December 2012, 'No data' is shown for records before that date.</li>
           <li>These usage statistics are confined to users with javascript enabled, which excludes web crawlers and API calls.</li>
           <li>The results are not shown when the number of views/visits is tiny. Where these relate to site pages, results are available in full in the CSV download. Where these relate to users' web browser information, results are not disclosed, for privacy reasons.</li>
       </ul>

--- a/ckanext/ga_report/templates/ga_report/publisher/index.html
+++ b/ckanext/ga_report/templates/ga_report/publisher/index.html
@@ -20,6 +20,7 @@
   <py:def function="optional_head">
     <link rel="stylesheet" type="text/css" href="/scripts/vendor/rickshaw.min.css"/>
     <link rel="stylesheet" type="text/css" href="/css/ga_report.css"/>
+    <script type="text/javascript" src="/scripts/ckanext_ga_reports.js"></script>
     <script type="text/javascript" src="/scripts/vendor/jquery.sparkline.modified.js"></script>
     <script type="text/javascript" src="/scripts/vendor/d3.v2.js"></script>
     <script type="text/javascript" src="/scripts/vendor/d3.layout.min.js"></script>
@@ -42,7 +43,7 @@
           </div>
        </form>
 
-       ${rickshaw_graph('[{name:"test series",data:[{x:1,y:200},{x:3,y:300},{x:5,y:100}]}]','dataset-downloads',debug=True)}
+       ${rickshaw_graph(c.top_publishers_graph,'publishers')}
        <table class="table table-condensed table-bordered table-striped">
 	 <tr>
 	   <th>Publisher</th>

--- a/ckanext/ga_report/templates/ga_report/publisher/read.html
+++ b/ckanext/ga_report/templates/ga_report/publisher/read.html
@@ -10,6 +10,7 @@
   <py:def function="optional_head">
     <link rel="stylesheet" type="text/css" href="/scripts/vendor/rickshaw.min.css"/>
     <link rel="stylesheet" type="text/css" href="/css/ga_report.css"/>
+    <script type="text/javascript" src="/scripts/ckanext_ga_reports.js"></script>
     <script type="text/javascript" src="/scripts/vendor/jquery.sparkline.modified.js"></script>
     <script type="text/javascript" src="/scripts/vendor/d3.v2.js"></script>
     <script type="text/javascript" src="/scripts/vendor/d3.layout.min.js"></script>
@@ -52,7 +53,7 @@
      <p py:if="not c.top_packages">No page views in this period</p>
 
      <py:if test="c.top_packages">
-       ${rickshaw_graph('[{name:"test series",data:[{x:1,y:200},{x:3,y:300},{x:5,y:100}]}]','dataset-downloads',debug=True)}
+       ${rickshaw_graph(c.graph_data,'dataset-downloads',debug=True)}
        <table class="table table-condensed table-bordered table-striped">
          <tr>
            <th>Dataset</th>

--- a/ckanext/ga_report/templates/ga_report/site/index.html
+++ b/ckanext/ga_report/templates/ga_report/site/index.html
@@ -10,6 +10,7 @@
   <py:def function="optional_head">
     <link rel="stylesheet" type="text/css" href="/scripts/vendor/rickshaw.min.css"/>
     <link rel="stylesheet" type="text/css" href="/css/ga_report.css"/>
+    <script type="text/javascript" src="/scripts/ckanext_ga_reports.js"></script>
     <script type="text/javascript" src="/scripts/vendor/jquery.sparkline.modified.js"></script>
     <script type="text/javascript" src="/scripts/vendor/d3.v2.js"></script>
     <script type="text/javascript" src="/scripts/vendor/d3.layout.min.js"></script>
@@ -94,20 +95,20 @@
                </table>
         </div>
          <div class="tab-pane" id="browsers_versions">
-             ${rickshaw_graph(c.browser_versions_graph,'browser-versions')}
+             ${rickshaw_graph(c.browser_versions_graph,'browser-versions',mode='stack')}
              <p>Note: Where a browser has a large number of versions, these have been grouped together.</p>
              ${stat_table(c.browser_versions)}
          </div>
          <div class="tab-pane" id="browsers_names">
-             ${rickshaw_graph(c.browsers_graph,'browsers')}
+             ${rickshaw_graph(c.browsers_graph,'browsers',mode='stack')}
              ${stat_table(c.browsers)}
          </div>
          <div class="tab-pane" id="os">
-             ${rickshaw_graph(c.os_graph,'os')}
+             ${rickshaw_graph(c.os_graph,'os',mode='stack')}
              ${stat_table(c.os)}
          </div>
          <div class="tab-pane" id="os_versions">
-             ${rickshaw_graph(c.os_versions_graph,'os_versions')}
+             ${rickshaw_graph(c.os_versions_graph,'os_versions',mode='stack')}
              ${stat_table(c.os_versions)}
          </div>
         <div class="tab-pane" id="social_referrals_totals">
@@ -115,16 +116,16 @@
             ${social_table(c.social_referrer_totals)}
         </div>
         <div class="tab-pane" id="social_networks">
-             ${rickshaw_graph(c.social_networks_graph, 'social_networks')}
+             ${rickshaw_graph(c.social_networks_graph, 'social_networks',mode='stack')}
             <p>Percentage of visits that were referred from these social networks</p>
              ${stat_table(c.social_networks, 'Visits')}
         </div>
         <div class="tab-pane" id="languages">
-             ${rickshaw_graph(c.languages_graph,'languages')}
+             ${rickshaw_graph(c.languages_graph,'languages',mode='stack')}
              ${stat_table(c.languages)}
         </div>
         <div class="tab-pane" id="country">
-             ${rickshaw_graph(c.country_graph,'country')}
+             ${rickshaw_graph(c.country_graph,'country',mode='stack')}
              ${stat_table(c.country)}
         </div>
        </div>