Stacked graphs with sane server-side data wrangling working. Percentage mode half-impemented.
[ckanext-ga-report.git] / ckanext / ga_report / controller.py
blob:a/ckanext/ga_report/controller.py -> blob:b/ckanext/ga_report/controller.py
--- a/ckanext/ga_report/controller.py
+++ b/ckanext/ga_report/controller.py
@@ -191,25 +191,11 @@
             q = model.Session.query(GA_Stat).\
                 filter(GA_Stat.stat_name==k).\
                 order_by(GA_Stat.period_name)
-            # Run the query on all months to gather graph data
-            graph = {}
-            for stat in q:
-                graph[ stat.key ] = graph.get(stat.key,{
-                    'name':stat.key, 
-                    'data': []
-                    })
-                graph[ stat.key ]['data'].append({
-                    'x':_get_unix_epoch(stat.period_name),
-                    'y':float(stat.value)
-                    })
-            setattr(c, v+'_graph', json.dumps( _to_rickshaw(graph.values(),percentageMode=True) ))
-
             # Buffer the tabular data
             if c.month:
                 entries = []
                 q = q.filter(GA_Stat.period_name==c.month).\
                           order_by('ga_stat.value::int desc')
-
             d = collections.defaultdict(int)
             for e in q.all():
                 d[e.key] += int(e.value)
@@ -218,6 +204,23 @@
                 entries.append((key,val,))
             entries = sorted(entries, key=operator.itemgetter(1), reverse=True)
 
+            # Run a query on all months to gather graph data
+            graph_query = model.Session.query(GA_Stat).\
+                filter(GA_Stat.stat_name==k).\
+                order_by(GA_Stat.period_name)
+            graph_dict = {}
+            for stat in graph_query:
+                graph_dict[ stat.key ] = graph_dict.get(stat.key,{
+                    'name':stat.key, 
+                    'data': []
+                    })
+                graph_dict[ stat.key ]['data'].append({
+                    'x':_get_unix_epoch(stat.period_name),
+                    'y':float(stat.value)
+                    })
+            graph = [ graph_dict[x[0]] for x in entries ]
+            setattr(c, v+'_graph', json.dumps( _to_rickshaw(graph,percentageMode=True) ))
+
             # Get the total for each set of values and then set the value as
             # a percentage of the total
             if k == 'Social sources':
@@ -298,7 +301,7 @@
             c.month_desc = ''.join([m[1] for m in c.months if m[0]==c.month])
 
         c.top_publishers, graph_data = _get_top_publishers()
-        c.top_publishers_graph = json.dumps( _to_rickshaw(graph_data.values()) )
+        c.top_publishers_graph = json.dumps( _to_rickshaw(graph_data) )
 
         return render('ga_report/publisher/index.html')
 
@@ -390,64 +393,73 @@
             .filter(model.Package.name==GA_Url.package_id)\
             .filter(GA_Url.url.like('/dataset/%'))\
             .filter(GA_Url.package_id.in_(top_package_names))
-        graph_data = {}
+        graph_dict = {}
         for entry,package in graph_query:
             if not package: continue
             if entry.period_name=='All': continue
-            graph_data[package.id] = graph_data.get(package.id,{
+            graph_dict[package.name] = graph_dict.get(package.name,{
                 'name':package.title,
                 'data':[]
                 })
-            graph_data[package.id]['data'].append({
+            graph_dict[package.name]['data'].append({
                 'x':_get_unix_epoch(entry.period_name),
                 'y':int(entry.pageviews),
                 })
-                    
-        c.graph_data = json.dumps( _to_rickshaw(graph_data.values()) )
+        graph = [ graph_dict[x] for x in top_package_names ]
+
+        c.graph_data = json.dumps( _to_rickshaw(graph) )
 
         return render('ga_report/publisher/read.html')
 
 def _to_rickshaw(data, percentageMode=False):
     if data==[]:
         return data
-    # Create a consistent x-axis
-    num_points = [ len(package['data']) for package in data ]
+    # Create a consistent x-axis between all series
+    num_points = [ len(series['data']) for series in data ]
     ideal_index = num_points.index( max(num_points) )
     x_axis = [ point['x'] for point in data[ideal_index]['data'] ]
-    for package in data:
-        xs = [ point['x'] for point in package['data'] ]
+    for series in data:
+        xs = [ point['x'] for point in series['data'] ]
         assert set(xs).issubset( set(x_axis) ), (xs, x_axis)
         # Zero pad any missing values
         for x in set(x_axis).difference(set(xs)):
-            package['data'].append( {'x':x, 'y':0} )
-        assert len(package['data'])==len(x_axis), (len(package['data']),len(x_axis),package['data'],x_axis,set(x_axis).difference(set(xs)))
+            series['data'].append( {'x':x, 'y':0} )
     if percentageMode:
+        def get_totals(series_list):
+            totals = {}
+            for series in series_list:
+                for point in series['data']:
+                    totals[point['x']] = totals.get(point['x'],0) + point['y']
+            lengths = [ len(series['data']) for series in series_list ]
+            assert len(set(lengths))==1
+            assert lengths[0] == len(totals)
+            return totals
         # Transform data into percentage stacks
-        totals = {}
-        for x in x_axis:
-            for package in data:
-                for point in package['data']:
-                    totals[ point['x'] ] = totals.get(point['x'],0) + point['y']
+        totals = get_totals(data)
         # Roll insignificant series into a catch-all
         THRESHOLD = 0.01
-        significant_series = []
-        for package in data:
-            for point in package['data']:
+        raw_data = data
+        data = []
+        for series in raw_data:
+            for point in series['data']:
                 fraction = float(point['y']) / totals[point['x']]
-                if fraction>THRESHOLD and not (package in significant_series):
-                    significant_series.append(package)
-        temp = {}
-        for package in data:
-            if package in significant_series: continue
-            for point in package['data']:
-                temp[point['x']] = temp.get(point['x'],0) + point['y']
-        catch_all = { 'name':'Other','data': [ {'x':x,'y':y} for x,y in temp.items() ] }
-        # Roll insignificant series into one
-        data = significant_series
-        data.append(catch_all)
+                if not (series in data) and fraction>THRESHOLD:
+                    data.append(series)
+        # Overwrite data with a set of intereting series
+        others = [ x for x in raw_data if not (x in data) ]
+        data.append({ 
+            'name':'Other',
+            'data': [ {'x':x,'y':y} for x,y in get_totals(others).items() ] 
+            })
+        # Turn each point into a percentage
+        for series in data:
+            for point in series['data']:
+                point['y'] = (point['y']*100) / totals[point['x']]
     # Sort the points
-    for package in data:
-        package['data'] = sorted( package['data'], key=lambda x:x['x'] )
+    for series in data:
+        series['data'] = sorted( series['data'], key=lambda x:x['x'] )
+        # Strip the latest month's incomplete analytics
+        series['data'] = series['data'][:-1]
     return data
 
 
@@ -479,7 +491,7 @@
             department_ids.append(row[0])
             top_publishers.append((g, row[1], row[2]))
 
-    graph = {}
+    graph = []
     if limit is not None:
         # Query for a history graph of these publishers
         q = model.Session.query(
@@ -491,15 +503,19 @@
             .filter( GA_Url.url.like('/dataset/%') )\
             .filter( GA_Url.package_id!='' )\
             .group_by( GA_Url.department_id, GA_Url.period_name )
+        graph_dict = {}
         for dept_id,period_name,views in q:
-            graph[dept_id] = graph.get( dept_id, {
+            graph_dict[dept_id] = graph_dict.get( dept_id, {
                 'name' : model.Group.get(dept_id).title,
                 'data' : []
                 })
-            graph[dept_id]['data'].append({
+            graph_dict[dept_id]['data'].append({
                 'x': _get_unix_epoch(period_name),
                 'y': views
                 })
+        # Sort dict into ordered list
+        for id in department_ids:
+            graph.append( graph_dict[id] )
     return top_publishers, graph