#167 Refactoring controller's logic to create better Percentage graphs
[ckanext-ga-report.git] / ckanext / ga_report / controller.py
blob:a/ckanext/ga_report/controller.py -> blob:b/ckanext/ga_report/controller.py
--- a/ckanext/ga_report/controller.py
+++ b/ckanext/ga_report/controller.py
@@ -191,25 +191,11 @@
             q = model.Session.query(GA_Stat).\
                 filter(GA_Stat.stat_name==k).\
                 order_by(GA_Stat.period_name)
-            # Run the query on all months to gather graph data
-            graph = {}
-            for stat in q:
-                graph[ stat.key ] = graph.get(stat.key,{
-                    'name':stat.key, 
-                    'data': []
-                    })
-                graph[ stat.key ]['data'].append({
-                    'x':_get_unix_epoch(stat.period_name),
-                    'y':float(stat.value)
-                    })
-            setattr(c, v+'_graph', json.dumps( _to_rickshaw(graph.values(),percentageMode=True) ))
-
             # Buffer the tabular data
             if c.month:
                 entries = []
                 q = q.filter(GA_Stat.period_name==c.month).\
                           order_by('ga_stat.value::int desc')
-
             d = collections.defaultdict(int)
             for e in q.all():
                 d[e.key] += int(e.value)
@@ -218,6 +204,23 @@
                 entries.append((key,val,))
             entries = sorted(entries, key=operator.itemgetter(1), reverse=True)
 
+            # Run a query on all months to gather graph data
+            graph_query = model.Session.query(GA_Stat).\
+                filter(GA_Stat.stat_name==k).\
+                order_by(GA_Stat.period_name)
+            graph_dict = {}
+            for stat in graph_query:
+                graph_dict[ stat.key ] = graph_dict.get(stat.key,{
+                    'name':stat.key,
+                    'raw': {}
+                    })
+                graph_dict[ stat.key ]['raw'][stat.period_name] = float(stat.value)
+            stats_in_table = [x[0] for x in entries]
+            stats_not_in_table = set(graph_dict.keys()) - set(stats_in_table)
+            stats = stats_in_table + sorted(list(stats_not_in_table))
+            graph = [graph_dict[x] for x in stats]
+            setattr(c, v+'_graph', json.dumps( _to_rickshaw(graph,percentageMode=True) ))
+
             # Get the total for each set of values and then set the value as
             # a percentage of the total
             if k == 'Social sources':
@@ -246,7 +249,7 @@
         writer = csv.writer(response)
         writer.writerow(["Publisher Title", "Publisher Name", "Views", "Visits", "Period Name"])
 
-        top_publishers, top_publishers_graph = _get_top_publishers(None)
+        top_publishers = _get_top_publishers(limit=None)
 
         for publisher,view,visit in top_publishers:
             writer.writerow([publisher.title.encode('utf-8'),
@@ -268,7 +271,7 @@
             if not c.publisher:
                 abort(404, 'A publisher with that name could not be found')
 
-        packages = self._get_packages(c.publisher)
+        packages = self._get_packages(publisher=c.publisher, month=c.month)
         response.headers['Content-Type'] = "text/csv; charset=utf-8"
         response.headers['Content-Disposition'] = \
             str('attachment; filename=datasets_%s_%s.csv' % (c.publisher_name, month,))
@@ -297,15 +300,18 @@
         if c.month:
             c.month_desc = ''.join([m[1] for m in c.months if m[0]==c.month])
 
-        c.top_publishers, graph_data = _get_top_publishers()
-        c.top_publishers_graph = json.dumps( _to_rickshaw(graph_data.values()) )
-
-        return render('ga_report/publisher/index.html')
-
-    def _get_packages(self, publisher=None, count=-1):
+        c.top_publishers = _get_top_publishers()
+        graph_data = _get_top_publishers_graph()
+        c.top_publishers_graph = json.dumps( _to_rickshaw(graph_data) )
+
+        x =  render('ga_report/publisher/index.html')
+
+        return x
+
+    def _get_packages(self, publisher=None, month='', count=-1):
         '''Returns the datasets in order of views'''
         have_download_data = True
-        month = c.month or 'All'
+        month = month or 'All'
         if month != 'All':
             have_download_data = month >= DOWNLOADS_AVAILABLE_FROM
 
@@ -336,7 +342,8 @@
                         downloads += int(x.value)
                 else:
                     downloads = 'No data'
-                top_packages.append((package, entry.pageviews, entry.visits, downloads))
+                if package.private == False:
+                    top_packages.append((package, entry.pageviews, entry.visits, downloads))
             else:
                 log.warning('Could not find package associated package')
 
@@ -382,81 +389,71 @@
         entry = q.filter(GA_Url.period_name==c.month).first()
         c.publisher_page_views = entry.pageviews if entry else 0
 
-        c.top_packages = self._get_packages(c.publisher, 20)
+        c.top_packages = self._get_packages(publisher=c.publisher, count=20, month=c.month)
 
         # Graph query
-        top_package_names = [ x[0].name for x in c.top_packages ]
+        top_packages_all_time = self._get_packages(publisher=c.publisher, count=20, month='All')
+        top_package_names = [ x[0].name for x in top_packages_all_time ]
         graph_query = model.Session.query(GA_Url,model.Package)\
             .filter(model.Package.name==GA_Url.package_id)\
             .filter(GA_Url.url.like('/dataset/%'))\
             .filter(GA_Url.package_id.in_(top_package_names))
-        graph_data = {}
+        all_series = {}
         for entry,package in graph_query:
             if not package: continue
             if entry.period_name=='All': continue
-            graph_data[package.id] = graph_data.get(package.id,{
+            all_series[package.name] = all_series.get(package.name,{
                 'name':package.title,
-                'data':[]
+                'raw': {}
                 })
-            graph_data[package.id]['data'].append({
-                'x':_get_unix_epoch(entry.period_name),
-                'y':int(entry.pageviews),
-                })
-                    
-        c.graph_data = json.dumps( _to_rickshaw(graph_data.values()) )
+            all_series[package.name]['raw'][entry.period_name] = int(entry.pageviews)
+        graph = [ all_series[series_name] for series_name in top_package_names ]
+        c.graph_data = json.dumps( _to_rickshaw(graph) )
 
         return render('ga_report/publisher/read.html')
 
 def _to_rickshaw(data, percentageMode=False):
     if data==[]:
         return data
-    # Create a consistent x-axis
-    num_points = [ len(series['data']) for series in data ]
-    ideal_index = num_points.index( max(num_points) )
-    x_axis = [ point['x'] for point in data[ideal_index]['data'] ]
+    # x-axis is every month in c.months. Note that data might not exist
+    # for entire history, eg. for recently-added datasets
+    x_axis = [x[0] for x in c.months]
+    x_axis.reverse() # Ascending order
+    x_axis = x_axis[:-1] # Remove latest month
+    totals = {}
     for series in data:
-        xs = [ point['x'] for point in series['data'] ]
-        assert set(xs).issubset( set(x_axis) ), (xs, x_axis)
-        # Zero pad any missing values
-        for x in set(x_axis).difference(set(xs)):
-            series['data'].append( {'x':x, 'y':0} )
-        assert len(series['data'])==len(x_axis), (len(series['data']),len(x_axis),series['data'],x_axis,set(x_axis).difference(set(xs)))
-    if percentageMode:
-        def get_totals(series_list):
-            totals = {}
-            for series in series_list:
-                for point in series['data']:
-                    totals[point['x']] = totals.get(point['x'],0) + point['y']
-            lengths = [ len(series['data']) for series in series_list ]
-            assert len(set(lengths))==1
-            assert lengths[0] == len(totals)
-            return totals
-        # Transform data into percentage stacks
-        totals = get_totals(data)
-        # Roll insignificant series into a catch-all
-        THRESHOLD = 0.01
-        raw_data = data
-        data = []
-        for series in raw_data:
-            for point in series['data']:
-                fraction = float(point['y']) / totals[point['x']]
-                if not (series in data) and fraction>THRESHOLD:
-                    data.append(series)
-        # Overwrite data with a set of intereting series
-        others = [ x for x in raw_data if not (x in data) ]
-        data.append({ 
+        series['data'] = []
+        for x_string in x_axis:
+            x = _get_unix_epoch( x_string )
+            y = series['raw'].get(x_string,0)
+            series['data'].append({'x':x,'y':y})
+            totals[x] = totals.get(x,0)+y
+    if not percentageMode:
+        return data
+    # Turn all data into percentages
+    # Roll insignificant series into a catch-all
+    THRESHOLD = 1
+    raw_data = data
+    data = []
+    for series in raw_data:
+        for point in series['data']:
+            percentage = (100*float(point['y'])) / totals[point['x']]
+            if not (series in data) and percentage>THRESHOLD:
+                data.append(series)
+            point['y'] = percentage
+    others = [ x for x in raw_data if not (x in data) ]
+    if len(others):
+        data_other = []
+        for i in range(len(x_axis)):
+            x = _get_unix_epoch(x_axis[i])
+            y = 0
+            for series in others:
+                y += series['data'][i]['y']
+            data_other.append({'x':x,'y':y})
+        data.append({
             'name':'Other',
-            'data': [ {'x':x,'y':y} for x,y in get_totals(others).items() ] 
+            'data': data_other
             })
-        # Turn each point into a percentage
-        for series in data:
-            for point in series['data']:
-                point['y'] = (point['y']*100) / totals[point['x']]
-    # Sort the points
-    for series in data:
-        series['data'] = sorted( series['data'], key=lambda x:x['x'] )
-        # Strip the latest month's incomplete analytics
-        series['data'] = series['data'][:-1]
     return data
 
 
@@ -481,35 +478,51 @@
 
     top_publishers = []
     res = connection.execute(q, month)
-    department_ids = []
     for row in res:
         g = model.Group.get(row[0])
         if g:
-            department_ids.append(row[0])
             top_publishers.append((g, row[1], row[2]))
-
-    graph = {}
-    if limit is not None:
-        # Query for a history graph of these publishers
-        q = model.Session.query(
-                GA_Url.department_id, 
-                GA_Url.period_name, 
-                func.sum(cast(GA_Url.pageviews,sqlalchemy.types.INT)))\
-            .filter( GA_Url.department_id.in_(department_ids) )\
-            .filter( GA_Url.period_name!='All' )\
-            .filter( GA_Url.url.like('/dataset/%') )\
-            .filter( GA_Url.package_id!='' )\
-            .group_by( GA_Url.department_id, GA_Url.period_name )
-        for dept_id,period_name,views in q:
-            graph[dept_id] = graph.get( dept_id, {
-                'name' : model.Group.get(dept_id).title,
-                'data' : []
-                })
-            graph[dept_id]['data'].append({
-                'x': _get_unix_epoch(period_name),
-                'y': views
-                })
-    return top_publishers, graph
+    return top_publishers
+
+
+def _get_top_publishers_graph(limit=20):
+    '''
+    Returns a list of the top 20 publishers by dataset visits.
+    (The number to show can be varied with 'limit')
+    '''
+    connection = model.Session.connection()
+    q = """
+        select department_id, sum(pageviews::int) views
+        from ga_url
+        where department_id <> ''
+          and package_id <> ''
+          and url like '/dataset/%%'
+          and period_name='All'
+        group by department_id order by views desc
+        """
+    if limit:
+        q = q + " limit %s;" % (limit)
+
+    res = connection.execute(q)
+    department_ids = [ row[0] for row in res ]
+
+    # Query for a history graph of these department ids
+    q = model.Session.query(
+            GA_Url.department_id,
+            GA_Url.period_name,
+            func.sum(cast(GA_Url.pageviews,sqlalchemy.types.INT)))\
+        .filter( GA_Url.department_id.in_(department_ids) )\
+        .filter( GA_Url.url.like('/dataset/%') )\
+        .filter( GA_Url.package_id!='' )\
+        .group_by( GA_Url.department_id, GA_Url.period_name )
+    graph_dict = {}
+    for dept_id,period_name,views in q:
+        graph_dict[dept_id] = graph_dict.get( dept_id, {
+            'name' : model.Group.get(dept_id).title,
+            'raw' : {}
+            })
+        graph_dict[dept_id]['raw'][period_name] = views
+    return [ graph_dict[id] for id in department_ids ]
 
 
 def _get_publishers():
@@ -519,7 +532,7 @@
     '''
     publishers = []
     for pub in model.Session.query(model.Group).\
-               filter(model.Group.type=='publisher').\
+               filter(model.Group.type=='organization').\
                filter(model.Group.state=='active').\
                order_by(model.Group.name):
         publishers.append((pub.name, pub.title))