From: David Read Date: Tue, 22 Jan 2013 17:08:05 +0000 Subject: Merge commit 'd0db210' X-Git-Url: http://maxious.lambdacomplex.org/git/?p=ckanext-ga-report.git&a=commitdiff&h=794c62a0f973c97269fef4f7a9988d2bb3957021 --- Merge commit 'd0db210' --- --- a/ckanext/ga_report/command.py +++ b/ckanext/ga_report/command.py @@ -23,7 +23,7 @@ import ckan.model as model model.Session.remove() model.Session.configure(bind=model.meta.engine) - log = logging.getLogger('ckanext.ga-report') + log = logging.getLogger('ckanext.ga_report') import ga_model ga_model.init_tables() --- a/ckanext/ga_report/controller.py +++ b/ckanext/ga_report/controller.py @@ -192,24 +192,17 @@ filter(GA_Stat.stat_name==k).\ order_by(GA_Stat.period_name) # Run the query on all months to gather graph data - series = {} - x_axis = set() + graph = {} for stat in q: - x_val = _get_unix_epoch(stat.period_name) - series[ stat.key ] = series.get(stat.key,{}) - series[ stat.key ][x_val] = float(stat.value) - x_axis.add(x_val) - # Common x-axis for all series. Exclude this month (incomplete data) - x_axis = sorted(list(x_axis))[:-1] - # Buffer a rickshaw dataset from the series - def create_graph(series_name, series_data): - return { - 'name':series_name, - 'data':[ {'x':x,'y':series_data.get(x,0)} for x in x_axis ] - } - rickshaw = [ create_graph(name,data) for name, data in series.items() ] - rickshaw = sorted(rickshaw,key=lambda x:x['data'][-1]['y']) - setattr(c, v+'_graph', json.dumps(rickshaw)) + graph[ stat.key ] = graph.get(stat.key,{ + 'name':stat.key, + 'data': [] + }) + graph[ stat.key ]['data'].append({ + 'x':_get_unix_epoch(stat.period_name), + 'y':float(stat.value) + }) + setattr(c, v+'_graph', json.dumps( _to_rickshaw(graph.values(),percentageMode=True) )) # Buffer the tabular data if c.month: @@ -253,7 +246,9 @@ writer = csv.writer(response) writer.writerow(["Publisher Title", "Publisher Name", "Views", "Visits", "Period Name"]) - for publisher,view,visit in _get_top_publishers(None): + top_publishers, top_publishers_graph = _get_top_publishers(None) + + for publisher,view,visit in top_publishers: writer.writerow([publisher.title.encode('utf-8'), publisher.name.encode('utf-8'), view, @@ -273,7 +268,7 @@ if not c.publisher: abort(404, 'A publisher with that name could not be found') - packages, graph_data = self._get_packages(c.publisher) + packages = self._get_packages(c.publisher) response.headers['Content-Type'] = "text/csv; charset=utf-8" response.headers['Content-Disposition'] = \ str('attachment; filename=datasets_%s_%s.csv' % (c.publisher_name, month,)) @@ -302,7 +297,9 @@ if c.month: c.month_desc = ''.join([m[1] for m in c.months if m[0]==c.month]) - c.top_publishers = _get_top_publishers() + c.top_publishers, graph_data = _get_top_publishers() + c.top_publishers_graph = json.dumps( _to_rickshaw(graph_data.values()) ) + return render('ga_report/publisher/index.html') def _get_packages(self, publisher=None, count=-1): @@ -319,7 +316,6 @@ q = q.filter(GA_Url.department_id==publisher.name) q = q.filter(GA_Url.period_name==month) q = q.order_by('ga_url.pageviews::int desc') - graph_data = [] top_packages = [] if count == -1: entries = q.all() @@ -328,7 +324,6 @@ for entry,package in entries: if package: - graph = [] # Downloads .... if have_download_data: dls = model.Session.query(GA_Stat).\ @@ -338,16 +333,14 @@ dls = dls.filter(GA_Stat.period_name==month) downloads = 0 for x in dls: - graph.append({ 'x': _get_unix_epoch(d.period_name), 'y': int(d.value)}) - downloads += int(d.value) + downloads += int(x.value) else: downloads = 'No data' - top_packages.append((package, entry.pageviews, entry.visits, downloads, graph_data)) - graph_data.append({'name':package.title, 'data':graph}) + top_packages.append((package, entry.pageviews, entry.visits, downloads)) else: log.warning('Could not find package associated package') - return top_packages,graph_data + return top_packages def read(self): ''' @@ -389,10 +382,80 @@ entry = q.filter(GA_Url.period_name==c.month).first() c.publisher_page_views = entry.pageviews if entry else 0 - c.top_packages, graph_data = self._get_packages(c.publisher, 20) - c.graph_data = json.dumps(graph_data) + c.top_packages = self._get_packages(c.publisher, 20) + + # Graph query + top_package_names = [ x[0].name for x in c.top_packages ] + graph_query = model.Session.query(GA_Url,model.Package)\ + .filter(model.Package.name==GA_Url.package_id)\ + .filter(GA_Url.url.like('/dataset/%'))\ + .filter(GA_Url.package_id.in_(top_package_names)) + graph_data = {} + for entry,package in graph_query: + if not package: continue + if entry.period_name=='All': continue + graph_data[package.id] = graph_data.get(package.id,{ + 'name':package.title, + 'data':[] + }) + graph_data[package.id]['data'].append({ + 'x':_get_unix_epoch(entry.period_name), + 'y':int(entry.pageviews), + }) + + c.graph_data = json.dumps( _to_rickshaw(graph_data.values()) ) return render('ga_report/publisher/read.html') + +def _to_rickshaw(data, percentageMode=False): + if data==[]: + return data + # Create a consistent x-axis + num_points = [ len(package['data']) for package in data ] + ideal_index = num_points.index( max(num_points) ) + x_axis = [ point['x'] for point in data[ideal_index]['data'] ] + for package in data: + xs = [ point['x'] for point in package['data'] ] + assert set(xs).issubset( set(x_axis) ), (xs, x_axis) + # Zero pad any missing values + for x in set(x_axis).difference(set(xs)): + package['data'].append( {'x':x, 'y':0} ) + assert len(package['data'])==len(x_axis), (len(package['data']),len(x_axis),package['data'],x_axis,set(x_axis).difference(set(xs))) + if percentageMode: + # Transform data into percentage stacks + totals = {} + for x in x_axis: + for package in data: + for point in package['data']: + totals[ point['x'] ] = totals.get(point['x'],0) + point['y'] + # Roll insignificant series into a catch-all + THRESHOLD = 0.01 + significant_series = [] + for package in data: + for point in package['data']: + fraction = float(point['y']) / totals[point['x']] + if fraction>THRESHOLD and not (package in significant_series): + significant_series.append(package) + temp = {} + for package in data: + if package in significant_series: continue + for point in package['data']: + temp[point['x']] = temp.get(point['x'],0) + point['y'] + catch_all = { 'name':'Other','data': [ {'x':x,'y':y} for x,y in temp.items() ] } + # Roll insignificant series into one + data = significant_series + data.append(catch_all) + # Turn each point into a percentage + for package in data: + for point in package['data']: + point['y'] = (point['y']*100) / totals[point['x']] + # Sort the points + for package in data: + package['data'] = sorted( package['data'], key=lambda x:x['x'] ) + # Strip the latest month's incomplete analytics + package['data'] = package['data'][:-1] + return data + def _get_top_publishers(limit=20): ''' @@ -415,11 +478,35 @@ top_publishers = [] res = connection.execute(q, month) + department_ids = [] for row in res: g = model.Group.get(row[0]) if g: + department_ids.append(row[0]) top_publishers.append((g, row[1], row[2])) - return top_publishers + + graph = {} + if limit is not None: + # Query for a history graph of these publishers + q = model.Session.query( + GA_Url.department_id, + GA_Url.period_name, + func.sum(cast(GA_Url.pageviews,sqlalchemy.types.INT)))\ + .filter( GA_Url.department_id.in_(department_ids) )\ + .filter( GA_Url.period_name!='All' )\ + .filter( GA_Url.url.like('/dataset/%') )\ + .filter( GA_Url.package_id!='' )\ + .group_by( GA_Url.department_id, GA_Url.period_name ) + for dept_id,period_name,views in q: + graph[dept_id] = graph.get( dept_id, { + 'name' : model.Group.get(dept_id).title, + 'data' : [] + }) + graph[dept_id]['data'].append({ + 'x': _get_unix_epoch(period_name), + 'y': views + }) + return top_publishers, graph def _get_publishers(): --- a/ckanext/ga_report/download_analytics.py +++ b/ckanext/ga_report/download_analytics.py @@ -32,6 +32,11 @@ first_of_this_month = datetime.datetime(date.year, date.month, 1) _, last_day_of_month = calendar.monthrange(int(date.year), int(date.month)) last_of_this_month = datetime.datetime(date.year, date.month, last_day_of_month) + # if this is the latest month, note that it is only up until today + now = datetime.datetime.now() + if now.year == date.year and now.month == date.month: + last_day_of_month = now.day + last_of_this_month = now periods = ((date.strftime(FORMAT_MONTH), last_day_of_month, first_of_this_month, last_of_this_month),) @@ -126,7 +131,7 @@ # Make sure the All records are correct. ga_model.post_update_url_stats() - log.info('Aggregating datasets by publisher') + log.info('Associating datasets with their publisher') ga_model.update_publisher_stats(period_name) # about 30 seconds. @@ -298,7 +303,7 @@ def _download_stats(self, start_date, end_date, period_name, period_complete_day): - """ Fetches stats about language and country """ + """ Fetches stats about data downloads """ import ckan.model as model data = {} @@ -320,7 +325,14 @@ return def process_result_data(result_data, cached=False): + progress_total = len(result_data) + progress_count = 0 + resources_not_matched = [] for result in result_data: + progress_count += 1 + if progress_count % 100 == 0: + log.debug('.. %d/%d done so far', progress_count, progress_total) + url = result[0].strip() # Get package id associated with the resource that has this URL. @@ -334,9 +346,13 @@ if package_name: data[package_name] = data.get(package_name, 0) + int(result[1]) else: - log.warning(u"Could not find resource for URL: {url}".format(url=url)) + resources_not_matched.append(url) continue - + if resources_not_matched: + log.debug('Could not match %i or %i resource URLs to datasets. e.g. %r', + len(resources_not_matched), progress_total, resources_not_matched[:3]) + + log.info('Associating downloads of resource URLs with their respective datasets') process_result_data(results.get('rows')) results = self.service.data().ga().get( @@ -348,6 +364,7 @@ dimensions="ga:eventLabel", max_results=10000, end_date=end_date).execute() + log.info('Associating downloads of cache resource URLs with their respective datasets') process_result_data(results.get('rows'), cached=False) self._filter_out_long_tail(data, MIN_DOWNLOADS) --- a/ckanext/ga_report/ga_model.py +++ b/ckanext/ga_report/ga_model.py @@ -161,20 +161,20 @@ def pre_update_url_stats(period_name): - log.debug("Deleting '%s' records" % period_name) - model.Session.query(GA_Url).\ - filter(GA_Url.period_name==period_name).delete() - - count = model.Session.query(GA_Url).\ - filter(GA_Url.period_name == 'All').count() - log.debug("Deleting %d 'All' records" % count) - count = model.Session.query(GA_Url).\ - filter(GA_Url.period_name == 'All').delete() - log.debug("Deleted %d 'All' records" % count) + q = model.Session.query(GA_Url).\ + filter(GA_Url.period_name==period_name) + log.debug("Deleting %d '%s' records" % (q.count(), period_name)) + q.delete() + + q = model.Session.query(GA_Url).\ + filter(GA_Url.period_name == 'All') + log.debug("Deleting %d 'All' records..." % q.count()) + q.delete() model.Session.flush() model.Session.commit() model.repo.commit_and_remove() + log.debug('...done') def post_update_url_stats(): @@ -185,6 +185,7 @@ record regardless of whether the URL has an entry for the month being currently processed. """ + log.debug('Post-processing "All" records...') query = """select url, pageviews::int, visits::int from ga_url where url not in (select url from ga_url where period_name ='All')""" @@ -197,7 +198,13 @@ views[row[0]] = views.get(row[0], 0) + row[1] visits[row[0]] = visits.get(row[0], 0) + row[2] + progress_total = len(views.keys()) + progress_count = 0 for key in views.keys(): + progress_count += 1 + if progress_count % 100 == 0: + log.debug('.. %d/%d done so far', progress_count, progress_total) + package, publisher = _get_package_and_publisher(key) values = {'id': make_uuid(), @@ -211,6 +218,7 @@ } model.Session.add(GA_Url(**values)) model.Session.commit() + log.debug('..done') def update_url_stats(period_name, period_complete_day, url_data): @@ -219,9 +227,14 @@ stores them in GA_Url under the period and recalculates the totals for the 'All' period. ''' + progress_total = len(progress_data) + progress_count = 0 for url, views, visits in url_data: + progress_count += 1 + if progress_count % 100 == 0: + log.debug('.. %d/%d done so far', progress_count, progress_total) + package, publisher = _get_package_and_publisher(url) - item = model.Session.query(GA_Url).\ filter(GA_Url.period_name==period_name).\ --- a/ckanext/ga_report/public/css/ga_report.css +++ b/ckanext/ga_report/public/css/ga_report.css @@ -5,7 +5,7 @@ } .rickshaw_chart_container { position: relative; - height: 300px; + height: 350px; margin: 0 auto 20px auto; } .rickshaw_chart { @@ -20,9 +20,12 @@ right: 0; top: 0; margin-left: 15px; + padding: 0 5px; background: transparent; max-width: 150px; overflow: hidden; + background: rgba(0,0,0,0.05); + border-radius:5px; } .rickshaw_y_axis { position: absolute; @@ -30,4 +33,9 @@ bottom: 0; width: 40px; } +.rickshaw_legend .label { + background: transparent !important; + color: #000000 !important; + font-weight: normal !important; +} --- /dev/null +++ b/ckanext/ga_report/public/scripts/ckanext_ga_reports.js @@ -1,1 +1,42 @@ +var CKAN = CKAN || {}; +CKAN.GA_Reports = {}; + +CKAN.GA_Reports.render_rickshaw = function( css_name, data, mode, colorscheme ) { + var palette = new Rickshaw.Color.Palette( { scheme: colorscheme } ); + $.each(data, function(i, object) { + object['color'] = palette.color(); + }); + + var graphElement = document.querySelector("#chart_"+css_name); + + var graph = new Rickshaw.Graph( { + element: document.querySelector("#chart_"+css_name), + renderer: mode, + series: data , + height: 328 + }); + var x_axis = new Rickshaw.Graph.Axis.Time( { graph: graph } ); + var y_axis = new Rickshaw.Graph.Axis.Y( { + graph: graph, + orientation: 'left', + tickFormat: Rickshaw.Fixtures.Number.formatKMBT, + element: document.getElementById('y_axis_'+css_name), + } ); + var legend = new Rickshaw.Graph.Legend( { + element: document.querySelector('#legend_'+css_name), + graph: graph + } ); + var hoverDetail = new Rickshaw.Graph.HoverDetail( { + graph: graph, + formatter: function(series, x, y) { + var date = '' + new Date(x * 1000).toUTCString() + ''; + var swatch = ''; + var content = swatch + series.name + ": " + parseInt(y) + '
' + date; + return content; + } + } ); + graph.render(); +}; + + --- a/ckanext/ga_report/templates/ga_report/ga_util.html +++ b/ckanext/ga_report/templates/ga_report/ga_util.html @@ -30,36 +30,14 @@ -
+
--- a/ckanext/ga_report/templates/ga_report/publisher/index.html +++ b/ckanext/ga_report/templates/ga_report/publisher/index.html @@ -20,6 +20,7 @@ + @@ -42,7 +43,7 @@
- ${rickshaw_graph('[{name:"test series",data:[{x:1,y:200},{x:3,y:300},{x:5,y:100}]}]','dataset-downloads',debug=True)} + ${rickshaw_graph(c.top_publishers_graph,'publishers')} --- a/ckanext/ga_report/templates/ga_report/publisher/read.html +++ b/ckanext/ga_report/templates/ga_report/publisher/read.html @@ -10,6 +10,7 @@ + --- a/ckanext/ga_report/templates/ga_report/site/index.html +++ b/ckanext/ga_report/templates/ga_report/site/index.html @@ -10,6 +10,7 @@ + @@ -94,20 +95,20 @@
Publisher
- ${rickshaw_graph(c.browser_versions_graph,'browser-versions')} + ${rickshaw_graph(c.browser_versions_graph,'browser-versions',mode='stack')}

Note: Where a browser has a large number of versions, these have been grouped together.

${stat_table(c.browser_versions)}
- ${rickshaw_graph(c.browsers_graph,'browsers')} + ${rickshaw_graph(c.browsers_graph,'browsers',mode='stack')} ${stat_table(c.browsers)}
- ${rickshaw_graph(c.os_graph,'os')} + ${rickshaw_graph(c.os_graph,'os',mode='stack')} ${stat_table(c.os)}
- ${rickshaw_graph(c.os_versions_graph,'os_versions')} + ${rickshaw_graph(c.os_versions_graph,'os_versions',mode='stack')} ${stat_table(c.os_versions)}
@@ -115,16 +116,16 @@ ${social_table(c.social_referrer_totals)}
- ${rickshaw_graph(c.social_networks_graph, 'social_networks')} + ${rickshaw_graph(c.social_networks_graph, 'social_networks',mode='stack')}

Percentage of visits that were referred from these social networks

${stat_table(c.social_networks, 'Visits')}
- ${rickshaw_graph(c.languages_graph,'languages')} + ${rickshaw_graph(c.languages_graph,'languages',mode='stack')} ${stat_table(c.languages)}
- ${rickshaw_graph(c.country_graph,'country')} + ${rickshaw_graph(c.country_graph,'country',mode='stack')} ${stat_table(c.country)}