Makes sure we don't add (and log) when we can't find the dataset for popular, fixes #242
Makes sure we don't add (and log) when we can't find the dataset for popular, fixes #242

import logging import logging
import operator import operator
   
import ckan.lib.base as base import ckan.lib.base as base
import ckan.model as model import ckan.model as model
from ckan.logic import get_action from ckan.logic import get_action
   
from ckanext.ga_report.ga_model import GA_Url, GA_Publisher from ckanext.ga_report.ga_model import GA_Url, GA_Publisher
from ckanext.ga_report.controller import _get_publishers from ckanext.ga_report.controller import _get_publishers
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
   
def popular_datasets(count=10): def popular_datasets(count=10):
import random import random
   
publisher = None publisher = None
publishers = _get_publishers(30) publishers = _get_publishers(30)
total = len(publishers) total = len(publishers)
while not publisher or not datasets: while not publisher or not datasets:
rand = random.randrange(0, total) rand = random.randrange(0, total)
publisher = publishers[rand][0] publisher = publishers[rand][0]
if not publisher.state == 'active': if not publisher.state == 'active':
publisher = None publisher = None
continue continue
datasets = _datasets_for_publisher(publisher, 10)[:count] datasets = _datasets_for_publisher(publisher, 10)[:count]
   
ctx = { ctx = {
'datasets': datasets, 'datasets': datasets,
'publisher': publisher 'publisher': publisher
} }
return base.render_snippet('ga_report/ga_popular_datasets.html', **ctx) return base.render_snippet('ga_report/ga_popular_datasets.html', **ctx)
   
def single_popular_dataset(top=20): def single_popular_dataset(top=20):
'''Returns a random dataset from the most popular ones. '''Returns a random dataset from the most popular ones.
   
:param top: the number of top datasets to select from :param top: the number of top datasets to select from
''' '''
import random import random
   
top_datasets = model.Session.query(GA_Url).\ top_datasets = model.Session.query(GA_Url).\
filter(GA_Url.url.like('/dataset/%')).\ filter(GA_Url.url.like('/dataset/%')).\
order_by('ga_url.pageviews::int desc') order_by('ga_url.pageviews::int desc')
num_top_datasets = top_datasets.count() num_top_datasets = top_datasets.count()
   
dataset = None dataset = None
if num_top_datasets: if num_top_datasets:
count = 0 count = 0
while not dataset: while not dataset:
rand = random.randrange(0, min(top, num_top_datasets)) rand = random.randrange(0, min(top, num_top_datasets))
ga_url = top_datasets[rand] ga_url = top_datasets[rand]
dataset = model.Package.get(ga_url.url[len('/dataset/'):]) dataset = model.Package.get(ga_url.url[len('/dataset/'):])
if dataset and not dataset.state == 'active': if dataset and not dataset.state == 'active':
dataset = None dataset = None
# When testing, it is possible that top datasets are not available # When testing, it is possible that top datasets are not available
# so only go round this loop a few times before falling back on # so only go round this loop a few times before falling back on
# a random dataset. # a random dataset.
count += 1 count += 1
if count > 10: if count > 10:
break break
if not dataset: if not dataset:
# fallback # fallback
dataset = model.Session.query(model.Package)\ dataset = model.Session.query(model.Package)\
.filter_by(state='active').first() .filter_by(state='active').first()
if not dataset: if not dataset:
return None return None
dataset_dict = get_action('package_show')({'model': model, dataset_dict = get_action('package_show')({'model': model,
'session': model.Session, 'session': model.Session,
'validate': False}, 'validate': False},
{'id':dataset.id}) {'id':dataset.id})
return dataset_dict return dataset_dict
   
def single_popular_dataset_html(top=20): def single_popular_dataset_html(top=20):
dataset_dict = single_popular_dataset(top) dataset_dict = single_popular_dataset(top)
groups = package.get('groups', []) groups = package.get('groups', [])
publishers = [ g for g in groups if g.get('type') == 'publisher' ] publishers = [ g for g in groups if g.get('type') == 'publisher' ]
publisher = publishers[0] if publishers else {'name':'', 'title': ''} publisher = publishers[0] if publishers else {'name':'', 'title': ''}
context = { context = {
'dataset': dataset_dict, 'dataset': dataset_dict,
'publisher': publisher_dict 'publisher': publisher_dict
} }
return base.render_snippet('ga_report/ga_popular_single.html', **context) return base.render_snippet('ga_report/ga_popular_single.html', **context)
   
   
def most_popular_datasets(publisher, count=20): def most_popular_datasets(publisher, count=20):
   
if not publisher: if not publisher:
_log.error("No valid publisher passed to 'most_popular_datasets'") _log.error("No valid publisher passed to 'most_popular_datasets'")
return "" return ""
   
results = _datasets_for_publisher(publisher, count) results = _datasets_for_publisher(publisher, count)
   
ctx = { ctx = {
'dataset_count': len(results), 'dataset_count': len(results),
'datasets': results, 'datasets': results,
   
'publisher': publisher 'publisher': publisher
} }
   
return base.render_snippet('ga_report/publisher/popular.html', **ctx) return base.render_snippet('ga_report/publisher/popular.html', **ctx)
   
def _datasets_for_publisher(publisher, count): def _datasets_for_publisher(publisher, count):
datasets = {} datasets = {}
entries = model.Session.query(GA_Url).\ entries = model.Session.query(GA_Url).\
filter(GA_Url.department_id==publisher.name).\ filter(GA_Url.department_id==publisher.name).\
filter(GA_Url.url.like('/dataset/%')).\ filter(GA_Url.url.like('/dataset/%')).\
order_by('ga_url.pageviews::int desc').all() order_by('ga_url.pageviews::int desc').all()
for entry in entries: for entry in entries:
if len(datasets) < count: if len(datasets) < count:
p = model.Package.get(entry.url[len('/dataset/'):]) p = model.Package.get(entry.url[len('/dataset/'):])
  if not p:
  _log.warning("Could not find Package for {url}".format(url=entry.url))
  continue
   
if not p in datasets: if not p in datasets:
datasets[p] = {'views':0, 'visits': 0} datasets[p] = {'views':0, 'visits': 0}
datasets[p]['views'] = datasets[p]['views'] + int(entry.pageviews) datasets[p]['views'] = datasets[p]['views'] + int(entry.pageviews)
datasets[p]['visits'] = datasets[p]['visits'] + int(entry.visits) datasets[p]['visits'] = datasets[p]['visits'] + int(entry.visits)
   
results = [] results = []
for k, v in datasets.iteritems(): for k, v in datasets.iteritems():
results.append((k,v['views'],v['visits'])) results.append((k,v['views'],v['visits']))
   
return sorted(results, key=operator.itemgetter(1), reverse=True) return sorted(results, key=operator.itemgetter(1), reverse=True)
   
<html xmlns:py="http://genshi.edgewall.org/" <html xmlns:py="http://genshi.edgewall.org/"
xmlns:i18n="http://genshi.edgewall.org/i18n" xmlns:i18n="http://genshi.edgewall.org/i18n"
xmlns:xi="http://www.w3.org/2001/XInclude" xmlns:xi="http://www.w3.org/2001/XInclude"
py:strip=""> py:strip="">
   
<li class="widget-container boxed widget_text"> <li class="widget-container boxed widget_text">
<h4>Notes</h4> <h4>Notes</h4>
<ul> <ul>
<li>"Views" is the number of times a page was loaded in users' browsers.</li> <li>"Views" is the number of times a page was loaded in users' browsers.</li>
<li>"Downloads" is the number of times a user has clicked to download either an original or cached resource for a particular dataset since December 2012</li> <li>"Downloads" is the number of times a user has clicked to download either an original or cached resource for a particular dataset. Download information is only available from 2nd December 2012, 'No data' is shown for records before that date.</li>
<li>These usage statistics are confined to users with javascript enabled, which excludes web crawlers and API calls.</li> <li>These usage statistics are confined to users with javascript enabled, which excludes web crawlers and API calls.</li>
<li>The results are not shown when the number of views/visits is tiny. Where these relate to site pages, results are available in full in the CSV download. Where these relate to users' web browser information, results are not disclosed, for privacy reasons.</li> <li>The results are not shown when the number of views/visits is tiny. Where these relate to site pages, results are available in full in the CSV download. Where these relate to users' web browser information, results are not disclosed, for privacy reasons.</li>
</ul> </ul>
</li> </li>
</html> </html>