Handling duplicate urls in ga_url processing
--- a/ckanext/ga_report/controller.py
+++ b/ckanext/ga_report/controller.py
@@ -22,6 +22,7 @@
def _month_details(cls):
+ '''Returns a list of all the month names'''
months = []
vals = model.Session.query(cls.period_name).filter(cls.period_name!='All').distinct().all()
for m in vals:
@@ -308,6 +309,8 @@
select department_id, sum(pageviews::int) views, sum(visitors::int) visits
from ga_url
where department_id <> ''
+ and package_id <> ''
+ and url like '/dataset/%%'
and period_name=%s
group by department_id order by visits desc
"""
--- a/ckanext/ga_report/ga_model.py
+++ b/ckanext/ga_report/ga_model.py
@@ -116,7 +116,7 @@
return '/' + '/'.join(url.split('/')[2:])
-def _get_department_id_of_url(url):
+def _get_package_and_publisher(url):
# e.g. /dataset/fuel_prices
# e.g. /dataset/fuel_prices/resource/e63380d4
dataset_match = re.match('/dataset/([^/]+)(/.*)?', url)
@@ -126,12 +126,13 @@
if dataset:
publisher_groups = dataset.get_groups('publisher')
if publisher_groups:
- return publisher_groups[0].name
+ return dataset_ref,publisher_groups[0].name
+ return dataset_ref, None
else:
publisher_match = re.match('/publisher/([^/]+)(/.*)?', url)
if publisher_match:
- return publisher_match.groups()[0]
-
+ return None, publisher_match.groups()[0]
+ return None, None
def update_sitewide_stats(period_name, stat_name, data):
for k,v in data.iteritems():
@@ -185,22 +186,26 @@
def update_url_stats(period_name, period_complete_day, url_data):
for url, views, visitors in url_data:
- department_id = _get_department_id_of_url(url)
-
- package = None
- if url.startswith('/dataset/'):
- package = url[len('/dataset/'):]
-
- values = {'id': make_uuid(),
- 'period_name': period_name,
- 'period_complete_day': period_complete_day,
- 'url': url,
- 'pageviews': views,
- 'visitors': visitors,
- 'department_id': department_id,
- 'package_id': package
- }
- model.Session.add(GA_Url(**values))
+ package, publisher = _get_package_and_publisher(url)
+
+ item = model.Session.query(GA_Url).\
+ filter(GA_Url.period_name==period_name).\
+ filter(GA_Url.url==url).first()
+ if item:
+ item.pageviews = item.pageviews + views
+ item.visitors = item.visitors + visitors
+ model.Session.add(item)
+ else:
+ values = {'id': make_uuid(),
+ 'period_name': period_name,
+ 'period_complete_day': period_complete_day,
+ 'url': url,
+ 'pageviews': views,
+ 'visitors': visitors,
+ 'department_id': publisher,
+ 'package_id': package
+ }
+ model.Session.add(GA_Url(**values))
model.Session.commit()
if package:
@@ -213,9 +218,10 @@
'url': url,
'pageviews': sum([int(e.pageviews) for e in entries]),
'visitors': sum([int(e.visitors) for e in entries]),
- 'department_id': department_id,
+ 'department_id': publisher,
'package_id': package
}
+
model.Session.add(GA_Url(**values))
model.Session.commit()
--- a/ckanext/ga_report/helpers.py
+++ b/ckanext/ga_report/helpers.py
@@ -1,7 +1,9 @@
import logging
import operator
+
import ckan.lib.base as base
import ckan.model as model
+from ckan.logic import get_action
from ckanext.ga_report.ga_model import GA_Url, GA_Publisher
from ckanext.ga_report.controller import _get_publishers
@@ -39,25 +41,38 @@
order_by('ga_url.pageviews::int desc')
num_top_datasets = top_datasets.count()
+ dataset = None
if num_top_datasets:
- dataset = None
+ count = 0
while not dataset:
rand = random.randrange(0, min(top, num_top_datasets))
ga_url = top_datasets[rand]
dataset = model.Package.get(ga_url.url[len('/dataset/'):])
if dataset and not dataset.state == 'active':
dataset = None
- else:
+ count += 1
+ if count > 10:
+ break
+ if not dataset:
+ # fallback
dataset = model.Session.query(model.Package)\
.filter_by(state='active').first()
- publisher = dataset.get_groups('publisher')[0]
- return {
- 'dataset': dataset,
- 'publisher': publisher
- }
+ if not dataset:
+ return None
+ dataset_dict = get_action('package_show')({'model': model,
+ 'session': model.Session},
+ {'id':dataset.id})
+ return dataset_dict
def single_popular_dataset_html(top=20):
- context = single_popular_dataset(top)
+ dataset_dict = single_popular_dataset(top)
+ groups = package.get('groups', [])
+ publishers = [ g for g in groups if g.get('type') == 'publisher' ]
+ publisher = publishers[0] if publishers else {'name':'', 'title': ''}
+ context = {
+ 'dataset': dataset_dict,
+ 'publisher': publisher_dict
+ }
return base.render_snippet('ga_report/ga_popular_single.html', **context)
--- a/ckanext/ga_report/templates/ga_report/ga_popular_single.html
+++ b/ckanext/ga_report/templates/ga_report/ga_popular_single.html
@@ -8,14 +8,14 @@
<h2>Featured dataset</h2>
<div class="dataset-summary boxed">
- <a class="dataset-header" href="${h.url_for(controller='package', action='read', id=dataset.name)}">
- <h3>${dataset.title}</h3>
+ <a class="dataset-header" href="${h.url_for(controller='package', action='read', id=dataset['name'])}">
+ <h3>${dataset['title']}</h3>
</a>
<h4>
<strong>Publisher</strong> :
- <a href="/publisher/${publisher.name}">${publisher.title}</a>
+ <a href="/publisher/${publisher['name']}">${publisher['title']}</a>
</h4>
- <div>${h.truncate(dataset.notes, length=200, whole_word=True)}</div>
+ <div>${h.truncate(dataset['notes_rendered'], length=200, whole_word=True)}</div>
</div>
<div>
<a href="${h.url_for(controller='ckanext.ga_report.controller:GaDatasetReport',action='read')}" class="btn">Other popular datasets</a>