From: Ross Jones Date: Wed, 07 Nov 2012 14:11:35 +0000 Subject: Handling duplicate urls in ga_url processing X-Git-Url: http://maxious.lambdacomplex.org/git/?p=ckanext-ga-report.git&a=commitdiff&h=6a953b2c1630aabdd8d7ea629b3f06c8c2f6eb47 --- Handling duplicate urls in ga_url processing --- --- a/ckanext/ga_report/controller.py +++ b/ckanext/ga_report/controller.py @@ -309,6 +309,8 @@ select department_id, sum(pageviews::int) views, sum(visitors::int) visits from ga_url where department_id <> '' + and package_id <> '' + and url like '/dataset/%%' and period_name=%s group by department_id order by visits desc """ --- a/ckanext/ga_report/ga_model.py +++ b/ckanext/ga_report/ga_model.py @@ -116,7 +116,7 @@ return '/' + '/'.join(url.split('/')[2:]) -def _get_department_id_of_url(url): +def _get_package_and_publisher(url): # e.g. /dataset/fuel_prices # e.g. /dataset/fuel_prices/resource/e63380d4 dataset_match = re.match('/dataset/([^/]+)(/.*)?', url) @@ -126,12 +126,13 @@ if dataset: publisher_groups = dataset.get_groups('publisher') if publisher_groups: - return publisher_groups[0].name + return dataset_ref,publisher_groups[0].name + return dataset_ref, None else: publisher_match = re.match('/publisher/([^/]+)(/.*)?', url) if publisher_match: - return publisher_match.groups()[0] - + return None, publisher_match.groups()[0] + return None, None def update_sitewide_stats(period_name, stat_name, data): for k,v in data.iteritems(): @@ -185,22 +186,26 @@ def update_url_stats(period_name, period_complete_day, url_data): for url, views, visitors in url_data: - department_id = _get_department_id_of_url(url) - - package = None - if url.startswith('/dataset/'): - package = url[len('/dataset/'):] - - values = {'id': make_uuid(), - 'period_name': period_name, - 'period_complete_day': period_complete_day, - 'url': url, - 'pageviews': views, - 'visitors': visitors, - 'department_id': department_id, - 'package_id': package - } - model.Session.add(GA_Url(**values)) + package, publisher = _get_package_and_publisher(url) + + item = model.Session.query(GA_Url).\ + filter(GA_Url.period_name==period_name).\ + filter(GA_Url.url==url).first() + if item: + item.pageviews = item.pageviews + views + item.visitors = item.visitors + visitors + model.Session.add(item) + else: + values = {'id': make_uuid(), + 'period_name': period_name, + 'period_complete_day': period_complete_day, + 'url': url, + 'pageviews': views, + 'visitors': visitors, + 'department_id': publisher, + 'package_id': package + } + model.Session.add(GA_Url(**values)) model.Session.commit() if package: @@ -213,9 +218,10 @@ 'url': url, 'pageviews': sum([int(e.pageviews) for e in entries]), 'visitors': sum([int(e.visitors) for e in entries]), - 'department_id': department_id, + 'department_id': publisher, 'package_id': package } + model.Session.add(GA_Url(**values)) model.Session.commit()