--- a/ckanext/ga_report/ga_model.py +++ b/ckanext/ga_report/ga_model.py @@ -114,7 +114,7 @@ >>> normalize_url('http://data.gov.uk/dataset/weekly_fuel_prices') '/dataset/weekly_fuel_prices' ''' - return '/' + '/'.join(url.split('/')[3:]) + return url #'/' + '/'.join(url.split('/')[3:]) def _get_package_and_publisher(url): @@ -125,12 +125,12 @@ dataset_ref = dataset_match.groups()[0] dataset = model.Package.get(dataset_ref) if dataset: - publisher_groups = dataset.get_groups('publisher') + publisher_groups = dataset.get_groups('organization') if publisher_groups: return dataset_ref,publisher_groups[0].name return dataset_ref, None else: - publisher_match = re.match('/publisher/([^/]+)(/.*)?', url) + publisher_match = re.match('/organization/([^/]+)(/.*)?', url) if publisher_match: return None, publisher_match.groups()[0] return None, None @@ -161,10 +161,64 @@ def pre_update_url_stats(period_name): - model.Session.query(GA_Url).\ - filter(GA_Url.period_name==period_name).delete() - model.Session.query(GA_Url).\ - filter(GA_Url.period_name=='All').delete() + q = model.Session.query(GA_Url).\ + filter(GA_Url.period_name==period_name) + log.debug("Deleting %d '%s' records" % (q.count(), period_name)) + q.delete() + + q = model.Session.query(GA_Url).\ + filter(GA_Url.period_name == 'All') + log.debug("Deleting %d 'All' records..." % q.count()) + q.delete() + + model.Session.flush() + model.Session.commit() + model.repo.commit_and_remove() + log.debug('...done') + +def post_update_url_stats(): + + """ Check the distinct url field in ga_url and make sure + it has an All record. If not then create one. + + After running this then every URL should have an All + record regardless of whether the URL has an entry for + the month being currently processed. + """ + log.debug('Post-processing "All" records...') + query = """select url, pageviews::int, visits::int + from ga_url + where url not in (select url from ga_url where period_name ='All')""" + connection = model.Session.connection() + res = connection.execute(query) + + views, visits = {}, {} + # url, views, visits + for row in res: + views[row[0]] = views.get(row[0], 0) + row[1] + visits[row[0]] = visits.get(row[0], 0) + row[2] + + progress_total = len(views.keys()) + progress_count = 0 + for key in views.keys(): + progress_count += 1 + if progress_count % 100 == 0: + log.debug('.. %d/%d done so far', progress_count, progress_total) + + package, publisher = _get_package_and_publisher(key) + + values = {'id': make_uuid(), + 'period_name': "All", + 'period_complete_day': 0, + 'url': key, + 'pageviews': views[key], + 'visits': visits[key], + 'department_id': publisher, + 'package_id': package + } + model.Session.add(GA_Url(**values)) + model.Session.commit() + log.debug('..done') def update_url_stats(period_name, period_complete_day, url_data): @@ -173,9 +227,14 @@ stores them in GA_Url under the period and recalculates the totals for the 'All' period. ''' + progress_total = len(url_data) + progress_count = 0 for url, views, visits in url_data: + progress_count += 1 + if progress_count % 100 == 0: + log.debug('.. %d/%d done so far', progress_count, progress_total) + package, publisher = _get_package_and_publisher(url) - item = model.Session.query(GA_Url).\ filter(GA_Url.period_name==period_name).\ @@ -216,8 +275,8 @@ 'period_name': 'All', 'period_complete_day': 0, 'url': url, - 'pageviews': sum([int(e.pageviews) for e in entries]) + old_pageviews, - 'visits': sum([int(e.visits or 0) for e in entries]) + old_visits, + 'pageviews': sum([int(e.pageviews) for e in entries]) + int(old_pageviews), + 'visits': sum([int(e.visits or 0) for e in entries]) + int(old_visits), 'department_id': publisher, 'package_id': package } @@ -264,11 +323,11 @@ """ toplevel = get_top_level() publishers = model.Session.query(model.Group).\ - filter(model.Group.type=='publisher').\ + filter(model.Group.type=='organization').\ filter(model.Group.state=='active').all() for publisher in publishers: views, visits, subpub = update_publisher(period_name, publisher, publisher.name) - parent, parents = '', publisher.get_groups('publisher') + parent, parents = '', publisher.get_parent_groups(type='organization') if parents: parent = parents[0].name item = model.Session.query(GA_Publisher).\ @@ -318,15 +377,12 @@ model.Member.table_name == 'group' and \ model.Member.state == 'active').\ filter(model.Member.id==None).\ - filter(model.Group.type=='publisher').\ + filter(model.Group.type=='organization').\ order_by(model.Group.name).all() def get_children(publisher): - '''Finds child publishers for the given publisher (object). (Not recursive)''' - from ckan.model.group import HIERARCHY_CTE - return model.Session.query(model.Group).\ - from_statement(HIERARCHY_CTE).params(id=publisher.id, type='publisher').\ - all() + '''Finds child publishers for the given publisher (object). (Not recursive i.e. returns one level)''' + return publisher.get_children_groups(type='organization') def go_down_tree(publisher): '''Provided with a publisher object, it walks down the hierarchy and yields each publisher, @@ -346,7 +402,7 @@ if period_name != 'All': q = q.filter_by(period_name=period_name) q.delete() - model.Session.commit() + model.repo.commit_and_remove() def get_score_for_dataset(dataset_name): '''