--- a/ckanext/ga_report/ga_model.py +++ b/ckanext/ga_report/ga_model.py @@ -161,21 +161,64 @@ def pre_update_url_stats(period_name): - log.debug("Deleting '%s' records" % period_name) - model.Session.query(GA_Url).\ - filter(GA_Url.period_name==period_name).delete() - - count = model.Session.query(GA_Url).\ - filter(GA_Url.period_name == 'All').count() - log.debug("Deleting %d 'All' records" % count) - model.Session.query(GA_Url).\ - filter(GA_Url.period_name == 'All').delete() - log.debug("Query: %s" % model.Session.query(GA_Url). - filter(GA_Url.period_name == 'All')) - - model.Session.clufhs() + q = model.Session.query(GA_Url).\ + filter(GA_Url.period_name==period_name) + log.debug("Deleting %d '%s' records" % (q.count(), period_name)) + q.delete() + + q = model.Session.query(GA_Url).\ + filter(GA_Url.period_name == 'All') + log.debug("Deleting %d 'All' records..." % q.count()) + q.delete() + + model.Session.flush() model.Session.commit() model.repo.commit_and_remove() + log.debug('...done') + +def post_update_url_stats(): + + """ Check the distinct url field in ga_url and make sure + it has an All record. If not then create one. + + After running this then every URL should have an All + record regardless of whether the URL has an entry for + the month being currently processed. + """ + log.debug('Post-processing "All" records...') + query = """select url, pageviews::int, visits::int + from ga_url + where url not in (select url from ga_url where period_name ='All')""" + connection = model.Session.connection() + res = connection.execute(query) + + views, visits = {}, {} + # url, views, visits + for row in res: + views[row[0]] = views.get(row[0], 0) + row[1] + visits[row[0]] = visits.get(row[0], 0) + row[2] + + progress_total = len(views.keys()) + progress_count = 0 + for key in views.keys(): + progress_count += 1 + if progress_count % 100 == 0: + log.debug('.. %d/%d done so far', progress_count, progress_total) + + package, publisher = _get_package_and_publisher(key) + + values = {'id': make_uuid(), + 'period_name': "All", + 'period_complete_day': 0, + 'url': key, + 'pageviews': views[key], + 'visits': visits[key], + 'department_id': publisher, + 'package_id': package + } + model.Session.add(GA_Url(**values)) + model.Session.commit() + log.debug('..done') def update_url_stats(period_name, period_complete_day, url_data): @@ -184,9 +227,14 @@ stores them in GA_Url under the period and recalculates the totals for the 'All' period. ''' + progress_total = len(url_data) + progress_count = 0 for url, views, visits in url_data: + progress_count += 1 + if progress_count % 100 == 0: + log.debug('.. %d/%d done so far', progress_count, progress_total) + package, publisher = _get_package_and_publisher(url) - item = model.Session.query(GA_Url).\ filter(GA_Url.period_name==period_name).\