Improved debug logging.
--- a/ckanext/ga_report/command.py
+++ b/ckanext/ga_report/command.py
@@ -23,7 +23,7 @@
import ckan.model as model
model.Session.remove()
model.Session.configure(bind=model.meta.engine)
- log = logging.getLogger('ckanext.ga-report')
+ log = logging.getLogger('ckanext.ga_report')
import ga_model
ga_model.init_tables()
--- a/ckanext/ga_report/download_analytics.py
+++ b/ckanext/ga_report/download_analytics.py
@@ -131,7 +131,7 @@
# Make sure the All records are correct.
ga_model.post_update_url_stats()
- log.info('Aggregating datasets by publisher')
+ log.info('Associating datasets with their publisher')
ga_model.update_publisher_stats(period_name) # about 30 seconds.
@@ -303,7 +303,7 @@
def _download_stats(self, start_date, end_date, period_name, period_complete_day):
- """ Fetches stats about language and country """
+ """ Fetches stats about data downloads """
import ckan.model as model
data = {}
@@ -325,7 +325,14 @@
return
def process_result_data(result_data, cached=False):
+ progress_total = len(result_data)
+ progress_count = 0
+ resources_not_matched = []
for result in result_data:
+ progress_count += 1
+ if progress_count % 100 == 0:
+ log.debug('.. %d/%d done so far', progress_count, progress_total)
+
url = result[0].strip()
# Get package id associated with the resource that has this URL.
@@ -339,9 +346,13 @@
if package_name:
data[package_name] = data.get(package_name, 0) + int(result[1])
else:
- log.warning(u"Could not find resource for URL: {url}".format(url=url))
+ resources_not_matched.append(url)
continue
-
+ if resources_not_matched:
+ log.debug('Could not match %i or %i resource URLs to datasets. e.g. %r',
+ len(resources_not_matched), progress_total, resources_not_matched[:3])
+
+ log.info('Associating downloads of resource URLs with their respective datasets')
process_result_data(results.get('rows'))
results = self.service.data().ga().get(
@@ -353,6 +364,7 @@
dimensions="ga:eventLabel",
max_results=10000,
end_date=end_date).execute()
+ log.info('Associating downloads of cache resource URLs with their respective datasets')
process_result_data(results.get('rows'), cached=False)
self._filter_out_long_tail(data, MIN_DOWNLOADS)
--- a/ckanext/ga_report/ga_model.py
+++ b/ckanext/ga_report/ga_model.py
@@ -161,20 +161,20 @@
def pre_update_url_stats(period_name):
- log.debug("Deleting '%s' records" % period_name)
- model.Session.query(GA_Url).\
- filter(GA_Url.period_name==period_name).delete()
-
- count = model.Session.query(GA_Url).\
- filter(GA_Url.period_name == 'All').count()
- log.debug("Deleting %d 'All' records" % count)
- count = model.Session.query(GA_Url).\
- filter(GA_Url.period_name == 'All').delete()
- log.debug("Deleted %d 'All' records" % count)
+ q = model.Session.query(GA_Url).\
+ filter(GA_Url.period_name==period_name)
+ log.debug("Deleting %d '%s' records" % (q.count(), period_name))
+ q.delete()
+
+ q = model.Session.query(GA_Url).\
+ filter(GA_Url.period_name == 'All')
+ log.debug("Deleting %d 'All' records..." % q.count())
+ q.delete()
model.Session.flush()
model.Session.commit()
model.repo.commit_and_remove()
+ log.debug('...done')
def post_update_url_stats():
@@ -185,6 +185,7 @@
record regardless of whether the URL has an entry for
the month being currently processed.
"""
+ log.debug('Post-processing "All" records...')
query = """select url, pageviews::int, visits::int
from ga_url
where url not in (select url from ga_url where period_name ='All')"""
@@ -197,7 +198,13 @@
views[row[0]] = views.get(row[0], 0) + row[1]
visits[row[0]] = visits.get(row[0], 0) + row[2]
+ progress_total = len(views.keys())
+ progress_count = 0
for key in views.keys():
+ progress_count += 1
+ if progress_count % 100 == 0:
+ log.debug('.. %d/%d done so far', progress_count, progress_total)
+
package, publisher = _get_package_and_publisher(key)
values = {'id': make_uuid(),
@@ -211,6 +218,7 @@
}
model.Session.add(GA_Url(**values))
model.Session.commit()
+ log.debug('..done')
def update_url_stats(period_name, period_complete_day, url_data):
@@ -219,9 +227,14 @@
stores them in GA_Url under the period and recalculates the totals for
the 'All' period.
'''
+ progress_total = len(progress_data)
+ progress_count = 0
for url, views, visits in url_data:
+ progress_count += 1
+ if progress_count % 100 == 0:
+ log.debug('.. %d/%d done so far', progress_count, progress_total)
+
package, publisher = _get_package_and_publisher(url)
-
item = model.Session.query(GA_Url).\
filter(GA_Url.period_name==period_name).\