From: Ross Jones Date: Fri, 11 Jan 2013 14:52:07 +0000 Subject: Implements downloads counts (for dataset resources) and fixes an issue with 'All' records. X-Git-Url: http://maxious.lambdacomplex.org/git/?p=ckanext-ga-report.git&a=commitdiff&h=fb4307a6de326e9aa113465bcabe674fd68d8775 --- Implements downloads counts (for dataset resources) and fixes an issue with 'All' records. - Fetches the data for downloads (either direct or cached) from when a user clicked on a Download button. We lookup the resource for this url (based on .url or .cache_url) and then associate the download count with the package it belongs to. - Fixes a bug (#211) where the All records are deleted for every url, even if that url was not fetched (and therefore won't get a new All record). --- --- a/ckanext/ga_report/command.py +++ b/ckanext/ga_report/command.py @@ -55,6 +55,36 @@ init_service('token.dat', self.args[0] if self.args else 'credentials.json') + +class FixTimePeriods(CkanCommand): + """ + Fixes the 'All' records for GA_Urls + + It is possible that older urls that haven't recently been visited + do not have All records. This command will traverse through those + records and generate valid All records for them. + """ + summary = __doc__.split('\n')[0] + usage = __doc__ + max_args = 0 + min_args = 0 + + def __init__(self, name): + super(FixTimePeriods, self).__init__(name) + + def command(self): + import ckan.model as model + from ga_model import post_update_url_stats + self._load_config() + model.Session.remove() + model.Session.configure(bind=model.meta.engine) + + log = logging.getLogger('ckanext.ga_report') + + log.info("Updating 'All' records for old URLs") + post_update_url_stats() + log.info("Processing complete") + class LoadAnalytics(CkanCommand): --- a/ckanext/ga_report/controller.py +++ b/ckanext/ga_report/controller.py @@ -13,6 +13,7 @@ log = logging.getLogger('ckanext.ga-report') +DOWNLOADS_AVAILABLE_FROM = '2012-12' def _get_month_name(strdate): import calendar @@ -38,6 +39,7 @@ q= q.filter(cls.stat_name==stat_key) vals = q.order_by("period_name desc").all() + if vals and vals[0][1]: day = int(vals[0][1]) ordinal = 'th' if 11 <= day <= 13 \ @@ -69,25 +71,6 @@ for entry in entries: writer.writerow([entry.period_name.encode('utf-8'), entry.stat_name.encode('utf-8'), - entry.key.encode('utf-8'), - entry.value.encode('utf-8')]) - - def csv_downloads(self, month): - import csv - - q = model.Session.query(GA_Stat).filter(GA_Stat.stat_name=='Downloads') - if month != 'all': - q = q.filter(GA_Stat.period_name==month) - entries = q.order_by('GA_Stat.period_name, GA_Stat.key').all() - - response.headers['Content-Type'] = "text/csv; charset=utf-8" - response.headers['Content-Disposition'] = str('attachment; filename=downloads_%s.csv' % (month,)) - - writer = csv.writer(response) - writer.writerow(["Period", "Resource URL", "Count"]) - - for entry in entries: - writer.writerow([entry.period_name.encode('utf-8'), entry.key.encode('utf-8'), entry.value.encode('utf-8')]) @@ -202,35 +185,6 @@ return render('ga_report/site/index.html') - def downloads(self): - - # Get the month details by fetching distinct values and determining the - # month names from the values. - c.months, c.day = _month_details(GA_Stat, "Downloads") - - # Work out which month to show, based on query params of the first item - c.month_desc = 'all months' - c.month = request.params.get('month', '') - if c.month: - c.month_desc = ''.join([m[1] for m in c.months if m[0]==c.month]) - - c.downloads = [] - q = model.Session.query(GA_Stat).filter(GA_Stat.stat_name=='Downloads') - q = q.filter(GA_Stat.period_name==c.month) if c.month else q - q = q.order_by("ga_stat.value::int desc") - - data = collections.defaultdict(int) - for entry in q.all(): - r = model.Session.query(model.Resource).filter(model.Resource.url==entry.key).first() - if not r: - continue - data[r] += int(entry.value) - - c.downloads = [(k,v,) for k,v in data.iteritems()] - c.downloads = sorted(c.downloads, key=operator.itemgetter(1), reverse=True) - - return render('ga_report/site/downloads.html') - class GaDatasetReport(BaseController): """ @@ -275,13 +229,14 @@ str('attachment; filename=datasets_%s_%s.csv' % (c.publisher_name, month,)) writer = csv.writer(response) - writer.writerow(["Dataset Title", "Dataset Name", "Views", "Visits", "Period Name"]) - - for package,view,visit in packages: + writer.writerow(["Dataset Title", "Dataset Name", "Views", "Visits", "Resource downloads", "Period Name"]) + + for package,view,visit,downloads in packages: writer.writerow([package.title.encode('utf-8'), package.name.encode('utf-8'), view, visit, + downloads, month]) def publishers(self): @@ -302,10 +257,10 @@ def _get_packages(self, publisher=None, count=-1): '''Returns the datasets in order of views''' - if count == -1: - count = sys.maxint - + have_download_data = True month = c.month or 'All' + if month != 'All': + have_download_data = month >= DOWNLOADS_AVAILABLE_FROM q = model.Session.query(GA_Url,model.Package)\ .filter(model.Package.name==GA_Url.package_id)\ @@ -315,9 +270,25 @@ q = q.filter(GA_Url.period_name==month) q = q.order_by('ga_url.pageviews::int desc') top_packages = [] - for entry,package in q.limit(count): + if count == -1: + entries = q.all() + else: + entries = q.limit(count) + + for entry,package in entries: if package: - top_packages.append((package, entry.pageviews, entry.visits)) + # Downloads .... + if have_download_data: + dls = model.Session.query(GA_Stat).\ + filter(GA_Stat.stat_name=='Downloads').\ + filter(GA_Stat.key==package.name) + if month != 'All': # Fetch everything unless the month is specific + dls = dls.filter(GA_Stat.period_name==month) + + downloads = sum(int(d.value) for d in dls.all()) + else: + downloads = 'No data' + top_packages.append((package, entry.pageviews, entry.visits, downloads)) else: log.warning('Could not find package associated package') --- a/ckanext/ga_report/download_analytics.py +++ b/ckanext/ga_report/download_analytics.py @@ -123,8 +123,12 @@ log.info('Storing publisher views (%i rows)', len(data.get('url'))) self.store(period_name, period_complete_day, data,) + # Make sure the All records are correct. + ga_model.post_update_url_stats() + log.info('Aggregating datasets by publisher') ga_model.update_publisher_stats(period_name) # about 30 seconds. + log.info('Downloading and storing analytics for site-wide stats') self.sitewide_stats( period_name, period_complete_day ) @@ -180,6 +184,7 @@ end_date=end_date).execute() packages = [] + log.info("There are %d results" % results['totalResults']) for entry in results.get('rows'): (loc,pageviews,visits) = entry url = _normalize_url('http:/' + loc) # strips off domain e.g. www.data.gov.uk or data.gov.uk @@ -294,6 +299,10 @@ def _download_stats(self, start_date, end_date, period_name, period_complete_day): """ Fetches stats about language and country """ + import ckan.model as model + + data = {} + results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, @@ -310,10 +319,37 @@ log.info("There is no download data for this time period") return - # [[url, count], [url],count] - data = {} - for result in result_data: - data[result[0]] = data.get(result[0], 0) + int(result[1]) + def process_result_data(result_data, cached=False): + for result in result_data: + url = result[0].strip() + + # Get package id associated with the resource that has this URL. + q = model.Session.query(model.Resource) + if cached: + r = q.filter(model.Resource.cache_url.like("%s%%" % url)).first() + else: + r = q.filter(model.Resource.url.like("%s%%" % url)).first() + + package_name = r.resource_group.package.name if r else "" + if package_name: + data[package_name] = data.get(package_name, 0) + int(result[1]) + else: + log.warning(u"Could not find resource for URL: {url}".format(url=url)) + continue + + process_result_data(results.get('rows')) + + results = self.service.data().ga().get( + ids='ga:' + self.profile_id, + start_date=start_date, + filters='ga:eventAction==download-cache', + metrics='ga:totalEvents', + sort='-ga:totalEvents', + dimensions="ga:eventLabel", + max_results=10000, + end_date=end_date).execute() + process_result_data(results.get('rows'), cached=False) + self._filter_out_long_tail(data, MIN_DOWNLOADS) ga_model.update_sitewide_stats(period_name, "Downloads", data, period_complete_day) --- a/ckanext/ga_report/ga_model.py +++ b/ckanext/ga_report/ga_model.py @@ -175,6 +175,42 @@ model.Session.flush() model.Session.commit() model.repo.commit_and_remove() + +def post_update_url_stats(): + + """ Check the distinct url field in ga_url and make sure + it has an All record. If not then create one. + + After running this then every URL should have an All + record regardless of whether the URL has an entry for + the month being currently processed. + """ + query = """select url, pageviews::int, visits::int + from ga_url + where url not in (select url from ga_url where period_name ='All')""" + connection = model.Session.connection() + res = connection.execute(query) + + views, visits = {}, {} + # url, views, visits + for row in res: + views[row[0]] = views.get(row[0], 0) + row[1] + visits[row[0]] = visits.get(row[0], 0) + row[2] + + for key in views.keys(): + package, publisher = _get_package_and_publisher(key) + + values = {'id': make_uuid(), + 'period_name': "All", + 'period_complete_day': 0, + 'url': key, + 'pageviews': views[key], + 'visits': visits[key], + 'department_id': publisher, + 'package_id': publisher + } + model.Session.add(GA_Url(**values)) + model.Session.commit() def update_url_stats(period_name, period_complete_day, url_data): --- a/ckanext/ga_report/templates/ga_report/ga_util.html +++ b/ckanext/ga_report/templates/ga_report/ga_util.html @@ -44,23 +44,6 @@ - - - - - - - - - - - -
Dataset and resourceDownloads
- ${resource.resource_group.package.title}
- ${h.link_to((resource.name or resource.description).strip() or "No name", h.url_for(controller='package', action='resource_read', id=resource.resource_group.package.name, resource_id=resource.id))}
-
${value}
- -
@@ -72,9 +55,6 @@
  • None Datasets
  • -
  • - None Downloads -
  • --- a/ckanext/ga_report/templates/ga_report/notes.html +++ b/ckanext/ga_report/templates/ga_report/notes.html @@ -7,7 +7,7 @@

    Notes

    --- a/ckanext/ga_report/templates/ga_report/publisher/read.html +++ b/ckanext/ga_report/templates/ga_report/publisher/read.html @@ -44,15 +44,15 @@ - + - + - +
    Dataset ViewsDownloads
    ${h.link_to(package.title or package.name, h.url_for(controller='package', action='read', id=package.name))} ${views}${downloads}
    --- a/setup.py +++ b/setup.py @@ -33,6 +33,7 @@ loadanalytics = ckanext.ga_report.command:LoadAnalytics initdb = ckanext.ga_report.command:InitDB getauthtoken = ckanext.ga_report.command:GetAuthToken + fixtimeperiods = ckanext.ga_report.command:FixTimePeriods """, )