From: David Read Date: Fri, 09 Nov 2012 16:15:35 +0000 Subject: Adjust popularity score to take account of number of days in the month. X-Git-Url: http://maxious.lambdacomplex.org/git/?p=ckanext-ga-report.git&a=commitdiff&h=20b6eca0a538a77122ce85cf588045784fa9b67e --- Adjust popularity score to take account of number of days in the month. --- --- a/ckanext/ga_report/command.py +++ b/ckanext/ga_report/command.py @@ -80,7 +80,7 @@ default=False, dest='delete_first', help='Delete data for the period first') - self.parser.add_option('-s', '--slip_url_stats', + self.parser.add_option('-s', '--skip_url_stats', action='store_true', default=False, dest='skip_url_stats', --- a/ckanext/ga_report/controller.py +++ b/ckanext/ga_report/controller.py @@ -71,13 +71,13 @@ entries = q.order_by('ga_stat.key').all() def clean_key(key, val): - if key in ['Average time on site', 'Pages per visit', 'New visits', 'Bounces']: + if key in ['Average time on site', 'Pages per visit', 'New visits', 'Bounce rate (home page)']: val = "%.2f" % round(float(val), 2) if key == 'Average time on site': mins, secs = divmod(float(val), 60) hours, mins = divmod(mins, 60) val = '%02d:%02d:%02d (%s seconds) ' % (hours, mins, secs, val) - if key in ['New visits','Bounces']: + if key in ['New visits','Bounce rate (home page)']: val = "%s%%" % val if key in ['Total page views', 'Total visits']: val = int(val) @@ -232,7 +232,7 @@ return render('ga_report/publisher/index.html') def _get_packages(self, publisher=None, count=-1): - '''Returns the datasets in order of visits''' + '''Returns the datasets in order of views''' if count == -1: count = sys.maxint @@ -244,11 +244,11 @@ if publisher: q = q.filter(GA_Url.department_id==publisher.name) q = q.filter(GA_Url.period_name==month) - q = q.order_by('ga_url.visitors::int desc') + q = q.order_by('ga_url.pageviews::int desc') top_packages = [] for entry,package in q.limit(count): if package: - top_packages.append((package, entry.pageviews, entry.visitors)) + top_packages.append((package, entry.pageviews, entry.visits)) else: log.warning('Could not find package associated package') @@ -306,13 +306,13 @@ month = c.month or 'All' connection = model.Session.connection() q = """ - select department_id, sum(pageviews::int) views, sum(visitors::int) visits + select department_id, sum(pageviews::int) views, sum(visits::int) visits from ga_url where department_id <> '' and package_id <> '' and url like '/dataset/%%' and period_name=%s - group by department_id order by visits desc + group by department_id order by views desc """ if limit: q = q + " limit %s;" % (limit) @@ -329,7 +329,7 @@ def _get_publishers(): ''' Returns a list of all publishers. Each item is a tuple: - (names, title) + (name, title) ''' publishers = [] for pub in model.Session.query(model.Group).\ --- a/ckanext/ga_report/download_analytics.py +++ b/ckanext/ga_report/download_analytics.py @@ -98,7 +98,7 @@ self.get_full_period_name(period_name, period_complete_day), start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d')) - + if self.delete_first: log.info('Deleting existing Analytics for this period "%s"', period_name) @@ -162,8 +162,8 @@ start_date = start_date.strftime('%Y-%m-%d') end_date = end_date.strftime('%Y-%m-%d') query = 'ga:pagePath=%s$' % path - metrics = 'ga:uniquePageviews, ga:visits' - sort = '-ga:uniquePageviews' + metrics = 'ga:pageviews, ga:visits' + sort = '-ga:pageviews' # Supported query params at # https://developers.google.com/analytics/devguides/reporting/core/v3/reference @@ -219,8 +219,8 @@ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', max_results=10000, end_date=end_date).execute() result_data = results.get('rows') @@ -248,12 +248,12 @@ ids='ga:' + self.profile_id, filters='ga:pagePath==%s' % (path,), start_date=start_date, - metrics='ga:bounces,ga:uniquePageviews', + metrics='ga:bounces,ga:pageviews', dimensions='ga:pagePath', max_results=10000, end_date=end_date).execute() result_data = results.get('rows') - if len(result_data) != 1: + if not result_data or len(result_data) != 1: log.error('Could not pinpoint the bounces for path: %s. Got results: %r', path, result_data) return @@ -261,7 +261,7 @@ bounces, total = [float(x) for x in result_data[0][1:]] pct = 100 * bounces/total log.info('%d bounces from %d total == %s', bounces, total, pct) - ga_model.update_sitewide_stats(period_name, "Totals", {'Bounce rate': pct}) + ga_model.update_sitewide_stats(period_name, "Totals", {'Bounce rate (home page)': pct}) def _locale_stats(self, start_date, end_date, period_name): @@ -269,8 +269,8 @@ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:language,ga:country", max_results=10000, end_date=end_date).execute() @@ -293,8 +293,8 @@ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:socialNetwork,ga:referralPath", max_results=10000, end_date=end_date).execute() @@ -312,8 +312,8 @@ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:operatingSystem,ga:operatingSystemVersion", max_results=10000, end_date=end_date).execute() @@ -337,8 +337,8 @@ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:browser,ga:browserVersion", max_results=10000, end_date=end_date).execute() @@ -386,8 +386,8 @@ results = self.service.data().ga().get( ids='ga:' + self.profile_id, start_date=start_date, - metrics='ga:uniquePageviews', - sort='-ga:uniquePageviews', + metrics='ga:pageviews', + sort='-ga:pageviews', dimensions="ga:mobileDeviceBranding, ga:mobileDeviceInfo", max_results=10000, end_date=end_date).execute() --- a/ckanext/ga_report/ga_model.py +++ b/ckanext/ga_report/ga_model.py @@ -9,6 +9,8 @@ import ckan.model as model from ckan.lib.base import * + +log = __import__('logging').getLogger(__name__) def make_uuid(): return unicode(uuid.uuid4()) @@ -27,7 +29,7 @@ Column('period_name', types.UnicodeText), Column('period_complete_day', types.Integer), Column('pageviews', types.UnicodeText), - Column('visitors', types.UnicodeText), + Column('visits', types.UnicodeText), Column('url', types.UnicodeText), Column('department_id', types.UnicodeText), Column('package_id', types.UnicodeText), @@ -63,7 +65,7 @@ Column('period_name', types.UnicodeText), Column('publisher_name', types.UnicodeText), Column('views', types.UnicodeText), - Column('visitors', types.UnicodeText), + Column('visits', types.UnicodeText), Column('toplevel', types.Boolean, default=False), Column('subpublishercount', types.Integer, default=0), Column('parent', types.UnicodeText), @@ -155,25 +157,6 @@ model.Session.commit() -def update_url_stat_totals(period_name): - - """ - items = model.Session.query(GA_Url).\ - filter(GA_Url.period_name != "All").\ - filter(GA_Url.url==url).all() - values = {'id': make_uuid(), - 'period_name': "All", - 'period_complete_day': "0", - 'url': url, - 'pageviews': sum([int(x.pageviews) for x in items]), - 'visitors': sum([int(x.visitors) for x in items]), - 'department_id': department_id, - 'package_id': package - } - model.Session.add(GA_Url(**values)) - model.Session.commit() - """ - def pre_update_url_stats(period_name): model.Session.query(GA_Url).\ filter(GA_Url.period_name==period_name).delete() @@ -187,7 +170,7 @@ stores them in GA_Url under the period and recalculates the totals for the 'All' period. ''' - for url, views, visitors in url_data: + for url, views, visits in url_data: package, publisher = _get_package_and_publisher(url) @@ -196,7 +179,7 @@ filter(GA_Url.url==url).first() if item: item.pageviews = item.pageviews + views - item.visitors = item.visitors + visitors + item.visits = item.visits + visits if not item.package_id: item.package_id = package if not item.department_id: @@ -208,7 +191,7 @@ 'period_complete_day': period_complete_day, 'url': url, 'pageviews': views, - 'visitors': visitors, + 'visits': visits, 'department_id': publisher, 'package_id': package } @@ -221,7 +204,7 @@ filter(GA_Url.period_name=='All').\ filter(GA_Url.url==url).all() old_pageviews = sum([int(o.pageviews) for o in old]) - old_visits = sum([int(o.visitors) for o in old]) + old_visits = sum([int(o.visits) for o in old]) entries = model.Session.query(GA_Url).\ filter(GA_Url.period_name!='All').\ @@ -231,7 +214,7 @@ 'period_complete_day': 0, 'url': url, 'pageviews': sum([int(e.pageviews) for e in entries]) + old_pageviews, - 'visitors': sum([int(e.visitors) for e in entries]) + old_visits, + 'visits': sum([int(e.visits or 0) for e in entries]) + old_visits, 'department_id': publisher, 'package_id': package } @@ -281,7 +264,7 @@ filter(model.Group.type=='publisher').\ filter(model.Group.state=='active').all() for publisher in publishers: - views, visitors, subpub = update_publisher(period_name, publisher, publisher.name) + views, visits, subpub = update_publisher(period_name, publisher, publisher.name) parent, parents = '', publisher.get_groups('publisher') if parents: parent = parents[0].name @@ -290,7 +273,7 @@ filter(GA_Publisher.publisher_name==publisher.name).first() if item: item.views = views - item.visitors = visitors + item.visits = visits item.publisher_name = publisher.name item.toplevel = publisher in toplevel item.subpublishercount = subpub @@ -302,7 +285,7 @@ 'period_name': period_name, 'publisher_name': publisher.name, 'views': views, - 'visitors': visitors, + 'visits': visits, 'toplevel': publisher in toplevel, 'subpublishercount': subpub, 'parent': parent @@ -312,7 +295,7 @@ def update_publisher(period_name, pub, part=''): - views,visitors,subpub = 0, 0, 0 + views,visits,subpub = 0, 0, 0 for publisher in go_down_tree(pub): subpub = subpub + 1 items = model.Session.query(GA_Url).\ @@ -320,9 +303,9 @@ filter(GA_Url.department_id==publisher.name).all() for item in items: views = views + int(item.pageviews) - visitors = visitors + int(item.visitors) - - return views, visitors, (subpub-1) + visits = visits + int(item.visits) + + return views, visits, (subpub-1) def get_top_level(): @@ -362,3 +345,34 @@ q.delete() model.Session.commit() +def get_score_for_dataset(dataset_name): + ''' + Returns a "current popularity" score for a dataset, + based on how many views it has had recently. + ''' + import datetime + now = datetime.datetime.now() + last_month = now - datetime.timedelta(days=30) + period_names = ['%s-%02d' % (last_month.year, last_month.month), + '%s-%02d' % (now.year, now.month), + ] + + score = 0 + for period_name in period_names: + score /= 2 # previous periods are discounted by 50% + entry = model.Session.query(GA_Url)\ + .filter(GA_Url.period_name==period_name)\ + .filter(GA_Url.package_id==dataset_name).first() + # score + if entry: + views = float(entry.pageviews) + if entry.period_complete_day: + views_per_day = views / entry.period_complete_day + else: + views_per_day = views / 15 # guess + score += views_per_day + + score = int(score * 100) + log.debug('Popularity %s: %s', score, dataset_name) + return score + --- a/ckanext/ga_report/helpers.py +++ b/ckanext/ga_report/helpers.py @@ -106,7 +106,7 @@ if not p in datasets: datasets[p] = {'views':0, 'visits': 0} datasets[p]['views'] = datasets[p]['views'] + int(entry.pageviews) - datasets[p]['visits'] = datasets[p]['visits'] + int(entry.visitors) + datasets[p]['visits'] = datasets[p]['visits'] + int(entry.visits) results = [] for k, v in datasets.iteritems(): --- a/ckanext/ga_report/templates/ga_report/notes.html +++ b/ckanext/ga_report/templates/ga_report/notes.html @@ -6,9 +6,8 @@
  • Notes

    --- a/ckanext/ga_report/templates/ga_report/publisher/index.html +++ b/ckanext/ga_report/templates/ga_report/publisher/index.html @@ -41,14 +41,14 @@ - + - + --- a/ckanext/ga_report/templates/ga_report/publisher/read.html +++ b/ckanext/ga_report/templates/ga_report/publisher/read.html @@ -47,14 +47,14 @@
    PublisherDataset Visits Dataset Views
    ${h.link_to(publisher.title, h.url_for(controller='ckanext.ga_report.controller:GaDatasetReport', action='read_publisher', id=publisher.name))} ${visits} ${views}
    - + - +
    DatasetVisits Views
    ${h.link_to(package.title or package.name, h.url_for(controller='package', action='read', id=package.name))} ${visits} ${views}