Fix display of publisher in Featured Dataset.
[ckanext-ga-report.git] / ckanext / ga_report / ga_model.py
blob:a/ckanext/ga_report/ga_model.py -> blob:b/ckanext/ga_report/ga_model.py
--- a/ckanext/ga_report/ga_model.py
+++ b/ckanext/ga_report/ga_model.py
@@ -9,6 +9,8 @@
 
 import ckan.model as model
 from ckan.lib.base import *
+
+log = __import__('logging').getLogger(__name__)
 
 def make_uuid():
     return unicode(uuid.uuid4())
@@ -27,7 +29,7 @@
                       Column('period_name', types.UnicodeText),
                       Column('period_complete_day', types.Integer),
                       Column('pageviews', types.UnicodeText),
-                      Column('visitors', types.UnicodeText),
+                      Column('visits', types.UnicodeText),
                       Column('url', types.UnicodeText),
                       Column('department_id', types.UnicodeText),
                       Column('package_id', types.UnicodeText),
@@ -45,6 +47,7 @@
                   Column('id', types.UnicodeText, primary_key=True,
                          default=make_uuid),
                   Column('period_name', types.UnicodeText),
+                  Column('period_complete_day', types.UnicodeText),
                   Column('stat_name', types.UnicodeText),
                   Column('key', types.UnicodeText),
                   Column('value', types.UnicodeText), )
@@ -63,7 +66,7 @@
                   Column('period_name', types.UnicodeText),
                   Column('publisher_name', types.UnicodeText),
                   Column('views', types.UnicodeText),
-                  Column('visitors', types.UnicodeText),
+                  Column('visits', types.UnicodeText),
                   Column('toplevel', types.Boolean, default=False),
                   Column('subpublishercount', types.Integer, default=0),
                   Column('parent', types.UnicodeText),
@@ -111,12 +114,10 @@
     >>> normalize_url('http://data.gov.uk/dataset/weekly_fuel_prices')
     '/dataset/weekly_fuel_prices'
     '''
-    # Deliberately leaving a /
-    url = url.replace('http:/','')
-    return '/' + '/'.join(url.split('/')[2:])
-
-
-def _get_department_id_of_url(url):
+    return url #'/' + '/'.join(url.split('/')[3:])
+
+
+def _get_package_and_publisher(url):
     # e.g. /dataset/fuel_prices
     # e.g. /dataset/fuel_prices/resource/e63380d4
     dataset_match = re.match('/dataset/([^/]+)(/.*)?', url)
@@ -124,16 +125,17 @@
         dataset_ref = dataset_match.groups()[0]
         dataset = model.Package.get(dataset_ref)
         if dataset:
-            publisher_groups = dataset.get_groups('publisher')
+            publisher_groups = dataset.get_groups('organization')
             if publisher_groups:
-                return publisher_groups[0].name
+                return dataset_ref,publisher_groups[0].name
+        return dataset_ref, None
     else:
-        publisher_match = re.match('/publisher/([^/]+)(/.*)?', url)
+        publisher_match = re.match('/organization/([^/]+)(/.*)?', url)
         if publisher_match:
-            return publisher_match.groups()[0]
-
-
-def update_sitewide_stats(period_name, stat_name, data):
+            return None, publisher_match.groups()[0]
+    return None, None
+
+def update_sitewide_stats(period_name, stat_name, data, period_complete_day):
     for k,v in data.iteritems():
         item = model.Session.query(GA_Stat).\
             filter(GA_Stat.period_name==period_name).\
@@ -143,11 +145,13 @@
             item.period_name = period_name
             item.key = k
             item.value = v
+            item.period_complete_day = period_complete_day
             model.Session.add(item)
         else:
             # create the row
             values = {'id': make_uuid(),
                      'period_name': period_name,
+                     'period_complete_day': period_complete_day,
                      'key': k,
                      'value': v,
                      'stat_name': stat_name
@@ -156,30 +160,65 @@
         model.Session.commit()
 
 
-def update_url_stat_totals(period_name):
-
+def pre_update_url_stats(period_name):
+    q = model.Session.query(GA_Url).\
+        filter(GA_Url.period_name==period_name)
+    log.debug("Deleting %d '%s' records" % (q.count(), period_name))
+    q.delete()
+
+    q = model.Session.query(GA_Url).\
+        filter(GA_Url.period_name == 'All')
+    log.debug("Deleting %d 'All' records..." % q.count())
+    q.delete()
+
+    model.Session.flush()
+    model.Session.commit()
+    model.repo.commit_and_remove()
+    log.debug('...done')
+
+def post_update_url_stats():
+
+    """ Check the distinct url field in ga_url and make sure
+        it has an All record.  If not then create one.
+
+        After running this then every URL should have an All
+        record regardless of whether the URL has an entry for
+        the month being currently processed.
     """
-        items = model.Session.query(GA_Url).\
-            filter(GA_Url.period_name != "All").\
-            filter(GA_Url.url==url).all()
+    log.debug('Post-processing "All" records...')
+    query = """select url, pageviews::int, visits::int
+               from ga_url
+               where url not in (select url from ga_url where period_name ='All')"""
+    connection = model.Session.connection()
+    res = connection.execute(query)
+
+    views, visits = {}, {}
+    # url, views, visits
+    for row in res:
+        views[row[0]] = views.get(row[0], 0) + row[1]
+        visits[row[0]] = visits.get(row[0], 0) + row[2]
+
+    progress_total = len(views.keys())
+    progress_count = 0
+    for key in views.keys():
+        progress_count += 1
+        if progress_count % 100 == 0:
+            log.debug('.. %d/%d done so far', progress_count, progress_total)
+
+        package, publisher = _get_package_and_publisher(key)
+
         values = {'id': make_uuid(),
                   'period_name': "All",
-                  'period_complete_day': "0",
-                  'url': url,
-                  'pageviews': sum([int(x.pageviews) for x in items]),
-                  'visitors': sum([int(x.visitors) for x in items]),
-                  'department_id': department_id,
+                  'period_complete_day': 0,
+                  'url': key,
+                  'pageviews': views[key],
+                  'visits': visits[key],
+                  'department_id': publisher,
                   'package_id': package
-                 }
+                  }
         model.Session.add(GA_Url(**values))
-        model.Session.commit()
-    """
-
-def pre_update_url_stats(period_name):
-    model.Session.query(GA_Url).\
-            filter(GA_Url.period_name==period_name).delete()
-    model.Session.query(GA_Url).\
-            filter(GA_Url.period_name=='All').delete()
+    model.Session.commit()
+    log.debug('..done')
 
 
 def update_url_stats(period_name, period_complete_day, url_data):
@@ -188,26 +227,47 @@
     stores them in GA_Url under the period and recalculates the totals for
     the 'All' period.
     '''
-    for url, views, visitors in url_data:
-        department_id = _get_department_id_of_url(url)
-
-        package = None
-        if url.startswith('/dataset/'):
-            package = url[len('/dataset/'):]
-
-        values = {'id': make_uuid(),
-                  'period_name': period_name,
-                  'period_complete_day': period_complete_day,
-                  'url': url,
-                  'pageviews': views,
-                  'visitors': visitors,
-                  'department_id': department_id,
-                  'package_id': package
-                 }
-        model.Session.add(GA_Url(**values))
+    progress_total = len(url_data)
+    progress_count = 0
+    for url, views, visits in url_data:
+        progress_count += 1
+        if progress_count % 100 == 0:
+            log.debug('.. %d/%d done so far', progress_count, progress_total)
+
+        package, publisher = _get_package_and_publisher(url)
+
+        item = model.Session.query(GA_Url).\
+            filter(GA_Url.period_name==period_name).\
+            filter(GA_Url.url==url).first()
+        if item:
+            item.pageviews = item.pageviews + views
+            item.visits = item.visits + visits
+            if not item.package_id:
+                item.package_id = package
+            if not item.department_id:
+                item.department_id = publisher
+            model.Session.add(item)
+        else:
+            values = {'id': make_uuid(),
+                      'period_name': period_name,
+                      'period_complete_day': period_complete_day,
+                      'url': url,
+                      'pageviews': views,
+                      'visits': visits,
+                      'department_id': publisher,
+                      'package_id': package
+                     }
+            model.Session.add(GA_Url(**values))
         model.Session.commit()
 
         if package:
+            old_pageviews, old_visits = 0, 0
+            old = model.Session.query(GA_Url).\
+                filter(GA_Url.period_name=='All').\
+                filter(GA_Url.url==url).all()
+            old_pageviews = sum([int(o.pageviews) for o in old])
+            old_visits = sum([int(o.visits) for o in old])
+
             entries = model.Session.query(GA_Url).\
                 filter(GA_Url.period_name!='All').\
                 filter(GA_Url.url==url).all()
@@ -215,14 +275,14 @@
                       'period_name': 'All',
                       'period_complete_day': 0,
                       'url': url,
-                      'pageviews': sum([int(e.pageviews) for e in entries]),
-                      'visitors': sum([int(e.visitors) for e in entries]),
-                      'department_id': department_id,
+                      'pageviews': sum([int(e.pageviews) for e in entries]) + int(old_pageviews),
+                      'visits': sum([int(e.visits or 0) for e in entries]) + int(old_visits),
+                      'department_id': publisher,
                       'package_id': package
                      }
+
             model.Session.add(GA_Url(**values))
             model.Session.commit()
-
 
 
 
@@ -263,11 +323,11 @@
     """
     toplevel = get_top_level()
     publishers = model.Session.query(model.Group).\
-        filter(model.Group.type=='publisher').\
+        filter(model.Group.type=='organization').\
         filter(model.Group.state=='active').all()
     for publisher in publishers:
-        views, visitors, subpub = update_publisher(period_name, publisher, publisher.name)
-        parent, parents = '', publisher.get_groups('publisher')
+        views, visits, subpub = update_publisher(period_name, publisher, publisher.name)
+        parent, parents = '', publisher.get_parent_groups(type='organization')
         if parents:
             parent = parents[0].name
         item = model.Session.query(GA_Publisher).\
@@ -275,7 +335,7 @@
             filter(GA_Publisher.publisher_name==publisher.name).first()
         if item:
             item.views = views
-            item.visitors = visitors
+            item.visits = visits
             item.publisher_name = publisher.name
             item.toplevel = publisher in toplevel
             item.subpublishercount = subpub
@@ -287,7 +347,7 @@
                      'period_name': period_name,
                      'publisher_name': publisher.name,
                      'views': views,
-                     'visitors': visitors,
+                     'visits': visits,
                      'toplevel': publisher in toplevel,
                      'subpublishercount': subpub,
                      'parent': parent
@@ -297,7 +357,7 @@
 
 
 def update_publisher(period_name, pub, part=''):
-    views,visitors,subpub = 0, 0, 0
+    views,visits,subpub = 0, 0, 0
     for publisher in go_down_tree(pub):
         subpub = subpub + 1
         items = model.Session.query(GA_Url).\
@@ -305,9 +365,9 @@
                 filter(GA_Url.department_id==publisher.name).all()
         for item in items:
             views = views + int(item.pageviews)
-            visitors = visitors + int(item.visitors)
-
-    return views, visitors, (subpub-1)
+            visits = visits + int(item.visits)
+
+    return views, visits, (subpub-1)
 
 
 def get_top_level():
@@ -317,15 +377,12 @@
                      model.Member.table_name == 'group' and \
                      model.Member.state == 'active').\
            filter(model.Member.id==None).\
-           filter(model.Group.type=='publisher').\
+           filter(model.Group.type=='organization').\
            order_by(model.Group.name).all()
 
 def get_children(publisher):
-    '''Finds child publishers for the given publisher (object). (Not recursive)'''
-    from ckan.model.group import HIERARCHY_CTE
-    return model.Session.query(model.Group).\
-           from_statement(HIERARCHY_CTE).params(id=publisher.id, type='publisher').\
-           all()
+    '''Finds child publishers for the given publisher (object). (Not recursive i.e. returns one level)'''
+    return publisher.get_children_groups(type='organization')
 
 def go_down_tree(publisher):
     '''Provided with a publisher object, it walks down the hierarchy and yields each publisher,
@@ -342,8 +399,39 @@
     '''
     for object_type in (GA_Url, GA_Stat, GA_Publisher, GA_ReferralStat):
         q = model.Session.query(object_type)
-        if period_name != 'all':
+        if period_name != 'All':
             q = q.filter_by(period_name=period_name)
         q.delete()
-    model.Session.commit()
-
+    model.repo.commit_and_remove()
+
+def get_score_for_dataset(dataset_name):
+    '''
+    Returns a "current popularity" score for a dataset,
+    based on how many views it has had recently.
+    '''
+    import datetime
+    now = datetime.datetime.now()
+    last_month = now - datetime.timedelta(days=30)
+    period_names = ['%s-%02d' % (last_month.year, last_month.month),
+                    '%s-%02d' % (now.year, now.month),
+                    ]
+
+    score = 0
+    for period_name in period_names:
+        score /= 2 # previous periods are discounted by 50%
+        entry = model.Session.query(GA_Url)\
+                .filter(GA_Url.period_name==period_name)\
+                .filter(GA_Url.package_id==dataset_name).first()
+        # score
+        if entry:
+            views = float(entry.pageviews)
+            if entry.period_complete_day:
+                views_per_day = views / entry.period_complete_day
+            else:
+                views_per_day = views / 15 # guess
+            score += views_per_day
+
+    score = int(score * 100)
+    log.debug('Popularity %s: %s', score, dataset_name)
+    return score
+