Changes to tidy up the handling of all months;
[ckanext-ga-report.git] / ckanext / ga_report / ga_model.py
blob:a/ckanext/ga_report/ga_model.py -> blob:b/ckanext/ga_report/ga_model.py
--- a/ckanext/ga_report/ga_model.py
+++ b/ckanext/ga_report/ga_model.py
@@ -9,6 +9,8 @@
 
 import ckan.model as model
 from ckan.lib.base import *
+
+log = __import__('logging').getLogger(__name__)
 
 def make_uuid():
     return unicode(uuid.uuid4())
@@ -27,7 +29,7 @@
                       Column('period_name', types.UnicodeText),
                       Column('period_complete_day', types.Integer),
                       Column('pageviews', types.UnicodeText),
-                      Column('visitors', types.UnicodeText),
+                      Column('visits', types.UnicodeText),
                       Column('url', types.UnicodeText),
                       Column('department_id', types.UnicodeText),
                       Column('package_id', types.UnicodeText),
@@ -45,6 +47,7 @@
                   Column('id', types.UnicodeText, primary_key=True,
                          default=make_uuid),
                   Column('period_name', types.UnicodeText),
+                  Column('period_complete_day', types.UnicodeText),
                   Column('stat_name', types.UnicodeText),
                   Column('key', types.UnicodeText),
                   Column('value', types.UnicodeText), )
@@ -63,7 +66,7 @@
                   Column('period_name', types.UnicodeText),
                   Column('publisher_name', types.UnicodeText),
                   Column('views', types.UnicodeText),
-                  Column('visitors', types.UnicodeText),
+                  Column('visits', types.UnicodeText),
                   Column('toplevel', types.Boolean, default=False),
                   Column('subpublishercount', types.Integer, default=0),
                   Column('parent', types.UnicodeText),
@@ -111,12 +114,10 @@
     >>> normalize_url('http://data.gov.uk/dataset/weekly_fuel_prices')
     '/dataset/weekly_fuel_prices'
     '''
-    # Deliberately leaving a /
-    url = url.replace('http:/','')
-    return '/' + '/'.join(url.split('/')[2:])
-
-
-def _get_department_id_of_url(url):
+    return '/' + '/'.join(url.split('/')[3:])
+
+
+def _get_package_and_publisher(url):
     # e.g. /dataset/fuel_prices
     # e.g. /dataset/fuel_prices/resource/e63380d4
     dataset_match = re.match('/dataset/([^/]+)(/.*)?', url)
@@ -126,14 +127,15 @@
         if dataset:
             publisher_groups = dataset.get_groups('publisher')
             if publisher_groups:
-                return publisher_groups[0].name
+                return dataset_ref,publisher_groups[0].name
+        return dataset_ref, None
     else:
         publisher_match = re.match('/publisher/([^/]+)(/.*)?', url)
         if publisher_match:
-            return publisher_match.groups()[0]
-
-
-def update_sitewide_stats(period_name, stat_name, data):
+            return None, publisher_match.groups()[0]
+    return None, None
+
+def update_sitewide_stats(period_name, stat_name, data, period_complete_day):
     for k,v in data.iteritems():
         item = model.Session.query(GA_Stat).\
             filter(GA_Stat.period_name==period_name).\
@@ -143,11 +145,13 @@
             item.period_name = period_name
             item.key = k
             item.value = v
+            item.period_complete_day = period_complete_day
             model.Session.add(item)
         else:
             # create the row
             values = {'id': make_uuid(),
                      'period_name': period_name,
+                     'period_complete_day': period_complete_day,
                      'key': k,
                      'value': v,
                      'stat_name': stat_name
@@ -156,25 +160,6 @@
         model.Session.commit()
 
 
-def update_url_stat_totals(period_name):
-
-    """
-        items = model.Session.query(GA_Url).\
-            filter(GA_Url.period_name != "All").\
-            filter(GA_Url.url==url).all()
-        values = {'id': make_uuid(),
-                  'period_name': "All",
-                  'period_complete_day': "0",
-                  'url': url,
-                  'pageviews': sum([int(x.pageviews) for x in items]),
-                  'visitors': sum([int(x.visitors) for x in items]),
-                  'department_id': department_id,
-                  'package_id': package
-                 }
-        model.Session.add(GA_Url(**values))
-        model.Session.commit()
-    """
-
 def pre_update_url_stats(period_name):
     model.Session.query(GA_Url).\
             filter(GA_Url.period_name==period_name).delete()
@@ -183,27 +168,47 @@
 
 
 def update_url_stats(period_name, period_complete_day, url_data):
-
-    for url, views, visitors in url_data:
-        department_id = _get_department_id_of_url(url)
-
-        package = None
-        if url.startswith('/dataset/'):
-            package = url[len('/dataset/'):]
-
-        values = {'id': make_uuid(),
-                  'period_name': period_name,
-                  'period_complete_day': period_complete_day,
-                  'url': url,
-                  'pageviews': views,
-                  'visitors': visitors,
-                  'department_id': department_id,
-                  'package_id': package
-                 }
-        model.Session.add(GA_Url(**values))
+    '''
+    Given a list of urls and number of hits for each during a given period,
+    stores them in GA_Url under the period and recalculates the totals for
+    the 'All' period.
+    '''
+    for url, views, visits in url_data:
+        package, publisher = _get_package_and_publisher(url)
+
+
+        item = model.Session.query(GA_Url).\
+            filter(GA_Url.period_name==period_name).\
+            filter(GA_Url.url==url).first()
+        if item:
+            item.pageviews = item.pageviews + views
+            item.visits = item.visits + visits
+            if not item.package_id:
+                item.package_id = package
+            if not item.department_id:
+                item.department_id = publisher
+            model.Session.add(item)
+        else:
+            values = {'id': make_uuid(),
+                      'period_name': period_name,
+                      'period_complete_day': period_complete_day,
+                      'url': url,
+                      'pageviews': views,
+                      'visits': visits,
+                      'department_id': publisher,
+                      'package_id': package
+                     }
+            model.Session.add(GA_Url(**values))
         model.Session.commit()
 
         if package:
+            old_pageviews, old_visits = 0, 0
+            old = model.Session.query(GA_Url).\
+                filter(GA_Url.period_name=='All').\
+                filter(GA_Url.url==url).all()
+            old_pageviews = sum([int(o.pageviews) for o in old])
+            old_visits = sum([int(o.visits) for o in old])
+
             entries = model.Session.query(GA_Url).\
                 filter(GA_Url.period_name!='All').\
                 filter(GA_Url.url==url).all()
@@ -211,14 +216,14 @@
                       'period_name': 'All',
                       'period_complete_day': 0,
                       'url': url,
-                      'pageviews': sum([int(e.pageviews) for e in entries]),
-                      'visitors': sum([int(e.visitors) for e in entries]),
-                      'department_id': department_id,
+                      'pageviews': sum([int(e.pageviews) for e in entries]) + old_pageviews,
+                      'visits': sum([int(e.visits or 0) for e in entries]) + old_visits,
+                      'department_id': publisher,
                       'package_id': package
                      }
+
             model.Session.add(GA_Url(**values))
             model.Session.commit()
-
 
 
 
@@ -262,7 +267,7 @@
         filter(model.Group.type=='publisher').\
         filter(model.Group.state=='active').all()
     for publisher in publishers:
-        views, visitors, subpub = update_publisher(period_name, publisher, publisher.name)
+        views, visits, subpub = update_publisher(period_name, publisher, publisher.name)
         parent, parents = '', publisher.get_groups('publisher')
         if parents:
             parent = parents[0].name
@@ -271,7 +276,7 @@
             filter(GA_Publisher.publisher_name==publisher.name).first()
         if item:
             item.views = views
-            item.visitors = visitors
+            item.visits = visits
             item.publisher_name = publisher.name
             item.toplevel = publisher in toplevel
             item.subpublishercount = subpub
@@ -283,7 +288,7 @@
                      'period_name': period_name,
                      'publisher_name': publisher.name,
                      'views': views,
-                     'visitors': visitors,
+                     'visits': visits,
                      'toplevel': publisher in toplevel,
                      'subpublishercount': subpub,
                      'parent': parent
@@ -293,7 +298,7 @@
 
 
 def update_publisher(period_name, pub, part=''):
-    views,visitors,subpub = 0, 0, 0
+    views,visits,subpub = 0, 0, 0
     for publisher in go_down_tree(pub):
         subpub = subpub + 1
         items = model.Session.query(GA_Url).\
@@ -301,9 +306,9 @@
                 filter(GA_Url.department_id==publisher.name).all()
         for item in items:
             views = views + int(item.pageviews)
-            visitors = visitors + int(item.visitors)
-
-    return views, visitors, (subpub-1)
+            visits = visits + int(item.visits)
+
+    return views, visits, (subpub-1)
 
 
 def get_top_level():
@@ -343,3 +348,34 @@
         q.delete()
     model.Session.commit()
 
+def get_score_for_dataset(dataset_name):
+    '''
+    Returns a "current popularity" score for a dataset,
+    based on how many views it has had recently.
+    '''
+    import datetime
+    now = datetime.datetime.now()
+    last_month = now - datetime.timedelta(days=30)
+    period_names = ['%s-%02d' % (last_month.year, last_month.month),
+                    '%s-%02d' % (now.year, now.month),
+                    ]
+
+    score = 0
+    for period_name in period_names:
+        score /= 2 # previous periods are discounted by 50%
+        entry = model.Session.query(GA_Url)\
+                .filter(GA_Url.period_name==period_name)\
+                .filter(GA_Url.package_id==dataset_name).first()
+        # score
+        if entry:
+            views = float(entry.pageviews)
+            if entry.period_complete_day:
+                views_per_day = views / entry.period_complete_day
+            else:
+                views_per_day = views / 15 # guess
+            score += views_per_day
+
+    score = int(score * 100)
+    log.debug('Popularity %s: %s', score, dataset_name)
+    return score
+