Modified model for delete() to check != 'All' instead of 'all'
[ckanext-ga-report.git] / ckanext / ga_report / ga_model.py
blob:a/ckanext/ga_report/ga_model.py -> blob:b/ckanext/ga_report/ga_model.py
--- a/ckanext/ga_report/ga_model.py
+++ b/ckanext/ga_report/ga_model.py
@@ -114,7 +114,7 @@
     >>> normalize_url('http://data.gov.uk/dataset/weekly_fuel_prices')
     '/dataset/weekly_fuel_prices'
     '''
-    return '/' + '/'.join(url.split('/')[3:])
+    return url #'/' + '/'.join(url.split('/')[3:])
 
 
 def _get_package_and_publisher(url):
@@ -125,12 +125,12 @@
         dataset_ref = dataset_match.groups()[0]
         dataset = model.Package.get(dataset_ref)
         if dataset:
-            publisher_groups = dataset.get_groups('publisher')
+            publisher_groups = dataset.get_groups('organization')
             if publisher_groups:
                 return dataset_ref,publisher_groups[0].name
         return dataset_ref, None
     else:
-        publisher_match = re.match('/publisher/([^/]+)(/.*)?', url)
+        publisher_match = re.match('/organization/([^/]+)(/.*)?', url)
         if publisher_match:
             return None, publisher_match.groups()[0]
     return None, None
@@ -161,10 +161,64 @@
 
 
 def pre_update_url_stats(period_name):
-    model.Session.query(GA_Url).\
-            filter(GA_Url.period_name==period_name).delete()
-    model.Session.query(GA_Url).\
-            filter(GA_Url.period_name=='All').delete()
+    q = model.Session.query(GA_Url).\
+        filter(GA_Url.period_name==period_name)
+    log.debug("Deleting %d '%s' records" % (q.count(), period_name))
+    q.delete()
+
+    q = model.Session.query(GA_Url).\
+        filter(GA_Url.period_name == 'All')
+    log.debug("Deleting %d 'All' records..." % q.count())
+    q.delete()
+
+    model.Session.flush()
+    model.Session.commit()
+    model.repo.commit_and_remove()
+    log.debug('...done')
+
+def post_update_url_stats():
+
+    """ Check the distinct url field in ga_url and make sure
+        it has an All record.  If not then create one.
+
+        After running this then every URL should have an All
+        record regardless of whether the URL has an entry for
+        the month being currently processed.
+    """
+    log.debug('Post-processing "All" records...')
+    query = """select url, pageviews::int, visits::int
+               from ga_url
+               where url not in (select url from ga_url where period_name ='All')"""
+    connection = model.Session.connection()
+    res = connection.execute(query)
+
+    views, visits = {}, {}
+    # url, views, visits
+    for row in res:
+        views[row[0]] = views.get(row[0], 0) + row[1]
+        visits[row[0]] = visits.get(row[0], 0) + row[2]
+
+    progress_total = len(views.keys())
+    progress_count = 0
+    for key in views.keys():
+        progress_count += 1
+        if progress_count % 100 == 0:
+            log.debug('.. %d/%d done so far', progress_count, progress_total)
+
+        package, publisher = _get_package_and_publisher(key)
+
+        values = {'id': make_uuid(),
+                  'period_name': "All",
+                  'period_complete_day': 0,
+                  'url': key,
+                  'pageviews': views[key],
+                  'visits': visits[key],
+                  'department_id': publisher,
+                  'package_id': package
+                  }
+        model.Session.add(GA_Url(**values))
+    model.Session.commit()
+    log.debug('..done')
 
 
 def update_url_stats(period_name, period_complete_day, url_data):
@@ -173,9 +227,14 @@
     stores them in GA_Url under the period and recalculates the totals for
     the 'All' period.
     '''
+    progress_total = len(url_data)
+    progress_count = 0
     for url, views, visits in url_data:
+        progress_count += 1
+        if progress_count % 100 == 0:
+            log.debug('.. %d/%d done so far', progress_count, progress_total)
+
         package, publisher = _get_package_and_publisher(url)
-
 
         item = model.Session.query(GA_Url).\
             filter(GA_Url.period_name==period_name).\
@@ -216,8 +275,8 @@
                       'period_name': 'All',
                       'period_complete_day': 0,
                       'url': url,
-                      'pageviews': sum([int(e.pageviews) for e in entries]) + old_pageviews,
-                      'visits': sum([int(e.visits or 0) for e in entries]) + old_visits,
+                      'pageviews': sum([int(e.pageviews) for e in entries]) + int(old_pageviews),
+                      'visits': sum([int(e.visits or 0) for e in entries]) + int(old_visits),
                       'department_id': publisher,
                       'package_id': package
                      }
@@ -264,11 +323,11 @@
     """
     toplevel = get_top_level()
     publishers = model.Session.query(model.Group).\
-        filter(model.Group.type=='publisher').\
+        filter(model.Group.type=='organization').\
         filter(model.Group.state=='active').all()
     for publisher in publishers:
         views, visits, subpub = update_publisher(period_name, publisher, publisher.name)
-        parent, parents = '', publisher.get_groups('publisher')
+        parent, parents = '', publisher.get_parent_groups(type='organization')
         if parents:
             parent = parents[0].name
         item = model.Session.query(GA_Publisher).\
@@ -318,15 +377,12 @@
                      model.Member.table_name == 'group' and \
                      model.Member.state == 'active').\
            filter(model.Member.id==None).\
-           filter(model.Group.type=='publisher').\
+           filter(model.Group.type=='organization').\
            order_by(model.Group.name).all()
 
 def get_children(publisher):
-    '''Finds child publishers for the given publisher (object). (Not recursive)'''
-    from ckan.model.group import HIERARCHY_CTE
-    return model.Session.query(model.Group).\
-           from_statement(HIERARCHY_CTE).params(id=publisher.id, type='publisher').\
-           all()
+    '''Finds child publishers for the given publisher (object). (Not recursive i.e. returns one level)'''
+    return publisher.get_children_groups(type='organization')
 
 def go_down_tree(publisher):
     '''Provided with a publisher object, it walks down the hierarchy and yields each publisher,
@@ -346,7 +402,7 @@
         if period_name != 'All':
             q = q.filter_by(period_name=period_name)
         q.delete()
-    model.Session.commit()
+    model.repo.commit_and_remove()
 
 def get_score_for_dataset(dataset_name):
     '''