Show top datasets cross-publisher. Drop-down for the publisher. Browser version numbers filtered on download, so you get this version in the CSV too - for privacy. single_popular_dataset now copes when not much data, and can return the figures so DGU can reskin it in its own repo. Notes about usage stats centralised to notes.html.
[ckanext-ga-report.git] / ckanext / ga_report / download_analytics.py
blob:a/ckanext/ga_report/download_analytics.py -> blob:b/ckanext/ga_report/download_analytics.py
--- a/ckanext/ga_report/download_analytics.py
+++ b/ckanext/ga_report/download_analytics.py
@@ -3,7 +3,7 @@
 import datetime
 import collections
 from pylons import config
-
+from ga_model import _normalize_url
 import ga_model
 
 #from ga_client import GA
@@ -100,12 +100,18 @@
                      self.get_full_period_name(period_name, period_complete_day),
                      start_date.strftime('%Y %m %d'),
                      end_date.strftime('%Y %m %d'))
-            data = self.download(start_date, end_date, '~/dataset/[a-z0-9-_]+')
+
+            # Clean up the entries before we run this
+            ga_model.pre_update_url_stats(period_name)
+
+            accountName = config.get('googleanalytics.account')
+
+            data = self.download(start_date, end_date, '~/%s/dataset/[a-z0-9-_]+' % accountName)
             log.info('Storing Dataset Analytics for period "%s"',
                      self.get_full_period_name(period_name, period_complete_day))
             self.store(period_name, period_complete_day, data, )
 
-            data = self.download(start_date, end_date, '~/publisher/[a-z0-9-_]+')
+            data = self.download(start_date, end_date, '~/%s/publisher/[a-z0-9-_]+' % accountName)
             log.info('Storing Publisher Analytics for period "%s"',
                      self.get_full_period_name(period_name, period_complete_day))
             self.store(period_name, period_complete_day, data,)
@@ -114,6 +120,7 @@
             self.sitewide_stats( period_name )
 
             self.update_social_info(period_name, start_date, end_date)
+
 
     def update_social_info(self, period_name, start_date, end_date):
         start_date = start_date.strftime('%Y-%m-%d')
@@ -136,17 +143,16 @@
         data = collections.defaultdict(list)
         rows = results.get('rows',[])
         for row in rows:
-            from ga_model import _normalize_url
             data[_normalize_url(row[0])].append( (row[1], int(row[2]),) )
         ga_model.update_social(period_name, data)
 
 
-    def download(self, start_date, end_date, path='~/dataset/[a-z0-9-_]+'):
+    def download(self, start_date, end_date, path=None):
         '''Get data from GA for a given time period'''
         start_date = start_date.strftime('%Y-%m-%d')
         end_date = end_date.strftime('%Y-%m-%d')
         query = 'ga:pagePath=%s$' % path
-        metrics = 'ga:uniquePageviews, ga:visitors'
+        metrics = 'ga:uniquePageviews, ga:visits'
         sort = '-ga:uniquePageviews'
 
         # Supported query params at
@@ -161,15 +167,13 @@
                                  max_results=10000,
                                  end_date=end_date).execute()
 
-        if os.getenv('DEBUG'):
-            import pprint
-            pprint.pprint(results)
-            print 'Total results: %s' % results.get('totalResults')
-
         packages = []
         for entry in results.get('rows'):
             (loc,pageviews,visits) = entry
-            packages.append( ('http:/' + loc, pageviews, visits,) ) # Temporary hack
+            url = _normalize_url('http:/' + loc)
+            if not url.startswith('/dataset/') and not url.startswith('/publisher/'):
+                continue
+            packages.append( (url, pageviews, visits,) ) # Temporary hack
         return dict(url=packages)
 
     def store(self, period_name, period_complete_day, data):
@@ -213,18 +217,37 @@
         results = self.service.data().ga().get(
                                  ids='ga:' + self.profile_id,
                                  start_date=start_date,
-                                 metrics='ga:pageviewsPerVisit,ga:bounces,ga:avgTimeOnSite,ga:percentNewVisits,ga:visitors',
+                                 metrics='ga:pageviewsPerVisit,ga:avgTimeOnSite,ga:percentNewVisits,ga:visits',
                                  max_results=10000,
                                  end_date=end_date).execute()
         result_data = results.get('rows')
         data = {
             'Pages per visit': result_data[0][0],
-            'Bounces': result_data[0][1],
-            'Average time on site': result_data[0][2],
-            'New visits': result_data[0][3],
-            'Total visits': result_data[0][4],
+            'Average time on site': result_data[0][1],
+            'New visits': result_data[0][2],
+            'Total visits': result_data[0][3],
         }
         ga_model.update_sitewide_stats(period_name, "Totals", data)
+
+        # Bounces from /data. This url is specified in configuration because
+        # for DGU we don't want /.
+        path = config.get('ga-report.bounce_url','/')
+        print path
+        results = self.service.data().ga().get(
+                                 ids='ga:' + self.profile_id,
+                                 filters='ga:pagePath=~%s$' % (path,),
+                                 start_date=start_date,
+                                 metrics='ga:bounces,ga:uniquePageviews',
+                                 dimensions='ga:pagePath',
+                                 max_results=10000,
+                                 end_date=end_date).execute()
+        result_data = results.get('rows')
+        for results in result_data:
+            if results[0] == path:
+                bounce, total = [float(x) for x in results[1:]]
+                pct = 100 * bounce/total
+                print "%d bounces from %d total == %s" % (bounce, total, pct)
+                ga_model.update_sitewide_stats(period_name, "Totals", {'Bounces': pct})
 
 
     def _locale_stats(self, start_date, end_date, period_name):