Implemented further analytics by fetching more specific items.
[ckanext-ga-report.git] / ckanext / ga_report / download_analytics.py
blob:a/ckanext/ga_report/download_analytics.py -> blob:b/ckanext/ga_report/download_analytics.py
--- a/ckanext/ga_report/download_analytics.py
+++ b/ckanext/ga_report/download_analytics.py
@@ -1,7 +1,7 @@
 import os
 import logging
 import datetime
-
+import collections
 from pylons import config
 
 import ga_model
@@ -21,8 +21,17 @@
         self.profile_id = profile_id
 
 
-    def all_(self):
-        self.since_date(datetime.datetime(2010, 1, 1))
+    def specific_month(self, date):
+        import calendar
+
+        first_of_this_month = datetime.datetime(date.year, date.month, 1)
+        _, last_day_of_month = calendar.monthrange(int(date.year), int(date.month))
+        last_of_this_month =  datetime.datetime(date.year, date.month, last_day_of_month)
+        periods = ((date.strftime(FORMAT_MONTH),
+                    last_day_of_month,
+                    first_of_this_month, last_of_this_month),)
+        self.download_and_store(periods)
+
 
     def latest(self):
         if self.period == 'monthly':
@@ -37,13 +46,13 @@
         self.download_and_store(periods)
 
 
-    def since_date(self, since_date):
+    def for_date(self, for_date):
         assert isinstance(since_date, datetime.datetime)
         periods = [] # (period_name, period_complete_day, start_date, end_date)
         if self.period == 'monthly':
             first_of_the_months_until_now = []
-            year = since_date.year
-            month = since_date.month
+            year = for_date.year
+            month = for_date.month
             now = datetime.datetime.now()
             first_of_this_month = datetime.datetime(now.year, now.month, 1)
             while True:
@@ -85,7 +94,6 @@
                      self.get_full_period_name(period_name, period_complete_day),
                      start_date.strftime('%Y %m %d'),
                      end_date.strftime('%Y %m %d'))
-
             data = self.download(start_date, end_date, '~/dataset/[a-z0-9-_]+')
             log.info('Storing Dataset Analytics for period "%s"',
                      self.get_full_period_name(period_name, period_complete_day))
@@ -95,9 +103,36 @@
             log.info('Storing Publisher Analytics for period "%s"',
                      self.get_full_period_name(period_name, period_complete_day))
             self.store(period_name, period_complete_day, data,)
-            ga_model.update_publisher_stats(period_name)
-
+
+            ga_model.update_publisher_stats(period_name) # about 30 seconds.
             self.sitewide_stats( period_name )
+
+            self.update_social_info(period_name, start_date, end_date)
+
+    def update_social_info(self, period_name, start_date, end_date):
+        start_date = start_date.strftime('%Y-%m-%d')
+        end_date = end_date.strftime('%Y-%m-%d')
+        query = 'ga:hasSocialSourceReferral=~Yes$'
+        metrics = 'ga:entrances'
+        sort = '-ga:entrances'
+
+        # Supported query params at
+        # https://developers.google.com/analytics/devguides/reporting/core/v3/reference
+        results = self.service.data().ga().get(
+                                 ids='ga:' + self.profile_id,
+                                 filters=query,
+                                 start_date=start_date,
+                                 metrics=metrics,
+                                 sort=sort,
+                                 dimensions="ga:landingPagePath,ga:socialNetwork",
+                                 max_results=10000,
+                                 end_date=end_date).execute()
+        data = collections.defaultdict(list)
+        rows = results.get('rows',[])
+        for row in rows:
+            from ga_model import _normalize_url
+            data[_normalize_url(row[0])].append( (row[1], int(row[2]),) )
+        ga_model.update_social(period_name, data)
 
 
     def download(self, start_date, end_date, path='~/dataset/[a-z0-9-_]+'):
@@ -105,7 +140,7 @@
         start_date = start_date.strftime('%Y-%m-%d')
         end_date = end_date.strftime('%Y-%m-%d')
         query = 'ga:pagePath=%s$' % path
-        metrics = 'ga:uniquePageviews, ga:visits'
+        metrics = 'ga:uniquePageviews, ga:visitors'
         sort = '-ga:uniquePageviews'
 
         # Supported query params at
@@ -135,7 +170,6 @@
         if 'url' in data:
             ga_model.update_url_stats(period_name, period_complete_day, data['url'])
 
-
     def sitewide_stats(self, period_name):
         import calendar
         year, month = period_name.split('-')
@@ -151,6 +185,12 @@
             print ' + Fetching %s stats' % f.split('_')[1]
             getattr(self, f)(start_date, end_date, period_name)
 
+    def _get_results(result_data, f):
+        data = {}
+        for result in result_data:
+            key = f(result)
+            data[key] = data.get(key,0) + result[1]
+        return data
 
     def _totals_stats(self, start_date, end_date, period_name):
         """ Fetches distinct totals, total pageviews etc """
@@ -162,12 +202,12 @@
                                  max_results=10000,
                                  end_date=end_date).execute()
         result_data = results.get('rows')
-        ga_model.update_sitewide_stats(period_name, "Totals", {'Total pageviews': result_data[0][0]})
-
-        results = self.service.data().ga().get(
-                                 ids='ga:' + self.profile_id,
-                                 start_date=start_date,
-                                 metrics='ga:pageviewsPerVisit,ga:bounces,ga:avgTimeOnSite,ga:percentNewVisits',
+        ga_model.update_sitewide_stats(period_name, "Totals", {'Total page views': result_data[0][0]})
+
+        results = self.service.data().ga().get(
+                                 ids='ga:' + self.profile_id,
+                                 start_date=start_date,
+                                 metrics='ga:pageviewsPerVisit,ga:bounces,ga:avgTimeOnSite,ga:percentNewVisits,ga:visitors',
                                  max_results=10000,
                                  end_date=end_date).execute()
         result_data = results.get('rows')
@@ -175,7 +215,8 @@
             'Pages per visit': result_data[0][0],
             'Bounces': result_data[0][1],
             'Average time on site': result_data[0][2],
-            'Percent new visits': result_data[0][3],
+            'New visits': result_data[0][3],
+            'Total visits': result_data[0][4],
         }
         ga_model.update_sitewide_stats(period_name, "Totals", data)
 
@@ -264,7 +305,7 @@
 
         data = {}
         for result in result_data:
-            key = "%s (%s)" % (result[0],result[1])
+            key = "%s (%s)" % (result[0], result[1])
             data[key] = result[2]
         ga_model.update_sitewide_stats(period_name, "Browser versions", data)