Remove prettyprinting unless DEBUG is set in env and add a new
[ckanext-ga-report.git] / ckanext / ga_report / download_analytics.py
blob:a/ckanext/ga_report/download_analytics.py -> blob:b/ckanext/ga_report/download_analytics.py
--- a/ckanext/ga_report/download_analytics.py
+++ b/ckanext/ga_report/download_analytics.py
@@ -85,18 +85,27 @@
                      self.get_full_period_name(period_name, period_complete_day),
                      start_date.strftime('%Y %m %d'),
                      end_date.strftime('%Y %m %d'))
-            data = self.download(start_date, end_date)
-            log.info('Storing Analytics for period "%s"',
+
+            data = self.download(start_date, end_date, '~/dataset/[a-z0-9-_]+')
+            log.info('Storing Dataset Analytics for period "%s"',
                      self.get_full_period_name(period_name, period_complete_day))
-            self.store(period_name, period_complete_day, data)
-
-
-    def download(self, start_date, end_date):
+            self.store(period_name, period_complete_day, data, )
+
+            data = self.download(start_date, end_date, '~/publisher/[a-z0-9-_]+')
+            log.info('Storing Publisher Analytics for period "%s"',
+                     self.get_full_period_name(period_name, period_complete_day))
+            self.store(period_name, period_complete_day, data,)
+
+            ga_model.update_publisher_stats(period_name) # about 30 seconds.
+            self.sitewide_stats( period_name )
+
+
+    def download(self, start_date, end_date, path='~/dataset/[a-z0-9-_]+'):
         '''Get data from GA for a given time period'''
         start_date = start_date.strftime('%Y-%m-%d')
         end_date = end_date.strftime('%Y-%m-%d')
-        query = 'ga:pagePath=~/dataset/[a-z0-9-]+$'
-        metrics = 'ga:uniquePageviews'
+        query = 'ga:pagePath=%s$' % path
+        metrics = 'ga:uniquePageviews, ga:visitors'
         sort = '-ga:uniquePageviews'
 
         # Supported query params at
@@ -118,11 +127,173 @@
 
         packages = []
         for entry in results.get('rows'):
-            (loc,size,) = entry
-            packages.append( ('http:/' + loc,size, '',) ) # Temporary hack
+            (loc,pageviews,visits) = entry
+            packages.append( ('http:/' + loc, pageviews, visits,) ) # Temporary hack
         return dict(url=packages)
 
     def store(self, period_name, period_complete_day, data):
         if 'url' in data:
             ga_model.update_url_stats(period_name, period_complete_day, data['url'])
 
+    def sitewide_stats(self, period_name):
+        import calendar
+        year, month = period_name.split('-')
+        _, last_day_of_month = calendar.monthrange(int(year), int(month))
+
+        start_date = '%s-01' % period_name
+        end_date = '%s-%s' % (period_name, last_day_of_month)
+        print 'Sitewide_stats for %s (%s -> %s)' % (period_name, start_date, end_date)
+
+        funcs = ['_totals_stats', '_social_stats', '_os_stats',
+                 '_locale_stats', '_browser_stats', '_mobile_stats']
+        for f in funcs:
+            print ' + Fetching %s stats' % f.split('_')[1]
+            getattr(self, f)(start_date, end_date, period_name)
+
+    def _get_results(result_data, f):
+        data = {}
+        for result in result_data:
+            key = f(result)
+            data[key] = data.get(key,0) + result[1]
+        return data
+
+    def _totals_stats(self, start_date, end_date, period_name):
+        """ Fetches distinct totals, total pageviews etc """
+        results = self.service.data().ga().get(
+                                 ids='ga:' + self.profile_id,
+                                 start_date=start_date,
+                                 metrics='ga:uniquePageviews',
+                                 sort='-ga:uniquePageviews',
+                                 max_results=10000,
+                                 end_date=end_date).execute()
+        result_data = results.get('rows')
+        ga_model.update_sitewide_stats(period_name, "Totals", {'Total pageviews': result_data[0][0]})
+
+        results = self.service.data().ga().get(
+                                 ids='ga:' + self.profile_id,
+                                 start_date=start_date,
+                                 metrics='ga:pageviewsPerVisit,ga:bounces,ga:avgTimeOnSite,ga:percentNewVisits',
+                                 max_results=10000,
+                                 end_date=end_date).execute()
+        result_data = results.get('rows')
+        data = {
+            'Pages per visit': result_data[0][0],
+            'Bounces': result_data[0][1],
+            'Average time on site': result_data[0][2],
+            'Percent new visits': result_data[0][3],
+        }
+        ga_model.update_sitewide_stats(period_name, "Totals", data)
+
+
+    def _locale_stats(self, start_date, end_date, period_name):
+        """ Fetches stats about language and country """
+        results = self.service.data().ga().get(
+                                 ids='ga:' + self.profile_id,
+                                 start_date=start_date,
+                                 metrics='ga:uniquePageviews',
+                                 sort='-ga:uniquePageviews',
+                                 dimensions="ga:language,ga:country",
+                                 max_results=10000,
+                                 end_date=end_date).execute()
+        result_data = results.get('rows')
+        data = {}
+        for result in result_data:
+            data[result[0]] = data.get(result[0], 0) + int(result[2])
+        ga_model.update_sitewide_stats(period_name, "Languages", data)
+
+        data = {}
+        for result in result_data:
+            data[result[1]] = data.get(result[1], 0) + int(result[2])
+        ga_model.update_sitewide_stats(period_name, "Country", data)
+
+
+    def _social_stats(self, start_date, end_date, period_name):
+        """ Finds out which social sites people are referred from """
+        results = self.service.data().ga().get(
+                                 ids='ga:' + self.profile_id,
+                                 start_date=start_date,
+                                 metrics='ga:uniquePageviews',
+                                 sort='-ga:uniquePageviews',
+                                 dimensions="ga:socialNetwork,ga:referralPath",
+                                 max_results=10000,
+                                 end_date=end_date).execute()
+        result_data = results.get('rows')
+        twitter_links = []
+        data = {}
+        for result in result_data:
+            if not result[0] == '(not set)':
+                data[result[0]] = data.get(result[0], 0) + int(result[2])
+                if result[0] == 'Twitter':
+                    twitter_links.append(result[1])
+        ga_model.update_sitewide_stats(period_name, "Social sources", data)
+
+
+    def _os_stats(self, start_date, end_date, period_name):
+        """ Operating system stats """
+        results = self.service.data().ga().get(
+                                 ids='ga:' + self.profile_id,
+                                 start_date=start_date,
+                                 metrics='ga:uniquePageviews',
+                                 sort='-ga:uniquePageviews',
+                                 dimensions="ga:operatingSystem,ga:operatingSystemVersion",
+                                 max_results=10000,
+                                 end_date=end_date).execute()
+        result_data = results.get('rows')
+        data = {}
+        for result in result_data:
+            data[result[0]] = data.get(result[0], 0) + int(result[2])
+        ga_model.update_sitewide_stats(period_name, "Operating Systems", data)
+
+        data = {}
+        for result in result_data:
+            key = "%s (%s)" % (result[0],result[1])
+            data[key] = result[2]
+        ga_model.update_sitewide_stats(period_name, "Operating Systems versions", data)
+
+
+    def _browser_stats(self, start_date, end_date, period_name):
+        """ Information about browsers and browser versions """
+        results = self.service.data().ga().get(
+                                 ids='ga:' + self.profile_id,
+                                 start_date=start_date,
+                                 metrics='ga:uniquePageviews',
+                                 sort='-ga:uniquePageviews',
+                                 dimensions="ga:browser,ga:browserVersion",
+                                 max_results=10000,
+                                 end_date=end_date).execute()
+        result_data = results.get('rows')
+        data = {}
+        for result in result_data:
+            data[result[0]] = data.get(result[0], 0) + int(result[2])
+        ga_model.update_sitewide_stats(period_name, "Browsers", data)
+
+        data = {}
+        for result in result_data:
+            key = "%s (%s)" % (result[0], result[1])
+            data[key] = result[2]
+        ga_model.update_sitewide_stats(period_name, "Browser versions", data)
+
+
+    def _mobile_stats(self, start_date, end_date, period_name):
+        """ Info about mobile devices """
+
+        results = self.service.data().ga().get(
+                                 ids='ga:' + self.profile_id,
+                                 start_date=start_date,
+                                 metrics='ga:uniquePageviews',
+                                 sort='-ga:uniquePageviews',
+                                 dimensions="ga:mobileDeviceBranding, ga:mobileDeviceInfo",
+                                 max_results=10000,
+                                 end_date=end_date).execute()
+
+        result_data = results.get('rows')
+        data = {}
+        for result in result_data:
+            data[result[0]] = data.get(result[0], 0) + int(result[2])
+        ga_model.update_sitewide_stats(period_name, "Mobile brands", data)
+
+        data = {}
+        for result in result_data:
+            data[result[1]] = data.get(result[1], 0) + int(result[2])
+        ga_model.update_sitewide_stats(period_name, "Mobile devices", data)
+