Initial iteration
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,6 @@
*.py[co]
+*.py~
+.gitignore
# Packages
*.egg
--- a/README.md
+++ /dev/null
@@ -1,4 +1,1 @@
-ckanext-ga-report
-=================
-For creating detailed reports of CKAN analytics, sliced by group
--- /dev/null
+++ b/README.rst
@@ -1,1 +1,63 @@
+ckanext-ga-report
+=================
+**Status:** Development
+
+**CKAN Version:** 1.7.1+
+
+
+Overview
+--------
+
+For creating detailed reports of CKAN analytics, including totals per group.
+
+Whereas ckanext-googleanalytics focusses on providing page view stats a recent period and for all time (aimed at end users), ckanext-ga-report is more interested in building regular periodic reports (more for site managers to monitor).
+
+Contents of this extension:
+
+ * Use the CLI tool to download Google Analytics data for each time period into this extension's database tables
+
+ * Users can view the data as web page reports
+
+
+Installation
+------------
+
+1. Activate you CKAN python environment and install this extension's software::
+
+ $ pyenv/bin/activate
+ $ pip install -e git+https://github.com/okfn/ckanext-ga-report.git#egg=ckanext-ga-report
+
+2. Ensure you development.ini (or similar) contains the info about your Google Analytics account and configuration::
+
+ googleanalytics.id = UA-1010101-1
+ googleanalytics.username = googleaccount@gmail.com
+ googleanalytics.password = googlepassword
+ ga-report.period = monthly
+
+ Note that your password will be readable by system administrators on your server. Rather than use sensitive account details, it is suggested you give access to the GA account to a new Google account that you create just for this purpose.
+
+3. Set up this extension's database tables using a paster command. (Ensure your CKAN pyenv is still activated, run the command from ``src/ckanext-ga-report``, alter the ``--config`` option to point to your site config file)::
+
+ $ paster initdb --config=../ckan/development.ini
+
+4. Enable the extension in your CKAN config file by adding it to ``ckan.plugins``::
+
+ ckan.plugins = ga-report
+
+
+Tutorial
+--------
+
+Download some GA data and store it in CKAN's db. (Ensure your CKAN pyenv is still activated, run the command from ``src/ckanext-ga-report``, alter the ``--config`` option to point to your site config file)::
+
+ $ paster loadanalytics latest --config=../ckan/development.ini
+
+
+Software Licence
+================
+
+This software is developed by Cabinet Office. It is Crown Copyright and opened up under the Open Government Licence (OGL) (which is compatible with Creative Commons Attibution License).
+
+OGL terms: http://www.nationalarchives.gov.uk/doc/open-government-licence/
+
--- /dev/null
+++ b/ckanext/__init__.py
@@ -1,1 +1,8 @@
+# this is a namespace package
+try:
+ import pkg_resources
+ pkg_resources.declare_namespace(__name__)
+except ImportError:
+ import pkgutil
+ __path__ = pkgutil.extend_path(__path__, __name__)
--- /dev/null
+++ b/ckanext/ga_report/__init__.py
@@ -1,1 +1,8 @@
+# this is a namespace package
+try:
+ import pkg_resources
+ pkg_resources.declare_namespace(__name__)
+except ImportError:
+ import pkgutil
+ __path__ = pkgutil.extend_path(__path__, __name__)
--- /dev/null
+++ b/ckanext/ga_report/command.py
@@ -1,1 +1,58 @@
+import logging
+from ckan.lib.cli import CkanCommand
+# No other CKAN imports allowed until _load_config is run, or logging is disabled
+
+class InitDB(CkanCommand):
+ """Initialise the extension's database tables
+ """
+ summary = __doc__.split('\n')[0]
+ usage = __doc__
+ max_args = 0
+ min_args = 0
+
+ def command(self):
+ self._load_config()
+
+ import ckan.model as model
+ model.Session.remove()
+ model.Session.configure(bind=model.meta.engine)
+ log = logging.getLogger('ckanext.ga-report')
+
+ import ga_model
+ ga_model.init_tables()
+ log.info("DB tables are setup")
+
+class LoadAnalytics(CkanCommand):
+ """Get data from Google Analytics API and save it
+ in the ga_model
+
+ Usage: paster loadanalytics <time-period>
+
+ Where <time-period> is:
+ all - data for all time
+ latest - (default) just the 'latest' data
+ YYYY-MM-DD - just data for all time periods going
+ back to (and including) this date
+ """
+ summary = __doc__.split('\n')[0]
+ usage = __doc__
+ max_args = 1
+ min_args = 0
+
+ def command(self):
+ self._load_config()
+
+ from download_analytics import DownloadAnalytics
+ downloader = DownloadAnalytics()
+
+ time_period = self.args[0] if self.args else 'latest'
+ if time_period == 'all':
+ downloader.all_()
+ elif time_period == 'latest':
+ downloader.latest()
+ else:
+ since_date = datetime.datetime.strptime(time_period, '%Y-%m-%d')
+ downloader.since_date(since_date)
+
+
--- /dev/null
+++ b/ckanext/ga_report/controller.py
@@ -1,1 +1,10 @@
+import logging
+from ckan.lib.base import BaseController, c, render
+import report_model
+log = logging.getLogger('ckanext.ga-report')
+
+class GaReport(BaseController):
+ def index(self):
+ return render('index.html')
+
--- /dev/null
+++ b/ckanext/ga_report/download_analytics.py
@@ -1,1 +1,116 @@
+import logging
+import datetime
+from pylons import config
+
+import ga_model
+from ga_client import GA
+
+log = logging.getLogger('ckanext.ga-report')
+
+FORMAT_MONTH = '%Y-%m'
+
+class DownloadAnalytics(object):
+ '''Downloads and stores analytics info'''
+ def __init__(self):
+ self.period = config['ga-report.period']
+
+ def all_(self):
+ pass
+
+ def latest(self):
+ if self.period == 'monthly':
+ # from first of this month to today
+ now = datetime.datetime.now()
+ first_of_this_month = datetime.datetime(now.year, now.month, 1)
+ periods = ((now.strftime(FORMAT_MONTH),
+ now.day,
+ first_of_this_month, now),)
+ else:
+ raise NotImplementedError
+ self.download_and_store(periods)
+
+
+ def since_date(self, since_date):
+ assert isinstance(since_date, datetime.datetime)
+ periods = [] # (period_name, period_complete_day, start_date, end_date)
+ if self.period == 'monthly':
+ first_of_the_months_until_now = []
+ year = since_date.year
+ month = since_date.month
+ now = datetime.datetime.now()
+ first_of_this_month = datetime.datetime(now.year, now.month, 1)
+ while True:
+ first_of_the_month = datetime.datetime(year, month, 1)
+ if first_of_the_month == first_of_this_month:
+ periods.append((now.strftime(FORMAT_MONTH),
+ now.day,
+ first_of_this_month, now))
+ break
+ elif first_of_the_month < first_of_this_month:
+ in_the_next_month = first_of_the_month + datetime.timedelta(40)
+ last_of_the_month == datetime.datetime(in_the_next_month.year,
+ in_the_next_month.month, a)\
+ - datetime.timedelta(1)
+ periods.append((now.strftime(FORMAT_MONTH), 0,
+ first_of_the_month, last_of_the_month))
+ else:
+ # first_of_the_month has got to the future somehow
+ break
+ month += 1
+ if month > 12:
+ year += 1
+ month = 1
+ else:
+ raise NotImplementedError
+ self.download_and_store(periods)
+
+ @staticmethod
+ def get_full_period_name(period_name, period_complete_day):
+ if period_complete_day:
+ return period_name + ' (up to %ith)' % period_complete_day
+ else:
+ return period_name
+
+
+ def download_and_store(self, periods):
+ for period_name, period_complete_day, start_date, end_date in periods:
+ log.info('Downloading Analytics for period "%s" (%s - %s)',
+ self.get_full_period_name(period_name, period_complete_day),
+ start_date.strftime('%Y %m %d'),
+ end_date.strftime('%Y %m %d'))
+ data = self.download(start_date, end_date)
+ log.info('Storing Analytics for period "%s"',
+ self.get_full_period_name(period_name, period_complete_day))
+ self.store(period_name, period_complete_day, data)
+
+ @classmethod
+ def download(cls, start_date, end_date):
+ '''Get data from GA for a given time period'''
+ start_date = start_date.strftime('%Y-%m-%d')
+ end_date = end_date.strftime('%Y-%m-%d')
+ # url
+ #query = 'ga:pagePath=~^%s,ga:pagePath=~^%s' % \
+ # (PACKAGE_URL, self.resource_url_tag)
+ query = 'ga:pagePath=~^/dataset/'
+ metrics = 'ga:uniquePageviews'
+ sort = '-ga:uniquePageviews'
+ for entry in GA.ga_query(query_filter=query,
+ from_date=start_date,
+ metrics=metrics,
+ sort=sort,
+ to_date=end_date):
+ print entry
+ import pdb; pdb.set_trace()
+ for dim in entry.dimension:
+ if dim.name == "ga:pagePath":
+ package = dim.value
+ count = entry.get_metric(
+ 'ga:uniquePageviews').value or 0
+ packages[package] = int(count)
+ return packages
+
+ def store(self, period_name, period_complete_day, data):
+ if 'url' in data:
+ ga_model.update_url_stats(period_name, period_complete_day, data['url'])
+
--- /dev/null
+++ b/ckanext/ga_report/ga_model.py
@@ -1,1 +1,91 @@
+import re
+import uuid
+from sqlalchemy import Table, Column, MetaData
+from sqlalchemy import types
+from sqlalchemy.sql import select, text
+from sqlalchemy import func
+
+import ckan.model as model
+from ckan.model.types import JsonType
+from ckan.lib.base import *
+
+def make_uuid():
+ return unicode(uuid.uuid4())
+
+def init_tables():
+ metadata = MetaData()
+ package_stats = Table('ga_url', metadata,
+ Column('id', types.UnicodeText, primary_key=True, default=make_uuid),
+ Column('period_name', types.UnicodeText),
+ Column('period_complete_day', types.Integer),
+ Column('visits', types.Integer),
+ Column('group_id', types.String(60)),
+ Column('next_page', JsonType),
+ )
+ metadata.create_all(model.meta.engine)
+
+
+cached_tables = {}
+
+def get_table(name):
+ if name not in cached_tables:
+ meta = MetaData()
+ meta.reflect(bind=model.meta.engine)
+ table = meta.tables[name]
+ cached_tables[name] = table
+ return cached_tables[name]
+
+
+def _normalize_url(url):
+ '''Strip off the hostname etc. Do this before storing it.
+
+ >>> normalize_url('http://data.gov.uk/dataset/weekly_fuel_prices')
+ '/dataset/weekly_fuel_prices'
+ '''
+ url = re.sub('https?://(www\.)?data.gov.uk', '', url)
+ return url
+
+def _get_department_id_of_url(url):
+ # e.g. /dataset/fuel_prices
+ # e.g. /dataset/fuel_prices/resource/e63380d4
+ dataset_match = re.match('/dataset/([^/]+)(/.*)?', url)
+ if dataset_match:
+ dataset_ref = dataset_match.groups()[0]
+ dataset = model.Package.get(dataset_ref)
+ if dataset:
+ publisher_groups = dataset.get_groups('publisher')
+ if publisher_groups:
+ return publisher_groups[0].id
+
+def update_url_stats(period_name, period_complete_day, url_data):
+ table = get_table('ga_url')
+ connection = model.Session.connection()
+ for url, views, next_page in url_data:
+ url = _normalize_url(url)
+ department_id = _get_department_id_of_url(url)
+ # see if the row for this url & month is in the table already
+ s = select([func.count(id_col)],
+ table.c.period_name == period_name,
+ table.c.url == url)
+ count = connection.execute(s).fetchone()
+ if count and count[0]:
+ # update the row
+ connection.execute(table.update()\
+ .where(table.c.period_name == period_name,
+ table.c.url == url)\
+ .values(period_complete_day=period_complete_day,
+ views=views,
+ department_id=department_id,
+ next_page=next_page))
+ else:
+ # create the row
+ values = {'period_name': period_name,
+ 'period_complete_day': period_complete_day,
+ 'url': url,
+ 'views': views,
+ 'department_id': department_id,
+ 'next_page': next_page}
+ connection.execute(stats.insert()\
+ .values(**values))
+
--- /dev/null
+++ b/ckanext/ga_report/plugin.py
@@ -1,1 +1,26 @@
+import logging
+import ckan.lib.helpers as h
+from ckan.plugins import implements, toolkit
+import gasnippet
+import commands
+import dbutil
+log = logging.getLogger('ckanext.ga-report')
+
+class GoogleAnalyticsPlugin(p.SingletonPlugin):
+ implements(p.IConfigurer, inherit=True)
+ implements(p.IRoutes, inherit=True)
+
+ def update_config(self, config):
+ toolkit.add_template_directory(config, 'templates')
+ toolkit.add_public_directory(config, 'public')
+
+ def after_map(self, map):
+ map.connect(
+ '/data/analytics/index',
+ controller='ckanext.ga-report.controller:GaReport',
+ action='index'
+ )
+ return map
+
+
--- /dev/null
+++ b/ckanext/ga_report/report_model.py
--- /dev/null
+++ b/setup.py
@@ -1,1 +1,36 @@
+from setuptools import setup, find_packages
+import sys, os
+version = '0.1'
+
+setup(
+ name='ckanext-ga-report',
+ version=version,
+ description="GA reporting for CKAN",
+ long_description="""\
+ """,
+ classifiers=[], # Get strings from http://pypi.python.org/pypi?%3Aaction=list_classifiers
+ keywords='',
+ author='David Read',
+ author_email='david.read@hackneyworkshop.com',
+ url='',
+ license='',
+ packages=find_packages(exclude=['ez_setup', 'examples', 'tests']),
+ namespace_packages=['ckanext', 'ckanext.ga_report'],
+ include_package_data=True,
+ zip_safe=False,
+ install_requires=[
+ 'gdata'
+ ],
+ entry_points=\
+ """
+ [ckan.plugins]
+ # Add plugins here, eg
+ ga-report=ckanext.ga_report.plugin:GaReportPlugin
+
+ [paste.paster_command]
+ loadanalytics = ckanext.ga_report.command:LoadAnalytics
+ initdb = ckanext.ga_report.command:InitDB
+ """,
+)
+