Remove prettyprinting unless DEBUG is set in env and add a new
[ckanext-ga-report.git] / ckanext / ga_report / ga_model.py
blob:a/ckanext/ga_report/ga_model.py -> blob:b/ckanext/ga_report/ga_model.py
import re import re
import uuid import uuid
   
from sqlalchemy import Table, Column, MetaData from sqlalchemy import Table, Column, MetaData, ForeignKey
from sqlalchemy import types from sqlalchemy import types
from sqlalchemy.sql import select from sqlalchemy.sql import select
from sqlalchemy.orm import mapper from sqlalchemy.orm import mapper, relation
from sqlalchemy import func from sqlalchemy import func
   
import ckan.model as model import ckan.model as model
from ckan.lib.base import * from ckan.lib.base import *
   
  log = __import__('logging').getLogger(__name__)
   
def make_uuid(): def make_uuid():
return unicode(uuid.uuid4()) return unicode(uuid.uuid4())
   
  metadata = MetaData()
   
class GA_Url(object): class GA_Url(object):
   
def __init__(self, **kwargs): def __init__(self, **kwargs):
for k,v in kwargs.items(): for k,v in kwargs.items():
setattr(self, k, v) setattr(self, k, v)
   
   
metadata = MetaData()  
url_table = Table('ga_url', metadata, url_table = Table('ga_url', metadata,
Column('id', types.UnicodeText, primary_key=True, Column('id', types.UnicodeText, primary_key=True,
default=make_uuid), default=make_uuid),
Column('period_name', types.UnicodeText), Column('period_name', types.UnicodeText),
Column('period_complete_day', types.Integer), Column('period_complete_day', types.Integer),
Column('metric', types.UnicodeText), Column('pageviews', types.UnicodeText),
Column('value', types.UnicodeText), Column('visits', types.UnicodeText),
Column('url', types.UnicodeText), Column('url', types.UnicodeText),
Column('department_id', types.UnicodeText), Column('department_id', types.UnicodeText),
  Column('package_id', types.UnicodeText),
) )
mapper(GA_Url, url_table) mapper(GA_Url, url_table)
   
   
  class GA_Stat(object):
   
  def __init__(self, **kwargs):
  for k,v in kwargs.items():
  setattr(self, k, v)
   
  stat_table = Table('ga_stat', metadata,
  Column('id', types.UnicodeText, primary_key=True,
  default=make_uuid),
  Column('period_name', types.UnicodeText),
  Column('period_complete_day', types.UnicodeText),
  Column('stat_name', types.UnicodeText),
  Column('key', types.UnicodeText),
  Column('value', types.UnicodeText), )
  mapper(GA_Stat, stat_table)
   
   
  class GA_Publisher(object):
   
  def __init__(self, **kwargs):
  for k,v in kwargs.items():
  setattr(self, k, v)
   
  pub_table = Table('ga_publisher', metadata,
  Column('id', types.UnicodeText, primary_key=True,
  default=make_uuid),
  Column('period_name', types.UnicodeText),
  Column('publisher_name', types.UnicodeText),
  Column('views', types.UnicodeText),
  Column('visits', types.UnicodeText),
  Column('toplevel', types.Boolean, default=False),
  Column('subpublishercount', types.Integer, default=0),
  Column('parent', types.UnicodeText),
  )
  mapper(GA_Publisher, pub_table)
   
   
  class GA_ReferralStat(object):
   
  def __init__(self, **kwargs):
  for k,v in kwargs.items():
  setattr(self, k, v)
   
  referrer_table = Table('ga_referrer', metadata,
  Column('id', types.UnicodeText, primary_key=True,
  default=make_uuid),
  Column('period_name', types.UnicodeText),
  Column('source', types.UnicodeText),
  Column('url', types.UnicodeText),
  Column('count', types.Integer),
  )
  mapper(GA_ReferralStat, referrer_table)
   
   
   
def init_tables(): def init_tables():
metadata.create_all(model.meta.engine) metadata.create_all(model.meta.engine)
   
   
cached_tables = {} cached_tables = {}
   
   
def get_table(name): def get_table(name):
if name not in cached_tables: if name not in cached_tables:
meta = MetaData() meta = MetaData()
meta.reflect(bind=model.meta.engine) meta.reflect(bind=model.meta.engine)
table = meta.tables[name] table = meta.tables[name]
cached_tables[name] = table cached_tables[name] = table
return cached_tables[name] return cached_tables[name]
   
   
def _normalize_url(url): def _normalize_url(url):
'''Strip off the hostname etc. Do this before storing it. '''Strip off the hostname etc. Do this before storing it.
   
>>> normalize_url('http://data.gov.uk/dataset/weekly_fuel_prices') >>> normalize_url('http://data.gov.uk/dataset/weekly_fuel_prices')
'/dataset/weekly_fuel_prices' '/dataset/weekly_fuel_prices'
''' '''
url = re.sub('https?://(www\.)?data.gov.uk', '', url) return url #'/' + '/'.join(url.split('/')[3:])
return url  
   
  def _get_package_and_publisher(url):
def _get_department_id_of_url(url):  
# e.g. /dataset/fuel_prices # e.g. /dataset/fuel_prices
# e.g. /dataset/fuel_prices/resource/e63380d4 # e.g. /dataset/fuel_prices/resource/e63380d4
dataset_match = re.match('/dataset/([^/]+)(/.*)?', url) dataset_match = re.match('/dataset/([^/]+)(/.*)?', url)
if dataset_match: if dataset_match:
dataset_ref = dataset_match.groups()[0] dataset_ref = dataset_match.groups()[0]
dataset = model.Package.get(dataset_ref) dataset = model.Package.get(dataset_ref)
if dataset: if dataset:
publisher_groups = dataset.get_groups('publisher') publisher_groups = dataset.get_groups('organization')
if publisher_groups: if publisher_groups:
return publisher_groups[0].name return dataset_ref,publisher_groups[0].name
  return dataset_ref, None
  else:
def update_url_stats(period_name, period_complete_day, url_data): publisher_match = re.match('/organization/([^/]+)(/.*)?', url)
for url, views, next_page in url_data: if publisher_match:
url = _normalize_url(url) return None, publisher_match.groups()[0]
department_id = _get_department_id_of_url(url) return None, None
   
# see if the row for this url & month is in the table already def update_sitewide_stats(period_name, stat_name, data, period_complete_day):
item = model.Session.query(GA_Url).\ for k,v in data.iteritems():
filter(GA_Url.period_name==period_name).\ item = model.Session.query(GA_Stat).\
filter(GA_Url.url==url).\ filter(GA_Stat.period_name==period_name).\
filter(GA_Url.metric == 'Total views').first() filter(GA_Stat.key==k).\
  filter(GA_Stat.stat_name==stat_name).first()
if item: if item:
item.period_name = period_complete_day = period_complete_day item.period_name = period_name
item.value = views item.key = k
item.department_id = department_id item.value = v
  item.period_complete_day = period_complete_day
model.Session.add(item) model.Session.add(item)
else: else:
# create the row # create the row
values = {'id': make_uuid(), values = {'id': make_uuid(),
  'period_name': period_name,
  'period_complete_day': period_complete_day,
  'key': k,
  'value': v,
  'stat_name': stat_name
  }
  model.Session.add(GA_Stat(**values))
  model.Session.commit()
   
   
  def pre_update_url_stats(period_name):
  q = model.Session.query(GA_Url).\
  filter(GA_Url.period_name==period_name)
  log.debug("Deleting %d '%s' records" % (q.count(), period_name))
  q.delete()
   
  q = model.Session.query(GA_Url).\
  filter(GA_Url.period_name == 'All')
  log.debug("Deleting %d 'All' records..." % q.count())
  q.delete()
   
  model.Session.flush()
  model.Session.commit()
  model.repo.commit_and_remove()
  log.debug('...done')
   
  def post_update_url_stats():
   
  """ Check the distinct url field in ga_url and make sure
  it has an All record. If not then create one.
   
  After running this then every URL should have an All
  record regardless of whether the URL has an entry for
  the month being currently processed.
  """
  log.debug('Post-processing "All" records...')
  query = """select url, pageviews::int, visits::int
  from ga_url
  where url not in (select url from ga_url where period_name ='All')"""
  connection = model.Session.connection()
  res = connection.execute(query)
   
  views, visits = {}, {}
  # url, views, visits
  for row in res:
  views[row[0]] = views.get(row[0], 0) + row[1]
  visits[row[0]] = visits.get(row[0], 0) + row[2]
   
  progress_total = len(views.keys())
  progress_count = 0
  for key in views.keys():
  progress_count += 1
  if progress_count % 100 == 0:
  log.debug('.. %d/%d done so far', progress_count, progress_total)
   
  package, publisher = _get_package_and_publisher(key)
   
  values = {'id': make_uuid(),
  'period_name': "All",
  'period_complete_day': 0,
  'url': key,
  'pageviews': views[key],
  'visits': visits[key],
  'department_id': publisher,
  'package_id': package
  }
  model.Session.add(GA_Url(**values))
  model.Session.commit()
  log.debug('..done')
   
   
  def update_url_stats(period_name, period_complete_day, url_data):
  '''
  Given a list of urls and number of hits for each during a given period,
  stores them in GA_Url under the period and recalculates the totals for
  the 'All' period.
  '''
  progress_total = len(url_data)
  progress_count = 0
  for url, views, visits in url_data:
  progress_count += 1
  if progress_count % 100 == 0:
  log.debug('.. %d/%d done so far', progress_count, progress_total)
   
  package, publisher = _get_package_and_publisher(url)
   
  item = model.Session.query(GA_Url).\
  filter(GA_Url.period_name==period_name).\
  filter(GA_Url.url==url).first()
  if item:
  item.pageviews = item.pageviews + views
  item.visits = item.visits + visits
  if not item.package_id:
  item.package_id = package
  if not item.department_id:
  item.department_id = publisher
  model.Session.add(item)
  else:
  values = {'id': make_uuid(),
'period_name': period_name, 'period_name': period_name,
'period_complete_day': period_complete_day, 'period_complete_day': period_complete_day,
'url': url, 'url': url,
'value': views, 'pageviews': views,
'metric': 'Total views', 'visits': visits,
'department_id': department_id 'department_id': publisher,
  'package_id': package
} }
model.Session.add(GA_Url(**values)) model.Session.add(GA_Url(**values))
model.Session.commit() model.Session.commit()
   
  if package:
  old_pageviews, old_visits = 0, 0
  old = model.Session.query(GA_Url).\
  filter(GA_Url.period_name=='All').\
  filter(GA_Url.url==url).all()
  old_pageviews = sum([int(o.pageviews) for o in old])
  old_visits = sum([int(o.visits) for o in old])
   
  entries = model.Session.query(GA_Url).\
  filter(GA_Url.period_name!='All').\
  filter(GA_Url.url==url).all()
  values = {'id': make_uuid(),
  'period_name': 'All',
  'period_complete_day': 0,
  'url': url,
  'pageviews': sum([int(e.pageviews) for e in entries]) + int(old_pageviews),
  'visits': sum([int(e.visits or 0) for e in entries]) + int(old_visits),
  'department_id': publisher,
  'package_id': package
  }
   
  model.Session.add(GA_Url(**values))
  model.Session.commit()
   
   
   
   
  def update_social(period_name, data):
  # Clean up first.
  model.Session.query(GA_ReferralStat).\
  filter(GA_ReferralStat.period_name==period_name).delete()
   
  for url,data in data.iteritems():
  for entry in data:
  source = entry[0]
  count = entry[1]
   
  item = model.Session.query(GA_ReferralStat).\
  filter(GA_ReferralStat.period_name==period_name).\
  filter(GA_ReferralStat.source==source).\
  filter(GA_ReferralStat.url==url).first()
  if item:
  item.count = item.count + count
  model.Session.add(item)
  else:
  # create the row
  values = {'id': make_uuid(),
  'period_name': period_name,
  'source': source,
  'url': url,
  'count': count,
  }
  model.Session.add(GA_ReferralStat(**values))
  model.Session.commit()
   
  def update_publisher_stats(period_name):
  """
  Updates the publisher stats from the data retrieved for /dataset/*
  and /publisher/*. Will run against each dataset and generates the
  totals for the entire tree beneath each publisher.
  """
  toplevel = get_top_level()
  publishers = model.Session.query(model.Group).\
  filter(model.Group.type=='organization').\
  filter(model.Group.state=='active').all()
  for publisher in publishers:
  views, visits, subpub = update_publisher(period_name, publisher, publisher.name)
  parent, parents = '', publisher.get_parent_groups(type='organization')
  if parents:
  parent = parents[0].name
  item = model.Session.query(GA_Publisher).\
  filter(GA_Publisher.period_name==period_name).\
  filter(GA_Publisher.publisher_name==publisher.name).first()
  if item:
  item.views = views
  item.visits = visits
  item.publisher_name = publisher.name
  item.toplevel = publisher in toplevel
  item.subpublishercount = subpub
  item.parent = parent
  model.Session.add(item)
  else:
  # create the row
  values = {'id': make_uuid(),
  'period_name': period_name,
  'publisher_name': publisher.name,
  'views': views,
  'visits': visits,
  'toplevel': publisher in toplevel,
  'subpublishercount': subpub,
  'parent': parent
  }
  model.Session.add(GA_Publisher(**values))
  model.Session.commit()
   
   
  def update_publisher(period_name, pub, part=''):
  vie