import re |
import re |
import uuid |
import uuid |
|
|
from sqlalchemy import Table, Column, MetaData |
from sqlalchemy import Table, Column, MetaData |
from sqlalchemy import types |
from sqlalchemy import types |
from sqlalchemy.sql import select |
from sqlalchemy.sql import select |
|
from sqlalchemy.orm import mapper |
from sqlalchemy import func |
from sqlalchemy import func |
|
|
import ckan.model as model |
import ckan.model as model |
from ckan.model.types import JsonType |
|
from ckan.lib.base import * |
from ckan.lib.base import * |
|
|
|
|
def make_uuid(): |
def make_uuid(): |
return unicode(uuid.uuid4()) |
return unicode(uuid.uuid4()) |
|
|
|
metadata = MetaData() |
|
|
|
|
|
|
|
class GA_Url(object): |
|
|
|
def __init__(self, **kwargs): |
|
for k,v in kwargs.items(): |
|
setattr(self, k, v) |
|
|
|
url_table = Table('ga_url', metadata, |
|
Column('id', types.UnicodeText, primary_key=True, |
|
default=make_uuid), |
|
Column('period_name', types.UnicodeText), |
|
Column('period_complete_day', types.Integer), |
|
Column('pageviews', types.UnicodeText), |
|
Column('visitors', types.UnicodeText), |
|
Column('url', types.UnicodeText), |
|
Column('department_id', types.UnicodeText), |
|
) |
|
mapper(GA_Url, url_table) |
|
|
|
|
|
class GA_Stat(object): |
|
|
|
def __init__(self, **kwargs): |
|
for k,v in kwargs.items(): |
|
setattr(self, k, v) |
|
|
|
stat_table = Table('ga_stat', metadata, |
|
Column('id', types.UnicodeText, primary_key=True, |
|
default=make_uuid), |
|
Column('period_name', types.UnicodeText), |
|
Column('stat_name', types.UnicodeText), |
|
Column('key', types.UnicodeText), |
|
Column('value', types.UnicodeText), ) |
|
mapper(GA_Stat, stat_table) |
|
|
|
|
|
class GA_Publisher(object): |
|
|
|
def __init__(self, **kwargs): |
|
for k,v in kwargs.items(): |
|
setattr(self, k, v) |
|
|
|
pub_table = Table('ga_publisher', metadata, |
|
Column('id', types.UnicodeText, primary_key=True, |
|
default=make_uuid), |
|
Column('period_name', types.UnicodeText), |
|
Column('publisher_name', types.UnicodeText), |
|
Column('views', types.UnicodeText), |
|
Column('visitors', types.UnicodeText), |
|
Column('toplevel', types.Boolean, default=False), |
|
Column('subpublishercount', types.Integer, default=0), |
|
Column('parent', types.UnicodeText), |
|
) |
|
mapper(GA_Publisher, pub_table) |
|
|
|
|
|
class GA_ReferralStat(object): |
|
|
|
def __init__(self, **kwargs): |
|
for k,v in kwargs.items(): |
|
setattr(self, k, v) |
|
|
|
referrer_table = Table('ga_referrer', metadata, |
|
Column('id', types.UnicodeText, primary_key=True, |
|
default=make_uuid), |
|
Column('period_name', types.UnicodeText), |
|
Column('source', types.UnicodeText), |
|
Column('url', types.UnicodeText), |
|
Column('count', types.Integer), |
|
) |
|
mapper(GA_ReferralStat, referrer_table) |
|
|
|
|
|
|
def init_tables(): |
def init_tables(): |
metadata = MetaData() |
|
package_stats = Table('ga_url', metadata, |
|
Column('id', types.UnicodeText, primary_key=True, |
|
default=make_uuid), |
|
Column('period_name', types.UnicodeText), |
|
Column('period_complete_day', types.Integer), |
|
Column('visits', types.Integer), |
|
Column('group_id', types.String(60)), |
|
Column('next_page', JsonType), |
|
) |
|
metadata.create_all(model.meta.engine) |
metadata.create_all(model.meta.engine) |
|
|
|
|
cached_tables = {} |
cached_tables = {} |
|
|
|
|
def get_table(name): |
def get_table(name): |
if name not in cached_tables: |
if name not in cached_tables: |
meta = MetaData() |
meta = MetaData() |
meta.reflect(bind=model.meta.engine) |
meta.reflect(bind=model.meta.engine) |
table = meta.tables[name] |
table = meta.tables[name] |
cached_tables[name] = table |
cached_tables[name] = table |
return cached_tables[name] |
return cached_tables[name] |
|
|
|
|
def _normalize_url(url): |
def _normalize_url(url): |
'''Strip off the hostname etc. Do this before storing it. |
'''Strip off the hostname etc. Do this before storing it. |
|
|
>>> normalize_url('http://data.gov.uk/dataset/weekly_fuel_prices') |
>>> normalize_url('http://data.gov.uk/dataset/weekly_fuel_prices') |
'/dataset/weekly_fuel_prices' |
'/dataset/weekly_fuel_prices' |
''' |
''' |
url = re.sub('https?://(www\.)?data.gov.uk', '', url) |
# Deliberately leaving a / |
return url |
url = url.replace('http:/','') |
|
return '/' + '/'.join(url.split('/')[2:]) |
|
|
|
|
def _get_department_id_of_url(url): |
def _get_department_id_of_url(url): |
# e.g. /dataset/fuel_prices |
# e.g. /dataset/fuel_prices |
# e.g. /dataset/fuel_prices/resource/e63380d4 |
# e.g. /dataset/fuel_prices/resource/e63380d4 |
dataset_match = re.match('/dataset/([^/]+)(/.*)?', url) |
dataset_match = re.match('/dataset/([^/]+)(/.*)?', url) |
if dataset_match: |
if dataset_match: |
dataset_ref = dataset_match.groups()[0] |
dataset_ref = dataset_match.groups()[0] |
dataset = model.Package.get(dataset_ref) |
dataset = model.Package.get(dataset_ref) |
if dataset: |
if dataset: |
publisher_groups = dataset.get_groups('publisher') |
publisher_groups = dataset.get_groups('publisher') |
if publisher_groups: |
if publisher_groups: |
return publisher_groups[0].id |
return publisher_groups[0].name |
|
else: |
|
publisher_match = re.match('/publisher/([^/]+)(/.*)?', url) |
|
if publisher_match: |
|
return publisher_match.groups()[0] |
|
|
|
|
|
def update_sitewide_stats(period_name, stat_name, data): |
|
for k,v in data.iteritems(): |
|
item = model.Session.query(GA_Stat).\ |
|
filter(GA_Stat.period_name==period_name).\ |
|
filter(GA_Stat.key==k).\ |
|
filter(GA_Stat.stat_name==stat_name).first() |
|
if item: |
|
item.period_name = period_name |
|
item.key = k |
|
item.value = v |
|
model.Session.add(item) |
|
else: |
|
# create the row |
|
values = {'id': make_uuid(), |
|
'period_name': period_name, |
|
'key': k, |
|
'value': v, |
|
'stat_name': stat_name |
|
} |
|
model.Session.add(GA_Stat(**values)) |
|
model.Session.commit() |
|
|
|
|
|
|
def update_url_stats(period_name, period_complete_day, url_data): |
def update_url_stats(period_name, period_complete_day, url_data): |
table = get_table('ga_url') |
for url, views, visitors in url_data: |
connection = model.Session.connection() |
|
for url, views, next_page in url_data: |
|
url = _normalize_url(url) |
url = _normalize_url(url) |
department_id = _get_department_id_of_url(url) |
department_id = _get_department_id_of_url(url) |
|
|
# see if the row for this url & month is in the table already |
# see if the row for this url & month is in the table already |
s = select([func.count(id_col)], |
item = model.Session.query(GA_Url).\ |
table.c.period_name == period_name, |
filter(GA_Url.period_name==period_name).\ |
table.c.url == url) |
filter(GA_Url.url==url).first() |
count = connection.execute(s).fetchone() |
if item: |
if count and count[0]: |
item.period_name = period_name |
# update the row |
item.pageviews = views |
connection.execute(table.update() |
item.visitors = visitors |
.where(table.c.period_name == period_name, |
item.department_id = department_id |
table.c.url == url) |
model.Session.add(item) |
.values(period_complete_day=period_complete_day, |
|
views=views, |
|
department_id=department_id, |
|
next_page=next_page)) |
|
else: |
else: |
# create the row |
# create the row |
values = {'period_name': period_name, |
values = {'id': make_uuid(), |
|
'period_name': period_name, |
'period_complete_day': period_complete_day, |
'period_complete_day': period_complete_day, |
'url': url, |
'url': url, |
'views': views, |
'pageviews': views, |
'department_id': department_id, |
'visitors': visitors, |
'next_page': next_page} |
'department_id': department_id |
connection.execute(stats.insert(). |
} |
values(**values)) |
model.Session.add(GA_Url(**values)) |
|
model.Session.commit() |
|
|
|
|
|
def update_social(period_name, data): |
|
# Clean up first. |
|
model.Session.query(GA_ReferralStat).\ |
|
filter(GA_ReferralStat.period_name==period_name).delete() |
|
|
|
for url,data in data.iteritems(): |
|
for entry in data: |
|
source = entry[0] |
|
count = entry[1] |
|
|
|
item = model.Session.query(GA_ReferralStat).\ |
|
filter(GA_ReferralStat.period_name==period_name).\ |
|
filter(GA_ReferralStat.source==source).\ |
|
filter(GA_ReferralStat.url==url).first() |
|
if item: |
|
item.count = item.count + count |
|
model.Session.add(item) |
|
else: |
|
# create the row |
|
values = {'id': make_uuid(), |
|
'period_name': period_name, |
|
'source': source, |
|
'url': url, |
|
'count': count, |
|
} |
|
model.Session.add(GA_ReferralStat(**values)) |
|
model.Session.commit() |
|
|
|
def update_publisher_stats(period_name): |
|
""" |
|
Updates the publisher stats from the data retrieved for /dataset/* |
|
and /publisher/*. Will run against each dataset and generates the |
|
totals for the entire tree beneath each publisher. |
|
""" |
|
toplevel = get_top_level() |
|
publishers = model.Session.query(model.Group).\ |
|
filter(model.Group.type=='publisher').\ |
|
filter(model.Group.state=='active').all() |
|
for publisher in publishers: |
|
views, visitors, subpub = update_publisher(period_name, publisher, publisher.name) |
|
parent, parents = '', publisher.get_groups('publisher') |
|
if parents: |
|
parent = parents[0].name |
|
item = model.Session.query(GA_Publisher).\ |
|
filter(GA_Publisher.period_name==period_name).\ |
|
filter(GA_Publisher.publisher_name==publisher.name).first() |
|
if item: |
|
item.views = views |
|
item.visitors = visitors |
|
item.publisher_name = publisher.name |
|
item.toplevel = publisher in toplevel |
|
item.subpublishercount = subpub |
|
item.parent = parent |
|
model.Session.add(item) |
|
else: |
|
# create the row |
|
values = {'id': make_uuid(), |
|
'period_name': period_name, |
|
'publisher_name': publisher.name, |
|
'views': views, |
|
'visitors': visitors, |
|
'toplevel': publisher in toplevel, |
|
'subpublishercount': subpub, |
|
'parent': parent |
|
} |
|
model.Session.add(GA_Publisher(**values)) |
|
model.Session.commit() |
|
|
|
|
|
def update_publisher(period_name, pub, part=''): |
|
views,visitors,subpub = 0, 0, 0 |
|
for publisher in go_down_tree(pub): |
|
subpub = subpub + 1 |
|
items = model.Session.query(GA_Url).\ |
|
filter(GA_Url.period_name==period_name).\ |
|
filter(GA_Url.department_id==publisher.name).all() |
|
for item in items: |
|
views = views + int(item.pageviews) |
|
visitors = visitors + int(item.visitors) |
|
|
|
return views, visitors, (subpub-1) |
|
|
|
|
|
def get_top_level(): |
|
'''Returns the top level publishers.''' |
|
return model.Session.query(model.Group).\ |
|
outerjoin(model.Member, model.Member.table_id == model.Group.id and \ |
|
model.Member.table_name == 'group' and \ |
|
model.Member.state == 'active').\ |
|
filter(model.Member.id==None).\ |
|
filter(model.Group.type=='publisher').\ |
|
order_by(model.Group.name).all() |
|
|
|
def get_children(publisher): |
|
'''Finds child publishers for the given publisher (object). (Not recursive)''' |
|
from ckan.model.group import HIERARCHY_CTE |
|
return model.Session.query(model.Group).\ |
|
from_statement(HIERARCHY_CTE).params(id=publisher.id, type='publisher').\ |
|
all() |
|
|
|
def go_down_tree(publisher): |
|
'''Provided with a publisher object, it walks down the hierarchy and yields each publisher, |
|
including the one you supply.''' |
|
yield publisher |
|
for child in get_children(publisher): |
|
for grandchild in go_down_tree(child): |
|
yield grandchild |
|
|
|
def delete(period_name): |
|
''' |
|
Deletes table data for the specified period, or specify 'all' |
|
for all periods. |
|
''' |
|
for object_type in (GA_Url, GA_Stat, GA_Publisher, GA_ReferralStat): |
|
q = model.Session.query(object_type) |
|
if period_name != 'all': |
|
q = q.filter_by(period_name=period_name) |
|
q.delete() |
|
model.Session.commit() |
|
|