import re |
import re |
import uuid |
import uuid |
|
|
from sqlalchemy import Table, Column, MetaData, ForeignKey |
from sqlalchemy import Table, Column, MetaData, ForeignKey |
from sqlalchemy import types |
from sqlalchemy import types |
from sqlalchemy.sql import select |
from sqlalchemy.sql import select |
from sqlalchemy.orm import mapper, relation |
from sqlalchemy.orm import mapper, relation |
from sqlalchemy import func |
from sqlalchemy import func |
|
|
import ckan.model as model |
import ckan.model as model |
from ckan.lib.base import * |
from ckan.lib.base import * |
|
|
log = __import__('logging').getLogger(__name__) |
log = __import__('logging').getLogger(__name__) |
|
|
def make_uuid(): |
def make_uuid(): |
return unicode(uuid.uuid4()) |
return unicode(uuid.uuid4()) |
|
|
metadata = MetaData() |
metadata = MetaData() |
|
|
class GA_Url(object): |
class GA_Url(object): |
|
|
def __init__(self, **kwargs): |
def __init__(self, **kwargs): |
for k,v in kwargs.items(): |
for k,v in kwargs.items(): |
setattr(self, k, v) |
setattr(self, k, v) |
|
|
url_table = Table('ga_url', metadata, |
url_table = Table('ga_url', metadata, |
Column('id', types.UnicodeText, primary_key=True, |
Column('id', types.UnicodeText, primary_key=True, |
default=make_uuid), |
default=make_uuid), |
Column('period_name', types.UnicodeText), |
Column('period_name', types.UnicodeText), |
Column('period_complete_day', types.Integer), |
Column('period_complete_day', types.Integer), |
Column('pageviews', types.UnicodeText), |
Column('pageviews', types.UnicodeText), |
Column('visits', types.UnicodeText), |
Column('visits', types.UnicodeText), |
Column('url', types.UnicodeText), |
Column('url', types.UnicodeText), |
Column('department_id', types.UnicodeText), |
Column('department_id', types.UnicodeText), |
Column('package_id', types.UnicodeText), |
Column('package_id', types.UnicodeText), |
) |
) |
mapper(GA_Url, url_table) |
mapper(GA_Url, url_table) |
|
|
|
|
class GA_Stat(object): |
class GA_Stat(object): |
|
|
def __init__(self, **kwargs): |
def __init__(self, **kwargs): |
for k,v in kwargs.items(): |
for k,v in kwargs.items(): |
setattr(self, k, v) |
setattr(self, k, v) |
|
|
stat_table = Table('ga_stat', metadata, |
stat_table = Table('ga_stat', metadata, |
Column('id', types.UnicodeText, primary_key=True, |
Column('id', types.UnicodeText, primary_key=True, |
default=make_uuid), |
default=make_uuid), |
Column('period_name', types.UnicodeText), |
Column('period_name', types.UnicodeText), |
Column('period_complete_day', types.UnicodeText), |
Column('period_complete_day', types.UnicodeText), |
Column('stat_name', types.UnicodeText), |
Column('stat_name', types.UnicodeText), |
Column('key', types.UnicodeText), |
Column('key', types.UnicodeText), |
Column('value', types.UnicodeText), ) |
Column('value', types.UnicodeText), ) |
mapper(GA_Stat, stat_table) |
mapper(GA_Stat, stat_table) |
|
|
|
|
class GA_Publisher(object): |
class GA_Publisher(object): |
|
|
def __init__(self, **kwargs): |
def __init__(self, **kwargs): |
for k,v in kwargs.items(): |
for k,v in kwargs.items(): |
setattr(self, k, v) |
setattr(self, k, v) |
|
|
pub_table = Table('ga_publisher', metadata, |
pub_table = Table('ga_publisher', metadata, |
Column('id', types.UnicodeText, primary_key=True, |
Column('id', types.UnicodeText, primary_key=True, |
default=make_uuid), |
default=make_uuid), |
Column('period_name', types.UnicodeText), |
Column('period_name', types.UnicodeText), |
Column('publisher_name', types.UnicodeText), |
Column('publisher_name', types.UnicodeText), |
Column('views', types.UnicodeText), |
Column('views', types.UnicodeText), |
Column('visits', types.UnicodeText), |
Column('visits', types.UnicodeText), |
Column('toplevel', types.Boolean, default=False), |
Column('toplevel', types.Boolean, default=False), |
Column('subpublishercount', types.Integer, default=0), |
Column('subpublishercount', types.Integer, default=0), |
Column('parent', types.UnicodeText), |
Column('parent', types.UnicodeText), |
) |
) |
mapper(GA_Publisher, pub_table) |
mapper(GA_Publisher, pub_table) |
|
|
|
|
class GA_ReferralStat(object): |
class GA_ReferralStat(object): |
|
|
def __init__(self, **kwargs): |
def __init__(self, **kwargs): |
for k,v in kwargs.items(): |
for k,v in kwargs.items(): |
setattr(self, k, v) |
setattr(self, k, v) |
|
|
referrer_table = Table('ga_referrer', metadata, |
referrer_table = Table('ga_referrer', metadata, |
Column('id', types.UnicodeText, primary_key=True, |
Column('id', types.UnicodeText, primary_key=True, |
default=make_uuid), |
default=make_uuid), |
Column('period_name', types.UnicodeText), |
Column('period_name', types.UnicodeText), |
Column('source', types.UnicodeText), |
Column('source', types.UnicodeText), |
Column('url', types.UnicodeText), |
Column('url', types.UnicodeText), |
Column('count', types.Integer), |
Column('count', types.Integer), |
) |
) |
mapper(GA_ReferralStat, referrer_table) |
mapper(GA_ReferralStat, referrer_table) |
|
|
|
|
|
|
def init_tables(): |
def init_tables(): |
metadata.create_all(model.meta.engine) |
metadata.create_all(model.meta.engine) |
|
|
|
|
cached_tables = {} |
cached_tables = {} |
|
|
|
|
def get_table(name): |
def get_table(name): |
if name not in cached_tables: |
if name not in cached_tables: |
meta = MetaData() |
meta = MetaData() |
meta.reflect(bind=model.meta.engine) |
meta.reflect(bind=model.meta.engine) |
table = meta.tables[name] |
table = meta.tables[name] |
cached_tables[name] = table |
cached_tables[name] = table |
return cached_tables[name] |
return cached_tables[name] |
|
|
|
|
def _normalize_url(url): |
def _normalize_url(url): |
'''Strip off the hostname etc. Do this before storing it. |
'''Strip off the hostname etc. Do this before storing it. |
|
|
>>> normalize_url('http://data.gov.uk/dataset/weekly_fuel_prices') |
>>> normalize_url('http://data.gov.uk/dataset/weekly_fuel_prices') |
'/dataset/weekly_fuel_prices' |
'/dataset/weekly_fuel_prices' |
''' |
''' |
return '/' + '/'.join(url.split('/')[3:]) |
return '/' + '/'.join(url.split('/')[3:]) |
|
|
|
|
def _get_package_and_publisher(url): |
def _get_package_and_publisher(url): |
# e.g. /dataset/fuel_prices |
# e.g. /dataset/fuel_prices |
# e.g. /dataset/fuel_prices/resource/e63380d4 |
# e.g. /dataset/fuel_prices/resource/e63380d4 |
dataset_match = re.match('/dataset/([^/]+)(/.*)?', url) |
dataset_match = re.match('/dataset/([^/]+)(/.*)?', url) |
if dataset_match: |
if dataset_match: |
dataset_ref = dataset_match.groups()[0] |
dataset_ref = dataset_match.groups()[0] |
dataset = model.Package.get(dataset_ref) |
dataset = model.Package.get(dataset_ref) |
if dataset: |
if dataset: |
publisher_groups = dataset.get_groups('publisher') |
publisher_groups = dataset.get_groups('publisher') |
if publisher_groups: |
if publisher_groups: |
return dataset_ref,publisher_groups[0].name |
return dataset_ref,publisher_groups[0].name |
return dataset_ref, None |
return dataset_ref, None |
else: |
else: |
publisher_match = re.match('/publisher/([^/]+)(/.*)?', url) |
publisher_match = re.match('/publisher/([^/]+)(/.*)?', url) |
if publisher_match: |
if publisher_match: |
return None, publisher_match.groups()[0] |
return None, publisher_match.groups()[0] |
return None, None |
return None, None |
|
|
def update_sitewide_stats(period_name, stat_name, data, period_complete_day): |
def update_sitewide_stats(period_name, stat_name, data, period_complete_day): |
for k,v in data.iteritems(): |
for k,v in data.iteritems(): |
item = model.Session.query(GA_Stat).\ |
item = model.Session.query(GA_Stat).\ |
filter(GA_Stat.period_name==period_name).\ |
filter(GA_Stat.period_name==period_name).\ |
filter(GA_Stat.key==k).\ |
filter(GA_Stat.key==k).\ |
filter(GA_Stat.stat_name==stat_name).first() |
filter(GA_Stat.stat_name==stat_name).first() |
if item: |
if item: |
item.period_name = period_name |
item.period_name = period_name |
item.key = k |
item.key = k |
item.value = v |
item.value = v |
item.period_complete_day = period_complete_day |
item.period_complete_day = period_complete_day |
model.Session.add(item) |
model.Session.add(item) |
else: |
else: |
# create the row |
# create the row |
values = {'id': make_uuid(), |
values = {'id': make_uuid(), |
'period_name': period_name, |
'period_name': period_name, |
'period_complete_day': period_complete_day, |
'period_complete_day': period_complete_day, |
'key': k, |
'key': k, |
'value': v, |
'value': v, |
'stat_name': stat_name |
'stat_name': stat_name |
} |
} |
model.Session.add(GA_Stat(**values)) |
model.Session.add(GA_Stat(**values)) |
model.Session.commit() |
model.Session.commit() |
|
|
|
|
def pre_update_url_stats(period_name): |
def pre_update_url_stats(period_name): |
q = model.Session.query(GA_Url).\ |
q = model.Session.query(GA_Url).\ |
filter(GA_Url.period_name==period_name) |
filter(GA_Url.period_name==period_name) |
log.debug("Deleting %d '%s' records" % (q.count(), period_name)) |
log.debug("Deleting %d '%s' records" % (q.count(), period_name)) |
q.delete() |
q.delete() |
|
|
q = model.Session.query(GA_Url).\ |
q = model.Session.query(GA_Url).\ |
filter(GA_Url.period_name == 'All') |
filter(GA_Url.period_name == 'All') |
log.debug("Deleting %d 'All' records..." % q.count()) |
log.debug("Deleting %d 'All' records..." % q.count()) |
q.delete() |
q.delete() |
|
|
model.Session.flush() |
model.Session.flush() |
model.Session.commit() |
model.Session.commit() |
model.repo.commit_and_remove() |
model.repo.commit_and_remove() |
log.debug('...done') |
log.debug('...done') |
|
|
def post_update_url_stats(): |
def post_update_url_stats(): |
|
|
""" Check the distinct url field in ga_url and make sure |
""" Check the distinct url field in ga_url and make sure |
it has an All record. If not then create one. |
it has an All record. If not then create one. |
|
|
After running this then every URL should have an All |
After running this then every URL should have an All |
record regardless of whether the URL has an entry for |
record regardless of whether the URL has an entry for |
the month being currently processed. |
the month being currently processed. |
""" |
""" |
log.debug('Post-processing "All" records...') |
log.debug('Post-processing "All" records...') |
query = """select url, pageviews::int, visits::int |
query = """select url, pageviews::int, visits::int |
from ga_url |
from ga_url |
where url not in (select url from ga_url where period_name ='All')""" |
where url not in (select url from ga_url where period_name ='All')""" |
connection = model.Session.connection() |
connection = model.Session.connection() |
res = connection.execute(query) |
res = connection.execute(query) |
|
|
views, visits = {}, {} |
views, visits = {}, {} |
# url, views, visits |
# url, views, visits |
for row in res: |
for row in res: |
views[row[0]] = views.get(row[0], 0) + row[1] |
views[row[0]] = views.get(row[0], 0) + row[1] |
visits[row[0]] = visits.get(row[0], 0) + row[2] |
visits[row[0]] = visits.get(row[0], 0) + row[2] |
|
|
progress_total = len(views.keys()) |
progress_total = len(views.keys()) |
progress_count = 0 |
progress_count = 0 |
for key in views.keys(): |
for key in views.keys(): |
progress_count += 1 |
progress_count += 1 |
if progress_count % 100 == 0: |
if progress_count % 100 == 0: |
log.debug('.. %d/%d done so far', progress_count, progress_total) |
log.debug('.. %d/%d done so far', progress_count, progress_total) |
|
|
package, publisher = _get_package_and_publisher(key) |
package, publisher = _get_package_and_publisher(key) |
|
|
values = {'id': make_uuid(), |
values = {'id': make_uuid(), |
'period_name': "All", |
'period_name': "All", |
'period_complete_day': 0, |
'period_complete_day': 0, |
'url': key, |
'url': key, |
'pageviews': views[key], |
'pageviews': views[key], |
'visits': visits[key], |
'visits': visits[key], |
'department_id': publisher, |
'department_id': publisher, |
'package_id': publisher |
'package_id': publisher |
} |
} |
model.Session.add(GA_Url(**values)) |
model.Session.add(GA_Url(**values)) |
model.Session.commit() |
model.Session.commit() |
log.debug('..done') |
log.debug('..done') |
|
|
|
|
def update_url_stats(period_name, period_complete_day, url_data): |
def update_url_stats(period_name, period_complete_day, url_data): |
''' |
''' |
Given a list of urls and number of hits for each during a given period, |
Given a list of urls and number of hits for each during a given period, |
stores them in GA_Url under the period and recalculates the totals for |
stores them in GA_Url under the period and recalculates the totals for |
the 'All' period. |
the 'All' period. |
''' |
''' |
progress_total = len(progress_data) |
progress_total = len(url_data) |
progress_count = 0 |
progress_count = 0 |
for url, views, visits in url_data: |
for url, views, visits in url_data: |
progress_count += 1 |
progress_count += 1 |
if progress_count % 100 == 0: |
if progress_count % 100 == 0: |
log.debug('.. %d/%d done so far', progress_count, progress_total) |
log.debug('.. %d/%d done so far', progress_count, progress_total) |
|
|
package, publisher = _get_package_and_publisher(url) |
package, publisher = _get_package_and_publisher(url) |
|
|
item = model.Session.query(GA_Url).\ |
item = model.Session.query(GA_Url).\ |
filter(GA_Url.period_name==period_name).\ |
filter(GA_Url.period_name==period_name).\ |
filter(GA_Url.url==url).first() |
filter(GA_Url.url==url).first() |
if item: |
if item: |
item.pageviews = item.pageviews + views |
item.pageviews = item.pageviews + views |
item.visits = item.visits + visits |
item.visits = item.visits + visits |
if not item.package_id: |
if not item.package_id: |
item.package_id = package |
item.package_id = package |
if not item.department_id: |
if not item.department_id: |
item.department_id = publisher |
item.department_id = publisher |
model.Session.add(item) |
model.Session.add(item) |
else: |
else: |
values = {'id': make_uuid(), |
values = {'id': make_uuid(), |
'period_name': period_name, |
'period_name': period_name, |
'period_complete_day': period_complete_day, |
'period_complete_day': period_complete_day, |
'url': url, |
'url': url, |
'pageviews': views, |
'pageviews': views, |
'visits': visits, |
'visits': visits, |
'department_id': publisher, |
'department_id': publisher, |
'package_id': package |
'package_id': package |
} |
} |
model.Session.add(GA_Url(**values)) |
model.Session.add(GA_Url(**values)) |
model.Session.commit() |
model.Session.commit() |
|
|
if package: |
if package: |
old_pageviews, old_visits = 0, 0 |
old_pageviews, old_visits = 0, 0 |
old = model.Session.query(GA_Url).\ |
old = model.Session.query(GA_Url).\ |
filter(GA_Url.period_name=='All').\ |
filter(GA_Url.period_name=='All').\ |
filter(GA_Url.url==url).all() |
filter(GA_Url.url==url).all() |
old_pageviews = sum([int(o.pageviews) for o in old]) |
old_pageviews = sum([int(o.pageviews) for o in old]) |
old_visits = sum([int(o.visits) for o in old]) |
old_visits = sum([int(o.visits) for o in old]) |
|
|
entries = model.Session.query(GA_Url).\ |
entries = model.Session.query(GA_Url).\ |
filter(GA_Url.period_name!='All').\ |
filter(GA_Url.period_name!='All').\ |
filter(GA_Url.url==url).all() |
filter(GA_Url.url==url).all() |
values = {'id': make_uuid(), |
values = {'id': make_uuid(), |
'period_name': 'All', |
'period_name': 'All', |
'period_complete_day': 0, |
'period_complete_day': 0, |
'url': url, |
'url': url, |
'pageviews': sum([int(e.pageviews) for e in entries]) + int(old_pageviews), |
'pageviews': sum([int(e.pageviews) for e in entries]) + int(old_pageviews), |
'visits': sum([int(e.visits or 0) for e in entries]) + int(old_visits), |
'visits': sum([int(e.visits or 0) for e in entries]) + int(old_visits), |
'department_id': publisher, |
'department_id': publisher, |
'package_id': package |
'package_id': package |
} |
} |
|
|
model.Session.add(GA_Url(**values)) |
model.Session.add(GA_Url(**values)) |
model.Session.commit() |
model.Session.commit() |
|
|
|
|
|
|
|
|
def update_social(period_name, data): |
def update_social(period_name, data): |
# Clean up first. |
# Clean up first. |
model.Session.query(GA_ReferralStat).\ |
model.Session.query(GA_ReferralStat).\ |
filter(GA_ReferralStat.period_name==period_name).delete() |
filter(GA_ReferralStat.period_name==period_name).delete() |
|
|
for url,data in data.iteritems(): |
for url,data in data.iteritems(): |
for entry in data: |
for entry in data: |
source = entry[0] |
source = entry[0] |
count = entry[1] |
count = entry[1] |
|
|
item = model.Session.query(GA_ReferralStat).\ |
item = model.Session.query(GA_ReferralStat).\ |
filter(GA_ReferralStat.period_name==period_name).\ |
filter(GA_ReferralStat.period_name==period_name).\ |
filter(GA_ReferralStat.source==source).\ |
filter(GA_ReferralStat.source==source).\ |
filter(GA_ReferralStat.url==url).first() |
filter(GA_ReferralStat.url==url).first() |
if item: |
if item: |
item.count = item.count + count |
item.count = item.count + count |
model.Session.add(item) |
model.Session.add(item) |
else: |
else: |
# create the row |
# create the row |
values = {'id': make_uuid(), |
values = {'id': make_uuid(), |
'period_name': period_name, |
'period_name': period_name, |
'source': source, |
'source': source, |
'url': url, |
'url': url, |
'count': count, |
'count': count, |
} |
} |
model.Session.add(GA_ReferralStat(**values)) |
model.Session.add(GA_ReferralStat(**values)) |
model.Session.commit() |
model.Session.commit() |
|
|
def update_publisher_stats(period_name): |
def update_publisher_stats(period_name): |
""" |
""" |
Updates the publisher stats from the data retrieved for /dataset/* |
Updates the publisher stats from the data retrieved for /dataset/* |
and /publisher/*. Will run against each dataset and generates the |
and /publisher/*. Will run against each dataset and generates the |
totals for the entire tree beneath each publisher. |
totals for the entire tree beneath each publisher. |
""" |
""" |
toplevel = get_top_level() |
toplevel = get_top_level() |
publishers = model.Session.query(model.Group).\ |
publishers = model.Session.query(model.Group).\ |
filter(model.Group.type=='publisher').\ |
filter(model.Group.type=='publisher').\ |
filter(model.Group.state=='active').all() |
filter(model.Group.state=='active').all() |
for publisher in publishers: |
for publisher in publishers: |
views, visits, subpub = update_publisher(period_name, publisher, publisher.name) |
views, visits, subpub = update_publisher(period_name, publisher, publisher.name) |
parent, parents = '', publisher.get_groups('publisher') |
parent, parents = '', publisher.get_groups('publisher') |
if parents: |
if parents: |
parent = parents[0].name |
parent = parents[0].name |
item = model.Session.query(GA_Publisher).\ |
item = model.Session.query(GA_Publisher).\ |
filter(GA_Publisher.period_name==period_name).\ |
filter(GA_Publisher.period_name==period_name).\ |
filter(GA_Publisher.publisher_name==publisher.name).first() |
filter(GA_Publisher.publisher_name==publisher.name).first() |
if item: |
if item: |
item.views = views |
item.views = views |
item.visits = visits |
item.visits = visits |
item.publisher_name = publisher.name |
item.publisher_name = publisher.name |
item.toplevel = publisher in toplevel |
item.toplevel = publisher in toplevel |
item.subpublishercount = subpub |
item.subpublishercount = subpub |
item.parent = parent |
item.parent = parent |
model.Session.add(item) |
model.Session.add(item) |
else: |
else: |
# create the row |
# create the row |
values = {'id': make_uuid(), |
values = {'id': make_uuid(), |
'period_name': period_name, |
'period_name': period_name, |
'publisher_name': publisher.name, |
'publisher_name': publisher.name, |
'views': views, |
'views': views, |
|