[854] Fixes unexplained failure with fetching publisher information.
[ckanext-ga-report.git] / ckanext / ga_report / ga_model.py
David Read


Ross Jones
David Read
Ross Jones
Ross Jones
David Read




David Read

David Read


Ross Jones
Ross Jones
Ross Jones
Ross Jones




Ross Jones




Ross Jones
Ross Jones
Ross Jones
Ross Jones
Ross Jones
Ross Jones


Ross Jones






Ross Jones



Ross Jones
Ross Jones





Ross Jones





Ross Jones





Ross Jones
Ross Jones


Ross Jones


Ross Jones
Ross Jones

















David Read




David Read
Ross Jones
David Read














David Read
David Read
Ross Jones
Ross Jones
David Read








Ross Jones

Ross Jones


Ross Jones

Ross Jones
Ross Jones
Ross Jones








Ross Jones
Ross Jones




Ross Jones
Ross Jones






David Read
Ross Jones
David Read



Ross Jones
David Read



Ross Jones
Ross Jones
Ross Jones
Ross Jones
David Read
Ross Jones
Ross Jones








David Read
Ross Jones











David Read

Ross Jones
David Read



Ross Jones








Ross Jones
Ross Jones


David Read
Ross Jones
Ross Jones
David Read
David Read




Ross Jones
David Read
Ross Jones
David Read


Ross Jones
David Read
Ross Jones
Ross Jones




Ross Jones
Ross Jones



Ross Jones






Ross Jones
Ross Jones



Ross Jones


Ross Jones




Ross Jones
Ross Jones
Ross Jones


Ross Jones
Ross Jones

David Read
Ross Jones

Ross Jones
Ross Jones
Ross Jones
Ross Jones
Ross Jones
Ross Jones
Ross Jones


Ross Jones
Ross Jones


























Ross Jones

Ross Jones








Ross Jones
Ross Jones
Ross Jones


Ross Jones




Ross Jones
Ross Jones
Ross Jones


Ross Jones






Ross Jones
Ross Jones


Ross Jones





Ross Jones
Ross Jones
Ross Jones

Ross Jones
Ross Jones


Ross Jones
Ross Jones
Ross Jones
Ross Jones












Ross Jones

Ross Jones







David Read







Ross Jones
David Read

Ross Jones
Ross Jones

David Read



Ross Jones

David Read


















Ross Jones




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
import re
import uuid
 
from sqlalchemy import Table, Column, MetaData, ForeignKey
from sqlalchemy import types
from sqlalchemy.sql import select
from sqlalchemy.orm import mapper, relation
from sqlalchemy import func
 
import ckan.model as model
from ckan.lib.base import *
 
log = __import__('logging').getLogger(__name__)
 
def make_uuid():
    return unicode(uuid.uuid4())
 
metadata = MetaData()
 
class GA_Url(object):
 
    def __init__(self, **kwargs):
        for k,v in kwargs.items():
            setattr(self, k, v)
 
url_table = Table('ga_url', metadata,
                      Column('id', types.UnicodeText, primary_key=True,
                             default=make_uuid),
                      Column('period_name', types.UnicodeText),
                      Column('period_complete_day', types.Integer),
                      Column('pageviews', types.UnicodeText),
                      Column('visits', types.UnicodeText),
                      Column('url', types.UnicodeText),
                      Column('department_id', types.UnicodeText),
                      Column('package_id', types.UnicodeText),
                )
mapper(GA_Url, url_table)
 
 
class GA_Stat(object):
 
    def __init__(self, **kwargs):
        for k,v in kwargs.items():
            setattr(self, k, v)
 
stat_table = Table('ga_stat', metadata,
                  Column('id', types.UnicodeText, primary_key=True,
                         default=make_uuid),
                  Column('period_name', types.UnicodeText),
                  Column('period_complete_day', types.UnicodeText),
                  Column('stat_name', types.UnicodeText),
                  Column('key', types.UnicodeText),
                  Column('value', types.UnicodeText), )
mapper(GA_Stat, stat_table)
 
 
class GA_Publisher(object):
 
    def __init__(self, **kwargs):
        for k,v in kwargs.items():
            setattr(self, k, v)
 
pub_table = Table('ga_publisher', metadata,
                  Column('id', types.UnicodeText, primary_key=True,
                         default=make_uuid),
                  Column('period_name', types.UnicodeText),
                  Column('publisher_name', types.UnicodeText),
                  Column('views', types.UnicodeText),
                  Column('visits', types.UnicodeText),
                  Column('toplevel', types.Boolean, default=False),
                  Column('subpublishercount', types.Integer, default=0),
                  Column('parent', types.UnicodeText),
)
mapper(GA_Publisher, pub_table)
 
 
class GA_ReferralStat(object):
 
    def __init__(self, **kwargs):
        for k,v in kwargs.items():
            setattr(self, k, v)
 
referrer_table = Table('ga_referrer', metadata,
                      Column('id', types.UnicodeText, primary_key=True,
                             default=make_uuid),
                      Column('period_name', types.UnicodeText),
                      Column('source', types.UnicodeText),
                      Column('url', types.UnicodeText),
                      Column('count', types.Integer),
                )
mapper(GA_ReferralStat, referrer_table)
 
 
 
def init_tables():
    metadata.create_all(model.meta.engine)
 
 
cached_tables = {}
 
 
def get_table(name):
    if name not in cached_tables:
        meta = MetaData()
        meta.reflect(bind=model.meta.engine)
        table = meta.tables[name]
        cached_tables[name] = table
    return cached_tables[name]
 
 
def _normalize_url(url):
    '''Strip off the hostname etc. Do this before storing it.
 
    >>> normalize_url('http://data.gov.uk/dataset/weekly_fuel_prices')
    '/dataset/weekly_fuel_prices'
    '''
    return '/' + '/'.join(url.split('/')[3:])
 
 
def _get_package_and_publisher(url):
    # e.g. /dataset/fuel_prices
    # e.g. /dataset/fuel_prices/resource/e63380d4
    dataset_match = re.match('/dataset/([^/]+)(/.*)?', url)
    if dataset_match:
        dataset_ref = dataset_match.groups()[0]
        dataset = model.Package.get(dataset_ref)
        if dataset:
            publisher_groups = dataset.get_groups('publisher')
            if publisher_groups:
                return dataset_ref,publisher_groups[0].name
        return dataset_ref, None
    else:
        publisher_match = re.match('/publisher/([^/]+)(/.*)?', url)
        if publisher_match:
            return None, publisher_match.groups()[0]
    return None, None
 
def update_sitewide_stats(period_name, stat_name, data, period_complete_day):
    for k,v in data.iteritems():
        item = model.Session.query(GA_Stat).\
            filter(GA_Stat.period_name==period_name).\
            filter(GA_Stat.key==k).\
            filter(GA_Stat.stat_name==stat_name).first()
        if item:
            item.period_name = period_name
            item.key = k
            item.value = v
            item.period_complete_day = period_complete_day
            model.Session.add(item)
        else:
            # create the row
            values = {'id': make_uuid(),
                     'period_name': period_name,
                     'period_complete_day': period_complete_day,
                     'key': k,
                     'value': v,
                     'stat_name': stat_name
                     }
            model.Session.add(GA_Stat(**values))
        model.Session.commit()
 
 
def pre_update_url_stats(period_name):
    q = model.Session.query(GA_Url).\
        filter(GA_Url.period_name==period_name)
    log.debug("Deleting %d '%s' records" % (q.count(), period_name))
    q.delete()
 
    q = model.Session.query(GA_Url).\
        filter(GA_Url.period_name == 'All')
    log.debug("Deleting %d 'All' records..." % q.count())
    q.delete()
 
    model.Session.flush()
    model.Session.commit()
    model.repo.commit_and_remove()
    log.debug('...done')
 
def post_update_url_stats():
 
    """ Check the distinct url field in ga_url and make sure
        it has an All record.  If not then create one.
 
        After running this then every URL should have an All
        record regardless of whether the URL has an entry for
        the month being currently processed.
    """
    log.debug('Post-processing "All" records...')
    query = """select url, pageviews::int, visits::int
               from ga_url
               where url not in (select url from ga_url where period_name ='All')"""
    connection = model.Session.connection()
    res = connection.execute(query)
 
    views, visits = {}, {}
    # url, views, visits
    for row in res:
        views[row[0]] = views.get(row[0], 0) + row[1]
        visits[row[0]] = visits.get(row[0], 0) + row[2]
 
    progress_total = len(views.keys())
    progress_count = 0
    for key in views.keys():
        progress_count += 1
        if progress_count % 100 == 0:
            log.debug('.. %d/%d done so far', progress_count, progress_total)
 
        package, publisher = _get_package_and_publisher(key)
 
        values = {'id': make_uuid(),
                  'period_name': "All",
                  'period_complete_day': 0,
                  'url': key,
                  'pageviews': views[key],
                  'visits': visits[key],
                  'department_id': publisher,
                  'package_id': package
                  }
        model.Session.add(GA_Url(**values))
    model.Session.commit()
    log.debug('..done')
 
 
def update_url_stats(period_name, period_complete_day, url_data):
    '''
    Given a list of urls and number of hits for each during a given period,
    stores them in GA_Url under the period and recalculates the totals for
    the 'All' period.
    '''
    progress_total = len(url_data)
    progress_count = 0
    for url, views, visits in url_data:
        progress_count += 1
        if progress_count % 100 == 0:
            log.debug('.. %d/%d done so far', progress_count, progress_total)
 
        package, publisher = _get_package_and_publisher(url)
 
        item = model.Session.query(GA_Url).\
            filter(GA_Url.period_name==period_name).\
            filter(GA_Url.url==url).first()
        if item:
            item.pageviews = item.pageviews + views
            item.visits = item.visits + visits
            if not item.package_id:
                item.package_id = package
            if not item.department_id:
                item.department_id = publisher
            model.Session.add(item)
        else:
            values = {'id': make_uuid(),
                      'period_name': period_name,
                      'period_complete_day': period_complete_day,
                      'url': url,
                      'pageviews': views,
                      'visits': visits,
                      'department_id': publisher,
                      'package_id': package
                     }
            model.Session.add(GA_Url(**values))
        model.Session.commit()
 
        if package:
            old_pageviews, old_visits = 0, 0
            old = model.Session.query(GA_Url).\
                filter(GA_Url.period_name=='All').\
                filter(GA_Url.url==url).all()
            old_pageviews = sum([int(o.pageviews) for o in old])
            old_visits = sum([int(o.visits) for o in old])
 
            entries = model.Session.query(GA_Url).\
                filter(GA_Url.period_name!='All').\
                filter(GA_Url.url==url).all()
            values = {'id': make_uuid(),
                      'period_name': 'All',
                      'period_complete_day': 0,
                      'url': url,
                      'pageviews': sum([int(e.pageviews) for e in entries]) + int(old_pageviews),
                      'visits': sum([int(e.visits or 0) for e in entries]) + int(old_visits),
                      'department_id': publisher,
                      'package_id': package
                     }
 
            model.Session.add(GA_Url(**values))
            model.Session.commit()
 
 
 
 
def update_social(period_name, data):
    # Clean up first.
    model.Session.query(GA_ReferralStat).\
        filter(GA_ReferralStat.period_name==period_name).delete()
 
    for url,data in data.iteritems():
        for entry in data:
            source = entry[0]
            count = entry[1]
 
            item = model.Session.query(GA_ReferralStat).\
                filter(GA_ReferralStat.period_name==period_name).\
                filter(GA_ReferralStat.source==source).\
                filter(GA_ReferralStat.url==url).first()
            if item:
                item.count = item.count + count
                model.Session.add(item)
            else:
                # create the row
                values = {'id': make_uuid(),
                          'period_name': period_name,
                          'source': source,
                          'url': url,
                          'count': count,
                         }
                model.Session.add(GA_ReferralStat(**values))
            model.Session.commit()
 
def update_publisher_stats(period_name):
    """
    Updates the publisher stats from the data retrieved for /dataset/*
    and /publisher/*. Will run against each dataset and generates the
    totals for the entire tree beneath each publisher.
    """
    toplevel = get_top_level()
    publishers = model.Session.query(model.Group).\
        filter(model.Group.type=='publisher').\
        filter(model.Group.state=='active').all()
    for publisher in publishers:
        views, visits, subpub = update_publisher(period_name, publisher, publisher.name)
        parent, parents = '', publisher.get_groups('publisher')
        if parents:
            parent = parents[0].name
        item = model.Session.query(GA_Publisher).\
            filter(GA_Publisher.period_name==period_name).\
            filter(GA_Publisher.publisher_name==publisher.name).first()
        if item:
            item.views = views
            item.visits = visits
            item.publisher_name = publisher.name
            item.toplevel = publisher in toplevel
            item.subpublishercount = subpub
            item.parent = parent
            model.Session.add(item)
        else:
            # create the row
            values = {'id': make_uuid(),
                     'period_name': period_name,
                     'publisher_name': publisher.name,
                     'views': views,
                     'visits': visits,
                     'toplevel': publisher in toplevel,
                     'subpublishercount': subpub,
                     'parent': parent
                     }
            model.Session.add(GA_Publisher(**values))
        model.Session.commit()
 
 
def update_publisher(period_name, pub, part=''):
    views,visits,subpub = 0, 0, 0
    for publisher in go_down_tree(pub):
        subpub = subpub + 1
        items = model.Session.query(GA_Url).\
                filter(GA_Url.period_name==period_name).\
                filter(GA_Url.department_id==publisher.name).all()
        for item in items:
            views = views + int(item.pageviews)
            visits = visits + int(item.visits)
 
    return views, visits, (subpub-1)
 
 
def get_top_level():
    '''Returns the top level publishers.'''
    return model.Session.query(model.Group).\
           outerjoin(model.Member, model.Member.table_id == model.Group.id and \
                     model.Member.table_name == 'group' and \
                     model.Member.state == 'active').\
           filter(model.Member.id==None).\
           filter(model.Group.type=='publisher').\
           order_by(model.Group.name).all()
 
def get_children(publisher):
    '''Finds child publishers for the given publisher (object). (Not recursive i.e. returns one level)'''
    return publisher.get_children_groups(type='organization')
 
def go_down_tree(publisher):
    '''Provided with a publisher object, it walks down the hierarchy and yields each publisher,
    including the one you supply.'''
    yield publisher
    for child in get_children(publisher):
        for grandchild in go_down_tree(child):
            yield grandchild
 
def delete(period_name):
    '''
    Deletes table data for the specified period, or specify 'all'
    for all periods.
    '''
    for object_type in (GA_Url, GA_Stat, GA_Publisher, GA_ReferralStat):
        q = model.Session.query(object_type)
        if period_name != 'All':
            q = q.filter_by(period_name=period_name)
        q.delete()
    model.repo.commit_and_remove()
 
def get_score_for_dataset(dataset_name):
    '''
    Returns a "current popularity" score for a dataset,
    based on how many views it has had recently.
    '''
    import datetime
    now = datetime.datetime.now()
    last_month = now - datetime.timedelta(days=30)
    period_names = ['%s-%02d' % (last_month.year, last_month.month),
                    '%s-%02d' % (now.year, now.month),
                    ]
 
    score = 0
    for period_name in period_names:
        score /= 2 # previous periods are discounted by 50%
        entry = model.Session.query(GA_Url)\
                .filter(GA_Url.period_name==period_name)\
                .filter(GA_Url.package_id==dataset_name).first()
        # score
        if entry:
            views = float(entry.pageviews)
            if entry.period_complete_day:
                views_per_day = views / entry.period_complete_day
            else:
                views_per_day = views / 15 # guess
            score += views_per_day
 
    score = int(score * 100)
    log.debug('Popularity %s: %s', score, dataset_name)
    return score