remove top_tags stats for DB load
[ckanext-dga-stats.git] / ckanext / dga_stats / stats.py




Alex Sadleir




Alex Sadleir
























Alex Sadleir












Alex Sadleir












CKAN data.gov.au


Alex Sadleir





Alex Sadleir























Alex Sadleir
CKAN data.gov.au










Alex Sadleir




Alex Sadleir








Alex Sadleir





Alex Sadleir









Alex Sadleir




































































CKAN data.gov.au
Alex Sadleir


























Alex Sadleir
































































































































































Alex Sadleir

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
import datetime
 
from pylons import config
from sqlalchemy import Table, select, func, and_
from sqlalchemy.sql.expression import text
 
import ckan.plugins as p
import ckan.model as model
 
import re
 
cache_enabled = p.toolkit.asbool(config.get('ckanext.stats.cache_enabled', 'True'))
 
if cache_enabled:
    from pylons import cache
    our_cache = cache.get_cache('stats', type='dbm')
 
DATE_FORMAT = '%Y-%m-%d'
 
def table(name):
    return Table(name, model.meta.metadata, autoload=True)
 
def datetime2date(datetime_):
    return datetime.date(datetime_.year, datetime_.month, datetime_.day)
 
 
class Stats(object):
    @classmethod
    def top_rated_packages(cls, limit=10):
        # NB Not using sqlalchemy as sqla 0.4 doesn't work using both group_by
        # and apply_avg
        package = table('package')
        rating = table('rating')
        sql = select([package.c.id, func.avg(rating.c.rating), func.count(rating.c.rating)], from_obj=[package.join(rating)]).\
              where(package.c.private == 'f').\
              group_by(package.c.id).\
              order_by(func.avg(rating.c.rating).desc(), func.count(rating.c.rating).desc()).\
              limit(limit)
        res_ids = model.Session.execute(sql).fetchall()
        res_pkgs = [(model.Session.query(model.Package).get(unicode(pkg_id)), avg, num) for pkg_id, avg, num in res_ids]
        return res_pkgs
 
    @classmethod
    def most_edited_packages(cls, limit=10):
        package_revision = table('package_revision')
        package = table('package')
        s = select([package_revision.c.id, func.count(package_revision.c.revision_id)], from_obj=[package_revision.join(package)]).\
            where(package.c.private == 'f').\
            group_by(package_revision.c.id).\
            order_by(func.count(package_revision.c.revision_id).desc()).\
            limit(limit)
        res_ids = model.Session.execute(s).fetchall()
        res_pkgs = [(model.Session.query(model.Package).get(unicode(pkg_id)), val) for pkg_id, val in res_ids]
        return res_pkgs
 
    @classmethod
    def largest_groups(cls, limit=10):
         member = table('member')
         s = select([member.c.group_id, func.count(member.c.table_id)]).\
            group_by(member.c.group_id).\
            where(member.c.group_id!=None).\
            where(member.c.table_name=='package').\
            where(member.c.capacity=='public').\
            order_by(func.count(member.c.table_id).desc())
            #limit(limit)
 
         res_ids = model.Session.execute(s).fetchall()
         res_groups = [(model.Session.query(model.Group).get(unicode(group_id)), val) for group_id, val in res_ids]
         return res_groups
 
    @classmethod
    def by_org(cls, limit=10):
        connection = model.Session.connection()
        res = connection.execute("select package.owner_org, package.private, count(*) from package \
                inner join \"group\" on package.owner_org = \"group\".id \
                where package.state='active'\
                group by package.owner_org,\"group\".name, package.private \
                order by \"group\".name, package.private;").fetchall();
        res_groups = [(model.Session.query(model.Group).get(unicode(group_id)), private, val) for group_id, private, val in res]
        return res_groups
 
    @classmethod
    def res_by_org(cls, limit=10):
        connection = model.Session.connection()
        reses = connection.execute("select owner_org,format,count(*) from \
                resource inner join resource_group on resource.resource_group_id = resource_group.id \
                inner join package on resource_group.package_id = package.id group by owner_org,format order by count desc;").fetchall();
        group_ids = []
        group_tab = {}
        group_spatial = {}
        group_other = {}
        for group_id,format,count in reses:
                if group_id not in group_ids:
                        group_ids.append(group_id) 
                        group_tab[group_id] = 0
                        group_spatial[group_id] = 0 
                        group_other[group_id] = 0
                if re.search('xls|csv|ms-excel|spreadsheetml.sheet|zip|netcdf',format, re.IGNORECASE):
                        group_tab[group_id] = group_tab[group_id] + count
                elif re.search('wms|wfs|wcs|shp|kml|kmz',format, re.IGNORECASE):
                        group_spatial[group_id] = group_spatial[group_id] + count
                else:
                        group_other[group_id] = group_other[group_id] + count
        return [(model.Session.query(model.Group).get(unicode(group_id)), group_tab[group_id],group_spatial[group_id],group_other[group_id], group_tab[group_id]+group_spatial[group_id]+group_other[group_id]) for group_id in group_ids]
 
    @classmethod
    def top_active_orgs(cls, limit=10):
        connection = model.Session.connection()
        res = connection.execute("select package.owner_org, count(*) from package \
                inner join \"group\" on package.owner_org = \"group\".id \
                inner join (select distinct object_id from activity where activity.timestamp > (now() - interval '60 day')) \
                latestactivities on latestactivities.object_id = package.id \
                where package.state='active' \
                and package.private = 'f' \
                group by package.owner_org \
                order by count(*) desc;").fetchall();
        res_groups = [(model.Session.query(model.Group).get(unicode(group_id)), val) for group_id, val in res]
        return res_groups
 
    @classmethod
    def top_package_owners(cls, limit=10):
        package_role = table('package_role')
        user_object_role = table('user_object_role')
        package = table('package')
        s = select([user_object_role.c.user_id, func.count(user_object_role.c.role)], from_obj=[user_object_role.join(package_role).join(package, package_role.c.package_id == package.c.id)]).\
            where(user_object_role.c.role==model.authz.Role.ADMIN).\
            where(package.c.private == 'f').\
            where(user_object_role.c.user_id!=None).\
            group_by(user_object_role.c.user_id).\
            order_by(func.count(user_object_role.c.role).desc()).\
            limit(limit)
        res_ids = model.Session.execute(s).fetchall()
        res_users = [(model.Session.query(model.User).get(unicode(user_id)), val) for user_id, val in res_ids]
        return res_users
 
    @classmethod
    def summary_stats(cls):
       connection = model.Session.connection()
 
       res = connection.execute("SELECT 'Total Organisations', count(*) from \"group\" where type = 'organization' and state = 'active' union \
                                select 'Total Datasets', count(*) from package where (state='active' or state='draft' or state='draft-complete') and private = 'f' union \
                                select 'Total Archived Datasets', count(*) from package where (state='active' or state='draft' or state='draft-complete') and private = 't' union \
                                select 'Total Data Files/Resources', count(*) from resource where state='active' union \
                                select 'Total Machine Readable/Data API Resources', count(*) from resource where state='active' and webstore_url = 'active'\
                                ").fetchall();
       return res
 
 
    @classmethod
    def activity_counts(cls):
       connection = model.Session.connection()
       res = connection.execute("select to_char(timestamp, 'YYYY-MM') as month,activity_type, count(*) from activity group by month, activity_type order by month;").fetchall();
       return res
 
    @classmethod
    def user_access_list(cls):
       connection = model.Session.connection()
       res = connection.execute("select name,sysadmin,role from user_object_role right outer join \"user\" on user_object_role.user_id = \"user\".id where name not in ('logged_in','visitor') group by name,sysadmin,role order by sysadmin desc, role asc;").fetchall();
       return res
 
    @classmethod
    def recent_datasets(cls):
        activity = table('activity')
        package = table('package')
        s = select([func.max(activity.c.timestamp),package.c.id, activity.c.activity_type], from_obj=[activity.join(package,activity.c.object_id == package.c.id)]).where(package.c.private == 'f').\
            where(activity.c.timestamp > func.now() - text("interval '60 day'")).group_by(package.c.id,activity.c.activity_type).order_by(func.max(activity.c.timestamp))
        result = model.Session.execute(s).fetchall()
        return [(datetime2date(timestamp), model.Session.query(model.Package).get(unicode(package_id)), activity_type) for timestamp,package_id,activity_type in result]
 
 
 
class RevisionStats(object):
    @classmethod
    def package_addition_rate(cls, weeks_ago=0):
        week_commenced = cls.get_date_weeks_ago(weeks_ago)
        return cls.get_objects_in_a_week(week_commenced,
                                          type_='package_addition_rate')
 
    @classmethod
    def package_revision_rate(cls, weeks_ago=0):
        week_commenced = cls.get_date_weeks_ago(weeks_ago)
        return cls.get_objects_in_a_week(week_commenced,
                                          type_='package_revision_rate')
 
    @classmethod
    def get_date_weeks_ago(cls, weeks_ago):
        '''
        @param weeks_ago: specify how many weeks ago to give count for
                          (0 = this week so far)
        '''
        date_ = datetime.date.today()
        return date_ - datetime.timedelta(days=
                             datetime.date.weekday(date_) + 7 * weeks_ago)
 
    @classmethod
    def get_week_dates(cls, weeks_ago):
        '''
        @param weeks_ago: specify how many weeks ago to give count for
                          (0 = this week so far)
        '''
        package_revision = table('package_revision')
        revision = table('revision')
        today = datetime.date.today()
        date_from = datetime.datetime(today.year, today.month, today.day) -\
                    datetime.timedelta(days=datetime.date.weekday(today) + \
                                       7 * weeks_ago)
        date_to = date_from + datetime.timedelta(days=7)
        return (date_from, date_to)
 
    @classmethod
    def get_date_week_started(cls, date_):
        assert isinstance(date_, datetime.date)
        if isinstance(date_, datetime.datetime):
            date_ = datetime2date(date_)<