be permissive with datagov resource file size
be permissive with datagov resource file size


Former-commit-id: c0a71c3c09369965e8748c2a8f7062eb2abbe01a

<?php <?php
   
require_once '../include/common.inc.php'; require_once '../include/common.inc.php';
//function createFOIDocumentsDesignDoc() { //function createFOIDocumentsDesignDoc() {
   
$foidb = $server->get_db('disclosr-foidocuments'); $foidb = $server->get_db('disclosr-foidocuments');
$obj = new stdClass(); $obj = new stdClass();
$obj->_id = "_design/" . urlencode("app"); $obj->_id = "_design/" . urlencode("app");
$obj->language = "javascript"; $obj->language = "javascript";
$obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; $obj->views->all->map = "function(doc) { emit(doc._id, doc); };";
$obj->views->byDate->map = "function(doc) { emit(doc.date, doc); };"; $obj->views->byDate->map = "function(doc) { emit(doc.date, doc); };";
$obj->views->byDateMonthYear->map = "function(doc) { emit(doc.date, doc); };"; $obj->views->byDateMonthYear->map = "function(doc) { emit(doc.date, doc); };";
$obj->views->byDateMonthYear->reduce = "_count"; $obj->views->byDateMonthYear->reduce = "_count";
$obj->views->byAgencyID->map = "function(doc) { emit(doc.agencyID, doc); };"; $obj->views->byAgencyID->map = "function(doc) { emit(doc.agencyID, doc); };";
$obj->views->byAgencyID->reduce = "_count"; $obj->views->byAgencyID->reduce = "_count";
$obj->views->fieldNames->map = ' $obj->views->fieldNames->map = '
function(doc) { function(doc) {
for(var propName in doc) { for(var propName in doc) {
emit(propName, doc._id); emit(propName, doc._id);
} }
   
}'; }';
$obj->views->fieldNames->reduce = 'function (key, values, rereduce) { $obj->views->fieldNames->reduce = 'function (key, values, rereduce) {
return values.length; return values.length;
}'; }';
// allow safe updates (even if slightly slower due to extra: rev-detection check). // allow safe updates (even if slightly slower due to extra: rev-detection check).
$foidb->save($obj, true); $foidb->save($obj, true);
   
   
//function createDocumentsDesignDoc() { //function createDocumentsDesignDoc() {
$docdb = $server->get_db('disclosr-documents'); $docdb = $server->get_db('disclosr-documents');
   
$obj = new stdClass(); $obj = new stdClass();
$obj->_id = "_design/" . urlencode("app"); $obj->_id = "_design/" . urlencode("app");
$obj->language = "javascript"; $obj->language = "javascript";
$obj->views->web_server->map = "function(doc) {\n emit(doc.web_server, 1);\n}"; $obj->views->web_server->map = "function(doc) {\n emit(doc.web_server, 1);\n}";
$obj->views->web_server->reduce = "_sum"; $obj->views->web_server->reduce = "_sum";
$obj->views->byAgency->map = "function(doc) {\n emit(doc.agencyID, 1);\n}"; $obj->views->byAgency->map = "function(doc) {\n emit(doc.agencyID, 1);\n}";
$obj->views->byAgency->reduce = "_sum"; $obj->views->byAgency->reduce = "_sum";
$obj->views->byURL->map = "function(doc) {\n emit(doc.url, doc);\n}"; $obj->views->byURL->map = "function(doc) {\n emit(doc.url, doc);\n}";
$obj->views->agency->map = "function(doc) {\n emit(doc.agencyID, doc);\n}"; $obj->views->agency->map = "function(doc) {\n emit(doc.agencyID, doc);\n}";
$obj->views->byWebServer->map = "function(doc) {\n emit(doc.web_server, doc);\n}"; $obj->views->byWebServer->map = "function(doc) {\n emit(doc.web_server, doc);\n}";
   
$obj->views->datasets->map = "function(doc) {\nif (doc.fieldName == \"data\") {\n emit(doc._id, doc);\n}\n}"; $obj->views->datasets->map = "function(doc) {\nif (doc.fieldName == \"data\") {\n emit(doc._id, doc);\n}\n}";
$obj->views->datasetGroups->map = "function(doc) {\nif (doc.fieldName == \"data\") {\n doc.metadata[\"data.gov.au Category\"] && doc.metadata[\"data.gov.au Category\"].forEach(function(tag) {\n emit(tag, doc.url); \n });\n}\n}";  
$obj->views->getValidationRequired->map = "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}"; $obj->views->getValidationRequired->map = "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}";
$docdb->save($obj, true); $docdb->save($obj, true);
   
   
   
   
//function createAgencyDesignDoc() { //function createAgencyDesignDoc() {
$db = $server->get_db('disclosr-agencies'); $db = $server->get_db('disclosr-agencies');
$obj = new stdClass(); $obj = new stdClass();
$obj->_id = "_design/" . urlencode("app"); $obj->_id = "_design/" . urlencode("app");
$obj->language = "javascript"; $obj->language = "javascript";
$obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; $obj->views->all->map = "function(doc) { emit(doc._id, doc); };";
$obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };"; $obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };";
$obj->views->byCanonicalName->map = "function(doc) { $obj->views->byCanonicalName->map = "function(doc) {
if (doc.parentOrg || doc.orgType == 'FMA-DepartmentOfState') { if (doc.parentOrg || doc.orgType == 'FMA-DepartmentOfState') {
emit(doc.name, doc); emit(doc.name, doc);
} }
};"; };";
$obj->views->byDeptStateName->map = "function(doc) { $obj->views->byDeptStateName->map = "function(doc) {
if (doc.orgType == 'FMA-DepartmentOfState') { if (doc.orgType == 'FMA-DepartmentOfState') {
emit(doc.name, doc._id); emit(doc.name, doc._id);
} }
};"; };";
$obj->views->parentOrgs->map = "function(doc) { $obj->views->parentOrgs->map = "function(doc) {
if (doc.parentOrg) { if (doc.parentOrg) {
emit(doc._id, doc.parentOrg); emit(doc._id, doc.parentOrg);
} }
};"; };";
$obj->views->byName->map = 'function(doc) { $obj->views->byName->map = 'function(doc) {
if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") {
emit(doc.name, doc._id); emit(doc.name, doc._id);
if (typeof(doc.shortName) != "undefined" && doc.shortName != doc.name) { if (typeof(doc.shortName) != "undefined" && doc.shortName != doc.name) {
emit(doc.shortName, doc._id); emit(doc.shortName, doc._id);
} }
for (name in doc.otherNames) { for (name in doc.otherNames) {
if (doc.otherNames[name] != "" && doc.otherNames[name] != doc.name) { if (doc.otherNames[name] != "" && doc.otherNames[name] != doc.name) {
emit(doc.otherNames[name], doc._id); emit(doc.otherNames[name], doc._id);
} }
} }
for (name in doc.foiBodies) { for (name in doc.foiBodies) {
if (doc.foiBodies[name] != "" && doc.foiBodies[name] != doc.name) { if (doc.foiBodies[name] != "" && doc.foiBodies[name] != doc.name) {
emit(doc.foiBodies[name], doc._id); emit(doc.foiBodies[name], doc._id);
} }
} }
for (name in doc.positions) { for (name in doc.positions) {
if (doc.positions[name] != "" && doc.positions[name] != doc.name) { if (doc.positions[name] != "" && doc.positions[name] != doc.name) {
emit(doc.positions[name], doc._id); emit(doc.positions[name], doc._id);
} }
} }
} }
};'; };';
   
$obj->views->foiEmails->map = "function(doc) { $obj->views->foiEmails->map = "function(doc) {
emit(doc._id, doc.foiEmail); emit(doc._id, doc.foiEmail);
};"; };";
   
$obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }"; $obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }";
$obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };'; $obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };';
$obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };'; $obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };';
$obj->views->getScrapeRequired->map = "function(doc) { $obj->views->getScrapeRequired->map = "function(doc) {
   
var lastScrape = Date.parse(doc.metadata.lastScraped); var lastScrape = Date.parse(doc.metadata.lastScraped);
   
var today = new Date(); var today = new Date();
   
if (!lastScrape || lastScrape.getTime() + 1000 != today.getTime()) { if (!lastScrape || lastScrape.getTime() + 1000 != today.getTime()) {
emit(doc._id, doc); emit(doc._id, doc);
} }
   
};"; };";
$obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };"; $obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };";
$obj->views->getConflicts->map = "function(doc) { $obj->views->getConflicts->map = "function(doc) {
if (doc._conflicts) { if (doc._conflicts) {
emit(null, [doc._rev].concat(doc._conflicts)); emit(null, [doc._rev].concat(doc._conflicts));
} }
}"; }";
$obj->views->getStatistics->map = $obj->views->getStatistics->map =
"function(doc) { "function(doc) {
if (doc.statistics) { if (doc.statistics) {
for (var statisticSet in doc.statistics) { for (var statisticSet in doc.statistics) {
for (var statisticPeriod in doc.statistics[statisticSet]) { for (var statisticPeriod in doc.statistics[statisticSet]) {
emit([statisticSet,statisticPeriod], doc.statistics[statisticSet][statisticPeriod]['value']); emit([statisticSet,statisticPeriod], doc.statistics[statisticSet][statisticPeriod]['value']);
} }
} }
} }
}"; }";
$obj->views->getStatistics->reduce = '_sum'; $obj->views->getStatistics->reduce = '_sum';
// http://stackoverflow.com/questions/646628/javascript-startswith // http://stackoverflow.com/questions/646628/javascript-startswith
$obj->views->score->map = 'if(!String.prototype.startsWith){ $obj->views->score->map = 'if(!String.prototype.startsWith){
String.prototype.startsWith = function (str) { String.prototype.startsWith = function (str) {
return !this.indexOf(str); return !this.indexOf(str);
} }
} }
   
function(doc) { function(doc) {
count = 0; count = 0;
if (doc["status"] != "suspended") { if (doc["status"] != "suspended") {
for(var propName in doc) { for(var propName in doc) {
if(typeof(doc[propName]) != "undefined" && doc[propName] != "") { if(typeof(doc[propName]) != "undefined" && doc[propName] != "") {
count++; count++;
} }
} }
portfolio = doc.parentOrg; portfolio = doc.parentOrg;
if (doc.orgType == "FMA-DepartmentOfState") { if (doc.orgType == "FMA-DepartmentOfState") {
portfolio = doc._id; portfolio = doc._id;
} }
if (doc.orgType == "Court-Commonwealth" || doc.orgType == "FMA-DepartmentOfParliament") { if (doc.orgType == "Court-Commonwealth" || doc.orgType == "FMA-DepartmentOfParliament") {
portfolio = doc.orgType; portfolio = doc.orgType;
} }
emit(count+doc._id, {id:doc._id, name: doc.name, score:count, orgType: doc.orgType, portfolio:portfolio}); emit(count+doc._id, {id:doc._id, name: doc.name, score:count, orgType: doc.orgType, portfolio:portfolio});
} }
}'; }';
$obj->views->scoreHas->map = 'if(!String.prototype.startsWith){ $obj->views->scoreHas->map = 'if(!String.prototype.startsWith){
String.prototype.startsWith = function (str) { String.prototype.startsWith = function (str) {
return !this.indexOf(str); return !this.indexOf(str);
} }
} }
if(!String.prototype.endsWith){ if(!String.prototype.endsWith){
String.prototype.endsWith = function(suffix) { String.prototype.endsWith = function(suffix) {
    return this.indexOf(suffix, this.length - suffix.length) !== -1;     return this.indexOf(suffix, this.length - suffix.length) !== -1;
}; };
} }
function(doc) { function(doc) {
if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") {
for(var propName in doc) { for(var propName in doc) {
if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") || propName.endsWith("URL"))) { if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") || propName.endsWith("URL"))) {
emit(propName, 1); emit(propName, 1);
} }
} }
emit("total", 1); emit("total", 1);
} }
}'; }';
$obj->views->scoreHas->reduce = '_sum'; $obj->views->scoreHas->reduce = '_sum';
$obj->views->fieldNames->map = ' $obj->views->fieldNames->map = '
function(doc) { function(doc) {
for(var propName in doc) { for(var propName in doc) {
emit(propName, doc._id); emit(propName, doc._id);
} }
}'; }';
$obj->views->fieldNames->reduce = '_count'; $obj->views->fieldNames->reduce = '_count';
// allow safe updates (even if slightly slower due to extra: rev-detection check). // allow safe updates (even if slightly slower due to extra: rev-detection check).
$db->save($obj, true); $db->save($obj, true);
?> ?>
   
file:a/disclosr.iml (deleted)
<?xml version="1.0" encoding="UTF-8"?>  
<module type="WEB_MODULE" version="4">  
<component name="FacetManager">  
<facet type="Python" name="Python">  
<configuration sdkName="" />  
</facet>  
</component>  
<component name="NewModuleRootManager" inherit-compiler-output="true">  
<exclude-output />  
<content url="file://$MODULE_DIR$" />  
<orderEntry type="inheritedJdk" />  
<orderEntry type="sourceFolder" forTests="false" />  
</component>  
</module>  
 
 
import ckanclient  
import couchdb  
from ckanclient import CkanApiError  
import re  
 
 
class LoaderError(Exception):  
pass  
 
# Instantiate the CKAN client.  
#ckan = ckanclient.CkanClient(base_location='http://localhost:5000/api', api_key='b47b24cd-591d-40c1-8677-d73101d56d1b')  
api_key = 'b3ab75e4-afbb-465b-a09d-8171c8c69a7a'  
ckan = ckanclient.CkanClient(base_location='http://data.disclosurelo.gs/api',  
api_key=api_key)  
couch = couchdb.Server('http://127.0.0.1:5984/')  
#couch = couchdb.Server('http://192.168.1.113:5984/')  
 
# https://github.com/okfn/ckanext-importlib  
def munge(name):  
# convert spaces to underscores  
name = re.sub(' ', '_', name).lower()  
# convert symbols to dashes  
name = re.sub('[:]', '_-', name).lower()  
name = re.sub('[/]', '-', name).lower()  
# take out not-allowed characters  
name = re.sub('[^a-zA-Z0-9-_]', '', name).lower()  
# remove double underscores  
name = re.sub('__', '_', name).lower()  
return name  
 
 
def name_munge(input_name):  
return munge(input_name.replace(' ', '').replace('.', '_').replace('&', 'and'))  
 
 
docsdb = couch['disclosr-documents']  
 
if __name__ == "__main__":  
groups = {}  
for doc in docsdb.view('app/datasetGroups'):  
group_name = doc.key  
pkg_name = filter(lambda x: x in '0123456789abcdefghijklmnopqrstuvwxyz-_',  
doc.value.replace("http://data.gov.au/dataset/", '').replace('/', '')[:100]);  
if group_name in groups.keys():  
groups[group_name] = list(set(groups[group_name] + [pkg_name]))  
else:  
groups[group_name] = [pkg_name]  
 
# add dataset to group(s)  
for group_name in groups.keys():  
group_url = name_munge(group_name[:100])  
print group_name  
print groups[group_name]  
try:  
# Update the group details  
group_entity = ckan.group_entity_get(group_url)  
print "group "+group_name+" exists"  
if 'packages' in group_entity.keys():  
group_entity['packages'] = list(set(group_entity['packages'] + groups[group_name]))  
else:  
group_entity['packages'] = groups[group_name]  
ckan.group_entity_put(group_entity)  
except CkanApiError, e:  
if ckan.last_status == 404:  
print "group "+group_name+" does not exist, creating"  
group_entity = {  
'name': group_url,  
'title': group_name,  
'description': group_name,  
'packages': groups[group_name]  
}  
#print group_entity  
ckan.group_register_post(group_entity)  
elif ckan.last_status == 409:  
print "group already exists"  
else:  
raise LoaderError('Unexpected status %s adding to group under \'%s\': %r' % (  
ckan.last_status, pkg_name, e.args))  
 
import ckanclient import ckanclient
import couchdb import couchdb
from ckanclient import CkanApiError from ckanclient import CkanApiError
import re import re
import html2text # aaronsw :( import html2text # aaronsw :(
import ckanapi # https://github.com/open-data/ckanapi import ckanapi # https://github.com/open-data/ckanapi
   
   
class LoaderError(Exception): class LoaderError(Exception):
pass pass
   
# Instantiate the CKAN client. # Instantiate the CKAN client.
#ckan = ckanclient.CkanClient(base_location='http://localhost:5000/api', api_key='b47b24cd-591d-40c1-8677-d73101d56d1b') api_key = 'ff34526e-f794-4068-8235-fcbba38cd8bc'
api_key = 'b3ab75e4-afbb-465b-a09d-8171c8c69a7a'  
ckan = ckanclient.CkanClient(base_location='http://data.disclosurelo.gs/api', ckan = ckanclient.CkanClient(base_location='http://data.disclosurelo.gs/api',
api_key=api_key) api_key=api_key)
ckandirect = ckanapi.RemoteCKAN('http://data.disclosurelo.gs', api_key=api_key) ckandirect = ckanapi.RemoteCKAN('http://data.disclosurelo.gs', api_key=api_key)
couch = couchdb.Server('http://127.0.0.1:5984/') couch = couchdb.Server('http://127.0.0.1:5984/')
#couch = couchdb.Server('http://192.168.1.113:5984/') #couch = couchdb.Server('http://192.168.1.113:5984/')
   
# http://code.activestate.com/recipes/578019-bytes-to-human-human-to-bytes-converter/ # http://code.activestate.com/recipes/578019-bytes-to-human-human-to-bytes-converter/
SYMBOLS = { SYMBOLS = {
'customary': ('B', 'KB', 'MB', 'GB', 'T', 'P', 'E', 'Z', 'Y'), 'customary': ('B', 'KB', 'MB', 'GB', 'T', 'P', 'E', 'Z', 'Y'),
'customary_ext': ('byte', 'kilo', 'mega', 'giga', 'tera', 'peta', 'exa', 'customary_ext': ('byte', 'kilo', 'mega', 'giga', 'tera', 'peta', 'exa',
'zetta', 'iotta'), 'zetta', 'iotta'),
'iec': ('Bi', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'), 'iec': ('Bi', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'),
'iec_ext': ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi', 'iec_ext': ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi',
'zebi', 'yobi'), 'zebi', 'yobi'),
} }
   
   
def human2bytes(s): def human2bytes(s):
""" """
Attempts to guess the string format based on default symbols Attempts to guess the string format based on default symbols
set and return the corresponding bytes as an integer. set and return the corresponding bytes as an integer.
When unable to recognize the format ValueError is raised. When unable to recognize the format ValueError is raised.
   
>>> human2bytes('0 B') >>> human2bytes('0 B')
0 0
>>> human2bytes('1 K') >>> human2bytes('1 K')
1024 1024
>>> human2bytes('1 M') >>> human2bytes('1 M')
1048576 1048576
>>> human2bytes('1 Gi') >>> human2bytes('1 Gi')
1073741824 1073741824
>>> human2bytes('1 tera') >>> human2bytes('1 tera')
1099511627776 1099511627776
   
>>> human2bytes('0.5kilo') >>> human2bytes('0.5kilo')
512 512
>>> human2bytes('0.1 byte') >>> human2bytes(�