import
import


Former-commit-id: 997dc9ece51a37dc25779ca4125d0960cdc195c9

<?php <?php
   
require_once '../include/common.inc.php'; require_once '../include/common.inc.php';
//function createFOIDocumentsDesignDoc() { //function createFOIDocumentsDesignDoc() {
   
$foidb = $server->get_db('disclosr-foidocuments'); $foidb = $server->get_db('disclosr-foidocuments');
$obj = new stdClass(); $obj = new stdClass();
$obj->_id = "_design/" . urlencode("app"); $obj->_id = "_design/" . urlencode("app");
$obj->language = "javascript"; $obj->language = "javascript";
$obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; $obj->views->all->map = "function(doc) { emit(doc._id, doc); };";
$obj->views->byDate->map = "function(doc) { emit(doc.date, doc); };"; $obj->views->byDate->map = "function(doc) { emit(doc.date, doc); };";
$obj->views->byDateMonthYear->map = "function(doc) { emit(doc.date, doc); };"; $obj->views->byDateMonthYear->map = "function(doc) { emit(doc.date, doc); };";
$obj->views->byDateMonthYear->reduce = "_count"; $obj->views->byDateMonthYear->reduce = "_count";
$obj->views->byAgencyID->map = "function(doc) { emit(doc.agencyID, doc); };"; $obj->views->byAgencyID->map = "function(doc) { emit(doc.agencyID, doc); };";
$obj->views->byAgencyID->reduce = "_count"; $obj->views->byAgencyID->reduce = "_count";
$obj->views->fieldNames->map = ' $obj->views->fieldNames->map = '
function(doc) { function(doc) {
for(var propName in doc) { for(var propName in doc) {
emit(propName, doc._id); emit(propName, doc._id);
} }
   
}'; }';
$obj->views->fieldNames->reduce = 'function (key, values, rereduce) { $obj->views->fieldNames->reduce = 'function (key, values, rereduce) {
return values.length; return values.length;
}'; }';
// allow safe updates (even if slightly slower due to extra: rev-detection check). // allow safe updates (even if slightly slower due to extra: rev-detection check).
$foidb->save($obj, true); $foidb->save($obj, true);
   
   
//function createDocumentsDesignDoc() { //function createDocumentsDesignDoc() {
$docdb = $server->get_db('disclosr-documents'); $docdb = $server->get_db('disclosr-documents');
   
$obj = new stdClass(); $obj = new stdClass();
$obj->_id = "_design/" . urlencode("app"); $obj->_id = "_design/" . urlencode("app");
$obj->language = "javascript"; $obj->language = "javascript";
$obj->views->web_server->map = "function(doc) {\n emit(doc.web_server, 1);\n}"; $obj->views->web_server->map = "function(doc) {\n emit(doc.web_server, 1);\n}";
$obj->views->web_server->reduce = "_sum"; $obj->views->web_server->reduce = "_sum";
$obj->views->byAgency->map = "function(doc) {\n emit(doc.agencyID, 1);\n}"; $obj->views->byAgency->map = "function(doc) {\n emit(doc.agencyID, 1);\n}";
$obj->views->byAgency->reduce = "_sum"; $obj->views->byAgency->reduce = "_sum";
$obj->views->byURL->map = "function(doc) {\n emit(doc.url, doc);\n}"; $obj->views->byURL->map = "function(doc) {\n emit(doc.url, doc);\n}";
$obj->views->agency->map = "function(doc) {\n emit(doc.agencyID, doc);\n}"; $obj->views->agency->map = "function(doc) {\n emit(doc.agencyID, doc);\n}";
$obj->views->byWebServer->map = "function(doc) {\n emit(doc.web_server, doc);\n}"; $obj->views->byWebServer->map = "function(doc) {\n emit(doc.web_server, doc);\n}";
   
  $obj->views->datasets->map = "function(doc) {\nif (doc.fieldName == \"data\") {\n emit(doc._id, doc);\n}\n}";
$obj->views->getValidationRequired->map = "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}"; $obj->views->getValidationRequired->map = "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}";
$docdb->save($obj, true); $docdb->save($obj, true);
   
   
   
   
//function createAgencyDesignDoc() { //function createAgencyDesignDoc() {
$db = $server->get_db('disclosr-agencies'); $db = $server->get_db('disclosr-agencies');
$obj = new stdClass(); $obj = new stdClass();
$obj->_id = "_design/" . urlencode("app"); $obj->_id = "_design/" . urlencode("app");
$obj->language = "javascript"; $obj->language = "javascript";
$obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; $obj->views->all->map = "function(doc) { emit(doc._id, doc); };";
$obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };"; $obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };";
$obj->views->byCanonicalName->map = "function(doc) { $obj->views->byCanonicalName->map = "function(doc) {
if (doc.parentOrg || doc.orgType == 'FMA-DepartmentOfState') { if (doc.parentOrg || doc.orgType == 'FMA-DepartmentOfState') {
emit(doc.name, doc); emit(doc.name, doc);
} }
};"; };";
$obj->views->byDeptStateName->map = "function(doc) { $obj->views->byDeptStateName->map = "function(doc) {
if (doc.orgType == 'FMA-DepartmentOfState') { if (doc.orgType == 'FMA-DepartmentOfState') {
emit(doc.name, doc._id); emit(doc.name, doc._id);
} }
};"; };";
$obj->views->parentOrgs->map = "function(doc) { $obj->views->parentOrgs->map = "function(doc) {
if (doc.parentOrg) { if (doc.parentOrg) {
emit(doc._id, doc.parentOrg); emit(doc._id, doc.parentOrg);
} }
};"; };";
$obj->views->byName->map = 'function(doc) { $obj->views->byName->map = 'function(doc) {
if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") {
emit(doc.name, doc._id); emit(doc.name, doc._id);
if (typeof(doc.shortName) != "undefined" && doc.shortName != doc.name) { if (typeof(doc.shortName) != "undefined" && doc.shortName != doc.name) {
emit(doc.shortName, doc._id); emit(doc.shortName, doc._id);
} }
for (name in doc.otherNames) { for (name in doc.otherNames) {
if (doc.otherNames[name] != "" && doc.otherNames[name] != doc.name) { if (doc.otherNames[name] != "" && doc.otherNames[name] != doc.name) {
emit(doc.otherNames[name], doc._id); emit(doc.otherNames[name], doc._id);
} }
} }
for (name in doc.foiBodies) { for (name in doc.foiBodies) {
if (doc.foiBodies[name] != "" && doc.foiBodies[name] != doc.name) { if (doc.foiBodies[name] != "" && doc.foiBodies[name] != doc.name) {
emit(doc.foiBodies[name], doc._id); emit(doc.foiBodies[name], doc._id);
} }
} }
for (name in doc.positions) { for (name in doc.positions) {
if (doc.positions[name] != "" && doc.positions[name] != doc.name) { if (doc.positions[name] != "" && doc.positions[name] != doc.name) {
emit(doc.positions[name], doc._id); emit(doc.positions[name], doc._id);
} }
} }
} }
};'; };';
   
$obj->views->foiEmails->map = "function(doc) { $obj->views->foiEmails->map = "function(doc) {
emit(doc._id, doc.foiEmail); emit(doc._id, doc.foiEmail);
};"; };";
   
$obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }"; $obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }";
$obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };'; $obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };';
$obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };'; $obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };';
$obj->views->getScrapeRequired->map = "function(doc) { $obj->views->getScrapeRequired->map = "function(doc) {
   
var lastScrape = Date.parse(doc.metadata.lastScraped); var lastScrape = Date.parse(doc.metadata.lastScraped);
   
var today = new Date(); var today = new Date();
   
if (!lastScrape || lastScrape.getTime() + 1000 != today.getTime()) { if (!lastScrape || lastScrape.getTime() + 1000 != today.getTime()) {
emit(doc._id, doc); emit(doc._id, doc);
} }
   
};"; };";
$obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };"; $obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };";
$obj->views->getConflicts->map = "function(doc) { $obj->views->getConflicts->map = "function(doc) {
if (doc._conflicts) { if (doc._conflicts) {
emit(null, [doc._rev].concat(doc._conflicts)); emit(null, [doc._rev].concat(doc._conflicts));
} }
}"; }";
$obj->views->getStatistics->map = $obj->views->getStatistics->map =
"function(doc) { "function(doc) {
if (doc.statistics) { if (doc.statistics) {
for (var statisticSet in doc.statistics) { for (var statisticSet in doc.statistics) {
for (var statisticPeriod in doc.statistics[statisticSet]) { for (var statisticPeriod in doc.statistics[statisticSet]) {
emit([statisticSet,statisticPeriod], doc.statistics[statisticSet][statisticPeriod]['value']); emit([statisticSet,statisticPeriod], doc.statistics[statisticSet][statisticPeriod]['value']);
} }
} }
} }
}"; }";
$obj->views->getStatistics->reduce = '_sum'; $obj->views->getStatistics->reduce = '_sum';
// http://stackoverflow.com/questions/646628/javascript-startswith // http://stackoverflow.com/questions/646628/javascript-startswith
$obj->views->score->map = 'if(!String.prototype.startsWith){ $obj->views->score->map = 'if(!String.prototype.startsWith){
String.prototype.startsWith = function (str) { String.prototype.startsWith = function (str) {
return !this.indexOf(str); return !this.indexOf(str);
} }
} }
   
function(doc) { function(doc) {
count = 0; count = 0;
if (doc["status"] != "suspended") { if (doc["status"] != "suspended") {
for(var propName in doc) { for(var propName in doc) {
if(typeof(doc[propName]) != "undefined" && doc[propName] != "") { if(typeof(doc[propName]) != "undefined" && doc[propName] != "") {
count++; count++;
} }
} }
portfolio = doc.parentOrg; portfolio = doc.parentOrg;
if (doc.orgType == "FMA-DepartmentOfState") { if (doc.orgType == "FMA-DepartmentOfState") {
portfolio = doc._id; portfolio = doc._id;
} }
if (doc.orgType == "Court-Commonwealth" || doc.orgType == "FMA-DepartmentOfParliament") { if (doc.orgType == "Court-Commonwealth" || doc.orgType == "FMA-DepartmentOfParliament") {
portfolio = doc.orgType; portfolio = doc.orgType;
} }
emit(count+doc._id, {id:doc._id, name: doc.name, score:count, orgType: doc.orgType, portfolio:portfolio}); emit(count+doc._id, {id:doc._id, name: doc.name, score:count, orgType: doc.orgType, portfolio:portfolio});
} }
}'; }';
$obj->views->scoreHas->map = 'if(!String.prototype.startsWith){ $obj->views->scoreHas->map = 'if(!String.prototype.startsWith){
String.prototype.startsWith = function (str) { String.prototype.startsWith = function (str) {
return !this.indexOf(str); return !this.indexOf(str);
} }
} }
if(!String.prototype.endsWith){ if(!String.prototype.endsWith){
String.prototype.endsWith = function(suffix) { String.prototype.endsWith = function(suffix) {
    return this.indexOf(suffix, this.length - suffix.length) !== -1;     return this.indexOf(suffix, this.length - suffix.length) !== -1;
}; };
} }
function(doc) { function(doc) {
if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") {
for(var propName in doc) { for(var propName in doc) {
if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") || propName.endsWith("URL"))) { if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") || propName.endsWith("URL"))) {
emit(propName, 1); emit(propName, 1);
} }
} }
emit("total", 1); emit("total", 1);
} }
}'; }';
$obj->views->scoreHas->reduce = '_sum'; $obj->views->scoreHas->reduce = '_sum';
$obj->views->fieldNames->map = ' $obj->views->fieldNames->map = '
function(doc) { function(doc) {
for(var propName in doc) { for(var propName in doc) {
emit(propName, doc._id); emit(propName, doc._id);
} }
}'; }';
$obj->views->fieldNames->reduce = '_count'; $obj->views->fieldNames->reduce = '_count';
// allow safe updates (even if slightly slower due to extra: rev-detection check). // allow safe updates (even if slightly slower due to extra: rev-detection check).
$db->save($obj, true); $db->save($obj, true);
?> ?>
   
  import ckanclient
  import couchdb
  from ckanclient import CkanApiError
  import re
 
  class LoaderError(Exception):
  pass
 
  # Instantiate the CKAN client.
  ckan = ckanclient.CkanClient(base_location='http://localhost:5000/api',
  api_key='b47b24cd-591d-40c1-8677-d73101d56d1b')
  # (use your own api_key from http://thedatahub.org/user/me )
  # http://code.activestate.com/recipes/578019-bytes-to-human-human-to-bytes-converter/
  SYMBOLS = {
  'customary': ('B', 'KB', 'MB', 'GB', 'T', 'P', 'E', 'Z', 'Y'),
  'customary_ext': ('byte', 'kilo', 'mega', 'giga', 'tera', 'peta', 'exa',
  'zetta', 'iotta'),
  'iec': ('Bi', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'),
  'iec_ext': ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi',
  'zebi', 'yobi'),
  }
 
  def human2bytes(s):
  """
  Attempts to guess the string format based on default symbols
  set and return the corresponding bytes as an integer.
  When unable to recognize the format ValueError is raised.
 
  >>> human2bytes('0 B')
  0
  >>> human2bytes('1 K')
  1024
  >>> human2bytes('1 M')
  1048576
  >>> human2bytes('1 Gi')
  1073741824
  >>> human2bytes('1 tera')
  1099511627776
 
  >>> human2bytes('0.5kilo')
  512
  >>> human2bytes('0.1 byte')
  0
  >>> human2bytes('1 k') # k is an alias for K
  1024
  >>> human2bytes('12 foo')
  Traceback (most recent call last):
  ...
  ValueError: can't interpret '12 foo'
  """
  init = s
  num = ""
  while s and s[0:1].isdigit() or s[0:1] == '.':
  num += s[0]
  s = s[1:]
  num = float(num)
  letter = s.strip()
  for name, sset in SYMBOLS.items():
  if letter in sset:
  break
  else:
  if letter == 'k':
  # treat 'k' as an alias for 'K' as per: http://goo.gl/kTQMs
  sset = SYMBOLS['customary']
  letter = letter.upper()
  else:
  raise ValueError("can't interpret %r" % init)
  prefix = {sset[0]: 1}
  for i, s in enumerate(sset[1:]):
  prefix[s] = 1 << (i + 1) * 10
  return int(num * prefix[letter])
 
  # https://github.com/okfn/ckanext-importlib
  def munge(name):
  # convert spaces to underscores
  name = re.sub(' ', '_', name).lower()
  # convert symbols to dashes
  name = re.sub('[:]', '_-', name).lower()
  name = re.sub('[/]', '-', name).lower()
  # take out not-allowed characters
  name = re.sub('[^a-zA-Z0-9-_]', '', name).lower()
  # remove double underscores
  name = re.sub('__', '_', name).lower()
  return name
 
 
  def name_munge(input_name):
  return munge(input_name.replace(' ', '').replace('.', '_').replace('&', 'and'))
  #return input_name.replace(' ', '').replace('.', '_').replace('&', 'and')
 
  couch = couchdb.Server('http://127.0.0.1:5984/')
  docsdb = couch['disclosr-documents']
 
  if __name__ == "__main__":
  for doc in docsdb.view('app/datasets'):
  print doc.id
  if doc.value['url'] != "http://data.gov.au/data/":
  # Collect the package metadata.
  pkg_name = name_munge(doc.value['metadata']['DCTERMS.Title'][:100])
  tags = doc.value['metadata']["Keywords / Tags"]
  if not hasattr(tags, '__iter__'):
  tags = [tags]
  [re.sub('[^a-zA-Z0-9-_]', '', tag).lower() for tag in tags]
  package_entity = {
  'name': pkg_name,
  'title': doc.value['metadata']['DCTERMS.Title'],
  'url': doc.value['metadata']['DCTERMS.Source.URI'],
  'tags': tags,
  'author': doc.value['metadata']["DCTERMS.Creator"],
  'maintainer': doc.value['metadata']["DCTERMS.Creator"],
  'licence_id': doc.value['metadata']['DCTERMS.License'], #todo licence id mapping
  'notes': doc.value['metadata']['Description'],
  }
  try:
  #print doc.id
  ckan.package_register_post(package_entity)
  except CkanApiError, e:
  if ckan.last_status == 409:
  print "already exists"
  else:
  raise LoaderError('Unexpected status %s checking for package under \'%s\': %r' % (
  ckan.last_status, pkg_name, e.args))
 
  print package_entity
  #todo add to organisation (author/creator/maintainer)
  #if 'data.gov.au Category' in doc.value['metadata'].keys(): #todo add to group
  if 'Download' in doc.value['metadata'].keys():
  try:
  pkg = ckan.package_entity_get(pkg_name)
  resources = pkg.get('resources', [])
  if len(resources) < len(doc.value['metadata']['Download']):
  for resource in doc.value['metadata']['Download']:
  #print resource
  # http://docs.ckan.org/en/ckan-1.7/domain-model-resource.html
  # (KML/KMZ) / (Shapefile) /(Other)
  format = "plain"
  if resource['format'] == '(XML)':
  format = 'xml'
  if resource['format'] == '(CSV/XLS)':
  format = 'csv'
  name = resource['href']
  if 'name' in resource.keys():
  name = resource['name']
  ckan.add_package_resource(pkg_name, resource['href'], name=name, resource_type='data',
  format=format, size=human2bytes(resource['size'].replace(',', '')))
  else:
  print "resources already exist"
  except CkanApiError, e:
  if ckan.last_status == 404:
  print "parent dataset does not exist"
  else:
  raise LoaderError('Unexpected status %s checking for package under \'%s\': %r' % (
  ckan.last_status, pkg_name, e.args))
 
import sys, os import sys, os
import time import time
import scrape import scrape
from bs4 import BeautifulSoup