export
export


Former-commit-id: 613905452e6bd6709f8810fd6b6ed709d2f4e5fb

<?php <?php
   
require_once '../include/common.inc.php'; require_once '../include/common.inc.php';
//function createFOIDocumentsDesignDoc() { //function createFOIDocumentsDesignDoc() {
   
$foidb = $server->get_db('disclosr-foidocuments'); $foidb = $server->get_db('disclosr-foidocuments');
$obj = new stdClass(); $obj = new stdClass();
$obj->_id = "_design/" . urlencode("app"); $obj->_id = "_design/" . urlencode("app");
$obj->language = "javascript"; $obj->language = "javascript";
$obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; $obj->views->all->map = "function(doc) { emit(doc._id, doc); };";
$obj->views->byDate->map = "function(doc) { emit(doc.date, doc); };"; $obj->views->byDate->map = "function(doc) { emit(doc.date, doc); };";
$obj->views->byDateMonthYear->map = "function(doc) { emit(doc.date, doc); };"; $obj->views->byDateMonthYear->map = "function(doc) { emit(doc.date, doc); };";
$obj->views->byDateMonthYear->reduce = "_count"; $obj->views->byDateMonthYear->reduce = "_count";
$obj->views->byAgencyID->map = "function(doc) { emit(doc.agencyID, doc); };"; $obj->views->byAgencyID->map = "function(doc) { emit(doc.agencyID, doc); };";
$obj->views->byAgencyID->reduce = "_count"; $obj->views->byAgencyID->reduce = "_count";
$obj->views->fieldNames->map = ' $obj->views->fieldNames->map = '
function(doc) { function(doc) {
for(var propName in doc) { for(var propName in doc) {
emit(propName, doc._id); emit(propName, doc._id);
} }
   
}'; }';
$obj->views->fieldNames->reduce = 'function (key, values, rereduce) { $obj->views->fieldNames->reduce = 'function (key, values, rereduce) {
return values.length; return values.length;
}'; }';
// allow safe updates (even if slightly slower due to extra: rev-detection check). // allow safe updates (even if slightly slower due to extra: rev-detection check).
$foidb->save($obj, true); $foidb->save($obj, true);
   
   
//function createDocumentsDesignDoc() { //function createDocumentsDesignDoc() {
$docdb = $server->get_db('disclosr-documents'); $docdb = $server->get_db('disclosr-documents');
   
$obj = new stdClass(); $obj = new stdClass();
$obj->_id = "_design/" . urlencode("app"); $obj->_id = "_design/" . urlencode("app");
$obj->language = "javascript"; $obj->language = "javascript";
$obj->views->web_server->map = "function(doc) {\n emit(doc.web_server, 1);\n}"; $obj->views->web_server->map = "function(doc) {\n emit(doc.web_server, 1);\n}";
$obj->views->web_server->reduce = "_sum"; $obj->views->web_server->reduce = "_sum";
$obj->views->byAgency->map = "function(doc) {\n emit(doc.agencyID, 1);\n}"; $obj->views->byAgency->map = "function(doc) {\n emit(doc.agencyID, 1);\n}";
$obj->views->byAgency->reduce = "_sum"; $obj->views->byAgency->reduce = "_sum";
$obj->views->byURL->map = "function(doc) {\n emit(doc.url, doc);\n}"; $obj->views->byURL->map = "function(doc) {\n emit(doc.url, doc);\n}";
$obj->views->agency->map = "function(doc) {\n emit(doc.agencyID, doc);\n}"; $obj->views->agency->map = "function(doc) {\n emit(doc.agencyID, doc);\n}";
$obj->views->byWebServer->map = "function(doc) {\n emit(doc.web_server, doc);\n}"; $obj->views->byWebServer->map = "function(doc) {\n emit(doc.web_server, doc);\n}";
   
  $obj->views->datasets->map = "function(doc) {\nif (doc.fieldName == \"data\") {\n emit(doc._id, doc);\n}\n}";
$obj->views->getValidationRequired->map = "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}"; $obj->views->getValidationRequired->map = "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}";
$docdb->save($obj, true); $docdb->save($obj, true);
   
   
   
   
//function createAgencyDesignDoc() { //function createAgencyDesignDoc() {
$db = $server->get_db('disclosr-agencies'); $db = $server->get_db('disclosr-agencies');
$obj = new stdClass(); $obj = new stdClass();
$obj->_id = "_design/" . urlencode("app"); $obj->_id = "_design/" . urlencode("app");
$obj->language = "javascript"; $obj->language = "javascript";
$obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; $obj->views->all->map = "function(doc) { emit(doc._id, doc); };";
$obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };"; $obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };";
$obj->views->byCanonicalName->map = "function(doc) { $obj->views->byCanonicalName->map = "function(doc) {
if (doc.parentOrg || doc.orgType == 'FMA-DepartmentOfState') { if (doc.parentOrg || doc.orgType == 'FMA-DepartmentOfState') {
emit(doc.name, doc); emit(doc.name, doc);
} }
};"; };";
$obj->views->byDeptStateName->map = "function(doc) { $obj->views->byDeptStateName->map = "function(doc) {
if (doc.orgType == 'FMA-DepartmentOfState') { if (doc.orgType == 'FMA-DepartmentOfState') {
emit(doc.name, doc._id); emit(doc.name, doc._id);
} }
};"; };";
$obj->views->parentOrgs->map = "function(doc) { $obj->views->parentOrgs->map = "function(doc) {
if (doc.parentOrg) { if (doc.parentOrg) {
emit(doc._id, doc.parentOrg); emit(doc._id, doc.parentOrg);
} }
};"; };";
$obj->views->byName->map = 'function(doc) { $obj->views->byName->map = 'function(doc) {
if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") {
emit(doc.name, doc._id); emit(doc.name, doc._id);
if (typeof(doc.shortName) != "undefined" && doc.shortName != doc.name) { if (typeof(doc.shortName) != "undefined" && doc.shortName != doc.name) {
emit(doc.shortName, doc._id); emit(doc.shortName, doc._id);
} }
for (name in doc.otherNames) { for (name in doc.otherNames) {
if (doc.otherNames[name] != "" && doc.otherNames[name] != doc.name) { if (doc.otherNames[name] != "" && doc.otherNames[name] != doc.name) {
emit(doc.otherNames[name], doc._id); emit(doc.otherNames[name], doc._id);
} }
} }
for (name in doc.foiBodies) { for (name in doc.foiBodies) {
if (doc.foiBodies[name] != "" && doc.foiBodies[name] != doc.name) { if (doc.foiBodies[name] != "" && doc.foiBodies[name] != doc.name) {
emit(doc.foiBodies[name], doc._id); emit(doc.foiBodies[name], doc._id);
} }
} }
for (name in doc.positions) { for (name in doc.positions) {
if (doc.positions[name] != "" && doc.positions[name] != doc.name) { if (doc.positions[name] != "" && doc.positions[name] != doc.name) {
emit(doc.positions[name], doc._id); emit(doc.positions[name], doc._id);
} }
} }
} }
};'; };';
   
$obj->views->foiEmails->map = "function(doc) { $obj->views->foiEmails->map = "function(doc) {
emit(doc._id, doc.foiEmail); emit(doc._id, doc.foiEmail);
};"; };";
   
$obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }"; $obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }";
$obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };'; $obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };';
$obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };'; $obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };';
$obj->views->getScrapeRequired->map = "function(doc) { $obj->views->getScrapeRequired->map = "function(doc) {
   
var lastScrape = Date.parse(doc.metadata.lastScraped); var lastScrape = Date.parse(doc.metadata.lastScraped);
   
var today = new Date(); var today = new Date();
   
if (!lastScrape || lastScrape.getTime() + 1000 != today.getTime()) { if (!lastScrape || lastScrape.getTime() + 1000 != today.getTime()) {
emit(doc._id, doc); emit(doc._id, doc);
} }
   
};"; };";
$obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };"; $obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };";
$obj->views->getConflicts->map = "function(doc) { $obj->views->getConflicts->map = "function(doc) {
if (doc._conflicts) { if (doc._conflicts) {
emit(null, [doc._rev].concat(doc._conflicts)); emit(null, [doc._rev].concat(doc._conflicts));
} }
}"; }";
$obj->views->getStatistics->map = $obj->views->getStatistics->map =
"function(doc) { "function(doc) {
if (doc.statistics) { if (doc.statistics) {
for (var statisticSet in doc.statistics) { for (var statisticSet in doc.statistics) {
for (var statisticPeriod in doc.statistics[statisticSet]) { for (var statisticPeriod in doc.statistics[statisticSet]) {
emit([statisticSet,statisticPeriod], doc.statistics[statisticSet][statisticPeriod]['value']); emit([statisticSet,statisticPeriod], doc.statistics[statisticSet][statisticPeriod]['value']);
} }
} }
} }
}"; }";
$obj->views->getStatistics->reduce = '_sum'; $obj->views->getStatistics->reduce = '_sum';
// http://stackoverflow.com/questions/646628/javascript-startswith // http://stackoverflow.com/questions/646628/javascript-startswith
$obj->views->score->map = 'if(!String.prototype.startsWith){ $obj->views->score->map = 'if(!String.prototype.startsWith){
String.prototype.startsWith = function (str) { String.prototype.startsWith = function (str) {
return !this.indexOf(str); return !this.indexOf(str);
} }
} }
   
function(doc) { function(doc) {
count = 0; count = 0;
if (doc["status"] != "suspended") { if (doc["status"] != "suspended") {
for(var propName in doc) { for(var propName in doc) {
if(typeof(doc[propName]) != "undefined" && doc[propName] != "") { if(typeof(doc[propName]) != "undefined" && doc[propName] != "") {
count++; count++;
} }
} }
portfolio = doc.parentOrg; portfolio = doc.parentOrg;
if (doc.orgType == "FMA-DepartmentOfState") { if (doc.orgType == "FMA-DepartmentOfState") {
portfolio = doc._id; portfolio = doc._id;
} }
if (doc.orgType == "Court-Commonwealth" || doc.orgType == "FMA-DepartmentOfParliament") { if (doc.orgType == "Court-Commonwealth" || doc.orgType == "FMA-DepartmentOfParliament") {
portfolio = doc.orgType; portfolio = doc.orgType;
} }
emit(count+doc._id, {id:doc._id, name: doc.name, score:count, orgType: doc.orgType, portfolio:portfolio}); emit(count+doc._id, {id:doc._id, name: doc.name, score:count, orgType: doc.orgType, portfolio:portfolio});
} }
}'; }';
$obj->views->scoreHas->map = 'if(!String.prototype.startsWith){ $obj->views->scoreHas->map = 'if(!String.prototype.startsWith){
String.prototype.startsWith = function (str) { String.prototype.startsWith = function (str) {
return !this.indexOf(str); return !this.indexOf(str);
} }
} }
if(!String.prototype.endsWith){ if(!String.prototype.endsWith){
String.prototype.endsWith = function(suffix) { String.prototype.endsWith = function(suffix) {
    return this.indexOf(suffix, this.length - suffix.length) !== -1;     return this.indexOf(suffix, this.length - suffix.length) !== -1;
}; };
} }
function(doc) { function(doc) {
if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") {
for(var propName in doc) { for(var propName in doc) {
if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") || propName.endsWith("URL"))) { if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") || propName.endsWith("URL"))) {
emit(propName, 1); emit(propName, 1);
} }
} }
emit("total", 1); emit("total", 1);
} }
}'; }';
$obj->views->scoreHas->reduce = '_sum'; $obj->views->scoreHas->reduce = '_sum';
$obj->views->fieldNames->map = ' $obj->views->fieldNames->map = '
function(doc) { function(doc) {
for(var propName in doc) { for(var propName in doc) {
emit(propName, doc._id); emit(propName, doc._id);
} }
}'; }';
$obj->views->fieldNames->reduce = '_count'; $obj->views->fieldNames->reduce = '_count';
// allow safe updates (even if slightly slower due to extra: rev-detection check). // allow safe updates (even if slightly slower due to extra: rev-detection check).
$db->save($obj, true); $db->save($obj, true);
?> ?>
   
  import ckanclient
  import couchdb
  from ckanclient import CkanApiError
  import re
  class LoaderError(Exception):
  pass
  # https://github.com/okfn/ckanext-importlib
  # Instantiate the CKAN client.
  ckan = ckanclient.CkanClient(base_location='http://localhost:5000/api',
  api_key='b47b24cd-591d-40c1-8677-d73101d56d1b')
  # (use your own api_key from http://thedatahub.org/user/me )
 
  def munge(name):
  # convert spaces to underscores
  name = re.sub(' ', '_', name).lower()
  # convert symbols to dashes
  name = re.sub('[:]', '_-', name).lower()
  name = re.sub('[/]', '-', name).lower()
  # take out not-allowed characters
  name = re.sub('[^a-zA-Z0-9-_]', '', name).lower()
  # remove double underscores
  name = re.sub('__', '_', name).lower()
  return name
  def name_munge(input_name):
  return munge(input_name.replace(' ', '').replace('.', '_').replace('&', 'and'))
  #return input_name.replace(' ', '').replace('.', '_').replace('&', 'and')
 
  couch = couchdb.Server('http://127.0.0.1:5984/')
  docsdb = couch['disclosr-documents']
 
  if __name__ == "__main__":
  for doc in docsdb.view('app/datasets'):
  print doc.id
  if doc.value['url'] != "http://data.gov.au/data/":
  # Collect the package metadata.
  pkg_name = name_munge(doc.value['metadata']['DCTERMS.Title'][:100])
  package_entity = {
  'name': pkg_name,
  'title': doc.value['metadata']['DCTERMS.Title'],
  'url': doc.value['metadata']['DCTERMS.Source.URI'],
  'tags': doc.value['metadata']["Keywords / Tags"], #todo must be alphanumeric characters or symbols
 
  'author': doc.value['metadata']["DCTERMS.Creator"],
  'maintainer': doc.value['metadata']["DCTERMS.Creator"],
  'licence_id': doc.value['metadata']['DCTERMS.License'],
  'notes': doc.value['metadata']['Description'],
  }
  try:
  ckan.package_register_post(package_entity)
  except CkanApiError, e:
  if ckan.last_status == 409:
  print "already exists"
  else:
  raise LoaderError('Unexpected status %s checking for package under \'%s\': %r' % (ckan.last_status, doc.id, e.args))
 
  print package_entity
  ckan.add_package_resource(pkg_name, 'http://example.org/', name='Foo', resource_type='data', format='csv')
 
import sys, os import sys, os
import time import time
import scrape import scrape
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
   
from unidecode import unidecode from unidecode import unidecode
   
listurl = "http://data.gov.au/data/" listurl = "http://data.gov.au/data/"
(url, mime_type, datasetlisthtml) = scrape.fetchURL(scrape.docsdb, (url, mime_type, datasetlisthtml) = scrape.fetchURL(scrape.docsdb,
listurl, "data", "AGIMO") listurl, "data", "AGIMO")
soup = BeautifulSoup(datasetlisthtml) soup = BeautifulSoup(datasetlisthtml)
for atag in soup.find_all(class_='result-title'): for atag in soup.find_all(class_='result-title'):
if atag.has_key('href'): if atag.has_key('href'):
url = scrape.fullurl(listurl, atag['href']) url = scrape.fullurl(listurl, atag['href'])
(url, mime_type, html) = scrape.fetchURL(scrape.docsdb, (url, mime_type, html) = scrape.fetchURL(scrape.docsdb,
url, "data", "AGIMO") url, "data", "AGIMO")
hash = scrape.mkhash(scrape.canonurl(url)) hash = scrape.mkhash(scrape.canonurl(url))
doc = scrape.docsdb.get(hash) doc = scrape.docsdb.get(hash)
if "metadata" not in doc.keys() or True: if "metadata" not in doc.keys() or True:
doc['type'] = "dataset" doc['type'] = "dataset"
doc['metadata'] = {} doc['metadata'] = {}
soup = BeautifulSoup(html) soup = BeautifulSoup(html)
for metatag in soup.find_all('meta'): for metatag in soup.find_all('meta'):
if metatag.has_key('name'): if metatag.has_key('name'):
doc['metadata'][metatag['name']] = metatag['content'] doc['metadata'][metatag['name']] = metatag['content']
for list in soup.find_all('dl'): for list in soup.find_all('dl'):
last_title = "" last_title = ""
for child in list.children: for child in list.children:
if str(type(child)) != "<class 'bs4.element.NavigableString'>": if str(type(child)) != "<class 'bs4.element.NavigableString'>":
if child.name == 'dt' and child.string != None: if child.name == 'dt' and child.string != None:
last_title = child.string.strip() last_title = child.string.strip()
if child.name == 'dd': if child.name == 'dd':
#print last_title #print last_title
if last_title == "Description": if last_title == "Description":
doc['metadata'][last_title] = unidecode(str(child)).encode('ascii', 'ignore') doc['metadata'][last_title] = unidecode(str(child)).encode('ascii', 'ignore')
elif last_title == "Download": elif last_title == "Download":
doc['metadata'][last_title] = {} doc['metadata'][last_title] = []
for item in child.find_all("li"): for item in child.find_all("li"):
link = item.find("a") link = item.find("a")
format = item.find(property="dc:format") format = item.find(property="dc:format")
linkobj = {"href":link['href'].replace("/bye?","").strip(), linkobj = {"href":link['href'].replace("/bye?","").strip(),
"format": format.string.strip(), "size": format.next_sibling.string.strip()} "format": format.string.strip(), "size": format.next_sibling.string.strip()}
if link.string != None: if link.string != None:
linkobj["name"] = link.string.strip() linkobj["name"] = link.string.strip()
doc['metadata'][last_title][] = linkobj doc['metadata'][last_title].append(linkobj)
   
else: else:
atags = child.find_all('a') atags = child.find_all('a')
if len(atags) < 2: if len(atags) < 2:
[s.extract() for s in child(class_='viewAll')] [s.extract() for s in child(class_='viewAll')]
doc['metadata'][last_title] = ''.join(child.stripped_strings).strip() doc['metadata'][last_title] = ''.join(child.stripped_strings).strip()
else: else:
doc['metadata'][last_title] = [item.string.replace(",","").strip() for item in atags] doc['metadata'][last_title] = [item.string.replace(",","").strip() for item in atags]
print doc['metadata'] print doc['metadata']
scrape.docsdb.save(doc) scrape.docsdb.save(doc)
#time.sleep(2) #time.sleep(2)
   
  import sys, os
  import time
  import scrape
  from bs4 import BeautifulSoup
   
  from unidecode import unidecode
   
  listurl = "http://gazettes.ag.gov.au/portal/govgazonline.nsf/publications?OpenView&Start=3960"
  (url, mime_type, listhtml) = scrape.fetchURL(scrape.docsdb,
  listurl, "gazette", "AGD")
  soup = BeautifulSoup(listhtml)
  for row in soup.find_all('tr'):
  if row.has_key('valign'):
  for col in tr.find_all('td'):
  print col.string
  #url = scrape.fullurl(listurl, atag['href'])
  #(url, mime_type, html) = scrape.fetchURL(scrape.docsdb,
  # url, "data", "AGIMO")
  #has