gitphp 0.2.9.1 :: disclosr.git/commitdiff

export

Former-commit-id: 613905452e6bd6709f8810fd6b6ed709d2f4e5fb

4 files changed: (show all)
admin/refreshDesignDoc.php
documents/datagov-export.py (new)
documents/datagov.py
documents/gazette.py

file:a/admin/refreshDesignDoc.php -> file:b/admin/refreshDesignDoc.php

file:b/documents/datagov-export.py (new)

	import ckanclient
	import couchdb
	from ckanclient import CkanApiError
	import re
	class LoaderError(Exception):
	pass
	# https://github.com/okfn/ckanext-importlib
	# Instantiate the CKAN client.
	ckan = ckanclient.CkanClient(base_location='http://localhost:5000/api',
	api_key='b47b24cd-591d-40c1-8677-d73101d56d1b')
	# (use your own api_key from http://thedatahub.org/user/me )

	def munge(name):
	# convert spaces to underscores
	name = re.sub(' ', '_', name).lower()
	# convert symbols to dashes
	name = re.sub('[:]', '_-', name).lower()
	name = re.sub('[/]', '-', name).lower()
	# take out not-allowed characters
	name = re.sub('[^a-zA-Z0-9-_]', '', name).lower()
	# remove double underscores
	name = re.sub('__', '_', name).lower()
	return name
	def name_munge(input_name):
	return munge(input_name.replace(' ', '').replace('.', '_').replace('&', 'and'))
	#return input_name.replace(' ', '').replace('.', '_').replace('&', 'and')

	couch = couchdb.Server('http://127.0.0.1:5984/')
	docsdb = couch['disclosr-documents']

	if __name__ == "__main__":
	for doc in docsdb.view('app/datasets'):
	print doc.id
	if doc.value['url'] != "http://data.gov.au/data/":
	# Collect the package metadata.
	pkg_name = name_munge(doc.value['metadata']['DCTERMS.Title'][:100])
	package_entity = {
	'name': pkg_name,
	'title': doc.value['metadata']['DCTERMS.Title'],
	'url': doc.value['metadata']['DCTERMS.Source.URI'],
	'tags': doc.value['metadata']["Keywords / Tags"], #todo must be alphanumeric characters or symbols

	'author': doc.value['metadata']["DCTERMS.Creator"],
	'maintainer': doc.value['metadata']["DCTERMS.Creator"],
	'licence_id': doc.value['metadata']['DCTERMS.License'],
	'notes': doc.value['metadata']['Description'],
	}
	try:
	ckan.package_register_post(package_entity)
	except CkanApiError, e:
	if ckan.last_status == 409:
	print "already exists"
	else:
	raise LoaderError('Unexpected status %s checking for package under \'%s\': %r' % (ckan.last_status, doc.id, e.args))

	print package_entity
	ckan.add_package_resource(pkg_name, 'http://example.org/', name='Foo', resource_type='data', format='csv')

file:a/documents/datagov.py -> file:b/documents/datagov.py

import sys, os	import sys, os
import time	import time
import scrape	import scrape
from bs4 import BeautifulSoup	from bs4 import BeautifulSoup

from unidecode import unidecode	from unidecode import unidecode

listurl = "http://data.gov.au/data/"	listurl = "http://data.gov.au/data/"
(url, mime_type, datasetlisthtml) = scrape.fetchURL(scrape.docsdb,	(url, mime_type, datasetlisthtml) = scrape.fetchURL(scrape.docsdb,
listurl, "data", "AGIMO")	listurl, "data", "AGIMO")
soup = BeautifulSoup(datasetlisthtml)	soup = BeautifulSoup(datasetlisthtml)
for atag in soup.find_all(class_='result-title'):	for atag in soup.find_all(class_='result-title'):
if atag.has_key('href'):	if atag.has_key('href'):
url = scrape.fullurl(listurl, atag['href'])	url = scrape.fullurl(listurl, atag['href'])
(url, mime_type, html) = scrape.fetchURL(scrape.docsdb,	(url, mime_type, html) = scrape.fetchURL(scrape.docsdb,
url, "data", "AGIMO")	url, "data", "AGIMO")
hash = scrape.mkhash(scrape.canonurl(url))	hash = scrape.mkhash(scrape.canonurl(url))
doc = scrape.docsdb.get(hash)	doc = scrape.docsdb.get(hash)
if "metadata" not in doc.keys() or True:	if "metadata" not in doc.keys() or True:
doc['type'] = "dataset"	doc['type'] = "dataset"
doc['metadata'] = {}	doc['metadata'] = {}
soup = BeautifulSoup(html)	soup = BeautifulSoup(html)
for metatag in soup.find_all('meta'):	for metatag in soup.find_all('meta'):
if metatag.has_key('name'):	if metatag.has_key('name'):
doc['metadata'][metatag['name']] = metatag['content']	doc['metadata'][metatag['name']] = metatag['content']
for list in soup.find_all('dl'):	for list in soup.find_all('dl'):
last_title = ""	last_title = ""
for child in list.children:	for child in list.children:
if str(type(child)) != "<class 'bs4.element.NavigableString'>":	if str(type(child)) != "<class 'bs4.element.NavigableString'>":
if child.name == 'dt' and child.string != None:	if child.name == 'dt' and child.string != None:
last_title = child.string.strip()	last_title = child.string.strip()
if child.name == 'dd':	if child.name == 'dd':
#print last_title	#print last_title
if last_title == "Description":	if last_title == "Description":
doc['metadata'][last_title] = unidecode(str(child)).encode('ascii', 'ignore')	doc['metadata'][last_title] = unidecode(str(child)).encode('ascii', 'ignore')
elif last_title == "Download":	elif last_title == "Download":
doc['metadata'][last_title] = {}	doc['metadata'][last_title] = []
for item in child.find_all("li"):	for item in child.find_all("li"):
link = item.find("a")	link = item.find("a")
format = item.find(property="dc:format")	format = item.find(property="dc:format")
linkobj = {"href":link['href'].replace("/bye?","").strip(),	linkobj = {"href":link['href'].replace("/bye?","").strip(),
"format": format.string.strip(), "size": format.next_sibling.string.strip()}	"format": format.string.strip(), "size": format.next_sibling.string.strip()}
if link.string != None:	if link.string != None:
linkobj["name"] = link.string.strip()	linkobj["name"] = link.string.strip()
doc['metadata'][last_title][] = linkobj	doc['metadata'][last_title].append(linkobj)

else:	else:
atags = child.find_all('a')	atags = child.find_all('a')
if len(atags) < 2:	if len(atags) < 2:
[s.extract() for s in child(class_='viewAll')]	[s.extract() for s in child(class_='viewAll')]
doc['metadata'][last_title] = ''.join(child.stripped_strings).strip()	doc['metadata'][last_title] = ''.join(child.stripped_strings).strip()
else:	else:
doc['metadata'][last_title] = [item.string.replace(",","").strip() for item in atags]	doc['metadata'][last_title] = [item.string.replace(",","").strip() for item in atags]
print doc['metadata']	print doc['metadata']
scrape.docsdb.save(doc)	scrape.docsdb.save(doc)
#time.sleep(2)	#time.sleep(2)

file:a/documents/gazette.py -> file:b/documents/gazette.py

	import sys, os
	import time
	import scrape
	from bs4 import BeautifulSoup

	from unidecode import unidecode

	listurl = "http://gazettes.ag.gov.au/portal/govgazonline.nsf/publications?OpenView&Start=3960"
	(url, mime_type, listhtml) = scrape.fetchURL(scrape.docsdb,
	listurl, "gazette", "AGD")
	soup = BeautifulSoup(listhtml)
	for row in soup.find_all('tr'):
	if row.has_key('valign'):
	for col in tr.find_all('td'):
	print col.string
	#url = scrape.fullurl(listurl, atag['href'])
	#(url, mime_type, html) = scrape.fetchURL(scrape.docsdb,
	# url, "data", "AGIMO")
	#has

<?php	<?php

require_once '../include/common.inc.php';	require_once '../include/common.inc.php';
//function createFOIDocumentsDesignDoc() {	//function createFOIDocumentsDesignDoc() {

$foidb = $server->get_db('disclosr-foidocuments');	$foidb = $server->get_db('disclosr-foidocuments');
$obj = new stdClass();	$obj = new stdClass();
$obj->_id = "_design/" . urlencode("app");	$obj->_id = "_design/" . urlencode("app");
$obj->language = "javascript";	$obj->language = "javascript";
$obj->views->all->map = "function(doc) { emit(doc._id, doc); };";	$obj->views->all->map = "function(doc) { emit(doc._id, doc); };";
$obj->views->byDate->map = "function(doc) { emit(doc.date, doc); };";	$obj->views->byDate->map = "function(doc) { emit(doc.date, doc); };";
$obj->views->byDateMonthYear->map = "function(doc) { emit(doc.date, doc); };";	$obj->views->byDateMonthYear->map = "function(doc) { emit(doc.date, doc); };";
$obj->views->byDateMonthYear->reduce = "_count";	$obj->views->byDateMonthYear->reduce = "_count";
$obj->views->byAgencyID->map = "function(doc) { emit(doc.agencyID, doc); };";	$obj->views->byAgencyID->map = "function(doc) { emit(doc.agencyID, doc); };";
$obj->views->byAgencyID->reduce = "_count";	$obj->views->byAgencyID->reduce = "_count";
$obj->views->fieldNames->map = '	$obj->views->fieldNames->map = '
function(doc) {	function(doc) {
for(var propName in doc) {	for(var propName in doc) {
emit(propName, doc._id);	emit(propName, doc._id);
}	}

}';	}';
$obj->views->fieldNames->reduce = 'function (key, values, rereduce) {	$obj->views->fieldNames->reduce = 'function (key, values, rereduce) {
return values.length;	return values.length;
}';	}';
// allow safe updates (even if slightly slower due to extra: rev-detection check).	// allow safe updates (even if slightly slower due to extra: rev-detection check).
$foidb->save($obj, true);	$foidb->save($obj, true);


//function createDocumentsDesignDoc() {	//function createDocumentsDesignDoc() {
$docdb = $server->get_db('disclosr-documents');	$docdb = $server->get_db('disclosr-documents');

$obj = new stdClass();	$obj = new stdClass();
$obj->_id = "_design/" . urlencode("app");	$obj->_id = "_design/" . urlencode("app");
$obj->language = "javascript";	$obj->language = "javascript";
$obj->views->web_server->map = "function(doc) {\n emit(doc.web_server, 1);\n}";	$obj->views->web_server->map = "function(doc) {\n emit(doc.web_server, 1);\n}";
$obj->views->web_server->reduce = "_sum";	$obj->views->web_server->reduce = "_sum";
$obj->views->byAgency->map = "function(doc) {\n emit(doc.agencyID, 1);\n}";	$obj->views->byAgency->map = "function(doc) {\n emit(doc.agencyID, 1);\n}";
$obj->views->byAgency->reduce = "_sum";	$obj->views->byAgency->reduce = "_sum";
$obj->views->byURL->map = "function(doc) {\n emit(doc.url, doc);\n}";	$obj->views->byURL->map = "function(doc) {\n emit(doc.url, doc);\n}";
$obj->views->agency->map = "function(doc) {\n emit(doc.agencyID, doc);\n}";	$obj->views->agency->map = "function(doc) {\n emit(doc.agencyID, doc);\n}";
$obj->views->byWebServer->map = "function(doc) {\n emit(doc.web_server, doc);\n}";	$obj->views->byWebServer->map = "function(doc) {\n emit(doc.web_server, doc);\n}";

	$obj->views->datasets->map = "function(doc) {\nif (doc.fieldName == \"data\") {\n emit(doc._id, doc);\n}\n}";
$obj->views->getValidationRequired->map = "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}";	$obj->views->getValidationRequired->map = "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}";
$docdb->save($obj, true);	$docdb->save($obj, true);




//function createAgencyDesignDoc() {	//function createAgencyDesignDoc() {
$db = $server->get_db('disclosr-agencies');	$db = $server->get_db('disclosr-agencies');
$obj = new stdClass();	$obj = new stdClass();
$obj->_id = "_design/" . urlencode("app");	$obj->_id = "_design/" . urlencode("app");
$obj->language = "javascript";	$obj->language = "javascript";
$obj->views->all->map = "function(doc) { emit(doc._id, doc); };";	$obj->views->all->map = "function(doc) { emit(doc._id, doc); };";
$obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };";	$obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };";
$obj->views->byCanonicalName->map = "function(doc) {	$obj->views->byCanonicalName->map = "function(doc) {
if (doc.parentOrg \|\| doc.orgType == 'FMA-DepartmentOfState') {	if (doc.parentOrg \|\| doc.orgType == 'FMA-DepartmentOfState') {
emit(doc.name, doc);	emit(doc.name, doc);
}	}
};";	};";
$obj->views->byDeptStateName->map = "function(doc) {	$obj->views->byDeptStateName->map = "function(doc) {
if (doc.orgType == 'FMA-DepartmentOfState') {	if (doc.orgType == 'FMA-DepartmentOfState') {
emit(doc.name, doc._id);	emit(doc.name, doc._id);
}	}
};";	};";
$obj->views->parentOrgs->map = "function(doc) {	$obj->views->parentOrgs->map = "function(doc) {
if (doc.parentOrg) {	if (doc.parentOrg) {
emit(doc._id, doc.parentOrg);	emit(doc._id, doc.parentOrg);
}	}
};";	};";
$obj->views->byName->map = 'function(doc) {	$obj->views->byName->map = 'function(doc) {
if (typeof(doc["status"]) == "undefined" \|\| doc["status"] != "suspended") {	if (typeof(doc["status"]) == "undefined" \|\| doc["status"] != "suspended") {
emit(doc.name, doc._id);	emit(doc.name, doc._id);
if (typeof(doc.shortName) != "undefined" && doc.shortName != doc.name) {	if (typeof(doc.shortName) != "undefined" && doc.shortName != doc.name) {
emit(doc.shortName, doc._id);	emit(doc.shortName, doc._id);
}	}
for (name in doc.otherNames) {	for (name in doc.otherNames) {
if (doc.otherNames[name] != "" && doc.otherNames[name] != doc.name) {	if (doc.otherNames[name] != "" && doc.otherNames[name] != doc.name) {
emit(doc.otherNames[name], doc._id);	emit(doc.otherNames[name], doc._id);
}	}
}	}
for (name in doc.foiBodies) {	for (name in doc.foiBodies) {
if (doc.foiBodies[name] != "" && doc.foiBodies[name] != doc.name) {	if (doc.foiBodies[name] != "" && doc.foiBodies[name] != doc.name) {
emit(doc.foiBodies[name], doc._id);	emit(doc.foiBodies[name], doc._id);
}	}
}	}
for (name in doc.positions) {	for (name in doc.positions) {
if (doc.positions[name] != "" && doc.positions[name] != doc.name) {	if (doc.positions[name] != "" && doc.positions[name] != doc.name) {
emit(doc.positions[name], doc._id);	emit(doc.positions[name], doc._id);
}	}
}	}
}	}
};';	};';

$obj->views->foiEmails->map = "function(doc) {	$obj->views->foiEmails->map = "function(doc) {
emit(doc._id, doc.foiEmail);	emit(doc._id, doc.foiEmail);
};";	};";

$obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }";	$obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }";
$obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };';	$obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };';
$obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };';	$obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };';
$obj->views->getScrapeRequired->map = "function(doc) {	$obj->views->getScrapeRequired->map = "function(doc) {

var lastScrape = Date.parse(doc.metadata.lastScraped);	var lastScrape = Date.parse(doc.metadata.lastScraped);

var today = new Date();	var today = new Date();

if (!lastScrape \|\| lastScrape.getTime() + 1000 != today.getTime()) {	if (!lastScrape \|\| lastScrape.getTime() + 1000 != today.getTime()) {
emit(doc._id, doc);	emit(doc._id, doc);
}	}

};";	};";
$obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };";	$obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };";
$obj->views->getConflicts->map = "function(doc) {	$obj->views->getConflicts->map = "function(doc) {
if (doc._conflicts) {	if (doc._conflicts) {
emit(null, [doc._rev].concat(doc._conflicts));	emit(null, [doc._rev].concat(doc._conflicts));
}	}
}";	}";
$obj->views->getStatistics->map =	$obj->views->getStatistics->map =
"function(doc) {	"function(doc) {
if (doc.statistics) {	if (doc.statistics) {
for (var statisticSet in doc.statistics) {	for (var statisticSet in doc.statistics) {
for (var statisticPeriod in doc.statistics[statisticSet]) {	for (var statisticPeriod in doc.statistics[statisticSet]) {
emit([statisticSet,statisticPeriod], doc.statistics[statisticSet][statisticPeriod]['value']);	emit([statisticSet,statisticPeriod], doc.statistics[statisticSet][statisticPeriod]['value']);
}	}
}	}
}	}
}";	}";
$obj->views->getStatistics->reduce = '_sum';	$obj->views->getStatistics->reduce = '_sum';
// http://stackoverflow.com/questions/646628/javascript-startswith	// http://stackoverflow.com/questions/646628/javascript-startswith
$obj->views->score->map = 'if(!String.prototype.startsWith){	$obj->views->score->map = 'if(!String.prototype.startsWith){
String.prototype.startsWith = function (str) {	String.prototype.startsWith = function (str) {
return !this.indexOf(str);	return !this.indexOf(str);
}	}
}	}

function(doc) {	function(doc) {
count = 0;	count = 0;
if (doc["status"] != "suspended") {	if (doc["status"] != "suspended") {
for(var propName in doc) {	for(var propName in doc) {
if(typeof(doc[propName]) != "undefined" && doc[propName] != "") {	if(typeof(doc[propName]) != "undefined" && doc[propName] != "") {
count++;	count++;
}	}
}	}
portfolio = doc.parentOrg;	portfolio = doc.parentOrg;
if (doc.orgType == "FMA-DepartmentOfState") {	if (doc.orgType == "FMA-DepartmentOfState") {
portfolio = doc._id;	portfolio = doc._id;
}	}
if (doc.orgType == "Court-Commonwealth" \|\| doc.orgType == "FMA-DepartmentOfParliament") {	if (doc.orgType == "Court-Commonwealth" \|\| doc.orgType == "FMA-DepartmentOfParliament") {
portfolio = doc.orgType;	portfolio = doc.orgType;
}	}
emit(count+doc._id, {id:doc._id, name: doc.name, score:count, orgType: doc.orgType, portfolio:portfolio});	emit(count+doc._id, {id:doc._id, name: doc.name, score:count, orgType: doc.orgType, portfolio:portfolio});
}	}
}';	}';
$obj->views->scoreHas->map = 'if(!String.prototype.startsWith){	$obj->views->scoreHas->map = 'if(!String.prototype.startsWith){
String.prototype.startsWith = function (str) {	String.prototype.startsWith = function (str) {
return !this.indexOf(str);	return !this.indexOf(str);
}	}
}	}
if(!String.prototype.endsWith){	if(!String.prototype.endsWith){
String.prototype.endsWith = function(suffix) {	String.prototype.endsWith = function(suffix) {
return this.indexOf(suffix, this.length - suffix.length) !== -1;	return this.indexOf(suffix, this.length - suffix.length) !== -1;
};	};
}	}
function(doc) {	function(doc) {
if (typeof(doc["status"]) == "undefined" \|\| doc["status"] != "suspended") {	if (typeof(doc["status"]) == "undefined" \|\| doc["status"] != "suspended") {
for(var propName in doc) {	for(var propName in doc) {
if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") \|\| propName.endsWith("URL"))) {	if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") \|\| propName.endsWith("URL"))) {
emit(propName, 1);	emit(propName, 1);
}	}
}	}
emit("total", 1);	emit("total", 1);
}	}
}';	}';
$obj->views->scoreHas->reduce = '_sum';	$obj->views->scoreHas->reduce = '_sum';
$obj->views->fieldNames->map = '	$obj->views->fieldNames->map = '
function(doc) {	function(doc) {
for(var propName in doc) {	for(var propName in doc) {
emit(propName, doc._id);	emit(propName, doc._id);
}	}

}';	}';
$obj->views->fieldNames->reduce = '_count';	$obj->views->fieldNames->reduce = '_count';
// allow safe updates (even if slightly slower due to extra: rev-detection check).	// allow safe updates (even if slightly slower due to extra: rev-detection check).
$db->save($obj, true);	$db->save($obj, true);
?>	?>