Add validation script
Former-commit-id: 8f0da9f23f1e65c57b3e158ef358630a33f10141
--- a/admin/importGov2RegisterRSSFacebookTwitter.php
+++ b/admin/importGov2RegisterRSSFacebookTwitter.php
@@ -1,7 +1,7 @@
<?php
require_once '../include/common.inc.php';
-require($basePath.'lib/phpquery/phpQuery/phpQuery.php');
+require($basePath . 'lib/phpquery/phpQuery/phpQuery.php');
$db = $server->get_db('disclosr-agencies');
$rows = $db->get_view("app", "byName")->rows;
@@ -10,41 +10,40 @@
foreach ($rows as $row) {
$nametoid[trim($row->key)] = $row->value;
}
+
function extractHTMLAccounts($url, $accountType) {
global $accounts, $nametoid;
$request = Requests::get($url);
$doc = phpQuery::newDocumentHTML($request->body);
phpQuery::selectDocument($doc);
foreach (pq('tr')->elements as $tr) {
- //echo $tr->nodeValue.PHP_EOL;
- $agency = "";
- $url = "";
- foreach ($tr->childNodes as $td) {
- $class = $td->getAttribute("class");
- //echo "cccc $class ".$td->nodeValue.PHP_EOL;
- if ($class == "s11" || $class == "s10" || $class == "s7") {
- $agency = $td->nodeValue;
- } else if ($class == "s6" || $class == "s9"){
- $url = $td->nodeValue;
- foreach($td->childNodes as $a) {
- $href = $a->getAttribute("href");
- if ($href != "") {
- $url = $href;
- }
- }
- }
- }
- if ($agency != "" && $url != "") {
- if (!in_array(trim($agency), array_keys($nametoid))) {
- echo trim($agency)." missing" . PHP_EOL;
- } else {
- // echo $agency." = ".$url.PHP_EOL;
- $accounts[$nametoid[trim($agency)]][$accountType][] = $url;
+ //echo $tr->nodeValue.PHP_EOL;
+ $agency = "";
+ $url = "";
+ foreach ($tr->childNodes as $td) {
+ $class = $td->getAttribute("class");
+ //echo "cccc $class ".$td->nodeValue.PHP_EOL;
+ if ($class == "s11" || $class == "s10" || $class == "s7") {
+ $agency = $td->nodeValue;
+ } else if ($class == "s6" || $class == "s9") {
+ $url = $td->nodeValue;
+ foreach ($td->childNodes as $a) {
+ $href = $a->getAttribute("href");
+ if ($href != "") {
+ $url = $href;
}
-
- }
+ }
+ }
+ }
+ if ($agency != "" && $url != "") {
+ if (!in_array(trim($agency), array_keys($nametoid))) {
+ echo trim($agency) . " missing" . PHP_EOL;
+ } else {
+ // echo $agency." = ".$url.PHP_EOL;
+ $accounts[$nametoid[trim($agency)]][$accountType][] = $url;
+ }
+ }
}
-
}
function extractCSVAccounts($url, $accountType, $nameField, $accountField, $filter) {
@@ -53,7 +52,7 @@
$Data = str_getcsv($request->body, "\n"); //parse the rows
$headers = Array();
foreach ($Data as $num => $line) {
- $Row = str_getcsv($line, ",",'"');
+ $Row = str_getcsv($line, ",", '"');
if ($num == 0) {
} else if ($num == 1) {
@@ -64,7 +63,7 @@
$agencyName = $Row[array_search($nameField, $headers)];
if (!$filter || $Row[array_search("State", $headers)] == "NAT") {
if (!in_array(trim($agencyName), array_keys($nametoid))) {
- echo trim($agencyName)." missing" . PHP_EOL;
+ echo trim($agencyName) . " missing" . PHP_EOL;
} else {
// echo $Row[array_search($nameField, $headers)] . PHP_EOL;
$accounts[$nametoid[trim($agencyName)]][$accountType][] = $Row[array_search($accountField, $headers)];
@@ -84,6 +83,18 @@
extractHTMLAccounts("https://docs.google.com/spreadsheet/pub?hl=en_GB&hl=en_GB&key=0Ah41IAK0HzSTdGJxandJREhLSGlWWUZfZ2xKOTNHZ0E&output=html", "RSS");
// facebook
extractHTMLAccounts("https://docs.google.com/spreadsheet/pub?hl=en_GB&hl=en_GB&key=0Ah41IAK0HzSTdGtjcW9vOXdyZ3pOV21vQU51VmhzQnc&single=true&gid=0&output=html", "Facebook");
+foreach ($accounts as $id => $accountTypes) {
+ echo $id . "<br>" . PHP_EOL;
+ $doc = object_to_array($db->get($id));
+ // print_r($doc);
+ foreach ($accountTypes as $accountType => $accounts) {
+ if (!isset($doc["has" . $accountType]) || !is_array($doc["has" . $accountType])) {
+ $doc["has" . $accountType] = Array();
+ }
+ $doc["has" . $accountType] = array_unique(array_merge($doc["has" . $accountType], $accounts));
+ }
+ $db->save($doc);
+}
?>
--- /dev/null
+++ b/admin/validation.py
@@ -1,1 +1,30 @@
+#http://packages.python.org/CouchDB/client.html
+import couchdb
+import json
+import pprint
+import re
+from tidylib import tidy_document
+couch = couchdb.Server('http://127.0.0.1:5984/')
+
+# select database
+docsdb = couch['disclosr-documents']
+
+def f(x):
+ invalid = re.compile(r"ensure|testing|flicker|updating|longdesc|Accessibility Checks|not recognized")
+ valid = re.compile(r"line")
+ return (not invalid.search(x)) and valid.search(x) and x != ''
+
+for row in docsdb.view('app/getValidationRequired'):
+ print row.id
+ html = docsdb.get_attachment(row.id,row.value.iterkeys().next()).read()
+ #print html
+ document, errors = tidy_document(html,options={'accessibility-check':1,'show-warnings':0,'markup':0},keep_doc=True)
+ #http://www.aprompt.ca/Tidy/accessibilitychecks.html
+ #print document
+ errors = '\n'.join(filter(f,errors.split('\n')))
+ #print errors
+ doc = docsdb.get(row.id)
+ doc['validation'] = errors
+ docsdb.save(doc)
+
--- a/couchdb/settee
+++ b/couchdb/settee
--- a/getAgency.php
+++ b/getAgency.php
@@ -86,9 +86,14 @@
} else if ($schemas['agency']['properties'][$defaultField]['type'] == "array") {
if (is_array($row[$defaultField])) {
$row[$defaultField][] = "";
+ $row[$defaultField][] = "";
+ $row[$defaultField][] = "";
} else {
$value = $row[$defaultField];
$row[$defaultField] = Array($value);
+ $row[$defaultField][] = "";
+ $row[$defaultField][] = "";
+
}
}
}
@@ -102,7 +107,7 @@
// by name = startkey="Ham"&endkey="Ham\ufff0"
// edit?
- $row = $db->get($_REQUEST['id']);
+ $obj = $db->get($_REQUEST['id']);
//print_r($row);
if (sizeof($_POST) > 0) {
//print_r($_POST);
@@ -126,17 +131,19 @@
echo "Edited version was latest version, continue saving";
$newdoc = $_POST;
$newdoc['metadata']['lastModified'] = time();
- $row = $db->save($newdoc);
+ $obj = $db->save($newdoc);
} else {
echo "ALERT doc revised by someone else while editing. Document not saved.";
}
}
$mode = "edit";
+ $rowArray = object_to_array($obj);
+ksort($rowArray);
if ($mode == "edit") {
- $row = addDefaultFields(object_to_array($row));
+ $row = addDefaultFields($rowArray);
} else {
- $row = object_to_array($row);
+ $row = $rowArray;
}
if ($mode == "view") {
@@ -188,14 +195,14 @@
(isset($row->value->name) && $row->value->name != "" ? $row->value->name : "NO NAME " . $row->value->abn)
. '</a></li>';
} */
- $rows = $db->get_view("app", "byName")->rows;
+ $rows = $db->get_view("app", "byCanonicalName")->rows;
//print_r($rows);
echo '<ul>';
foreach ($rows as $row) {
// print_r($row);
- echo '<li typeof="schema:GovernmentOrganisation foaf:Organization" about="getAgency.php?id=' . $row->value . '">
-<a href="getAgency.php?id=' . $row->value . '" rel="schema:url foaf:page" property="schema:name foaf:name">' .
- $row->key
+ echo '<li typeof="schema:GovernmentOrganisation foaf:Organization" about="getAgency.php?id=' . $row->value->_id . '">
+<a href="getAgency.php?id=' . $row->value->_id . '" rel="schema:url foaf:page" property="schema:name foaf:name">' .
+ $row->value->name
. '</a></li>';
}
echo "</ul>";
--- a/include/couchdb.inc.php
+++ b/include/couchdb.inc.php
@@ -22,7 +22,10 @@
},
"byWebServer": {
"map": "function(doc) {\n emit(doc.web_server, doc);\n}"
- }
+ },
+ "getValidationRequired": {
+ "map": "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}"
+ }
}*/
}
@@ -167,3 +170,4 @@
function setteErrorHandler($e) {
echo $e->getMessage() . "<br>" . PHP_EOL;
}
+
--- a/schemas/agency.json.php
+++ b/schemas/agency.json.php
@@ -17,6 +17,7 @@
"parentOrg" => Array("type" => "string", "required" => true, "x-title" => "Parent Organisation", "description" => "Parent organisation, usually a department of state"),
"website" => Array("type" => "string", "required" => true, "x-title" => "Website", "x-property" => "schema:url foaf:homepage", "description" => "Website URL"),
"abn" => Array("type" => "string", "required" => true, "x-title" => "Australian Business Number", "description" => "ABN from business register"),
+ "employees" => Array("type" => "string", "required" => true, "x-title" => "2010-2011 employees", "description" => "2010-2011 employees"),
"contractListURL" => Array("type" => "string", "required" => true, "x-title" => "Contract Listing", "description" => "Departmental and agency contracts, <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a>" ),
"budgetURL" => Array("type" => "string", "required" => true,"x-title" => "Budget", "description" => "Portfolio Budget Statements and Portfolio Additional Estimates Statements"),
"grantsReportingURL" => Array("type" => "string", "required" => true, "x-title" => "Grants Awarded",
@@ -49,6 +50,8 @@
"items" => Array("type" => "string")),
"hasRestrictiveLicence" => Array("type" => "array","required" => true, "x-title" => "Has Restrictive Licence", "description" => "Has any page licenced under terms more restrictive than Crown Copyright",
"items" => Array("type" => "string")),
+ "hasPermissiveLicence" => Array("type" => "array","required" => true, "x-title" => "Has Permissive Licence", "description" => "Has any page licenced under terms more permissive than Crown Copyright but not clear CCBY",
+ "items" => Array("type" => "string")),
"hasCrownCopyright" => Array("type" => "array", "required" => true, "x-title" => "Has Standard Crown Copyright licence", "description" => "Has any page still licenced under the former Commonwealth Copyright Administration",
"items" => Array("type" => "string")),
),