<?php | <?php |
include_once('include/common.inc.php'); | include_once('include/common.inc.php'); |
include_header(); | include_header(); |
function displayValue($key, $value, $mode) { | function displayValue($key, $value, $mode) { |
global $db, $schemas; | global $db, $schemas; |
if ($mode == "view") { | if ($mode == "view") { |
echo "<tr>"; | echo "<tr>"; |
echo "<td>" . $schemas['agency']["properties"][$key]['x-title'] . "<br><small>" . $schemas['agency']["properties"][$key]['description'] . "</small></td><td>"; | echo "<td>" . $schemas['agency']["properties"][$key]['x-title'] . "<br><small>" . $schemas['agency']["properties"][$key]['description'] . "</small></td><td>"; |
if (is_array($value)) { | if (is_array($value)) { |
echo "<ol>"; | echo "<ol>"; |
foreach ($value as $subkey => $subvalue) { | foreach ($value as $subkey => $subvalue) { |
if (isset($schemas['agency']["properties"][$key]['x-itemprop'])) { | if (isset($schemas['agency']["properties"][$key]['x-itemprop'])) { |
echo '<li itemprop="' . $schemas['agency']["properties"][$key]['x-itemprop'] . '">'; | echo '<li itemprop="' . $schemas['agency']["properties"][$key]['x-itemprop'] . '">'; |
} else { | } else { |
echo "<li>"; | echo "<li>"; |
} | } |
echo "$subvalue</li>"; | echo "$subvalue</li>"; |
} | } |
echo "</ol></td></tr>"; | echo "</ol></td></tr>"; |
} else { | } else { |
if (isset($schemas['agency']["properties"][$key]['x-itemprop'])) { | if (isset($schemas['agency']["properties"][$key]['x-itemprop'])) { |
echo '<span itemprop="' . $schemas['agency']["properties"][$key]['x-itemprop'] . '">'; | echo '<span itemprop="' . $schemas['agency']["properties"][$key]['x-itemprop'] . '">'; |
} else { | } else { |
echo "<span>"; | echo "<span>"; |
} | } |
if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") { | if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") { |
echo "<a href='$value'>view</a></span>"; | echo "<a href='$value'>view</a></span>"; |
} else { | } else { |
echo "$value</span>"; | echo "$value</span>"; |
} | } |
} | } |
echo "</td></tr>"; | echo "</td></tr>"; |
} | } |
if ($mode == "edit") { | if ($mode == "edit") { |
if (is_array($value)) { | if (is_array($value)) { |
echo '<div class="row"> | echo '<div class="row"> |
<div class="seven columns"> | <div class="seven columns"> |
<fieldset> | <fieldset> |
<h5>' . $key . '</h5>'; | <h5>' . $key . '</h5>'; |
foreach ($value as $subkey => $subvalue) { | foreach ($value as $subkey => $subvalue) { |
echo "<label>$subkey</label><input class='input-text' type='text' id='$key$subkey' name='$key" . '[' . $subkey . "]' value='$subvalue'/></tr>"; | echo "<label>$subkey</label><input class='input-text' type='text' id='$key$subkey' name='$key" . '[' . $subkey . "]' value='$subvalue'/></tr>"; |
} | } |
echo "</fieldset> | echo "</fieldset> |
</div> | </div> |
</div>"; | </div>"; |
} else { | } else { |
if (strpos($key, "_") === 0) { | if (strpos($key, "_") === 0) { |
echo"<input type='hidden' id='$key' name='$key' value='$value'/>"; | echo"<input type='hidden' id='$key' name='$key' value='$value'/>"; |
} else if ($key == "parentOrg") { | } else if ($key == "parentOrg") { |
echo "<label for='$key'>$key</label><select id='$key' name='$key'><option value=''> Select... </option>"; | echo "<label for='$key'>$key</label><select id='$key' name='$key'><option value=''> Select... </option>"; |
$rows = $db->get_view("app", "byDeptStateName")->rows; | $rows = $db->get_view("app", "byDeptStateName")->rows; |
//print_r($rows); | //print_r($rows); |
foreach ($rows as $row) { | foreach ($rows as $row) { |
echo "<option value='{$row->value}'" . (($row->value == $value) ? "SELECTED" : "") . " >" . str_replace("Department of ", "", $row->key) . "</option>"; | echo "<option value='{$row->value}'" . (($row->value == $value) ? "SELECTED" : "") . " >" . str_replace("Department of ", "", $row->key) . "</option>"; |
} | } |
echo" </select>"; | echo" </select>"; |
} else if (strpos($key, "has") === 0) { | } else if (strpos($key, "has") === 0) { |
echo "<label for='$key'><input type='checkbox' id='$key' name='$key' " . (($value == 'on' || $value == 'true') ? "checked='$value'" : "") . "> $key</label>"; | echo "<label for='$key'><input type='checkbox' id='$key' name='$key' " . (($value == 'on' || $value == 'true') ? "checked='$value'" : "") . "> $key</label>"; |
} else { | } else { |
echo "<label>$key</label><input class='input-text' type='text' id='$key' name='$key' value='$value'/>"; | echo "<label>$key</label><input class='input-text' type='text' id='$key' name='$key' value='$value'/>"; |
if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") { | if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") { |
echo "<a href='$value'>view</a>"; | echo "<a href='$value'>view</a>"; |
} | } |
if ($key == 'abn') { | if ($key == 'abn') { |
echo "<a href='http://www.abr.business.gov.au/SearchByAbn.aspx?SearchText=$value'>view abn</a>"; | echo "<a href='http://www.abr.business.gov.au/SearchByAbn.aspx?SearchText=$value'>view abn</a>"; |
} | } |
} | } |
} | } |
} | } |
// | // |
} | } |
function addDefaultFields($row) { | function addDefaultFields($row) { |
global $schemas; | global $schemas; |
$defaultFields = array_keys($schemas['agency']['properties']); | $defaultFields = array_keys($schemas['agency']['properties']); |
foreach ($defaultFields as $defaultField) { | foreach ($defaultFields as $defaultField) { |
if (!isset($row[$defaultField])) { | if (!isset($row[$defaultField])) { |
if ($schemas['agency']['properties'][$defaultField]['type'] == "string") { | if ($schemas['agency']['properties'][$defaultField]['type'] == "string") { |
if (strpos($defaultField, "has") === 0) { | if (strpos($defaultField, "has") === 0) { |
$row[$defaultField] = "false"; | $row[$defaultField] = "false"; |
} else { | } else { |
$row[$defaultField] = ""; | $row[$defaultField] = ""; |
} | } |
} | } |
if ($schemas['agency']['properties'][$defaultField]['type'] == "array") { | if ($schemas['agency']['properties'][$defaultField]['type'] == "array") { |
$row[$defaultField] = Array(""); | $row[$defaultField] = Array(""); |
} | } |
} | } |
} | } |
return $row; | return $row; |
} | } |
$db = $server->get_db('disclosr-agencies'); | $db = $server->get_db('disclosr-agencies'); |
if (isset($_REQUEST['id'])) { | if (isset($_REQUEST['id'])) { |
//get an agency record as json/html, search by name/abn/id | //get an agency record as json/html, search by name/abn/id |
// by name = startkey="Ham"&endkey="Ham\ufff0" | // by name = startkey="Ham"&endkey="Ham\ufff0" |
// edit? | // edit? |
$row = $db->get($_REQUEST['id']); | $row = $db->get($_REQUEST['id']); |
//print_r($row); | //print_r($row); |
if (sizeof($_POST) > 0) { | if (sizeof($_POST) > 0) { |
//print_r($_POST); | //print_r($_POST); |
foreach ($_POST as $postkey => $postvalue) { | foreach ($_POST as $postkey => $postvalue) { |
if ($postvalue == "") { | if ($postvalue == "") { |
unset($_POST[$postkey]); | unset($_POST[$postkey]); |
} | } |
if (is_array($postvalue) && count($postvalue) == 1 && $postvalue[0] == "") { | if (is_array($postvalue) && count($postvalue) == 1 && $postvalue[0] == "") { |
unset($_POST[$postkey]); | unset($_POST[$postkey]); |
} | } |
} | } |
if (isset($_POST['_id']) && $db->get_rev($_POST['_id']) == $_POST['_rev']) { | if (isset($_POST['_id']) && $db->get_rev($_POST['_id']) == $_POST['_rev']) { |
echo "Edited version was latest version, continue saving"; | echo "Edited version was latest version, continue saving"; |
$newdoc = $_POST; | $newdoc = $_POST; |
$newdoc['metadata']['lastModified'] = time(); | $newdoc['metadata']['lastModified'] = time(); |
$row = $db->save($newdoc); | $row = $db->save($newdoc); |
} else { | } else { |
echo "ALERT doc revised by someone else while editing. Document not saved."; | echo "ALERT doc revised by someone else while editing. Document not saved."; |
} | } |
} | } |
$mode = "view"; | $mode = "edit"; |
if ($mode == "edit") { | if ($mode == "edit") { |
$row = addDefaultFields(object_to_array($row)); | $row = addDefaultFields(object_to_array($row)); |
} else { | } else { |
$row = object_to_array($row); | $row = object_to_array($row); |
} | } |
if ($mode == "view") { | if ($mode == "view") { |
echo '<div itemscope itemtype ="http://schema.org/GovernmentOrganisation"><table width="100%">'; | echo '<div itemscope itemtype ="http://schema.org/GovernmentOrganisation"><table width="100%">'; |
echo '<tr> <td colspan="2"><h3>' . $row['name'] . "</h3></td></tr>"; | echo '<tr> <td colspan="2"><h3>' . $row['name'] . "</h3></td></tr>"; |
echo "<tr><th>Field Name</th><th>Field Value</th></tr>"; | echo "<tr><th>Field Name</th><th>Field Value</th></tr>"; |
} | } |
if ($mode == "edit") { | if ($mode == "edit") { |
?> | ?> |
<input id="addfield" type="button" value="Add Field"/> | <input id="addfield" type="button" value="Add Field"/> |
<script> | <script> |
window.onload = function() { | window.onload = function() { |
$(document).ready(function() { | $(document).ready(function() { |
// put all your jQuery goodness in here. | // put all your jQuery goodness in here. |
// http://charlie.griefer.com/blog/2009/09/17/jquery-dynamically-adding-form-elements/ | // http://charlie.griefer.com/blog/2009/09/17/jquery-dynamically-adding-form-elements/ |
$('#addfield').click(function() { | $('#addfield').click(function() { |
var field_name=window.prompt("fieldname?",""); | var field_name=window.prompt("fieldname?",""); |
if (field_name !="") { | if (field_name !="") { |
$('#submitbutton').before($('<span></span>') | $('#submitbutton').before($('<span></span>') |
.append("<label>"+field_name+"</label>") | .append("<label>"+field_name+"</label>") |
.append("<input class='input-text' type='text' id='"+field_name+"' name='"+field_name+"'/>") | .append("<input class='input-text' type='text' id='"+field_name+"' name='"+field_name+"'/>") |
); | ); |
} | } |
}); | }); |
}); | }); |
}; | }; |
</script> | </script> |
<form id="editform" class="nice" method="post"> | <form id="editform" class="nice" method="post"> |
<?php | <?php |
} | } |
foreach ($row as $key => $value) { | foreach ($row as $key => $value) { |
echo displayValue($key, $value, $mode); | echo displayValue($key, $value, $mode); |
} | } |
if ($mode == "view") { | if ($mode == "view") { |
echo "</table></div>"; | echo "</table></div>"; |
} | } |
if ($mode == "edit") { | if ($mode == "edit") { |
echo '<input id="submitbutton" type="submit"/></form>'; | echo '<input id="submitbutton" type="submit"/></form>'; |
} | } |
} else { | } else { |
try { | try { |
/* $rows = $db->get_view("app", "showNamesABNs")->rows; | /* $rows = $db->get_view("app", "showNamesABNs")->rows; |
//print_r($rows); | //print_r($rows); |
foreach ($rows as $row) { | foreach ($rows as $row) { |
// print_r($row); | // print_r($row); |
echo '<li><a href="getAgency.php?id=' . $row->key . '">' . | echo '<li><a href="getAgency.php?id=' . $row->key . '">' . |
(isset($row->value->name) && $row->value->name != "" ? $row->value->name : "NO NAME " . $row->value->abn) | (isset($row->value->name) && $row->value->name != "" ? $row->value->name : "NO NAME " . $row->value->abn) |
. '</a></li>'; | . '</a></li>'; |
} */ | } */ |
$rows = $db->get_view("app", "byName")->rows; | $rows = $db->get_view("app", "byName")->rows; |
//print_r($rows); | //print_r($rows); |
foreach ($rows as $row) { | foreach ($rows as $row) { |
// print_r($row); | // print_r($row); |
echo '<li itemscope itemtype="http://schema.org/GovernmentOrganization"><a href="getAgency.php?id=' . $row->value . '" itemprop="url"><span itemprop="name">' . | echo '<li itemscope itemtype="http://schema.org/GovernmentOrganization"><a href="getAgency.php?id=' . $row->value . '" itemprop="url"><span itemprop="name">' . |
$row->key | $row->key |
. '</span></a></li>'; | . '</span></a></li>'; |
} | } |
} catch (SetteeRestClientException $e) { | } catch (SetteeRestClientException $e) { |
setteErrorHandler($e); | setteErrorHandler($e); |
} | } |
} | } |
include_footer(); | include_footer(); |
?> | ?> |
<?php | <?php |
include $basePath . "schemas/schemas.inc.php"; | include $basePath . "schemas/schemas.inc.php"; |
require ($basePath . 'couchdb/settee/src/settee.php'); | require ($basePath . 'couchdb/settee/src/settee.php'); |
function createAgencyDesignDoc() { | function createAgencyDesignDoc() { |
global $db; | global $db; |
$obj = new stdClass(); | $obj = new stdClass(); |
$obj->_id = "_design/" . urlencode("app"); | $obj->_id = "_design/" . urlencode("app"); |
$obj->language = "javascript"; | $obj->language = "javascript"; |
$obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; | $obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; |
$obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };"; | $obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };"; |
$obj->views->byCanonicalName->map = "function(doc) { | $obj->views->byCanonicalName->map = "function(doc) { |
if (doc.parentOrg || doc.orgType == 'FMA-DepartmentOfState') { | if (doc.parentOrg || doc.orgType == 'FMA-DepartmentOfState') { |
emit(doc.name, doc); | emit(doc.name, doc); |
} | } |
};"; | };"; |
$obj->views->byDeptStateName->map = "function(doc) { | $obj->views->byDeptStateName->map = "function(doc) { |
if (doc.orgType == 'FMA-DepartmentOfState') { | if (doc.orgType == 'FMA-DepartmentOfState') { |
emit(doc.name, doc._id); | emit(doc.name, doc._id); |
} | } |
};"; | };"; |
$obj->views->parentOrgs->map = "function(doc) { | $obj->views->parentOrgs->map = "function(doc) { |
if (doc.parentOrg) { | if (doc.parentOrg) { |
emit(doc._id, doc.parentOrg); | emit(doc._id, doc.parentOrg); |
} | } |
};"; | };"; |
$obj->views->byName->map = "function(doc) { | $obj->views->byName->map = 'function(doc) { |
if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { | |
emit(doc.name, doc._id); | emit(doc.name, doc._id); |
for (name in doc.otherNames) { | for (name in doc.otherNames) { |
if (doc.otherNames[name] != '' && doc.otherNames[name] != doc.name) { | if (doc.otherNames[name] != "" && doc.otherNames[name] != doc.name) { |
emit(doc.otherNames[name], doc._id); | emit(doc.otherNames[name], doc._id); |
} | } |
} | } |
};"; | } |
};'; | |
$obj->views->foiEmails->map = "function(doc) { | $obj->views->foiEmails->map = "function(doc) { |
emit(doc._id, doc.foiEmail); | emit(doc._id, doc.foiEmail); |
};"; | };"; |
$obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }"; | $obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }"; |
$obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };'; | $obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };'; |
$obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };'; | $obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };'; |
$obj->views->getScrapeRequired->map = "function(doc) { | $obj->views->getScrapeRequired->map = "function(doc) { |
var lastScrape = Date.parse(doc.metadata.lastScraped); | var lastScrape = Date.parse(doc.metadata.lastScraped); |
var today = new Date(); | var today = new Date(); |
if (!lastScrape || lastScrape.getTime() + 1000 != today.getTime()) { | if (!lastScrape || lastScrape.getTime() + 1000 != today.getTime()) { |
emit(doc._id, doc); | emit(doc._id, doc); |
} | } |
};"; | };"; |
$obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };"; | $obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };"; |
$obj->views->getConflicts->map = "function(doc) { | $obj->views->getConflicts->map = "function(doc) { |
if (doc._conflicts) { | if (doc._conflicts) { |
emit(null, [doc._rev].concat(doc._conflicts)); | emit(null, [doc._rev].concat(doc._conflicts)); |
} | } |
}"; | }"; |
// http://stackoverflow.com/questions/646628/javascript-startswith | // http://stackoverflow.com/questions/646628/javascript-startswith |
$obj->views->scoreHas->map = 'if(!String.prototype.startsWith){ | $obj->views->scoreHas->map = 'if(!String.prototype.startsWith){ |
String.prototype.startsWith = function (str) { | String.prototype.startsWith = function (str) { |
return !this.indexOf(str); | return !this.indexOf(str); |
} | } |
} | } |
if(!String.prototype.endsWith){ | if(!String.prototype.endsWith){ |
String.prototype.endsWith = function(suffix) { | String.prototype.endsWith = function(suffix) { |
return this.indexOf(suffix, this.length - suffix.length) !== -1; | return this.indexOf(suffix, this.length - suffix.length) !== -1; |
}; | }; |
} | } |
function(doc) { | function(doc) { |
if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { | if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { |
for(var propName in doc) { | for(var propName in doc) { |
if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") || propName.endsWith("URL"))) { | if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") || propName.endsWith("URL"))) { |
emit(propName, 1); | emit(propName, 1); |
} | } |
} | } |
emit("total", 1); | emit("total", 1); |
} | } |
}'; | }'; |
$obj->views->score->map = 'if(!String.prototype.startsWith){ | $obj->views->score->map = 'if(!String.prototype.startsWith){ |
String.prototype.startsWith = function (str) { | String.prototype.startsWith = function (str) { |
return !this.indexOf(str); | return !this.indexOf(str); |
} | } |
} | } |
function(doc) { | function(doc) { |
count = 0; | count = 0; |
if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { | if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { |
for(var propName in doc) { | for(var propName in doc) { |
if(typeof(doc[propName]) != "undefined" && propName.startsWith("l")) { | if(typeof(doc[propName]) != "undefined" && propName.startsWith("l")) { |
count++ | count++ |
} | } |
} | } |
emit(count+doc._id, {id:doc._id, name: doc.name, score:count}); | emit(count+doc._id, {id:doc._id, name: doc.name, score:count}); |
} | } |
}'; | }'; |
// allow safe updates (even if slightly slower due to extra: rev-detection check). | // allow safe updates (even if slightly slower due to extra: rev-detection check). |
return $db->save($obj, true); | return $db->save($obj, true); |
} | } |
if (php_uname('n') == "vanille") { | if (php_uname('n') == "vanille") { |
$server = new SetteeServer('http://192.168.178.21:5984'); | $server = new SetteeServer('http://192.168.178.21:5984'); |
} else | } else |
if (php_uname('n') == "KYUUBEY") { | if (php_uname('n') == "KYUUBEY") { |
$server = new SetteeServer('http://192.168.1.148:5984'); | $server = new SetteeServer('http://192.168.1.148:5984'); |
} else { | } else { |
$server = new SetteeServer('http://127.0.0.1:5984'); | $server = new SetteeServer('http://127.0.0.1:5984'); |
} | } |
function setteErrorHandler($e) { | function setteErrorHandler($e) { |
echo $e->getMessage() . "<br>" . PHP_EOL; | echo $e->getMessage() . "<br>" . PHP_EOL; |
} | } |
<?php | <?php |
$schemas['agency'] = Array( | $schemas['agency'] = Array( |
"description" => "Representation of government agency and online transparency measures", | "description" => "Representation of government agency and online transparency measures", |
"type" => "object", | "type" => "object", |
"properties" => Array( | "properties" => Array( |
"name" => Array("type" => "string", "required" => true, "x-itemprop" => "name", "x-title" => "Name", "description" => "Name, most recent and broadest"), | "name" => Array("type" => "string", "required" => true, "x-itemprop" => "name", "x-title" => "Name", "description" => "Name, most recent and broadest"), |
"shortName" => Array("type" => "string", "required" => false, "x-title" => "Short Name", "description" => "Name shortened, usually to an acronym"), | "shortName" => Array("type" => "string", "required" => false, "x-title" => "Short Name", "description" => "Name shortened, usually to an acronym"), |
"foiEmail" => Array("type" => "string", "required" => false, "x-title" => "FOI Contact Email", "description" => "FOI contact email if not foi@"), | "foiEmail" => Array("type" => "string", "required" => false, "x-title" => "FOI Contact Email", "description" => "FOI contact email if not foi@"), |
"sameAs" => Array("type" => "array", "required" => false, "x-itemprop"=>"http://www.w3.org/2002/07/owl#sameAs","x-title" => "Same As", "description" => "Same as other URLs/URIs for this entity", | "sameAs" => Array("type" => "array", "required" => false, "x-itemprop"=>"http://www.w3.org/2002/07/owl#sameAs","x-title" => "Same As", "description" => "Same as other URLs/URIs for this entity", |
"items" => Array("type" => "string")), | "items" => Array("type" => "string")), |
"otherNames" => Array("type" => "array", "required" => true, "x-title" => "Past/Other Names", "description" => "Other names for organisation", | "otherNames" => Array("type" => "array", "required" => true, "x-title" => "Past/Other Names", "description" => "Other names for organisation", |
"items" => Array("type" => "string")), | "items" => Array("type" => "string")), |
"foiBodies" => Array("type" => "array", "required" => true, "x-title" => "FOI Bodies","x-itemprop"=>"members", "description" => "Organisational units within this agency that are subject to FOI Act but are not autonomous", | "foiBodies" => Array("type" => "array", "required" => true, "x-title" => "FOI Bodies","x-itemprop"=>"members", "description" => "Organisational units within this agency that are subject to FOI Act but are not autonomous", |
"items" => Array("type" => "string")), | "items" => Array("type" => "string")), |
"orgType" => Array("type" => "string", "required" => true, "x-title" => "Organisation Type", "description" => "Org type based on legal formation via FMA/CAC legislation etc."), | "orgType" => Array("type" => "string", "required" => true, "x-title" => "Organisation Type", "description" => "Org type based on legal formation via FMA/CAC legislation etc."), |
"parentOrg" => Array("type" => "string", "required" => true, "x-title" => "Parent Organisation", "description" => "Parent organisation, usually a department of state"), | "parentOrg" => Array("type" => "string", "required" => true, "x-title" => "Parent Organisation", "description" => "Parent organisation, usually a department of state"), |
"website" => Array("type" => "string", "required" => true, "x-title" => "Website", "x-itemprop" => "url", "description" => "Website URL"), | "website" => Array("type" => "string", "required" => true, "x-title" => "Website", "x-itemprop" => "url", "description" => "Website URL"), |
"abn" => Array("type" => "string", "required" => true, "x-title" => "Australian Business Number", "description" => "ABN from business register"), | "abn" => Array("type" => "string", "required" => true, "x-title" => "Australian Business Number", "description" => "ABN from business register"), |
"contractListURL" => Array("type" => "string", "required" => true, "x-title" => "Contract Listing", "description" => "Departmental and agency contracts, <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a>"), | "contractListURL" => Array("type" => "string", "required" => true, "x-title" => "Contract Listing", "description" => "Departmental and agency contracts, <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a>"), |
"grantsReportingURL" => Array("type" => "string", "required" => true, "x-title" => "Grants Awarded", | "grantsReportingURL" => Array("type" => "string", "required" => true, "x-title" => "Grants Awarded", |
"description" => "Departmental and agency grants <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a> and <a href='http://www.finance.gov.au/publications/fmg-series/23-commonwealth-grant-guidelines.html'>Commonwealth grants guidelines</a> "), | "description" => "Departmental and agency grants <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a> and <a href='http://www.finance.gov.au/publications/fmg-series/23-commonwealth-grant-guidelines.html'>Commonwealth grants guidelines</a> "), |
"annualReportURL" => Array("type" => "string", "required" => true, "x-title" => "Annual Report(s)", "description" => ""), | "annualReportURL" => Array("type" => "string", "required" => true, "x-title" => "Annual Report(s)", "description" => ""), |
"consultanciesURL" => Array("type" => "string", "required" => true, "x-title" => "Consultants Hired", "description" => ""), | "consultanciesURL" => Array("type" => "string", "required" => true, "x-title" => "Consultants Hired", "description" => ""), |
"legalExpenditureURL" => Array("type" => "string", "required" => true, "x-title" => "Legal Services Expenditure", "description" => "Legal Services Expenditure mandated by Legal Services Directions 2005"), | "legalExpenditureURL" => Array("type" => "string", "required" => true, "x-title" => "Legal Services Expenditure", "description" => "Legal Services Expenditure mandated by Legal Services Directions 2005"), |
"recordsListURL" => Array("type" => "string", "required" => true, "x-title" => "Files/Records Held", "description" => "Indexed lists of departmental and agency files, <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a>"), | "recordsListURL" => Array("type" => "string", "required" => true, "x-title" => "Files/Records Held", "description" => "Indexed lists of departmental and agency files, <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a>"), |
"FOIDocumentsURL" => Array("type" => "string", "required" => true, "x-title" => "FOI Documents Released", "description" => ""), | "FOIDocumentsURL" => Array("type" => "string", "required" => true, "x-title" => "FOI Documents Released", "description" => "FOI Disclosure Log URL"), |
"infoPublicationSchemeURL" => Array("type" => "string", "required" => true, "x-title" => "Information Publication Scheme", "description" => ""), | "FOIDocumentsRSSURL" => Array("type" => "string", "required" => false, "x-title" => "RSS Feed of FOI Documents Released", "description" => "FOI Disclosure Log in RSS format"), |
"hasFOIPDF" => Array("type" => "string", "required" => false, "x-title" => "Has FOI Documents Released in PDF", "description" => "FOI Disclosure Log contains any PDFs"), | |
"infoPublicationSchemeURL" => Array("type" => "string", "required" => true, "x-title" => "Information Publication Scheme", "description" => ""), | |
"appointmentsURL" => Array("type" => "string", "required" => true, "x-title" => "Agency Appointments/Boards", "description" => "Departmental and agency appointments and vacancies , <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a>"), | "appointmentsURL" => Array("type" => "string", "required" => true, "x-title" => "Agency Appointments/Boards", "description" => "Departmental and agency appointments and vacancies , <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a>"), |
"advertisingURL" => Array("type" => "string", "required" => true, "x-title" => "Approved Advertising Campaigns", "description" => " Agency advertising and public information projects, <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a> "), | "advertisingURL" => Array("type" => "string", "required" => true, "x-title" => "Approved Advertising Campaigns", "description" => " Agency advertising and public information projects, <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a> "), |
"hasRSS" => Array("type" => "string", "required" => true, "x-title" => "Has RSS", "description" => ""), | "hasRSS" => Array("type" => "string", "required" => true, "x-title" => "Has RSS", "description" => ""), |
"hasMailingList" => Array("type" => "string", "required" => true, "x-title" => "Has Mailing List", "description" => ""), | "hasMailingList" => Array("type" => "string", "required" => true, "x-title" => "Has Mailing List", "description" => ""), |
"hasTwitter" => Array("type" => "string", "required" => true, "x-title" => "Has Twitter", "description" => ""), | "hasTwitter" => Array("type" => "string", "required" => true, "x-title" => "Has Twitter", "description" => ""), |
"hasFacebook" => Array("type" => "string", "required" => true, "x-title" => "Has Facebook", "description" => ""), | "hasFacebook" => Array("type" => "string", "required" => true, "x-title" => "Has Facebook", "description" => ""), |
"hasYouTube" => Array("type" => "string", "required" => true, "x-title" => "Has YouTube", "description" => ""), | "hasYouTube" => Array("type" => "string", "required" => true, "x-title" => "Has YouTube", "description" => ""), |
"hasFlickr" => Array("type" => "string", "required" => true, "x-title" => "Has Flickr", "description" => ""), | "hasFlickr" => Array("type" => "string", "required" => true, "x-title" => "Has Flickr", "description" => ""), |
"hasCCBY" => Array("type" => "string", "required" => true, "x-title" => "Has CC-BY", "description" => "Has any page licenced Creative Commons - Attribution"), | "hasCCBY" => Array("type" => "string", "required" => true, "x-title" => "Has CC-BY", "description" => "Has any page licenced Creative Commons - Attribution"), |
), | ), |
/* "org":{"type":"object", | /* "org":{"type":"object", |
"properties":{ | "properties":{ |
"organizationName":{"type":"string"}, | "organizationName":{"type":"string"}, |
"organizationUnit":{"type":"string"}}, | "organizationUnit":{"type":"string"}}, |
} | } |
} */ | } */ |
); | ); |
?> | ?> |
#http://packages.python.org/CouchDB/client.html | #http://packages.python.org/CouchDB/client.html |
import couchdb | import couchdb |
import urllib2 | import urllib2 |
from BeautifulSoup import BeautifulSoup | from BeautifulSoup import BeautifulSoup |
import re | import re |
import hashlib | |
from urlparse import urljoin | |
import time | |
import os | |
#http://diveintopython.org/http_web_services/etags.html | #http://diveintopython.org/http_web_services/etags.html |
class NotModifiedHandler(urllib2.BaseHandler): | class NotModifiedHandler(urllib2.BaseHandler): |
def http_error_304(self, req, fp, code, message, headers): | def http_error_304(self, req, fp, code, message, headers): |
addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url()) | addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url()) |
addinfourl.code = code | addinfourl.code = code |
return addinfourl | return addinfourl |
def scrapeAndStore(URL, depth, agency): | def fetchURL(docsdb, url, agencyID): |
URL = "http://www.google.com" | hash = hashlib.md5(url).hexdigest() |
req = urllib2.Request(URL) | req = urllib2.Request(url) |
etag = 'y' | print "Fetching %s", url |
last_modified = 'y' | doc = docsdb.get(hash) |
#if there is a previous version sotred in couchdb, load caching helper tags | if doc == None: |
if etag: | doc = {'_id': hash, 'agencyID': agencyID} |
req.add_header("If-None-Match", etag) | #if there is a previous version stored in couchdb, load caching helper tags |
if last_modified: | if doc.has_key('etag'): |
req.add_header("If-Modified-Since", last_modified) | req.add_header("If-None-Match", doc['etag']) |
if doc.has_key('last_modified'): | |
req.add_header("If-Modified-Since", doc['last_modified']) | |
opener = urllib2.build_opener(NotModifiedHandler()) | opener = urllib2.build_opener(NotModifiedHandler()) |
url_handle = opener.open(req) | url_handle = opener.open(req) |
headers = url_handle.info() # the addinfourls have the .info() too | headers = url_handle.info() # the addinfourls have the .info() too |
etag = headers.getheader("ETag") | doc['etag'] = headers.getheader("ETag") |
last_modified = headers.getheader("Last-Modified") | doc['last_modified'] = headers.getheader("Last-Modified") |
web_server = headers.getheader("Server") | doc['web_server'] = headers.getheader("Server") |
file_size = headers.getheader("Content-Length") | doc['powered_by'] = headers.getheader("X-Powered-By") |
mime_type = headers.getheader("Content-Type") | doc['file_size'] = headers.getheader("Content-Length") |
doc['mime_type'] = headers.getheader("Content-Type").split(";")[0] | |
if hasattr(url_handle, 'code'): | if hasattr(url_handle, 'code'): |
if url_handle.code == 304: | if url_handle.code == 304: |
print "the web page has not been modified" | print "the web page has not been modified" |
return None | |
else: | else: |
#do scraping | content = url_handle.read() |
html = url_handle.read() | docsdb.save(doc) |
# http://www.crummy.com/software/BeautifulSoup/documentation.html | doc = docsdb.get(hash) # need to get a _rev |
soup = BeautifulSoup(html) | docsdb.put_attachment(doc, content, str(time.time())+"-"+os.path.basename(url), doc['mime_type']) |
links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-")) | return (doc['mime_type'], content) |
for link in links: | |
print link['href'] | |
#for each unique link | |
#if html mimetype | |
# go down X levels, | |
# diff with last stored attachment, store in document | |
#if not | |
# remember to save parentURL and title (link text that lead to document) | |
#store as attachment epoch-filename | #store as attachment epoch-filename |
else: | else: |
print "error %s in downloading %s", url_handle.code, URL | print "error %s in downloading %s", url_handle.code, URL |
#record/alert error to error database | #record/alert error to error database |
def scrapeAndStore(docsdb, url, depth, agencyID): | |
(mime_type,content) = fetchURL(docsdb, url, agencyID) | |
if content != None: | |
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": | |
# http://www.crummy.com/software/BeautifulSoup/documentation.html | |
soup = BeautifulSoup(content) | |
navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar')) | |
for nav in navIDs: | |
print "Removing element", nav['id'] | |
nav.extract() | |
navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar')}) | |
for nav in navClasses: | |
print "Removing element", nav['class'] | |
nav.extract() | |
links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-")) | |
for link in links: | |
if link.has_key("href"): | |
if link['href'].startswith("http"): | |
linkurl = link['href'] | |
else: | |
linkurl = urljoin(url,link['href']) | |
print linkurl | |
#for each unique link | |
# if | |
#if html mimetype | |
# go down X levels, | |
# diff with last stored attachment, store in document | |
#if not | |
# remember to save parentURL and title (link text that lead to document) | |
couch = couchdb.Server('http://127.0.0.1:5984/') | |
couch = couchdb.Server('http://192.168.1.148:5984/') | |
# select database | # select database |
agencydb = couch['disclosr-agencies'] | agencydb = couch['disclosr-agencies'] |
docsdb = couch['disclosr-documents'] | |
for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view? | for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view? |
agency = agencydb.get(row.id) | agency = agencydb.get(row.id) |
print agency['name'] | print agency['name'] |
scrapeAndStore("A",1,1) | scrapeAndStore(docsdb, agency['website'],1,agency['_id']) |