Store scraper results to couchdb
Store scraper results to couchdb


Former-commit-id: 234bb19e5682c98cb4cbd9c6d6b1bf542ff16d50

<?php <?php
   
include_once('include/common.inc.php'); include_once('include/common.inc.php');
include_header(); include_header();
   
function displayValue($key, $value, $mode) { function displayValue($key, $value, $mode) {
global $db, $schemas; global $db, $schemas;
if ($mode == "view") { if ($mode == "view") {
   
echo "<tr>"; echo "<tr>";
   
echo "<td>" . $schemas['agency']["properties"][$key]['x-title'] . "<br><small>" . $schemas['agency']["properties"][$key]['description'] . "</small></td><td>"; echo "<td>" . $schemas['agency']["properties"][$key]['x-title'] . "<br><small>" . $schemas['agency']["properties"][$key]['description'] . "</small></td><td>";
if (is_array($value)) { if (is_array($value)) {
echo "<ol>"; echo "<ol>";
foreach ($value as $subkey => $subvalue) { foreach ($value as $subkey => $subvalue) {
if (isset($schemas['agency']["properties"][$key]['x-itemprop'])) { if (isset($schemas['agency']["properties"][$key]['x-itemprop'])) {
echo '<li itemprop="' . $schemas['agency']["properties"][$key]['x-itemprop'] . '">'; echo '<li itemprop="' . $schemas['agency']["properties"][$key]['x-itemprop'] . '">';
} else { } else {
echo "<li>"; echo "<li>";
} }
echo "$subvalue</li>"; echo "$subvalue</li>";
} }
echo "</ol></td></tr>"; echo "</ol></td></tr>";
} else { } else {
if (isset($schemas['agency']["properties"][$key]['x-itemprop'])) { if (isset($schemas['agency']["properties"][$key]['x-itemprop'])) {
echo '<span itemprop="' . $schemas['agency']["properties"][$key]['x-itemprop'] . '">'; echo '<span itemprop="' . $schemas['agency']["properties"][$key]['x-itemprop'] . '">';
} else { } else {
echo "<span>"; echo "<span>";
} }
if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") { if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") {
echo "<a href='$value'>view</a></span>"; echo "<a href='$value'>view</a></span>";
} else { } else {
echo "$value</span>"; echo "$value</span>";
} }
} }
echo "</td></tr>"; echo "</td></tr>";
} }
if ($mode == "edit") { if ($mode == "edit") {
if (is_array($value)) { if (is_array($value)) {
echo '<div class="row"> echo '<div class="row">
<div class="seven columns"> <div class="seven columns">
<fieldset> <fieldset>
<h5>' . $key . '</h5>'; <h5>' . $key . '</h5>';
foreach ($value as $subkey => $subvalue) { foreach ($value as $subkey => $subvalue) {
echo "<label>$subkey</label><input class='input-text' type='text' id='$key$subkey' name='$key" . '[' . $subkey . "]' value='$subvalue'/></tr>"; echo "<label>$subkey</label><input class='input-text' type='text' id='$key$subkey' name='$key" . '[' . $subkey . "]' value='$subvalue'/></tr>";
} }
echo "</fieldset> echo "</fieldset>
</div> </div>
</div>"; </div>";
} else { } else {
if (strpos($key, "_") === 0) { if (strpos($key, "_") === 0) {
echo"<input type='hidden' id='$key' name='$key' value='$value'/>"; echo"<input type='hidden' id='$key' name='$key' value='$value'/>";
} else if ($key == "parentOrg") { } else if ($key == "parentOrg") {
echo "<label for='$key'>$key</label><select id='$key' name='$key'><option value=''> Select... </option>"; echo "<label for='$key'>$key</label><select id='$key' name='$key'><option value=''> Select... </option>";
$rows = $db->get_view("app", "byDeptStateName")->rows; $rows = $db->get_view("app", "byDeptStateName")->rows;
//print_r($rows); //print_r($rows);
foreach ($rows as $row) { foreach ($rows as $row) {
echo "<option value='{$row->value}'" . (($row->value == $value) ? "SELECTED" : "") . " >" . str_replace("Department of ", "", $row->key) . "</option>"; echo "<option value='{$row->value}'" . (($row->value == $value) ? "SELECTED" : "") . " >" . str_replace("Department of ", "", $row->key) . "</option>";
} }
echo" </select>"; echo" </select>";
} else if (strpos($key, "has") === 0) { } else if (strpos($key, "has") === 0) {
echo "<label for='$key'><input type='checkbox' id='$key' name='$key' " . (($value == 'on' || $value == 'true') ? "checked='$value'" : "") . "> $key</label>"; echo "<label for='$key'><input type='checkbox' id='$key' name='$key' " . (($value == 'on' || $value == 'true') ? "checked='$value'" : "") . "> $key</label>";
} else { } else {
echo "<label>$key</label><input class='input-text' type='text' id='$key' name='$key' value='$value'/>"; echo "<label>$key</label><input class='input-text' type='text' id='$key' name='$key' value='$value'/>";
if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") { if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") {
echo "<a href='$value'>view</a>"; echo "<a href='$value'>view</a>";
} }
if ($key == 'abn') { if ($key == 'abn') {
echo "<a href='http://www.abr.business.gov.au/SearchByAbn.aspx?SearchText=$value'>view abn</a>"; echo "<a href='http://www.abr.business.gov.au/SearchByAbn.aspx?SearchText=$value'>view abn</a>";
} }
} }
} }
} }
// //
} }
   
function addDefaultFields($row) { function addDefaultFields($row) {
global $schemas; global $schemas;
$defaultFields = array_keys($schemas['agency']['properties']); $defaultFields = array_keys($schemas['agency']['properties']);
foreach ($defaultFields as $defaultField) { foreach ($defaultFields as $defaultField) {
if (!isset($row[$defaultField])) { if (!isset($row[$defaultField])) {
if ($schemas['agency']['properties'][$defaultField]['type'] == "string") { if ($schemas['agency']['properties'][$defaultField]['type'] == "string") {
if (strpos($defaultField, "has") === 0) { if (strpos($defaultField, "has") === 0) {
$row[$defaultField] = "false"; $row[$defaultField] = "false";
} else { } else {
$row[$defaultField] = ""; $row[$defaultField] = "";
} }
} }
if ($schemas['agency']['properties'][$defaultField]['type'] == "array") { if ($schemas['agency']['properties'][$defaultField]['type'] == "array") {
   
$row[$defaultField] = Array(""); $row[$defaultField] = Array("");
} }
} }
} }
return $row; return $row;
} }
   
$db = $server->get_db('disclosr-agencies'); $db = $server->get_db('disclosr-agencies');
   
if (isset($_REQUEST['id'])) { if (isset($_REQUEST['id'])) {
//get an agency record as json/html, search by name/abn/id //get an agency record as json/html, search by name/abn/id
// by name = startkey="Ham"&endkey="Ham\ufff0" // by name = startkey="Ham"&endkey="Ham\ufff0"
// edit? // edit?
   
$row = $db->get($_REQUEST['id']); $row = $db->get($_REQUEST['id']);
//print_r($row); //print_r($row);
if (sizeof($_POST) > 0) { if (sizeof($_POST) > 0) {
//print_r($_POST); //print_r($_POST);
foreach ($_POST as $postkey => $postvalue) { foreach ($_POST as $postkey => $postvalue) {
if ($postvalue == "") { if ($postvalue == "") {
unset($_POST[$postkey]); unset($_POST[$postkey]);
} }
if (is_array($postvalue) && count($postvalue) == 1 && $postvalue[0] == "") { if (is_array($postvalue) && count($postvalue) == 1 && $postvalue[0] == "") {
unset($_POST[$postkey]); unset($_POST[$postkey]);
} }
} }
if (isset($_POST['_id']) && $db->get_rev($_POST['_id']) == $_POST['_rev']) { if (isset($_POST['_id']) && $db->get_rev($_POST['_id']) == $_POST['_rev']) {
echo "Edited version was latest version, continue saving"; echo "Edited version was latest version, continue saving";
$newdoc = $_POST; $newdoc = $_POST;
$newdoc['metadata']['lastModified'] = time(); $newdoc['metadata']['lastModified'] = time();
$row = $db->save($newdoc); $row = $db->save($newdoc);
} else { } else {
echo "ALERT doc revised by someone else while editing. Document not saved."; echo "ALERT doc revised by someone else while editing. Document not saved.";
} }
} }
   
$mode = "view"; $mode = "edit";
if ($mode == "edit") { if ($mode == "edit") {
$row = addDefaultFields(object_to_array($row)); $row = addDefaultFields(object_to_array($row));
} else { } else {
$row = object_to_array($row); $row = object_to_array($row);
} }
if ($mode == "view") { if ($mode == "view") {
echo '<div itemscope itemtype ="http://schema.org/GovernmentOrganisation"><table width="100%">'; echo '<div itemscope itemtype ="http://schema.org/GovernmentOrganisation"><table width="100%">';
echo '<tr> <td colspan="2"><h3>' . $row['name'] . "</h3></td></tr>"; echo '<tr> <td colspan="2"><h3>' . $row['name'] . "</h3></td></tr>";
echo "<tr><th>Field Name</th><th>Field Value</th></tr>"; echo "<tr><th>Field Name</th><th>Field Value</th></tr>";
} }
if ($mode == "edit") { if ($mode == "edit") {
?> ?>
<input id="addfield" type="button" value="Add Field"/> <input id="addfield" type="button" value="Add Field"/>
<script> <script>
window.onload = function() { window.onload = function() {
$(document).ready(function() { $(document).ready(function() {
// put all your jQuery goodness in here. // put all your jQuery goodness in here.
// http://charlie.griefer.com/blog/2009/09/17/jquery-dynamically-adding-form-elements/ // http://charlie.griefer.com/blog/2009/09/17/jquery-dynamically-adding-form-elements/
$('#addfield').click(function() { $('#addfield').click(function() {
var field_name=window.prompt("fieldname?",""); var field_name=window.prompt("fieldname?","");
if (field_name !="") { if (field_name !="") {
$('#submitbutton').before($('<span></span>') $('#submitbutton').before($('<span></span>')
.append("<label>"+field_name+"</label>") .append("<label>"+field_name+"</label>")
.append("<input class='input-text' type='text' id='"+field_name+"' name='"+field_name+"'/>") .append("<input class='input-text' type='text' id='"+field_name+"' name='"+field_name+"'/>")
); );
} }
}); });
}); });
}; };
</script> </script>
<form id="editform" class="nice" method="post"> <form id="editform" class="nice" method="post">
<?php <?php
   
} }
foreach ($row as $key => $value) { foreach ($row as $key => $value) {
echo displayValue($key, $value, $mode); echo displayValue($key, $value, $mode);
} }
if ($mode == "view") { if ($mode == "view") {
echo "</table></div>"; echo "</table></div>";
} }
if ($mode == "edit") { if ($mode == "edit") {
echo '<input id="submitbutton" type="submit"/></form>'; echo '<input id="submitbutton" type="submit"/></form>';
} }
} else { } else {
   
try { try {
/* $rows = $db->get_view("app", "showNamesABNs")->rows; /* $rows = $db->get_view("app", "showNamesABNs")->rows;
//print_r($rows); //print_r($rows);
foreach ($rows as $row) { foreach ($rows as $row) {
// print_r($row); // print_r($row);
echo '<li><a href="getAgency.php?id=' . $row->key . '">' . echo '<li><a href="getAgency.php?id=' . $row->key . '">' .
(isset($row->value->name) && $row->value->name != "" ? $row->value->name : "NO NAME " . $row->value->abn) (isset($row->value->name) && $row->value->name != "" ? $row->value->name : "NO NAME " . $row->value->abn)
. '</a></li>'; . '</a></li>';
} */ } */
$rows = $db->get_view("app", "byName")->rows; $rows = $db->get_view("app", "byName")->rows;
//print_r($rows); //print_r($rows);
foreach ($rows as $row) { foreach ($rows as $row) {
// print_r($row); // print_r($row);
echo '<li itemscope itemtype="http://schema.org/GovernmentOrganization"><a href="getAgency.php?id=' . $row->value . '" itemprop="url"><span itemprop="name">' . echo '<li itemscope itemtype="http://schema.org/GovernmentOrganization"><a href="getAgency.php?id=' . $row->value . '" itemprop="url"><span itemprop="name">' .
$row->key $row->key
. '</span></a></li>'; . '</span></a></li>';
} }
} catch (SetteeRestClientException $e) { } catch (SetteeRestClientException $e) {
setteErrorHandler($e); setteErrorHandler($e);
} }
} }
include_footer(); include_footer();
?> ?>
<?php <?php
   
include $basePath . "schemas/schemas.inc.php"; include $basePath . "schemas/schemas.inc.php";
   
require ($basePath . 'couchdb/settee/src/settee.php'); require ($basePath . 'couchdb/settee/src/settee.php');
   
function createAgencyDesignDoc() { function createAgencyDesignDoc() {
global $db; global $db;
$obj = new stdClass(); $obj = new stdClass();
$obj->_id = "_design/" . urlencode("app"); $obj->_id = "_design/" . urlencode("app");
$obj->language = "javascript"; $obj->language = "javascript";
$obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; $obj->views->all->map = "function(doc) { emit(doc._id, doc); };";
$obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };"; $obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };";
$obj->views->byCanonicalName->map = "function(doc) { $obj->views->byCanonicalName->map = "function(doc) {
if (doc.parentOrg || doc.orgType == 'FMA-DepartmentOfState') { if (doc.parentOrg || doc.orgType == 'FMA-DepartmentOfState') {
emit(doc.name, doc); emit(doc.name, doc);
} }
};"; };";
$obj->views->byDeptStateName->map = "function(doc) { $obj->views->byDeptStateName->map = "function(doc) {
if (doc.orgType == 'FMA-DepartmentOfState') { if (doc.orgType == 'FMA-DepartmentOfState') {
emit(doc.name, doc._id); emit(doc.name, doc._id);
} }
};"; };";
$obj->views->parentOrgs->map = "function(doc) { $obj->views->parentOrgs->map = "function(doc) {
if (doc.parentOrg) { if (doc.parentOrg) {
emit(doc._id, doc.parentOrg); emit(doc._id, doc.parentOrg);
} }
};"; };";
$obj->views->byName->map = "function(doc) { $obj->views->byName->map = 'function(doc) {
  if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") {
emit(doc.name, doc._id); emit(doc.name, doc._id);
for (name in doc.otherNames) { for (name in doc.otherNames) {
if (doc.otherNames[name] != '' && doc.otherNames[name] != doc.name) { if (doc.otherNames[name] != "" && doc.otherNames[name] != doc.name) {
emit(doc.otherNames[name], doc._id); emit(doc.otherNames[name], doc._id);
} }
} }
};"; }
  };';
   
$obj->views->foiEmails->map = "function(doc) { $obj->views->foiEmails->map = "function(doc) {
emit(doc._id, doc.foiEmail); emit(doc._id, doc.foiEmail);
};"; };";
   
$obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }"; $obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }";
$obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };'; $obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };';
$obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };'; $obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };';
$obj->views->getScrapeRequired->map = "function(doc) { $obj->views->getScrapeRequired->map = "function(doc) {
   
var lastScrape = Date.parse(doc.metadata.lastScraped); var lastScrape = Date.parse(doc.metadata.lastScraped);
   
var today = new Date(); var today = new Date();
   
if (!lastScrape || lastScrape.getTime() + 1000 != today.getTime()) { if (!lastScrape || lastScrape.getTime() + 1000 != today.getTime()) {
emit(doc._id, doc); emit(doc._id, doc);
} }
   
};"; };";
$obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };"; $obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };";
$obj->views->getConflicts->map = "function(doc) { $obj->views->getConflicts->map = "function(doc) {
if (doc._conflicts) { if (doc._conflicts) {
emit(null, [doc._rev].concat(doc._conflicts)); emit(null, [doc._rev].concat(doc._conflicts));
} }
}"; }";
// http://stackoverflow.com/questions/646628/javascript-startswith // http://stackoverflow.com/questions/646628/javascript-startswith
$obj->views->scoreHas->map = 'if(!String.prototype.startsWith){ $obj->views->scoreHas->map = 'if(!String.prototype.startsWith){
String.prototype.startsWith = function (str) { String.prototype.startsWith = function (str) {
return !this.indexOf(str); return !this.indexOf(str);
} }
} }
if(!String.prototype.endsWith){ if(!String.prototype.endsWith){
String.prototype.endsWith = function(suffix) { String.prototype.endsWith = function(suffix) {
    return this.indexOf(suffix, this.length - suffix.length) !== -1;     return this.indexOf(suffix, this.length - suffix.length) !== -1;
}; };
} }
function(doc) { function(doc) {
if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") {
for(var propName in doc) { for(var propName in doc) {
if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") || propName.endsWith("URL"))) { if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") || propName.endsWith("URL"))) {
emit(propName, 1); emit(propName, 1);
} }
} }
emit("total", 1); emit("total", 1);
} }
}'; }';
$obj->views->score->map = 'if(!String.prototype.startsWith){ $obj->views->score->map = 'if(!String.prototype.startsWith){
String.prototype.startsWith = function (str) { String.prototype.startsWith = function (str) {
return !this.indexOf(str); return !this.indexOf(str);
} }
} }
   
function(doc) { function(doc) {
count = 0; count = 0;
if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") {
for(var propName in doc) { for(var propName in doc) {
if(typeof(doc[propName]) != "undefined" && propName.startsWith("l")) { if(typeof(doc[propName]) != "undefined" && propName.startsWith("l")) {
count++ count++
} }
} }
emit(count+doc._id, {id:doc._id, name: doc.name, score:count}); emit(count+doc._id, {id:doc._id, name: doc.name, score:count});
} }
}'; }';
   
// allow safe updates (even if slightly slower due to extra: rev-detection check). // allow safe updates (even if slightly slower due to extra: rev-detection check).
return $db->save($obj, true); return $db->save($obj, true);
} }
   
if (php_uname('n') == "vanille") { if (php_uname('n') == "vanille") {
   
$server = new SetteeServer('http://192.168.178.21:5984'); $server = new SetteeServer('http://192.168.178.21:5984');
} else } else
if (php_uname('n') == "KYUUBEY") { if (php_uname('n') == "KYUUBEY") {
   
$server = new SetteeServer('http://192.168.1.148:5984'); $server = new SetteeServer('http://192.168.1.148:5984');
} else { } else {
$server = new SetteeServer('http://127.0.0.1:5984'); $server = new SetteeServer('http://127.0.0.1:5984');
} }
   
function setteErrorHandler($e) { function setteErrorHandler($e) {
echo $e->getMessage() . "<br>" . PHP_EOL; echo $e->getMessage() . "<br>" . PHP_EOL;
} }
<?php <?php
   
$schemas['agency'] = Array( $schemas['agency'] = Array(
"description" => "Representation of government agency and online transparency measures", "description" => "Representation of government agency and online transparency measures",
"type" => "object", "type" => "object",
"properties" => Array( "properties" => Array(
"name" => Array("type" => "string", "required" => true, "x-itemprop" => "name", "x-title" => "Name", "description" => "Name, most recent and broadest"), "name" => Array("type" => "string", "required" => true, "x-itemprop" => "name", "x-title" => "Name", "description" => "Name, most recent and broadest"),
"shortName" => Array("type" => "string", "required" => false, "x-title" => "Short Name", "description" => "Name shortened, usually to an acronym"), "shortName" => Array("type" => "string", "required" => false, "x-title" => "Short Name", "description" => "Name shortened, usually to an acronym"),
"foiEmail" => Array("type" => "string", "required" => false, "x-title" => "FOI Contact Email", "description" => "FOI contact email if not foi@"), "foiEmail" => Array("type" => "string", "required" => false, "x-title" => "FOI Contact Email", "description" => "FOI contact email if not foi@"),
"sameAs" => Array("type" => "array", "required" => false, "x-itemprop"=>"http://www.w3.org/2002/07/owl#sameAs","x-title" => "Same As", "description" => "Same as other URLs/URIs for this entity", "sameAs" => Array("type" => "array", "required" => false, "x-itemprop"=>"http://www.w3.org/2002/07/owl#sameAs","x-title" => "Same As", "description" => "Same as other URLs/URIs for this entity",
"items" => Array("type" => "string")), "items" => Array("type" => "string")),
"otherNames" => Array("type" => "array", "required" => true, "x-title" => "Past/Other Names", "description" => "Other names for organisation", "otherNames" => Array("type" => "array", "required" => true, "x-title" => "Past/Other Names", "description" => "Other names for organisation",
"items" => Array("type" => "string")), "items" => Array("type" => "string")),
"foiBodies" => Array("type" => "array", "required" => true, "x-title" => "FOI Bodies","x-itemprop"=>"members", "description" => "Organisational units within this agency that are subject to FOI Act but are not autonomous", "foiBodies" => Array("type" => "array", "required" => true, "x-title" => "FOI Bodies","x-itemprop"=>"members", "description" => "Organisational units within this agency that are subject to FOI Act but are not autonomous",
"items" => Array("type" => "string")), "items" => Array("type" => "string")),
"orgType" => Array("type" => "string", "required" => true, "x-title" => "Organisation Type", "description" => "Org type based on legal formation via FMA/CAC legislation etc."), "orgType" => Array("type" => "string", "required" => true, "x-title" => "Organisation Type", "description" => "Org type based on legal formation via FMA/CAC legislation etc."),
"parentOrg" => Array("type" => "string", "required" => true, "x-title" => "Parent Organisation", "description" => "Parent organisation, usually a department of state"), "parentOrg" => Array("type" => "string", "required" => true, "x-title" => "Parent Organisation", "description" => "Parent organisation, usually a department of state"),
"website" => Array("type" => "string", "required" => true, "x-title" => "Website", "x-itemprop" => "url", "description" => "Website URL"), "website" => Array("type" => "string", "required" => true, "x-title" => "Website", "x-itemprop" => "url", "description" => "Website URL"),
"abn" => Array("type" => "string", "required" => true, "x-title" => "Australian Business Number", "description" => "ABN from business register"), "abn" => Array("type" => "string", "required" => true, "x-title" => "Australian Business Number", "description" => "ABN from business register"),
"contractListURL" => Array("type" => "string", "required" => true, "x-title" => "Contract Listing", "description" => "Departmental and agency contracts, <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a>"), "contractListURL" => Array("type" => "string", "required" => true, "x-title" => "Contract Listing", "description" => "Departmental and agency contracts, <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a>"),
"grantsReportingURL" => Array("type" => "string", "required" => true, "x-title" => "Grants Awarded", "grantsReportingURL" => Array("type" => "string", "required" => true, "x-title" => "Grants Awarded",
"description" => "Departmental and agency grants <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a> and <a href='http://www.finance.gov.au/publications/fmg-series/23-commonwealth-grant-guidelines.html'>Commonwealth grants guidelines</a> "), "description" => "Departmental and agency grants <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a> and <a href='http://www.finance.gov.au/publications/fmg-series/23-commonwealth-grant-guidelines.html'>Commonwealth grants guidelines</a> "),
"annualReportURL" => Array("type" => "string", "required" => true, "x-title" => "Annual Report(s)", "description" => ""), "annualReportURL" => Array("type" => "string", "required" => true, "x-title" => "Annual Report(s)", "description" => ""),
"consultanciesURL" => Array("type" => "string", "required" => true, "x-title" => "Consultants Hired", "description" => ""), "consultanciesURL" => Array("type" => "string", "required" => true, "x-title" => "Consultants Hired", "description" => ""),
"legalExpenditureURL" => Array("type" => "string", "required" => true, "x-title" => "Legal Services Expenditure", "description" => "Legal Services Expenditure mandated by Legal Services Directions 2005"), "legalExpenditureURL" => Array("type" => "string", "required" => true, "x-title" => "Legal Services Expenditure", "description" => "Legal Services Expenditure mandated by Legal Services Directions 2005"),
"recordsListURL" => Array("type" => "string", "required" => true, "x-title" => "Files/Records Held", "description" => "Indexed lists of departmental and agency files, <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a>"), "recordsListURL" => Array("type" => "string", "required" => true, "x-title" => "Files/Records Held", "description" => "Indexed lists of departmental and agency files, <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a>"),
"FOIDocumentsURL" => Array("type" => "string", "required" => true, "x-title" => "FOI Documents Released", "description" => ""), "FOIDocumentsURL" => Array("type" => "string", "required" => true, "x-title" => "FOI Documents Released", "description" => "FOI Disclosure Log URL"),
"infoPublicationSchemeURL" => Array("type" => "string", "required" => true, "x-title" => "Information Publication Scheme", "description" => ""), "FOIDocumentsRSSURL" => Array("type" => "string", "required" => false, "x-title" => "RSS Feed of FOI Documents Released", "description" => "FOI Disclosure Log in RSS format"),
  "hasFOIPDF" => Array("type" => "string", "required" => false, "x-title" => "Has FOI Documents Released in PDF", "description" => "FOI Disclosure Log contains any PDFs"),
  "infoPublicationSchemeURL" => Array("type" => "string", "required" => true, "x-title" => "Information Publication Scheme", "description" => ""),
"appointmentsURL" => Array("type" => "string", "required" => true, "x-title" => "Agency Appointments/Boards", "description" => "Departmental and agency appointments and vacancies , <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a>"), "appointmentsURL" => Array("type" => "string", "required" => true, "x-title" => "Agency Appointments/Boards", "description" => "Departmental and agency appointments and vacancies , <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a>"),
"advertisingURL" => Array("type" => "string", "required" => true, "x-title" => "Approved Advertising Campaigns", "description" => " Agency advertising and public information projects, <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a> "), "advertisingURL" => Array("type" => "string", "required" => true, "x-title" => "Approved Advertising Campaigns", "description" => " Agency advertising and public information projects, <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a> "),
"hasRSS" => Array("type" => "string", "required" => true, "x-title" => "Has RSS", "description" => ""), "hasRSS" => Array("type" => "string", "required" => true, "x-title" => "Has RSS", "description" => ""),
"hasMailingList" => Array("type" => "string", "required" => true, "x-title" => "Has Mailing List", "description" => ""), "hasMailingList" => Array("type" => "string", "required" => true, "x-title" => "Has Mailing List", "description" => ""),
"hasTwitter" => Array("type" => "string", "required" => true, "x-title" => "Has Twitter", "description" => ""), "hasTwitter" => Array("type" => "string", "required" => true, "x-title" => "Has Twitter", "description" => ""),
"hasFacebook" => Array("type" => "string", "required" => true, "x-title" => "Has Facebook", "description" => ""), "hasFacebook" => Array("type" => "string", "required" => true, "x-title" => "Has Facebook", "description" => ""),
"hasYouTube" => Array("type" => "string", "required" => true, "x-title" => "Has YouTube", "description" => ""), "hasYouTube" => Array("type" => "string", "required" => true, "x-title" => "Has YouTube", "description" => ""),
"hasFlickr" => Array("type" => "string", "required" => true, "x-title" => "Has Flickr", "description" => ""), "hasFlickr" => Array("type" => "string", "required" => true, "x-title" => "Has Flickr", "description" => ""),
"hasCCBY" => Array("type" => "string", "required" => true, "x-title" => "Has CC-BY", "description" => "Has any page licenced Creative Commons - Attribution"), "hasCCBY" => Array("type" => "string", "required" => true, "x-title" => "Has CC-BY", "description" => "Has any page licenced Creative Commons - Attribution"),
), ),
/* "org":{"type":"object", /* "org":{"type":"object",
"properties":{ "properties":{
"organizationName":{"type":"string"}, "organizationName":{"type":"string"},
"organizationUnit":{"type":"string"}}, "organizationUnit":{"type":"string"}},
} }
} */ } */
); );
?> ?>
   
file:a/scrape.py -> file:b/scrape.py
#http://packages.python.org/CouchDB/client.html #http://packages.python.org/CouchDB/client.html
import couchdb import couchdb
import urllib2 import urllib2
from BeautifulSoup import BeautifulSoup from BeautifulSoup import BeautifulSoup
import re import re
  import hashlib
  from urlparse import urljoin
  import time
  import os
   
#http://diveintopython.org/http_web_services/etags.html #http://diveintopython.org/http_web_services/etags.html
class NotModifiedHandler(urllib2.BaseHandler): class NotModifiedHandler(urllib2.BaseHandler):
def http_error_304(self, req, fp, code, message, headers): def http_error_304(self, req, fp, code, message, headers):
addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url()) addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url())
addinfourl.code = code addinfourl.code = code
return addinfourl return addinfourl
   
def scrapeAndStore(URL, depth, agency): def fetchURL(docsdb, url, agencyID):
URL = "http://www.google.com" hash = hashlib.md5(url).hexdigest()
req = urllib2.Request(URL) req = urllib2.Request(url)
etag = 'y' print "Fetching %s", url
last_modified = 'y' doc = docsdb.get(hash)
#if there is a previous version sotred in couchdb, load caching helper tags if doc == None:
if etag: doc = {'_id': hash, 'agencyID': agencyID}
req.add_header("If-None-Match", etag) #if there is a previous version stored in couchdb, load caching helper tags
if last_modified: if doc.has_key('etag'):
req.add_header("If-Modified-Since", last_modified) req.add_header("If-None-Match", doc['etag'])
  if doc.has_key('last_modified'):
  req.add_header("If-Modified-Since", doc['last_modified'])
opener = urllib2.build_opener(NotModifiedHandler()) opener = urllib2.build_opener(NotModifiedHandler())
url_handle = opener.open(req) url_handle = opener.open(req)
headers = url_handle.info() # the addinfourls have the .info() too headers = url_handle.info() # the addinfourls have the .info() too
etag = headers.getheader("ETag") doc['etag'] = headers.getheader("ETag")
last_modified = headers.getheader("Last-Modified") doc['last_modified'] = headers.getheader("Last-Modified")
web_server = headers.getheader("Server") doc['web_server'] = headers.getheader("Server")
file_size = headers.getheader("Content-Length") doc['powered_by'] = headers.getheader("X-Powered-By")
mime_type = headers.getheader("Content-Type") doc['file_size'] = headers.getheader("Content-Length")
  doc['mime_type'] = headers.getheader("Content-Type").split(";")[0]
if hasattr(url_handle, 'code'): if hasattr(url_handle, 'code'):
if url_handle.code == 304: if url_handle.code == 304:
print "the web page has not been modified" print "the web page has not been modified"
  return None
else: else:
#do scraping content = url_handle.read()
html = url_handle.read() docsdb.save(doc)
# http://www.crummy.com/software/BeautifulSoup/documentation.html doc = docsdb.get(hash) # need to get a _rev
soup = BeautifulSoup(html) docsdb.put_attachment(doc, content, str(time.time())+"-"+os.path.basename(url), doc['mime_type'])
links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-")) return (doc['mime_type'], content)
for link in links:  
print link['href']  
#for each unique link  
#if html mimetype  
# go down X levels,  
# diff with last stored attachment, store in document  
#if not  
# remember to save parentURL and title (link text that lead to document)  
   
#store as attachment epoch-filename #store as attachment epoch-filename
else: else:
print "error %s in downloading %s", url_handle.code, URL print "error %s in downloading %s", url_handle.code, URL
#record/alert error to error database #record/alert error to error database
   
   
   
   
  def scrapeAndStore(docsdb, url, depth, agencyID):
  (mime_type,content) = fetchURL(docsdb, url, agencyID)
  if content != None:
  if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
  # http://www.crummy.com/software/BeautifulSoup/documentation.html
  soup = BeautifulSoup(content)
  navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar'))
  for nav in navIDs:
  print "Removing element", nav['id']
  nav.extract()
  navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar')})
  for nav in navClasses:
  print "Removing element", nav['class']
  nav.extract()
  links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-"))
  for link in links:
  if link.has_key("href"):
  if link['href'].startswith("http"):
  linkurl = link['href']
  else:
  linkurl = urljoin(url,link['href'])
  print linkurl
  #for each unique link
  # if
  #if html mimetype
  # go down X levels,
  # diff with last stored attachment, store in document
  #if not
  # remember to save parentURL and title (link text that lead to document)
   
   
  couch = couchdb.Server('http://127.0.0.1:5984/')
   
   
   
   
couch = couchdb.Server('http://192.168.1.148:5984/')  
   
# select database # select database
agencydb = couch['disclosr-agencies'] agencydb = couch['disclosr-agencies']
  docsdb = couch['disclosr-documents']
   
for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view? for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view?
agency = agencydb.get(row.id) agency = agencydb.get(row.id)
print agency['name'] print agency['name']
scrapeAndStore("A",1,1) scrapeAndStore(docsdb, agency['website'],1,agency['_id'])