Use RDFa for semantic markup
Use RDFa for semantic markup


Former-commit-id: bff09f0408b049c7c7d7a6c1a35b890b9ed693e2

  <?php
 
  include_once("../include/common.inc.php");
 
 
  setlocale(LC_CTYPE, 'C');
 
  $headers = Array("#id", "name", "request_email", "short_name", "notes", "publication_scheme", "home_page", "tag_string");
 
  $db = $server->get_db('disclosr-agencies');
  $headers = Array();
  try {
  $rows = $db->get_view("app", "fieldNames?group=true", null, true)->rows;
 
  $dataValues = Array();
  foreach ($rows as $row) {
  $headers[] = $row->key;
  }
  } catch (SetteeRestClientException $e) {
  setteErrorHandler($e);
  }
 
  $fp = fopen('php://output', 'w');
  if ($fp && $db) {
  header('Content-Type: text/csv; charset=utf-8');
  header('Content-Disposition: attachment; filename="export.' . date("c") . '.csv"');
  header('Pragma: no-cache');
  header('Expires: 0');
  fputcsv($fp, $headers);
  try {
  $agencies = $db->get_view("app", "byCanonicalName", null, true)->rows;
  //print_r($rows);
  foreach ($agencies as $agency) {
  // print_r($agency);
 
  if ( !isset($agency->value->status)) {
  $row = Array();
  $agencyArray = object_to_array($agency->value);
  foreach ($headers as $fieldName) {
  if (isset($agencyArray[$fieldName])) {
  if (is_array($agencyArray[$fieldName])) {
  $row[] = implode(";",$agencyArray[$fieldName]);
  } else {
  $row[] = $agencyArray[$fieldName];
  }
  } else {
  $row[] = "";
  }
  }
 
  fputcsv($fp, array_values($row));
 
 
  }
  }
  } catch (SetteeRestClientException $e) {
  setteErrorHandler($e);
  }
 
  die;
  }
  ?>
 
  <?php
 
  /**
  * Databaase class.
  */
  class SetteeDatabase {
 
  /**
  * Base URL of the CouchDB REST API
  */
  private $conn_url;
 
  /**
  * HTTP REST Client instance
  */
  protected $rest_client;
 
  /**
  * Name of the database
  */
  private $dbname;
 
  /**
  * Default constructor
  */
  function __construct($conn_url, $dbname) {
  $this->conn_url = $conn_url;
  $this->dbname = $dbname;
  $this->rest_client = SetteeRestClient::get_instance($this->conn_url);
  }
 
 
  /**
  * Get UUID from CouchDB
  *
  * @return
  * CouchDB-generated UUID string
  *
  */
  function gen_uuid() {
  $ret = $this->rest_client->http_get('_uuids');
  return $ret['decoded']->uuids[0]; // should never be empty at this point, so no checking
  }
 
  /**
  * Create or update a document database
  *
  * @param $document
  * PHP object, a PHP associative array, or a JSON String representing the document to be saved. PHP Objects and arrays are JSON-encoded automatically.
  *
  * <p>If $document has a an "_id" property set, it will be used as document's unique id (even for "create" operation).
  * If "_id" is missing, CouchDB will be used to generate a UUID.
  *
  * <p>If $document has a "_rev" property (revision), document will be updated, rather than creating a new document.
  * You have to provide "_rev" if you want to update an existing document, otherwise operation will be assumed to be
  * one of creation and you will get a duplicate document exception from CouchDB. Also, you may not provide "_rev" but
  * not provide "_id" since that is an invalid input.
  *
  * @param $allowRevAutoDetection
  * Default: false. When true and _rev is missing from the document, save() function will auto-detect latest revision
  * for a document and use it. This option is "false" by default because it involves an extra http HEAD request and
  * therefore can make save() operation slightly slower if such auto-detection is not required.
  *
  * @return
  * document object with the database id (uuid) and revision attached;
  *
  * @throws SetteeCreateDatabaseException
  */
  function save($document, $allowRevAutoDetection = false) {
  if (is_string($document)) {
  $document = json_decode($document);
  }
 
  // Allow passing of $document as an array (for syntactic simplicity and also because in JSON world it does not matter)
  if(is_array($document)) {
  $document = (object) $document;
  }
 
  if (empty($document->_id) && empty($document->_rev)) {
  $id = $this->gen_uuid();
  }
  elseif (empty($document->_id) && !empty($document->_rev)) {
  throw new SetteeWrongInputException("Error: You can not save a document with a revision provided, but missing id");
  }
  else {
  $id = $document->_id;
 
  if ($allowRevAutoDetection) {
  try {
  $rev = $this->get_rev($id);
  } catch (SetteeRestClientException $e) {
  // auto-detection may fail legitimately, if a document has never been saved before (new doc), so skipping error
  }
  if (!empty($rev)) {
  $document->_rev = $rev;
  }
  }
  }
 
  $full_uri = $this->dbname . "/" . $this->safe_urlencode($id);
  $document_json = json_encode($document, JSON_NUMERIC_CHECK);
 
  $ret = $this->rest_client->http_put($full_uri, $document_json);
 
  $document->_id = $ret['decoded']->id;
  $document->_rev = $ret['decoded']->rev;
 
  return $document;
  }
 
  /**
  * @param $doc
  * @param $name
  * @param $content
  * Content of the attachment in a string-buffer format. This function will automatically base64-encode content for
  * you, so you don't have to do it.
  * @param $mime_type
  * Optional. Will be auto-detected if not provided
  * @return void
  */
  public function add_attachment($doc, $name, $content, $mime_type = null) {
  if (empty($doc->_attachments) || !is_object($doc->_attachments)) {
  $doc->_attachments = new stdClass();
  }
 
  if (empty($mime_type)) {
  $mime_type = $this->rest_client->content_mime_type($content);
  }
 
  $doc->_attachments->$name = new stdClass();
  $doc->_attachments->$name->content_type = $mime_type;
  $doc->_attachments->$name->data = base64_encode($content);
  }
 
  /**
  * @param $doc
  * @param $name
  * @param $file
  * Full path to a file (e.g. as returned by PHP's realpath function).
  * @param $mime_type
  * Optional. Will be auto-detected if not provided
  * @return void
  */
  public function add_attachment_file($doc, $name, $file, $mime_type = null) {
  $content = file_get_contents($file);
  $this->add_attachment($doc, $name, $content, $mime_type);
  }
 
  /**
  *
  * Retrieve a document from CouchDB
  *
  * @throws SetteeWrongInputException
  *
  * @param $id
  * Unique ID (usually: UUID) of the document to be retrieved.
  * @return
  * database document in PHP object format.
  */
  function get($id) {
  if (empty($id)) {
  throw new SetteeWrongInputException("Error: Can't retrieve a document without a uuid.");
  }
 
  $full_uri = $this->dbname . "/" . $this->safe_urlencode($id);
  $full_uri = str_replace("%3Frev%3D","?rev=",$full_uri);
  $ret = $this->rest_client->http_get($full_uri);
  return $ret['decoded'];
  }
 
  /**
  *
  * Get the latest revision of a document with document id: $id in CouchDB.
  *
  * @throws SetteeWrongInputException
  *
  * @param $id
  * Unique ID (usually: UUID) of the document to be retrieved.
  * @return
  * database document in PHP object format.
  */
  function get_rev($id) {
  if (empty($id)) {
  throw new SetteeWrongInputException("Error: Can't query a document without a uuid.");
  }
 
  $full_uri = $this->dbname . "/" . $this->safe_urlencode($id);
  $headers = $this->rest_client->http_head($full_uri);
  if (empty($headers['Etag'])) {
  throw new SetteeRestClientException("Error: could not retrieve revision. Server unexpectedly returned empty Etag");
  }
  $etag = str_replace('"', '', $headers['Etag']);
  return $etag;
  }
 
  /**
  * Delete a document
  *
  * @param $document
  * a PHP object or JSON representation of the document that has _id and _rev fields.
  *
  * @return void
  */
  function delete($document) {
  if (!is_object($document)) {
  $document = json_decode($document);
  }
 
  $full_uri = $this->dbname . "/" . $this->safe_urlencode($document->_id) . "?rev=" . $document->_rev;
  $this->rest_client->http_delete($full_uri);
  }
 
 
  /*----------------- View-related functions --------------*/
 
  /**
  * Create a new view or update an existing one.
  *
  * @param $design_doc
  * @param $view_name
  * @param $map_src
  * Source code of the map function in Javascript
  * @param $reduce_src
  * Source code of the reduce function in Javascript (optional)
  * @return void
  */
  function save_view($design_doc, $view_name, $map_src, $reduce_src = null) {
  $obj = new stdClass();
  $obj->_id = "_design/" . urlencode($design_doc);
  $view_name = urlencode($view_name);
  $obj->views->$view_name->map = $map_src;
  if (!empty($reduce_src)) {
  $obj->views->$view_name->reduce = $reduce_src;
  }
 
  // allow safe updates (even if slightly slower due to extra: rev-detection check).
  return $this->save($obj, true);
  }
 
  /**
  * Create a new view or update an existing one.
  *
  * @param $design_doc
  * @param $view_name
  * @param $key
  * key parameter to a view. Can be a single value or an array (for a range). If passed an array, function assumes
  * that first element is startkey, second: endkey.
  * @param $descending
  * return results in descending order. Please don't forget that if you are using a startkey/endkey, when you change
  * order you also need to swap startkey and endkey values!
  *
  * @return void
  */
  function get_view($design_doc, $view_name, $key = null, $descending = false) {
  $id = "_design/" . urlencode($design_doc);
  $view_name = urlencode($view_name);
  $id .= "/_view/$view_name";
 
  $data = array();
  if (!empty($key)) {
  if (is_string($key)) {
  $data = "key=" . '"' . $key . '"';
  }
  elseif (is_array($key)) {
  list($startkey, $endkey) = $key;
  $data = "startkey=" . '"' . $startkey . '"&' . "endkey=" . '"' . $endkey . '"';
  }
 
  if ($descending) {
  $data .= "&descending=true";
  }
  }
 
 
 
  if (empty($id)) {
  throw new SetteeWrongInputException("Error: Can't retrieve a document without a uuid.");
  }
 
  $full_uri = $this->dbname . "/" . $this->safe_urlencode($id);
  $full_uri = str_replace("%253Fgroup%253Dtrue","?group=true",$full_uri);
  $ret = $this->rest_client->http_get($full_uri, $data);
  return $ret['decoded'];
 
  }
 
  /**
  * @param $id
  * @return
  * return a properly url-encoded id.
  */
  private function safe_urlencode($id) {
  //-- System views like _design can have "/" in their URLs.
  $id = rawurlencode($id);
  if (substr($id, 0, 1) == '_') {
  $id = str_replace('%2F', '/', $id);
  }
  return $id;
  }
 
  /** Getter for a database name */
  function get_name() {
  return $this->dbname;
  }
 
  }
<?php <?php
   
include_once('include/common.inc.php'); include_once('include/common.inc.php');
include_header(); include_header();
   
function displayValue($key, $value, $mode) { function displayValue($key, $value, $mode) {
global $db, $schemas; global $db, $schemas;
if ($mode == "view") { if ($mode == "view") {
   
echo "<tr>"; echo "<tr>";
   
echo "<td>" . $schemas['agency']["properties"][$key]['x-title'] . "<br><small>" . $schemas['agency']["properties"][$key]['description'] . "</small></td><td>"; echo "<td>" . $schemas['agency']["properties"][$key]['x-title'] . "<br><small>" . $schemas['agency']["properties"][$key]['description'] . "</small></td><td>";
if (is_array($value)) { if (is_array($value)) {
echo "<ol>"; echo "<ol>";
foreach ($value as $subkey => $subvalue) { foreach ($value as $subkey => $subvalue) {
if (isset($schemas['agency']["properties"][$key]['x-itemprop'])) { if (isset($schemas['agency']["properties"][$key]['x-property'])) {
echo '<li itemprop="' . $schemas['agency']["properties"][$key]['x-itemprop'] . '">'; echo '<li property="' . $schemas['agency']["properties"][$key]['x-property'] . '">';
} else { } else {
echo "<li>"; echo "<li>";
} }
echo "$subvalue</li>"; echo "$subvalue</li>";
} }
echo "</ol></td></tr>"; echo "</ol></td></tr>";
} else { } else {
if (isset($schemas['agency']["properties"][$key]['x-itemprop'])) { if (isset($schemas['agency']["properties"][$key]['x-property'])) {
echo '<span itemprop="' . $schemas['agency']["properties"][$key]['x-itemprop'] . '">'; echo '<span property="' . $schemas['agency']["properties"][$key]['x-property'] . '">';
} else { } else {
echo "<span>"; echo "<span>";
} }
if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") { if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") {
echo "<a href='$value'>view</a></span>"; echo "<a href='$value'>view</a></span>";
} else { } else {
echo "$value</span>"; echo "$value</span>";
} }
} }
echo "</td></tr>"; echo "</td></tr>";
} }
if ($mode == "edit") { if ($mode == "edit") {
if (is_array($value)) { if (is_array($value)) {
echo '<div class="row"> echo '<div class="row">
<div class="seven columns"> <div class="seven columns">
<fieldset> <fieldset>
<h5>' . $key . '</h5>'; <h5>' . $key . '</h5>';
foreach ($value as $subkey => $subvalue) { foreach ($value as $subkey => $subvalue) {
echo "<label>$subkey</label><input class='input-text' type='text' id='$key$subkey' name='$key" . '[' . $subkey . "]' value='$subvalue'/></tr>"; echo "<label>$subkey</label><input class='input-text' type='text' id='$key$subkey' name='$key" . '[' . $subkey . "]' value='$subvalue'/></tr>";
} }
echo "</fieldset> echo "</fieldset>
</div> </div>
</div>"; </div>";
} else { } else {
if (strpos($key, "_") === 0) { if (strpos($key, "_") === 0) {
echo"<input type='hidden' id='$key' name='$key' value='$value'/>"; echo"<input type='hidden' id='$key' name='$key' value='$value'/>";
} else if ($key == "parentOrg") { } else if ($key == "parentOrg") {
echo "<label for='$key'>$key</label><select id='$key' name='$key'><option value=''> Select... </option>"; echo "<label for='$key'>$key</label><select id='$key' name='$key'><option value=''> Select... </option>";
$rows = $db->get_view("app", "byDeptStateName")->rows; $rows = $db->get_view("app", "byDeptStateName")->rows;
//print_r($rows); //print_r($rows);
foreach ($rows as $row) { foreach ($rows as $row) {
echo "<option value='{$row->value}'" . (($row->value == $value) ? "SELECTED" : "") . " >" . str_replace("Department of ", "", $row->key) . "</option>"; echo "<option value='{$row->value}'" . (($row->value == $value) ? "SELECTED" : "") . " >" . str_replace("Department of ", "", $row->key) . "</option>";
} }
echo" </select>"; echo" </select>";
} else if (strpos($key, "has") === 0) { } else {
echo "<label for='$key'><input type='checkbox' id='$key' name='$key' " . (($value == 'on' || $value == 'true') ? "checked='$value'" : "") . "> $key</label>";  
} else {  
echo "<label>$key</label><input class='input-text' type='text' id='$key' name='$key' value='$value'/>"; echo "<label>$key</label><input class='input-text' type='text' id='$key' name='$key' value='$value'/>";
if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") { if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") {
echo "<a href='$value'>view</a>"; echo "<a href='$value'>view</a>";
} }
if ($key == 'abn') { if ($key == 'abn') {
echo "<a href='http://www.abr.business.gov.au/SearchByAbn.aspx?SearchText=$value'>view abn</a>"; echo "<a href='http://www.abr.business.gov.au/SearchByAbn.aspx?SearchText=$value'>view abn</a>";
} }
} }
} }
} }
// //
} }
   
function addDefaultFields($row) { function addDefaultFields($row) {
global $schemas; global $schemas;
$defaultFields = array_keys($schemas['agency']['properties']); $defaultFields = array_keys($schemas['agency']['properties']);
foreach ($defaultFields as $defaultField) { foreach ($defaultFields as $defaultField) {
if (!isset($row[$defaultField])) { if (!isset($row[$defaultField])) {
if ($schemas['agency']['properties'][$defaultField]['type'] == "string") { if ($schemas['agency']['properties'][$defaultField]['type'] == "string") {
if (strpos($defaultField, "has") === 0) {  
$row[$defaultField] = "false";  
} else {  
$row[$defaultField] = ""; $row[$defaultField] = "";
}  
} }
if ($schemas['agency']['properties'][$defaultField]['type'] == "array") { if ($schemas['agency']['properties'][$defaultField]['type'] == "array") {
   
$row[$defaultField] = Array(""); $row[$defaultField] = Array("");
} }
} }
} }
return $row; return $row;
} }
   
$db = $server->get_db('disclosr-agencies'); $db = $server->get_db('disclosr-agencies');
   
if (isset($_REQUEST['id'])) { if (isset($_REQUEST['id'])) {
//get an agency record as json/html, search by name/abn/id //get an agency record as json/html, search by name/abn/id
// by name = startkey="Ham"&endkey="Ham\ufff0" // by name = startkey="Ham"&endkey="Ham\ufff0"
// edit? // edit?
   
$row = $db->get($_REQUEST['id']); $row = $db->get($_REQUEST['id']);
//print_r($row); //print_r($row);
if (sizeof($_POST) > 0) { if (sizeof($_POST) > 0) {
//print_r($_POST); //print_r($_POST);
foreach ($_POST as $postkey => $postvalue) { foreach ($_POST as $postkey => $postvalue) {
if ($postvalue == "") { if ($postvalue == "") {
unset($_POST[$postkey]); unset($_POST[$postkey]);
} }
if (is_array($postvalue) && count($postvalue) == 1 && $postvalue[0] == "") { if (is_array($postvalue) && count($postvalue) == 1 && $postvalue[0] == "") {
unset($_POST[$postkey]); unset($_POST[$postkey]);
} }
} }
if (isset($_POST['_id']) && $db->get_rev($_POST['_id']) == $_POST['_rev']) { if (isset($_POST['_id']) && $db->get_rev($_POST['_id']) == $_POST['_rev']) {
echo "Edited version was latest version, continue saving"; echo "Edited version was latest version, continue saving";
$newdoc = $_POST; $newdoc = $_POST;
$newdoc['metadata']['lastModified'] = time(); $newdoc['metadata']['lastModified'] = time();
$row = $db->save($newdoc); $row = $db->save($newdoc);
} else { } else {
echo "ALERT doc revised by someone else while editing. Document not saved."; echo "ALERT doc revised by someone else while editing. Document not saved.";
} }
} }
   
$mode = "view"; $mode = "edit";
if ($mode == "edit") { if ($mode == "edit") {
$row = addDefaultFields(object_to_array($row)); $row = addDefaultFields(object_to_array($row));
} else { } else {
$row = object_to_array($row); $row = object_to_array($row);
} }
if ($mode == "view") { if ($mode == "view") {
echo '<div itemscope itemtype ="http://schema.org/GovernmentOrganisation"><table width="100%">'; echo '<div typeof="schema:GovernmentOrganisation" about="#' . $row['_id'] . '"><table width="100%">';
echo '<tr> <td colspan="2"><h3>' . $row['name'] . "</h3></td></tr>"; echo '<tr> <td colspan="2"><h3>' . $row['name'] . "</h3></td></tr>";
echo "<tr><th>Field Name</th><th>Field Value</th></tr>"; echo "<tr><th>Field Name</th><th>Field Value</th></tr>";
} }
if ($mode == "edit") { if ($mode == "edit") {
?> ?>
<input id="addfield" type="button" value="Add Field"/> <input id="addfield" type="button" value="Add Field"/>
<script> <script>
window.onload = function() { window.onload = function() {
$(document).ready(function() { $(document).ready(function() {
// put all your jQuery goodness in here. // put all your jQuery goodness in here.
// http://charlie.griefer.com/blog/2009/09/17/jquery-dynamically-adding-form-elements/ // http://charlie.griefer.com/blog/2009/09/17/jquery-dynamically-adding-form-elements/
$('#addfield').click(function() { $('#addfield').click(function() {
var field_name=window.prompt("fieldname?",""); var field_name=window.prompt("fieldname?","");
if (field_name !="") { if (field_name !="") {
$('#submitbutton').before($('<span></span>') $('#submitbutton').before($('<span></span>')
.append("<label>"+field_name+"</label>") .append("<label>"+field_name+"</label>")
.append("<input class='input-text' type='text' id='"+field_name+"' name='"+field_name+"'/>") .append("<input class='input-text' type='text' id='"+field_name+"' name='"+field_name+"'/>")
); );
} }
}); });
}); });
}; };
</script> </script>
<form id="editform" class="nice" method="post"> <form id="editform" class="nice" method="post">
<?php <?php
   
} }
foreach ($row as $key => $value) { foreach ($row as $key => $value) {
echo displayValue($key, $value, $mode); echo displayValue($key, $value, $mode);
} }
if ($mode == "view") { if ($mode == "view") {
echo "</table></div>"; echo "</table></div>";
} }
if ($mode == "edit") { if ($mode == "edit") {
echo '<input id="submitbutton" type="submit"/></form>'; echo '<input id="submitbutton" type="submit"/></form>';
} }
} else { } else {
   
try { try {
/* $rows = $db->get_view("app", "showNamesABNs")->rows; /* $rows = $db->get_view("app", "showNamesABNs")->rows;
//print_r($rows); //print_r($rows);
foreach ($rows as $row) { foreach ($rows as $row) {
// print_r($row); // print_r($row);
echo '<li><a href="getAgency.php?id=' . $row->key . '">' . echo '<li><a href="getAgency.php?id=' . $row->key . '">' .
(isset($row->value->name) && $row->value->name != "" ? $row->value->name : "NO NAME " . $row->value->abn) (isset($row->value->name) && $row->value->name != "" ? $row->value->name : "NO NAME " . $row->value->abn)
. '</a></li>'; . '</a></li>';
} */ } */
$rows = $db->get_view("app", "byName")->rows; $rows = $db->get_view("app", "byName")->rows;
//print_r($rows); //print_r($rows);
  echo '<ul>';
foreach ($rows as $row) { foreach ($rows as $row) {
// print_r($row); // print_r($row);
echo '<li itemscope itemtype="http://schema.org/GovernmentOrganization"><a href="getAgency.php?id=' . $row->value . '" itemprop="url"><span itemprop="name">' . echo '<li typeof="schema:GovernmentOrganisation foaf:Organization" about="getAgency.php?id=' . $row->value . '">
  <a href="getAgency.php?id=' . $row->value . '" rel="schema:url foaf:page" property="schema:name foaf:name">' .
$row->key $row->key
. '</span></a></li>'; . '</a></li>';
} }
  echo "</ul>";
} catch (SetteeRestClientException $e) { } catch (SetteeRestClientException $e) {
setteErrorHandler($e); setteErrorHandler($e);
} }
} }
include_footer(); include_footer();
?> ?>
   
<?php <?php
   
include $basePath . "schemas/schemas.inc.php"; include $basePath . "schemas/schemas.inc.php";
   
require ($basePath . 'couchdb/settee/src/settee.php'); require ($basePath . 'couchdb/settee/src/settee.php');
   
function createAgencyDesignDoc() { function createAgencyDesignDoc() {
global $db; global $db;
$obj = new stdClass(); $obj = new stdClass();
$obj->_id = "_design/" . urlencode("app"); $obj->_id = "_design/" . urlencode("app");
$obj->language = "javascript"; $obj->language = "javascript";
$obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; $obj->views->all->map = "function(doc) { emit(doc._id, doc); };";
$obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };"; $obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };";
$obj->views->byCanonicalName->map = "function(doc) { $obj->views->byCanonicalName->map = "function(doc) {
if (doc.parentOrg || doc.orgType == 'FMA-DepartmentOfState') { if (doc.parentOrg || doc.orgType == 'FMA-DepartmentOfState') {
emit(doc.name, doc); emit(doc.name, doc);
} }
};"; };";
$obj->views->byDeptStateName->map = "function(doc) { $obj->views->byDeptStateName->map = "function(doc) {
if (doc.orgType == 'FMA-DepartmentOfState') { if (doc.orgType == 'FMA-DepartmentOfState') {
emit(doc.name, doc._id); emit(doc.name, doc._id);
} }
};"; };";
$obj->views->parentOrgs->map = "function(doc) { $obj->views->parentOrgs->map = "function(doc) {
if (doc.parentOrg) { if (doc.parentOrg) {
emit(doc._id, doc.parentOrg); emit(doc._id, doc.parentOrg);
} }
};"; };";
$obj->views->byName->map = "function(doc) { $obj->views->byName->map = 'function(doc) {
  if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") {
emit(doc.name, doc._id); emit(doc.name, doc._id);
for (name in doc.otherNames) { for (name in doc.otherNames) {
if (doc.otherNames[name] != '' && doc.otherNames[name] != doc.name) { if (doc.otherNames[name] != "" && doc.otherNames[name] != doc.name) {
emit(doc.otherNames[name], doc._id); emit(doc.otherNames[name], doc._id);
} }
} }
};"; }
  };';
   
$obj->views->foiEmails->map = "function(doc) { $obj->views->foiEmails->map = "function(doc) {
emit(doc._id, doc.foiEmail); emit(doc._id, doc.foiEmail);
};"; };";
   
$obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }"; $obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }";
$obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };'; $obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };';
$obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };'; $obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };';
$obj->views->getScrapeRequired->map = "function(doc) { $obj->views->getScrapeRequired->map = "function(doc) {
   
var lastScrape = Date.parse(doc.metadata.lastScraped); var lastScrape = Date.parse(doc.metadata.lastScraped);
   
var today = new Date(); var today = new Date();
   
if (!lastScrape || lastScrape.getTime() + 1000 != today.getTime()) { if (!lastScrape || lastScrape.getTime() + 1000 != today.getTime()) {
emit(doc._id, doc); emit(doc._id, doc);
} }
   
};"; };";
$obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };"; $obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };";
$obj->views->getConflicts->map = "function(doc) { $obj->views->getConflicts->map = "function(doc) {
if (doc._conflicts) { if (doc._conflicts) {
emit(null, [doc._rev].concat(doc._conflicts)); emit(null, [doc._rev].concat(doc._conflicts));
} }
}"; }";
// http://stackoverflow.com/questions/646628/javascript-startswith // http://stackoverflow.com/questions/646628/javascript-startswith
$obj->views->scoreHas->map = 'if(!String.prototype.startsWith){ $obj->views->scoreHas->map = 'if(!String.prototype.startsWith){
String.prototype.startsWith = function (str) { String.prototype.startsWith = function (str) {
return !this.indexOf(str); return !this.indexOf(str);
} }
} }
if(!String.prototype.endsWith){ if(!String.prototype.endsWith){
String.prototype.endsWith = function(suffix) { String.prototype.endsWith = function(suffix) {
    return this.indexOf(suffix, this.length - suffix.length) !== -1;     return this.indexOf(suffix, this.length - suffix.length) !== -1;
}; };
} }
function(doc) { function(doc) {
if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") {
for(var propName in doc) { for(var propName in doc) {
if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") || propName.endsWith("URL"))) { if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") || propName.endsWith("URL"))) {
emit(propName, 1); emit(propName, 1);
} }
} }
emit("total", 1); emit("total", 1);
} }
}'; }';
$obj->views->score->map = 'if(!String.prototype.startsWith){ $obj->views->scoreHas->map = 'if(!String.prototype.startsWith){
String.prototype.startsWith = function (str) { String.prototype.startsWith = function (str) {
return !this.indexOf(str); return !this.indexOf(str);
} }
} }
  if(!String.prototype.endsWith){
  String.prototype.endsWith = function(suffix) {
      return this.indexOf(suffix, this.length - suffix.length) !== -1;
  };
  }
function(doc) { function(doc) {
count = 0;  
if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") {
for(var propName in doc) { for(var propName in doc) {
if(typeof(doc[propName]) != "undefined" && propName.startsWith("l")) { if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") || propName.endsWith("URL"))) {
count++ emit(propName, 1);
} }
} }
emit(count+doc._id, {id:doc._id, name: doc.name, score:count}); emit("total", 1);
} }
}'; }';
  $obj->views->scoreHas->reduce = 'function (key, values, rereduce) {
  return sum(values);
  }';
  $obj->views->fieldNames->map = '
  function(doc) {
  for(var propName in doc) {
  emit(propName, doc._id);
  }
   
  }';
  $obj->views->fieldNames->reduce = 'function (key, values, rereduce) {
  return values.length;
  }';
// allow safe updates (even if slightly slower due to extra: rev-detection check). // allow safe updates (even if slightly slower due to extra: rev-detection check).
return $db->save($obj, true); return $db->save($obj, true);
} }
   
if (php_uname('n') == "vanille") { if (php_uname('n') == "vanille") {
   
$server = new SetteeServer('http://192.168.178.21:5984'); $server = new SetteeServer('http://192.168.178.21:5984');
} else } else
if (php_uname('n') == "KYUUBEY") { if (php_uname('n') == "KYUUBEY") {
   
$server = new SetteeServer('http://192.168.1.148:5984'); $server = new SetteeServer('http://192.168.1.148:5984');
} else { } else {
$server = new SetteeServer('http://127.0.0.1:5984'); $server = new SetteeServer('http://127.0.0.1:5984');
} }
   
function setteErrorHandler($e) { function setteErrorHandler($e) {
echo $e->getMessage() . "<br>" . PHP_EOL; echo $e->getMessage() . "<br>" . PHP_EOL;
} }
<?php <?php
   
function include_header() { function include_header() {
global $basePath; global $basePath;
?> ?>
<!DOCTYPE html> <!DOCTYPE html>
   
<!-- paulirish.com/2008/conditional-stylesheets-vs-css-hacks-answer-neither/ --> <!-- paulirish.com/2008/conditional-stylesheets-vs-css-hacks-answer-neither/ -->
<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]--> <!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]-->
<!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]--> <!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]-->
<!--[if IE 8]> <html class="no-js lt-ie9" lang="en"> <![endif]--> <!--[if IE 8]> <html class="no-js lt-ie9" lang="en"> <![endif]-->
<!--[if gt IE 8]><!--> <html lang="en"> <!--<![endif]--> <!--[if gt IE 8]><!--> <html lang="en"> <!--<![endif]-->
<head> <head>
<meta charset="utf-8" /> <meta charset="utf-8" />
   
<!-- Set the viewport width to device width for mobile --> <!-- Set the viewport width to device width for mobile -->
<meta name="viewport" content="width=device-width" /> <meta name="viewport" content="width=device-width" />
   
<title>Disclosr</title> <title>Disclosr</title>
   
<!-- Included CSS Files --> <!-- Included CSS Files -->
<link rel="stylesheet" href="<?php echo $basePath ?>stylesheets/foundation.css"> <link rel="stylesheet" href="<?php echo $basePath ?>stylesheets/foundation.css">
<link rel="stylesheet" href="<?php echo $basePath ?>stylesheets/app.css"> <link rel="stylesheet" href="<?php echo $basePath ?>stylesheets/app.css">
   
<!--[if lt IE 9]> <!--[if lt IE 9]>
<link rel="stylesheet" href="<?php echo $basePath ?>stylesheets/ie.css"> <link rel="stylesheet" href="<?php echo $basePath ?>stylesheets/ie.css">
<![endif]--> <![endif]-->
   
   
<!-- IE Fix for HTML5 Tags --> <!-- IE Fix for HTML5 Tags -->
<!--[if lt IE 9]> <!--[if lt IE 9]>
<script src="http://html5shiv.googlecode.com/svn/trunk/html5.js"></script> <script src="http://html5shiv.googlecode.com/svn/trunk/html5.js"></script>
<![endif]--> <![endif]-->
   
</head> </head>
<body> <body xmlns:schema="http://schema.org/" xmlns:foaf="http://xmlns.com/foaf/0.1/">
   
<!-- navBar --> <!-- navBar -->
<div id="navbar" class="container"> <div id="navbar" class="container">
<div class="row"> <div class="row">
<div class="four columns"> <div class="four columns">
<h1><a href="/">Disclosr</a></h1> <h1><a href="/">Disclosr</a></h1>
</div> </div>
<div class="eight columns hide-on-phones"> <div class="eight columns hide-on-phones">
<strong class="right"> <strong class="right">
<a href="getAgency.php">Agencies</a> <a href="getAgency.php">Agencies</a>
<a href="about.php">About/FAQ</a> <a href="about.php">About/FAQ</a>
</strong> </strong>
</div> </div>
</div> </div>
</div> </div>
<!-- /navBar --> <!-- /navBar -->
   
<!-- container --> <!-- container -->
<div class="container"> <div class="container">
<?php } <?php }
   
function include_footer() { function include_footer() {
global $basePath; global $basePath;
?> ?>
</div> </div>
<!-- container --> <!-- container -->
   
   
   
   
<!-- Included JS Files --> <!-- Included JS Files -->
<script src="<?php echo $basePath; ?>javascripts/foundation.js"></script> <script src="<?php echo $basePath; ?>javascripts/foundation.js"></script>
<script src="<?php echo $basePath; ?>javascripts/app.js"></script> <script src="<?php echo $basePath; ?>javascripts/app.js"></script>
<script src="http://code.jquery.com/jquery-1.7.1.min.js"></script> <script src="http://code.jquery.com/jquery-1.7.1.min.js"></script>
<!--<script language="javascript" type="text/javascript" src="javascripts/jquery.js"></script>--> <!--<script type="text/javascript" src="javascripts/jquery.js"></script>-->
<script language="javascript" type="text/javascript" src="javascripts/flot/jquery.flot.js"></script> <script type="text/javascript" src="javascripts/flot/jquery.flot.js"></script>
   
</body> </body>
</html> </html>
   
<?php } <?php }
   
<?php <?php
   
$schemas['agency'] = Array( $schemas['agency'] = Array(
"description" => "Representation of government agency and online transparency measures", "description" => "Representation of government agency and online transparency measures",
"type" => "object", "type" => "object",
"properties" => Array( "properties" => Array(
"name" => Array("type" => "string", "required" => true, "x-itemprop" => "name", "x-title" => "Name", "description" => "Name, most recent and broadest"), "name" => Array("type" => "string", "required" => true, "x-property" => "schema:name foaf:name", "x-title" => "Name", "description" => "Name, most recent and broadest"),
"shortName" => Array("type" => "string", "required" => false, "x-title" => "Short Name", "description" => "Name shortened, usually to an acronym"), "shortName" => Array("type" => "string", "required" => false, "x-title" => "Short Name", "description" => "Name shortened, usually to an acronym"),
"foiEmail" => Array("type" => "string", "required" => false, "x-title" => "FOI Contact Email", "description" => "FOI contact email if not foi@"), "foiEmail" => Array("type" => "string", "required" => false, "x-title" => "FOI Contact Email", "description" => "FOI contact email if not foi@"),
"sameAs" => Array("type" => "array", "required" => false, "x-itemprop"=>"http://www.w3.org/2002/07/owl#sameAs","x-title" => "Same As", "description" => "Same as other URLs/URIs for this entity", "sameAs" => Array("type" => "array", "required" => false, "x-property"=>"owl:sameAs","x-title" => "Same As", "description" => "Same as other URLs/URIs for this entity",
"items" => Array("type" => "string")), "items" => Array("type" => "string")),
"otherNames" => Array("type" => "array", "required" => true, "x-title" => "Past/Other Names", "description" => "Other names for organisation", "otherNames" => Array("type" => "array", "required" => true, "x-title" => "Past/Other Names", "description" => "Other names for organisation",
"items" => Array("type" => "string")), "items" => Array("type" => "string")),
"foiBodies" => Array("type" => "array", "required" => true, "x-title" => "FOI Bodies","x-itemprop"=>"members", "description" => "Organisational units within this agency that are subject to FOI Act but are not autonomous", "foiBodies" => Array("type" => "array", "required" => true, "x-title" => "FOI Bodies","x-property"=>"schema:members foaf:knows", "description" => "Organisational units within this agency that are subject to FOI Act but are not autonomous",
"items" => Array("type" => "string")), "items" => Array("type" => "string")),
"orgType" => Array("type" => "string", "required" => true, "x-title" => "Organisation Type", "description" => "Org type based on legal formation via FMA/CAC legislation etc."), "orgType" => Array("type" => "string", "required" => true, "x-title" => "Organisation Type", "description" => "Org type based on legal formation via FMA/CAC legislation etc."),
"parentOrg" => Array("type" => "string", "required" => true, "x-title" => "Parent Organisation", "description" => "Parent organisation, usually a department of state"), "parentOrg" => Array("type" => "string", "required" => true, "x-title" => "Parent Organisation", "description" => "Parent organisation, usually a department of state"),
"website" => Array("type" => "string", "required" => true, "x-title" => "Website", "x-itemprop" => "url", "description" => "Website URL"), "website" => Array("type" => "string", "required" => true, "x-title" => "Website", "x-property" => "schema:url foaf:homepage", "description" => "Website URL"),
"abn" => Array("type" => "string", "required" => true, "x-title" => "Australian Business Number", "description" => "ABN from business register"), "abn" => Array("type" => "string", "required" => true, "x-title" => "Australian Business Number", "description" => "ABN from business register"),
"contractListURL" => Array("type" => "string", "required" => true, "x-title" => "Contract Listing", "description" => "Departmental and agency contracts, <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a>"), "contractListURL" => Array("type" => "string", "required" => true, "x-title" => "Contract Listing", "description" => "Departmental and agency contracts, <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a>"),
"grantsReportingURL" => Array("type" => "string", "required" => true, "x-title" => "Grants Awarded", "grantsReportingURL" => Array("type" => "string", "required" => true, "x-title" => "Grants Awarded",
"description" => "Departmental and agency grants <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a> and <a href='http://www.finance.gov.au/publications/fmg-series/23-commonwealth-grant-guidelines.html'>Commonwealth grants guidelines</a> "), "description" => "Departmental and agency grants <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a> and <a href='http://www.finance.gov.au/publications/fmg-series/23-commonwealth-grant-guidelines.html'>Commonwealth grants guidelines</a> "),
"annualReportURL" => Array("type" => "string", "required" => true, "x-title" => "Annual Report(s)", "description" => ""), "annualReportURL" => Array("type" => "string", "required" => true, "x-title" => "Annual Report(s)", "description" => ""),
"consultanciesURL" => Array("type" => "string", "required" => true, "x-title" => "Consultants Hired", "description" => ""), "consultanciesURL" => Array("type" => "string", "required" => true, "x-title" => "Consultants Hired", "description" => ""),
"legalExpenditureURL" => Array("type" => "string", "required" => true, "x-title" => "Legal Services Expenditure", "description" => "Legal Services Expenditure mandated by Legal Services Directions 2005"), "legalExpenditureURL" => Array("type" => "string", "required" => true, "x-title" => "Legal Services Expenditure", "description" => "Legal Services Expenditure mandated by Legal Services Directions 2005"),
"recordsListURL" => Array("type" => "string", "required" => true, "x-title" => "Files/Records Held", "description" => "Indexed lists of departmental and agency files, <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a>"), "recordsListURL" => Array("type" => "string", "required" => true, "x-title" => "Files/Records Held", "description" => "Indexed lists of departmental and agency files, <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a>"),
"FOIDocumentsURL" => Array("type" => "string", "required" => true, "x-title" => "FOI Documents Released", "description" => ""), "FOIDocumentsURL" => Array("type" => "string", "required" => true, "x-title" => "FOI Documents Released", "description" => "FOI Disclosure Log URL"),
"infoPublicationSchemeURL" => Array("type" => "string", "required" => true, "x-title" => "Information Publication Scheme", "description" => ""), "FOIDocumentsRSSURL" => Array("type" => "string", "required" => false, "x-title" => "RSS Feed of FOI Documents Released", "description" => "FOI Disclosure Log in RSS format"),
  "hasFOIPDF" => Array("type" => "array", "required" => false, "x-title" => "Has FOI Documents Released in PDF", "description" => "FOI Disclosure Log contains any PDFs",
  "items" => Array("type" => "string")),
  "infoPublicationSchemeURL" => Array("type" => "string", "required" => true, "x-title" => "Information Publication Scheme", "description" => ""),
"appointmentsURL" => Array("type" => "string", "required" => true, "x-title" => "Agency Appointments/Boards", "description" => "Departmental and agency appointments and vacancies , <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a>"), "appointmentsURL" => Array("type" => "string", "required" => true, "x-title" => "Agency Appointments/Boards", "description" => "Departmental and agency appointments and vacancies , <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a>"),
"advertisingURL" => Array("type" => "string", "required" => true, "x-title" => "Approved Advertising Campaigns", "description" => " Agency advertising and public information projects, <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a> "), "advertisingURL" => Array("type" => "string", "required" => true, "x-title" => "Approved Advertising Campaigns", "description" => " Agency advertising and public information projects, <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a> "),
"hasRSS" => Array("type" => "string", "required" => true, "x-title" => "Has RSS", "description" => ""), "hasRSS" => Array("type" => "array", "required" => true, "x-title" => "Has RSS", "description" => ""),
"hasMailingList" => Array("type" => "string", "required" => true, "x-title" => "Has Mailing List", "description" => ""), "hasMailingList" => Array("type" => "array", "required" => true, "x-title" => "Has Mailing List", "description" => "",
"hasTwitter" => Array("type" => "string", "required" => true, "x-title" => "Has Twitter", "description" => ""), "items" => Array("type" => "string")),
"hasFacebook" => Array("type" => "string", "required" => true, "x-title" => "Has Facebook", "description" => ""), "hasTwitter" => Array("type" => "array", "required" => true, "x-title" => "Has Twitter", "description" => "",
"hasYouTube" => Array("type" => "string", "required" => true, "x-title" => "Has YouTube", "description" => ""), "items" => Array("type" => "string")),
"hasFlickr" => Array("type" => "string", "required" => true, "x-title" => "Has Flickr", "description" => ""), "hasFacebook" => Array("type" => "array", "required" => true, "x-title" => "Has Facebook", "description" => "",
"hasCCBY" => Array("type" => "string", "required" => true, "x-title" => "Has CC-BY", "description" => "Has any page licenced Creative Commons - Attribution"), "items" => Array("type" => "string")),
  "hasYouTube" => Array("type" => "array", "required" => true, "x-title" => "Has YouTube", "description" => "",
  "items" => Array("type" => "string")),
  "hasFlickr" => Array("type" => "array", "required" => true, "x-title" => "Has Flickr", "description" => "",
  "items" => Array("type" => "string")),
  "hasCCBY" => Array("type" => "array", "required" => true, "x-title" => "Has CC-BY", "description" => "Has any page licenced Creative Commons - Attribution",
  "items" => Array("type" => "string")),
  "hasRestrictiveLicence" => Array("type" => "array","required" => true, "x-title" => "Has Restrictive Licence", "description" => "Has any page licenced under terms more restrictive than Crown Copyright",
  "items" => Array("type" => "string")),
  "hasCrownCopyright" => Array("type" => "array", "required" => true, "x-title" => "Has Standard Crown Copyright licence", "description" => "Has any page still licenced under the former Commonwealth Copyright Administration",
  "items" => Array("type" => "string")),
), ),
/* "org":{"type":"object", /* "org":{"type":"object",
"properties":{ "properties":{
"organizationName":{"type":"string"}, "organizationName":{"type":"string"},
"organizationUnit":{"type":"string"}}, "organizationUnit":{"type":"string"}},
} }
} */ } */
); );
?> ?>
   
file:a/scrape.py -> file:b/scrape.py
#http://packages.python.org/CouchDB/client.html #http://packages.python.org/CouchDB/client.html
import couchdb import couchdb
import urllib2 import urllib2
from BeautifulSoup import BeautifulSoup from BeautifulSoup import BeautifulSoup
import re import re
import hashlib import hashlib
  from urlparse import urljoin
  import time
  import os
  import mimetypes
  import re
  import urllib
  import urlparse
   
  def canonurl(url):
  r"""Return the canonical, ASCII-encoded form of a UTF-8 encoded URL, or ''
  if the URL looks invalid.
  >>> canonurl('\xe2\x9e\xa1.ws') # tinyarro.ws
  'http://xn--hgi.ws/'
  """
  # strip spaces at the ends and ensure it's prefixed with 'scheme://'
  url = url.strip()
  if not url:
  return ''
  if not urlparse.urlsplit(url).scheme:
  url = 'http://' + url
   
  # turn it into Unicode
  #try:
  # url = unicode(url, 'utf-8')
  #except UnicodeDecodeError:
  # return '' # bad UTF-8 chars in URL
   
  # parse the URL into its components
  parsed = urlparse.urlsplit(url)
  scheme, netloc, path, query, fragment = parsed
   
  # ensure scheme is a letter followed by letters, digits, and '+-.' chars
  if not re.match(r'[a-z][-+.a-z0-9]*$', scheme, flags=re.I):
  return ''
  scheme = str(scheme)
   
  # ensure domain and port are valid, eg: sub.domain.<1-to-6-TLD-chars>[:port]
  match = re.match(r'(.+\.[a-z0-9]{1,6})(:\d{1,5})?$', netloc, flags=re.I)
  if not match:
  return ''
  domain, port = match.groups()
  netloc = domain + (port if port else '')
  netloc = netloc.encode('idna')
   
  # ensure path is valid and convert Unicode chars to %-encoded
  if not path:
  path = '/' # eg: 'http://google.com' -> 'http://google.com/'
  path = urllib.quote(urllib.unquote(path.encode('utf-8')), safe='/;')
   
  # ensure query is valid
  query = urllib.quote(urllib.unquote(query.encode('utf-8')), safe='=&?/')
   
  # ensure fragment is valid
  fragment = urllib.quote(urllib.unquote(fragment.encode('utf-8')))
   
  # piece it all back together, truncating it to a maximum of 4KB
  url = urlparse.urlunsplit((scheme, netloc, path, query, fragment))
  return url[:4096]
   
#http://diveintopython.org/http_web_services/etags.html #http://diveintopython.org/http_web_services/etags.html
class NotModifiedHandler(urllib2.BaseHandler): class NotModifiedHandler(urllib2.BaseHandler):
def http_error_304(self, req, fp, code, message, headers): def http_error_304(self, req, fp, code, message, headers):
addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url()) addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url())
addinfourl.code = code addinfourl.code = code
return addinfourl return addinfourl
   
def scrapeAndStore(docsdb, url, depth, agencyID): def fetchURL(docsdb, url, fieldName, agencyID, scrape_again=True):
hash = hashlib.md5(url).hexdigest() url = canonurl(url)
  hash = hashlib.md5(url).hexdigest().encode("utf-8")
req = urllib2.Request(url) req = urllib2.Request(url)
print "Fetching %s", url print "Fetching %s" % url
doc = docsdb['hash'] if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "":
  print "Not a valid HTTP url"
  return (None,None)
  doc = docsdb.get(hash)
  if doc == None:
  doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName}
  else:
  if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 999999):
  print "Uh oh, trying to scrape URL again too soon!"
  last_attachment_fname = doc["_attachments"].keys()[-1]
  last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
  return (doc['mime_type'],last_attachment)
  if scrape_again == False:
  print "Not scraping this URL again as requested"
  return (None,None)
   
  time.sleep(3) # wait 3 seconds to give webserver time to recover
   
#if there is a previous version stored in couchdb, load caching helper tags #if there is a previous version stored in couchdb, load caching helper tags
if doc.has_key('etag'): if doc.has_key('etag'):
req.add_header("If-None-Match", doc['etag']) req.add_header("If-None-Match", doc['etag'])
if doc.has_key('last_modified'): if doc.has_key('last_modified'):
req.add_header("If-Modified-Since", doc['last_modified']) req.add_header("If-Modified-Since", doc['last_modified'])
opener = urllib2.build_opener(NotModifiedHandler()) opener = urllib2.build_opener(NotModifiedHandler())
url_handle = opener.open(req) try:
headers = url_handle.info() # the addinfourls have the .info() too url_handle = opener.open(req)
doc['etag'] = headers.getheader("ETag") headers = url_handle.info() # the addinfourls have the .info() too
doc['last_modified'] = headers.getheader("Last-Modified") doc['etag'] = headers.getheader("ETag")
doc['web_server'] = headers.getheader("Server") doc['last_modified'] = headers.getheader("Last-Modified")
doc['file_size'] = headers.getheader("Content-Length") doc['date'] = headers.getheader("Date")
doc['mime_type'] = headers.getheader("Content-Type") doc['page_scraped'] = time.time()
  doc['web_server'] = headers.getheader("Server")
if hasattr(url_handle, 'code'): doc['powered_by'] = headers.getheader("X-Powered-By")
  doc['file_size'] = headers.getheader("Content-Length")
  content_type = headers.getheader("Content-Type")
  if content_type != None:
  doc['mime_type'] = content_type.split(";")[0]
  else:
  (type,encoding) = mimetypes.guess_type(url)
  doc['mime_type'] = type
  if hasattr(url_handle, 'code'):
if url_handle.code == 304: if url_handle.code == 304:
print "the web page has not been modified" print "the web page has not been modified"
  return (None,None)
else: else:
#do scraping content = url_handle.read()
html = url_handle.read() docsdb.save(doc)
# http://www.crummy.com/software/BeautifulSoup/documentation.html doc = docsdb.get(hash) # need to get a _rev
soup = BeautifulSoup(html) docsdb.put_attachment(doc, content, str(time.time())+"-"+os.path.basename(url), doc['mime_type'])
links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-")) return (doc['mime_type'], content)
for link in links: #store as attachment epoch-filename
if link.has_key("href"): except urllib2.URLError as e:
print link['href'] error = ""
#for each unique link if hasattr(e, 'reason'):
#if html mimetype error = "error %s in downloading %s" % (str(e.reason), url)
# go down X levels, elif hasattr(e, 'code'):
# diff with last stored attachment, store in document error = "error %s in downloading %s" % (e.code, url)
#if not print error
# remember to save parentURL and title (link text that lead to document) doc['error'] = error
  docsdb.save(doc)
#store as attachment epoch-filename return (None,None)
else:  
print "error %s in downloading %s", url_handle.code, URL  
#record/alert error to error database  
   
   
   
   
   
  def scrapeAndStore(docsdb, url, depth, fieldName, agencyID):
  (mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID)
  if content != None and depth > 0:
  if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
  # http://www.crummy.com/software/BeautifulSoup/documentation.html
  soup = BeautifulSoup(content)
  navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar'))
  for nav in navIDs:
  print "Removing element", nav['id']
  nav.extract()
  navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar')})
  for nav in navClasses:
  print "Removing element", nav['class']
  nav.extract()
  links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-"))
  linkurls = set([])
  for link in links:
  if link.has_key("href"):
  if link['href'].startswith("http"):
  # lets not do external links for now
  # linkurls.add(link['href'])
  None
  if link['href'].startswith("mailto"):
  # not http
  None
  if link['href'].startswith("javascript"):
  # not http
  None
  else:
  linkurls.add(urljoin(url,link['href'].replace(" ","%20")))
  for linkurl in linkurls:
  #print linkurl
  scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID)
   
  couch = couchdb.Server('http://127.0.0.1:5984/')
   
   
   
   
couch = couchdb.Server('http://192.168.1.148:5984/')  
   
# select database # select database
agencydb = couch['disclosr-agencies'] agencydb = couch['disclosr-agencies']
docsdb = couch['disclosr-documents'] docsdb = couch['disclosr-documents']
   
for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view? for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view?
agency = agencydb.get(row.id) agency = agencydb.get(row.id)
print agency['name'] print agency['name']
scrapeAndStore(docsdb, agency['website'],1,agency['_id']) for key in agency.keys():
  if key == 'website':
  scrapeAndStore(docsdb, agency[key],0,key,agency['_id'])
  if key.endswith('URL'):
  print key
  depth = 1
  if 'scrapeDepth' in agency.keys():
  depth = agency['scrapeDepth']
  scrapeAndStore(docsdb, agency[key],depth,key,agency['_id'])
   
  agency['metadata']['lastScraped'] = time.time()
  agencydb.save(agency)