prod fixes
prod fixes


Former-commit-id: 130b8c05fff32afd5b4e3f8a9faadac5381bd456

<?php <?php
   
require_once '../include/common.inc.php'; require_once '../include/common.inc.php';
//function createFOIDocumentsDesignDoc() { //function createFOIDocumentsDesignDoc() {
   
$foidb = $server->get_db('disclosr-foidocuments'); $foidb = $server->get_db('disclosr-foidocuments');
$obj = new stdClass(); $obj = new stdClass();
$obj->_id = "_design/" . urlencode("app"); $obj->_id = "_design/" . urlencode("app");
$obj->language = "javascript"; $obj->language = "javascript";
$obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; $obj->views->all->map = "function(doc) { emit(doc._id, doc); };";
$obj->views->byDate->map = "function(doc) { emit(doc.date, doc); };"; $obj->views->byDate->map = "function(doc) { emit(doc.date, doc); };";
$obj->views->byDate->reduce = "_count";  
$obj->views->byDateMonthYear->map = "function(doc) { emit(doc.date, doc); };"; $obj->views->byDateMonthYear->map = "function(doc) { emit(doc.date, doc); };";
$obj->views->byDateMonthYear->reduce = "_count"; $obj->views->byDateMonthYear->reduce = "_count";
$obj->views->byAgencyID->map = "function(doc) { emit(doc.agencyID, doc); };"; $obj->views->byAgencyID->map = "function(doc) { emit(doc.agencyID, doc); };";
$obj->views->byAgencyID->reduce = "_count"; $obj->views->byAgencyID->reduce = "_count";
   
// allow safe updates (even if slightly slower due to extra: rev-detection check). // allow safe updates (even if slightly slower due to extra: rev-detection check).
$foidb->save($obj, true); $foidb->save($obj, true);
   
   
//function createDocumentsDesignDoc() { //function createDocumentsDesignDoc() {
$docdb = $server->get_db('disclosr-documents'); $docdb = $server->get_db('disclosr-documents');
   
$obj = new stdClass(); $obj = new stdClass();
$obj->_id = "_design/" . urlencode("app"); $obj->_id = "_design/" . urlencode("app");
$obj->language = "javascript"; $obj->language = "javascript";
$obj->views->web_server->map = "function(doc) {\n emit(doc.web_server, 1);\n}"; $obj->views->web_server->map = "function(doc) {\n emit(doc.web_server, 1);\n}";
$obj->views->web_server->reduce = "function (key, values, rereduce) {\n return sum(values);\n}"; $obj->views->web_server->reduce = "function (key, values, rereduce) {\n return sum(values);\n}";
$obj->views->byAgency->map = "function(doc) {\n emit(doc.agencyID, 1);\n}"; $obj->views->byAgency->map = "function(doc) {\n emit(doc.agencyID, 1);\n}";
$obj->views->byAgency->reduce = "function (key, values, rereduce) {\n return sum(values);\n}"; $obj->views->byAgency->reduce = "function (key, values, rereduce) {\n return sum(values);\n}";
$obj->views->byURL->map = "function(doc) {\n emit(doc.url, doc);\n}"; $obj->views->byURL->map = "function(doc) {\n emit(doc.url, doc);\n}";
$obj->views->agency->map = "function(doc) {\n emit(doc.agencyID, doc);\n}"; $obj->views->agency->map = "function(doc) {\n emit(doc.agencyID, doc);\n}";
$obj->views->byWebServer->map = "function(doc) {\n emit(doc.web_server, doc);\n}"; $obj->views->byWebServer->map = "function(doc) {\n emit(doc.web_server, doc);\n}";
$obj->views->getValidationRequired = "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}"; $obj->views->getValidationRequired = "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}";
   
   
   
   
//function createAgencyDesignDoc() { //function createAgencyDesignDoc() {
$db = $server->get_db('disclosr-agencies'); $db = $server->get_db('disclosr-agencies');
$obj = new stdClass(); $obj = new stdClass();
$obj->_id = "_design/" . urlencode("app"); $obj->_id = "_design/" . urlencode("app");
$obj->language = "javascript"; $obj->language = "javascript";
$obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; $obj->views->all->map = "function(doc) { emit(doc._id, doc); };";
$obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };"; $obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };";
$obj->views->byCanonicalName->map = "function(doc) { $obj->views->byCanonicalName->map = "function(doc) {
if (doc.parentOrg || doc.orgType == 'FMA-DepartmentOfState') { if (doc.parentOrg || doc.orgType == 'FMA-DepartmentOfState') {
emit(doc.name, doc); emit(doc.name, doc);
} }
};"; };";
$obj->views->byDeptStateName->map = "function(doc) { $obj->views->byDeptStateName->map = "function(doc) {
if (doc.orgType == 'FMA-DepartmentOfState') { if (doc.orgType == 'FMA-DepartmentOfState') {
emit(doc.name, doc._id); emit(doc.name, doc._id);
} }
};"; };";
$obj->views->parentOrgs->map = "function(doc) { $obj->views->parentOrgs->map = "function(doc) {
if (doc.parentOrg) { if (doc.parentOrg) {
emit(doc._id, doc.parentOrg); emit(doc._id, doc.parentOrg);
} }
};"; };";
$obj->views->byName->map = 'function(doc) { $obj->views->byName->map = 'function(doc) {
if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") {
emit(doc.name, doc._id); emit(doc.name, doc._id);
if (typeof(doc.shortName) != "undefined" && doc.shortName != doc.name) { if (typeof(doc.shortName) != "undefined" && doc.shortName != doc.name) {
emit(doc.shortName, doc._id); emit(doc.shortName, doc._id);
} }
for (name in doc.otherNames) { for (name in doc.otherNames) {
if (doc.otherNames[name] != "" && doc.otherNames[name] != doc.name) { if (doc.otherNames[name] != "" && doc.otherNames[name] != doc.name) {
emit(doc.otherNames[name], doc._id); emit(doc.otherNames[name], doc._id);
} }
} }
for (name in doc.foiBodies) { for (name in doc.foiBodies) {
if (doc.foiBodies[name] != "" && doc.foiBodies[name] != doc.name) { if (doc.foiBodies[name] != "" && doc.foiBodies[name] != doc.name) {
emit(doc.foiBodies[name], doc._id); emit(doc.foiBodies[name], doc._id);
} }
} }
for (name in doc.positions) { for (name in doc.positions) {
if (doc.positions[name] != "" && doc.positions[name] != doc.name) { if (doc.positions[name] != "" && doc.positions[name] != doc.name) {
emit(doc.positions[name], doc._id); emit(doc.positions[name], doc._id);
} }
} }
} }
};'; };';
   
$obj->views->foiEmails->map = "function(doc) { $obj->views->foiEmails->map = "function(doc) {
emit(doc._id, doc.foiEmail); emit(doc._id, doc.foiEmail);
};"; };";
   
$obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }"; $obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }";
$obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };'; $obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };';
$obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };'; $obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };';
$obj->views->getScrapeRequired->map = "function(doc) { $obj->views->getScrapeRequired->map = "function(doc) {
   
var lastScrape = Date.parse(doc.metadata.lastScraped); var lastScrape = Date.parse(doc.metadata.lastScraped);
   
var today = new Date(); var today = new Date();
   
if (!lastScrape || lastScrape.getTime() + 1000 != today.getTime()) { if (!lastScrape || lastScrape.getTime() + 1000 != today.getTime()) {
emit(doc._id, doc); emit(doc._id, doc);
} }
   
};"; };";
$obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };"; $obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };";
$obj->views->getConflicts->map = "function(doc) { $obj->views->getConflicts->map = "function(doc) {
if (doc._conflicts) { if (doc._conflicts) {
emit(null, [doc._rev].concat(doc._conflicts)); emit(null, [doc._rev].concat(doc._conflicts));
} }
}"; }";
// http://stackoverflow.com/questions/646628/javascript-startswith // http://stackoverflow.com/questions/646628/javascript-startswith
$obj->views->score->map = 'if(!String.prototype.startsWith){ $obj->views->score->map = 'if(!String.prototype.startsWith){
String.prototype.startsWith = function (str) { String.prototype.startsWith = function (str) {
return !this.indexOf(str); return !this.indexOf(str);
} }
} }
   
function(doc) { function(doc) {
count = 0; count = 0;
if (doc["status"] != "suspended") { if (doc["status"] != "suspended") {
for(var propName in doc) { for(var propName in doc) {
if(typeof(doc[propName]) != "undefined" && doc[propName] != "") { if(typeof(doc[propName]) != "undefined" && doc[propName] != "") {
count++; count++;
} }
} }
portfolio = doc.parentOrg; portfolio = doc.parentOrg;
if (doc.orgType == "FMA-DepartmentOfState") { if (doc.orgType == "FMA-DepartmentOfState") {
portfolio = doc._id; portfolio = doc._id;
} }
if (doc.orgType == "Court-Commonwealth" || doc.orgType == "FMA-DepartmentOfParliament") { if (doc.orgType == "Court-Commonwealth" || doc.orgType == "FMA-DepartmentOfParliament") {
portfolio = doc.orgType; portfolio = doc.orgType;
} }
emit(count+doc._id, {id:doc._id, name: doc.name, score:count, orgType: doc.orgType, portfolio:portfolio}); emit(count+doc._id, {id:doc._id, name: doc.name, score:count, orgType: doc.orgType, portfolio:portfolio});
} }
}'; }';
$obj->views->scoreHas->map = 'if(!String.prototype.startsWith){ $obj->views->scoreHas->map = 'if(!String.prototype.startsWith){
String.prototype.startsWith = function (str) { String.prototype.startsWith = function (str) {
return !this.indexOf(str); return !this.indexOf(str);
} }
} }
if(!String.prototype.endsWith){ if(!String.prototype.endsWith){
String.prototype.endsWith = function(suffix) { String.prototype.endsWith = function(suffix) {
    return this.indexOf(suffix, this.length - suffix.length) !== -1;     return this.indexOf(suffix, this.length - suffix.length) !== -1;
}; };
} }
function(doc) { function(doc) {
if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") {
for(var propName in doc) { for(var propName in doc) {
if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") || propName.endsWith("URL"))) { if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") || propName.endsWith("URL"))) {
emit(propName, 1); emit(propName, 1);
} }
} }
emit("total", 1); emit("total", 1);
} }
}'; }';
$obj->views->scoreHas->reduce = 'function (key, values, rereduce) { $obj->views->scoreHas->reduce = 'function (key, values, rereduce) {
return sum(values); return sum(values);
}'; }';
$obj->views->fieldNames->map = ' $obj->views->fieldNames->map = '
function(doc) { function(doc) {
for(var propName in doc) { for(var propName in doc) {
emit(propName, doc._id); emit(propName, doc._id);
} }
}'; }';
$obj->views->fieldNames->reduce = 'function (key, values, rereduce) { $obj->views->fieldNames->reduce = 'function (key, values, rereduce) {
return values.length; return values.length;
}'; }';
// allow safe updates (even if slightly slower due to extra: rev-detection check). // allow safe updates (even if slightly slower due to extra: rev-detection check).
$db->save($obj, true); $db->save($obj, true);
?> ?>
   
directory:a/couchdb/settee (deleted)
 
  language: php
  phps:
  - 5.3
  - 5.4
  before_script: cd tests/
 
  (The MIT License)
 
  Copyright (c) 2011 Irakli Nadareishvili
 
  Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the 'Software'), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 
  The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 
  THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  Inspired by: "CouchRest library for Ruby":http://jchrisa.net/drl/_design/sofa/_list/post/post-page?startkey=%5B%22couchrest__restful_ruby_client_%22%5D and the "couchdb-python":http://packages.python.org/CouchDB/client.html#document library.
 
  h3. Server Functions
 
  # Specify a server:
  @$server = new SetteeServer('http://127.0.0.1:5984');@
  # Database API
  ## Create a database:
  @$ret = $server->create_db('irakli_test');@
  ## Drop a database:
  @$ret = $server->drop_db('irakli_test');@
  ## List all databases:
  @$ret = $server->list_dbs();@
  ## Get a database object
  @$db = $server->get_db('irakli_test');@
  # Document API
  ## Create/Update a document:
  @$ret = $db->save($doc);@
  ## Retrieve a document:
  @$db_doc = $db->get($id);@
  ## Determine the latest revision_id for a document:
  @$rev = $db->get_rev($id);@
  ## Delete a document:
  @$db_doc = $db->delete($doc);@
  # Attachments API
  ## Add content as attachment:
  @$db->add_attachment($doc, "foo.txt", "Some text that will be base64 encoded", "text/plain");@
  ## Add a file path to be attached:
  @$db->add_attachment_file($doc, "foo.pdf", $file_path, "application/pdf");@
  ## Add a file path to be attached (mime-type is auto-detected):
  @$db->add_attachment_file($doc, "foo.pdf", $file_path);@
  ## Full attachment saving example:
  $doc = new stdClass();
  $doc->_id = "attachment_doc";
  $file_path = dirname(__FILE__) . "/resources/couch-logo.pdf";
  $this->db->add_attachment_file($doc, "foo.pdf", $file_path, "application/pdf");
  $db_doc = $this->db->save($doc);
  ## ATTENTION: there is no "load_attachments" method, because when you load a document, all its attachments get loaded with it, as well.
  # Views API
  ## Create a new view or save a view:
  @$view = $db->save_view("some_design_document_id", "a_view_name", $map_src);@
  @$view = $db->save_view("some_design_document_id", "a_view_name", $map_src, $reduce_src);@
  ## Get a view (run query and get results):
  @$view = $db->get_view("some_design_document_id", "a_view_name");@
  ## Parametrized view:
  @$view = $db->get_view("some_design_document_id", "a_view_name", "2009/02/17 21:13:39");@
  ## Parametrized view with key range:
  @$view = $db->get_view("some_design_document_id", "a_view_name", array("2009/01/30 18:04:11", "2009/02/17 21:13:39"));@
  ## Parametrized view with key range, ordered descending:
  @$view = $db->get_view("some_design_document_id", "a_view_name", array("2009/01/30 18:04:11", "2009/02/17 21:13:39"), true);@
 
 
  h3. Requirements
  # PHP 5.2 or newer
 
  h3. Recommended
  # PHP 5.3 or newer. With PHP 5.2 following functionality will not work:
  ## Some unit-tests
  ## Mime type auto-detection.
  # pecl_http
  #!/usr/bin/env php
 
  <?php
 
  require (realpath(dirname(__FILE__) . '/../src/settee.php'));
 
  $server = new SetteeServer('http://127.0.0.1:5984');
 
 
  $dbs = array (
  1 => "settee_test_perf_01",
  2 => "settee_test_perf_02",
  3 => "settee_test_perf_03",
  );
 
  print ("creating databases: \n");
 
  foreach ($dbs as $db) {
  $start = microtime(true);
  try {
  $ret = $server->create_db($db);
  } catch (Exception $e) {
  //-- re-throw. this is just for demo
  throw $e;
  }
  $elapsed = microtime(true) - $start;
  print("Time elapsed: $elapsed \n");
  }
 
  $ret = $server->list_dbs();
  print_r($ret);
  print ("\n");
 
  print ("dropping databases: \n");
 
  foreach ($dbs as $db) {
  $start = microtime(true);
  try {
  $ret = $server->drop_db($db);
  } catch (Exception $e) {
  //-- re-throw. this is just for demo
  throw $e;
  }
  $elapsed = microtime(true) - $start;
  print("Time elapsed: $elapsed \n");
  }
 
  $ret = $server->list_dbs();
  print_r($ret);
 
  #!/usr/bin/env php
 
  <?php
 
  require (realpath(dirname(__FILE__) . '/../src/settee.php'));
 
  $server = new SetteeServer('http://127.0.0.1:5984');
  $dname = 'irakli';
  $db = $server->get_db('irakli');
 
  try {
  $server->create_db($db);
  } catch (Exception $e) {
  print_r("database irakli already exists! \n");
  }
 
  $doc = new StdClass();
  $doc->firstName = "Irakli";
  $doc->lastName = "Nadareishvili";
  $doc->IQ = 200;
  $doc->hobbies = array("skiing", "swimming");
  $doc->pets = array ("whitey" => "labrador", "mikey" => "pug");
 
  // Should work with json string as well:
  //$doc = '{"firstName":"irakli","lastName":"Nadareishvili","IQ":200,"hobbies":["skiing","swimming"],"pets":{"whitey":"labrador","mikey":"pug"}}';
 
  $doc = $db->save($doc);
  print_r($doc);
 
  $doc = $db->get($doc->_id);
  print_r($doc);
 
  $doc->firstName = "Ika";
  $doc = $db->save($doc);
  print_r($doc);
 
  $db->delete($doc);
 
 
 
  <?php
 
  /**
  * Databaase class.
  */
  class SetteeDatabase {
 
  /**
  * Base URL of the CouchDB REST API
  */
  private $conn_url;
 
  /**
  * HTTP REST Client instance
  */
  protected $rest_client;
 
  /**
  * Name of the database
  */
  private $dbname;
 
  /**
  * Default constructor
  */
  function __construct($conn_url, $dbname) {
  $this->conn_url = $conn_url;
  $this->dbname = $dbname;
  $this->rest_client = SetteeRestClient::get_instance($this->conn_url);
  }
 
 
  /**
  * Get UUID from CouchDB
  *
  * @return
  * CouchDB-generated UUID string
  *
  */
  function gen_uuid() {
  $ret = $this->rest_client->http_get('_uuids');
  return $ret['decoded']->uuids[0]; // should never be empty at this point, so no checking
  }
 
  /**
  * Create or update a document database
  *
  * @param $document
  * PHP object, a PHP associative array, or a JSON String representing the document to be saved. PHP Objects and arrays are JSON-encoded automatically.
  *
  * <p>If $document has a an "_id" property set, it will be used as document's unique id (even for "create" operation).
  * If "_id" is missing, CouchDB will be used to generate a UUID.
  *
  * <p>If $document has a "_rev" property (revision), document will be updated, rather than creating a new document.
  * You have to provide "_rev" if you want to update an existing document, otherwise operation will be assumed to be
  * one of creation and you will get a duplicate document exception from CouchDB. Also, you may not provide "_rev" but
  * not provide "_id" since that is an invalid input.
  *
  * @param $allowRevAutoDetection
  * Default: false. When true and _rev is missing from the document, save() function will auto-detect latest revision
  * for a document and use it. This option is "false" by default because it involves an extra http HEAD request and
  * therefore can make save() operation slightly slower if such auto-detection is not required.
  *
  * @return
  * document object with the database id (uuid) and revision attached;
  *
  * @throws SetteeCreateDatabaseException
  */
  function save($document, $allowRevAutoDetection = false) {
  if (is_string($document)) {
  $document = json_decode($document);
  }
 
  // Allow passing of $document as an array (for syntactic simplicity and also because in JSON world it does not matter)
  if(is_array($document)) {
  $document = (object) $document;
  }
 
  if (empty($document->_id) && empty($document->_rev)) {
  $id = $this->gen_uuid();
  }
  elseif (empty($document->_id) && !empty($document->_rev)) {
  throw new SetteeWrongInputException("Error: You can not save a document with a revision provided, but missing id");
  }
  else {
  $id = $document->_id;
 
  if ($allowRevAutoDetection) {
  try {
  $rev = $this->get_rev($id);
  } catch (SetteeRestClientException $e) {
  // auto-detection may fail legitimately, if a document has never been saved before (new doc), so skipping error
  }
  if (!empty($rev)) {
  $document->_rev = $rev;
  }
  }
  }
 
  $full_uri = $this->dbname . "/" . $this->safe_urlencode($id);
  $document_json = json_encode($document, JSON_NUMERIC_CHECK);
 
  $ret = $this->rest_client->http_put($full_uri, $document_json);
 
  $document->_id = $ret['decoded']->id;
  $document->_rev = $ret['decoded']->rev;
 
  return $document;
  }
 
  /**
  * @param $doc
  * @param $name
  * @param $content
  * Content of the attachment in a string-buffer format. This function will automatically base64-encode content for
  * you, so you don't have to do it.
  * @param $mime_type
  * Optional. Will be auto-detected if not provided
  * @return void
  */
  public function add_attachment($doc, $name, $content, $mime_type = null) {
  if (empty($doc->_attachments) || !is_object($doc->_attachments)) {
  $doc->_attachments = new stdClass();
  }
 
  if (empty($mime_type)) {
  $mime_type = $this->rest_client->content_mime_type($content);
  }
 
  $doc->_attachments->$name = new stdClass();
  $doc->_attachments->$name->content_type = $mime_type;
  $doc->_attachments->$name->data = base64_encode($content);
  }
 
  /**
  * @param $doc
  * @param $name
  * @param $file
  * Full path to a file (e.g. as returned by PHP's realpath function).
  * @param $mime_type
  * Optional. Will be auto-detected if not provided
  * @return void
  */
  public function add_attachment_file($doc, $name, $file, $mime_type = null) {
  $content = file_get_contents($file);
  $this->add_attachment($doc, $name, $content, $mime_type);
  }
 
  /**
  *
  * Retrieve a document from CouchDB
  *
  * @throws SetteeWrongInputException
  *
  * @param $id
  * Unique ID (usually: UUID) of the document to be retrieved.
  * @return
  * database document in PHP object format.
  */
  function get($id) {
  if (empty($id)) {
  throw new SetteeWrongInputException("Error: Can't retrieve a document without a uuid.");
  }
 
  $full_uri = $this->dbname . "/" . $this->safe_urlencode($id);
  $full_uri = str_replace("%3Frev%3D","?rev=",$full_uri);
  $ret = $this->rest_client->http_get($full_uri);
  return $ret['decoded'];
  }
 
  /**
  *
  * Get the latest revision of a document with document id: $id in CouchDB.
  *
  * @throws SetteeWrongInputException
  *
  * @param $id
  * Unique ID (usually: UUID) of the document to be retrieved.
  * @return
  * database document in PHP object format.
  */
  function get_rev($id) {
  if (empty($id)) {
  throw new SetteeWrongInputException("Error: Can't query a document without a uuid.");
  }
 
  $full_uri = $this->dbname . "/" . $this->safe_urlencode($id);
  $headers = $this->rest_client->http_head($full_uri);
  if (empty($headers['Etag'])) {
  throw new SetteeRestClientException("Error: could not retrieve revision. Server unexpectedly returned empty Etag");
  }
  $etag = str_replace('"', '', $headers['Etag']);
  return $etag;
  }
 
  /**
  * Delete a document
  *
  * @param $document
  * a PHP object or JSON representation of the document that has _id and _rev fields.
  *
  * @return void
  */
  function delete($document) {
  if (!is_object($document)) {
  $document = json_decode($document);
  }
 
  $full_uri = $this->dbname . "/" . $this->safe_urlencode($document->_id) . "?rev=" . $document->_rev;
  $this->rest_client->http_delete($full_uri);
  }
 
 
  /*----------------- View-related functions --------------*/
 
  /**
  * Create a new view or update an existing one.
  *
  * @param $design_doc
  * @param $view_name
  * @param $map_src
  * Source code of the map function in Javascript
  * @param $reduce_src
  * Source code of the reduce function in Javascript (optional)
  * @return void
  */
  function save_view($design_doc, $view_name, $map_src, $reduce_src = null) {
  $obj = new stdClass();
  $obj->_id = "_design/" . urlencode($design_doc);
  $view_name = urlencode($view_name);
  $obj->views->$view_name->map = $map_src;
  if (!empty($reduce_src)) {
  $obj->views->$view_name->reduce = $reduce_src;
  }
 
  // allow safe updates (even if slightly slower due to extra: rev-detection check).
  return $this->save($obj, true);
  }
 
  /**
  * Create a new view or update an existing one.
  *
  * @param $design_doc
  * @param $view_name
  * @param $key
  * key parameter to a view. Can be a single value or an array (for a range). If passed an array, function assumes
  * that first element is startkey, second: endkey.
  * @param $descending
  * return results in descending order. Please don't forget that if you are using a startkey/endkey, when you change
  * order you also need to swap startkey and endkey values!
  *
  * @return void
  */
  function get_view($design_doc, $view_name, $key = null, $descending = false, $limit = false, $reduce=null) {
  $id = "_design/" . urlencode($design_doc);
  $view_name = urlencode($view_name);
  $id .= "/_view/$view_name";
 
  $data = array();
  if (!empty($key)) {
  if (is_string($key)) {
  $data = "key=" . '"' . $key . '"';
  }
  elseif (is_array($key)) {
  list($startkey, $endkey) = $key;
  $data = "startkey=" . '"' . $startkey . '"&' . "endkey=" . '"' . $endkey . '"';
  }
 
  if ($descending) {
  $data .= "&descending=true";
  }
  if ($reduce != null) {
  if ($reduce == true) {
  $data .= "&reduce=true";
  } else {
  $data .= "&reduce=false";
  }
  }
  if ($limit) {
  $data .= "&limit=".$limit;
  }
  }
 
 
 
  if (empty($id)) {
  throw new SetteeWrongInputException("Error: Can't retrieve a document without a uuid.");
  }
 
  $full_uri = $this->dbname . "/" . $this->safe_urlencode($id);
 
  $full_uri = str_replace("%253Fgroup%253D","?group=",$full_uri);
  $full_uri = str_replace("%253Flimit%253D","?limit=",$full_uri);
  $ret = $this->rest_client->http_get($full_uri, $data);
  //$ret['decoded'] = str_replace("?k","&k",$ret['decoded']);
  return $ret['decoded'];
 
  }
 
  /**
  * @param $id
  * @return
  * return a properly url-encoded id.
  */
  private function safe_urlencode($id) {
  //-- System views like _design can have "/" in their URLs.
  $id = rawurlencode($id);
  if (substr($id, 0, 1) == '_') {
  $id = str_replace('%2F', '/', $id);
  }
  return $id;
  }
 
  /** Getter for a database name */
  function get_name() {
  return $this->dbname;
  }
 
  }
  <?php
 
  /**
  * HTTP REST Client for CouchDB API
  */
  class SetteeRestClient {
 
  /**
  * HTTP Timeout in Milliseconds
  */
  const HTTP_TIMEOUT = 2000;
 
  private $base_url;
  private $curl;
 
  private static $curl_workers = array();
 
  /**
  * Singleton factory method
  */
  static function get_instance($base_url) {
 
  if (empty(self::$curl_workers[$base_url])) {
  self::$curl_workers[$base_url] = new SetteeRestClient($base_url);
  }
 
  return self::$curl_workers[$base_url];
  }
 
  /**
  * Class constructor
  */
  private function __construct($base_url) {
  $this->base_url = $base_url;
 
  $curl = curl_init();
  curl_setopt($curl, CURLOPT_USERAGENT, "Settee CouchDB Client/1.0");
  curl_setopt($curl, CURLOPT_HTTPHEADER, array('Content-Type: application/json'));
  curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
  curl_setopt($curl, CURLOPT_HEADER, 0);
  curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1);
  curl_setopt($curl, CURLOPT_TIMEOUT_MS, self::HTTP_TIMEOUT);
  curl_setopt($curl, CURLOPT_FORBID_REUSE, false); // Connection-pool for CURL
 
  $this->curl = $curl;
 
  }
 
  /**
  * Class destructor cleans up any resources
  */
  function __destruct() {
  curl_close($this->curl);
  }
 
  /**
  * HTTP HEAD
  *
  * @return
  * Raw HTTP Headers of the response.
  *
  * @see: http://www.php.net/manual/en/context.params.php
  *
  */
  function http_head($uri) {
  curl_setopt($this->curl, CURLOPT_HEADER, 1);
 
  $full_url = $this->get_full_url($uri);
  curl_setopt($this->curl, CURLOPT_URL, $full_url);
  curl_setopt($this->curl, CURLOPT_CUSTOMREQUEST, 'HEAD');
  curl_setopt($this->curl, CURLOPT_NOBODY, true);
 
 
  $response = curl_exec($this->curl);
  // Restore default values
  curl_setopt($this->curl, CURLOPT_NOBODY, false);
  curl_setopt($this->curl, CURLOPT_HEADER, false);
 
  $resp_code = curl_getinfo($this->curl, CURLINFO_HTTP_CODE);
  if ($resp_code == 404 ) {
  throw new SetteeRestClientException("Couch document not found at: '$full_url'");
  }
 
  if (function_exists('http_parse_headers')) {
  $headers = http_parse_headers($response);
  }
  else {
  $headers = $this->_http_parse_headers($response);
  }
 
  return $headers;
  }
 
  /**
  * Backup PHP impl. for when PECL http_parse_headers() function is not available
  *
  * @param $header
  * @return array
  * @source http://www.php.net/manual/en/function.http-parse-headers.php#77241
  */
  private function _http_parse_headers( $header ) {
  $retVal = array();
  $fields = explode("\r\n", preg_replace('/\x0D\x0A[\x09\x20]+/', ' ', $header));
  foreach( $fields as $field ) {
  if( preg_match('/([^:]+): (.+)/m', $field, $match) ) {
  $match[1] = preg_replace('/(?<=^|[\x09\x20\x2D])./e', 'strtoupper("\0")', strtolower(trim($match[1])));
  if( isset($retVal[$match[1]]) ) {
  $retVal[$match[1]] = array($retVal[$match[1]], $match[2]);
  } else {
  $retVal[$match[1]] = trim($match[2]);
  }
  }
  }
  return $retVal;
  }
 
  /**
  * HTTP GET
  */
  function http_get($uri, $data = array()) {
  $data = (is_array($data)) ? http_build_query($data) : $data;
  if (!empty($data)) {
  $uri .= "?$data";
  }
  return $this->http_request('GET', $uri);
  }
 
  /**
  * HTTP PUT
  */
  function http_put($uri, $data = array()) {
  return $this->http_request('PUT', $uri, $data);
  }
 
  /**
  * HTTP DELETE
  */
  function http_delete($uri, $data = array()) {
  return $this->http_request('DELETE', $uri, $data);
  }
 
  /**
  * Generic implementation of a HTTP Request.
  *
  * @param $http_method
  * @param $uri
  * @param array $data
  * @return
  * an array containing json and decoded versions of the response.
  */
  private function http_request($http_method, $uri, $data = array()) {
  $data = (is_array($data)) ? http_build_query($data) : $data;
 
  if (!empty($data)) {
  curl_setopt($this->curl, CURLOPT_HTTPHEADER, array('Content-Length: ' . strlen($data)));
  curl_setopt($this->curl, CURLOPT_POSTFIELDS, $data);
  }
 
  curl_setopt($this->curl, CURLOPT_URL, $this->get_full_url($uri));
  curl_setopt($this->curl, CURLOPT_CUSTOMREQUEST, $http_method);
 
  $response = curl_exec($this->curl);
  $response_decoded = $this->decode_response($response);
  $response = array('json' => $response, 'decoded'=>$response_decoded);
 
  $this->check_status($response,$uri);
 
  return $response;
  }
 
  /**
  * Check http status for safe return codes
  *
  * @throws SetteeRestClientException
  */
  private function check_status($response,$uri) {
  $resp_code = curl_getinfo($this->curl, CURLINFO_HTTP_CODE);
 
  if ($resp_code < 199 || $resp_code > 399 || !empty($response['decoded']->error)) {
  $msg = "CouchDB returned: \"HTTP 1.1. $resp_code\". ERROR: " . $response['json'] . $uri;
  throw new SetteeRestClientException($msg);
  }
  }
 
  /**
  * @param $path
  * Full path to a file (e.g. as returned by PHP's realpath function).
  * @return void
  */
  public function file_mime_type ($path) {
  $ftype = 'application/octet-stream';
 
  if (function_exists("finfo_file")) {
  $finfo = new finfo(FILEINFO_MIME_TYPE | FILEINFO_SYMLINK);
  $fres = $finfo->file($path);
  if (is_string($fres) && !empty($fres)) {
  $ftype = $fres;
  }
  }
 
  return $ftype;
  }
 
  /**
  * @param $content
  * content of a file in a string buffer format.
  * @return void
  */
  public function content_mime_type ($content) {
  $ftype = 'application/octet-stream';
 
  if (function_exists("finfo_file")) {
  $finfo = new finfo(FILEINFO_MIME_TYPE | FILEINFO_SYMLINK);
  $fres = $finfo->buffer($content);
  if (is_string($fres) && !empty($fres)) {
  $ftype = $fres;
  }
  }
 
  return $ftype;
  }
 
 
  /**
  *
  * @param $json
  * json-encoded response from CouchDB
  *
  * @return
  * decoded PHP object
  */
  private function decode_response($json) {
  return json_decode($json);
  }
 
  /**
  * Get full URL from a partial one
  */
  private function get_full_url($uri) {
  // We do not want "/", "?", "&" and "=" separators to be encoded!!!
  $uri = str_replace(array('%2F', '%3F', '%3D', '%26'), array('/', '?', '=', '&'), urlencode($uri));
  return $this->base_url . '/' . $uri;
  }
  }
 
  class SetteeRestClientException extends Exception {}
  <?php
 
  /**
  * CouchDB Server Manager
  */
  class SetteeServer {
 
  /**
  * Base URL of the CouchDB REST API
  */
  private $conn_url;
 
  /**
  * HTTP REST Client instance
  */
  protected $rest_client;
 
 
  /**
  * Class constructor
  *
  * @param $conn_url
  * (optional) URL of the CouchDB server to connect to. Default value: http://127.0.0.1:5984
  */
  function __construct($conn_url = "http://127.0.0.1:5984") {
  $this->conn_url = rtrim($conn_url, ' /');
  $this->rest_client = SetteeRestClient::get_instance($this->conn_url);
  }
 
  /**
  * Create database
  *
  * @param $db
  * Either a database object or a String name of the database.
  *
  * @return
  * json string from the server.
  *
  * @throws SetteeCreateDatabaseException
  */
  function create_db($db) {
  if ($db instanceof SetteeDatabase) {
  $db = $db->get_name();
  }
  $ret = $this->rest_client->http_put($db);
  if (!empty($ret['decoded']->error)) {
  throw new SetteeDatabaseException("Could not create database: " . $ret["json"]);
  }
  return $ret['decoded'];
  }
 
  /**
  * Drop database
  *
  * @param $db
  * Either a database object or a String name of the database.
  *
  * @return
  * json string from the server.
  *
  * @throws SetteeDropDatabaseException
  */
  function drop_db($db) {
  if ($db instanceof SetteeDatabase) {
  $db = $db->get_name();
  }
  $ret = $this->rest_client->http_delete($db);
  if (!empty($ret['decoded']->error)) {
  throw new SetteeDatabaseException("Could not create database: " . $ret["json"]);
  }
  return $ret['decoded'];
  }
 
  /**
  * Instantiate a database object
  *
  * @param $dbname
  * name of the newly created database
  *
  * @return SetteeDatabase
  * new SetteeDatabase instance.
  */
  function get_db($dbname) {
  return new SetteeDatabase($this->conn_url, $dbname);
  }
 
 
  /**
  * Return an array containing all databases
  *
  * @return Array
  * an array of database names in the CouchDB instance
  */
  function list_dbs() {
  $ret = $this->rest_client->http_get('_all_dbs');
  if (!empty($ret['decoded']["error"])) {
  throw new SetteeDatabaseException("Could not get list of databases: " . $ret["json"]);
  }
  return $ret['decoded'];
  }
 
  }
 
  class SetteeServerErrorException extends Exception {}
  class SetteeDatabaseException extends Exception {}
  class SetteeWrongInputException extends Exception {}
  <?php
 
  require(dirname(__FILE__) . '/classes/SetteeRestClient.class.php');
 
  require(dirname(__FILE__) . '/classes/SetteeServer.class.php');
  require(dirname(__FILE__) . '/classes/SetteeDatabase.class.php');
  1. Make sure you have latest PEAR PHPUnit installed:
  > sudo upgrade pear
  > sudo pear channel-discover pear.phpunit.de
  > sudo pear install phpunit/PHPUnit
 
  2. You need PHP 5.3.2 or later to run some tests that deal with private or protected methods. If you use an earlier
  version of PHP, these tests will be skipped.
 
  3. Run all tests with:
  > phpunit .
  <?php
 
  require_once (realpath(dirname(__FILE__) . '/../src/settee.php'));
  require_once (dirname(__FILE__) . '/SetteeTestCase.class.php');
 
  class SetteeDatabaseTest extends SetteeTestCase {
 
  private $db;
 
  public function setUp() {
  parent::setUp();
  $dbname = "settee_tests_" . md5(microtime(true));
  $this->db = $this->server->get_db($dbname);
  $this->server->create_db($this->db);
  }
 
  public function test_document_lifecycle_objectbased() {
  $doc = new StdClass();
  $doc->firstName = "Irakli";
  $doc->lastName = "Nadareishvili";
  $doc->IQ = 200;
  $doc->hobbies = array("skiing", "swimming");
  $doc->pets = array ("whitey" => "labrador", "mikey" => "pug");
 
  $doc = $this->db->save($doc);
  $this->assertTrue(!empty($doc->_id) && !empty($doc->_rev), "Document creation success [object-based]");
 
  $_rev = $doc->_rev;
  $doc = $this->db->get($doc->_id);
  $this->assertEquals($_rev, $doc->_rev, "Document retrieval success [object-based] test");
 
  $doc->firstName = "Ika";
  $db_doc = $this->db->save($doc);
  $this->assertEquals($doc->firstName, $db_doc->firstName, "Document update success [object-based]");
 
  $this->db->delete($doc);
 
 
  try {
  $doc = $this->db->get($doc->_id);
  } catch (SetteeRestClientException $e) {
  // we expect exception to fire, so this is good.
  return;
  }
 
  $this->fail('Document still available for retrieval after being deleted. [object-based]');
  }
 
  // Should work with json string as well:
  //
 
 
  public function test_document_lifecycle_jsonbased() {
  $doc = '{"firstName":"Irakli","lastName":"Nadareishvili","IQ":200,"hobbies":["skiing","swimming"],"pets":{"whitey":"labrador","mikey":"pug"}}';
 
  $doc = $this->db->save($doc);
  $this->assertTrue(!empty($doc->_id) && !empty($doc->_rev), "Document creation success [json-based]");
 
  $_rev = $doc->_rev;
 
  $db_doc = $this->db->get($doc->_id);
  $this->assertEquals($_rev, $db_doc->_rev, "Document retrieval success [json-based] test");
 
  $doc = '{';
  $doc .= '"_id":"' . $db_doc->_id . '",';
  $doc .= '"_rev":"' . $db_doc->_rev . '",';
  $doc .= '"firstName":"Ika","lastName":"Nadareishvili","IQ":200,"hobbies":["skiing","swimming"],"pets":{"whitey":"labrador","mikey":"pug"}}';
 
  $orig_doc = json_decode($doc);
  $db_doc = $this->db->save($doc);
  $this->assertEquals($orig_doc->firstName, $db_doc->firstName, "Document update success [json-based]");
 
  $doc = '{';
  $doc .= '"_id":"' . $db_doc->_id . '",';
  $doc .= '"_rev":"' . $db_doc->_rev . '",';
  $doc .= '"firstName":"Ika","lastName":"Nadareishvili","IQ":200,"hobbies":["skiing","swimming"],"pets":{"whitey":"labrador","mikey":"pug"}}';
 
  $this->db->delete($doc);
 
  try {
  $doc = $this->db->get($db_doc->_id);
  } catch (SetteeRestClientException $e) {
  // we expect exception to fire, so this is good.
  return;
  }
 
  $this->fail('Document still available for retrieval after being deleted. [object-based]');
  }
 
  public function test_invalid_document() {
  $doc = 12345;
  try {
  $doc = $this->db->save($doc);
  } catch (SetteeRestClientException $e) {
  // we expect exception to fire, so this is good.
  return;
  }
 
  $this->fail('Document saved with invalid format');
  }
 
  public function test_get_rev() {
  $doc = new stdClass();
  $doc->_id = "some_fixed_id";
  $doc = $this->db->save($doc);
 
  $_rev = $doc->_rev;
 
  $db_rev = $this->db->get_rev($doc->_id);
  $this->assertEquals($_rev, $db_rev, "Document Revision retrieval success");
 
  // _rev is now attached to this object due to last ->save() call
  $doc->_id = "some_fixed_id";
  $doc->title = "Some Fixed ID";
  $doc = $this->db->save($doc);
 
  $_rev = $doc->_rev;
 
  $db_rev = $this->db->get_rev($doc->_id);
  $this->assertEquals($_rev, $db_rev, "Document Revision retrieval success after re-save");
 
  }
 
  public function test_save_auto_revision_detection() {
  $doc = new stdClass();
  $doc->_id = "some_fixed_id";
  $this->db->save($doc);
 
  $doc = new stdClass();
  $doc->_id = "some_fixed_id";
  $doc->extra_field = "some other value";
 
  $new_doc = $this->db->save($doc, true);
  $this->assertEquals ($new_doc->extra_field, "some other value", "Testing auto-rev detection by save method");
  }
 
  public function test_inline_attachment_json() {
  $doc = '{
  "_id":"attachment_doc",
  "_attachments":
  {
  "foo.txt":
  {
  "content_type":"text\/plain",
  "data": "VGhpcyBpcyBhIGJhc2U2NCBlbmNvZGVkIHRleHQ="
  }
  }
  }';
  $db_doc = $this->db->save($doc);
  $this->assertTrue(is_object($db_doc->_attachments), "Inline attachment save successful [json-based]");
  }
 
  public function test_inline_attachment_obj_content() {
  $doc = new stdClass();
  $doc->_id = "attachment_doc";
  $this->db->add_attachment($doc, "foo.txt", "This is some text to be encoded", "text/plain");
  $db_doc = $this->db->save($doc);
  $this->assertTrue(is_object($db_doc->_attachments), "Inline attachment save successful [object-based]");
 
  $doc = new stdClass();
  $doc->_id = "attachment_doc_autodetect";
  $this->db->add_attachment($doc, "foo.txt", "This is some other text to be encoded");
  $db_doc = $this->db->save($doc);
  $this->assertTrue(is_object($db_doc->_attachments), "Inline attachment save successful [object-based, mime auto-detection]");
  }
 
  public function test_inline_attachment_obj_file() {
  $doc = new stdClass();
  $doc->_id = "attachment_doc";
  $file_path = dirname(__FILE__) . "/resources/couch-logo.pdf";
  $this->db->add_attachment_file($doc, "foo.pdf", $file_path, "application/pdf");
  $db_doc = $this->db->save($doc);
  $this->assertTrue(is_object($db_doc->_attachments), "Inline attachment of file successful");
 
  $doc = new stdClass();
  $doc->_id = "attachment_doc_autodetect";
  $file_path = dirname(__FILE__) . "/resources/couch-logo.pdf";
  $this->db->add_attachment_file($doc, "foo.pdf", $file_path);
  $db_doc = $this->db->save($doc);
  $this->assertTrue(is_object($db_doc->_attachments), "Inline attachment of file successful w/ mime type auto-detection");
  }
 
  public function test_view_lifecycle() {
  $this->_create_some_sample_docs();
 
  $map_src = <<<VIEW
  function(doc) {
  if(doc.date && doc.title) {
  emit(doc.date, doc.title);
  }
  }
  VIEW;
 
  $view = $this->db->save_view("foo_views", "bar_view", $map_src);
  $this->assertEquals("_design/foo_views", $view->_id, "View Creation Success");
 
  $view = $this->db->get_view("foo_views", "bar_view");
  $this->assertEquals(3, $view->total_rows, "Running a View Success");
 
  $map_src = <<<VIEW
  function(doc) {
  if(doc.date) {
  emit(doc.date, doc);
  }
  }
  VIEW;
 
  $view = $this->db->save_view("foo_views", "bar_view", $map_src);
  $this->assertEquals("_design/foo_views", $view->_id, "View Update Success");
 
  $view = $this->db->get_view("foo_views", "bar_view");
  $this->assertEquals("Well hello and welcome to my new blog...", $view->rows[0]->value->body, "Running a View Success (after update)");
 
  $view = $this->db->get_view("foo_views", "bar_view", "2009/02/17 21:13:39");
  $this->assertEquals("Bought a Cat", $view->rows[0]->value->title, "Running a Parametrized View");
 
  $view = $this->db->get_view("foo_views", "bar_view", array("2009/01/30 18:04:11", "2009/02/17 21:13:39"));
  $this->assertEquals("Biking", $view->rows[0]->value->title, "Running a Parametrized View with range");
 
  $view = $this->db->get_view("foo_views", "bar_view", array("2009/02/17 21:13:39", "2009/01/30 18:04:11"), true);
  $this->assertEquals("Bought a Cat", $view->rows[0]->value->title, "Running a Parametrized View with range, descending");
  $this->assertEquals(2, count($view->rows), "Running a Parametrized View with range, descending [count]");
 
  }
 
  function test_two_views_in_a_design_doc() {
 
  $map_src = <<<VIEW
  function(doc) {
  if(doc.date && doc.title) {
  emit(doc.date, doc.title);
  }
  }
  VIEW;
 
  $view = $this->db->save_view("a_settee_design_doc", "foo_view", $map_src);
  $this->assertTrue(isset($view->views->foo_view), "View1 Creation Success");
 
  $view = $this->db->save_view("a_settee_design_doc", "bar_view", $map_src);
  $this->assertTrue(isset($view->views->bar_view), "View2 Creation Success");
  }
 
  /**
  * Create some sample docs for running tests on them.
  *
  * <p>This sample was taken from a wonderful book:
  * CouchDB: The Definitive Guide (Animal Guide) by J. Chris Anderson, Jan Lehnardt and Noah Slater
  * http://www.amazon.com/CouchDB-Definitive-Guide-Relax-Animal/dp/0596155891/ref=sr_1_1?ie=UTF8&qid=1311533443&sr=8-1
  *
  * @return void
  */
  private function _create_some_sample_docs() {
  $doc = new stdClass();
  $doc->_id = "biking";
  $doc->title = "Biking";
  $doc->body = "My biggest hobby is mountainbiking";
  $doc->date = "2009/01/30 18:04:11";
  $this->db->save($doc);
 
  $doc = new stdClass();
  $doc->_id = "bought-a-cat";
  $doc->title = "Bought a Cat";
  $doc->body = "I went to the the pet store earlier and brought home a little kitty...";
  $doc->date = "2009/02/17 21:13:39";
  $this->db->save($doc);
 
  $doc = new stdClass();
  $doc->_id = "hello-world";
  $doc->title = "Hello World";
  $doc->body = "Well hello and welcome to my new blog...";
  $doc->date = "2009/01/15 15:52:20";
  $this->db->save($doc);
  }
 
  public function tearDown() {
  $ret = $this->server->drop_db($this->db);
  }
 
  }
 
 
  <?php
 
  require_once (realpath(dirname(__FILE__) . '/../src/settee.php'));
  require_once (dirname(__FILE__) . '/SetteeTestCase.class.php');
 
  class SetteeRestClientTest extends SetteeTestCase {
 
  private $rest_client;
 
  public function setUp() {
  parent::setUp();
  $this->rest_client = SetteeRestClient::get_instance($this->db_url);
  }
 
  public function test_get_full_url() {
 
  //-- Can't run this test in PHP versions earlier than 5.3.2, which do not support ReflectionMethod class.
  if (!class_exists('ReflectionMethod')) {
  return;
  }
 
  //-- Prepare for testing the private full_url_method method.
  $get_full_url_method = new ReflectionMethod('SetteeRestClient', 'get_full_url');
  $get_full_url_method->setAccessible(TRUE);
 
  $uri = 'irakli/26cede9ab9cd8fcd67895eb05200d1ea';
  //-- Equivalent to: $calc = $this->rest_client->get_full_url($uri); but for a private method.
  $calc = $get_full_url_method->invokeArgs($this->rest_client, array($uri));
  //--
  $expected = $this->db_url . '/irakli/26cede9ab9cd8fcd67895eb05200d1ea';
  $this->assertEquals($expected, $calc, "Full URL Generation with DB and ID");
 
  $uri = 'irakli/26cede9ab9cd8fcd67895eb05200d1ea?rev=2-21587f7dffc43b4100f40168f309a267';
  $calc = $get_full_url_method->invokeArgs($this->rest_client, array($uri));
  $expected = $this->db_url . '/irakli/26cede9ab9cd8fcd67895eb05200d1ea?rev=2-21587f7dffc43b4100f40168f309a267';
  $this->assertEquals($expected, $calc, "Full URL Generation with DB, ID and Single Query Parameter");
 
  $uri = 'irakli/26cede9ab9cd8fcd67895eb05200d1ea?rev=2-21587f7dffc43b4100f40168f309a267&second=foo';
  $calc = $get_full_url_method->invokeArgs($this->rest_client, array($uri));
  $expected = $this->db_url . '/irakli/26cede9ab9cd8fcd67895eb05200d1ea?rev=2-21587f7dffc43b4100f40168f309a267&second=foo';
  $this->assertEquals($expected, $calc, "Full URL Generation with DB, ID and Two Query Parameters");
 
  }
 
  public function test_file_mime_type() {
 
  $type = $this->rest_client->file_mime_type(dirname(__FILE__) . "/resources/couch-logo.jpg");
  $this->assertEquals("image/jpeg", $type, "Jpeg Mime Type Detection");
 
  $type = $this->rest_client->file_mime_type(dirname(__FILE__) . "/resources/couch-logo.pdf");
  $this->assertEquals("application/pdf", $type, "PDF Mime Type Detection");
 
 
  $type = $this->rest_client->file_mime_type(dirname(__FILE__) . "/resources/couch-logo.png");
  $this->assertEquals("image/png", $type, "PNG Mime Type Detection");
 
  $type = $this->rest_client->file_mime_type(dirname(__FILE__) . "/resources/couch-tag.ini");
  $this->assertEquals("text/plain", $type, "Text Mime Type Detection");
 
  $type = $this->rest_client->file_mime_type(dirname(__FILE__) . "/resources/couch-tag.xml");
  $this->assertEquals("application/xml", $type, "XML Mime Type Detection");
  }
 
  public function test_content_mime_type() {
  $content = file_get_contents(dirname(__FILE__) . "/resources/couch-logo.jpg");
  $type = $this->rest_client->content_mime_type($content);
  $this->assertEquals("image/jpeg", $type, "Jpeg Mime Type Detection");
 
  $content = file_get_contents(dirname(__FILE__) . "/resources/couch-logo.pdf");
  $type = $this->rest_client->content_mime_type($content);
  $this->assertEquals("application/pdf", $type, "PDF Mime Type Detection");
 
  $content = file_get_contents(dirname(__FILE__) . "/resources/couch-logo.png");
  $type = $this->rest_client->content_mime_type($content);
  $this->assertEquals("image/png", $type, "PNG Mime Type Detection");
 
  $content = file_get_contents(dirname(__FILE__) . "/resources/couch-tag.ini");
  $type = $this->rest_client->content_mime_type($content);
  $this->assertEquals("text/plain", $type, "Text Mime Type Detection");
 
  $content = file_get_contents(dirname(__FILE__) . "/resources/couch-tag.xml");
  $type = $this->rest_client->content_mime_type($content);
  $this->assertEquals("application/xml", $type, "XML Mime Type Detection");
  }
 
 
 
  }
 
 
  <?php
 
  require_once (realpath(dirname(__FILE__) . '/../src/settee.php'));
  require_once (dirname(__FILE__) . '/SetteeTestCase.class.php');
 
  class SetteeServerTest extends SetteeTestCase {
 
  private $dbname;
 
  public function setUp() {
  parent::setUp();
  $this->dbname = "settee_tests_" . md5(microtime(true));
  }
 
  public function test_database_lifecycle_namebased() {
  $db = $this->server->get_db($this->dbname);
  $ret = $this->server->create_db($this->dbname);
  $this->assertTrue($ret->ok, "Database Creation Success Response [name-based]");
 
  $database_list = $this->server->list_dbs();
  $this->assertTrue(is_array($database_list) && in_array($this->dbname, $database_list),
  "Verifying Database in the List on the Server [name-based]");
 
  $ret = $this->server->drop_db($this->dbname);
  $this->assertTrue($ret->ok, "Database Deletion Success Response [name-based]");
  }
 
  public function test_database_lifecycle_objectbased() {
  $db = $this->server->get_db($this->dbname);
  $ret = $this->server->create_db($db);
  $this->assertTrue($ret->ok, "Database Creation Success Response [object-based]");
 
  $database_list = $this->server->list_dbs();
  $this->assertTrue(is_array($database_list) && in_array($this->dbname, $database_list),
  "Verifying Database in the List on the Server [object-based]");
 
  $ret = $this->server->drop_db($db);
  $this->assertTrue($ret->ok, "Database Deletion Success Response [object-based]");
  }
 
  }
 
 
  <?php
 
  /**
  * Abstract parent for Settee test classes.
  */
  abstract class SetteeTestCase extends PHPUnit_Framework_TestCase {
 
  protected $server;
  protected $db_url;
  protected $db_user;
  protected $db_pass;
 
  public function setUp() {
  $this->db_url = isset($GLOBALS['db_url']) ? $GLOBALS['db_url'] : 'http://127.0.0.1:5984';
  $this->db_user = isset($GLOBALS['db_user']) ? $GLOBALS['db_user'] : 'admin';
  $this->db_pass = isset($GLOBALS['db_pass']) ? $GLOBALS['db_pass'] : 'admin';
  $this->server = new SetteeServer($this->db_url);
  }
 
  }
  <phpunit>
  <php>
  <var name="db_url" value="http://127.0.0.1:5984"/>
  <var name="db_user" value="admin"/>
  <var name="db_pass" value="passwd"/>
  </php>
  </phpunit>
 
 Binary files /dev/null and b/couchdb/settee/tests/resources/couch-logo.jpg differ
 Binary files /dev/null and b/couchdb/settee/tests/resources/couch-logo.pdf differ
 Binary files /dev/null and b/couchdb/settee/tests/resources/couch-logo.png differ
  Couchdb=relax
 
  <?xml version="1.0" encoding="UTF-8" ?>
  <tagline>
  <main>CouchDB - Relax</main>
  </tagline>
 
  <?php
 
  include('template.inc.php');
  include_header_documents("About");
  include_once('../include/common.inc.php');
  ?>
  <h1>About</h1>
  <?php
  include_footer_documents();
  ?>
 
  <?php
  include('template.inc.php');
  include_once('../include/common.inc.php');
  $agenciesdb = $server->get_db('disclosr-agencies');
 
  $idtoname = Array();
  foreach ($agenciesdb->get_view("app", "byCanonicalName")->rows as $row) {
  $idtoname[$row->id] = trim($row->value->name);
  }
  $foidocsdb = $server->get_db('disclosr-foidocuments');
 
  include_header_documents((isset($_REQUEST['id']) ? $idtoname[$_REQUEST['id']] : 'Entries by Agency'));
  $endkey = (isset($_REQUEST['end_key']) ? $_REQUEST['end_key'] : '9999-99-99');
  ?>
  <div class="headline">Read all the information released by Australian Federal Government agencies under the FOI Act in one place!</div>
  <a style='float:right' href="rss.xml.php"><img src="img/feed-icon-14x14.png" alt="RSS Icon"/> All Agencies RSS Feed</a><br>
  <?php
  try {
  if ($_REQUEST['id']) {
  $rows = $foidocsdb->get_view("app", "byAgencyID", $_REQUEST['id'], false, false, false)->rows;
  foreach ($rows as $row) {
  //print_r($rows);
  echo displayLogEntry($row, $idtoname);
  if (!isset($startkey))
  $startkey = $row->key;
  $endkey = $row->key;
  }
  } else {
  $rows = $foidocsdb->get_view("app", "byAgencyID?group=true", null, false, false, true)->rows;
  if ($rows) {
  foreach ($rows as $row) {
  echo '<a href="agency.php?id=' . $row->key . '">' . $idtoname[$row->key] . " (" . $row->value . " records)</a> <br>\n";
  }
  }
  }
  } catch (SetteeRestClientException $e) {
  setteErrorHandler($e);
  }
  echo "<a class='btn btn-large btn-primary' href='?end_key=$endkey' style='float:right;'>next page <i class='icon-circle-arrow-right icon-white'></i></a>";
  include_footer_documents();
  ?>
<?php <?php
include('template.inc.php'); include('template.inc.php');
include_header_documents(""); include_header_documents("Charts");
include_once('../include/common.inc.php'); include_once('../include/common.inc.php');
$agenciesdb = $server->get_db('disclosr-agencies'); $agenciesdb = $server->get_db('disclosr-agencies');
   
$idtoname = Array(); $idtoname = Array();
foreach ($agenciesdb->get_view("app", "byCanonicalName")->rows as $row) { foreach ($agenciesdb->get_view("app", "byCanonicalName")->rows as $row) {
$idtoname[$row->id] = trim($row->value->name); $idtoname[$row->id] = trim($row->value->name);
} }
$foidocsdb = $server->get_db('disclosr-foidocuments'); $foidocsdb = $server->get_db('disclosr-foidocuments');
   
?> ?>
<div class="foundation-header"> <div class="foundation-header">
<h1><a href="about.php">Charts</a></h1> <h1><a href="about.php">Charts</a></h1>
<h4 class="subheader">Lorem ipsum.</h4> <h4 class="subheader">Lorem ipsum.</h4>
</div> </div>
<div id="employees" style="width:1000px;height:900px;"></div> <div id="bydate" style="width:1000px;height:300px;"></div>
  <div id="byagency" style="width:1200px;height:300px;"></div>
<script id="source"> <script id="source">
window.onload = function() { window.onload = function() {
$(document).ready(function() { $(document).ready(function() {
var var
d1 = [], d1 = [],
start = new Date("2009/01/01 01:00").getTime(), options1,
options, o1;
graph,  
i, x, o;  
   
<?php <?php
try { try {
$rows = $foidocsdb->get_view("app", "byDate?group=true", null, true)->rows; $rows = $foidocsdb->get_view("app", "byDateMonthYear?group=true",null, false,false,true)->rows;
   
   
$dataValues = Array(); $dataValues = Array();
foreach ($rows as $row) { foreach ($rows as $row) {
$dataValues[$row->value] = $row->key; $dataValues[$row->key] = $row->value;
} }
$i = 0; $i = 0;
ksort($dataValues); ksort($dataValues);
foreach ($dataValues as $value => $key) { foreach ($dataValues as $key => $value) {
$date = date_create_from_format('Y-m-d', $key); $date = date_create_from_format('Y-m-d', $key);
if (date_format($date, 'U') != "") { if (date_format($date, 'U') != "") {
echo " d1.push([".date_format($date, 'U')."000, $value]);" . PHP_EOL; echo " d1.push([".date_format($date, 'U')."000, $value]);" . PHP_EOL;
// echo " emplabels.push('$key');" . PHP_EOL; // echo " emplabels.push('$key');" . PHP_EOL;
$i++; $i++;
} }
} }
} catch (SetteeRestClientException $e) { } catch (SetteeRestClientException $e) {
setteErrorHandler($e); setteErrorHandler($e);
} }
?> ?>
   
   
options = { options1 = {
xaxis : { xaxis : {
mode : 'time', mode : 'time',
labelsAngle : 45 labelsAngle : 45
}, },
selection : { selection : {
mode : 'x' mode : 'x'
}, },
HtmlText : false, HtmlText : false,
title : 'Time' title : 'Time'
}; };
// Draw graph with default options, overwriting with passed options // Draw graph with default options, overwriting with passed options
function drawGraph (opts) { function drawGraph (opts) {
   
// Clone the options, so the 'options' variable always keeps intact. // Clone the options, so the 'options' variable always keeps intact.
o = Flotr._.extend(Flotr._.clone(options), opts || {}); o1 = Flotr._.extend(Flotr._.clone(options1), opts || {});
   
// Return a new graph. // Return a new graph.
return Flotr.draw( return Flotr.draw(
document.getElementById("employees"), document.getElementById("bydate"),
[ d1 ], [ d1 ],
o o1
); );
} }
   
graph = drawGraph(); graph = drawGraph();
Flotr.EventAdapter.observe(container, 'flotr:select', function(area){ Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:select', function(area){
// Draw selected area // Draw selected area
graph = drawGraph({ graph = drawGraph({
xaxis : { min : area.x1, max : area.x2, mode : 'time', labelsAngle : 45 }, xaxis : { min : area.x1, max : area.x2, mode : 'time', labelsAngle : 45 },
yaxis : { min : area.y1, max : area.y2 } yaxis : { min : area.y1, max : area.y2 }
}); });
}); });
// When graph is clicked, draw the graph with default area. // When graph is clicked, draw the graph with default area.
Flotr.EventAdapter.observe(container, 'flotr:click', function () { graph = drawGraph(); }); Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:click', function () { graph = drawGraph(); });
   
}); });
}; };
   
  var d2 = [];
  var agencylabels = [];
  function agencytrackformatter(obj) {
   
  return agencylabels[Math.floor(obj.x)] +" = "+obj.y;
   
  }
  function agencytickformatter(val, axis) {
  if (agencylabels[Math.floor(val)]) {
  return '<p style="margin-top:8em;-webkit-transform:rotate(-90deg);">'+(agencylabels[Math.floor(val)])+"</b>";
   
  } else {
  return "";
  }
  }
  <?php
  try {
  $rows = $foidocsdb->get_view("app", "byAgencyID?group=true",null, false,false,true)->rows;
   
   
  $dataValues = Array();
  $i = 0;
  foreach ($rows as $row) {
  echo " d2.push([".$i.", $row->value]);" . PHP_EOL;
  echo " agencylabels.push(['".str_replace("'","",$idtoname[$row->key])."']);" . PHP_EOL;
   
  $i++;
  }
  } catch (SetteeRestClientException $e) {
  setteErrorHandler($e);
  }
  ?>
  // Draw the graph
  Flotr.draw(
  document.getElementById("byagency"),
  [d2],
  {
  bars : {
  show : true,
  horizontal : false,
  shadowSize : 0,
  barWidth : 0.5
  },
  mouse : {
  track : true,
  relative : true,
  trackFormatter: agencytrackformatter
  },
  yaxis : {
  min : 0,
  autoscaleMargin : 1
  },
  xaxis: {
  minorTickFreq: 1,
  noTicks: agencylabels.length,
  showMinorLabels: true,
  tickFormatter: agencytickformatter
  },
  legend: {
  show: false
  }
  }
  );
</script> </script>
   
<?php <?php
include_footer_documents(); include_footer_documents();
?> ?>
   
   
  <?php
 
  include('template.inc.php');
  include_header_documents("Entries by Date");
  include_once('../include/common.inc.php');
  $endkey = (isset($_REQUEST['end_key']) ? $_REQUEST['end_key'] : '9999-99-99');
  ?>
  <div class="headline">Read all the information released by Australian Federal Government agencies under the FOI Act in one place!</div>
  <a style='float:right' href="rss.xml.php"><img src="img/feed-icon-14x14.png" alt="RSS Icon"/> All Agencies RSS Feed</a><br>
  <?php
  /*$agenciesdb = $server->get_db('disclosr-agencies');
 
  $idtoname = Array();
  foreach ($agenciesdb->get_view("app", "byCanonicalName")->rows as $row) {
  $idtoname[$row->id] = trim($row->value->name);
  }
  $foidocsdb = $server->get_db('disclosr-foidocuments');
  try {
  $rows = $foidocsdb->get_view("app", "byDate", Array($endkey, '0000-00-00'), true, 20)->rows;
  if ($rows) {
  foreach ($rows as $key => $row) {
  echo displayLogEntry($row, $idtoname);
  if (!isset($startkey)) $startkey = $row->key;
  $endkey = $row->key;
  }
  }
  } catch (SetteeRestClientException $e) {
  setteErrorHandler($e);
  }
  echo "<a class='btn btn-large btn-primary' href='?end_key=$endkey' style='float:right;'>next page <i class='icon-circle-arrow-right icon-white'></i></a>";
  */
  include_footer_documents();
  ?>
 
<?php <?php
   
include('template.inc.php'); include('template.inc.php');
include_header_documents(""); include_header_documents("List of Disclosure Logs");
include_once('../include/common.inc.php'); include_once('../include/common.inc.php');
   
echo "<table> echo "<table>
<tr><th>Agency Name</th><th>Disclosure Log URL recorded?</th><th>Do we monitor this URL?</th></tr>"; <tr><th>Agency Name</th><th>Disclosure Log URL recorded?</th><th>Do we monitor this URL?</th></tr>";
$agenciesdb = $server->get_db('disclosr-agencies'); $agenciesdb = $server->get_db('disclosr-agencies');
$docsdb = $server->get_db('disclosr-documents'); $docsdb = $server->get_db('disclosr-documents');
$agencies = 0; $agencies = 0;
$disclogs = 0; $disclogs = 0;
$red = 0; $red = 0;
$green = 0; $green = 0;
$yellow = 0; $yellow = 0;
$orange = 0; $orange = 0;
try { try {
$rows = $agenciesdb->get_view("app", "byCanonicalName", null, true)->rows; $rows = $agenciesdb->get_view("app", "byCanonicalName", null, true)->rows;
   
   
if ($rows) { if ($rows) {
foreach ($rows as $row) { foreach ($rows as $row) {
if ((!isset($row->value->status) || $row->value->status != "suspended") && isset($row->value->foiEmail)) { if ((!isset($row->value->status) || $row->value->status != "suspended") && isset($row->value->foiEmail)) {
echo "<tr><td>"; echo "<tr><td>";
if (isset($row->value->website)) echo "<a href='" . $row->value->website . "'>"; if (isset($row->value->website)) echo "<a href='" . $row->value->website . "'>";
echo "<b>" . $row->value->name . "</b>"; echo "<b>" . $row->value->name . "</b>";
if (isset($row->value->website)) echo "</a>"; if (isset($row->value->website)) echo "</a>";
if ($ENV == "DEV") if ($ENV == "DEV")
echo "<br>(" . $row->id . ")"; echo "<br>(" . $row->id . ")";
echo "</td>\n"; echo "</td>\n";
$agencies++; $agencies++;
   
echo "<td>"; echo "<td>";
if (isset($row->value->FOIDocumentsURL)) { if (isset($row->value->FOIDocumentsURL)) {
$disclogs++; $disclogs++;
echo '<a href="' . $row->value->FOIDocumentsURL . '">' echo '<a href="' . $row->value->FOIDocumentsURL . '">'
. $row->value->FOIDocumentsURL . '</a>'; . $row->value->FOIDocumentsURL . '</a>';
if ($ENV == "DEV") if ($ENV == "DEV")
echo '<br><small>(<a href="viewDocument.php?hash=' . md5($row->value->FOIDocumentsURL) . '">' echo '<br><small>(<a href="viewDocument.php?hash=' . md5($row->value->FOIDocumentsURL) . '">'
. 'view local copy</a>)</small>'; . 'view local copy</a>)</small>';
} else { } else {
echo "<font color='red'><abbr title='No'>✘</abbr></font>"; echo "<font color='red'><abbr title='No'>✘</abbr></font>";
} }
echo "</td>\n<td>"; echo "</td>\n<td>";
if (isset($row->value->FOIDocumentsURL)) { if (isset($row->value->FOIDocumentsURL)) {
if (file_exists("./scrapers/" . $row->id . '.py')) { if (file_exists("./scrapers/" . $row->id . '.py')) {
echo "<font color='green'><abbr title='Yes'>✔</abbr></font>"; echo "<font color='green'><abbr title='Yes'>✔</abbr></font>";
$green++; $green++;
} else if (file_exists("./scrapers/" . $row->id . '.txt')) { } else if (file_exists("./scrapers/" . $row->id . '.txt')) {
if (trim(file_get_contents("./scrapers/" . $row->id . '.txt')) == "no disclog") { if (trim(file_get_contents("./scrapers/" . $row->id . '.txt')) == "no disclog") {
echo "<font color='yellow'><abbr title='No log table exists at URL to scrape'><b>◎</b></abbr></font>"; echo "<font color='yellow'><abbr title='No log table exists at URL to scrape'><b>◎</b></abbr></font>";
$yellow++; $yellow++;
} else { } else {
echo file_get_contents("./scrapers/" . $row->id . '.txt'); echo file_get_contents("./scrapers/" . $row->id . '.txt');
echo "<font color='orange'><abbr title='Work in progress'><b>▬</b></abbr></font>"; echo "<font color='orange'><abbr title='Work in progress'><b>▬</b></abbr></font>";
$orange++; $orange++;
} }
} else { } else {
echo "<font color='red'><abbr title='No'>✘</abbr></font>"; echo "<font color='red'><abbr title='No'>✘</abbr></font>";
$red++; $red++;
} }
} }
echo "</td></tr>\n"; echo "</td></tr>\n";
} }
} }
} }
} catch (SetteeRestClientException $e) { } catch (SetteeRestClientException $e) {
setteErrorHandler($e); setteErrorHandler($e);
} }
echo "</table>"; echo "</table>";
echo $agencies . " agencies, " . round(($disclogs / $agencies) * 100) . "% with disclosure logs; " echo $agencies . " agencies, " . round(($disclogs / $agencies) * 100) . "% with disclosure logs; "
. round(($green / $disclogs) * 100) . "% logs with scrapers " . round(($red / $disclogs) * 100) . "% logs without scrapers " . round(($orange / $disclogs) * 100) . "% logs Work-In-Progress scrapers "; . round(($green / $disclogs) * 100) . "% logs with scrapers " . round(($red / $disclogs) * 100) . "% logs without scrapers " . round(($orange / $disclogs) * 100) . "% logs Work-In-Progress scrapers ";
   
include_footer_documents(); include_footer_documents();
?> ?>
   
  {
  "venv": "",
  "project-type": "Import from sources",
  "name": "disclosr-documents",
  "license": "GNU General Public License v3",
  "description": ""
  }
import sys,os import sys
  import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import scrape import scrape
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from time import mktime from time import mktime
import feedparser import feedparser
import abc import abc
import unicodedata, re import unicodedata
  import re
import dateutil import dateutil
from dateutil.parser import * from dateutil.parser import *
from datetime import * from datetime import *
import codecs import codecs
   
  import difflib
   
  from StringIO import StringIO
   
  from pdfminer.pdfparser import PDFDocument, PDFParser
  from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
  from pdfminer.pdfdevice import PDFDevice, TagExtractor
  from pdfminer.converter import TextConverter
  from pdfminer.cmapdb import CMapDB
  from pdfminer.layout import LAParams
   
   
class GenericDisclogScraper(object): class GenericDisclogScraper(object):
__metaclass__ = abc.ABCMeta __metaclass__ = abc.ABCMeta
agencyID = None agencyID = None
disclogURL = None disclogURL = None
def remove_control_chars(self, input):  
return "".join([i for i in input if ord(i) in range(32, 127)]) def remove_control_chars(self, input):
def getAgencyID(self): return "".join([i for i in input if ord(i) in range(32, 127)])
""" disclosr agency id """  
if self.agencyID == None: def getAgencyID(self):
self.agencyID = os.path.basename(sys.argv[0]).replace(".py","") """ disclosr agency id """
return self.agencyID if self.agencyID is None:
  self.agencyID = os.path.basename(sys.argv[0]).replace(".py", "")
def getURL(self): return self.agencyID
""" disclog URL"""  
if self.disclogURL == None: def getURL(self):
agency = scrape.agencydb.get(self.getAgencyID()) """ disclog URL"""
self.disclogURL = agency['FOIDocumentsURL'] if self.disclogURL is None:
return self.disclogURL agency = scrape.agencydb.get(self.getAgencyID())
  self.disclogURL = agency['FOIDocumentsURL']
@abc.abstractmethod return self.disclogURL
def doScrape(self):  
""" do the scraping """ @abc.abstractmethod
return def doScrape(self):
  """ do the scraping """
@abc.abstractmethod return
def getDescription(self, content, entry, doc):  
""" get description""" class GenericHTMLDisclogScraper(GenericDisclogScraper):
return  
  def doScrape(self):
  foidocsdb = scrape.couch['disclosr-foidocuments']
  (url, mime_type, rcontent) = scrape.fetchURL(scrape.docsdb,
  self.getURL(), "foidocuments", self.getAgencyID())
  content = rcontent
  dochash = scrape.mkhash(content)
  doc = foidocsdb.get(dochash)
  if doc is None:
  print "saving " + dochash
  description = "This log may have updated but as it was not in a table last time we viewed it, we cannot extract what has changed. Please refer to the agency's website Disclosure Log to see the most recent entries"
  last_attach = scrape.getLastAttachment(scrape.docsdb, self.getURL())
  if last_attach != None:
  html_diff = difflib.HtmlDiff()
  description = description + "\nChanges: "
  description = description + html_diff.make_table(last_attach.read().split('\n'),
  content.split('\n'))
  edate = date.today().strftime("%Y-%m-%d")
  doc = {'_id': dochash, 'agencyID': self.getAgencyID()
  , 'url': self.getURL(), 'docID': dochash,
  "date": edate, "title": "Disclosure Log Updated", "description": description}
  foidocsdb.save(doc)
  else:
  print "already saved"
   
  class GenericPDFDisclogScraper(GenericDisclogScraper):
   
  def doScrape(self):
  foidocsdb = scrape.couch['disclosr-foidocuments']
  (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
  self.getURL(), "foidocuments", self.getAgencyID())
  laparams = LAParams()
  rsrcmgr = PDFResourceManager(caching=True)
  outfp = StringIO()
  device = TextConverter(rsrcmgr, outfp, codec='utf-8',
  laparams=laparams)
  fp = StringIO()
  fp.write(content)
   
  process_pdf(rsrcmgr, device, fp, set(), caching=True,
  check_extractable=True)
  description = outfp.getvalue()
  fp.close()
  device.close()
  outfp.close()
  dochash = scrape.mkhash(description)
  doc = foidocsdb.get(dochash)
  if doc is None:
  print "saving " + dochash
  edate = date.today().strftime("%Y-%m-%d")
  doc = {'_id': dochash, 'agencyID': self.getAgencyID()
  , 'url': self.getURL(), 'docID': dochash,
  "date": edate, "title": "Disclosure Log Updated", "description": description}
  foidocsdb.save(doc)
  else:
  print "already saved"
   
   
  class GenericDOCXDisclogScraper(GenericDisclogScraper):
   
  def doScrape(self):
  foidocsdb = scrape.couch['disclosr-foidocuments']
  (url, mime_type, content) = scrape.fetchURL(scrape.docsdb
  , self.getURL(), "foidocuments", self.getAgencyID())
  mydoc = zipfile.ZipFile(file)
  xmlcontent = mydoc.read('word/document.xml')
  document = etree.fromstring(xmlcontent)
  ## Fetch all the text out of the document we just created
  paratextlist = getdocumenttext(document)
  # Make explicit unicode version
  newparatextlist = []
  for paratext in paratextlist:
  newparatextlist.append(paratext.encode("utf-8"))
  ## Print our documnts test with two newlines under each paragraph
  description = '\n\n'.join(newparatextlist).strip(' \t\n\r')
  dochash = scrape.mkhash(description)
  doc = foidocsdb.get(dochash)
   
  if doc is None:
  print "saving " + dochash
  edate = time().strftime("%Y-%m-%d")
  doc = {'_id': dochash, 'agencyID': self.getAgencyID()
  , 'url': self.getURL(), 'docID': dochash,
  "date": edate, "title": "Disclosure Log Updated", "description": description}
  foidocsdb.save(doc)
  else:
  print "already saved"
   
   
class GenericRSSDisclogScraper(GenericDisclogScraper): class GenericRSSDisclogScraper(GenericDisclogScraper):
   
def doScrape(self): def doScrape(self):
foidocsdb = scrape.couch['disclosr-foidocuments'] foidocsdb = scrape.couch['disclosr-foidocuments']
(url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
feed = feedparser.parse(content) self.getURL(), "foidocuments", self.getAgencyID())
for entry in feed.entries: feed = feedparser.parse(content)
#print entry for entry in feed.entries:
print entry.id #print entry
hash = scrape.mkhash(entry.id) print entry.id
#print hash dochash = scrape.mkhash(entry.id)
doc = foidocsdb.get(hash) doc = foidocsdb.get(dochash)
#print doc #print doc
if doc == None: if doc is None:
print "saving "+ hash print "saving " + dochash
edate = datetime.fromtimestamp(mktime( entry.published_parsed)).strftime("%Y-%m-%d") edate = datetime.fromtimestamp(
doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': entry.link, 'docID': entry.id, mktime(entry.published_parsed)).strftime("%Y-%m-%d")
"date": edate,"title": entry.title} doc = {'_id': dochash, 'agencyID': self.getAgencyID(),
self.getDescription(entry,entry, doc) 'url': entry.link, 'docID': entry.id,
foidocsdb.save(doc) "date": edate, "title": entry.title}
  self.getDescription(entry, entry, doc)
  foidocsdb.save(doc)
  else:
  print "already saved"
   
  def getDescription(self, content, entry, doc):
  """ get description from rss entry"""
  doc.update({'description': content.summary})
  return
   
   
  class GenericOAICDisclogScraper(GenericDisclogScraper):
  __metaclass__ = abc.ABCMeta
   
  @abc.abstractmethod
  def getColumns(self, columns):
  """ rearranges columns if required """
  return
   
  def getColumnCount(self):
  return 5
   
  def getDescription(self, content, entry, doc):
  """ get description from rss entry"""
  descriptiontxt = ""
  for string in content.stripped_strings:
  descriptiontxt = descriptiontxt + " \n" + string
  doc.update({'description': descriptiontxt})
   
  def getTitle(self, content, entry, doc):
  doc.update({'title': (''.join(content.stripped_strings))})
   
  def getTable(self, soup):
  return soup.table
   
  def getRows(self, table):
  return table.find_all('tr')
   
  def getDate(self, content, entry, doc):
  date = ''.join(content.stripped_strings).strip()
  (a, b, c) = date.partition("(")
  date = self.remove_control_chars(a.replace("Octber", "October"))
  print date
  edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
  print edate
  doc.update({'date': edate})
  return
   
  def getLinks(self, content, entry, doc):
  links = []
  for atag in entry.find_all("a"):
  if atag.has_key('href'):
  links.append(scrape.fullurl(content, atag['href']))
  if links != []:
  doc.update({'links': links})
  return
   
  def doScrape(self):
  foidocsdb = scrape.couch['disclosr-foidocuments']
  (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
  self.getURL(), "foidocuments", self.getAgencyID())
  if content is not None:
  if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type == "application/xml":
  # http://www.crummy.com/software/BeautifulSoup/documentation.html
  print "parsing"
  soup = BeautifulSoup(content)
  table = self.getTable(soup)
  for row in self.getRows(table):
  columns = row.find_all('td')
  if len(columns) is self.getColumnCount():
  (id, date, title,
  description, notes) = self.getColumns(columns)
  print self.remove_control_chars(
  ''.join(id.stripped_strings))
  if id.string is None:
  dochash = scrape.mkhash(
  self.remove_control_chars(
  url + (''.join(date.stripped_strings))))
else: else:
print "already saved" dochash = scrape.mkhash(
def getDescription(self, content, entry, doc): self.remove_control_chars(
""" get description from rss entry""" url + (''.join(id.stripped_strings))))
doc.update({'description': content.summary}) doc = foidocsdb.get(dochash)
return  
  if doc is None:
class GenericOAICDisclogScraper(GenericDisclogScraper): print "saving " + dochash
__metaclass__ = abc.ABCMeta doc = {'_id': dochash,
@abc.abstractmethod 'agencyID': self.getAgencyID(),
def getColumns(self,columns): 'url': self.getURL(),
""" rearranges columns if required """ 'docID': (''.join(id.stripped_strings))}
return self.getLinks(self.getURL(), row, doc)
def getColumnCount(self): self.getTitle(title, row, doc)
return 5 self.getDate(date, row, doc)
def getDescription(self, content, entry, doc): self.getDescription(description, row, doc)
""" get description from rss entry""" if notes is not None:
descriptiontxt = "" doc.update({ 'notes': (
for string in content.stripped_strings: ''.join(notes.stripped_strings))})
descriptiontxt = descriptiontxt + " \n" + string badtitles = ['-','Summary of FOI Request'
doc.update({'description': descriptiontxt}) , 'FOI request(in summary form)'
return , 'Summary of FOI request received by the ASC',
def getTitle(self, content, entry, doc): 'Summary of FOI request received by agency/minister',
doc.update({'title': (''.join(content.stripped_strings))}) 'Description of Documents Requested','FOI request',
return 'Description of FOI Request','Summary of request','Description','Summary',
def getTable(self, soup): 'Summary of FOIrequest received by agency/minister','Summary of FOI request received','Description of FOI Request',"FOI request",'Results 1 to 67 of 67']
return soup.table if doc['title'] not in badtitles\
def getRows(self, table): and doc['description'] != '':
return table.find_all('tr') print "saving"
def getDate(self, content, entry, doc): foidocsdb.save(doc)
date = ''.join(content.stripped_strings).strip() else:
(a,b,c) = date.partition("(") print "already saved " + dochash
date = self.remove_control_chars(a.replace("Octber","October"))  
print date elif len(row.find_all('th')) is self.getColumnCount():
edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") print "header row"
print edate  
doc.update({'date': edate}) else:
return print "ERROR number of columns incorrect"
def getLinks(self, content, entry, doc): print row
links = []  
for atag in entry.find_all("a"):  
if atag.has_key('href'):  
links.append(scrape.fullurl(content,atag['href']))  
if links != []:  
doc.update({'links': links})  
return  
   
def doScrape(self):  
foidocsdb = scrape.couch['disclosr-foidocuments']  
(url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID())  
if content != None:  
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":  
# http://www.crummy.com/software/BeautifulSoup/documentation.html  
soup = BeautifulSoup(content)  
table = self.getTable(soup)  
for row in self.getRows(table):  
columns = row.find_all('td')  
if len(columns) == self.getColumnCount():  
(id, date, title, description, notes) = self.getColumns(columns)  
print self.remove_control_chars(''.join(id.stripped_strings))  
if id.string == None:  
hash = scrape.mkhash(self.remove_control_chars(url+(''.join(date.stripped_strings))))  
else:  
hash = scrape.mkhash(self.remove_control_chars(url+(''.join(id.stripped_strings))))  
doc = foidocsdb.get(hash)  
   
if doc == None:  
print "saving " +hash  
doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': (''.join(id.stripped_strings))}  
self.getLinks(self.getURL(),row,doc)  
self.getTitle(title,row, doc)  
self.getDate(date,row, doc)  
self.getDescription(description,row, doc)  
if notes != None:  
doc.update({ 'notes': (''.join(notes.stripped_strings))})  
foidocsdb.save(doc)  
else:  
print "already saved "+hash  
   
elif len(row.find_all('th')) == self.getColumnCount():  
print "header row"  
   
else:  
print "ERROR number of columns incorrect"  
print row  
   
 Binary files /dev/null and b/documents/img/feed-icon-14x14.png differ
<?php <?php
   
include('template.inc.php'); include('template.inc.php');
include_header_documents(""); include_header_documents("");
include_once('../include/common.inc.php'); include_once('../include/common.inc.php');
$startkey = (isset($_REQUEST['start_key']) ? $_REQUEST['start_key'] : '9999-99-99'); $endkey = (isset($_REQUEST['end_key']) ? $_REQUEST['end_key'] : '9999-99-99');
?> ?>
  <div class="headline">Read all the information released by Australian Federal Government agencies under the FOI Act in one place!</div>
  <a style='float:right' href="rss.xml.php"><img src="img/feed-icon-14x14.png" alt="RSS Icon"/> All Agencies RSS Feed</a><br>
<?php <?php
   
$agenciesdb = $server->get_db('disclosr-agencies'); $agenciesdb = $server->get_db('disclosr-agencies');
   
$idtoname = Array(); $idtoname = Array();
foreach ($agenciesdb->get_view("app", "byCanonicalName")->rows as $row) { foreach ($agenciesdb->get_view("app", "byCanonicalName")->rows as $row) {
$idtoname[$row->id] = trim($row->value->name); $idtoname[$row->id] = trim($row->value->name);
} }
$foidocsdb = $server->get_db('disclosr-foidocuments'); $foidocsdb = $server->get_db('disclosr-foidocuments');
try { try {
$rows = $foidocsdb->get_view("app", "byDate", Array($startkey, '0000-00-00'), true, 20)->rows; $rows = $foidocsdb->get_view("app", "byDate", Array($endkey, '0000-00-00'), true, 20)->rows;
if ($rows) { if ($rows) {
foreach ($rows as $key => $row) { foreach ($rows as $key => $row) {
echo displayLogEntry($row, $idtoname); echo displayLogEntry($row, $idtoname);
  if (!isset($startkey)) $startkey = $row->key;
$endkey = $row->key; $endkey = $row->key;
} }
} }
} catch (SetteeRestClientException $e) { } catch (SetteeRestClientException $e) {
setteErrorHandler($e); setteErrorHandler($e);
} }
echo "<a href='?start_key=$endkey'>next page</a>"; echo "<a class='btn btn-large btn-primary' href='?end_key=$endkey' style='float:right;'>next page <i class='icon-circle-arrow-right icon-white'></i></a>";
include_footer_documents(); include_footer_documents();
?> ?>
   
<?php <?php
   
// Agency X updated Y, new files, diff of plain text/link text, // Agency X updated Y, new files, diff of plain text/link text,
// feed for just one agency or all // feed for just one agency or all
// This is a minimum example of using the Universal Feed Generator Class // This is a minimum example of using the Universal Feed Generator Class
include("../lib/FeedWriter/FeedTypes.php"); include("../lib/FeedWriter/FeedTypes.php");
include_once('../include/common.inc.php'); include_once('../include/common.inc.php');
//Creating an instance of FeedWriter class. //Creating an instance of FeedWriter class.
$TestFeed = new RSS2FeedWriter(); $TestFeed = new RSS2FeedWriter();
//Setting the channel elements //Setting the channel elements
//Use wrapper functions for common channelelements ////Retriving informations from database
$TestFeed->setTitle('Last Modified - All');  
$TestFeed->setLink('http://disclosurelo.gs/rss.xml.php');  
$TestFeed->setDescription('This is test of creating a RSS 2.0 feed Universal Feed Writer');  
$TestFeed->setChannelElement('language', 'en-us');  
$TestFeed->setChannelElement('pubDate', date(DATE_RSS, time()));  
//Retriving informations from database  
$idtoname = Array(); $idtoname = Array();
$agenciesdb = $server->get_db('disclosr-agencies'); $agenciesdb = $server->get_db('disclosr-agencies');
foreach ($agenciesdb->get_view("app", "byCanonicalName")->rows as $row) { foreach ($agenciesdb->get_view("app", "byCanonicalName")->rows as $row) {
$idtoname[$row->id] = trim($row->value->name); $idtoname[$row->id] = trim($row->value->name);
} }
$foidocsdb = $server->get_db('disclosr-foidocuments'); $foidocsdb = $server->get_db('disclosr-foidocuments');
$rows = $foidocsdb->get_view("app", "byDate", Array('9999-99-99','0000-00-00'), true)->rows; if (isset($_REQUEST['id'])) {
  $rows = $foidocsdb->get_view("app", "byAgencyID", $_REQUEST['id'], false, false, false)->rows;
  $title = $idtoname[$_REQUEST['id']];
  } else {
  $rows = $foidocsdb->get_view("app", "byDate", Array('9999-99-99', '0000-00-00', 50), true)->rows;
  $title = 'All Agencies';
  }
  //Use wrapper functions for common channelelements
  $TestFeed->setTitle('disclosurelo.gs Newest Entries - '.$title);
  $TestFeed->setLink('http://disclosurelo.gs/rss.xml.php'.(isset($_REQUEST['id'])? '?id='.$_REQUEST['id'] : ''));
  $TestFeed->setDescription('disclosurelo.gs Newest Entries - '.$title);
  $TestFeed->setChannelElement('language', 'en-us');
  $TestFeed->setChannelElement('pubDate', date(DATE_RSS, time()));
   
   
//print_r($rows); //print_r($rows);
foreach ($rows as $row) { foreach ($rows as $row) {
//Create an empty FeedItem //Create an empty FeedItem
$newItem = $TestFeed->createNewItem(); $newItem = $TestFeed->createNewItem();
//Add elements to the feed item //Add elements to the feed item
$newItem->setTitle($row->value->title); $newItem->setTitle($row->value->title);
$newItem->setLink("view.php?id=".$row->value->docID); $newItem->setLink("http://disclosurelo.gs/view.php?id=" . $row->value->_id);
$newItem->setDate(date("c", strtotime($row->value->date))); $newItem->setDate(strtotime($row->value->date));
$newItem->setDescription(displayLogEntry($row,$idtoname)); $newItem->setDescription(displayLogEntry($row, $idtoname));
$newItem->addElement('guid', $row->value->_id,array('isPermaLink'=>'true')); $newItem->setAuthor($idtoname[$row->value->agencyID]);
  $newItem->addElement('guid', "http://disclosurelo.gs/view.php?id=" . $row->value->_id, array('isPermaLink' => 'true'));
//Now add the feed item //Now add the feed item
$TestFeed->addItem($newItem); $TestFeed->addItem($newItem);
} }
//OK. Everything is done. Now genarate the feed. //OK. Everything is done. Now genarate the feed.
$TestFeed->generateFeed(); $TestFeed->generateFeed();
?> ?>
   
  for f in scrapers/*.py;
  do echo "Processing $f file..";
  python $f;
  if [ "$?" -ne "0" ]; then
  echo "error";
  sleep 2;
  fi
  done
 
 
#http://packages.python.org/CouchDB/client.html #http://packages.python.org/CouchDB/client.html
import couchdb import couchdb
import urllib2 import urllib2
from BeautifulSoup import BeautifulSoup from BeautifulSoup import BeautifulSoup
import re import re
import hashlib import hashlib
from urlparse import urljoin from urlparse import urljoin
import time import time
import os import os
import mimetypes import mimetypes
import re  
import urllib import urllib
import urlparse import urlparse
   
def mkhash(input): def mkhash(input):
return hashlib.md5(input).hexdigest().encode("utf-8") return hashlib.md5(input).hexdigest().encode("utf-8")
   
def canonurl(url): def canonurl(url):
r"""Return the canonical, ASCII-encoded form of a UTF-8 encoded URL, or '' r"""Return the canonical, ASCII-encoded form of a UTF-8 encoded URL, or ''
if the URL looks invalid. if the URL looks invalid.
>>> canonurl('\xe2\x9e\xa1.ws') # tinyarro.ws >>> canonurl('\xe2\x9e\xa1.ws') # tinyarro.ws
'http://xn--hgi.ws/' 'http://xn--hgi.ws/'
""" """
# strip spaces at the ends and ensure it's prefixed with 'scheme://' # strip spaces at the ends and ensure it's prefixed with 'scheme://'
url = url.strip() url = url.strip()
if not url: if not url:
return '' return ''
if not urlparse.urlsplit(url).scheme: if not urlparse.urlsplit(url).scheme:
url = 'http://' + url url = 'http://' + url
   
# turn it into Unicode # turn it into Unicode
#try: #try:
# url = unicode(url, 'utf-8') # url = unicode(url, 'utf-8')
#except UnicodeDecodeError: #except UnicodeDecodeError:
# return '' # bad UTF-8 chars in URL # return '' # bad UTF-8 chars in URL
   
# parse the URL into its components # parse the URL into its components
parsed = urlparse.urlsplit(url) parsed = urlparse.urlsplit(url)
scheme, netloc, path, query, fragment = parsed scheme, netloc, path, query, fragment = parsed
   
# ensure scheme is a letter followed by letters, digits, and '+-.' chars # ensure scheme is a letter followed by letters, digits, and '+-.' chars
if not re.match(r'[a-z][-+.a-z0-9]*$', scheme, flags=re.I): if not re.match(r'[a-z][-+.a-z0-9]*$', scheme, flags=re.I):
return '' return ''
scheme = str(scheme) scheme = str(scheme)
   
# ensure domain and port are valid, eg: sub.domain.<1-to-6-TLD-chars>[:port] # ensure domain and port are valid, eg: sub.domain.<1-to-6-TLD-chars>[:port]
match = re.match(r'(.+\.[a-z0-9]{1,6})(:\d{1,5})?$', netloc, flags=re.I) match = re.match(r'(.+\.[a-z0-9]{1,6})(:\d{1,5})?$', netloc, flags=re.I)
if not match: if not match:
return '' return ''
domain, port = match.groups() domain, port = match.groups()
netloc = domain + (port if port else '') netloc = domain + (port if port else '')
netloc = netloc.encode('idna') netloc = netloc.encode('idna')
   
# ensure path is valid and convert Unicode chars to %-encoded # ensure path is valid and convert Unicode chars to %-encoded
if not path: if not path:
path = '/' # eg: 'http://google.com' -> 'http://google.com/' path = '/' # eg: 'http://google.com' -> 'http://google.com/'
path = urllib.quote(urllib.unquote(path.encode('utf-8')), safe='/;') path = urllib.quote(urllib.unquote(path.encode('utf-8')), safe='/;')
   
# ensure query is valid # ensure query is valid
query = urllib.quote(urllib.unquote(query.encode('utf-8')), safe='=&?/') query = urllib.quote(urllib.unquote(query.encode('utf-8')), safe='=&?/')
   
# ensure fragment is valid # ensure fragment is valid
fragment = urllib.quote(urllib.unquote(fragment.encode('utf-8'))) fragment = urllib.quote(urllib.unquote(fragment.encode('utf-8')))
   
# piece it all back together, truncating it to a maximum of 4KB # piece it all back together, truncating it to a maximum of 4KB
url = urlparse.urlunsplit((scheme, netloc, path, query, fragment)) url = urlparse.urlunsplit((scheme, netloc, path, query, fragment))
return url[:4096] return url[:4096]
   
def fullurl(url,href): def fullurl(url,href):
href = href.replace(" ","%20") href = href.replace(" ","%20")
href = re.sub('#.*$','',href) href = re.sub('#.*$','',href)
return urljoin(url,href) return urljoin(url,href)
   
#http://diveintopython.org/http_web_services/etags.html #http://diveintopython.org/http_web_services/etags.html
class NotModifiedHandler(urllib2.BaseHandler): class NotModifiedHandler(urllib2.BaseHandler):
def http_error_304(self, req, fp, code, message, headers): def http_error_304(self, req, fp, code, message, headers):
addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url()) addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url())
addinfourl.code = code addinfourl.code = code
return addinfourl return addinfourl
   
  def getLastAttachment(docsdb,url):
  hash = mkhash(url)
  doc = docsdb.get(hash)
  if doc != None:
  last_attachment_fname = doc["_attachments"].keys()[-1]
  last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
  return last_attachment
  else:
  return None
   
def fetchURL(docsdb, url, fieldName, agencyID, scrape_again=True): def fetchURL(docsdb, url, fieldName, agencyID, scrape_again=True):
url = canonurl(url) url = canonurl(url)
hash = mkhash(url) hash = mkhash(url)
req = urllib2.Request(url) req = urllib2.Request(url)
print "Fetching %s (%s)" % (url,hash) print "Fetching %s (%s)" % (url,hash)
if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "": if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "":
print "Not a valid HTTP url" print "Not a valid HTTP url"
return (None,None,None) return (None,None,None)
doc = docsdb.get(hash) doc = docsdb.get(hash)
if doc == None: if doc == None:
doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName} doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName}
else: else:
if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 60*24*14*1000): if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 60*24*14*1000):
print "Uh oh, trying to scrape URL again too soon!" print "Uh oh, trying to scrape URL again too soon!"+hash
last_attachment_fname = doc["_attachments"].keys()[-1] last_attachment_fname = doc["_attachments"].keys()[-1]
last_attachment = docsdb.get_attachment(doc,last_attachment_fname) last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
content = last_attachment content = last_attachment
return (doc['url'],doc['mime_type'],content) return (doc['url'],doc['mime_type'],content.read())
if scrape_again == False: if scrape_again == False:
print "Not scraping this URL again as requested" print "Not scraping this URL again as requested"
return (None,None,None) return (doc['url'],doc['mime_type'],content.read())
   
time.sleep(3) # wait 3 seconds to give webserver time to recover req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)")
  #if there is a previous version stored in couchdb, load caching helper tags
req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)") if doc.has_key('etag'):
#if there is a previous version stored in couchdb, load caching helper tags req.add_header("If-None-Match", doc['etag'])
if doc.has_key('etag'): if doc.has_key('last_modified'):
req.add_header("If-None-Match", doc['etag']) req.add_header("If-Modified-Since", doc['last_modified'])
if doc.has_key('last_modified'):  
req.add_header("If-Modified-Since", doc['last_modified']) opener = urllib2.build_opener(NotModifiedHandler())
  try:
opener = urllib2.build_opener(NotModifiedHandler()) url_handle = opener.open(req)
try: doc['url'] = url_handle.geturl() # may have followed a redirect to a new url
url_handle = opener.open(req) headers = url_handle.info() # the addinfourls have the .info() too
doc['url'] = url_handle.geturl() # may have followed a redirect to a new url doc['etag'] = headers.getheader("ETag")
headers = url_handle.info() # the addinfourls have the .info() too doc['last_modified'] = headers.getheader("Last-Modified")
doc['etag'] = headers.getheader("ETag") doc['date'] = headers.getheader("Date")
doc['last_modified'] = headers.getheader("Last-Modified") doc['page_scraped'] = time.time()
doc['date'] = headers.getheader("Date") doc['web_server'] = headers.getheader("Server")
doc['page_scraped'] = time.time() doc['via'] = headers.getheader("Via")
doc['web_server'] = headers.getheader("Server") doc['powered_by'] = headers.getheader("X-Powered-By")
doc['via'] = headers.getheader("Via") doc['file_size'] = headers.getheader("Content-Length")
doc['powered_by'] = headers.getheader("X-Powered-By") content_type = headers.getheader("Content-Type")
doc['file_size'] = headers.getheader("Content-Length") if content_type != None:
content_type = headers.getheader("Content-Type") doc['mime_type'] = content_type.split(";")[0]
if content_type != None: else:
doc['mime_type'] = content_type.split(";")[0] (type,encoding) = mimetypes.guess_type(url)
else: doc['mime_type'] = type
(type,encoding) = mimetypes.guess_type(url) if hasattr(url_handle, 'code'):
doc['mime_type'] = type if url_handle.code == 304:
if hasattr(url_handle, 'code'): print "the web page has not been modified"+hash
if url_handle.code == 304: last_attachment_fname = doc["_attachments"].keys()[-1]
print "the web page has not been modified" last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
return (None,None,None) content = last_attachment
else: return (doc['url'],doc['mime_type'],content.read())
content = url_handle.read() else:
docsdb.save(doc) print "new webpage loaded"
doc = docsdb.get(hash) # need to get a _rev content = url_handle.read()
docsdb.put_attachment(doc, content, str(time.time())+"-"+os.path.basename(url), doc['mime_type']) docsdb.save(doc)
return (doc['url'], doc['mime_type'], content) doc = docsdb.get(hash) # need to get a _rev
#store as attachment epoch-filename docsdb.put_attachment(doc, content, str(time.time())+"-"+os.path.basename(url), doc['mime_type'])
  return (doc['url'], doc['mime_type'], content)
except urllib2.URLError as e: #store as attachment epoch-filename
error = ""  
if hasattr(e, 'reason'): except urllib2.URLError as e:
error = "error %s in downloading %s" % (str(e.reason), url) print "error!"
elif hasattr(e, 'code'): error = ""
error = "error %s in downloading %s" % (e.code, url) if hasattr(e, 'reason'):
print error error = "error %s in downloading %s" % (str(e.reason), url)
doc['error'] = error elif hasattr(e, 'code'):
docsdb.save(doc) error = "error %s in downloading %s" % (e.code, url)
return (None,None,None) print error
  doc['error'] = error
  docsdb.save(doc)
  return (None,None,None)
   
   
   
def scrapeAndStore(docsdb, url, depth, fieldName, agencyID): def scrapeAndStore(docsdb, url, depth, fieldName, agencyID):
(url,mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID) (url,mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID)
badURLs = ["http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report"] badURLs = ["http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report"]
if content != None and depth > 0 and url != "http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report": if content != None and depth > 0 and url != "http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report":
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
# http://www.crummy.com/software/BeautifulSoup/documentation.html # http://www.crummy.com/software/BeautifulSoup/documentation.html
soup = BeautifulSoup(content) soup = BeautifulSoup(content)
navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')) navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header'))
for nav in navIDs: for nav in navIDs:
print "Removing element", nav['id'] print "Removing element", nav['id']
nav.extract() nav.extract()
navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')}) navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')})
for nav in navClasses: for nav in navClasses:
print "Removing element", nav['class'] print "Removing element", nav['class']
nav.extract() nav.extract()
links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-")) links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-"))
linkurls = set([]) linkurls = set([])
for link in links: for link in links:
if link.has_key("href"): if link.has_key("href"):
if link['href'].startswith("http"): if link['href'].startswith("http"):
# lets not do external links for now # lets not do external links for now
# linkurls.add(link['href']) # linkurls.add(link['href'])
None None
if link['href'].startswith("mailto"): if link['href'].startswith("mailto"):
# not http # not http
None None
if link['href'].startswith("javascript"): if link['href'].startswith("javascript"):
# not http # not http
None None
else: else:
# remove anchors and spaces in urls # remove anchors and spaces in urls
linkurls.add(fullurl(url,link['href'])) linkurls.add(fullurl(url,link['href']))
for linkurl in linkurls: for linkurl in linkurls:
#print linkurl #print linkurl
scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID) scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID)
   
#couch = couchdb.Server('http://192.168.1.148:5984/') #couch = couchdb.Server('http://192.168.1.148:5984/')
couch = couchdb.Server('http://127.0.0.1:5984/') couch = couchdb.Server('http://127.0.0.1:5984/')
# select database # select database
agencydb = couch['disclosr-agencies'] agencydb = couch['disclosr-agencies']
docsdb = couch['disclosr-documents'] docsdb = couch['disclosr-documents']
   
if __name__ == "__main__": if __name__ == "__main__":
for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view? for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view?
agency = agencydb.get(row.id) agency = agencydb.get(row.id)
print agency['name'] print agency['name']
for key in agency.keys(): for key in agency.keys():
if key == "FOIDocumentsURL" and "status" not in agency.keys: if key == "FOIDocumentsURL" and "status" not in agency.keys:
scrapeAndStore(docsdb, agency[key],0,key,agency['_id']) scrapeAndStore(docsdb, agency[key],0,key,agency['_id'])
if key == 'website' and False: if key == 'website' and False:
scrapeAndStore(docsdb, agency[key],0,key,agency['_id']) scrapeAndStore(docsdb, agency[key],0,key,agency['_id'])
agency['metadata']['lastScraped'] = time.time() agency['metadata']['lastScraped'] = time.time()
if key.endswith('URL') and False: if key.endswith('URL') and False:
print key print key
depth = 1 depth = 1
if 'scrapeDepth' in agency.keys(): if 'scrapeDepth' in agency.keys():
depth = agency['scrapeDepth'] depth = agency['scrapeDepth']
scrapeAndStore(docsdb, agency[key],depth,key,agency['_id']) scrapeAndStore(docsdb, agency[key],depth,key,agency['_id'])
agencydb.save(agency) agencydb.save(agency)
   
  import sys
  import os
  sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
  import genericScrapers
 
 
  class ScraperImplementation(genericScrapers.GenericPDFDisclogScraper):
 
  def __init__(self):
  super(ScraperImplementation, self).__init__()
 
 
  if __name__ == '__main__':
  print 'Subclass:', issubclass(ScraperImplementation,
  genericScrapers.GenericPDFDisclogScraper)
  print 'Instance:', isinstance(ScraperImplementation(),
  genericScrapers.GenericPDFDisclogScraper)
  ScraperImplementation().doScrape()
 
  import sys,os
  sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
  import genericScrapers
  import dateutil
  from dateutil.parser import *
  from datetime import *
 
 
  class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
 
  def __init__(self):
  super(ScraperImplementation, self).__init__()
  def getDate(self, content, entry, doc):
  date = ''.join(entry.find('th').stripped_strings).strip()
  (a, b, c) = date.partition("(")
  date = self.remove_control_chars(a.replace("Octber", "October"))
  print date
  edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
  print edate
  doc.update({'date': edate})
  return
  def getColumnCount(self):
  return 4
 
  def getTable(self, soup):
  return soup.find(summary="List of Defence documents released under Freedom of Information requets")
 
  def getColumns(self, columns):
  (id, description, access, notes) = columns
  return (id, None, description, description, notes)
 
 
  if __name__ == '__main__':
  print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
  print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
 
  nsi = ScraperImplementation()
  nsi.disclogURL = "http://www.defence.gov.au/foi/disclosure_log_201213.cfm"
  nsi.doScrape()
 
  nsi.disclogURL = "http://www.defence.gov.au/foi/disclosure_log_201112.cfm"
  nsi.doScrape()
 
  nsi.disclogURL = "http://www.defence.gov.au/foi/disclosure_log_201011.cfm"
  nsi.doScrape()
 
 
  import sys
  import os
  sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
  import genericScrapers
 
 
  class ScraperImplementation(genericScrapers.GenericHTMLDisclogScraper):
 
  def __init__(self):
  super(ScraperImplementation, self).__init__()
 
 
  if __name__ == '__main__':
  print 'Subclass:', issubclass(ScraperImplementation,
  genericScrapers.GenericHTMLDisclogScraper)
  print 'Instance:', isinstance(ScraperImplementation(),
  genericScrapers.GenericHTMLDisclogScraper)
  ScraperImplementation().doScrape()
 
  import sys,os
  sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
  import genericScrapers
  import dateutil
  from dateutil.parser import *
  from datetime import *
  import scrape
  from bs4 import BeautifulSoup
  class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
 
  def __init__(self):
  super(ScraperImplementation, self).__init__()
 
  def getDescription(self,content, entry,doc):
  link = None
  links = []
  description = ""
  for atag in entry.find_all('a'):
  if atag.has_key('href'):
  link = scrape.fullurl(self.getURL(), atag['href'])
  (url, mime_type, htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False)
  if htcontent != None:
  if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
  soup = BeautifulSoup(htcontent)
  row = soup.find(id="content_div_148050")
  description = ''.join(row.stripped_strings)
  for atag in row.find_all("a"):
  if atag.has_key('href'):
  links.append(scrape.fullurl(link, atag['href']))
 
  if links != []:
  doc.update({'links': links})
  if description != "":
  doc.update({ 'description': description})
  def getColumnCount(self):
  return 4
 
  def getColumns(self, columns):
  (id, date, datepub, title) = columns
  return (id, date, title, title, None)
 
 
  if __name__ == '__main__':
  print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
  print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
 
  nsi = ScraperImplementation()
  nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=1"
  nsi.doScrape()
  nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=2"
  nsi.doScrape()
  nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=3"
  nsi.doScrape()
  nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=4"
  nsi.doScrape()
  nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=5"
  nsi.doScrape()
 
import sys,os import sys,os
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import genericScrapers import genericScrapers
import scrape import scrape
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
   
#http://www.doughellmann.com/PyMOTW/abc/ #http://www.doughellmann.com/PyMOTW/abc/
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
def getDescription(self,content, entry,doc): def getDescription(self,content, entry,doc):
link = None link = None
links = [] links = []
description = "" description = ""
for atag in entry.find_all('a'): for atag in entry.find_all('a'):
if atag.has_key('href'): if atag.has_key('href'):
link = scrape.fullurl(self.getURL(),atag['href']) link = scrape.fullurl(self.getURL(),atag['href'])
(url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False)
if htcontent != None: if htcontent != None:
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
# http://www.crummy.com/software/BeautifulSoup/documentation.html # http://www.crummy.com/software/BeautifulSoup/documentation.html
soup = BeautifulSoup(htcontent) soup = BeautifulSoup(htcontent)
for row in soup.find(class_ = "ms-rteTable-GreyAlternating").find_all('tr'): for row in soup.find(class_ = "ms-rteTable-GreyAlternating").find_all('tr'):
if row != None: if row != None:
rowtitle = row.find('th').string rowtitle = row.find('th').string
description = description + "\n" + rowtitle + ": " if rowtitle != None:
  description = description + "\n" + rowtitle + ": "
for text in row.find('td').stripped_strings: for text in row.find('td').stripped_strings:
description = description + text description = description + text
for atag in row.find_all("a"): for atag in row.find_all("a"):
if atag.has_key('href'): if atag.has_key('href'):
links.append(scrape.fullurl(link,atag['href'])) links.append(scrape.fullurl(link,atag['href']))
   
if links != []: if links != []:
doc.update({'links': links}) doc.update({'links': links})
if description != "": if description != "":
doc.update({ 'description': description}) doc.update({ 'description': description})
   
def getColumnCount(self): def getColumnCount(self):
return 2 return 2
def getTable(self,soup): def getTable(self,soup):
return soup.find(class_ = "ms-rteTable-GreyAlternating") return soup.find(class_ = "ms-rteTable-GreyAlternating")
def getColumns(self,columns): def getColumns(self,columns):
(date, title) = columns (date, title) = columns
return (title, date, title, title, None) return (title, date, title, title, None)
   
if __name__ == '__main__': if __name__ == '__main__':
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
ScraperImplementation().doScrape() ScraperImplementation().doScrape()
   
  import sys
  import os
  sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
  import genericScrapers
 
 
  class ScraperImplementation(genericScrapers.GenericPDFDisclogScraper):
 
  def __init__(self):
  super(ScraperImplementation, self).__init__()
 
 
  if __name__ == '__main__':
  print 'Subclass:', issubclass(ScraperImplementation,
  genericScrapers.GenericPDFDisclogScraper)
  print 'Instance:', isinstance(ScraperImplementation(),
  genericScrapers.GenericPDFDisclogScraper)
  ScraperImplementation().doScrape()
 
  import sys
  import os
  sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
  import genericScrapers
 
 
  class ScraperImplementation(genericScrapers.GenericHTMLDisclogScraper):
 
  def __init__(self):
  super(ScraperImplementation, self).__init__()
 
 
  if __name__ == '__main__':
  print 'Subclass:', issubclass(ScraperImplementation,
  genericScrapers.GenericHTMLDisclogScraper)
  print 'Instance:', isinstance(ScraperImplementation(),
  genericScrapers.GenericHTMLDisclogScraper)
  ScraperImplementation().doScrape()
 
  import sys
  import os
  sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
  import genericScrapers
 
 
  class ScraperImplementation(genericScrapers.GenericHTMLDisclogScraper):
 
  def __init__(self):
  super(ScraperImplementation, self).__init__()
 
 
  if __name__ == '__main__':
  print 'Subclass:', issubclass(ScraperImplementation,
  genericScrapers.GenericHTMLDisclogScraper)
  print 'Instance:', isinstance(ScraperImplementation(),
  genericScrapers.GenericHTMLDisclogScraper)
  ScraperImplementation().doScrape()
 
  import sys
  import os
  sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
  import genericScrapers
 
 
  class ScraperImplementation(genericScrapers.GenericHTMLDisclogScraper):
 
  def __init__(self):
  super(ScraperImplementation, self).__init__()
 
 
  if __name__ == '__main__':
  print 'Subclass:', issubclass(ScraperImplementation,
  genericScrapers.GenericHTMLDisclogScraper)
  print 'Instance:', isinstance(ScraperImplementation(),
  genericScrapers.GenericHTMLDisclogScraper)
  ScraperImplementation().doScrape()
 
  import sys
  import os
  sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
  import genericScrapers
 
 
  class ScraperImplementation(genericScrapers.GenericHTMLDisclogScraper):
 
  def __init__(self):
  super(ScraperImplementation, self).__init__()
 
 
  if __name__ == '__main__':
  print 'Subclass:', issubclass(ScraperImplementation,
  genericScrapers.GenericHTMLDisclogScraper)
  print 'Instance:', isinstance(ScraperImplementation(),
  genericScrapers.GenericHTMLDisclogScraper)
  ScraperImplementation().doScrape()
 
  import sys
  import os
  sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
  import genericScrapers
 
 
  class ScraperImplementation(genericScrapers.GenericPDFDisclogScraper):
 
  def __init__(self):
  super(ScraperImplementation, self).__init__()
 
 
  if __name__ == '__main__':
  print 'Subclass:', issubclass(ScraperImplementation,
  genericScrapers.GenericOAICDisclogScraper)
  print 'Instance:', isinstance(ScraperImplementation(),
  genericScrapers.GenericOAICDisclogScraper)
  ScraperImplementation().doScrape()
 
  import sys
  import os
  sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
  import genericScrapers
 
 
  class ScraperImplementation(genericScrapers.GenericHTMLDisclogScraper):
 
  def __init__(self):
  super(ScraperImplementation, self).__init__()
 
 
  if __name__ == '__main__':
  print 'Subclass:', issubclass(ScraperImplementation,
  genericScrapers.GenericHTMLDisclogScraper)
  print 'Instance:', isinstance(ScraperImplementation(),
  genericScrapers.GenericHTMLDisclogScraper)
  ScraperImplementation().doScrape()
 
  import sys
  import os
  sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
  import genericScrapers
 
 
  class ScraperImplementation(genericScrapers.GenericHTMLDisclogScraper):
 
  def __init__(self):
  super(ScraperImplementation, self).__init__()
 
 
  if __name__ == '__main__':
  print 'Subclass:', issubclass(ScraperImplementation,
  genericScrapers.GenericHTMLDisclogScraper)
  print 'Instance:', isinstance(ScraperImplementation(),
  genericScrapers.GenericHTMLDisclogScraper)
  ScraperImplementation().doScrape()
 
  import sys
  import os
  sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
  import genericScrapers
  import scrape
  from datetime import date
  from pyquery import PyQuery as pq
  from lxml import etree
  import urllib
  import dateutil
  from dateutil.parser import *
 
  class ACMADisclogScraper(genericScrapers.GenericDisclogScraper):
 
  def doScrape(self):
  foidocsdb = scrape.couch['disclosr-foidocuments']
  (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
  self.getURL(), "foidocuments", self.getAgencyID())
 
  d = pq(content)
  d.make_links_absolute(base_url = self.getURL())
  for table in d('table').items():
  title= table('thead').text()
  print title
  (idate,descA,descB,link,deldate,notes) = table('tbody tr').map(lambda i, e: pq(e).children().eq(1).text())
  links = table('a').map(lambda i, e: pq(e).attr('href'))
  description = descA+" "+descB
  edate = parse(idate[:12], dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
  print edate
  dochash = scrape.mkhash(self.remove_control_chars(title))
  doc = foidocsdb.get(dochash)
  if doc is None:
  print "saving " + dochash
  edate = date.today().strftime("%Y-%m-%d")
  doc = {'_id': dochash, 'agencyID': self.getAgencyID()
  , 'url': self.getURL(), 'docID': dochash,
  "links": links,
  "date": edate, "notes": notes, "title": title, "description": description}
  #print doc
  foidocsdb.save(doc)
  else:
  print "already saved"
 
 
  if __name__ == '__main__':
  print 'Subclass:', issubclass(ACMADisclogScraper,
  genericScrapers.GenericDisclogScraper)
  print 'Instance:', isinstance(ACMADisclogScraper(),
  genericScrapers.GenericDisclogScraper)
  ACMADisclogScraper().doScrape()
 
  import sys
  import os
  sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
  import genericScrapers
 
 
  class ScraperImplementation(genericScrapers.GenericPDFDisclogScraper):
 
  def __init__(self):
  super(ScraperImplementation, self).__init__()
 
 
  if __name__ == '__main__':
  print 'Subclass:', issubclass(ScraperImplementation,
  genericScrapers.GenericOAICDisclogScraper)
  print 'Instance:', isinstance(ScraperImplementation(),
  genericScrapers.GenericOAICDisclogScraper)
  ScraperImplementation().doScrape()
 
  import sys,os
  sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
  import genericScrapers
  import dateutil
  from dateutil.parser import *
  from datetime import *
 
 
  class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
 
  def __init__(self):
  super(ScraperImplementation, self).__init__()
 
  def getColumnCount(self):
  return 6
 
  def getColumns(self, columns):
  (id, date, title, description, datepub, notes) = columns
  return (id, date, title, description, notes)
 
 
  if __name__ == '__main__':
  print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
  print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
 
  nsi = ScraperImplementation()
  nsi.disclogURL = "http://www.dpmc.gov.au/foi/ips/disclosure_logs/pmo/2011-12.cfm"
  nsi.doScrape()
  nsi.disclogURL = "http://www.dpmc.gov.au/foi/ips/disclosure_logs/dpmc/2011-12.cfm"
  nsi.doScrape()
  nsi.disclogURL = "http://www.dpmc.gov.au/foi/ips/disclosure_logs/dpmc/2012-13.cfm"
  nsi.doScrape()
  nsi.disclogURL = "http://www.dpmc.gov.au/foi/ips/disclosure_logs/omsi/2011-12.cfm"
  nsi.doScrape()
  nsi.disclogURL = "http://www.dpmc.gov.au/foi/ips/disclosure_logs/omps/2012-13.cfm"
  nsi.doScrape()
 
multiple pages  
 
import sys,os import sys,os
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import genericScrapers import genericScrapers
import scrape import scrape
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import codecs import codecs
#http://www.doughellmann.com/PyMOTW/abc/ #http://www.doughellmann.com/PyMOTW/abc/
class NewScraperImplementation(genericScrapers.GenericOAICDisclogScraper): class NewScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
def getDescription(self,content, entry,doc): def getDescription(self,content, entry,doc):
link = None link = None
links = [] links = []
description = "" description = ""
for atag in entry.find_all('a'): for atag in entry.find_all('a'):
if atag.has_key('href'): if atag.has_key('href'):
link = scrape.fullurl(self.getURL(),atag['href']) link = scrape.fullurl(self.getURL(),atag['href'])
(url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False)
if htcontent != None: if htcontent != None:
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
# http://www.crummy.com/software/BeautifulSoup/documentation.html # http://www.crummy.com/software/BeautifulSoup/documentation.html
soup = BeautifulSoup(htcontent) soup = BeautifulSoup(htcontent)
for text in soup.find(id="divFullWidthColumn").stripped_strings: for text in soup.find(id="divFullWidthColumn").stripped_strings:
description = description + text.encode('ascii', 'ignore') description = description + text.encode('ascii', 'ignore')
   
for atag in soup.find(id="divFullWidthColumn").find_all("a"): for atag in soup.find(id="divFullWidthColumn").find_all("a"):
if atag.has_key('href'): if atag.has_key('href'):
links.append(scrape.fullurl(link,atag['href'])) links.append(scrape.fullurl(link,atag['href']))
   
if links != []: if links != []:
doc.update({'links': links}) doc.update({'links': links})
if description != "": if description != "":
doc.update({ 'description': description}) doc.update({ 'description': description})
   
def getColumnCount(self): def getColumnCount(self):
return 2 return 2
def getTable(self,soup): def getTable(self,soup):
return soup.find(id = "TwoColumnSorting") return soup.find(id = "TwoColumnSorting")
def getColumns(self,columns): def getColumns(self,columns):
( title, date) = columns ( title, date) = columns
return (title, date, title, title, None) return (title, date, title, title, None)
class OldScraperImplementation(genericScrapers.GenericOAICDisclogScraper): class OldScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
def getDescription(self,content, entry,doc): def getDescription(self,content, entry,doc):
link = None link = None
links = [] links = []
description = "" description = ""
for atag in entry.find_all('a'): for atag in entry.find_all('a'):
if atag.has_key('href'): if atag.has_key('href'):
link = scrape.fullurl(self.getURL(),atag['href']) link = scrape.fullurl(self.getURL(),atag['href'])
(url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False)
if htcontent != None: if htcontent != None:
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
# http://www.crummy.com/software/BeautifulSoup/documentation.html # http://www.crummy.com/software/BeautifulSoup/documentation.html
soup = BeautifulSoup(htcontent) soup = BeautifulSoup(htcontent)
for text in soup.find(id="content-item").stripped_strings: for text in soup.find(id="content-item").stripped_strings:
description = description + text + " \n" description = description + text + " \n"
for atag in soup.find(id="content-item").find_all("a"): for atag in soup.find(id="content-item").find_all("a"):
if atag.has_key('href'): if atag.has_key('href'):
links.append(scrape.fullurl(link,atag['href'])) links.append(scrape.fullurl(link,atag['href']))
if links != []: if links != []:
doc.update({'links': links}) doc.update({'links': links})
if description != "": if description != "":
doc.update({ 'description': description}) doc.update({ 'description': description})
   
if links != []: if links != []:
doc.update({'links': links}) doc.update({'links': links})
if description != "": if description != "":
doc.update({ 'description': description}) doc.update({ 'description': description})
   
def getColumnCount(self): def getColumnCount(self):
return 2 return 2
def getTable(self,soup): def getTable(self,soup):
return soup.find(class_ = "doc-list") return soup.find(class_ = "doc-list")
def getColumns(self,columns): def getColumns(self,columns):
(date, title) = columns (date, title) = columns
return (title, date, title, title, None) return (title, date, title, title, None)
   
if __name__ == '__main__': if __name__ == '__main__':
print 'Subclass:', issubclass(NewScraperImplementation, genericScrapers.GenericOAICDisclogScraper) print 'Subclass:', issubclass(NewScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
print 'Instance:', isinstance(NewScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) print 'Instance:', isinstance(NewScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
#NewScraperImplementation().doScrape() NewScraperImplementation().doScrape()
print 'Subclass:', issubclass(OldScraperImplementation, genericScrapers.GenericOAICDisclogScraper) print 'Subclass:', issubclass(OldScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
print 'Instance:', isinstance(OldScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) print 'Instance:', isinstance(OldScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
osi = OldScraperImplementation() osi = OldScraperImplementation()
osi.disclogURL = "http://archive.treasury.gov.au/content/foi_publications.asp?year=-1&abstract=0&classification=&=&titl=Disclosure+Log+-+Documents+Released+Under+FOI" osi.disclogURL = "http://archive.treasury.gov.au/content/foi_publications.asp?year=-1&abstract=0&classification=&=&titl=Disclosure+Log+-+Documents+Released+Under+FOI"
osi.doScrape() osi.doScrape()
# old site too  
   
  import sys
  import os
  sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
  import genericScrapers
 
 
  class ScraperImplementation(genericScrapers.GenericHTMLDisclogScraper):
 
  def __init__(self):
  super(ScraperImplementation, self).__init__()
 
 
  if __name__ == '__main__':
  print 'Subclass:', issubclass(ScraperImplementation,
  genericScrapers.GenericHTMLDisclogScraper)
  print 'Instance:', isinstance(ScraperImplementation(),
  genericScrapers.GenericHTMLDisclogScraper)
  ScraperImplementation().doScrape()
 
  import sys,os
  sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
  import genericScrapers
  import dateutil
  from dateutil.parser import *
  from datetime import *
 
 
  class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
 
  def __init__(self):
  super(ScraperImplementation, self).__init__()
 
  def getColumnCount(self):
  return 2
 
  def getColumns(self, columns):
  (date, title) = columns
  return (title, date, title, title, None)
 
 
  if __name__ == '__main__':
  print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
  print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
 
  nsi = ScraperImplementation()
  nsi.disclogURL = "http://www.immi.gov.au/about/foi/foi-disclosures-2012.htm"
  nsi.doScrape()
  nsi.disclogURL = "http://www.immi.gov.au/about/foi/foi-disclosures-2011.htm"
  nsi.doScrape()
  nsi.disclogURL = "http://www.immi.gov.au/about/foi/foi-disclosures-2010.htm"
  nsi.doScrape()
  nsi.disclogURL = "http://www.immi.gov.au/about/foi/foi-disclosures-2009.htm"
  nsi.doScrape()
 
multipage immi  
 
  import sys
  import os
  sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
  import genericScrapers
 
 
  class ScraperImplementation(genericScrapers.GenericHTMLDisclogScraper):
 
  def __init__(self):
  super(ScraperImplementation, self).__init__()
 
 
  if __name__ == '__main__':
  print 'Subclass:', issubclass(ScraperImplementation,
  genericScrapers.GenericHTMLDisclogScraper)
  print 'Instance:', isinstance(ScraperImplementation(),
  genericScrapers.GenericHTMLDisclogScraper)
  ScraperImplementation().doScrape()
 
  import sys
  import os
  sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
  import genericScrapers
 
 
  class ScraperImplementation(genericScrapers.GenericHTMLDisclogScraper):
 
  def __init__(self):
  super(ScraperImplementation, self).__init__()
 
 
  if __name__ == '__main__':
  print 'Subclass:', issubclass(ScraperImplementation,
  genericScrapers.GenericHTMLDisclogScraper)
  print 'Instance:', isinstance(ScraperImplementation(),
  genericScrapers.GenericHTMLDisclogScraper)
  ScraperImplementation().doScrape()
 
  import sys
  import os
  sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
  import genericScrapers
 
 
  class ScraperImplementation(genericScrapers.GenericPDFDisclogScraper):
 
  def __init__(self):
  super(ScraperImplementation, self).__init__()
 
 
  if __name__ == '__main__':
  print 'Subclass:', issubclass(ScraperImplementation,
  genericScrapers.GenericOAICDisclogScraper)
  print 'Instance:', isinstance(ScraperImplementation(),
  genericScrapers.GenericOAICDisclogScraper)
  ScraperImplementation().doScrape()
 
# pdf  
http://www.awm.gov.au/about/AWM_Disclosure_Log.pdf  
 
  import sys
  import os
  sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
  import genericScrapers
 
 
  class ScraperImplementation(genericScrapers.GenericHTMLDisclogScraper):
 
  def __init__(self):
  super(ScraperImplementation, self).__init__()
 
 
  if __name__ == '__main__':
  print 'Subclass:', issubclass(ScraperImplementation,
  genericScrapers.GenericHTMLDisclogScraper)
  print 'Instance:', isinstance(ScraperImplementation(),
  genericScrapers.GenericHTMLDisclogScraper)
  ScraperImplementation().doScrape()
 
  import sys
  import os
  sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
  import genericScrapers
  import scrape
  from datetime import date
  from pyquery import PyQuery as pq
  from lxml import etree
  import urllib
  import dateutil
  from dateutil.parser import *
 
  class ACMADisclogScraper(genericScrapers.GenericDisclogScraper):
 
  def doScrape(self):
  foidocsdb = scrape.couch['disclosr-foidocuments']
  (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
  self.getURL(), "foidocuments", self.getAgencyID())
 
  d = pq(content)
  d.make_links_absolute(base_url = self.getURL())
  for item in d('.item-list').items():
  title= item('h3').text()
  print title
  links = item('a').map(lambda i, e: pq(e).attr('href'))
  description = title= item('ul').text()
  edate = date.today().strftime("%Y-%m-%d")
  print edate
  dochash = scrape.mkhash(self.remove_control_chars(title))
  doc = foidocsdb.get(dochash)
  if doc is None:
  print "saving " + dochash
  edate = date.today().strftime("%Y-%m-%d")
  doc = {'_id': dochash, 'agencyID': self.getAgencyID()
  , 'url': self.getURL(), 'docID': dochash,
  "links": links,
  "date": edate, "title": title, "description": description}
  #print doc
  foidocsdb.save(doc)
  else:
  print "already saved"
 
 
  if __name__ == '__main__':
  print 'Subclass:', issubclass(ACMADisclogScraper,
  genericScrapers.GenericDisclogScraper)
  print 'Instance:', isinstance(ACMADisclogScraper(),
  genericScrapers.GenericDisclogScraper)
  ACMADisclogScraper().doScrape()
 
  import sys
  import os
  sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
  import genericScrapers
 
 
  class ScraperImplementation(genericScrapers.GenericHTMLDisclogScraper):
 
  def __init__(self):
  super(ScraperImplementation, self).__init__()
 
 
  if __name__ == '__main__':
  print 'Subclass:', issubclass(ScraperImplementation,
  genericScrapers.GenericHTMLDisclogScraper)
  print 'Instance:', isinstance(ScraperImplementation(),
  genericScrapers.GenericHTMLDisclogScraper)
  ScraperImplementation().doScrape()
 
  import sys
  import os
  sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
  import genericScrapers
 
 
  class ScraperImplementation(genericScrapers.GenericHTMLDisclogScraper):
 
  def __init__(self):
  super(ScraperImplementation, self).__init__()
 
 
  if __name__ == '__main__':
  print 'Subclass:', issubclass(ScraperImplementation,
  genericScrapers.GenericHTMLDisclogScraper)
  print 'Instance:', isinstance(ScraperImplementation(),
  genericScrapers.GenericHTMLDisclogScraper)
  ScraperImplementation().doScrape()
 
<?php <?php
   
include ('../include/common.inc.php'); include ('../include/common.inc.php');
$last_updated = date('Y-m-d', @filemtime('cbrfeed.zip')); $last_updated = date('Y-m-d', @filemtime('cbrfeed.zip'));
header("Content-Type: text/xml"); header("Content-Type: text/xml");
echo "<?xml version='1.0' encoding='UTF-8'?>"; echo "<?xml version='1.0' encoding='UTF-8'?>";
echo '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' . "\n"; echo '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' . "\n";
echo " <url><loc>" . local_url() . "index.php</loc><priority>1.0</priority></url>\n"; echo " <url><loc>" . local_url() . "index.php</loc><priority>1.0</priority></url>\n";
foreach (scandir("./") as $file) { foreach (scandir("./") as $file) {
if (strpos($file, ".php") !== false && $file != "index.php" && $file != "sitemap.xml.php") if (strpos($file, ".php") !== false && $file != "index.php" && $file != "sitemap.xml.php")
echo " <url><loc>" . local_url() . "$file</loc><priority>0.6</priority></url>\n"; echo " <url><loc>" . local_url() . "$file</loc><priority>0.6</priority></url>\n";
} }
  $agenciesdb = $server->get_db('disclosr-agencies');
$db = $server->get_db('disclosr-foidocuments');  
try { try {
$rows = $db->get_view("app", "all")->rows; $rows = $agenciesdb->get_view("app", "byCanonicalName")->rows;
  foreach ($rows as $row) {
  echo '<url><loc>' . local_url() . 'agency.php?id=' . $row->value->_id . "</loc><priority>0.3</priority></url>\n";
  }
  } catch (SetteeRestClientException $e) {
  setteErrorHandler($e);
  }
  $foidocsdb = $server->get_db('disclosr-foidocuments');
  try {
  $rows = $foidocsdb->get_view("app", "all")->rows;
foreach ($rows as $row) { foreach ($rows as $row) {
echo '<url><loc>' . local_url() . 'view.php?id=' . $row->value->_id . "</loc><priority>0.3</priority></url>\n"; echo '<url><loc>' . local_url() . 'view.php?id=' . $row->value->_id . "</loc><priority>0.3</priority></url>\n";
} }
} catch (SetteeRestClientException $e) { } catch (SetteeRestClientException $e) {
setteErrorHandler($e); setteErrorHandler($e);
} }
echo '</urlset>'; echo '</urlset>';
?> ?>
   
<?php <?php
   
function include_header_documents($title) { function include_header_documents($title) {
?> header('X-UA-Compatible: IE=edge,chrome=1');
<!doctype html> ?>
<!-- paulirish.com/2008/conditional-stylesheets-vs-css-hacks-answer-neither/ --> <!doctype html>
<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]--> <!-- paulirish.com/2008/conditional-stylesheets-vs-css-hacks-answer-neither/ -->
<!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]--> <!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]-->
<!--[if IE 8]> <html class="no-js lt-ie9" lang="en"> <![endif]--> <!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]-->
<!-- Consider adding a manifest.appcache: h5bp.com/d/Offline --> <!--[if IE 8]> <html class="no-js lt-ie9" lang="en"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en"> <!--<![endif]--> <!-- Consider adding a manifest.appcache: h5bp.com/d/Offline -->
<head> <!--[if gt IE 8]><!--> <html class="no-js" lang="en"> <!--<![endif]-->
<meta charset="utf-8"> <head>
  <meta charset="utf-8">
   
<!-- Use the .htaccess and remove these lines to avoid edge case issues. <title>Australian Disclosure Logs<?php if ($title != "") echo " - $title"; ?></title>
More info: h5bp.com/i/378 --> <meta name="description" content="">
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">  
   
<title>Australian Disclosure Logs<?php if ($title != "") echo " - $title";?></title> <!-- Mobile viewport optimized: h5bp.com/viewport -->
<meta name="description" content=""> <meta name="viewport" content="width=device-width">
  <link rel="alternate" type="application/rss+xml" title="Latest Disclosure Log Entries" href="rss.xml.php" />
  <!-- Place favicon.ico and apple-touch-icon.png in the root directory: mathiasbynens.be/notes/touch-icons -->
  <meta name="google-site-verification" content="jkknX5g2FCpQvrW030b1Nq2hyoa6mb3EDiA7kCoHNj8" />
   
<!-- Mobile viewport optimized: h5bp.com/viewport --> <!-- Le styles -->
<meta name="viewport" content="width=device-width"> <link href="css/bootstrap.min.css" rel="stylesheet">
  <style type="text/css">
  body {
  padding-top: 60px;
  padding-bottom: 40px;
  }
  .sidebar-nav {
  padding: 9px 0;
  }
  </style>
  <link href="css/bootstrap-responsive.min.css" rel="stylesheet">
   
<!-- Place favicon.ico and apple-touch-icon.png in the root directory: mathiasbynens.be/notes/touch-icons --> <!-- HTML5 shim, for IE6-8 support of HTML5 elements -->
<meta name="google-site-verification" content="jkknX5g2FCpQvrW030b1Nq2hyoa6mb3EDiA7kCoHNj8" /> <!--[if lt IE 9]>
  <script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script>
  <![endif]-->
  <!-- More ideas for your <head> here: h5bp.com/d/head-Tips -->
   
<!-- Le styles --> <!-- All JavaScript at the bottom, except this Modernizr build.
<link href="css/bootstrap.min.css" rel="stylesheet"> Modernizr enables HTML5 elements & feature detects for optimal performance.
<style type="text/css"> Create your own custom Modernizr build: www.modernizr.com/download/
body { <script src="js/libs/modernizr-2.5.3.min.js"></script>-->
padding-top: 60px; <script src="js/jquery.js"></script>
padding-bottom: 40px; <script type="text/javascript" src="js/flotr2.min.js"></script>
}  
.sidebar-nav {  
padding: 9px 0;  
}  
</style>  
<link href="css/bootstrap-responsive.min.css" rel="stylesheet">  
   
<!-- HTML5 shim, for IE6-8 support of HTML5 elements --> </head>
<!--[if lt IE 9]> <body>
<script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script> <div class="navbar navbar-inverse navbar-fixed-top">
<![endif]--> <div class="navbar-inner">
<!-- More ideas for your <head> here: h5bp.com/d/head-Tips --> <div class="container-fluid">
  <a class="btn btn-navbar" data-toggle="collapse" data-target=".nav-collapse">
  <span class="icon-bar"></span>
  <span class="icon-bar"></span>
  <span class="icon-bar"></span>
  </a>