rm settee
Former-commit-id: 2fe3d86753e524fca7ee4e095a794727c0556d79
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,9 +1,6 @@
[submodule "couchdb/couchdb-lucene"]
path = couchdb/couchdb-lucene
url = https://github.com/rnewson/couchdb-lucene.git
-[submodule "couchdb/settee"]
- path = couchdb/settee
- url = https://github.com/inadarei/settee.git
[submodule "lib/php-diff"]
path = lib/php-diff
url = https://github.com/chrisboulton/php-diff.git
--- a/admin/exportEmployees.csv.php
+++ b/admin/exportEmployees.csv.php
@@ -4,7 +4,8 @@
$format = "csv";
//$format = "json";
-if (isset($_REQUEST['format'])) $format = $_REQUEST['format'];
+if (isset($_REQUEST['format']))
+ $format = $_REQUEST['format'];
setlocale(LC_CTYPE, 'C');
if ($format == "csv") {
$headers = Array("name");
@@ -21,7 +22,6 @@
if (isset($row->value->statistics->employees)) {
$headers = array_unique(array_merge($headers, array_keys(object_to_array($row->value->statistics->employees))));
-
}
}
} catch (SetteeRestClientException $e) {
@@ -40,15 +40,14 @@
fputcsv($fp, $headers);
} else if ($format == "json") {
echo '{
- "labels" : ["' . implode('","', $headers) . '"],'.PHP_EOL;
+ "labels" : ["' . implode('","', $headers) . '"],' . PHP_EOL;
}
try {
$agencies = $db->get_view("app", "all", null, true)->rows;
//print_r($agencies);
$first = true;
if ($format == "json") {
- echo '"data" : ['.PHP_EOL;
-
+ echo '"data" : [' . PHP_EOL;
}
foreach ($agencies as $agency) {
@@ -56,25 +55,35 @@
$row = Array();
$agencyEmployeesArray = object_to_array($agency->value->statistics->employees);
foreach ($headers as $i => $fieldName) {
+ if ($format == "csv") {
+ if (isset($agencyEmployeesArray[$fieldName])) {
+ $row[] = $agencyEmployeesArray[$fieldName]["value"] ;
+ } else if ($i == 0) {
+ $row[] = $agency->value->name;
+ } else {
+ $row[] = 0;
+ }
+ } else if ($format == "json") {
if (isset($agencyEmployeesArray[$fieldName])) {
- $row[] = '['.$i.','.$agencyEmployeesArray[$fieldName]["value"].']';
+ $row[] = '[' . $i . ',' . $agencyEmployeesArray[$fieldName]["value"] . ']';
} else {
- $row[] = '['.$i.',0]';
+ $row[] = '[' . $i . ',0]';
}
+ }
}
if ($format == "csv") {
fputcsv($fp, array_values($row));
} else if ($format == "json") {
- if (!$first) echo ",";
- echo '{"data" : [' . implode(",", array_values($row)) . '], "label": "'.$agency->value->name.'", "lines" : { "show" : true }, "points" : { "show" : true }}'.PHP_EOL;
+ if (!$first)
+ echo ",";
+ echo '{"data" : [' . implode(",", array_values($row)) . '], "label": "' . $agency->value->name . '", "lines" : { "show" : true }, "points" : { "show" : true }}' . PHP_EOL;
$first = false;
}
}
}
if ($format == "json") {
- echo ']
- }'.PHP_EOL;
-
+ echo ']
+ }' . PHP_EOL;
}
} catch (SetteeRestClientException $e) {
setteErrorHandler($e);
--- a/admin/importAPSCEmployees.php
+++ b/admin/importAPSCEmployees.php
@@ -47,13 +47,17 @@
$changed = false;
if (!isset($doc->statistics)) {
$changed = true;
- $doc->statistics = Array();
+ $doc->statistics = new stdClass();
+ }
+ if (!isset($doc->statistics->employees)) {
+ $changed = true;
+ $doc->statistics->employees = new stdClass();
}
foreach ($sum as $timePeriod => $value) {
if (!isset($doc->statistics->employees->$timePeriod->value)
|| $doc->statistics->employees->$timePeriod->value != $value) {
$changed = true;
- $doc->statistics["employees"][$timePeriod] = Array("value" => $value, "source" => "http://apsc.gov.au/stateoftheservice/");
+ $doc->statistics->employees->$timePeriod = Array("value" => $value, "source" => "http://apsc.gov.au/stateoftheservice/");
}
}
if ($changed) {
--- /dev/null
+++ b/admin/importAPSCEmployees2012.php
@@ -1,1 +1,86 @@
+<?php
+require_once '../include/common.inc.php';
+require($basePath . 'lib/phpquery/phpQuery/phpQuery.php');
+$db = $server->get_db('disclosr-agencies');
+$rows = $db->get_view("app", "byName")->rows;
+$nametoid = Array();
+$sums = Array();
+$functions = Array();
+foreach ($rows as $row) {
+ $nametoid[trim($row->key)] = $row->value;
+}
+
+
+$request = Requests::get("http://www.apsc.gov.au/publications-and-media/parliamentary/state-of-the-service/new-sosr/appendix-2-aps-agencies");
+$doc = phpQuery::newDocumentHTML($request->body);
+phpQuery::selectDocument($doc);
+foreach (pq('tr')->elements as $tr) {
+ //echo $tr->nodeValue.PHP_EOL;
+ $agency = "";
+ $employees = "";
+ $function = "";
+ $i = 0;
+ foreach ($tr->childNodes as $td) {
+ //echo $td->nodeValue." $i <br>";
+ if ($i == 0)
+ $agency = $td->nodeValue;
+ if ($i == 2) {
+ $employees = trim(str_replace(",", "", $td->nodeValue));
+ }
+ if ($i == 4) {
+ $function = $td->nodeValue;
+ }
+ $i++;
+ }
+ if ($agency != "" && $employees != "" && $function != "") {
+ $name = trim(str_replace('2','',$agency));
+ //echo "$name<br><bR>" . PHP_EOL;
+ if (isset($nametoid[$name])) {
+ $id = $nametoid[$name];
+ //echo $id . "<br>" . PHP_EOL;
+ @$sums[$id]["2011-2012"] += $employees;
+ $functions[$id] = $function;
+ } else if ($agency != "Agency"){
+ echo "<br>ERROR NAME '$agency' MISSING FROM ID LIST<br><bR>" . PHP_EOL;
+
+ die();
+ }
+ } else {
+ echo "skipped $agency";
+ }
+}
+//print_r($sums);
+foreach ($sums as $id => $sum) {
+ echo $id . "<br>" . PHP_EOL;
+ $doc = $db->get($id);
+ echo $doc->name . "<br>" . PHP_EOL;
+ // print_r($doc);
+ $changed = false;
+ if (!isset($doc->statistics)) {
+ $changed = true;
+ $doc->statistics = new stdClass();
+ }
+ if (!isset($doc->statistics->employees)) {
+ $changed = true;
+ $doc->statistics->employees = new stdClass();
+ }
+ foreach ($sum as $timePeriod => $value) {
+ if (!isset($doc->statistics->employees->$timePeriod->value)
+ || $doc->statistics->employees->$timePeriod->value != $value) {
+ $changed = true;
+ $doc->statistics->employees->$timePeriod = Array("value" => $value, "source" => "http://apsc.gov.au/stateoftheservice/");
+ $doc->employees = $value;
+ $doc->functionClassification = $functions[$id];
+ }
+ }
+
+ if ($changed) {
+ $db->save($doc);
+ } else {
+ echo "not changed" . "<br>" . PHP_EOL;
+ }
+}
+// employees: timeperiod, source = apsc state of service, value
+?>
+
--- a/admin/refreshDesignDoc.php
+++ b/admin/refreshDesignDoc.php
@@ -4,74 +4,62 @@
//function createFOIDocumentsDesignDoc() {
$foidb = $server->get_db('disclosr-foidocuments');
- $obj = new stdClass();
- $obj->_id = "_design/" . urlencode("app");
- $obj->language = "javascript";
- $obj->views->all->map = "function(doc) { emit(doc._id, doc); };";
- $obj->views->byDate->map = "function(doc) { emit(doc.date, doc); };";
- $obj->views->byDate->reduce = "_count";
- $obj->views->byAgencyID->map = "function(doc) { emit(doc.agencyID, doc); };";
- $obj->views->byAgencyID->reduce = "_count";
+$obj = new stdClass();
+$obj->_id = "_design/" . urlencode("app");
+$obj->language = "javascript";
+$obj->views->all->map = "function(doc) { emit(doc._id, doc); };";
+$obj->views->byDate->map = "function(doc) { emit(doc.date, doc); };";
+$obj->views->byDate->reduce = "_count";
+$obj->views->byDateMonthYear->map = "function(doc) { emit(doc.date, doc); };";
+$obj->views->byDateMonthYear->reduce = "_count";
+$obj->views->byAgencyID->map = "function(doc) { emit(doc.agencyID, doc); };";
+$obj->views->byAgencyID->reduce = "_count";
- // allow safe updates (even if slightly slower due to extra: rev-detection check).
- $foidb->save($obj, true);
+// allow safe updates (even if slightly slower due to extra: rev-detection check).
+$foidb->save($obj, true);
-function createDocumentsDesignDoc() {
- /*
- global $db;
- $obj = new stdClass();
- $obj->_id = "_design/" . urlencode("app");
- $obj->language = "javascript";
- $obj->views->all->map = "function(doc) { emit(doc._id, doc); };";
- $obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };";
- "views": {
- "web_server": {
- "map": "function(doc) {\n emit(doc.web_server, 1);\n}",
- "reduce": "function (key, values, rereduce) {\n return sum(values);\n}"
- },
- "byAgency": {
- "map": "function(doc) {\n emit(doc.agencyID, 1);\n}",
- "reduce": "function (key, values, rereduce) {\n return sum(values);\n}"
- },
- "byURL": {
- "map": "function(doc) {\n emit(doc.url, doc);\n}"
- },
- "agency": {
- "map": "function(doc) {\n emit(doc.agencyID, doc);\n}"
- },
- "byWebServer": {
- "map": "function(doc) {\n emit(doc.web_server, doc);\n}"
- },
- "getValidationRequired": {
- "map": "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}"
- }
- } */
-}
+//function createDocumentsDesignDoc() {
+$docdb = $server->get_db('disclosr-documents');
+
+$obj = new stdClass();
+$obj->_id = "_design/" . urlencode("app");
+$obj->language = "javascript";
+$obj->views->web_server->map = "function(doc) {\n emit(doc.web_server, 1);\n}";
+$obj->views->web_server->reduce = "function (key, values, rereduce) {\n return sum(values);\n}";
+$obj->views->byAgency->map = "function(doc) {\n emit(doc.agencyID, 1);\n}";
+$obj->views->byAgency->reduce = "function (key, values, rereduce) {\n return sum(values);\n}";
+$obj->views->byURL->map = "function(doc) {\n emit(doc.url, doc);\n}";
+$obj->views->agency->map = "function(doc) {\n emit(doc.agencyID, doc);\n}";
+$obj->views->byWebServer->map = "function(doc) {\n emit(doc.web_server, doc);\n}";
+$obj->views->getValidationRequired = "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}";
+
+
+
//function createAgencyDesignDoc() {
$db = $server->get_db('disclosr-agencies');
- $obj = new stdClass();
- $obj->_id = "_design/" . urlencode("app");
- $obj->language = "javascript";
- $obj->views->all->map = "function(doc) { emit(doc._id, doc); };";
- $obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };";
- $obj->views->byCanonicalName->map = "function(doc) {
+$obj = new stdClass();
+$obj->_id = "_design/" . urlencode("app");
+$obj->language = "javascript";
+$obj->views->all->map = "function(doc) { emit(doc._id, doc); };";
+$obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };";
+$obj->views->byCanonicalName->map = "function(doc) {
if (doc.parentOrg || doc.orgType == 'FMA-DepartmentOfState') {
emit(doc.name, doc);
}
};";
- $obj->views->byDeptStateName->map = "function(doc) {
+$obj->views->byDeptStateName->map = "function(doc) {
if (doc.orgType == 'FMA-DepartmentOfState') {
emit(doc.name, doc._id);
}
};";
- $obj->views->parentOrgs->map = "function(doc) {
+$obj->views->parentOrgs->map = "function(doc) {
if (doc.parentOrg) {
emit(doc._id, doc.parentOrg);
}
};";
- $obj->views->byName->map = 'function(doc) {
+$obj->views->byName->map = 'function(doc) {
if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") {
emit(doc.name, doc._id);
if (typeof(doc.shortName) != "undefined" && doc.shortName != doc.name) {
@@ -95,14 +83,14 @@
}
};';
- $obj->views->foiEmails->map = "function(doc) {
+$obj->views->foiEmails->map = "function(doc) {
emit(doc._id, doc.foiEmail);
};";
- $obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }";
- $obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };';
- $obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };';
- $obj->views->getScrapeRequired->map = "function(doc) {
+$obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }";
+$obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };';
+$obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };';
+$obj->views->getScrapeRequired->map = "function(doc) {
var lastScrape = Date.parse(doc.metadata.lastScraped);
@@ -113,14 +101,14 @@
}
};";
- $obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };";
- $obj->views->getConflicts->map = "function(doc) {
+$obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };";
+$obj->views->getConflicts->map = "function(doc) {
if (doc._conflicts) {
emit(null, [doc._rev].concat(doc._conflicts));
}
}";
- // http://stackoverflow.com/questions/646628/javascript-startswith
- $obj->views->score->map = 'if(!String.prototype.startsWith){
+// http://stackoverflow.com/questions/646628/javascript-startswith
+$obj->views->score->map = 'if(!String.prototype.startsWith){
String.prototype.startsWith = function (str) {
return !this.indexOf(str);
}
@@ -144,7 +132,7 @@
emit(count+doc._id, {id:doc._id, name: doc.name, score:count, orgType: doc.orgType, portfolio:portfolio});
}
}';
- $obj->views->scoreHas->map = 'if(!String.prototype.startsWith){
+$obj->views->scoreHas->map = 'if(!String.prototype.startsWith){
String.prototype.startsWith = function (str) {
return !this.indexOf(str);
}
@@ -164,22 +152,20 @@
emit("total", 1);
}
}';
- $obj->views->scoreHas->reduce = 'function (key, values, rereduce) {
+$obj->views->scoreHas->reduce = 'function (key, values, rereduce) {
return sum(values);
}';
- $obj->views->fieldNames->map = '
+$obj->views->fieldNames->map = '
function(doc) {
for(var propName in doc) {
emit(propName, doc._id);
}
}';
- $obj->views->fieldNames->reduce = 'function (key, values, rereduce) {
+$obj->views->fieldNames->reduce = 'function (key, values, rereduce) {
return values.length;
}';
- // allow safe updates (even if slightly slower due to extra: rev-detection check).
- $db->save($obj, true);
-
-
+// allow safe updates (even if slightly slower due to extra: rev-detection check).
+$db->save($obj, true);
?>
--- a/couchdb/SetteeDatabase.class.php
+++ /dev/null
@@ -1,306 +1,1 @@
-<?php
-/**
-* Databaase class.
-*/
-class SetteeDatabase {
-
- /**
- * Base URL of the CouchDB REST API
- */
- private $conn_url;
-
- /**
- * HTTP REST Client instance
- */
- protected $rest_client;
-
- /**
- * Name of the database
- */
- private $dbname;
-
- /**
- * Default constructor
- */
- function __construct($conn_url, $dbname) {
- $this->conn_url = $conn_url;
- $this->dbname = $dbname;
- $this->rest_client = SetteeRestClient::get_instance($this->conn_url);
- }
-
-
- /**
- * Get UUID from CouchDB
- *
- * @return
- * CouchDB-generated UUID string
- *
- */
- function gen_uuid() {
- $ret = $this->rest_client->http_get('_uuids');
- return $ret['decoded']->uuids[0]; // should never be empty at this point, so no checking
- }
-
- /**
- * Create or update a document database
- *
- * @param $document
- * PHP object, a PHP associative array, or a JSON String representing the document to be saved. PHP Objects and arrays are JSON-encoded automatically.
- *
- * <p>If $document has a an "_id" property set, it will be used as document's unique id (even for "create" operation).
- * If "_id" is missing, CouchDB will be used to generate a UUID.
- *
- * <p>If $document has a "_rev" property (revision), document will be updated, rather than creating a new document.
- * You have to provide "_rev" if you want to update an existing document, otherwise operation will be assumed to be
- * one of creation and you will get a duplicate document exception from CouchDB. Also, you may not provide "_rev" but
- * not provide "_id" since that is an invalid input.
- *
- * @param $allowRevAutoDetection
- * Default: false. When true and _rev is missing from the document, save() function will auto-detect latest revision
- * for a document and use it. This option is "false" by default because it involves an extra http HEAD request and
- * therefore can make save() operation slightly slower if such auto-detection is not required.
- *
- * @return
- * document object with the database id (uuid) and revision attached;
- *
- * @throws SetteeCreateDatabaseException
- */
- function save($document, $allowRevAutoDetection = false) {
- if (is_string($document)) {
- $document = json_decode($document);
- }
-
- // Allow passing of $document as an array (for syntactic simplicity and also because in JSON world it does not matter)
- if(is_array($document)) {
- $document = (object) $document;
- }
-
- if (empty($document->_id) && empty($document->_rev)) {
- $id = $this->gen_uuid();
- }
- elseif (empty($document->_id) && !empty($document->_rev)) {
- throw new SetteeWrongInputException("Error: You can not save a document with a revision provided, but missing id");
- }
- else {
- $id = $document->_id;
-
- if ($allowRevAutoDetection) {
- try {
- $rev = $this->get_rev($id);
- } catch (SetteeRestClientException $e) {
- // auto-detection may fail legitimately, if a document has never been saved before (new doc), so skipping error
- }
- if (!empty($rev)) {
- $document->_rev = $rev;
- }
- }
- }
-
- $full_uri = $this->dbname . "/" . $this->safe_urlencode($id);
- $document_json = json_encode($document, JSON_NUMERIC_CHECK);
-
- $ret = $this->rest_client->http_put($full_uri, $document_json);
-
- $document->_id = $ret['decoded']->id;
- $document->_rev = $ret['decoded']->rev;
-
- return $document;
- }
-
- /**
- * @param $doc
- * @param $name
- * @param $content
- * Content of the attachment in a string-buffer format. This function will automatically base64-encode content for
- * you, so you don't have to do it.
- * @param $mime_type
- * Optional. Will be auto-detected if not provided
- * @return void
- */
- public function add_attachment($doc, $name, $content, $mime_type = null) {
- if (empty($doc->_attachments) || !is_object($doc->_attachments)) {
- $doc->_attachments = new stdClass();
- }
-
- if (empty($mime_type)) {
- $mime_type = $this->rest_client->content_mime_type($content);
- }
-
- $doc->_attachments->$name = new stdClass();
- $doc->_attachments->$name->content_type = $mime_type;
- $doc->_attachments->$name->data = base64_encode($content);
- }
-
- /**
- * @param $doc
- * @param $name
- * @param $file
- * Full path to a file (e.g. as returned by PHP's realpath function).
- * @param $mime_type
- * Optional. Will be auto-detected if not provided
- * @return void
- */
- public function add_attachment_file($doc, $name, $file, $mime_type = null) {
- $content = file_get_contents($file);
- $this->add_attachment($doc, $name, $content, $mime_type);
- }
-
- /**
- *
- * Retrieve a document from CouchDB
- *
- * @throws SetteeWrongInputException
- *
- * @param $id
- * Unique ID (usually: UUID) of the document to be retrieved.
- * @return
- * database document in PHP object format.
- */
- function get($id) {
- if (empty($id)) {
- throw new SetteeWrongInputException("Error: Can't retrieve a document without a uuid.");
- }
-
- $full_uri = $this->dbname . "/" . $this->safe_urlencode($id);
-$full_uri = str_replace("%3Frev%3D","?rev=",$full_uri);
- $ret = $this->rest_client->http_get($full_uri);
- return $ret['decoded'];
- }
-
- /**
- *
- * Get the latest revision of a document with document id: $id in CouchDB.
- *
- * @throws SetteeWrongInputException
- *
- * @param $id
- * Unique ID (usually: UUID) of the document to be retrieved.
- * @return
- * database document in PHP object format.
- */
- function get_rev($id) {
- if (empty($id)) {
- throw new SetteeWrongInputException("Error: Can't query a document without a uuid.");
- }
-
- $full_uri = $this->dbname . "/" . $this->safe_urlencode($id);
- $headers = $this->rest_client->http_head($full_uri);
- if (empty($headers['Etag'])) {
- throw new SetteeRestClientException("Error: could not retrieve revision. Server unexpectedly returned empty Etag");
- }
- $etag = str_replace('"', '', $headers['Etag']);
- return $etag;
- }
-
- /**
- * Delete a document
- *
- * @param $document
- * a PHP object or JSON representation of the document that has _id and _rev fields.
- *
- * @return void
- */
- function delete($document) {
- if (!is_object($document)) {
- $document = json_decode($document);
- }
-
- $full_uri = $this->dbname . "/" . $this->safe_urlencode($document->_id) . "?rev=" . $document->_rev;
- $this->rest_client->http_delete($full_uri);
- }
-
-
- /*----------------- View-related functions --------------*/
-
- /**
- * Create a new view or update an existing one.
- *
- * @param $design_doc
- * @param $view_name
- * @param $map_src
- * Source code of the map function in Javascript
- * @param $reduce_src
- * Source code of the reduce function in Javascript (optional)
- * @return void
- */
- function save_view($design_doc, $view_name, $map_src, $reduce_src = null) {
- $obj = new stdClass();
- $obj->_id = "_design/" . urlencode($design_doc);
- $view_name = urlencode($view_name);
- $obj->views->$view_name->map = $map_src;
- if (!empty($reduce_src)) {
- $obj->views->$view_name->reduce = $reduce_src;
- }
-
- // allow safe updates (even if slightly slower due to extra: rev-detection check).
- return $this->save($obj, true);
- }
-
- /**
- * Create a new view or update an existing one.
- *
- * @param $design_doc
- * @param $view_name
- * @param $key
- * key parameter to a view. Can be a single value or an array (for a range). If passed an array, function assumes
- * that first element is startkey, second: endkey.
- * @param $descending
- * return results in descending order. Please don't forget that if you are using a startkey/endkey, when you change
- * order you also need to swap startkey and endkey values!
- *
- * @return void
- */
- function get_view($design_doc, $view_name, $key = null, $descending = false) {
- $id = "_design/" . urlencode($design_doc);
- $view_name = urlencode($view_name);
- $id .= "/_view/$view_name";
-
- $data = array();
- if (!empty($key)) {
- if (is_string($key)) {
- $data = "key=" . '"' . $key . '"';
- }
- elseif (is_array($key)) {
- list($startkey, $endkey) = $key;
- $data = "startkey=" . '"' . $startkey . '"&' . "endkey=" . '"' . $endkey . '"';
- }
-
- if ($descending) {
- $data .= "&descending=true";
- }
- }
-
-
-
- if (empty($id)) {
- throw new SetteeWrongInputException("Error: Can't retrieve a document without a uuid.");
- }
-
- $full_uri = $this->dbname . "/" . $this->safe_urlencode($id);
-$full_uri = str_replace("%253Fgroup%253Dtrue","?group=true",$full_uri);
- $ret = $this->rest_client->http_get($full_uri, $data);
- return $ret['decoded'];
-
- }
-
- /**
- * @param $id
- * @return
- * return a properly url-encoded id.
- */
- private function safe_urlencode($id) {
- //-- System views like _design can have "/" in their URLs.
- $id = rawurlencode($id);
- if (substr($id, 0, 1) == '_') {
- $id = str_replace('%2F', '/', $id);
- }
- return $id;
- }
-
- /** Getter for a database name */
- function get_name() {
- return $this->dbname;
- }
-
-}
--- a/couchdb/settee
+++ /dev/null
--- a/documents/disclogsList.php
+++ b/documents/disclogsList.php
@@ -12,6 +12,7 @@
$disclogs = 0;
$red = 0;
$green = 0;
+$yellow = 0;
$orange = 0;
try {
$rows = $agenciesdb->get_view("app", "byCanonicalName", null, true)->rows;
@@ -19,46 +20,56 @@
if ($rows) {
foreach ($rows as $row) {
+ if ((!isset($row->value->status) || $row->value->status != "suspended") && isset($row->value->foiEmail)) {
+ echo "<tr><td>";
+ if (isset($row->value->website)) echo "<a href='" . $row->value->website . "'>";
+ echo "<b>" . $row->value->name . "</b>";
+ if (isset($row->value->website)) echo "</a>";
+ if ($ENV == "DEV")
+ echo "<br>(" . $row->id . ")";
+ echo "</td>\n";
+ $agencies++;
- echo "<tr><td><b>" . $row->value->name . "</b>";
- if ($ENV == "DEV")
- echo "<br>(" . $row->id . ")";
- echo "</td>\n";
-$agencies++;
-
- echo "<td>";
- if (isset($row->value->FOIDocumentsURL)) {
- $disclogs++;
- echo '<a href="' . $row->value->FOIDocumentsURL . '">'
- . $row->value->FOIDocumentsURL . '</a>';
- if ($ENV == "DEV")
- echo '<br><small>(<a href="viewDocument.php?hash=' . md5($row->value->FOIDocumentsURL) . '">'
- . 'view local copy</a>)</small>';
- } else {
- echo "<font color='red'><abbr title='No'>✘</abbr></font>";
- }
- echo "</td>\n<td>";
- if (isset($row->value->FOIDocumentsURL)) {
- if (file_exists("./scrapers/" . $row->id . '.py')) {
- echo "<font color='green'><abbr title='Yes'>✔</abbr></font>";
- $green++;
- } else if (file_exists("./scrapers/" . $row->id . '.txt')) {
- echo "<font color='orange'><abbr title='Work in progress'><b>▬</b></abbr></font>";
- $orange++;
+ echo "<td>";
+ if (isset($row->value->FOIDocumentsURL)) {
+ $disclogs++;
+ echo '<a href="' . $row->value->FOIDocumentsURL . '">'
+ . $row->value->FOIDocumentsURL . '</a>';
+ if ($ENV == "DEV")
+ echo '<br><small>(<a href="viewDocument.php?hash=' . md5($row->value->FOIDocumentsURL) . '">'
+ . 'view local copy</a>)</small>';
} else {
echo "<font color='red'><abbr title='No'>✘</abbr></font>";
- $red++;
}
+ echo "</td>\n<td>";
+ if (isset($row->value->FOIDocumentsURL)) {
+ if (file_exists("./scrapers/" . $row->id . '.py')) {
+ echo "<font color='green'><abbr title='Yes'>✔</abbr></font>";
+ $green++;
+ } else if (file_exists("./scrapers/" . $row->id . '.txt')) {
+ if (trim(file_get_contents("./scrapers/" . $row->id . '.txt')) == "no disclog") {
+ echo "<font color='yellow'><abbr title='No log table exists at URL to scrape'><b>◎</b></abbr></font>";
+ $yellow++;
+ } else {
+ echo file_get_contents("./scrapers/" . $row->id . '.txt');
+ echo "<font color='orange'><abbr title='Work in progress'><b>▬</b></abbr></font>";
+ $orange++;
+ }
+ } else {
+ echo "<font color='red'><abbr title='No'>✘</abbr></font>";
+ $red++;
+ }
+ }
+ echo "</td></tr>\n";
}
- echo "</td></tr>\n";
}
}
} catch (SetteeRestClientException $e) {
setteErrorHandler($e);
}
echo "</table>";
-echo $agencies." agencies, ".round(($disclogs/$agencies)*100)."% with disclosure logs; "
-.round(($green/$disclogs)*100)."% logs with scrapers ".round(($red/$disclogs)*100)."% logs without scrapers ".round(($orange/$disclogs)*100)."% logs Work-In-Progress scrapers ";
+echo $agencies . " agencies, " . round(($disclogs / $agencies) * 100) . "% with disclosure logs; "
+ . round(($green / $disclogs) * 100) . "% logs with scrapers " . round(($red / $disclogs) * 100) . "% logs without scrapers " . round(($orange / $disclogs) * 100) . "% logs Work-In-Progress scrapers ";
include_footer_documents();
?>
--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -9,6 +9,7 @@
import dateutil
from dateutil.parser import *
from datetime import *
+import codecs
class GenericDisclogScraper(object):
__metaclass__ = abc.ABCMeta
@@ -92,7 +93,8 @@
return table.find_all('tr')
def getDate(self, content, entry, doc):
date = ''.join(content.stripped_strings).strip()
- date = date.replace("Octber","October")
+ (a,b,c) = date.partition("(")
+ date = self.remove_control_chars(a.replace("Octber","October"))
print date
edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
print edate
@@ -119,7 +121,7 @@
columns = row.find_all('td')
if len(columns) == self.getColumnCount():
(id, date, title, description, notes) = self.getColumns(columns)
- print ''.join(id.stripped_strings)
+ print self.remove_control_chars(''.join(id.stripped_strings))
if id.string == None:
hash = scrape.mkhash(self.remove_control_chars(url+(''.join(date.stripped_strings))))
else:
--- a/documents/index.php
+++ b/documents/index.php
@@ -1,11 +1,11 @@
<?php
+
include('template.inc.php');
include_header_documents("");
include_once('../include/common.inc.php');
+$startkey = (isset($_REQUEST['start_key']) ? $_REQUEST['start_key'] : '9999-99-99');
?>
<?php
-
-
$agenciesdb = $server->get_db('disclosr-agencies');
@@ -15,17 +15,17 @@
}
$foidocsdb = $server->get_db('disclosr-foidocuments');
try {
- $rows = $foidocsdb->get_view("app", "byDate", Array('9999-99-99','0000-00-00'), true)->rows;
-
-
+ $rows = $foidocsdb->get_view("app", "byDate", Array($startkey, '0000-00-00'), true, 20)->rows;
if ($rows) {
- foreach ($rows as $row) {
-echo displayLogEntry($row,$idtoname);
+ foreach ($rows as $key => $row) {
+ echo displayLogEntry($row, $idtoname);
+ $endkey = $row->key;
}
}
} catch (SetteeRestClientException $e) {
setteErrorHandler($e);
}
+echo "<a href='?start_key=$endkey'>next page</a>";
include_footer_documents();
?>
--- /dev/null
+++ b/documents/scrapers/0049d35216493c545ef5f7f000e6b252.txt
@@ -1,1 +1,2 @@
+pdf
--- /dev/null
+++ b/documents/scrapers/0372b19123076338d483f624c433727b.txt
@@ -1,1 +1,2 @@
+docx
--- /dev/null
+++ b/documents/scrapers/0ae822d1a748e60d90f0b79b97d5a3e5.txt
@@ -1,1 +1,2 @@
+ACMA style
--- /dev/null
+++ b/documents/scrapers/0ced9dd2de36100c3cabdb7fd8e843a9.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/1d404c4934f74feacd00dcb434e7c10a.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "cphMain_C001_Col01").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/2cac2cd1f42687db2d04fa20b5b6a538.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 3
+ def getColumns(self,columns):
+ (id, title, date) = columns
+ return (id, date, title, title, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/3e2f110af49d62833a835bd257771ffb.txt
@@ -1,1 +1,2 @@
+no disclog
--- /dev/null
+++ b/documents/scrapers/4c57389dda9bd454bcb08bc1e5ed87bf.txt
@@ -1,1 +1,2 @@
+parent
--- /dev/null
+++ b/documents/scrapers/4d2af2dcc72f1703bbf04b13b03720a8.txt
@@ -1,1 +1,2 @@
+no disclog
--- /dev/null
+++ b/documents/scrapers/525c3953187da08cd702359b2fc2997f.txt
@@ -1,1 +1,2 @@
+no disclog
--- /dev/null
+++ b/documents/scrapers/54cbb3439276062b7a9f007f9f69d1f6.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 4
+ def getColumns(self,columns):
+ (id, date, title, description) = columns
+ return (id, date, title, description, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- a/documents/scrapers/627f116dfe42c9f27ad6747be0aa44e2.txt
+++ b/documents/scrapers/627f116dfe42c9f27ad6747be0aa44e2.txt
@@ -1,2 +1,1 @@
-see parent dhs
-
+no disclog
--- /dev/null
+++ b/documents/scrapers/655d4d67333536bda18d68265dfe7e80.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(id="node-30609")
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/656f7bb1884f4b9d31ebe2a5f5f58064.txt
@@ -1,1 +1,2 @@
+list style
--- /dev/null
+++ b/documents/scrapers/65ec17101b00519e6d88c5a9f33c2c46.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 3
+ def getColumns(self,columns):
+ (id, date, description) = columns
+ return (id, date, description, description, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/69d59284ef0ccd2677394d82d3292abc.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(id = "centercontent").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/6ac74a939f420c6194ae29224809734a.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/768bbbfb34115873af361af8519b38a9.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/794ea270edc9aa4f70f2a84bbc5ecc7a.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(id = "cphMain_C001_Col01").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/795e7a8afb39a420360aa207b0cb1306.txt
@@ -1,1 +1,2 @@
+no disclog
--- /dev/null
+++ b/documents/scrapers/7b39ce7f362a0af9a711eaf223943eea.txt
@@ -1,1 +1,2 @@
+no disclog
--- a/documents/scrapers/7c6adc1d41cf029bf1a0959e5156477a.txt
+++ b/documents/scrapers/7c6adc1d41cf029bf1a0959e5156477a.txt
@@ -1,1 +1,1 @@
-
+acma style
--- /dev/null
+++ b/documents/scrapers/7ec28d7d97fcf493b1350acd03e3642e.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 3
+ def getColumns(self,columns):
+ (date, title, description) = columns
+ return (date, date, title, description, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/7f55a3c42ad7460254906aa043a6e324.py
@@ -1,1 +1,24 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getTitle(self, content, entry, doc):
+ doc.update({'title': content.stripped_strings.next()})
+ return
+ def getColumnCount(self):
+ return 3
+ def getColumns(self,columns):
+ (date, id, description) = columns
+ return (id, date, description, description, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/8317df630946937864d31a4728ad8ee8.txt
@@ -1,1 +1,2 @@
+pdf
--- /dev/null
+++ b/documents/scrapers/8aae1c28db7f3ce10f232a0137be6bb2.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/8e874a2fde8aa0ccdc6d14573d766540.py
@@ -1,1 +1,86 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+import codecs
+#http://www.doughellmann.com/PyMOTW/abc/
+class NewScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getDescription(self,content, entry,doc):
+ link = None
+ links = []
+ description = ""
+ for atag in entry.find_all('a'):
+ if atag.has_key('href'):
+ link = scrape.fullurl(self.getURL(),atag['href'])
+ (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False)
+ if htcontent != None:
+ if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
+ # http://www.crummy.com/software/BeautifulSoup/documentation.html
+ soup = BeautifulSoup(htcontent)
+ for text in soup.find(id="divFullWidthColumn").stripped_strings:
+ description = description + text.encode('ascii', 'ignore')
+
+ for atag in soup.find(id="divFullWidthColumn").find_all("a"):
+ if atag.has_key('href'):
+ links.append(scrape.fullurl(link,atag['href']))
+ if links != []:
+ doc.update({'links': links})
+ if description != "":
+ doc.update({ 'description': description})
+
+ def getColumnCount(self):
+ return 2
+ def getTable(self,soup):
+ return soup.find(id = "TwoColumnSorting")
+ def getColumns(self,columns):
+ ( title, date) = columns
+ return (title, date, title, title, None)
+class OldScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getDescription(self,content, entry,doc):
+ link = None
+ links = []
+ description = ""
+ for atag in entry.find_all('a'):
+ if atag.has_key('href'):
+ link = scrape.fullurl(self.getURL(),atag['href'])
+ (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False)
+ if htcontent != None:
+ if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
+ # http://www.crummy.com/software/BeautifulSoup/documentation.html
+ soup = BeautifulSoup(htcontent)
+ for text in soup.find(id="content-item").stripped_strings:
+ description = description + text + " \n"
+ for atag in soup.find(id="content-item").find_all("a"):
+ if atag.has_key('href'):
+ links.append(scrape.fullurl(link,atag['href']))
+ if links != []:
+ doc.update({'links': links})
+ if description != "":
+ doc.update({ 'description': description})
+
+ if links != []:
+ doc.update({'links': links})
+ if description != "":
+ doc.update({ 'description': description})
+
+ def getColumnCount(self):
+ return 2
+ def getTable(self,soup):
+ return soup.find(class_ = "doc-list")
+ def getColumns(self,columns):
+ (date, title) = columns
+ return (title, date, title, title, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(NewScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(NewScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ #NewScraperImplementation().doScrape()
+ print 'Subclass:', issubclass(OldScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(OldScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ osi = OldScraperImplementation()
+ osi.disclogURL = "http://archive.treasury.gov.au/content/foi_publications.asp?year=-1&abstract=0&classification=&=&titl=Disclosure+Log+-+Documents+Released+Under+FOI"
+ osi.doScrape()
+# old site too
+
--- a/documents/scrapers/8e874a2fde8aa0ccdc6d14573d766540.txt
+++ /dev/null
@@ -1,49 +1,1 @@
-import sys,os
-sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
-import genericScrapers
-import scrape
-from bs4 import BeautifulSoup
-#http://www.doughellmann.com/PyMOTW/abc/
-class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
- def getDescription(self,content, entry,doc):
- link = None
- links = []
- description = ""
- for atag in entry.find_all('a'):
- if atag.has_key('href'):
- link = scrape.fullurl(self.getURL(),atag['href'])
- (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False)
- if htcontent != None:
- if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
- # http://www.crummy.com/software/BeautifulSoup/documentation.html
- soup = BeautifulSoup(htcontent)
- for row in soup.find(class_ = "ms-rteTable-GreyAlternating").find_all('tr'):
- if row != None:
- rowtitle = row.find('th').string
- description = description + "\n" + rowtitle + ": "
- for text in row.find('td').stripped_strings:
- description = description + text
- for atag in row.find_all("a"):
- if atag.has_key('href'):
- links.append(scrape.fullurl(link,atag['href']))
-
- if links != []:
- doc.update({'links': links})
- if description != "":
- doc.update({ 'description': description})
-
- def getColumnCount(self):
- return 2
- def getTable(self,soup):
- return soup.find(class_ = "ms-rteTable-GreyAlternating")
- def getColumns(self,columns):
- (date, title) = columns
- return (title, date, title, title, None)
-
-if __name__ == '__main__':
- print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
- print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
- ScraperImplementation().doScrape()
-# old site too http://archive.treasury.gov.au/content/foi_publications.asp
-
--- /dev/null
+++ b/documents/scrapers/9282306e244040c9e4ae5705f06f9548.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 4
+ def getColumns(self,columns):
+ (id, date, title, description) = columns
+ return (id, date, title, description, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/93ce83e46f5c2c4ca1b7f199b59b4bd2.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 4
+ def getColumns(self,columns):
+ (id, date,logdate, description) = columns
+ return (id, date, description, description, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/99328d76c8efb56ff3f1da79b9d1b17f.txt
@@ -1,1 +1,2 @@
+acma style
--- /dev/null
+++ b/documents/scrapers/9961dc45e046288ad1431941653af20c.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/9f4815bfdcb918a036e4bb43a30f8d77.txt
@@ -1,1 +1,1 @@
-
+no disclog
--- /dev/null
+++ b/documents/scrapers/a1ab9c80ab473958676c62c1a25dd502.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/a43467fe82b840a353b380c4d7462a4c.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 3
+ def getColumns(self,columns):
+ (date, title, description) = columns
+ return (date, date, title, description, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/a687a9eaab9e10e9e118d3fd7cf0e13a.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(id="ctl00_ContentPlaceHolderMainNoAjax_EdtrTD1494_2").table
+ def getColumnCount(self):
+ return 4
+ def getColumns(self,columns):
+ (blank,id, title,date) = columns
+ return (id, date, title, title, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/b91f866928eb61959dbbab56313214fc.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- a/documents/scrapers/bb96fe4065afb7e0872136dd657f9369.txt
+++ b/documents/scrapers/bb96fe4065afb7e0872136dd657f9369.txt
@@ -1,2 +1,1 @@
-# does not have any disclog entries or table
-
+no disclog
--- /dev/null
+++ b/documents/scrapers/bc91b878e2317fa231cc2c512e2027f0.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 4
+ def getColumns(self,columns):
+ (id, date, title, description) = columns
+ return (id, date, title, description, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/bf16d4ba0d306ee03e5a1d32aaba3da1.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(summary="This table shows every FOI request to date.")
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- a/documents/scrapers/bf6e587f166040b63681cd2ff76fbfdf.txt
+++ b/documents/scrapers/bf6e587f166040b63681cd2ff76fbfdf.txt
@@ -1,1 +1,1 @@
-no disclog yet
+no disclog
--- a/documents/scrapers/cb7f40e3495b682de6eee61bf09c1cfc.txt
+++ b/documents/scrapers/cb7f40e3495b682de6eee61bf09c1cfc.txt
@@ -1,2 +1,1 @@
-no log
-
+no disclog
--- /dev/null
+++ b/documents/scrapers/cca17a34bd490474a316fe0a1ca03c25.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(id = "ctl00_PlaceHolderMain_ctl01__ControlWrapper_RichHtmlField").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/cde8eb4a2e40abb18d8b28d3b85bc9b0.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(summary="This table lists the schedule of upcoming courses.")
+ def getColumnCount(self):
+ return 7
+ def getColumns(self,columns):
+ (id, date, title, description,link,deldate,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/ce34d1e9b55911e4272d2d388821f311.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/d1296c366287f7a9faedf235c7e6df01.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(id="main").table
+ def getColumnCount(self):
+ return 7
+ def getColumns(self,columns):
+ (id, date, title, description,link,deldate,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- a/documents/scrapers/d72744fb1e5d6e87af9a5ea16cc27fa5.txt
+++ b/documents/scrapers/d72744fb1e5d6e87af9a5ea16cc27fa5.txt
@@ -1,1 +1,1 @@
-
+acma style
--- /dev/null
+++ b/documents/scrapers/e0614dc3a9e25d375370ffd82f7165ac.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 6
+ def getColumns(self,columns):
+ (id, date, title, description,deldate, notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/e64c71f4986f78675a252104c5a5f359.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/e770921522a49dc77de208cc724ce134.txt
@@ -1,1 +1,1 @@
-
+no disclog
--- /dev/null
+++ b/documents/scrapers/ee30aad97f0bb32e74c4587404b67ce4.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ #def getTable(self,soup):
+ # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table
+ def getColumnCount(self):
+ return 4
+ def getColumns(self,columns):
+ (id, title, date, description) = columns
+ return (id, date, title, description, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/f189459fc43f941e0d4ecfba52c666f3.txt
@@ -1,1 +1,2 @@
+no disclog
--- a/documents/template.inc.php
+++ b/documents/template.inc.php
@@ -145,7 +145,7 @@
$result .= "</ul>";
}
- $result .= "<small><A href='".$row->value->url."'>View original source...</a> ID: ".$row->value->docID."</small>";
+ $result .= "<small><A href='".$row->value->url."'>View original source...</a> ID: ".strip_tags($row->value->docID)."</small>";
$result .= "</div>";
return $result;
}