FOI stats importer fixed
FOI stats importer fixed


Former-commit-id: 81a6a149848e27565b7a7052d2a7ff4e5aaa9310

<?php <?php
   
require_once '../include/common.inc.php'; require_once '../include/common.inc.php';
   
$db = $server->get_db('disclosr-agencies'); $db = $server->get_db('disclosr-agencies');
$rows = $db->get_view("app", "byName")->rows; $rows = $db->get_view("app", "byName")->rows;
$nametoid = Array(); $nametoid = Array();
$stats = Array(); $stats = Array();
foreach ($rows as $row) { foreach ($rows as $row) {
$nametoid[trim($row->key)] = $row->value; $nametoid[trim($row->key)] = $row->value;
} }
$row = 0; $row = 0;
$headers = Array(); $headers = Array();
// source: http://data.gov.au/dataset/freedom-of-information-quarterly-request-and-review-statistical-data-2011-12/ // source: http://data.gov.au/dataset/freedom-of-information-quarterly-request-and-review-statistical-data-2011-12/
if (($handle = fopen("FOI-quarterly-requests-and-reviews-2011-12.csv", "r")) !== FALSE) { if (($handle = fopen("FOI-quarterly-requests-and-reviews-2011-12.csv", "r")) !== FALSE) {
while (($data = fgetcsv($handle, 10000, ",")) !== FALSE) { while (($data = fgetcsv($handle, 10000, ",")) !== FALSE) {
if ($row >= 1) { if ($row >= 1) {
// print_r($data); // print_r($data);
$name = trim($data[2]); $name = trim($data[2]);
echo "$name <br>"; // echo "$name <br>";
if ($data[0] != "TOTALS" && $data[0] != "") { if ($data[0] != "TOTALS" && $data[0] != "") {
if (isset($nametoid[$name])) { if (isset($nametoid[$name])) {
$id = $nametoid[$name]; $id = $nametoid[$name];
$timePeriod = $data[0] . "-Q" . $data[1]; $timePeriod = $data[0] . "-Q" . $data[1];
   
echo "$timePeriod <br>"; // echo "$timePeriod <br>";
unset($data[0]); unset($data[0]);
unset($data[1]); unset($data[1]);
unset($data[2]); unset($data[2]);
unset($data[3]); unset($data[3]);
unset($data[4]); unset($data[4]);
unset($data[5]); unset($data[5]);
unset($data[6]); unset($data[6]);
unset($data[7]); unset($data[7]);
unset($data[8]); unset($data[8]);
   
//echo $id . "<br>" . PHP_EOL; //echo $id . "<br>" . PHP_EOL;
$result = Array("source" => "http://data.gov.au/dataset/freedom-of-information-quarterly-request-and-review-statistical-data-2011-12/"); $result = Array("source" => "http://data.gov.au/dataset/freedom-of-information-quarterly-request-and-review-statistical-data-2011-12/");
foreach ($data as $key => $datum) { foreach ($data as $key => $datum) {
if ($datum != 0) { if ($datum != 0) {
  // tODO prefix header with "FOI"
  if (isset($stats[$id][$timePeriod][$key])) $datum += $stats[$id][$timePeriod][$key];
$result[trim($headers[$key])] = $datum; $result[trim($headers[$key])] = $datum;
} }
} }
$stats[$id][$timePeriod] = $result; $stats[$id][$timePeriod] = $result;
  // TODO merge if already exists
//print_r($stats); //print_r($stats);
} else { } else {
echo "<br>ERROR NAME MISSING FROM ID LIST<br><bR> $row" . PHP_EOL; echo "<br>ERROR NAME MISSING FROM ID LIST<br><bR> $row" . PHP_EOL;
print_r($data); print_r($data);
die(); die();
} }
} }
} else { } else {
$headers = $data; $headers = $data;
//print_r($headers); //print_r($headers);
} }
$row++; $row++;
} }
fclose($handle); fclose($handle);
} }
  echo "all stats loaded successfuly";
foreach ($stats as $id => $stat) { foreach ($stats as $id => $stat) {
echo $id . "<br>" . PHP_EOL; echo $id . "<br>" . PHP_EOL;
$doc = $db->get($id); $doc = $db->get($id);
echo $doc->name . "<br>" . PHP_EOL; echo $doc->name . "<br>" . PHP_EOL;
print_r($stat); // print_r($stat);
die();  
// print_r($doc); // print_r($doc);
$changed = false; $changed = false;
if (!isset($doc->statistics)) { if (!isset($doc->statistics)) {
$changed = true; $changed = true;
$doc->statistics = Array(); $doc->statistics = Array();
  } else {
  $doc->statistics = object_to_array($doc->statistics);
} }
foreach ($stat as $timePeriod => $value) { foreach ($stat as $timePeriod => $value) {
if (!isset($doc->statistics->foiRequests->$timePeriod) if (!isset($doc->statistics["foiRequests"][$timePeriod])
|| $doc->statistics->foiRequests->$timePeriod != $value) { || $doc->statistics["foiRequests"][$timePeriod] != $value
  ) {
$changed = true; $changed = true;
$doc->statistics["foiRequests"][$timePeriod] = $value; $doc->statistics["foiRequests"][$timePeriod] = $value;
} }
} }
if ($changed) { if ($changed) {
$db->save($doc); $db->save($doc);
} else { } else {
echo "not changed" . "<br>" . PHP_EOL; echo "not changed" . "<br>" . PHP_EOL;
} }
  //print_r($doc);die();
} }
?> ?>
   
<?php <?php
   
require_once '../include/common.inc.php'; require_once '../include/common.inc.php';
//function createFOIDocumentsDesignDoc() { //function createFOIDocumentsDesignDoc() {
   
$foidb = $server->get_db('disclosr-foidocuments'); $foidb = $server->get_db('disclosr-foidocuments');
$obj = new stdClass(); $obj = new stdClass();
$obj->_id = "_design/" . urlencode("app"); $obj->_id = "_design/" . urlencode("app");
$obj->language = "javascript"; $obj->language = "javascript";
$obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; $obj->views->all->map = "function(doc) { emit(doc._id, doc); };";
$obj->views->byDate->map = "function(doc) { if (doc.title != \"Disclosure Log Updated\") { emit(doc.date, doc); } };"; $obj->views->byDate->map = "function(doc) { if (doc.title != \"Disclosure Log Updated\") { emit(doc.date, doc); } };";
$obj->views->byDateMonthYear->map = "function(doc) { emit(doc.date, doc); };"; $obj->views->byDateMonthYear->map = "function(doc) { emit(doc.date, doc); };";
$obj->views->byDateMonthYear->reduce = "_count"; $obj->views->byDateMonthYear->reduce = "_count";
$obj->views->byAgencyID->map = "function(doc) { emit(doc.agencyID, doc); };"; $obj->views->byAgencyID->map = "function(doc) { emit(doc.agencyID, doc); };";
$obj->views->byAgencyID->reduce = "_count"; $obj->views->byAgencyID->reduce = "_count";
$obj->views->fieldNames->map = 'function(doc) { for(var propName in doc) { emit(propName, doc._id); }}'; $obj->views->fieldNames->map = 'function(doc) { for(var propName in doc) { emit(propName, doc._id); }}';
$obj->views->fieldNames->reduce = 'function (key, values, rereduce) { return values.length; }'; $obj->views->fieldNames->reduce = 'function (key, values, rereduce) { return values.length; }';
// allow safe updates (even if slightly slower due to extra: rev-detection check). // allow safe updates (even if slightly slower due to extra: rev-detection check).
$foidb->save($obj, true); $foidb->save($obj, true);
   
   
//function createDocumentsDesignDoc() { //function createDocumentsDesignDoc() {
$docdb = $server->get_db('disclosr-documents'); $docdb = $server->get_db('disclosr-documents');
   
$obj = new stdClass(); $obj = new stdClass();
$obj->_id = "_design/" . urlencode("app"); $obj->_id = "_design/" . urlencode("app");
$obj->language = "javascript"; $obj->language = "javascript";
$obj->views->web_server->map = "function(doc) {\n emit(doc.web_server, 1);\n}"; $obj->views->web_server->map = "function(doc) {\n emit(doc.web_server, 1);\n}";
$obj->views->web_server->reduce = "_sum"; $obj->views->web_server->reduce = "_sum";
$obj->views->byAgency->map = "function(doc) {\n emit(doc.agencyID, 1);\n}"; $obj->views->byAgency->map = "function(doc) {\n emit(doc.agencyID, 1);\n}";
$obj->views->byAgency->reduce = "_sum"; $obj->views->byAgency->reduce = "_sum";
$obj->views->byURL->map = "function(doc) {\n emit(doc.url, doc);\n}"; $obj->views->byURL->map = "function(doc) {\n emit(doc.url, doc);\n}";
$obj->views->agency->map = "function(doc) {\n emit(doc.agencyID, doc);\n}"; $obj->views->agency->map = "function(doc) {\n emit(doc.agencyID, doc);\n}";
$obj->views->byWebServer->map = "function(doc) {\n emit(doc.web_server, doc);\n}"; $obj->views->byWebServer->map = "function(doc) {\n emit(doc.web_server, doc);\n}";
   
$obj->views->datasets->map = "function(doc) {\nif (doc.fieldName == \"data\") {\n emit(doc._id, doc);\n}\n}"; $obj->views->datasets->map = "function(doc) {\nif (doc.fieldName == \"data\") {\n emit(doc._id, doc);\n}\n}";
$obj->views->datasetGroups->map = "function(doc) {\nif (doc.fieldName == \"data\") {\n doc.metadata[\"data.gov.au Category\"] && doc.metadata[\"data.gov.au Category\"].forEach(function(tag) {\n emit(tag, doc.url); \n });\n}\n}"; $obj->views->datasetGroups->map = "function(doc) {\nif (doc.fieldName == \"data\") {\n doc.metadata[\"data.gov.au Category\"] && doc.metadata[\"data.gov.au Category\"].forEach(function(tag) {\n emit(tag, doc.url); \n });\n}\n}";
$obj->views->getValidationRequired->map = "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}"; $obj->views->getValidationRequired->map = "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}";
$docdb->save($obj, true); $docdb->save($obj, true);
   
   
   
   
//function createAgencyDesignDoc() { //function createAgencyDesignDoc() {
$db = $server->get_db('disclosr-agencies'); $db = $server->get_db('disclosr-agencies');
$obj = new stdClass(); $obj = new stdClass();
$obj->_id = "_design/" . urlencode("app"); $obj->_id = "_design/" . urlencode("app");
$obj->language = "javascript"; $obj->language = "javascript";
$obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; $obj->views->all->map = "function(doc) { emit(doc._id, doc); };";
$obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };"; $obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };";
$obj->views->byCanonicalName->map = "function(doc) { $obj->views->byCanonicalName->map = "function(doc) {
if (doc.parentOrg || doc.orgType == 'FMA-DepartmentOfState') { if (doc.parentOrg || doc.orgType == 'FMA-DepartmentOfState') {
emit(doc.name, doc); emit(doc.name, doc);
} }
};"; };";
$obj->views->byDeptStateName->map = "function(doc) { $obj->views->byDeptStateName->map = "function(doc) {
if (doc.orgType == 'FMA-DepartmentOfState') { if (doc.orgType == 'FMA-DepartmentOfState') {
emit(doc.name, doc._id); emit(doc.name, doc._id);
} }
};"; };";
$obj->views->parentOrgs->map = "function(doc) { $obj->views->parentOrgs->map = "function(doc) {
if (doc.parentOrg) { if (doc.parentOrg) {
emit(doc._id, doc.parentOrg); emit(doc._id, doc.parentOrg);
} }
};"; };";
$obj->views->byName->map = 'function(doc) { $obj->views->byName->map = 'function(doc) {
if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") {
emit(doc.name, doc._id); emit(doc.name, doc._id);
if (typeof(doc.shortName) != "undefined" && doc.shortName != doc.name) { if (typeof(doc.shortName) != "undefined" && doc.shortName != doc.name) {
emit(doc.shortName, doc._id); emit(doc.shortName, doc._id);
} }
for (name in doc.otherNames) { for (name in doc.otherNames) {
if (doc.otherNames[name] != "" && doc.otherNames[name] != doc.name) { if (doc.otherNames[name] != "" && doc.otherNames[name] != doc.name) {
emit(doc.otherNames[name], doc._id); emit(doc.otherNames[name], doc._id);
} }
} }
for (name in doc.foiBodies) { for (name in doc.foiBodies) {
if (doc.foiBodies[name] != "" && doc.foiBodies[name] != doc.name) { if (doc.foiBodies[name] != "" && doc.foiBodies[name] != doc.name) {
emit(doc.foiBodies[name], doc._id); emit(doc.foiBodies[name], doc._id);
} }
} }
for (name in doc.positions) { for (name in doc.positions) {
if (doc.positions[name] != "" && doc.positions[name] != doc.name) { if (doc.positions[name] != "" && doc.positions[name] != doc.name) {
emit(doc.positions[name], doc._id); emit(doc.positions[name], doc._id);
} }
} }
} }
};'; };';
   
$obj->views->foiEmails->map = "function(doc) { $obj->views->foiEmails->map = "function(doc) {
emit(doc._id, doc.foiEmail); emit(doc._id, doc.foiEmail);
};"; };";
   
$obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }"; $obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }";
$obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };'; $obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };';
$obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };'; $obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };';
$obj->views->getScrapeRequired->map = "function(doc) { $obj->views->getScrapeRequired->map = "function(doc) {
   
var lastScrape = Date.parse(doc.metadata.lastScraped); var lastScrape = Date.parse(doc.metadata.lastScraped);
   
var today = new Date(); var today = new Date();
   
if (!lastScrape || lastScrape.getTime() + 1000 != today.getTime()) { if (!lastScrape || lastScrape.getTime() + 1000 != today.getTime()) {
emit(doc._id, doc); emit(doc._id, doc);
} }
   
};"; };";
$obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };"; $obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };";
$obj->views->getConflicts->map = "function(doc) { $obj->views->getConflicts->map = "function(doc) {
if (doc._conflicts) { if (doc._conflicts) {
emit(null, [doc._rev].concat(doc._conflicts)); emit(null, [doc._rev].concat(doc._conflicts));
} }
}"; }";
$obj->views->getStatistics->map = $obj->views->getStatistics->map =
"function(doc) { "
if (doc.statistics) { function (doc) {
for (var statisticSet in doc.statistics) { if (doc.statistics) {
for (var statisticPeriod in doc.statistics[statisticSet]) { for (var statisticSet in doc.statistics) {
emit([statisticSet,statisticPeriod], doc.statistics[statisticSet][statisticPeriod]['value']); for (var statisticPeriod in doc.statistics[statisticSet]) {
  if (doc.statistics[statisticSet][statisticPeriod]['value']) {
  emit([statisticSet, statisticPeriod], doc.statistics[statisticSet][statisticPeriod]['value']);
  } else {
  for (var statisticSubSet in doc.statistics[statisticSet][statisticPeriod]) {
  if (statisticSubSet != 'source' && statisticSubSet != 'value') {
  emit([statisticSubSet, statisticPeriod], doc.statistics[statisticSet][statisticPeriod][statisticSubSet]);
  }
  }
  }
  }
  }
  }
} }
} ";
}  
}";  
$obj->views->getStatistics->reduce = '_sum'; $obj->views->getStatistics->reduce = '_sum';
// http://stackoverflow.com/questions/646628/javascript-startswith // http://stackoverflow.com/questions/646628/javascript-startswith
$obj->views->score->map = 'if(!String.prototype.startsWith){ $obj->views->score->map = 'if(!String.prototype.startsWith){
String.prototype.startsWith = function (str) { String.prototype.startsWith = function (str) {
return !this.indexOf(str); return !this.indexOf(str);
} }
} }
   
function(doc) { function(doc) {
count = 0; count = 0;
if (doc["status"] != "suspended") { if (doc["status"] != "suspended") {
for(var propName in doc) { for(var propName in doc) {
if(typeof(doc[propName]) != "undefined" && doc[propName] != "") { if(typeof(doc[propName]) != "undefined" && doc[propName] != "") {
count++; count++;
} }
} }
portfolio = doc.parentOrg; portfolio = doc.parentOrg;
if (doc.orgType == "FMA-DepartmentOfState") { if (doc.orgType == "FMA-DepartmentOfState") {
portfolio = doc._id; portfolio = doc._id;
} }
if (doc.orgType == "Court-Commonwealth" || doc.orgType == "FMA-DepartmentOfParliament") { if (doc.orgType == "Court-Commonwealth" || doc.orgType == "FMA-DepartmentOfParliament") {
portfolio = doc.orgType; portfolio = doc.orgType;
} }
emit(count+doc._id, {id:doc._id, name: doc.name, score:count, orgType: doc.orgType, portfolio:portfolio}); emit(count+doc._id, {id:doc._id, name: doc.name, score:count, orgType: doc.orgType, portfolio:portfolio});
} }
}'; }';
$obj->views->scoreHas->map = 'if(!String.prototype.startsWith){ $obj->views->scoreHas->map = 'if(!String.prototype.startsWith){
String.prototype.startsWith = function (str) { String.prototype.startsWith = function (str) {
return !this.indexOf(str); return !this.indexOf(str);
} }
} }
if(!String.prototype.endsWith){ if(!String.prototype.endsWith){
String.prototype.endsWith = function(suffix) { String.prototype.endsWith = function(suffix) {
    return this.indexOf(suffix, this.length - suffix.length) !== -1;     return this.indexOf(suffix, this.length - suffix.length) !== -1;
}; };
} }
function(doc) { function(doc) {
if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") {
for(var propName in doc) { for(var propName in doc) {
if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") || propName.endsWith("URL"))) { if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") || propName.endsWith("URL"))) {
emit(propName, 1); emit(propName, 1);
} }
} }
emit("total", 1); emit("total", 1);
} }
}'; }';
$obj->views->scoreHas->reduce = '_sum'; $obj->views->scoreHas->reduce = '_sum';
$obj->views->fieldNames->map = ' $obj->views->fieldNames->map = '
function(doc) { function(doc) {
for(var propName in doc) { for(var propName in doc) {
emit(propName, doc._id); emit(propName, doc._id);
} }
}'; }';
$obj->views->fieldNames->reduce = '_count'; $obj->views->fieldNames->reduce = '_count';
// allow safe updates (even if slightly slower due to extra: rev-detection check). // allow safe updates (even if slightly slower due to extra: rev-detection check).
$db->save($obj, true); $db->save($obj, true);
?> ?>
   
<?php <?php
include('template.inc.php'); include('template.inc.php');
include_header_documents("Charts"); include_header_documents("Charts");
include_once('../include/common.inc.php'); include_once('../include/common.inc.php');
$agenciesdb = $server->get_db('disclosr-agencies'); $agenciesdb = $server->get_db('disclosr-agencies');
   
$idtoname = Array(); $idtoname = Array();
  $idtofoirequestssuccessful = Array();
foreach ($agenciesdb->get_view("app", "byCanonicalName")->rows as $row) { foreach ($agenciesdb->get_view("app", "byCanonicalName")->rows as $row) {
$idtoname[$row->id] = trim($row->value->name); $idtoname[$row->id] = trim($row->value->name);
  $foirequestssuccessful = 0;
  if(isset($row->value->statistics->foiRequests)) {
  foreach ($row->value->statistics->foiRequests as $statperiod) {
  $statperiod=object_to_array($statperiod);
  if (isset($statperiod["Requests for other information granted in full"])) $foirequestssuccessful += $statperiod["Requests for other information granted in full"];
  if (isset($statperiod["Requests for other information granted in part"])) $foirequestssuccessful += $statperiod["Requests for other information granted in part"];
  }
  }
  $idtofoirequestssuccessful[$row->id] =$foirequestssuccessful;
} }
$foidocsdb = $server->get_db('disclosr-foidocuments'); $foidocsdb = $server->get_db('disclosr-foidocuments');
   
?> ?>
<div class="foundation-header"> <div class="foundation-header">
<h1><a href="about.php">Charts</a></h1> <h1><a href="about.php">Charts</a></h1>
<h4 class="subheader"></h4> <h4 class="subheader"></h4>
</div> </div>
<div id="bydate" style="width:1000px;height:300px;"></div> <div id="bydate" style="width:1000px;height:300px;"></div>
<div id="byagency" style="width:1000px;height:1400px;"></div> <div id="byagency" style="width:1000px;height:1400px;"></div>
<script id="source"> <script id="source">
window.onload = function () { window.onload = function () {
$(document).ready(function () { $(document).ready(function () {
var var
d1 = [], d1 = [],
options1, options1,
o1; o1;
   
<?php <?php
try { try {
$rows = $foidocsdb->get_view("app", "byDateMonthYear?group=true",null, false,false,true)->rows; $rows = $foidocsdb->get_view("app", "byDateMonthYear?group=true",null, false,false,true)->rows;
   
   
$dataValues = Array(); $dataValues = Array();
foreach ($rows as $row) { foreach ($rows as $row) {
$dataValues[$row->key] = $row->value; $dataValues[$row->key] = $row->value;
} }
$i = 0; $i = 0;
ksort($dataValues); ksort($dataValues);
foreach ($dataValues as $key => $value) { foreach ($dataValues as $key => $value) {
$date = date_create_from_format('Y-m-d', $key); $date = date_create_from_format('Y-m-d', $key);
if (date_format($date, 'U') != "") { if (date_format($date, 'U') != "") {
echo " d1.push([".date_format($date, 'U')."000, $value]);" . PHP_EOL; echo " d1.push([".date_format($date, 'U')."000, $value]);" . PHP_EOL;
// echo " emplabels.push('$key');" . PHP_EOL; // echo " emplabels.push('$key');" . PHP_EOL;
$i++; $i++;
} }
} }
} catch (SetteeRestClientException $e) { } catch (SetteeRestClientException $e) {
setteErrorHandler($e); setteErrorHandler($e);
} }
?> ?>
   
   
options1 = { options1 = {
xaxis: { xaxis: {
mode: 'time', mode: 'time',
labelsAngle: 45 labelsAngle: 45
}, },
selection: { selection: {
mode: 'x' mode: 'x'
}, },
HtmlText: false, HtmlText: false,
title: 'Disclosure Log entries added by Date' title: 'Disclosure Log entries added by Date'
}; };
   
// Draw graph with default options, overwriting with passed options // Draw graph with default options, overwriting with passed options
function drawGraph(opts) { function drawGraph(opts) {
   
// Clone the options, so the 'options' variable always keeps intact. // Clone the options, so the 'options' variable always keeps intact.
o1 = Flotr._.extend(Flotr._.clone(options1), opts || {}); o1 = Flotr._.extend(Flotr._.clone(options1), opts || {});
   
// Return a new graph. // Return a new graph.
return Flotr.draw( return Flotr.draw(
document.getElementById("bydate"), document.getElementById("bydate"),
[ d1 ], [ d1 ],
o1 o1
); );
} }
   
graph = drawGraph(); graph = drawGraph();
   
Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:select', function (area) { Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:select', function (area) {
// Draw selected area // Draw selected area
graph = drawGraph({ graph = drawGraph({
xaxis: { min: area.x1, max: area.x2, mode: 'time', labelsAngle: 45 }, xaxis: { min: area.x1, max: area.x2, mode: 'time', labelsAngle: 45 },
yaxis: { min: area.y1, max: area.y2 } yaxis: { min: area.y1, max: area.y2 }
}); });
}); });
   
// When graph is clicked, draw the graph with default area. // When graph is clicked, draw the graph with default area.
Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:click', function () { Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:click', function () {
graph = drawGraph(); graph = drawGraph();
}); });
   
}); });
}; };
   
var d2 = []; var d2 = [];
  var d3 = [];
var agencylabels = []; var agencylabels = [];
function agencytrackformatter(obj) { function agencytrackformatter(obj) {
   
return agencylabels[Math.floor(obj.y)] + " = " + obj.x; return agencylabels[Math.floor(obj.y)] + " = " + obj.x;
   
} }
function agencytickformatter(val, axis) { function agencytickformatter(val, axis) {
if (agencylabels[Math.floor(val)]) { if (agencylabels[Math.floor(val)]) {
return (agencylabels[Math.floor(val)]) ; return (agencylabels[Math.floor(val)]) ;
   
} else { } else {
return ""; return "";
} }
} }
<?php <?php
try { try {
$rows = $foidocsdb->get_view("app", "byAgencyID?group=true",null, false,false,true)->rows; $rows = $foidocsdb->get_view("app", "byAgencyID?group=true",null, false,false,true)->rows;
function cmp($a, $b) function cmp($a, $b)
{ {
return $a->value > $b->value; return $a->value > $b->value;
} }
usort($rows, "cmp"); usort($rows, "cmp");
   
$dataValues = Array(); $dataValues = Array();
$i = 0; $i = 0;
foreach ($rows as $row) { foreach ($rows as $row) {
echo " d2.push([ $row->value,$i]);" . PHP_EOL; echo " d2.push([ $row->value,$i]);" . PHP_EOL;
  echo " d3.push([ ".$idtofoirequestssuccessful[$row->key].",$i]);" . PHP_EOL;
echo " agencylabels.push(['".str_replace("'","",$idtoname[$row->key])."']);" . PHP_EOL; echo " agencylabels.push(['".str_replace("'","",$idtoname[$row->key])."']);" . PHP_EOL;
   
$i++; $i++;
} }
} catch (SetteeRestClientException $e) { } catch (SetteeRestClientException $e) {
setteErrorHandler($e); setteErrorHandler($e);
} }
?> ?>
// Draw the graph // Draw the graph
Flotr.draw( Flotr.draw(
document.getElementById("byagency"), document.getElementById("byagency"),
[d2], [d2],
{ {
title: "Disclosure Log entries by Agency", title: "Disclosure Log entries by Agency",
bars: { bars: {
show: true, show: true,
horizontal: true, horizontal: true,
shadowSize: 0, shadowSize: 0,
barWidth: 0.5 barWidth: 0.5
}, },
mouse: { mouse: {
track: true, track: true,
relative: true, relative: true,
trackFormatter: agencytrackformatter trackFormatter: agencytrackformatter
}, },
yaxis: { yaxis: {
minorTickFreq: 1, minorTickFreq: 1,
noTicks: agencylabels.length, noTicks: agencylabels.length,
showMinorLabels: true, showMinorLabels: true,
tickFormatter: agencytickformatter tickFormatter: agencytickformatter
}, },
xaxis: { xaxis: {
min: 0, min: 0,
autoscaleMargin: 1 autoscaleMargin: 1
}, },
legend: { legend: {
show: false show: true
} }
} }
); );
</script> </script>
   
<?php <?php
include_footer_documents(); include_footer_documents();
?> ?>
   
   
import sys import sys
import os import os
   
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import scrape import scrape
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from time import mktime from time import mktime
import feedparser import feedparser
import abc import abc
import unicodedata import unicodedata
import re import re
import dateutil import dateutil
from dateutil.parser import * from dateutil.parser import *
from datetime import * from datetime import *
import codecs import codecs
   
import difflib import difflib
   
from StringIO import StringIO from StringIO import StringIO
   
from pdfminer.pdfparser import PDFDocument, PDFParser from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
from pdfminer.pdfdevice import PDFDevice, TagExtractor from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.converter import TextConverter from pdfminer.converter import TextConverter
from pdfminer.cmapdb import CMapDB from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams from pdfminer.layout import LAParams
   
   
class GenericDisclogScraper(object): class GenericDisclogScraper(object):
__metaclass__ = abc.ABCMeta __metaclass__ = abc.ABCMeta
agencyID = None agencyID = None
disclogURL = None disclogURL = None
   
def remove_control_chars(self, input): def remove_control_chars(self, input):
return "".join([i for i in input if ord(i) in range(32, 127)]) return "".join([i for i in input if ord(i) in range(32, 127)])
   
def getAgencyID(self): def getAgencyID(self):
""" disclosr agency id """ """ disclosr agency id """
if self.agencyID is None: if self.agencyID is None:
self.agencyID = os.path.basename(sys.argv[0]).replace(".py", "") self.agencyID = os.path.basename(sys.argv[0]).replace(".py", "")
return self.agencyID return self.agencyID
   
def getURL(self): def getURL(self):
""" disclog URL""" """ disclog URL"""
if self.disclogURL is None: if self.disclogURL is None:
agency = scrape.agencydb.get(self.getAgencyID()) agency = scrape.agencydb.get(self.getAgencyID())
self.disclogURL = agency['FOIDocumentsURL'] self.disclogURL = agency['FOIDocumentsURL']
return self.disclogURL return self.disclogURL
   
@abc.abstractmethod @abc.abstractmethod
def doScrape(self): def doScrape(self):
""" do the scraping """ """ do the scraping """
return return
   
   
class GenericHTMLDisclogScraper(GenericDisclogScraper): class GenericHTMLDisclogScraper(GenericDisclogScraper):
def doScrape(self): def doScrape(self):
foidocsdb = scrape.couch['disclosr-foidocuments'] foidocsdb = scrape.couch['disclosr-foidocuments']
(url, mime_type, rcontent) = scrape.fetchURL(scrape.docsdb, (url, mime_type, rcontent) = scrape.fetchURL(scrape.docsdb,
self.getURL(), "foidocuments", self.getAgencyID()) self.getURL(), "foidocuments", self.getAgencyID())
content = rcontent content = rcontent
dochash = scrape.mkhash(content) dochash = scrape.mkhash(content)
doc = foidocsdb.get(dochash) doc = foidocsdb.get(dochash)
if doc is None: if doc is None:
print "saving " + dochash print "saving " + dochash
description = "This log may have updated but as it was not in a table last time we viewed it, we cannot extract what has changed. Please refer to the agency's website Disclosure Log to see the most recent entries" description = "This log may have updated but as it was not in a table last time we viewed it, we cannot extract what has changed. Please refer to the agency's website Disclosure Log to see the most recent entries"
last_attach = scrape.getLastAttachment(scrape.docsdb, self.getURL()) last_attach = scrape.getLastAttachment(scrape.docsdb, self.getURL())
if last_attach != None: if last_attach != None:
html_diff = difflib.HtmlDiff() html_diff = difflib.HtmlDiff()
diff = html_diff.make_table(last_attach.read().split('\n'), diff = html_diff.make_table(last_attach.read().split('\n'),
content.split('\n')) content.split('\n'))
edate = date.today().strftime("%Y-%m-%d") edate = date.today().strftime("%Y-%m-%d")
doc = {'_id': dochash, 'agencyID': self.getAgencyID() doc = {'_id': dochash, 'agencyID': self.getAgencyID()
, 'url': self.getURL(), 'docID': dochash, , 'url': self.getURL(), 'docID': dochash,
"date": edate, "title": "Disclosure Log Updated", "date": edate, "title": "Disclosure Log Updated",
"description": self.remove_control_chars(description), "diff": self.remove_control_chars(diff)} "description": self.remove_control_chars(description), "diff": self.remove_control_chars(diff)}
foidocsdb.save(doc) foidocsdb.save(doc)
else: else:
print "already saved" print "already saved"
   
   
class GenericPDFDisclogScraper(GenericDisclogScraper): class GenericPDFDisclogScraper(GenericDisclogScraper):
def doScrape(self): def doScrape(self):
foidocsdb = scrape.couch['disclosr-foidocuments'] foidocsdb = scrape.couch['disclosr-foidocuments']
(url, mime_type, content) = scrape.fetchURL(scrape.docsdb, (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
self.getURL(), "foidocuments", self.getAgencyID()) self.getURL(), "foidocuments", self.getAgencyID())
laparams = LAParams() laparams = LAParams()
rsrcmgr = PDFResourceManager(caching=True) rsrcmgr = PDFResourceManager(caching=True)
outfp = StringIO() outfp = StringIO()
device = TextConverter(rsrcmgr, outfp, codec='utf-8', device = TextConverter(rsrcmgr, outfp, codec='utf-8',
laparams=laparams) laparams=laparams)
fp = StringIO() fp = StringIO()
fp.write(content) fp.write(content)
   
process_pdf(rsrcmgr, device, fp, set(), caching=True, process_pdf(rsrcmgr, device, fp, set(), caching=True,
check_extractable=True) check_extractable=True)
description = outfp.getvalue() description = outfp.getvalue()
fp.close() fp.close()
device.close() device.close()
outfp.close() outfp.close()
dochash = scrape.mkhash(description) dochash = scrape.mkhash(description)
doc = foidocsdb.get(dochash) doc = foidocsdb.get(dochash)
if doc is None: if doc is None:
print "saving " + dochash print "saving " + dochash
edate = date.today().strftime("%Y-%m-%d") edate = date.today().strftime("%Y-%m-%d")
doc = {'_id': dochash, 'agencyID': self.getAgencyID() doc = {'_id': dochash, 'agencyID': self.getAgencyID()
, 'url': self.getURL(), 'docID': dochash, , 'url': self.getURL(), 'docID': dochash,
"date": edate, "title": "Disclosure Log Updated", "description": self.remove_control_chars(description)} "date": edate, "title": "Disclosure Log Updated", "description": self.remove_control_chars(description)}
foidocsdb.save(doc) foidocsdb.save(doc)
else: else:
print "already saved" print "already saved"
   
   
class GenericDOCXDisclogScraper(GenericDisclogScraper): class GenericDOCXDisclogScraper(GenericDisclogScraper):
def doScrape(self): def doScrape(self):
foidocsdb = scrape.couch['disclosr-foidocuments'] foidocsdb = scrape.couch['disclosr-foidocuments']
(url, mime_type, content) = scrape.fetchURL(scrape.docsdb (url, mime_type, content) = scrape.fetchURL(scrape.docsdb
, self.getURL(), "foidocuments", self.getAgencyID()) , self.getURL(), "foidocuments", self.getAgencyID())
mydoc = zipfile.ZipFile(file) mydoc = zipfile.ZipFile(file)
xmlcontent = mydoc.read('word/document.xml') xmlcontent = mydoc.read('word/document.xml')
document = etree.fromstring(xmlcontent) document = etree.fromstring(xmlcontent)
## Fetch all the text out of the document we just created ## Fetch all the text out of the document we just created
paratextlist = getdocumenttext(document) paratextlist = getdocumenttext(document)
# Make explicit unicode version # Make explicit unicode version
newparatextlist = [] newparatextlist = []
for paratext in paratextlist: for paratext in paratextlist:
newparatextlist.append(paratext.encode("utf-8")) newparatextlist.append(paratext.encode("utf-8"))
## Print our documnts test with two newlines under each paragraph ## Print our documnts test with two newlines under each paragraph
description = '\n\n'.join(newparatextlist).strip(' \t\n\r') description = '\n\n'.join(newparatextlist).strip(' \t\n\r')
dochash = scrape.mkhash(description) dochash = scrape.mkhash(description)
doc = foidocsdb.get(dochash) doc = foidocsdb.get(dochash)
   
if doc is None: if doc is None:
print "saving " + dochash print "saving " + dochash
edate = time().strftime("%Y-%m-%d") edate = time().strftime("%Y-%m-%d")
doc = {'_id': dochash, 'agencyID': self.getAgencyID() doc = {'_id': dochash, 'agencyID': self.getAgencyID()
, 'url': self.getURL(), 'docID': dochash, , 'url': self.getURL(), 'docID': dochash,
"date": edate, "title": "Disclosure Log Updated", "description": description} "date": edate, "title": "Disclosure Log Updated", "description": description}
foidocsdb.save(doc) foidocsdb.save(doc)
else: else:
print "already saved" print "already saved"
   
   
class GenericRSSDisclogScraper(GenericDisclogScraper): class GenericRSSDisclogScraper(GenericDisclogScraper):
def doScrape(self): def doScrape(self):
foidocsdb = scrape.couch['disclosr-foidocuments'] foidocsdb = scrape.couch['disclosr-foidocuments']
(url, mime_type, content) = scrape.fetchURL(scrape.docsdb, (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
self.getURL(), "foidocuments", self.getAgencyID()) self.getURL(), "foidocuments", self.getAgencyID())
feed = feedparser.parse(content) feed = feedparser.parse(content)
for entry in feed.entries: for entry in feed.entries:
#print entry #print entry
print entry.id print entry.id
dochash = scrape.mkhash(entry.id) dochash = scrape.mkhash(entry.id)
doc = foidocsdb.get(dochash) doc = foidocsdb.get(dochash)
#print doc #print doc
if doc is None: if doc is None:
print "saving " + dochash print "saving " + dochash
edate = datetime.fromtimestamp( edate = datetime.fromtimestamp(
mktime(entry.published_parsed)).strftime("%Y-%m-%d") mktime(entry.published_parsed)).strftime("%Y-%m-%d")
doc = {'_id': dochash, 'agencyID': self.getAgencyID(), doc = {'_id': dochash, 'agencyID': self.getAgencyID(),
'url': entry.link, 'docID': entry.id, 'url': entry.link, 'docID': entry.id,
"date": edate, "title": entry.title} "date": edate, "title": entry.title}
self.getDescription(entry, entry, doc) self.getDescription(entry, entry, doc)
foidocsdb.save(doc) foidocsdb.save(doc)
else: else:
print "already saved" print "already saved"
   
def getDescription(self, content, entry, doc): def getDescription(self, content, entry, doc):
""" get description from rss entry""" """ get description from rss entry"""
doc.update({'description': content.summary}) doc.update({'description': content.summary})
   
return return
   
   
class GenericOAICDisclogScraper(GenericDisclogScraper): class GenericOAICDisclogScraper(GenericDisclogScraper):
__metaclass__ = abc.ABCMeta __metaclass__ = abc.ABCMeta
   
@abc.abstractmethod @abc.abstractmethod
def getColumns(self, columns): def getColumns(self, columns):
""" rearranges columns if required """ """ rearranges columns if required """
return return
   
def getColumnCount(self): def getColumnCount(self):
return 5 return 5
   
def getDescription(self, content, entry, doc): def getDescription(self, content, entry, doc):
""" get description from rss entry""" """ get description from rss entry"""
descriptiontxt = "" descriptiontxt = ""
for string in content.stripped_strings: for string in content.stripped_strings:
descriptiontxt = descriptiontxt + " \n" + string descriptiontxt = descriptiontxt + " \n" + string
doc.update({'description': descriptiontxt}) doc.update({'description': descriptiontxt})
   
def getTitle(self, content, entry, doc): def getTitle(self, content, entry, doc):
doc.update({'title': (''.join(content.stripped_strings))}) doc.update({'title': (''.join(content.stripped_strings))})
   
def getTable(self, soup): def getTable(self, soup):
return soup.table return soup.table
   
def getRows(self, table): def getRows(self, table):
return table.find_all('tr') return table.find_all('tr')
  def findColumns(self, row):
  return row.find_all('td')
   
def getDocHash(self, id,date, url): def getDocHash(self, id,date, url):
if id.string is None: if id.string is None:
print "no id, using date as hash" print "no id, using date as hash"
return scrape.mkhash( return scrape.mkhash(
self.remove_control_chars( self.remove_control_chars(
url + (''.join(date.stripped_strings)))) url + (''.join(date.stripped_strings))))
else: else:
return scrape.mkhash( return scrape.mkhash(
self.remove_control_chars( self.remove_control_chars(
url + (''.join(id.stripped_strings)))) url + (''.join(id.stripped_strings))))
   
def getDate(self, content, entry, doc): def getDate(self, content, entry, doc):
strdate = ''.join(content.stripped_strings).strip() strdate = ''.join(content.stripped_strings).strip()
(a, b, c) = strdate.partition("(") (a, b, c) = strdate.partition("(")
strdate = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012").replace("Janrurary", "January").replace("1012","2012")) strdate = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012").replace("Janrurary", "January").replace("1012","2012"))
print strdate print strdate
try: try:
edate = parse(strdate, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") edate = parse(strdate, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
except ValueError: except ValueError:
print >> sys.stderr, "ERROR date invalid %s " % strdate print >> sys.stderr, "ERROR date invalid %s " % strdate
print >> sys.stderr, "ERROR date originally %s " % ''.join(content.stripped_strings).strip() print >> sys.stderr, "ERROR date originally %s " % ''.join(content.stripped_strings).strip()
edate = date.today().strftime("%Y-%m-%d") edate = date.today().strftime("%Y-%m-%d")
print edate print edate
doc.update({'date': edate}) doc.update({'date': edate})
return return
   
def getLinks(self, content, entry, doc): def getLinks(self, content, entry, doc):
links = [] links = []
for atag in entry.find_all("a"): for atag in entry.find_all("a"):
if atag.has_key('href'): if atag.has_key('href'):
links.append(scrape.fullurl(content, atag['href'])) links.append(scrape.fullurl(content, atag['href']))
if links != []: if links != []:
doc.update({'links': links}) doc.update({'links': links})
return return
   
def doScrape(self): def doScrape(self):
foidocsdb = scrape.couch['disclosr-foidocuments'] foidocsdb = scrape.couch['disclosr-foidocuments']
(url, mime_type, content) = scrape.fetchURL(scrape.docsdb, (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
self.getURL(), "foidocuments", self.getAgencyID()) self.getURL(), "foidocuments", self.getAgencyID())
if content is not None: if content is not None:
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type == "application/xml": if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type == "application/xml":
# http://www.crummy.com/software/BeautifulSoup/documentation.html # http://www.crummy.com/software/BeautifulSoup/documentation.html
print "parsing" print "parsing"
soup = BeautifulSoup(content) soup = BeautifulSoup(content)
table = self.getTable(soup) table = self.getTable(soup)
for row in self.getRows(table): for row in self.getRows(table):
columns = row.find_all('td') columns = self.findColumns(row)
if len(columns) is self.getColumnCount(): if len(columns) is self.getColumnCount():
(id, date, title, (id, date, title,
description, notes) = self.getColumns(columns) description, notes) = self.getColumns(columns)
print self.remove_control_chars( print self.remove_control_chars(
''.join(id.stripped_strings)) ''.join(id.stripped_strings))
dochash = self.getDocHash(id,date,url) dochash = self.getDocHash(id,date,url)
doc = foidocsdb.get(dochash) doc = foidocsdb.get(dochash)
   
if doc is None: if doc is None:
print "saving " + dochash print "saving " + dochash
doc = {'_id': dochash, doc = {'_id': dochash,
'agencyID': self.getAgencyID(), 'agencyID': self.getAgencyID(),
'url': self.getURL(), 'url': self.getURL(),
'docID': (''.join(id.stripped_strings))} 'docID': (''.join(id.stripped_strings))}
self.getLinks(self.getURL(), row, doc) self.getLinks(self.getURL(), row, doc)
self.getTitle(title, row, doc) self.getTitle(title, row, doc)
self.getDate(date, row, doc) self.getDate(date, row, doc)
self.getDescription(description, row, doc) self.getDescription(description, row, doc)
if notes is not None: if notes is not None:
doc.update({'notes': ( doc.update({'notes': (
''.join(notes.stripped_strings))}) ''.join(notes.stripped_strings))})
badtitles = ['-', 'Summary of FOI Request' badtitles = ['-', 'Summary of FOI Request'
, 'FOI request(in summary form)' , 'FOI request(in summary form)'
, 'Summary of FOI request received by the ASC', , 'Summary of FOI request received by the ASC',
'Summary of FOI request received by agency/minister', 'Summary of FOI request received by agency/minister',
'Description of Documents Requested', 'FOI request', 'Description of Documents Requested', 'FOI request',
'Description of FOI Request', 'Summary of request', 'Description', 'Summary', 'Description of FOI Request', 'Summary of request', 'Description', 'Summary',
'Summary of FOIrequest received by agency/minister', 'Summary of FOIrequest received by agency/minister',
'Summary of FOI request received', 'Description of FOI Request', 'Summary of FOI request received', 'Description of FOI Request',
"FOI request", 'Results 1 to 67 of 67'] "FOI request", 'Results 1 to 67 of 67']
if doc['title'] not in badtitles and 'description' in doc.keys() and doc['description'] != '': if doc['title'] not in badtitles and 'description' in doc.keys() and doc['description'] != '':
print "saving" print "saving"
foidocsdb.save(doc) foidocsdb.save(doc)
else: else:
print "already saved " + dochash print "already saved " + dochash
   
elif len(row.find_all('th')) is self.getColumnCount(): elif len(row.find_all('th')) is self.getColumnCount():
print "header row" print "header row"
   
else: else:
print >> sys.stderr, "ERROR number of columns incorrect" print >> sys.stderr, "ERROR number of columns incorrect"
print row print row
   
#!/bin/bash #!/bin/bash
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
echo $DIR echo $DIR
cd $DIR cd $DIR
echo "" > /tmp/disclosr-error echo "" > /tmp/disclosr-error
for f in $DIR/scrapers/*.py; do for f in $DIR/scrapers/*.py; do
echo "Processing $f file.."; echo "Processing $f file..";
md5=`md5sum /tmp/disclosr-error` md5=`md5sum /tmp/disclosr-error`
python $f 3>&1 1>&2 2>&3 | tee --append /tmp/disclosr-error; python $f 3>&1 1>&2 2>&3 | tee --append /tmp/disclosr-error;
md52=`md5sum /tmp/disclosr-error` md52=`md5sum /tmp/disclosr-error`
if [ "$md5" != "$md52" ]; then if [ "$md5" != "$md52" ]; then
echo "^^^^^^^^^^^^^^ $f" >> /tmp/disclosr-error; echo "^^^^^^^^^^^^^^ $f" >> /tmp/disclosr-error;
fi fi
if [ "$?" -ne "0" ]; then if [ "$?" -ne "0" ]; then
echo "error"; echo "error";
sleep 1; sleep 1;
fi fi
done done
  curl "localhost:5984/disclosr-foidocuments/_design/app/_view/byDate?startkey=\"9999-99-99\"&endkey=\"0000-00-00\"&descending=true&limit=20"
if [ -s /tmp/disclosr-error ] ; then if [ -s /tmp/disclosr-error ] ; then
echo "emailling logs.."; echo "emailling logs..";
mail -E -s "Disclosr errors" maxious@lambdacomplex.org < /tmp/disclosr-error ; mail -E -s "Disclosr errors" maxious@lambdacomplex.org < /tmp/disclosr-error ;
fi fi
   
   
import sys,os import sys,os
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import genericScrapers import genericScrapers
import dateutil import dateutil
from dateutil.parser import * from dateutil.parser import *
from datetime import * from datetime import *
import scrape import scrape
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
   
def __init__(self): def __init__(self):
super(ScraperImplementation, self).__init__() super(ScraperImplementation, self).__init__()
   
def getDescription(self,content, entry,doc): def getDescription(self,content, entry,doc):
link = None link = None
links = [] links = []
description = "" description = ""
for atag in entry.find_all('a'): for atag in entry.find_all('a'):
if atag.has_attr('href'): if atag.has_attr('href'):
link = scrape.fullurl(self.getURL(), atag['href']) link = scrape.fullurl(self.getURL(), atag['href'])
(url, mime_type, htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) (url, mime_type, htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False)
if htcontent != None: if htcontent != None:
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
soup = BeautifulSoup(htcontent) soup = BeautifulSoup(htcontent)
row = soup.find(id="content_div_148050") row = soup.find(id="content_div_148050")
description = ''.join(row.stripped_strings) description = ''.join(row.stripped_strings)
for atag in row.find_all("a"): for atag in row.find_all("a"):
if atag.has_attr('href'): if atag.has_attr('href'):
links.append(scrape.fullurl(link, atag['href'])) links.append(scrape.fullurl(link, atag['href']))
   
if links != []: if links != []:
doc.update({'links': links}) doc.update({'links': links})
if description != "": if description != "":
doc.update({ 'description': description}) doc.update({ 'description': description})
def getColumnCount(self): def getColumnCount(self):
return 4 return 4
   
def getColumns(self, columns): def getColumns(self, columns):
(id, date, datepub, title) = columns (id, date, datepub, title) = columns
return (id, date, title, title, None) return (id, date, title, title, None)
   
   
if __name__ == '__main__': if __name__ == '__main__':
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
   
nsi = ScraperImplementation() nsi = ScraperImplementation()
nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=1"  
nsi.doScrape()  
nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=2"  
nsi.doScrape()  
nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=3"  
nsi.doScrape()  
nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=4"  
nsi.doScrape()  
nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=5"  
nsi.doScrape() nsi.doScrape()
   
import sys,os import sys,os
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import genericScrapers import genericScrapers
import scrape import scrape
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
   
#http://www.doughellmann.com/PyMOTW/abc/ #http://www.doughellmann.com/PyMOTW/abc/
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
def getDescription(self,content, entry,doc): def getDescription(self,content, entry,doc):
link = None link = None
links = [] links = []
description = "" description = ""
for atag in entry.find_all('a'): for atag in entry.find_all('a'):
if atag.has_attr('href'): if atag.has_attr('href'):
link = scrape.fullurl(self.getURL(),atag['href']) link = scrape.fullurl(self.getURL(),atag['href'])
(url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False)
if htcontent != None: if htcontent != None:
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
# http://www.crummy.com/software/BeautifulSoup/documentation.html # http://www.crummy.com/software/BeautifulSoup/documentation.html
soup = BeautifulSoup(htcontent) soup = BeautifulSoup(htcontent)
rowtitle = soup.find(class_ = "wc-title").find("h1").string rowtitle = soup.find(class_ = "wc-title").find("h1").string
if rowtitle != None: if rowtitle != None:
description = rowtitle + ": " description = rowtitle + ": "
for row in soup.find(class_ ="wc-content").find_all('td'): for row in soup.find(class_ ="wc-content").find_all('td'):
if row != None: if row != None:
for text in row.stripped_strings: for text in row.stripped_strings:
description = description + text + "\n" description = description + text + "\n"
for atag in row.find_all("a"): for atag in row.find_all("a"):
if atag.has_attr('href'): if atag.has_attr('href'):
links.append(scrape.fullurl(link,atag['href'])) links.append(scrape.fullurl(link,atag['href']))
   
if links != []: if links != []:
doc.update({'links': links}) doc.update({'links': links})
if description != "": if description != "":
doc.update({ 'description': description}) doc.update({ 'description': description})
  def getRows(self, table):
  return table.find_all(class_ = "dl-row");
  def findColumns(self, table):
  return table.find_all('div');
def getColumnCount(self): def getColumnCount(self):
return 2 return 2
def getTable(self,soup): def getTable(self,soup):
return soup.find(class_ = "ms-rteTable-default") return soup.find(class_ = "foi-dl-list")
def getColumns(self,columns): def getColumns(self,columns):
(date, title) = columns (title,date) = columns
return (title, date, title, title, None) return (title, date, title, title, None)
   
if __name__ == '__main__': if __name__ == '__main__':
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
ScraperImplementation().doScrape() ScraperImplementation().doScrape()
   
import sys,os import sys,os
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import genericScrapers import genericScrapers
import scrape import scrape
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
   
#http://www.doughellmann.com/PyMOTW/abc/ #http://www.doughellmann.com/PyMOTW/abc/
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
def getTable(self,soup): def getTable(self,soup):
return soup.find(summary="This table shows every FOI request to date.") return soup
def getColumnCount(self): def getColumnCount(self):
return 5 return 5
def getColumns(self,columns): def getColumns(self,columns):
(id, date, title, description,notes) = columns (id, date, title, description,notes) = columns
return (id, date, title, description, notes) return (id, date, title, description, notes)
   
if __name__ == '__main__': if __name__ == '__main__':
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
ScraperImplementation().doScrape() ScraperImplementation().doScrape()
   
import sys,os import sys,os
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import genericScrapers import genericScrapers
import dateutil import dateutil
from dateutil.parser import * from dateutil.parser import *
from datetime import * from datetime import *
import scrape import scrape
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
   
def __init__(self): def __init__(self):
super(ScraperImplementation, self).__init__() super(ScraperImplementation, self).__init__()
def getTable(self, soup): def getTable(self, soup):
return soup.find(id='content') return soup.find(id='zone-content')
   
def getDescription(self,content, entry,doc): def getDescription(self,content, entry,doc):
link = None link = None
links = [] links = []
description = "" description = ""
for atag in entry.find_all('a'): for atag in entry.find_all('a'):
if atag.has_attr('href'): if atag.has_attr('href'):
link = scrape.fullurl(self.getURL(), atag['href']) link = scrape.fullurl(self.getURL(), atag['href'])
(url, mime_type, htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) (url, mime_type, htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False)
if htcontent != None: if htcontent != None:
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
soup = BeautifulSoup(htcontent) soup = BeautifulSoup(htcontent)
row = soup.find(id="foidetails") row = soup.find(id="foidetails")
if row == None: if row == None:
row = soup.find(id="content").table row = soup.find(id="content").table
if row == None: if row == None:
row = soup.find(id="content") row = soup.find(id="content")
description = ''.join(row.stripped_strings) description = ''.join(row.stripped_strings)
for atag in row.find_all("a"): for atag in row.find_all("a"):
if atag.has_attr('href'): if atag.has_attr('href'):
links.append(scrape.fullurl(link, atag['href'])) links.append(scrape.fullurl(link, atag['href']))
   
if links != []: if links != []:
doc.update({'links': links}) doc.update({'links': links})
if description != "": if description != "":
doc.update({ 'description': description}) doc.update({ 'description': description})
   
def getColumnCount(self): def getColumnCount(self):
return 3 return 3
   
def getColumns(self, columns): def getColumns(self, columns):
(id, title, date) = columns (id, title, date) = columns
return (id, date, title, title, None) return (id, date, title, title, None)
   
   
if __name__ == '__main__': if __name__ == '__main__':
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
ScraperImplementation().doScrape() ScraperImplementation().doScrape()