<?php | <?php |
require_once '../include/common.inc.php'; | require_once '../include/common.inc.php'; |
$db = $server->get_db('disclosr-agencies'); | $db = $server->get_db('disclosr-agencies'); |
$rows = $db->get_view("app", "byName")->rows; | $rows = $db->get_view("app", "byName")->rows; |
$nametoid = Array(); | $nametoid = Array(); |
$stats = Array(); | $stats = Array(); |
foreach ($rows as $row) { | foreach ($rows as $row) { |
$nametoid[trim($row->key)] = $row->value; | $nametoid[trim($row->key)] = $row->value; |
} | } |
$row = 0; | $row = 0; |
$headers = Array(); | $headers = Array(); |
// source: http://data.gov.au/dataset/freedom-of-information-quarterly-request-and-review-statistical-data-2011-12/ | // source: http://data.gov.au/dataset/freedom-of-information-quarterly-request-and-review-statistical-data-2011-12/ |
if (($handle = fopen("FOI-quarterly-requests-and-reviews-2011-12.csv", "r")) !== FALSE) { | if (($handle = fopen("FOI-quarterly-requests-and-reviews-2011-12.csv", "r")) !== FALSE) { |
while (($data = fgetcsv($handle, 10000, ",")) !== FALSE) { | while (($data = fgetcsv($handle, 10000, ",")) !== FALSE) { |
if ($row >= 1) { | if ($row >= 1) { |
// print_r($data); | // print_r($data); |
$name = trim($data[2]); | $name = trim($data[2]); |
echo "$name <br>"; | // echo "$name <br>"; |
if ($data[0] != "TOTALS" && $data[0] != "") { | if ($data[0] != "TOTALS" && $data[0] != "") { |
if (isset($nametoid[$name])) { | if (isset($nametoid[$name])) { |
$id = $nametoid[$name]; | $id = $nametoid[$name]; |
$timePeriod = $data[0] . "-Q" . $data[1]; | $timePeriod = $data[0] . "-Q" . $data[1]; |
echo "$timePeriod <br>"; | // echo "$timePeriod <br>"; |
unset($data[0]); | unset($data[0]); |
unset($data[1]); | unset($data[1]); |
unset($data[2]); | unset($data[2]); |
unset($data[3]); | unset($data[3]); |
unset($data[4]); | unset($data[4]); |
unset($data[5]); | unset($data[5]); |
unset($data[6]); | unset($data[6]); |
unset($data[7]); | unset($data[7]); |
unset($data[8]); | unset($data[8]); |
//echo $id . "<br>" . PHP_EOL; | //echo $id . "<br>" . PHP_EOL; |
$result = Array("source" => "http://data.gov.au/dataset/freedom-of-information-quarterly-request-and-review-statistical-data-2011-12/"); | $result = Array("source" => "http://data.gov.au/dataset/freedom-of-information-quarterly-request-and-review-statistical-data-2011-12/"); |
foreach ($data as $key => $datum) { | foreach ($data as $key => $datum) { |
if ($datum != 0) { | if ($datum != 0) { |
// tODO prefix header with "FOI" | |
if (isset($stats[$id][$timePeriod][$key])) $datum += $stats[$id][$timePeriod][$key]; | |
$result[trim($headers[$key])] = $datum; | $result[trim($headers[$key])] = $datum; |
} | } |
} | } |
$stats[$id][$timePeriod] = $result; | $stats[$id][$timePeriod] = $result; |
// TODO merge if already exists | |
//print_r($stats); | //print_r($stats); |
} else { | } else { |
echo "<br>ERROR NAME MISSING FROM ID LIST<br><bR> $row" . PHP_EOL; | echo "<br>ERROR NAME MISSING FROM ID LIST<br><bR> $row" . PHP_EOL; |
print_r($data); | print_r($data); |
die(); | die(); |
} | } |
} | } |
} else { | } else { |
$headers = $data; | $headers = $data; |
//print_r($headers); | //print_r($headers); |
} | } |
$row++; | $row++; |
} | } |
fclose($handle); | fclose($handle); |
} | } |
echo "all stats loaded successfuly"; | |
foreach ($stats as $id => $stat) { | foreach ($stats as $id => $stat) { |
echo $id . "<br>" . PHP_EOL; | echo $id . "<br>" . PHP_EOL; |
$doc = $db->get($id); | $doc = $db->get($id); |
echo $doc->name . "<br>" . PHP_EOL; | echo $doc->name . "<br>" . PHP_EOL; |
print_r($stat); | // print_r($stat); |
die(); | |
// print_r($doc); | // print_r($doc); |
$changed = false; | $changed = false; |
if (!isset($doc->statistics)) { | if (!isset($doc->statistics)) { |
$changed = true; | $changed = true; |
$doc->statistics = Array(); | $doc->statistics = Array(); |
} else { | |
$doc->statistics = object_to_array($doc->statistics); | |
} | } |
foreach ($stat as $timePeriod => $value) { | foreach ($stat as $timePeriod => $value) { |
if (!isset($doc->statistics->foiRequests->$timePeriod) | if (!isset($doc->statistics["foiRequests"][$timePeriod]) |
|| $doc->statistics->foiRequests->$timePeriod != $value) { | || $doc->statistics["foiRequests"][$timePeriod] != $value |
) { | |
$changed = true; | $changed = true; |
$doc->statistics["foiRequests"][$timePeriod] = $value; | $doc->statistics["foiRequests"][$timePeriod] = $value; |
} | } |
} | } |
if ($changed) { | if ($changed) { |
$db->save($doc); | $db->save($doc); |
} else { | } else { |
echo "not changed" . "<br>" . PHP_EOL; | echo "not changed" . "<br>" . PHP_EOL; |
} | } |
//print_r($doc);die(); | |
} | } |
?> | ?> |
<?php | <?php |
require_once '../include/common.inc.php'; | require_once '../include/common.inc.php'; |
//function createFOIDocumentsDesignDoc() { | //function createFOIDocumentsDesignDoc() { |
$foidb = $server->get_db('disclosr-foidocuments'); | $foidb = $server->get_db('disclosr-foidocuments'); |
$obj = new stdClass(); | $obj = new stdClass(); |
$obj->_id = "_design/" . urlencode("app"); | $obj->_id = "_design/" . urlencode("app"); |
$obj->language = "javascript"; | $obj->language = "javascript"; |
$obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; | $obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; |
$obj->views->byDate->map = "function(doc) { if (doc.title != \"Disclosure Log Updated\") { emit(doc.date, doc); } };"; | $obj->views->byDate->map = "function(doc) { if (doc.title != \"Disclosure Log Updated\") { emit(doc.date, doc); } };"; |
$obj->views->byDateMonthYear->map = "function(doc) { emit(doc.date, doc); };"; | $obj->views->byDateMonthYear->map = "function(doc) { emit(doc.date, doc); };"; |
$obj->views->byDateMonthYear->reduce = "_count"; | $obj->views->byDateMonthYear->reduce = "_count"; |
$obj->views->byAgencyID->map = "function(doc) { emit(doc.agencyID, doc); };"; | $obj->views->byAgencyID->map = "function(doc) { emit(doc.agencyID, doc); };"; |
$obj->views->byAgencyID->reduce = "_count"; | $obj->views->byAgencyID->reduce = "_count"; |
$obj->views->fieldNames->map = 'function(doc) { for(var propName in doc) { emit(propName, doc._id); }}'; | $obj->views->fieldNames->map = 'function(doc) { for(var propName in doc) { emit(propName, doc._id); }}'; |
$obj->views->fieldNames->reduce = 'function (key, values, rereduce) { return values.length; }'; | $obj->views->fieldNames->reduce = 'function (key, values, rereduce) { return values.length; }'; |
// allow safe updates (even if slightly slower due to extra: rev-detection check). | // allow safe updates (even if slightly slower due to extra: rev-detection check). |
$foidb->save($obj, true); | $foidb->save($obj, true); |
//function createDocumentsDesignDoc() { | //function createDocumentsDesignDoc() { |
$docdb = $server->get_db('disclosr-documents'); | $docdb = $server->get_db('disclosr-documents'); |
$obj = new stdClass(); | $obj = new stdClass(); |
$obj->_id = "_design/" . urlencode("app"); | $obj->_id = "_design/" . urlencode("app"); |
$obj->language = "javascript"; | $obj->language = "javascript"; |
$obj->views->web_server->map = "function(doc) {\n emit(doc.web_server, 1);\n}"; | $obj->views->web_server->map = "function(doc) {\n emit(doc.web_server, 1);\n}"; |
$obj->views->web_server->reduce = "_sum"; | $obj->views->web_server->reduce = "_sum"; |
$obj->views->byAgency->map = "function(doc) {\n emit(doc.agencyID, 1);\n}"; | $obj->views->byAgency->map = "function(doc) {\n emit(doc.agencyID, 1);\n}"; |
$obj->views->byAgency->reduce = "_sum"; | $obj->views->byAgency->reduce = "_sum"; |
$obj->views->byURL->map = "function(doc) {\n emit(doc.url, doc);\n}"; | $obj->views->byURL->map = "function(doc) {\n emit(doc.url, doc);\n}"; |
$obj->views->agency->map = "function(doc) {\n emit(doc.agencyID, doc);\n}"; | $obj->views->agency->map = "function(doc) {\n emit(doc.agencyID, doc);\n}"; |
$obj->views->byWebServer->map = "function(doc) {\n emit(doc.web_server, doc);\n}"; | $obj->views->byWebServer->map = "function(doc) {\n emit(doc.web_server, doc);\n}"; |
$obj->views->datasets->map = "function(doc) {\nif (doc.fieldName == \"data\") {\n emit(doc._id, doc);\n}\n}"; | $obj->views->datasets->map = "function(doc) {\nif (doc.fieldName == \"data\") {\n emit(doc._id, doc);\n}\n}"; |
$obj->views->datasetGroups->map = "function(doc) {\nif (doc.fieldName == \"data\") {\n doc.metadata[\"data.gov.au Category\"] && doc.metadata[\"data.gov.au Category\"].forEach(function(tag) {\n emit(tag, doc.url); \n });\n}\n}"; | $obj->views->datasetGroups->map = "function(doc) {\nif (doc.fieldName == \"data\") {\n doc.metadata[\"data.gov.au Category\"] && doc.metadata[\"data.gov.au Category\"].forEach(function(tag) {\n emit(tag, doc.url); \n });\n}\n}"; |
$obj->views->getValidationRequired->map = "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}"; | $obj->views->getValidationRequired->map = "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}"; |
$docdb->save($obj, true); | $docdb->save($obj, true); |
//function createAgencyDesignDoc() { | //function createAgencyDesignDoc() { |
$db = $server->get_db('disclosr-agencies'); | $db = $server->get_db('disclosr-agencies'); |
$obj = new stdClass(); | $obj = new stdClass(); |
$obj->_id = "_design/" . urlencode("app"); | $obj->_id = "_design/" . urlencode("app"); |
$obj->language = "javascript"; | $obj->language = "javascript"; |
$obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; | $obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; |
$obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };"; | $obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };"; |
$obj->views->byCanonicalName->map = "function(doc) { | $obj->views->byCanonicalName->map = "function(doc) { |
if (doc.parentOrg || doc.orgType == 'FMA-DepartmentOfState') { | if (doc.parentOrg || doc.orgType == 'FMA-DepartmentOfState') { |
emit(doc.name, doc); | emit(doc.name, doc); |
} | } |
};"; | };"; |
$obj->views->byDeptStateName->map = "function(doc) { | $obj->views->byDeptStateName->map = "function(doc) { |
if (doc.orgType == 'FMA-DepartmentOfState') { | if (doc.orgType == 'FMA-DepartmentOfState') { |
emit(doc.name, doc._id); | emit(doc.name, doc._id); |
} | } |
};"; | };"; |
$obj->views->parentOrgs->map = "function(doc) { | $obj->views->parentOrgs->map = "function(doc) { |
if (doc.parentOrg) { | if (doc.parentOrg) { |
emit(doc._id, doc.parentOrg); | emit(doc._id, doc.parentOrg); |
} | } |
};"; | };"; |
$obj->views->byName->map = 'function(doc) { | $obj->views->byName->map = 'function(doc) { |
if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { | if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { |
emit(doc.name, doc._id); | emit(doc.name, doc._id); |
if (typeof(doc.shortName) != "undefined" && doc.shortName != doc.name) { | if (typeof(doc.shortName) != "undefined" && doc.shortName != doc.name) { |
emit(doc.shortName, doc._id); | emit(doc.shortName, doc._id); |
} | } |
for (name in doc.otherNames) { | for (name in doc.otherNames) { |
if (doc.otherNames[name] != "" && doc.otherNames[name] != doc.name) { | if (doc.otherNames[name] != "" && doc.otherNames[name] != doc.name) { |
emit(doc.otherNames[name], doc._id); | emit(doc.otherNames[name], doc._id); |
} | } |
} | } |
for (name in doc.foiBodies) { | for (name in doc.foiBodies) { |
if (doc.foiBodies[name] != "" && doc.foiBodies[name] != doc.name) { | if (doc.foiBodies[name] != "" && doc.foiBodies[name] != doc.name) { |
emit(doc.foiBodies[name], doc._id); | emit(doc.foiBodies[name], doc._id); |
} | } |
} | } |
for (name in doc.positions) { | for (name in doc.positions) { |
if (doc.positions[name] != "" && doc.positions[name] != doc.name) { | if (doc.positions[name] != "" && doc.positions[name] != doc.name) { |
emit(doc.positions[name], doc._id); | emit(doc.positions[name], doc._id); |
} | } |
} | } |
} | } |
};'; | };'; |
$obj->views->foiEmails->map = "function(doc) { | $obj->views->foiEmails->map = "function(doc) { |
emit(doc._id, doc.foiEmail); | emit(doc._id, doc.foiEmail); |
};"; | };"; |
$obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }"; | $obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }"; |
$obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };'; | $obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };'; |
$obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };'; | $obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };'; |
$obj->views->getScrapeRequired->map = "function(doc) { | $obj->views->getScrapeRequired->map = "function(doc) { |
var lastScrape = Date.parse(doc.metadata.lastScraped); | var lastScrape = Date.parse(doc.metadata.lastScraped); |
var today = new Date(); | var today = new Date(); |
if (!lastScrape || lastScrape.getTime() + 1000 != today.getTime()) { | if (!lastScrape || lastScrape.getTime() + 1000 != today.getTime()) { |
emit(doc._id, doc); | emit(doc._id, doc); |
} | } |
};"; | };"; |
$obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };"; | $obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };"; |
$obj->views->getConflicts->map = "function(doc) { | $obj->views->getConflicts->map = "function(doc) { |
if (doc._conflicts) { | if (doc._conflicts) { |
emit(null, [doc._rev].concat(doc._conflicts)); | emit(null, [doc._rev].concat(doc._conflicts)); |
} | } |
}"; | }"; |
$obj->views->getStatistics->map = | $obj->views->getStatistics->map = |
"function(doc) { | " |
if (doc.statistics) { | function (doc) { |
for (var statisticSet in doc.statistics) { | if (doc.statistics) { |
for (var statisticPeriod in doc.statistics[statisticSet]) { | for (var statisticSet in doc.statistics) { |
emit([statisticSet,statisticPeriod], doc.statistics[statisticSet][statisticPeriod]['value']); | for (var statisticPeriod in doc.statistics[statisticSet]) { |
if (doc.statistics[statisticSet][statisticPeriod]['value']) { | |
emit([statisticSet, statisticPeriod], doc.statistics[statisticSet][statisticPeriod]['value']); | |
} else { | |
for (var statisticSubSet in doc.statistics[statisticSet][statisticPeriod]) { | |
if (statisticSubSet != 'source' && statisticSubSet != 'value') { | |
emit([statisticSubSet, statisticPeriod], doc.statistics[statisticSet][statisticPeriod][statisticSubSet]); | |
} | |
} | |
} | |
} | |
} | |
} | |
} | } |
} | "; |
} | |
}"; | |
$obj->views->getStatistics->reduce = '_sum'; | $obj->views->getStatistics->reduce = '_sum'; |
// http://stackoverflow.com/questions/646628/javascript-startswith | // http://stackoverflow.com/questions/646628/javascript-startswith |
$obj->views->score->map = 'if(!String.prototype.startsWith){ | $obj->views->score->map = 'if(!String.prototype.startsWith){ |
String.prototype.startsWith = function (str) { | String.prototype.startsWith = function (str) { |
return !this.indexOf(str); | return !this.indexOf(str); |
} | } |
} | } |
function(doc) { | function(doc) { |
count = 0; | count = 0; |
if (doc["status"] != "suspended") { | if (doc["status"] != "suspended") { |
for(var propName in doc) { | for(var propName in doc) { |
if(typeof(doc[propName]) != "undefined" && doc[propName] != "") { | if(typeof(doc[propName]) != "undefined" && doc[propName] != "") { |
count++; | count++; |
} | } |
} | } |
portfolio = doc.parentOrg; | portfolio = doc.parentOrg; |
if (doc.orgType == "FMA-DepartmentOfState") { | if (doc.orgType == "FMA-DepartmentOfState") { |
portfolio = doc._id; | portfolio = doc._id; |
} | } |
if (doc.orgType == "Court-Commonwealth" || doc.orgType == "FMA-DepartmentOfParliament") { | if (doc.orgType == "Court-Commonwealth" || doc.orgType == "FMA-DepartmentOfParliament") { |
portfolio = doc.orgType; | portfolio = doc.orgType; |
} | } |
emit(count+doc._id, {id:doc._id, name: doc.name, score:count, orgType: doc.orgType, portfolio:portfolio}); | emit(count+doc._id, {id:doc._id, name: doc.name, score:count, orgType: doc.orgType, portfolio:portfolio}); |
} | } |
}'; | }'; |
$obj->views->scoreHas->map = 'if(!String.prototype.startsWith){ | $obj->views->scoreHas->map = 'if(!String.prototype.startsWith){ |
String.prototype.startsWith = function (str) { | String.prototype.startsWith = function (str) { |
return !this.indexOf(str); | return !this.indexOf(str); |
} | } |
} | } |
if(!String.prototype.endsWith){ | if(!String.prototype.endsWith){ |
String.prototype.endsWith = function(suffix) { | String.prototype.endsWith = function(suffix) { |
return this.indexOf(suffix, this.length - suffix.length) !== -1; | return this.indexOf(suffix, this.length - suffix.length) !== -1; |
}; | }; |
} | } |
function(doc) { | function(doc) { |
if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { | if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { |
for(var propName in doc) { | for(var propName in doc) { |
if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") || propName.endsWith("URL"))) { | if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") || propName.endsWith("URL"))) { |
emit(propName, 1); | emit(propName, 1); |
} | } |
} | } |
emit("total", 1); | emit("total", 1); |
} | } |
}'; | }'; |
$obj->views->scoreHas->reduce = '_sum'; | $obj->views->scoreHas->reduce = '_sum'; |
$obj->views->fieldNames->map = ' | $obj->views->fieldNames->map = ' |
function(doc) { | function(doc) { |
for(var propName in doc) { | for(var propName in doc) { |
emit(propName, doc._id); | emit(propName, doc._id); |
} | } |
}'; | }'; |
$obj->views->fieldNames->reduce = '_count'; | $obj->views->fieldNames->reduce = '_count'; |
// allow safe updates (even if slightly slower due to extra: rev-detection check). | // allow safe updates (even if slightly slower due to extra: rev-detection check). |
$db->save($obj, true); | $db->save($obj, true); |
?> | ?> |
<?php | <?php |
include('template.inc.php'); | include('template.inc.php'); |
include_header_documents("Charts"); | include_header_documents("Charts"); |
include_once('../include/common.inc.php'); | include_once('../include/common.inc.php'); |
$agenciesdb = $server->get_db('disclosr-agencies'); | $agenciesdb = $server->get_db('disclosr-agencies'); |
$idtoname = Array(); | $idtoname = Array(); |
$idtofoirequestssuccessful = Array(); | |
foreach ($agenciesdb->get_view("app", "byCanonicalName")->rows as $row) { | foreach ($agenciesdb->get_view("app", "byCanonicalName")->rows as $row) { |
$idtoname[$row->id] = trim($row->value->name); | $idtoname[$row->id] = trim($row->value->name); |
$foirequestssuccessful = 0; | |
if(isset($row->value->statistics->foiRequests)) { | |
foreach ($row->value->statistics->foiRequests as $statperiod) { | |
$statperiod=object_to_array($statperiod); | |
if (isset($statperiod["Requests for other information granted in full"])) $foirequestssuccessful += $statperiod["Requests for other information granted in full"]; | |
if (isset($statperiod["Requests for other information granted in part"])) $foirequestssuccessful += $statperiod["Requests for other information granted in part"]; | |
} | |
} | |
$idtofoirequestssuccessful[$row->id] =$foirequestssuccessful; | |
} | } |
$foidocsdb = $server->get_db('disclosr-foidocuments'); | $foidocsdb = $server->get_db('disclosr-foidocuments'); |
?> | ?> |
<div class="foundation-header"> | <div class="foundation-header"> |
<h1><a href="about.php">Charts</a></h1> | <h1><a href="about.php">Charts</a></h1> |
<h4 class="subheader"></h4> | <h4 class="subheader"></h4> |
</div> | </div> |
<div id="bydate" style="width:1000px;height:300px;"></div> | <div id="bydate" style="width:1000px;height:300px;"></div> |
<div id="byagency" style="width:1000px;height:1400px;"></div> | <div id="byagency" style="width:1000px;height:1400px;"></div> |
<script id="source"> | <script id="source"> |
window.onload = function () { | window.onload = function () { |
$(document).ready(function () { | $(document).ready(function () { |
var | var |
d1 = [], | d1 = [], |
options1, | options1, |
o1; | o1; |
<?php | <?php |
try { | try { |
$rows = $foidocsdb->get_view("app", "byDateMonthYear?group=true",null, false,false,true)->rows; | $rows = $foidocsdb->get_view("app", "byDateMonthYear?group=true",null, false,false,true)->rows; |
$dataValues = Array(); | $dataValues = Array(); |
foreach ($rows as $row) { | foreach ($rows as $row) { |
$dataValues[$row->key] = $row->value; | $dataValues[$row->key] = $row->value; |
} | } |
$i = 0; | $i = 0; |
ksort($dataValues); | ksort($dataValues); |
foreach ($dataValues as $key => $value) { | foreach ($dataValues as $key => $value) { |
$date = date_create_from_format('Y-m-d', $key); | $date = date_create_from_format('Y-m-d', $key); |
if (date_format($date, 'U') != "") { | if (date_format($date, 'U') != "") { |
echo " d1.push([".date_format($date, 'U')."000, $value]);" . PHP_EOL; | echo " d1.push([".date_format($date, 'U')."000, $value]);" . PHP_EOL; |
// echo " emplabels.push('$key');" . PHP_EOL; | // echo " emplabels.push('$key');" . PHP_EOL; |
$i++; | $i++; |
} | } |
} | } |
} catch (SetteeRestClientException $e) { | } catch (SetteeRestClientException $e) { |
setteErrorHandler($e); | setteErrorHandler($e); |
} | } |
?> | ?> |
options1 = { | options1 = { |
xaxis: { | xaxis: { |
mode: 'time', | mode: 'time', |
labelsAngle: 45 | labelsAngle: 45 |
}, | }, |
selection: { | selection: { |
mode: 'x' | mode: 'x' |
}, | }, |
HtmlText: false, | HtmlText: false, |
title: 'Disclosure Log entries added by Date' | title: 'Disclosure Log entries added by Date' |
}; | }; |
// Draw graph with default options, overwriting with passed options | // Draw graph with default options, overwriting with passed options |
function drawGraph(opts) { | function drawGraph(opts) { |
// Clone the options, so the 'options' variable always keeps intact. | // Clone the options, so the 'options' variable always keeps intact. |
o1 = Flotr._.extend(Flotr._.clone(options1), opts || {}); | o1 = Flotr._.extend(Flotr._.clone(options1), opts || {}); |
// Return a new graph. | // Return a new graph. |
return Flotr.draw( | return Flotr.draw( |
document.getElementById("bydate"), | document.getElementById("bydate"), |
[ d1 ], | [ d1 ], |
o1 | o1 |
); | ); |
} | } |
graph = drawGraph(); | graph = drawGraph(); |
Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:select', function (area) { | Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:select', function (area) { |
// Draw selected area | // Draw selected area |
graph = drawGraph({ | graph = drawGraph({ |
xaxis: { min: area.x1, max: area.x2, mode: 'time', labelsAngle: 45 }, | xaxis: { min: area.x1, max: area.x2, mode: 'time', labelsAngle: 45 }, |
yaxis: { min: area.y1, max: area.y2 } | yaxis: { min: area.y1, max: area.y2 } |
}); | }); |
}); | }); |
// When graph is clicked, draw the graph with default area. | // When graph is clicked, draw the graph with default area. |
Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:click', function () { | Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:click', function () { |
graph = drawGraph(); | graph = drawGraph(); |
}); | }); |
}); | }); |
}; | }; |
var d2 = []; | var d2 = []; |
var d3 = []; | |
var agencylabels = []; | var agencylabels = []; |
function agencytrackformatter(obj) { | function agencytrackformatter(obj) { |
return agencylabels[Math.floor(obj.y)] + " = " + obj.x; | return agencylabels[Math.floor(obj.y)] + " = " + obj.x; |
} | } |
function agencytickformatter(val, axis) { | function agencytickformatter(val, axis) { |
if (agencylabels[Math.floor(val)]) { | if (agencylabels[Math.floor(val)]) { |
return (agencylabels[Math.floor(val)]) ; | return (agencylabels[Math.floor(val)]) ; |
} else { | } else { |
return ""; | return ""; |
} | } |
} | } |
<?php | <?php |
try { | try { |
$rows = $foidocsdb->get_view("app", "byAgencyID?group=true",null, false,false,true)->rows; | $rows = $foidocsdb->get_view("app", "byAgencyID?group=true",null, false,false,true)->rows; |
function cmp($a, $b) | function cmp($a, $b) |
{ | { |
return $a->value > $b->value; | return $a->value > $b->value; |
} | } |
usort($rows, "cmp"); | usort($rows, "cmp"); |
$dataValues = Array(); | $dataValues = Array(); |
$i = 0; | $i = 0; |
foreach ($rows as $row) { | foreach ($rows as $row) { |
echo " d2.push([ $row->value,$i]);" . PHP_EOL; | echo " d2.push([ $row->value,$i]);" . PHP_EOL; |
echo " d3.push([ ".$idtofoirequestssuccessful[$row->key].",$i]);" . PHP_EOL; | |
echo " agencylabels.push(['".str_replace("'","",$idtoname[$row->key])."']);" . PHP_EOL; | echo " agencylabels.push(['".str_replace("'","",$idtoname[$row->key])."']);" . PHP_EOL; |
$i++; | $i++; |
} | } |
} catch (SetteeRestClientException $e) { | } catch (SetteeRestClientException $e) { |
setteErrorHandler($e); | setteErrorHandler($e); |
} | } |
?> | ?> |
// Draw the graph | // Draw the graph |
Flotr.draw( | Flotr.draw( |
document.getElementById("byagency"), | document.getElementById("byagency"), |
[d2], | [d2], |
{ | { |
title: "Disclosure Log entries by Agency", | title: "Disclosure Log entries by Agency", |
bars: { | bars: { |
show: true, | show: true, |
horizontal: true, | horizontal: true, |
shadowSize: 0, | shadowSize: 0, |
barWidth: 0.5 | barWidth: 0.5 |
}, | }, |
mouse: { | mouse: { |
track: true, | track: true, |
relative: true, | relative: true, |
trackFormatter: agencytrackformatter | trackFormatter: agencytrackformatter |
}, | }, |
yaxis: { | yaxis: { |
minorTickFreq: 1, | minorTickFreq: 1, |
noTicks: agencylabels.length, | noTicks: agencylabels.length, |
showMinorLabels: true, | showMinorLabels: true, |
tickFormatter: agencytickformatter | tickFormatter: agencytickformatter |
}, | }, |
xaxis: { | xaxis: { |
min: 0, | min: 0, |
autoscaleMargin: 1 | autoscaleMargin: 1 |
}, | }, |
legend: { | legend: { |
show: false | show: true |
} | } |
} | } |
); | ); |
</script> | </script> |
<?php | <?php |
include_footer_documents(); | include_footer_documents(); |
?> | ?> |
import sys | import sys |
import os | import os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import scrape | import scrape |
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup |
from time import mktime | from time import mktime |
import feedparser | import feedparser |
import abc | import abc |
import unicodedata | import unicodedata |
import re | import re |
import dateutil | import dateutil |
from dateutil.parser import * | from dateutil.parser import * |
from datetime import * | from datetime import * |
import codecs | import codecs |
import difflib | import difflib |
from StringIO import StringIO | from StringIO import StringIO |
from pdfminer.pdfparser import PDFDocument, PDFParser | from pdfminer.pdfparser import PDFDocument, PDFParser |
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf | from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf |
from pdfminer.pdfdevice import PDFDevice, TagExtractor | from pdfminer.pdfdevice import PDFDevice, TagExtractor |
from pdfminer.converter import TextConverter | from pdfminer.converter import TextConverter |
from pdfminer.cmapdb import CMapDB | from pdfminer.cmapdb import CMapDB |
from pdfminer.layout import LAParams | from pdfminer.layout import LAParams |
class GenericDisclogScraper(object): | class GenericDisclogScraper(object): |
__metaclass__ = abc.ABCMeta | __metaclass__ = abc.ABCMeta |
agencyID = None | agencyID = None |
disclogURL = None | disclogURL = None |
def remove_control_chars(self, input): | def remove_control_chars(self, input): |
return "".join([i for i in input if ord(i) in range(32, 127)]) | return "".join([i for i in input if ord(i) in range(32, 127)]) |
def getAgencyID(self): | def getAgencyID(self): |
""" disclosr agency id """ | """ disclosr agency id """ |
if self.agencyID is None: | if self.agencyID is None: |
self.agencyID = os.path.basename(sys.argv[0]).replace(".py", "") | self.agencyID = os.path.basename(sys.argv[0]).replace(".py", "") |
return self.agencyID | return self.agencyID |
def getURL(self): | def getURL(self): |
""" disclog URL""" | """ disclog URL""" |
if self.disclogURL is None: | if self.disclogURL is None: |
agency = scrape.agencydb.get(self.getAgencyID()) | agency = scrape.agencydb.get(self.getAgencyID()) |
self.disclogURL = agency['FOIDocumentsURL'] | self.disclogURL = agency['FOIDocumentsURL'] |
return self.disclogURL | return self.disclogURL |
@abc.abstractmethod | @abc.abstractmethod |
def doScrape(self): | def doScrape(self): |
""" do the scraping """ | """ do the scraping """ |
return | return |
class GenericHTMLDisclogScraper(GenericDisclogScraper): | class GenericHTMLDisclogScraper(GenericDisclogScraper): |
def doScrape(self): | def doScrape(self): |
foidocsdb = scrape.couch['disclosr-foidocuments'] | foidocsdb = scrape.couch['disclosr-foidocuments'] |
(url, mime_type, rcontent) = scrape.fetchURL(scrape.docsdb, | (url, mime_type, rcontent) = scrape.fetchURL(scrape.docsdb, |
self.getURL(), "foidocuments", self.getAgencyID()) | self.getURL(), "foidocuments", self.getAgencyID()) |
content = rcontent | content = rcontent |
dochash = scrape.mkhash(content) | dochash = scrape.mkhash(content) |
doc = foidocsdb.get(dochash) | doc = foidocsdb.get(dochash) |
if doc is None: | if doc is None: |
print "saving " + dochash | print "saving " + dochash |
description = "This log may have updated but as it was not in a table last time we viewed it, we cannot extract what has changed. Please refer to the agency's website Disclosure Log to see the most recent entries" | description = "This log may have updated but as it was not in a table last time we viewed it, we cannot extract what has changed. Please refer to the agency's website Disclosure Log to see the most recent entries" |
last_attach = scrape.getLastAttachment(scrape.docsdb, self.getURL()) | last_attach = scrape.getLastAttachment(scrape.docsdb, self.getURL()) |
if last_attach != None: | if last_attach != None: |
html_diff = difflib.HtmlDiff() | html_diff = difflib.HtmlDiff() |
diff = html_diff.make_table(last_attach.read().split('\n'), | diff = html_diff.make_table(last_attach.read().split('\n'), |
content.split('\n')) | content.split('\n')) |
edate = date.today().strftime("%Y-%m-%d") | edate = date.today().strftime("%Y-%m-%d") |
doc = {'_id': dochash, 'agencyID': self.getAgencyID() | doc = {'_id': dochash, 'agencyID': self.getAgencyID() |
, 'url': self.getURL(), 'docID': dochash, | , 'url': self.getURL(), 'docID': dochash, |
"date": edate, "title": "Disclosure Log Updated", | "date": edate, "title": "Disclosure Log Updated", |
"description": self.remove_control_chars(description), "diff": self.remove_control_chars(diff)} | "description": self.remove_control_chars(description), "diff": self.remove_control_chars(diff)} |
foidocsdb.save(doc) | foidocsdb.save(doc) |
else: | else: |
print "already saved" | print "already saved" |
class GenericPDFDisclogScraper(GenericDisclogScraper): | class GenericPDFDisclogScraper(GenericDisclogScraper): |
def doScrape(self): | def doScrape(self): |
foidocsdb = scrape.couch['disclosr-foidocuments'] | foidocsdb = scrape.couch['disclosr-foidocuments'] |
(url, mime_type, content) = scrape.fetchURL(scrape.docsdb, | (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, |
self.getURL(), "foidocuments", self.getAgencyID()) | self.getURL(), "foidocuments", self.getAgencyID()) |
laparams = LAParams() | laparams = LAParams() |
rsrcmgr = PDFResourceManager(caching=True) | rsrcmgr = PDFResourceManager(caching=True) |
outfp = StringIO() | outfp = StringIO() |
device = TextConverter(rsrcmgr, outfp, codec='utf-8', | device = TextConverter(rsrcmgr, outfp, codec='utf-8', |
laparams=laparams) | laparams=laparams) |
fp = StringIO() | fp = StringIO() |
fp.write(content) | fp.write(content) |
process_pdf(rsrcmgr, device, fp, set(), caching=True, | process_pdf(rsrcmgr, device, fp, set(), caching=True, |
check_extractable=True) | check_extractable=True) |
description = outfp.getvalue() | description = outfp.getvalue() |
fp.close() | fp.close() |
device.close() | device.close() |
outfp.close() | outfp.close() |
dochash = scrape.mkhash(description) | dochash = scrape.mkhash(description) |
doc = foidocsdb.get(dochash) | doc = foidocsdb.get(dochash) |
if doc is None: | if doc is None: |
print "saving " + dochash | print "saving " + dochash |
edate = date.today().strftime("%Y-%m-%d") | edate = date.today().strftime("%Y-%m-%d") |
doc = {'_id': dochash, 'agencyID': self.getAgencyID() | doc = {'_id': dochash, 'agencyID': self.getAgencyID() |
, 'url': self.getURL(), 'docID': dochash, | , 'url': self.getURL(), 'docID': dochash, |
"date": edate, "title": "Disclosure Log Updated", "description": self.remove_control_chars(description)} | "date": edate, "title": "Disclosure Log Updated", "description": self.remove_control_chars(description)} |
foidocsdb.save(doc) | foidocsdb.save(doc) |
else: | else: |
print "already saved" | print "already saved" |
class GenericDOCXDisclogScraper(GenericDisclogScraper): | class GenericDOCXDisclogScraper(GenericDisclogScraper): |
def doScrape(self): | def doScrape(self): |
foidocsdb = scrape.couch['disclosr-foidocuments'] | foidocsdb = scrape.couch['disclosr-foidocuments'] |
(url, mime_type, content) = scrape.fetchURL(scrape.docsdb | (url, mime_type, content) = scrape.fetchURL(scrape.docsdb |
, self.getURL(), "foidocuments", self.getAgencyID()) | , self.getURL(), "foidocuments", self.getAgencyID()) |
mydoc = zipfile.ZipFile(file) | mydoc = zipfile.ZipFile(file) |
xmlcontent = mydoc.read('word/document.xml') | xmlcontent = mydoc.read('word/document.xml') |
document = etree.fromstring(xmlcontent) | document = etree.fromstring(xmlcontent) |
## Fetch all the text out of the document we just created | ## Fetch all the text out of the document we just created |
paratextlist = getdocumenttext(document) | paratextlist = getdocumenttext(document) |
# Make explicit unicode version | # Make explicit unicode version |
newparatextlist = [] | newparatextlist = [] |
for paratext in paratextlist: | for paratext in paratextlist: |
newparatextlist.append(paratext.encode("utf-8")) | newparatextlist.append(paratext.encode("utf-8")) |
## Print our documnts test with two newlines under each paragraph | ## Print our documnts test with two newlines under each paragraph |
description = '\n\n'.join(newparatextlist).strip(' \t\n\r') | description = '\n\n'.join(newparatextlist).strip(' \t\n\r') |
dochash = scrape.mkhash(description) | dochash = scrape.mkhash(description) |
doc = foidocsdb.get(dochash) | doc = foidocsdb.get(dochash) |
if doc is None: | if doc is None: |
print "saving " + dochash | print "saving " + dochash |
edate = time().strftime("%Y-%m-%d") | edate = time().strftime("%Y-%m-%d") |
doc = {'_id': dochash, 'agencyID': self.getAgencyID() | doc = {'_id': dochash, 'agencyID': self.getAgencyID() |
, 'url': self.getURL(), 'docID': dochash, | , 'url': self.getURL(), 'docID': dochash, |
"date": edate, "title": "Disclosure Log Updated", "description": description} | "date": edate, "title": "Disclosure Log Updated", "description": description} |
foidocsdb.save(doc) | foidocsdb.save(doc) |
else: | else: |
print "already saved" | print "already saved" |
class GenericRSSDisclogScraper(GenericDisclogScraper): | class GenericRSSDisclogScraper(GenericDisclogScraper): |
def doScrape(self): | def doScrape(self): |
foidocsdb = scrape.couch['disclosr-foidocuments'] | foidocsdb = scrape.couch['disclosr-foidocuments'] |
(url, mime_type, content) = scrape.fetchURL(scrape.docsdb, | (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, |
self.getURL(), "foidocuments", self.getAgencyID()) | self.getURL(), "foidocuments", self.getAgencyID()) |
feed = feedparser.parse(content) | feed = feedparser.parse(content) |
for entry in feed.entries: | for entry in feed.entries: |
#print entry | #print entry |
print entry.id | print entry.id |
dochash = scrape.mkhash(entry.id) | dochash = scrape.mkhash(entry.id) |
doc = foidocsdb.get(dochash) | doc = foidocsdb.get(dochash) |
#print doc | #print doc |
if doc is None: | if doc is None: |
print "saving " + dochash | print "saving " + dochash |
edate = datetime.fromtimestamp( | edate = datetime.fromtimestamp( |
mktime(entry.published_parsed)).strftime("%Y-%m-%d") | mktime(entry.published_parsed)).strftime("%Y-%m-%d") |
doc = {'_id': dochash, 'agencyID': self.getAgencyID(), | doc = {'_id': dochash, 'agencyID': self.getAgencyID(), |
'url': entry.link, 'docID': entry.id, | 'url': entry.link, 'docID': entry.id, |
"date": edate, "title": entry.title} | "date": edate, "title": entry.title} |
self.getDescription(entry, entry, doc) | self.getDescription(entry, entry, doc) |
foidocsdb.save(doc) | foidocsdb.save(doc) |
else: | else: |
print "already saved" | print "already saved" |
def getDescription(self, content, entry, doc): | def getDescription(self, content, entry, doc): |
""" get description from rss entry""" | """ get description from rss entry""" |
doc.update({'description': content.summary}) | doc.update({'description': content.summary}) |
return | return |
class GenericOAICDisclogScraper(GenericDisclogScraper): | class GenericOAICDisclogScraper(GenericDisclogScraper): |
__metaclass__ = abc.ABCMeta | __metaclass__ = abc.ABCMeta |
@abc.abstractmethod | @abc.abstractmethod |
def getColumns(self, columns): | def getColumns(self, columns): |
""" rearranges columns if required """ | """ rearranges columns if required """ |
return | return |
def getColumnCount(self): | def getColumnCount(self): |
return 5 | return 5 |
def getDescription(self, content, entry, doc): | def getDescription(self, content, entry, doc): |
""" get description from rss entry""" | """ get description from rss entry""" |
descriptiontxt = "" | descriptiontxt = "" |
for string in content.stripped_strings: | for string in content.stripped_strings: |
descriptiontxt = descriptiontxt + " \n" + string | descriptiontxt = descriptiontxt + " \n" + string |
doc.update({'description': descriptiontxt}) | doc.update({'description': descriptiontxt}) |
def getTitle(self, content, entry, doc): | def getTitle(self, content, entry, doc): |
doc.update({'title': (''.join(content.stripped_strings))}) | doc.update({'title': (''.join(content.stripped_strings))}) |
def getTable(self, soup): | def getTable(self, soup): |
return soup.table | return soup.table |
def getRows(self, table): | def getRows(self, table): |
return table.find_all('tr') | return table.find_all('tr') |
def findColumns(self, row): | |
return row.find_all('td') | |
def getDocHash(self, id,date, url): | def getDocHash(self, id,date, url): |
if id.string is None: | if id.string is None: |
print "no id, using date as hash" | print "no id, using date as hash" |
return scrape.mkhash( | return scrape.mkhash( |
self.remove_control_chars( | self.remove_control_chars( |
url + (''.join(date.stripped_strings)))) | url + (''.join(date.stripped_strings)))) |
else: | else: |
return scrape.mkhash( | return scrape.mkhash( |
self.remove_control_chars( | self.remove_control_chars( |
url + (''.join(id.stripped_strings)))) | url + (''.join(id.stripped_strings)))) |
def getDate(self, content, entry, doc): | def getDate(self, content, entry, doc): |
strdate = ''.join(content.stripped_strings).strip() | strdate = ''.join(content.stripped_strings).strip() |
(a, b, c) = strdate.partition("(") | (a, b, c) = strdate.partition("(") |
strdate = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012").replace("Janrurary", "January").replace("1012","2012")) | strdate = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012").replace("Janrurary", "January").replace("1012","2012")) |
print strdate | print strdate |
try: | try: |
edate = parse(strdate, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") | edate = parse(strdate, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") |
except ValueError: | except ValueError: |
print >> sys.stderr, "ERROR date invalid %s " % strdate | print >> sys.stderr, "ERROR date invalid %s " % strdate |
print >> sys.stderr, "ERROR date originally %s " % ''.join(content.stripped_strings).strip() | print >> sys.stderr, "ERROR date originally %s " % ''.join(content.stripped_strings).strip() |
edate = date.today().strftime("%Y-%m-%d") | edate = date.today().strftime("%Y-%m-%d") |
print edate | print edate |
doc.update({'date': edate}) | doc.update({'date': edate}) |
return | return |
def getLinks(self, content, entry, doc): | def getLinks(self, content, entry, doc): |
links = [] | links = [] |
for atag in entry.find_all("a"): | for atag in entry.find_all("a"): |
if atag.has_key('href'): | if atag.has_key('href'): |
links.append(scrape.fullurl(content, atag['href'])) | links.append(scrape.fullurl(content, atag['href'])) |
if links != []: | if links != []: |
doc.update({'links': links}) | doc.update({'links': links}) |
return | return |
def doScrape(self): | def doScrape(self): |
foidocsdb = scrape.couch['disclosr-foidocuments'] | foidocsdb = scrape.couch['disclosr-foidocuments'] |
(url, mime_type, content) = scrape.fetchURL(scrape.docsdb, | (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, |
self.getURL(), "foidocuments", self.getAgencyID()) | self.getURL(), "foidocuments", self.getAgencyID()) |
if content is not None: | if content is not None: |
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type == "application/xml": | if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type == "application/xml": |
# http://www.crummy.com/software/BeautifulSoup/documentation.html | # http://www.crummy.com/software/BeautifulSoup/documentation.html |
print "parsing" | print "parsing" |
soup = BeautifulSoup(content) | soup = BeautifulSoup(content) |
table = self.getTable(soup) | table = self.getTable(soup) |
for row in self.getRows(table): | for row in self.getRows(table): |
columns = row.find_all('td') | columns = self.findColumns(row) |
if len(columns) is self.getColumnCount(): | if len(columns) is self.getColumnCount(): |
(id, date, title, | (id, date, title, |
description, notes) = self.getColumns(columns) | description, notes) = self.getColumns(columns) |
print self.remove_control_chars( | print self.remove_control_chars( |
''.join(id.stripped_strings)) | ''.join(id.stripped_strings)) |
dochash = self.getDocHash(id,date,url) | dochash = self.getDocHash(id,date,url) |
doc = foidocsdb.get(dochash) | doc = foidocsdb.get(dochash) |
if doc is None: | if doc is None: |
print "saving " + dochash | print "saving " + dochash |
doc = {'_id': dochash, | doc = {'_id': dochash, |
'agencyID': self.getAgencyID(), | 'agencyID': self.getAgencyID(), |
'url': self.getURL(), | 'url': self.getURL(), |
'docID': (''.join(id.stripped_strings))} | 'docID': (''.join(id.stripped_strings))} |
self.getLinks(self.getURL(), row, doc) | self.getLinks(self.getURL(), row, doc) |
self.getTitle(title, row, doc) | self.getTitle(title, row, doc) |
self.getDate(date, row, doc) | self.getDate(date, row, doc) |
self.getDescription(description, row, doc) | self.getDescription(description, row, doc) |
if notes is not None: | if notes is not None: |
doc.update({'notes': ( | doc.update({'notes': ( |
''.join(notes.stripped_strings))}) | ''.join(notes.stripped_strings))}) |
badtitles = ['-', 'Summary of FOI Request' | badtitles = ['-', 'Summary of FOI Request' |
, 'FOI request(in summary form)' | , 'FOI request(in summary form)' |
, 'Summary of FOI request received by the ASC', | , 'Summary of FOI request received by the ASC', |
'Summary of FOI request received by agency/minister', | 'Summary of FOI request received by agency/minister', |
'Description of Documents Requested', 'FOI request', | 'Description of Documents Requested', 'FOI request', |
'Description of FOI Request', 'Summary of request', 'Description', 'Summary', | 'Description of FOI Request', 'Summary of request', 'Description', 'Summary', |
'Summary of FOIrequest received by agency/minister', | 'Summary of FOIrequest received by agency/minister', |
'Summary of FOI request received', 'Description of FOI Request', | 'Summary of FOI request received', 'Description of FOI Request', |
"FOI request", 'Results 1 to 67 of 67'] | "FOI request", 'Results 1 to 67 of 67'] |
if doc['title'] not in badtitles and 'description' in doc.keys() and doc['description'] != '': | if doc['title'] not in badtitles and 'description' in doc.keys() and doc['description'] != '': |
print "saving" | print "saving" |
foidocsdb.save(doc) | foidocsdb.save(doc) |
else: | else: |
print "already saved " + dochash | print "already saved " + dochash |
elif len(row.find_all('th')) is self.getColumnCount(): | elif len(row.find_all('th')) is self.getColumnCount(): |
print "header row" | print "header row" |
else: | else: |
print >> sys.stderr, "ERROR number of columns incorrect" | print >> sys.stderr, "ERROR number of columns incorrect" |
print row | print row |
#!/bin/bash | #!/bin/bash |
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" |
echo $DIR | echo $DIR |
cd $DIR | cd $DIR |
echo "" > /tmp/disclosr-error | echo "" > /tmp/disclosr-error |
for f in $DIR/scrapers/*.py; do | for f in $DIR/scrapers/*.py; do |
echo "Processing $f file.."; | echo "Processing $f file.."; |
md5=`md5sum /tmp/disclosr-error` | md5=`md5sum /tmp/disclosr-error` |
python $f 3>&1 1>&2 2>&3 | tee --append /tmp/disclosr-error; | python $f 3>&1 1>&2 2>&3 | tee --append /tmp/disclosr-error; |
md52=`md5sum /tmp/disclosr-error` | md52=`md5sum /tmp/disclosr-error` |
if [ "$md5" != "$md52" ]; then | if [ "$md5" != "$md52" ]; then |
echo "^^^^^^^^^^^^^^ $f" >> /tmp/disclosr-error; | echo "^^^^^^^^^^^^^^ $f" >> /tmp/disclosr-error; |
fi | fi |
if [ "$?" -ne "0" ]; then | if [ "$?" -ne "0" ]; then |
echo "error"; | echo "error"; |
sleep 1; | sleep 1; |
fi | fi |
done | done |
curl "localhost:5984/disclosr-foidocuments/_design/app/_view/byDate?startkey=\"9999-99-99\"&endkey=\"0000-00-00\"&descending=true&limit=20" | |
if [ -s /tmp/disclosr-error ] ; then | if [ -s /tmp/disclosr-error ] ; then |
echo "emailling logs.."; | echo "emailling logs.."; |
mail -E -s "Disclosr errors" maxious@lambdacomplex.org < /tmp/disclosr-error ; | mail -E -s "Disclosr errors" maxious@lambdacomplex.org < /tmp/disclosr-error ; |
fi | fi |
import sys,os | import sys,os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import genericScrapers | import genericScrapers |
import dateutil | import dateutil |
from dateutil.parser import * | from dateutil.parser import * |
from datetime import * | from datetime import * |
import scrape | import scrape |
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup |
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): | class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): |
def __init__(self): | def __init__(self): |
super(ScraperImplementation, self).__init__() | super(ScraperImplementation, self).__init__() |
def getDescription(self,content, entry,doc): | def getDescription(self,content, entry,doc): |
link = None | link = None |
links = [] | links = [] |
description = "" | description = "" |
for atag in entry.find_all('a'): | for atag in entry.find_all('a'): |
if atag.has_attr('href'): | if atag.has_attr('href'): |
link = scrape.fullurl(self.getURL(), atag['href']) | link = scrape.fullurl(self.getURL(), atag['href']) |
(url, mime_type, htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) | (url, mime_type, htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) |
if htcontent != None: | if htcontent != None: |
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": | if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": |
soup = BeautifulSoup(htcontent) | soup = BeautifulSoup(htcontent) |
row = soup.find(id="content_div_148050") | row = soup.find(id="content_div_148050") |
description = ''.join(row.stripped_strings) | description = ''.join(row.stripped_strings) |
for atag in row.find_all("a"): | for atag in row.find_all("a"): |
if atag.has_attr('href'): | if atag.has_attr('href'): |
links.append(scrape.fullurl(link, atag['href'])) | links.append(scrape.fullurl(link, atag['href'])) |
if links != []: | if links != []: |
doc.update({'links': links}) | doc.update({'links': links}) |
if description != "": | if description != "": |
doc.update({ 'description': description}) | doc.update({ 'description': description}) |
def getColumnCount(self): | def getColumnCount(self): |
return 4 | return 4 |
def getColumns(self, columns): | def getColumns(self, columns): |
(id, date, datepub, title) = columns | (id, date, datepub, title) = columns |
return (id, date, title, title, None) | return (id, date, title, title, None) |
if __name__ == '__main__': | if __name__ == '__main__': |
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) | print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) |
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) | print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) |
nsi = ScraperImplementation() | nsi = ScraperImplementation() |
nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=1" | |
nsi.doScrape() | |
nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=2" | |
nsi.doScrape() | |
nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=3" | |
nsi.doScrape() | |
nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=4" | |
nsi.doScrape() | |
nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=5" | |
nsi.doScrape() | nsi.doScrape() |
import sys,os | import sys,os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import genericScrapers | import genericScrapers |
import scrape | import scrape |
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup |
#http://www.doughellmann.com/PyMOTW/abc/ | #http://www.doughellmann.com/PyMOTW/abc/ |
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): | class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): |
def getDescription(self,content, entry,doc): | def getDescription(self,content, entry,doc): |
link = None | link = None |
links = [] | links = [] |
description = "" | description = "" |
for atag in entry.find_all('a'): | for atag in entry.find_all('a'): |
if atag.has_attr('href'): | if atag.has_attr('href'): |
link = scrape.fullurl(self.getURL(),atag['href']) | link = scrape.fullurl(self.getURL(),atag['href']) |
(url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) | (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) |
if htcontent != None: | if htcontent != None: |
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": | if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": |
# http://www.crummy.com/software/BeautifulSoup/documentation.html | # http://www.crummy.com/software/BeautifulSoup/documentation.html |
soup = BeautifulSoup(htcontent) | soup = BeautifulSoup(htcontent) |
rowtitle = soup.find(class_ = "wc-title").find("h1").string | rowtitle = soup.find(class_ = "wc-title").find("h1").string |
if rowtitle != None: | if rowtitle != None: |
description = rowtitle + ": " | description = rowtitle + ": " |
for row in soup.find(class_ ="wc-content").find_all('td'): | for row in soup.find(class_ ="wc-content").find_all('td'): |
if row != None: | if row != None: |
for text in row.stripped_strings: | for text in row.stripped_strings: |
description = description + text + "\n" | description = description + text + "\n" |
for atag in row.find_all("a"): | for atag in row.find_all("a"): |
if atag.has_attr('href'): | if atag.has_attr('href'): |
links.append(scrape.fullurl(link,atag['href'])) | links.append(scrape.fullurl(link,atag['href'])) |
if links != []: | if links != []: |
doc.update({'links': links}) | doc.update({'links': links}) |
if description != "": | if description != "": |
doc.update({ 'description': description}) | doc.update({ 'description': description}) |
def getRows(self, table): | |
return table.find_all(class_ = "dl-row"); | |
def findColumns(self, table): | |
return table.find_all('div'); | |
def getColumnCount(self): | def getColumnCount(self): |
return 2 | return 2 |
def getTable(self,soup): | def getTable(self,soup): |
return soup.find(class_ = "ms-rteTable-default") | return soup.find(class_ = "foi-dl-list") |
def getColumns(self,columns): | def getColumns(self,columns): |
(date, title) = columns | (title,date) = columns |
return (title, date, title, title, None) | return (title, date, title, title, None) |
if __name__ == '__main__': | if __name__ == '__main__': |
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) | print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) |
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) | print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) |
ScraperImplementation().doScrape() | ScraperImplementation().doScrape() |
import sys,os | import sys,os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import genericScrapers | import genericScrapers |
import scrape | import scrape |
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup |
#http://www.doughellmann.com/PyMOTW/abc/ | #http://www.doughellmann.com/PyMOTW/abc/ |
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): | class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): |
def getTable(self,soup): | def getTable(self,soup): |
return soup.find(summary="This table shows every FOI request to date.") | return soup |
def getColumnCount(self): | def getColumnCount(self): |
return 5 | return 5 |
def getColumns(self,columns): | def getColumns(self,columns): |
(id, date, title, description,notes) = columns | (id, date, title, description,notes) = columns |
return (id, date, title, description, notes) | return (id, date, title, description, notes) |
if __name__ == '__main__': | if __name__ == '__main__': |
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) | print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) |
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) | print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) |
ScraperImplementation().doScrape() | ScraperImplementation().doScrape() |
import sys,os | import sys,os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import genericScrapers | import genericScrapers |
import dateutil | import dateutil |
from dateutil.parser import * | from dateutil.parser import * |
from datetime import * | from datetime import * |
import scrape | import scrape |
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup |
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): | class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): |
def __init__(self): | def __init__(self): |
super(ScraperImplementation, self).__init__() | super(ScraperImplementation, self).__init__() |
def getTable(self, soup): | def getTable(self, soup): |
return soup.find(id='content') | return soup.find(id='zone-content') |
def getDescription(self,content, entry,doc): | def getDescription(self,content, entry,doc): |
link = None | link = None |
links = [] | links = [] |
description = "" | description = "" |
for atag in entry.find_all('a'): | for atag in entry.find_all('a'): |
if atag.has_attr('href'): | if atag.has_attr('href'): |
link = scrape.fullurl(self.getURL(), atag['href']) | link = scrape.fullurl(self.getURL(), atag['href']) |
(url, mime_type, htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) | (url, mime_type, htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) |
if htcontent != None: | if htcontent != None: |
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": | if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": |
soup = BeautifulSoup(htcontent) | soup = BeautifulSoup(htcontent) |
row = soup.find(id="foidetails") | row = soup.find(id="foidetails") |
if row == None: | if row == None: |
row = soup.find(id="content").table | row = soup.find(id="content").table |
if row == None: | if row == None: |
row = soup.find(id="content") | row = soup.find(id="content") |
description = ''.join(row.stripped_strings) | description = ''.join(row.stripped_strings) |
for atag in row.find_all("a"): | for atag in row.find_all("a"): |
if atag.has_attr('href'): | if atag.has_attr('href'): |
links.append(scrape.fullurl(link, atag['href'])) | links.append(scrape.fullurl(link, atag['href'])) |
if links != []: | if links != []: |
doc.update({'links': links}) | doc.update({'links': links}) |
if description != "": | if description != "": |
doc.update({ 'description': description}) | doc.update({ 'description': description}) |
def getColumnCount(self): | def getColumnCount(self): |
return 3 | return 3 |
def getColumns(self, columns): | def getColumns(self, columns): |
(id, title, date) = columns | (id, title, date) = columns |
return (id, date, title, title, None) | return (id, date, title, title, None) |
if __name__ == '__main__': | if __name__ == '__main__': |
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) | print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) |
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) | print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) |
ScraperImplementation().doScrape() | ScraperImplementation().doScrape() |