<?xml version="1.0" encoding="utf-8"?> | <?xml version="1.0" encoding="utf-8"?> |
<!-- Generator: Adobe Illustrator 15.0.0, SVG Export Plug-In . SVG Version: 6.00 Build 0) --> | <!-- Generator: Adobe Illustrator 15.0.0, SVG Export Plug-In . SVG Version: 6.00 Build 0) --> |
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"> | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"> |
<svg version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px" width="100px" | <svg version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px" width="100px" |
height="100px" viewBox="0 -25.635 100 100" enable-background="new 0 -25.635 100 100" xml:space="preserve"> | height="100px" viewBox="0 -25.635 100 100" enable-background="new 0 -25.635 100 100" xml:space="preserve"> |
<g id="docs"> | <g id="docs"> |
<path fill="#C2A385" d="M86.108-9.909l5.229,9.952c0,0-1.832,0.083-5.297,1.95c-2.312,1.249-6.468,6.246-6.468,6.246L71.827-2.909 | <path fill="#C2A385" d="M86.108-9.909l5.229,9.952c0,0-1.832,0.083-5.297,1.95c-2.312,1.249-6.468,6.246-6.468,6.246L71.827-2.909 |
c0,0,4.201-3.996,6.513-5.242C81.805-10.022,86.108-9.909,86.108-9.909"/> | c0,0,4.201-3.996,6.513-5.242C81.805-10.022,86.108-9.909,86.108-9.909"/> |
<path fill="#C2A385" d="M65.604,20.731l-3.152-9.868c0,0-5.441,3.56-8.017,4.074c-1.008,0.202-1.93,0.335-2.749,0.425 | <path fill="#C2A385" d="M65.604,20.731l-3.152-9.868c0,0-5.441,3.56-8.017,4.074c-1.008,0.202-1.93,0.335-2.749,0.425 |
L65.604,20.731z"/> | L65.604,20.731z"/> |
<path fill="#C2A385" d="M72.326,23.321c0.268-0.226,0.537-0.44,0.804-0.616c3.104-2.054,6.139-3.685,6.268-3.755l1.882-1.005 | <path fill="#C2A385" d="M72.326,23.321c0.268-0.226,0.537-0.44,0.804-0.616c3.104-2.054,6.139-3.685,6.268-3.755l1.882-1.005 |
l1.369,1.634l2.864,3.417l3.198-4.334L76.68,9.783l-8.74,11.847L72.326,23.321z"/> | l1.369,1.634l2.864,3.417l3.198-4.334L76.68,9.783l-8.74,11.847L72.326,23.321z"/> |
<path fill="#C2A385" d="M39.918,10.823l4.825,1.86l3.33,0.212c0.04,0.001,0.269,0.015,0.652,0.015c0.91,0,2.798-0.072,5.196-0.551 | <path fill="#C2A385" d="M39.918,10.823l4.825,1.86l3.33,0.212c0.04,0.001,0.269,0.015,0.652,0.015c0.91,0,2.798-0.072,5.196-0.551 |
c1.427-0.284,5.007-2.332,7.092-3.695l2.889-1.888l1.05,3.285l2.496,7.812l5.889-7.985l-4.625,0.163l1.348-6.225L55.133,0.593 | c1.427-0.284,5.007-2.332,7.092-3.695l2.889-1.888l1.05,3.285l2.496,7.812l5.889-7.985l-4.625,0.163l1.348-6.225L55.133,0.593 |
l-2.095,9.667c-0.531-2.599-1.841-5.727-1.841-5.727L37.709,6.055c0,0,0.885,2.206,1.586,4.529L39.918,10.823z"/> | l-2.095,9.667c-0.531-2.599-1.841-5.727-1.841-5.727L37.709,6.055c0,0,0.885,2.206,1.586,4.529L39.918,10.823z"/> |
<path fill="#C2A385" d="M91.233,45.562c-1.102-0.691-2.323-1.142-3.415-1.433l-3.779,9.804c1.932,1.246,5.197,5.738,5.197,5.738 | <path fill="#C2A385" d="M91.233,45.562c-1.102-0.691-2.323-1.142-3.415-1.433l-3.779,9.804c1.932,1.246,5.197,5.738,5.197,5.738 |
l7.336-9.206C96.572,50.466,93.162,46.771,91.233,45.562z"/> | l7.336-9.206C96.572,50.466,93.162,46.771,91.233,45.562z"/> |
<path fill="#C2A385" d="M93.192,32.166l-3.656,1.224c-0.019,0.007-1.779,0.613-4.117,2.069l2.817,4.868l0.626,1.08 | <path fill="#C2A385" d="M93.192,32.166l-3.656,1.224c-0.019,0.007-1.779,0.613-4.117,2.069l2.817,4.868l0.626,1.08 |
c3.306-0.562,7.727-1.922,7.727-1.922l-2.332-15.261c0,0-2.934,1.277-5.852,2.221l2.318,2.765L93.192,32.166z"/> | c3.306-0.562,7.727-1.922,7.727-1.922l-2.332-15.261c0,0-2.934,1.277-5.852,2.221l2.318,2.765L93.192,32.166z"/> |
<path fill="#C2A385" d="M79.272,25.999l0.864,0.334l0.46,0.801l3.503,6.05c2.646-1.636,4.611-2.287,4.611-2.287l-8.075-9.632 | <path fill="#C2A385" d="M79.272,25.999l0.864,0.334l0.46,0.801l3.503,6.05c2.646-1.636,4.611-2.287,4.611-2.287l-8.075-9.632 |
c0,0-2.584,1.391-5.376,3.188L79.272,25.999z"/> | c0,0-2.584,1.391-5.376,3.188L79.272,25.999z"/> |
</g> | </g> |
<g id="trunk"> | <g id="trunk"> |
<circle fill="#C00000" cx="66.019" cy="52.945" r="7.877"/> | <circle fill="#C00000" cx="66.019" cy="52.945" r="7.877"/> |
<circle fill="#C00000" cx="22.693" cy="52.945" r="7.877"/> | <circle fill="#C00000" cx="22.693" cy="52.945" r="7.877"/> |
<path fill="#C00000" d="M22.693,42.441c1.915,0,3.705,0.522,5.251,1.421V28.001H12.191L3,37.192v14.439h9.281 | <path fill="#C00000" d="M22.693,42.441c1.915,0,3.705,0.522,5.251,1.421V28.001H12.191L3,37.192v14.439h9.281 |
C12.931,46.459,17.347,42.441,22.693,42.441z"/> | C12.931,46.459,17.347,42.441,22.693,42.441z"/> |
<path fill="#C00000" d="M55.925,50.075l-9.583-3.695H30.88c1.186,1.476,1.978,3.28,2.225,5.252h22.502 | <path fill="#C00000" d="M55.925,50.075l-9.583-3.695H30.88c1.186,1.476,1.978,3.28,2.225,5.252h22.502 |
C55.674,51.1,55.78,50.58,55.925,50.075z"/> | C55.674,51.1,55.78,50.58,55.925,50.075z"/> |
<path fill="#C00000" d="M79.312,28.328L39.961,13.149l-9.384,24.335l26.381,10.174c1.824-3.115,5.198-5.218,9.062-5.218 | <path fill="#C00000" d="M79.312,28.328L39.961,13.149l-9.384,24.335l26.381,10.174c1.824-3.115,5.198-5.218,9.062-5.218 |
c5.791,0,10.503,4.712,10.503,10.502c0,0.744-0.081,1.471-0.229,2.173l4.713,1.817L86.95,41.52L79.312,28.328z"/> | c5.791,0,10.503,4.712,10.503,10.502c0,0.744-0.081,1.471-0.229,2.173l4.713,1.817L86.95,41.52L79.312,28.328z"/> |
</g> | </g> |
</svg> | </svg> |
for line in `curl "http://localhost:5984/disclosr-foidocuments/_design/app/_view/byAgencyID?reduce=false&keys=%5B\"5716ce0aacfe98f7d638b7a66b7f1040\"%5D&limit=600" | xargs -L1`; do | |
# echo $line | |
id=`echo $line | grep -Po '_id:.*?[^\\\],' | perl -pe 's/_id://; s/^//; s/,$//'` | |
rev=`echo $line | grep -Po 'rev:.*?[^\\\],'| perl -pe 's/rev://; s/^//; s/,$//'` | |
if [ -n "$id" ]; then | |
echo "curl -X DELETE http://localhost:5984/disclosr-foidocuments/$id?rev=$rev" | |
curl -X DELETE http://localhost:5984/disclosr-foidocuments/$id?rev=$rev | |
fi | |
done; | |
<?php | <?php |
require_once '../include/common.inc.php'; | require_once '../include/common.inc.php'; |
//function createFOIDocumentsDesignDoc() { | //function createFOIDocumentsDesignDoc() { |
$foidb = $server->get_db('disclosr-foidocuments'); | $foidb = $server->get_db('disclosr-foidocuments'); |
$obj = new stdClass(); | $obj = new stdClass(); |
$obj->_id = "_design/" . urlencode("app"); | $obj->_id = "_design/" . urlencode("app"); |
$obj->language = "javascript"; | $obj->language = "javascript"; |
$obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; | $obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; |
$obj->views->byDate->map = "function(doc) { if (doc.title != "Disclosure Log Updated") { emit(doc.date, doc); } };"; | $obj->views->byDate->map = "function(doc) { if (doc.title != \"Disclosure Log Updated\") { emit(doc.date, doc); } };"; |
$obj->views->byDateMonthYear->map = "function(doc) { emit(doc.date, doc); };"; | $obj->views->byDateMonthYear->map = "function(doc) { emit(doc.date, doc); };"; |
$obj->views->byDateMonthYear->reduce = "_count"; | $obj->views->byDateMonthYear->reduce = "_count"; |
$obj->views->byAgencyID->map = "function(doc) { emit(doc.agencyID, doc); };"; | $obj->views->byAgencyID->map = "function(doc) { emit(doc.agencyID, doc); };"; |
$obj->views->byAgencyID->reduce = "_count"; | $obj->views->byAgencyID->reduce = "_count"; |
$obj->views->fieldNames->map = ' | $obj->views->fieldNames->map = 'function(doc) { for(var propName in doc) { emit(propName, doc._id); }}'; |
function(doc) { | $obj->views->fieldNames->reduce = 'function (key, values, rereduce) { return values.length; }'; |
for(var propName in doc) { | |
emit(propName, doc._id); | |
} | |
}'; | |
$obj->views->fieldNames->reduce = 'function (key, values, rereduce) { | |
return values.length; | |
}'; | |
// allow safe updates (even if slightly slower due to extra: rev-detection check). | // allow safe updates (even if slightly slower due to extra: rev-detection check). |
$foidb->save($obj, true); | $foidb->save($obj, true); |
//function createDocumentsDesignDoc() { | //function createDocumentsDesignDoc() { |
$docdb = $server->get_db('disclosr-documents'); | $docdb = $server->get_db('disclosr-documents'); |
$obj = new stdClass(); | $obj = new stdClass(); |
$obj->_id = "_design/" . urlencode("app"); | $obj->_id = "_design/" . urlencode("app"); |
$obj->language = "javascript"; | $obj->language = "javascript"; |
$obj->views->web_server->map = "function(doc) {\n emit(doc.web_server, 1);\n}"; | $obj->views->web_server->map = "function(doc) {\n emit(doc.web_server, 1);\n}"; |
$obj->views->web_server->reduce = "_sum"; | $obj->views->web_server->reduce = "_sum"; |
$obj->views->byAgency->map = "function(doc) {\n emit(doc.agencyID, 1);\n}"; | $obj->views->byAgency->map = "function(doc) {\n emit(doc.agencyID, 1);\n}"; |
$obj->views->byAgency->reduce = "_sum"; | $obj->views->byAgency->reduce = "_sum"; |
$obj->views->byURL->map = "function(doc) {\n emit(doc.url, doc);\n}"; | $obj->views->byURL->map = "function(doc) {\n emit(doc.url, doc);\n}"; |
$obj->views->agency->map = "function(doc) {\n emit(doc.agencyID, doc);\n}"; | $obj->views->agency->map = "function(doc) {\n emit(doc.agencyID, doc);\n}"; |
$obj->views->byWebServer->map = "function(doc) {\n emit(doc.web_server, doc);\n}"; | $obj->views->byWebServer->map = "function(doc) {\n emit(doc.web_server, doc);\n}"; |
$obj->views->datasets->map = "function(doc) {\nif (doc.fieldName == \"data\") {\n emit(doc._id, doc);\n}\n}"; | $obj->views->datasets->map = "function(doc) {\nif (doc.fieldName == \"data\") {\n emit(doc._id, doc);\n}\n}"; |
$obj->views->datasetGroups->map = "function(doc) {\nif (doc.fieldName == \"data\") {\n doc.metadata[\"data.gov.au Category\"] && doc.metadata[\"data.gov.au Category\"].forEach(function(tag) {\n emit(tag, doc.url); \n });\n}\n}"; | $obj->views->datasetGroups->map = "function(doc) {\nif (doc.fieldName == \"data\") {\n doc.metadata[\"data.gov.au Category\"] && doc.metadata[\"data.gov.au Category\"].forEach(function(tag) {\n emit(tag, doc.url); \n });\n}\n}"; |
$obj->views->getValidationRequired->map = "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}"; | $obj->views->getValidationRequired->map = "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}"; |
$docdb->save($obj, true); | $docdb->save($obj, true); |
//function createAgencyDesignDoc() { | //function createAgencyDesignDoc() { |
$db = $server->get_db('disclosr-agencies'); | $db = $server->get_db('disclosr-agencies'); |
$obj = new stdClass(); | $obj = new stdClass(); |
$obj->_id = "_design/" . urlencode("app"); | $obj->_id = "_design/" . urlencode("app"); |
$obj->language = "javascript"; | $obj->language = "javascript"; |
$obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; | $obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; |
$obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };"; | $obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };"; |
$obj->views->byCanonicalName->map = "function(doc) { | $obj->views->byCanonicalName->map = "function(doc) { |
if (doc.parentOrg || doc.orgType == 'FMA-DepartmentOfState') { | if (doc.parentOrg || doc.orgType == 'FMA-DepartmentOfState') { |
emit(doc.name, doc); | emit(doc.name, doc); |
} | } |
};"; | };"; |
$obj->views->byDeptStateName->map = "function(doc) { | $obj->views->byDeptStateName->map = "function(doc) { |
if (doc.orgType == 'FMA-DepartmentOfState') { | if (doc.orgType == 'FMA-DepartmentOfState') { |
emit(doc.name, doc._id); | emit(doc.name, doc._id); |
} | } |
};"; | };"; |
$obj->views->parentOrgs->map = "function(doc) { | $obj->views->parentOrgs->map = "function(doc) { |
if (doc.parentOrg) { | if (doc.parentOrg) { |
emit(doc._id, doc.parentOrg); | emit(doc._id, doc.parentOrg); |
} | } |
};"; | };"; |
$obj->views->byName->map = 'function(doc) { | $obj->views->byName->map = 'function(doc) { |
if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { | if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { |
emit(doc.name, doc._id); | emit(doc.name, doc._id); |
if (typeof(doc.shortName) != "undefined" && doc.shortName != doc.name) { | if (typeof(doc.shortName) != "undefined" && doc.shortName != doc.name) { |
emit(doc.shortName, doc._id); | emit(doc.shortName, doc._id); |
} | } |
for (name in doc.otherNames) { | for (name in doc.otherNames) { |
if (doc.otherNames[name] != "" && doc.otherNames[name] != doc.name) { | if (doc.otherNames[name] != "" && doc.otherNames[name] != doc.name) { |
emit(doc.otherNames[name], doc._id); | emit(doc.otherNames[name], doc._id); |
} | } |
} | } |
for (name in doc.foiBodies) { | for (name in doc.foiBodies) { |
if (doc.foiBodies[name] != "" && doc.foiBodies[name] != doc.name) { | if (doc.foiBodies[name] != "" && doc.foiBodies[name] != doc.name) { |
emit(doc.foiBodies[name], doc._id); | emit(doc.foiBodies[name], doc._id); |
} | } |
} | } |
for (name in doc.positions) { | for (name in doc.positions) { |
if (doc.positions[name] != "" && doc.positions[name] != doc.name) { | if (doc.positions[name] != "" && doc.positions[name] != doc.name) { |
emit(doc.positions[name], doc._id); | emit(doc.positions[name], doc._id); |
} | } |
} | } |
} | } |
};'; | };'; |
$obj->views->foiEmails->map = "function(doc) { | $obj->views->foiEmails->map = "function(doc) { |
emit(doc._id, doc.foiEmail); | emit(doc._id, doc.foiEmail); |
};"; | };"; |
$obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }"; | $obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }"; |
$obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };'; | $obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };'; |
$obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };'; | $obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };'; |
$obj->views->getScrapeRequired->map = "function(doc) { | $obj->views->getScrapeRequired->map = "function(doc) { |
var lastScrape = Date.parse(doc.metadata.lastScraped); | var lastScrape = Date.parse(doc.metadata.lastScraped); |
var today = new Date(); | var today = new Date(); |
if (!lastScrape || lastScrape.getTime() + 1000 != today.getTime()) { | if (!lastScrape || lastScrape.getTime() + 1000 != today.getTime()) { |
emit(doc._id, doc); | emit(doc._id, doc); |
} | } |
};"; | };"; |
$obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };"; | $obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };"; |
$obj->views->getConflicts->map = "function(doc) { | $obj->views->getConflicts->map = "function(doc) { |
if (doc._conflicts) { | if (doc._conflicts) { |
emit(null, [doc._rev].concat(doc._conflicts)); | emit(null, [doc._rev].concat(doc._conflicts)); |
} | } |
}"; | }"; |
$obj->views->getStatistics->map = | $obj->views->getStatistics->map = |
"function(doc) { | "function(doc) { |
if (doc.statistics) { | if (doc.statistics) { |
for (var statisticSet in doc.statistics) { | for (var statisticSet in doc.statistics) { |
for (var statisticPeriod in doc.statistics[statisticSet]) { | for (var statisticPeriod in doc.statistics[statisticSet]) { |
emit([statisticSet,statisticPeriod], doc.statistics[statisticSet][statisticPeriod]['value']); | emit([statisticSet,statisticPeriod], doc.statistics[statisticSet][statisticPeriod]['value']); |
} | } |
} | } |
} | } |
}"; | }"; |
$obj->views->getStatistics->reduce = '_sum'; | $obj->views->getStatistics->reduce = '_sum'; |
// http://stackoverflow.com/questions/646628/javascript-startswith | // http://stackoverflow.com/questions/646628/javascript-startswith |
$obj->views->score->map = 'if(!String.prototype.startsWith){ | $obj->views->score->map = 'if(!String.prototype.startsWith){ |
String.prototype.startsWith = function (str) { | String.prototype.startsWith = function (str) { |
return !this.indexOf(str); | return !this.indexOf(str); |
} | } |
} | } |
function(doc) { | function(doc) { |
count = 0; | count = 0; |
if (doc["status"] != "suspended") { | if (doc["status"] != "suspended") { |
for(var propName in doc) { | for(var propName in doc) { |
if(typeof(doc[propName]) != "undefined" && doc[propName] != "") { | if(typeof(doc[propName]) != "undefined" && doc[propName] != "") { |
count++; | count++; |
} | } |
} | } |
portfolio = doc.parentOrg; | portfolio = doc.parentOrg; |
if (doc.orgType == "FMA-DepartmentOfState") { | if (doc.orgType == "FMA-DepartmentOfState") { |
portfolio = doc._id; | portfolio = doc._id; |
} | } |
if (doc.orgType == "Court-Commonwealth" || doc.orgType == "FMA-DepartmentOfParliament") { | if (doc.orgType == "Court-Commonwealth" || doc.orgType == "FMA-DepartmentOfParliament") { |
portfolio = doc.orgType; | portfolio = doc.orgType; |
} | } |
emit(count+doc._id, {id:doc._id, name: doc.name, score:count, orgType: doc.orgType, portfolio:portfolio}); | emit(count+doc._id, {id:doc._id, name: doc.name, score:count, orgType: doc.orgType, portfolio:portfolio}); |
} | } |
}'; | }'; |
$obj->views->scoreHas->map = 'if(!String.prototype.startsWith){ | $obj->views->scoreHas->map = 'if(!String.prototype.startsWith){ |
String.prototype.startsWith = function (str) { | String.prototype.startsWith = function (str) { |
return !this.indexOf(str); | return !this.indexOf(str); |
} | } |
} | } |
if(!String.prototype.endsWith){ | if(!String.prototype.endsWith){ |
String.prototype.endsWith = function(suffix) { | String.prototype.endsWith = function(suffix) { |
return this.indexOf(suffix, this.length - suffix.length) !== -1; | return this.indexOf(suffix, this.length - suffix.length) !== -1; |
}; | }; |
} | } |
function(doc) { | function(doc) { |
if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { | if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { |
for(var propName in doc) { | for(var propName in doc) { |
if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") || propName.endsWith("URL"))) { | if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") || propName.endsWith("URL"))) { |
emit(propName, 1); | emit(propName, 1); |
} | } |
} | } |
emit("total", 1); | emit("total", 1); |
} | } |
}'; | }'; |
$obj->views->scoreHas->reduce = '_sum'; | $obj->views->scoreHas->reduce = '_sum'; |
$obj->views->fieldNames->map = ' | $obj->views->fieldNames->map = ' |
function(doc) { | function(doc) { |
for(var propName in doc) { | for(var propName in doc) { |
emit(propName, doc._id); | emit(propName, doc._id); |
} | } |
}'; | }'; |
$obj->views->fieldNames->reduce = '_count'; | $obj->views->fieldNames->reduce = '_count'; |
// allow safe updates (even if slightly slower due to extra: rev-detection check). | // allow safe updates (even if slightly slower due to extra: rev-detection check). |
$db->save($obj, true); | $db->save($obj, true); |
?> | ?> |
<?php | <?php |
include('template.inc.php'); | include('template.inc.php'); |
include_header_documents("About"); | include_header_documents("About"); |
include_once('../include/common.inc.php'); | include_once('../include/common.inc.php'); |
?> | ?> |
<h1>About</h1> | <h1>About</h1> |
Written and managed by Alex Sadleir (maxious [at] lambdacomplex.org) | |
<?php | <?php |
include_footer_documents(); | include_footer_documents(); |
?> | ?> |
import sys | import sys |
import os | import os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import scrape | import scrape |
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup |
from time import mktime | from time import mktime |
import feedparser | import feedparser |
import abc | import abc |
import unicodedata | import unicodedata |
import re | import re |
import dateutil | import dateutil |
from dateutil.parser import * | from dateutil.parser import * |
from datetime import * | from datetime import * |
import codecs | import codecs |
import difflib | import difflib |
from StringIO import StringIO | from StringIO import StringIO |
from pdfminer.pdfparser import PDFDocument, PDFParser | from pdfminer.pdfparser import PDFDocument, PDFParser |
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf | from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf |
from pdfminer.pdfdevice import PDFDevice, TagExtractor | from pdfminer.pdfdevice import PDFDevice, TagExtractor |
from pdfminer.converter import TextConverter | from pdfminer.converter import TextConverter |
from pdfminer.cmapdb import CMapDB | from pdfminer.cmapdb import CMapDB |
from pdfminer.layout import LAParams | from pdfminer.layout import LAParams |
class GenericDisclogScraper(object): | class GenericDisclogScraper(object): |
__metaclass__ = abc.ABCMeta | __metaclass__ = abc.ABCMeta |
agencyID = None | agencyID = None |
disclogURL = None | disclogURL = None |
def remove_control_chars(self, input): | def remove_control_chars(self, input): |
return "".join([i for i in input if ord(i) in range(32, 127)]) | return "".join([i for i in input if ord(i) in range(32, 127)]) |
def getAgencyID(self): | def getAgencyID(self): |
""" disclosr agency id """ | """ disclosr agency id """ |
if self.agencyID is None: | if self.agencyID is None: |
self.agencyID = os.path.basename(sys.argv[0]).replace(".py", "") | self.agencyID = os.path.basename(sys.argv[0]).replace(".py", "") |
return self.agencyID | return self.agencyID |
def getURL(self): | def getURL(self): |
""" disclog URL""" | """ disclog URL""" |
if self.disclogURL is None: | if self.disclogURL is None: |
agency = scrape.agencydb.get(self.getAgencyID()) | agency = scrape.agencydb.get(self.getAgencyID()) |
self.disclogURL = agency['FOIDocumentsURL'] | self.disclogURL = agency['FOIDocumentsURL'] |
return self.disclogURL | return self.disclogURL |
@abc.abstractmethod | @abc.abstractmethod |
def doScrape(self): | def doScrape(self): |
""" do the scraping """ | """ do the scraping """ |
return | return |
class GenericHTMLDisclogScraper(GenericDisclogScraper): | class GenericHTMLDisclogScraper(GenericDisclogScraper): |
def doScrape(self): | def doScrape(self): |
foidocsdb = scrape.couch['disclosr-foidocuments'] | foidocsdb = scrape.couch['disclosr-foidocuments'] |
(url, mime_type, rcontent) = scrape.fetchURL(scrape.docsdb, | (url, mime_type, rcontent) = scrape.fetchURL(scrape.docsdb, |
self.getURL(), "foidocuments", self.getAgencyID()) | self.getURL(), "foidocuments", self.getAgencyID()) |
content = rcontent | content = rcontent |
dochash = scrape.mkhash(content) | dochash = scrape.mkhash(content) |
doc = foidocsdb.get(dochash) | doc = foidocsdb.get(dochash) |
if doc is None: | if doc is None: |
print "saving " + dochash | print "saving " + dochash |
description = "This log may have updated but as it was not in a table last time we viewed it, we cannot extract what has changed. Please refer to the agency's website Disclosure Log to see the most recent entries" | description = "This log may have updated but as it was not in a table last time we viewed it, we cannot extract what has changed. Please refer to the agency's website Disclosure Log to see the most recent entries" |
last_attach = scrape.getLastAttachment(scrape.docsdb, self.getURL()) | last_attach = scrape.getLastAttachment(scrape.docsdb, self.getURL()) |
if last_attach != None: | if last_attach != None: |
html_diff = difflib.HtmlDiff() | html_diff = difflib.HtmlDiff() |
diff = html_diff.make_table(last_attach.read().split('\n'), | diff = html_diff.make_table(last_attach.read().split('\n'), |
content.split('\n')) | content.split('\n')) |
edate = date.today().strftime("%Y-%m-%d") | edate = date.today().strftime("%Y-%m-%d") |
doc = {'_id': dochash, 'agencyID': self.getAgencyID() | doc = {'_id': dochash, 'agencyID': self.getAgencyID() |
, 'url': self.getURL(), 'docID': dochash, | , 'url': self.getURL(), 'docID': dochash, |
"date": edate, "title": "Disclosure Log Updated", | "date": edate, "title": "Disclosure Log Updated", |
"description": self.remove_control_chars(description), "diff": self.remove_control_chars(diff)} | "description": self.remove_control_chars(description), "diff": self.remove_control_chars(diff)} |
foidocsdb.save(doc) | foidocsdb.save(doc) |
else: | else: |
print "already saved" | print "already saved" |
class GenericPDFDisclogScraper(GenericDisclogScraper): | class GenericPDFDisclogScraper(GenericDisclogScraper): |
def doScrape(self): | def doScrape(self): |
foidocsdb = scrape.couch['disclosr-foidocuments'] | foidocsdb = scrape.couch['disclosr-foidocuments'] |
(url, mime_type, content) = scrape.fetchURL(scrape.docsdb, | (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, |
self.getURL(), "foidocuments", self.getAgencyID()) | self.getURL(), "foidocuments", self.getAgencyID()) |
laparams = LAParams() | laparams = LAParams() |
rsrcmgr = PDFResourceManager(caching=True) | rsrcmgr = PDFResourceManager(caching=True) |
outfp = StringIO() | outfp = StringIO() |
device = TextConverter(rsrcmgr, outfp, codec='utf-8', | device = TextConverter(rsrcmgr, outfp, codec='utf-8', |
laparams=laparams) | laparams=laparams) |
fp = StringIO() | fp = StringIO() |
fp.write(content) | fp.write(content) |
process_pdf(rsrcmgr, device, fp, set(), caching=True, | process_pdf(rsrcmgr, device, fp, set(), caching=True, |
check_extractable=True) | check_extractable=True) |
description = outfp.getvalue() | description = outfp.getvalue() |
fp.close() | fp.close() |
device.close() | device.close() |
outfp.close() | outfp.close() |
dochash = scrape.mkhash(description) | dochash = scrape.mkhash(description) |
doc = foidocsdb.get(dochash) | doc = foidocsdb.get(dochash) |
if doc is None: | if doc is None: |
print "saving " + dochash | print "saving " + dochash |
edate = date.today().strftime("%Y-%m-%d") | edate = date.today().strftime("%Y-%m-%d") |
doc = {'_id': dochash, 'agencyID': self.getAgencyID() | doc = {'_id': dochash, 'agencyID': self.getAgencyID() |
, 'url': self.getURL(), 'docID': dochash, | , 'url': self.getURL(), 'docID': dochash, |
"date": edate, "title": "Disclosure Log Updated", "description": self.remove_control_chars(description)} | "date": edate, "title": "Disclosure Log Updated", "description": self.remove_control_chars(description)} |
foidocsdb.save(doc) | foidocsdb.save(doc) |
else: | else: |
print "already saved" | print "already saved" |
class GenericDOCXDisclogScraper(GenericDisclogScraper): | class GenericDOCXDisclogScraper(GenericDisclogScraper): |
def doScrape(self): | def doScrape(self): |
foidocsdb = scrape.couch['disclosr-foidocuments'] | foidocsdb = scrape.couch['disclosr-foidocuments'] |
(url, mime_type, content) = scrape.fetchURL(scrape.docsdb | (url, mime_type, content) = scrape.fetchURL(scrape.docsdb |
, self.getURL(), "foidocuments", self.getAgencyID()) | , self.getURL(), "foidocuments", self.getAgencyID()) |
mydoc = zipfile.ZipFile(file) | mydoc = zipfile.ZipFile(file) |
xmlcontent = mydoc.read('word/document.xml') | xmlcontent = mydoc.read('word/document.xml') |
document = etree.fromstring(xmlcontent) | document = etree.fromstring(xmlcontent) |
## Fetch all the text out of the document we just created | ## Fetch all the text out of the document we just created |
paratextlist = getdocumenttext(document) | paratextlist = getdocumenttext(document) |
# Make explicit unicode version | # Make explicit unicode version |
newparatextlist = [] | newparatextlist = [] |
for paratext in paratextlist: | for paratext in paratextlist: |
newparatextlist.append(paratext.encode("utf-8")) | newparatextlist.append(paratext.encode("utf-8")) |
## Print our documnts test with two newlines under each paragraph | ## Print our documnts test with two newlines under each paragraph |
description = '\n\n'.join(newparatextlist).strip(' \t\n\r') | description = '\n\n'.join(newparatextlist).strip(' \t\n\r') |
dochash = scrape.mkhash(description) | dochash = scrape.mkhash(description) |
doc = foidocsdb.get(dochash) | doc = foidocsdb.get(dochash) |
if doc is None: | if doc is None: |
print "saving " + dochash | print "saving " + dochash |
edate = time().strftime("%Y-%m-%d") | edate = time().strftime("%Y-%m-%d") |
doc = {'_id': dochash, 'agencyID': self.getAgencyID() | doc = {'_id': dochash, 'agencyID': self.getAgencyID() |
, 'url': self.getURL(), 'docID': dochash, | , 'url': self.getURL(), 'docID': dochash, |
"date": edate, "title": "Disclosure Log Updated", "description": description} | "date": edate, "title": "Disclosure Log Updated", "description": description} |
foidocsdb.save(doc) | foidocsdb.save(doc) |
else: | else: |
print "already saved" | print "already saved" |
class GenericRSSDisclogScraper(GenericDisclogScraper): | class GenericRSSDisclogScraper(GenericDisclogScraper): |
def doScrape(self): | def doScrape(self): |
foidocsdb = scrape.couch['disclosr-foidocuments'] | foidocsdb = scrape.couch['disclosr-foidocuments'] |
(url, mime_type, content) = scrape.fetchURL(scrape.docsdb, | (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, |
self.getURL(), "foidocuments", self.getAgencyID()) | self.getURL(), "foidocuments", self.getAgencyID()) |
feed = feedparser.parse(content) | feed = feedparser.parse(content) |
for entry in feed.entries: | for entry in feed.entries: |
#print entry | #print entry |
print entry.id | print entry.id |
dochash = scrape.mkhash(entry.id) | dochash = scrape.mkhash(entry.id) |
doc = foidocsdb.get(dochash) | doc = foidocsdb.get(dochash) |
#print doc | #print doc |
if doc is None: | if doc is None: |
print "saving " + dochash | print "saving " + dochash |
edate = datetime.fromtimestamp( | edate = datetime.fromtimestamp( |
mktime(entry.published_parsed)).strftime("%Y-%m-%d") | mktime(entry.published_parsed)).strftime("%Y-%m-%d") |
doc = {'_id': dochash, 'agencyID': self.getAgencyID(), | doc = {'_id': dochash, 'agencyID': self.getAgencyID(), |
'url': entry.link, 'docID': entry.id, | 'url': entry.link, 'docID': entry.id, |
"date": edate, "title": entry.title} | "date": edate, "title": entry.title} |
self.getDescription(entry, entry, doc) | self.getDescription(entry, entry, doc) |
foidocsdb.save(doc) | foidocsdb.save(doc) |
else: | else: |
print "already saved" | print "already saved" |
def getDescription(self, content, entry, doc): | def getDescription(self, content, entry, doc): |
""" get description from rss entry""" | """ get description from rss entry""" |
doc.update({'description': content.summary}) | doc.update({'description': content.summary}) |
return | return |
class GenericOAICDisclogScraper(GenericDisclogScraper): | class GenericOAICDisclogScraper(GenericDisclogScraper): |
__metaclass__ = abc.ABCMeta | __metaclass__ = abc.ABCMeta |
@abc.abstractmethod | @abc.abstractmethod |
def getColumns(self, columns): | def getColumns(self, columns): |
""" rearranges columns if required """ | """ rearranges columns if required """ |
return | return |
def getColumnCount(self): | def getColumnCount(self): |
return 5 | return 5 |
def getDescription(self, content, entry, doc): | def getDescription(self, content, entry, doc): |
""" get description from rss entry""" | """ get description from rss entry""" |
descriptiontxt = "" | descriptiontxt = "" |
for string in content.stripped_strings: | for string in content.stripped_strings: |
descriptiontxt = descriptiontxt + " \n" + string | descriptiontxt = descriptiontxt + " \n" + string |
doc.update({'description': descriptiontxt}) | doc.update({'description': descriptiontxt}) |
def getTitle(self, content, entry, doc): | def getTitle(self, content, entry, doc): |
doc.update({'title': (''.join(content.stripped_strings))}) | doc.update({'title': (''.join(content.stripped_strings))}) |
def getTable(self, soup): | def getTable(self, soup): |
return soup.table | return soup.table |
def getRows(self, table): | def getRows(self, table): |
return table.find_all('tr') | return table.find_all('tr') |
def getDocHash(self, id,date, url): | |
if id.string is None: | |
print "no id, using date as hash" | |
return scrape.mkhash( | |
self.remove_control_chars( | |
url + (''.join(date.stripped_strings)))) | |
else: | |
return scrape.mkhash( | |
self.remove_control_chars( | |
url + (''.join(id.stripped_strings)))) | |
def getDate(self, content, entry, doc): | def getDate(self, content, entry, doc): |
strdate = ''.join(content.stripped_strings).strip() | strdate = ''.join(content.stripped_strings).strip() |
(a, b, c) = strdate.partition("(") | (a, b, c) = strdate.partition("(") |
strdate = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012").replace("Janrurary", "January").replace("1012","2012")) | strdate = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012").replace("Janrurary", "January").replace("1012","2012")) |
print strdate | print strdate |
try: | try: |
edate = parse(strdate, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") | edate = parse(strdate, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") |
except ValueError: | except ValueError: |
print >> sys.stderr, "ERROR date invalid %s " % strdate | print >> sys.stderr, "ERROR date invalid %s " % strdate |
print >> sys.stderr, "ERROR date originally %s " % ''.join(content.stripped_strings).strip() | print >> sys.stderr, "ERROR date originally %s " % ''.join(content.stripped_strings).strip() |
edate = date.today().strftime("%Y-%m-%d") | edate = date.today().strftime("%Y-%m-%d") |
print edate | print edate |
doc.update({'date': edate}) | doc.update({'date': edate}) |
return | return |
def getLinks(self, content, entry, doc): | def getLinks(self, content, entry, doc): |
links = [] | links = [] |
for atag in entry.find_all("a"): | for atag in entry.find_all("a"): |
if atag.has_key('href'): | if atag.has_key('href'): |
links.append(scrape.fullurl(content, atag['href'])) | links.append(scrape.fullurl(content, atag['href'])) |
if links != []: | if links != []: |
doc.update({'links': links}) | doc.update({'links': links}) |
return | return |
def doScrape(self): | def doScrape(self): |
foidocsdb = scrape.couch['disclosr-foidocuments'] | foidocsdb = scrape.couch['disclosr-foidocuments'] |
(url, mime_type, content) = scrape.fetchURL(scrape.docsdb, | (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, |
self.getURL(), "foidocuments", self.getAgencyID()) | self.getURL(), "foidocuments", self.getAgencyID()) |
if content is not None: | if content is not None: |
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type == "application/xml": | if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type == "application/xml": |
# http://www.crummy.com/software/BeautifulSoup/documentation.html | # http://www.crummy.com/software/BeautifulSoup/documentation.html |
print "parsing" | print "parsing" |
soup = BeautifulSoup(content) | soup = BeautifulSoup(content) |
table = self.getTable(soup) | table = self.getTable(soup) |
for row in self.getRows(table): | for row in self.getRows(table): |
columns = row.find_all('td') | columns = row.find_all('td') |
if len(columns) is self.getColumnCount(): | if len(columns) is self.getColumnCount(): |
(id, date, title, | (id, date, title, |
description, notes) = self.getColumns(columns) | description, notes) = self.getColumns(columns) |
print self.remove_control_chars( | print self.remove_control_chars( |
''.join(id.stripped_strings)) | ''.join(id.stripped_strings)) |
if id.string is None: | dochash = self.getDocHash(id,date,url) |
print "no id, using date as hash" | |
dochash = scrape.mkhash( | |
self.remove_control_chars( | |
url + (''.join(date.stripped_strings)))) | |
else: | |
dochash = scrape.mkhash( | |
self.remove_control_chars( | |
url + (''.join(id.stripped_strings)))) | |
doc = foidocsdb.get(dochash) | doc = foidocsdb.get(dochash) |
if doc is None: | if doc is None: |
print "saving " + dochash | print "saving " + dochash |
doc = {'_id': dochash, | doc = {'_id': dochash, |
'agencyID': self.getAgencyID(), | 'agencyID': self.getAgencyID(), |
'url': self.getURL(), | 'url': self.getURL(), |
'docID': (''.join(id.stripped_strings))} | 'docID': (''.join(id.stripped_strings))} |
self.getLinks(self.getURL(), row, doc) | self.getLinks(self.getURL(), row, doc) |
self.getTitle(title, row, doc) | self.getTitle(title, row, doc) |
self.getDate(date, row, doc) | self.getDate(date, row, doc) |
self.getDescription(description, row, doc) | self.getDescription(description, row, doc) |
if notes is not None: | if notes is not None: |
doc.update({'notes': ( | doc.update({'notes': ( |
''.join(notes.stripped_strings))}) | ''.join(notes.stripped_strings))}) |
badtitles = ['-', 'Summary of FOI Request' | badtitles = ['-', 'Summary of FOI Request' |
, 'FOI request(in summary form)' | , 'FOI request(in summary form)' |
, 'Summary of FOI request received by the ASC', | , 'Summary of FOI request received by the ASC', |
'Summary of FOI request received by agency/minister', | 'Summary of FOI request received by agency/minister', |
'Description of Documents Requested', 'FOI request', | 'Description of Documents Requested', 'FOI request', |
'Description of FOI Request', 'Summary of request', 'Description', 'Summary', | 'Description of FOI Request', 'Summary of request', 'Description', 'Summary', |
'Summary of FOIrequest received by agency/minister', | 'Summary of FOIrequest received by agency/minister', |
'Summary of FOI request received', 'Description of FOI Request', | 'Summary of FOI request received', 'Description of FOI Request', |
"FOI request", 'Results 1 to 67 of 67'] | "FOI request", 'Results 1 to 67 of 67'] |
if doc['title'] not in badtitles and 'description' in doc.keys() and doc['description'] != '': | if doc['title'] not in badtitles and 'description' in doc.keys() and doc['description'] != '': |
print "saving" | print "saving" |
foidocsdb.save(doc) | foidocsdb.save(doc) |
else: | else: |
print "already saved " + dochash | print "already saved " + dochash |
elif len(row.find_all('th')) is self.getColumnCount(): | elif len(row.find_all('th')) is self.getColumnCount(): |
print "header row" | print "header row" |
else: | else: |
print >> sys.stderr, "ERROR number of columns incorrect" | print >> sys.stderr, "ERROR number of columns incorrect" |
print row | print row |
# www.robotstxt.org/ | # www.robotstxt.org/ |
# http://code.google.com/web/controlcrawlindex/ | # http://code.google.com/web/controlcrawlindex/ |
User-agent: * | User-agent: * |
Disallow: /admin/ | Disallow: /admin/ |
Disallow: /viewDocument.php | |
Sitemap: http://disclosurelo.gs/sitemap.xml.php | Sitemap: http://disclosurelo.gs/sitemap.xml.php |
#!/bin/bash | |
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" |
echo $DIR | echo $DIR |
cd $DIR | cd $DIR |
echo "" > /tmp/disclosr-error | echo "" > /tmp/disclosr-error |
for f in $DIR/scrapers/*.py; do | for f in $DIR/scrapers/*.py; do |
echo "Processing $f file.."; | echo "Processing $f file.."; |
md5=`md5sum /tmp/disclosr-error` | md5=`md5sum /tmp/disclosr-error` |
python $f 3>&1 1>&2 2>&3 | tee --append /tmp/disclosr-error; | python $f 3>&1 1>&2 2>&3 | tee --append /tmp/disclosr-error; |
md52=`md5sum /tmp/disclosr-error` | md52=`md5sum /tmp/disclosr-error` |
if [ "$md5" != "$md52" ]; then | if [ "$md5" != "$md52" ]; then |
echo "^^^^^^^^^^^^^^ $f" >> /tmp/disclosr-error; | echo "^^^^^^^^^^^^^^ $f" >> /tmp/disclosr-error; |
fi | fi |
if [ "$?" -ne "0" ]; then | if [ "$?" -ne "0" ]; then |
echo "error"; | echo "error"; |
sleep 1; | sleep 1; |
fi | fi |
done | done |
if [ -s /tmp/disclosr-error ] ; then | if [ -s /tmp/disclosr-error ] ; then |
echo "emailling logs.."; | echo "emailling logs.."; |
mail -E -s "Disclosr errors" maxious@lambdacomplex.org < /tmp/disclosr-error ; | mail -E -s "Disclosr errors" maxious@lambdacomplex.org < /tmp/disclosr-error ; |
fi | fi |
#http://packages.python.org/CouchDB/client.html | #http://packages.python.org/CouchDB/client.html |
import couchdb | import couchdb |
import urllib2 | import urllib2 |
from BeautifulSoup import BeautifulSoup | from BeautifulSoup import BeautifulSoup |
import re | import re |
import hashlib | import hashlib |
from urlparse import urljoin | from urlparse import urljoin |
import time | import time |
import os | import os |
import sys | import sys |
import mimetypes | import mimetypes |
import urllib | import urllib |
import urlparse | import urlparse |
import socket | import socket |
#couch = couchdb.Server('http://192.168.1.148:5984/') | #couch = couchdb.Server('http://192.168.1.148:5984/') |
#couch = couchdb.Server('http://192.168.1.113:5984/') | #couch = couchdb.Server('http://192.168.1.113:5984/') |
couch = couchdb.Server('http://127.0.0.1:5984/') | couch = couchdb.Server('http://127.0.0.1:5984/') |
def mkhash(input): | def mkhash(input): |
return hashlib.md5(input).hexdigest().encode("utf-8") | return hashlib.md5(input).hexdigest().encode("utf-8") |
def canonurl(url): | def canonurl(url): |
r"""Return the canonical, ASCII-encoded form of a UTF-8 encoded URL, or '' | r"""Return the canonical, ASCII-encoded form of a UTF-8 encoded URL, or '' |
if the URL looks invalid. | if the URL looks invalid. |
>>> canonurl('\xe2\x9e\xa1.ws') # tinyarro.ws | >>> canonurl('\xe2\x9e\xa1.ws') # tinyarro.ws |
'http://xn--hgi.ws/' | 'http://xn--hgi.ws/' |
""" | """ |
# strip spaces at the ends and ensure it's prefixed with 'scheme://' | # strip spaces at the ends and ensure it's prefixed with 'scheme://' |
url = url.strip() | url = url.strip() |
if not url: | if not url: |
return '' | return '' |
if not urlparse.urlsplit(url).scheme: | if not urlparse.urlsplit(url).scheme: |
url = 'http://' + url | url = 'http://' + url |
# turn it into Unicode | # turn it into Unicode |
#try: | #try: |
# url = unicode(url, 'utf-8') | # url = unicode(url, 'utf-8') |
#except UnicodeDecodeError: | #except UnicodeDecodeError: |
# return '' # bad UTF-8 chars in URL | # return '' # bad UTF-8 chars in URL |
# parse the URL into its components | # parse the URL into its components |
parsed = urlparse.urlsplit(url) | parsed = urlparse.urlsplit(url) |
scheme, netloc, path, query, fragment = parsed | scheme, netloc, path, query, fragment = parsed |
# ensure scheme is a letter followed by letters, digits, and '+-.' chars | # ensure scheme is a letter followed by letters, digits, and '+-.' chars |
if not re.match(r'[a-z][-+.a-z0-9]*$', scheme, flags=re.I): | if not re.match(r'[a-z][-+.a-z0-9]*$', scheme, flags=re.I): |
return '' | return '' |
scheme = str(scheme) | scheme = str(scheme) |
# ensure domain and port are valid, eg: sub.domain.<1-to-6-TLD-chars>[:port] | # ensure domain and port are valid, eg: sub.domain.<1-to-6-TLD-chars>[:port] |
match = re.match(r'(.+\.[a-z0-9]{1,6})(:\d{1,5})?$', netloc, flags=re.I) | match = re.match(r'(.+\.[a-z0-9]{1,6})(:\d{1,5})?$', netloc, flags=re.I) |
if not match: | if not match: |
return '' | return '' |
domain, port = match.groups() | domain, port = match.groups() |
netloc = domain + (port if port else '') | netloc = domain + (port if port else '') |
netloc = netloc.encode('idna') | netloc = netloc.encode('idna') |
# ensure path is valid and convert Unicode chars to %-encoded | # ensure path is valid and convert Unicode chars to %-encoded |
if not path: | if not path: |
path = '/' # eg: 'http://google.com' -> 'http://google.com/' | path = '/' # eg: 'http://google.com' -> 'http://google.com/' |
path = urllib.quote(urllib.unquote(path.encode('utf-8')), safe='/;') | path = urllib.quote(urllib.unquote(path.encode('utf-8')), safe='/;') |
# ensure query is valid | # ensure query is valid |
query = urllib.quote(urllib.unquote(query.encode('utf-8')), safe='=&?/') | query = urllib.quote(urllib.unquote(query.encode('utf-8')), safe='=&?/') |
# ensure fragment is valid | # ensure fragment is valid |
fragment = urllib.quote(urllib.unquote(fragment.encode('utf-8'))) | fragment = urllib.quote(urllib.unquote(fragment.encode('utf-8'))) |
# piece it all back together, truncating it to a maximum of 4KB | # piece it all back together, truncating it to a maximum of 4KB |
url = urlparse.urlunsplit((scheme, netloc, path, query, fragment)) | url = urlparse.urlunsplit((scheme, netloc, path, query, fragment)) |
return url[:4096] | return url[:4096] |
def fullurl(url, href): | def fullurl(url, href): |
href = href.replace(" ", "%20") | href = href.replace(" ", "%20") |
href = re.sub('#.*$', '', href) | href = re.sub('#.*$', '', href) |
return urljoin(url, href) | return urljoin(url, href) |
#http://diveintopython.org/http_web_services/etags.html | #http://diveintopython.org/http_web_services/etags.html |
class NotModifiedHandler(urllib2.BaseHandler): | class NotModifiedHandler(urllib2.BaseHandler): |
def http_error_304(self, req, fp, code, message, headers): | def http_error_304(self, req, fp, code, message, headers): |
addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url()) | addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url()) |
addinfourl.code = code | addinfourl.code = code |
return addinfourl | return addinfourl |
def getLastAttachment(docsdb, url): | def getLastAttachment(docsdb, url): |
hash = mkhash(url) | hash = mkhash(url) |
doc = docsdb.get(hash) | doc = docsdb.get(hash) |
if doc != None and "_attachments" in doc.keys(): | if doc != None and "_attachments" in doc.keys(): |
last_attachment_fname = doc["_attachments"].keys()[-1] | last_attachment_fname = doc["_attachments"].keys()[-1] |
last_attachment = docsdb.get_attachment(doc, last_attachment_fname) | last_attachment = docsdb.get_attachment(doc, last_attachment_fname) |
return last_attachment | return last_attachment |
else: | else: |
return None | return None |
def fetchURL(docsdb, url, fieldName, agencyID, scrape_again=True): | def fetchURL(docsdb, url, fieldName, agencyID, scrape_again=True): |
url = canonurl(url) | url = canonurl(url) |
hash = mkhash(url) | hash = mkhash(url) |
req = urllib2.Request(url) | req = urllib2.Request(url) |
print "Fetching %s (%s)" % (url, hash) | print "Fetching %s (%s)" % (url, hash) |
if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "": | if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "": |
print >> sys.stderr, "Not a valid HTTP url" | print >> sys.stderr, "Not a valid HTTP url" |
return (None, None, None) | return (None, None, None) |
doc = docsdb.get(hash) | doc = docsdb.get(hash) |
if doc == None: | if doc == None: |
doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName': fieldName, 'type': 'website'} | doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName': fieldName, 'type': 'website'} |
else: | else: |
if (('page_scraped' in doc) and ((time.time() - doc['page_scraped']) < 60 * 24 * 14) or (scrape_again == False)): | if (('page_scraped' in doc) and ((time.time() - doc['page_scraped']) < 60 * 24 * 14) or (scrape_again == False)): |
print "Uh oh, trying to scrape URL again too soon!" + hash | print "Uh oh, trying to scrape URL again too soon!" + hash |
if (not doc.has_key('file_size') or doc["file_size"] != "0") and "_attachments" in doc.keys(): | if (not doc.has_key('file_size') or doc["file_size"] != "0") and "_attachments" in doc.keys(): |
last_attachment_fname = doc["_attachments"].keys()[-1] | last_attachment_fname = doc["_attachments"].keys()[-1] |
last_attachment = docsdb.get_attachment(doc, last_attachment_fname) | last_attachment = docsdb.get_attachment(doc, last_attachment_fname) |
content = last_attachment.read() | content = last_attachment.read() |
mime_type = doc['mime_type'] | mime_type = doc['mime_type'] |
else: | else: |
content = None | content = None |
mime_type = None | mime_type = None |
return (doc['url'], mime_type, content) | return (doc['url'], mime_type, content) |
req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)") | req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)") |
#if there is a previous version stored in couchdb, load caching helper tags | #if there is a previous version stored in couchdb, load caching helper tags |
if doc.has_key('etag'): | if doc.has_key('etag'): |
req.add_header("If-None-Match", doc['etag']) | req.add_header("If-None-Match", doc['etag']) |
if doc.has_key('last_modified'): | if doc.has_key('last_modified'): |
req.add_header("If-Modified-Since", doc['last_modified']) | req.add_header("If-Modified-Since", doc['last_modified']) |
opener = urllib2.build_opener(NotModifiedHandler()) | opener = urllib2.build_opener(NotModifiedHandler()) |
try: | try: |
url_handle = opener.open(req, None, 20) | url_handle = opener.open(req, None, 20) |
doc['url'] = url_handle.geturl() # may have followed a redirect to a new url | doc['url'] = url_handle.geturl() # may have followed a redirect to a new url |
headers = url_handle.info() # the addinfourls have the .info() too | headers = url_handle.info() # the addinfourls have the .info() too |
doc['etag'] = headers.getheader("ETag") | doc['etag'] = headers.getheader("ETag") |
doc['last_modified'] = headers.getheader("Last-Modified") | doc['last_modified'] = headers.getheader("Last-Modified") |
doc['date'] = headers.getheader("Date") | doc['date'] = headers.getheader("Date") |
doc['page_scraped'] = time.time() | doc['page_scraped'] = time.time() |
doc['web_server'] = headers.getheader("Server") | doc['web_server'] = headers.getheader("Server") |
doc['via'] = headers.getheader("Via") | doc['via'] = headers.getheader("Via") |
doc['powered_by'] = headers.getheader("X-Powered-By") | doc['powered_by'] = headers.getheader("X-Powered-By") |
doc['file_size'] = headers.getheader("Content-Length") | doc['file_size'] = headers.getheader("Content-Length") |
content_type = headers.getheader("Content-Type") | content_type = headers.getheader("Content-Type") |
if content_type != None: | if content_type != None: |
doc['mime_type'] = content_type.split(";")[0] | doc['mime_type'] = content_type.split(";")[0] |
else: | else: |
(type, encoding) = mimetypes.guess_type(url) | (type, encoding) = mimetypes.guess_type(url) |
doc['mime_type'] = type | doc['mime_type'] = type |
if hasattr(url_handle, 'code'): | if hasattr(url_handle, 'code'): |
if url_handle.code == 304: | if url_handle.code == 304: |
print "the web page has not been modified" + hash | print "the web page has not been modified" + hash |
last_attachment_fname = doc["_attachments"].keys()[-1] | last_attachment_fname = doc["_attachments"].keys()[-1] |
last_attachment = docsdb.get_attachment(doc, last_attachment_fname) | last_attachment = docsdb.get_attachment(doc, last_attachment_fname) |
content = last_attachment | content = last_attachment |
return (doc['url'], doc['mime_type'], content.read()) | return (doc['url'], doc['mime_type'], content.read()) |
else: | else: |
print "new webpage loaded" | print "new webpage loaded" |
content = url_handle.read() | content = url_handle.read() |
docsdb.save(doc) | docsdb.save(doc) |
doc = docsdb.get(hash) # need to get a _rev | doc = docsdb.get(hash) # need to get a _rev |
docsdb.put_attachment(doc, content, str(time.time()) + "-" + os.path.basename(url), doc['mime_type']) | docsdb.put_attachment(doc, content, str(time.time()) + "-" + os.path.basename(url), doc['mime_type']) |
return (doc['url'], doc['mime_type'], content) | return (doc['url'], doc['mime_type'], content) |
#store as attachment epoch-filename | #store as attachment epoch-filename |
except (urllib2.URLError, socket.timeout) as e: | except (urllib2.URLError, socket.timeout) as e: |
print >> sys.stderr,"error!" | print >> sys.stderr,"error!" |
error = "" | error = "" |
if hasattr(e, 'reason'): | if hasattr(e, 'reason'): |
error = "error %s in downloading %s" % (str(e.reason), url) | error = "error %s in downloading %s" % (str(e.reason), url) |
elif hasattr(e, 'code'): | elif hasattr(e, 'code'): |
error = "error %s in downloading %s" % (e.code, url) | error = "error %s in downloading %s" % (e.code, url) |
print >> sys.stderr, error | print >> sys.stderr, error |
doc['error'] = error | doc['error'] = error |
docsdb.save(doc) | docsdb.save(doc) |
return (None, None, None) | return (None, None, None) |
def scrapeAndStore(docsdb, url, depth, fieldName, agencyID): | def scrapeAndStore(docsdb, url, depth, fieldName, agencyID): |
(url, mime_type, content) = fetchURL(docsdb, url, fieldName, agencyID) | (url, mime_type, content) = fetchURL(docsdb, url, fieldName, agencyID) |
badURLs = ["http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report"] | badURLs = ["http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report"] |
if content != None and depth > 0 and url != "http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report": | if content != None and depth > 0 and url != "http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report": |
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type == "application/xml": | if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type == "application/xml": |
# http://www.crummy.com/software/BeautifulSoup/documentation.html | # http://www.crummy.com/software/BeautifulSoup/documentation.html |
soup = BeautifulSoup(content) | soup = BeautifulSoup(content) |
navIDs = soup.findAll( | navIDs = soup.findAll( |
id=re.compile('nav|Nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')) | id=re.compile('nav|Nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')) |
for nav in navIDs: | for nav in navIDs: |
print "Removing element", nav['id'] | print "Removing element", nav['id'] |
nav.extract() | nav.extract() |
navClasses = soup.findAll( | navClasses = soup.findAll( |
attrs={'class': re.compile('nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')}) | attrs={'class': re.compile('nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')}) |
for nav in navClasses: | for nav in navClasses: |
print "Removing element", nav['class'] | print "Removing element", nav['class'] |
nav.extract() | nav.extract() |
links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-")) | links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-")) |
linkurls = set([]) | linkurls = set([]) |
for link in links: | for link in links: |
if link.has_key("href"): | if link.has_attr("href"): |
if link['href'].startswith("http"): | if link['href'].startswith("http"): |
# lets not do external links for now | # lets not do external links for now |
# linkurls.add(link['href']) | # linkurls.add(link['href']) |
None | None |
if link['href'].startswith("mailto"): | if link['href'].startswith("mailto"): |
# not http | # not http |
None | None |
if link['href'].startswith("javascript"): | if link['href'].startswith("javascript"): |
# not http | # not http |
None | None |
else: | else: |
# remove anchors and spaces in urls | # remove anchors and spaces in urls |
linkurls.add(fullurl(url, link['href'])) | linkurls.add(fullurl(url, link['href'])) |
for linkurl in linkurls: | for linkurl in linkurls: |
#print linkurl | #print linkurl |
scrapeAndStore(docsdb, linkurl, depth - 1, fieldName, agencyID) | scrapeAndStore(docsdb, linkurl, depth - 1, fieldName, agencyID) |
# select database | # select database |
agencydb = couch['disclosr-agencies'] | agencydb = couch['disclosr-agencies'] |
docsdb = couch['disclosr-documents'] | docsdb = couch['disclosr-documents'] |
if __name__ == "__main__": | if __name__ == "__main__": |
for row in agencydb.view('app/all'): #not recently scraped agencies view? | for row in agencydb.view('app/all'): #not recently scraped agencies view? |
agency = agencydb.get(row.id) | agency = agencydb.get(row.id) |
print agency['name'] | print agency['name'] |
for key in agency.keys(): | for key in agency.keys(): |
if key == "FOIDocumentsURL" and "status" not in agency.keys() and False: | if key == "FOIDocumentsURL" and "status" not in agency.keys() and False: |
scrapeAndStore(docsdb, agency[key], 0, key, agency['_id']) | scrapeAndStore(docsdb, agency[key], 0, key, agency['_id']) |
if key == 'website' and True: | if key == 'website' and True: |
scrapeAndStore(docsdb, agency[key], 0, key, agency['_id']) | scrapeAndStore(docsdb, agency[key], 0, key, agency['_id']) |
if "metadata" not in agency.keys(): | if "metadata" not in agency.keys(): |
agency['metadata'] = {} | agency['metadata'] = {} |
agency['metadata']['lastScraped'] = time.time() | agency['metadata']['lastScraped'] = time.time() |
if key.endswith('URL') and False: | if key.endswith('URL') and False: |
print key | print key |
depth = 1 | depth = 1 |
if 'scrapeDepth' in agency.keys(): | if 'scrapeDepth' in agency.keys(): |
depth = agency['scrapeDepth'] | depth = agency['scrapeDepth'] |
scrapeAndStore(docsdb, agency[key], depth, key, agency['_id']) | scrapeAndStore(docsdb, agency[key], depth, key, agency['_id']) |
agencydb.save(agency) | agencydb.save(agency) |
import sys,os | import sys,os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import genericScrapers | import genericScrapers |
import scrape | import scrape |
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup |
#http://www.doughellmann.com/PyMOTW/abc/ | #http://www.doughellmann.com/PyMOTW/abc/ |
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): | class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): |
def getTable(self,soup): | def getTable(self,soup): |
return soup.find(id = "maincontentcontainer").table | return soup.find(class_ = "contentcontainer").table |
def getColumnCount(self): | def getColumnCount(self): |
return 5 | return 5 |
def getColumns(self,columns): | def getColumns(self,columns): |
(date, disclogdate, title, description, notes) = columns | (date, disclogdate, title, description, notes) = columns |
return (date, date, title, description, notes) | return (date, date, title, description, notes) |
if __name__ == '__main__': | if __name__ == '__main__': |
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) | print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) |
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) | print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) |
ScraperImplementation().doScrape() | ScraperImplementation().doScrape() |
import sys,os | import sys,os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import genericScrapers | import genericScrapers |
import dateutil | import dateutil |
from dateutil.parser import * | from dateutil.parser import * |
from datetime import * | from datetime import * |
import scrape | import scrape |
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup |
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): | class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): |
def __init__(self): | def __init__(self): |
super(ScraperImplementation, self).__init__() | super(ScraperImplementation, self).__init__() |
def getDescription(self,content, entry,doc): | def getDescription(self,content, entry,doc): |
link = None | link = None |
links = [] | links = [] |
description = "" | description = "" |
for atag in entry.find_all('a'): | for atag in entry.find_all('a'): |
if atag.has_key('href'): | if atag.has_attr('href'): |
link = scrape.fullurl(self.getURL(), atag['href']) | link = scrape.fullurl(self.getURL(), atag['href']) |
(url, mime_type, htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) | (url, mime_type, htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) |
if htcontent != None: | if htcontent != None: |
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": | if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": |
soup = BeautifulSoup(htcontent) | soup = BeautifulSoup(htcontent) |
row = soup.find(id="content_div_148050") | row = soup.find(id="content_div_148050") |
description = ''.join(row.stripped_strings) | description = ''.join(row.stripped_strings) |
for atag in row.find_all("a"): | for atag in row.find_all("a"): |
if atag.has_key('href'): | if atag.has_attr('href'): |
links.append(scrape.fullurl(link, atag['href'])) | links.append(scrape.fullurl(link, atag['href'])) |
if links != []: | if links != []: |
doc.update({'links': links}) | doc.update({'links': links}) |
if description != "": | if description != "": |
doc.update({ 'description': description}) | doc.update({ 'description': description}) |
def getColumnCount(self): | def getColumnCount(self): |
return 4 | return 4 |
def getColumns(self, columns): | def getColumns(self, columns): |
(id, date, datepub, title) = columns | (id, date, datepub, title) = columns |
return (id, date, title, title, None) | return (id, date, title, title, None) |
if __name__ == '__main__': | if __name__ == '__main__': |
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) | print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) |
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) | print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) |
nsi = ScraperImplementation() | nsi = ScraperImplementation() |
nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=1" | nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=1" |
nsi.doScrape() | nsi.doScrape() |
nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=2" | nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=2" |
nsi.doScrape() | nsi.doScrape() |
nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=3" | nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=3" |
nsi.doScrape() | nsi.doScrape() |
nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=4" | nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=4" |
nsi.doScrape() | nsi.doScrape() |
nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=5" | nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=5" |
nsi.doScrape() | nsi.doScrape() |
import sys,os | import sys,os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import genericScrapers | import genericScrapers |
import scrape | import scrape |
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup |
#http://www.doughellmann.com/PyMOTW/abc/ | #http://www.doughellmann.com/PyMOTW/abc/ |
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): | class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): |
def getDescription(self,content, entry,doc): | def getDescription(self,content, entry,doc): |
link = None | link = None |
links = [] | links = [] |
description = "" | description = "" |
for atag in entry.find_all('a'): | for atag in entry.find_all('a'): |
if atag.has_key('href'): | if atag.has_attr('href'): |
link = scrape.fullurl(self.getURL(),atag['href']) | link = scrape.fullurl(self.getURL(),atag['href']) |
(url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) | (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) |
if htcontent != None: | if htcontent != None: |
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": | if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": |
# http://www.crummy.com/software/BeautifulSoup/documentation.html | # http://www.crummy.com/software/BeautifulSoup/documentation.html |
soup = BeautifulSoup(htcontent) | soup = BeautifulSoup(htcontent) |
rowtitle = soup.find(class_ = "wc-title").find("h1").string | rowtitle = soup.find(class_ = "wc-title").find("h1").string |
if rowtitle != None: | if rowtitle != None: |
description = rowtitle + ": " | description = rowtitle + ": " |
for row in soup.find(class_ ="wc-content").find_all('td'): | for row in soup.find(class_ ="wc-content").find_all('td'): |
if row != None: | if row != None: |
for text in row.stripped_strings: | for text in row.stripped_strings: |
description = description + text + "\n" | description = description + text + "\n" |
for atag in row.find_all("a"): | for atag in row.find_all("a"): |
if atag.has_key('href'): | if atag.has_attr('href'): |
links.append(scrape.fullurl(link,atag['href'])) | links.append(scrape.fullurl(link,atag['href'])) |
if links != []: | if links != []: |
doc.update({'links': links}) | doc.update({'links': links}) |
if description != "": | if description != "": |
doc.update({ 'description': description}) | doc.update({ 'description': description}) |
def getColumnCount(self): | def getColumnCount(self): |
return 2 | return 2 |
def getTable(self,soup): | def getTable(self,soup): |
return soup.find(class_ = "ms-rteTable-default") | return soup.find(class_ = "ms-rteTable-default") |
def getColumns(self,columns): | def getColumns(self,columns): |
(date, title) = columns | (date, title) = columns |
return (title, date, title, title, None) | return (title, date, title, title, None) |
if __name__ == '__main__': | if __name__ == '__main__': |
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) | print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) |
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) | print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) |
ScraperImplementation().doScrape() | ScraperImplementation().doScrape() |
import sys,os | import sys,os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import genericScrapers | import genericScrapers |
import scrape | import scrape |
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup |
#http://www.doughellmann.com/PyMOTW/abc/ | #http://www.doughellmann.com/PyMOTW/abc/ |
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): | class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): |
def getTable(self,soup): | def getTable(self,soup): |
return soup.find(class_ = "ms-rtestate-field").table | return soup.find(class_ = "ms-rtestate-field").table |
def getColumns(self,columns): | def getColumns(self,columns): |
(id, date, title, description, notes) = columns | (id, date, title, description, notes) = columns |
return (id, date, title, description, notes) | return (id, date, title, description, notes) |
def getLinks(self, content, entry, doc): | def getLinks(self, content, entry, doc): |
link = None | link = None |
links = [] | links = [] |
for atag in entry.find_all('a'): | for atag in entry.find_all('a'): |
if atag.has_key('href'): | if atag.has_attr('href'): |
link = scrape.fullurl(self.getURL(),atag['href']) | link = scrape.fullurl(self.getURL(),atag['href']) |
(url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) | (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) |
if htcontent != None: | if htcontent != None: |
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": | if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": |
# http://www.crummy.com/software/BeautifulSoup/documentation.html | # http://www.crummy.com/software/BeautifulSoup/documentation.html |
soup = BeautifulSoup(htcontent) | soup = BeautifulSoup(htcontent) |
for atag in soup.find(class_ = "article-content").find_all('a'): | for atag in soup.find(class_ = "article-content").find_all('a'): |
if atag.has_key('href'): | if atag.has_attr('href'): |
links.append(scrape.fullurl(link,atag['href'])) | links.append(scrape.fullurl(link,atag['href'])) |
if links != []: | if links != []: |
doc.update({'links': links}) | doc.update({'links': links}) |
doc.update({'url': link}) | doc.update({'url': link}) |
return | return |
if __name__ == '__main__': | if __name__ == '__main__': |
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) | print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) |
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) | print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) |
ScraperImplementation().doScrape() | ScraperImplementation().doScrape() |
import sys,os | import sys,os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import genericScrapers | import genericScrapers |
import scrape | import scrape |
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup |
#http://www.doughellmann.com/PyMOTW/abc/ | #http://www.doughellmann.com/PyMOTW/abc/ |
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): | class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): |
def getDocHash(self, id,date, url): | |
''' url changes on ever request so ignore for hash ''' | |
return scrape.mkhash( | |
self.remove_control_chars( | |
''.join(id.stripped_strings))) | |
def getColumnCount(self): | def getColumnCount(self): |
return 4 | return 4 |
def getColumns(self,columns): | def getColumns(self,columns): |
(date, id, title, description) = columns | (date, id, title, description) = columns |
return (id, date, title, description, None) | return (id, date, title, description, None) |
if __name__ == '__main__': | if __name__ == '__main__': |
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) | print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) |
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) | print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) |
ScraperImplementation().doScrape() | ScraperImplementation().doScrape() |
import sys,os | import sys,os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import genericScrapers | import genericScrapers |
import scrape | import scrape |
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup |
#http://www.doughellmann.com/PyMOTW/abc/ | #http://www.doughellmann.com/PyMOTW/abc/ |
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): | class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): |
def getTable(self,soup): | |
return soup.find(id = "centercontent").table | |
def getColumnCount(self): | def getColumnCount(self): |
return 5 | return 5 |
def getColumns(self,columns): | def getColumns(self,columns): |
(id, date, title, description,notes) = columns | (id, date, title, description,notes) = columns |
return (id, date, title, description, notes) | return (id, date, title, description, notes) |
if __name__ == '__main__': | if __name__ == '__main__': |
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) | print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) |
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) | print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) |
ScraperImplementation().doScrape() | ScraperImplementation().doScrape() |
import sys,os | import sys,os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import genericScrapers | import genericScrapers |
import scrape | import scrape |
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup |
import codecs | import codecs |
#http://www.doughellmann.com/PyMOTW/abc/ | #http://www.doughellmann.com/PyMOTW/abc/ |
class NewScraperImplementation(genericScrapers.GenericOAICDisclogScraper): | class NewScraperImplementation(genericScrapers.GenericOAICDisclogScraper): |
def getDescription(self,content, entry,doc): | def getDescription(self,content, entry,doc): |
link = None | link = None |
links = [] | links = [] |
description = "" | description = "" |
for atag in entry.find_all('a'): | for atag in entry.find_all('a'): |
if atag.has_key('href'): | if atag.has_attr('href'): |
link = scrape.fullurl(self.getURL(),atag['href']) | link = scrape.fullurl(self.getURL(),atag['href']) |
(url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) | (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) |
if htcontent != None: | if htcontent != None: |
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": | if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": |
# http://www.crummy.com/software/BeautifulSoup/documentation.html | # http://www.crummy.com/software/BeautifulSoup/documentation.html |
soup = BeautifulSoup(htcontent) | soup = BeautifulSoup(htcontent) |
for text in soup.find(class_ = "mainContent").stripped_strings: | for text in soup.find(class_ = "mainContent").stripped_strings: |
description = description + text.encode('ascii', 'ignore') | description = description + text.encode('ascii', 'ignore') |
for atag in soup.find(id="SortingTable").find_all("a"): | for atag in soup.find(id="SortingTable").find_all("a"): |
if atag.has_key('href'): | if atag.has_attr('href'): |
links.append(scrape.fullurl(link,atag['href'])) | links.append(scrape.fullurl(link,atag['href'])) |
if links != []: | if links != []: |
doc.update({'links': links}) | doc.update({'links': links}) |
if description != "": | if description != "": |
doc.update({ 'description': description}) | doc.update({ 'description': description}) |
def getColumnCount(self): | def getColumnCount(self): |
return 2 | return 2 |
def getTable(self,soup): | def getTable(self,soup): |
return soup.find(id = "TwoColumnSorting") | return soup.find(id = "TwoColumnSorting") |
def getColumns(self,columns): | def getColumns(self,columns): |
( title, date) = columns | ( title, date) = columns |
return (title, date, title, title, None) | return (title, date, title, title, None) |
class OldScraperImplementation(genericScrapers.GenericOAICDisclogScraper): | class OldScraperImplementation(genericScrapers.GenericOAICDisclogScraper): |
def getDescription(self,content, entry,doc): | def getDescription(self,content, entry,doc): |
link = None | link = None |
links = [] | links = [] |
description = "" | description = "" |
for atag in entry.find_all('a'): | for atag in entry.find_all('a'): |
if atag.has_key('href'): | if atag.has_attr('href'): |
link = scrape.fullurl(self.getURL(),atag['href']) | link = scrape.fullurl(self.getURL(),atag['href']) |
(url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) | (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) |
if htcontent != None: | if htcontent != None: |
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": | if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": |
# http://www.crummy.com/software/BeautifulSoup/documentation.html | # http://www.crummy.com/software/BeautifulSoup/documentation.html |
soup = BeautifulSoup(htcontent) | soup = BeautifulSoup(htcontent) |
for text in soup.find(id="content-item").stripped_strings: | for text in soup.find(id="content-item").stripped_strings: |
description = description + text + " \n" | description = description + text + " \n" |
for atag in soup.find(id="content-item").find_all("a"): | for atag in soup.find(id="content-item").find_all("a"): |
if atag.has_key('href'): | if atag.has_attr('href'): |
links.append(scrape.fullurl(link,atag['href'])) | links.append(scrape.fullurl(link,atag['href'])) |
if links != []: | if links != []: |
doc.update({'links': links}) | doc.update({'links': links}) |
if description != "": | if description != "": |
doc.update({ 'description': description}) | doc.update({ 'description': description}) |
if links != []: | if links != []: |
doc.update({'links': links}) | doc.update({'links': links}) |
if description != "": | if description != "": |
doc.update({ 'description': description}) | doc.update({ 'description': description}) |
def getColumnCount(self): | def getColumnCount(self): |
return 2 | return 2 |
def getTable(self,soup): | def getTable(self,soup): |
return soup.find(class_ = "doc-list") | return soup.find(class_ = "doc-list") |
def getColumns(self,columns): | def getColumns(self,columns): |
(date, title) = columns | (date, title) = columns |
return (title, date, title, title, None) | return (title, date, title, title, None) |
if __name__ == '__main__': | if __name__ == '__main__': |
print 'Subclass:', issubclass(NewScraperImplementation, genericScrapers.GenericOAICDisclogScraper) | print 'Subclass:', issubclass(NewScraperImplementation, genericScrapers.GenericOAICDisclogScraper) |
print 'Instance:', isinstance(NewScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) | print 'Instance:', isinstance(NewScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) |
NewScraperImplementation().doScrape() | NewScraperImplementation().doScrape() |
print 'Subclass:', issubclass(OldScraperImplementation, genericScrapers.GenericOAICDisclogScraper) | print 'Subclass:', issubclass(OldScraperImplementation, genericScrapers.GenericOAICDisclogScraper) |
print 'Instance:', isinstance(OldScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) | print 'Instance:', isinstance(OldScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) |
osi = OldScraperImplementation() | osi = OldScraperImplementation() |
osi.disclogURL = "http://archive.treasury.gov.au/content/foi_publications.asp?year=-1&abstract=0&classification=&=&titl=Disclosure+Log+-+Documents+Released+Under+FOI" | osi.disclogURL = "http://archive.treasury.gov.au/content/foi_publications.asp?year=-1&abstract=0&classification=&=&titl=Disclosure+Log+-+Documents+Released+Under+FOI" |
osi.doScrape() | osi.doScrape() |
import sys,os | import sys,os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import genericScrapers | import genericScrapers |
#RSS feed not detailed | #RSS feed not detailed |
import scrape | import scrape |
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup |
#http://www.doughellmann.com/PyMOTW/abc/ | #http://www.doughellmann.com/PyMOTW/abc/ |
class ScraperImplementation(genericScrapers.GenericRSSDisclogScraper): | class ScraperImplementation(genericScrapers.GenericRSSDisclogScraper): |
def getDescription(self,content, entry,doc): | def getDescription(self,content, entry,doc): |
(url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, entry.link, "foidocuments", self.getAgencyID(), False) | (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, entry.link, "foidocuments", self.getAgencyID(), False) |
if htcontent != None: | if htcontent != None: |
if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": | if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": |
# http://www.crummy.com/software/BeautifulSoup/documentation.html | # http://www.crummy.com/software/BeautifulSoup/documentation.html |
soup = BeautifulSoup(content) | soup = BeautifulSoup(content) |
links = [] | links = [] |
description = "" | description = "" |
dldivs = soup.find('div',class_="download") | dldivs = soup.find('div',class_="download") |
if dldivs != None: | if dldivs != None: |
for atag in dldivs.find_all("a"): | for atag in dldivs.find_all("a"): |
if atag.has_key('href'): | if atag.has_attr('href'): |
links.append(scrape.fullurl(url,atag['href'])) | links.append(scrape.fullurl(url,atag['href'])) |
nodldivs = soup.find('div',class_="incompleteNotification") | nodldivs = soup.find('div',class_="incompleteNotification") |
if nodldivs != None and nodldivs.stripped_strings != None: | if nodldivs != None and nodldivs.stripped_strings != None: |
for text in nodldivs.stripped_strings: | for text in nodldivs.stripped_strings: |
description = description + text | description = description + text |
for row in soup.table.find_all('tr'): | for row in soup.table.find_all('tr'): |
if row != None: | if row != None: |
description = description + "\n" + row.find('th').string + ": " | description = description + "\n" + row.find('th').string + ": " |
for text in row.find('div').stripped_strings: | for text in row.find('div').stripped_strings: |
description = description + text | description = description + text |
if links != []: | if links != []: |
doc.update({'links': links}) | doc.update({'links': links}) |
if description != "": | if description != "": |
doc.update({ 'description': description}) | doc.update({ 'description': description}) |
if __name__ == '__main__': | if __name__ == '__main__': |
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericRSSDisclogScraper) | print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericRSSDisclogScraper) |
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericRSSDisclogScraper) | print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericRSSDisclogScraper) |
ScraperImplementation().doScrape() | ScraperImplementation().doScrape() |
import sys,os | import sys,os |
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
import genericScrapers | import genericScrapers |
import scrape | import scrape |
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup |
#http://www.doughellmann.com/PyMOTW/abc/ | #http://www.doughellmann.com/PyMOTW/abc/ |
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): | class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): |
def getTable(self,soup): | |
return soup.find(id="main").table | |
def getColumnCount(self): | def getColumnCount(self): |
return 7 | return 7 |
def getColumns(self,columns): | def getColumns(self,columns): |
(id, date, title, description,link,deldate,notes) = columns | (id, date, title, description,link,deldate,notes) = columns |
return (id, date, title, description, notes) | return (id, date, title, description, notes) |
if __name__ == '__main__': | if __name__ == '__main__': |
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) | print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) |
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) | print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) |
ScraperImplementation().doScrape() | ScraperImplementation().doScrape() |
<?php | <?php |
include ('../include/common.inc.php'); | include ('../include/common.inc.php'); |
header("Content-Type: text/xml"); | header("Content-Type: text/xml"); |
echo "<?xml version='1.0' encoding='UTF-8'?>"; | echo "<?xml version='1.0' encoding='UTF-8'?>"; |
echo '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' . "\n"; | echo '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' . "\n"; |
echo " <url><loc>" . local_url() . "index.php</loc><priority>1.0</priority></url>\n"; | echo " <url><loc>" . local_url() . "index.php</loc><priority>1.0</priority></url>\n"; |
foreach (scandir("./") as $file) { | foreach (scandir("./") as $file) { |
if (strpos($file, ".php") !== false && $file != "index.php" && $file != "sitemap.xml.php") { | if (strpos($file, ".php") !== false && ($file != "index.php" && $file != "sitemap.xml.php"&& $file != "viewDocument.php")) { |
echo " <url><loc>" . local_url() . "$file</loc><priority>0.6</priority></url>\n"; | echo " <url><loc>" . local_url() . "$file</loc><priority>0.6</priority></url>\n"; |
} | } |
} | } |
$agenciesdb = $server->get_db('disclosr-agencies'); | $agenciesdb = $server->get_db('disclosr-agencies'); |
$foidocsdb = $server->get_db('disclosr-foidocuments'); | $foidocsdb = $server->get_db('disclosr-foidocuments'); |
try { | try { |
$rows = $agenciesdb->get_view("app", "byCanonicalName")->rows; | $rows = $agenciesdb->get_view("app", "byCanonicalName")->rows; |
foreach ($rows as $row) { | foreach ($rows as $row) { |
echo '<url><loc>' . local_url() . 'agency.php?id=' . $row->value->_id . "</loc><priority>0.3</priority></url>\n"; | echo '<url><loc>' . local_url() . 'agency.php?id=' . $row->value->_id . "</loc><priority>0.3</priority></url>\n"; |
} | } |
unset($rows); | unset($rows); |
$rows = null; | $rows = null; |
} catch (SetteeRestClientException $e) { | } catch (SetteeRestClientException $e) { |
setteErrorHandler($e); | setteErrorHandler($e); |
} | } |
foreach (range(0, 8) as $number) { | foreach (range(0, 8) as $number) { |
try { | try { |
$rows = $foidocsdb->get_view("app", "all", Array($number,$number+1))->rows; | $rows = $foidocsdb->get_view("app", "all", Array($number,$number+1))->rows; |
foreach ($rows as $row) { | foreach ($rows as $row) { |
echo '<url><loc>' . local_url() . 'view.php?id=' . $row->value->_id . "</loc><priority>0.3</priority></url>\n"; | echo '<url><loc>' . local_url() . 'view.php?id=' . $row->value->_id . "</loc><priority>0.3</priority></url>\n"; |
} | } |
unset($rows); | unset($rows); |
$rows = null; | $rows = null; |
} catch (SetteeRestClientException $e) { | } catch (SetteeRestClientException $e) { |
setteErrorHandler($e); | setteErrorHandler($e); |
} | } |
} | } |
try { | try { |
$rows = $foidocsdb->get_view("app", "all", Array('9','fffffffff'))->rows; | $rows = $foidocsdb->get_view("app", "all", Array('9','fffffffff'))->rows; |
foreach ($rows as $row) { | foreach ($rows as $row) { |
echo '<url><loc>' . local_url() . 'view.php?id=' . $row->value->_id . "</loc><priority>0.3</priority></url>\n"; | echo '<url><loc>' . local_url() . 'view.php?id=' . $row->value->_id . "</loc><priority>0.3</priority></url>\n"; |
} | } |
unset($rows); | unset($rows); |
$rows = null; | $rows = null; |
} catch (SetteeRestClientException $e) { | } catch (SetteeRestClientException $e) { |
setteErrorHandler($e); | setteErrorHandler($e); |
} | } |
echo '</urlset>'; | echo '</urlset>'; |
?> | ?> |
<?php | |
// use https://github.com/okfn/publicbodies/blob/master/data/nz.csv format | |
include_once("include/common.inc.php"); | |
setlocale(LC_CTYPE, 'C'); | |
$headers = Array("title","abbr","key","category","parent","parent_key","description","url","jurisdiction","jurisdiction_code","source","source_url","address","contact","email","tags","created_at","updated_at"); | |
$db = $server->get_db('disclosr-agencies'); | |
$foiEmail = Array(); | |
try { | |
$rows = $db->get_view("app", "foiEmails", null, true)->rows; | |
//print_r($rows); | |
foreach ($rows as $row) { | |
$foiEmail[$row->key] = $row->value; | |
} | |
} catch (SetteeRestClientException $e) { | |
setteErrorHandler($e); | |
die(); | |
} | |
$fp = fopen('php://output', 'w'); | |
if ($fp && $db) { | |
header('Content-Type: text/csv; charset=utf-8'); | |
header('Content-Disposition: attachment; filename="export.' . date("c") . '.csv"'); | |
header('Pragma: no-cache'); | |
header('Expires: 0'); | |
fputcsv($fp, $headers); | |
try { | |
$agencies = $db->get_view("app", "byCanonicalName", null, true)->rows; | |
//print_r($rows); | |
foreach ($agencies as $agency) { | |
// print_r($agency); | |
if (isset($agency->value->foiEmail) && $agency->value->foiEmail != "null" && !isset($agency->value->status)) { | |
$row = Array(); | |
$row["title"] = trim($agency->value->name); | |
$row["abbr"] = (isset($agency->value->shortName) ? $agency->value->shortName : ""); | |
$row["key"] = (isset($agency->value->shortName) ? "au/".strtolower($agency->value->shortName) : ""); | |
$row["category"] =""; | |
$row["parent"] =""; | |
$row["parentkey"] =""; | |
$row["description"] = (isset($agency->value->description) ? $agency->value->description : ""); | |
$row["url"] = (isset($agency->value->website) ? $agency->value->website : ""); | |
$row["jurisdiction"] = "Australia"; | |
$row["jurisdiction_code"] = "au"; | |
$row["source"] =""; | |
$row["source_url"] =""; | |
$row["address"] =""; | |
$row["contact"] =""; | |
$row["email"] = (isset($agency->value->foiEmail) ? $agency->value->foiEmail : ""); | |
$row["tags"] =""; | |
$row["created_at"] =""; | |
$row["updated_at"] =""; | |
$otherBodies = Array(); | |
if (isset($agency->value->foiBodies)) { | |
$otherBodies = array_merge($otherBodies, $agency->value->foiBodies); | |
} | |
if (isset($agency->value->positions)) { | |
$positions = Array(); | |
foreach ($agency->value->positions as $position) { | |
$positions[] = "Office of the ".$position; | |
} | |
$otherBodies = array_merge($otherBodies, $positions); | |
} | |
sort($otherBodies); | |
if (count($otherBodies) > 0) { | |
$row["description"] .= "<br/> This department also responds to requests for information held by " . implode(", ", $otherBodies); | |
} | |
fputcsv($fp, array_values($row)); | |
} | |
} | |
} catch (SetteeRestClientException $e) { | |
setteErrorHandler($e); | |
} | |
die; | |
} | |
?> | |
<?php | <?php |
include_once('include/common.inc.php'); | include_once('include/common.inc.php'); |
function displayValue($key, $value, $mode) | function displayValue($key, $value, $mode) |
{ | { |
global $db, $schemas; | global $db, $schemas; |
$ignoreKeys = Array("metadata", "metaTags", "statistics", "rtkURLs", "rtkDescriptions"); | $ignoreKeys = Array("metadata", "metaTags", "statistics", "rtkURLs", "rtkDescriptions"); |
if ($mode == "view") { | if ($mode == "view") { |
if (strpos($key, "_") === 0 || in_array($key, $ignoreKeys)) | if (strpos($key, "_") === 0 || in_array($key, $ignoreKeys)) |
return; | return; |
echo "<tr>"; | echo "<tr>"; |
echo "<td class='$key'>"; | echo "<td class='$key'>"; |
if (isset($schemas['agency']["properties"][$key])) { | if (isset($schemas['agency']["properties"][$key])) { |
echo $schemas['agency']["properties"][$key]['x-title'] . "<br><small>" . $schemas['agency']["properties"][$key]['description'] . "</small>"; | echo $schemas['agency']["properties"][$key]['x-title'] . "<br><small>" . $schemas['agency']["properties"][$key]['description'] . "</small>"; |
} | } |
echo "</td><td>"; | echo "</td><td>"; |
if (is_array($value)) { | if (is_array($value)) { |
echo "<ol>"; | echo "<ol>"; |
foreach ($value as $subkey => $subvalue) { | foreach ($value as $subkey => $subvalue) { |
echo "<li "; | echo "<li "; |
if (isset($schemas['agency']["properties"][$key]['x-property'])) { | if (isset($schemas['agency']["properties"][$key]['x-property'])) { |
echo ' property="' . $schemas['agency']["properties"][$key]['x-property'] . '" '; | echo ' property="' . $schemas['agency']["properties"][$key]['x-property'] . '" '; |
} | } |
if (isset($schemas['agency']["properties"][$key]['x-itemprop'])) { | if (isset($schemas['agency']["properties"][$key]['x-itemprop'])) { |
echo ' itemprop="' . $schemas['agency']["properties"][$key]['x-itemprop'] . '" '; | echo ' itemprop="' . $schemas['agency']["properties"][$key]['x-itemprop'] . '" '; |
} | } |
echo " >"; | echo " >"; |
echo "$subvalue</li>"; | echo "$subvalue</li>"; |
} | } |
echo "</ol></td></tr>"; | echo "</ol></td></tr>"; |
} else { | } else { |
if (isset($schemas['agency']["properties"][$key]['x-property'])) { | if (isset($schemas['agency']["properties"][$key]['x-property'])) { |
echo '<span property="' . $schemas['agency']["properties"][$key]['x-property'] . '">'; | echo '<span property="' . $schemas['agency']["properties"][$key]['x-property'] . '">'; |
} else { | } else { |
echo "<span>"; | echo "<span>"; |
} | } |
if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") { | if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") { |
echo "<a " . ($key == 'website' ? 'itemprop="url"' : '') . " href='$value'>$value</a>"; | echo "<a " . ($key == 'website' ? 'itemprop="url"' : '') . " href='$value'>$value</a>"; |
} else if ($key == 'abn') { | } else if ($key == 'abn') { |
echo "<a href='http://www.abr.business.gov.au/SearchByAbn.aspx?SearchText=$value'>$value</a>"; | echo "<a href='http://www.abr.business.gov.au/SearchByAbn.aspx?SearchText=$value'>$value</a>"; |
} else { | } else { |
echo "$value"; | echo "$value"; |
} | } |
echo "</span>"; | echo "</span>"; |
} | } |
echo "</td></tr>"; | echo "</td></tr>"; |
} | } |
if ($mode == "edit") { | if ($mode == "edit") { |
if (is_array($value)) { | if (is_array($value)) { |
echo '<div class="row"> | echo '<div class="row"> |
<div class="seven columns"> | <div class="seven columns"> |
<fieldset> | <fieldset> |
<h5>' . $key . '</h5>'; | <h5>' . $key . '</h5>'; |
foreach ($value as $subkey => $subvalue) { | foreach ($value as $subkey => $subvalue) { |
echo "<label>$subkey</label><input class='input-text' type='text' id='$key$subkey' name='$key" . '[' . $subkey . "]' value='$subvalue'/></tr>"; | echo "<label>$subkey</label><input class='input-text' type='text' id='$key$subkey' name='$key" . '[' . $subkey . "]' value='$subvalue'/></tr>"; |
} | } |
echo "</fieldset> | echo "</fieldset> |
</div> | </div> |
</div>"; | </div>"; |
} else { | } else { |
if (strpos($key, "_") === 0) { | if (strpos($key, "_") === 0) { |
echo "<input type='hidden' id='$key' name='$key' value='$value'/>"; | echo "<input type='hidden' id='$key' name='$key' value='$value'/>"; |
} else if ($key == "parentOrg") { | } else if ($key == "parentOrg") { |
echo "<label for='$key'>$key</label><select id='$key' name='$key'><option value=''> Select... </option>"; | echo "<label for='$key'>$key</label><select id='$key' name='$key'><option value=''> Select... </option>"; |
$rows = $db->get_view("app", "byDeptStateName")->rows; | $rows = $db->get_view("app", "byDeptStateName")->rows; |
//print_r($rows); | //print_r($rows); |
foreach ($rows as $row) { | foreach ($rows as $row) { |
echo "<option value='{$row->value}'" . (($row->value == $value) ? "SELECTED" : "") . " >" . str_replace("Department of ", "", $row->key) . "</option>"; | echo "<option value='{$row->value}'" . (($row->value == $value) ? "SELECTED" : "") . " >" . str_replace("Department of ", "", $row->key) . "</option>"; |
} | } |
echo " </select>"; | echo " </select>"; |
} else { | } else { |
echo "<label>$key</label><input class='input-text' type='text' id='$key' name='$key' value='$value'/>"; | echo "<label>$key</label><input class='input-text' type='text' id='$key' name='$key' value='$value'/>"; |
if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") { | if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") { |
echo "<a " . ($key == 'website' ? 'itemprop="url"' : '') . " href='$value'>view</a>"; | echo "<a " . ($key == 'website' ? 'itemprop="url"' : '') . " href='$value'>view</a>"; |
} | } |
if ($key == 'abn') { | if ($key == 'abn') { |
echo "<a href='http://www.abr.business.gov.au/SearchByAbn.aspx?SearchText=$value'>view abn</a>"; | echo "<a href='http://www.abr.business.gov.au/SearchByAbn.aspx?SearchText=$value'>view abn</a>"; |
} | } |
} | } |
} | } |
} | } |
// | // |
} | } |
function addDefaultFields($row) | function addDefaultFields($row) |
{ | { |
global $schemas; | global $schemas; |
$defaultFields = array_keys($schemas['agency']['properties']); | $defaultFields = array_keys($schemas['agency']['properties']); |
foreach ($defaultFields as $defaultField) { | foreach ($defaultFields as $defaultField) { |
if (!isset($row[$defaultField])) { | if (!isset($row[$defaultField])) { |
if ($schemas['agency']['properties'][$defaultField]['type'] == "string") { | if ($schemas['agency']['properties'][$defaultField]['type'] == "string") { |
$row[$defaultField] = ""; | $row[$defaultField] = ""; |
} | } |
if ($schemas['agency']['properties'][$defaultField]['type'] == "array") { | if ($schemas['agency']['properties'][$defaultField]['type'] == "array") { |
$row[$defaultField] = Array(""); | $row[$defaultField] = Array(""); |
} | } |
} else if ($schemas['agency']['properties'][$defaultField]['type'] == "array") { | } else if ($schemas['agency']['properties'][$defaultField]['type'] == "array") { |
if (is_array($row[$defaultField])) { | if (is_array($row[$defaultField])) { |
$row[$defaultField][] = ""; | $row[$defaultField][] = ""; |
$row[$defaultField][] = ""; | $row[$defaultField][] = ""; |
$row[$defaultField][] = ""; | $row[$defaultField][] = ""; |
} else { | } else { |
$value = $row[$defaultField]; | $value = $row[$defaultField]; |
$row[$defaultField] = Array($value); | $row[$defaultField] = Array($value); |
$row[$defaultField][] = ""; | $row[$defaultField][] = ""; |
$row[$defaultField][] = ""; | $row[$defaultField][] = ""; |
} | } |
} | } |
} | } |
return $row; | return $row; |
} | } |
$db = $server->get_db('disclosr-agencies'); | $db = $server->get_db('disclosr-agencies'); |
if (isset($_REQUEST['id'])) { | if (isset($_REQUEST['id'])) { |
//get an agency record as json/html, search by name/abn/id | //get an agency record as json/html, search by name/abn/id |
// by name = startkey="Ham"&endkey="Ham\ufff0" | // by name = startkey="Ham"&endkey="Ham\ufff0" |
// edit? | // edit? |
$obj = $db->get($_REQUEST['id']); | $obj = $db->get($_REQUEST['id']); |
include_header(isset($obj->name) ? $obj->name : ""); | include_header(isset($obj->name) ? $obj->name : ""); |
//print_r($row); | //print_r($row); |
if (sizeof($_POST) > 0) { | if (sizeof($_POST) > 0) { |
//print_r($_POST); | //print_r($_POST); |
foreach ($_POST as $postkey => $postvalue) { | foreach ($_POST as $postkey => $postvalue) { |
if ($postvalue == "") { | if ($postvalue == "") { |
unset($_POST[$postkey]); | unset($_POST[$postkey]); |
} | } |
if (is_array($postvalue)) { | if (is_array($postvalue)) { |
if (count($postvalue) == 1 && $postvalue[0] == "") { | if (count($postvalue) == 1 && $postvalue[0] == "") { |
unset($_POST[$postkey]); | unset($_POST[$postkey]); |
} else { | } else { |
foreach ($_POST[$postkey] as $key => &$value) { | foreach ($_POST[$postkey] as $key => &$value) { |
if ($value == "") { | if ($value == "") { |
unset($_POST[$postkey][$key]); | unset($_POST[$postkey][$key]); |
} | } |
} | } |
} | } |
} | } |
} | } |
if (isset($_POST['_id']) && $db->get_rev($_POST['_id']) == $_POST['_rev']) { | if (isset($_POST['_id']) && $db->get_rev($_POST['_id']) == $_POST['_rev']) { |
echo "Edited version was latest version, continue saving"; | echo "Edited version was latest version, continue saving"; |
$newdoc = $_POST; | $newdoc = $_POST; |
$newdoc['metadata']['lastModified'] = time(); | $newdoc['metadata']['lastModified'] = time(); |
$obj = $db->save($newdoc); | $obj = $db->save($newdoc); |
} else { | } else { |
echo "ALERT doc revised by someone else while editing. Document not saved."; | echo "ALERT doc revised by someone else while editing. Document not saved."; |
} | } |
} | } |
$mode = "view"; | $mode = "view"; |
$rowArray = object_to_array($obj); | $rowArray = object_to_array($obj); |
ksort($rowArray); | ksort($rowArray); |
if ($mode == "edit") { | if ($mode == "edit") { |
$row = addDefaultFields($rowArray); | $row = addDefaultFields($rowArray); |
} else { | } else { |
$row = $rowArray; | $row = $rowArray; |
} | } |
if ($mode == "view") { | if ($mode == "view") { |
echo ' <div class="container-fluid"> | echo ' <div class="container-fluid"> |
<div class="row-fluid"> | <div class="row-fluid"> |
<div class="span3"> | <div class="span3"> |
<div class="well sidebar-nav"> | <div class="well sidebar-nav"> |
<ul class="nav nav-list"> | <ul class="nav nav-list"> |
<li class="nav-header">Statistics</li>'; | <li class="nav-header">Statistics</li>'; |
if (isset($row['statistics']['employees'])) { | if (isset($row['statistics']['employees'])) { |
echo '<div><i class="icon-user" style="float:left"></i><p style="margin-left:16px;">'; | echo '<div><i class="icon-user" style="float:left"></i><p style="margin-left:16px;">'; |
$keys = array_keys($row['statistics']['employees']); | $keys = array_keys($row['statistics']['employees']); |
$lastkey = $keys[count($keys) - 1]; | $lastkey = $keys[count($keys) - 1]; |
echo $row['statistics']['employees'][$lastkey]['value'] . ' employees <small>(' . $lastkey . ')</small>'; | echo $row['statistics']['employees'][$lastkey]['value'] . ' employees <small>(' . $lastkey . ')</small>'; |
echo '</div>'; | echo '</div>'; |
} | } |
if (isset($row['statistics']['budget'])) { | if (isset($row['statistics']['budget'])) { |
echo '<div><i class="icon-shopping-cart" style="float:left"></i><p style="margin-left:16px;">'; | echo '<div><i class="icon-shopping-cart" style="float:left"></i><p style="margin-left:16px;">'; |
$keys = array_keys($row['statistics']['budget']); | $keys = array_keys($row['statistics']['budget']); |
$lastkey = $keys[count($keys) - 1]; | $lastkey = $keys[count($keys) - 1]; |
echo "$" . number_format(floatval($row['statistics']['budget'][$lastkey]['value'])) . ' <small>(' . $lastkey . ' budget)</small>'; | echo "$" . number_format(floatval($row['statistics']['budget'][$lastkey]['value'])) . ' <small>(' . $lastkey . ' budget)</small>'; |
echo '</div>'; | echo '</div>'; |
} | } |
echo ' </ul> | echo ' </ul> |
</div><!--/.well --> | </div><!--/.well --> |
</div><!--/span--> | </div><!--/span--> |
<div class="span9">'; | <div class="span9">'; |
echo '<div itemscope itemtype="http://schema.org/GovernmentOrganization" typeof="schema:GovernmentOrganization" about="#' . $row['_id'] . '">'; | echo '<div itemscope itemtype="http://schema.org/GovernmentOrganization" typeof="schema:GovernmentOrganization org:Organization" about="#' . $row['_id'] . '">'; |
echo '<div class="hero-unit"> | echo '<div class="hero-unit"> |
<h1 itemprop="name">' . $row['name'] . '</h1>'; | <h1 itemprop="name">' . $row['name'] . '</h1>'; |
if (isset($row['description'])) { | if (isset($row['description'])) { |
echo '<p>' . $row['description'] . '</p>'; | echo '<p>' . $row['description'] . '</p>'; |
} | } |
echo '</div><table width="100%">'; | echo '</div><table width="100%">'; |
echo "<tr><th>Field Name</th><th>Field Value</th></tr>"; | echo "<tr><th>Field Name</th><th>Field Value</th></tr>"; |
} | } |
if ($mode == "edit") { | if ($mode == "edit") { |
?> | ?> |
<input id="addfield" type="button" value="Add Field"/> | <input id="addfield" type="button" value="Add Field"/> |
<script> | <script> |
window.onload = function () { | window.onload = function () { |
$(document).ready(function () { | $(document).ready(function () { |
// put all your jQuery goodness in here. | // put all your jQuery goodness in here. |
// http://charlie.griefer.com/blog/2009/09/17/jquery-dynamically-adding-form-elements/ | // http://charlie.griefer.com/blog/2009/09/17/jquery-dynamically-adding-form-elements/ |
$('#addfield').click(function () { | $('#addfield').click(function () { |
var field_name = window.prompt("fieldname?", ""); | var field_name = window.prompt("fieldname?", ""); |
if (field_name != "") { | if (field_name != "") { |
$('#submitbutton').before($('<span></span>') | $('#submitbutton').before($('<span></span>') |
.append("<label>" + field_name + "</label>") | .append("<label>" + field_name + "</label>") |
.append("<input class='input-text' type='text' id='" + field_name + "' name='" + field_name + "'/>") | .append("<input class='input-text' type='text' id='" + field_name + "' name='" + field_name + "'/>") |
); | ); |
} | } |
}); | }); |
}); | }); |
}; | }; |
</script> | </script> |
<form id="editform" class="nice" method="post"> | <form id="editform" class="nice" method="post"> |
<?php | <?php |
} | } |
foreach ($row as $key => $value) { | foreach ($row as $key => $value) { |
echo displayValue($key, $value, $mode); | echo displayValue($key, $value, $mode); |
} | } |
if ($mode == "view") { | if ($mode == "view") { |
echo "</table></div>"; | echo "</table></div>"; |
echo ' </div><!--/span--> | echo ' </div><!--/span--> |
</div><!--/row--> | </div><!--/row--> |
</div><!--/span--> | </div><!--/span--> |
</div><!--/row-->'; | </div><!--/row-->'; |
} | } |
if ($mode == "edit") { | if ($mode == "edit") { |
echo '<input id="submitbutton" type="submit"/></form>'; | echo '<input id="submitbutton" type="submit"/></form>'; |
} | } |
} else { | } else { |
// show all list | // show all list |
include_header('Agencies'); | include_header('Agencies'); |
echo ' <div class="container-fluid"> | echo ' <div class="container-fluid"> |
<div class="row-fluid"> | <div class="row-fluid"> |
<div class="span3"> | <div class="span3"> |
<div class="well sidebar-nav"> | <div class="well sidebar-nav"> |
<ul class="nav nav-list"> | <ul class="nav nav-list"> |
<li class="nav-header">Sidebar</li>'; | <li class="nav-header">Sidebar</li>'; |
echo ' </ul> | echo ' </ul> |
</div><!--/.well --> | </div><!--/.well --> |
</div><!--/span--> | </div><!--/span--> |
<div class="span9"> | <div class="span9"> |
<div class="hero-unit"> | <div class="hero-unit"> |
<h1>Australian Government Agencies</h1> | <h1>Australian Government Agencies</h1> |
<p>Explore collected information about Australian Government Agencies below.</p> | <p>Explore collected information about Australian Government Agencies below.</p> |
</div> | </div> |
<div class="row-fluid"> | <div class="row-fluid"> |
<div class="span4">'; | <div class="span4">'; |
try { | try { |
$rows = $db->get_view("app", "byCanonicalName")->rows; | $rows = $db->get_view("app", "byCanonicalName")->rows; |
//print_r($rows); | //print_r($rows); |
$rowCount = count($rows); | $rowCount = count($rows); |
foreach ($rows as $i => $row) { | foreach ($rows as $i => $row) { |
if ($i % ($rowCount / 3) == 0 && $i != 0 && $i != $rowCount - 2) echo '</div><div class="span4">'; | if ($i % ($rowCount / 3) == 0 && $i != 0 && $i != $rowCount - 2) echo '</div><div class="span4">'; |
// print_r($row); | // print_r($row); |
echo '<span itemscope itemtype="http://schema.org/GovernmentOrganization" typeof="schema:GovernmentOrganization foaf:Organization" about="getAgency.php?id=' . $row->value->_id . '"> | echo '<span itemscope itemtype="http://schema.org/GovernmentOrganization" typeof="schema:GovernmentOrganization foaf:Organization" about="getAgency.php?id=' . $row->value->_id . '"> |
<a href="getAgency.php?id=' . $row->value->_id . '" rel="schema:url foaf:page" property="schema:name foaf:name" itemprop="url"><span itemprop="name">' . | <a href="getAgency.php?id=' . $row->value->_id . '" rel="schema:url foaf:page" property="schema:name foaf:name" itemprop="url"><span itemprop="name">' . |
(isset($row->value->name) ? $row->value->name : "ERROR NAME MISSING") | (isset($row->value->name) ? $row->value->name : "ERROR NAME MISSING") |
. '</span></a></span><br><br>'; | . '</span></a></span><br><br>'; |
} | } |
} catch (SetteeRestClientException $e) { | } catch (SetteeRestClientException $e) { |
setteErrorHandler($e); | setteErrorHandler($e); |
} | } |
echo ' </div><!--/span--> | echo ' </div><!--/span--> |
</div><!--/row--> | </div><!--/row--> |
</div><!--/span--> | </div><!--/span--> |
</div><!--/row-->'; | </div><!--/row-->'; |
} | } |
include_footer(); | include_footer(); |
?> | ?> |
<?php | <?php |
include_once('include/common.inc.php'); | include_once('include/common.inc.php'); |
//include_header(); | //include_header(); |
$format = "html"; | $format = "html"; |
if (isset($_REQUEST['format'])) { | if (isset($_REQUEST['format'])) { |
$format = $_REQUEST['format']; | $format = $_REQUEST['format']; |
} | } |
function add_node($id, $label, $parent="") { | function add_node($id, $label, $parent="") { |
global $format; | global $format; |
if ($format == "html") { | if ($format == "html") { |
// echo "nodes[\"$id\"] = graph.newNode({label: \"$label\"});" . PHP_EOL; | // echo "nodes[\"$id\"] = graph.newNode({label: \"$label\"});" . PHP_EOL; |
} | } |
if ($format == "dot" && $label != "") { | if ($format == "dot" && $label != "") { |
echo "$id [label=\"$label\"];". PHP_EOL; | echo "\"$id\" [label=\"$label\", shape=plaintext];". PHP_EOL; |
} | } |
if ($format == "gexf") { | if ($format == "gexf") { |
echo "<node id='$id' label=\"".htmlentities($label,ENT_XML1)."\" ".($parent != ""? "pid='$parent'><viz:size value='1'/>":"><viz:size value='2'/>") | echo "<node id='$id' label=\"".htmlentities($label)."\" ".($parent != ""? "pid='$parent'><viz:size value='1'/>":"><viz:size value='2'/>") |
."<viz:color b='".rand(0,255)."' g='".rand(0,255)."' r='".rand(0,255)."'/>" | ."<viz:color b='".rand(0,255)."' g='".rand(0,255)."' r='".rand(0,255)."'/>" |
."</node>". PHP_EOL; | ."</node>". PHP_EOL; |
} | } |
} | } |
function add_edge($from, $to, $color) { | function add_edge($from, $to, $color) { |
global $format; | global $format; |
if ($format == "html") { | if ($format == "html") { |
// echo "graph.newEdge(nodes[\"$from\"], nodes['$to'], {color: '$color'});" . PHP_EOL; | // echo "graph.newEdge(nodes[\"$from\"], nodes['$to'], {color: '$color'});" . PHP_EOL; |
} | } |
if ($format == "dot") { | if ($format == "dot") { |
echo "$from -> $to ".($color != ""? "[color=$color]":"").";". PHP_EOL; | echo "\"$from\" -> \"$to\" ".($color != ""? "[color=$color]":"").";". PHP_EOL; |
} | } |
if ($format == "gexf") { | if ($format == "gexf") { |
echo "<edge id='$from$to' source='$from' target='$to' />". PHP_EOL; | echo "<edge id='$from$to' source='$from' target='$to' />". PHP_EOL; |
} | } |
} | } |
if ($format == "gexf") { | if ($format == "gexf") { |
//header('Content-Type: text/xml'); | //header('Content-Type: text/xml'); |
header('Content-Type: application/gexf+xml'); | header('Content-Type: application/gexf+xml'); |
echo '<?xml version="1.0" encoding="UTF-8"?> | echo '<?xml version="1.0" encoding="UTF-8"?> |
<gexf xmlns="http://www.gexf.net/1.2draft" xmlns:viz="http://www.gexf.net/1.2draft/viz" version="1.2"> | <gexf xmlns="http://www.gexf.net/1.2draft" xmlns:viz="http://www.gexf.net/1.2draft/viz" version="1.2"> |
<meta lastmodifieddate="2009-03-20"> | <meta lastmodifieddate="2009-03-20"> |
<creator>Gexf.net</creator> | <creator>Gexf.net</creator> |
<description>A hello world! file</description> | <description>A hello world! file</description> |
</meta> | </meta> |
<graph mode="static" defaultedgetype="directed"> | <graph mode="static" defaultedgetype="directed"> |
<nodes>'. PHP_EOL; | <nodes>'. PHP_EOL; |
} | } |
if ($format == "dot") { | if ($format == "dot") { |
echo 'digraph g {'. PHP_EOL; | echo 'digraph g {'. PHP_EOL; |
} | } |
$db = $server->get_db('disclosr-agencies'); | $db = $server->get_db('disclosr-agencies'); |
add_node("fedg","Federal Government - Commonwealth of Australia"); | add_node("fedg","Federal Government - Commonwealth of Australia"); |
try { | try { |
$rows = $db->get_view("app", "byCanonicalName", null, true)->rows; | $rows = $db->get_view("app", "byCanonicalName", null, true)->rows; |
//print_r($rows); | //print_r($rows); |
foreach ($rows as $row) { | foreach ($rows as $row) { |
add_node($row->id, $row->key); | add_node($row->id, $row->value->name); |
} | } |
} catch (SetteeRestClientException $e) { | } catch (SetteeRestClientException $e) { |
setteErrorHandler($e); | setteErrorHandler($e); |
} | } |
if ($format == "gexf") { | if ($format == "gexf") { |
echo '</nodes> | echo '</nodes> |
<edges>'. PHP_EOL; | <edges>'. PHP_EOL; |
} | } |
try { | try { |
$rows = $db->get_view("app", "byDeptStateName", null, true)->rows; | $rows = $db->get_view("app", "byDeptStateName", null, true)->rows; |
//print_r($rows); | //print_r($rows); |
foreach ($rows as $row) { | foreach ($rows as $row) { |
add_edge("fedg", $row->value, 'yellow'); | add_edge("fedg", $row->value, 'yellow'); |
} | } |
} catch (SetteeRestClientException $e) { | } catch (SetteeRestClientException $e) { |
setteErrorHandler($e); | setteErrorHandler($e); |
} | } |
try { | try { |
$rows = $db->get_view("app", "parentOrgs", null, true)->rows; | $rows = $db->get_view("app", "parentOrgs", null, true)->rows; |
// print_r($rows); | // print_r($rows); |
foreach ($rows as $row) { | foreach ($rows as $row) { |
add_edge($row->key, $row->value, 'blue'); | add_edge($row->key, $row->value, 'blue'); |
} | } |
} catch (SetteeRestClientException $e) { | } catch (SetteeRestClientException $e) { |
setteErrorHandler($e); | setteErrorHandler($e); |
} | } |
if ($format == "html") { | if ($format == "html") { |
?> | ?> |
<div id="sigma-example" width="960" style="min-height:800px;background-color: #333;"></div> | <div id="sigma-example" width="960" style="min-height:800px;background-color: #333;"></div> |
<script src="js/sigma.min.js"></script> | <script src="js/sigma.min.js"></script> |
<script src="js/sigma/plugins/sigma.parseGexf.js"></script> | <script src="js/sigma/plugins/sigma.parseGexf.js"></script> |
<script src="js/sigma/plugins/sigma.forceatlas2.js"></script> | <script src="js/sigma/plugins/sigma.forceatlas2.js"></script> |
<script type="text/javascript">function init() { | <script type="text/javascript">function init() { |
// Instanciate sigma.js and customize rendering : | // Instanciate sigma.js and customize rendering : |
var sigInst = sigma.init(document.getElementById('sigma-example')).drawingProperties({ | var sigInst = sigma.init(document.getElementById('sigma-example')).drawingProperties({ |
defaultLabelColor: '#fff', | defaultLabelColor: '#fff', |
defaultLabelSize: 14, | defaultLabelSize: 14, |
defaultLabelBGColor: '#fff', | defaultLabelBGColor: '#fff', |
defaultLabelHoverColor: '#000', | defaultLabelHoverColor: '#000', |
labelThreshold: 6, | labelThreshold: 6, |
defaultEdgeType: 'curve' | defaultEdgeType: 'curve' |
}).graphProperties({ | }).graphProperties({ |
minNodeSize: 0.5, | minNodeSize: 0.5, |
maxNodeSize: 5, | maxNodeSize: 5, |
minEdgeSize: 5, | minEdgeSize: 5, |
maxEdgeSize: 5 | maxEdgeSize: 5 |
}).mouseProperties({ | }).mouseProperties({ |
maxRatio: 32 | maxRatio: 32 |
}); | }); |
// Parse a GEXF encoded file to fill the graph | // Parse a GEXF encoded file to fill the graph |
// (requires "sigma.parseGexf.js" to be included) | // (requires "sigma.parseGexf.js" to be included) |
sigInst.parseGexf('graph.php?format=gexf'); | sigInst.parseGexf('graph.php?format=gexf'); |
sigInst.bind('downnodes',function(event){ | sigInst.bind('downnodes',function(event){ |
var nodes = event.content; | var nodes = event.content; |
}); | }); |
// Start the ForceAtlas2 algorithm | // Start the ForceAtlas2 algorithm |
// (requires "sigma.forceatlas2.js" to be included) | // (requires "sigma.forceatlas2.js" to be included) |
sigInst.startForceAtlas2(); | sigInst.startForceAtlas2(); |
// Draw the graph : | // Draw the graph : |
sigInst.draw(); | sigInst.draw(); |
} | } |
if (document.addEventListener) { | if (document.addEventListener) { |
document.addEventListener("DOMContentLoaded", init, false); | document.addEventListener("DOMContentLoaded", init, false); |
} else { | } else { |
window.onload = init; | window.onload = init; |
} | } |
</script> | </script> |
<?php | <?php |
} | } |
if ($format == "dot") { | if ($format == "dot") { |
echo "}"; | echo "}"; |
} | } |
if ($format == "gexf") { | if ($format == "gexf") { |
echo ' </edges> | echo ' </edges> |
</graph> | </graph> |
</gexf>'. PHP_EOL; | </gexf>'. PHP_EOL; |
} | } |
//include_footer(); | //include_footer(); |
?> | ?> |
<?php | <?php |
function include_header($title) { | function include_header($title) { |
global $basePath; | global $basePath; |
?> | ?> |
<!DOCTYPE html> | <!DOCTYPE html> |
<!-- paulirish.com/2008/conditional-stylesheets-vs-css-hacks-answer-neither/ --> | <!-- paulirish.com/2008/conditional-stylesheets-vs-css-hacks-answer-neither/ --> |
<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]--> | <!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]--> |
<!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]--> | <!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]--> |
<!--[if IE 8]> <html class="no-js lt-ie9" lang="en"> <![endif]--> | <!--[if IE 8]> <html class="no-js lt-ie9" lang="en"> <![endif]--> |
<!--[if gt IE 8]><!--> <html lang="en"> <!--<![endif]--> | <!--[if gt IE 8]><!--> <html lang="en"> <!--<![endif]--> |
<head> | <head> |
<meta charset="utf-8" /> | <meta charset="utf-8" /> |
<!-- Set the viewport width to device width for mobile --> | <!-- Set the viewport width to device width for mobile --> |
<meta name="viewport" content="width=device-width" /> | <meta name="viewport" content="width=device-width" /> |
<title><?php echo $title; ?> - Disclosr</title> | <title><?php echo $title; ?> - Disclosr</title> |
<!-- Included CSS Files --> | <!-- Included CSS Files --> |
<link href="<?php echo $basePath ?>css/bootstrap.min.css" rel="stylesheet"> | <link href="<?php echo $basePath ?>css/bootstrap.min.css" rel="stylesheet"> |
<style type="text/css"> | <style type="text/css"> |
body { | body { |
padding-top: 60px; | padding-top: 60px; |
padding-bottom: 40px; | padding-bottom: 40px; |
} | } |
.sidebar-nav { | .sidebar-nav { |
padding: 9px 0; | padding: 9px 0; |
} | } |
.flotr-dummy-div { | .flotr-dummy-div { |
margin-left: -999px; | margin-left: -999px; |
} | } |
</style> | </style> |
<link href="<?php echo $basePath ?>css/bootstrap-responsive.min.css" rel="stylesheet"> | <link href="<?php echo $basePath ?>css/bootstrap-responsive.min.css" rel="stylesheet"> |
<!--[if lt IE 9]> | <!--[if lt IE 9]> |
<link rel="stylesheet" href="<?php echo $basePath ?>stylesheets/ie.css"> | <link rel="stylesheet" href="<?php echo $basePath ?>stylesheets/ie.css"> |
<![endif]--> | <![endif]--> |
<!-- IE Fix for HTML5 Tags --> | <!-- IE Fix for HTML5 Tags --> |
<!--[if lt IE 9]> | <!--[if lt IE 9]> |
<script src="http://html5shiv.googlecode.com/svn/trunk/html5.js"></script> | <script src="http://html5shiv.googlecode.com/svn/trunk/html5.js"></script> |
<![endif]--> | <![endif]--> |
</head> | </head> |
<body xmlns:schema="http://schema.org/" xmlns:foaf="http://xmlns.com/foaf/0.1/"> | <body xmlns:schema="http://schema.org/" xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:org="http://www.w3.org/ns/org#" xmlns:skos="http://www.w3.org/2004/02/skos/core#"> |
<div class="navbar navbar-inverse navbar-fixed-top"> | <div class="navbar navbar-inverse navbar-fixed-top"> |
<div class="navbar-inner"> | <div class="navbar-inner"> |
<div class="container-fluid"> | <div class="container-fluid"> |
<a class="btn btn-navbar" data-toggle="collapse" data-target=".nav-collapse"> | <a class="btn btn-navbar" data-toggle="collapse" data-target=".nav-collapse"> |
<span class="icon-bar"></span> | <span class="icon-bar"></span> |
<span class="icon-bar"></span> | <span class="icon-bar"></span> |
<span class="icon-bar"></span> | <span class="icon-bar"></span> |
</a> | </a> |
<a class="brand" href="#">Disclosr</a> | <a class="brand" href="#">Disclosr</a> |
<div class="nav-collapse collapse"> | <div class="nav-collapse collapse"> |
<ul class="nav"> | <ul class="nav"> |
<li><a href="getAgency.php">Agencies</a></li> | <li><a href="getAgency.php">Agencies</a></li> |
<li><a href="ranking.php">Open Gov Ranking</a></li> | <li><a href="ranking.php">Open Gov Ranking</a></li> |
<li><a href="headcount.php">Employee Headcount Graph</a></li> | <li><a href="headcount.php">Employee Headcount Graph</a></li> |
<li><a href="budget.php">Budget Graph</a></li> | <li><a href="budget.php">Budget Graph</a></li> |
<li><a href="about.php">About/FAQ</a></li> | <li><a href="about.php">About/FAQ</a></li> |
</ul> | </ul> |
</div><!--/.nav-collapse --> | </div><!--/.nav-collapse --> |
</div> | </div> |
</div> | </div> |
</div> | </div> |
<div class="container-fluid"> | <div class="container-fluid"> |
<?php } | <?php } |
function include_footer() { | function include_footer() { |
global $basePath; | global $basePath; |
?> | ?> |
</div> <!-- /container --> | </div> <!-- /container --> |
<hr> | <hr> |
<footer> | <footer> |
<p>Not affiliated with or endorsed by any government agency.</p> | <p>Not affiliated with or endorsed by any government agency.</p> |
</footer> | </footer> |
<!-- Included JS Files --> | <!-- Included JS Files --> |
<script src="http://code.jquery.com/jquery-1.7.1.min.js"></script> | <script src="http://code.jquery.com/jquery-1.7.1.min.js"></script> |
<script type="text/javascript" src="<?php echo $basePath ?>js/flotr2/flotr2.js"></script> | <script type="text/javascript" src="<?php echo $basePath ?>js/flotr2/flotr2.js"></script> |
<?php | <?php |
if (strpos($_SERVER['SERVER_NAME'], ".gs")) { | if (strpos($_SERVER['SERVER_NAME'], ".gs")) { |
?> | ?> |
<script type="text/javascript"> | <script type="text/javascript"> |
var _gaq = _gaq || []; | var _gaq = _gaq || []; |
_gaq.push(['_setAccount', 'UA-12341040-2']); | _gaq.push(['_setAccount', 'UA-12341040-2']); |
_gaq.push(['_trackPageview']); | _gaq.push(['_trackPageview']); |
(function() { | (function() { |
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true; | var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true; |
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js'; | ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js'; |
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s); | var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s); |
})(); | })(); |
</script> | </script> |
</body> | </body> |
</html> | </html> |
<?php } | <?php } |
} | } |
<?php | <?php |
include_once('include/common.inc.php'); | include_once('include/common.inc.php'); |
include_header('Open Gov Rankings'); | include_header('Open Gov Rankings'); |
$db = $server->get_db('disclosr-agencies'); | $db = $server->get_db('disclosr-agencies'); |
?> | ?> |
<div class="foundation-header"> | <div class="foundation-header"> |
<h1><a href="about.php">Open Government Rankings</a></h1> | <h1><a href="about.php">Open Government Rankings</a></h1> |
<h4 class="subheader"></h4> | <h4 class="subheader"></h4> |
</div> | </div> |
<table> | <table> |
<?php | <?php |
$agenciesdb = $server->get_db('disclosr-agencies'); | $agenciesdb = $server->get_db('disclosr-agencies'); |
//$docsdb = $server->get_db('disclosr-documents'); | //$docsdb = $server->get_db('disclosr-documents'); |
$scoredagencies = Array(); | $scoredagencies = Array(); |
$scores = Array(); | $scores = Array(); |
$columnKeys = Array(); | $columnKeys = Array(); |
try { | try { |
$rows = $agenciesdb->get_view("app", "all", null, true)->rows; | $rows = $agenciesdb->get_view("app", "all", null, true)->rows; |
if ($rows) { | if ($rows) { |
foreach ($rows as $row) { | foreach ($rows as $row) { |
$columns = Array(); | $columns = Array(); |
foreach ($row->value as $key => $value) { | foreach ($row->value as $key => $value) { |
if ((strstr($key, "has") || strstr($key, "URL")) && $key != "rtkURLs") { | if ((strstr($key, "has") || strstr($key, "URL")) && $key != "rtkURLs") { |
//echo "$key<br>"; | //echo "$key<br>"; |
$columns[$key] = $value; | $columns[$key] = $value; |
} | } |
} | } |
//print_r(array_keys($columns)); | //print_r(array_keys($columns)); |
$columnKeys = array_unique(array_merge($columnKeys, array_keys($columns))); | $columnKeys = array_unique(array_merge($columnKeys, array_keys($columns))); |
//print_r($columnKeys); | //print_r($columnKeys); |
$score = count($columns); | $score = count($columns); |
$scores[$score]++; | if (isset($scores[$score])){ |
$scoredagencies[] = Array("id"=> $row->key, "website"=> $row->value->website, "name" => $row->value->name, "columns" => $columns, "score" => $score); | $scores[$score]++; |
} else { | |
$scores[$score] =1; | |
} | |
$scoredagencies[] = Array("id"=> $row->key, "website"=> (isset($row->value->website)?$row->value->website:""), "name" => $row->value->name, "columns" => $columns, "score" => $score); | |
} | } |
} | } |
} catch (SetteeRestClientException $e) { | } catch (SetteeRestClientException $e) { |
setteErrorHandler($e); | setteErrorHandler($e); |
} | } |
function cmp($a, $b) | function cmp($a, $b) |
{ | { |
if ($a['score'] == $b['score']) { | if ($a['score'] == $b['score']) { |
return strcmp($a['name'], $b['name']); | return strcmp($a['name'], $b['name']); |
} | } |
return ($a['score'] > $b['score']) ? -1 : 1; | return ($a['score'] > $b['score']) ? -1 : 1; |
} | } |
usort($scoredagencies, "cmp"); | usort($scoredagencies, "cmp"); |
echo "<tr>"; | echo "<tr>"; |
echo "<th>Agency Name</th>"; | echo "<th>Agency Name</th>"; |
echo "<th>Score</th>"; | echo "<th>Score</th>"; |
foreach ($columnKeys as $columnID) { | foreach ($columnKeys as $columnID) { |
echo "<th>" . (isset($schemas['agency']["properties"][$columnID]['x-title']) ? $schemas['agency']["properties"][$columnID]['x-title'] : "<i>$columnID</i>") . "</th>"; | echo "<th>" . (isset($schemas['agency']["properties"][$columnID]['x-title']) ? $schemas['agency']["properties"][$columnID]['x-title'] : "<i>$columnID</i>") . "</th>"; |
} | } |
echo "</tr>"; | echo "</tr>"; |
foreach ($scoredagencies as $scoredagency) { | foreach ($scoredagencies as $scoredagency) { |
echo "<tr>"; | echo "<tr>"; |
echo "<td><b><a href='getAgency.php?id=" . $scoredagency['id'] . "'>". $scoredagency['name'] . "</a></b></td>"; | echo "<td><b><a href='getAgency.php?id=" . $scoredagency['id'] . "'>". $scoredagency['name'] . "</a></b></td>"; |
echo "<td><b>" . $scoredagency['score'] . "</b></td>"; | echo "<td><b>" . $scoredagency['score'] . "</b></td>"; |
foreach ($columnKeys as $key) { | foreach ($columnKeys as $key) { |
echo "<td style='text-align: center;'>"; | echo "<td style='text-align: center;'>"; |
if (isset($scoredagency['columns'][$key])) { | if (isset($scoredagency['columns'][$key])) { |
$value = $scoredagency['columns'][$key]; | $value = $scoredagency['columns'][$key]; |
if (is_array($value)) { | if (is_array($value)) { |
if (count($value) == 1) { | if (count($value) == 1) { |
$href = $value[0]; | $href = $value[0]; |
} else { | } else { |
$href = $value[0]; | $href = $value[0]; |
} | } |
} else { | } else { |
$href = $value; | $href = $value; |
} | } |
if ($href[0] == "@") { | if (isset($href[0]) && $href[0] == "@") { |
$href = str_replace("@","https://twitter.com/",$href); | $href = str_replace("@","https://twitter.com/",$href); |
} | } |
//$href= urlencode($href); | //$href= urlencode($href); |
echo "<font color='lightgreen'>"; | echo "<font color='lightgreen'>"; |
if (strstr($href, "http")) { | if (strstr($href, "http")) { |
echo "<a title='Yes' href='$href' style='color:lightgreen;'>✓</a>"; | echo "<a title='Yes' href='$href' style='color:lightgreen;'>✓</a>"; |
} else { | } else { |
echo "✓"; | echo "✓"; |
} | } |
echo "</font>"; | echo "</font>"; |
} else { | } else { |
echo "<font color='orange'><abbr title='No'>✘</abbr></font>"; | echo "<font color='orange'><abbr title='No'>✘</abbr></font>"; |
} | } |
echo "</td>"; | echo "</td>"; |
} | } |
echo "</tr>\n"; | echo "</tr>\n"; |
} | } |
?> | ?> |
</table><br> | </table><br> |
<div id="criteria" style="width:500px;height:900px;"></div> | <div id="criteria" style="width:500px;height:900px;"></div> |
<div id="scores" style="width:900px;height:500px;"></div> | <div id="scores" style="width:900px;height:500px;"></div> |
<script id="source"> | <script id="source"> |
window.onload = function () { | window.onload = function () { |
$(document).ready(function () { | $(document).ready(function () { |
var d1 = []; | var d1 = []; |
var scorelabels = []; | var scorelabels = []; |
<?php | <?php |
try { | try { |
$rows = $db->get_view("app", "scoreHas?group=true", null, true)->rows; | $rows = $db->get_view("app", "scoreHas?group=true", null, true)->rows; |
$dataValues = Array(); | $dataValues = Array(); |
foreach ($rows as $row) { | foreach ($rows as $row) { |
$dataValues[$row->value] = $row->key; | $dataValues[$row->value] = $row->key; |
} | } |
$i = 0; | $i = 0; |
ksort($dataValues); | ksort($dataValues); |
foreach ($dataValues as $value => $key) { | foreach ($dataValues as $value => $key) { |
echo " d1.push([$value, $i]);" . PHP_EOL; | echo " d1.push([$value, $i]);" . PHP_EOL; |
echo " scorelabels.push('$key');" . PHP_EOL; | echo " scorelabels.push('$key');" . PHP_EOL; |
$i++; | $i++; |
} | } |
} catch (SetteeRestClientException $e) { | } catch (SetteeRestClientException $e) { |
setteErrorHandler($e); | setteErrorHandler($e); |
} | } |
?> | ?> |
function scoretrackformatter(obj) { | function scoretrackformatter(obj) { |
if (scorelabels[Math.floor(obj.y)]) { | if (scorelabels[Math.floor(obj.y)]) { |
return (scorelabels[Math.floor(obj.y)]) + "=" + obj.x; | return (scorelabels[Math.floor(obj.y)]) + "=" + obj.x; |
} else { | } else { |
return ""; | return ""; |
} | } |
} | } |
function scoretickformatter(val, axis) { | function scoretickformatter(val, axis) { |
if (scorelabels[Math.floor(val)]) { | if (scorelabels[Math.floor(val)]) { |
return (scorelabels[Math.floor(val)]) ; | return (scorelabels[Math.floor(val)]) ; |
} else { | } else { |
return ""; | return ""; |
} | } |
} | } |
Flotr.draw(document.getElementById("criteria"), [ | Flotr.draw(document.getElementById("criteria"), [ |
{data: d1} | {data: d1} |
], { | ], { |
title: 'Total count of agencies with criteria', | title: 'Total count of agencies with criteria', |
HtmlText: true, | HtmlText: true, |
bars: { | bars: { |
show: true, | show: true, |
horizontal: true | horizontal: true |
}, | }, |
mouse: { | mouse: { |
track: true, | track: true, |
relative: true, | relative: true, |
trackFormatter: scoretrackformatter | trackFormatter: scoretrackformatter |
}, yaxis: { | }, yaxis: { |
autoscaling: true, | autoscaling: true, |
minorTickFreq: 0.6, | minorTickFreq: 0.6, |
noTicks: scorelabels.length, | noTicks: scorelabels.length, |
tickFormatter: scoretickformatter | tickFormatter: scoretickformatter |
}, | }, |
xaxis: { | xaxis: { |
autoscaling: true | autoscaling: true |
} | } |
}); | }); |
var d2 = []; | var d2 = []; |
<?php | <?php |
try { | try { |
ksort($scores); | ksort($scores); |
foreach ($scores as $key => $value) { | foreach ($scores as $key => $value) { |
echo " d2.push([$key,$value]);" . PHP_EOL; | echo " d2.push([$key,$value]);" . PHP_EOL; |
$i++; | $i++; |
} | } |
} catch (SetteeRestClientException $e) { | } catch (SetteeRestClientException $e) { |
setteErrorHandler($e); | setteErrorHandler($e); |
} | } |
?> | ?> |
Flotr.draw(document.getElementById("scores"), [ | Flotr.draw(document.getElementById("scores"), [ |
{data: d2} | {data: d2} |
], { | ], { |
title: 'Frequency distribution of Scores', | title: 'Frequency distribution of Scores', |
HtmlText: true, | HtmlText: true, |
bars: { | bars: { |
show: true | show: true |
}, | }, |
mouse: { | mouse: { |
track: true, | track: true, |
relative: true | relative: true |
}, yaxis: { | }, yaxis: { |
autoscaling: true | autoscaling: true |
}, | }, |
xaxis: { | xaxis: { |
autoscaling: true | autoscaling: true |
} | } |
}); | }); |
}); | }); |
}; | }; |
</script> | </script> |
<?php | <?php |
include_footer(); | include_footer(); |
?> | ?> |
# www.robotstxt.org/ | # www.robotstxt.org/ |
# www.google.com/support/webmasters/bin/answer.py?hl=en&answer=156449 | # www.google.com/support/webmasters/bin/answer.py?hl=en&answer=156449 |
User-agent: * | User-agent: * |
Disallow: /admin/ | Disallow: /admin/ |
Disallow: /viewDocument.php | |
Sitemap: http://orgs.disclosurelo.gs/sitemap.xml.php | Sitemap: http://orgs.disclosurelo.gs/sitemap.xml.php |
<?php | <?php |
$schemas['agency'] = Array( | $schemas['agency'] = Array( |
"description" => "Representation of government agency and online transparency measures", | "description" => "Representation of government agency and online transparency measures", |
"type" => "object", | "type" => "object", |
"properties" => Array( | "properties" => Array( |
"name" => Array("type" => "string", "required" => true, "x-itemprop" => "name", "x-property" => "schema:name foaf:name", "x-title" => "Name", "description" => "Name, most recent and broadest"), | "name" => Array("type" => "string", "required" => true, "x-itemprop" => "name", "x-property" => "schema:name foaf:name skos:prefLabel ", "x-title" => "Name", "description" => "Name, most recent and broadest"), |
"shortName" => Array("type" => "string", "required" => false, "x-title" => "Short Name", "description" => "Name shortened, usually to an acronym"), | "shortName" => Array("type" => "string", "required" => false, "x-title" => "Short Name", "description" => "Name shortened, usually to an acronym"), |
"description" => Array("type" => "string", "required" => false, "x-title" => "Description", "description" => "Description of roles and responsiblities of organisation"), | "description" => Array("type" => "string", "required" => false, "x-title" => "Description", "description" => "Description of roles and responsiblities of organisation"), |
"foiEmail" => Array("type" => "string", "required" => false, "x-title" => "FOI Contact Email", "x-itemprop" => "email", "description" => "FOI contact email if not foi@"), | "foiEmail" => Array("type" => "string", "required" => false, "x-title" => "FOI Contact Email", "x-itemprop" => "email", "description" => "FOI contact email if not foi@"), |
"sameAs" => Array("type" => "array", "required" => false, "x-property"=>"owl:sameAs","x-title" => "Same As", "description" => "Same as other URLs/URIs for this entity", | "sameAs" => Array("type" => "array", "required" => false, "x-property"=>"owl:sameAs","x-title" => "Same As", "description" => "Same as other URLs/URIs for this entity", |
"items" => Array("type" => "string")), | "items" => Array("type" => "string")), |
"otherNames" => Array("type" => "array", "required" => true, "x-title" => "Past/Other Names", "description" => "Other names for organisation", | "otherNames" => Array("type" => "array", "required" => true, "x-title" => "Past/Other Names", "description" => "Other names for organisation", |
"items" => Array("type" => "string")), | "items" => Array("type" => "string")), |
"positions" => Array("type" => "array", "required" => true, "x-title" => "Political Positions", "description" => "Ministers and Parliamentary Secretaries", | "positions" => Array("type" => "array", "required" => true, "x-title" => "Political Positions", "description" => "Ministers and Parliamentary Secretaries", |
"items" => Array("type" => "string")), | "items" => Array("type" => "string")), |
"foiBodies" => Array("type" => "array", "required" => true, "x-title" => "FOI Bodies","x-property"=>"schema:members foaf:knows", "description" => "Organisational units within this agency that are subject to FOI Act but are not autonomous", | "foiBodies" => Array("type" => "array", "required" => true, "x-title" => "FOI Bodies","x-property"=>"schema:members foaf:knows org:hasSubOrganization", "description" => "Organisational units within this agency that are subject to FOI Act but are not autonomous", |
"items" => Array("type" => "string")), | "items" => Array("type" => "string")), |
"legislation" => Array("type" => "array", "required" => true, "x-title" => "Legislation", "description" => "Legislation administered by or created for the establishment of this organisation", | "legislation" => Array("type" => "array", "required" => true, "x-title" => "Legislation", "description" => "Legislation administered by or created for the establishment of this organisation", |
"items" => Array("type" => "string")), | "items" => Array("type" => "string")), |
"orgType" => Array("type" => "string", "required" => true, "x-title" => "Organisation Type", "description" => "Org type based on legal formation via FMA/CAC legislation etc."), | "orgType" => Array("type" => "string", "required" => true, "x-title" => "Organisation Type", "x-property" => "org:classification", "description" => "Org type based on legal formation via FMA/CAC legislation etc."), |
"parentOrg" => Array("type" => "string", "required" => true, "x-title" => "Parent Organisation", "description" => "Parent organisation, usually a department of state"), | "parentOrg" => Array("type" => "string", "required" => true, "x-title" => "Parent Organisation", "x-property" => "org:subOrganizationOf", "description" => "Parent organisation, usually a department of state"), |
"website" => Array("type" => "string", "required" => true, "x-title" => "Website", "x-itemprop" => "url", "x-property" => "schema:url foaf:homepage", "description" => "Website URL"), | "website" => Array("type" => "string", "required" => true, "x-title" => "Website", "x-itemprop" => "url", "x-property" => "schema:url foaf:homepage", "description" => "Website URL"), |
"abn" => Array("type" => "string", "required" => true, "x-title" => "Australian Business Number", "description" => "ABN from business register"), | "abn" => Array("type" => "string", "required" => true, "x-title" => "Australian Business Number", "x-property" => "org:identifier", "description" => "ABN from business register"), |
"established" => Array("type" => "string", "required" => true, "x-title" => "Date established", "description" => "Date established"), | "established" => Array("type" => "string", "required" => true, "x-title" => "Date established", "x-property" => "schema:foundingDate", "description" => "Date established"), |
"employees" => Array("type" => "string", "required" => true, "x-title" => "2010-2011 employees", "description" => "2010-2011 employees"), | "employees" => Array("type" => "string", "required" => true, "x-title" => "2010-2011 employees", "description" => "2010-2011 employees"), |
"contractListURL" => Array("type" => "string", "required" => true, "x-title" => "Contract Listing", "description" => "Departmental and agency contracts, <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a>" ), | "contractListURL" => Array("type" => "string", "required" => true, "x-title" => "Contract Listing", "description" => "Departmental and agency contracts, <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a>" ), |
"budgetURL" => Array("type" => "string", "required" => true,"x-title" => "Budget", "description" => "Portfolio Budget Statements and Portfolio Additional Estimates Statements"), | "budgetURL" => Array("type" => "string", "required" => true,"x-title" => "Budget", "description" => "Portfolio Budget Statements and Portfolio Additional Estimates Statements"), |
"grantsReportingURL" => Array("type" => "string", "required" => true, "x-title" => "Grants Awarded", | "grantsReportingURL" => Array("type" => "string", "required" => true, "x-title" => "Grants Awarded", |
"description" => "Departmental and agency grants <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a> and <a href='http://www.finance.gov.au/publications/fmg-series/23-commonwealth-grant-guidelines.html'>Commonwealth grants guidelines</a> "), | "description" => "Departmental and agency grants <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a> and <a href='http://www.finance.gov.au/publications/fmg-series/23-commonwealth-grant-guidelines.html'>Commonwealth grants guidelines</a> "), |
"annualReportURL" => Array("type" => "string", "required" => true, "x-title" => "Annual Report(s)", "description" => ""), | "annualReportURL" => Array("type" => "string", "required" => true, "x-title" => "Annual Report(s)", "description" => ""), |
"consultanciesURL" => Array("type" => "string", "required" => true, "x-title" => "Consultants Hired", "description" => ""), | "consultanciesURL" => Array("type" => "string", "required" => true, "x-title" => "Consultants Hired", "description" => ""), |
"legalExpenditureURL" => Array("type" => "string", "required" => true, "x-title" => "Legal Services Expenditure", "description" => "Legal Services Expenditure mandated by Legal Services Directions 2005"), | "legalExpenditureURL" => Array("type" => "string", "required" => true, "x-title" => "Legal Services Expenditure", "description" => "Legal Services Expenditure mandated by Legal Services Directions 2005"), |
"recordsListURL" => Array("type" => "string", "required" => true, "x-title" => "Files/Records Held", "description" => "Indexed lists of departmental and agency files, <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a>"), | "recordsListURL" => Array("type" => "string", "required" => true, "x-title" => "Files/Records Held", "description" => "Indexed lists of departmental and agency files, <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a>"), |
"FOIDocumentsURL" => Array("type" => "string", "required" => true, "x-title" => "FOI Documents Released", "description" => "FOI Disclosure Log URL"), | "FOIDocumentsURL" => Array("type" => "string", "required" => true, "x-title" => "FOI Documents Released", "description" => "FOI Disclosure Log URL"), |
"FOIDocumentsRSSURL" => Array("type" => "string", "required" => false, "x-title" => "RSS Feed of FOI Documents Released", "description" => "FOI Disclosure Log in RSS format"), | "FOIDocumentsRSSURL" => Array("type" => "string", "required" => false, "x-title" => "RSS Feed of FOI Documents Released", "description" => "FOI Disclosure Log in RSS format"), |
"hasFOIPDF" => Array("type" => "array", "required" => false, "x-title" => "Has FOI Documents Released in PDF", "description" => "FOI Disclosure Log contains any PDFs", | "hasFOIPDF" => Array("type" => "array", "required" => false, "x-title" => "Has FOI Documents Released in PDF", "description" => "FOI Disclosure Log contains any PDFs", |
"items" => Array("type" => "string")), | "items" => Array("type" => "string")), |
"infoPublicationSchemeURL" => Array("type" => "string", "required" => true, "x-title" => "Information Publication Scheme", "description" => ""), | "infoPublicationSchemeURL" => Array("type" => "string", "required" => true, "x-title" => "Information Publication Scheme", "description" => ""), |
"appointmentsURL" => Array("type" => "string", "required" => true, "x-title" => "Agency Appointments/Boards", "description" => "Departmental and agency appointments and vacancies , <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a>"), | "appointmentsURL" => Array("type" => "string", "required" => true, "x-title" => "Agency Appointments/Boards", "description" => "Departmental and agency appointments and vacancies , <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a>"), |
"advertisingURL" => Array("type" => "string", "required" => true, "x-title" => "Approved Advertising Campaigns", "description" => " Agency advertising and public information projects, <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a> "), | "advertisingURL" => Array("type" => "string", "required" => true, "x-title" => "Approved Advertising Campaigns", "description" => " Agency advertising and public information projects, <a href='http://www.aph.gov.au/senate/pubs/standing_orders/d05.htm'>mandated by the Senate</a> "), |
"hasRSS" => Array("type" => "array", "required" => true, "x-title" => "Has RSS", "description" => ""), | "hasRSS" => Array("type" => "array", "required" => true, "x-title" => "Has RSS", "description" => ""), |
"hasBlog" => Array("type" => "array", "required" => true, "x-title" => "Has Blog", "description" => ""), | "hasBlog" => Array("type" => "array", "required" => true, "x-title" => "Has Blog", "description" => ""), |
"hasMobileApp" => Array("type" => "array", "required" => true, "x-title" => "Has Mobile App", "description" => ""), | "hasMobileApp" => Array("type" => "array", "required" => true, "x-title" => "Has Mobile App", "description" => ""), |
"hasMailingList" => Array("type" => "array", "required" => true, "x-title" => "Has Mailing List", "description" => "", | "hasMailingList" => Array("type" => "array", "required" => true, "x-title" => "Has Mailing List", "description" => "", |
"items" => Array("type" => "string")), | "items" => Array("type" => "string")), |
"hasTwitter" => Array("type" => "array", "required" => true, "x-title" => "Has Twitter", "description" => "", | "hasTwitter" => Array("type" => "array", "required" => true, "x-title" => "Has Twitter", "description" => "", |
"items" => Array("type" => "string")), | "items" => Array("type" => "string")), |
"hasFacebook" => Array("type" => "array", "required" => true, "x-title" => "Has Facebook", "description" => "", | "hasFacebook" => Array("type" => "array", "required" => true, "x-title" => "Has Facebook", "description" => "", |
"items" => Array("type" => "string")), | "items" => Array("type" => "string")), |
"hasYouTube" => Array("type" => "array", "required" => true, "x-title" => "Has YouTube", "description" => "", | "hasYouTube" => Array("type" => "array", "required" => true, "x-title" => "Has YouTube", "description" => "", |
"items" => Array("type" => "string")), | "items" => Array("type" => "string")), |
"hasFlickr" => Array("type" => "array", "required" => true, "x-title" => "Has Flickr", "description" => "", | "hasFlickr" => Array("type" => "array", "required" => true, "x-title" => "Has Flickr", "description" => "", |
"items" => Array("type" => "string")), | "items" => Array("type" => "string")), |
"hasCCBY" => Array("type" => "array", "required" => true, "x-title" => "Has CC-BY", "description" => "Has any page licenced Creative Commons - Attribution", | "hasCCBY" => Array("type" => "array", "required" => true, "x-title" => "Has CC-BY", "description" => "Has any page licenced Creative Commons - Attribution", |
"items" => Array("type" => "string")), | "items" => Array("type" => "string")), |
"hasRestrictiveLicence" => Array("type" => "array","required" => true, "x-title" => "Has Restrictive Licence", "description" => "Has any page licenced under terms more restrictive than Crown Copyright", | "hasRestrictiveLicence" => Array("type" => "array","required" => true, "x-title" => "Has Restrictive Licence", "description" => "Has any page licenced under terms more restrictive than Crown Copyright", |
"items" => Array("type" => "string")), | "items" => Array("type" => "string")), |
"hasPermissiveLicence" => Array("type" => "array","required" => true, "x-title" => "Has Permissive Licence", "description" => "Has any page licenced under terms more permissive than Crown Copyright but not clear CCBY", | "hasPermissiveLicence" => Array("type" => "array","required" => true, "x-title" => "Has Permissive Licence", "description" => "Has any page licenced under terms more permissive than Crown Copyright but not clear CCBY", |
"items" => Array("type" => "string")), | "items" => Array("type" => "string")), |
"hasCrownCopyright" => Array("type" => "array", "required" => true, "x-title" => "Has Standard Crown Copyright licence", "description" => "Has any page still licenced under the former Commonwealth Copyright Administration", | "hasCrownCopyright" => Array("type" => "array", "required" => true, "x-title" => "Has Standard Crown Copyright licence", "description" => "Has any page still licenced under the former Commonwealth Copyright Administration", |
"items" => Array("type" => "string")), | "items" => Array("type" => "string")), |
), | ), |
); | ); |
?> | ?> |
<?php | <?php |
include ('include/common.inc.php'); | include ('include/common.inc.php'); |
$last_updated = date('Y-m-d', @filemtime('cbrfeed.zip')); | $last_updated = date('Y-m-d', @filemtime('cbrfeed.zip')); |
header("Content-Type: text/xml"); | header("Content-Type: text/xml"); |
echo "<?xml version='1.0' encoding='UTF-8'?>"; | echo "<?xml version='1.0' encoding='UTF-8'?>"; |
echo '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' . "\n"; | echo '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' . "\n"; |
echo " <url><loc>" . local_url() . "index.php</loc><priority>1.0</priority></url>\n"; | echo " <url><loc>" . local_url() . "index.php</loc><priority>1.0</priority></url>\n"; |
foreach (scandir("./") as $file) { | foreach (scandir("./") as $file) { |
if (strpos($file, ".php") !== false && $file != "index.php" && $file != "sitemap.xml.php") | if (strpos($file, ".php") !== false && $file != "index.php" && $file != "sitemap.xml.php"&& $file != "viewDocument.php") |
echo " <url><loc>" . local_url() . "$file</loc><priority>0.3</priority></url>\n"; | echo " <url><loc>" . local_url() . "$file</loc><priority>0.3</priority></url>\n"; |
} | } |
$db = $server->get_db('disclosr-agencies'); | $db = $server->get_db('disclosr-agencies'); |
try { | try { |
$rows = $db->get_view("app", "byCanonicalName")->rows; | $rows = $db->get_view("app", "byCanonicalName")->rows; |
foreach ($rows as $row) { | foreach ($rows as $row) { |
echo '<url><loc>' . local_url() . 'getAgency.php?id=' . $row->value->_id . "</loc><priority>0.6</priority></url>\n"; | echo '<url><loc>' . local_url() . 'getAgency.php?id=' . $row->value->_id . "</loc><priority>0.6</priority></url>\n"; |
} | } |
} catch (SetteeRestClientException $e) { | } catch (SetteeRestClientException $e) { |
setteErrorHandler($e); | setteErrorHandler($e); |
} | } |
echo '</urlset>'; | echo '</urlset>'; |
?> | ?> |