beginning datagov scraper
beginning datagov scraper


Former-commit-id: a8775a64a3cdda480e4433742ed7ea6ca6a437ef

<?php <?php
   
require_once '../include/common.inc.php'; require_once '../include/common.inc.php';
//function createFOIDocumentsDesignDoc() { //function createFOIDocumentsDesignDoc() {
   
$foidb = $server->get_db('disclosr-foidocuments'); $foidb = $server->get_db('disclosr-foidocuments');
$obj = new stdClass(); $obj = new stdClass();
$obj->_id = "_design/" . urlencode("app"); $obj->_id = "_design/" . urlencode("app");
$obj->language = "javascript"; $obj->language = "javascript";
$obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; $obj->views->all->map = "function(doc) { emit(doc._id, doc); };";
$obj->views->byDate->map = "function(doc) { emit(doc.date, doc); };"; $obj->views->byDate->map = "function(doc) { emit(doc.date, doc); };";
$obj->views->byDateMonthYear->map = "function(doc) { emit(doc.date, doc); };"; $obj->views->byDateMonthYear->map = "function(doc) { emit(doc.date, doc); };";
$obj->views->byDateMonthYear->reduce = "_count"; $obj->views->byDateMonthYear->reduce = "_count";
$obj->views->byAgencyID->map = "function(doc) { emit(doc.agencyID, doc); };"; $obj->views->byAgencyID->map = "function(doc) { emit(doc.agencyID, doc); };";
$obj->views->byAgencyID->reduce = "_count"; $obj->views->byAgencyID->reduce = "_count";
$obj->views->fieldNames->map = ' $obj->views->fieldNames->map = '
function(doc) { function(doc) {
for(var propName in doc) { for(var propName in doc) {
emit(propName, doc._id); emit(propName, doc._id);
} }
   
}'; }';
$obj->views->fieldNames->reduce = 'function (key, values, rereduce) { $obj->views->fieldNames->reduce = 'function (key, values, rereduce) {
return values.length; return values.length;
}'; }';
// allow safe updates (even if slightly slower due to extra: rev-detection check). // allow safe updates (even if slightly slower due to extra: rev-detection check).
$foidb->save($obj, true); $foidb->save($obj, true);
   
   
//function createDocumentsDesignDoc() { //function createDocumentsDesignDoc() {
$docdb = $server->get_db('disclosr-documents'); $docdb = $server->get_db('disclosr-documents');
   
$obj = new stdClass(); $obj = new stdClass();
$obj->_id = "_design/" . urlencode("app"); $obj->_id = "_design/" . urlencode("app");
$obj->language = "javascript"; $obj->language = "javascript";
$obj->views->web_server->map = "function(doc) {\n emit(doc.web_server, 1);\n}"; $obj->views->web_server->map = "function(doc) {\n emit(doc.web_server, 1);\n}";
$obj->views->web_server->reduce = "_sum"; $obj->views->web_server->reduce = "_sum";
$obj->views->byAgency->map = "function(doc) {\n emit(doc.agencyID, 1);\n}"; $obj->views->byAgency->map = "function(doc) {\n emit(doc.agencyID, 1);\n}";
$obj->views->byAgency->reduce = "_sum"; $obj->views->byAgency->reduce = "_sum";
$obj->views->byURL->map = "function(doc) {\n emit(doc.url, doc);\n}"; $obj->views->byURL->map = "function(doc) {\n emit(doc.url, doc);\n}";
$obj->views->agency->map = "function(doc) {\n emit(doc.agencyID, doc);\n}"; $obj->views->agency->map = "function(doc) {\n emit(doc.agencyID, doc);\n}";
$obj->views->byWebServer->map = "function(doc) {\n emit(doc.web_server, doc);\n}"; $obj->views->byWebServer->map = "function(doc) {\n emit(doc.web_server, doc);\n}";
$obj->views->getValidationRequired = "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}"; $obj->views->getValidationRequired->map = "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}";
  $docdb->save($obj, true);
   
   
   
   
//function createAgencyDesignDoc() { //function createAgencyDesignDoc() {
$db = $server->get_db('disclosr-agencies'); $db = $server->get_db('disclosr-agencies');
$obj = new stdClass(); $obj = new stdClass();
$obj->_id = "_design/" . urlencode("app"); $obj->_id = "_design/" . urlencode("app");
$obj->language = "javascript"; $obj->language = "javascript";
$obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; $obj->views->all->map = "function(doc) { emit(doc._id, doc); };";
$obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };"; $obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };";
$obj->views->byCanonicalName->map = "function(doc) { $obj->views->byCanonicalName->map = "function(doc) {
if (doc.parentOrg || doc.orgType == 'FMA-DepartmentOfState') { if (doc.parentOrg || doc.orgType == 'FMA-DepartmentOfState') {
emit(doc.name, doc); emit(doc.name, doc);
} }
};"; };";
$obj->views->byDeptStateName->map = "function(doc) { $obj->views->byDeptStateName->map = "function(doc) {
if (doc.orgType == 'FMA-DepartmentOfState') { if (doc.orgType == 'FMA-DepartmentOfState') {
emit(doc.name, doc._id); emit(doc.name, doc._id);
} }
};"; };";
$obj->views->parentOrgs->map = "function(doc) { $obj->views->parentOrgs->map = "function(doc) {
if (doc.parentOrg) { if (doc.parentOrg) {
emit(doc._id, doc.parentOrg); emit(doc._id, doc.parentOrg);
} }
};"; };";
$obj->views->byName->map = 'function(doc) { $obj->views->byName->map = 'function(doc) {
if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") {
emit(doc.name, doc._id); emit(doc.name, doc._id);
if (typeof(doc.shortName) != "undefined" && doc.shortName != doc.name) { if (typeof(doc.shortName) != "undefined" && doc.shortName != doc.name) {
emit(doc.shortName, doc._id); emit(doc.shortName, doc._id);
} }
for (name in doc.otherNames) { for (name in doc.otherNames) {
if (doc.otherNames[name] != "" && doc.otherNames[name] != doc.name) { if (doc.otherNames[name] != "" && doc.otherNames[name] != doc.name) {
emit(doc.otherNames[name], doc._id); emit(doc.otherNames[name], doc._id);
} }
} }
for (name in doc.foiBodies) { for (name in doc.foiBodies) {
if (doc.foiBodies[name] != "" && doc.foiBodies[name] != doc.name) { if (doc.foiBodies[name] != "" && doc.foiBodies[name] != doc.name) {
emit(doc.foiBodies[name], doc._id); emit(doc.foiBodies[name], doc._id);
} }
} }
for (name in doc.positions) { for (name in doc.positions) {
if (doc.positions[name] != "" && doc.positions[name] != doc.name) { if (doc.positions[name] != "" && doc.positions[name] != doc.name) {
emit(doc.positions[name], doc._id); emit(doc.positions[name], doc._id);
} }
} }
} }
};'; };';
   
$obj->views->foiEmails->map = "function(doc) { $obj->views->foiEmails->map = "function(doc) {
emit(doc._id, doc.foiEmail); emit(doc._id, doc.foiEmail);
};"; };";
   
$obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }"; $obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }";
$obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };'; $obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };';
$obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };'; $obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };';
$obj->views->getScrapeRequired->map = "function(doc) { $obj->views->getScrapeRequired->map = "function(doc) {
   
var lastScrape = Date.parse(doc.metadata.lastScraped); var lastScrape = Date.parse(doc.metadata.lastScraped);
   
var today = new Date(); var today = new Date();
   
if (!lastScrape || lastScrape.getTime() + 1000 != today.getTime()) { if (!lastScrape || lastScrape.getTime() + 1000 != today.getTime()) {
emit(doc._id, doc); emit(doc._id, doc);
} }
   
};"; };";
$obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };"; $obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };";
$obj->views->getConflicts->map = "function(doc) { $obj->views->getConflicts->map = "function(doc) {
if (doc._conflicts) { if (doc._conflicts) {
emit(null, [doc._rev].concat(doc._conflicts)); emit(null, [doc._rev].concat(doc._conflicts));
} }
}"; }";
$obj->views->getStatistics->map = $obj->views->getStatistics->map =
"function(doc) { "function(doc) {
if (doc.statistics) { if (doc.statistics) {
for (var statisticSet in doc.statistics) { for (var statisticSet in doc.statistics) {
for (var statisticPeriod in doc.statistics[statisticSet]) { for (var statisticPeriod in doc.statistics[statisticSet]) {
emit([statisticSet,statisticPeriod], doc.statistics[statisticSet][statisticPeriod]['value']); emit([statisticSet,statisticPeriod], doc.statistics[statisticSet][statisticPeriod]['value']);
} }
} }
} }
}"; }";
$obj->views->getStatistics->reduce = '_sum'; $obj->views->getStatistics->reduce = '_sum';
// http://stackoverflow.com/questions/646628/javascript-startswith // http://stackoverflow.com/questions/646628/javascript-startswith
$obj->views->score->map = 'if(!String.prototype.startsWith){ $obj->views->score->map = 'if(!String.prototype.startsWith){
String.prototype.startsWith = function (str) { String.prototype.startsWith = function (str) {
return !this.indexOf(str); return !this.indexOf(str);
} }
} }
   
function(doc) { function(doc) {
count = 0; count = 0;
if (doc["status"] != "suspended") { if (doc["status"] != "suspended") {
for(var propName in doc) { for(var propName in doc) {
if(typeof(doc[propName]) != "undefined" && doc[propName] != "") { if(typeof(doc[propName]) != "undefined" && doc[propName] != "") {
count++; count++;
} }
} }
portfolio = doc.parentOrg; portfolio = doc.parentOrg;
if (doc.orgType == "FMA-DepartmentOfState") { if (doc.orgType == "FMA-DepartmentOfState") {
portfolio = doc._id; portfolio = doc._id;
} }
if (doc.orgType == "Court-Commonwealth" || doc.orgType == "FMA-DepartmentOfParliament") { if (doc.orgType == "Court-Commonwealth" || doc.orgType == "FMA-DepartmentOfParliament") {
portfolio = doc.orgType; portfolio = doc.orgType;
} }
emit(count+doc._id, {id:doc._id, name: doc.name, score:count, orgType: doc.orgType, portfolio:portfolio}); emit(count+doc._id, {id:doc._id, name: doc.name, score:count, orgType: doc.orgType, portfolio:portfolio});
} }
}'; }';
$obj->views->scoreHas->map = 'if(!String.prototype.startsWith){ $obj->views->scoreHas->map = 'if(!String.prototype.startsWith){
String.prototype.startsWith = function (str) { String.prototype.startsWith = function (str) {
return !this.indexOf(str); return !this.indexOf(str);
} }
} }
if(!String.prototype.endsWith){ if(!String.prototype.endsWith){
String.prototype.endsWith = function(suffix) { String.prototype.endsWith = function(suffix) {
    return this.indexOf(suffix, this.length - suffix.length) !== -1;     return this.indexOf(suffix, this.length - suffix.length) !== -1;
}; };
} }
function(doc) { function(doc) {
if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") {
for(var propName in doc) { for(var propName in doc) {
if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") || propName.endsWith("URL"))) { if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") || propName.endsWith("URL"))) {
emit(propName, 1); emit(propName, 1);
} }
} }
emit("total", 1); emit("total", 1);
} }
}'; }';
$obj->views->scoreHas->reduce = '_sum'; $obj->views->scoreHas->reduce = '_sum';
$obj->views->fieldNames->map = ' $obj->views->fieldNames->map = '
function(doc) { function(doc) {
for(var propName in doc) { for(var propName in doc) {
emit(propName, doc._id); emit(propName, doc._id);
} }
}'; }';
$obj->views->fieldNames->reduce = '_count'; $obj->views->fieldNames->reduce = '_count';
// allow safe updates (even if slightly slower due to extra: rev-detection check). // allow safe updates (even if slightly slower due to extra: rev-detection check).
$db->save($obj, true); $db->save($obj, true);
?> ?>
   
#http://packages.python.org/CouchDB/client.html #http://packages.python.org/CouchDB/client.html
import couchdb import couchdb
import json import json
import pprint import pprint
import re import re
from tidylib import tidy_document from tidylib import tidy_document
   
couch = couchdb.Server('http://127.0.0.1:5984/') couch = couchdb.Server('http://192.168.1.113:5984/')
   
# select database # select database
docsdb = couch['disclosr-documents'] docsdb = couch['disclosr-documents']
   
def f(x): def f(x):
invalid = re.compile(r"ensure|testing|flicker|updating|longdesc|Accessibility Checks|not recognized") invalid = re.compile(r"ensure|testing|flicker|updating|longdesc|Accessibility Checks|not recognized|noscript|audio")
valid = re.compile(r"line") valid = re.compile(r"line")
return (not invalid.search(x)) and valid.search(x) and x != '' return (not invalid.search(x)) and valid.search(x) and x != ''
   
for row in docsdb.view('app/getValidationRequired'): for row in docsdb.view('app/getValidationRequired'):
print row.id print row.id
html = docsdb.get_attachment(row.id,row.value.iterkeys().next()).read() html = docsdb.get_attachment(row.id,row.value.iterkeys().next()).read()
#print html #print html
document, errors = tidy_document(html,options={'accessibility-check':1,'show-warnings':0,'markup':0},keep_doc=True) document, errors = tidy_document(html,options={'accessibility-check':1,'show-warnings':0,'markup':0},keep_doc=True)
#http://www.aprompt.ca/Tidy/accessibilitychecks.html #http://www.aprompt.ca/Tidy/accessibilitychecks.html
#print document #print document
errors = '\n'.join(filter(f,errors.split('\n'))) errors = '\n'.join(filter(f,errors.split('\n')))
#print errors #print errors
doc = docsdb.get(row.id) doc = docsdb.get(row.id)
doc['validation'] = errors doc['validation'] = errors
docsdb.save(doc) docsdb.save(doc)
   
<!doctype html> <!doctype html>
<html lang="en"> <html lang="en">
<head> <head>
<meta charset="utf-8"> <meta charset="utf-8">
<title>Page Not Found :(</title> <title>Page Not Found :(</title>
<style> <style>
::-moz-selection { background: #fe57a1; color: #fff; text-shadow: none; } ::-moz-selection {
::selection { background: #fe57a1; color: #fff; text-shadow: none; } background: #fe57a1;
html { padding: 30px 10px; font-size: 20px; line-height: 1.4; color: #737373; background: #f0f0f0; -webkit-text-size-adjust: 100%; -ms-text-size-adjust: 100%; } color: #fff;
html, input { font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; } text-shadow: none;
body { max-width: 500px; _width: 500px; padding: 30px 20px 50px; border: 1px solid #b3b3b3; border-radius: 4px; margin: 0 auto; box-shadow: 0 1px 10px #a7a7a7, inset 0 1px 0 #fff; background: #fcfcfc; } }
h1 { margin: 0 10px; font-size: 50px; text-align: center; }  
h1 span { color: #bbb; } ::selection {
h3 { margin: 1.5em 0 0.5em; } background: #fe57a1;
p { margin: 1em 0; } color: #fff;
ul { padding: 0 0 0 40px; margin: 1em 0; } text-shadow: none;
.container { max-width: 380px; _width: 380px; margin: 0 auto; } }
/* google search */  
#goog-fixurl ul { list-style: none; padding: 0; margin: 0; } html {
#goog-fixurl form { margin: 0; } padding: 30px 10px;
#goog-wm-qt, #goog-wm-sb { border: 1px solid #bbb; font-size: 16px; line-height: normal; vertical-align: top; color: #444; border-radius: 2px; } font-size: 20px;
#goog-wm-qt { width: 220px; height: 20px; padding: 5px; margin: 5px 10px 0 0; box-shadow: inset 0 1px 1px #ccc; } line-height: 1.4;
#goog-wm-sb { display: inline-block; height: 32px; padding: 0 10px; margin: 5px 0 0; white-space: nowrap; cursor: pointer; background-color: #f5f5f5; background-image: -webkit-linear-gradient(rgba(255,255,255,0), #f1f1f1); background-image: -moz-linear-gradient(rgba(255,255,255,0), #f1f1f1); background-image: -ms-linear-gradient(rgba(255,255,255,0), #f1f1f1); background-image: -o-linear-gradient(rgba(255,255,255,0), #f1f1f1); -webkit-appearance: none; -moz-appearance: none; appearance: none; *overflow: visible; *display: inline; *zoom: 1; } color: #737373;
#goog-wm-sb:hover, #goog-wm-sb:focus { border-color: #aaa; box-shadow: 0 1px 1px rgba(0, 0, 0, 0.1); background-color: #f8f8f8; } background: #f0f0f0;
#goog-wm-qt:focus, #goog-wm-sb:focus { border-color: #105cb6; outline: 0; color: #222; } -webkit-text-size-adjust: 100%;
input::-moz-focus-inner { padding: 0; border: 0; } -ms-text-size-adjust: 100%;
</style> }
   
  html, input {
  font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
  }
   
  body {
  max-width: 500px;
  _width: 500px;
  padding: 30px 20px 50px;
  border: 1px solid #b3b3b3;
  border-radius: 4px;
  margin: 0 auto;
  box-shadow: 0 1px 10px #a7a7a7, inset 0 1px 0 #fff;
  background: #fcfcfc;
  }
   
  h1 {
  margin: 0 10px;
  font-size: 50px;
  text-align: center;
  }
   
  h1 span {
  color: #bbb;
  }
   
  h3 {
  margin: 1.5em 0 0.5em;
  }
   
  p {
  margin: 1em 0;
  }
   
  ul {
  padding: 0 0 0 40px;
  margin: 1em 0;
  }
   
  .container {
  max-width: 380px;
  _width: 380px;
  margin: 0 auto;
  }
   
  /* google search */
  #goog-fixurl ul {
  list-style: none;
  padding: 0;
  margin: 0;
  }
   
  #goog-fixurl form {
  margin: 0;
  }
   
  #goog-wm-qt, #goog-wm-sb {
  border: 1px solid #bbb;
  font-size: 16px;
  line-height: normal;
  vertical-align: top;
  color: #444;
  border-radius: 2px;
  }
   
  #goog-wm-qt {
  width: 220px;
  height: 20px;
  padding: 5px;