From: maxious Date: Sat, 24 Mar 2012 12:01:50 +0000 Subject: Add validation script X-Git-Url: https://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=ec5d4d67db0b90be9c18bd541b4434798d559f1d --- Add validation script Former-commit-id: 8f0da9f23f1e65c57b3e158ef358630a33f10141 --- --- a/admin/importGov2RegisterRSSFacebookTwitter.php +++ b/admin/importGov2RegisterRSSFacebookTwitter.php @@ -1,7 +1,7 @@ get_db('disclosr-agencies'); $rows = $db->get_view("app", "byName")->rows; @@ -10,41 +10,40 @@ foreach ($rows as $row) { $nametoid[trim($row->key)] = $row->value; } + function extractHTMLAccounts($url, $accountType) { global $accounts, $nametoid; $request = Requests::get($url); $doc = phpQuery::newDocumentHTML($request->body); phpQuery::selectDocument($doc); foreach (pq('tr')->elements as $tr) { - //echo $tr->nodeValue.PHP_EOL; - $agency = ""; - $url = ""; - foreach ($tr->childNodes as $td) { - $class = $td->getAttribute("class"); - //echo "cccc $class ".$td->nodeValue.PHP_EOL; - if ($class == "s11" || $class == "s10" || $class == "s7") { - $agency = $td->nodeValue; - } else if ($class == "s6" || $class == "s9"){ - $url = $td->nodeValue; - foreach($td->childNodes as $a) { - $href = $a->getAttribute("href"); - if ($href != "") { - $url = $href; - } - } - } - } - if ($agency != "" && $url != "") { - if (!in_array(trim($agency), array_keys($nametoid))) { - echo trim($agency)." missing" . PHP_EOL; - } else { - // echo $agency." = ".$url.PHP_EOL; - $accounts[$nametoid[trim($agency)]][$accountType][] = $url; + //echo $tr->nodeValue.PHP_EOL; + $agency = ""; + $url = ""; + foreach ($tr->childNodes as $td) { + $class = $td->getAttribute("class"); + //echo "cccc $class ".$td->nodeValue.PHP_EOL; + if ($class == "s11" || $class == "s10" || $class == "s7") { + $agency = $td->nodeValue; + } else if ($class == "s6" || $class == "s9") { + $url = $td->nodeValue; + foreach ($td->childNodes as $a) { + $href = $a->getAttribute("href"); + if ($href != "") { + $url = $href; } - - } + } + } + } + if ($agency != "" && $url != "") { + if (!in_array(trim($agency), array_keys($nametoid))) { + echo trim($agency) . " missing" . PHP_EOL; + } else { + // echo $agency." = ".$url.PHP_EOL; + $accounts[$nametoid[trim($agency)]][$accountType][] = $url; + } + } } - } function extractCSVAccounts($url, $accountType, $nameField, $accountField, $filter) { @@ -53,7 +52,7 @@ $Data = str_getcsv($request->body, "\n"); //parse the rows $headers = Array(); foreach ($Data as $num => $line) { - $Row = str_getcsv($line, ",",'"'); + $Row = str_getcsv($line, ",", '"'); if ($num == 0) { } else if ($num == 1) { @@ -64,7 +63,7 @@ $agencyName = $Row[array_search($nameField, $headers)]; if (!$filter || $Row[array_search("State", $headers)] == "NAT") { if (!in_array(trim($agencyName), array_keys($nametoid))) { - echo trim($agencyName)." missing" . PHP_EOL; + echo trim($agencyName) . " missing" . PHP_EOL; } else { // echo $Row[array_search($nameField, $headers)] . PHP_EOL; $accounts[$nametoid[trim($agencyName)]][$accountType][] = $Row[array_search($accountField, $headers)]; @@ -84,6 +83,18 @@ extractHTMLAccounts("https://docs.google.com/spreadsheet/pub?hl=en_GB&hl=en_GB&key=0Ah41IAK0HzSTdGJxandJREhLSGlWWUZfZ2xKOTNHZ0E&output=html", "RSS"); // facebook extractHTMLAccounts("https://docs.google.com/spreadsheet/pub?hl=en_GB&hl=en_GB&key=0Ah41IAK0HzSTdGtjcW9vOXdyZ3pOV21vQU51VmhzQnc&single=true&gid=0&output=html", "Facebook"); +foreach ($accounts as $id => $accountTypes) { + echo $id . "
" . PHP_EOL; + $doc = object_to_array($db->get($id)); + // print_r($doc); + foreach ($accountTypes as $accountType => $accounts) { + if (!isset($doc["has" . $accountType]) || !is_array($doc["has" . $accountType])) { + $doc["has" . $accountType] = Array(); + } + $doc["has" . $accountType] = array_unique(array_merge($doc["has" . $accountType], $accounts)); + } + $db->save($doc); +} ?> --- /dev/null +++ b/admin/validation.py @@ -1,1 +1,30 @@ +#http://packages.python.org/CouchDB/client.html +import couchdb +import json +import pprint +import re +from tidylib import tidy_document +couch = couchdb.Server('http://127.0.0.1:5984/') + +# select database +docsdb = couch['disclosr-documents'] + +def f(x): + invalid = re.compile(r"ensure|testing|flicker|updating|longdesc|Accessibility Checks|not recognized") + valid = re.compile(r"line") + return (not invalid.search(x)) and valid.search(x) and x != '' + +for row in docsdb.view('app/getValidationRequired'): + print row.id + html = docsdb.get_attachment(row.id,row.value.iterkeys().next()).read() + #print html + document, errors = tidy_document(html,options={'accessibility-check':1,'show-warnings':0,'markup':0},keep_doc=True) + #http://www.aprompt.ca/Tidy/accessibilitychecks.html + #print document + errors = '\n'.join(filter(f,errors.split('\n'))) + #print errors + doc = docsdb.get(row.id) + doc['validation'] = errors + docsdb.save(doc) + --- a/couchdb/settee +++ b/couchdb/settee --- a/include/couchdb.inc.php +++ b/include/couchdb.inc.php @@ -22,7 +22,10 @@ }, "byWebServer": { "map": "function(doc) {\n emit(doc.web_server, doc);\n}" - } + }, + "getValidationRequired": { + "map": "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}" + } }*/ } @@ -167,3 +170,4 @@ function setteErrorHandler($e) { echo $e->getMessage() . "
" . PHP_EOL; } +