From: maxious Date: Sat, 24 Mar 2012 12:01:50 +0000 Subject: Add validation script X-Git-Url: https://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=ec5d4d67db0b90be9c18bd541b4434798d559f1d --- Add validation script Former-commit-id: 8f0da9f23f1e65c57b3e158ef358630a33f10141 --- --- a/.gitmodules +++ b/.gitmodules @@ -16,4 +16,7 @@ [submodule "javascripts/flotr2"] path = javascripts/flotr2 url = https://github.com/HumbleSoftware/Flotr2.git +[submodule "lib/phpquery"] + path = lib/phpquery + url = https://github.com/TobiaszCudnik/phpquery.git --- a/admin/exportEmployees.csv.php +++ b/admin/exportEmployees.csv.php @@ -22,6 +22,7 @@ if (isset($row->value->statistics->employees)) { $headers = array_unique(array_merge($headers, array_keys(object_to_array($row->value->statistics->employees)))); + } } } catch (SetteeRestClientException $e) { --- /dev/null +++ b/admin/exportScore.csv.php @@ -1,1 +1,73 @@ +get_db('disclosr-agencies'); +$format = "csv"; +//$format = "json"; +if (isset($_REQUEST['format'])) $format = $_REQUEST['format']; + +setlocale(LC_CTYPE, 'C'); + + $headers = Array(); + +$fp = fopen('php://output', 'w'); +if ($fp && $db) { + if ($format == "csv") { + header('Content-Type: text/csv; charset=utf-8'); + header('Content-Disposition: attachment; filename="export.score.' . date("c") . '.csv"'); + } + header('Pragma: no-cache'); + header('Expires: 0'); + + try { + $agencies = $db->get_view("score", "score", null, true)->rows; + //print_r($agencies); + $first = true; + if ($format == "json") { + echo '"data" : ['.PHP_EOL; + + } + foreach ($agencies as $agency) { + $agencyArray = object_to_array($agency->value); + if ($first) { + $headers = array_keys($agencyArray); +if ($format == "csv") { + fputcsv($fp, $headers); + } else if ($format == "json") { + echo '{ + "labels" : ["' . implode('","', $headers) . '"],'.PHP_EOL; + } + } + $row = Array(); + + foreach ($headers as $i => $fieldName) { + if (isset($agencyArray[$fieldName])) { + $row[] = $agencyArray[$fieldName]; + } else { + $row[] = ''; + } + } + if ($format == "csv") { + fputcsv($fp, array_values($row)); + } else if ($format == "json") { + if (!$first) echo ","; + echo '{"data" : [' . implode(",", array_values($row)) . '], "label": "'.$agency->value->name.'", "lines" : { "show" : true }, "points" : { "show" : true }}'.PHP_EOL; + + } + $first = false; + } + + if ($format == "json") { + echo '] + }'.PHP_EOL; + + } + } catch (SetteeRestClientException $e) { + setteErrorHandler($e); + } + + die; +} +?> + --- /dev/null +++ b/admin/importAustraliaGovAuGov2.php @@ -1,1 +1,61 @@ +get_db('disclosr-agencies'); +$rows = $db->get_view("app", "byName")->rows; +$nametoid = Array(); +$accounts = Array(); +foreach ($rows as $row) { + $nametoid[trim($row->key)] = $row->value; +} + +function extractCSVAccounts($url, $nameField, $accountField, $filter) { + global $accounts, $nametoid; + $request = Requests::get($url); + $Data = str_getcsv($request->body, "\n"); //parse the rows + $headers = Array(); + foreach ($Data as $num => $line) { + $Row = str_getcsv($line, ","); + if ($num == 0) { + + } else if ($num == 1) { + $headers = $Row; + //print_r($headers); + } else { + if (isset($Row[array_search($nameField, $headers)])) { + $agencyName = $Row[array_search($nameField, $headers)]; + if (!$filter || $Row[array_search("State", $headers)] == "NAT") { + if (!in_array(trim($agencyName), array_keys($nametoid))) { + echo "$agencyName missing" . PHP_EOL; + } else { + // echo $Row[array_search($nameField, $headers)] . PHP_EOL; + } + } + } else { + //echo "error finding agency" . $line . PHP_EOL; + } + } + } +} + +// http://agimo.govspace.gov.au/page/gov2register/ +// twitter +//extractCSVAccounts("https://docs.google.com/spreadsheet/pub?key=0Ap1exl80wB8OdHNKVmQ5RVlvQWpibDAxNHkzcU1nV2c&single=true&gid=0&output=csv", "Agency/Body/Event", "", true); +// RSS +// https://docs.google.com/spreadsheet/pub?hl=en_GB&hl=en_GB&key=0Ah41IAK0HzSTdGJxandJREhLSGlWWUZfZ2xKOTNHZ0E&output=csv +// facebook +extractCSVAccounts("https://docs.google.com/spreadsheet/pub?hl=en_GB&hl=en_GB&key=0Ah41IAK0HzSTdGtjcW9vOXdyZ3pOV21vQU51VmhzQnc&single=true&gid=0&output=csv","Agency","Name"); + +/* + * http://australia.gov.au/news-and-media/media-release-rss-feeds + * http://australia.gov.au/news-and-media/social-media/blogs + * http://australia.gov.au/news-and-media/social-media/twitter + * http://australia.gov.au/news-and-media/social-media/facebook + * http://australia.gov.au/news-and-media/social-media/youtube + * http://australia.gov.au/news-and-media/social-media/flickr + * http://australia.gov.au/news-and-media/social-media/apps http://www.harmony.gov.au/get-involved/app-downloads.htm http://www.em.gov.au/Resources/Pages/Before-the-Storm-phone-game.aspx + * http://australia.gov.au/news-and-media/social-media/podcasts + */ +?> + --- a/admin/importGov2RegisterRSSFacebookTwitter.php +++ b/admin/importGov2RegisterRSSFacebookTwitter.php @@ -1,6 +1,7 @@ get_db('disclosr-agencies'); $rows = $db->get_view("app", "byName")->rows; @@ -10,13 +11,48 @@ $nametoid[trim($row->key)] = $row->value; } -function extractCSVAccounts($url, $nameField, $accountField, $filter) { +function extractHTMLAccounts($url, $accountType) { + global $accounts, $nametoid; + $request = Requests::get($url); + $doc = phpQuery::newDocumentHTML($request->body); + phpQuery::selectDocument($doc); + foreach (pq('tr')->elements as $tr) { + //echo $tr->nodeValue.PHP_EOL; + $agency = ""; + $url = ""; + foreach ($tr->childNodes as $td) { + $class = $td->getAttribute("class"); + //echo "cccc $class ".$td->nodeValue.PHP_EOL; + if ($class == "s11" || $class == "s10" || $class == "s7") { + $agency = $td->nodeValue; + } else if ($class == "s6" || $class == "s9") { + $url = $td->nodeValue; + foreach ($td->childNodes as $a) { + $href = $a->getAttribute("href"); + if ($href != "") { + $url = $href; + } + } + } + } + if ($agency != "" && $url != "") { + if (!in_array(trim($agency), array_keys($nametoid))) { + echo trim($agency) . " missing" . PHP_EOL; + } else { + // echo $agency." = ".$url.PHP_EOL; + $accounts[$nametoid[trim($agency)]][$accountType][] = $url; + } + } + } +} + +function extractCSVAccounts($url, $accountType, $nameField, $accountField, $filter) { global $accounts, $nametoid; $request = Requests::get($url); $Data = str_getcsv($request->body, "\n"); //parse the rows $headers = Array(); foreach ($Data as $num => $line) { - $Row = str_getcsv($line, ","); + $Row = str_getcsv($line, ",", '"'); if ($num == 0) { } else if ($num == 1) { @@ -27,9 +63,10 @@ $agencyName = $Row[array_search($nameField, $headers)]; if (!$filter || $Row[array_search("State", $headers)] == "NAT") { if (!in_array(trim($agencyName), array_keys($nametoid))) { - echo "$agencyName missing" . PHP_EOL; + echo trim($agencyName) . " missing" . PHP_EOL; } else { // echo $Row[array_search($nameField, $headers)] . PHP_EOL; + $accounts[$nametoid[trim($agencyName)]][$accountType][] = $Row[array_search($accountField, $headers)]; } } } else { @@ -41,21 +78,23 @@ // http://agimo.govspace.gov.au/page/gov2register/ // twitter -extractCSVAccounts("https://docs.google.com/spreadsheet/pub?key=0Ap1exl80wB8OdHNKVmQ5RVlvQWpibDAxNHkzcU1nV2c&single=true&gid=0&output=csv", "Agency/Body/Event", "", true); +extractCSVAccounts("https://docs.google.com/spreadsheet/pub?key=0Ap1exl80wB8OdHNKVmQ5RVlvQWpibDAxNHkzcU1nV2c&single=true&gid=0&output=csv", "Twitter", "Agency/Body/Event", "", true); // RSS -// https://docs.google.com/spreadsheet/pub?hl=en_GB&hl=en_GB&key=0Ah41IAK0HzSTdGJxandJREhLSGlWWUZfZ2xKOTNHZ0E&output=csv +extractHTMLAccounts("https://docs.google.com/spreadsheet/pub?hl=en_GB&hl=en_GB&key=0Ah41IAK0HzSTdGJxandJREhLSGlWWUZfZ2xKOTNHZ0E&output=html", "RSS"); // facebook -//extractCSVAccounts("https://docs.google.com/spreadsheet/pub?hl=en_GB&hl=en_GB&key=0Ah41IAK0HzSTdGtjcW9vOXdyZ3pOV21vQU51VmhzQnc&single=true&gid=0&output=csv","",""); +extractHTMLAccounts("https://docs.google.com/spreadsheet/pub?hl=en_GB&hl=en_GB&key=0Ah41IAK0HzSTdGtjcW9vOXdyZ3pOV21vQU51VmhzQnc&single=true&gid=0&output=html", "Facebook"); +foreach ($accounts as $id => $accountTypes) { + echo $id . "
" . PHP_EOL; + $doc = object_to_array($db->get($id)); + // print_r($doc); -/* - * http://australia.gov.au/news-and-media/media-release-rss-feeds - * http://australia.gov.au/news-and-media/social-media/blogs - * http://australia.gov.au/news-and-media/social-media/twitter - * http://australia.gov.au/news-and-media/social-media/facebook - * http://australia.gov.au/news-and-media/social-media/youtube - * http://australia.gov.au/news-and-media/social-media/flickr - * http://australia.gov.au/news-and-media/social-media/apps http://www.harmony.gov.au/get-involved/app-downloads.htm http://www.em.gov.au/Resources/Pages/Before-the-Storm-phone-game.aspx - * http://australia.gov.au/news-and-media/social-media/podcasts - */ + foreach ($accountTypes as $accountType => $accounts) { + if (!isset($doc["has" . $accountType]) || !is_array($doc["has" . $accountType])) { + $doc["has" . $accountType] = Array(); + } + $doc["has" . $accountType] = array_unique(array_merge($doc["has" . $accountType], $accounts)); + } + $db->save($doc); +} ?> --- /dev/null +++ b/admin/validation.py @@ -1,1 +1,30 @@ +#http://packages.python.org/CouchDB/client.html +import couchdb +import json +import pprint +import re +from tidylib import tidy_document +couch = couchdb.Server('http://127.0.0.1:5984/') + +# select database +docsdb = couch['disclosr-documents'] + +def f(x): + invalid = re.compile(r"ensure|testing|flicker|updating|longdesc|Accessibility Checks|not recognized") + valid = re.compile(r"line") + return (not invalid.search(x)) and valid.search(x) and x != '' + +for row in docsdb.view('app/getValidationRequired'): + print row.id + html = docsdb.get_attachment(row.id,row.value.iterkeys().next()).read() + #print html + document, errors = tidy_document(html,options={'accessibility-check':1,'show-warnings':0,'markup':0},keep_doc=True) + #http://www.aprompt.ca/Tidy/accessibilitychecks.html + #print document + errors = '\n'.join(filter(f,errors.split('\n'))) + #print errors + doc = docsdb.get(row.id) + doc['validation'] = errors + docsdb.save(doc) + --- a/couchdb/settee +++ b/couchdb/settee --- a/getAgency.php +++ b/getAgency.php @@ -86,9 +86,14 @@ } else if ($schemas['agency']['properties'][$defaultField]['type'] == "array") { if (is_array($row[$defaultField])) { $row[$defaultField][] = ""; + $row[$defaultField][] = ""; + $row[$defaultField][] = ""; } else { $value = $row[$defaultField]; $row[$defaultField] = Array($value); + $row[$defaultField][] = ""; + $row[$defaultField][] = ""; + } } } @@ -102,7 +107,7 @@ // by name = startkey="Ham"&endkey="Ham\ufff0" // edit? - $row = $db->get($_REQUEST['id']); + $obj = $db->get($_REQUEST['id']); //print_r($row); if (sizeof($_POST) > 0) { //print_r($_POST); @@ -126,17 +131,19 @@ echo "Edited version was latest version, continue saving"; $newdoc = $_POST; $newdoc['metadata']['lastModified'] = time(); - $row = $db->save($newdoc); + $obj = $db->save($newdoc); } else { echo "ALERT doc revised by someone else while editing. Document not saved."; } } $mode = "edit"; + $rowArray = object_to_array($obj); +ksort($rowArray); if ($mode == "edit") { - $row = addDefaultFields(object_to_array($row)); + $row = addDefaultFields($rowArray); } else { - $row = object_to_array($row); + $row = $rowArray; } if ($mode == "view") { @@ -188,14 +195,14 @@ (isset($row->value->name) && $row->value->name != "" ? $row->value->name : "NO NAME " . $row->value->abn) . ''; } */ - $rows = $db->get_view("app", "byName")->rows; + $rows = $db->get_view("app", "byCanonicalName")->rows; //print_r($rows); echo '"; --- a/include/couchdb.inc.php +++ b/include/couchdb.inc.php @@ -22,7 +22,10 @@ }, "byWebServer": { "map": "function(doc) {\n emit(doc.web_server, doc);\n}" - } + }, + "getValidationRequired": { + "map": "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}" + } }*/ } @@ -92,25 +95,29 @@ } }"; // http://stackoverflow.com/questions/646628/javascript-startswith - $obj->views->scoreHas->map = 'if(!String.prototype.startsWith){ +$obj->views->score->map = 'if(!String.prototype.startsWith){ String.prototype.startsWith = function (str) { return !this.indexOf(str); } } -if(!String.prototype.endsWith){ - String.prototype.endsWith = function(suffix) { -     return this.indexOf(suffix, this.length - suffix.length) !== -1; - }; -} + function(doc) { -if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { -for(var propName in doc) { - if(typeof(doc[propName]) != "undefined" && (propName.startsWith("has") || propName.endsWith("URL"))) { - emit(propName, 1); - } -} - emit("total", 1); - } + count = 0; + if (doc["status"] != "suspended") { + for(var propName in doc) { + if(typeof(doc[propName]) != "undefined" && doc[propName] != "") { + count++; + } + } + portfolio = doc.parentOrg; + if (doc.orgType == "FMA-DepartmentOfState") { + portfolio = doc._id; + } + if (doc.orgType == "Court-Commonwealth" || doc.orgType == "FMA-DepartmentOfParliament") { + portfolio = doc.orgType; + } + emit(count+doc._id, {id:doc._id, name: doc.name, score:count, orgType: doc.orgType, portfolio:portfolio}); + } }'; $obj->views->scoreHas->map = 'if(!String.prototype.startsWith){ String.prototype.startsWith = function (str) { @@ -163,3 +170,4 @@ function setteErrorHandler($e) { echo $e->getMessage() . "
" . PHP_EOL; } + --- /dev/null +++ b/lib/phpquery --- a/schemas/agency.json.php +++ b/schemas/agency.json.php @@ -17,6 +17,7 @@ "parentOrg" => Array("type" => "string", "required" => true, "x-title" => "Parent Organisation", "description" => "Parent organisation, usually a department of state"), "website" => Array("type" => "string", "required" => true, "x-title" => "Website", "x-property" => "schema:url foaf:homepage", "description" => "Website URL"), "abn" => Array("type" => "string", "required" => true, "x-title" => "Australian Business Number", "description" => "ABN from business register"), + "employees" => Array("type" => "string", "required" => true, "x-title" => "2010-2011 employees", "description" => "2010-2011 employees"), "contractListURL" => Array("type" => "string", "required" => true, "x-title" => "Contract Listing", "description" => "Departmental and agency contracts, mandated by the Senate" ), "budgetURL" => Array("type" => "string", "required" => true,"x-title" => "Budget", "description" => "Portfolio Budget Statements and Portfolio Additional Estimates Statements"), "grantsReportingURL" => Array("type" => "string", "required" => true, "x-title" => "Grants Awarded", @@ -49,6 +50,8 @@ "items" => Array("type" => "string")), "hasRestrictiveLicence" => Array("type" => "array","required" => true, "x-title" => "Has Restrictive Licence", "description" => "Has any page licenced under terms more restrictive than Crown Copyright", "items" => Array("type" => "string")), + "hasPermissiveLicence" => Array("type" => "array","required" => true, "x-title" => "Has Permissive Licence", "description" => "Has any page licenced under terms more permissive than Crown Copyright but not clear CCBY", + "items" => Array("type" => "string")), "hasCrownCopyright" => Array("type" => "array", "required" => true, "x-title" => "Has Standard Crown Copyright licence", "description" => "Has any page still licenced under the former Commonwealth Copyright Administration", "items" => Array("type" => "string")), ), --- a/scrape.py +++ b/scrape.py @@ -77,7 +77,7 @@ print "Fetching %s" % url if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "": print "Not a valid HTTP url" - return (None,None) + return (None,None,None) doc = docsdb.get(hash) if doc == None: doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName} @@ -86,13 +86,14 @@ print "Uh oh, trying to scrape URL again too soon!" last_attachment_fname = doc["_attachments"].keys()[-1] last_attachment = docsdb.get_attachment(doc,last_attachment_fname) - return (doc['mime_type'],last_attachment) + return (doc['url'],doc['mime_type'],last_attachment) if scrape_again == False: print "Not scraping this URL again as requested" - return (None,None) + return (None,None,None) time.sleep(3) # wait 3 seconds to give webserver time to recover + req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)") #if there is a previous version stored in couchdb, load caching helper tags if doc.has_key('etag'): req.add_header("If-None-Match", doc['etag']) @@ -102,12 +103,14 @@ opener = urllib2.build_opener(NotModifiedHandler()) try: url_handle = opener.open(req) + doc['url'] = url_handle.geturl() # may have followed a redirect to a new url headers = url_handle.info() # the addinfourls have the .info() too doc['etag'] = headers.getheader("ETag") doc['last_modified'] = headers.getheader("Last-Modified") doc['date'] = headers.getheader("Date") doc['page_scraped'] = time.time() doc['web_server'] = headers.getheader("Server") + doc['via'] = headers.getheader("Via") doc['powered_by'] = headers.getheader("X-Powered-By") doc['file_size'] = headers.getheader("Content-Length") content_type = headers.getheader("Content-Type") @@ -119,13 +122,13 @@ if hasattr(url_handle, 'code'): if url_handle.code == 304: print "the web page has not been modified" - return (None,None) + return (None,None,None) else: content = url_handle.read() docsdb.save(doc) doc = docsdb.get(hash) # need to get a _rev docsdb.put_attachment(doc, content, str(time.time())+"-"+os.path.basename(url), doc['mime_type']) - return (doc['mime_type'], content) + return (doc['url'], doc['mime_type'], content) #store as attachment epoch-filename except urllib2.URLError as e: error = "" @@ -136,21 +139,22 @@ print error doc['error'] = error docsdb.save(doc) - return (None,None) + return (None,None,None) def scrapeAndStore(docsdb, url, depth, fieldName, agencyID): - (mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID) - if content != None and depth > 0: + (url,mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID) + badURLs = ["http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report"] + if content != None and depth > 0 and url != "http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report": if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": # http://www.crummy.com/software/BeautifulSoup/documentation.html soup = BeautifulSoup(content) - navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar')) + navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')) for nav in navIDs: print "Removing element", nav['id'] nav.extract() - navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar')}) + navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')}) for nav in navClasses: print "Removing element", nav['class'] nav.extract() @@ -169,7 +173,10 @@ # not http None else: - linkurls.add(urljoin(url,link['href'].replace(" ","%20"))) + # remove anchors and spaces in urls + link['href'] = link['href'].replace(" ","%20") + link['href'] = re.sub('#.*$','',link['href']) + linkurls.add(urljoin(url,link['href'])) for linkurl in linkurls: #print linkurl scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID)