Merge branch 'master' of /git/contractdashboard
[contractdashboard.git] / heuristics / metadataHeuristics.php
blob:a/heuristics/metadataHeuristics.php -> blob:b/heuristics/metadataHeuristics.php
<?php <?php
/* all /* all
SELECT description, count(*) as count SELECT description, count(*) as count
FROM `contractnotice` FROM contractnotice
group by description having count > 1 order by count group by description having count > 1 order by count
*/ */
/*- duplicated description /*- duplicated description
- most duplicated overall, most duplicated per agency/category/supplier etc. */ - most duplicated overall, most duplicated per agency/category/supplier etc. */
$heuristics["METADATA_DUPLICATED_DESCRIPTION"] = Array( $heuristics["METADATA_DUPLICATED_DESCRIPTION"] = Array(
"description" => "" "description" => ""
); );
function METADATA_DUPLICATED_DESCRIPTION($cn) function METADATA_DUPLICATED_DESCRIPTION($cn)
{ {
$averageDuplicatedDescriptions = getAverageDuplicatedDescriptions(); $averageDuplicatedDescriptions = getAverageDuplicatedDescriptions();
$stddevDuplicatedDescriptions = getstddevDuplicatedDescriptions(); $stddevDuplicatedDescriptions = getstddevDuplicatedDescriptions();
$query = 'select count(*) from contractnotice where description = "' . $agencyName . '"'; $query = 'select count(*) from contractnotice where description = "' . $agencyName . '"';
$result = $conn->query($query); $result = $conn->query($query);
$r = $result->fetch(PDO::FETCH_BOTH); $r = $result->fetch(PDO::FETCH_BOTH);
$dupeDesc = $r[0]; $dupeDesc = $r[0];
if ($dupeDesc == 1) $value = 0; if ($dupeDesc == 1) $value = 0;
else $value = abs($dupeDesc - $averageDuplicatedDescriptions) / $stddevDuplicatedDescriptions; else $value = abs($dupeDesc - $averageDuplicatedDescriptions) / $stddevDuplicatedDescriptions;
return Array( return Array(
"heuristic_value" => $value, "heuristic_value" => $value,
"raw_value" => $dupeDesc, "raw_value" => $dupeDesc,
"mean" => $averageDuplicatedDescriptions, "mean" => $averageDuplicatedDescriptions,
"stddev" => $stddevDuplicatedDescriptions "stddev" => $stddevDuplicatedDescriptions
); );
} }
$averageDuplicatedDescriptions; $averageDuplicatedDescriptions;
function getAverageDuplicatedDescriptions() function getAverageDuplicatedDescriptions()
{ {
global $averageDuplicatedDescriptions; global $averageDuplicatedDescriptions;
if (!$averageDuplicatedDescriptions) { if (!$averageDuplicatedDescriptions) {
getStatsDuplicatedDescriptions(); getStatsDuplicatedDescriptions();
} }
return $averageDuplicatedDescriptions; return $averageDuplicatedDescriptions;
} }
$stddevDuplicatedDescriptions; $stddevDuplicatedDescriptions;
function getstddevDuplicatedDescriptions() function getstddevDuplicatedDescriptions()
{ {
global $stddevDuplicatedDescriptions; global $stddevDuplicatedDescriptions;
if (!$stddevDuplicatedDescriptions) { if (!$stddevDuplicatedDescriptions) {
getStatsDuplicatedDescriptions(); getStatsDuplicatedDescriptions();
} }
return $stddevDuplicatedDescriptions; return $stddevDuplicatedDescriptions;
} }
function getStatsDuplicatedDescriptions() function getStatsDuplicatedDescriptions()
{ {
$query = "select avg(count),STDDEV(count) from ( $query = "select avg(count),STDDEV(count) from (
SELECT description, count(*) as count SELECT description, count(*) as count
FROM `contractnotice` FROM contractnotice
group by description having count > 1 group by description having count > 1
) as a;"; ) as a;";
$result = $conn->query($query); $result = $conn->query($query);
$r = $result->fetch(PDO::FETCH_BOTH); $r = $result->fetch(PDO::FETCH_BOTH);
$averageDuplicatedDescriptions = $r[0]; $averageDuplicatedDescriptions = $r[0];
$stddevDuplicatedDescriptions = $r[1]; $stddevDuplicatedDescriptions = $r[1];
} }