<?php |
<?php |
/* all |
/* all |
SELECT description, count(*) as count |
SELECT description, count(*) as count |
FROM `contractnotice` |
FROM contractnotice |
group by description having count > 1 order by count |
group by description having count > 1 order by count |
*/ |
*/ |
/*- duplicated description |
/*- duplicated description |
- most duplicated overall, most duplicated per agency/category/supplier etc. */ |
- most duplicated overall, most duplicated per agency/category/supplier etc. */ |
$heuristics["METADATA_DUPLICATED_DESCRIPTION"] = Array( |
$heuristics["METADATA_DUPLICATED_DESCRIPTION"] = Array( |
"description" => "" |
"description" => "" |
); |
); |
function METADATA_DUPLICATED_DESCRIPTION($cn) |
function METADATA_DUPLICATED_DESCRIPTION($cn) |
{ |
{ |
$averageDuplicatedDescriptions = getAverageDuplicatedDescriptions(); |
$averageDuplicatedDescriptions = getAverageDuplicatedDescriptions(); |
$stddevDuplicatedDescriptions = getstddevDuplicatedDescriptions(); |
$stddevDuplicatedDescriptions = getstddevDuplicatedDescriptions(); |
$query = 'select count(*) from contractnotice where description = "' . $agencyName . '"'; |
$query = 'select count(*) from contractnotice where description = "' . $agencyName . '"'; |
$result = $conn->query($query); |
$result = $conn->query($query); |
$r = $result->fetch(PDO::FETCH_BOTH); |
$r = $result->fetch(PDO::FETCH_BOTH); |
$dupeDesc = $r[0]; |
$dupeDesc = $r[0]; |
if ($dupeDesc == 1) $value = 0; |
if ($dupeDesc == 1) $value = 0; |
else $value = abs($dupeDesc - $averageDuplicatedDescriptions) / $stddevDuplicatedDescriptions; |
else $value = abs($dupeDesc - $averageDuplicatedDescriptions) / $stddevDuplicatedDescriptions; |
return Array( |
return Array( |
"heuristic_value" => $value, |
"heuristic_value" => $value, |
"raw_value" => $dupeDesc, |
"raw_value" => $dupeDesc, |
"mean" => $averageDuplicatedDescriptions, |
"mean" => $averageDuplicatedDescriptions, |
"stddev" => $stddevDuplicatedDescriptions |
"stddev" => $stddevDuplicatedDescriptions |
); |
); |
} |
} |
$averageDuplicatedDescriptions; |
$averageDuplicatedDescriptions; |
function getAverageDuplicatedDescriptions() |
function getAverageDuplicatedDescriptions() |
{ |
{ |
global $averageDuplicatedDescriptions; |
global $averageDuplicatedDescriptions; |
if (!$averageDuplicatedDescriptions) { |
if (!$averageDuplicatedDescriptions) { |
getStatsDuplicatedDescriptions(); |
getStatsDuplicatedDescriptions(); |
} |
} |
return $averageDuplicatedDescriptions; |
return $averageDuplicatedDescriptions; |
} |
} |
$stddevDuplicatedDescriptions; |
$stddevDuplicatedDescriptions; |
function getstddevDuplicatedDescriptions() |
function getstddevDuplicatedDescriptions() |
{ |
{ |
global $stddevDuplicatedDescriptions; |
global $stddevDuplicatedDescriptions; |
if (!$stddevDuplicatedDescriptions) { |
if (!$stddevDuplicatedDescriptions) { |
getStatsDuplicatedDescriptions(); |
getStatsDuplicatedDescriptions(); |
} |
} |
return $stddevDuplicatedDescriptions; |
return $stddevDuplicatedDescriptions; |
} |
} |
function getStatsDuplicatedDescriptions() |
function getStatsDuplicatedDescriptions() |
{ |
{ |
$query = "select avg(count),STDDEV(count) from ( |
$query = "select avg(count),STDDEV(count) from ( |
SELECT description, count(*) as count |
SELECT description, count(*) as count |
FROM `contractnotice` |
FROM contractnotice |
group by description having count > 1 |
group by description having count > 1 |
) as a;"; |
) as a;"; |
$result = $conn->query($query); |
$result = $conn->query($query); |
$r = $result->fetch(PDO::FETCH_BOTH); |
$r = $result->fetch(PDO::FETCH_BOTH); |
$averageDuplicatedDescriptions = $r[0]; |
$averageDuplicatedDescriptions = $r[0]; |
$stddevDuplicatedDescriptions = $r[1]; |
$stddevDuplicatedDescriptions = $r[1]; |
} |
} |
|
|