- duplicated description |
<?php |
- most duplicated overall, most duplicated per agency/category/supplier etc. |
/* all |
|
SELECT description, count(*) as count |
|
FROM `contractnotice` |
|
group by description having count > 1 order by count |
|
*/ |
|
/*- duplicated description |
|
- most duplicated overall, most duplicated per agency/category/supplier etc. */ |
|
$heuristics["METADATA_DUPLICATED_DESCRIPTION"] = Array( |
|
"description" => "" |
|
); |
|
function METADATA_DUPLICATED_DESCRIPTION($cn) |
|
{ |
|
$averageDuplicatedDescriptions = getAverageDuplicatedDescriptions(); |
|
$stddevDuplicatedDescriptions = getstddevDuplicatedDescriptions(); |
|
$query = 'select count(*) from contractnotice where description = "' . $agencyName . '"'; |
|
$result = mysql_query($query); |
|
$r = mysql_fetch_array($result, MYSQL_BOTH); |
|
$dupeDesc = $r[0]; |
|
if ($dupeDesc == 1) $value = 0; |
|
else $value = abs($dupeDesc - $averageDuplicatedDescriptions) / $stddevDuplicatedDescriptions; |
|
return Array( |
|
"heuristic_value" => $value, |
|
"raw_value" => $dupeDesc, |
|
"mean" => $averageDuplicatedDescriptions, |
|
"stddev" => $stddevDuplicatedDescriptions |
|
); |
|
} |
|
$averageDuplicatedDescriptions; |
|
function getAverageDuplicatedDescriptions() |
|
{ |
|
global $averageDuplicatedDescriptions; |
|
if (!$averageDuplicatedDescriptions) { |
|
getStatsDuplicatedDescriptions(); |
|
} |
|
return $averageDuplicatedDescriptions; |
|
} |
|
$stddevDuplicatedDescriptions; |
|
function getstddevDuplicatedDescriptions() |
|
{ |
|
global $stddevDuplicatedDescriptions; |
|
if (!$stddevDuplicatedDescriptions) { |
|
getStatsDuplicatedDescriptions(); |
|
} |
|
return $stddevDuplicatedDescriptions; |
|
} |
|
function getStatsDuplicatedDescriptions() |
|
{ |
|
$query = "select avg(count),STDDEV(count) from ( |
|
SELECT description, count(*) as count |
|
FROM `contractnotice` |
|
group by description having count > 1 |
|
) as a;"; |
|
$result = mysql_query($query); |
|
$r = mysql_fetch_array($result, MYSQL_BOTH); |
|
$averageDuplicatedDescriptions = $r[0]; |
|
$stddevDuplicatedDescriptions = $r[1]; |
|
} |
|
|