1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 | <?php /* all SELECT description, count(*) as count FROM `contractnotice` group by description having count > 1 order by count */ /*- duplicated description - most duplicated overall, most duplicated per agency/category/supplier etc. */ $heuristics["METADATA_DUPLICATED_DESCRIPTION"] = Array( "description" => "" ); function METADATA_DUPLICATED_DESCRIPTION($cn) { $averageDuplicatedDescriptions = getAverageDuplicatedDescriptions(); $stddevDuplicatedDescriptions = getstddevDuplicatedDescriptions(); $query = 'select count(*) from contractnotice where description = "' . $agencyName . '"'; $result = mysql_query($query); $r = mysql_fetch_array($result, MYSQL_BOTH); $dupeDesc = $r[0]; if ($dupeDesc == 1) $value = 0; else $value = abs($dupeDesc - $averageDuplicatedDescriptions) / $stddevDuplicatedDescriptions; return Array( "heuristic_value" => $value, "raw_value" => $dupeDesc, "mean" => $averageDuplicatedDescriptions, "stddev" => $stddevDuplicatedDescriptions ); } $averageDuplicatedDescriptions; function getAverageDuplicatedDescriptions() { global $averageDuplicatedDescriptions; if (!$averageDuplicatedDescriptions) { getStatsDuplicatedDescriptions(); } return $averageDuplicatedDescriptions; } $stddevDuplicatedDescriptions; function getstddevDuplicatedDescriptions() { global $stddevDuplicatedDescriptions; if (!$stddevDuplicatedDescriptions) { getStatsDuplicatedDescriptions(); } return $stddevDuplicatedDescriptions; } function getStatsDuplicatedDescriptions() { $query = "select avg(count),STDDEV(count) from ( SELECT description, count(*) as count FROM `contractnotice` group by description having count > 1 ) as a;"; $result = mysql_query($query); $r = mysql_fetch_array($result, MYSQL_BOTH); $averageDuplicatedDescriptions = $r[0]; $stddevDuplicatedDescriptions = $r[1]; } |