More value heuristics
[contractdashboard.git] / heuristics / metadataHeuristics.php
blob:a/heuristics/metadataHeuristics.php -> blob:b/heuristics/metadataHeuristics.php
--- a/heuristics/metadataHeuristics.php
+++ b/heuristics/metadataHeuristics.php
@@ -1,12 +1,59 @@
  <?php
- /*- duplicated description
-        - most duplicated overall, most duplicated per agency/category/supplier etc. */
-   $heuristics["METADATA_DUPLICATED_DESCRIPTION"] = Array(
-	"description" => "unusual value for time of year");
+/* all
+   SELECT description, count(*) as count
+FROM `contractnotice` 
+group by description having count > 1 order by count
+*/
+/*- duplicated description
+ - most duplicated overall, most duplicated per agency/category/supplier etc. */
+$heuristics["METADATA_DUPLICATED_DESCRIPTION"] = Array(
+	"description" => ""
+);
 function METADATA_DUPLICATED_DESCRIPTION($cn)
 {
-    	$averageContractPeriod = getAverageContractPeriod();
-	$diff = strtotime($cn['contractStart']) - strtotime($cn['publishDate']);
-	$days = intval($diff / 24);
-	return ($days > 45 ? 1 : 0);
+	$averageDuplicatedDescriptions = getAverageDuplicatedDescriptions();
+	$stddevDuplicatedDescriptions = getstddevDuplicatedDescriptions();
+	$query = 'select count(*) from contractnotice where description = "' . $agencyName . '"';
+	$result = mysql_query($query);
+	$r = mysql_fetch_array($result, MYSQL_BOTH);
+	$dupeDesc = $r[0];
+        if ($dupeDesc == 1) $value = 0;
+	else $value = abs($dupeDesc - $averageDuplicatedDescriptions) / $stddevDuplicatedDescriptions;
+	return Array(
+		"heuristic_value" => $value,
+		"raw_value" => $dupeDesc,
+		"mean" => $averageDuplicatedDescriptions,
+		"stddev" => $stddevDuplicatedDescriptions
+	);
 }
+$averageDuplicatedDescriptions;
+function getAverageDuplicatedDescriptions()
+{
+	global $averageDuplicatedDescriptions;
+	if (!$averageDuplicatedDescriptions) {
+		getStatsDuplicatedDescriptions();
+	}
+	return $averageDuplicatedDescriptions;
+}
+$stddevDuplicatedDescriptions;
+function getstddevDuplicatedDescriptions()
+{
+	global $stddevDuplicatedDescriptions;
+	if (!$stddevDuplicatedDescriptions) {
+		getStatsDuplicatedDescriptions();
+	}
+	return $stddevDuplicatedDescriptions;
+}
+function getStatsDuplicatedDescriptions()
+{
+	$query = "select avg(count),STDDEV(count) from (
+        SELECT description, count(*) as count
+FROM `contractnotice` 
+group by description having count > 1 
+        ) as a;";
+	$result = mysql_query($query);
+	$r = mysql_fetch_array($result, MYSQL_BOTH);
+	$averageDuplicatedDescriptions = $r[0];
+	$stddevDuplicatedDescriptions = $r[1];
+}
+