More value heuristics
More value heuristics

--- a/heuristics/dateHeuristics.php
+++ b/heuristics/dateHeuristics.php
@@ -20,27 +20,32 @@
 $averageContractPeriod;
 function getAverageContractPeriod()
 {
-	global $averageContractPeriod;
+	global $averageContractPeriod, $stddevContractPeriod;
 	if (!$averageContractPeriod) {
-		$query = "select AVG(dateDiff(contractEnd,contractStart)) from contractnotice";
-		$result = mysql_query($query);
-		$r = mysql_fetch_array($result, MYSQL_BOTH);
-		$averageContractPeriod = $r[0];
+		getStddevAverageContractPeriod();
 	}
 	return $averageContractPeriod;
 }
 $stddevContractPeriod;
 function getstddevContractPeriod()
 {
-	global $stddevContractPeriod;
+	global $averageContractPeriod, $stddevContractPeriod;
 	if (!$stddevContractPeriod) {
-		$query = "select STDDEV(dateDiff(contractEnd,contractStart)) from contractnotice";
-		$result = mysql_query($query);
-		$r = mysql_fetch_array($result, MYSQL_BOTH);
-		$stddevContractPeriod = $r[0];
+		getStddevAverageContractPeriod();
 	}
 	return $stddevContractPeriod;
 }
+function getStddevAverageContractPeriod()
+{
+	global $averageContractPeriod, $stddevContractPeriod;
+	$query = "select AVG(dateDiff(contractEnd,contractStart)),stddev(dateDiff(contractEnd,contractStart)) from contractnotice";
+	$result = mysql_query($query);
+	$r = mysql_fetch_array($result, MYSQL_BOTH);
+	$averageContractPeriod = $r[0];
+	$stddevContractPeriod = $r[1];
+}
+
+
 //Reported late, 45 days? A late contract is a dodgy contract except maybe for variations?
 $heuristics["DATE_REPORTED_LATE"] = Array(
 	"description" => "Reported late, 45 days?"
@@ -70,10 +75,7 @@
 {
 	global $averageDaysLate;
 	if (!$averageDaysLate) {
-		$query = "select AVG(dateDiff(publishDate,contractStart)) from contractnotice";
-		$result = mysql_query($query);
-		$r = mysql_fetch_array($result, MYSQL_BOTH);
-		$averageDaysLate = $r[0];
+            getDaysLate();
 	}
 	return $averageDaysLate;
 }
@@ -82,11 +84,19 @@
 {
 	global $stddevDaysLate;
 	if (!$stddevDaysLate) {
-		$query = "select STDDEV(dateDiff(publishDate,contractStart)) from contractnotice";
-		$result = mysql_query($query);
-		$r = mysql_fetch_array($result, MYSQL_BOTH);
-		$stddevDaysLate = $r[0];
+	getDaysLate();
 	}
 	return $stddevDaysLate;
 }
+function getDaysLate() {
+    
+	global $averageDaysLate,$stddevDaysLate;
+    
+    		$query = "select AVG(dateDiff(publishDate,contractStart)), STDDEV(dateDiff(publishDate,contractStart)) from contractnotice";
+		$result = mysql_query($query);
+		$r = mysql_fetch_array($result, MYSQL_BOTH);
+		$averageDaysLate = $r[0];
+                	$stddevDaysLate = $r[1];
+
+}
 ?>

--- a/heuristics/heuristics.inc.php
+++ b/heuristics/heuristics.inc.php
@@ -1,20 +1,25 @@
 <?php
-  include_once("../lib/common.inc.php");
+include_once ("../lib/common.inc.php");
 $heuristics = Array();
 //each heuristic adds self to description array
 include ("dateHeuristics.php");
-//include("historyHeuristics.php");
-//include("metadataHeuristics.php");
-//include("valueHeuristics.php");
-// method signature heuristic($contractNoticeAsArray);
+//include ("historyHeuristics.php");
+//include ("metadataHeuristics.php");
+//include ("valueHeuristics.php");
 function runHeuristic($heuristicName, $cn)
 {
-	$hresults = call_user_func($heuristicName, $cn);
-	if (!isset($hresults["heuristic_value"]) || !isset($hresults["raw_value"]) || !isset($hresults["mean"]) || !isset($hresults["stddev"])) {
-		print_r($hresults);
-		die("Missing field in heurtistic $heuristicName result");
-	}
-	$query = "insert into heuristic_results values('$heuristicName',
+	// check  if already ran
+	$query = "select count(*) from heuristic_results where heuristic_name = '$heuristicName' and CNID = '{$CN['CNID']}";
+	$result = mysql_query($query);
+	$r = mysql_fetch_array($result);
+	if ($r[0] == 0) {
+		// if not, run now
+		$hresults = call_user_func($heuristicName, $cn);
+		if (!isset($hresults["heuristic_value"]) || !isset($hresults["raw_value"]) || !isset($hresults["mean"]) || !isset($hresults["stddev"])) {
+			print_r($hresults);
+			die("Missing field in heurtistic $heuristicName result");
+		}
+		$query = "insert into heuristic_results values('$heuristicName',
     '{$hresults["heuristic_value"]}',
      '{$hresults["raw_value"]}',
       '{$hresults["mean"]}',
@@ -25,9 +30,10 @@
            '{$cn["agencyABN"]}',
            '{$cn["supplierID"]}'
     )";
-	// save value and cn data via sql
-	$result = mysql_query($query);
-	if ($result) echo "Saved $heuristicName for {$cn["CNID"]} <br>\n";
-	elseif (strpos(mysql_error() , "Duplicate entry") === false) echo $hresults . " failed insert.<br>" . mysql_error() . " <br>  $query <br><br>\n";
+		// save value and cn data via sql
+		$result = mysql_query($query);
+		if ($result) echo "Saved $heuristicName for {$cn["CNID"]} <br>\n";
+		elseif (strpos(mysql_error() , "Duplicate entry") === false) echo $hresults . " failed insert.<br>" . mysql_error() . " <br>  $query <br><br>\n";
+	}
 }
 ?>

--- a/heuristics/historyHeuristics.php
+++ b/heuristics/historyHeuristics.php
@@ -1,25 +1,126 @@
-<?php     
-    // "unusual for agency/supplier due to previous low number of transactions "
-$heuristics["HISTORY_LOW_TRANSACTIONS"] = Array(
-	"description" => "unusual for agency/supplier due to previous low number of transactions "
+<?php
+$heuristics["HISTORY_LOW_TRANSACTIONS_AGENCY"] = Array(
+	"description" => "unusual for agency due to previous low number of transactions "
 );
-function HISTORY_LOW_TRANSACTIONS($cn)
+function HISTORY_LOW_TRANSACTIONS_AGENCY($cn)
 {
-    	$averageContractPeriod = getAverageContractPeriod();
-	$diff = strtotime($cn['contractStart']) - strtotime($cn['publishDate']);
-	$days = intval($diff / 24);
-	return ($days > 45 ? 1 : 0);
+	$thisAgencyTransactions = getAgencyTransactions($cn['agencyName']);
+	$averageAgencyTransactions = getAverageAgencyTransactions();
+	$stddevAgencyTransactions = getstddevAgencyTransactions();
+	$diff = strtotime($cn['contractEnd']) - strtotime($cn['contractStart']);
+	$days = intval($diff / (60 * 60 * 24));
+	$value = abs($days - $averageAgencyTransactions) / $stddevAgencyTransactions;
+	return Array(
+		"heuristic_value" => $value,
+		"raw_value" => $days,
+		"mean" => $averageAgencyTransactions,
+		"stddev" => $stddevAgencyTransactions
+	);
 }
- /*   - unusual value for time of year
-        - compare to all other records in last 2 weeks
-        - ie. many large contracts in june so takes more to standout*/
- 
- $heuristics["HISTORY_HIGH_VALUE_FOR_MONTH"] = Array(
-	"description" => "unusual value for time of year");
-function HISTORY_HIGH_VALUE_FOR_MONTH($cn)
+$agencyTransactions = Array();
+function getAgencyTransactions($agencyName)
 {
-    	$averageContractPeriod = getAverageContractPeriod();
-	$diff = strtotime($cn['contractStart']) - strtotime($cn['publishDate']);
-	$days = intval($diff / 24);
-	return ($days > 45 ? 1 : 0);
+	global $agencyTransactions;
+	if (!$agencyTransactions[$agencyName]) {
+		$query = 'select count(*) from contractnotice where agencyName = "' . $agencyName . '"';
+		$result = mysql_query($query);
+		$r = mysql_fetch_array($result, MYSQL_BOTH);
+		$agencyTransactions[$agencyName] = $r[0];
+	}
+	return $agencyTransactions[$agencyName];
 }
+$averageAgencyTransactions;
+function getAverageAgencyTransactions()
+{
+	global $averageAgencyTransactions;
+	if (!$averageAgencyTransactions) {
+		getStatsAgencyTransactions();
+	}
+	return $averageAgencyTransactions;
+}
+$stddevAgencyTransactions;
+function getstddevAgencyTransactions()
+{
+	global $stddevAgencyTransactions;
+	if (!$stddevAgencyTransactions) {
+		getStatsAgencyTransactions();
+	}
+	return $stddevAgencyTransactions;
+}
+function getStatsAgencyTransactions()
+{
+	global $averageAgencyTransactions, $stddevAgencyTransactions;
+	$query = "select avg(count), STDDEV(count) from (select count(*) as count
+                from contractnotice group by agencyName) as a;";
+	$result = mysql_query($query);
+	$r = mysql_fetch_array($result, MYSQL_BOTH);
+	$averageAgencyTransactions = $r[0];
+	$stddevAgencyTransactions = $r[1];
+}
+$heuristics["HISTORY_LOW_TRANSACTIONS_SUPPLIER"] = Array(
+	"description" => "unusual for supplier due to previous low number of transactions "
+);
+function HISTORY_LOW_TRANSACTIONS_SUPPLIER($cn)
+{
+	$thisSupplierTransactions = getSupplierTransactions($cn['supplierName'], $cn['supplierABN']);
+	$averageSupplierTransactions = getAverageSupplierTransactions();
+	$stddevSupplierTransactions = getstddevSupplierTransactions();
+	$diff = strtotime($cn['contractEnd']) - strtotime($cn['contractStart']);
+	$days = intval($diff / (60 * 60 * 24));
+	$value = abs($days - $averageSupplierTransactions) / $stddevSupplierTransactions;
+	return Array(
+		"heuristic_value" => $value,
+		"raw_value" => $days,
+		"mean" => $averageSupplierTransactions,
+		"stddev" => $stddevSupplierTransactions
+	);
+}
+$supplierTransactions = Array();
+function getSupplierTransactions($supplierName, $supplierABN)
+{
+	global $supplierTransactions;
+	if ($supplierABN != 0 && $supplierABN != "") {
+		if (!$supplierTransactions[$supplierABN]) {
+			$query = 'select count(*) from contractnotice where supplierABN = "' . $supplierABN . '"';
+			$result = mysql_query($query);
+			$r = mysql_fetch_array($result, MYSQL_BOTH);
+			$supplierTransactions[$supplierABN] = $r[0];
+		}
+		return $supplierTransactions[$supplierABN];
+	}
+	if (!$supplierTransactions[$supplierName]) {
+		$query = 'select count(*) from contractnotice where supplierName = "' . $supplierName . '"';
+		$result = mysql_query($query);
+		$r = mysql_fetch_array($result, MYSQL_BOTH);
+		$supplierTransactions[$supplierName] = $r[0];
+	}
+	return $supplierTransactions[$supplierName];
+}
+$averageSupplierTransactions;
+function getAverageSupplierTransactions()
+{
+	global $averageSupplierTransactions;
+	if (!$averageSupplierTransactions) {
+		getStatsSupplierTransactions();
+	}
+	return $averageSupplierTransactions;
+}
+$stddevSupplierTransactions;
+function getstddevSupplierTransactions()
+{
+	global $stddevSupplierTransactions;
+	if (!$stddevSupplierTransactions) {
+		getStatsSupplierTransactions();
+	}
+	return $stddevSupplierTransactions;
+}
+function getStatsSupplierTransactions()
+{
+	global $averageSupplierTransactions, $stddevSupplierTransactions;
+	$query = 'select avg(count), stddev(count) from (select IF(supplierABN != "",supplierABN,supplierName) as supplierID, count(*) as count from contractnotice group by supplierID) as a;';
+	$result = mysql_query($query);
+	$r = mysql_fetch_array($result, MYSQL_BOTH);
+	$averageSupplierTransactions = $r[0];
+	$stddevSupplierTransactions = $r[1];
+}
+

--- a/heuristics/metadataHeuristics.php
+++ b/heuristics/metadataHeuristics.php
@@ -1,12 +1,59 @@
  <?php
- /*- duplicated description
-        - most duplicated overall, most duplicated per agency/category/supplier etc. */
-   $heuristics["METADATA_DUPLICATED_DESCRIPTION"] = Array(
-	"description" => "unusual value for time of year");
+/* all
+   SELECT description, count(*) as count
+FROM `contractnotice` 
+group by description having count > 1 order by count
+*/
+/*- duplicated description
+ - most duplicated overall, most duplicated per agency/category/supplier etc. */
+$heuristics["METADATA_DUPLICATED_DESCRIPTION"] = Array(
+	"description" => ""
+);
 function METADATA_DUPLICATED_DESCRIPTION($cn)
 {
-    	$averageContractPeriod = getAverageContractPeriod();
-	$diff = strtotime($cn['contractStart']) - strtotime($cn['publishDate']);
-	$days = intval($diff / 24);
-	return ($days > 45 ? 1 : 0);
+	$averageDuplicatedDescriptions = getAverageDuplicatedDescriptions();
+	$stddevDuplicatedDescriptions = getstddevDuplicatedDescriptions();
+	$query = 'select count(*) from contractnotice where description = "' . $agencyName . '"';
+	$result = mysql_query($query);
+	$r = mysql_fetch_array($result, MYSQL_BOTH);
+	$dupeDesc = $r[0];
+        if ($dupeDesc == 1) $value = 0;
+	else $value = abs($dupeDesc - $averageDuplicatedDescriptions) / $stddevDuplicatedDescriptions;
+	return Array(
+		"heuristic_value" => $value,
+		"raw_value" => $dupeDesc,
+		"mean" => $averageDuplicatedDescriptions,
+		"stddev" => $stddevDuplicatedDescriptions
+	);
 }
+$averageDuplicatedDescriptions;
+function getAverageDuplicatedDescriptions()
+{
+	global $averageDuplicatedDescriptions;
+	if (!$averageDuplicatedDescriptions) {
+		getStatsDuplicatedDescriptions();
+	}
+	return $averageDuplicatedDescriptions;
+}
+$stddevDuplicatedDescriptions;
+function getstddevDuplicatedDescriptions()
+{
+	global $stddevDuplicatedDescriptions;
+	if (!$stddevDuplicatedDescriptions) {
+		getStatsDuplicatedDescriptions();
+	}
+	return $stddevDuplicatedDescriptions;
+}
+function getStatsDuplicatedDescriptions()
+{
+	$query = "select avg(count),STDDEV(count) from (
+        SELECT description, count(*) as count
+FROM `contractnotice` 
+group by description having count > 1 
+        ) as a;";
+	$result = mysql_query($query);
+	$r = mysql_fetch_array($result, MYSQL_BOTH);
+	$averageDuplicatedDescriptions = $r[0];
+	$stddevDuplicatedDescriptions = $r[1];
+}
+

--- a/heuristics/runHeuristics.php
+++ b/heuristics/runHeuristics.php
@@ -7,7 +7,7 @@
 FROM contractnotice JOIN agency ON contractnotice.agencyName=agency.agencyName
 WHERE  DATE(importDate) = (select * from (SELECT DATE(importDate) 
 FROM contractnotice ORDER BY importDate DESC limit 1) alias)";
-$result = mysql_query($lastimportquery);
+$result = mysql_query($query);
 if (!$result) echo mysql_error().$query;
 while ($cn = mysql_fetch_array($result, MYSQL_BOTH)) {
 	//get each new CN from latest update

--- a/heuristics/valueHeuristics.php
+++ b/heuristics/valueHeuristics.php
@@ -1,8 +1,8 @@
-
-        - large contract value
-          - chi-square test for outliers / standard dev from mean/median
-          - percent of total contracts for supplier/agency
-   $heuristics["METADATA_DUPLICATED_DESCRIPTION"] = Array(
+<?php 
+      /*  - large contract value
+          - standard dev from mean/median
+          - percent of total contracts for supplier/agency*/
+   $heuristics["VALUE_LARGE_CONTRACT_OVERALL"] = Array(
 	"description" => "unusual value for time of year");
 function METADATA_DUPLICATED_DESCRIPTION($cn)
 {
@@ -12,12 +12,15 @@
 	return ($days > 45 ? 1 : 0);
 }
 
-        - peculiar value
+      /*  - peculiar value
         - Just under 80k, amplified if other contracts with same supplier are just under
-    - unusual variation amount
-        - absolute value; large reductions as well as large increases
-        
-           $heuristics["METADATA_DUPLICATED_DESCRIPTION"] = Array(
+      */
+         $heuristics["VALUE_NEAR_THRESHOLD"] = Array(
+	"description" => "unusual value for time of year");
+      /*
+    - unusual variation amount - absolute value; large reductions as well as large increases
+      */
+           $heuristics["VALUE_LARGE_VARIATION"] = Array(
 	"description" => "unusual value for time of year");
 function METADATA_DUPLICATED_DESCRIPTION($cn)
 {
@@ -26,3 +29,30 @@
 	$days = intval($diff / 24);
 	return ($days > 45 ? 1 : 0);
 }
+
+/*   - unusual value for time of year
+        - compare to all other records in last 2 weeks
+        - ie. many large contracts in june so takes more to standout*/
+$heuristics["VALUE_HIGH_FOR_MONTH"] = Array(
+	"description" => "unusual value for time of year"
+);
+function VALUE_HIGH_FOR_MONTH($cn, $monthAsInt)
+{
+	$averageContractPeriod = getAverageContractPeriod();
+	$diff = strtotime($cn['contractStart']) - strtotime($cn['publishDate']);
+	$days = intval($diff / 24);
+	return ($days > 45 ? 1 : 0);
+}
+$monthlyValueAverage = Array();
+function getAgencyTransactions($agencyName)
+{
+	global $agencyTransactions;
+	if (!$agencyTransactions[$agencyName]) {
+		$query = 'select count(*) from contractnotice where agencyName = "' . $agencyName . '"';
+		$result = mysql_query($query);
+		$r = mysql_fetch_array($result, MYSQL_BOTH);
+		$agencyTransactions[$agencyName] = $r[0];
+	}
+	return $agencyTransactions[$agencyName];
+}
+?>

--- a/heuristics/viewHeuristicsColormap.php
+++ b/heuristics/viewHeuristicsColormap.php
@@ -1,5 +1,11 @@
 <?php    
   include_once("../lib/common.inc.php");
+  echo '<style>
+  div {
+  padding: 5px;
+  display: inline-block;
+  }
+  </style>';
 // http://www.herethere.net/~samson/php/color_gradient/color_gradient_generator.php.txt
 // return the interpolated value between pBegin and pEnd
 function interpolate($pBegin, $pEnd, $pStep, $pMax)
@@ -42,13 +48,13 @@
 $maxVal = $r[0];
   
 $query = "SELECT sum(heuristic_value) as sum, CNID
-FROM `heuristic_results` group by CNID order by sum DESC LIMIT 30";
+FROM `heuristic_results` group by CNID order by sum DESC LIMIT 300";
 $result = mysql_query($query);
 if (!$result) echo mysql_error().$query;
 while ($r = mysql_fetch_array($result, MYSQL_BOTH)) {
-    echo '<span style="background: #'.$Gradients[floor(($r['sum']/$maxVal) * 10)].'; padding: 5px;">';
+    echo '<div style="background: #'.$Gradients[floor(($r['sum']/$maxVal) * 10)].';">';
     echo '<a title="'.$r['sum'].'" href="../displayContract.php?CNID='.$r['CNID'].'">X</a>';
-    echo "</span>";
+    echo "</div>";
 }
 
 ?>

--- a/heuristics/viewHeuristicsDistribution.php
+++ b/heuristics/viewHeuristicsDistribution.php
@@ -14,22 +14,32 @@
  include("../lib/pChart2.1.0/class/pData.class.php"); 
  include("../lib/pChart2.1.0/class/pDraw.class.php"); 
  include("../lib/pChart2.1.0/class/pImage.class.php"); 
-$labels = Array();
-$values = Array();
+
+$series = Array();
 
   include_once("../lib/common.inc.php");
-$query = "select floor(sum) as val,count(*) from (SELECT sum(heuristic_value) 
-as sum FROM heuristic_results group by CNID) as a group by val";
+$query = "select heuristic_name, floor(heuristic_value) as val,count(*) from heuristic_results group by heuristic_name, val";
 $result = mysql_query($query);
 if (!$result) echo mysql_error().$query;
 while ($r = mysql_fetch_array($result, MYSQL_BOTH)) {
-    $labels[] = $r[0];
-    $values[] = $r[1];
+    $series[$r["heuristic_name"]][$r["val"]] = $r[2];
 }
 
  /* Create and populate the pData object */ 
- $MyData = new pData();   
- $MyData->addPoints($values,"Records"); 
+ $MyData = new pData();
+ $labels = Array();
+ foreach ($series as $value) {
+    $labels = $labels+array_keys($value);
+ }
+ $labels =  Array(0,1,2,3,4,5);
+foreach ($series as $seriesName => $seriesEntry) {
+    $data;
+ foreach ($labels as $label) {
+    $data[$label] = ($seriesEntry[$label] ? $seriesEntry[$label] : 0);
+ }
+  
+$MyData->addPoints($data,$seriesName); 
+ }
  $MyData->setAxisName(0,"# of records"); 
  $MyData->addPoints($labels,"Labels"); 
  $MyData->setSerieDescription("Labels","Bins"); 
@@ -65,7 +75,7 @@
  $myPicture->drawSplineChart(); 
 
  /* Write the chart legend */ 
- $myPicture->drawLegend(540,20,array("Style"=>LEGEND_NOBORDER,"Mode"=>LEGEND_HORIZONTAL)); 
+ $myPicture->drawLegend(540,20,array("Style"=>LEGEND_NOBORDER,"Mode"=>LEGEND_VERTICAL)); 
 
  /* Render the picture (choose the best way) */ 
  $myPicture->autoOutput("pictures/example.drawSplineChart.simple.png");