Add initial date based heuristics
--- a/heuristics/dateHeuristics.php
+++ b/heuristics/dateHeuristics.php
@@ -1,4 +1,92 @@
- - long contract period (number of weeks/days?)
- - Reported late
- - 45 days? A late contract is a dodgy contract except maybe for variations?
-
+<?php
+//long contract period (number of weeks/days?)
+$heuristics["DATE_LONG_CONTRACT_PERIOD"] = Array(
+ "description" => "long contract period (number of weeks/days?)"
+);
+function DATE_LONG_CONTRACT_PERIOD($cn)
+{
+ $averageContractPeriod = getAverageContractPeriod();
+ $stddevContractPeriod = getstddevContractPeriod();
+ $diff = strtotime($cn['contractEnd']) - strtotime($cn['contractStart']);
+ $days = intval($diff / (60 * 60 * 24));
+ $value = abs($days - $averageContractPeriod) / $stddevContractPeriod;
+ return Array(
+ "heuristic_value" => $value,
+ "raw_value" => $days,
+ "mean" => $averageContractPeriod,
+ "stddev" => $stddevContractPeriod
+ );
+}
+$averageContractPeriod;
+function getAverageContractPeriod()
+{
+ global $averageContractPeriod;
+ if (!$averageContractPeriod) {
+ $query = "select AVG(dateDiff(contractEnd,contractStart)) from contractnotice";
+ $result = mysql_query($query);
+ $r = mysql_fetch_array($result, MYSQL_BOTH);
+ $averageContractPeriod = $r[0];
+ }
+ return $averageContractPeriod;
+}
+$stddevContractPeriod;
+function getstddevContractPeriod()
+{
+ global $stddevContractPeriod;
+ if (!$stddevContractPeriod) {
+ $query = "select STDDEV(dateDiff(contractEnd,contractStart)) from contractnotice";
+ $result = mysql_query($query);
+ $r = mysql_fetch_array($result, MYSQL_BOTH);
+ $stddevContractPeriod = $r[0];
+ }
+ return $stddevContractPeriod;
+}
+//Reported late, 45 days? A late contract is a dodgy contract except maybe for variations?
+$heuristics["DATE_REPORTED_LATE"] = Array(
+ "description" => "Reported late, 45 days?"
+);
+function DATE_REPORTED_LATE($cn)
+{
+ $averageDaysLate = getAverageDaysLate();
+ $stddevDaysLate = getStddevDaysLate();
+ $diff = strtotime($cn['publishDate']) - strtotime($cn['contractStart']);
+ $days = intval($diff / (60 * 60 * 24));
+ if ($days <= 0) {
+ $value = 0;
+ }
+ else {
+ // +1 demerit for exceeding 45 day requirement
+ $value = (abs($days - $averageDaysLate) / $stddevDaysLate) + ($days < 45 ? 0 : 1);
+ }
+ return Array(
+ "heuristic_value" => $value,
+ "raw_value" => $days,
+ "mean" => $averageDaysLate,
+ "stddev" => $stddevDaysLate
+ );
+}
+$averageDaysLate;
+function getAverageDaysLate()
+{
+ global $averageDaysLate;
+ if (!$averageDaysLate) {
+ $query = "select AVG(dateDiff(publishDate,contractStart)) from contractnotice";
+ $result = mysql_query($query);
+ $r = mysql_fetch_array($result, MYSQL_BOTH);
+ $averageDaysLate = $r[0];
+ }
+ return $averageDaysLate;
+}
+$stddevDaysLate;
+function getStddevDaysLate()
+{
+ global $stddevDaysLate;
+ if (!$stddevDaysLate) {
+ $query = "select STDDEV(dateDiff(publishDate,contractStart)) from contractnotice";
+ $result = mysql_query($query);
+ $r = mysql_fetch_array($result, MYSQL_BOTH);
+ $stddevDaysLate = $r[0];
+ }
+ return $stddevDaysLate;
+}
+?>
--- /dev/null
+++ b/heuristics/heuristics.inc.php
@@ -1,1 +1,33 @@
-
+<?php
+ include_once("../lib/common.inc.php");
+$heuristics = Array();
+//each heuristic adds self to description array
+include ("dateHeuristics.php");
+//include("historyHeuristics.php");
+//include("metadataHeuristics.php");
+//include("valueHeuristics.php");
+// method signature heuristic($contractNoticeAsArray);
+function runHeuristic($heuristicName, $cn)
+{
+ $hresults = call_user_func($heuristicName, $cn);
+ if (!isset($hresults["heuristic_value"]) || !isset($hresults["raw_value"]) || !isset($hresults["mean"]) || !isset($hresults["stddev"])) {
+ print_r($hresults);
+ die("Missing field in heurtistic $heuristicName result");
+ }
+ $query = "insert into heuristic_results values('$heuristicName',
+ '{$hresults["heuristic_value"]}',
+ '{$hresults["raw_value"]}',
+ '{$hresults["mean"]}',
+ '{$hresults["stddev"]}',
+ '{$cn["CNID"]}',
+ NOW(),
+ '{$cn["publishDate"]}',
+ '{$cn["agencyABN"]}',
+ '{$cn["supplierID"]}'
+ )";
+ // save value and cn data via sql
+ $result = mysql_query($query);
+ if ($result) echo "Saved $heuristicName for {$cn["CNID"]} <br>\n";
+ elseif (strpos(mysql_error() , "Duplicate entry") === false) echo $hresults . " failed insert.<br>" . mysql_error() . " <br> $query <br><br>\n";
+}
+?>
--- a/heuristics/historyHeuristics.php
+++ b/heuristics/historyHeuristics.php
@@ -1,6 +1,25 @@
- - unusual for agency/supplier
- - previous low number of transactions
- - zero ie. new agency/supplier is huge score
- - unusual value for time of year
+<?php
+ // "unusual for agency/supplier due to previous low number of transactions "
+$heuristics["HISTORY_LOW_TRANSACTIONS"] = Array(
+ "description" => "unusual for agency/supplier due to previous low number of transactions "
+);
+function HISTORY_LOW_TRANSACTIONS($cn)
+{
+ $averageContractPeriod = getAverageContractPeriod();
+ $diff = strtotime($cn['contractStart']) - strtotime($cn['publishDate']);
+ $days = intval($diff / 24);
+ return ($days > 45 ? 1 : 0);
+}
+ /* - unusual value for time of year
- compare to all other records in last 2 weeks
- - ie. many large contracts in june so takes more to standout
+ - ie. many large contracts in june so takes more to standout*/
+
+ $heuristics["HISTORY_HIGH_VALUE_FOR_MONTH"] = Array(
+ "description" => "unusual value for time of year");
+function HISTORY_HIGH_VALUE_FOR_MONTH($cn)
+{
+ $averageContractPeriod = getAverageContractPeriod();
+ $diff = strtotime($cn['contractStart']) - strtotime($cn['publishDate']);
+ $days = intval($diff / 24);
+ return ($days > 45 ? 1 : 0);
+}
--- a/heuristics/metadataHeuristics.php
+++ b/heuristics/metadataHeuristics.php
@@ -1,3 +1,12 @@
- - duplicated description
- - most duplicated overall, most duplicated per agency/category/supplier etc.
-
+ <?php
+ /*- duplicated description
+ - most duplicated overall, most duplicated per agency/category/supplier etc. */
+ $heuristics["METADATA_DUPLICATED_DESCRIPTION"] = Array(
+ "description" => "unusual value for time of year");
+function METADATA_DUPLICATED_DESCRIPTION($cn)
+{
+ $averageContractPeriod = getAverageContractPeriod();
+ $diff = strtotime($cn['contractStart']) - strtotime($cn['publishDate']);
+ $days = intval($diff / 24);
+ return ($days > 45 ? 1 : 0);
+}
--- a/heuristics/runHeuristics.php
+++ b/heuristics/runHeuristics.php
@@ -1,6 +1,29 @@
-<?
-if agency
-if supplier
-if CN
+<?php
+include_once("heuristics.inc.php");
+$query = "SELECT *, agency.abn as agencyABN, IF(supplierABN != '',supplierABN,supplierName) as supplierID
+FROM contractnotice JOIN agency ON contractnotice.agencyName=agency.agencyName
+WHERE DATE(importDate) = (select * from (SELECT DATE(importDate)
+FROM contractnotice ORDER BY importDate DESC limit 1) alias)";
+$result = mysql_query($query);
+if (!$result) echo mysql_error().$query;
+while ($cn = mysql_fetch_array($result, MYSQL_BOTH)) {
+ //get each new CN from latest update
+ foreach ($heuristics as $heuristic => $description) {
+ // run all heuristics
+ runHeuristic($heuristic, $cn);
+ }
+ flush();
+}
+/*foreach agency
+
+aggregate agency metrics
+
+foreach supplier
+
+aggreate supplier metrics
+
+foreach CN
+
+aggregate CN metrics */
?>
--- a/heuristics/valueHeuristics.php
+++ b/heuristics/valueHeuristics.php
@@ -2,9 +2,27 @@
- large contract value
- chi-square test for outliers / standard dev from mean/median
- percent of total contracts for supplier/agency
-
+ $heuristics["METADATA_DUPLICATED_DESCRIPTION"] = Array(
+ "description" => "unusual value for time of year");
+function METADATA_DUPLICATED_DESCRIPTION($cn)
+{
+ $averageContractPeriod = getAverageContractPeriod();
+ $diff = strtotime($cn['contractStart']) - strtotime($cn['publishDate']);
+ $days = intval($diff / 24);
+ return ($days > 45 ? 1 : 0);
+}
- peculiar value
- Just under 80k, amplified if other contracts with same supplier are just under
- unusual variation amount
- absolute value; large reductions as well as large increases
+
+ $heuristics["METADATA_DUPLICATED_DESCRIPTION"] = Array(
+ "description" => "unusual value for time of year");
+function METADATA_DUPLICATED_DESCRIPTION($cn)
+{
+ $averageContractPeriod = getAverageContractPeriod();
+ $diff = strtotime($cn['contractStart']) - strtotime($cn['publishDate']);
+ $days = intval($diff / 24);
+ return ($days > 45 ? 1 : 0);
+}