More value heuristics
--- a/heuristics/dateHeuristics.php
+++ b/heuristics/dateHeuristics.php
@@ -20,27 +20,32 @@
$averageContractPeriod;
function getAverageContractPeriod()
{
- global $averageContractPeriod;
+ global $averageContractPeriod, $stddevContractPeriod;
if (!$averageContractPeriod) {
- $query = "select AVG(dateDiff(contractEnd,contractStart)) from contractnotice";
- $result = mysql_query($query);
- $r = mysql_fetch_array($result, MYSQL_BOTH);
- $averageContractPeriod = $r[0];
+ getStddevAverageContractPeriod();
}
return $averageContractPeriod;
}
$stddevContractPeriod;
function getstddevContractPeriod()
{
- global $stddevContractPeriod;
+ global $averageContractPeriod, $stddevContractPeriod;
if (!$stddevContractPeriod) {
- $query = "select STDDEV(dateDiff(contractEnd,contractStart)) from contractnotice";
- $result = mysql_query($query);
- $r = mysql_fetch_array($result, MYSQL_BOTH);
- $stddevContractPeriod = $r[0];
+ getStddevAverageContractPeriod();
}
return $stddevContractPeriod;
}
+function getStddevAverageContractPeriod()
+{
+ global $averageContractPeriod, $stddevContractPeriod;
+ $query = "select AVG(dateDiff(contractEnd,contractStart)),stddev(dateDiff(contractEnd,contractStart)) from contractnotice";
+ $result = mysql_query($query);
+ $r = mysql_fetch_array($result, MYSQL_BOTH);
+ $averageContractPeriod = $r[0];
+ $stddevContractPeriod = $r[1];
+}
+
+
//Reported late, 45 days? A late contract is a dodgy contract except maybe for variations?
$heuristics["DATE_REPORTED_LATE"] = Array(
"description" => "Reported late, 45 days?"
@@ -70,10 +75,7 @@
{
global $averageDaysLate;
if (!$averageDaysLate) {
- $query = "select AVG(dateDiff(publishDate,contractStart)) from contractnotice";
- $result = mysql_query($query);
- $r = mysql_fetch_array($result, MYSQL_BOTH);
- $averageDaysLate = $r[0];
+ getDaysLate();
}
return $averageDaysLate;
}
@@ -82,11 +84,19 @@
{
global $stddevDaysLate;
if (!$stddevDaysLate) {
- $query = "select STDDEV(dateDiff(publishDate,contractStart)) from contractnotice";
- $result = mysql_query($query);
- $r = mysql_fetch_array($result, MYSQL_BOTH);
- $stddevDaysLate = $r[0];
+ getDaysLate();
}
return $stddevDaysLate;
}
+function getDaysLate() {
+
+ global $averageDaysLate,$stddevDaysLate;
+
+ $query = "select AVG(dateDiff(publishDate,contractStart)), STDDEV(dateDiff(publishDate,contractStart)) from contractnotice";
+ $result = mysql_query($query);
+ $r = mysql_fetch_array($result, MYSQL_BOTH);
+ $averageDaysLate = $r[0];
+ $stddevDaysLate = $r[1];
+
+}
?>
--- a/heuristics/heuristics.inc.php
+++ b/heuristics/heuristics.inc.php
@@ -1,20 +1,25 @@
<?php
- include_once("../lib/common.inc.php");
+include_once ("../lib/common.inc.php");
$heuristics = Array();
//each heuristic adds self to description array
include ("dateHeuristics.php");
-//include("historyHeuristics.php");
-//include("metadataHeuristics.php");
-//include("valueHeuristics.php");
-// method signature heuristic($contractNoticeAsArray);
+//include ("historyHeuristics.php");
+//include ("metadataHeuristics.php");
+//include ("valueHeuristics.php");
function runHeuristic($heuristicName, $cn)
{
- $hresults = call_user_func($heuristicName, $cn);
- if (!isset($hresults["heuristic_value"]) || !isset($hresults["raw_value"]) || !isset($hresults["mean"]) || !isset($hresults["stddev"])) {
- print_r($hresults);
- die("Missing field in heurtistic $heuristicName result");
- }
- $query = "insert into heuristic_results values('$heuristicName',
+ // check if already ran
+ $query = "select count(*) from heuristic_results where heuristic_name = '$heuristicName' and CNID = '{$CN['CNID']}";
+ $result = mysql_query($query);
+ $r = mysql_fetch_array($result);
+ if ($r[0] == 0) {
+ // if not, run now
+ $hresults = call_user_func($heuristicName, $cn);
+ if (!isset($hresults["heuristic_value"]) || !isset($hresults["raw_value"]) || !isset($hresults["mean"]) || !isset($hresults["stddev"])) {
+ print_r($hresults);
+ die("Missing field in heurtistic $heuristicName result");
+ }
+ $query = "insert into heuristic_results values('$heuristicName',
'{$hresults["heuristic_value"]}',
'{$hresults["raw_value"]}',
'{$hresults["mean"]}',
@@ -25,9 +30,10 @@
'{$cn["agencyABN"]}',
'{$cn["supplierID"]}'
)";
- // save value and cn data via sql
- $result = mysql_query($query);
- if ($result) echo "Saved $heuristicName for {$cn["CNID"]} <br>\n";
- elseif (strpos(mysql_error() , "Duplicate entry") === false) echo $hresults . " failed insert.<br>" . mysql_error() . " <br> $query <br><br>\n";
+ // save value and cn data via sql
+ $result = mysql_query($query);
+ if ($result) echo "Saved $heuristicName for {$cn["CNID"]} <br>\n";
+ elseif (strpos(mysql_error() , "Duplicate entry") === false) echo $hresults . " failed insert.<br>" . mysql_error() . " <br> $query <br><br>\n";
+ }
}
?>
--- a/heuristics/historyHeuristics.php
+++ b/heuristics/historyHeuristics.php
@@ -1,25 +1,126 @@
-<?php
- // "unusual for agency/supplier due to previous low number of transactions "
-$heuristics["HISTORY_LOW_TRANSACTIONS"] = Array(
- "description" => "unusual for agency/supplier due to previous low number of transactions "
+<?php
+$heuristics["HISTORY_LOW_TRANSACTIONS_AGENCY"] = Array(
+ "description" => "unusual for agency due to previous low number of transactions "
);
-function HISTORY_LOW_TRANSACTIONS($cn)
+function HISTORY_LOW_TRANSACTIONS_AGENCY($cn)
{
- $averageContractPeriod = getAverageContractPeriod();
- $diff = strtotime($cn['contractStart']) - strtotime($cn['publishDate']);
- $days = intval($diff / 24);
- return ($days > 45 ? 1 : 0);
+ $thisAgencyTransactions = getAgencyTransactions($cn['agencyName']);
+ $averageAgencyTransactions = getAverageAgencyTransactions();
+ $stddevAgencyTransactions = getstddevAgencyTransactions();
+ $diff = strtotime($cn['contractEnd']) - strtotime($cn['contractStart']);
+ $days = intval($diff / (60 * 60 * 24));
+ $value = abs($days - $averageAgencyTransactions) / $stddevAgencyTransactions;
+ return Array(
+ "heuristic_value" => $value,
+ "raw_value" => $days,
+ "mean" => $averageAgencyTransactions,
+ "stddev" => $stddevAgencyTransactions
+ );
}
- /* - unusual value for time of year
- - compare to all other records in last 2 weeks
- - ie. many large contracts in june so takes more to standout*/
-
- $heuristics["HISTORY_HIGH_VALUE_FOR_MONTH"] = Array(
- "description" => "unusual value for time of year");
-function HISTORY_HIGH_VALUE_FOR_MONTH($cn)
+$agencyTransactions = Array();
+function getAgencyTransactions($agencyName)
{
- $averageContractPeriod = getAverageContractPeriod();
- $diff = strtotime($cn['contractStart']) - strtotime($cn['publishDate']);
- $days = intval($diff / 24);
- return ($days > 45 ? 1 : 0);
+ global $agencyTransactions;
+ if (!$agencyTransactions[$agencyName]) {
+ $query = 'select count(*) from contractnotice where agencyName = "' . $agencyName . '"';
+ $result = mysql_query($query);
+ $r = mysql_fetch_array($result, MYSQL_BOTH);
+ $agencyTransactions[$agencyName] = $r[0];
+ }
+ return $agencyTransactions[$agencyName];
}
+$averageAgencyTransactions;
+function getAverageAgencyTransactions()
+{
+ global $averageAgencyTransactions;
+ if (!$averageAgencyTransactions) {
+ getStatsAgencyTransactions();
+ }
+ return $averageAgencyTransactions;
+}
+$stddevAgencyTransactions;
+function getstddevAgencyTransactions()
+{
+ global $stddevAgencyTransactions;
+ if (!$stddevAgencyTransactions) {
+ getStatsAgencyTransactions();
+ }
+ return $stddevAgencyTransactions;
+}
+function getStatsAgencyTransactions()
+{
+ global $averageAgencyTransactions, $stddevAgencyTransactions;
+ $query = "select avg(count), STDDEV(count) from (select count(*) as count
+ from contractnotice group by agencyName) as a;";
+ $result = mysql_query($query);
+ $r = mysql_fetch_array($result, MYSQL_BOTH);
+ $averageAgencyTransactions = $r[0];
+ $stddevAgencyTransactions = $r[1];
+}
+$heuristics["HISTORY_LOW_TRANSACTIONS_SUPPLIER"] = Array(
+ "description" => "unusual for supplier due to previous low number of transactions "
+);
+function HISTORY_LOW_TRANSACTIONS_SUPPLIER($cn)
+{
+ $thisSupplierTransactions = getSupplierTransactions($cn['supplierName'], $cn['supplierABN']);
+ $averageSupplierTransactions = getAverageSupplierTransactions();
+ $stddevSupplierTransactions = getstddevSupplierTransactions();
+ $diff = strtotime($cn['contractEnd']) - strtotime($cn['contractStart']);
+ $days = intval($diff / (60 * 60 * 24));
+ $value = abs($days - $averageSupplierTransactions) / $stddevSupplierTransactions;
+ return Array(
+ "heuristic_value" => $value,
+ "raw_value" => $days,
+ "mean" => $averageSupplierTransactions,
+ "stddev" => $stddevSupplierTransactions
+ );
+}
+$supplierTransactions = Array();
+function getSupplierTransactions($supplierName, $supplierABN)
+{
+ global $supplierTransactions;
+ if ($supplierABN != 0 && $supplierABN != "") {
+ if (!$supplierTransactions[$supplierABN]) {
+ $query = 'select count(*) from contractnotice where supplierABN = "' . $supplierABN . '"';
+ $result = mysql_query($query);
+ $r = mysql_fetch_array($result, MYSQL_BOTH);
+ $supplierTransactions[$supplierABN] = $r[0];
+ }
+ return $supplierTransactions[$supplierABN];
+ }
+ if (!$supplierTransactions[$supplierName]) {
+ $query = 'select count(*) from contractnotice where supplierName = "' . $supplierName . '"';
+ $result = mysql_query($query);
+ $r = mysql_fetch_array($result, MYSQL_BOTH);
+ $supplierTransactions[$supplierName] = $r[0];
+ }
+ return $supplierTransactions[$supplierName];
+}
+$averageSupplierTransactions;
+function getAverageSupplierTransactions()
+{
+ global $averageSupplierTransactions;
+ if (!$averageSupplierTransactions) {
+ getStatsSupplierTransactions();
+ }
+ return $averageSupplierTransactions;
+}
+$stddevSupplierTransactions;
+function getstddevSupplierTransactions()
+{
+ global $stddevSupplierTransactions;
+ if (!$stddevSupplierTransactions) {
+ getStatsSupplierTransactions();
+ }
+ return $stddevSupplierTransactions;
+}
+function getStatsSupplierTransactions()
+{
+ global $averageSupplierTransactions, $stddevSupplierTransactions;
+ $query = 'select avg(count), stddev(count) from (select IF(supplierABN != "",supplierABN,supplierName) as supplierID, count(*) as count from contractnotice group by supplierID) as a;';
+ $result = mysql_query($query);
+ $r = mysql_fetch_array($result, MYSQL_BOTH);
+ $averageSupplierTransactions = $r[0];
+ $stddevSupplierTransactions = $r[1];
+}
+
--- a/heuristics/metadataHeuristics.php
+++ b/heuristics/metadataHeuristics.php
@@ -1,12 +1,59 @@
<?php
- /*- duplicated description
- - most duplicated overall, most duplicated per agency/category/supplier etc. */
- $heuristics["METADATA_DUPLICATED_DESCRIPTION"] = Array(
- "description" => "unusual value for time of year");
+/* all
+ SELECT description, count(*) as count
+FROM `contractnotice`
+group by description having count > 1 order by count
+*/
+/*- duplicated description
+ - most duplicated overall, most duplicated per agency/category/supplier etc. */
+$heuristics["METADATA_DUPLICATED_DESCRIPTION"] = Array(
+ "description" => ""
+);
function METADATA_DUPLICATED_DESCRIPTION($cn)
{
- $averageContractPeriod = getAverageContractPeriod();
- $diff = strtotime($cn['contractStart']) - strtotime($cn['publishDate']);
- $days = intval($diff / 24);
- return ($days > 45 ? 1 : 0);
+ $averageDuplicatedDescriptions = getAverageDuplicatedDescriptions();
+ $stddevDuplicatedDescriptions = getstddevDuplicatedDescriptions();
+ $query = 'select count(*) from contractnotice where description = "' . $agencyName . '"';
+ $result = mysql_query($query);
+ $r = mysql_fetch_array($result, MYSQL_BOTH);
+ $dupeDesc = $r[0];
+ if ($dupeDesc == 1) $value = 0;
+ else $value = abs($dupeDesc - $averageDuplicatedDescriptions) / $stddevDuplicatedDescriptions;
+ return Array(
+ "heuristic_value" => $value,
+ "raw_value" => $dupeDesc,
+ "mean" => $averageDuplicatedDescriptions,
+ "stddev" => $stddevDuplicatedDescriptions
+ );
}
+$averageDuplicatedDescriptions;
+function getAverageDuplicatedDescriptions()
+{
+ global $averageDuplicatedDescriptions;
+ if (!$averageDuplicatedDescriptions) {
+ getStatsDuplicatedDescriptions();
+ }
+ return $averageDuplicatedDescriptions;
+}
+$stddevDuplicatedDescriptions;
+function getstddevDuplicatedDescriptions()
+{
+ global $stddevDuplicatedDescriptions;
+ if (!$stddevDuplicatedDescriptions) {
+ getStatsDuplicatedDescriptions();
+ }
+ return $stddevDuplicatedDescriptions;
+}
+function getStatsDuplicatedDescriptions()
+{
+ $query = "select avg(count),STDDEV(count) from (
+ SELECT description, count(*) as count
+FROM `contractnotice`
+group by description having count > 1
+ ) as a;";
+ $result = mysql_query($query);
+ $r = mysql_fetch_array($result, MYSQL_BOTH);
+ $averageDuplicatedDescriptions = $r[0];
+ $stddevDuplicatedDescriptions = $r[1];
+}
+
--- a/heuristics/runHeuristics.php
+++ b/heuristics/runHeuristics.php
@@ -7,7 +7,7 @@
FROM contractnotice JOIN agency ON contractnotice.agencyName=agency.agencyName
WHERE DATE(importDate) = (select * from (SELECT DATE(importDate)
FROM contractnotice ORDER BY importDate DESC limit 1) alias)";
-$result = mysql_query($lastimportquery);
+$result = mysql_query($query);
if (!$result) echo mysql_error().$query;
while ($cn = mysql_fetch_array($result, MYSQL_BOTH)) {
//get each new CN from latest update
--- a/heuristics/valueHeuristics.php
+++ b/heuristics/valueHeuristics.php
@@ -1,8 +1,8 @@
-
- - large contract value
- - chi-square test for outliers / standard dev from mean/median
- - percent of total contracts for supplier/agency
- $heuristics["METADATA_DUPLICATED_DESCRIPTION"] = Array(
+<?php
+ /* - large contract value
+ - standard dev from mean/median
+ - percent of total contracts for supplier/agency*/
+ $heuristics["VALUE_LARGE_CONTRACT_OVERALL"] = Array(
"description" => "unusual value for time of year");
function METADATA_DUPLICATED_DESCRIPTION($cn)
{
@@ -12,12 +12,15 @@
return ($days > 45 ? 1 : 0);
}
- - peculiar value
+ /* - peculiar value
- Just under 80k, amplified if other contracts with same supplier are just under
- - unusual variation amount
- - absolute value; large reductions as well as large increases
-
- $heuristics["METADATA_DUPLICATED_DESCRIPTION"] = Array(
+ */
+ $heuristics["VALUE_NEAR_THRESHOLD"] = Array(
+ "description" => "unusual value for time of year");
+ /*
+ - unusual variation amount - absolute value; large reductions as well as large increases
+ */
+ $heuristics["VALUE_LARGE_VARIATION"] = Array(
"description" => "unusual value for time of year");
function METADATA_DUPLICATED_DESCRIPTION($cn)
{
@@ -26,3 +29,30 @@
$days = intval($diff / 24);
return ($days > 45 ? 1 : 0);
}
+
+/* - unusual value for time of year
+ - compare to all other records in last 2 weeks
+ - ie. many large contracts in june so takes more to standout*/
+$heuristics["VALUE_HIGH_FOR_MONTH"] = Array(
+ "description" => "unusual value for time of year"
+);
+function VALUE_HIGH_FOR_MONTH($cn, $monthAsInt)
+{
+ $averageContractPeriod = getAverageContractPeriod();
+ $diff = strtotime($cn['contractStart']) - strtotime($cn['publishDate']);
+ $days = intval($diff / 24);
+ return ($days > 45 ? 1 : 0);
+}
+$monthlyValueAverage = Array();
+function getAgencyTransactions($agencyName)
+{
+ global $agencyTransactions;
+ if (!$agencyTransactions[$agencyName]) {
+ $query = 'select count(*) from contractnotice where agencyName = "' . $agencyName . '"';
+ $result = mysql_query($query);
+ $r = mysql_fetch_array($result, MYSQL_BOTH);
+ $agencyTransactions[$agencyName] = $r[0];
+ }
+ return $agencyTransactions[$agencyName];
+}
+?>
--- a/heuristics/viewHeuristicsColormap.php
+++ b/heuristics/viewHeuristicsColormap.php
@@ -1,5 +1,11 @@
<?php
include_once("../lib/common.inc.php");
+ echo '<style>
+ div {
+ padding: 5px;
+ display: inline-block;
+ }
+ </style>';
// http://www.herethere.net/~samson/php/color_gradient/color_gradient_generator.php.txt
// return the interpolated value between pBegin and pEnd
function interpolate($pBegin, $pEnd, $pStep, $pMax)
@@ -42,13 +48,13 @@
$maxVal = $r[0];
$query = "SELECT sum(heuristic_value) as sum, CNID
-FROM `heuristic_results` group by CNID order by sum DESC LIMIT 30";
+FROM `heuristic_results` group by CNID order by sum DESC LIMIT 300";
$result = mysql_query($query);
if (!$result) echo mysql_error().$query;
while ($r = mysql_fetch_array($result, MYSQL_BOTH)) {
- echo '<span style="background: #'.$Gradients[floor(($r['sum']/$maxVal) * 10)].'; padding: 5px;">';
+ echo '<div style="background: #'.$Gradients[floor(($r['sum']/$maxVal) * 10)].';">';
echo '<a title="'.$r['sum'].'" href="../displayContract.php?CNID='.$r['CNID'].'">X</a>';
- echo "</span>";
+ echo "</div>";
}
?>
--- a/heuristics/viewHeuristicsDistribution.php
+++ b/heuristics/viewHeuristicsDistribution.php
@@ -14,22 +14,32 @@
include("../lib/pChart2.1.0/class/pData.class.php");
include("../lib/pChart2.1.0/class/pDraw.class.php");
include("../lib/pChart2.1.0/class/pImage.class.php");
-$labels = Array();
-$values = Array();
+
+$series = Array();
include_once("../lib/common.inc.php");
-$query = "select floor(sum) as val,count(*) from (SELECT sum(heuristic_value)
-as sum FROM heuristic_results group by CNID) as a group by val";
+$query = "select heuristic_name, floor(heuristic_value) as val,count(*) from heuristic_results group by heuristic_name, val";
$result = mysql_query($query);
if (!$result) echo mysql_error().$query;
while ($r = mysql_fetch_array($result, MYSQL_BOTH)) {
- $labels[] = $r[0];
- $values[] = $r[1];
+ $series[$r["heuristic_name"]][$r["val"]] = $r[2];
}
/* Create and populate the pData object */
- $MyData = new pData();
- $MyData->addPoints($values,"Records");
+ $MyData = new pData();
+ $labels = Array();
+ foreach ($series as $value) {
+ $labels = $labels+array_keys($value);
+ }
+ $labels = Array(0,1,2,3,4,5);
+foreach ($series as $seriesName => $seriesEntry) {
+ $data;
+ foreach ($labels as $label) {
+ $data[$label] = ($seriesEntry[$label] ? $seriesEntry[$label] : 0);
+ }
+
+$MyData->addPoints($data,$seriesName);
+ }
$MyData->setAxisName(0,"# of records");
$MyData->addPoints($labels,"Labels");
$MyData->setSerieDescription("Labels","Bins");
@@ -65,7 +75,7 @@
$myPicture->drawSplineChart();
/* Write the chart legend */
- $myPicture->drawLegend(540,20,array("Style"=>LEGEND_NOBORDER,"Mode"=>LEGEND_HORIZONTAL));
+ $myPicture->drawLegend(540,20,array("Style"=>LEGEND_NOBORDER,"Mode"=>LEGEND_VERTICAL));
/* Render the picture (choose the best way) */
$myPicture->autoOutput("pictures/example.drawSplineChart.simple.png");