fix heuristics
fix heuristics

<?php <?php
   
include_once ("../lib/common.inc.php"); include_once ("../lib/common.inc.php");
$heuristics = Array(); $heuristics = Array();
//each heuristic adds self to description array //each heuristic adds self to description array
include ("dateHeuristics.php"); include ("dateHeuristics.php");
include ("historyHeuristics.php"); include ("historyHeuristics.php");
   
//include ("metadataHeuristics.php"); //include ("metadataHeuristics.php");
//include ("valueHeuristics.php"); //include ("valueHeuristics.php");
function runHeuristic($heuristicName, $cn) function runHeuristic($heuristicName, $cn) {
{  
global $conn; global $conn;
// check if already ran // check if already ran
$query = "select count(*) from heuristic_results where heuristic_name = '$heuristicName' and \"CNID\" = '{$cn['CNID']}'"; $query = "select count(*) from heuristic_results where heuristic_name = '$heuristicName' and \"CNID\" = '{$cn['CNID']}'";
$result = $conn->query($query); $result = $conn->query($query);
databaseError($conn->errorInfo()); databaseError($conn->errorInfo());
$r = $result->fetch(PDO::FETCH_BOTH); $r = $result->fetch(PDO::FETCH_BOTH);
if ($r[0] == 0) { if ($r[0] == 0) {
// if not, run now // if not, run now
$hresults = call_user_func($heuristicName, $cn); $hresults = call_user_func($heuristicName, $cn);
if (!isset($hresults["heuristic_value"]) || !isset($hresults["raw_value"]) || !isset($hresults["mean"]) || !isset($hresults["stddev"])) { if (!isset($hresults["heuristic_value"]) || !isset($hresults["raw_value"]) || !isset($hresults["mean"]) || !isset($hresults["stddev"])) {
print_r($hresults); print_r($hresults);
die("Missing field in heurtistic $heuristicName result"); die("Missing field in heurtistic $heuristicName result");
} }
$query = "insert into heuristic_results values('$heuristicName', $query = "insert into heuristic_results values('$heuristicName',
'{$hresults["heuristic_value"]}', '{$hresults["heuristic_value"]}',
'{$hresults["raw_value"]}', '{$hresults["raw_value"]}',
'{$hresults["mean"]}', '{$hresults["mean"]}',
'{$hresults["stddev"]}', '{$hresults["stddev"]}',
'{$cn["CNID"]}', '{$cn["CNID"]}',
NOW(), NOW(),
'{$cn["publishDate"]}', '{$cn["publishDate"]}',"
'{$cn["agencyABN"]}', //."'{$cn["agencyABN"]}',
'{$cn["supplierID"]}' ."0,'{$cn["supplierID"]}'
)"; )";
// save value and cn data via sql // save value and cn data via sql
$result = $conn->query($query); $result = $conn->query($query);
$errors = $conn->errorInfo(); $errors = $conn->errorInfo();
if ($errors[2] == "") echo "Saved $heuristicName for {$cn["CNID"]} <br>\n"; if ($errors[2] == "")
elseif (strpos($errors[2] , "Duplicate entry") === false) echo $hresults . " failed insert.<br>" . print_r($errors,true) . " <br> $query <br><br>\n"; echo "Saved $heuristicName for {$cn["CNID"]} <br>\n";
} elseif (strpos($errors[2], "Duplicate entry") === false)
  echo $hresults . " failed insert.<br>" . print_r($errors, true) . " <br> $query <br><br>\n";
  }
} }
   
?> ?>
   
<?php <?php
$heuristics["HISTORY_LOW_TRANSACTIONS_AGENCY"] = Array( $heuristics["HISTORY_LOW_TRANSACTIONS_AGENCY"] = Array(
"description" => "unusual for agency due to previous low number of transactions " "description" => "unusual for agency due to previous low number of transactions "
); );
function HISTORY_LOW_TRANSACTIONS_AGENCY($cn) function HISTORY_LOW_TRANSACTIONS_AGENCY($cn)
{ {
$thisAgencyTransactions = getAgencyTransactions($cn['agencyName']); $thisAgencyTransactions = getAgencyTransactions($cn['agencyName']);
$averageAgencyTransactions = getAverageAgencyTransactions(); $averageAgencyTransactions = getAverageAgencyTransactions();
$stddevAgencyTransactions = getstddevAgencyTransactions(); $stddevAgencyTransactions = getstddevAgencyTransactions();
$diff = strtotime($cn['contractEnd']) - strtotime($cn['contractStart']); $diff = strtotime($cn['contractEnd']) - strtotime($cn['contractStart']);
$days = intval($diff / (60 * 60 * 24)); $days = intval($diff / (60 * 60 * 24));
$value = abs($days - $averageAgencyTransactions) / $stddevAgencyTransactions; $value = abs($days - $averageAgencyTransactions) / $stddevAgencyTransactions;
return Array( return Array(
"heuristic_value" => $value, "heuristic_value" => $value,
"raw_value" => $days, "raw_value" => $days,
"mean" => $averageAgencyTransactions, "mean" => $averageAgencyTransactions,
"stddev" => $stddevAgencyTransactions "stddev" => $stddevAgencyTransactions
); );
} }
$agencyTransactions = Array(); $agencyTransactions = Array();
function getAgencyTransactions($agencyName) function getAgencyTransactions($agencyName)
{ {
global $agencyTransactions; global $agencyTransactions,$conn;
if (!$agencyTransactions[$agencyName]) { if (!$agencyTransactions[$agencyName]) {
$query = 'select count(*) from contractnotice where agencyName = "' . $agencyName . '"'; $query = 'select count(*) from contractnotice where "agencyName" = \'' . $agencyName . '"\'';
$result = $conn->query($query); $result = $conn->query($query);
$r = $result->fetch(PDO::FETCH_BOTH); $r = $result->fetch(PDO::FETCH_BOTH);
$agencyTransactions[$agencyName] = $r[0]; $agencyTransactions[$agencyName] = $r[0];
} }
return $agencyTransactions[$agencyName]; return $agencyTransactions[$agencyName];
} }
$averageAgencyTransactions; $averageAgencyTransactions;
function getAverageAgencyTransactions() function getAverageAgencyTransactions()
{ {
global $averageAgencyTransactions; global $averageAgencyTransactions;
if (!$averageAgencyTransactions) { if (!$averageAgencyTransactions) {
getStatsAgencyTransactions(); getStatsAgencyTransactions();
} }
return $averageAgencyTransactions; return $averageAgencyTransactions;
} }
$stddevAgencyTransactions; $stddevAgencyTransactions;
function getstddevAgencyTransactions() function getstddevAgencyTransactions()
{ {
global $stddevAgencyTransactions; global $stddevAgencyTransactions;
if (!$stddevAgencyTransactions) { if (!$stddevAgencyTransactions) {
getStatsAgencyTransactions(); getStatsAgencyTransactions();
} }
return $stddevAgencyTransactions; return $stddevAgencyTransactions;
} }
function getStatsAgencyTransactions() function getStatsAgencyTransactions()
{ {
global $averageAgencyTransactions, $stddevAgencyTransactions; global $averageAgencyTransactions, $stddevAgencyTransactions,$conn;
$query = "select avg(count), STDDEV(count) from (select count(*) as count $query = 'select avg(count), STDDEV(count) from (select count(*) as count
from contractnotice group by agencyName) as a;"; from contractnotice group by "agencyName") as a;';
$result = $conn->query($query); $result = $conn->query($query);
$r = $result->fetch(PDO::FETCH_BOTH); $r = $result->fetch(PDO::FETCH_BOTH);
$averageAgencyTransactions = $r[0]; $averageAgencyTransactions = $r[0];
$stddevAgencyTransactions = $r[1]; $stddevAgencyTransactions = $r[1];
} }
$heuristics["HISTORY_LOW_TRANSACTIONS_SUPPLIER"] = Array( $heuristics["HISTORY_LOW_TRANSACTIONS_SUPPLIER"] = Array(
"description" => "unusual for supplier due to previous low number of transactions " "description" => "unusual for supplier due to previous low number of transactions "
); );
function HISTORY_LOW_TRANSACTIONS_SUPPLIER($cn) function HISTORY_LOW_TRANSACTIONS_SUPPLIER($cn)
{ {
$thisSupplierTransactions = getSupplierTransactions($cn['supplierName'], $cn['supplierABN']); $thisSupplierTransactions = getSupplierTransactions($cn['supplierName'], $cn['supplierABN']);
$averageSupplierTransactions = getAverageSupplierTransactions(); $averageSupplierTransactions = getAverageSupplierTransactions();
$stddevSupplierTransactions = getstddevSupplierTransactions(); $stddevSupplierTransactions = getstddevSupplierTransactions();
$diff = strtotime($cn['contractEnd']) - strtotime($cn['contractStart']); $diff = strtotime($cn['contractEnd']) - strtotime($cn['contractStart']);
$days = intval($diff / (60 * 60 * 24)); $days = intval($diff / (60 * 60 * 24));
$value = abs($days - $averageSupplierTransactions) / $stddevSupplierTransactions; $value = abs($days - $averageSupplierTransactions) / $stddevSupplierTransactions;
return Array( return Array(
"heuristic_value" => $value, "heuristic_value" => $value,
"raw_value" => $days, "raw_value" => $days,
"mean" => $averageSupplierTransactions, "mean" => $averageSupplierTransactions,
"stddev" => $stddevSupplierTransactions "stddev" => $stddevSupplierTransactions
); );
} }
$supplierTransactions = Array(); $supplierTransactions = Array();
function getSupplierTransactions($supplierName, $supplierABN) function getSupplierTransactions($supplierName, $supplierABN)
{ {
global $supplierTransactions; global $supplierTransactions,$conn;
if ($supplierABN != 0 && $supplierABN != "") { if ($supplierABN != 0 && $supplierABN != "") {
if (!$supplierTransactions[$supplierABN]) { if (!$supplierTransactions[$supplierABN]) {
$query = 'select count(*) from contractnotice where supplierABN = "' . $supplierABN . '"'; $query = "select count(*) from contractnotice where \"supplierABN\" = '" . $supplierABN . "'";
$result = $conn->query($query); $result = $conn->query($query);
$r = $result->fetch(PDO::FETCH_BOTH); $r = $result->fetch(PDO::FETCH_BOTH);
$supplierTransactions[$supplierABN] = $r[0]; $supplierTransactions[$supplierABN] = $r[0];
} }
return $supplierTransactions[$supplierABN]; return $supplierTransactions[$supplierABN];
} }
if (!$supplierTransactions[$supplierName]) { if (!$supplierTransactions[$supplierName]) {
$query = 'select count(*) from contractnotice where supplierName = "' . $supplierName . '"'; $query = "select count(*) from contractnotice where \"supplierName\" = '" . $supplierName . "'";
   
$result = $conn->query($query); $result = $conn->query($query);
$r = $result->fetch(PDO::FETCH_BOTH); $r = $result->fetch(PDO::FETCH_BOTH);
$supplierTransactions[$supplierName] = $r[0]; $supplierTransactions[$supplierName] = $r[0];
} }
return $supplierTransactions[$supplierName]; return $supplierTransactions[$supplierName];
} }
$averageSupplierTransactions; $averageSupplierTransactions;
function getAverageSupplierTransactions() function getAverageSupplierTransactions()
{ {
global $averageSupplierTransactions; global $averageSupplierTransactions;
if (!$averageSupplierTransactions) { if (!$averageSupplierTransactions) {
getStatsSupplierTransactions(); getStatsSupplierTransactions();
} }
return $averageSupplierTransactions; return $averageSupplierTransactions;
} }
$stddevSupplierTransactions; $stddevSupplierTransactions;
function getstddevSupplierTransactions() function getstddevSupplierTransactions()
{ {
global $stddevSupplierTransactions; global $stddevSupplierTransactions;
if (!$stddevSupplierTransactions) { if (!$stddevSupplierTransactions) {
getStatsSupplierTransactions(); getStatsSupplierTransactions();
} }
return $stddevSupplierTransactions; return $stddevSupplierTransactions;
} }
function getStatsSupplierTransactions() function getStatsSupplierTransactions()
{ {
global $averageSupplierTransactions, $stddevSupplierTransactions; global $averageSupplierTransactions, $stddevSupplierTransactions,$conn;
$query = 'select avg(count), stddev(count) from (select IF(supplierABN != "",supplierABN,supplierName) as supplierID, count(*) as count from contractnotice group by supplierID) as a;'; $query = 'select avg(count), stddev(count) from (
  select (case when "supplierABN" != 0 THEN "supplierABN"::text ELSE "supplierName" END) as "supplierID",
  count(*) as count from contractnotice group by "supplierID") as a;';
$result = $conn->query($query); $result = $conn->query($query);
$r = $result->fetch(PDO::FETCH_BOTH); $r = $result->fetch(PDO::FETCH_BOTH);
$averageSupplierTransactions = $r[0]; $averageSupplierTransactions = $r[0];
$stddevSupplierTransactions = $r[1]; $stddevSupplierTransactions = $r[1];
} }
   
<?php <?php
if (php_sapi_name() != "cli") { if (php_sapi_name() != "cli") {
include_once ("../lib/common.inc.php"); include_once ("../lib/common.inc.php");
auth(); auth();
include_once("heuristics.inc.php"); include_once("heuristics.inc.php");
   
$query = 'SELECT *, agency.abn as "agencyABN", case when "supplierABN" != 0 then "supplierABN"::text else "supplierName" end as "supplierID" // agency table missing JOIN agency ON contractnotice."agencyName"
FROM contractnotice JOIN agency ON contractnotice."agencyName"=agency."agencyName" $query = 'SELECT *, case when "supplierABN" != 0 then "supplierABN"::text else "supplierName" end as "supplierID"
  FROM contractnotice
WHERE DATE("importDate") = (select * from (SELECT DATE("importDate") WHERE DATE("importDate") = (select * from (SELECT DATE("importDate")
FROM contractnotice ORDER BY "importDate" DESC limit 1) alias) limit 10'; FROM contractnotice ORDER BY "importDate" DESC limit 1) alias) limit 100';
$query = $conn->prepare($query); $query = $conn->prepare($query);
$query->execute(); $query->execute();
databaseError($conn->errorInfo()); databaseError($conn->errorInfo());
foreach ($query->fetchAll() as $cn) { foreach ($query->fetchAll() as $cn) {
//get each new CN from latest update //get each new CN from latest update
foreach ($heuristics as $heuristic => $description) { foreach ($heuristics as $heuristic => $description) {
// run all heuristics // run all heuristics
runHeuristic($heuristic, $cn); runHeuristic($heuristic, $cn);
} }
flush(); flush();
} }
/*foreach agency /*foreach agency
   
aggregate agency metrics aggregate agency metrics
   
foreach supplier foreach supplier
   
aggreate supplier metrics aggreate supplier metrics
   
foreach CN foreach CN
   
aggregate CN metrics */ aggregate CN metrics */
} }
?> ?>
   
<?php <?php
/* - large contract value  
- standard dev from mean/median /* - large contract value
- percent of total contracts for supplier/agency*/ - standard dev from mean/median
$heuristics["VALUE_LARGE_CONTRACT_OVERALL"] = Array( - percent of total contracts for supplier/agency */
"description" => "unusual value for time of year"); $heuristics["VALUE_LARGE_CONTRACT_OVERALL"] = Array(
function METADATA_DUPLICATED_DESCRIPTION($cn) "description" => "unusual value for time of year");
{  
$averageContractPeriod = getAverageContractPeriod(); function VALUE_LARGE_CONTRACT_OVERALL($cn) {
$diff = strtotime($cn['contractStart']) - strtotime($cn['publishDate']); $averageContractPeriod = getAverageContractPeriod();
$days = intval($diff / 24); $diff = strtotime($cn['contractStart']) - strtotime($cn['publishDate']);
return ($days > 45 ? 1 : 0); $days = intval($diff / 24);
  return ($days > 45 ? 1 : 0);
} }
   
/* - peculiar value /* - peculiar value
- Just under 80k, amplified if other contracts with same supplier are just under - Just under 80k, amplified if other contracts with same supplier are just under
*/ */
$heuristics["VALUE_NEAR_THRESHOLD"] = Array( $heuristics["VALUE_NEAR_THRESHOLD"] = Array(
"description" => "unusual value for time of year"); "description" => "unusual value for time of year");
/* /*
- unusual variation amount - absolute value; large reductions as well as large increases - unusual variation amount - absolute value; large reductions as well as large increases
*/ */
$heuristics["VALUE_LARGE_VARIATION"] = Array( $heuristics["VALUE_LARGE_VARIATION"] = Array(
"description" => "unusual value for time of year"); "description" => "unusual value for time of year");
function METADATA_DUPLICATED_DESCRIPTION($cn)  
{ function VALUE_LARGE_VARIATION($cn) {
$averageContractPeriod = getAverageContractPeriod(); $averageContractPeriod = getAverageContractPeriod();
$diff = strtotime($cn['contractStart']) - strtotime($cn['publishDate']); $diff = strtotime($cn['contractStart']) - strtotime($cn['publishDate']);
$days = intval($diff / 24); $days = intval($diff / 24);
return ($days > 45 ? 1 : 0); return ($days > 45 ? 1 : 0);
} }
   
/* - unusual value for time of year /* - unusual value for time of year
- compare to all other records in last 2 weeks - compare to all other records in last 2 weeks
- ie. many large contracts in june so takes more to standout*/ - ie. many large contracts in june so takes more to standout */
$heuristics["VALUE_HIGH_FOR_MONTH"] = Array( $heuristics["VALUE_HIGH_FOR_MONTH"] = Array(
"description" => "unusual value for time of year" "description" => "unusual value for time of year"
); );
function VALUE_HIGH_FOR_MONTH($cn, $monthAsInt)  
{ function VALUE_HIGH_FOR_MONTH($cn, $monthAsInt) {
$averageContractPeriod = getAverageContractPeriod(); $averageContractPeriod = getAverageContractPeriod();
$diff = strtotime($cn['contractStart']) - strtotime($cn['publishDate']); $diff = strtotime($cn['contractStart']) - strtotime($cn['publishDate']);
$days = intval($diff / 24); $days = intval($diff / 24);
return ($days > 45 ? 1 : 0); return ($days > 45 ? 1 : 0);
} }
   
$monthlyValueAverage = Array(); $monthlyValueAverage = Array();
function getAgencyTransactions($agencyName)  
{ function getAgencyTransactions($agencyName) {
global $agencyTransactions; global $agencyTransactions;
if (!$agencyTransactions[$agencyName]) { if (!$agencyTransactions[$agencyName]) {
$query = 'select count(*) from contractnotice where agencyName = "' . $agencyName . '"'; $query = 'select count(*) from contractnotice where agencyName = "' . $agencyName . '"';
$result = $conn->query($query); $result = $conn->query($query);
$r = $result->fetch(PDO::FETCH_BOTH); $r = $result->fetch(PDO::FETCH_BOTH);
$agencyTransactions[$agencyName] = $r[0]; $agencyTransactions[$agencyName] = $r[0];
} }
return $agencyTransactions[$agencyName]; return $agencyTransactions[$agencyName];
} }
   
?> ?>
<?php <?php
   
/*// most interesting /*// most interesting
SELECT sum(heuristic_value) as sum, CNID SELECT sum(heuristic_value) as sum, CNID
FROM heuristic_results group by CNID order by sum DESC limit 30 FROM heuristic_results group by CNID order by sum DESC limit 30
   
// spread of values // spread of values
select floor(sum) as val,count(*) from (SELECT sum(heuristic_value) select floor(sum) as val,count(*) from (SELECT sum(heuristic_value)
as sum FROM heuristic_results group by CNID) as a group by val*/ as sum FROM heuristic_results group by "CNID") as a group by val*/
   
$series = Array(); $series = Array();
   
include_once("../lib/common.inc.php"); include_once("../lib/common.inc.php");
$query = "select heuristic_name, floor(heuristic_value) as val,count(*) from heuristic_results group by heuristic_name, val"; $query = "select heuristic_name, floor(heuristic_value) as val,count(*) from heuristic_results group by heuristic_name, val";
$result = $conn->query($query); $result = $conn->query($query);
foreach ($result->fetchAll() as $r) { foreach ($result->fetchAll() as $r) {
$series[$r["heuristic_name"]][$r["val"]] = $r[2]; $series[$r["heuristic_name"]][$r["val"]] = $r[2];
} }
   
  $labels = Array();
  foreach ($series as $value) {
  $labels = $labels+array_keys($value);
  }
  $labels = Array(0,1,2,3,4,5);
  foreach ($series as $seriesName => $seriesEntry) {
  $data;
  foreach ($labels as $label) {
  $data[$label] = ($seriesEntry[$label] ? $seriesEntry[$label] : 0);
  }
  }
  print_r($labels);
  print_r($data);
?> ?>
<?php <?php
date_default_timezone_set("Australia/ACT"); date_default_timezone_set("Australia/ACT");
error_reporting(E_AL