Add initial date based heuristics
Add initial date based heuristics

- long contract period (number of weeks/days?) <?php
- Reported late //long contract period (number of weeks/days?)
- 45 days? A late contract is a dodgy contract except maybe for variations? $heuristics["DATE_LONG_CONTRACT_PERIOD"] = Array(
  "description" => "long contract period (number of weeks/days?)"
  );
  function DATE_LONG_CONTRACT_PERIOD($cn)
  {
  $averageContractPeriod = getAverageContractPeriod();
  $stddevContractPeriod = getstddevContractPeriod();
  $diff = strtotime($cn['contractEnd']) - strtotime($cn['contractStart']);
  $days = intval($diff / (60 * 60 * 24));
  $value = abs($days - $averageContractPeriod) / $stddevContractPeriod;
  return Array(
  "heuristic_value" => $value,
  "raw_value" => $days,
  "mean" => $averageContractPeriod,
  "stddev" => $stddevContractPeriod
  );
  }
  $averageContractPeriod;
  function getAverageContractPeriod()
  {
  global $averageContractPeriod;
  if (!$averageContractPeriod) {
  $query = "select AVG(dateDiff(contractEnd,contractStart)) from contractnotice";
  $result = mysql_query($query);
  $r = mysql_fetch_array($result, MYSQL_BOTH);
  $averageContractPeriod = $r[0];
  }
  return $averageContractPeriod;
  }
  $stddevContractPeriod;
  function getstddevContractPeriod()
  {
  global $stddevContractPeriod;
  if (!$stddevContractPeriod) {
  $query = "select STDDEV(dateDiff(contractEnd,contractStart)) from contractnotice";
  $result = mysql_query($query);
  $r = mysql_fetch_array($result, MYSQL_BOTH);
  $stddevContractPeriod = $r[0];
  }
  return $stddevContractPeriod;
  }
  //Reported late, 45 days? A late contract is a dodgy contract except maybe for variations?
  $heuristics["DATE_REPORTED_LATE"] = Array(
  "description" => "Reported late, 45 days?"
  );
  function DATE_REPORTED_LATE($cn)
  {
  $averageDaysLate = getAverageDaysLate();
  $stddevDaysLate = getStddevDaysLate();
  $diff = strtotime($cn['publishDate']) - strtotime($cn['contractStart']);
  $days = intval($diff / (60 * 60 * 24));
  if ($days <= 0) {
  $value = 0;
  }
  else {
  // +1 demerit for exceeding 45 day requirement
  $value = (abs($days - $averageDaysLate) / $stddevDaysLate) + ($days < 45 ? 0 : 1);
  }
  return Array(
  "heuristic_value" => $value,
  "raw_value" => $days,
  "mean" => $averageDaysLate,
  "stddev" => $stddevDaysLate
  );
  }
  $averageDaysLate;
  function getAverageDaysLate()
  {
  global $averageDaysLate;
  if (!$averageDaysLate) {
  $query = "select AVG(dateDiff(publishDate,contractStart)) from contractnotice";
  $result = mysql_query($query);
  $r = mysql_fetch_array($result, MYSQL_BOTH);
  $averageDaysLate = $r[0];
  }
  return $averageDaysLate;
  }
  $stddevDaysLate;
  function getStddevDaysLate()
  {
  global $stddevDaysLate;
  if (!$stddevDaysLate) {
  $query = "select STDDEV(dateDiff(publishDate,contractStart)) from contractnotice";
  $result = mysql_query($query);
  $r = mysql_fetch_array($result, MYSQL_BOTH);
  $stddevDaysLate = $r[0];
  }
  return $stddevDaysLate;
  }
  ?>
  <?php
  include_once("../lib/common.inc.php");
  $heuristics = Array();
  //each heuristic adds self to description array
  include ("dateHeuristics.php");
  //include("historyHeuristics.php");
  //include("metadataHeuristics.php");
  //include("valueHeuristics.php");
  // method signature heuristic($contractNoticeAsArray);
  function runHeuristic($heuristicName, $cn)
  {
  $hresults = call_user_func($heuristicName, $cn);
  if (!isset($hresults["heuristic_value"]) || !isset($hresults["raw_value"]) || !isset($hresults["mean"]) || !isset($hresults["stddev"])) {
  print_r($hresults);
  die("Missing field in heurtistic $heuristicName result");
  }
  $query = "insert into heuristic_results values('$heuristicName',
  '{$hresults["heuristic_value"]}',
  '{$hresults["raw_value"]}',
  '{$hresults["mean"]}',
  '{$hresults["stddev"]}',
  '{$cn["CNID"]}',
  NOW(),
  '{$cn["publishDate"]}',
  '{$cn["agencyABN"]}',
  '{$cn["supplierID"]}'
  )";
  // save value and cn data via sql
  $result = mysql_query($query);
  if ($result) echo "Saved $heuristicName for {$cn["CNID"]} <br>\n";
  elseif (strpos(mysql_error() , "Duplicate entry") === false) echo $hresults . " failed insert.<br>" . mysql_error() . " <br> $query <br><br>\n";
  }
  ?>
- unusual for agency/supplier <?php
- previous low number of transactions // "unusual for agency/supplier due to previous low number of transactions "
- zero ie. new agency/supplier is huge score $heuristics["HISTORY_LOW_TRANSACTIONS"] = Array(
- unusual value for time of year "description" => "unusual for agency/supplier due to previous low number of transactions "
  );
  function HISTORY_LOW_TRANSACTIONS($cn)
  {
  $averageContractPeriod = getAverageContractPeriod();
  $diff = strtotime($cn['contractStart']) - strtotime($cn['publishDate']);
  $days = intval($diff / 24);
  return ($days > 45 ? 1 : 0);
  }
  /* - unusual value for time of year
- compare to all other records in last 2 weeks - compare to all other records in last 2 weeks
- ie. many large contracts in june so takes more to standout - ie. many large contracts in june so takes more to standout*/
   
  $heuristics["HISTORY_HIGH_VALUE_FOR_MONTH"] = Array(
  "description" => "unusual value for time of year");
  function HISTORY_HIGH_VALUE_FOR_MONTH($cn)
  {
  $averageContractPeriod = getAverageContractPeriod();
  $diff = strtotime($cn['contractStart']) - strtotime($cn['publishDate']);
  $days = intval($diff / 24);
  return ($days > 45 ? 1 : 0);
  }
- duplicated description <?php
- most duplicated overall, most duplicated per agency/category/supplier etc. /*- duplicated description
  - most duplicated overall, most duplicated per agency/category/supplier etc. */
  $heuristics["METADATA_DUPLICATED_DESCRIPTION"] = Array(
  "description" => "unusual value for time of year");
  function METADATA_DUPLICATED_DESCRIPTION($cn)
  {
  $averageContractPeriod = getAverageContractPeriod();
  $diff = strtotime($cn['contractStart']) - strtotime($cn['publishDate']);
  $days = intval($diff / 24);
  return ($days > 45 ? 1 : 0);
  }
<? <?php
if agency include_once("heuristics.inc.php");
if supplier $query = "SELECT *, agency.abn as agencyABN, IF(supplierABN != '',supplierABN,supplierName) as supplierID
if CN FROM contractnotice JOIN agency ON contractnotice.agencyName=agency.agencyName
  WHERE DATE(importDate) = (select * from (SELECT DATE(importDate)
  FROM contractnotice ORDER BY importDate DESC limit 1) alias)";
  $result = mysql_query($query);
  if (!$result) echo mysql_error().$query;
  while ($cn = mysql_fetch_array($result, MYSQL_BOTH)) {
  //get each new CN from latest update
  foreach ($heuristics as $heuristic => $description) {
  // run all heuristics
  runHeuristic($heuristic, $cn);
  }
  flush();
  }
  /*foreach agency
   
  aggregate agency metrics
   
  foreach supplier
   
  aggreate supplier metrics
   
  foreach CN
   
  aggregate CN metrics */
?> ?>
   
   
- large contract value - large contract value
- chi-square test for outliers / standard dev from mean/median - chi-square test for outliers / standard dev from mean/median
- percent of total contracts for supplier/agency - percent of total contracts for supplier/agency
  $heuristics["METADATA_DUPLICATED_DESCRIPTION"] = Array(
  "description" => "unusual value for time of year");
  function METADATA_DUPLICATED_DESCRIPTION($cn)
  {
  $averageContractPeriod = getAverageContractPeriod();
  $diff = strtotime($cn['contractStart']) - strtotime($cn['publishDate']);
  $days = intval($diff / 24);
  return ($days > 45 ? 1 : 0);
  }
   
- peculiar value - peculiar value
- Just under 80k, amplified if other contracts with same supplier are just under - Just under 80k, amplified if other contracts with same supplier are just under
- unusual variation amount - unusual variation amount
- absolute value; large reductions as well as large increases - absolute value; large reductions as well as large increases
   
  $heuristics["METADATA_DUPLICATED_DESCRIPTION"] = Array(
  "description" => "unusual value for time of year");
  function METADATA_DUPLICATED_DESCRIPTION($cn)
  {
  $averageContractPeriod = getAverageContractPeriod();
  $diff = strtotime($cn['contractStart']) - strtotime($cn['publishDate']);
  $days = intval($diff / 24);
  return ($days > 45 ? 1 : 0);
  }