From: maxious Date: Tue, 12 Jul 2011 14:37:50 +0000 Subject: Add initial date based heuristics X-Git-Url: http://maxious.lambdacomplex.org/git/?p=contractdashboard.git&a=commitdiff&h=b1850c8bc2901b9e380df5dc569fe44691cda16b --- Add initial date based heuristics --- --- a/heuristics/dateHeuristics.php +++ b/heuristics/dateHeuristics.php @@ -1,4 +1,92 @@ - - long contract period (number of weeks/days?) - - Reported late - - 45 days? A late contract is a dodgy contract except maybe for variations? - + "long contract period (number of weeks/days?)" +); +function DATE_LONG_CONTRACT_PERIOD($cn) +{ + $averageContractPeriod = getAverageContractPeriod(); + $stddevContractPeriod = getstddevContractPeriod(); + $diff = strtotime($cn['contractEnd']) - strtotime($cn['contractStart']); + $days = intval($diff / (60 * 60 * 24)); + $value = abs($days - $averageContractPeriod) / $stddevContractPeriod; + return Array( + "heuristic_value" => $value, + "raw_value" => $days, + "mean" => $averageContractPeriod, + "stddev" => $stddevContractPeriod + ); +} +$averageContractPeriod; +function getAverageContractPeriod() +{ + global $averageContractPeriod; + if (!$averageContractPeriod) { + $query = "select AVG(dateDiff(contractEnd,contractStart)) from contractnotice"; + $result = mysql_query($query); + $r = mysql_fetch_array($result, MYSQL_BOTH); + $averageContractPeriod = $r[0]; + } + return $averageContractPeriod; +} +$stddevContractPeriod; +function getstddevContractPeriod() +{ + global $stddevContractPeriod; + if (!$stddevContractPeriod) { + $query = "select STDDEV(dateDiff(contractEnd,contractStart)) from contractnotice"; + $result = mysql_query($query); + $r = mysql_fetch_array($result, MYSQL_BOTH); + $stddevContractPeriod = $r[0]; + } + return $stddevContractPeriod; +} +//Reported late, 45 days? A late contract is a dodgy contract except maybe for variations? +$heuristics["DATE_REPORTED_LATE"] = Array( + "description" => "Reported late, 45 days?" +); +function DATE_REPORTED_LATE($cn) +{ + $averageDaysLate = getAverageDaysLate(); + $stddevDaysLate = getStddevDaysLate(); + $diff = strtotime($cn['publishDate']) - strtotime($cn['contractStart']); + $days = intval($diff / (60 * 60 * 24)); + if ($days <= 0) { + $value = 0; + } + else { + // +1 demerit for exceeding 45 day requirement + $value = (abs($days - $averageDaysLate) / $stddevDaysLate) + ($days < 45 ? 0 : 1); + } + return Array( + "heuristic_value" => $value, + "raw_value" => $days, + "mean" => $averageDaysLate, + "stddev" => $stddevDaysLate + ); +} +$averageDaysLate; +function getAverageDaysLate() +{ + global $averageDaysLate; + if (!$averageDaysLate) { + $query = "select AVG(dateDiff(publishDate,contractStart)) from contractnotice"; + $result = mysql_query($query); + $r = mysql_fetch_array($result, MYSQL_BOTH); + $averageDaysLate = $r[0]; + } + return $averageDaysLate; +} +$stddevDaysLate; +function getStddevDaysLate() +{ + global $stddevDaysLate; + if (!$stddevDaysLate) { + $query = "select STDDEV(dateDiff(publishDate,contractStart)) from contractnotice"; + $result = mysql_query($query); + $r = mysql_fetch_array($result, MYSQL_BOTH); + $stddevDaysLate = $r[0]; + } + return $stddevDaysLate; +} +?> --- /dev/null +++ b/heuristics/heuristics.inc.php @@ -1,1 +1,33 @@ - +\n"; + elseif (strpos(mysql_error() , "Duplicate entry") === false) echo $hresults . " failed insert.
" . mysql_error() . "
$query

\n"; +} +?> --- a/heuristics/historyHeuristics.php +++ b/heuristics/historyHeuristics.php @@ -1,6 +1,25 @@ - - unusual for agency/supplier - - previous low number of transactions - - zero ie. new agency/supplier is huge score - - unusual value for time of year + "unusual for agency/supplier due to previous low number of transactions " +); +function HISTORY_LOW_TRANSACTIONS($cn) +{ + $averageContractPeriod = getAverageContractPeriod(); + $diff = strtotime($cn['contractStart']) - strtotime($cn['publishDate']); + $days = intval($diff / 24); + return ($days > 45 ? 1 : 0); +} + /* - unusual value for time of year - compare to all other records in last 2 weeks - - ie. many large contracts in june so takes more to standout + - ie. many large contracts in june so takes more to standout*/ + + $heuristics["HISTORY_HIGH_VALUE_FOR_MONTH"] = Array( + "description" => "unusual value for time of year"); +function HISTORY_HIGH_VALUE_FOR_MONTH($cn) +{ + $averageContractPeriod = getAverageContractPeriod(); + $diff = strtotime($cn['contractStart']) - strtotime($cn['publishDate']); + $days = intval($diff / 24); + return ($days > 45 ? 1 : 0); +} --- a/heuristics/metadataHeuristics.php +++ b/heuristics/metadataHeuristics.php @@ -1,3 +1,12 @@ - - duplicated description - - most duplicated overall, most duplicated per agency/category/supplier etc. - + "unusual value for time of year"); +function METADATA_DUPLICATED_DESCRIPTION($cn) +{ + $averageContractPeriod = getAverageContractPeriod(); + $diff = strtotime($cn['contractStart']) - strtotime($cn['publishDate']); + $days = intval($diff / 24); + return ($days > 45 ? 1 : 0); +} --- a/heuristics/runHeuristics.php +++ b/heuristics/runHeuristics.php @@ -1,6 +1,29 @@ - $description) { + // run all heuristics + runHeuristic($heuristic, $cn); + } + flush(); +} +/*foreach agency + +aggregate agency metrics + +foreach supplier + +aggreate supplier metrics + +foreach CN + +aggregate CN metrics */ ?> --- a/heuristics/valueHeuristics.php +++ b/heuristics/valueHeuristics.php @@ -2,9 +2,27 @@ - large contract value - chi-square test for outliers / standard dev from mean/median - percent of total contracts for supplier/agency - + $heuristics["METADATA_DUPLICATED_DESCRIPTION"] = Array( + "description" => "unusual value for time of year"); +function METADATA_DUPLICATED_DESCRIPTION($cn) +{ + $averageContractPeriod = getAverageContractPeriod(); + $diff = strtotime($cn['contractStart']) - strtotime($cn['publishDate']); + $days = intval($diff / 24); + return ($days > 45 ? 1 : 0); +} - peculiar value - Just under 80k, amplified if other contracts with same supplier are just under - unusual variation amount - absolute value; large reductions as well as large increases + + $heuristics["METADATA_DUPLICATED_DESCRIPTION"] = Array( + "description" => "unusual value for time of year"); +function METADATA_DUPLICATED_DESCRIPTION($cn) +{ + $averageContractPeriod = getAverageContractPeriod(); + $diff = strtotime($cn['contractStart']) - strtotime($cn['publishDate']); + $days = intval($diff / 24); + return ($days > 45 ? 1 : 0); +}