From: maxious Date: Tue, 19 Jul 2011 05:38:53 +0000 Subject: More value heuristics X-Git-Url: http://maxious.lambdacomplex.org/git/?p=contractdashboard.git&a=commitdiff&h=0f553d1f4620fa84ec0cb3da84e468c80534e685 --- More value heuristics --- --- a/heuristics/dateHeuristics.php +++ b/heuristics/dateHeuristics.php @@ -20,27 +20,32 @@ $averageContractPeriod; function getAverageContractPeriod() { - global $averageContractPeriod; + global $averageContractPeriod, $stddevContractPeriod; if (!$averageContractPeriod) { - $query = "select AVG(dateDiff(contractEnd,contractStart)) from contractnotice"; - $result = mysql_query($query); - $r = mysql_fetch_array($result, MYSQL_BOTH); - $averageContractPeriod = $r[0]; + getStddevAverageContractPeriod(); } return $averageContractPeriod; } $stddevContractPeriod; function getstddevContractPeriod() { - global $stddevContractPeriod; + global $averageContractPeriod, $stddevContractPeriod; if (!$stddevContractPeriod) { - $query = "select STDDEV(dateDiff(contractEnd,contractStart)) from contractnotice"; - $result = mysql_query($query); - $r = mysql_fetch_array($result, MYSQL_BOTH); - $stddevContractPeriod = $r[0]; + getStddevAverageContractPeriod(); } return $stddevContractPeriod; } +function getStddevAverageContractPeriod() +{ + global $averageContractPeriod, $stddevContractPeriod; + $query = "select AVG(dateDiff(contractEnd,contractStart)),stddev(dateDiff(contractEnd,contractStart)) from contractnotice"; + $result = mysql_query($query); + $r = mysql_fetch_array($result, MYSQL_BOTH); + $averageContractPeriod = $r[0]; + $stddevContractPeriod = $r[1]; +} + + //Reported late, 45 days? A late contract is a dodgy contract except maybe for variations? $heuristics["DATE_REPORTED_LATE"] = Array( "description" => "Reported late, 45 days?" @@ -70,10 +75,7 @@ { global $averageDaysLate; if (!$averageDaysLate) { - $query = "select AVG(dateDiff(publishDate,contractStart)) from contractnotice"; - $result = mysql_query($query); - $r = mysql_fetch_array($result, MYSQL_BOTH); - $averageDaysLate = $r[0]; + getDaysLate(); } return $averageDaysLate; } @@ -82,11 +84,19 @@ { global $stddevDaysLate; if (!$stddevDaysLate) { - $query = "select STDDEV(dateDiff(publishDate,contractStart)) from contractnotice"; - $result = mysql_query($query); - $r = mysql_fetch_array($result, MYSQL_BOTH); - $stddevDaysLate = $r[0]; + getDaysLate(); } return $stddevDaysLate; } +function getDaysLate() { + + global $averageDaysLate,$stddevDaysLate; + + $query = "select AVG(dateDiff(publishDate,contractStart)), STDDEV(dateDiff(publishDate,contractStart)) from contractnotice"; + $result = mysql_query($query); + $r = mysql_fetch_array($result, MYSQL_BOTH); + $averageDaysLate = $r[0]; + $stddevDaysLate = $r[1]; + +} ?> --- a/heuristics/heuristics.inc.php +++ b/heuristics/heuristics.inc.php @@ -1,20 +1,25 @@ \n"; - elseif (strpos(mysql_error() , "Duplicate entry") === false) echo $hresults . " failed insert.
" . mysql_error() . "
$query

\n"; + // save value and cn data via sql + $result = mysql_query($query); + if ($result) echo "Saved $heuristicName for {$cn["CNID"]}
\n"; + elseif (strpos(mysql_error() , "Duplicate entry") === false) echo $hresults . " failed insert.
" . mysql_error() . "
$query

\n"; + } } ?> --- a/heuristics/historyHeuristics.php +++ b/heuristics/historyHeuristics.php @@ -1,25 +1,126 @@ - "unusual for agency/supplier due to previous low number of transactions " + "unusual for agency due to previous low number of transactions " ); -function HISTORY_LOW_TRANSACTIONS($cn) +function HISTORY_LOW_TRANSACTIONS_AGENCY($cn) { - $averageContractPeriod = getAverageContractPeriod(); - $diff = strtotime($cn['contractStart']) - strtotime($cn['publishDate']); - $days = intval($diff / 24); - return ($days > 45 ? 1 : 0); + $thisAgencyTransactions = getAgencyTransactions($cn['agencyName']); + $averageAgencyTransactions = getAverageAgencyTransactions(); + $stddevAgencyTransactions = getstddevAgencyTransactions(); + $diff = strtotime($cn['contractEnd']) - strtotime($cn['contractStart']); + $days = intval($diff / (60 * 60 * 24)); + $value = abs($days - $averageAgencyTransactions) / $stddevAgencyTransactions; + return Array( + "heuristic_value" => $value, + "raw_value" => $days, + "mean" => $averageAgencyTransactions, + "stddev" => $stddevAgencyTransactions + ); } - /* - unusual value for time of year - - compare to all other records in last 2 weeks - - ie. many large contracts in june so takes more to standout*/ - - $heuristics["HISTORY_HIGH_VALUE_FOR_MONTH"] = Array( - "description" => "unusual value for time of year"); -function HISTORY_HIGH_VALUE_FOR_MONTH($cn) +$agencyTransactions = Array(); +function getAgencyTransactions($agencyName) { - $averageContractPeriod = getAverageContractPeriod(); - $diff = strtotime($cn['contractStart']) - strtotime($cn['publishDate']); - $days = intval($diff / 24); - return ($days > 45 ? 1 : 0); + global $agencyTransactions; + if (!$agencyTransactions[$agencyName]) { + $query = 'select count(*) from contractnotice where agencyName = "' . $agencyName . '"'; + $result = mysql_query($query); + $r = mysql_fetch_array($result, MYSQL_BOTH); + $agencyTransactions[$agencyName] = $r[0]; + } + return $agencyTransactions[$agencyName]; } +$averageAgencyTransactions; +function getAverageAgencyTransactions() +{ + global $averageAgencyTransactions; + if (!$averageAgencyTransactions) { + getStatsAgencyTransactions(); + } + return $averageAgencyTransactions; +} +$stddevAgencyTransactions; +function getstddevAgencyTransactions() +{ + global $stddevAgencyTransactions; + if (!$stddevAgencyTransactions) { + getStatsAgencyTransactions(); + } + return $stddevAgencyTransactions; +} +function getStatsAgencyTransactions() +{ + global $averageAgencyTransactions, $stddevAgencyTransactions; + $query = "select avg(count), STDDEV(count) from (select count(*) as count + from contractnotice group by agencyName) as a;"; + $result = mysql_query($query); + $r = mysql_fetch_array($result, MYSQL_BOTH); + $averageAgencyTransactions = $r[0]; + $stddevAgencyTransactions = $r[1]; +} +$heuristics["HISTORY_LOW_TRANSACTIONS_SUPPLIER"] = Array( + "description" => "unusual for supplier due to previous low number of transactions " +); +function HISTORY_LOW_TRANSACTIONS_SUPPLIER($cn) +{ + $thisSupplierTransactions = getSupplierTransactions($cn['supplierName'], $cn['supplierABN']); + $averageSupplierTransactions = getAverageSupplierTransactions(); + $stddevSupplierTransactions = getstddevSupplierTransactions(); + $diff = strtotime($cn['contractEnd']) - strtotime($cn['contractStart']); + $days = intval($diff / (60 * 60 * 24)); + $value = abs($days - $averageSupplierTransactions) / $stddevSupplierTransactions; + return Array( + "heuristic_value" => $value, + "raw_value" => $days, + "mean" => $averageSupplierTransactions, + "stddev" => $stddevSupplierTransactions + ); +} +$supplierTransactions = Array(); +function getSupplierTransactions($supplierName, $supplierABN) +{ + global $supplierTransactions; + if ($supplierABN != 0 && $supplierABN != "") { + if (!$supplierTransactions[$supplierABN]) { + $query = 'select count(*) from contractnotice where supplierABN = "' . $supplierABN . '"'; + $result = mysql_query($query); + $r = mysql_fetch_array($result, MYSQL_BOTH); + $supplierTransactions[$supplierABN] = $r[0]; + } + return $supplierTransactions[$supplierABN]; + } + if (!$supplierTransactions[$supplierName]) { + $query = 'select count(*) from contractnotice where supplierName = "' . $supplierName . '"'; + $result = mysql_query($query); + $r = mysql_fetch_array($result, MYSQL_BOTH); + $supplierTransactions[$supplierName] = $r[0]; + } + return $supplierTransactions[$supplierName]; +} +$averageSupplierTransactions; +function getAverageSupplierTransactions() +{ + global $averageSupplierTransactions; + if (!$averageSupplierTransactions) { + getStatsSupplierTransactions(); + } + return $averageSupplierTransactions; +} +$stddevSupplierTransactions; +function getstddevSupplierTransactions() +{ + global $stddevSupplierTransactions; + if (!$stddevSupplierTransactions) { + getStatsSupplierTransactions(); + } + return $stddevSupplierTransactions; +} +function getStatsSupplierTransactions() +{ + global $averageSupplierTransactions, $stddevSupplierTransactions; + $query = 'select avg(count), stddev(count) from (select IF(supplierABN != "",supplierABN,supplierName) as supplierID, count(*) as count from contractnotice group by supplierID) as a;'; + $result = mysql_query($query); + $r = mysql_fetch_array($result, MYSQL_BOTH); + $averageSupplierTransactions = $r[0]; + $stddevSupplierTransactions = $r[1]; +} + --- a/heuristics/metadataHeuristics.php +++ b/heuristics/metadataHeuristics.php @@ -1,12 +1,59 @@ "unusual value for time of year"); +/* all + SELECT description, count(*) as count +FROM `contractnotice` +group by description having count > 1 order by count +*/ +/*- duplicated description + - most duplicated overall, most duplicated per agency/category/supplier etc. */ +$heuristics["METADATA_DUPLICATED_DESCRIPTION"] = Array( + "description" => "" +); function METADATA_DUPLICATED_DESCRIPTION($cn) { - $averageContractPeriod = getAverageContractPeriod(); - $diff = strtotime($cn['contractStart']) - strtotime($cn['publishDate']); - $days = intval($diff / 24); - return ($days > 45 ? 1 : 0); + $averageDuplicatedDescriptions = getAverageDuplicatedDescriptions(); + $stddevDuplicatedDescriptions = getstddevDuplicatedDescriptions(); + $query = 'select count(*) from contractnotice where description = "' . $agencyName . '"'; + $result = mysql_query($query); + $r = mysql_fetch_array($result, MYSQL_BOTH); + $dupeDesc = $r[0]; + if ($dupeDesc == 1) $value = 0; + else $value = abs($dupeDesc - $averageDuplicatedDescriptions) / $stddevDuplicatedDescriptions; + return Array( + "heuristic_value" => $value, + "raw_value" => $dupeDesc, + "mean" => $averageDuplicatedDescriptions, + "stddev" => $stddevDuplicatedDescriptions + ); } +$averageDuplicatedDescriptions; +function getAverageDuplicatedDescriptions() +{ + global $averageDuplicatedDescriptions; + if (!$averageDuplicatedDescriptions) { + getStatsDuplicatedDescriptions(); + } + return $averageDuplicatedDescriptions; +} +$stddevDuplicatedDescriptions; +function getstddevDuplicatedDescriptions() +{ + global $stddevDuplicatedDescriptions; + if (!$stddevDuplicatedDescriptions) { + getStatsDuplicatedDescriptions(); + } + return $stddevDuplicatedDescriptions; +} +function getStatsDuplicatedDescriptions() +{ + $query = "select avg(count),STDDEV(count) from ( + SELECT description, count(*) as count +FROM `contractnotice` +group by description having count > 1 + ) as a;"; + $result = mysql_query($query); + $r = mysql_fetch_array($result, MYSQL_BOTH); + $averageDuplicatedDescriptions = $r[0]; + $stddevDuplicatedDescriptions = $r[1]; +} + --- a/heuristics/runHeuristics.php +++ b/heuristics/runHeuristics.php @@ -7,7 +7,7 @@ FROM contractnotice JOIN agency ON contractnotice.agencyName=agency.agencyName WHERE DATE(importDate) = (select * from (SELECT DATE(importDate) FROM contractnotice ORDER BY importDate DESC limit 1) alias)"; -$result = mysql_query($lastimportquery); +$result = mysql_query($query); if (!$result) echo mysql_error().$query; while ($cn = mysql_fetch_array($result, MYSQL_BOTH)) { //get each new CN from latest update --- a/heuristics/valueHeuristics.php +++ b/heuristics/valueHeuristics.php @@ -1,8 +1,8 @@ - - - large contract value - - chi-square test for outliers / standard dev from mean/median - - percent of total contracts for supplier/agency - $heuristics["METADATA_DUPLICATED_DESCRIPTION"] = Array( + "unusual value for time of year"); function METADATA_DUPLICATED_DESCRIPTION($cn) { @@ -12,12 +12,15 @@ return ($days > 45 ? 1 : 0); } - - peculiar value + /* - peculiar value - Just under 80k, amplified if other contracts with same supplier are just under - - unusual variation amount - - absolute value; large reductions as well as large increases - - $heuristics["METADATA_DUPLICATED_DESCRIPTION"] = Array( + */ + $heuristics["VALUE_NEAR_THRESHOLD"] = Array( + "description" => "unusual value for time of year"); + /* + - unusual variation amount - absolute value; large reductions as well as large increases + */ + $heuristics["VALUE_LARGE_VARIATION"] = Array( "description" => "unusual value for time of year"); function METADATA_DUPLICATED_DESCRIPTION($cn) { @@ -26,3 +29,30 @@ $days = intval($diff / 24); return ($days > 45 ? 1 : 0); } + +/* - unusual value for time of year + - compare to all other records in last 2 weeks + - ie. many large contracts in june so takes more to standout*/ +$heuristics["VALUE_HIGH_FOR_MONTH"] = Array( + "description" => "unusual value for time of year" +); +function VALUE_HIGH_FOR_MONTH($cn, $monthAsInt) +{ + $averageContractPeriod = getAverageContractPeriod(); + $diff = strtotime($cn['contractStart']) - strtotime($cn['publishDate']); + $days = intval($diff / 24); + return ($days > 45 ? 1 : 0); +} +$monthlyValueAverage = Array(); +function getAgencyTransactions($agencyName) +{ + global $agencyTransactions; + if (!$agencyTransactions[$agencyName]) { + $query = 'select count(*) from contractnotice where agencyName = "' . $agencyName . '"'; + $result = mysql_query($query); + $r = mysql_fetch_array($result, MYSQL_BOTH); + $agencyTransactions[$agencyName] = $r[0]; + } + return $agencyTransactions[$agencyName]; +} +?> --- a/heuristics/viewHeuristicsColormap.php +++ b/heuristics/viewHeuristicsColormap.php @@ -1,5 +1,11 @@ + div { + padding: 5px; + display: inline-block; + } + '; // http://www.herethere.net/~samson/php/color_gradient/color_gradient_generator.php.txt // return the interpolated value between pBegin and pEnd function interpolate($pBegin, $pEnd, $pStep, $pMax) @@ -42,13 +48,13 @@ $maxVal = $r[0]; $query = "SELECT sum(heuristic_value) as sum, CNID -FROM `heuristic_results` group by CNID order by sum DESC LIMIT 30"; +FROM `heuristic_results` group by CNID order by sum DESC LIMIT 300"; $result = mysql_query($query); if (!$result) echo mysql_error().$query; while ($r = mysql_fetch_array($result, MYSQL_BOTH)) { - echo ''; + echo '
'; echo 'X'; - echo ""; + echo "
"; } ?> --- a/heuristics/viewHeuristicsDistribution.php +++ b/heuristics/viewHeuristicsDistribution.php @@ -14,22 +14,32 @@ include("../lib/pChart2.1.0/class/pData.class.php"); include("../lib/pChart2.1.0/class/pDraw.class.php"); include("../lib/pChart2.1.0/class/pImage.class.php"); -$labels = Array(); -$values = Array(); + +$series = Array(); include_once("../lib/common.inc.php"); -$query = "select floor(sum) as val,count(*) from (SELECT sum(heuristic_value) -as sum FROM heuristic_results group by CNID) as a group by val"; +$query = "select heuristic_name, floor(heuristic_value) as val,count(*) from heuristic_results group by heuristic_name, val"; $result = mysql_query($query); if (!$result) echo mysql_error().$query; while ($r = mysql_fetch_array($result, MYSQL_BOTH)) { - $labels[] = $r[0]; - $values[] = $r[1]; + $series[$r["heuristic_name"]][$r["val"]] = $r[2]; } /* Create and populate the pData object */ - $MyData = new pData(); - $MyData->addPoints($values,"Records"); + $MyData = new pData(); + $labels = Array(); + foreach ($series as $value) { + $labels = $labels+array_keys($value); + } + $labels = Array(0,1,2,3,4,5); +foreach ($series as $seriesName => $seriesEntry) { + $data; + foreach ($labels as $label) { + $data[$label] = ($seriesEntry[$label] ? $seriesEntry[$label] : 0); + } + +$MyData->addPoints($data,$seriesName); + } $MyData->setAxisName(0,"# of records"); $MyData->addPoints($labels,"Labels"); $MyData->setSerieDescription("Labels","Bins"); @@ -65,7 +75,7 @@ $myPicture->drawSplineChart(); /* Write the chart legend */ - $myPicture->drawLegend(540,20,array("Style"=>LEGEND_NOBORDER,"Mode"=>LEGEND_HORIZONTAL)); + $myPicture->drawLegend(540,20,array("Style"=>LEGEND_NOBORDER,"Mode"=>LEGEND_VERTICAL)); /* Render the picture (choose the best way) */ $myPicture->autoOutput("pictures/example.drawSplineChart.simple.png");