Merge branch 'master' of /git/contractdashboard
[contractdashboard.git] / admin / partialdata / scraper.txt
blob:a/admin/partialdata/scraper.txt -> blob:b/admin/partialdata/scraper.txt
<?php <?php
  if (php_sapi_name() == "cli") {
date_default_timezone_set('Australia/Melbourne'); date_default_timezone_set('Australia/Melbourne');
$split = false; $split = false;
function format_bytes($size) { function format_bytes($size) {
$units = array(' B', ' KB', ' MB', ' GB', ' TB'); $units = array(' B', ' KB', ' MB', ' GB', ' TB');
for ($i = 0; $size >= 1024 && $i < 4; $i++) $size /= 1024; for ($i = 0; $size >= 1024 && $i < 4; $i++) $size /= 1024;
return round($size, 2).$units[$i]; return round($size, 2).$units[$i];
} }
   
$days = 4; $days = 4;
if (isset($_REQUEST['days'])) $days = $_REQUEST['days']; if (isset($_REQUEST['days'])) $days = $_REQUEST['days'];
$startDate = strtotime("05-Jun-2008"); $startDate = strtotime("05-Jun-2008");
if (isset($_REQUEST['startDate'])) $startDate = $_REQUEST['startDate']; if (isset($_REQUEST['startDate'])) $startDate = $_REQUEST['startDate'];
   
function getFile($startDate, $days, $minVal, $maxVal) { function getFile($startDate, $days, $minVal, $maxVal) {
global $split; global $split;
$endDate = strtotime(date("Y-m-d", $startDate)." +".$days." days"); $endDate = strtotime(date("Y-m-d", $startDate)." +".$days." days");
$file = date("dMY",$startDate).'to'.date("dMY",$endDate).'val'.$minVal.'to'.$maxVal.'.xls'; $file = date("dMY",$startDate).'to'.date("dMY",$endDate).'val'.$minVal.'to'.$maxVal.'.xls';
echo "Fetching $file ($days days) ($minVal < value < $maxVal )... "; echo "Fetching $file ($days days) ($minVal < value < $maxVal )... ";
$url = "https://www.tenders.gov.au/?event=public.advancedsearch.CNSONRedirect&type=cnEvent&atmType=archived%2Cclosed%2Cpublished%2Cproposed&agencyUUID=&agencyStatus=-1&portfolioUUID=&keyword=&KeywordTypeSearch=AllWord&CNID=&dateType=Publish+Date&dateStart=".date("d-M-Y",$startDate)."&dateEnd=".date("d-M-Y",$endDate)."&supplierName=&supplierABN=&valueFrom=".$minVal."&valueTo=".$maxVal."&ATMID=&AgencyRefId=&consultancy=&download=Download+results"; $url = "https://www.tenders.gov.au/?event=public.advancedsearch.CNSONRedirect&type=cnEvent&atmType=archived%2Cclosed%2Cpublished%2Cproposed&agencyUUID=&agencyStatus=-1&portfolioUUID=&keyword=&KeywordTypeSearch=AllWord&CNID=&dateType=Publish+Date&dateStart=".date("d-M-Y",$startDate)."&dateEnd=".date("d-M-Y",$endDate)."&supplierName=&supplierABN=&valueFrom=".$minVal."&valueTo=".$maxVal."&ATMID=&AgencyRefId=&consultancy=&download=Download+results";
echo "<!-- $url -->"; echo "<!-- $url -->";
$current = file_get_contents($url); $current = file_get_contents($url);
if (strpos($current,"There are no results that match your selection.")> 0 ) { if (strpos($current,"There are no results that match your selection.")> 0 ) {
echo "<font color=red>Empty file!</font><br>"; echo "<font color=red>Empty file!</font><br>";
} }
if (strpos($current,"Your search returned more than 1000 results.") === false) { if (strpos($current,"Your search returned more than 1000 results.") === false) {
file_put_contents($file, $current); file_put_contents($file, $current);
echo "$file saved<br>"; echo "$file saved<br>";
echo format_bytes(filesize($file))."<br>"; echo format_bytes(filesize($file))."<br>";
echo '<a href="?startDate='.$endDate.'&days='.$days.'">Load next '.($days).' days </a><br>'; echo '<a href="?startDate='.$endDate.'&days='.$days.'">Load next '.($days).' days </a><br>';
echo '<a href="?startDate='.$endDate.'&days='.($days*2).'">Load next '.($days*2).' days </a><br>'; echo '<a href="?startDate='.$endDate.'&days='.($days*2).'">Load next '.($days*2).' days </a><br>';
echo '<a href="?startDate='.$endDate.'&days='.$days.'&split=yes">Load next '.($days).' days with split</a><br>'; echo '<a href="?startDate='.$endDate.'&days='.$days.'&split=yes">Load next '.($days).' days with split</a><br>';
flush(); flush();
if (!isset($_REQUEST['split']) && !$split) { if (!isset($_REQUEST['split']) && !$split) {
echo "Success so fetching next $days... <br>"; echo "Success so fetching next $days... <br>";
getFile($endDate, $days, "" , ""); getFile($endDate, $days, "" , "");
} }
return true; return true;
} else { } else {
echo "<font color=red>Too many records!</font><br>"; echo "<font color=red>Too many records!</font><br>";
echo '<a href="?startDate='.$startDate.'&days='.floor($days/2).'">Load '.($days/2).' days instead?</a><br>'; echo '<a href="?startDate='.$startDate.'&days='.floor($days/2).'">Load '.($days/2).' days instead?</a><br>';
echo '<a href="?startDate='.$startDate.'&days='.$days.'&split=yes">Split instead?</a><br>'; echo '<a href="?startDate='.$startDate.'&days='.$days.'&split=yes">Split instead?</a><br>';
flush(); flush();
if (!isset($_REQUEST['split']) && !$split) { if (!isset($_REQUEST['split']) && !$split) {
echo "Failure so splitting ... <br>"; echo "Failure so splitting ... <br>";
doSplit($startDate, $days); doSplit($startDate, $days);
} }
return false; return false;
} }
} }
function doSplit($startDate, $days) { function doSplit($startDate, $days) {
global $split; global $split;
$split = true; $split = true;
set_time_limit(20); set_time_limit(20);
getFile($startDate, $days, 0, 12000); getFile($startDate, $days, 0, 12000);
getFile($startDate, $days, 12000, 16000); getFile($startDate, $days, 12000, 16000);
getFile($startDate, $days, 16000, 20000); getFile($startDate, $days, 16000, 20000);
getFile($startDate, $days, 20000, 30000); getFile($startDate, $days, 20000, 30000);
getFile($startDate, $days, 30000, 40000); getFile($startDate, $days, 30000, 40000);
// getFile($startDate, $days, 40000, 80000); // getFile($startDate, $days, 40000, 80000);
getFile($startDate, $days, 40000, 60000); getFile($startDate, $days, 40000, 60000);
getFile($startDate, $days, 60000, 80000); getFile($startDate, $days, 60000, 80000);
// getFile($startDate, $days, 80000, 300000); // getFile($startDate, $days, 80000, 300000);
getFile($startDate, $days, 80000, 150000); getFile($startDate, $days, 80000, 150000);
getFile($startDate, $days, 150000, 300000); getFile($startDate, $days, 150000, 300000);
getFile($startDate, $days, 300000, 999999999); getFile($startDate, $days, 300000, 999999999);
} }
if (isset($_REQUEST['split'])) { if (isset($_REQUEST['split'])) {
doSplit($startDate, $days); doSplit($startDate, $days);
} else { } else {
getFile($startDate, $days, "" , ""); getFile($startDate, $days, "" , "");
} }
  }
?> ?>