|
<?php |
|
|
|
/************************************************* |
|
|
|
Snoopy - the PHP net client |
|
Author: Monte Ohrt <monte@ispi.net> |
|
Copyright (c): 1999-2008 New Digital Group, all rights reserved |
|
Version: 1.2.4 |
|
|
|
* This library is free software; you can redistribute it and/or |
|
* modify it under the terms of the GNU Lesser General Public |
|
* License as published by the Free Software Foundation; either |
|
* version 2.1 of the License, or (at your option) any later version. |
|
* |
|
* This library is distributed in the hope that it will be useful, |
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
* Lesser General Public License for more details. |
|
* |
|
* You should have received a copy of the GNU Lesser General Public |
|
* License along with this library; if not, write to the Free Software |
|
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
|
|
|
You may contact the author of Snoopy by e-mail at: |
|
monte@ohrt.com |
|
|
|
The latest version of Snoopy can be obtained from: |
|
http://snoopy.sourceforge.net/ |
|
|
|
*************************************************/ |
|
|
|
class Snoopy |
|
{ |
|
/**** Public variables ****/ |
|
|
|
/* user definable vars */ |
|
|
|
var $host = "www.php.net"; // host name we are connecting to |
|
var $port = 80; // port we are connecting to |
|
var $proxy_host = ""; // proxy host to use |
|
var $proxy_port = ""; // proxy port to use |
|
var $proxy_user = ""; // proxy user to use |
|
var $proxy_pass = ""; // proxy password to use |
|
|
|
var $agent = "Snoopy v1.2.4"; // agent we masquerade as |
|
var $referer = ""; // referer info to pass |
|
var $cookies = array(); // array of cookies to pass |
|
// $cookies["username"]="joe"; |
|
var $rawheaders = array(); // array of raw headers to send |
|
// $rawheaders["Content-type"]="text/html"; |
|
|
|
var $maxredirs = 5; // http redirection depth maximum. 0 = disallow |
|
var $lastredirectaddr = ""; // contains address of last redirected address |
|
var $offsiteok = true; // allows redirection off-site |
|
var $maxframes = 0; // frame content depth maximum. 0 = disallow |
|
var $expandlinks = true; // expand links to fully qualified URLs. |
|
// this only applies to fetchlinks() |
|
// submitlinks(), and submittext() |
|
var $passcookies = true; // pass set cookies back through redirects |
|
// NOTE: this currently does not respect |
|
// dates, domains or paths. |
|
|
|
var $user = ""; // user for http authentication |
|
var $pass = ""; // password for http authentication |
|
|
|
// http accept types |
|
var $accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, */*"; |
|
|
|
var $results = ""; // where the content is put |
|
|
|
var $error = ""; // error messages sent here |
|
var $response_code = ""; // response code returned from server |
|
var $headers = array(); // headers returned from server sent here |
|
var $maxlength = 500000; // max return data length (body) |
|
var $read_timeout = 0; // timeout on read operations, in seconds |
|
// supported only since PHP 4 Beta 4 |
|
// set to 0 to disallow timeouts |
|
var $timed_out = false; // if a read operation timed out |
|
var $status = 0; // http request status |
|
|
|
var $temp_dir = "/tmp"; // temporary directory that the webserver |
|
// has permission to write to. |
|
// under Windows, this should be C:\temp |
|
|
|
var $curl_path = "/usr/local/bin/curl"; |
|
// Snoopy will use cURL for fetching |
|
// SSL content if a full system path to |
|
// the cURL binary is supplied here. |
|
// set to false if you do not have |
|
// cURL installed. See http://curl.haxx.se |
|
// for details on installing cURL. |
|
// Snoopy does *not* use the cURL |
|
// library functions built into php, |
|
// as these functions are not stable |
|
// as of this Snoopy release. |
|
|
|
/**** Private variables ****/ |
|
|
|
var $_maxlinelen = 4096; // max line length (headers) |
|
|
|
var $_httpmethod = "GET"; // default http request method |
|
var $_httpversion = "HTTP/1.0"; // default http request version |
|
var $_submit_method = "POST"; // default submit method |
|
var $_submit_type = "application/x-www-form-urlencoded"; // default submit type |
|
var $_mime_boundary = ""; // MIME boundary for multipart/form-data submit type |
|
var $_redirectaddr = false; // will be set if page fetched is a redirect |
|
var $_redirectdepth = 0; // increments on an http redirect |
|
var $_frameurls = array(); // frame src urls |
|
var $_framedepth = 0; // increments on frame depth |
|
|
|
var $_isproxy = false; // set if using a proxy server |
|
var $_fp_timeout = 30; // timeout for socket connection |
|
|
|
/*======================================================================*\ |
|
Function: fetch |
|
Purpose: fetch the contents of a web page |
|
(and possibly other protocols in the |
|
future like ftp, nntp, gopher, etc.) |
|
Input: $URI the location of the page to fetch |
|
Output: $this->results the output text from the fetch |
|
\*======================================================================*/ |
|
|
|
function fetch($URI) |
|
{ |
|
|
|
//preg_match("|^([^:]+)://([^:/]+)(:[\d]+)*(.*)|",$URI,$URI_PARTS); |
|
$URI_PARTS = parse_url($URI); |
|
if (!empty($URI_PARTS["user"])) |
|
$this->user = $URI_PARTS["user"]; |
|
if (!empty($URI_PARTS["pass"])) |
|
$this->pass = $URI_PARTS["pass"]; |
|
if (empty($URI_PARTS["query"])) |
|
$URI_PARTS["query"] = ''; |
|
if (empty($URI_PARTS["path"])) |
|
$URI_PARTS["path"] = ''; |
|
|
|
switch(strtolower($URI_PARTS["scheme"])) |
|
{ |
|
case "http": |
|
$this->host = $URI_PARTS["host"]; |
|
if(!empty($URI_PARTS["port"])) |
|
$this->port = $URI_PARTS["port"]; |
|
if($this->_connect($fp)) |
|
{ |
|
if($this->_isproxy) |
|
{ |
|
// using proxy, send entire URI |
|
$this->_httprequest($URI,$fp,$URI,$this->_httpmethod); |
|
} |
|
else |
|
{ |
|
$path = $URI_PARTS["path"].($URI_PARTS["query"] ? "?".$URI_PARTS["query"] : ""); |
|
// no proxy, send only the path |
|
$this->_httprequest($path, $fp, $URI, $this->_httpmethod); |
|
} |
|
|
|
$this->_disconnect($fp); |
|
|
|
if($this->_redirectaddr) |
|
{ |
|
/* url was redirected, check if we've hit the max depth */ |
|
if($this->maxredirs > $this->_redirectdepth) |
|
{ |
|
// only follow redirect if it's on this site, or offsiteok is true |
|
if(preg_match("|^http://".preg_quote($this->host)."|i",$this->_redirectaddr) || $this->offsiteok) |
|
{ |
|
/* follow the redirect */ |
|
$this->_redirectdepth++; |
|
$this->lastredirectaddr=$this->_redirectaddr; |
|
$this->fetch($this->_redirectaddr); |
|
} |
|
} |
|
} |
|
|
|
if($this->_framedepth < $this->maxframes && count($this->_frameurls) > 0) |
|
{ |
|
$frameurls = $this->_frameurls; |
|
$this->_frameurls = array(); |
|
|
|
while(list(,$frameurl) = each($frameurls)) |
|
{ |
|
if($this->_framedepth < $this->maxframes) |
|
{ |
|
$this->fetch($frameurl); |
|
$this->_framedepth++; |
|
} |
|
else |
|
break; |
|
} |
|
} |
|
} |
|
else |
|
{ |
|
return false; |
|
} |
|
return true; |
|
break; |
|
case "https": |
|
if(!$this->curl_path) |
|
return false; |
|
if(function_exists("is_executable")) |
|
if (!is_executable($this->curl_path)) |
|
return false; |
|
$this->host = $URI_PARTS["host"]; |
|
if(!empty($URI_PARTS["port"])) |
|
$this->port = $URI_PARTS["port"]; |
|
if($this->_isproxy) |
|
{ |
|
// using proxy, send entire URI |
|
$this->_httpsrequest($URI,$URI,$this->_httpmethod); |
|
} |
|
else |
|
{ |
|
$path = $URI_PARTS["path"].($URI_PARTS["query"] ? "?".$URI_PARTS["query"] : ""); |
|
// no proxy, send only the path |
|
$this->_httpsrequest($path, $URI, $this->_httpmethod); |
|
} |
|
|
|
if($this->_redirectaddr) |
|
{ |
|
/* url was redirected, check if we've hit the max depth */ |
|
if($this->maxredirs > $this->_redirectdepth) |
|
{ |
|
// only follow redirect if it's on this site, or offsiteok is true |
|
if(preg_match("|^http://".preg_quote($this->host)."|i",$this->_redirectaddr) || $this->offsiteok) |
|
{ |
|
/* follow the redirect */ |
|
$this->_redirectdepth++; |
|
$this->lastredirectaddr=$this->_redirectaddr; |
|
$this->fetch($this->_redirectaddr); |
|
} |
|
} |
|
} |
|
|
|
if($this->_framedepth < $this->maxframes && count($this->_frameurls) > 0) |
|
{ |
|
$frameurls = $this->_frameurls; |
|
$this->_frameurls = array(); |
|
|
|
while(list(,$frameurl) = each($frameurls)) |
|
{ |
|
if($this->_framedepth < $this->maxframes) |
|
{ |
|
$this->fetch($frameurl); |
|
$this->_framedepth++; |
|
} |
|
else |
|
break; |
|
} |
|
} |
|
return true; |
|
break; |
|
default: |
|
// not a valid protocol |
|
$this->error = 'Invalid protocol "'.$URI_PARTS["scheme"].'"\n'; |
|
return false; |
|
break; |
|
} |
|
return true; |
|
} |
|
|
|
/*======================================================================*\ |
|
Function: submit |
|
Purpose: submit an http form |
|
Input: $URI the location to post the data |
|
$formvars the formvars to use. |
|
format: $formvars["var"] = "val"; |
|
$formfiles an array of files to submit |
|
format: $formfiles["var"] = "/dir/filename.ext"; |
|
Output: $this->results the text output from the post |
|
\*======================================================================*/ |
|
|
|
function submit($URI, $formvars="", $formfiles="") |
|
{ |
|
unset($postdata); |
|
|
|
$postdata = $this->_prepare_post_body($formvars, $formfiles); |
|
|
|
$URI_PARTS = parse_url($URI); |
|
if (!empty($URI_PARTS["user"])) |
|
$this->user = $URI_PARTS["user"]; |
|
if (!empty($URI_PARTS["pass"])) |
|
$this->pass = $URI_PARTS["pass"]; |
|
if (empty($URI_PARTS["query"])) |
|
$URI_PARTS["query"] = ''; |
|
if (empty($URI_PARTS["path"])) |
|
$URI_PARTS["path"] = ''; |
|
|
|
switch(strtolower($URI_PARTS["scheme"])) |
|
{ |
|
case "http": |
|
$this->host = $URI_PARTS["host"]; |
|
if(!empty($URI_PARTS["port"])) |
|
$this->port = $URI_PARTS["port"]; |
|
if($this->_connect($fp)) |
|
{ |
|
if($this->_isproxy) |
|
{ |
|
// using proxy, send entire URI |
|
$this->_httprequest($URI,$fp,$URI,$this->_submit_method,$this->_submit_type,$postdata); |
|
} |
|
else |
|
{ |
|
$path = $URI_PARTS["path"].($URI_PARTS["query"] ? "?".$URI_PARTS["query"] : ""); |
|
// no proxy, send only the path |
|
$this->_httprequest($path, $fp, $URI, $this->_submit_method, $this->_submit_type, $postdata); |
|
} |
|
|
|
$this->_disconnect($fp); |
|
|
|
if($this->_redirectaddr) |
|
{ |
|
/* url was redirected, check if we've hit the max depth */ |
|
if($this->maxredirs > $this->_redirectdepth) |
|
{ |
|
if(!preg_match("|^".$URI_PARTS["scheme"]."://|", $this->_redirectaddr)) |
|
$this->_redirectaddr = $this->_expandlinks($this->_redirectaddr,$URI_PARTS["scheme"]."://".$URI_PARTS["host"]); |
|
|
|
// only follow redirect if it's on this site, or offsiteok is true |
|
if(preg_match("|^http://".preg_quote($this->host)."|i",$this->_redirectaddr) || $this->offsiteok) |
|
{ |
|
/* follow the redirect */ |
|
$this->_redirectdepth++; |
|
$this->lastredirectaddr=$this->_redirectaddr; |
|
if( strpos( $this->_redirectaddr, "?" ) > 0 ) |
|
$this->fetch($this->_redirectaddr); // the redirect has changed the request method from post to get |
|
else |
|
$this->submit($this->_redirectaddr,$formvars, $formfiles); |
|
} |
|
} |
|
} |
|
|
|
if($this->_framedepth < $this->maxframes && count($this->_frameurls) > 0) |
|
{ |
|
$frameurls = $this->_frameurls; |
|
$this->_frameurls = array(); |
|
|
|
while(list(,$frameurl) = each($frameurls)) |
|
{ |
|
if($this->_framedepth < $this->maxframes) |
|
{ |
|
$this->fetch($frameurl); |
|
$this->_framedepth++; |
|
} |
|
else |
|
break; |
|
} |
|
} |
|
|
|
} |
|
else |
|
{ |
|
return false; |
|
} |
|
return true; |
|
break; |
|
case "https": |
|
if(!$this->curl_path) |
|
return false; |
|
if(function_exists("is_executable")) |
|
if (!is_executable($this->curl_path)) |
|
return false; |
|
$this->host = $URI_PARTS["host"]; |
|
if(!empty($URI_PARTS["port"])) |
|
$this->port = $URI_PARTS["port"]; |
|
if($this->_isproxy) |
|
{ |
|
// using proxy, send entire URI |
|
$this->_httpsrequest($URI, $URI, $this->_submit_method, $this->_submit_type, $postdata); |
|
} |
|