--- a/busui/owa/owa_httpRequest.php +++ b/busui/owa/owa_httpRequest.php @@ -1,1 +1,337 @@ - + + * @copyright Copyright © 2006 Peter Adams + * @license http://www.gnu.org/copyleft/gpl.html GPL v2.0 + * @category owa + * @package owa + * @version $Revision$ + * @since owa 1.0.0 + */ + +class owa_http { + + /** + * Configuration + * + * @var array + */ + var $config; + + /** + * Error handler + * + * @var object + */ + var $e; + + /** + * The length of text contained in the snippet + * + * @var string + */ + var $snip_len = 100; + + /** + * The string that is added to the beginning and + * end of snippet text. + * + * @var string + */ + var $snip_str = '...'; + + /** + * Anchor information for a particular link + * + * @var array + */ + var $anchor_info; + + var $crawler; + + var $testcrawler; + + var $http; + + var $response; + var $response_headers; + var $response_code; + + var $request_headers; + + function __construct() { + + $c = &owa_coreAPI::configSingleton(); + $this->config = $c->fetch('base'); + $this->e = &owa_coreAPI::errorSingleton(); + $this->crawler = new Snoopy; + // do not allow snoopy to follow links + $this->crawler->maxredirs = 5; + $this->crawler->agent = owa_coreAPI::getSetting('base', 'owa_user_agent'); + //$this->crawler->agent = "Firefox"; + //owa_coreAPI::debug('hello from owa_http constructor'); + return; + + } + + function fetch($uri) { + //owa_coreAPI::debug('hello from owa_http fetch'); + return $this->crawler->fetch($uri); + } + + function testFetch($url) { + + $http= new http_class; + owa_coreAPI::debug('hello owa_http testfetch method'); + /* Connection timeout */ + $http->timeout=0; + /* Data transfer timeout */ + $http->data_timeout=0; + /* Output debugging information about the progress of the connection */ + $http->debug=1; + $http->user_agent = owa_coreAPI::getSetting('base', 'owa_user_agent'); + $http->follow_redirect=1; + $http->redirection_limit=5; + $http->exclude_address=""; + $http->prefer_curl=0; + $arguments = array(); + $error=$http->GetRequestArguments($url,$arguments); + $error=$http->Open($arguments); + + //for(;;) + // { + $error=$http->ReadReplyBody($body,50000); + if($error!="" || strlen($body)==0) + owa_coreAPI::debug(HtmlSpecialChars($body)); + // } + + } + + /** + * Searches a fetched html document for the anchor of a specific url + * + * @param string $link + */ + function extract_anchor($link) { + + $matches = ''; + $regex = '/]*href=\"%s\"[^>]*>(.*?)<\/a>/i'; + + //$escaped_link = str_replace(array("/", "?"), array("\/", "\?"), $link); + + $pattern = trim(sprintf($regex, preg_quote($link, '/'))); + $search = preg_match($pattern, $this->response, $matches); + //$this->e->debug('pattern: '.$pattern); + //$this->e->debug('link: '.$link); + + + if (empty($matches)) { + if (substr($link, -1) === '/') { + $link = substr($link, 0, -1); + $pattern = trim(sprintf($regex, preg_quote($link, '/'))); + $search = preg_match($pattern, $this->response, $matches); + //$this->e->debug('pattern: '.$pattern); + //$this->e->debug('link: '.$link); + } + } + + $this->e->debug('ref search: '.$search); + //$this->e->debug('ref matches: '.print_r($this->results, true)); + //$this->e->debug('ref matches: '.print_r($matches, true)); + if (isset($matches[0])) { + $this->anchor_info = array('anchor_tag' => $matches[0], 'anchor_text' => owa_lib::inputFilter($matches[0])); + $this->e->debug('Anchor info: '.print_r($this->anchor_info, true)); + } + } + + /** + * Creates a text snippet of the portion of page where the + * specific link is found. + * + * Takes fully qualified URL for the link to search for. + * + * @param string $link + * @return string + */ + function extract_anchor_snippet($link){ + + // Search the page for a specific anchor + $this->extract_anchor($link); + + if(!empty($this->anchor_info['anchor_tag'])) { + + // drop certain HTML entitities and their content + $nohtml = $this->strip_selected_tags( + $this->response, + array('title', + 'head', + 'script', + 'object', + 'style', + 'meta', + 'link', + 'rdf:'), + true); + + //$this->e->debug('Refering page content after certain html entities were dropped: '.$this->results); + + // calc len of the anchor text + $atext_len = strlen($this->anchor_info['anchor_tag']); + + // find position within document of the anchor text + $start = strpos($nohtml, $this->anchor_info['anchor_tag']); + + if ($start < $this->snip_len) { + $part1_start_pos = 0; + $part1_snip_len = $start; + } else { + $part1_start_pos = $start; + $part1_snip_len = $this->snip_len; + } + + $replace_items = array("\r\n", "\n\n", "\t", "\r", "\n"); + // Create first segment of snippet + $first_part = substr($nohtml, 0, $part1_start_pos); + $first_part = str_replace($replace_items, '', $first_part); + $first_part = strip_tags(owa_lib::inputFilter($first_part)); + //$part1 = trim(substr($nohtml, $part1_start_pos, $part1_snip_len)); + $part1 = substr($first_part,-$part1_snip_len, $part1_snip_len); + + //$part1 = str_replace(array('\r\n', '\n\n', '\t', '\r', '\n'), '', $part1); + //$part1 = owa_lib::inputFilter($part1); + // Create second segment of snippet + $part2 = trim(substr($nohtml, $start + $atext_len, $this->snip_len+300)); + $part2 = str_replace($replace_items, '', $part2); + $part2 = substr(strip_tags(owa_lib::inputFilter($part2)),0, $this->snip_len); + + // Put humpty dumpy back together again and create actual snippet + $snippet = $this->snip_str.$part1.' '.owa_lib::inputFilter($this->anchor_info['anchor_tag']).' '.$part2.$this->snip_str; + + } else { + + $snippet = ''; + + } + + return $snippet; + + } + + function extract_title() { + + preg_match('~(||(\s*(.*?)\s*))~i', $this->response, $m); + + $this->e->debug("referer title extract: ". print_r($m, true)); + + return $m[3]; + } + + function strip_selected_tags($str, $tags = array(), $stripContent = false) { + + foreach ($tags as $k => $tag){ + + if ($stripContent == true) { + $pattern = sprintf('#(<%s.*?>)(.*?)(<\/%s.*?>)#is', preg_quote($tag), preg_quote($tag)); + $str = preg_replace($pattern,"",$str); + } + $str = preg_replace($pattern, ${2},$str); + } + + return $str; + } + + function SetupHTTP() + { + if(!IsSet($this->http)) + { + $this->http = new http_class; + $this->http->follow_redirect = 1; + $this->http->debug = 0; + $this->http->debug_response_body = 0; + $this->http->html_debug = 1; + $this->http->user_agent = owa_coreAPI::getSetting('base', 'owa_user_agent'); + $this->http->timeout = 3; + $this->http->data_timeout = 3; + } + } + + function OpenRequest($arguments, &$headers) + { + if(strlen($this->error=$this->http->Open($arguments))) + return(0); + if(strlen($this->error=$this->http->SendRequest($arguments)) + || strlen($this->error=$this->http->ReadReplyHeaders($headers))) + { + $this->http->Close(); + return(0); + } + if($this->http->response_status!=200) + { + $this->error = 'the HTTP request returned the status '.$this->http->response_status; + $this->http->Close(); + return(0); + } + return(1); + } + + function GetRequestResponse(&$response) + { + for($response = ''; ; ) + { + if(strlen($this->error=$this->http->ReadReplyBody($body, 500000))) + { + $this->http->Close(); + return(0); + } + if(strlen($body)==0) + break; + $response .= $body; + + } + $this->http->Close(); + owa_coreAPI::debug('http response code: '.$this->http->response_status); + return($response); + } + + function getRequest($url, $arguments = '', $response = '') { + + $this->SetupHTTP(); + + $this->http->GetRequestArguments($url, $arguments); + $arguments['RequestMethod']='GET'; + if(!$this->OpenRequest($arguments, $headers)) { + return(0); + } + $this->response = $this->GetRequestResponse($response); + return($this->response); + } + +} + + +?>