Add analytics
[bus.git] / busui / owa / owa_httpRequest.php
blob:a/busui/owa/owa_httpRequest.php -> blob:b/busui/owa/owa_httpRequest.php
  <?php
   
  //
  // Open Web Analytics - An Open Source Web Analytics Framework
  //
  // Copyright 2006 Peter Adams. All rights reserved.
  //
  // Licensed under GPL v2.0 http://www.gnu.org/copyleft/gpl.html
  //
  // Unless required by applicable law or agreed to in writing, software
  // distributed under the License is distributed on an "AS IS" BASIS,
  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  // See the License for the specific language governing permissions and
  // limitations under the License.
  //
  // $Id$
  //
   
  if(!class_exists('Snoopy')) {
  require_once(OWA_INCLUDE_DIR.'/Snoopy.class.php');
  }
   
  require_once(OWA_HTTPCLIENT_DIR.'http.php');
   
  /**
  * Wrapper for Snoopy http request class
  *
  * @author Peter Adams <peter@openwebanalytics.com>
  * @copyright Copyright &copy; 2006 Peter Adams <peter@openwebanalytics.com>
  * @license http://www.gnu.org/copyleft/gpl.html GPL v2.0
  * @category owa
  * @package owa
  * @version $Revision$
  * @since owa 1.0.0
  */
   
  class owa_http {
   
  /**
  * Configuration
  *
  * @var array
  */
  var $config;
   
  /**
  * Error handler
  *
  * @var object
  */
  var $e;
   
  /**
  * The length of text contained in the snippet
  *
  * @var string
  */
  var $snip_len = 100;
   
  /**
  * The string that is added to the beginning and
  * end of snippet text.
  *
  * @var string
  */
  var $snip_str = '...';
   
  /**
  * Anchor information for a particular link
  *
  * @var array
  */
  var $anchor_info;
   
  var $crawler;
   
  var $testcrawler;
   
  var $http;
   
  var $response;
  var $response_headers;
  var $response_code;
   
  var $request_headers;
   
  function __construct() {
   
  $c = &owa_coreAPI::configSingleton();
  $this->config = $c->fetch('base');
  $this->e = &owa_coreAPI::errorSingleton();
  $this->crawler = new Snoopy;
  // do not allow snoopy to follow links
  $this->crawler->maxredirs = 5;
  $this->crawler->agent = owa_coreAPI::getSetting('base', 'owa_user_agent');
  //$this->crawler->agent = "Firefox";
  //owa_coreAPI::debug('hello from owa_http constructor');
  return;
   
  }
   
  function fetch($uri) {
  //owa_coreAPI::debug('hello from owa_http fetch');
  return $this->crawler->fetch($uri);
  }
   
  function testFetch($url) {
   
  $http= new http_class;
  owa_coreAPI::debug('hello owa_http testfetch method');
  /* Connection timeout */
  $http->timeout=0;
  /* Data transfer timeout */
  $http->data_timeout=0;
  /* Output debugging information about the progress of the connection */
  $http->debug=1;
  $http->user_agent = owa_coreAPI::getSetting('base', 'owa_user_agent');
  $http->follow_redirect=1;
  $http->redirection_limit=5;
  $http->exclude_address="";
  $http->prefer_curl=0;
  $arguments = array();
  $error=$http->GetRequestArguments($url,$arguments);
  $error=$http->Open($arguments);
   
  //for(;;)
  // {
  $error=$http->ReadReplyBody($body,50000);
  if($error!="" || strlen($body)==0)
  owa_coreAPI::debug(HtmlSpecialChars($body));
  // }
   
  }
   
  /**
  * Searches a fetched html document for the anchor of a specific url
  *
  * @param string $link
  */
  function extract_anchor($link) {
   
  $matches = '';
  $regex = '/<a[^>]*href=\"%s\"[^>]*>(.*?)<\/a>/i';
   
  //$escaped_link = str_replace(array("/", "?"), array("\/", "\?"), $link);
   
  $pattern = trim(sprintf($regex, preg_quote($link, '/')));
  $search = preg_match($pattern, $this->response, $matches);
  //$this->e->debug('pattern: '.$pattern);
  //$this->e->debug('link: '.$link);
   
   
  if (empty($matches)) {
  if (substr($link, -1) === '/') {
  $link = substr($link, 0, -1);
  $pattern = trim(sprintf($regex, preg_quote($link, '/')));
  $search = preg_match($pattern, $this->response, $matches);
  //$this->e->debug('pattern: '.$pattern);
  //$this->e->debug('link: '.$link);
  }
  }
   
  $this->e->debug('ref search: '.$search);
  //$this->e->debug('ref matches: '.print_r($this->results, true));
  //$this->e->debug('ref matches: '.print_r($matches, true));
  if (isset($matches[0])) {
  $this->anchor_info = array('anchor_tag' => $matches[0], 'anchor_text' => owa_lib::inputFilter($matches[0]));
  $this->e->debug('Anchor info: '.print_r($this->anchor_info, true));
  }
  }
   
  /**
  * Creates a text snippet of the portion of page where the
  * specific link is found.
  *
  * Takes fully qualified URL for the link to search for.
  *
  * @param string $link
  * @return string
  */
  function extract_anchor_snippet($link){
   
  // Search the page for a specific anchor
  $this->extract_anchor($link);
   
  if(!empty($this->anchor_info['anchor_tag'])) {
   
  // drop certain HTML entitities and their content
  $nohtml = $this->strip_selected_tags(
  $this->response,
  array('title',
  'head',
  'script',
  'object',
  'style',
  'meta',
  'link',
  'rdf:'),
  true);
   
  //$this->e->debug('Refering page content after certain html entities were dropped: '.$this->results);
   
  // calc len of the anchor text
  $atext_len = strlen($this->anchor_info['anchor_tag']);
   
  // find position within document of the anchor text
  $start = strpos($nohtml, $this->anchor_info['anchor_tag']);
   
  if ($start < $this->snip_len) {
  $part1_start_pos = 0;
  $part1_snip_len = $start;
  } else {
  $part1_start_pos = $start;
  $part1_snip_len = $this->snip_len;
  }
   
  $replace_items = array("\r\n", "\n\n", "\t", "\r", "\n");
  // Create first segment of snippet
  $first_part = substr($nohtml, 0, $part1_start_pos);
  $first_part = str_replace($replace_items, '', $first_part);
  $first_part = strip_tags(owa_lib::inputFilter($first_part));
  //$part1 = trim(substr($nohtml, $part1_start_pos, $part1_snip_len));
  $part1 = substr($first_part,-$part1_snip_len, $part1_snip_len);
   
  //$part1 = str_replace(array('\r\n', '\n\n', '\t', '\r', '\n'), '', $part1);
  //$part1 = owa_lib::inputFilter($part1);
  // Create second segment of snippet
  $part2 = trim(substr($nohtml, $start + $atext_len, $this->snip_len+300));
  $part2 = str_replace($replace_items, '', $part2);
  $part2 = substr(strip_tags(owa_lib::inputFilter($part2)),0, $this->snip_len);
   
  // Put humpty dumpy back together again and create actual snippet
  $snippet = $this->snip_str.$part1.' <span class="snippet_anchor">'.owa_lib::inputFilter($this->anchor_info['anchor_tag']).'</span> '.$part2.$this->snip_str;
   
  } else {
   
  $snippet = '';
   
  }
   
  return $snippet;
   
  }
   
  function extract_title() {
   
  preg_match('~(</head>|<body>|(<title>\s*(.*?)\s*</title>))~i', $this->response, $m);
   
  $this->e->debug("referer title extract: ". print_r($m, true));
   
  return $m[3];
  }
   
  function strip_selected_tags($str, $tags = array(), $stripContent = false) {
   
  foreach ($tags as $k => $tag){
   
  if ($stripContent == true) {
  $pattern = sprintf('#(<%s.*?>)(.*?)(<\/%s.*?>)#is', preg_quote($tag), preg_quote($tag));
  $str = preg_replace($pattern,"",$str);
  }
  $str = preg_replace($pattern, ${2},$str);
  }
   
  return $str;
  }
   
  function SetupHTTP()
  {
  if(!IsSet($this->http))
  {
  $this->http = new http_class;
  $this->http->follow_redirect = 1;
  $this->http->debug = 0;
  $this->http->debug_response_body = 0;
  $this->http->html_debug = 1;
  $this->http->user_agent = owa_coreAPI::getSetting('base', 'owa_user_agent');
  $this->http->timeout = 3;
  $this->http->data_timeout = 3;
  }
  }
   
  function OpenRequest($arguments, &$headers)
  {
  if(strlen($this->error=$this->http->Open($arguments)))
  return(0);
  if(strlen($this->error=$this->http->SendRequest($arguments))
  || strlen($this->error=$this->http->ReadReplyHeaders($headers)))
  {
  $this->http->Close();
  return(0);
  }
  if($this->http->response_status!=200)
  {
  $this->error = 'the HTTP request returned the status '.$this->http->response_status;
  $this->http->Close();
  return(0);
  }
  return(1);
  }
   
  function GetRequestResponse(&$response)
  {
  for($response = ''; ; )
  {
  if(strlen($this->error=$this->http->ReadReplyBody($body, 500000)))
  {
  $this->http->Close();
  return(0);
  }
  if(strlen($body)==0)
  break;
  $response .= $body;
   
  }
  $this->http->Close();
  owa_coreAPI::debug('http response code: '.$this->http->response_status);
  return($response);
  }
   
  function getRequest($url, $arguments = '', $response = '') {
   
  $this->SetupHTTP();
   
  $this->http->GetRequestArguments($url, $arguments);
  $arguments['RequestMethod']='GET';
  if(!$this->OpenRequest($arguments, $headers)) {
  return(0);
  }
  $this->response = $this->GetRequestResponse($response);
  return($this->response);
  }
   
  }
   
   
  ?>