Rearrange php libs/includes and produce sitemap.xml
[busui.git] / lib / simple_html_dom.php
blob:a/lib/simple_html_dom.php -> blob:b/lib/simple_html_dom.php
  <?php
  /*******************************************************************************
  Version: 1.11 ($Rev: 175 $)
  Website: http://sourceforge.net/projects/simplehtmldom/
  Author: S.C. Chen <me578022@gmail.com>
  Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
  Contributions by:
  Yousuke Kumakura (Attribute filters)
  Vadim Voituk (Negative indexes supports of "find" method)
  Antcs (Constructor with automatically load contents either text or file/url)
  Licensed under The MIT License
  Redistributions of files must retain the above copyright notice.
  *******************************************************************************/
   
  define('HDOM_TYPE_ELEMENT', 1);
  define('HDOM_TYPE_COMMENT', 2);
  define('HDOM_TYPE_TEXT', 3);
  define('HDOM_TYPE_ENDTAG', 4);
  define('HDOM_TYPE_ROOT', 5);
  define('HDOM_TYPE_UNKNOWN', 6);
  define('HDOM_QUOTE_DOUBLE', 0);
  define('HDOM_QUOTE_SINGLE', 1);
  define('HDOM_QUOTE_NO', 3);
  define('HDOM_INFO_BEGIN', 0);
  define('HDOM_INFO_END', 1);
  define('HDOM_INFO_QUOTE', 2);
  define('HDOM_INFO_SPACE', 3);
  define('HDOM_INFO_TEXT', 4);
  define('HDOM_INFO_INNER', 5);
  define('HDOM_INFO_OUTER', 6);
  define('HDOM_INFO_ENDSPACE',7);
   
  // helper functions
  // -----------------------------------------------------------------------------
  // get html dom form file
  function file_get_html() {
  $dom = new simple_html_dom;
  $args = func_get_args();
  $dom->load(call_user_func_array('file_get_contents', $args), true);
  return $dom;
  }
   
  // get html dom form string
  function str_get_html($str, $lowercase=true) {
  $dom = new simple_html_dom;
  $dom->load($str, $lowercase);
  return $dom;
  }
   
  // dump html dom tree
  function dump_html_tree($node, $show_attr=true, $deep=0) {
  $lead = str_repeat(' ', $deep);
  echo $lead.$node->tag;
  if ($show_attr && count($node->attr)>0) {
  echo '(';
  foreach($node->attr as $k=>$v)
  echo "[$k]=>\"".$node->$k.'", ';
  echo ')';
  }
  echo "\n";
   
  foreach($node->nodes as $c)
  dump_html_tree($c, $show_attr, $deep+1);
  }
   
  // get dom form file (deprecated)
  function file_get_dom() {
  $dom = new simple_html_dom;
  $args = func_get_args();
  $dom->load(call_user_func_array('file_get_contents', $args), true);
  return $dom;
  }
   
  // get dom form string (deprecated)
  function str_get_dom($str, $lowercase=true) {
  $dom = new simple_html_dom;
  $dom->load($str, $lowercase);
  return $dom;
  }
   
  // simple html dom node
  // -----------------------------------------------------------------------------
  class simple_html_dom_node {
  public $nodetype = HDOM_TYPE_TEXT;
  public $tag = 'text';
  public $attr = array();
  public $children = array();
  public $nodes = array();
  public $parent = null;
  public $_ = array();
  private $dom = null;
   
  function __construct($dom) {
  $this->dom = $dom;
  $dom->nodes[] = $this;
  }
   
  function __destruct() {
  $this->clear();
  }
   
  function __toString() {
  return $this->outertext();
  }
   
  // clean up memory due to php5 circular references memory leak...
  function clear() {
  $this->dom = null;
  $this->nodes = null;
  $this->parent = null;
  $this->children = null;
  }
   
  // dump node's tree
  function dump($show_attr=true) {
  dump_html_tree($this, $show_attr);
  }
   
  // returns the parent of node
  function parent() {
  return $this->parent;
  }
   
  // returns children of node
  function children($idx=-1) {
  if ($idx===-1) return $this->children;
  if (isset($this->children[$idx])) return $this->children[$idx];
  return null;
  }
   
  // returns the first child of node
  function first_child() {
  if (count($this->children)>0) return $this->children[0];
  return null;
  }
   
  // returns the last child of node
  function last_child() {
  if (($count=count($this->children))>0) return $this->children[$count-1];
  return null;
  }
   
  // returns the next sibling of node
  function next_sibling() {
  if ($this->parent===null) return null;
  $idx = 0;
  $count = count($this->parent->children);
  while ($idx<$count && $this!==$this->parent->children[$idx])
  ++$idx;
  if (++$idx>=$count) return null;
  return $this->parent->children[$idx];
  }
   
  // returns the previous sibling of node
  function prev_sibling() {
  if ($this->parent===null) return null;
  $idx = 0;
  $count = count($this->parent->children);
  while ($idx<$count && $this!==$this->parent->children[$idx])
  ++$idx;
  if (--$idx<0) return null;
  return $this->parent->children[$idx];
  }
   
  // get dom node's inner html
  function innertext() {
  if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
  if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
   
  $ret = '';
  foreach($this->nodes as $n)
  $ret .= $n->outertext();
  return $ret;
  }
   
  // get dom node's outer text (with tag)
  function outertext() {
  if ($this->tag==='root') return $this->innertext();
   
  // trigger callback
  if ($this->dom->callback!==null)
  call_user_func_array($this->dom->callback, array($this));
   
  if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER];
  if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
   
  // render begin tag
  $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
   
  // render inner text
  if (isset($this->_[HDOM_INFO_INNER]))
  $ret .= $this->_[HDOM_INFO_INNER];
  else {
  foreach($this->nodes as $n)
  $ret .= $n->outertext();
  }
   
  // render end tag
  if(isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0)
  $ret .= '</'.$this->tag.'>';
  return $ret;
  }
   
  // get dom node's plain text
  function text() {
  if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
  switch ($this->nodetype) {
  case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
  case HDOM_TYPE_COMMENT: return '';
  case HDOM_TYPE_UNKNOWN: return '';
  }
  if (strcasecmp($this->tag, 'script')===0) return '';
  if (strcasecmp($this->tag, 'style')===0) return '';
   
  $ret = '';
  foreach($this->nodes as $n)
  $ret .= $n->text();
  return $ret;
  }
   
  function xmltext() {
  $ret = $this->innertext();
  $ret = str_ireplace('<![CDATA[', '', $ret);
  $ret = str_replace(']]>', '', $ret);
  return $ret;
  }
   
  // build node's text with tag
  function makeup() {
  // text, comment, unknown
  if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
   
  $ret = '<'.$this->tag;
  $i = -1;
   
  foreach($this->attr as $key=>$val) {
  ++$i;
   
  // skip removed attribute
  if ($val===null || $val===false)
  continue;
   
  $ret .= $this->_[HDOM_INFO_SPACE][$i][0];
  //no value attr: nowrap, checked selected...
  if ($val===true)
  $ret .= $key;
  else {
  switch($this->_[HDOM_INFO_QUOTE][$i]) {
  case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
  case HDOM_QUOTE_SINGLE: $quote = '\''; break;
  default: $quote = '';
  }
  $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote;
  }
  }
  $ret = $this->dom->restore_noise($ret);
  return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
  }
   
  // find elements by css selector
  function find($selector, $idx=null) {
  $selectors = $this->parse_selector($selector);
  if (($count=count($selectors))===0) return array();
  $found_keys = array();
   
  // find each selector
  for ($c=0; $c<$count; ++$c) {
  if (($levle=count($selectors[0]))===0) return array();
  if (!isset($this->_[HDOM_INFO_BEGIN])) return array();
   
  $head = array($this->_[HDOM_INFO_BEGIN]=>1);
   
  // handle descendant selectors, no recursive!
  for ($l=0; $l<$levle; ++$l) {
  $ret = array();
  foreach($head as $k=>$v) {
  $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k];
  $n->seek($selectors[$c][$l], $ret);
  }
  $head = $ret;
  }
   
  foreach($head as $k=>$v) {
  if (!isset($found_keys[$k]))
  $found_keys[$k] = 1;
  }
  }
   
  // sort keys
  ksort($found_keys);
   
  $found = array();
  foreach($found_keys as $k=>$v)
  $found[] = $this->dom->nodes[$k];
   
  // return nth-element or array
  if (is_null($idx)) return $found;
  else if ($idx<0) $idx = count($found) + $idx;
  return (isset($found[$idx])) ? $found[$idx] : null;
  }
   
  // seek for given conditions
  protected function seek($selector, &$ret) {
  list($tag, $key, $val, $exp, $no_key) = $selector;
   
  // xpath index
  if ($tag && $key && is_numeric($key)) {
  $count = 0;
  foreach ($this->children as $c) {
  if ($tag==='*' || $tag===$c->tag) {
  if (++$count==$key) {
  $ret[$c->_[HDOM_INFO_BEGIN]] = 1;
  return;
  }
  }
  }
  return;
  }
   
  $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
  if ($end==0) {
  $parent = $this->parent;
  while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) {
  $end -= 1;
  $parent = $parent->parent;
  }
  $end += $parent->_[HDOM_INFO_END];
  }
   
  for($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) {
  $node = $this->dom->nodes[$i];
  $pass = true;
   
  if ($tag==='*' && !$key) {
  if (in_array($node, $this->children, true))
  $ret[$i] = 1;
  continue;
  }
   
  // compare tag
  if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;}
  // compare key
  if ($pass && $key) {
  if ($no_key) {
  if (isset($node->attr[$key])) $pass=false;
  }
  else if (!isset($node->attr[$key])) $pass=false;
  }
  // compare value
  if ($pass && $key && $val && $val!=='*') {
  $check = $this->match($exp, $val, $node->attr[$key]);
  // handle multiple class
  if (!$check && strcasecmp($key, 'class')===0) {
  foreach(explode(' ',$node->attr[$key]) as $k) {
  $check = $this->match($exp, $val, $k);
  if ($check) break;
  }
  }
  if (!$check) $pass = false;
  }
  if ($pass) $ret[$i] = 1;
  unset($node);
  }
  }
   
  protected function match($exp, $pattern, $value) {
  switch ($exp) {
  case '=':
  return ($value===$pattern);
  case '!=':
  return ($value!==$pattern);
  case '^=':
  return preg_match("/^".preg_quote($pattern,'/')."/", $value);
  case '$=':
  return preg_match("/".preg_quote($pattern,'/')."$/", $value);
  case '*=':
  if ($pattern[0]=='/')