From 78f0e6db066f778b4da8586dc1b0ecdac7c7a0be Mon Sep 17 00:00:00 2001 From: Deon George Date: Fri, 22 Aug 2014 15:37:59 +1000 Subject: [PATCH] Updated Simple_HTML_Dom and fixed module XML to work with KH 3.3 --- .../simplehtmldom/classes/simple_html_dom.php | 2435 +++++++++++------ modules/xml/classes/{xml.php => XML.php} | 0 .../classes/{xml/core.php => XML/Core.php} | 0 .../driver/atom.php => XML/Driver/Atom.php} | 0 .../driver/rss2.php => XML/Driver/Rss2.php} | 0 .../driver/xrds.php => XML/Driver/Xrds.php} | 0 .../classes/{xml/meta.php => XML/Meta.php} | 0 .../{xml/meta/core.php => XML/Meta/Core.php} | 0 8 files changed, 1601 insertions(+), 834 deletions(-) rename modules/xml/classes/{xml.php => XML.php} (100%) rename modules/xml/classes/{xml/core.php => XML/Core.php} (100%) rename modules/xml/classes/{xml/driver/atom.php => XML/Driver/Atom.php} (100%) rename modules/xml/classes/{xml/driver/rss2.php => XML/Driver/Rss2.php} (100%) rename modules/xml/classes/{xml/driver/xrds.php => XML/Driver/Xrds.php} (100%) rename modules/xml/classes/{xml/meta.php => XML/Meta.php} (100%) rename modules/xml/classes/{xml/meta/core.php => XML/Meta/Core.php} (100%) diff --git a/modules/simplehtmldom/classes/simple_html_dom.php b/modules/simplehtmldom/classes/simple_html_dom.php index fbe43df..67c7c9d 100644 --- a/modules/simplehtmldom/classes/simple_html_dom.php +++ b/modules/simplehtmldom/classes/simple_html_dom.php @@ -1,975 +1,1742 @@ -Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/) -Contributions by: - Yousuke Kumakura (Attribute filters) - Vadim Voituk (Negative indexes supports of "find" method) - Antcs (Constructor with automatically load contents either text or file/url) -Licensed under The MIT License -Redistributions of files must retain the above copyright notice. -*******************************************************************************/ +/** + * Website: http://sourceforge.net/projects/simplehtmldom/ + * Additional projects that may be used: http://sourceforge.net/projects/debugobject/ + * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/) + * Contributions by: + * Yousuke Kumakura (Attribute filters) + * Vadim Voituk (Negative indexes supports of "find" method) + * Antcs (Constructor with automatically load contents either text or file/url) + * + * all affected sections have comments starting with "PaperG" + * + * Paperg - Added case insensitive testing of the value of the selector. + * Paperg - Added tag_start for the starting index of tags - NOTE: This works but not accurately. + * This tag_start gets counted AFTER \r\n have been crushed out, and after the remove_noice calls so it will not reflect the REAL position of the tag in the source, + * it will almost always be smaller by some amount. + * We use this to determine how far into the file the tag in question is. This "percentage will never be accurate as the $dom->size is the "real" number of bytes the dom was created from. + * but for most purposes, it's a really good estimation. + * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags closed is great for malformed html, but it CAN lead to parsing errors. + * Allow the user to tell us how much they trust the html. + * Paperg add the text and plaintext to the selectors for the find syntax. plaintext implies text in the innertext of a node. text implies that the tag is a text node. + * This allows for us to find tags based on the text they contain. + * Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag. + * Paperg: added parse_charset so that we know about the character set of the source document. + * NOTE: If the user's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it's returning the content-type header from the + * last transfer or curl_exec, and we will parse that and use it in preference to any other method of charset detection. + * + * Found infinite loop in the case of broken html in restore_noise. Rewrote to protect from that. + * PaperG (John Schlick) Added get_display_size for "IMG" tags. + * + * Licensed under The MIT License + * Redistributions of files must retain the above copyright notice. + * + * @author S.C. Chen + * @author John Schlick + * @author Rus Carroll + * @version 1.5 ($Rev: 210 $) + * @package PlaceLocalInclude + * @subpackage simple_html_dom + */ +/** + * All of the Defines for the classes below. + * @author S.C. Chen + */ define('HDOM_TYPE_ELEMENT', 1); define('HDOM_TYPE_COMMENT', 2); -define('HDOM_TYPE_TEXT', 3); +define('HDOM_TYPE_TEXT', 3); define('HDOM_TYPE_ENDTAG', 4); -define('HDOM_TYPE_ROOT', 5); +define('HDOM_TYPE_ROOT', 5); define('HDOM_TYPE_UNKNOWN', 6); define('HDOM_QUOTE_DOUBLE', 0); define('HDOM_QUOTE_SINGLE', 1); -define('HDOM_QUOTE_NO', 3); +define('HDOM_QUOTE_NO', 3); define('HDOM_INFO_BEGIN', 0); -define('HDOM_INFO_END', 1); +define('HDOM_INFO_END', 1); define('HDOM_INFO_QUOTE', 2); define('HDOM_INFO_SPACE', 3); -define('HDOM_INFO_TEXT', 4); +define('HDOM_INFO_TEXT', 4); define('HDOM_INFO_INNER', 5); define('HDOM_INFO_OUTER', 6); define('HDOM_INFO_ENDSPACE',7); - +define('DEFAULT_TARGET_CHARSET', 'UTF-8'); +define('DEFAULT_BR_TEXT', "\r\n"); +define('DEFAULT_SPAN_TEXT', " "); +define('MAX_FILE_SIZE', 600000); // helper functions // ----------------------------------------------------------------------------- -// get html dom form file -function file_get_html() { - $dom = new simple_html_dom; - $args = func_get_args(); - $dom->load(call_user_func_array('file_get_contents', $args), true); - return $dom; +// get html dom from file +// $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1. +function file_get_html($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) +{ + // We DO force the tags to be terminated. + $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText); + // For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done. + $contents = file_get_contents($url, $use_include_path, $context, $offset); + // Paperg - use our own mechanism for getting the contents as we want to control the timeout. + //$contents = retrieve_url_contents($url); + if (empty($contents) || strlen($contents) > MAX_FILE_SIZE) + { + return false; + } + // The second parameter can force the selectors to all be lowercase. + $dom->load($contents, $lowercase, $stripRN); + return $dom; } -// get html dom form string -function str_get_html($str, $lowercase=true) { - $dom = new simple_html_dom; - $dom->load($str, $lowercase); - return $dom; +// get html dom from string +function str_get_html($str, $lowercase=true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) +{ + $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText); + if (empty($str) || strlen($str) > MAX_FILE_SIZE) + { + $dom->clear(); + return false; + } + $dom->load($str, $lowercase, $stripRN); + return $dom; } // dump html dom tree -function dump_html_tree($node, $show_attr=true, $deep=0) { - $lead = str_repeat(' ', $deep); - echo $lead.$node->tag; - if ($show_attr && count($node->attr)>0) { - echo '('; - foreach($node->attr as $k=>$v) - echo "[$k]=>\"".$node->$k.'", '; - echo ')'; - } - echo "\n"; - - foreach($node->nodes as $c) - dump_html_tree($c, $show_attr, $deep+1); +function dump_html_tree($node, $show_attr=true, $deep=0) +{ + $node->dump($node); } -// get dom form file (deprecated) -function file_get_dom() { - $dom = new simple_html_dom; - $args = func_get_args(); - $dom->load(call_user_func_array('file_get_contents', $args), true); - return $dom; -} -// get dom form string (deprecated) -function str_get_dom($str, $lowercase=true) { - $dom = new simple_html_dom; - $dom->load($str, $lowercase); - return $dom; -} +/** + * simple html dom node + * PaperG - added ability for "find" routine to lowercase the value of the selector. + * PaperG - added $tag_start to track the start position of the tag in the total byte index + * + * @package PlaceLocalInclude + */ +class simple_html_dom_node +{ + public $nodetype = HDOM_TYPE_TEXT; + public $tag = 'text'; + public $attr = array(); + public $children = array(); + public $nodes = array(); + public $parent = null; + // The "info" array - see HDOM_INFO_... for what each element contains. + public $_ = array(); + public $tag_start = 0; + private $dom = null; -// simple html dom node -// ----------------------------------------------------------------------------- -class simple_html_dom_node { - public $nodetype = HDOM_TYPE_TEXT; - public $tag = 'text'; - public $attr = array(); - public $children = array(); - public $nodes = array(); - public $parent = null; - public $_ = array(); - private $dom = null; + function __construct($dom) + { + $this->dom = $dom; + $dom->nodes[] = $this; + } - function __construct($dom) { - $this->dom = $dom; - $dom->nodes[] = $this; - } + function __destruct() + { + $this->clear(); + } - function __destruct() { - $this->clear(); - } + function __toString() + { + return $this->outertext(); + } - function __toString() { - return $this->outertext(); - } + // clean up memory due to php5 circular references memory leak... + function clear() + { + $this->dom = null; + $this->nodes = null; + $this->parent = null; + $this->children = null; + } - // clean up memory due to php5 circular references memory leak... - function clear() { - $this->dom = null; - $this->nodes = null; - $this->parent = null; - $this->children = null; - } - - // dump node's tree - function dump($show_attr=true) { - dump_html_tree($this, $show_attr); - } + // dump node's tree + function dump($show_attr=true, $deep=0) + { + $lead = str_repeat(' ', $deep); - // returns the parent of node - function parent() { - return $this->parent; - } + echo $lead.$this->tag; + if ($show_attr && count($this->attr)>0) + { + echo '('; + foreach ($this->attr as $k=>$v) + echo "[$k]=>\"".$this->$k.'", '; + echo ')'; + } + echo "\n"; - // returns children of node - function children($idx=-1) { - if ($idx===-1) return $this->children; - if (isset($this->children[$idx])) return $this->children[$idx]; - return null; - } + if ($this->nodes) + { + foreach ($this->nodes as $c) + { + $c->dump($show_attr, $deep+1); + } + } + } - // returns the first child of node - function first_child() { - if (count($this->children)>0) return $this->children[0]; - return null; - } - // returns the last child of node - function last_child() { - if (($count=count($this->children))>0) return $this->children[$count-1]; - return null; - } + // Debugging function to dump a single dom node with a bunch of information about it. + function dump_node($echo=true) + { - // returns the next sibling of node - function next_sibling() { - if ($this->parent===null) return null; - $idx = 0; - $count = count($this->parent->children); - while ($idx<$count && $this!==$this->parent->children[$idx]) - ++$idx; - if (++$idx>=$count) return null; - return $this->parent->children[$idx]; - } + $string = $this->tag; + if (count($this->attr)>0) + { + $string .= '('; + foreach ($this->attr as $k=>$v) + { + $string .= "[$k]=>\"".$this->$k.'", '; + } + $string .= ')'; + } + if (count($this->_)>0) + { + $string .= ' $_ ('; + foreach ($this->_ as $k=>$v) + { + if (is_array($v)) + { + $string .= "[$k]=>("; + foreach ($v as $k2=>$v2) + { + $string .= "[$k2]=>\"".$v2.'", '; + } + $string .= ")"; + } else { + $string .= "[$k]=>\"".$v.'", '; + } + } + $string .= ")"; + } - // returns the previous sibling of node - function prev_sibling() { - if ($this->parent===null) return null; - $idx = 0; - $count = count($this->parent->children); - while ($idx<$count && $this!==$this->parent->children[$idx]) - ++$idx; - if (--$idx<0) return null; - return $this->parent->children[$idx]; - } + if (isset($this->text)) + { + $string .= " text: (" . $this->text . ")"; + } - // get dom node's inner html - function innertext() { - if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER]; - if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); + $string .= " HDOM_INNER_INFO: '"; + if (isset($node->_[HDOM_INFO_INNER])) + { + $string .= $node->_[HDOM_INFO_INNER] . "'"; + } + else + { + $string .= ' NULL '; + } - $ret = ''; - foreach($this->nodes as $n) - $ret .= $n->outertext(); - return $ret; - } + $string .= " children: " . count($this->children); + $string .= " nodes: " . count($this->nodes); + $string .= " tag_start: " . $this->tag_start; + $string .= "\n"; - // get dom node's outer text (with tag) - function outertext() { - if ($this->tag==='root') return $this->innertext(); + if ($echo) + { + echo $string; + return; + } + else + { + return $string; + } + } - // trigger callback - if ($this->dom->callback!==null) - call_user_func_array($this->dom->callback, array($this)); + // returns the parent of node + // If a node is passed in, it will reset the parent of the current node to that one. + function parent($parent=null) + { + // I am SURE that this doesn't work properly. + // It fails to unset the current node from it's current parents nodes or children list first. + if ($parent !== null) + { + $this->parent = $parent; + $this->parent->nodes[] = $this; + $this->parent->children[] = $this; + } - if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER]; - if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); + return $this->parent; + } - // render begin tag - $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup(); + // verify that node has children + function has_child() + { + return !empty($this->children); + } - // render inner text - if (isset($this->_[HDOM_INFO_INNER])) - $ret .= $this->_[HDOM_INFO_INNER]; - else { - foreach($this->nodes as $n) - $ret .= $n->outertext(); - } + // returns children of node + function children($idx=-1) + { + if ($idx===-1) + { + return $this->children; + } + if (isset($this->children[$idx])) + { + return $this->children[$idx]; + } + return null; + } - // render end tag - if(isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0) - $ret .= 'tag.'>'; - return $ret; - } + // returns the first child of node + function first_child() + { + if (count($this->children)>0) + { + return $this->children[0]; + } + return null; + } - // get dom node's plain text - function text() { - if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER]; - switch ($this->nodetype) { - case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); - case HDOM_TYPE_COMMENT: return ''; - case HDOM_TYPE_UNKNOWN: return ''; - } - if (strcasecmp($this->tag, 'script')===0) return ''; - if (strcasecmp($this->tag, 'style')===0) return ''; + // returns the last child of node + function last_child() + { + if (($count=count($this->children))>0) + { + return $this->children[$count-1]; + } + return null; + } - $ret = ''; - foreach($this->nodes as $n) - $ret .= $n->text(); - return $ret; - } - - function xmltext() { - $ret = $this->innertext(); - $ret = str_ireplace('', '', $ret); - return $ret; - } + // returns the next sibling of node + function next_sibling() + { + if ($this->parent===null) + { + return null; + } - // build node's text with tag - function makeup() { - // text, comment, unknown - if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); + $idx = 0; + $count = count($this->parent->children); + while ($idx<$count && $this!==$this->parent->children[$idx]) + { + ++$idx; + } + if (++$idx>=$count) + { + return null; + } + return $this->parent->children[$idx]; + } - $ret = '<'.$this->tag; - $i = -1; + // returns the previous sibling of node + function prev_sibling() + { + if ($this->parent===null) return null; + $idx = 0; + $count = count($this->parent->children); + while ($idx<$count && $this!==$this->parent->children[$idx]) + ++$idx; + if (--$idx<0) return null; + return $this->parent->children[$idx]; + } - foreach($this->attr as $key=>$val) { - ++$i; + // function to locate a specific ancestor tag in the path to the root. + function find_ancestor_tag($tag) + { + global $debug_object; + if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } - // skip removed attribute - if ($val===null || $val===false) - continue; + // Start by including ourselves in the comparison. + $returnDom = $this; - $ret .= $this->_[HDOM_INFO_SPACE][$i][0]; - //no value attr: nowrap, checked selected... - if ($val===true) - $ret .= $key; - else { - switch($this->_[HDOM_INFO_QUOTE][$i]) { - case HDOM_QUOTE_DOUBLE: $quote = '"'; break; - case HDOM_QUOTE_SINGLE: $quote = '\''; break; - default: $quote = ''; - } - $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote; - } - } - $ret = $this->dom->restore_noise($ret); - return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>'; - } + while (!is_null($returnDom)) + { + if (is_object($debug_object)) { $debug_object->debug_log(2, "Current tag is: " . $returnDom->tag); } - // find elements by css selector - function find($selector, $idx=null) { - $selectors = $this->parse_selector($selector); - if (($count=count($selectors))===0) return array(); - $found_keys = array(); + if ($returnDom->tag == $tag) + { + break; + } + $returnDom = $returnDom->parent; + } + return $returnDom; + } - // find each selector - for ($c=0; $c<$count; ++$c) { - if (($levle=count($selectors[0]))===0) return array(); - if (!isset($this->_[HDOM_INFO_BEGIN])) return array(); + // get dom node's inner html + function innertext() + { + if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER]; + if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); - $head = array($this->_[HDOM_INFO_BEGIN]=>1); + $ret = ''; + foreach ($this->nodes as $n) + $ret .= $n->outertext(); + return $ret; + } - // handle descendant selectors, no recursive! - for ($l=0; $l<$levle; ++$l) { - $ret = array(); - foreach($head as $k=>$v) { - $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k]; - $n->seek($selectors[$c][$l], $ret); - } - $head = $ret; - } + // get dom node's outer text (with tag) + function outertext() + { + global $debug_object; + if (is_object($debug_object)) + { + $text = ''; + if ($this->tag == 'text') + { + if (!empty($this->text)) + { + $text = " with text: " . $this->text; + } + } + $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text); + } - foreach($head as $k=>$v) { - if (!isset($found_keys[$k])) - $found_keys[$k] = 1; - } - } + if ($this->tag==='root') return $this->innertext(); - // sort keys - ksort($found_keys); + // trigger callback + if ($this->dom && $this->dom->callback!==null) + { + call_user_func_array($this->dom->callback, array($this)); + } - $found = array(); - foreach($found_keys as $k=>$v) - $found[] = $this->dom->nodes[$k]; + if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER]; + if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); - // return nth-element or array - if (is_null($idx)) return $found; + // render begin tag + if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) + { + $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup(); + } else { + $ret = ""; + } + + // render inner text + if (isset($this->_[HDOM_INFO_INNER])) + { + // If it's a br tag... don't return the HDOM_INNER_INFO that we may or may not have added. + if ($this->tag != "br") + { + $ret .= $this->_[HDOM_INFO_INNER]; + } + } else { + if ($this->nodes) + { + foreach ($this->nodes as $n) + { + $ret .= $this->convert_text($n->outertext()); + } + } + } + + // render end tag + if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0) + $ret .= 'tag.'>'; + return $ret; + } + + // get dom node's plain text + function text() + { + if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER]; + switch ($this->nodetype) + { + case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); + case HDOM_TYPE_COMMENT: return ''; + case HDOM_TYPE_UNKNOWN: return ''; + } + if (strcasecmp($this->tag, 'script')===0) return ''; + if (strcasecmp($this->tag, 'style')===0) return ''; + + $ret = ''; + // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL. + // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening. + // WHY is this happening? + if (!is_null($this->nodes)) + { + foreach ($this->nodes as $n) + { + $ret .= $this->convert_text($n->text()); + } + + // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all. + if ($this->tag == "span") + { + $ret .= $this->dom->default_span_text; + } + + + } + return $ret; + } + + function xmltext() + { + $ret = $this->innertext(); + $ret = str_ireplace('', '', $ret); + return $ret; + } + + // build node's text with tag + function makeup() + { + // text, comment, unknown + if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); + + $ret = '<'.$this->tag; + $i = -1; + + foreach ($this->attr as $key=>$val) + { + ++$i; + + // skip removed attribute + if ($val===null || $val===false) + continue; + + $ret .= $this->_[HDOM_INFO_SPACE][$i][0]; + //no value attr: nowrap, checked selected... + if ($val===true) + $ret .= $key; + else { + switch ($this->_[HDOM_INFO_QUOTE][$i]) + { + case HDOM_QUOTE_DOUBLE: $quote = '"'; break; + case HDOM_QUOTE_SINGLE: $quote = '\''; break; + default: $quote = ''; + } + $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote; + } + } + $ret = $this->dom->restore_noise($ret); + return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>'; + } + + // find elements by css selector + //PaperG - added ability for find to lowercase the value of the selector. + function find($selector, $idx=null, $lowercase=false) + { + $selectors = $this->parse_selector($selector); + if (($count=count($selectors))===0) return array(); + $found_keys = array(); + + // find each selector + for ($c=0; $c<$count; ++$c) + { + // The change on the below line was documented on the sourceforge code tracker id 2788009 + // used to be: if (($levle=count($selectors[0]))===0) return array(); + if (($levle=count($selectors[$c]))===0) return array(); + if (!isset($this->_[HDOM_INFO_BEGIN])) return array(); + + $head = array($this->_[HDOM_INFO_BEGIN]=>1); + + // handle descendant selectors, no recursive! + for ($l=0; $l<$levle; ++$l) + { + $ret = array(); + foreach ($head as $k=>$v) + { + $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k]; + //PaperG - Pass this optional parameter on to the seek function. + $n->seek($selectors[$c][$l], $ret, $lowercase); + } + $head = $ret; + } + + foreach ($head as $k=>$v) + { + if (!isset($found_keys[$k])) + { + $found_keys[$k] = 1; + } + } + } + + // sort keys + ksort($found_keys); + + $found = array(); + foreach ($found_keys as $k=>$v) + $found[] = $this->dom->nodes[$k]; + + // return nth-element or array + if (is_null($idx)) return $found; else if ($idx<0) $idx = count($found) + $idx; - return (isset($found[$idx])) ? $found[$idx] : null; - } + return (isset($found[$idx])) ? $found[$idx] : null; + } - // seek for given conditions - protected function seek($selector, &$ret) { - list($tag, $key, $val, $exp, $no_key) = $selector; + // seek for given conditions + // PaperG - added parameter to allow for case insensitive testing of the value of a selector. + protected function seek($selector, &$ret, $lowercase=false) + { + global $debug_object; + if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } - // xpath index - if ($tag && $key && is_numeric($key)) { - $count = 0; - foreach ($this->children as $c) { - if ($tag==='*' || $tag===$c->tag) { - if (++$count==$key) { - $ret[$c->_[HDOM_INFO_BEGIN]] = 1; - return; - } - } - } - return; - } + list($tag, $key, $val, $exp, $no_key) = $selector; - $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0; - if ($end==0) { - $parent = $this->parent; - while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) { - $end -= 1; - $parent = $parent->parent; - } - $end += $parent->_[HDOM_INFO_END]; - } + // xpath index + if ($tag && $key && is_numeric($key)) + { + $count = 0; + foreach ($this->children as $c) + { + if ($tag==='*' || $tag===$c->tag) { + if (++$count==$key) { + $ret[$c->_[HDOM_INFO_BEGIN]] = 1; + return; + } + } + } + return; + } - for($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) { - $node = $this->dom->nodes[$i]; - $pass = true; + $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0; + if ($end==0) { + $parent = $this->parent; + while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) { + $end -= 1; + $parent = $parent->parent; + } + $end += $parent->_[HDOM_INFO_END]; + } - if ($tag==='*' && !$key) { - if (in_array($node, $this->children, true)) - $ret[$i] = 1; - continue; - } + for ($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) { + $node = $this->dom->nodes[$i]; - // compare tag - if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;} - // compare key - if ($pass && $key) { - if ($no_key) { - if (isset($node->attr[$key])) $pass=false; - } - else if (!isset($node->attr[$key])) $pass=false; - } - // compare value - if ($pass && $key && $val && $val!=='*') { - $check = $this->match($exp, $val, $node->attr[$key]); - // handle multiple class - if (!$check && strcasecmp($key, 'class')===0) { - foreach(explode(' ',$node->attr[$key]) as $k) { - $check = $this->match($exp, $val, $k); - if ($check) break; - } - } - if (!$check) $pass = false; - } - if ($pass) $ret[$i] = 1; - unset($node); - } - } + $pass = true; - protected function match($exp, $pattern, $value) { - switch ($exp) { - case '=': - return ($value===$pattern); - case '!=': - return ($value!==$pattern); - case '^=': - return preg_match("/^".preg_quote($pattern,'/')."/", $value); - case '$=': - return preg_match("/".preg_quote($pattern,'/')."$/", $value); - case '*=': - if ($pattern[0]=='/') - return preg_match($pattern, $value); - return preg_match("/".$pattern."/i", $value); - } - return false; - } + if ($tag==='*' && !$key) { + if (in_array($node, $this->children, true)) + $ret[$i] = 1; + continue; + } - protected function parse_selector($selector_string) { - // pattern of CSS selectors, modified from mootools - $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; - preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER); - $selectors = array(); - $result = array(); - //print_r($matches); + // compare tag + if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;} + // compare key + if ($pass && $key) { + if ($no_key) { + if (isset($node->attr[$key])) $pass=false; + } else { + if (($key != "plaintext") && !isset($node->attr[$key])) $pass=false; + } + } + // compare value + if ($pass && $key && $val && $val!=='*') { + // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right? + if ($key == "plaintext") { + // $node->plaintext actually returns $node->text(); + $nodeKeyValue = $node->text(); + } else { + // this is a normal search, we want the value of that attribute of the tag. + $nodeKeyValue = $node->attr[$key]; + } + if (is_object($debug_object)) {$debug_object->debug_log(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);} - foreach ($matches as $m) { - $m[0] = trim($m[0]); - if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue; - // for borwser grnreated xpath - if ($m[1]==='tbody') continue; + //PaperG - If lowercase is set, do a case insensitive test of the value of the selector. + if ($lowercase) { + $check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue)); + } else { + $check = $this->match($exp, $val, $nodeKeyValue); + } + if (is_object($debug_object)) {$debug_object->debug_log(2, "after match: " . ($check ? "true" : "false"));} - list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false); - if(!empty($m[2])) {$key='id'; $val=$m[2];} - if(!empty($m[3])) {$key='class'; $val=$m[3];} - if(!empty($m[4])) {$key=$m[4];} - if(!empty($m[5])) {$exp=$m[5];} - if(!empty($m[6])) {$val=$m[6];} + // handle multiple class + if (!$check && strcasecmp($key, 'class')===0) { + foreach (explode(' ',$node->attr[$key]) as $k) { + // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form. + if (!empty($k)) { + if ($lowercase) { + $check = $this->match($exp, strtolower($val), strtolower($k)); + } else { + $check = $this->match($exp, $val, $k); + } + if ($check) break; + } + } + } + if (!$check) $pass = false; + } + if ($pass) $ret[$i] = 1; + unset($node); + } + // It's passed by reference so this is actually what this function returns. + if (is_object($debug_object)) {$debug_object->debug_log(1, "EXIT - ret: ", $ret);} + } - // convert to lowercase - if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);} - //elements that do NOT have the specified attribute - if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;} + protected function match($exp, $pattern, $value) { + global $debug_object; + if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} - $result[] = array($tag, $key, $val, $exp, $no_key); - if (trim($m[7])===',') { - $selectors[] = $result; - $result = array(); - } - } - if (count($result)>0) - $selectors[] = $result; - return $selectors; - } + switch ($exp) { + case '=': + return ($value===$pattern); + case '!=': + return ($value!==$pattern); + case '^=': + return preg_match("/^".preg_quote($pattern,'/')."/", $value); + case '$=': + return preg_match("/".preg_quote($pattern,'/')."$/", $value); + case '*=': + if ($pattern[0]=='/') { + return preg_match($pattern, $value); + } + return preg_match("/".$pattern."/i", $value); + } + return false; + } - function __get($name) { - if (isset($this->attr[$name])) return $this->attr[$name]; - switch($name) { - case 'outertext': return $this->outertext(); - case 'innertext': return $this->innertext(); - case 'plaintext': return $this->text(); - case 'xmltext': return $this->xmltext(); - default: return array_key_exists($name, $this->attr); - } - } + protected function parse_selector($selector_string) { + global $debug_object; + if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} - function __set($name, $value) { - switch($name) { - case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value; - case 'innertext': - if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value; - return $this->_[HDOM_INFO_INNER] = $value; - } - if (!isset($this->attr[$name])) { - $this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); - $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; - } - $this->attr[$name] = $value; - } + // pattern of CSS selectors, modified from mootools + // Paperg: Add the colon to the attrbute, so that it properly finds like google does. + // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check. +// Notice the \[ starting the attbute? and the @? following? This implies that an attribute can begin with an @ sign that is not captured. +// This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression. +// farther study is required to determine of this should be documented or removed. +// $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; + $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; + preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER); + if (is_object($debug_object)) {$debug_object->debug_log(2, "Matches Array: ", $matches);} - function __isset($name) { - switch($name) { - case 'outertext': return true; - case 'innertext': return true; - case 'plaintext': return true; - } - //no value attr: nowrap, checked selected... - return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]); - } + $selectors = array(); + $result = array(); + //print_r($matches); - function __unset($name) { - if (isset($this->attr[$name])) - unset($this->attr[$name]); - } + foreach ($matches as $m) { + $m[0] = trim($m[0]); + if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue; + // for browser generated xpath + if ($m[1]==='tbody') continue; + + list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false); + if (!empty($m[2])) {$key='id'; $val=$m[2];} + if (!empty($m[3])) {$key='class'; $val=$m[3];} + if (!empty($m[4])) {$key=$m[4];} + if (!empty($m[5])) {$exp=$m[5];} + if (!empty($m[6])) {$val=$m[6];} + + // convert to lowercase + if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);} + //elements that do NOT have the specified attribute + if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;} + + $result[] = array($tag, $key, $val, $exp, $no_key); + if (trim($m[7])===',') { + $selectors[] = $result; + $result = array(); + } + } + if (count($result)>0) + $selectors[] = $result; + return $selectors; + } + + function __get($name) + { + if (isset($this->attr[$name])) + { + return $this->convert_text($this->attr[$name]); + } + switch ($name) + { + case 'outertext': return $this->outertext(); + case 'innertext': return $this->innertext(); + case 'plaintext': return $this->text(); + case 'xmltext': return $this->xmltext(); + default: return array_key_exists($name, $this->attr); + } + } + + function __set($name, $value) + { + global $debug_object; + if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} + + switch ($name) + { + case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value; + case 'innertext': + if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value; + return $this->_[HDOM_INFO_INNER] = $value; + } + if (!isset($this->attr[$name])) + { + $this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); + $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; + } + $this->attr[$name] = $value; + } + + function __isset($name) + { + switch ($name) + { + case 'outertext': return true; + case 'innertext': return true; + case 'plaintext': return true; + } + //no value attr: nowrap, checked selected... + return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]); + } + + function __unset($name) { + if (isset($this->attr[$name])) + unset($this->attr[$name]); + } + + // PaperG - Function to convert the text from one character set to another if the two sets are not the same. + function convert_text($text) + { + global $debug_object; + if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} + + $converted_text = $text; + + $sourceCharset = ""; + $targetCharset = ""; + + if ($this->dom) + { + $sourceCharset = strtoupper($this->dom->_charset); + $targetCharset = strtoupper($this->dom->_target_charset); + } + if (is_object($debug_object)) {$debug_object->debug_log(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);} + + if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0)) + { + // Check if the reported encoding could have been incorrect and the text is actually already UTF-8 + if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text))) + { + $converted_text = $text; + } + else + { + $converted_text = iconv($sourceCharset, $targetCharset, $text); + } + } + + // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output. + if ($targetCharset == 'UTF-8') + { + if (substr($converted_text, 0, 3) == "\xef\xbb\xbf") + { + $converted_text = substr($converted_text, 3); + } + if (substr($converted_text, -3) == "\xef\xbb\xbf") + { + $converted_text = substr($converted_text, 0, -3); + } + } + + return $converted_text; + } + + /** + * Returns true if $string is valid UTF-8 and false otherwise. + * + * @param mixed $str String to be tested + * @return boolean + */ + static function is_utf8($str) + { + $c=0; $b=0; + $bits=0; + $len=strlen($str); + for($i=0; $i<$len; $i++) + { + $c=ord($str[$i]); + if($c > 128) + { + if(($c >= 254)) return false; + elseif($c >= 252) $bits=6; + elseif($c >= 248) $bits=5; + elseif($c >= 240) $bits=4; + elseif($c >= 224) $bits=3; + elseif($c >= 192) $bits=2; + else return false; + if(($i+$bits) > $len) return false; + while($bits > 1) + { + $i++; + $b=ord($str[$i]); + if($b < 128 || $b > 191) return false; + $bits--; + } + } + } + return true; + } + /* + function is_utf8($string) + { + //this is buggy + return (utf8_encode(utf8_decode($string)) == $string); + } + */ + + /** + * Function to try a few tricks to determine the displayed size of an img on the page. + * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types. + * + * @author John Schlick + * @version April 19 2012 + * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out. + */ + function get_display_size() + { + global $debug_object; + + $width = -1; + $height = -1; + + if ($this->tag !== 'img') + { + return false; + } + + // See if there is aheight or width attribute in the tag itself. + if (isset($this->attr['width'])) + { + $width = $this->attr['width']; + } + + if (isset($this->attr['height'])) + { + $height = $this->attr['height']; + } + + // Now look for an inline style. + if (isset($this->attr['style'])) + { + // Thanks to user gnarf from stackoverflow for this regular expression. + $attributes = array(); + preg_match_all("/([\w-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER); + foreach ($matches as $match) { + $attributes[$match[1]] = $match[2]; + } + + // If there is a width in the style attributes: + if (isset($attributes['width']) && $width == -1) + { + // check that the last two characters are px (pixels) + if (strtolower(substr($attributes['width'], -2)) == 'px') + { + $proposed_width = substr($attributes['width'], 0, -2); + // Now make sure that it's an integer and not something stupid. + if (filter_var($proposed_width, FILTER_VALIDATE_INT)) + { + $width = $proposed_width; + } + } + } + + // If there is a width in the style attributes: + if (isset($attributes['height']) && $height == -1) + { + // check that the last two characters are px (pixels) + if (strtolower(substr($attributes['height'], -2)) == 'px') + { + $proposed_height = substr($attributes['height'], 0, -2); + // Now make sure that it's an integer and not something stupid. + if (filter_var($proposed_height, FILTER_VALIDATE_INT)) + { + $height = $proposed_height; + } + } + } + + } + + // Future enhancement: + // Look in the tag to see if there is a class or id specified that has a height or width attribute to it. + + // Far future enhancement + // Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width + // Note that in this case, the class or id will have the img subselector for it to apply to the image. + + // ridiculously far future development + // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page. + + $result = array('height' => $height, + 'width' => $width); + return $result; + } + + // camel naming conventions + function getAllAttributes() {return $this->attr;} + function getAttribute($name) {return $this->__get($name);} + function setAttribute($name, $value) {$this->__set($name, $value);} + function hasAttribute($name) {return $this->__isset($name);} + function removeAttribute($name) {$this->__set($name, null);} + function getElementById($id) {return $this->find("#$id", 0);} + function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);} + function getElementByTagName($name) {return $this->find($name, 0);} + function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);} + function parentNode() {return $this->parent();} + function childNodes($idx=-1) {return $this->children($idx);} + function firstChild() {return $this->first_child();} + function lastChild() {return $this->last_child();} + function nextSibling() {return $this->next_sibling();} + function previousSibling() {return $this->prev_sibling();} + function hasChildNodes() {return $this->has_child();} + function nodeName() {return $this->tag;} + function appendChild($node) {$node->parent($this); return $node;} - // camel naming conventions - function getAllAttributes() {return $this->attr;} - function getAttribute($name) {return $this->__get($name);} - function setAttribute($name, $value) {$this->__set($name, $value);} - function hasAttribute($name) {return $this->__isset($name);} - function removeAttribute($name) {$this->__set($name, null);} - function getElementById($id) {return $this->find("#$id", 0);} - function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);} - function getElementByTagName($name) {return $this->find($name, 0);} - function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);} - function parentNode() {return $this->parent();} - function childNodes($idx=-1) {return $this->children($idx);} - function firstChild() {return $this->first_child();} - function lastChild() {return $this->last_child();} - function nextSibling() {return $this->next_sibling();} - function previousSibling() {return $this->prev_sibling();} } -// simple html dom parser -// ----------------------------------------------------------------------------- -class simple_html_dom { - public $root = null; - public $nodes = array(); - public $callback = null; - public $lowercase = false; - protected $pos; - protected $doc; - protected $char; - protected $size; - protected $cursor; - protected $parent; - protected $noise = array(); - protected $token_blank = " \t\r\n"; - protected $token_equal = ' =/>'; - protected $token_slash = " />\r\n\t"; - protected $token_attr = ' >'; - // use isset instead of in_array, performance boost about 30%... - protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1); - protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1); - protected $optional_closing_tags = array( - 'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1), - 'th'=>array('th'=>1), - 'td'=>array('td'=>1), - 'li'=>array('li'=>1), - 'dt'=>array('dt'=>1, 'dd'=>1), - 'dd'=>array('dd'=>1, 'dt'=>1), - 'dl'=>array('dd'=>1, 'dt'=>1), - 'p'=>array('p'=>1), - 'nobr'=>array('nobr'=>1), - ); +/** + * simple html dom parser + * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector. + * Paperg - change $size from protected to public so we can easily access it + * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not. Default is to NOT trust it. + * + * @package PlaceLocalInclude + */ +class simple_html_dom +{ + public $root = null; + public $nodes = array(); + public $callback = null; + public $lowercase = false; + // Used to keep track of how large the text was when we started. + public $original_size; + public $size; + protected $pos; + protected $doc; + protected $char; + protected $cursor; + protected $parent; + protected $noise = array(); + protected $token_blank = " \t\r\n"; + protected $token_equal = ' =/>'; + protected $token_slash = " />\r\n\t"; + protected $token_attr = ' >'; + // Note that this is referenced by a child node, and so it needs to be public for that node to see this information. + public $_charset = ''; + public $_target_charset = ''; + protected $default_br_text = ""; + public $default_span_text = ""; - function __construct($str=null) { - if ($str) { - if (preg_match("/^http:\/\//i",$str) || is_file($str)) - $this->load_file($str); - else - $this->load($str); - } - } + // use isset instead of in_array, performance boost about 30%... + protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1); + protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1); + // Known sourceforge issue #2977341 + // B tags that are not closed cause us to return everything to the end of the document. + protected $optional_closing_tags = array( + 'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1), + 'th'=>array('th'=>1), + 'td'=>array('td'=>1), + 'li'=>array('li'=>1), + 'dt'=>array('dt'=>1, 'dd'=>1), + 'dd'=>array('dd'=>1, 'dt'=>1), + 'dl'=>array('dd'=>1, 'dt'=>1), + 'p'=>array('p'=>1), + 'nobr'=>array('nobr'=>1), + 'b'=>array('b'=>1), + 'option'=>array('option'=>1), + ); - function __destruct() { - $this->clear(); - } + function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) + { + if ($str) + { + if (preg_match("/^http:\/\//i",$str) || is_file($str)) + { + $this->load_file($str); + } + else + { + $this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText); + } + } + // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html. + if (!$forceTagsClosed) { + $this->optional_closing_array=array(); + } + $this->_target_charset = $target_charset; + } - // load html from string - function load($str, $lowercase=true) { - // prepare - $this->prepare($str, $lowercase); - // strip out comments - $this->remove_noise("''is"); - // strip out cdata - $this->remove_noise("''is", true); - // strip out