From 8df3db566a3a937b45ebf11adb90d265e6f5e2d4 Mon Sep 17 00:00:00 2001 From: Andreas Baumann Date: Sun, 17 Nov 2019 20:45:02 +0100 Subject: initial checking of customized version 1.0rc9 --- .../library/HTMLPurifier/Lexer/DOMLex.php | 328 +++++++++++++++++++++ 1 file changed, 328 insertions(+) create mode 100644 vendor/ezyang/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php (limited to 'vendor/ezyang/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php') diff --git a/vendor/ezyang/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php b/vendor/ezyang/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php new file mode 100644 index 0000000..6238a99 --- /dev/null +++ b/vendor/ezyang/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php @@ -0,0 +1,328 @@ +factory = new HTMLPurifier_TokenFactory(); + } + + /** + * @param string $html + * @param HTMLPurifier_Config $config + * @param HTMLPurifier_Context $context + * @return HTMLPurifier_Token[] + */ + public function tokenizeHTML($html, $config, $context) + { + $html = $this->normalize($html, $config, $context); + + // attempt to armor stray angled brackets that cannot possibly + // form tags and thus are probably being used as emoticons + if ($config->get('Core.AggressivelyFixLt')) { + $char = '[^a-z!\/]'; + $comment = "/|\z)/is"; + $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html); + do { + $old = $html; + $html = preg_replace("/<($char)/i", '<\\1', $html); + } while ($html !== $old); + $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments + } + + // preprocess html, essential for UTF-8 + $html = $this->wrapHTML($html, $config, $context); + + $doc = new DOMDocument(); + $doc->encoding = 'UTF-8'; // theoretically, the above has this covered + + set_error_handler(array($this, 'muteErrorHandler')); + $doc->loadHTML($html); + restore_error_handler(); + + $body = $doc->getElementsByTagName('html')->item(0)-> // + getElementsByTagName('body')->item(0); // + + $div = $body->getElementsByTagName('div')->item(0); //
+ $tokens = array(); + $this->tokenizeDOM($div, $tokens, $config); + // If the div has a sibling, that means we tripped across + // a premature
tag. So remove the div we parsed, + // and then tokenize the rest of body. We can't tokenize + // the sibling directly as we'll lose the tags in that case. + if ($div->nextSibling) { + $body->removeChild($div); + $this->tokenizeDOM($body, $tokens, $config); + } + return $tokens; + } + + /** + * Iterative function that tokenizes a node, putting it into an accumulator. + * To iterate is human, to recurse divine - L. Peter Deutsch + * @param DOMNode $node DOMNode to be tokenized. + * @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens. + * @return HTMLPurifier_Token of node appended to previously passed tokens. + */ + protected function tokenizeDOM($node, &$tokens, $config) + { + $level = 0; + $nodes = array($level => new HTMLPurifier_Queue(array($node))); + $closingNodes = array(); + do { + while (!$nodes[$level]->isEmpty()) { + $node = $nodes[$level]->shift(); // FIFO + $collect = $level > 0 ? true : false; + $needEndingTag = $this->createStartNode($node, $tokens, $collect, $config); + if ($needEndingTag) { + $closingNodes[$level][] = $node; + } + if ($node->childNodes && $node->childNodes->length) { + $level++; + $nodes[$level] = new HTMLPurifier_Queue(); + foreach ($node->childNodes as $childNode) { + $nodes[$level]->push($childNode); + } + } + } + $level--; + if ($level && isset($closingNodes[$level])) { + while ($node = array_pop($closingNodes[$level])) { + $this->createEndNode($node, $tokens); + } + } + } while ($level > 0); + } + + /** + * Portably retrieve the tag name of a node; deals with older versions + * of libxml like 2.7.6 + * @param DOMNode $node + */ + protected function getTagName($node) + { + if (property_exists($node, 'tagName')) { + return $node->tagName; + } else if (property_exists($node, 'nodeName')) { + return $node->nodeName; + } else if (property_exists($node, 'localName')) { + return $node->localName; + } + return null; + } + + /** + * Portably retrieve the data of a node; deals with older versions + * of libxml like 2.7.6 + * @param DOMNode $node + */ + protected function getData($node) + { + if (property_exists($node, 'data')) { + return $node->data; + } else if (property_exists($node, 'nodeValue')) { + return $node->nodeValue; + } else if (property_exists($node, 'textContent')) { + return $node->textContent; + } + return null; + } + + + /** + * @param DOMNode $node DOMNode to be tokenized. + * @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens. + * @param bool $collect Says whether or start and close are collected, set to + * false at first recursion because it's the implicit DIV + * tag you're dealing with. + * @return bool if the token needs an endtoken + * @todo data and tagName properties don't seem to exist in DOMNode? + */ + protected function createStartNode($node, &$tokens, $collect, $config) + { + // intercept non element nodes. WE MUST catch all of them, + // but we're not getting the character reference nodes because + // those should have been preprocessed + if ($node->nodeType === XML_TEXT_NODE) { + $data = $this->getData($node); // Handle variable data property + if ($data !== null) { + $tokens[] = $this->factory->createText($data); + } + return false; + } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) { + // undo libxml's special treatment of