From 1b6d96ff95dbcc6009d4b5fe0d152e3b80880211 Mon Sep 17 00:00:00 2001 From: Michael M Slusarz Date: Fri, 23 Jul 2010 11:49:23 -0600 Subject: [PATCH] Horde_Mime_Viewer HTML viewer now uses DOM parsing. Additionally, refactor driver so that it (and things like the IMP extended driver) can reuse the DOMDocument object so that the HTML data does not need to be continually reparsed. --- framework/Mime/lib/Horde/Mime/Viewer/Html.php | 259 +++++++++++++++----------- imp/lib/Mime/Viewer/Html.php | 207 ++++++++++---------- imp/lib/tests/mime_viewer_html.phpt | 45 ++++- 3 files changed, 277 insertions(+), 234 deletions(-) diff --git a/framework/Mime/lib/Horde/Mime/Viewer/Html.php b/framework/Mime/lib/Horde/Mime/Viewer/Html.php index 00662fda8..92cd2b200 100644 --- a/framework/Mime/lib/Horde/Mime/Viewer/Html.php +++ b/framework/Mime/lib/Horde/Mime/Viewer/Html.php @@ -42,6 +42,13 @@ class Horde_Mime_Viewer_Html extends Horde_Mime_Viewer_Driver protected $_phishWarn = false; /** + * Temp array for storing data when parsing the HTML document. + * + * @var array + */ + protected $_tmp = array(); + + /** * Return the full rendered version of the Horde_Mime_Part object. * * @return array See Horde_Mime_Viewer_Driver::render(). @@ -87,15 +94,15 @@ class Horde_Mime_Viewer_Html extends Horde_Mime_Viewer_Driver * @param string $data The HTML data. * @param array $options Additional options: *
-     * 'charset' => (string) The charset of $data.
-     *              DEFAULT: The base part charset.
-     * 'inline' => (boolean) Are we viewing inline?
-     *             DEFAULT: false
-     * 'noprefetch' => (boolean) Disable DNS prefetching?
-     *                 DEFAULT: false
-     * 'phishing' => (boolean) Do phishing highlighting even if not viewing
-     *               inline.
-     *               DEFAULT: false.
+     * 'charset' - (string) The charset of $data.
+     *             DEFAULT: The base part charset.
+     * 'inline' - (boolean) Are we viewing inline?
+     *            DEFAULT: false
+     * 'noprefetch' - (boolean) Disable DNS prefetching?
+     *                DEFAULT: false
+     * 'phishing' - (boolean) Do phishing highlighting even if not viewing
+     *              inline.
+     *              DEFAULT: false.
      * 
* * @return string The cleaned HTML string. @@ -104,38 +111,10 @@ class Horde_Mime_Viewer_Html extends Horde_Mime_Viewer_Driver { global $browser; - /* Deal with tags in the HTML, since they will screw up our own - * relative paths. */ - if (!empty($options['inline']) && - preg_match('/ ]*)"? ?\/?>/i', $data, $matches)) { - $base = $matches[1]; - if (substr($base, -1) != '/') { - $base .= '/'; - } - - /* Recursively call _cleanHTML() to prevent clever fiends from - * sneaking nasty things into the page via $base. */ - $base = $this->_cleanHTML($base, $options); - - /* Attempt to fix paths that were relying on a tag. */ - if (!empty($base)) { - $pattern = array('|src=(["\'])([^:"\']+)\1|i', - '|src=([^: >"\']+)|i', - '|href= *(["\'])([^:"\']+)\1|i', - '|href=([^: >"\']+)|i'); - $replace = array('src=\1' . $base . '\2\1', - 'src=' . $base . '\1', - 'href=\1' . $base . '\2\1', - 'href=' . $base . '\1'); - $data = preg_replace($pattern, $replace, $data); - } - } - - $strip_style_attributes = !empty($options['inline']) && - (($browser->isBrowser('mozilla') && - $browser->getMajor() == 4) || - $browser->isBrowser('msie')); - $strip_styles = !empty($options['inline']) || $strip_style_attributes; + $strip_style_attributes = (!empty($options['inline']) && + (($browser->isBrowser('mozilla') && + ($browser->getMajor() == 4)) || + $browser->isBrowser('msie'))); $data = Horde_Text_Filter::filter($data, array('cleanhtml', 'xss'), array( array( @@ -143,108 +122,164 @@ class Horde_Mime_Viewer_Html extends Horde_Mime_Viewer_Driver ), array( 'noprefetch' => !empty($options['noprefetch']), - 'return_document' => empty($options['inline']), - 'strip_styles' => $strip_styles, + 'return_dom' => true, + 'strip_styles' => (!empty($options['inline']) || $strip_style_attributes), 'strip_style_attributes' => $strip_style_attributes ) )); - /* Check for phishing exploits. */ - if (!empty($options['inline']) || !empty($options['phishing'])) { - $data = $this->_phishingCheck($data); - } + $this->_tmp = array( + 'base' => null, + 'inline' => !empty($options['inline']), + 'phish' => ((!empty($options['inline']) || !empty($options['phishing'])) && $this->getConfigParam('phishing_check')) + ); + $this->_phishWarn = false; - /* Try to derefer all external references. */ - $data = preg_replace_callback('/href\s*=\s*(["\'])?((?(1)[^\1]*?|[^\s>]+))(?(1)\1|)/i', array($this, '_dereferCallback'), $data); + $this->_node($data, $data); - return $data; + return $data->saveHTML(); } /** - * TODO + * Process DOM node. * - * @param string $m TODO + * @param DOMDocument $doc Document node. + * @param DOMNode $node Node. + */ + protected function _node($doc, $node) + { + if ($node->hasChildNodes()) { + foreach ($node->childNodes as $child) { + if ($child instanceof DOMElement) { + switch (strtolower($child->tagName)) { + case 'base': + /* Deal with tags in the HTML, since they will + * screw up our own relative paths. */ + if ($this->_tmp['inline'] && + $child->hasAttribute('href')) { + $base = $child->getAttribute('href'); + if (substr($base, -1) != '/') { + $base .= '/'; + } + + $this->_tmp['base'] = $base; + $child->removeAttribute('href'); + } + break; + } + + foreach ($child->attributes as $val) { + /* Attempt to fix paths that were relying on a + * tag. */ + if (!is_null($this->_tmp['base']) && + in_array($val->name, array('href', 'src'))) { + $child->setAttribute($val->name, $this->_tmp['base'] . ltrim($val->value, '/')); + } + + if ($val->name == 'href') { + if ($this->_tmp['phish'] && + $this->_phishingCheck($val->value, $child->textContent)) { + $this->_phishWarn = true; + $child->setAttribute('style', ($child->hasAttribute('style') ? rtrim($child->getAttribute('style'), '; ') . ';' : '') . $this->_phishCss); + } + + /* Try to derefer all external references. */ + $child->setAttribute('href', Horde::externalUrl($val->value)); + } + } + } + + $this->_nodeCallback($doc, $child); + $this->_node($doc, $child); + } + } + } + + /** + * Process DOM node (callback). * - * @return string TODO + * @param DOMDocument $doc Document node. + * @param DOMNode $node Node. */ - protected function _dereferCallback($m) + protected function _nodeCallback($doc, $node) { - return 'href="' . Horde::externalUrl($m[2]) . '"'; } /** * Check for phishing exploits. * - * @param string $data The html data. - * @param boolean $scanonly Only scan data; don't replace anything. + * @param string $href The HREF value. + * @param string $text The text value of the link. * - * @return string The string, with phishing links highlighted. + * @return boolean True if phishing is detected. */ - protected function _phishingCheck($data, $scanonly = false) + protected function _phishingCheck($href, $text) { - $this->_phishWarn = false; + /* For phishing, we are checking whether the displayable text URL is + * the same as the HREF URL. If we can't parse the text URL, then we + * can't do phishing checks. */ + $text_url = @parse_url($text); + if (!$text_url) { + return false; + } + + $href_url = parse_url($href); - if (!$this->getConfigParam('phishing_check')) { - return $data; + /* Only concern ourselves with HTTP and FTP links. */ + if (!isset($text_url['scheme']) || + !in_array($text_url['scheme'], array('ftp', 'http', 'https'))) { + return false; } - if (preg_match('/href\s*=\s*["\']?\s*(http|https|ftp):\/\/(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})(?:[^>]*>\s*(?:\\1:\/\/)?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[^<]*<\/a)?/i', $data, $m)) { - /* Check 1: Check for IP address links, but ignore if the link - * text has the same IP address. */ - if (!isset($m[3]) || ($m[2] != $m[3])) { - if (isset($m[3]) && !$scanonly) { - $data = preg_replace('/href\s*=\s*["\']?\s*(http|https|ftp):\/\/' . preg_quote($m[2], '/') . '(?:[^>]*>\s*(?:$1:\/\/)?' . preg_quote($m[3], '/') . '[^<]*<\/a)?/i', 'style="' . $this->_phishCss . '" $0', $data); - } - $this->_phishWarn = true; + /* Check for case where text is just the domain name. */ + if (!isset($text_url['host'])) { + if (!isset($text_url['path']) || + !preg_match("/^[^\.\s]+(?:\.[^\.\s]+)+$/", $text_url['path'])) { + return false; } - } elseif (preg_match_all('/href\s*=\s*["\']?\s*(?:http|https|ftp):\/\/([^\s"\'>]+)["\']?[^>]*>\s*(?:(?:http|https|ftp):\/\/)?(.*?)<\/a/is', $data, $m)) { - /* $m[1] = Link; $m[2] = Target - * Check 2: Check for links that point to a different host than - * the target url; if target looks like a domain name, check it - * against the link. */ - for ($i = 0, $links = count($m[0]); $i < $links; ++$i) { - $link = strtolower(urldecode($m[1][$i])); - $target = strtolower(preg_replace('/^(http|https|ftp):\/\//', '', strip_tags($m[2][$i]))); - if (preg_match('/^[-._\da-z]+\.[a-z]{2,}/i', $target) && - (strpos($link, $target) !== 0) && - (strpos($target, $link) !== 0)) { - /* Don't consider the link a phishing link if the domain - * is the same on both links (e.g. adtracking.example.com - * & www.example.com). */ - preg_match('/\.?([^\.\/]+\.[^\.\/]+)[\/?]/', $link, $host1); - preg_match('/\.?([^\.\/]+\.[^\.\/ ]+)([\/ ].*)?$/s', $target, $host2); - if (!(count($host1) && count($host2)) || - (strcasecmp($host1[1], $host2[1]) !== 0)) { - if (!$scanonly) { - $data = preg_replace('/href\s*=\s*["\']?\s*(?:http|https|ftp):\/\/' . preg_quote($m[1][$i], '/') . '["\']?[^>]*>\s*(?:(?:http|https|ftp):\/\/)?' . preg_quote($m[2][$i], '/') . '<\/a/is', 'style="' . $this->_phishCss . '" $0', $data); - } - $this->_phishWarn = true; - } + + $text_url['host'] = $text_url['path']; + } + + /* If port exists on link, and text link has scheme or port defined, + * do extra checks: + * 1. If port exists on text link, and doesn't match, this is + * phishing. + * 2. If port doesn't exist on text link, and port does not match + * defaults, this is phishing. */ + if (isset($href_url['port']) && + (isset($text_url['scheme']) || isset($text_url['port']))) { + if (!isset($text_url['port'])) { + switch ($text_url['scheme']) { + case 'ftp': + $text_url['port'] = 25; + break; + + case 'http': + $text_url['port'] = 80; + break; + + case 'https': + $text_url['port'] = 443; + break; } } - } - return $data; - } + if ($href_url['port'] != $text_url['port']) { + return false; + } + } - /** - * Returns any phishing warnings that should be shown to the user. - * - * @return array The status array. - */ - protected function _phishingStatus() - { - if (!$this->_phishWarn) { - return array(); + if (strcasecmp($href_url['host'], $text_url['host']) === 0) { + return false; } - return array( - 'class' => 'mimestatuswarning', - 'text' => array( - sprintf(_("%s: This message may not be from whom it claims to be. Beware of following any links in it or of providing the sender with any personal information."), _("Warning")), - _("The links that caused this warning have this background color:") . ' ' . _("EXAMPLE") . '.' - ) - ); + /* Don't consider the link a phishing link if the domain is the same + * on both links (e.g. adtracking.example.com & www.example.com). */ + $host1 = explode('.', $href_url['host']); + $host2 = explode('.', $text_url['host']); + + return (strcasecmp(implode('.', array_slice($host1, -2)), implode('.', array_slice($host2, -2))) !== 0); } } diff --git a/imp/lib/Mime/Viewer/Html.php b/imp/lib/Mime/Viewer/Html.php index d528951c1..69fe92ec3 100644 --- a/imp/lib/Mime/Viewer/Html.php +++ b/imp/lib/Mime/Viewer/Html.php @@ -18,26 +18,11 @@ class IMP_Horde_Mime_Viewer_Html extends Horde_Mime_Viewer_Html { /** - * Cached block image. - * - * @var string - */ - public $blockimg = null; - - /** - * The window target to use for links. - * Needed for testing purposes. - * - * @var string - */ - public $newwinTarget = null; - - /** * Temp array for storing data when parsing the HTML document. * * @var array */ - protected $_tmp = array(); + protected $_imptmp = array(); /** * This driver's display capabilities. @@ -149,17 +134,45 @@ class IMP_Horde_Mime_Viewer_Html extends Horde_Mime_Viewer_Html { $data = $this->_mimepart->getContents(); + /* Don't do IMP DOM processing if in mimp mode or converting to + * text. */ + if (($_SESSION['imp']['view'] == 'mimp') || + (!$inline && Horde_Util::getFormData('convert_text'))) { + $this->_imptmp = null; + } else { + $this->_imptmp = array( + 'blockimg' => null, + 'img' => ($inline && $GLOBALS['prefs']->getValue('html_image_replacement') && !$this->_inAddressBook()), + 'imgblock' => false, + 'inline' => $inline, + 'target' => 'target_' . uniqid(mt_rand()) + ); + + /* Image filtering. */ + if ($this->_imptmp['img']) { + $this->_imptmp['blockimg'] = Horde::url(Horde_Themes::img('spacer_red.png'), true, -1); + } + } + /* Sanitize the HTML. */ $data = $this->_cleanHTML($data, array( 'noprefetch' => ($inline && ($_SESSION['imp']['view'] != 'mimp')), 'phishing' => $inline )); - $status = array($this->_phishingStatus()); + $status = array(); + if ($this->_phishWarn) { + $status[] = array( + 'class' => 'mimestatuswarning', + 'text' => array( + sprintf(_("%s: This message may not be from whom it claims to be. Beware of following any links in it or of providing the sender with any personal information."), _("Warning")), + _("The links that caused this warning have this background color:") . ' ' . _("EXAMPLE") . '.' + ) + ); + } /* We are done processing if in mimp mode, or we are converting to * text. */ - if (($_SESSION['imp']['view'] == 'mimp') || - (!$inline && Horde_Util::getFormData('convert_text'))) { + if (is_null($this->_imptmp)) { $data = $GLOBALS['injector']->getInstance('Horde_Text_Filter')->filter($data, 'Html2text', array('wrap' => false)); // Filter bad language. @@ -170,9 +183,7 @@ class IMP_Horde_Mime_Viewer_Html extends Horde_Mime_Viewer_Html ); } - $data = $this->parseHtml($inline, $data)->saveHTML(); - - if ($this->_tmp['imgblock']) { + if ($this->_imptmp['imgblock']) { $status[] = array( 'icon' => Horde::img('mime/image.png'), 'text' => array( @@ -226,34 +237,6 @@ class IMP_Horde_Mime_Viewer_Html extends Horde_Mime_Viewer_Html } /** - */ - public function parseHtml($inline, $data) - { - $this->_tmp = array( - 'img' => ($inline && $GLOBALS['prefs']->getValue('html_image_replacement') && !$this->_inAddressBook()), - 'imgblock' => false, - 'inline' => $inline, - 'target' => (is_null($this->newwinTarget) ? 'target_' . uniqid(mt_rand()) : $this->newwinTarget) - ); - - /* Image filtering. */ - if ($this->_tmp['img'] && is_null($this->blockimg)) { - $this->blockimg = Horde::url(Horde_Themes::img('spacer_red.png'), true, -1); - } - - $old_error = libxml_use_internal_errors(true); - $doc = new DOMDocument(); - $doc->loadHTML($data); - if ($old_error) { - libxml_use_internal_errors(false); - } - - $this->_node($doc, $doc); - - return $doc; - } - - /** * Determine whether the sender appears in an available addressbook. * * @return boolean Does the sender appear in an addressbook? @@ -282,75 +265,73 @@ class IMP_Horde_Mime_Viewer_Html extends Horde_Mime_Viewer_Html } /** - * Process DOM node. + * Process DOM node (callback). * * @param DOMDocument $doc Document node. - * @param DOMElement $node Element node. + * @param DOMNode $node Node. */ - protected function _node($doc, $node) + protected function _nodeCallback($doc, $node) { - if ($node->hasChildNodes()) { - foreach ($node->childNodes as $child) { - if ($child instanceof DOMElement) { - switch (strtolower($child->tagName)) { - case 'a': - case 'area': - /* Convert links to open in new windows. Ignore - * mailto: links, links that have an "#xyz" anchor, - * and links that already have a target. */ - if (!$child->hasAttribute('target') && - $child->hasAttribute('href')) { - $url = parse_url($child->getAttribute('href')); - if ($url['scheme'] == 'mailto') { - $child->setAttribute('href', IMP::composeLink($child->getAttribute('href'))); - } elseif (empty($url['fragment'])) { - $child->setAttribute('target', $this->_tmp['target']); - } - } - break; - - case 'img': - case 'input': - if ($this->_tmp['img'] && $child->hasAttribute('src')) { - $child->setAttribute('htmlimgblocked', $child->getAttribute('src')); - $child->setAttribute('src', $this->blockimg); - $this->_tmp['imgblock'] = true; - } - break; - - case 'table': - /* If displaying inline (in IFRAME), tables with 100% - * height seems to confuse many browsers re: the - * iframe internal height. */ - if ($this->_tmp['inline'] && - $child->hasAttribute('height') && - ($child->getAttribute('height') == '100%')) { - $child->removeAttribute('height'); - } - - // Fall-through - - case 'body': - case 'td': - if ($this->_tmp['img'] && - $child->hasAttribute('background')) { - $child->setAttribute('htmlimgblocked', $child->getAttribute('background')); - $child->setAttribute('background', $this->blockimg); - $this->_tmp['imgblock'] = true; - } - break; - } + if (is_null($this->_imptmp)) { + return; + } - if ($this->_tmp['img'] && $child->hasAttribute('style')) { - $this->_tmp['child'] = $child; - $style = preg_replace_callback('/(background(?:-image)?:[^;}]*(?:url\(["\']?))(.*?)((?:["\']?\)))/i', array($this, '_styleCallback'), $child->getAttribute('style'), -1, $matches); - if ($matches) { - $child->setAttribute('style', $style); - } + if ($node instanceof DOMElement) { + switch (strtolower($node->tagName)) { + case 'a': + case 'area': + /* Convert links to open in new windows. Ignore + * mailto: links, links that have an "#xyz" anchor, + * and links that already have a target. */ + if (!$node->hasAttribute('target') && + $node->hasAttribute('href')) { + $url = parse_url($node->getAttribute('href')); + if (isset($url['scheme']) && ($url['scheme'] == 'mailto')) { + $node->setAttribute('href', IMP::composeLink($node->getAttribute('href'))); + } elseif (empty($url['fragment'])) { + $node->setAttribute('target', $this->_imptmp['target']); } } + break; + + case 'img': + case 'input': + if ($this->_imptmp['img'] && $node->hasAttribute('src')) { + $node->setAttribute('htmlimgblocked', $node->getAttribute('src')); + $node->setAttribute('src', $this->_imptmp['blockimg']); + $this->_imptmp['imgblock'] = true; + } + break; + + case 'table': + /* If displaying inline (in IFRAME), tables with 100% + * height seems to confuse many browsers re: the + * iframe internal height. */ + if ($this->_imptmp['inline'] && + $node->hasAttribute('height') && + ($node->getAttribute('height') == '100%')) { + $node->removeAttribute('height'); + } - $this->_node($doc, $child); + // Fall-through + + case 'body': + case 'td': + if ($this->_imptmp['img'] && + $node->hasAttribute('background')) { + $node->setAttribute('htmlimgblocked', $node->getAttribute('background')); + $node->setAttribute('background', $this->_imptmp['blockimg']); + $this->_imptmp['imgblock'] = true; + } + break; + } + + if ($this->_imptmp['img'] && $node->hasAttribute('style')) { + $this->_imptmp['node'] = $node; + $style = preg_replace_callback('/(background(?:-image)?:[^;\}]*(?:url\(["\']?))(.*?)((?:["\']?\)))/i', array($this, '_styleCallback'), $node->getAttribute('style'), -1, $matches); + if ($matches) { + $node->setAttribute('style', $style); + } } } } @@ -364,8 +345,8 @@ class IMP_Horde_Mime_Viewer_Html extends Horde_Mime_Viewer_Html */ protected function _styleCallback($matches) { - $this->_tmp['child']->setAttribute('htmlimgblocked', $matches[2]); - return $matches[1] . $this->blockimg . $matches[3]; + $this->_imptmp['node']->setAttribute('htmlimgblocked', $matches[2]); + return $matches[1] . $this->_imptmp['blockimg'] . $matches[3]; } } diff --git a/imp/lib/tests/mime_viewer_html.phpt b/imp/lib/tests/mime_viewer_html.phpt index 81976193b..ce660f771 100644 --- a/imp/lib/tests/mime_viewer_html.phpt +++ b/imp/lib/tests/mime_viewer_html.phpt @@ -9,12 +9,41 @@ Horde_Registry::appInit('imp', array( 'cli' => true )); -$mock_part = new Horde_Mime_Part(); -$mock_part->setType('text/html'); +// Suppress DOM parsing errors. +libxml_use_internal_errors(true); -$v = Horde_Mime_Viewer::factory($mock_part); -$v->blockimg = 'imgblock.png'; -$v->newwinTarget = '_blank'; +require_once dirname(__FILE__) . '/../Mime/Viewer/Html.php'; +class IMP_Html_Viewer_Test extends IMP_Horde_Mime_Viewer_Html +{ + public function runTest($html) + { + $this->_imptmp = array( + 'blockimg' => 'imgblock.png', + 'img' => true, + 'imgblock' => false, + 'inline' => true, + 'target' => '_blank' + ); + + $doc = DOMDocument::loadHTML($html); + $this->_node($doc, $doc); + + return $doc->saveXML($doc->getElementsByTagName('body')->item(0)->firstChild) . "\n"; + } + + protected function _node($doc, $node) + { + if ($node->hasChildNodes()) { + foreach ($node->childNodes as $child) { + $this->_nodeCallback($doc, $child); + $this->_node($doc, $child); + } + } + } + +} + +$v = new IMP_Html_Viewer_Test(new Horde_Mime_Part()); // Test regex for converting links to open in a new window. $links = array( @@ -30,8 +59,7 @@ $links = array( ); foreach ($links as $val) { - $doc = $v->parseHtml(true, $val); - echo $doc->saveXML($doc->getElementsByTagName('body')->item(0)->firstChild) . "\n"; + echo $v->runTest($val); } echo "\n"; @@ -49,8 +77,7 @@ $images = array( ); foreach ($images as $val) { - $doc = $v->parseHtml(true, $val); - echo $doc->saveXML($doc->getElementsByTagName('body')->item(0)->firstChild) . "\n"; + echo $v->runTest($val); } ?> -- 2.11.0