Bug #9190: Add Horde_Support_Domhtml.
authorMichael M Slusarz <slusarz@curecanti.org>
Fri, 20 Aug 2010 17:40:35 +0000 (11:40 -0600)
committerMichael M Slusarz <slusarz@curecanti.org>
Fri, 20 Aug 2010 18:12:11 +0000 (12:12 -0600)
Charset handling with libxml is less than ideal.  Abstract the
loadHTML() code out into a single place so that we can experiment and
fix things all at once instead of piecemeal.

Suggestions for where else this could reside?  It does require
Horde_String, so maybe horde/Util would be better.

framework/Mime_Viewer/lib/Horde/Mime/Viewer/Html.php
framework/Support/lib/Horde/Support/Domhtml.php [new file with mode: 0644]
framework/Support/package.xml
framework/Text_Filter/lib/Horde/Text/Filter/Html2text.php
framework/Text_Filter/lib/Horde/Text/Filter/Xss.php
framework/Text_Filter/package.xml
imp/lib/Compose.php
imp/lib/tests/mime_viewer_html.phpt
imp/view.php

index b42051a..6c3276d 100644 (file)
@@ -144,6 +144,7 @@ class Horde_Mime_Viewer_Html extends Horde_Mime_Viewer_Base
                 'charset' => $charset
             ),
             array(
+                'charset' => $charset,
                 'noprefetch' => !empty($options['noprefetch']),
                 'return_dom' => true,
                 'strip_styles' => (!empty($options['inline']) || $strip_style_attributes),
@@ -158,9 +159,9 @@ class Horde_Mime_Viewer_Html extends Horde_Mime_Viewer_Base
         );
         $this->_phishWarn = false;
 
-        $this->_node($data, $data);
+        $this->_node($data->dom, $data->dom);
 
-        return Horde_String::convertCharset($data->saveHTML(), $data->encoding, $charset);
+        return $data->returnHtml();
     }
 
     /**
diff --git a/framework/Support/lib/Horde/Support/Domhtml.php b/framework/Support/lib/Horde/Support/Domhtml.php
new file mode 100644 (file)
index 0000000..61b9127
--- /dev/null
@@ -0,0 +1,86 @@
+<?php
+/**
+ * @category   Horde
+ * @package    Support
+ * @copyright  2010 The Horde Project (http://www.horde.org/)
+ * @license    http://opensource.org/licenses/bsd-license.php
+ */
+
+/**
+ * Utility class to help in loading DOM data from HTML strings.
+ *
+ * @author     Michael Slusarz <slusarz@horde.org>
+ * @category   Horde
+ * @package    Support
+ * @copyright  2010 The Horde Project (http://www.horde.org/)
+ * @license    http://opensource.org/licenses/bsd-license.php
+ */
+class Horde_Support_Domhtml
+{
+    /**
+     * DOM object.
+     *
+     * @var DOMDocument
+     */
+    public $dom;
+
+    /**
+     * Charset/encoding used in object.
+     *
+     * @var string
+     */
+    public $encoding;
+
+    /**
+     * Original charset of data.
+     *
+     * @var string
+     */
+    protected $_origCharset;
+
+    /**
+     * @param string $text
+     * @param string $charset
+     *
+     * @throws Exception
+     */
+    public function __construct($text, $charset = null)
+    {
+        if (!extension_loaded('dom')) {
+            throw new Exception('DOM extension is not available.');
+        }
+
+        $this->_origCharset = $charset;
+
+        $old_error = libxml_use_internal_errors(true);
+        $doc = new DOMDocument();
+        $doc->loadHTML($text);
+        $this->encoding = $doc->encoding;
+
+        if (!is_null($charset)) {
+            if (!$doc->encoding) {
+                $doc->loadHTML(Horde_String::convertCharset($text, $charset, 'ISO-8859-1'));
+                $this->encoding = null;
+            } elseif ($doc->encoding != $charset) {
+                /* If libxml can't auto-detect encoding, convert to what it
+                 * *thinks* the encoding should be. */
+                $doc->loadHTML(Horde_String::convertCharset($text, $charset, $doc->encoding));
+            }
+        }
+
+        if ($old_error) {
+            libxml_use_internal_errors(false);
+        }
+
+        $this->dom = $doc;
+    }
+
+    /**
+     * @return string
+     */
+    public function returnHtml()
+    {
+        return Horde_String::convertCharset($this->dom->saveHTML(), $this->encoding, $this->_origCharset);
+    }
+
+}
index 2e655ff..2e9fc1e 100644 (file)
@@ -21,7 +21,8 @@
   <api>beta</api>
  </stability>
  <license uri="http://opensource.org/licenses/bsd-license.php">BSD</license>
- <notes> * Add Horde_Support_Randomid::.
+ <notes>* Add Horde_Support_Domhtml::.
+ * Add Horde_Support_Randomid::.
  * Add Portuguese numerizer.
  </notes>
  <contents>
@@ -40,6 +41,7 @@
       <file name="Backtrace.php" role="php" />
       <file name="CombineStream.php" role="php" />
       <file name="ConsistentHash.php" role="php" />
+      <file name="Domhtml.php" role="php" />
       <file name="Guid.php" role="php" />
       <file name="Inflector.php" role="php" />
       <file name="Numerizer.php" role="php" />
     <channel>pear.horde.org</channel>
    </package>
   </required>
+  <optional>
+   <extension>
+    <name>dom</name>
+   </extension>
+  </optional>
  </dependencies>
  <phprelease>
   <filelist>
    <install as="Horde/Support/Backtrace.php" name="lib/Horde/Support/Backtrace.php" />
    <install as="Horde/Support/CombineStream.php" name="lib/Horde/Support/CombineStream.php" />
    <install as="Horde/Support/ConsistentHash.php" name="lib/Horde/Support/ConsistentHash.php" />
+   <install as="Horde/Support/Domhtml.php" name="lib/Horde/Support/Domhtml.php" />
    <install as="Horde/Support/Guid.php" name="lib/Horde/Support/Guid.php" />
    <install as="Horde/Support/Inflector.php" name="lib/Horde/Support/Inflector.php" />
    <install as="Horde/Support/Numerizer.php" name="lib/Horde/Support/Numerizer.php" />
index b89ab29..cbd5362 100644 (file)
@@ -106,21 +106,10 @@ class Horde_Text_Filter_Html2text extends Horde_Text_Filter_Base
      */
     public function postProcess($text)
     {
-        if (extension_loaded('dom')) {
-            $old_error = libxml_use_internal_errors(true);
-            $doc = new DOMDocument();
-            $doc->loadHTML($text);
-            if (!$doc->encoding) {
-                /* If libxml can't auto-detect encoding, convert to ISO-8859-1
-                 * manually. */
-                $doc->loadHTML(Horde_String::convertCharset($text, $this->_params['charset'], 'ISO-8859-1'));
-            }
-            if ($old_error) {
-                libxml_use_internal_errors(false);
-            }
-
-            $text = Horde_String::convertCharset($this->_node($doc, $doc), $doc->encoding, $this->_params['charset']);
-        } else {
+        try {
+            $dom = new Horde_Support_Domhtml($text, $this->_params['charset']);
+            $text = Horde_String::convertCharset($this->_node($dom->dom, $dom->dom), $dom->encoding, $this->_params['charset']);
+        } catch (Exception $e) {
             $text = strip_tags(preg_replace("/\<br\s*\/?\>/i", "\n", $text));
         }
 
index 4569718..d24e42f 100644 (file)
@@ -15,8 +15,8 @@
  *                     the document.
  *                     DEFAULT: false (returns the contents contained inside
  *                              the BODY tag)
- * 'return_dom' - (boolean) If true, return a DOMDocument object instead of
- *                HTML text (overrides return_document).
+ * 'return_dom' - (boolean) If true, return a Horde_Support_Domhtml object
+ *                instead of HTML text (overrides return_document).
  *                DEFAULT: false
  * 'strip_styles' - (boolean) Strip style tags?
  *                  DEFAULT: true
@@ -94,59 +94,52 @@ class Horde_Text_Filter_Xss extends Horde_Text_Filter_Base
      *
      * @param string $text  The text after the filtering.
      *
-     * @return string|DOMDocument  The modified text or a DOMDocument object
-     *                             if the 'return_dom' parameter is set.
+     * @return string|Horde_Support_Domhtml  The modified text or a Domhtml
+     *                                       object if the 'return_dom'
+     *                                       parameter is set.
      */
     public function postProcess($text)
     {
-        if (!extension_loaded('dom')) {
+        try {
+            $dom = new Horde_Support_Domhtml($text, $this->_params['charset']);
+        } catch (Exception $e) {
             return $text;
         }
 
-        $old_error = libxml_use_internal_errors(true);
-        $doc = new DOMDocument();
-        $doc->loadHTML($text);
-        if (!$doc->encoding) {
-            /* If libxml can't auto-detect encoding, convert to ISO-8859-1
-             * manually. */
-            $doc->loadHTML(Horde_String::convertCharset($text, $this->_params['charset'], 'ISO-8859-1'));
-        }
-        if ($old_error) {
-            libxml_use_internal_errors(false);
-        }
-
-        $this->_node($doc, $doc);
+        $this->_node($dom->dom, $dom->dom);
 
         if (!$this->_params['return_document']) {
-            $body = $doc->getElementsByTagName('body')->item(0);
+            $body = $dom->dom->getElementsByTagName('body')->item(0);
         }
 
         if ($this->_params['noprefetch']) {
-            $meta = $doc->createElement('meta');
+            $meta = $dom->dom->createElement('meta');
             $meta->setAttribute('http-equiv', 'x-dns-prefetch-control');
             $meta->setAttribute('value-equiv', 'off');
 
             if ($this->_params['return_document']) {
-                $doc->getElementsByTagName('head')->item(0)->appendChild($meta);
+                $dom->dom->getElementsByTagName('head')->item(0)->appendChild($meta);
             } elseif ($body) {
                 $body->appendChild($meta);
             }
         }
 
         if ($this->_params['return_dom']) {
-            return $doc;
+            return $dom;
         }
 
-        $text = '';
         if ($this->_params['return_document']) {
-            $text = $doc->saveHTML();
-        } elseif ($body && $body->hasChildNodes()) {
+            return $dom->returnHtml();
+        }
+
+        $text = '';
+        if ($body && $body->hasChildNodes()) {
             foreach ($body->childNodes as $child) {
-                $text .= $doc->saveXML($child);
+                $text .= $dom->dom->saveXML($child);
             }
         }
 
-        return Horde_String::convertCharset($text, $doc->encoding, $this->_params['charset']);
+        return Horde_String::convertCharset($text, $dom->encoding, $this->_params['charset']);
     }
 
     /**
index a5d26df..688c546 100644 (file)
     <channel>pear.horde.org</channel>
    </package>
    <package>
+    <name>Support</name>
+    <channel>pear.horde.org</channel>
+   </package>
+   <package>
     <name>Util</name>
     <channel>pear.horde.org</channel>
    </package>
index b83e97b..fe0b2d3 100644 (file)
@@ -2480,6 +2480,7 @@ class IMP_Compose
         $type = $part->getType();
         $part_charset = $part->getCharset();
         $charset = $GLOBALS['registry']->getCharset();
+
         $msg = Horde_String::convertCharset($part->getContents(), $part_charset, $charset);
 
         /* Enforce reply limits. */
@@ -2492,9 +2493,9 @@ class IMP_Compose
         }
 
         if ($mode == 'html') {
-            $msg = $GLOBALS['injector']->getInstance('Horde_Text_Filter')->filter($msg, array('cleanhtml', 'xss'), array(array('body_only' => true), array('charset' => $charset, 'strip_styles' => true, 'strip_style_attributes' => false)));
+            $msg = $GLOBALS['injector']->getInstance('Horde_Text_Filter')->filter($msg, array('Cleanhtml', 'Xss'), array(array('body_only' => true), array('charset' => $charset, 'strip_styles' => true, 'strip_style_attributes' => false)));
         } elseif ($type == 'text/html') {
-            $msg = $GLOBALS['injector']->getInstance('Horde_Text_Filter')->filter($msg, 'html2text');
+            $msg = $GLOBALS['injector']->getInstance('Horde_Text_Filter')->filter($msg, 'Html2text', array('charset' => $charset));
             $type = 'text/plain';
         }
 
index ce660f7..2a7a1f8 100644 (file)
@@ -9,9 +9,6 @@ Horde_Registry::appInit('imp', array(
     'cli' => true
 ));
 
-// Suppress DOM parsing errors.
-libxml_use_internal_errors(true);
-
 require_once dirname(__FILE__) . '/../Mime/Viewer/Html.php';
 class IMP_Html_Viewer_Test extends IMP_Horde_Mime_Viewer_Html
 {
@@ -25,10 +22,10 @@ class IMP_Html_Viewer_Test extends IMP_Horde_Mime_Viewer_Html
             'target' => '_blank'
         );
 
-        $doc = DOMDocument::loadHTML($html);
-        $this->_node($doc, $doc);
+        $dom = new Horde_Support_Domhtml($html);
+        $this->_node($dom->dom, $dom->dom);
 
-        return $doc->saveXML($doc->getElementsByTagName('body')->item(0)->firstChild) . "\n";
+        return $dom->dom->saveXML($dom->dom->getElementsByTagName('body')->item(0)->firstChild) . "\n";
     }
 
     protected function _node($doc, $node)
index 4ef8454..6557a8b 100644 (file)
@@ -291,22 +291,12 @@ case 'print_attach':
                 if ($browser->isBrowser('mozilla')) {
                     $pstring = Horde_Mime::decodeParam('content-type', $render[$key]['type']);
 
-                    $old_error = libxml_use_internal_errors(true);
-                    $doc = new DOMDocument();
-                    $doc->loadHTML($render[$key]['data']);
-                    if (!$doc->encoding) {
-                        /* If libxml can't auto-detect encoding, convert to
-                         * ISO-8859-1 manually. */
-                        $doc->loadHTML(Horde_String::convertCharset($render[$key]['data'], $pstring['params']['charset'], 'ISO-8859-1'));
-                    }
-                    if (!$old_error) {
-                        libxml_use_internal_errors(false);
-                    }
+                    $doc = new Horde_Support_Domhtml($render[$key]['data'], $pstring['params']['charset']);
 
-                    $bodyelt = $doc->getElementsByTagName('body')->item(0);
-                    $bodyelt->insertBefore($doc->importNode($div, true), $bodyelt->firstChild);
+                    $bodyelt = $doc->dom->getElementsByTagName('body')->item(0);
+                    $bodyelt->insertBefore($doc->dom->importNode($div, true), $bodyelt->firstChild);
 
-                    echo Horde_String::convertCharset($doc->saveHTML(), $doc->encoding, $pstring['params']['charset']);
+                    echo $doc->returnHtml();
                 } else {
                     echo $render[$key]['data'];
                 }