Add Horde_Text_Filter_Cleanhtml::.
authorMichael M Slusarz <slusarz@curecanti.org>
Mon, 6 Jul 2009 20:11:01 +0000 (14:11 -0600)
committerMichael M Slusarz <slusarz@curecanti.org>
Mon, 6 Jul 2009 20:11:01 +0000 (14:11 -0600)
framework/Mime/lib/Horde/Mime/Viewer/Html.php
framework/Text_Filter/lib/Horde/Text/Filter/Cleanhtml.php [new file with mode: 0644]
framework/Text_Filter/package.xml

index bbe1e0e..a2b66dc 100644 (file)
@@ -112,10 +112,16 @@ class Horde_Mime_Viewer_html extends Horde_Mime_Viewer_Driver
                                     $browser->getMajor() == 4) ||
                                    $browser->isBrowser('msie'));
         $strip_styles = $inline || $strip_style_attributes;
-        $data = Horde_Text_Filter::filter($data, 'xss', array(
-            'body_only' => $inline,
-            'strip_styles' => $strip_styles,
-            'strip_style_attributes' => $strip_style_attributes
+
+        $data = Horde_Text_Filter::filter($data, array('cleanhtml', 'xss'), array(
+            array(
+                'charset' => $this->_mimepart->getCharset()
+            ),
+            array(
+                'body_only' => $inline,
+                'strip_styles' => $strip_styles,
+                'strip_style_attributes' => $strip_style_attributes
+            )
         ));
 
         /* Check for phishing exploits. */
diff --git a/framework/Text_Filter/lib/Horde/Text/Filter/Cleanhtml.php b/framework/Text_Filter/lib/Horde/Text/Filter/Cleanhtml.php
new file mode 100644 (file)
index 0000000..838d199
--- /dev/null
@@ -0,0 +1,76 @@
+<?php
+/**
+ * This filter attempts to sanitize HTML by cleaning up malformed HTML tags.
+ *
+ * Parameters:
+ * <pre>
+ * body_only - (boolean) Only return the body data?
+ *             DEFAULT: Return the whole HTML document
+ * charset - (string) The charset of the text.
+ *           DEFAULT: US-ASCII
+ * size - (integer) Only filter if data is below this size.
+ *        DEFAULT: No default
+ * </pre>
+ *
+ * Copyright 2009 The Horde Project (http://www.horde.org/)
+ *
+ * See the enclosed file COPYING for license information (LGPL). If you
+ * did not receive this file, see http://www.fsf.org/copyleft/lgpl.html.
+ *
+ * @author  Michael Slusarz <jan@horde.org>
+ * @package Horde_Text
+ */
+class Horde_Text_Filter_Cleanhtml extends Horde_Text_Filter
+{
+    /**
+     * Filter parameters.
+     *
+     * @var array
+     */
+    protected $_params = array(
+        'body_only' => false,
+        'charset' => 'us-ascii',
+        'size' => false
+    );
+
+    /**
+     * Executes any code necessary after applying the filter patterns.
+     *
+     * @param string $text  The text after the filtering.
+     *
+     * @return string  The modified text.
+     */
+    public function postProcess($text)
+    {
+        if (!Horde_Util::extensionExists('tidy') ||
+            (($this->_params['size'] !== false) &&
+             (strlen($text) > $this->_params['size']))) {
+            return $text;
+        }
+
+        $tidy_config = array(
+            'enclose-block-text' => true,
+            'hide-comments' => true,
+            'indent' => true,
+            'indent-spaces' => 4,
+            'numeric-entities' => true,
+            'output-xhtml' => true,
+            'show-body-only' => !empty($this->_params['body_only']),
+            'tab-size' => 4,
+            'wrap' => 0
+        );
+
+        if (strtolower($this->_params['charset']) == 'us-ascii') {
+            $tidy = tidy_parse_string($text, $tidy_config, 'ascii');
+            $tidy->cleanRepair();
+            $text = tidy_get_output($tidy);
+        } else {
+            $tidy = tidy_parse_string(Horde_String::convertCharset($text, $this->_params['charset'], 'UTF-8'), $tidy_config, 'utf8');
+            $tidy->cleanRepair();
+            $text = Horde_String::convertCharset(tidy_get_output($tidy), 'UTF-8', $this->_params['charset']);
+        }
+
+        return $text;
+    }
+
+}
index bee5156..57ecc50 100644 (file)
@@ -37,7 +37,8 @@ http://pear.php.net/dtd/package-2.0.xsd">
   <api>beta</api>
  </stability>
  <license uri="http://www.gnu.org/copyleft/lesser.html">LGPL</license>
- <notes>* Initial Horde 4 package.
+ <notes>* Add support for using the tidy extension when filtering HTML data.
+ * Initial Horde 4 package.
  </notes>
  <contents>
   <dir name="/">
@@ -47,6 +48,7 @@ http://pear.php.net/dtd/package-2.0.xsd">
       <dir name="Filter">
        <file name="Bbcode.php" role="php" />
        <file name="Cleanascii.php" role="php" />
+       <file name="Cleanhtml.php" role="php" />
        <file name="Dimsignature.php" role="php" />
        <file name="Emails.php" role="php" />
        <file name="Emoticons.php" role="php" />
@@ -199,6 +201,7 @@ http://pear.php.net/dtd/package-2.0.xsd">
   <filelist>
    <install name="lib/Horde/Text/Filter/Bbcode.php" as="Horde/Text/Filter/Bbcode.php" />
    <install name="lib/Horde/Text/Filter/Cleanascii.php" as="Horde/Text/Filter/Cleanascii.php" />
+   <install name="lib/Horde/Text/Filter/Cleanhtml.php" as="Horde/Text/Filter/Cleanhtml.php" />
    <install name="lib/Horde/Text/Filter/Dimsignature.php" as="Horde/Text/Filter/Dimsignature.php" />
    <install name="lib/Horde/Text/Filter/Emails.php" as="Horde/Text/Filter/Emails.php" />
    <install name="lib/Horde/Text/Filter/Emoticons.php" as="Horde/Text/Filter/Emoticons.php" />