Use XML parser to generate Html2text output.
authorMichael M Slusarz <slusarz@curecanti.org>
Tue, 23 Feb 2010 06:46:23 +0000 (23:46 -0700)
committerMichael M Slusarz <slusarz@curecanti.org>
Tue, 23 Feb 2010 08:50:12 +0000 (01:50 -0700)
framework/Text_Filter/lib/Horde/Text/Filter/Html2text.php
framework/Text_Filter/package.xml
framework/Text_Filter/test/Horde/Text/Filter/fixtures/html2text.html
framework/Text_Filter/test/Horde/Text/Filter/html2text.phpt
framework/Text_Filter/test/Horde/Text/Filter/html2text3.phpt

index 698aa1b..460d0a7 100644 (file)
@@ -9,14 +9,11 @@
  * wrap - (boolean) Whether to wrap the text or not.
  * </pre>
  *
- * Copyright 2003-2004 Jon Abernathy <jon@chuggnutt.com>
- * Original source: http://www.chuggnutt.com/html2text.php
  * Copyright 2004-2010 The Horde Project (http://www.horde.org/)
  *
  * See the enclosed file COPYING for license information (LGPL). If you
  * did not receive this file, see http://www.fsf.org/copyleft/lgpl.html.
  *
- * @author  Jon Abernathy <jon@chuggnutt.com>
  * @author  Jan Schneider <jan@horde.org>
  * @author  Michael Slusarz <slusarz@horde.org>
  * @package Horde_Text
@@ -31,30 +28,38 @@ class Horde_Text_Filter_Html2text extends Horde_Text_Filter_Base
     protected $_linkList = array();
 
     /**
+     * Current list indentation level.
+     *
+     * @var integer
+     */
+    protected $_indent = 0;
+
+    /**
+     * Current blockquote level.
+     *
+     * @var integer
+     */
+    protected $_bqlevel = 0;
+
+    /**
+     * Current blockquote data.
+     *
+     * @var array
+     */
+    protected $_bqdata = array();
+
+    /**
      * Filter parameters.
      *
      * @var array
      */
     protected $_params = array(
-        'charset' => 'ISO-8859-1',
+        'charset' => 'UTF-8',
         'width' => 70,
         'wrap' => true
     );
 
     /**
-     * Executes any code necessary before applying the filter patterns.
-     *
-     * @param string $text  The text before the filtering.
-     *
-     * @return string  The modified text.
-     */
-    public function preProcess($text)
-    {
-        $this->_linkList = array();
-        return trim($text);
-    }
-
-    /**
      * Returns a hash with replace patterns.
      *
      * @return array  Patterns hash.
@@ -62,96 +67,32 @@ class Horde_Text_Filter_Html2text extends Horde_Text_Filter_Base
     public function getPatterns()
     {
         $replace = array(
-            // Non-legal carriage return.
-            '/\r/' => ''
-        );
-
-        $regexp = array(
-            // Newlines and tabs.
-            '/[\n\t]+/' => ' ',
-
-            // Normalize <br> (remove leading/trailing whitespace)
-            '/\s*<br[^>]*>\s*/i' => '<br>',
-
-            // <script>s -- which strip_tags() supposedly has problems with.
-            '/<script(?:>|\s[^>]*>).*?<\/script\s*>/i' => '',
-
-            // <style>s -- which strip_tags() supposedly has problems with.
-            '/<style(?:>|\s[^>]*>).*?<\/style\s*>/i' => '',
-
-            // h1 - h3
-            '/<h[123](?:>|\s[^>]*>)(.+?)<\/h[123]\s*>/ie' => '"<br><br>" . strtoupper("\\1") . "<br><br>"',
-
-            // h4 - h6
-            '/<h[456](?:>|\s[^>]*>)(.+?)<\/h[456]\s*> ?/ie' => '"<br><br>" . ucwords("\\1") . "<br><br>"',
-
-            // <p>
-            '/\s*<p(?:>|\s[^>]*>)\s*/i' => '<br><br>',
-
-            // <div>
-            '/\s*<div(?:>|\s[^>]*>)\s*/i' => '<br>',
-
-            // <b>
-            '/<b(?:>|\s[^>]*>)(.+?)<\/b>/ie' => 'strtoupper("\\1")',
-
-            // <strong>
-            '/<strong(?:>|\s[^>]*>)(.+?)<\/strong>/ie' => 'strtoupper("\\1")',
-            '/<span\s+style="font-weight:\s*bold.*">(.+?)<\/span>/ie' => 'strtoupper("\\1")',
-
-            // <i>
-            '/<i(?:>|\s[^>]*>)(.+?)<\/i>/i' => '/\\1/',
-
-            // <em>
-            '/<em(?:>|\s[^>]*>)(.+?)<\/em>/i' => '_\\1_',
-
-            // <u>
-            '/<u(?:>|\s[^>]*>)(.+?)<\/u>/i' => '_\\1_',
-
-            // <ul>/<ol> and </ul>/</ol>
-            '/\s*(<(u|o)l(?:>|\s[^>]*>)| ?<\/(u|o)l\s*>)\s*/i' => '<br><br>',
-
-            // <li>
-            '/\s*<li(?:>|\s[^>]*>)\s*/i' => '<br>  * ',
-
-            // <hr>
-            '/\s*<hr(?:>|\s[^>]*>)\s*/i' => '<br>-------------------------<br>',
-
-            // <table> and </table>
-            '/\s*(<table(?:>|\s[^>]*>)| ?<\/table\s*>)\s*/i' => '<br><br>',
-
-            // <tr>
-            '/\s*<tr(?:>|\s[^>]*>)\s*/i' => '<br>',
-
-            // <td> and </td>
-            '/\s*<td(?:>|\s[^>]*>)(.+?)<\/td>\s*/i' => '\\1<br>',
-
-            // <th> and </th>
-            '/\s*<th(?:>|\s[^>]*>)(.+?)<\/th>\s*/ie' => 'strtoupper("\\1") . "<br>"',
-
-            // Some mailers (e.g. Hotmail) use the following div tag as a way
-            // to define a block of text.
-            '/<div class="?rte"?>(.+?)<\/div> ?/i' => '\\1<br>',
-
-            // Cite blocks.
-            '/\s*<blockquote\s+[^>]*(?:type="?cite"?|class="?gmail_quote"?)[^>]*>\s*/i' => '<hordecite>',
-
-            // <br>
-            '/<br[^>]*>/i' => "\n"
-        );
-
-        $regexp_callback = array(
-            // <a href="">
-            '/<a href="([^"]+)"[^>]*>(.+?)<\/a>/i' => array($this, 'buildLinkList')
+            "\r" => '',
+            "\n" => ' ',
+            "\t" => ' '
         );
 
         return array(
-            'regexp' => $regexp,
-            'regexp_callback' => $regexp_callback,
             'replace' => $replace
         );
     }
 
     /**
+     * Executes any code necessary before applying the filter patterns.
+     *
+     * @param string $text  The text before the filtering.
+     *
+     * @return string  The modified text.
+     */
+    public function preProcess($text)
+    {
+        $this->_bqlevel = $this->_indent = 0;
+        $this->_bqdata = $this->_linkList = array();
+
+        return $text;
+    }
+
+    /**
      * Executes any code necessary after applying the filter patterns.
      *
      * @param string $text  The text after the filtering.
@@ -160,20 +101,23 @@ class Horde_Text_Filter_Html2text extends Horde_Text_Filter_Base
      */
     public function postProcess($text)
     {
-        /* Convert blockquote tags. */
-        return $text;
-     //   if (strpos($text, chr(0)) !== false) {
-        //    $text = $this->_blockQuote($text);
-//        }
+        $text = Horde_String::convertCharset($text, $this->_params['charset'], 'UTF-8');
 
-        /* Strip any other HTML tags. */
-        $text = strip_tags($text);
+        if (extension_loaded('dom')) {
+            $old_error = libxml_use_internal_errors(true);
+            $doc = DOMDocument::loadHTML('<?xml encoding="UTF-8">' . $text);
+            if ($old_error) {
+                libxml_use_internal_errors(false);
+            }
+            $text = $this->_node($doc, $doc);
+        }
 
-        /* Convert HTML entities. */
-        $text = html_entity_decode($text, ENT_QUOTES, $this->_params['charset']);
+        /* Strip HTML tags and convert HTML entities. */
+        $text = html_entity_decode($text, ENT_QUOTES, 'UTF-8');
 
-        /* Bring down number of empty lines to 2 max. */
-        $text = preg_replace(array("/\n[[:space:]]+\n/", "/[\n]{3,}/"), "\n\n", $text);
+        /* Bring down number of empty lines to 2 max, and remove trailing
+         * ws. */
+        $text = preg_replace(array("/\n[[:space:]]+\n/", "/[\n]{3,}/", "/ +\n/"), array("\n\n", "\n\n", "\n"), $text);
 
         /* Wrap the text to a readable format. */
         if ($this->_params['wrap']) {
@@ -189,74 +133,200 @@ class Horde_Text_Filter_Html2text extends Horde_Text_Filter_Base
             }
         }
 
-        return trim(rtrim($text), "\n");
-    }
-
-    /**
-     * Replaces blockquote tags with > quotes.
-     *
-     * @param string $text  The text to quote.
-     *
-     * @return string  The quoted text.
-     */
-    protected function _blockQuote($text)
-    {
-        return preg_replace(
-            '/([^\x00\x01]*)\x00(((?>[^\x00\x01]*)|(?R))*)\x01([^\x00\x01]*)/se',
-            "stripslashes('$1') . \"\n\n\" . \$this->_quote('$2') . \"\n\n\" . stripslashes('$4')",
-            $text);
+        return ltrim(rtrim($text), "\n");
     }
 
     /**
-     * Quotes a chunk of text.
+     * Process DOM node.
      *
-     * @param string $text  The text to quote.
+     * @param DOMDocument $doc  Document node.
+     * @param DOMElement $node  Element node.
      *
-     * @return string  The quoted text.
+     * @return string  The plaintext representation.
      */
-    protected function _quote($text)
+    protected function _node($doc, $node)
     {
-        $text = stripslashes($text);
-        if (strpos($text, chr(0)) !== false) {
-            $text = stripslashes($this->_blockQuote($text));
-        }
-
-        $text = rtrim(strip_tags($text));
-        if ($this->_params['wrap']) {
-            $text = wordwrap($text, $this->_params['width'] - 2);
+        $out = '';
+
+        if ($node->hasChildNodes()) {
+            foreach ($node->childNodes as $child) {
+                if ($child instanceof DOMElement) {
+                    switch (strtolower($child->tagName)) {
+                    case 'h1':
+                    case 'h2':
+                    case 'h3':
+                        $out .= "\n\n" .
+                            strtoupper($this->_node($doc, $child)) .
+                            "\n\n";
+                        break;
+
+                    case 'h4':
+                    case 'h5':
+                    case 'h6':
+                        $out .= "\n\n" .
+                            ucwords($this->_node($doc, $child))
+                            . "\n\n";
+                        break;
+
+                    case 'b':
+                    case 'strong':
+                        $out .= strtoupper($this->_node($doc, $child));
+                        break;
+
+                    case 'u':
+                        $out .= '_' . $this->_node($doc, $child) . '_';
+                        break;
+
+                    case 'em':
+                    case 'i':
+                        $out .= '/' . $this->_node($doc, $child) . '/';
+                        break;
+
+                    case 'hr':
+                        $out .= "\n-------------------------\n";
+                        break;
+
+                    case 'ol':
+                    case 'ul':
+                        ++$this->_indent;
+                        $out .= "\n\n" . $this->_node($doc, $child) . "\n\n";
+                        --$this->_indent;
+                        break;
+
+                    case 'p':
+                        if ($tmp = $this->_node($doc, $child)) {
+                            $out .= "\n" . $tmp . "\n";
+                        }
+                        break;
+
+                    case 'table':
+                        if ($tmp = $this->_node($doc, $child)) {
+                            $out .= "\n\n" . $tmp . "\n\n";
+                        }
+                        break;
+
+                    case 'tr':
+                        $out .= "\n  " . rtrim($this->_node($doc, $child));
+                        break;
+
+                    case 'th':
+                        $out .= strtoupper($this->_node($doc, $child)) . " \t";
+                        break;
+
+                    case 'td':
+                        $out .= $this->_node($doc, $child) . " \t";
+                        break;
+
+                    case 'li':
+                        $out .= "\n" . str_repeat('  ', $this->_indent) . '* ' . $this->_node($doc, $child);
+                        break;
+
+                    case 'a':
+                        $out .= $this->_node($doc, $child) . $this->_buildLinkList($doc, $child);
+                        break;
+
+                    case 'blockquote':
+                        if ($this->_bqlevel) {
+                            $this->_bqdata[] = array(
+                                'level' => $this->_bqlevel,
+                                'text' => $out
+                            );
+                            $out = '';
+                        }
+                        ++$this->_bqlevel;
+                        $this->_bqdata[] = array(
+                            'level' => $this->_bqlevel,
+                            'text' => $this->_node($doc, $child)
+                        );
+                        --$this->_bqlevel;
+
+                        if (!$this->_bqlevel) {
+                            $out .= "\n\n";
+                            foreach ($this->_bqdata as $val) {
+                                if (empty($val['text'])) {
+                                    continue;
+                                }
+
+                                if ($this->_params['wrap']) {
+                                    $tmp = array();
+                                    foreach (explode("\n", $val['text']) as $val2) {
+                                        $tmp = array_merge($tmp, explode("\n", wordwrap($val2, $this->_params['width'] - (2 * $val['level']))));
+                                    }
+                                } else {
+                                    $tmp = $val['text'];
+                                }
+
+                                /* Clean out empty entries. */
+                                for ($i = 0, $cnt = count($tmp); $i < $cnt; ++$i) {
+                                    if (!empty($tmp[$i])) {
+                                        break;
+                                    }
+                                    unset($tmp[$i]);
+                                }
+
+                                $tmp = array_values($tmp);
+
+                                for ($i = count($tmp); $i >= 0; --$i) {
+                                    if (!empty($tmp[$i])) {
+                                        break;
+                                    }
+                                    unset($tmp[$i]);
+                                }
+
+                                foreach ($tmp as $val2) {
+                                    $out .= str_repeat("> ", $val['level']) . rtrim($val2) . "\n";
+                                }
+                            }
+                            $out .= "\n\n";
+                            $this->_bqdata = array();
+                        }
+                        break;
+
+                    case 'div':
+                        $out .= $this->_node($doc, $child) . "\n";
+                        break;
+
+                    case 'br':
+                        $out .= "\n";
+                        break;
+
+                    default:
+                        $out .= $this->_node($doc, $child);
+                        break;
+                    }
+                } elseif (($child instanceof DOMText) &&
+                          !$child->isWhitespaceInElementContent()) {
+                    $tmp = $child->textContent;
+                    if ($child->parentNode->tagName == 'body' ||
+                        !$child->previousSibling) {
+                        $tmp = ltrim($tmp);
+                    }
+                    if (!$child->nextSibling) {
+                        $tmp = rtrim($tmp);
+                    }
+                    $out .= $tmp;
+                }
+            }
         }
 
-        return preg_replace(array('/^/m', '/(\n>\s*$){3,}/m', '/^>\s+$/m'),
-                            array('> ', "\n> ", '>'),
-                            $text);
+        return $out;
     }
 
     /**
-     * Helper function called by preg_replace() on link replacement.
-     *
      * Maintains an internal list of links to be displayed at the end
      * of the text, with numeric indices to the original point in the
      * text they appeared.
      *
-     * @param array $matches  Match information:
-     * <pre>
-     * [1] URL of the link.
-     * [2] Part of the text to associate number with.
-     * </pre>
-     *
-     * @return string  The link replacement.
+     * @param DOMDocument $doc  Document node.
+     * @param DOMElement $node  Element node.
      */
-    public function buildLinkList($matches)
+    protected function _buildLinkList($doc, $node)
     {
-        $link = $matches[1];
-        $display = $matches[2];
-
-        if ($link == strip_tags($display)) {
-            return $display;
-        }
+        $link = $node->getAttribute('href');
+        $display = $node->textContent;
 
         $parsed_link = parse_url($link);
-        $parsed_display = parse_url(strip_tags(preg_replace('/^&lt;|&gt;$/', '', $display)));
+        $parsed_display = parse_url($display);
 
         if (isset($parsed_link['path'])) {
             $parsed_link['path'] = trim($parsed_link['path'], '/');
@@ -282,12 +352,15 @@ class Horde_Text_Filter_Html2text extends Horde_Text_Filter_Base
              (isset($parsed_link['path']) &&
               isset($parsed_display['path']) &&
               $parsed_link['path'] == $parsed_display['path']))) {
-            return $display;
+            return '';
         }
 
-        $this->_linkList[] = $link;
+        if (($pos = array_search($link, $this->_linkList)) === false) {
+            $this->_linkList[] = $link;
+            $pos = count($this->_linkList) - 1;
+        }
 
-        return $display . '[' . count($this->_linkList) . ']';
+        return '[' . ($pos + 1) . ']';
     }
 
 }
index 06d7d7a..e911331 100644 (file)
@@ -37,7 +37,8 @@ http://pear.php.net/dtd/package-2.0.xsd">
   <api>beta</api>
  </stability>
  <license uri="http://www.gnu.org/copyleft/lesser.html">LGPL</license>
- <notes>* Add ability to define filters to use with preg_replace_callback().
+ <notes>* Html2text converter now uses XML parser to generate output.
+ * Add ability to define filters to use with preg_replace_callback().
  * Add 'noprefetch' parameter to XSS filter (Ticket #8836).
  * Add XSS filtering for data URLs in A HREF parameters (Bug #8715).
  * Add support for Google Closure Compiler in javascript minfiy filter.
@@ -212,6 +213,9 @@ http://pear.php.net/dtd/package-2.0.xsd">
     <channel>pear.horde.org</channel>
    </package>
    <extension>
+    <name>dom</name>
+   </extension>
+   <extension>
     <name>gettext</name>
    </extension>
   </optional>
index edd94c6..7aecb38 100644 (file)
@@ -2,8 +2,18 @@
 
     Some text with leading and trailing whitespace  
 
+<br />
+
+<hr />
+
+<h2>Table</h2>
+
 <table class="table">
     <tr>
+        <th>Type</th>
+        <th>Representation</th>
+    </tr>
+    <tr>
         <td class="table-cell">emphasis text</td>
         <td class="table-cell"><em>emphasis text</em></td>
     </tr>
@@ -117,11 +127,13 @@ lines prefixed with the &gt; character.</p>
 
 <h2>Special Characters</h2>
 
+<div>
 &auml;
 &eacute;
 &copy;
 &trade;
 &#x0110;
+</div>
 
 <p>Zitat von John Doe &lt;john.doe@example.com&gt;:</p>
   <blockquote type="cite"> 
index 04bfd95..2a63a9d 100644 (file)
@@ -17,12 +17,17 @@ INLINE FORMATTING
 
 Some text with leading and trailing whitespace
 
-       emphasis text           /emphasis text/
-       strong text             STRONG TEXT
-       italic text             /italic text/
-       bold text               BOLD TEXT
-       emphasis and strong             /EMPHASIS AND STRONG/
-       underline text          _underline text_
+-------------------------
+
+TABLE
+
+  TYPE         REPRESENTATION
+  emphasis text        /emphasis text/
+  strong text  STRONG TEXT
+  italic text  /italic text/
+  bold text    BOLD TEXT
+  emphasis and strong  /EMPHASIS AND STRONG/
+  underline text       _underline text_
 
 -------------------------
 
@@ -30,7 +35,7 @@ LINKS
 
 Horde Homepage[1]
 Test User[2]
-Some inline link[3].
+Some inline link[1].
 http://www.example.com
 
 -------------------------
@@ -57,7 +62,7 @@ asterisks.
 
   * Bullet one
 
-  * Sub-bullet
+    * Sub-bullet
 
 NUMBERED LISTS
 
@@ -67,7 +72,7 @@ one or more hashes.
   * Numero uno
   * Number two
 
-  * Sub-item
+    * Sub-item
 
 MIXING BULLET AND NUMBER LIST ITEMS
 
@@ -75,33 +80,34 @@ You can mix and match bullet and number lists:
 
   * Number one
 
-  * Bullet
-  * Bullet
+    * Bullet
+    * Bullet
 
   * Number two
 
-  * Bullet
-  * Bullet
+    * Bullet
+    * Bullet
 
-  * Sub-bullet
+      * Sub-bullet
 
-  * Sub-sub-number
-  * Sub-sub-number
+        * Sub-sub-number
+        * Sub-sub-number
 
   * Number three
 
-  * Bullet
-  * Bullet
+    * Bullet
+    * Bullet
 
 BLOCK QUOTING
 
-> Horde Homepage[4]
-> Some inline link[5].
+> Horde Homepage[1]
+>  Some inline link[1].
 
 Line inbetween.
 
 > HEADING INSIDE QUOTING
 >
+>
 > This is a paragraph inside a block quoting. The result should be
 > several lines prefixed with the > character.
 
@@ -109,49 +115,41 @@ SPECIAL CHARACTERS
 
 ä é © ™ Đ
 
-Zitat von John Doe <john.doe@example.com>: 
+Zitat von John Doe <john.doe@example.com>:
 
-> Hallo lieber John, 
+> Hallo lieber John,
 >
 > Blah, blah.'
 
--- 
-Some signature
+--
+ Some signature
 http://www.example.com
 
 Zitat von Jane Doe <jane.doe@example.com>:
 
-> Jan Schneider a écrit :
->
-> > Zitat von Jane Doe <jane.doe@example.com>: 
-> >
-> > > Hi, 
+> Jan Schneider a écrit :
+> > Zitat von Jane Doe <jane.doe@example.com>[3]:
+> > > Hi,
 > > >
-> > > I prepare the last "horde-webmail-1.2" for production
-> > > level but I have few questions: 
-> > > - is there a way to disable "external_display_cal" in
+> > >  I prepare the last "horde-webmail-1.2" for production level but
+> > > I have few questions:
+> > >  - is there a way to disable "external_display_cal" in
 > > > kronolith, I don't want seeing birthdays calendars (turba) and
-> > task
-> > > list (nag)
-> >
-> >
-> > They aren't displayed by default, or do you mean you don't want
-> them
-> > to appear in the top right calendar panel?
->
+> > > task list (nag)
+> >  They aren't displayed by default, or do you mean you don't want
+> > them to appear in the top right calendar panel?
 >  Yes I don't want them to appear in the top right calendar panel but
 > I want user can create their external_cal
 
 Jan.
 
--- 
-Do you need professional PHP or Horde consulting?
+ --
+ Do you need professional PHP or Horde consulting?
 http://horde.org/consulting/
 
+
 Links:
 ------
 [1] http://www.horde.org
 [2] mailto:test@example.com
-[3] http://www.horde.org
-[4] http://www.horde.org
-[5] http://www.horde.org
+[3] mailto:jane.doe@example.com
index d909468..713ee97 100644 (file)
@@ -37,34 +37,30 @@ echo Horde_Text_Filter::filter($html, 'html2text');
 
 ?>
 --EXPECT--
-Zitat von Roberto Maurizzi <roberto.maurizzi@gmail.com>: 
+Zitat von Roberto Maurizzi <roberto.maurizzi@gmail.com>:
 
 > > > > 4) In Turba, I can select a VFS driver to use. Currently it is
-> > set
-> > > > to
-> > > > None and turba seems to be working fine. What does Turba use
-> the
-> > > VFS
-> > > > for?
+> > > > set to
+> > > >   None and turba seems to be working fine. What does Turba use
+> > > > the VFS
+> > > >  for?
+> >  You can attach files to contacts with that.
 > >
-> > You can attach files to contacts with that.
-> >
-> > Jan.
->
+> >  Jan.
 > Anything similar for Kronolith, maybe in the new version?
 > I've googled a little and only found a discussion in 2004 about
 > having attachment (or links) from VFS in Kronolith.
-> I'd really like to be able to attach all my taxes forms to the day I
-> have to pay them ;-) and more in general all the extra documentation
-> regarding an appointment.
+>  I'd really like to be able to attach all my taxes forms to the day
+> I have to pay them ;-) and more in general all the extra
+> documentation regarding an appointment.
 >
 > Ciao,
->   Roberto
+>   Roberto
 
-Some unquoted line with single ' quotes. 
+Some unquoted line with single ' quotes.
 
 Jan.
 
--- 
-Do you need professional PHP or Horde consulting?
+ --
+ Do you need professional PHP or Horde consulting?
 http://horde.org/consulting/