* wrap - (boolean) Whether to wrap the text or not.
* </pre>
*
- * Copyright 2003-2004 Jon Abernathy <jon@chuggnutt.com>
- * Original source: http://www.chuggnutt.com/html2text.php
* Copyright 2004-2010 The Horde Project (http://www.horde.org/)
*
* See the enclosed file COPYING for license information (LGPL). If you
* did not receive this file, see http://www.fsf.org/copyleft/lgpl.html.
*
- * @author Jon Abernathy <jon@chuggnutt.com>
* @author Jan Schneider <jan@horde.org>
* @author Michael Slusarz <slusarz@horde.org>
* @package Horde_Text
protected $_linkList = array();
/**
+ * Current list indentation level.
+ *
+ * @var integer
+ */
+ protected $_indent = 0;
+
+ /**
+ * Current blockquote level.
+ *
+ * @var integer
+ */
+ protected $_bqlevel = 0;
+
+ /**
+ * Current blockquote data.
+ *
+ * @var array
+ */
+ protected $_bqdata = array();
+
+ /**
* Filter parameters.
*
* @var array
*/
protected $_params = array(
- 'charset' => 'ISO-8859-1',
+ 'charset' => 'UTF-8',
'width' => 70,
'wrap' => true
);
/**
- * Executes any code necessary before applying the filter patterns.
- *
- * @param string $text The text before the filtering.
- *
- * @return string The modified text.
- */
- public function preProcess($text)
- {
- $this->_linkList = array();
- return trim($text);
- }
-
- /**
* Returns a hash with replace patterns.
*
* @return array Patterns hash.
public function getPatterns()
{
$replace = array(
- // Non-legal carriage return.
- '/\r/' => ''
- );
-
- $regexp = array(
- // Newlines and tabs.
- '/[\n\t]+/' => ' ',
-
- // Normalize <br> (remove leading/trailing whitespace)
- '/\s*<br[^>]*>\s*/i' => '<br>',
-
- // <script>s -- which strip_tags() supposedly has problems with.
- '/<script(?:>|\s[^>]*>).*?<\/script\s*>/i' => '',
-
- // <style>s -- which strip_tags() supposedly has problems with.
- '/<style(?:>|\s[^>]*>).*?<\/style\s*>/i' => '',
-
- // h1 - h3
- '/<h[123](?:>|\s[^>]*>)(.+?)<\/h[123]\s*>/ie' => '"<br><br>" . strtoupper("\\1") . "<br><br>"',
-
- // h4 - h6
- '/<h[456](?:>|\s[^>]*>)(.+?)<\/h[456]\s*> ?/ie' => '"<br><br>" . ucwords("\\1") . "<br><br>"',
-
- // <p>
- '/\s*<p(?:>|\s[^>]*>)\s*/i' => '<br><br>',
-
- // <div>
- '/\s*<div(?:>|\s[^>]*>)\s*/i' => '<br>',
-
- // <b>
- '/<b(?:>|\s[^>]*>)(.+?)<\/b>/ie' => 'strtoupper("\\1")',
-
- // <strong>
- '/<strong(?:>|\s[^>]*>)(.+?)<\/strong>/ie' => 'strtoupper("\\1")',
- '/<span\s+style="font-weight:\s*bold.*">(.+?)<\/span>/ie' => 'strtoupper("\\1")',
-
- // <i>
- '/<i(?:>|\s[^>]*>)(.+?)<\/i>/i' => '/\\1/',
-
- // <em>
- '/<em(?:>|\s[^>]*>)(.+?)<\/em>/i' => '_\\1_',
-
- // <u>
- '/<u(?:>|\s[^>]*>)(.+?)<\/u>/i' => '_\\1_',
-
- // <ul>/<ol> and </ul>/</ol>
- '/\s*(<(u|o)l(?:>|\s[^>]*>)| ?<\/(u|o)l\s*>)\s*/i' => '<br><br>',
-
- // <li>
- '/\s*<li(?:>|\s[^>]*>)\s*/i' => '<br> * ',
-
- // <hr>
- '/\s*<hr(?:>|\s[^>]*>)\s*/i' => '<br>-------------------------<br>',
-
- // <table> and </table>
- '/\s*(<table(?:>|\s[^>]*>)| ?<\/table\s*>)\s*/i' => '<br><br>',
-
- // <tr>
- '/\s*<tr(?:>|\s[^>]*>)\s*/i' => '<br>',
-
- // <td> and </td>
- '/\s*<td(?:>|\s[^>]*>)(.+?)<\/td>\s*/i' => '\\1<br>',
-
- // <th> and </th>
- '/\s*<th(?:>|\s[^>]*>)(.+?)<\/th>\s*/ie' => 'strtoupper("\\1") . "<br>"',
-
- // Some mailers (e.g. Hotmail) use the following div tag as a way
- // to define a block of text.
- '/<div class="?rte"?>(.+?)<\/div> ?/i' => '\\1<br>',
-
- // Cite blocks.
- '/\s*<blockquote\s+[^>]*(?:type="?cite"?|class="?gmail_quote"?)[^>]*>\s*/i' => '<hordecite>',
-
- // <br>
- '/<br[^>]*>/i' => "\n"
- );
-
- $regexp_callback = array(
- // <a href="">
- '/<a href="([^"]+)"[^>]*>(.+?)<\/a>/i' => array($this, 'buildLinkList')
+ "\r" => '',
+ "\n" => ' ',
+ "\t" => ' '
);
return array(
- 'regexp' => $regexp,
- 'regexp_callback' => $regexp_callback,
'replace' => $replace
);
}
/**
+ * Executes any code necessary before applying the filter patterns.
+ *
+ * @param string $text The text before the filtering.
+ *
+ * @return string The modified text.
+ */
+ public function preProcess($text)
+ {
+ $this->_bqlevel = $this->_indent = 0;
+ $this->_bqdata = $this->_linkList = array();
+
+ return $text;
+ }
+
+ /**
* Executes any code necessary after applying the filter patterns.
*
* @param string $text The text after the filtering.
*/
public function postProcess($text)
{
- /* Convert blockquote tags. */
- return $text;
- // if (strpos($text, chr(0)) !== false) {
- // $text = $this->_blockQuote($text);
-// }
+ $text = Horde_String::convertCharset($text, $this->_params['charset'], 'UTF-8');
- /* Strip any other HTML tags. */
- $text = strip_tags($text);
+ if (extension_loaded('dom')) {
+ $old_error = libxml_use_internal_errors(true);
+ $doc = DOMDocument::loadHTML('<?xml encoding="UTF-8">' . $text);
+ if ($old_error) {
+ libxml_use_internal_errors(false);
+ }
+ $text = $this->_node($doc, $doc);
+ }
- /* Convert HTML entities. */
- $text = html_entity_decode($text, ENT_QUOTES, $this->_params['charset']);
+ /* Strip HTML tags and convert HTML entities. */
+ $text = html_entity_decode($text, ENT_QUOTES, 'UTF-8');
- /* Bring down number of empty lines to 2 max. */
- $text = preg_replace(array("/\n[[:space:]]+\n/", "/[\n]{3,}/"), "\n\n", $text);
+ /* Bring down number of empty lines to 2 max, and remove trailing
+ * ws. */
+ $text = preg_replace(array("/\n[[:space:]]+\n/", "/[\n]{3,}/", "/ +\n/"), array("\n\n", "\n\n", "\n"), $text);
/* Wrap the text to a readable format. */
if ($this->_params['wrap']) {
}
}
- return trim(rtrim($text), "\n");
- }
-
- /**
- * Replaces blockquote tags with > quotes.
- *
- * @param string $text The text to quote.
- *
- * @return string The quoted text.
- */
- protected function _blockQuote($text)
- {
- return preg_replace(
- '/([^\x00\x01]*)\x00(((?>[^\x00\x01]*)|(?R))*)\x01([^\x00\x01]*)/se',
- "stripslashes('$1') . \"\n\n\" . \$this->_quote('$2') . \"\n\n\" . stripslashes('$4')",
- $text);
+ return ltrim(rtrim($text), "\n");
}
/**
- * Quotes a chunk of text.
+ * Process DOM node.
*
- * @param string $text The text to quote.
+ * @param DOMDocument $doc Document node.
+ * @param DOMElement $node Element node.
*
- * @return string The quoted text.
+ * @return string The plaintext representation.
*/
- protected function _quote($text)
+ protected function _node($doc, $node)
{
- $text = stripslashes($text);
- if (strpos($text, chr(0)) !== false) {
- $text = stripslashes($this->_blockQuote($text));
- }
-
- $text = rtrim(strip_tags($text));
- if ($this->_params['wrap']) {
- $text = wordwrap($text, $this->_params['width'] - 2);
+ $out = '';
+
+ if ($node->hasChildNodes()) {
+ foreach ($node->childNodes as $child) {
+ if ($child instanceof DOMElement) {
+ switch (strtolower($child->tagName)) {
+ case 'h1':
+ case 'h2':
+ case 'h3':
+ $out .= "\n\n" .
+ strtoupper($this->_node($doc, $child)) .
+ "\n\n";
+ break;
+
+ case 'h4':
+ case 'h5':
+ case 'h6':
+ $out .= "\n\n" .
+ ucwords($this->_node($doc, $child))
+ . "\n\n";
+ break;
+
+ case 'b':
+ case 'strong':
+ $out .= strtoupper($this->_node($doc, $child));
+ break;
+
+ case 'u':
+ $out .= '_' . $this->_node($doc, $child) . '_';
+ break;
+
+ case 'em':
+ case 'i':
+ $out .= '/' . $this->_node($doc, $child) . '/';
+ break;
+
+ case 'hr':
+ $out .= "\n-------------------------\n";
+ break;
+
+ case 'ol':
+ case 'ul':
+ ++$this->_indent;
+ $out .= "\n\n" . $this->_node($doc, $child) . "\n\n";
+ --$this->_indent;
+ break;
+
+ case 'p':
+ if ($tmp = $this->_node($doc, $child)) {
+ $out .= "\n" . $tmp . "\n";
+ }
+ break;
+
+ case 'table':
+ if ($tmp = $this->_node($doc, $child)) {
+ $out .= "\n\n" . $tmp . "\n\n";
+ }
+ break;
+
+ case 'tr':
+ $out .= "\n " . rtrim($this->_node($doc, $child));
+ break;
+
+ case 'th':
+ $out .= strtoupper($this->_node($doc, $child)) . " \t";
+ break;
+
+ case 'td':
+ $out .= $this->_node($doc, $child) . " \t";
+ break;
+
+ case 'li':
+ $out .= "\n" . str_repeat(' ', $this->_indent) . '* ' . $this->_node($doc, $child);
+ break;
+
+ case 'a':
+ $out .= $this->_node($doc, $child) . $this->_buildLinkList($doc, $child);
+ break;
+
+ case 'blockquote':
+ if ($this->_bqlevel) {
+ $this->_bqdata[] = array(
+ 'level' => $this->_bqlevel,
+ 'text' => $out
+ );
+ $out = '';
+ }
+ ++$this->_bqlevel;
+ $this->_bqdata[] = array(
+ 'level' => $this->_bqlevel,
+ 'text' => $this->_node($doc, $child)
+ );
+ --$this->_bqlevel;
+
+ if (!$this->_bqlevel) {
+ $out .= "\n\n";
+ foreach ($this->_bqdata as $val) {
+ if (empty($val['text'])) {
+ continue;
+ }
+
+ if ($this->_params['wrap']) {
+ $tmp = array();
+ foreach (explode("\n", $val['text']) as $val2) {
+ $tmp = array_merge($tmp, explode("\n", wordwrap($val2, $this->_params['width'] - (2 * $val['level']))));
+ }
+ } else {
+ $tmp = $val['text'];
+ }
+
+ /* Clean out empty entries. */
+ for ($i = 0, $cnt = count($tmp); $i < $cnt; ++$i) {
+ if (!empty($tmp[$i])) {
+ break;
+ }
+ unset($tmp[$i]);
+ }
+
+ $tmp = array_values($tmp);
+
+ for ($i = count($tmp); $i >= 0; --$i) {
+ if (!empty($tmp[$i])) {
+ break;
+ }
+ unset($tmp[$i]);
+ }
+
+ foreach ($tmp as $val2) {
+ $out .= str_repeat("> ", $val['level']) . rtrim($val2) . "\n";
+ }
+ }
+ $out .= "\n\n";
+ $this->_bqdata = array();
+ }
+ break;
+
+ case 'div':
+ $out .= $this->_node($doc, $child) . "\n";
+ break;
+
+ case 'br':
+ $out .= "\n";
+ break;
+
+ default:
+ $out .= $this->_node($doc, $child);
+ break;
+ }
+ } elseif (($child instanceof DOMText) &&
+ !$child->isWhitespaceInElementContent()) {
+ $tmp = $child->textContent;
+ if ($child->parentNode->tagName == 'body' ||
+ !$child->previousSibling) {
+ $tmp = ltrim($tmp);
+ }
+ if (!$child->nextSibling) {
+ $tmp = rtrim($tmp);
+ }
+ $out .= $tmp;
+ }
+ }
}
- return preg_replace(array('/^/m', '/(\n>\s*$){3,}/m', '/^>\s+$/m'),
- array('> ', "\n> ", '>'),
- $text);
+ return $out;
}
/**
- * Helper function called by preg_replace() on link replacement.
- *
* Maintains an internal list of links to be displayed at the end
* of the text, with numeric indices to the original point in the
* text they appeared.
*
- * @param array $matches Match information:
- * <pre>
- * [1] URL of the link.
- * [2] Part of the text to associate number with.
- * </pre>
- *
- * @return string The link replacement.
+ * @param DOMDocument $doc Document node.
+ * @param DOMElement $node Element node.
*/
- public function buildLinkList($matches)
+ protected function _buildLinkList($doc, $node)
{
- $link = $matches[1];
- $display = $matches[2];
-
- if ($link == strip_tags($display)) {
- return $display;
- }
+ $link = $node->getAttribute('href');
+ $display = $node->textContent;
$parsed_link = parse_url($link);
- $parsed_display = parse_url(strip_tags(preg_replace('/^<|>$/', '', $display)));
+ $parsed_display = parse_url($display);
if (isset($parsed_link['path'])) {
$parsed_link['path'] = trim($parsed_link['path'], '/');
(isset($parsed_link['path']) &&
isset($parsed_display['path']) &&
$parsed_link['path'] == $parsed_display['path']))) {
- return $display;
+ return '';
}
- $this->_linkList[] = $link;
+ if (($pos = array_search($link, $this->_linkList)) === false) {
+ $this->_linkList[] = $link;
+ $pos = count($this->_linkList) - 1;
+ }
- return $display . '[' . count($this->_linkList) . ']';
+ return '[' . ($pos + 1) . ']';
}
}
Some text with leading and trailing whitespace
- emphasis text /emphasis text/
- strong text STRONG TEXT
- italic text /italic text/
- bold text BOLD TEXT
- emphasis and strong /EMPHASIS AND STRONG/
- underline text _underline text_
+-------------------------
+
+TABLE
+
+ TYPE REPRESENTATION
+ emphasis text /emphasis text/
+ strong text STRONG TEXT
+ italic text /italic text/
+ bold text BOLD TEXT
+ emphasis and strong /EMPHASIS AND STRONG/
+ underline text _underline text_
-------------------------
Horde Homepage[1]
Test User[2]
-Some inline link[3].
+Some inline link[1].
http://www.example.com
-------------------------
* Bullet one
- * Sub-bullet
+ * Sub-bullet
NUMBERED LISTS
* Numero uno
* Number two
- * Sub-item
+ * Sub-item
MIXING BULLET AND NUMBER LIST ITEMS
* Number one
- * Bullet
- * Bullet
+ * Bullet
+ * Bullet
* Number two
- * Bullet
- * Bullet
+ * Bullet
+ * Bullet
- * Sub-bullet
+ * Sub-bullet
- * Sub-sub-number
- * Sub-sub-number
+ * Sub-sub-number
+ * Sub-sub-number
* Number three
- * Bullet
- * Bullet
+ * Bullet
+ * Bullet
BLOCK QUOTING
-> Horde Homepage[4]
-> Some inline link[5].
+> Horde Homepage[1]
+> Some inline link[1].
Line inbetween.
> HEADING INSIDE QUOTING
>
+>
> This is a paragraph inside a block quoting. The result should be
> several lines prefixed with the > character.
ä é © ™ Đ
-Zitat von John Doe <john.doe@example.com>:
+Zitat von John Doe <john.doe@example.com>:
-> Hallo lieber John,
+> Hallo lieber John,
>
> Blah, blah.'
---
-Some signature
+--
+ Some signature
http://www.example.com
Zitat von Jane Doe <jane.doe@example.com>:
-> Jan Schneider a écrit :
->
-> > Zitat von Jane Doe <jane.doe@example.com>:
-> >
-> > > Hi,
+> Jan Schneider a écrit :
+> > Zitat von Jane Doe <jane.doe@example.com>[3]:
+> > > Hi,
> > >
-> > > I prepare the last "horde-webmail-1.2" for production
-> > > level but I have few questions:
-> > > - is there a way to disable "external_display_cal" in
+> > > I prepare the last "horde-webmail-1.2" for production level but
+> > > I have few questions:
+> > > - is there a way to disable "external_display_cal" in
> > > kronolith, I don't want seeing birthdays calendars (turba) and
-> > task
-> > > list (nag)
-> >
-> >
-> > They aren't displayed by default, or do you mean you don't want
-> them
-> > to appear in the top right calendar panel?
->
+> > > task list (nag)
+> > They aren't displayed by default, or do you mean you don't want
+> > them to appear in the top right calendar panel?
> Yes I don't want them to appear in the top right calendar panel but
> I want user can create their external_cal
Jan.
---
-Do you need professional PHP or Horde consulting?
+ --
+ Do you need professional PHP or Horde consulting?
http://horde.org/consulting/
+
Links:
------
[1] http://www.horde.org
[2] mailto:test@example.com
-[3] http://www.horde.org
-[4] http://www.horde.org
-[5] http://www.horde.org
+[3] mailto:jane.doe@example.com