From 7ac94421bf85eb04c00c5ed05390e1ea0c6bcb0b Mon Sep 17 00:00:00 2001 From: Aleksander Machniak Date: Tue, 25 Dec 2012 18:06:17 +0100 Subject: Move washtml class into Roundcube Framework (rcube_washtml), add some improvements --- program/steps/mail/func.inc | 68 +-------------------------------------------- 1 file changed, 1 insertion(+), 67 deletions(-) (limited to 'program/steps') diff --git a/program/steps/mail/func.inc b/program/steps/mail/func.inc index 70493766b..90f54cf1b 100644 --- a/program/steps/mail/func.inc +++ b/program/steps/mail/func.inc @@ -628,39 +628,6 @@ function rcmail_wash_html($html, $p, $cid_replaces) $p += array('safe' => false, 'inline_html' => true); - // special replacements (not properly handled by washtml class) - $html_search = array( - '/(<\/nobr>)(\s+)()/i', // space(s) between - '/]*>[^<]*<\/title>/i', // PHP bug #32547 workaround: remove title tag - '/^(\0\0\xFE\xFF|\xFF\xFE\0\0|\xFE\xFF|\xFF\xFE|\xEF\xBB\xBF)/', // byte-order mark (only outlook?) - '/]+>/i', // washtml/DOMDocument cannot handle xml namespaces - ); - $html_replace = array( - '\\1'.'   '.'\\3', - '', - '', - '', - ); - $html = preg_replace($html_search, $html_replace, trim($html)); - - // PCRE errors handling (#1486856), should we use something like for every preg_* use? - if ($html === null && ($preg_error = preg_last_error()) != PREG_NO_ERROR) { - $errstr = "Could not clean up HTML message! PCRE Error: $preg_error."; - - if ($preg_error == PREG_BACKTRACK_LIMIT_ERROR) - $errstr .= " Consider raising pcre.backtrack_limit!"; - if ($preg_error == PREG_RECURSION_LIMIT_ERROR) - $errstr .= " Consider raising pcre.recursion_limit!"; - - raise_error(array('code' => 620, 'type' => 'php', - 'line' => __LINE__, 'file' => __FILE__, - 'message' => $errstr), true, false); - return ''; - } - - // fix (unknown/malformed) HTML tags before "wash" - $html = preg_replace_callback('/(<[\/]*)([^\s>]+)/', 'rcmail_html_tag_callback', $html); - // charset was converted to UTF-8 in rcube_storage::get_message_part(), // change/add charset specification in HTML accordingly, // washtml cannot work without that @@ -674,9 +641,6 @@ function rcmail_wash_html($html, $p, $cid_replaces) $html = '' . $meta . '' . $html; } - // turn relative into absolute urls - $html = rcmail_resolve_base($html); - // clean HTML with washhtml by Frederic Motte $wash_opts = array( 'show_washed' => false, @@ -702,7 +666,7 @@ function rcmail_wash_html($html, $p, $cid_replaces) $wash_opts['html_attribs'] = $p['html_attribs']; // initialize HTML washer - $washer = new washtml($wash_opts); + $washer = new rcube_washtml($wash_opts); if (!$p['skip_washer_form_callback']) $washer->add_callback('form', 'rcmail_washtml_callback'); @@ -920,22 +884,6 @@ function rcmail_washtml_callback($tagname, $attrib, $content, $washtml) } -/** - * Callback function for HTML tags fixing - */ -function rcmail_html_tag_callback($matches) -{ - $tagname = $matches[2]; - - $tagname = preg_replace(array( - '/:.*$/', // Microsoft's Smart Tags - '/[^a-z0-9_\[\]\!-]/i', // forbidden characters - ), '', $tagname); - - return $matches[1].$tagname; -} - - /** * return table with message headers */ @@ -1319,20 +1267,6 @@ function rcmail_part_image_type($part) } } -/** - * Convert all relative URLs according to a in HTML - */ -function rcmail_resolve_base($body) -{ - // check for - if (preg_match('!(replace($body); - } - - return $body; -} - /** * modify a HTML message that it can be displayed inside a HTML page -- cgit v1.2.3 From 66afd70b756a0637da3537e96f6bf6ce0a2c46e9 Mon Sep 17 00:00:00 2001 From: Aleksander Machniak Date: Wed, 26 Dec 2012 12:14:34 +0100 Subject: Framework'ize html2text class --- program/include/bc.php | 4 + program/lib/Roundcube/rcube_html2text.php | 691 ++++++++++++++++++++++++ program/lib/Roundcube/rcube_message.php | 2 +- program/lib/Roundcube/rcube_spellchecker.php | 2 +- program/lib/html2text.php | 755 --------------------------- program/steps/mail/compose.inc | 4 +- program/steps/mail/func.inc | 2 +- program/steps/mail/sendmail.inc | 2 +- program/steps/utils/html2text.inc | 4 +- tests/Framework/Html2text.php | 59 +++ tests/HtmlToText.php | 59 --- tests/phpunit.xml | 2 +- 12 files changed, 762 insertions(+), 824 deletions(-) create mode 100644 program/lib/Roundcube/rcube_html2text.php delete mode 100644 program/lib/html2text.php create mode 100644 tests/Framework/Html2text.php delete mode 100644 tests/HtmlToText.php (limited to 'program/steps') diff --git a/program/include/bc.php b/program/include/bc.php index 05d15b9e3..3d9d46289 100644 --- a/program/include/bc.php +++ b/program/include/bc.php @@ -412,3 +412,7 @@ class rcube_html_page extends rcmail_html_page class washtml extends rcube_washtml { } + +class html2text extends rcube_html2text +{ +} diff --git a/program/lib/Roundcube/rcube_html2text.php b/program/lib/Roundcube/rcube_html2text.php new file mode 100644 index 000000000..0b172ebfa --- /dev/null +++ b/program/lib/Roundcube/rcube_html2text.php @@ -0,0 +1,691 @@ + | + | | + | Licensed under the GNU General Public License version 3 or | + | any later version with exceptions for skins & plugins. | + | See the README file for a full license statement. | + | | + | PURPOSE: | + | Converts HTML to formatted plain text (based on html2text class) | + +-----------------------------------------------------------------------+ + | Author: Thomas Bruederli | + | Author: Aleksander Machniak | + | Author: Jon Abernathy | + +-----------------------------------------------------------------------+ + */ + +/** + * Takes HTML and converts it to formatted, plain text. + * + * Thanks to Alexander Krug (http://www.krugar.de/) to pointing out and + * correcting an error in the regexp search array. Fixed 7/30/03. + * + * Updated set_html() function's file reading mechanism, 9/25/03. + * + * Thanks to Joss Sanglier (http://www.dancingbear.co.uk/) for adding + * several more HTML entity codes to the $search and $replace arrays. + * Updated 11/7/03. + * + * Thanks to Darius Kasperavicius (http://www.dar.dar.lt/) for + * suggesting the addition of $allowed_tags and its supporting function + * (which I slightly modified). Updated 3/12/04. + * + * Thanks to Justin Dearing for pointing out that a replacement for the + * tag was missing, and suggesting an appropriate fix. + * Updated 8/25/04. + * + * Thanks to Mathieu Collas (http://www.myefarm.com/) for finding a + * display/formatting bug in the _build_link_list() function: email + * readers would show the left bracket and number ("[1") as part of the + * rendered email address. + * Updated 12/16/04. + * + * Thanks to Wojciech Bajon (http://histeria.pl/) for submitting code + * to handle relative links, which I hadn't considered. I modified his + * code a bit to handle normal HTTP links and MAILTO links. Also for + * suggesting three additional HTML entity codes to search for. + * Updated 03/02/05. + * + * Thanks to Jacob Chandler for pointing out another link condition + * for the _build_link_list() function: "https". + * Updated 04/06/05. + * + * Thanks to Marc Bertrand (http://www.dresdensky.com/) for + * suggesting a revision to the word wrapping functionality; if you + * specify a $width of 0 or less, word wrapping will be ignored. + * Updated 11/02/06. + * + * *** Big housecleaning updates below: + * + * Thanks to Colin Brown (http://www.sparkdriver.co.uk/) for + * suggesting the fix to handle and blank lines (whitespace). + * Christian Basedau (http://www.movetheweb.de/) also suggested the + * blank lines fix. + * + * Special thanks to Marcus Bointon (http://www.synchromedia.co.uk/), + * Christian Basedau, Norbert Laposa (http://ln5.co.uk/), + * Bas van de Weijer, and Marijn van Butselaar + * for pointing out my glaring error in the handling. Marcus also + * supplied a host of fixes. + * + * Thanks to Jeffrey Silverman (http://www.newtnotes.com/) for pointing + * out that extra spaces should be compressed--a problem addressed with + * Marcus Bointon's fixes but that I had not yet incorporated. + * + * Thanks to Daniel Schledermann (http://www.typoconsult.dk/) for + * suggesting a valuable fix with tag handling. + * + * Thanks to Wojciech Bajon (again!) for suggesting fixes and additions, + * including the tag handling that Daniel Schledermann pointed + * out but that I had not yet incorporated. I haven't (yet) + * incorporated all of Wojciech's changes, though I may at some + * future time. + * + * *** End of the housecleaning updates. Updated 08/08/07. + */ + +/** + * Converts HTML to formatted plain text + * + * @package Framework + * @subpackage Utils + */ +class rcube_html2text +{ + /** + * Contains the HTML content to convert. + * + * @var string $html + */ + protected $html; + + /** + * Contains the converted, formatted text. + * + * @var string $text + */ + protected $text; + + /** + * Maximum width of the formatted text, in columns. + * + * Set this value to 0 (or less) to ignore word wrapping + * and not constrain text to a fixed-width column. + * + * @var integer $width + */ + protected $width = 70; + + /** + * Target character encoding for output text + * + * @var string $charset + */ + protected $charset = 'UTF-8'; + + /** + * List of preg* regular expression patterns to search for, + * used in conjunction with $replace. + * + * @var array $search + * @see $replace + */ + protected $search = array( + "/\r/", // Non-legal carriage return + "/[\n\t]+/", // Newlines and tabs + '/]*>.*?<\/head>/i', // + '/]*>.*?<\/script>/i', //