From 8ac6fd094af2ecc93ad8f750a0731c043f7f8a2a Mon Sep 17 00:00:00 2001 From: alecpl Date: Sat, 30 Aug 2008 11:24:39 +0000 Subject: - Improved HTML to TXT conversion by html2text class update to version 1.0.0 --- program/lib/html2text.php | 413 +++++++++++++++++++++++++--------------------- 1 file changed, 225 insertions(+), 188 deletions(-) (limited to 'program') diff --git a/program/lib/html2text.php b/program/lib/html2text.php index b476555ba..ee7b0dc40 100644 --- a/program/lib/html2text.php +++ b/program/lib/html2text.php @@ -1,78 +1,109 @@ * -* All rights reserved. * -* * -* This script is free software; you can redistribute it and/or modify * -* it under the terms of the GNU General Public License as published by * -* the Free Software Foundation; either version 2 of the License, or * -* (at your option) any later version. * -* * -* The GNU General Public License can be found at * -* http://www.gnu.org/copyleft/gpl.html. * -* * -* This script is distributed in the hope that it will be useful, * -* but WITHOUT ANY WARRANTY; without even the implied warranty of * -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * -* GNU General Public License for more details. * -* * -* Author(s): Jon Abernathy * -* * -* Last modified: 04/06/05 * -* Modified: 2004/05/19 (tbr) * -* * -*************************************************************************/ - -/* 2008/08/29: Added PRE handling by A.L.E.C */ + * * + * class.html2text.inc * + * * + ************************************************************************* + * * + * Converts HTML to formatted plain text * + * * + * Copyright (c) 2005-2007 Jon Abernathy * + * All rights reserved. * + * * + * This script is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * The GNU General Public License can be found at * + * http://www.gnu.org/copyleft/gpl.html. * + * * + * This script is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * Author(s): Jon Abernathy * + * * + * Last modified: 08/08/07 * + * * + *************************************************************************/ + /** -* Takes HTML and converts it to formatted, plain text. -* -* Thanks to Alexander Krug (http://www.krugar.de/) to pointing out and -* correcting an error in the regexp search array. Fixed 7/30/03. -* -* Updated set_html() function's file reading mechanism, 9/25/03. -* -* Thanks to Joss Sanglier (http://www.dancingbear.co.uk/) for adding -* several more HTML entity codes to the $search and $replace arrays. -* Updated 11/7/03. -* -* Thanks to Darius Kasperavicius (http://www.dar.dar.lt/) for -* suggesting the addition of $allowed_tags and its supporting function -* (which I slightly modified). Updated 3/12/04. -* -* Thanks to Justin Dearing for pointing out that a replacement for the -* tag was missing, and suggesting an appropriate fix. -* Updated 8/25/04. -* -* Thanks to Mathieu Collas (http://www.myefarm.com/) for finding a -* display/formatting bug in the _build_link_list() function: email -* readers would show the left bracket and number ("[1") as part of the -* rendered email address. -* Updated 12/16/04. -* -* Thanks to Wojciech Bajon (http://histeria.pl/) for submitting code -* to handle relative links, which I hadn't considered. I modified his -* code a bit to handle normal HTTP links and MAILTO links. Also for -* suggesting three additional HTML entity codes to search for. -* Updated 03/02/05. -* -* Thanks to Jacob Chandler for pointing out another link condition -* for the _build_link_list() function: "https". -* Updated 04/06/05. -* -* @author Jon Abernathy -* @version 0.6.1 -* @since PHP 4.0.2 -*/ + * Takes HTML and converts it to formatted, plain text. + * + * Thanks to Alexander Krug (http://www.krugar.de/) to pointing out and + * correcting an error in the regexp search array. Fixed 7/30/03. + * + * Updated set_html() function's file reading mechanism, 9/25/03. + * + * Thanks to Joss Sanglier (http://www.dancingbear.co.uk/) for adding + * several more HTML entity codes to the $search and $replace arrays. + * Updated 11/7/03. + * + * Thanks to Darius Kasperavicius (http://www.dar.dar.lt/) for + * suggesting the addition of $allowed_tags and its supporting function + * (which I slightly modified). Updated 3/12/04. + * + * Thanks to Justin Dearing for pointing out that a replacement for the + * tag was missing, and suggesting an appropriate fix. + * Updated 8/25/04. + * + * Thanks to Mathieu Collas (http://www.myefarm.com/) for finding a + * display/formatting bug in the _build_link_list() function: email + * readers would show the left bracket and number ("[1") as part of the + * rendered email address. + * Updated 12/16/04. + * + * Thanks to Wojciech Bajon (http://histeria.pl/) for submitting code + * to handle relative links, which I hadn't considered. I modified his + * code a bit to handle normal HTTP links and MAILTO links. Also for + * suggesting three additional HTML entity codes to search for. + * Updated 03/02/05. + * + * Thanks to Jacob Chandler for pointing out another link condition + * for the _build_link_list() function: "https". + * Updated 04/06/05. + * + * Thanks to Marc Bertrand (http://www.dresdensky.com/) for + * suggesting a revision to the word wrapping functionality; if you + * specify a $width of 0 or less, word wrapping will be ignored. + * Updated 11/02/06. + * + * *** Big housecleaning updates below: + * + * Thanks to Colin Brown (http://www.sparkdriver.co.uk/) for + * suggesting the fix to handle and blank lines (whitespace). + * Christian Basedau (http://www.movetheweb.de/) also suggested the + * blank lines fix. + * + * Special thanks to Marcus Bointon (http://www.synchromedia.co.uk/), + * Christian Basedau, Norbert Laposa (http://ln5.co.uk/), + * Bas van de Weijer, and Marijn van Butselaar + * for pointing out my glaring error in the handling. Marcus also + * supplied a host of fixes. + * + * Thanks to Jeffrey Silverman (http://www.newtnotes.com/) for pointing + * out that extra spaces should be compressed--a problem addressed with + * Marcus Bointon's fixes but that I had not yet incorporated. + * + * Thanks to Daniel Schledermann (http://www.typoconsult.dk/) for + * suggesting a valuable fix with tag handling. + * + * Thanks to Wojciech Bajon (again!) for suggesting fixes and additions, + * including the tag handling that Daniel Schledermann pointed + * out but that I had not yet incorporated. I haven't (yet) + * incorporated all of Wojciech's changes, though I may at some + * future time. + * + * *** End of the housecleaning updates. Updated 08/08/07. + * + * @author Jon Abernathy + * @version 1.0.0 + * @since PHP 4.0.2 + */ class html2text { @@ -95,6 +126,9 @@ class html2text /** * Maximum width of the formatted text, in columns. * + * Set this value to 0 (or less) to ignore word wrapping + * and not constrain text to a fixed-width column. + * * @var integer $width * @access public */ @@ -111,43 +145,46 @@ class html2text var $search = array( "/\r/", // Non-legal carriage return "/[\n\t]+/", // Newlines and tabs + '/[ ]{2,}/', // Runs of spaces, pre-handling '/]*>.*?<\/script>/i', //