diff options
author | Aleksander Machniak <alec@alec.pl> | 2012-12-26 12:14:34 +0100 |
---|---|---|
committer | Aleksander Machniak <alec@alec.pl> | 2012-12-26 12:14:34 +0100 |
commit | 66afd70b756a0637da3537e96f6bf6ce0a2c46e9 (patch) | |
tree | 97d57798ee3c4cd9ebca62f24f90d05f806cffda /program/lib/html2text.php | |
parent | 7ac94421bf85eb04c00c5ed05390e1ea0c6bcb0b (diff) |
Framework'ize html2text class
Diffstat (limited to 'program/lib/html2text.php')
-rw-r--r-- | program/lib/html2text.php | 755 |
1 files changed, 0 insertions, 755 deletions
diff --git a/program/lib/html2text.php b/program/lib/html2text.php deleted file mode 100644 index 34c719302..000000000 --- a/program/lib/html2text.php +++ /dev/null @@ -1,755 +0,0 @@ -<?php - -/************************************************************************* - * * - * class.html2text.inc * - * * - ************************************************************************* - * * - * Converts HTML to formatted plain text * - * * - * Copyright (c) 2005-2007 Jon Abernathy <jon@chuggnutt.com> * - * All rights reserved. * - * * - * This script is free software; you can redistribute it and/or modify * - * it under the terms of the GNU General Public License as published by * - * the Free Software Foundation; either version 2 of the License, or * - * (at your option) any later version. * - * * - * The GNU General Public License can be found at * - * http://www.gnu.org/copyleft/gpl.html. * - * * - * This script is distributed in the hope that it will be useful, * - * but WITHOUT ANY WARRANTY; without even the implied warranty of * - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * - * GNU General Public License for more details. * - * * - * Author(s): Jon Abernathy <jon@chuggnutt.com> * - * * - * Last modified: 08/08/07 * - * * - *************************************************************************/ - - -/** - * Takes HTML and converts it to formatted, plain text. - * - * Thanks to Alexander Krug (http://www.krugar.de/) to pointing out and - * correcting an error in the regexp search array. Fixed 7/30/03. - * - * Updated set_html() function's file reading mechanism, 9/25/03. - * - * Thanks to Joss Sanglier (http://www.dancingbear.co.uk/) for adding - * several more HTML entity codes to the $search and $replace arrays. - * Updated 11/7/03. - * - * Thanks to Darius Kasperavicius (http://www.dar.dar.lt/) for - * suggesting the addition of $allowed_tags and its supporting function - * (which I slightly modified). Updated 3/12/04. - * - * Thanks to Justin Dearing for pointing out that a replacement for the - * <TH> tag was missing, and suggesting an appropriate fix. - * Updated 8/25/04. - * - * Thanks to Mathieu Collas (http://www.myefarm.com/) for finding a - * display/formatting bug in the _build_link_list() function: email - * readers would show the left bracket and number ("[1") as part of the - * rendered email address. - * Updated 12/16/04. - * - * Thanks to Wojciech Bajon (http://histeria.pl/) for submitting code - * to handle relative links, which I hadn't considered. I modified his - * code a bit to handle normal HTTP links and MAILTO links. Also for - * suggesting three additional HTML entity codes to search for. - * Updated 03/02/05. - * - * Thanks to Jacob Chandler for pointing out another link condition - * for the _build_link_list() function: "https". - * Updated 04/06/05. - * - * Thanks to Marc Bertrand (http://www.dresdensky.com/) for - * suggesting a revision to the word wrapping functionality; if you - * specify a $width of 0 or less, word wrapping will be ignored. - * Updated 11/02/06. - * - * *** Big housecleaning updates below: - * - * Thanks to Colin Brown (http://www.sparkdriver.co.uk/) for - * suggesting the fix to handle </li> and blank lines (whitespace). - * Christian Basedau (http://www.movetheweb.de/) also suggested the - * blank lines fix. - * - * Special thanks to Marcus Bointon (http://www.synchromedia.co.uk/), - * Christian Basedau, Norbert Laposa (http://ln5.co.uk/), - * Bas van de Weijer, and Marijn van Butselaar - * for pointing out my glaring error in the <th> handling. Marcus also - * supplied a host of fixes. - * - * Thanks to Jeffrey Silverman (http://www.newtnotes.com/) for pointing - * out that extra spaces should be compressed--a problem addressed with - * Marcus Bointon's fixes but that I had not yet incorporated. - * - * Thanks to Daniel Schledermann (http://www.typoconsult.dk/) for - * suggesting a valuable fix with <a> tag handling. - * - * Thanks to Wojciech Bajon (again!) for suggesting fixes and additions, - * including the <a> tag handling that Daniel Schledermann pointed - * out but that I had not yet incorporated. I haven't (yet) - * incorporated all of Wojciech's changes, though I may at some - * future time. - * - * *** End of the housecleaning updates. Updated 08/08/07. - * - * @author Jon Abernathy <jon@chuggnutt.com> - * @version 1.0.0 - * @since PHP 4.0.2 - */ -class html2text -{ - - /** - * Contains the HTML content to convert. - * - * @var string $html - * @access public - */ - var $html; - - /** - * Contains the converted, formatted text. - * - * @var string $text - * @access public - */ - var $text; - - /** - * Maximum width of the formatted text, in columns. - * - * Set this value to 0 (or less) to ignore word wrapping - * and not constrain text to a fixed-width column. - * - * @var integer $width - * @access public - */ - var $width = 70; - - /** - * Target character encoding for output text - * - * @var string $charset - * @access public - */ - var $charset = 'UTF-8'; - - /** - * List of preg* regular expression patterns to search for, - * used in conjunction with $replace. - * - * @var array $search - * @access public - * @see $replace - */ - var $search = array( - "/\r/", // Non-legal carriage return - "/[\n\t]+/", // Newlines and tabs - '/<head[^>]*>.*?<\/head>/i', // <head> - '/<script[^>]*>.*?<\/script>/i', // <script>s -- which strip_tags supposedly has problems with - '/<style[^>]*>.*?<\/style>/i', // <style>s -- which strip_tags supposedly has problems with - '/<p[^>]*>/i', // <P> - '/<br[^>]*>/i', // <br> - '/<i[^>]*>(.*?)<\/i>/i', // <i> - '/<em[^>]*>(.*?)<\/em>/i', // <em> - '/(<ul[^>]*>|<\/ul>)/i', // <ul> and </ul> - '/(<ol[^>]*>|<\/ol>)/i', // <ol> and </ol> - '/<li[^>]*>(.*?)<\/li>/i', // <li> and </li> - '/<li[^>]*>/i', // <li> - '/<hr[^>]*>/i', // <hr> - '/<div[^>]*>/i', // <div> - '/(<table[^>]*>|<\/table>)/i', // <table> and </table> - '/(<tr[^>]*>|<\/tr>)/i', // <tr> and </tr> - '/<td[^>]*>(.*?)<\/td>/i', // <td> and </td> - ); - - /** - * List of pattern replacements corresponding to patterns searched. - * - * @var array $replace - * @access public - * @see $search - */ - var $replace = array( - '', // Non-legal carriage return - ' ', // Newlines and tabs - '', // <head> - '', // <script>s -- which strip_tags supposedly has problems with - '', // <style>s -- which strip_tags supposedly has problems with - "\n\n", // <P> - "\n", // <br> - '_\\1_', // <i> - '_\\1_', // <em> - "\n\n", // <ul> and </ul> - "\n\n", // <ol> and </ol> - "\t* \\1\n", // <li> and </li> - "\n\t* ", // <li> - "\n-------------------------\n", // <hr> - "<div>\n", // <div> - "\n\n", // <table> and </table> - "\n", // <tr> and </tr> - "\t\t\\1\n", // <td> and </td> - ); - - /** - * List of preg* regular expression patterns to search for, - * used in conjunction with $ent_replace. - * - * @var array $ent_search - * @access public - * @see $ent_replace - */ - var $ent_search = array( - '/&(nbsp|#160);/i', // Non-breaking space - '/&(quot|rdquo|ldquo|#8220|#8221|#147|#148);/i', - // Double quotes - '/&(apos|rsquo|lsquo|#8216|#8217);/i', // Single quotes - '/>/i', // Greater-than - '/</i', // Less-than - '/&(copy|#169);/i', // Copyright - '/&(trade|#8482|#153);/i', // Trademark - '/&(reg|#174);/i', // Registered - '/&(mdash|#151|#8212);/i', // mdash - '/&(ndash|minus|#8211|#8722);/i', // ndash - '/&(bull|#149|#8226);/i', // Bullet - '/&(pound|#163);/i', // Pound sign - '/&(euro|#8364);/i', // Euro sign - '/&(amp|#38);/i', // Ampersand: see _converter() - '/[ ]{2,}/', // Runs of spaces, post-handling - ); - - /** - * List of pattern replacements corresponding to patterns searched. - * - * @var array $ent_replace - * @access public - * @see $ent_search - */ - var $ent_replace = array( - ' ', // Non-breaking space - '"', // Double quotes - "'", // Single quotes - '>', - '<', - '(c)', - '(tm)', - '(R)', - '--', - '-', - '*', - '£', - 'EUR', // Euro sign. € ? - '|+|amp|+|', // Ampersand: see _converter() - ' ', // Runs of spaces, post-handling - ); - - /** - * List of preg* regular expression patterns to search for - * and replace using callback function. - * - * @var array $callback_search - * @access public - */ - var $callback_search = array( - '/<(a) [^>]*href=("|\')([^"\']+)\2[^>]*>(.*?)<\/a>/i', // <a href=""> - '/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i', // h1 - h6 - '/<(b)( [^>]*)?>(.*?)<\/b>/i', // <b> - '/<(strong)( [^>]*)?>(.*?)<\/strong>/i', // <strong> - '/<(th)( [^>]*)?>(.*?)<\/th>/i', // <th> and </th> - ); - - /** - * List of preg* regular expression patterns to search for in PRE body, - * used in conjunction with $pre_replace. - * - * @var array $pre_search - * @access public - * @see $pre_replace - */ - var $pre_search = array( - "/\n/", - "/\t/", - '/ /', - '/<pre[^>]*>/', - '/<\/pre>/' - ); - - /** - * List of pattern replacements corresponding to patterns searched for PRE body. - * - * @var array $pre_replace - * @access public - * @see $pre_search - */ - var $pre_replace = array( - '<br>', - ' ', - ' ', - '', - '' - ); - - /** - * Contains a list of HTML tags to allow in the resulting text. - * - * @var string $allowed_tags - * @access public - * @see set_allowed_tags() - */ - var $allowed_tags = ''; - - /** - * Contains the base URL that relative links should resolve to. - * - * @var string $url - * @access public - */ - var $url; - - /** - * Indicates whether content in the $html variable has been converted yet. - * - * @var boolean $_converted - * @access private - * @see $html, $text - */ - var $_converted = false; - - /** - * Contains URL addresses from links to be rendered in plain text. - * - * @var array $_link_list - * @access private - * @see _build_link_list() - */ - var $_link_list = array(); - - /** - * Boolean flag, true if a table of link URLs should be listed after the text. - * - * @var boolean $_do_links - * @access private - * @see html2text() - */ - var $_do_links = true; - - /** - * Constructor. - * - * If the HTML source string (or file) is supplied, the class - * will instantiate with that source propagated, all that has - * to be done it to call get_text(). - * - * @param string $source HTML content - * @param boolean $from_file Indicates $source is a file to pull content from - * @param boolean $do_links Indicate whether a table of link URLs is desired - * @param integer $width Maximum width of the formatted text, 0 for no limit - * @access public - * @return void - */ - function html2text( $source = '', $from_file = false, $do_links = true, $width = 75, $charset = 'UTF-8' ) - { - if ( !empty($source) ) { - $this->set_html($source, $from_file); - } - - $this->set_base_url(); - $this->_do_links = $do_links; - $this->width = $width; - $this->charset = $charset; - } - - /** - * Loads source HTML into memory, either from $source string or a file. - * - * @param string $source HTML content - * @param boolean $from_file Indicates $source is a file to pull content from - * @access public - * @return void - */ - function set_html( $source, $from_file = false ) - { - if ( $from_file && file_exists($source) ) { - $this->html = file_get_contents($source); - } - else - $this->html = $source; - - $this->_converted = false; - } - - /** - * Returns the text, converted from HTML. - * - * @access public - * @return string - */ - function get_text() - { - if ( !$this->_converted ) { - $this->_convert(); - } - - return $this->text; - } - - /** - * Prints the text, converted from HTML. - * - * @access public - * @return void - */ - function print_text() - { - print $this->get_text(); - } - - /** - * Alias to print_text(), operates identically. - * - * @access public - * @return void - * @see print_text() - */ - function p() - { - print $this->get_text(); - } - - /** - * Sets the allowed HTML tags to pass through to the resulting text. - * - * Tags should be in the form "<p>", with no corresponding closing tag. - * - * @access public - * @return void - */ - function set_allowed_tags( $allowed_tags = '' ) - { - if ( !empty($allowed_tags) ) { - $this->allowed_tags = $allowed_tags; - } - } - - /** - * Sets a base URL to handle relative links. - * - * @access public - * @return void - */ - function set_base_url( $url = '' ) - { - if ( empty($url) ) { - if ( !empty($_SERVER['HTTP_HOST']) ) { - $this->url = 'http://' . $_SERVER['HTTP_HOST']; - } else { - $this->url = ''; - } - } else { - // Strip any trailing slashes for consistency (relative - // URLs may already start with a slash like "/file.html") - if ( substr($url, -1) == '/' ) { - $url = substr($url, 0, -1); - } - $this->url = $url; - } - } - - /** - * Workhorse function that does actual conversion (calls _converter() method). - * - * @access private - * @return void - */ - function _convert() - { - // Variables used for building the link list - $this->_link_list = array(); - - $text = trim(stripslashes($this->html)); - - // Convert HTML to TXT - $this->_converter($text); - - // Add link list - if (!empty($this->_link_list)) { - $text .= "\n\nLinks:\n------\n"; - foreach ($this->_link_list as $idx => $url) { - $text .= '[' . ($idx+1) . '] ' . $url . "\n"; - } - } - - $this->text = $text; - - $this->_converted = true; - } - - /** - * Workhorse function that does actual conversion. - * - * First performs custom tag replacement specified by $search and - * $replace arrays. Then strips any remaining HTML tags, reduces whitespace - * and newlines to a readable format, and word wraps the text to - * $width characters. - * - * @param string Reference to HTML content string - * - * @access private - * @return void - */ - function _converter(&$text) - { - // Convert <BLOCKQUOTE> (before PRE!) - $this->_convert_blockquotes($text); - - // Convert <PRE> - $this->_convert_pre($text); - - // Run our defined tags search-and-replace - $text = preg_replace($this->search, $this->replace, $text); - - // Run our defined tags search-and-replace with callback - $text = preg_replace_callback($this->callback_search, array('html2text', '_preg_callback'), $text); - - // Strip any other HTML tags - $text = strip_tags($text, $this->allowed_tags); - - // Run our defined entities/characters search-and-replace - $text = preg_replace($this->ent_search, $this->ent_replace, $text); - - // Replace known html entities - $text = html_entity_decode($text, ENT_QUOTES, $this->charset); - - // Remove unknown/unhandled entities (this cannot be done in search-and-replace block) - $text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text); - - // Convert "|+|amp|+|" into "&", need to be done after handling of unknown entities - // This properly handles situation of "&quot;" in input string - $text = str_replace('|+|amp|+|', '&', $text); - - // Bring down number of empty lines to 2 max - $text = preg_replace("/\n\s+\n/", "\n\n", $text); - $text = preg_replace("/[\n]{3,}/", "\n\n", $text); - - // remove leading empty lines (can be produced by eg. P tag on the beginning) - $text = ltrim($text, "\n"); - - // Wrap the text to a readable format - // for PHP versions >= 4.0.2. Default width is 75 - // If width is 0 or less, don't wrap the text. - if ( $this->width > 0 ) { - $text = wordwrap($text, $this->width); - } - } - - /** - * Helper function called by preg_replace() on link replacement. - * - * Maintains an internal list of links to be displayed at the end of the - * text, with numeric indices to the original point in the text they - * appeared. Also makes an effort at identifying and handling absolute - * and relative links. - * - * @param string $link URL of the link - * @param string $display Part of the text to associate number with - * @access private - * @return string - */ - function _build_link_list( $link, $display ) - { - if (!$this->_do_links || empty($link)) { - return $display; - } - - // Ignored link types - if (preg_match('!^(javascript:|mailto:|#)!i', $link)) { - return $display; - } - - if (preg_match('!^([a-z][a-z0-9.+-]+:)!i', $link)) { - $url = $link; - } - else { - $url = $this->url; - if (substr($link, 0, 1) != '/') { - $url .= '/'; - } - $url .= "$link"; - } - - if (($index = array_search($url, $this->_link_list)) === false) { - $index = count($this->_link_list); - $this->_link_list[] = $url; - } - - return $display . ' [' . ($index+1) . ']'; - } - - /** - * Helper function for PRE body conversion. - * - * @param string HTML content - * @access private - */ - function _convert_pre(&$text) - { - // get the content of PRE element - while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) { - $this->pre_content = $matches[1]; - - // Run our defined tags search-and-replace with callback - $this->pre_content = preg_replace_callback($this->callback_search, - array('html2text', '_preg_callback'), $this->pre_content); - - // convert the content - $this->pre_content = sprintf('<div><br>%s<br></div>', - preg_replace($this->pre_search, $this->pre_replace, $this->pre_content)); - - // replace the content (use callback because content can contain $0 variable) - $text = preg_replace_callback('/<pre[^>]*>.*<\/pre>/ismU', - array('html2text', '_preg_pre_callback'), $text, 1); - - // free memory - $this->pre_content = ''; - } - } - - /** - * Helper function for BLOCKQUOTE body conversion. - * - * @param string HTML content - * @access private - */ - function _convert_blockquotes(&$text) - { - if (preg_match_all('/<\/*blockquote[^>]*>/i', $text, $matches, PREG_OFFSET_CAPTURE)) { - $level = 0; - $diff = 0; - foreach ($matches[0] as $m) { - if ($m[0][0] == '<' && $m[0][1] == '/') { - $level--; - if ($level < 0) { - $level = 0; // malformed HTML: go to next blockquote - } - else if ($level > 0) { - // skip inner blockquote - } - else { - $end = $m[1]; - $len = $end - $taglen - $start; - // Get blockquote content - $body = substr($text, $start + $taglen - $diff, $len); - - // Set text width - $p_width = $this->width; - if ($this->width > 0) $this->width -= 2; - // Convert blockquote content - $body = trim($body); - $this->_converter($body); - // Add citation markers and create PRE block - $body = preg_replace('/((^|\n)>*)/', '\\1> ', trim($body)); - $body = '<pre>' . htmlspecialchars($body) . '</pre>'; - // Re-set text width - $this->width = $p_width; - // Replace content - $text = substr($text, 0, $start - $diff) - . $body . substr($text, $end + strlen($m[0]) - $diff); - - $diff = $len + $taglen + strlen($m[0]) - strlen($body); - unset($body); - } - } - else { - if ($level == 0) { - $start = $m[1]; - $taglen = strlen($m[0]); - } - $level ++; - } - } - } - } - - /** - * Callback function for preg_replace_callback use. - * - * @param array PREG matches - * @return string - */ - private function _preg_callback($matches) - { - switch (strtolower($matches[1])) { - case 'b': - case 'strong': - return $this->_toupper($matches[3]); - case 'th': - return $this->_toupper("\t\t". $matches[3] ."\n"); - case 'h': - return $this->_toupper("\n\n". $matches[3] ."\n\n"); - case 'a': - // Remove spaces in URL (#1487805) - $url = str_replace(' ', '', $matches[3]); - return $this->_build_link_list($url, $matches[4]); - } - } - - /** - * Callback function for preg_replace_callback use in PRE content handler. - * - * @param array PREG matches - * @return string - */ - private function _preg_pre_callback($matches) - { - return $this->pre_content; - } - - /** - * Strtoupper function with HTML tags and entities handling. - * - * @param string $str Text to convert - * @return string Converted text - */ - private function _toupper($str) - { - // string can containg HTML tags - $chunks = preg_split('/(<[^>]*>)/', $str, null, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE); - - // convert toupper only the text between HTML tags - foreach ($chunks as $idx => $chunk) { - if ($chunk[0] != '<') { - $chunks[$idx] = $this->_strtoupper($chunk); - } - } - - return implode($chunks); - } - - /** - * Strtoupper multibyte wrapper function with HTML entities handling. - * - * @param string $str Text to convert - * @return string Converted text - */ - private function _strtoupper($str) - { - $str = html_entity_decode($str, ENT_COMPAT, $this->charset); - - if (function_exists('mb_strtoupper')) - $str = mb_strtoupper($str); - else - $str = strtoupper($str); - - $str = htmlspecialchars($str, ENT_COMPAT, $this->charset); - - return $str; - } -} |