diff options
author | alecpl <alec@alec.pl> | 2010-09-29 13:32:41 +0000 |
---|---|---|
committer | alecpl <alec@alec.pl> | 2010-09-29 13:32:41 +0000 |
commit | 11bcac5802dbdd01ee37b97e84f9a91c5777d9e6 (patch) | |
tree | 45e334d3d6204e6ef6133e0b80997f0c9951da24 /program | |
parent | b3660bbdc3a7dcae7873016f10ddc0b7c82e17b7 (diff) |
- Fix HTML to plain text conversion doesn't handle citation blocks (#1486921)
Diffstat (limited to 'program')
-rw-r--r-- | program/lib/html2text.php | 131 | ||||
-rw-r--r-- | program/steps/utils/html2text.inc | 2 |
2 files changed, 103 insertions, 30 deletions
diff --git a/program/lib/html2text.php b/program/lib/html2text.php index 9c5267f62..aa5df0eab 100644 --- a/program/lib/html2text.php +++ b/program/lib/html2text.php @@ -201,7 +201,7 @@ class html2text "\t* \\1\n", // <li> and </li> "\n\t* ", // <li> "\n-------------------------\n", // <hr> - "<div>\n", // <div> + "<div>\n", // <div> "\n\n", // <table> and </table> "\n", // <tr> and </tr> "\t\t\\1\n", // <td> and </td> @@ -445,12 +445,7 @@ class html2text } /** - * Workhorse function that does actual conversion. - * - * First performs custom tag replacement specified by $search and - * $replace arrays. Then strips any remaining HTML tags, reduces whitespace - * and newlines to a readable format, and word wraps the text to - * $width characters. + * Workhorse function that does actual conversion (calls _converter() method). * * @access private * @return void @@ -463,6 +458,37 @@ class html2text $text = trim(stripslashes($this->html)); + // Convert HTML to TXT + $this->_converter($text); + + // Add link list + if ( !empty($this->_link_list) ) { + $text .= "\n\nLinks:\n------\n" . $this->_link_list; + } + + $this->text = $text; + + $this->_converted = true; + } + + /** + * Workhorse function that does actual conversion. + * + * First performs custom tag replacement specified by $search and + * $replace arrays. Then strips any remaining HTML tags, reduces whitespace + * and newlines to a readable format, and word wraps the text to + * $width characters. + * + * @param string Reference to HTML content string + * + * @access private + * @return void + */ + function _converter(&$text) + { + // Convert <BLOCKQUOTE> (before PRE!) + $this->_convert_blockquotes($text); + // Convert <PRE> $this->_convert_pre($text); @@ -485,21 +511,12 @@ class html2text $text = preg_replace("/\n\s+\n/", "\n\n", $text); $text = preg_replace("/[\n]{3,}/", "\n\n", $text); - // Add link list - if ( !empty($this->_link_list) ) { - $text .= "\n\nLinks:\n------\n" . $this->_link_list; - } - // Wrap the text to a readable format // for PHP versions >= 4.0.2. Default width is 75 // If width is 0 or less, don't wrap the text. if ( $this->width > 0 ) { $text = wordwrap($text, $this->width); } - - $this->text = $text; - - $this->_converted = true; } /** @@ -517,20 +534,22 @@ class html2text */ function _build_link_list( $link, $display ) { - if ( !$this->_do_links ) return $display; - - if ( substr($link, 0, 7) == 'http://' || substr($link, 0, 8) == 'https://' || - substr($link, 0, 7) == 'mailto:' ) { + if ( !$this->_do_links ) + return $display; + + if ( substr($link, 0, 7) == 'http://' || substr($link, 0, 8) == 'https://' || + substr($link, 0, 7) == 'mailto:' + ) { $this->_link_count++; - $this->_link_list .= "[" . $this->_link_count . "] $link\n"; + $this->_link_list .= '[' . $this->_link_count . "] $link\n"; $additional = ' [' . $this->_link_count . ']'; - } elseif ( substr($link, 0, 11) == 'javascript:' ) { - // Don't count the link; ignore it - $additional = ''; + } elseif ( substr($link, 0, 11) == 'javascript:' ) { + // Don't count the link; ignore it + $additional = ''; // what about href="#anchor" ? } else { $this->_link_count++; - $this->_link_list .= "[" . $this->_link_count . "] " . $this->url; + $this->_link_list .= '[' . $this->_link_count . '] ' . $this->url; if ( substr($link, 0, 1) != '/' ) { $this->_link_list .= '/'; } @@ -540,7 +559,7 @@ class html2text return $display . $additional; } - + /** * Helper function for PRE body conversion. * @@ -549,13 +568,69 @@ class html2text */ function _convert_pre(&$text) { - while(preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) { + while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) { $result = preg_replace($this->pre_search, $this->pre_replace, $matches[1]); $text = preg_replace('/<pre[^>]*>.*<\/pre>/ismU', '<div><br>' . $result . '<br></div>', $text, 1); } } /** + * Helper function for BLOCKQUOTE body conversion. + * + * @param string HTML content + * @access private + */ + function _convert_blockquotes(&$text) + { + if (preg_match_all('/<\/*blockquote[^>]*>/i', $text, $matches, PREG_OFFSET_CAPTURE)) { + $level = 0; + $diff = 0; + foreach ($matches[0] as $m) { + if ($m[0][0] == '<' && $m[0][1] == '/') { + $level--; + if ($level < 0) { + $level = 0; // malformed HTML: go to next blockquote + } + else if ($level > 0) { + // skip inner blockquote + } + else { + $end = $m[1]; + $len = $end - $taglen - $start; + // Get blockquote content + $body = substr($text, $start + $taglen - $diff, $len); + + // Set text width + $p_width = $this->width; + if ($this->width > 0) $this->width -= 2; + // Convert blockquote content + $body = trim($body); + $this->_converter($body); + // Add citation markers and create PRE block + $body = preg_replace('/((^|\n)>*)/', '\\1> ', trim($body)); + $body = '<pre>' . htmlspecialchars($body) . '</pre>'; + // Re-set text width + $this->width = $p_width; + // Replace content + $text = substr($text, 0, $start - $diff) + . $body . substr($text, $end + strlen($m[0]) - $diff); + + $diff = $len + $taglen + strlen($m[0]) - strlen($body); + unset($body); + } + } + else { + if ($level == 0) { + $start = $m[1]; + $taglen = strlen($m[0]); + } + $level ++; + } + } + } + } + + /** * Callback function for preg_replace_callback use. * * @param array PREG matches @@ -592,5 +667,3 @@ class html2text return strtoupper($str); } } - -?> diff --git a/program/steps/utils/html2text.inc b/program/steps/utils/html2text.inc index f97e1c67f..ef74ec49e 100644 --- a/program/steps/utils/html2text.inc +++ b/program/steps/utils/html2text.inc @@ -22,7 +22,7 @@ $converter = new html2text($HTTP_RAW_POST_DATA); header('Content-Type: text/plain; charset=UTF-8'); -print trim($converter->get_text()); +print rtrim($converter->get_text()); exit; |