From 11bcac5802dbdd01ee37b97e84f9a91c5777d9e6 Mon Sep 17 00:00:00 2001 From: alecpl Date: Wed, 29 Sep 2010 13:32:41 +0000 Subject: - Fix HTML to plain text conversion doesn't handle citation blocks (#1486921) --- program/lib/html2text.php | 131 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 102 insertions(+), 29 deletions(-) (limited to 'program/lib') diff --git a/program/lib/html2text.php b/program/lib/html2text.php index 9c5267f62..aa5df0eab 100644 --- a/program/lib/html2text.php +++ b/program/lib/html2text.php @@ -201,7 +201,7 @@ class html2text "\t* \\1\n", //
  • and
  • "\n\t* ", //
  • "\n-------------------------\n", //
    - "
    \n", //
    + "
    \n", //
    "\n\n", // and
    "\n", // and "\t\t\\1\n", // and @@ -445,12 +445,7 @@ class html2text } /** - * Workhorse function that does actual conversion. - * - * First performs custom tag replacement specified by $search and - * $replace arrays. Then strips any remaining HTML tags, reduces whitespace - * and newlines to a readable format, and word wraps the text to - * $width characters. + * Workhorse function that does actual conversion (calls _converter() method). * * @access private * @return void @@ -463,6 +458,37 @@ class html2text $text = trim(stripslashes($this->html)); + // Convert HTML to TXT + $this->_converter($text); + + // Add link list + if ( !empty($this->_link_list) ) { + $text .= "\n\nLinks:\n------\n" . $this->_link_list; + } + + $this->text = $text; + + $this->_converted = true; + } + + /** + * Workhorse function that does actual conversion. + * + * First performs custom tag replacement specified by $search and + * $replace arrays. Then strips any remaining HTML tags, reduces whitespace + * and newlines to a readable format, and word wraps the text to + * $width characters. + * + * @param string Reference to HTML content string + * + * @access private + * @return void + */ + function _converter(&$text) + { + // Convert
    (before PRE!) + $this->_convert_blockquotes($text); + // Convert
             $this->_convert_pre($text);
     
    @@ -485,21 +511,12 @@ class html2text
             $text = preg_replace("/\n\s+\n/", "\n\n", $text);
             $text = preg_replace("/[\n]{3,}/", "\n\n", $text);
     
    -        // Add link list
    -        if ( !empty($this->_link_list) ) {
    -            $text .= "\n\nLinks:\n------\n" . $this->_link_list;
    -        }
    -
             // Wrap the text to a readable format
             // for PHP versions >= 4.0.2. Default width is 75
             // If width is 0 or less, don't wrap the text.
             if ( $this->width > 0 ) {
             	$text = wordwrap($text, $this->width);
             }
    -
    -        $this->text = $text;
    -
    -        $this->_converted = true;
         }
     
         /**
    @@ -517,20 +534,22 @@ class html2text
          */
         function _build_link_list( $link, $display )
         {
    -	if ( !$this->_do_links ) return $display;
    -	
    -	if ( substr($link, 0, 7) == 'http://' || substr($link, 0, 8) == 'https://' ||
    -             substr($link, 0, 7) == 'mailto:' ) {
    +	    if ( !$this->_do_links )
    +	        return $display;
    +
    +	    if ( substr($link, 0, 7) == 'http://' || substr($link, 0, 8) == 'https://' ||
    +            substr($link, 0, 7) == 'mailto:'
    +        ) {
                 $this->_link_count++;
    -            $this->_link_list .= "[" . $this->_link_count . "] $link\n";
    +            $this->_link_list .= '[' . $this->_link_count . "] $link\n";
                 $additional = ' [' . $this->_link_count . ']';
    -	} elseif ( substr($link, 0, 11) == 'javascript:' ) {
    -		// Don't count the link; ignore it
    -		$additional = '';
    +	    } elseif ( substr($link, 0, 11) == 'javascript:' ) {
    +		    // Don't count the link; ignore it
    +		    $additional = '';
     		// what about href="#anchor" ?
             } else {
                 $this->_link_count++;
    -            $this->_link_list .= "[" . $this->_link_count . "] " . $this->url;
    +            $this->_link_list .= '[' . $this->_link_count . '] ' . $this->url;
                 if ( substr($link, 0, 1) != '/' ) {
                     $this->_link_list .= '/';
                 }
    @@ -540,7 +559,7 @@ class html2text
     
             return $display . $additional;
         }
    -    
    +
         /**
          *  Helper function for PRE body conversion.
          *
    @@ -549,12 +568,68 @@ class html2text
          */
         function _convert_pre(&$text)
         {
    -        while(preg_match('/]*>(.*)<\/pre>/ismU', $text, $matches)) {
    +        while (preg_match('/]*>(.*)<\/pre>/ismU', $text, $matches)) {
                 $result = preg_replace($this->pre_search, $this->pre_replace, $matches[1]);
                 $text = preg_replace('/]*>.*<\/pre>/ismU', '

    ' . $result . '
    ', $text, 1); } } + /** + * Helper function for BLOCKQUOTE body conversion. + * + * @param string HTML content + * @access private + */ + function _convert_blockquotes(&$text) + { + if (preg_match_all('/<\/*blockquote[^>]*>/i', $text, $matches, PREG_OFFSET_CAPTURE)) { + $level = 0; + $diff = 0; + foreach ($matches[0] as $m) { + if ($m[0][0] == '<' && $m[0][1] == '/') { + $level--; + if ($level < 0) { + $level = 0; // malformed HTML: go to next blockquote + } + else if ($level > 0) { + // skip inner blockquote + } + else { + $end = $m[1]; + $len = $end - $taglen - $start; + // Get blockquote content + $body = substr($text, $start + $taglen - $diff, $len); + + // Set text width + $p_width = $this->width; + if ($this->width > 0) $this->width -= 2; + // Convert blockquote content + $body = trim($body); + $this->_converter($body); + // Add citation markers and create PRE block + $body = preg_replace('/((^|\n)>*)/', '\\1> ', trim($body)); + $body = '
    ' . htmlspecialchars($body) . '
    '; + // Re-set text width + $this->width = $p_width; + // Replace content + $text = substr($text, 0, $start - $diff) + . $body . substr($text, $end + strlen($m[0]) - $diff); + + $diff = $len + $taglen + strlen($m[0]) - strlen($body); + unset($body); + } + } + else { + if ($level == 0) { + $start = $m[1]; + $taglen = strlen($m[0]); + } + $level ++; + } + } + } + } + /** * Callback function for preg_replace_callback use. * @@ -592,5 +667,3 @@ class html2text return strtoupper($str); } } - -?> -- cgit v1.2.3