From 51809bd66c3dc08fb578950a0394c8603a419b9e Mon Sep 17 00:00:00 2001 From: Aleksander Machniak Date: Fri, 8 Jun 2012 08:53:07 +0200 Subject: Fix handling of links with various URI schemes e.g. "skype:" (#1488106) Fix handling of links inside PRE elements on html to text conversion Fix indexing of links on html to text conversion Conflicts: CHANGELOG --- program/lib/html2text.php | 37 ++++++++++++++++++++++--------------- program/lib/washtml.php | 2 +- 2 files changed, 23 insertions(+), 16 deletions(-) (limited to 'program/lib') diff --git a/program/lib/html2text.php b/program/lib/html2text.php index 84a7374b3..9de2e961e 100644 --- a/program/lib/html2text.php +++ b/program/lib/html2text.php @@ -249,12 +249,11 @@ class html2text * @access public */ var $callback_search = array( - '/<(a) [^>]*href=("|\')([^"\']+)\2[^>]*>(.*?)<\/a>/i', - // - '/<(h)[123456][^>]*>(.*?)<\/h[123456]>/i', // H1 - H3 - '/<(b)[^>]*>(.*?)<\/b>/i', // - '/<(strong)[^>]*>(.*?)<\/strong>/i', // - '/<(th)[^>]*>(.*?)<\/th>/i', // and + '/<(a) [^>]*href=("|\')([^"\']+)\2[^>]*>(.*?)<\/a>/i', // + '/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i', // h1 - h6 + '/<(b)( [^>]*)?>(.*?)<\/b>/i', // + '/<(strong)( [^>]*)?>(.*?)<\/strong>/i', // + '/<(th)( [^>]*)?>(.*?)<\/th>/i', // and ); /** @@ -368,7 +367,7 @@ class html2text function set_html( $source, $from_file = false ) { if ( $from_file && file_exists($source) ) { - $this->html = file_get_contents($source); + $this->html = file_get_contents($source); } else $this->html = $source; @@ -560,11 +559,11 @@ class html2text } // Ignored link types - if (preg_match('!^(javascript|mailto|#):!i', $link)) { + if (preg_match('!^(javascript:|mailto:|#)!i', $link)) { return $display; } - if (preg_match('!^(https?://)!i', $link)) { + if (preg_match('!^([a-z][a-z0-9.+-]+:)!i', $link)) { $url = $link; } else { @@ -576,8 +575,8 @@ class html2text } if (($index = array_search($url, $this->_link_list)) === false) { - $this->_link_list[] = $url; $index = count($this->_link_list); + $this->_link_list[] = $url; } return $display . ' [' . ($index+1) . ']'; @@ -593,12 +592,20 @@ class html2text { // get the content of PRE element while (preg_match('/]*>(.*)<\/pre>/ismU', $text, $matches)) { + $this->pre_content = $matches[1]; + + // Run our defined tags search-and-replace with callback + $this->pre_content = preg_replace_callback($this->callback_search, + array('html2text', '_preg_callback'), $this->pre_content); + // convert the content $this->pre_content = sprintf('

%s
', - preg_replace($this->pre_search, $this->pre_replace, $matches[1])); + preg_replace($this->pre_search, $this->pre_replace, $this->pre_content)); + // replace the content (use callback because content can contain $0 variable) - $text = preg_replace_callback('/]*>.*<\/pre>/ismU', + $text = preg_replace_callback('/]*>.*<\/pre>/ismU', array('html2text', '_preg_pre_callback'), $text, 1); + // free memory $this->pre_content = ''; } @@ -671,11 +678,11 @@ class html2text switch (strtolower($matches[1])) { case 'b': case 'strong': - return $this->_toupper($matches[2]); + return $this->_toupper($matches[3]); case 'th': - return $this->_toupper("\t\t". $matches[2] ."\n"); + return $this->_toupper("\t\t". $matches[3] ."\n"); case 'h': - return $this->_toupper("\n\n". $matches[2] ."\n\n"); + return $this->_toupper("\n\n". $matches[3] ."\n\n"); case 'a': // Remove spaces in URL (#1487805) $url = str_replace(' ', '', $matches[3]); diff --git a/program/lib/washtml.php b/program/lib/washtml.php index 4221abdef..6ea59f03f 100644 --- a/program/lib/washtml.php +++ b/program/lib/washtml.php @@ -202,7 +202,7 @@ class washtml $key = strtolower($key); $value = $node->getAttribute($key); if (isset($this->_html_attribs[$key]) || - ($key == 'href' && preg_match('!^(http:|https:|ftp:|mailto:|//|#).+!i', $value))) + ($key == 'href' && preg_match('!^([a-z][a-z0-9.+-]+:|//|#).+!i', $value))) $t .= ' ' . $key . '="' . htmlspecialchars($value, ENT_QUOTES) . '"'; else if ($key == 'style' && ($style = $this->wash_style($value))) { $quot = strpos($style, '"') !== false ? "'" : '"'; -- cgit v1.2.3