diff options
author | Aleksander Machniak <alec@alec.pl> | 2012-06-08 08:53:07 +0200 |
---|---|---|
committer | Aleksander Machniak <alec@alec.pl> | 2012-06-08 08:59:10 +0200 |
commit | 51809bd66c3dc08fb578950a0394c8603a419b9e (patch) | |
tree | c8cef6c331d546a9634fecdade23a01371456c03 /program | |
parent | 49db5234109c4e46e82b0ece8b0f8da58ce5ea69 (diff) |
Fix handling of links with various URI schemes e.g. "skype:" (#1488106)
Fix handling of links inside PRE elements on html to text conversion
Fix indexing of links on html to text conversion
Conflicts:
CHANGELOG
Diffstat (limited to 'program')
-rw-r--r-- | program/lib/html2text.php | 37 | ||||
-rw-r--r-- | program/lib/washtml.php | 2 |
2 files changed, 23 insertions, 16 deletions
diff --git a/program/lib/html2text.php b/program/lib/html2text.php index 84a7374b3..9de2e961e 100644 --- a/program/lib/html2text.php +++ b/program/lib/html2text.php @@ -249,12 +249,11 @@ class html2text * @access public */ var $callback_search = array( - '/<(a) [^>]*href=("|\')([^"\']+)\2[^>]*>(.*?)<\/a>/i', - // <a href=""> - '/<(h)[123456][^>]*>(.*?)<\/h[123456]>/i', // H1 - H3 - '/<(b)[^>]*>(.*?)<\/b>/i', // <b> - '/<(strong)[^>]*>(.*?)<\/strong>/i', // <strong> - '/<(th)[^>]*>(.*?)<\/th>/i', // <th> and </th> + '/<(a) [^>]*href=("|\')([^"\']+)\2[^>]*>(.*?)<\/a>/i', // <a href=""> + '/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i', // h1 - h6 + '/<(b)( [^>]*)?>(.*?)<\/b>/i', // <b> + '/<(strong)( [^>]*)?>(.*?)<\/strong>/i', // <strong> + '/<(th)( [^>]*)?>(.*?)<\/th>/i', // <th> and </th> ); /** @@ -368,7 +367,7 @@ class html2text function set_html( $source, $from_file = false ) { if ( $from_file && file_exists($source) ) { - $this->html = file_get_contents($source); + $this->html = file_get_contents($source); } else $this->html = $source; @@ -560,11 +559,11 @@ class html2text } // Ignored link types - if (preg_match('!^(javascript|mailto|#):!i', $link)) { + if (preg_match('!^(javascript:|mailto:|#)!i', $link)) { return $display; } - if (preg_match('!^(https?://)!i', $link)) { + if (preg_match('!^([a-z][a-z0-9.+-]+:)!i', $link)) { $url = $link; } else { @@ -576,8 +575,8 @@ class html2text } if (($index = array_search($url, $this->_link_list)) === false) { - $this->_link_list[] = $url; $index = count($this->_link_list); + $this->_link_list[] = $url; } return $display . ' [' . ($index+1) . ']'; @@ -593,12 +592,20 @@ class html2text { // get the content of PRE element while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) { + $this->pre_content = $matches[1]; + + // Run our defined tags search-and-replace with callback + $this->pre_content = preg_replace_callback($this->callback_search, + array('html2text', '_preg_callback'), $this->pre_content); + // convert the content $this->pre_content = sprintf('<div><br>%s<br></div>', - preg_replace($this->pre_search, $this->pre_replace, $matches[1])); + preg_replace($this->pre_search, $this->pre_replace, $this->pre_content)); + // replace the content (use callback because content can contain $0 variable) - $text = preg_replace_callback('/<pre[^>]*>.*<\/pre>/ismU', + $text = preg_replace_callback('/<pre[^>]*>.*<\/pre>/ismU', array('html2text', '_preg_pre_callback'), $text, 1); + // free memory $this->pre_content = ''; } @@ -671,11 +678,11 @@ class html2text switch (strtolower($matches[1])) { case 'b': case 'strong': - return $this->_toupper($matches[2]); + return $this->_toupper($matches[3]); case 'th': - return $this->_toupper("\t\t". $matches[2] ."\n"); + return $this->_toupper("\t\t". $matches[3] ."\n"); case 'h': - return $this->_toupper("\n\n". $matches[2] ."\n\n"); + return $this->_toupper("\n\n". $matches[3] ."\n\n"); case 'a': // Remove spaces in URL (#1487805) $url = str_replace(' ', '', $matches[3]); diff --git a/program/lib/washtml.php b/program/lib/washtml.php index 4221abdef..6ea59f03f 100644 --- a/program/lib/washtml.php +++ b/program/lib/washtml.php @@ -202,7 +202,7 @@ class washtml $key = strtolower($key); $value = $node->getAttribute($key); if (isset($this->_html_attribs[$key]) || - ($key == 'href' && preg_match('!^(http:|https:|ftp:|mailto:|//|#).+!i', $value))) + ($key == 'href' && preg_match('!^([a-z][a-z0-9.+-]+:|//|#).+!i', $value))) $t .= ' ' . $key . '="' . htmlspecialchars($value, ENT_QUOTES) . '"'; else if ($key == 'style' && ($style = $this->wash_style($value))) { $quot = strpos($style, '"') !== false ? "'" : '"'; |