summaryrefslogtreecommitdiff
path: root/program
diff options
context:
space:
mode:
authorAleksander Machniak <alec@alec.pl>2012-06-08 08:53:07 +0200
committerAleksander Machniak <alec@alec.pl>2012-06-08 08:53:07 +0200
commit8c188058cf9281251cbac5cda43ef833843fd51b (patch)
treef7ccf7a0ac1f04acec6b87da65267b1d393439ea /program
parentf8c96f737c1916377e361e3fbaa8a415c4101ca4 (diff)
Fix handling of links with various URI schemes e.g. "skype:" (#1488106)
Fix handling of links inside PRE elements on html to text conversion Fix indexing of links on html to text conversion
Diffstat (limited to 'program')
-rw-r--r--program/lib/html2text.php37
-rw-r--r--program/lib/washtml.php2
2 files changed, 23 insertions, 16 deletions
diff --git a/program/lib/html2text.php b/program/lib/html2text.php
index 84a7374b3..9de2e961e 100644
--- a/program/lib/html2text.php
+++ b/program/lib/html2text.php
@@ -249,12 +249,11 @@ class html2text
* @access public
*/
var $callback_search = array(
- '/<(a) [^>]*href=("|\')([^"\']+)\2[^>]*>(.*?)<\/a>/i',
- // <a href="">
- '/<(h)[123456][^>]*>(.*?)<\/h[123456]>/i', // H1 - H3
- '/<(b)[^>]*>(.*?)<\/b>/i', // <b>
- '/<(strong)[^>]*>(.*?)<\/strong>/i', // <strong>
- '/<(th)[^>]*>(.*?)<\/th>/i', // <th> and </th>
+ '/<(a) [^>]*href=("|\')([^"\']+)\2[^>]*>(.*?)<\/a>/i', // <a href="">
+ '/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i', // h1 - h6
+ '/<(b)( [^>]*)?>(.*?)<\/b>/i', // <b>
+ '/<(strong)( [^>]*)?>(.*?)<\/strong>/i', // <strong>
+ '/<(th)( [^>]*)?>(.*?)<\/th>/i', // <th> and </th>
);
/**
@@ -368,7 +367,7 @@ class html2text
function set_html( $source, $from_file = false )
{
if ( $from_file && file_exists($source) ) {
- $this->html = file_get_contents($source);
+ $this->html = file_get_contents($source);
}
else
$this->html = $source;
@@ -560,11 +559,11 @@ class html2text
}
// Ignored link types
- if (preg_match('!^(javascript|mailto|#):!i', $link)) {
+ if (preg_match('!^(javascript:|mailto:|#)!i', $link)) {
return $display;
}
- if (preg_match('!^(https?://)!i', $link)) {
+ if (preg_match('!^([a-z][a-z0-9.+-]+:)!i', $link)) {
$url = $link;
}
else {
@@ -576,8 +575,8 @@ class html2text
}
if (($index = array_search($url, $this->_link_list)) === false) {
- $this->_link_list[] = $url;
$index = count($this->_link_list);
+ $this->_link_list[] = $url;
}
return $display . ' [' . ($index+1) . ']';
@@ -593,12 +592,20 @@ class html2text
{
// get the content of PRE element
while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) {
+ $this->pre_content = $matches[1];
+
+ // Run our defined tags search-and-replace with callback
+ $this->pre_content = preg_replace_callback($this->callback_search,
+ array('html2text', '_preg_callback'), $this->pre_content);
+
// convert the content
$this->pre_content = sprintf('<div><br>%s<br></div>',
- preg_replace($this->pre_search, $this->pre_replace, $matches[1]));
+ preg_replace($this->pre_search, $this->pre_replace, $this->pre_content));
+
// replace the content (use callback because content can contain $0 variable)
- $text = preg_replace_callback('/<pre[^>]*>.*<\/pre>/ismU',
+ $text = preg_replace_callback('/<pre[^>]*>.*<\/pre>/ismU',
array('html2text', '_preg_pre_callback'), $text, 1);
+
// free memory
$this->pre_content = '';
}
@@ -671,11 +678,11 @@ class html2text
switch (strtolower($matches[1])) {
case 'b':
case 'strong':
- return $this->_toupper($matches[2]);
+ return $this->_toupper($matches[3]);
case 'th':
- return $this->_toupper("\t\t". $matches[2] ."\n");
+ return $this->_toupper("\t\t". $matches[3] ."\n");
case 'h':
- return $this->_toupper("\n\n". $matches[2] ."\n\n");
+ return $this->_toupper("\n\n". $matches[3] ."\n\n");
case 'a':
// Remove spaces in URL (#1487805)
$url = str_replace(' ', '', $matches[3]);
diff --git a/program/lib/washtml.php b/program/lib/washtml.php
index 4221abdef..6ea59f03f 100644
--- a/program/lib/washtml.php
+++ b/program/lib/washtml.php
@@ -202,7 +202,7 @@ class washtml
$key = strtolower($key);
$value = $node->getAttribute($key);
if (isset($this->_html_attribs[$key]) ||
- ($key == 'href' && preg_match('!^(http:|https:|ftp:|mailto:|//|#).+!i', $value)))
+ ($key == 'href' && preg_match('!^([a-z][a-z0-9.+-]+:|//|#).+!i', $value)))
$t .= ' ' . $key . '="' . htmlspecialchars($value, ENT_QUOTES) . '"';
else if ($key == 'style' && ($style = $this->wash_style($value))) {
$quot = strpos($style, '"') !== false ? "'" : '"';