diff options
Diffstat (limited to 'program/lib/Roundcube/rcube_washtml.php')
-rw-r--r-- | program/lib/Roundcube/rcube_washtml.php | 94 |
1 files changed, 87 insertions, 7 deletions
diff --git a/program/lib/Roundcube/rcube_washtml.php b/program/lib/Roundcube/rcube_washtml.php index e7467545f..51f7930aa 100644 --- a/program/lib/Roundcube/rcube_washtml.php +++ b/program/lib/Roundcube/rcube_washtml.php @@ -184,7 +184,7 @@ class rcube_washtml '|rgb\(\s*[0-9]+\s*,\s*[0-9]+\s*,\s*[0-9]+\s*\)'. '|-?[0-9.]+\s*(em|ex|px|cm|mm|in|pt|pc|deg|rad|grad|ms|s|hz|khz|%)?'. '|#[0-9a-f]{3,6}'. - '|[a-z0-9", -]+'. + '|[a-z0-9"\', -]+'. ')\s*/i', $str, $match) ) { if ($match[2]) { @@ -418,7 +418,7 @@ class rcube_washtml $html = preg_replace($html_search, $html_replace, trim($html)); //-> Replace all of those weird MS Word quotes and other high characters - $badwordchars=array( + $badwordchars = array( "\xe2\x80\x98", // left single quote "\xe2\x80\x99", // right single quote "\xe2\x80\x9c", // left double quote @@ -426,7 +426,7 @@ class rcube_washtml "\xe2\x80\x94", // em dash "\xe2\x80\xa6" // elipses ); - $fixedwordchars=array( + $fixedwordchars = array( "'", "'", '"', @@ -434,7 +434,7 @@ class rcube_washtml '—', '...' ); - $html = str_replace($badwordchars,$fixedwordchars, $html); + $html = str_replace($badwordchars, $fixedwordchars, $html); // PCRE errors handling (#1486856), should we use something like for every preg_* use? if ($html === null && ($preg_error = preg_last_error()) != PREG_NO_ERROR) { @@ -455,13 +455,16 @@ class rcube_washtml } // fix (unknown/malformed) HTML tags before "wash" - $html = preg_replace_callback('/(<(?!\!)[\/]*)([^\s>]+)/', array($this, 'html_tag_callback'), $html); + $html = preg_replace_callback('/(<(?!\!)[\/]*)([^\s>]+)([^>]*)/', array($this, 'html_tag_callback'), $html); // Remove invalid HTML comments (#1487759) // Don't remove valid conditional comments // Don't remove MSOutlook (<!-->) conditional comments (#1489004) $html = preg_replace('/<!--[^->\[\n]+>/', '', $html); + // fix broken nested lists + self::fix_broken_lists($html); + // turn relative into absolute urls $html = self::resolve_base($html); @@ -479,7 +482,12 @@ class rcube_washtml '/[^a-z0-9_\[\]\!-]/i', // forbidden characters ), '', $tagname); - return $matches[1] . $tagname; + // fix invalid closing tags - remove any attributes (#1489446) + if ($matches[1] == '</') { + $matches[3] = ''; + } + + return $matches[1] . $tagname . $matches[3]; } /** @@ -495,5 +503,77 @@ class rcube_washtml return $body; } -} + /** + * Fix broken nested lists, they are not handled properly by DOMDocument (#1488768) + */ + public static function fix_broken_lists(&$html) + { + // do two rounds, one for <ol>, one for <ul> + foreach (array('ol', 'ul') as $tag) { + $pos = 0; + while (($pos = stripos($html, '<' . $tag, $pos)) !== false) { + $pos++; + + // make sure this is an ol/ul tag + if (!in_array($html[$pos+2], array(' ', '>'))) { + continue; + } + + $p = $pos; + $in_li = false; + $li_pos = 0; + + while (($p = strpos($html, '<', $p)) !== false) { + $tt = strtolower(substr($html, $p, 4)); + + // li open tag + if ($tt == '<li>' || $tt == '<li ') { + $in_li = true; + $p += 4; + } + // li close tag + else if ($tt == '</li' && in_array($html[$p+4], array(' ', '>'))) { + $li_pos = $p; + $p += 4; + $in_li = false; + } + // ul/ol closing tag + else if ($tt == '</' . $tag && in_array($html[$p+4], array(' ', '>'))) { + break; + } + // nested ol/ul element out of li + else if (!$in_li && $li_pos && ($tt == '<ol>' || $tt == '<ol ' || $tt == '<ul>' || $tt == '<ul ')) { + // find closing tag of this ul/ol element + $element = substr($tt, 1, 2); + $cpos = $p; + do { + $tpos = stripos($html, '<' . $element, $cpos+1); + $cpos = stripos($html, '</' . $element, $cpos+1); + } + while ($tpos !== false && $cpos !== false && $cpos > $tpos); + + // not found, this is invalid HTML, skip it + if ($cpos === false) { + break; + } + + // get element content + $end = strpos($html, '>', $cpos); + $len = $end - $p + 1; + $element = substr($html, $p, $len); + + // move element to the end of the last li + $html = substr_replace($html, '', $p, $len); + $html = substr_replace($html, $element, $li_pos, 0); + + $p = $end; + } + else { + $p++; + } + } + } + } + } +} |