diff options
Diffstat (limited to 'program/include')
-rw-r--r-- | program/include/rcube_shared.inc | 67 |
1 files changed, 66 insertions, 1 deletions
diff --git a/program/include/rcube_shared.inc b/program/include/rcube_shared.inc index 85a6105b0..d60291012 100644 --- a/program/include/rcube_shared.inc +++ b/program/include/rcube_shared.inc @@ -554,7 +554,6 @@ function rc_mime_content_type($path, $name, $failover = 'application/octet-strea return $mime_type; } - /** * A method to guess encoding of a string. * @@ -585,6 +584,72 @@ function rc_detect_encoding($string, $failover='') return $result ? $result : $failover; } +/** + * Removes non-unicode characters from input + * + * @param mixed $input String or array. + * @return string + */ +function rc_utf8_clean($input) +{ + // handle input of type array + if (is_array($input)) { + foreach ($input as $idx => $val) + $input[$idx] = rc_utf8_clean($val); + return $input; + } + + if (!is_string($input)) + return $input; + + // iconv is 10x faster + if (function_exists('iconv')) + return iconv('UTF8', 'UTF8//IGNORE', $input); + + $regexp = '/^('. +// '[\x00-\x7F]'. // UTF8-1 + '|[\xC2-\xDF][\x80-\xBF]'. // UTF8-2 + '|\xE0[\xA0-\xBF][\x80-\xBF]'. // UTF8-3 + '|[\xE1-\xEC][\x80-\xBF][\x80-\xBF]'. // UTF8-3 + '|\xED[\x80-\x9F][\x80-\xBF]'. // UTF8-3 + '|[\xEE-\xEF][\x80-\xBF][\x80-\xBF]'. // UTF8-3 + '|\xF0[\x90-\xBF][\x80-\xBF][\x80-\xBF]'. // UTF8-4 + '|[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]'.// UTF8-4 + '|\xF4[\x80-\x8F][\x80-\xBF][\x80-\xBF]'. // UTF8-4 + ')$/'; + + $seq = ''; + $out = ''; + + for ($i = 0, $len = strlen($input)-1; $i < $len; $i++) { + $chr = $input[$i]; + $ord = ord($chr); + // 1-byte character + if ($ord <= 0x7F) { + if ($seq) + $out .= preg_match($regexp, $seq) ? $seq : ''; + $seq = ''; + $out .= $chr; + // first (or second) byte of multibyte sequence + } else if ($ord >= 0xC0) { + if (strlen($seq)>1) { + $out .= preg_match($regexp, $seq) ? $seq : ''; + $seq = ''; + } else if ($seq && ord($seq) < 0xC0) { + $seq = ''; + } + $seq .= $chr; + // next byte of multibyte sequence + } else if ($seq) { + $seq .= $chr; + } + } + + if ($seq) + $out .= preg_match($regexp, $seq) ? $seq : ''; + + return $out; +} /** * Explode quoted string |