summaryrefslogtreecommitdiff
path: root/program/include
diff options
context:
space:
mode:
Diffstat (limited to 'program/include')
-rw-r--r--program/include/rcube_shared.inc67
1 files changed, 66 insertions, 1 deletions
diff --git a/program/include/rcube_shared.inc b/program/include/rcube_shared.inc
index 85a6105b0..d60291012 100644
--- a/program/include/rcube_shared.inc
+++ b/program/include/rcube_shared.inc
@@ -554,7 +554,6 @@ function rc_mime_content_type($path, $name, $failover = 'application/octet-strea
return $mime_type;
}
-
/**
* A method to guess encoding of a string.
*
@@ -585,6 +584,72 @@ function rc_detect_encoding($string, $failover='')
return $result ? $result : $failover;
}
+/**
+ * Removes non-unicode characters from input
+ *
+ * @param mixed $input String or array.
+ * @return string
+ */
+function rc_utf8_clean($input)
+{
+ // handle input of type array
+ if (is_array($input)) {
+ foreach ($input as $idx => $val)
+ $input[$idx] = rc_utf8_clean($val);
+ return $input;
+ }
+
+ if (!is_string($input))
+ return $input;
+
+ // iconv is 10x faster
+ if (function_exists('iconv'))
+ return iconv('UTF8', 'UTF8//IGNORE', $input);
+
+ $regexp = '/^('.
+// '[\x00-\x7F]'. // UTF8-1
+ '|[\xC2-\xDF][\x80-\xBF]'. // UTF8-2
+ '|\xE0[\xA0-\xBF][\x80-\xBF]'. // UTF8-3
+ '|[\xE1-\xEC][\x80-\xBF][\x80-\xBF]'. // UTF8-3
+ '|\xED[\x80-\x9F][\x80-\xBF]'. // UTF8-3
+ '|[\xEE-\xEF][\x80-\xBF][\x80-\xBF]'. // UTF8-3
+ '|\xF0[\x90-\xBF][\x80-\xBF][\x80-\xBF]'. // UTF8-4
+ '|[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]'.// UTF8-4
+ '|\xF4[\x80-\x8F][\x80-\xBF][\x80-\xBF]'. // UTF8-4
+ ')$/';
+
+ $seq = '';
+ $out = '';
+
+ for ($i = 0, $len = strlen($input)-1; $i < $len; $i++) {
+ $chr = $input[$i];
+ $ord = ord($chr);
+ // 1-byte character
+ if ($ord <= 0x7F) {
+ if ($seq)
+ $out .= preg_match($regexp, $seq) ? $seq : '';
+ $seq = '';
+ $out .= $chr;
+ // first (or second) byte of multibyte sequence
+ } else if ($ord >= 0xC0) {
+ if (strlen($seq)>1) {
+ $out .= preg_match($regexp, $seq) ? $seq : '';
+ $seq = '';
+ } else if ($seq && ord($seq) < 0xC0) {
+ $seq = '';
+ }
+ $seq .= $chr;
+ // next byte of multibyte sequence
+ } else if ($seq) {
+ $seq .= $chr;
+ }
+ }
+
+ if ($seq)
+ $out .= preg_match($regexp, $seq) ? $seq : '';
+
+ return $out;
+}
/**
* Explode quoted string