diff options
author | Aleksander Machniak <alec@alec.pl> | 2013-12-12 08:58:54 +0100 |
---|---|---|
committer | Aleksander Machniak <alec@alec.pl> | 2013-12-12 08:58:54 +0100 |
commit | d19c0f9f309cbe63411a8ddcbbda3daf7461a30d (patch) | |
tree | 4b5a72bb04d3cb16296d7ab8c5a3218fddaaf9e6 /program | |
parent | 7eecf873da8d2f28e20dc8fd0e949e6abc5762b4 (diff) |
In normalize_string() replace 4-byte unicode characters with '?' character.
These are not supported in default utf-8 charset on mysql,
the chance we'd need them in searching is very low.
Diffstat (limited to 'program')
-rw-r--r-- | program/lib/Roundcube/rcube_utils.php | 10 |
1 files changed, 10 insertions, 0 deletions
diff --git a/program/lib/Roundcube/rcube_utils.php b/program/lib/Roundcube/rcube_utils.php index 27a618d83..db41a6e86 100644 --- a/program/lib/Roundcube/rcube_utils.php +++ b/program/lib/Roundcube/rcube_utils.php @@ -912,10 +912,20 @@ class rcube_utils * * @param string Input string (UTF-8) * @param boolean True to return list of words as array + * * @return mixed Normalized string or a list of normalized tokens */ public static function normalize_string($str, $as_array = false) { + // replace 4-byte unicode characters with '?' character, + // these are not supported in default utf-8 charset on mysql, + // the chance we'd need them in searching is very low + $str = preg_replace('/(' + . '\xF0[\x90-\xBF][\x80-\xBF]{2}' + . '|[\xF1-\xF3][\x80-\xBF]{3}' + . '|\xF4[\x80-\x8F][\x80-\xBF]{2}' + . ')/', '?', $str); + // split by words $arr = self::tokenize_string($str); |