In normalize_string() replace 4-byte unicode characters with '?' character.

These are not supported in default utf-8 charset on mysql, the chance we'd need them in searching is very low.
author: Aleksander Machniak <alec@alec.pl> 2013-12-12 08:58:54 +0100
committer: Aleksander Machniak <alec@alec.pl> 2013-12-12 08:58:54 +0100
commit: d19c0f9f309cbe63411a8ddcbbda3daf7461a30d (patch)
tree: 4b5a72bb04d3cb16296d7ab8c5a3218fddaaf9e6 /program/lib
parent: 7eecf873da8d2f28e20dc8fd0e949e6abc5762b4 (diff)
1 files changed, 10 insertions, 0 deletions
diff --git a/program/lib/Roundcube/rcube_utils.php b/program/lib/Roundcube/rcube_utils.php
index 27a618d83..db41a6e86 100644
--- a/program/lib/Roundcube/rcube_utils.php
+++ b/program/lib/Roundcube/rcube_utils.php
@@ -912,10 +912,20 @@ class rcube_utils
      *
      * @param string  Input string (UTF-8)
      * @param boolean True to return list of words as array
+     *
      * @return mixed  Normalized string or a list of normalized tokens
      */
     public static function normalize_string($str, $as_array = false)
     {
+        // replace 4-byte unicode characters with '?' character,
+        // these are not supported in default utf-8 charset on mysql,
+        // the chance we'd need them in searching is very low
+        $str = preg_replace('/('
+            . '\xF0[\x90-\xBF][\x80-\xBF]{2}'
+            . '|[\xF1-\xF3][\x80-\xBF]{3}'
+            . '|\xF4[\x80-\x8F][\x80-\xBF]{2}'
+            . ')/', '?', $str);
+
         // split by words
         $arr = self::tokenize_string($str);
author	Aleksander Machniak <alec@alec.pl>	2013-12-12 08:58:54 +0100
committer	Aleksander Machniak <alec@alec.pl>	2013-12-12 08:58:54 +0100
commit	d19c0f9f309cbe63411a8ddcbbda3daf7461a30d (patch)
tree	4b5a72bb04d3cb16296d7ab8c5a3218fddaaf9e6 /program/lib
parent	7eecf873da8d2f28e20dc8fd0e949e6abc5762b4 (diff)