Fix rcube_utils::normalize_string() to support unicode characters + add argument for minimum token length

author: Thomas Bruederli <bruederli@kolabsys.com> 2014-12-28 16:22:08 +0100
committer: Thomas Bruederli <bruederli@kolabsys.com> 2014-12-28 16:22:08 +0100
commit: e8b82c2e7b0ae2e5d45ecb600813b8990568feb9 (patch)
tree: 9522b21d1161fdb9ae37e1a8572b3d3670836b96
parent: 09c58d1adde92a60a3e7cd67f4e66c8b1a56be6a (diff)
2 files changed, 42 insertions, 12 deletions
diff --git a/program/lib/Roundcube/rcube_utils.php b/program/lib/Roundcube/rcube_utils.php
index 2e4aa323e..f4c0e90ca 100644
--- a/program/lib/Roundcube/rcube_utils.php
+++ b/program/lib/Roundcube/rcube_utils.php
@@ -912,14 +912,21 @@ class rcube_utils
      * Split the given string into word tokens
      *
      * @param string Input to tokenize
+     * @param integer Minimum length of a single token
      * @return array List of tokens
      */
-    public static function tokenize_string($str)
+    public static function tokenize_string($str, $minlen = 2)
     {
-        return explode(" ", preg_replace(
-            array('/[\s;\/+-]+/i', '/(\d)[-.\s]+(\d)/', '/\s\w{1,3}\s/u'),
-            array(' ', '\\1\\2', ' '),
-            $str));
+        $expr = array('/[\s;\/+-]+/ui', '/(\d)[-.\s]+(\d)/u');
+        $repl = array(' ', '\\1\\2');
+
+        if ($minlen > 1) {
+            $minlen--;
+            $expr[] = "/(^|\s+)\w{1,$minlen}(\s+|$)/u";
+            $repl[] = ' ';
+        }
+
+        return array_filter(explode(" ", preg_replace($expr, $repl, $str)));
     }
 
     /**
@@ -928,10 +935,11 @@ class rcube_utils
      *
      * @param string  Input string (UTF-8)
      * @param boolean True to return list of words as array
+     * @param integer Minimum length of tokens
      *
      * @return mixed  Normalized string or a list of normalized tokens
      */
-    public static function normalize_string($str, $as_array = false)
+    public static function normalize_string($str, $as_array = false, $minlen = 2)
     {
         // replace 4-byte unicode characters with '?' character,
         // these are not supported in default utf-8 charset on mysql,
@@ -943,7 +951,7 @@ class rcube_utils
             . ')/', '?', $str);
 
         // split by words
-        $arr = self::tokenize_string($str);
+        $arr = self::tokenize_string($str, $minlen);
 
         // detect character set
         if (utf8_encode(utf8_decode($str)) == $str) {
diff --git a/tests/Framework/Utils.php b/tests/Framework/Utils.php
index 1cd9e7429..f7389f0d7 100644
--- a/tests/Framework/Utils.php
+++ b/tests/Framework/Utils.php
@@ -344,6 +344,25 @@ class Framework_Utils extends PHPUnit_Framework_TestCase
     }
 
     /**
+     * rcube:utils::tokenize_string()
+     */
+    function test_tokenize_string()
+    {
+        $test = array(
+            ''        => array(),
+            'abc d'   => array('abc'),
+            'abc de'  => array('abc','de'),
+            'äàé;êöü-xyz' => array('äàé','êöü','xyz'),
+            '日期格式' => array('日期格式'),
+        );
+
+        foreach ($test as $input => $output) {
+            $result = rcube_utils::tokenize_string($input);
+            $this->assertSame($output, $result);
+        }
+    }
+
+    /**
      * rcube:utils::normalize_string()
      */
     function test_normalize_string()
@@ -353,15 +372,18 @@ class Framework_Utils extends PHPUnit_Framework_TestCase
             'abc def' => 'abc def',
             'ÇçäâàåæéêëèïîìÅÉöôòüûùÿøØáíóúñÑÁÂÀãÃÊËÈÍÎÏÓÔõÕÚÛÙýÝ' => 'ccaaaaaeeeeiiiaeooouuuyooaiounnaaaaaeeeiiioooouuuyy',
             'ąáâäćçčéęëěíîłľĺńňóôöŕřśšşťţůúűüźžżýĄŚŻŹĆ' => 'aaaaccceeeeiilllnnooorrsssttuuuuzzzyaszzc',
-            'ß'  => 'ss',
-            'ae' => 'a',
-            'oe' => 'o',
-            'ue' => 'u',
+            'ß'   => '',
+            'ßs'  => 'sss',
+            'Xae' => 'xa',
+            'Xoe' => 'xo',
+            'Xue' => 'xu',
+            '项目' => '项目',
+            '日'   => '',  // FIXME: this should not be stripped although minlen = 2
         );
 
         foreach ($test as $input => $output) {
             $result = rcube_utils::normalize_string($input);
-            $this->assertSame($output, $result);
+            $this->assertSame($output, $result, "Error normalizing '$input'");
         }
     }
author	Thomas Bruederli <bruederli@kolabsys.com>	2014-12-28 16:22:08 +0100
committer	Thomas Bruederli <bruederli@kolabsys.com>	2014-12-28 16:22:08 +0100
commit	e8b82c2e7b0ae2e5d45ecb600813b8990568feb9 (patch)
tree	9522b21d1161fdb9ae37e1a8572b3d3670836b96
parent	09c58d1adde92a60a3e7cd67f4e66c8b1a56be6a (diff)