diff options
author | Thomas Bruederli <bruederli@kolabsys.com> | 2014-12-28 16:22:08 +0100 |
---|---|---|
committer | Thomas Bruederli <bruederli@kolabsys.com> | 2014-12-28 16:22:08 +0100 |
commit | e8b82c2e7b0ae2e5d45ecb600813b8990568feb9 (patch) | |
tree | 9522b21d1161fdb9ae37e1a8572b3d3670836b96 | |
parent | 09c58d1adde92a60a3e7cd67f4e66c8b1a56be6a (diff) |
Fix rcube_utils::normalize_string() to support unicode characters + add argument for minimum token length
-rw-r--r-- | program/lib/Roundcube/rcube_utils.php | 22 | ||||
-rw-r--r-- | tests/Framework/Utils.php | 32 |
2 files changed, 42 insertions, 12 deletions
diff --git a/program/lib/Roundcube/rcube_utils.php b/program/lib/Roundcube/rcube_utils.php index 2e4aa323e..f4c0e90ca 100644 --- a/program/lib/Roundcube/rcube_utils.php +++ b/program/lib/Roundcube/rcube_utils.php @@ -912,14 +912,21 @@ class rcube_utils * Split the given string into word tokens * * @param string Input to tokenize + * @param integer Minimum length of a single token * @return array List of tokens */ - public static function tokenize_string($str) + public static function tokenize_string($str, $minlen = 2) { - return explode(" ", preg_replace( - array('/[\s;\/+-]+/i', '/(\d)[-.\s]+(\d)/', '/\s\w{1,3}\s/u'), - array(' ', '\\1\\2', ' '), - $str)); + $expr = array('/[\s;\/+-]+/ui', '/(\d)[-.\s]+(\d)/u'); + $repl = array(' ', '\\1\\2'); + + if ($minlen > 1) { + $minlen--; + $expr[] = "/(^|\s+)\w{1,$minlen}(\s+|$)/u"; + $repl[] = ' '; + } + + return array_filter(explode(" ", preg_replace($expr, $repl, $str))); } /** @@ -928,10 +935,11 @@ class rcube_utils * * @param string Input string (UTF-8) * @param boolean True to return list of words as array + * @param integer Minimum length of tokens * * @return mixed Normalized string or a list of normalized tokens */ - public static function normalize_string($str, $as_array = false) + public static function normalize_string($str, $as_array = false, $minlen = 2) { // replace 4-byte unicode characters with '?' character, // these are not supported in default utf-8 charset on mysql, @@ -943,7 +951,7 @@ class rcube_utils . ')/', '?', $str); // split by words - $arr = self::tokenize_string($str); + $arr = self::tokenize_string($str, $minlen); // detect character set if (utf8_encode(utf8_decode($str)) == $str) { diff --git a/tests/Framework/Utils.php b/tests/Framework/Utils.php index 1cd9e7429..f7389f0d7 100644 --- a/tests/Framework/Utils.php +++ b/tests/Framework/Utils.php @@ -344,6 +344,25 @@ class Framework_Utils extends PHPUnit_Framework_TestCase } /** + * rcube:utils::tokenize_string() + */ + function test_tokenize_string() + { + $test = array( + '' => array(), + 'abc d' => array('abc'), + 'abc de' => array('abc','de'), + 'äàé;êöü-xyz' => array('äàé','êöü','xyz'), + '日期格式' => array('日期格式'), + ); + + foreach ($test as $input => $output) { + $result = rcube_utils::tokenize_string($input); + $this->assertSame($output, $result); + } + } + + /** * rcube:utils::normalize_string() */ function test_normalize_string() @@ -353,15 +372,18 @@ class Framework_Utils extends PHPUnit_Framework_TestCase 'abc def' => 'abc def', 'ÇçäâàåæéêëèïîìÅÉöôòüûùÿøØáíóúñÑÁÂÀãÃÊËÈÍÎÏÓÔõÕÚÛÙýÝ' => 'ccaaaaaeeeeiiiaeooouuuyooaiounnaaaaaeeeiiioooouuuyy', 'ąáâäćçčéęëěíîłľĺńňóôöŕřśšşťţůúűüźžżýĄŚŻŹĆ' => 'aaaaccceeeeiilllnnooorrsssttuuuuzzzyaszzc', - 'ß' => 'ss', - 'ae' => 'a', - 'oe' => 'o', - 'ue' => 'u', + 'ß' => '', + 'ßs' => 'sss', + 'Xae' => 'xa', + 'Xoe' => 'xo', + 'Xue' => 'xu', + '项目' => '项目', + '日' => '', // FIXME: this should not be stripped although minlen = 2 ); foreach ($test as $input => $output) { $result = rcube_utils::normalize_string($input); - $this->assertSame($output, $result); + $this->assertSame($output, $result, "Error normalizing '$input'"); } } |