diff options
| -rw-r--r-- | program/lib/Roundcube/rcube_utils.php | 22 | ||||
| -rw-r--r-- | tests/Framework/Utils.php | 32 | 
2 files changed, 42 insertions, 12 deletions
| diff --git a/program/lib/Roundcube/rcube_utils.php b/program/lib/Roundcube/rcube_utils.php index 2e4aa323e..f4c0e90ca 100644 --- a/program/lib/Roundcube/rcube_utils.php +++ b/program/lib/Roundcube/rcube_utils.php @@ -912,14 +912,21 @@ class rcube_utils       * Split the given string into word tokens       *       * @param string Input to tokenize +     * @param integer Minimum length of a single token       * @return array List of tokens       */ -    public static function tokenize_string($str) +    public static function tokenize_string($str, $minlen = 2)      { -        return explode(" ", preg_replace( -            array('/[\s;\/+-]+/i', '/(\d)[-.\s]+(\d)/', '/\s\w{1,3}\s/u'), -            array(' ', '\\1\\2', ' '), -            $str)); +        $expr = array('/[\s;\/+-]+/ui', '/(\d)[-.\s]+(\d)/u'); +        $repl = array(' ', '\\1\\2'); + +        if ($minlen > 1) { +            $minlen--; +            $expr[] = "/(^|\s+)\w{1,$minlen}(\s+|$)/u"; +            $repl[] = ' '; +        } + +        return array_filter(explode(" ", preg_replace($expr, $repl, $str)));      }      /** @@ -928,10 +935,11 @@ class rcube_utils       *       * @param string  Input string (UTF-8)       * @param boolean True to return list of words as array +     * @param integer Minimum length of tokens       *       * @return mixed  Normalized string or a list of normalized tokens       */ -    public static function normalize_string($str, $as_array = false) +    public static function normalize_string($str, $as_array = false, $minlen = 2)      {          // replace 4-byte unicode characters with '?' character,          // these are not supported in default utf-8 charset on mysql, @@ -943,7 +951,7 @@ class rcube_utils              . ')/', '?', $str);          // split by words -        $arr = self::tokenize_string($str); +        $arr = self::tokenize_string($str, $minlen);          // detect character set          if (utf8_encode(utf8_decode($str)) == $str) { diff --git a/tests/Framework/Utils.php b/tests/Framework/Utils.php index 1cd9e7429..f7389f0d7 100644 --- a/tests/Framework/Utils.php +++ b/tests/Framework/Utils.php @@ -344,6 +344,25 @@ class Framework_Utils extends PHPUnit_Framework_TestCase      }      /** +     * rcube:utils::tokenize_string() +     */ +    function test_tokenize_string() +    { +        $test = array( +            ''        => array(), +            'abc d'   => array('abc'), +            'abc de'  => array('abc','de'), +            'äàé;êöü-xyz' => array('äàé','êöü','xyz'), +            '日期格式' => array('日期格式'), +        ); + +        foreach ($test as $input => $output) { +            $result = rcube_utils::tokenize_string($input); +            $this->assertSame($output, $result); +        } +    } + +    /**       * rcube:utils::normalize_string()       */      function test_normalize_string() @@ -353,15 +372,18 @@ class Framework_Utils extends PHPUnit_Framework_TestCase              'abc def' => 'abc def',              'ÇçäâàåæéêëèïîìÅÉöôòüûùÿøØáíóúñÑÁÂÀãÃÊËÈÍÎÏÓÔõÕÚÛÙýÝ' => 'ccaaaaaeeeeiiiaeooouuuyooaiounnaaaaaeeeiiioooouuuyy',              'ąáâäćçčéęëěíîłľĺńňóôöŕřśšşťţůúűüźžżýĄŚŻŹĆ' => 'aaaaccceeeeiilllnnooorrsssttuuuuzzzyaszzc', -            'ß'  => 'ss', -            'ae' => 'a', -            'oe' => 'o', -            'ue' => 'u', +            'ß'   => '', +            'ßs'  => 'sss', +            'Xae' => 'xa', +            'Xoe' => 'xo', +            'Xue' => 'xu', +            '项目' => '项目', +            '日'   => '',  // FIXME: this should not be stripped although minlen = 2          );          foreach ($test as $input => $output) {              $result = rcube_utils::normalize_string($input); -            $this->assertSame($output, $result); +            $this->assertSame($output, $result, "Error normalizing '$input'");          }      } | 
