diff options
author | Aleksander Machniak <alec@alec.pl> | 2014-05-27 11:39:31 +0200 |
---|---|---|
committer | Aleksander Machniak <alec@alec.pl> | 2014-05-27 14:59:03 +0200 |
commit | dbd5c184714a14b9944d4f4654fca6c94b81d0de (patch) | |
tree | 38fea99b8069eec0356e5723b55e8ae1bb285047 | |
parent | 4f3f85da39b1ec0ad092a1f7e6a27277588cc689 (diff) |
Fix broken normalize_string(), add support for ISO-8859-2
-rw-r--r-- | program/lib/Roundcube/rcube_utils.php | 34 | ||||
-rw-r--r-- | tests/Framework/Utils.php | 3 |
2 files changed, 27 insertions, 10 deletions
diff --git a/program/lib/Roundcube/rcube_utils.php b/program/lib/Roundcube/rcube_utils.php index c2009cee0..00999ba50 100644 --- a/program/lib/Roundcube/rcube_utils.php +++ b/program/lib/Roundcube/rcube_utils.php @@ -928,7 +928,7 @@ class rcube_utils /** * Normalize the given string for fulltext search. - * Currently only optimized for Latin-1 characters; to be extended + * Currently only optimized for ISO-8859-1 and ISO-8859-2 characters; to be extended * * @param string Input string (UTF-8) * @param boolean True to return list of words as array @@ -949,15 +949,32 @@ class rcube_utils // split by words $arr = self::tokenize_string($str); + // detect character set + if (utf8_encode(utf8_decode($str)) == $str) { + // ISO-8859-1 (or ASCII) + preg_match_all('/./u', 'äâàåáãæçéêëèïîìíñöôòøõóüûùúýÿ', $keys); + preg_match_all('/./', 'aaaaaaaceeeeiiiinoooooouuuuyy', $values); + + $mapping = array_combine($keys[0], $values[0]); + $mapping = array_merge($mapping, array('ß' => 'ss', 'ae' => 'a', 'oe' => 'o', 'ue' => 'u')); + } + else if (rcube_charset::convert(rcube_charset::convert($str, 'UTF-8', 'ISO-8859-2'), 'ISO-8859-2', 'UTF-8') == $str) { + // ISO-8859-2 + preg_match_all('/./u', 'ąáâäćçčéęëěíîłľĺńňóôöŕřśšşťţůúűüźžżý', $keys); + preg_match_all('/./', 'aaaaccceeeeiilllnnooorrsssttuuuuzzzy', $values); + + $mapping = array_combine($keys[0], $values[0]); + $mapping = array_merge($mapping, array('ß' => 'ss', 'ae' => 'a', 'oe' => 'o', 'ue' => 'u')); + } + foreach ($arr as $i => $part) { - if (utf8_encode(utf8_decode($part)) == $part) { // is latin-1 ? - $arr[$i] = utf8_encode(strtr(strtolower(strtr(utf8_decode($part), - 'ÇçäâàåéêëèïîìÅÉöôòüûùÿøØáíóúñÑÁÂÀãÃÊËÈÍÎÏÓÔõÕÚÛÙýÝ', - 'ccaaaaeeeeiiiaeooouuuyooaiounnaaaaaeeeiiioooouuuyy')), - array('ß' => 'ss', 'ae' => 'a', 'oe' => 'o', 'ue' => 'u'))); + $part = mb_strtolower($part); + + if (!empty($mapping)) { + $part = strtr($part, $mapping); } - else - $arr[$i] = mb_strtolower($part); + + $arr[$i] = $part; } return $as_array ? $arr : join(" ", $arr); @@ -1039,7 +1056,6 @@ class rcube_utils } } - /** * Find out if the string content means true or false * diff --git a/tests/Framework/Utils.php b/tests/Framework/Utils.php index 82b8ebb73..560a8bde7 100644 --- a/tests/Framework/Utils.php +++ b/tests/Framework/Utils.php @@ -327,7 +327,8 @@ class Framework_Utils extends PHPUnit_Framework_TestCase $test = array( '' => '', 'abc def' => 'abc def', - 'ÇçäâàåéêëèïîìÅÉöôòüûùÿøØáíóúñÑÁÂÀãÃÊËÈÍÎÏÓÔõÕÚÛÙýÝ' => 'ccaaaaeeeeiiiaeooouuuyooaiounnaaaaaeeeiiioooouuuyy', + 'ÇçäâàåæéêëèïîìÅÉöôòüûùÿøØáíóúñÑÁÂÀãÃÊËÈÍÎÏÓÔõÕÚÛÙýÝ' => 'ccaaaaaeeeeiiiaeooouuuyooaiounnaaaaaeeeiiioooouuuyy', + 'ąáâäćçčéęëěíîłľĺńňóôöŕřśšşťţůúűüźžżýĄŚŻŹĆ' => 'aaaaccceeeeiilllnnooorrsssttuuuuzzzyaszzc', 'ß' => 'ss', 'ae' => 'a', 'oe' => 'o', |