summaryrefslogtreecommitdiff
path: root/program/include/rcube_utils.php
diff options
context:
space:
mode:
Diffstat (limited to 'program/include/rcube_utils.php')
-rw-r--r--program/include/rcube_utils.php41
1 files changed, 41 insertions, 0 deletions
diff --git a/program/include/rcube_utils.php b/program/include/rcube_utils.php
index 5b31537fd..c6d4805c8 100644
--- a/program/include/rcube_utils.php
+++ b/program/include/rcube_utils.php
@@ -790,4 +790,45 @@ class rcube_utils
return $at ? $user . '@' . $domain : $domain;
}
+ /**
+ * Split the given string into word tokens
+ *
+ * @param string Input to tokenize
+ * @return array List of tokens
+ */
+ public static function tokenize_string($str)
+ {
+ return explode(" ", preg_replace(
+ array('/[\s;\/+-]+/i', '/(\d)[-.\s]+(\d)/', '/\s\w{1,3}\s/u'),
+ array(' ', '\\1\\2', ' '),
+ $str));
+ }
+
+ /**
+ * Normalize the given string for fulltext search.
+ * Currently only optimized for Latin-1 characters; to be extended
+ *
+ * @param string Input string (UTF-8)
+ * @param boolean True to return list of words as array
+ * @return mixed Normalized string or a list of normalized tokens
+ */
+ public static function normalize_string($str, $as_array = false)
+ {
+ // split by words
+ $arr = self::tokenize_string($str);
+
+ foreach ($arr as $i => $part) {
+ if (utf8_encode(utf8_decode($part)) == $part) { // is latin-1 ?
+ $arr[$i] = utf8_encode(strtr(strtolower(strtr(utf8_decode($part),
+ 'ÇçäâàåéêëèïîìÅÉöôòüûùÿøØáíóúñÑÁÂÀãÃÊËÈÍÎÏÓÔõÕÚÛÙýÝ',
+ 'ccaaaaeeeeiiiaeooouuuyooaiounnaaaaaeeeiiioooouuuyy')),
+ array('ß' => 'ss', 'ae' => 'a', 'oe' => 'o', 'ue' => 'u')));
+ }
+ else
+ $arr[$i] = mb_strtolower($part);
+ }
+
+ return $as_array ? $arr : join(" ", $arr);
+ }
+
}