diff options
author | Aleksander Machniak <alec@alec.pl> | 2012-11-21 19:52:03 +0100 |
---|---|---|
committer | Aleksander Machniak <alec@alec.pl> | 2012-11-21 19:52:03 +0100 |
commit | ba6f21caeb405c7e8512a09941fefbc97286e45f (patch) | |
tree | 4a0e8f6fbab3260d37bf85cbf0bc9f506e627678 /program/lib/Roundcube/rcube_spellchecker.php | |
parent | f707fec0001d7dc7d46be114c42b37e49a052660 (diff) |
Framework files moved to lib/Roundcube
Diffstat (limited to 'program/lib/Roundcube/rcube_spellchecker.php')
-rw-r--r-- | program/lib/Roundcube/rcube_spellchecker.php | 621 |
1 files changed, 621 insertions, 0 deletions
diff --git a/program/lib/Roundcube/rcube_spellchecker.php b/program/lib/Roundcube/rcube_spellchecker.php new file mode 100644 index 000000000..30d15d721 --- /dev/null +++ b/program/lib/Roundcube/rcube_spellchecker.php @@ -0,0 +1,621 @@ +<?php + +/* + +-----------------------------------------------------------------------+ + | program/include/rcube_spellchecker.php | + | | + | This file is part of the Roundcube Webmail client | + | Copyright (C) 2011, Kolab Systems AG | + | Copyright (C) 2008-2011, The Roundcube Dev Team | + | | + | Licensed under the GNU General Public License version 3 or | + | any later version with exceptions for skins & plugins. | + | See the README file for a full license statement. | + | | + | PURPOSE: | + | Spellchecking using different backends | + | | + +-----------------------------------------------------------------------+ + | Author: Aleksander Machniak <machniak@kolabsys.com> | + | Author: Thomas Bruederli <roundcube@gmail.com> | + +-----------------------------------------------------------------------+ +*/ + + +/** + * Helper class for spellchecking with Googielspell and PSpell support. + * + * @package Framework + * @subpackage Utils + */ +class rcube_spellchecker +{ + private $matches = array(); + private $engine; + private $lang; + private $rc; + private $error; + private $separator = '/[\s\r\n\t\(\)\/\[\]{}<>\\"]+|[:;?!,\.]([^\w]|$)/'; + private $options = array(); + private $dict; + private $have_dict; + + + // default settings + const GOOGLE_HOST = 'ssl://www.google.com'; + const GOOGLE_PORT = 443; + const MAX_SUGGESTIONS = 10; + + + /** + * Constructor + * + * @param string $lang Language code + */ + function __construct($lang = 'en') + { + $this->rc = rcube::get_instance(); + $this->engine = $this->rc->config->get('spellcheck_engine', 'googie'); + $this->lang = $lang ? $lang : 'en'; + + $this->options = array( + 'ignore_syms' => $this->rc->config->get('spellcheck_ignore_syms'), + 'ignore_nums' => $this->rc->config->get('spellcheck_ignore_nums'), + 'ignore_caps' => $this->rc->config->get('spellcheck_ignore_caps'), + 'dictionary' => $this->rc->config->get('spellcheck_dictionary'), + ); + } + + + /** + * Set content and check spelling + * + * @param string $text Text content for spellchecking + * @param bool $is_html Enables HTML-to-Text conversion + * + * @return bool True when no mispelling found, otherwise false + */ + function check($text, $is_html = false) + { + // convert to plain text + if ($is_html) { + $this->content = $this->html2text($text); + } + else { + $this->content = $text; + } + + if ($this->engine == 'pspell') { + $this->matches = $this->_pspell_check($this->content); + } + else { + $this->matches = $this->_googie_check($this->content); + } + + return $this->found() == 0; + } + + + /** + * Number of mispellings found (after check) + * + * @return int Number of mispellings + */ + function found() + { + return count($this->matches); + } + + + /** + * Returns suggestions for the specified word + * + * @param string $word The word + * + * @return array Suggestions list + */ + function get_suggestions($word) + { + if ($this->engine == 'pspell') { + return $this->_pspell_suggestions($word); + } + + return $this->_googie_suggestions($word); + } + + + /** + * Returns misspelled words + * + * @param string $text The content for spellchecking. If empty content + * used for check() method will be used. + * + * @return array List of misspelled words + */ + function get_words($text = null, $is_html=false) + { + if ($this->engine == 'pspell') { + return $this->_pspell_words($text, $is_html); + } + + return $this->_googie_words($text, $is_html); + } + + + /** + * Returns checking result in XML (Googiespell) format + * + * @return string XML content + */ + function get_xml() + { + // send output + $out = '<?xml version="1.0" encoding="'.RCMAIL_CHARSET.'"?><spellresult charschecked="'.mb_strlen($this->content).'">'; + + foreach ($this->matches as $item) { + $out .= '<c o="'.$item[1].'" l="'.$item[2].'">'; + $out .= is_array($item[4]) ? implode("\t", $item[4]) : $item[4]; + $out .= '</c>'; + } + + $out .= '</spellresult>'; + + return $out; + } + + + /** + * Returns checking result (misspelled words with suggestions) + * + * @return array Spellchecking result. An array indexed by word. + */ + function get() + { + $result = array(); + + foreach ($this->matches as $item) { + if ($this->engine == 'pspell') { + $word = $item[0]; + } + else { + $word = mb_substr($this->content, $item[1], $item[2], RCMAIL_CHARSET); + } + $result[$word] = is_array($item[4]) ? implode("\t", $item[4]) : $item[4]; + } + + return $result; + } + + + /** + * Returns error message + * + * @return string Error message + */ + function error() + { + return $this->error; + } + + + /** + * Checks the text using pspell + * + * @param string $text Text content for spellchecking + */ + private function _pspell_check($text) + { + // init spellchecker + $this->_pspell_init(); + + if (!$this->plink) { + return array(); + } + + // tokenize + $text = preg_split($this->separator, $text, NULL, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_OFFSET_CAPTURE); + + $diff = 0; + $matches = array(); + + foreach ($text as $w) { + $word = trim($w[0]); + $pos = $w[1] - $diff; + $len = mb_strlen($word); + + // skip exceptions + if ($this->is_exception($word)) { + } + else if (!pspell_check($this->plink, $word)) { + $suggestions = pspell_suggest($this->plink, $word); + + if (sizeof($suggestions) > self::MAX_SUGGESTIONS) { + $suggestions = array_slice($suggestions, 0, self::MAX_SUGGESTIONS); + } + + $matches[] = array($word, $pos, $len, null, $suggestions); + } + + $diff += (strlen($word) - $len); + } + + return $matches; + } + + + /** + * Returns the misspelled words + */ + private function _pspell_words($text = null, $is_html=false) + { + $result = array(); + + if ($text) { + // init spellchecker + $this->_pspell_init(); + + if (!$this->plink) { + return array(); + } + + // With PSpell we don't need to get suggestions to return misspelled words + if ($is_html) { + $text = $this->html2text($text); + } + + $text = preg_split($this->separator, $text, NULL, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_OFFSET_CAPTURE); + + foreach ($text as $w) { + $word = trim($w[0]); + + // skip exceptions + if ($this->is_exception($word)) { + continue; + } + + if (!pspell_check($this->plink, $word)) { + $result[] = $word; + } + } + + return $result; + } + + foreach ($this->matches as $m) { + $result[] = $m[0]; + } + + return $result; + } + + + /** + * Returns suggestions for misspelled word + */ + private function _pspell_suggestions($word) + { + // init spellchecker + $this->_pspell_init(); + + if (!$this->plink) { + return array(); + } + + $suggestions = pspell_suggest($this->plink, $word); + + if (sizeof($suggestions) > self::MAX_SUGGESTIONS) + $suggestions = array_slice($suggestions, 0, self::MAX_SUGGESTIONS); + + return is_array($suggestions) ? $suggestions : array(); + } + + + /** + * Initializes PSpell dictionary + */ + private function _pspell_init() + { + if (!$this->plink) { + if (!extension_loaded('pspell')) { + $this->error = "Pspell extension not available"; + rcube::raise_error(array( + 'code' => 500, 'type' => 'php', + 'file' => __FILE__, 'line' => __LINE__, + 'message' => $this->error), true, false); + + return; + } + + $this->plink = pspell_new($this->lang, null, null, RCMAIL_CHARSET, PSPELL_FAST); + } + + if (!$this->plink) { + $this->error = "Unable to load Pspell engine for selected language"; + } + } + + + private function _googie_check($text) + { + // spell check uri is configured + $url = $this->rc->config->get('spellcheck_uri'); + + if ($url) { + $a_uri = parse_url($url); + $ssl = ($a_uri['scheme'] == 'https' || $a_uri['scheme'] == 'ssl'); + $port = $a_uri['port'] ? $a_uri['port'] : ($ssl ? 443 : 80); + $host = ($ssl ? 'ssl://' : '') . $a_uri['host']; + $path = $a_uri['path'] . ($a_uri['query'] ? '?'.$a_uri['query'] : '') . $this->lang; + } + else { + $host = self::GOOGLE_HOST; + $port = self::GOOGLE_PORT; + $path = '/tbproxy/spell?lang=' . $this->lang; + } + + // Google has some problem with spaces, use \n instead + $gtext = str_replace(' ', "\n", $text); + + $gtext = '<?xml version="1.0" encoding="utf-8" ?>' + .'<spellrequest textalreadyclipped="0" ignoredups="0" ignoredigits="1" ignoreallcaps="1">' + .'<text>' . $gtext . '</text>' + .'</spellrequest>'; + + $store = ''; + if ($fp = fsockopen($host, $port, $errno, $errstr, 30)) { + $out = "POST $path HTTP/1.0\r\n"; + $out .= "Host: " . str_replace('ssl://', '', $host) . "\r\n"; + $out .= "Content-Length: " . strlen($gtext) . "\r\n"; + $out .= "Content-Type: application/x-www-form-urlencoded\r\n"; + $out .= "Connection: Close\r\n\r\n"; + $out .= $gtext; + fwrite($fp, $out); + + while (!feof($fp)) + $store .= fgets($fp, 128); + fclose($fp); + } + + if (!$store) { + $this->error = "Empty result from spelling engine"; + } + + preg_match_all('/<c o="([^"]*)" l="([^"]*)" s="([^"]*)">([^<]*)<\/c>/', $store, $matches, PREG_SET_ORDER); + + // skip exceptions (if appropriate options are enabled) + if (!empty($this->options['ignore_syms']) || !empty($this->options['ignore_nums']) + || !empty($this->options['ignore_caps']) || !empty($this->options['dictionary']) + ) { + foreach ($matches as $idx => $m) { + $word = mb_substr($text, $m[1], $m[2], RCMAIL_CHARSET); + // skip exceptions + if ($this->is_exception($word)) { + unset($matches[$idx]); + } + } + } + + return $matches; + } + + + private function _googie_words($text = null, $is_html=false) + { + if ($text) { + if ($is_html) { + $text = $this->html2text($text); + } + + $matches = $this->_googie_check($text); + } + else { + $matches = $this->matches; + $text = $this->content; + } + + $result = array(); + + foreach ($matches as $m) { + $result[] = mb_substr($text, $m[1], $m[2], RCMAIL_CHARSET); + } + + return $result; + } + + + private function _googie_suggestions($word) + { + if ($word) { + $matches = $this->_googie_check($word); + } + else { + $matches = $this->matches; + } + + if ($matches[0][4]) { + $suggestions = explode("\t", $matches[0][4]); + if (sizeof($suggestions) > self::MAX_SUGGESTIONS) { + $suggestions = array_slice($suggestions, 0, MAX_SUGGESTIONS); + } + + return $suggestions; + } + + return array(); + } + + + private function html2text($text) + { + $h2t = new html2text($text, false, true, 0); + return $h2t->get_text(); + } + + + /** + * Check if the specified word is an exception accoring to + * spellcheck options. + * + * @param string $word The word + * + * @return bool True if the word is an exception, False otherwise + */ + public function is_exception($word) + { + // Contain only symbols (e.g. "+9,0", "2:2") + if (!$word || preg_match('/^[0-9@#$%^&_+~*=:;?!,.-]+$/', $word)) + return true; + + // Contain symbols (e.g. "g@@gle"), all symbols excluding separators + if (!empty($this->options['ignore_syms']) && preg_match('/[@#$%^&_+~*=-]/', $word)) + return true; + + // Contain numbers (e.g. "g00g13") + if (!empty($this->options['ignore_nums']) && preg_match('/[0-9]/', $word)) + return true; + + // Blocked caps (e.g. "GOOGLE") + if (!empty($this->options['ignore_caps']) && $word == mb_strtoupper($word)) + return true; + + // Use exceptions from dictionary + if (!empty($this->options['dictionary'])) { + $this->load_dict(); + + // @TODO: should dictionary be case-insensitive? + if (!empty($this->dict) && in_array($word, $this->dict)) + return true; + } + + return false; + } + + + /** + * Add a word to dictionary + * + * @param string $word The word to add + */ + public function add_word($word) + { + $this->load_dict(); + + foreach (explode(' ', $word) as $word) { + // sanity check + if (strlen($word) < 512) { + $this->dict[] = $word; + $valid = true; + } + } + + if ($valid) { + $this->dict = array_unique($this->dict); + $this->update_dict(); + } + } + + + /** + * Remove a word from dictionary + * + * @param string $word The word to remove + */ + public function remove_word($word) + { + $this->load_dict(); + + if (($key = array_search($word, $this->dict)) !== false) { + unset($this->dict[$key]); + $this->update_dict(); + } + } + + + /** + * Update dictionary row in DB + */ + private function update_dict() + { + if (strcasecmp($this->options['dictionary'], 'shared') != 0) { + $userid = $this->rc->get_user_id(); + } + + $plugin = $this->rc->plugins->exec_hook('spell_dictionary_save', array( + 'userid' => $userid, 'language' => $this->lang, 'dictionary' => $this->dict)); + + if (!empty($plugin['abort'])) { + return; + } + + if ($this->have_dict) { + if (!empty($this->dict)) { + $this->rc->db->query( + "UPDATE ".$this->rc->db->table_name('dictionary') + ." SET data = ?" + ." WHERE user_id " . ($plugin['userid'] ? "= ".$this->rc->db->quote($plugin['userid']) : "IS NULL") + ." AND " . $this->rc->db->quoteIdentifier('language') . " = ?", + implode(' ', $plugin['dictionary']), $plugin['language']); + } + // don't store empty dict + else { + $this->rc->db->query( + "DELETE FROM " . $this->rc->db->table_name('dictionary') + ." WHERE user_id " . ($plugin['userid'] ? "= ".$this->rc->db->quote($plugin['userid']) : "IS NULL") + ." AND " . $this->rc->db->quoteIdentifier('language') . " = ?", + $plugin['language']); + } + } + else if (!empty($this->dict)) { + $this->rc->db->query( + "INSERT INTO " .$this->rc->db->table_name('dictionary') + ." (user_id, " . $this->rc->db->quoteIdentifier('language') . ", data) VALUES (?, ?, ?)", + $plugin['userid'], $plugin['language'], implode(' ', $plugin['dictionary'])); + } + } + + + /** + * Get dictionary from DB + */ + private function load_dict() + { + if (is_array($this->dict)) { + return $this->dict; + } + + if (strcasecmp($this->options['dictionary'], 'shared') != 0) { + $userid = $this->rc->get_user_id(); + } + + $plugin = $this->rc->plugins->exec_hook('spell_dictionary_get', array( + 'userid' => $userid, 'language' => $this->lang, 'dictionary' => array())); + + if (empty($plugin['abort'])) { + $dict = array(); + $this->rc->db->query( + "SELECT data FROM ".$this->rc->db->table_name('dictionary') + ." WHERE user_id ". ($plugin['userid'] ? "= ".$this->rc->db->quote($plugin['userid']) : "IS NULL") + ." AND " . $this->rc->db->quoteIdentifier('language') . " = ?", + $plugin['language']); + + if ($sql_arr = $this->rc->db->fetch_assoc($sql_result)) { + $this->have_dict = true; + if (!empty($sql_arr['data'])) { + $dict = explode(' ', $sql_arr['data']); + } + } + + $plugin['dictionary'] = array_merge((array)$plugin['dictionary'], $dict); + } + + if (!empty($plugin['dictionary']) && is_array($plugin['dictionary'])) { + $this->dict = $plugin['dictionary']; + } + else { + $this->dict = array(); + } + + return $this->dict; + } + +} |