summaryrefslogtreecommitdiff
path: root/program/lib/Roundcube/rcube_spellchecker.php
diff options
context:
space:
mode:
authorAleksander Machniak <alec@alec.pl>2012-11-21 19:52:03 +0100
committerAleksander Machniak <alec@alec.pl>2012-11-21 19:52:03 +0100
commitba6f21caeb405c7e8512a09941fefbc97286e45f (patch)
tree4a0e8f6fbab3260d37bf85cbf0bc9f506e627678 /program/lib/Roundcube/rcube_spellchecker.php
parentf707fec0001d7dc7d46be114c42b37e49a052660 (diff)
Framework files moved to lib/Roundcube
Diffstat (limited to 'program/lib/Roundcube/rcube_spellchecker.php')
-rw-r--r--program/lib/Roundcube/rcube_spellchecker.php621
1 files changed, 621 insertions, 0 deletions
diff --git a/program/lib/Roundcube/rcube_spellchecker.php b/program/lib/Roundcube/rcube_spellchecker.php
new file mode 100644
index 000000000..30d15d721
--- /dev/null
+++ b/program/lib/Roundcube/rcube_spellchecker.php
@@ -0,0 +1,621 @@
+<?php
+
+/*
+ +-----------------------------------------------------------------------+
+ | program/include/rcube_spellchecker.php |
+ | |
+ | This file is part of the Roundcube Webmail client |
+ | Copyright (C) 2011, Kolab Systems AG |
+ | Copyright (C) 2008-2011, The Roundcube Dev Team |
+ | |
+ | Licensed under the GNU General Public License version 3 or |
+ | any later version with exceptions for skins & plugins. |
+ | See the README file for a full license statement. |
+ | |
+ | PURPOSE: |
+ | Spellchecking using different backends |
+ | |
+ +-----------------------------------------------------------------------+
+ | Author: Aleksander Machniak <machniak@kolabsys.com> |
+ | Author: Thomas Bruederli <roundcube@gmail.com> |
+ +-----------------------------------------------------------------------+
+*/
+
+
+/**
+ * Helper class for spellchecking with Googielspell and PSpell support.
+ *
+ * @package Framework
+ * @subpackage Utils
+ */
+class rcube_spellchecker
+{
+ private $matches = array();
+ private $engine;
+ private $lang;
+ private $rc;
+ private $error;
+ private $separator = '/[\s\r\n\t\(\)\/\[\]{}<>\\"]+|[:;?!,\.]([^\w]|$)/';
+ private $options = array();
+ private $dict;
+ private $have_dict;
+
+
+ // default settings
+ const GOOGLE_HOST = 'ssl://www.google.com';
+ const GOOGLE_PORT = 443;
+ const MAX_SUGGESTIONS = 10;
+
+
+ /**
+ * Constructor
+ *
+ * @param string $lang Language code
+ */
+ function __construct($lang = 'en')
+ {
+ $this->rc = rcube::get_instance();
+ $this->engine = $this->rc->config->get('spellcheck_engine', 'googie');
+ $this->lang = $lang ? $lang : 'en';
+
+ $this->options = array(
+ 'ignore_syms' => $this->rc->config->get('spellcheck_ignore_syms'),
+ 'ignore_nums' => $this->rc->config->get('spellcheck_ignore_nums'),
+ 'ignore_caps' => $this->rc->config->get('spellcheck_ignore_caps'),
+ 'dictionary' => $this->rc->config->get('spellcheck_dictionary'),
+ );
+ }
+
+
+ /**
+ * Set content and check spelling
+ *
+ * @param string $text Text content for spellchecking
+ * @param bool $is_html Enables HTML-to-Text conversion
+ *
+ * @return bool True when no mispelling found, otherwise false
+ */
+ function check($text, $is_html = false)
+ {
+ // convert to plain text
+ if ($is_html) {
+ $this->content = $this->html2text($text);
+ }
+ else {
+ $this->content = $text;
+ }
+
+ if ($this->engine == 'pspell') {
+ $this->matches = $this->_pspell_check($this->content);
+ }
+ else {
+ $this->matches = $this->_googie_check($this->content);
+ }
+
+ return $this->found() == 0;
+ }
+
+
+ /**
+ * Number of mispellings found (after check)
+ *
+ * @return int Number of mispellings
+ */
+ function found()
+ {
+ return count($this->matches);
+ }
+
+
+ /**
+ * Returns suggestions for the specified word
+ *
+ * @param string $word The word
+ *
+ * @return array Suggestions list
+ */
+ function get_suggestions($word)
+ {
+ if ($this->engine == 'pspell') {
+ return $this->_pspell_suggestions($word);
+ }
+
+ return $this->_googie_suggestions($word);
+ }
+
+
+ /**
+ * Returns misspelled words
+ *
+ * @param string $text The content for spellchecking. If empty content
+ * used for check() method will be used.
+ *
+ * @return array List of misspelled words
+ */
+ function get_words($text = null, $is_html=false)
+ {
+ if ($this->engine == 'pspell') {
+ return $this->_pspell_words($text, $is_html);
+ }
+
+ return $this->_googie_words($text, $is_html);
+ }
+
+
+ /**
+ * Returns checking result in XML (Googiespell) format
+ *
+ * @return string XML content
+ */
+ function get_xml()
+ {
+ // send output
+ $out = '<?xml version="1.0" encoding="'.RCMAIL_CHARSET.'"?><spellresult charschecked="'.mb_strlen($this->content).'">';
+
+ foreach ($this->matches as $item) {
+ $out .= '<c o="'.$item[1].'" l="'.$item[2].'">';
+ $out .= is_array($item[4]) ? implode("\t", $item[4]) : $item[4];
+ $out .= '</c>';
+ }
+
+ $out .= '</spellresult>';
+
+ return $out;
+ }
+
+
+ /**
+ * Returns checking result (misspelled words with suggestions)
+ *
+ * @return array Spellchecking result. An array indexed by word.
+ */
+ function get()
+ {
+ $result = array();
+
+ foreach ($this->matches as $item) {
+ if ($this->engine == 'pspell') {
+ $word = $item[0];
+ }
+ else {
+ $word = mb_substr($this->content, $item[1], $item[2], RCMAIL_CHARSET);
+ }
+ $result[$word] = is_array($item[4]) ? implode("\t", $item[4]) : $item[4];
+ }
+
+ return $result;
+ }
+
+
+ /**
+ * Returns error message
+ *
+ * @return string Error message
+ */
+ function error()
+ {
+ return $this->error;
+ }
+
+
+ /**
+ * Checks the text using pspell
+ *
+ * @param string $text Text content for spellchecking
+ */
+ private function _pspell_check($text)
+ {
+ // init spellchecker
+ $this->_pspell_init();
+
+ if (!$this->plink) {
+ return array();
+ }
+
+ // tokenize
+ $text = preg_split($this->separator, $text, NULL, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_OFFSET_CAPTURE);
+
+ $diff = 0;
+ $matches = array();
+
+ foreach ($text as $w) {
+ $word = trim($w[0]);
+ $pos = $w[1] - $diff;
+ $len = mb_strlen($word);
+
+ // skip exceptions
+ if ($this->is_exception($word)) {
+ }
+ else if (!pspell_check($this->plink, $word)) {
+ $suggestions = pspell_suggest($this->plink, $word);
+
+ if (sizeof($suggestions) > self::MAX_SUGGESTIONS) {
+ $suggestions = array_slice($suggestions, 0, self::MAX_SUGGESTIONS);
+ }
+
+ $matches[] = array($word, $pos, $len, null, $suggestions);
+ }
+
+ $diff += (strlen($word) - $len);
+ }
+
+ return $matches;
+ }
+
+
+ /**
+ * Returns the misspelled words
+ */
+ private function _pspell_words($text = null, $is_html=false)
+ {
+ $result = array();
+
+ if ($text) {
+ // init spellchecker
+ $this->_pspell_init();
+
+ if (!$this->plink) {
+ return array();
+ }
+
+ // With PSpell we don't need to get suggestions to return misspelled words
+ if ($is_html) {
+ $text = $this->html2text($text);
+ }
+
+ $text = preg_split($this->separator, $text, NULL, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_OFFSET_CAPTURE);
+
+ foreach ($text as $w) {
+ $word = trim($w[0]);
+
+ // skip exceptions
+ if ($this->is_exception($word)) {
+ continue;
+ }
+
+ if (!pspell_check($this->plink, $word)) {
+ $result[] = $word;
+ }
+ }
+
+ return $result;
+ }
+
+ foreach ($this->matches as $m) {
+ $result[] = $m[0];
+ }
+
+ return $result;
+ }
+
+
+ /**
+ * Returns suggestions for misspelled word
+ */
+ private function _pspell_suggestions($word)
+ {
+ // init spellchecker
+ $this->_pspell_init();
+
+ if (!$this->plink) {
+ return array();
+ }
+
+ $suggestions = pspell_suggest($this->plink, $word);
+
+ if (sizeof($suggestions) > self::MAX_SUGGESTIONS)
+ $suggestions = array_slice($suggestions, 0, self::MAX_SUGGESTIONS);
+
+ return is_array($suggestions) ? $suggestions : array();
+ }
+
+
+ /**
+ * Initializes PSpell dictionary
+ */
+ private function _pspell_init()
+ {
+ if (!$this->plink) {
+ if (!extension_loaded('pspell')) {
+ $this->error = "Pspell extension not available";
+ rcube::raise_error(array(
+ 'code' => 500, 'type' => 'php',
+ 'file' => __FILE__, 'line' => __LINE__,
+ 'message' => $this->error), true, false);
+
+ return;
+ }
+
+ $this->plink = pspell_new($this->lang, null, null, RCMAIL_CHARSET, PSPELL_FAST);
+ }
+
+ if (!$this->plink) {
+ $this->error = "Unable to load Pspell engine for selected language";
+ }
+ }
+
+
+ private function _googie_check($text)
+ {
+ // spell check uri is configured
+ $url = $this->rc->config->get('spellcheck_uri');
+
+ if ($url) {
+ $a_uri = parse_url($url);
+ $ssl = ($a_uri['scheme'] == 'https' || $a_uri['scheme'] == 'ssl');
+ $port = $a_uri['port'] ? $a_uri['port'] : ($ssl ? 443 : 80);
+ $host = ($ssl ? 'ssl://' : '') . $a_uri['host'];
+ $path = $a_uri['path'] . ($a_uri['query'] ? '?'.$a_uri['query'] : '') . $this->lang;
+ }
+ else {
+ $host = self::GOOGLE_HOST;
+ $port = self::GOOGLE_PORT;
+ $path = '/tbproxy/spell?lang=' . $this->lang;
+ }
+
+ // Google has some problem with spaces, use \n instead
+ $gtext = str_replace(' ', "\n", $text);
+
+ $gtext = '<?xml version="1.0" encoding="utf-8" ?>'
+ .'<spellrequest textalreadyclipped="0" ignoredups="0" ignoredigits="1" ignoreallcaps="1">'
+ .'<text>' . $gtext . '</text>'
+ .'</spellrequest>';
+
+ $store = '';
+ if ($fp = fsockopen($host, $port, $errno, $errstr, 30)) {
+ $out = "POST $path HTTP/1.0\r\n";
+ $out .= "Host: " . str_replace('ssl://', '', $host) . "\r\n";
+ $out .= "Content-Length: " . strlen($gtext) . "\r\n";
+ $out .= "Content-Type: application/x-www-form-urlencoded\r\n";
+ $out .= "Connection: Close\r\n\r\n";
+ $out .= $gtext;
+ fwrite($fp, $out);
+
+ while (!feof($fp))
+ $store .= fgets($fp, 128);
+ fclose($fp);
+ }
+
+ if (!$store) {
+ $this->error = "Empty result from spelling engine";
+ }
+
+ preg_match_all('/<c o="([^"]*)" l="([^"]*)" s="([^"]*)">([^<]*)<\/c>/', $store, $matches, PREG_SET_ORDER);
+
+ // skip exceptions (if appropriate options are enabled)
+ if (!empty($this->options['ignore_syms']) || !empty($this->options['ignore_nums'])
+ || !empty($this->options['ignore_caps']) || !empty($this->options['dictionary'])
+ ) {
+ foreach ($matches as $idx => $m) {
+ $word = mb_substr($text, $m[1], $m[2], RCMAIL_CHARSET);
+ // skip exceptions
+ if ($this->is_exception($word)) {
+ unset($matches[$idx]);
+ }
+ }
+ }
+
+ return $matches;
+ }
+
+
+ private function _googie_words($text = null, $is_html=false)
+ {
+ if ($text) {
+ if ($is_html) {
+ $text = $this->html2text($text);
+ }
+
+ $matches = $this->_googie_check($text);
+ }
+ else {
+ $matches = $this->matches;
+ $text = $this->content;
+ }
+
+ $result = array();
+
+ foreach ($matches as $m) {
+ $result[] = mb_substr($text, $m[1], $m[2], RCMAIL_CHARSET);
+ }
+
+ return $result;
+ }
+
+
+ private function _googie_suggestions($word)
+ {
+ if ($word) {
+ $matches = $this->_googie_check($word);
+ }
+ else {
+ $matches = $this->matches;
+ }
+
+ if ($matches[0][4]) {
+ $suggestions = explode("\t", $matches[0][4]);
+ if (sizeof($suggestions) > self::MAX_SUGGESTIONS) {
+ $suggestions = array_slice($suggestions, 0, MAX_SUGGESTIONS);
+ }
+
+ return $suggestions;
+ }
+
+ return array();
+ }
+
+
+ private function html2text($text)
+ {
+ $h2t = new html2text($text, false, true, 0);
+ return $h2t->get_text();
+ }
+
+
+ /**
+ * Check if the specified word is an exception accoring to
+ * spellcheck options.
+ *
+ * @param string $word The word
+ *
+ * @return bool True if the word is an exception, False otherwise
+ */
+ public function is_exception($word)
+ {
+ // Contain only symbols (e.g. "+9,0", "2:2")
+ if (!$word || preg_match('/^[0-9@#$%^&_+~*=:;?!,.-]+$/', $word))
+ return true;
+
+ // Contain symbols (e.g. "g@@gle"), all symbols excluding separators
+ if (!empty($this->options['ignore_syms']) && preg_match('/[@#$%^&_+~*=-]/', $word))
+ return true;
+
+ // Contain numbers (e.g. "g00g13")
+ if (!empty($this->options['ignore_nums']) && preg_match('/[0-9]/', $word))
+ return true;
+
+ // Blocked caps (e.g. "GOOGLE")
+ if (!empty($this->options['ignore_caps']) && $word == mb_strtoupper($word))
+ return true;
+
+ // Use exceptions from dictionary
+ if (!empty($this->options['dictionary'])) {
+ $this->load_dict();
+
+ // @TODO: should dictionary be case-insensitive?
+ if (!empty($this->dict) && in_array($word, $this->dict))
+ return true;
+ }
+
+ return false;
+ }
+
+
+ /**
+ * Add a word to dictionary
+ *
+ * @param string $word The word to add
+ */
+ public function add_word($word)
+ {
+ $this->load_dict();
+
+ foreach (explode(' ', $word) as $word) {
+ // sanity check
+ if (strlen($word) < 512) {
+ $this->dict[] = $word;
+ $valid = true;
+ }
+ }
+
+ if ($valid) {
+ $this->dict = array_unique($this->dict);
+ $this->update_dict();
+ }
+ }
+
+
+ /**
+ * Remove a word from dictionary
+ *
+ * @param string $word The word to remove
+ */
+ public function remove_word($word)
+ {
+ $this->load_dict();
+
+ if (($key = array_search($word, $this->dict)) !== false) {
+ unset($this->dict[$key]);
+ $this->update_dict();
+ }
+ }
+
+
+ /**
+ * Update dictionary row in DB
+ */
+ private function update_dict()
+ {
+ if (strcasecmp($this->options['dictionary'], 'shared') != 0) {
+ $userid = $this->rc->get_user_id();
+ }
+
+ $plugin = $this->rc->plugins->exec_hook('spell_dictionary_save', array(
+ 'userid' => $userid, 'language' => $this->lang, 'dictionary' => $this->dict));
+
+ if (!empty($plugin['abort'])) {
+ return;
+ }
+
+ if ($this->have_dict) {
+ if (!empty($this->dict)) {
+ $this->rc->db->query(
+ "UPDATE ".$this->rc->db->table_name('dictionary')
+ ." SET data = ?"
+ ." WHERE user_id " . ($plugin['userid'] ? "= ".$this->rc->db->quote($plugin['userid']) : "IS NULL")
+ ." AND " . $this->rc->db->quoteIdentifier('language') . " = ?",
+ implode(' ', $plugin['dictionary']), $plugin['language']);
+ }
+ // don't store empty dict
+ else {
+ $this->rc->db->query(
+ "DELETE FROM " . $this->rc->db->table_name('dictionary')
+ ." WHERE user_id " . ($plugin['userid'] ? "= ".$this->rc->db->quote($plugin['userid']) : "IS NULL")
+ ." AND " . $this->rc->db->quoteIdentifier('language') . " = ?",
+ $plugin['language']);
+ }
+ }
+ else if (!empty($this->dict)) {
+ $this->rc->db->query(
+ "INSERT INTO " .$this->rc->db->table_name('dictionary')
+ ." (user_id, " . $this->rc->db->quoteIdentifier('language') . ", data) VALUES (?, ?, ?)",
+ $plugin['userid'], $plugin['language'], implode(' ', $plugin['dictionary']));
+ }
+ }
+
+
+ /**
+ * Get dictionary from DB
+ */
+ private function load_dict()
+ {
+ if (is_array($this->dict)) {
+ return $this->dict;
+ }
+
+ if (strcasecmp($this->options['dictionary'], 'shared') != 0) {
+ $userid = $this->rc->get_user_id();
+ }
+
+ $plugin = $this->rc->plugins->exec_hook('spell_dictionary_get', array(
+ 'userid' => $userid, 'language' => $this->lang, 'dictionary' => array()));
+
+ if (empty($plugin['abort'])) {
+ $dict = array();
+ $this->rc->db->query(
+ "SELECT data FROM ".$this->rc->db->table_name('dictionary')
+ ." WHERE user_id ". ($plugin['userid'] ? "= ".$this->rc->db->quote($plugin['userid']) : "IS NULL")
+ ." AND " . $this->rc->db->quoteIdentifier('language') . " = ?",
+ $plugin['language']);
+
+ if ($sql_arr = $this->rc->db->fetch_assoc($sql_result)) {
+ $this->have_dict = true;
+ if (!empty($sql_arr['data'])) {
+ $dict = explode(' ', $sql_arr['data']);
+ }
+ }
+
+ $plugin['dictionary'] = array_merge((array)$plugin['dictionary'], $dict);
+ }
+
+ if (!empty($plugin['dictionary']) && is_array($plugin['dictionary'])) {
+ $this->dict = $plugin['dictionary'];
+ }
+ else {
+ $this->dict = array();
+ }
+
+ return $this->dict;
+ }
+
+}