diff options
Diffstat (limited to 'program/include/rcube_spellchecker.php')
-rw-r--r-- | program/include/rcube_spellchecker.php | 393 |
1 files changed, 393 insertions, 0 deletions
diff --git a/program/include/rcube_spellchecker.php b/program/include/rcube_spellchecker.php new file mode 100644 index 000000000..7acb70095 --- /dev/null +++ b/program/include/rcube_spellchecker.php @@ -0,0 +1,393 @@ +<?php + +/* + +-----------------------------------------------------------------------+ + | program/include/rcube_spellchecker.php | + | | + | This file is part of the Roundcube Webmail client | + | Copyright (C) 2011, Kolab Systems AG | + | Copyright (C) 2008-2011, The Roundcube Dev Team | + | Licensed under the GNU GPL | + | | + | PURPOSE: | + | Spellchecking using different backends | + | | + +-----------------------------------------------------------------------+ + | Author: Aleksander Machniak <machniak@kolabsys.com> | + | Author: Thomas Bruederli <roundcube@gmail.com> | + +-----------------------------------------------------------------------+ + + $Id$ + +*/ + + +/** + * Helper class for spellchecking with Googielspell and PSpell support. + * + * @package Core + */ +class rcube_spellchecker +{ + private $matches = array(); + private $engine; + private $lang; + private $rc; + private $error; + private $separator = '/[ !"#$%&()*+\\,\/\n:;<=>?@\[\]^_{|}-]+|\.[^\w]/'; + + + // default settings + const GOOGLE_HOST = 'ssl://www.google.com'; + const GOOGLE_PORT = 443; + const MAX_SUGGESTIONS = 10; + + + /** + * Constructor + * + * @param string $lang Language code + */ + function __construct($lang = 'en') + { + $this->rc = rcmail::get_instance(); + $this->engine = $this->rc->config->get('spellcheck_engine', 'googie'); + $this->lang = $lang; + + if ($this->engine == 'pspell' && !extension_loaded('pspell')) { + raise_error(array( + 'code' => 500, 'type' => 'php', + 'file' => __FILE__, 'line' => __LINE__, + 'message' => "Pspell extension not available"), true, true); + } + } + + + /** + * Set content and check spelling + * + * @param string $text Text content for spellchecking + * @param bool $is_html Enables HTML-to-Text conversion + * + * @return bool True when no mispelling found, otherwise false + */ + function check($text, $is_html=false) + { + // convert to plain text + if ($is_html) { + $this->content = $this->html2text($text); + } + else { + $this->content = $text; + } + + if ($this->engine == 'pspell') { + $this->matches = $this->_pspell_check($this->content); + } + else { + $this->matches = $this->_googie_check($this->content); + } + + return $this->found() == 0; + } + + + /** + * Number of mispellings found (after check) + * + * @return int Number of mispellings + */ + function found() + { + return count($this->matches); + } + + + /** + * Returns suggestions for the specified word + * + * @param string $word The word + * + * @return array Suggestions list + */ + function get_suggestions($word) + { + if ($this->engine == 'pspell') { + return $this->_pspell_suggestions($word); + } + + return $this->_googie_suggestions($word); + } + + + /** + * Returns mispelled words + * + * @param string $text The content for spellchecking. If empty content + * used for check() method will be used. + * + * @return array List of mispelled words + */ + function get_words($text = null, $is_html=false) + { + if ($this->engine == 'pspell') { + return $this->_pspell_words($text, $is_html); + } + + return $this->_googie_words($text, $is_html); + } + + + /** + * Returns checking result in XML (Googiespell) format + * + * @return string XML content + */ + function get_xml() + { + // send output + $out = '<?xml version="1.0" encoding="'.RCMAIL_CHARSET.'"?><spellresult charschecked="'.mb_strlen($this->content).'">'; + + foreach ($this->matches as $item) { + $out .= '<c o="'.$item[1].'" l="'.$item[2].'">'; + $out .= is_array($item[4]) ? implode("\t", $item[4]) : $item[4]; + $out .= '</c>'; + } + + $out .= '</spellresult>'; + + return $out; + } + + + /** + * Returns error message + * + * @return string Error message + */ + function error() + { + return $this->error; + } + + + /** + * Checks the text using pspell + * + * @param string $text Text content for spellchecking + */ + private function _pspell_check($text) + { + // init spellchecker + $this->_pspell_init(); + + if (!$this->plink) { + return array(); + } + + // tokenize + $text = preg_split($this->separator, $text, NULL, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_OFFSET_CAPTURE); + + $diff = 0; + $matches = array(); + + foreach ($text as $w) { + $word = trim($w[0]); + $pos = $w[1] - $diff; + $len = mb_strlen($word); + + if ($word && preg_match('/[^0-9\.]/', $word) && !pspell_check($this->plink, $word)) { + $suggestions = pspell_suggest($this->plink, $word); + + if (sizeof($suggestions) > self::MAX_SUGGESTIONS) + $suggestions = array_slice($suggestions, 0, self::MAX_SUGGESTIONS); + + $matches[] = array($word, $pos, $len, null, $suggestions); + } + + $diff += (strlen($word) - $len); + } + + return $matches; + } + + + /** + * Returns the mispelled words + */ + private function _pspell_words($text = null, $is_html=false) + { + if ($text) { + // init spellchecker + $this->_pspell_init(); + + if (!$this->plink) { + return array(); + } + + // With PSpell we don't need to get suggestions to return mispelled words + if ($is_html) { + $text = $this->html2text($text); + } + + $text = preg_split($this->separator, $text, NULL, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_OFFSET_CAPTURE); + + foreach ($text as $w) { + $word = trim($w[0]); + if ($word && preg_match('/[^0-9\.]/', $word) && !pspell_check($this->plink, $word)) { + $result[] = $word; + } + } + + return $result; + } + + $result = array(); + + foreach ($this->matches as $m) { + $result[] = $m[0]; + } + + return $result; + } + + + /** + * Returns suggestions for mispelled word + */ + private function _pspell_suggestions($word) + { + // init spellchecker + $this->_pspell_init(); + + if (!$this->plink) { + return array(); + } + + $suggestions = pspell_suggest($this->plink, $word); + + if (sizeof($suggestions) > self::MAX_SUGGESTIONS) + $suggestions = array_slice($suggestions, 0, self::MAX_SUGGESTIONS); + + return is_array($suggestions) ? $suggestions : array(); + } + + + /** + * Initializes PSpell dictionary + */ + private function _pspell_init() + { + if (!$this->plink) { + $this->plink = pspell_new($this->lang, null, null, RCMAIL_CHARSET, PSPELL_FAST); + } + + if (!$this->plink) { + $this->error = "Unable to load Pspell engine for selected language"; + } + } + + + private function _googie_check($text) + { + // spell check uri is configured + $url = $this->rc->config->get('spellcheck_uri'); + + if ($url) { + $a_uri = parse_url($url); + $ssl = ($a_uri['scheme'] == 'https' || $a_uri['scheme'] == 'ssl'); + $port = $a_uri['port'] ? $a_uri['port'] : ($ssl ? 443 : 80); + $host = ($ssl ? 'ssl://' : '') . $a_uri['host']; + $path = $a_uri['path'] . ($a_uri['query'] ? '?'.$a_uri['query'] : '') . $this->lang; + } + else { + $host = self::GOOGLE_HOST; + $port = self::GOOGLE_PORT; + $path = '/tbproxy/spell?lang=' . $this->lang; + } + + // Google has some problem with spaces, use \n instead + $text = str_replace(' ', "\n", $text); + + $text = '<?xml version="1.0" encoding="utf-8" ?>' + .'<spellrequest textalreadyclipped="0" ignoredups="0" ignoredigits="1" ignoreallcaps="1">' + .'<text>' . $text . '</text>' + .'</spellrequest>'; + + $store = ''; + if ($fp = fsockopen($host, $port, $errno, $errstr, 30)) { + $out = "POST $path HTTP/1.0\r\n"; + $out .= "Host: " . str_replace('ssl://', '', $host) . "\r\n"; + $out .= "Content-Length: " . strlen($text) . "\r\n"; + $out .= "Content-Type: application/x-www-form-urlencoded\r\n"; + $out .= "Connection: Close\r\n\r\n"; + $out .= $text; + fwrite($fp, $out); + + while (!feof($fp)) + $store .= fgets($fp, 128); + fclose($fp); + } + + if (!$store) { + $this->error = "Empty result from spelling engine"; + } + + preg_match_all('/<c o="([^"]*)" l="([^"]*)" s="([^"]*)">([^<]*)<\/c>/', $store, $matches, PREG_SET_ORDER); + + return $matches; + } + + + private function _googie_words($text = null, $is_html=false) + { + if ($text) { + if ($is_html) { + $text = $this->html2text($text); + } + + $matches = $this->_googie_check($text); + } + else { + $matches = $this->matches; + $text = $this->content; + } + + $result = array(); + + foreach ($matches as $m) { + $result[] = mb_substr($text, $m[1], $m[2], RCMAIL_CHARSET); + } + + return $result; + } + + + private function _googie_suggestions($word) + { + if ($word) { + $matches = $this->_googie_check($word); + } + else { + $matches = $this->matches; + } + + if ($matches[0][4]) { + $suggestions = explode("\t", $matches[0][4]); + if (sizeof($suggestions) > self::MAX_SUGGESTIONS) { + $suggestions = array_slice($suggestions, 0, MAX_SUGGESTIONS); + } + + return $suggestions; + } + + return array(); + } + + + private function html2text($text) + { + $h2t = new html2text($text, false, true, 0); + return $h2t->get_text(); + } +} |