/* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * * Copyright (C) 2002-2022 Németh László * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License * for the specific language governing rights and limitations under the * License. * * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. * * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen * * Alternatively, the contents of this file may be used under the terms of * either the GNU General Public License Version 2 or later (the "GPL"), or * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), * in which case the provisions of the GPL or the LGPL are applicable instead * of those above. If you wish to allow use of your version of this file only * under the terms of either the GPL or the LGPL, and not to allow others to * use your version of this file under the terms of the MPL, indicate your * decision by deleting the provisions above and replace them with the notice * and other provisions required by the GPL or the LGPL. If you do not delete * the provisions above, a recipient may use your version of this file under * the terms of any one of the MPL, the GPL or the LGPL. *
* ***** END LICENSE BLOCK ***** */ /* * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada * And Contributors. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * 3. All modifications to the source code must be clearly marked as * such. Binary redistributions based on modified source code * must be clearly marked as modified versions in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE.
*/
class HunspellImpl
{ public:
HunspellImpl(constchar* affpath, constchar* dpath, constchar* key = NULL);
~HunspellImpl(); int add_dic(constchar* dpath, constchar* key = NULL);
std::vector<std::string> suffix_suggest(const std::string& root_word);
std::vector<std::string> generate(const std::string& word, const std::vector<std::string>& pl);
std::vector<std::string> generate(const std::string& word, const std::string& pattern);
std::vector<std::string> stem(const std::string& word);
std::vector<std::string> stem(const std::vector<std::string>& morph);
std::vector<std::string> analyze(const std::string& word); int get_langnum() const; bool input_conv(const std::string& word, std::string& dest); bool spell(const std::string& word, int* info = NULL, std::string* root = NULL);
std::vector<std::string> suggest(const std::string& word); const std::string& get_wordchars_cpp() const; const std::vector<w_char>& get_wordchars_utf16() const; const std::string& get_dict_encoding() const; int add(const std::string& word); int add_with_affix(const std::string& word, const std::string& example); int remove(const std::string& word); const std::string& get_version_cpp() const; struct cs_info* get_csconv();
int spell(constchar* word, int* info = NULL, char** root = NULL); int suggest(char*** slst, constchar* word); int suffix_suggest(char*** slst, constchar* root_word); void free_list(char*** slst, int n); char* get_dic_encoding(); int analyze(char*** slst, constchar* word); int stem(char*** slst, constchar* word); int stem(char*** slst, char** morph, int n); int generate(char*** slst, constchar* word, constchar* word2); int generate(char*** slst, constchar* word, char** desc, int n); constchar* get_wordchars() const; constchar* get_version() const; int input_conv(constchar* word, char* dest, size_t destsize);
private:
AffixMgr* pAMgr;
std::vector<HashMgr*> m_HMgrs;
SuggestMgr* pSMgr; char* affixpath;
std::string encoding; struct cs_info* csconv; int langnum; int utf8; int complexprefixes;
std::vector<std::string> wordbreak;
/* first set up the hash manager */
m_HMgrs.push_back(new HashMgr(dpath, affpath, key));
/* next set up the affix manager */ /* it needs access to the hash manager lookup methods */
pAMgr = new AffixMgr(affpath, m_HMgrs, key);
/* get the preferred try string and the dictionary */ /* encoding from the Affix Manager for that dictionary */ char* try_string = pAMgr->get_try_string();
encoding = pAMgr->get_encoding();
langnum = pAMgr->get_langnum();
utf8 = pAMgr->get_utf8(); if (!utf8)
csconv = get_current_cs(encoding);
complexprefixes = pAMgr->get_complexprefixes();
wordbreak = pAMgr->get_breaktable();
/* and finally set up the suggestion manager */
pSMgr = new SuggestMgr(try_string, MAXSUGGESTION, pAMgr); if (try_string)
free(try_string);
}
HunspellImpl::~HunspellImpl() { delete pSMgr; delete pAMgr; for (size_t i = 0; i < m_HMgrs.size(); ++i) delete m_HMgrs[i];
pSMgr = NULL;
pAMgr = NULL; #ifdef MOZILLA_CLIENT delete[] csconv; #endif
csconv = NULL; if (affixpath)
free(affixpath);
affixpath = NULL;
}
// load extra dictionaries int HunspellImpl::add_dic(constchar* dpath, constchar* key) { if (!affixpath) return 1;
m_HMgrs.push_back(new HashMgr(dpath, affixpath, key)); return 0;
}
// make a copy of src at dest while removing all characters // specified in IGNORE rule void HunspellImpl::clean_ignore(std::string& dest, const std::string& src) {
dest.clear();
dest.assign(src); constchar* ignoredchars = pAMgr ? pAMgr->get_ignore() : NULL; if (ignoredchars != NULL) { if (utf8) { const std::vector<w_char>& ignoredchars_utf16 =
pAMgr->get_ignore_utf16();
remove_ignored_chars_utf(dest, ignoredchars_utf16);
} else {
remove_ignored_chars(dest, ignoredchars);
}
}
}
// make a copy of src at destination while removing all leading // blanks and removing any trailing periods after recording // their presence with the abbreviation flag // also since already going through character by character, // set the capitalization type // return the length of the "cleaned" (and UTF-8 encoded) word
// remove IGNORE characters from the string
std::string w2;
clean_ignore(w2, src);
constchar* q = w2.c_str();
// first skip over any leading blanks while (*q == ' ')
++q;
// now strip off any trailing periods (recording their presence)
*pabbrev = 0; int nl = strlen(q); while ((nl > 0) && (*(q + nl - 1) == '.')) {
nl--;
(*pabbrev)++;
}
// if no characters are left it can't be capitalized if (nl <= 0) {
*pcaptype = NOCAP; return 0;
}
// first skip over any leading blanks while (*q == ' ')
++q;
// now strip off any trailing periods (recording their presence)
*pabbrev = 0; int nl = strlen((constchar*)q); while ((nl > 0) && (*(q + nl - 1) == '.')) {
nl--;
(*pabbrev)++;
}
// if no characters are left it can't be capitalized if (nl <= 0) {
*pcaptype = NOCAP; return;
}
// now determine the capitalization type of the first nl letters int ncap = 0; int nneutral = 0; int nc = 0;
if (!utf8) { while (nl > 0) {
nc++; if (csconv[(*q)].ccase)
ncap++; if (csconv[(*q)].cupper == csconv[(*q)].clower)
nneutral++;
dest.push_back(*q++);
nl--;
} // remember to terminate the destination string
firstcap = csconv[static_cast<unsignedchar>(dest[0])].ccase;
} else {
std::vector<w_char> t;
u8_u16(t, src); for (size_t i = 0; i < t.size(); ++i) { unsignedshort idx = (t[i].h << 8) + t[i].l; unsignedshort low = unicodetolower(idx, langnum); if (idx != low)
ncap++; if (unicodetoupper(idx, langnum) == low)
nneutral++;
}
u16_u8(dest, t); if (ncap) { unsignedshort idx = (t[0].h << 8) + t[0].l;
firstcap = (idx != unicodetolower(idx, langnum));
}
}
/* insert a word to the beginning of the suggestion array */ void HunspellImpl::insert_sug(std::vector<std::string>& slst, const std::string& word) {
slst.insert(slst.begin(), word);
}
bool HunspellImpl::spell(const std::string& word, int* info, std::string* root) { bool r = spell_internal(word, info, root); if (r && root) { // output conversion
RepList* rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL; if (rl) {
std::string wspace; if (rl->conv(*root, wspace)) {
*root = wspace;
}
}
} return r;
}
int info2 = 0; if (!info)
info = &info2; else
*info = 0;
// Hunspell supports XML input of the simplified API (see manual) if (word == SPELL_XML) returntrue; if (utf8) { if (word.size() >= MAXWORDUTF8LEN) returnfalse;
} else { if (word.size() >= MAXWORDLEN) returnfalse;
} int captype = NOCAP;
size_t abbv = 0;
size_t wl = 0;
// other patterns for (size_t j = 0; j < wordbreak.size(); ++j) {
size_t plen = wordbreak[j].size();
size_t found = scw.find(wordbreak[j]); if ((found > 0) && (found < wl - plen)) {
size_t found2 = scw.find(wordbreak[j], found + 1); // try to break at the second occurance // to recognize dictionary words with wordbreak if (found2 > 0 && (found2 < wl - plen))
found = found2; if (!spell(scw.substr(found + plen))) continue;
std::string suffix(scw.substr(found));
scw.resize(found); // examine 2 sides of the break point if (spell(scw)) returntrue;
scw.append(suffix);
// LANG_hu: spec. dash rule if (langnum == LANG_hu && wordbreak[j] == "-") {
suffix = scw.substr(found + 1);
scw.resize(found + 1); if (spell(scw)) returntrue; // check the first part with dash
scw.append(suffix);
} // end of LANG specific region
}
}
// other patterns (break at first break point) for (size_t j = 0; j < wordbreak.size(); ++j) {
size_t plen = wordbreak[j].size();
size_t found = scw.find(wordbreak[j]); if ((found > 0) && (found < wl - plen)) { if (!spell(scw.substr(found + plen))) continue;
std::string suffix(scw.substr(found));
scw.resize(found); // examine 2 sides of the break point if (spell(scw)) returntrue;
scw.append(suffix);
// LANG_hu: spec. dash rule if (langnum == LANG_hu && wordbreak[j] == "-") {
suffix = scw.substr(found + 1);
scw.resize(found + 1); if (spell(scw)) returntrue; // check the first part with dash
scw.append(suffix);
} // end of LANG specific region
}
}
}
int onlycmpdsug = 0; if (!pSMgr || m_HMgrs.empty()) return slst;
// process XML input of the simplified API (see manual) if (word.compare(0, sizeof(SPELL_XML) - 3, SPELL_XML, sizeof(SPELL_XML) - 3) == 0) { return spellml(word);
} if (utf8) { if (word.size() >= MAXWORDUTF8LEN) return slst;
} else { if (word.size() >= MAXWORDLEN) return slst;
}
size_t wl = 0;
clock_t timelimit; // initialize in every suggestion call
timelimit = clock();
// check capitalized form for FORCEUCASE if (pAMgr && captype == NOCAP && pAMgr->get_forceucase()) { int info = SPELL_ORIGCAP; if (checkword(scw, &info, NULL)) {
std::string form(scw);
mkinitcap(form);
slst.push_back(form); return slst;
}
}
switch (captype) { case NOCAP: {
good |= pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug); if (clock() > timelimit + TIMELIMIT_GLOBAL) return slst; if (abbv) {
std::string wspace(scw);
wspace.push_back('.');
good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); if (clock() > timelimit + TIMELIMIT_GLOBAL) return slst;
} break;
}
case INITCAP: {
capwords = true;
good |= pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug); if (clock() > timelimit + TIMELIMIT_GLOBAL) return slst;
std::string wspace(scw);
mkallsmall2(wspace, sunicw);
good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); if (clock() > timelimit + TIMELIMIT_GLOBAL) return slst; break;
} case HUHINITCAP:
capwords = true; /* FALLTHROUGH */ case HUHCAP: {
good |= pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug); if (clock() > timelimit + TIMELIMIT_GLOBAL) return slst; // something.The -> something. The
size_t dot_pos = scw.find('.'); if (dot_pos != std::string::npos) {
std::string postdot = scw.substr(dot_pos + 1); int captype_; if (utf8) {
std::vector<w_char> postdotu;
u8_u16(postdotu, postdot);
captype_ = get_captype_utf8(postdotu, langnum);
} else {
captype_ = get_captype(postdot, csconv);
} if (captype_ == INITCAP) {
std::string str(scw);
str.insert(dot_pos + 1, 1, ' ');
insert_sug(slst, str);
}
}
std::string wspace;
if (captype == HUHINITCAP) { // TheOpenOffice.org -> The OpenOffice.org
wspace = scw;
mkinitsmall2(wspace, sunicw);
good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); if (clock() > timelimit + TIMELIMIT_GLOBAL) return slst;
}
wspace = scw;
mkallsmall2(wspace, sunicw); if (spell(wspace.c_str()))
insert_sug(slst, wspace);
size_t prevns = slst.size();
good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); if (clock() > timelimit + TIMELIMIT_GLOBAL) return slst; if (captype == HUHINITCAP) {
mkinitcap2(wspace, sunicw); if (spell(wspace.c_str()))
insert_sug(slst, wspace);
good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); if (clock() > timelimit + TIMELIMIT_GLOBAL) return slst;
} // aNew -> "a New" (instead of "a new") for (size_t j = prevns; j < slst.size(); ++j) { constchar* space = strchr(slst[j].c_str(), ' '); if (space) {
size_t slen = strlen(space + 1); // different case after space (need capitalisation) if ((slen < wl) && strcmp(scw.c_str() + wl - slen, space + 1)) {
std::string first(slst[j].c_str(), space + 1);
std::string second(space + 1);
std::vector<w_char> w; if (utf8)
u8_u16(w, second);
mkinitcap2(second, w); // set as first suggestion
slst.erase(slst.begin() + j);
slst.insert(slst.begin(), first + second);
}
}
} break;
}
case ALLCAP: {
std::string wspace(scw);
mkallsmall2(wspace, sunicw);
good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); if (clock() > timelimit + TIMELIMIT_GLOBAL) return slst; if (pAMgr && pAMgr->get_keepcase() && spell(wspace.c_str()))
insert_sug(slst, wspace);
mkinitcap2(wspace, sunicw);
good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug); if (clock() > timelimit + TIMELIMIT_GLOBAL) return slst; for (size_t j = 0; j < slst.size(); ++j) {
mkallcap(slst[j]); if (pAMgr && pAMgr->get_checksharps()) { if (utf8) {
mystrrep(slst[j], "\xC3\x9F", "SS");
} else {
mystrrep(slst[j], "\xDF", "SS");
}
}
} break;
}
}
// LANG_hu section: replace '-' with ' ' in Hungarian if (langnum == LANG_hu) { for (size_t j = 0; j < slst.size(); ++j) {
size_t pos = slst[j].find('-'); if (pos != std::string::npos) { int info;
std::string w(slst[j].substr(0, pos));
w.append(slst[j].substr(pos + 1));
(void)spell(w, &info, NULL); if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) {
slst[j][pos] = ' ';
} else
slst[j][pos] = '-';
}
}
} // END OF LANG_hu section // try ngram approach since found nothing good suggestion if (!good && pAMgr && (slst.empty() || onlycmpdsug) && (pAMgr->get_maxngramsugs() != 0)) { switch (captype) { case NOCAP: {
pSMgr->ngsuggest(slst, scw.c_str(), m_HMgrs, NOCAP); if (clock() > timelimit + TIMELIMIT_GLOBAL) return slst; break;
} /* FALLTHROUGH */ case HUHINITCAP:
capwords = true; /* FALLTHROUGH */ case HUHCAP: {
std::string wspace(scw);
mkallsmall2(wspace, sunicw);
pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs, HUHCAP); if (clock() > timelimit + TIMELIMIT_GLOBAL) return slst; break;
} case INITCAP: {
capwords = true;
std::string wspace(scw);
mkallsmall2(wspace, sunicw);
pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs, INITCAP); if (clock() > timelimit + TIMELIMIT_GLOBAL) return slst; break;
} case ALLCAP: {
std::string wspace(scw);
mkallsmall2(wspace, sunicw);
size_t oldns = slst.size();
pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs, ALLCAP); if (clock() > timelimit + TIMELIMIT_GLOBAL) return slst; for (size_t j = oldns; j < slst.size(); ++j) {
mkallcap(slst[j]);
} break;
}
}
}
// try dash suggestion (Afo-American -> Afro-American) // Note: LibreOffice was modified to treat dashes as word // characters to check "scot-free" etc. word forms, but // we need to handle suggestions for "Afo-American", etc., // while "Afro-American" is missing from the dictionary. // TODO avoid possible overgeneration
size_t dash_pos = scw.find('-'); if (dash_pos != std::string::npos) { int nodashsug = 1; for (size_t j = 0; j < slst.size() && nodashsug == 1; ++j) { if (slst[j].find('-') != std::string::npos)
nodashsug = 0;
}
size_t prev_pos = 0; bool last = false;
while (!good && nodashsug && !last) { if (dash_pos == scw.size())
last = 1;
std::string chunk = scw.substr(prev_pos, dash_pos - prev_pos); if (!spell(chunk.c_str())) {
std::vector<std::string> nlst = suggest(chunk.c_str()); if (clock() > timelimit + TIMELIMIT_GLOBAL) return slst; for (std::vector<std::string>::reverse_iterator j = nlst.rbegin(); j != nlst.rend(); ++j) {
std::string wspace = scw.substr(0, prev_pos);
wspace.append(*j); if (!last) {
wspace.append("-");
wspace.append(scw.substr(dash_pos + 1));
} int info = 0; if (pAMgr && pAMgr->get_forbiddenword())
checkword(wspace, &info, NULL); if (!(info & SPELL_FORBIDDEN))
insert_sug(slst, wspace);
}
nodashsug = 0;
} if (!last) {
prev_pos = dash_pos + 1;
dash_pos = scw.find('-', prev_pos);
} if (dash_pos == std::string::npos)
dash_pos = scw.size();
}
} return slst;
}
if (!result.empty()) { // word reversing wrapper for complex prefixes if (complexprefixes) { if (utf8)
reverseword_utf(result); else
reverseword(result);
} return line_tok(result, MSEP_REC);
}
// compound word with dash (HU) I18n // LANG_hu section: set dash information for suggestions
size_t dash_pos = langnum == LANG_hu ? scw.find('-') : std::string::npos; if (dash_pos != std::string::npos) { int nresult = 0;
// return the beginning of the element (attr == NULL) or the attribute
std::string::size_type HunspellImpl::get_xml_pos(const std::string& s, std::string::size_type pos, constchar* attr) { if (pos == std::string::npos) return std::string::npos;
std::string::size_type qpos = in_word.find("); if (qpos == std::string::npos) return slst; // bad XML input
std::string::size_type q2pos = in_word.find('>', qpos); if (q2pos == std::string::npos) return slst; // bad XML input
q2pos = in_word.find(", q2pos); if (q2pos == std::string::npos) return slst; // bad XML input
if (check_xml_par(in_word, qpos, "type=", "analyze")) {
std::string cw = get_xml_par(in_word, in_word.find('>', q2pos)); if (!cw.empty())
slst = analyze(cw); if (slst.empty()) return slst; // convert the result to <code><a>ana1</a><a>ana2</a></code> format
std::string r;
r.append(""); for (size_t i = 0; i < slst.size(); ++i) {
r.append("");
int HunspellImpl::stem(char*** slst, char** desc, int n) {
std::vector<std::string> morph;
morph.reserve(n); for (int i = 0; i < n; ++i)
morph.push_back(desc[i]);
int HunspellImpl::generate(char*** slst, constchar* word, constchar* pattern) {
std::vector<std::string> stems = generate(word, pattern); return munge_vector(slst, stems);
}
int HunspellImpl::generate(char*** slst, constchar* word, char** pl, int pln) {
std::vector<std::string> morph;
morph.reserve(pln); for (int i = 0; i < pln; ++i)
morph.push_back(pl[i]);
int Hunspell_stem2(Hunhandle* pHunspell, char*** slst, char** desc, int n) { returnreinterpret_cast<HunspellImpl*>(pHunspell)->stem(slst, desc, n);
}
int Hunspell_generate(Hunhandle* pHunspell, char*** slst, constchar* word, constchar* pattern)
{ returnreinterpret_cast<HunspellImpl*>(pHunspell)->generate(slst, word, pattern);
}
int Hunspell_generate2(Hunhandle* pHunspell, char*** slst,
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.16 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.