Added HunSpell library to project
This commit is contained in:
parent
dd7b2256f8
commit
b31a923d9d
21
goldlib/hunspell/README
Normal file
21
goldlib/hunspell/README
Normal file
@ -0,0 +1,21 @@
|
||||
Hunspell spell checker and morphological analyser library
|
||||
|
||||
Documentation, tests, examples: http://hunspell.sourceforge.net
|
||||
|
||||
Author of Hunspell:
|
||||
László Németh (nemethl (at) gyorsposta.hu)
|
||||
|
||||
Hunspell based on OpenOffice.org's Myspell. MySpell's author:
|
||||
Kevin Hendricks (kevin.hendricks (at) sympatico.ca)
|
||||
|
||||
License: GPL 2.0/LGPL 2.1/MPL 1.1 tri-license
|
||||
|
||||
The contents of this library may be used under the terms of
|
||||
the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
the GNU Lesser General Public License Version 2.1 or later (the "LGPL",
|
||||
see http://gnu.org/copyleft/lesser.html) or the Mozilla Public License
|
||||
Version 1.1 or later (the "MPL", see http://mozilla.org/MPL/MPL-1.1.html).
|
||||
|
||||
Software distributed under these licenses is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the licences
|
||||
for the specific language governing rights and limitations under the licenses.
|
845
goldlib/hunspell/affentry.cxx
Normal file
845
goldlib/hunspell/affentry.cxx
Normal file
@ -0,0 +1,845 @@
|
||||
#include "license.hun"
|
||||
#include "license.mys"
|
||||
|
||||
#include <cctype>
|
||||
#include <cstring>
|
||||
#include <cstdlib>
|
||||
#include <cstdio>
|
||||
|
||||
#include "affentry.hxx"
|
||||
#include "csutil.hxx"
|
||||
|
||||
#if !defined(_MSC_VER)
|
||||
using namespace std;
|
||||
#endif
|
||||
|
||||
|
||||
PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp)
|
||||
{
|
||||
// register affix manager
|
||||
pmyMgr = pmgr;
|
||||
|
||||
// set up its intial values
|
||||
|
||||
aflag = dp->aflag; // flag
|
||||
strip = dp->strip; // string to strip
|
||||
appnd = dp->appnd; // string to append
|
||||
stripl = dp->stripl; // length of strip string
|
||||
appndl = dp->appndl; // length of append string
|
||||
numconds = dp->numconds; // number of conditions to match
|
||||
opts = dp->opts; // cross product flag
|
||||
// then copy over all of the conditions
|
||||
memcpy(&conds.base[0],&dp->conds.base[0],SETSIZE*sizeof(conds.base[0]));
|
||||
next = NULL;
|
||||
nextne = NULL;
|
||||
nexteq = NULL;
|
||||
morphcode = dp->morphcode;
|
||||
contclass = dp->contclass;
|
||||
contclasslen = dp->contclasslen;
|
||||
}
|
||||
|
||||
|
||||
PfxEntry::~PfxEntry()
|
||||
{
|
||||
aflag = 0;
|
||||
if (appnd) free(appnd);
|
||||
if (strip) free(strip);
|
||||
pmyMgr = NULL;
|
||||
appnd = NULL;
|
||||
strip = NULL;
|
||||
if (opts & aeUTF8) {
|
||||
for (int i = 0; i < 8; i++) {
|
||||
if (conds.utf8.wchars[i]) free(conds.utf8.wchars[i]);
|
||||
}
|
||||
}
|
||||
if (morphcode && !(opts & aeALIASM)) free(morphcode);
|
||||
if (contclass && !(opts & aeALIASF)) free(contclass);
|
||||
}
|
||||
|
||||
// add prefix to this word assuming conditions hold
|
||||
char * PfxEntry::add(const char * word, int len)
|
||||
{
|
||||
char tword[MAXWORDUTF8LEN + 4];
|
||||
|
||||
if ((len > stripl) && (len >= numconds) && test_condition(word) &&
|
||||
(!stripl || (strncmp(word, strip, stripl) == 0)) &&
|
||||
((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
|
||||
/* we have a match so add prefix */
|
||||
char * pp = tword;
|
||||
if (appndl) {
|
||||
strcpy(tword,appnd);
|
||||
pp += appndl;
|
||||
}
|
||||
strcpy(pp, (word + stripl));
|
||||
return mystrdup(tword);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
inline int PfxEntry::test_condition(const char * st)
|
||||
{
|
||||
int cond;
|
||||
unsigned char * cp = (unsigned char *)st;
|
||||
if (!(opts & aeUTF8)) { // 256-character codepage
|
||||
for (cond = 0; cond < numconds; cond++) {
|
||||
if ((conds.base[*cp++] & (1 << cond)) == 0) return 0;
|
||||
}
|
||||
} else { // UTF-8 encoding
|
||||
unsigned short wc;
|
||||
for (cond = 0; cond < numconds; cond++) {
|
||||
// a simple 7-bit ASCII character in UTF-8
|
||||
if ((*cp >> 7) == 0) {
|
||||
// also check limit (end of word)
|
||||
if ((!*cp) || ((conds.utf8.ascii[*cp++] & (1 << cond)) == 0)) return 0;
|
||||
// UTF-8 multibyte character
|
||||
} else {
|
||||
// not dot wildcard in rule
|
||||
if (!conds.utf8.all[cond]) {
|
||||
if (conds.utf8.neg[cond]) {
|
||||
u8_u16((w_char *) &wc, 1, (char *) cp);
|
||||
if (conds.utf8.wchars[cond] &&
|
||||
flag_bsearch((unsigned short *)conds.utf8.wchars[cond],
|
||||
wc, (short) conds.utf8.wlen[cond])) return 0;
|
||||
} else {
|
||||
if (!conds.utf8.wchars[cond]) return 0;
|
||||
u8_u16((w_char *) &wc, 1, (char *) cp);
|
||||
if (!flag_bsearch((unsigned short *)conds.utf8.wchars[cond],
|
||||
wc, (short)conds.utf8.wlen[cond])) return 0;
|
||||
}
|
||||
}
|
||||
// jump to next UTF-8 character
|
||||
for(cp++; (*cp & 0xc0) == 0x80; cp++);
|
||||
}
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
// check if this prefix entry matches
|
||||
struct hentry * PfxEntry::check(const char * word, int len, char in_compound, const FLAG needflag)
|
||||
{
|
||||
int tmpl; // length of tmpword
|
||||
struct hentry * he; // hash entry of root word or NULL
|
||||
char tmpword[MAXWORDUTF8LEN + 4];
|
||||
|
||||
// on entry prefix is 0 length or already matches the beginning of the word.
|
||||
// So if the remaining root word has positive length
|
||||
// and if there are enough chars in root word and added back strip chars
|
||||
// to meet the number of characters conditions, then test it
|
||||
|
||||
tmpl = len - appndl;
|
||||
|
||||
if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
|
||||
|
||||
// generate new root word by removing prefix and adding
|
||||
// back any characters that would have been stripped
|
||||
|
||||
if (stripl) strcpy (tmpword, strip);
|
||||
strcpy ((tmpword + stripl), (word + appndl));
|
||||
|
||||
// now make sure all of the conditions on characters
|
||||
// are met. Please see the appendix at the end of
|
||||
// this file for more info on exactly what is being
|
||||
// tested
|
||||
|
||||
// if all conditions are met then check if resulting
|
||||
// root word in the dictionary
|
||||
|
||||
if (test_condition(tmpword)) {
|
||||
tmpl += stripl;
|
||||
if ((he = pmyMgr->lookup(tmpword)) != NULL) {
|
||||
do {
|
||||
if (TESTAFF(he->astr, aflag, he->alen) &&
|
||||
// forbid single prefixes with pseudoroot flag
|
||||
! TESTAFF(contclass, pmyMgr->get_pseudoroot(), contclasslen) &&
|
||||
// needflag
|
||||
((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
|
||||
(contclass && TESTAFF(contclass, needflag, contclasslen))))
|
||||
return he;
|
||||
} while ((he = he->next_homonym)); // check homonyms
|
||||
}
|
||||
|
||||
// prefix matched but no root word was found
|
||||
// if aeXPRODUCT is allowed, try again but now
|
||||
// ross checked combined with a suffix
|
||||
|
||||
//if ((opts & aeXPRODUCT) && in_compound) {
|
||||
if ((opts & aeXPRODUCT)) {
|
||||
he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, NULL,
|
||||
0, NULL, FLAG_NULL, needflag, in_compound);
|
||||
if (he) return he;
|
||||
}
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// check if this prefix entry matches
|
||||
struct hentry * PfxEntry::check_twosfx(const char * word, int len,
|
||||
char in_compound, const FLAG needflag)
|
||||
{
|
||||
int tmpl; // length of tmpword
|
||||
struct hentry * he; // hash entry of root word or NULL
|
||||
char tmpword[MAXWORDUTF8LEN + 4];
|
||||
|
||||
// on entry prefix is 0 length or already matches the beginning of the word.
|
||||
// So if the remaining root word has positive length
|
||||
// and if there are enough chars in root word and added back strip chars
|
||||
// to meet the number of characters conditions, then test it
|
||||
|
||||
tmpl = len - appndl;
|
||||
|
||||
if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
|
||||
|
||||
// generate new root word by removing prefix and adding
|
||||
// back any characters that would have been stripped
|
||||
|
||||
if (stripl) strcpy (tmpword, strip);
|
||||
strcpy ((tmpword + stripl), (word + appndl));
|
||||
|
||||
// now make sure all of the conditions on characters
|
||||
// are met. Please see the appendix at the end of
|
||||
// this file for more info on exactly what is being
|
||||
// tested
|
||||
|
||||
// if all conditions are met then check if resulting
|
||||
// root word in the dictionary
|
||||
|
||||
if (test_condition(tmpword)) {
|
||||
tmpl += stripl;
|
||||
|
||||
// prefix matched but no root word was found
|
||||
// if aeXPRODUCT is allowed, try again but now
|
||||
// cross checked combined with a suffix
|
||||
|
||||
if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
|
||||
he = pmyMgr->suffix_check_twosfx(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, needflag);
|
||||
if (he) return he;
|
||||
}
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
// check if this prefix entry matches
|
||||
char * PfxEntry::check_twosfx_morph(const char * word, int len,
|
||||
char in_compound, const FLAG needflag)
|
||||
{
|
||||
int tmpl; // length of tmpword
|
||||
char tmpword[MAXWORDUTF8LEN + 4];
|
||||
|
||||
// on entry prefix is 0 length or already matches the beginning of the word.
|
||||
// So if the remaining root word has positive length
|
||||
// and if there are enough chars in root word and added back strip chars
|
||||
// to meet the number of characters conditions, then test it
|
||||
|
||||
tmpl = len - appndl;
|
||||
|
||||
if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
|
||||
|
||||
// generate new root word by removing prefix and adding
|
||||
// back any characters that would have been stripped
|
||||
|
||||
if (stripl) strcpy (tmpword, strip);
|
||||
strcpy ((tmpword + stripl), (word + appndl));
|
||||
|
||||
// now make sure all of the conditions on characters
|
||||
// are met. Please see the appendix at the end of
|
||||
// this file for more info on exactly what is being
|
||||
// tested
|
||||
|
||||
// if all conditions are met then check if resulting
|
||||
// root word in the dictionary
|
||||
|
||||
if (test_condition(tmpword)) {
|
||||
tmpl += stripl;
|
||||
|
||||
// prefix matched but no root word was found
|
||||
// if aeXPRODUCT is allowed, try again but now
|
||||
// ross checked combined with a suffix
|
||||
|
||||
if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
|
||||
return pmyMgr->suffix_check_twosfx_morph(tmpword, tmpl,
|
||||
aeXPRODUCT, (AffEntry *)this, needflag);
|
||||
}
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// check if this prefix entry matches
|
||||
char * PfxEntry::check_morph(const char * word, int len, char in_compound, const FLAG needflag)
|
||||
{
|
||||
int tmpl; // length of tmpword
|
||||
struct hentry * he; // hash entry of root word or NULL
|
||||
char tmpword[MAXWORDUTF8LEN + 4];
|
||||
char result[MAXLNLEN];
|
||||
char * st;
|
||||
|
||||
*result = '\0';
|
||||
|
||||
// on entry prefix is 0 length or already matches the beginning of the word.
|
||||
// So if the remaining root word has positive length
|
||||
// and if there are enough chars in root word and added back strip chars
|
||||
// to meet the number of characters conditions, then test it
|
||||
|
||||
tmpl = len - appndl;
|
||||
|
||||
if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
|
||||
|
||||
// generate new root word by removing prefix and adding
|
||||
// back any characters that would have been stripped
|
||||
|
||||
if (stripl) strcpy (tmpword, strip);
|
||||
strcpy ((tmpword + stripl), (word + appndl));
|
||||
|
||||
// now make sure all of the conditions on characters
|
||||
// are met. Please see the appendix at the end of
|
||||
// this file for more info on exactly what is being
|
||||
// tested
|
||||
|
||||
// if all conditions are met then check if resulting
|
||||
// root word in the dictionary
|
||||
|
||||
if (test_condition(tmpword)) {
|
||||
tmpl += stripl;
|
||||
if ((he = pmyMgr->lookup(tmpword)) != NULL) {
|
||||
do {
|
||||
if (TESTAFF(he->astr, aflag, he->alen) &&
|
||||
// forbid single prefixes with pseudoroot flag
|
||||
! TESTAFF(contclass, pmyMgr->get_pseudoroot(), contclasslen) &&
|
||||
// needflag
|
||||
((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
|
||||
(contclass && TESTAFF(contclass, needflag, contclasslen)))) {
|
||||
if (morphcode) strcat(result, morphcode); else strcat(result,getKey());
|
||||
if (he->description) {
|
||||
if ((*(he->description)=='[')||(*(he->description)=='<')) strcat(result,he->word);
|
||||
strcat(result,he->description);
|
||||
}
|
||||
strcat(result, "\n");
|
||||
}
|
||||
} while ((he = he->next_homonym));
|
||||
}
|
||||
|
||||
// prefix matched but no root word was found
|
||||
// if aeXPRODUCT is allowed, try again but now
|
||||
// ross checked combined with a suffix
|
||||
|
||||
if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
|
||||
st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this,
|
||||
FLAG_NULL, needflag);
|
||||
if (st) {
|
||||
strcat(result, st);
|
||||
free(st);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (*result) return mystrdup(result);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp)
|
||||
{
|
||||
// register affix manager
|
||||
pmyMgr = pmgr;
|
||||
|
||||
// set up its intial values
|
||||
aflag = dp->aflag; // char flag
|
||||
strip = dp->strip; // string to strip
|
||||
appnd = dp->appnd; // string to append
|
||||
stripl = dp->stripl; // length of strip string
|
||||
appndl = dp->appndl; // length of append string
|
||||
numconds = dp->numconds; // number of conditions to match
|
||||
opts = dp->opts; // cross product flag
|
||||
|
||||
// then copy over all of the conditions
|
||||
memcpy(&conds.base[0],&dp->conds.base[0],SETSIZE*sizeof(conds.base[0]));
|
||||
|
||||
rappnd = myrevstrdup(appnd);
|
||||
|
||||
morphcode = dp->morphcode;
|
||||
contclass = dp->contclass;
|
||||
contclasslen = dp->contclasslen;
|
||||
}
|
||||
|
||||
|
||||
SfxEntry::~SfxEntry()
|
||||
{
|
||||
aflag = 0;
|
||||
if (appnd) free(appnd);
|
||||
if (rappnd) free(rappnd);
|
||||
if (strip) free(strip);
|
||||
pmyMgr = NULL;
|
||||
appnd = NULL;
|
||||
strip = NULL;
|
||||
if (opts & aeUTF8) {
|
||||
for (int i = 0; i < 8; i++) {
|
||||
if (conds.utf8.wchars[i]) free(conds.utf8.wchars[i]);
|
||||
}
|
||||
}
|
||||
if (morphcode && !(opts & aeALIASM)) free(morphcode);
|
||||
if (contclass && !(opts & aeALIASF)) free(contclass);
|
||||
}
|
||||
|
||||
// add suffix to this word assuming conditions hold
|
||||
char * SfxEntry::add(const char * word, int len)
|
||||
{
|
||||
char tword[MAXWORDUTF8LEN + 4];
|
||||
|
||||
/* make sure all conditions match */
|
||||
if ((len > stripl) && (len >= numconds) && test_condition(word + len, word) &&
|
||||
(!stripl || (strcmp(word + len - stripl, strip) == 0)) &&
|
||||
((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
|
||||
/* we have a match so add suffix */
|
||||
strcpy(tword,word);
|
||||
if (appndl) {
|
||||
strcpy(tword + len - stripl, appnd);
|
||||
} else {
|
||||
*(tword + len - stripl) = '\0';
|
||||
}
|
||||
return mystrdup(tword);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
inline int SfxEntry::test_condition(const char * st, const char * beg)
|
||||
{
|
||||
int cond;
|
||||
unsigned char * cp = (unsigned char *) st;
|
||||
if (!(opts & aeUTF8)) { // 256-character codepage
|
||||
// Dömölki affix algorithm
|
||||
for (cond = numconds; --cond >= 0; ) {
|
||||
if ((conds.base[*--cp] & (1 << cond)) == 0) return 0;
|
||||
}
|
||||
} else { // UTF-8 encoding
|
||||
unsigned short wc;
|
||||
for (cond = numconds; --cond >= 0; ) {
|
||||
// go to next character position and check limit
|
||||
if ((char *) --cp < beg) return 0;
|
||||
// a simple 7-bit ASCII character in UTF-8
|
||||
if ((*cp >> 7) == 0) {
|
||||
if ((conds.utf8.ascii[*cp] & (1 << cond)) == 0) return 0;
|
||||
// UTF-8 multibyte character
|
||||
} else {
|
||||
// go to first character of UTF-8 multibyte character
|
||||
for (; (*cp & 0xc0) == 0x80; cp--);
|
||||
// not dot wildcard in rule
|
||||
if (!conds.utf8.all[cond]) {
|
||||
if (conds.utf8.neg[cond]) {
|
||||
u8_u16((w_char *) &wc, 1, (char *) cp);
|
||||
if (conds.utf8.wchars[cond] &&
|
||||
flag_bsearch((unsigned short *)conds.utf8.wchars[cond],
|
||||
wc, (short) conds.utf8.wlen[cond])) return 0;
|
||||
} else {
|
||||
if (!conds.utf8.wchars[cond]) return 0;
|
||||
u8_u16((w_char *) &wc, 1, (char *) cp);
|
||||
if (!flag_bsearch((unsigned short *)conds.utf8.wchars[cond],
|
||||
wc, (short)conds.utf8.wlen[cond])) return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// see if this suffix is present in the word
|
||||
struct hentry * SfxEntry::check(const char * word, int len, int optflags,
|
||||
AffEntry* ppfx, char ** wlst, int maxSug, int * ns, const FLAG cclass, const FLAG needflag)
|
||||
{
|
||||
int tmpl; // length of tmpword
|
||||
struct hentry * he; // hash entry pointer
|
||||
unsigned char * cp;
|
||||
char tmpword[MAXWORDUTF8LEN + 4];
|
||||
PfxEntry* ep = (PfxEntry *) ppfx;
|
||||
|
||||
// if this suffix is being cross checked with a prefix
|
||||
// but it does not support cross products skip it
|
||||
|
||||
if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0))
|
||||
return NULL;
|
||||
|
||||
// upon entry suffix is 0 length or already matches the end of the word.
|
||||
// So if the remaining root word has positive length
|
||||
// and if there are enough chars in root word and added back strip chars
|
||||
// to meet the number of characters conditions, then test it
|
||||
|
||||
tmpl = len - appndl;
|
||||
// the second condition is not enough for UTF-8 strings
|
||||
// it checked in test_condition()
|
||||
|
||||
if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
|
||||
|
||||
// generate new root word by removing suffix and adding
|
||||
// back any characters that would have been stripped or
|
||||
// or null terminating the shorter string
|
||||
|
||||
strcpy (tmpword, word);
|
||||
cp = (unsigned char *)(tmpword + tmpl);
|
||||
if (stripl) {
|
||||
strcpy ((char *)cp, strip);
|
||||
tmpl += stripl;
|
||||
cp = (unsigned char *)(tmpword + tmpl);
|
||||
} else *cp = '\0';
|
||||
|
||||
// now make sure all of the conditions on characters
|
||||
// are met. Please see the appendix at the end of
|
||||
// this file for more info on exactly what is being // tested
|
||||
|
||||
// if all conditions are met then check if resulting
|
||||
// root word in the dictionary
|
||||
|
||||
if (test_condition((char *) cp, (char *) tmpword)) {
|
||||
|
||||
#ifdef SZOSZABLYA_POSSIBLE_ROOTS
|
||||
fprintf(stdout,"%s %s %c\n", word, tmpword, aflag);
|
||||
#endif
|
||||
if ((he = pmyMgr->lookup(tmpword)) != NULL) {
|
||||
do {
|
||||
// check conditional suffix (enabled by prefix)
|
||||
if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() &&
|
||||
TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
|
||||
(((optflags & aeXPRODUCT) == 0) ||
|
||||
TESTAFF(he->astr, ep->getFlag(), he->alen) ||
|
||||
// enabled by prefix
|
||||
((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
|
||||
) &&
|
||||
// handle cont. class
|
||||
((!cclass) ||
|
||||
((contclass) && TESTAFF(contclass, cclass, contclasslen))
|
||||
) &&
|
||||
// handle required flag
|
||||
((!needflag) ||
|
||||
(TESTAFF(he->astr, needflag, he->alen) ||
|
||||
((contclass) && TESTAFF(contclass, needflag, contclasslen)))
|
||||
)
|
||||
) return he;
|
||||
} while ((he = he->next_homonym)); // check homonyms
|
||||
|
||||
// obsolote stemming code (used only by the
|
||||
// experimental SuffixMgr:suggest_pos_stems)
|
||||
// store resulting root in wlst
|
||||
} else if (wlst && (*ns < maxSug)) {
|
||||
int cwrd = 1;
|
||||
for (int k=0; k < *ns; k++)
|
||||
if (strcmp(tmpword, wlst[k]) == 0) cwrd = 0;
|
||||
if (cwrd) {
|
||||
wlst[*ns] = mystrdup(tmpword);
|
||||
if (wlst[*ns] == NULL) {
|
||||
for (int j=0; j<*ns; j++) free(wlst[j]);
|
||||
*ns = -1;
|
||||
return NULL;
|
||||
}
|
||||
(*ns)++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// see if two-level suffix is present in the word
|
||||
struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags,
|
||||
AffEntry* ppfx, const FLAG needflag)
|
||||
{
|
||||
int tmpl; // length of tmpword
|
||||
struct hentry * he; // hash entry pointer
|
||||
unsigned char * cp;
|
||||
char tmpword[MAXWORDUTF8LEN + 4];
|
||||
PfxEntry* ep = (PfxEntry *) ppfx;
|
||||
|
||||
|
||||
// if this suffix is being cross checked with a prefix
|
||||
// but it does not support cross products skip it
|
||||
|
||||
if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
|
||||
return NULL;
|
||||
|
||||
// upon entry suffix is 0 length or already matches the end of the word.
|
||||
// So if the remaining root word has positive length
|
||||
// and if there are enough chars in root word and added back strip chars
|
||||
// to meet the number of characters conditions, then test it
|
||||
|
||||
tmpl = len - appndl;
|
||||
|
||||
if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
|
||||
|
||||
// generate new root word by removing suffix and adding
|
||||
// back any characters that would have been stripped or
|
||||
// or null terminating the shorter string
|
||||
|
||||
strcpy (tmpword, word);
|
||||
cp = (unsigned char *)(tmpword + tmpl);
|
||||
if (stripl) {
|
||||
strcpy ((char *)cp, strip);
|
||||
tmpl += stripl;
|
||||
cp = (unsigned char *)(tmpword + tmpl);
|
||||
} else *cp = '\0';
|
||||
|
||||
// now make sure all of the conditions on characters
|
||||
// are met. Please see the appendix at the end of
|
||||
// this file for more info on exactly what is being
|
||||
// tested
|
||||
|
||||
// if all conditions are met then recall suffix_check
|
||||
|
||||
if (test_condition((char *) cp, (char *) tmpword)) {
|
||||
if (ppfx) {
|
||||
// handle conditional suffix
|
||||
if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
|
||||
he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
|
||||
else
|
||||
he = pmyMgr->suffix_check(tmpword, tmpl, optflags, ppfx, NULL, 0, NULL, (FLAG) aflag, needflag);
|
||||
} else {
|
||||
he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
|
||||
}
|
||||
if (he) return he;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
// see if two-level suffix is present in the word
|
||||
char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags,
|
||||
AffEntry* ppfx, const FLAG needflag)
|
||||
{
|
||||
int tmpl; // length of tmpword
|
||||
unsigned char * cp;
|
||||
char tmpword[MAXWORDUTF8LEN + 4];
|
||||
PfxEntry* ep = (PfxEntry *) ppfx;
|
||||
char * st;
|
||||
|
||||
char result[MAXLNLEN];
|
||||
|
||||
*result = '\0';
|
||||
|
||||
// if this suffix is being cross checked with a prefix
|
||||
// but it does not support cross products skip it
|
||||
|
||||
if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
|
||||
return NULL;
|
||||
|
||||
// upon entry suffix is 0 length or already matches the end of the word.
|
||||
// So if the remaining root word has positive length
|
||||
// and if there are enough chars in root word and added back strip chars
|
||||
// to meet the number of characters conditions, then test it
|
||||
|
||||
tmpl = len - appndl;
|
||||
|
||||
if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
|
||||
|
||||
// generate new root word by removing suffix and adding
|
||||
// back any characters that would have been stripped or
|
||||
// or null terminating the shorter string
|
||||
|
||||
strcpy (tmpword, word);
|
||||
cp = (unsigned char *)(tmpword + tmpl);
|
||||
if (stripl) {
|
||||
strcpy ((char *)cp, strip);
|
||||
tmpl += stripl;
|
||||
cp = (unsigned char *)(tmpword + tmpl);
|
||||
} else *cp = '\0';
|
||||
|
||||
// now make sure all of the conditions on characters
|
||||
// are met. Please see the appendix at the end of
|
||||
// this file for more info on exactly what is being
|
||||
// tested
|
||||
|
||||
// if all conditions are met then recall suffix_check
|
||||
|
||||
if (test_condition((char *) cp, (char *) tmpword)) {
|
||||
if (ppfx) {
|
||||
// handle conditional suffix
|
||||
if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) {
|
||||
st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
|
||||
if (st) {
|
||||
if (((PfxEntry *) ppfx)->getMorph()) {
|
||||
strcat(result, ((PfxEntry *) ppfx)->getMorph());
|
||||
}
|
||||
strcat(result,st);
|
||||
free(st);
|
||||
mychomp(result);
|
||||
}
|
||||
} else {
|
||||
st = pmyMgr->suffix_check_morph(tmpword, tmpl, optflags, ppfx, aflag, needflag);
|
||||
if (st) {
|
||||
strcat(result, st);
|
||||
free(st);
|
||||
mychomp(result);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
|
||||
if (st) {
|
||||
strcat(result, st);
|
||||
free(st);
|
||||
mychomp(result);
|
||||
}
|
||||
}
|
||||
if (*result) return mystrdup(result);
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// get next homonym with same affix
|
||||
struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, AffEntry* ppfx,
|
||||
const FLAG cclass, const FLAG needflag)
|
||||
{
|
||||
PfxEntry* ep = (PfxEntry *) ppfx;
|
||||
|
||||
while (he->next_homonym) {
|
||||
he = he->next_homonym;
|
||||
if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
|
||||
((optflags & aeXPRODUCT) == 0 ||
|
||||
TESTAFF(he->astr, ep->getFlag(), he->alen) ||
|
||||
// handle conditional suffix
|
||||
((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
|
||||
) &&
|
||||
// handle cont. class
|
||||
((!cclass) ||
|
||||
((contclass) && TESTAFF(contclass, cclass, contclasslen))
|
||||
) &&
|
||||
// handle required flag
|
||||
((!needflag) ||
|
||||
(TESTAFF(he->astr, needflag, he->alen) ||
|
||||
((contclass) && TESTAFF(contclass, needflag, contclasslen)))
|
||||
)
|
||||
) return he;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
#if 0
|
||||
|
||||
Appendix: Understanding Affix Code
|
||||
|
||||
|
||||
An affix is either a prefix or a suffix attached to root words to make
|
||||
other words.
|
||||
|
||||
Basically a Prefix or a Suffix is set of AffEntry objects
|
||||
which store information about the prefix or suffix along
|
||||
with supporting routines to check if a word has a particular
|
||||
prefix or suffix or a combination.
|
||||
|
||||
The structure affentry is defined as follows:
|
||||
|
||||
struct affentry
|
||||
{
|
||||
unsigned short aflag; // ID used to represent the affix
|
||||
char * strip; // string to strip before adding affix
|
||||
char * appnd; // the affix string to add
|
||||
unsigned char stripl; // length of the strip string
|
||||
unsigned char appndl; // length of the affix string
|
||||
char numconds; // the number of conditions that must be met
|
||||
char opts; // flag: aeXPRODUCT- combine both prefix and suffix
|
||||
char conds[SETSIZE]; // array which encodes the conditions to be met
|
||||
};
|
||||
|
||||
|
||||
Here is a suffix borrowed from the en_US.aff file. This file
|
||||
is whitespace delimited.
|
||||
|
||||
SFX D Y 4
|
||||
SFX D 0 e d
|
||||
SFX D y ied [^aeiou]y
|
||||
SFX D 0 ed [^ey]
|
||||
SFX D 0 ed [aeiou]y
|
||||
|
||||
This information can be interpreted as follows:
|
||||
|
||||
In the first line has 4 fields
|
||||
|
||||
Field
|
||||
-----
|
||||
1 SFX - indicates this is a suffix
|
||||
2 D - is the name of the character flag which represents this suffix
|
||||
3 Y - indicates it can be combined with prefixes (cross product)
|
||||
4 4 - indicates that sequence of 4 affentry structures are needed to
|
||||
properly store the affix information
|
||||
|
||||
The remaining lines describe the unique information for the 4 SfxEntry
|
||||
objects that make up this affix. Each line can be interpreted
|
||||
as follows: (note fields 1 and 2 are as a check against line 1 info)
|
||||
|
||||
Field
|
||||
-----
|
||||
1 SFX - indicates this is a suffix
|
||||
2 D - is the name of the character flag for this affix
|
||||
3 y - the string of chars to strip off before adding affix
|
||||
(a 0 here indicates the NULL string)
|
||||
4 ied - the string of affix characters to add
|
||||
5 [^aeiou]y - the conditions which must be met before the affix
|
||||
can be applied
|
||||
|
||||
Field 5 is interesting. Since this is a suffix, field 5 tells us that
|
||||
there are 2 conditions that must be met. The first condition is that
|
||||
the next to the last character in the word must *NOT* be any of the
|
||||
following "a", "e", "i", "o" or "u". The second condition is that
|
||||
the last character of the word must end in "y".
|
||||
|
||||
So how can we encode this information concisely and be able to
|
||||
test for both conditions in a fast manner? The answer is found
|
||||
but studying the wonderful ispell code of Geoff Kuenning, et.al.
|
||||
(now available under a normal BSD license).
|
||||
|
||||
If we set up a conds array of 256 bytes indexed (0 to 255) and access it
|
||||
using a character (cast to an unsigned char) of a string, we have 8 bits
|
||||
of information we can store about that character. Specifically we
|
||||
could use each bit to say if that character is allowed in any of the
|
||||
last (or first for prefixes) 8 characters of the word.
|
||||
|
||||
Basically, each character at one end of the word (up to the number
|
||||
of conditions) is used to index into the conds array and the resulting
|
||||
value found there says whether the that character is valid for a
|
||||
specific character position in the word.
|
||||
|
||||
For prefixes, it does this by setting bit 0 if that char is valid
|
||||
in the first position, bit 1 if valid in the second position, and so on.
|
||||
|
||||
If a bit is not set, then that char is not valid for that postion in the
|
||||
word.
|
||||
|
||||
If working with suffixes bit 0 is used for the character closest
|
||||
to the front, bit 1 for the next character towards the end, ...,
|
||||
with bit numconds-1 representing the last char at the end of the string.
|
||||
|
||||
Note: since entries in the conds[] are 8 bits, only 8 conditions
|
||||
(read that only 8 character positions) can be examined at one
|
||||
end of a word (the beginning for prefixes and the end for suffixes.
|
||||
|
||||
So to make this clearer, lets encode the conds array values for the
|
||||
first two affentries for the suffix D described earlier.
|
||||
|
||||
|
||||
For the first affentry:
|
||||
numconds = 1 (only examine the last character)
|
||||
|
||||
conds['e'] = (1 << 0) (the word must end in an E)
|
||||
all others are all 0
|
||||
|
||||
For the second affentry:
|
||||
numconds = 2 (only examine the last two characters)
|
||||
|
||||
conds[X] = conds[X] | (1 << 0) (aeiou are not allowed)
|
||||
where X is all characters *but* a, e, i, o, or u
|
||||
|
||||
|
||||
conds['y'] = (1 << 1) (the last char must be a y)
|
||||
all other bits for all other entries in the conds array are zero
|
||||
|
||||
|
||||
#endif
|
||||
|
130
goldlib/hunspell/affentry.hxx
Normal file
130
goldlib/hunspell/affentry.hxx
Normal file
@ -0,0 +1,130 @@
|
||||
#ifndef _AFFIX_HXX_
|
||||
#define _AFFIX_HXX_
|
||||
|
||||
#include "atypes.hxx"
|
||||
#include "baseaffi.hxx"
|
||||
#include "affixmgr.hxx"
|
||||
|
||||
/* A Prefix Entry */
|
||||
|
||||
class PfxEntry : public AffEntry
|
||||
{
|
||||
AffixMgr* pmyMgr;
|
||||
|
||||
PfxEntry * next;
|
||||
PfxEntry * nexteq;
|
||||
PfxEntry * nextne;
|
||||
PfxEntry * flgnxt;
|
||||
|
||||
public:
|
||||
|
||||
PfxEntry(AffixMgr* pmgr, affentry* dp );
|
||||
~PfxEntry();
|
||||
|
||||
inline bool allowCross() { return ((opts & aeXPRODUCT) != 0); }
|
||||
struct hentry * check(const char * word, int len, char in_compound,
|
||||
const FLAG needflag = FLAG_NULL);
|
||||
|
||||
struct hentry * check_twosfx(const char * word, int len, char in_compound, const FLAG needflag = NULL);
|
||||
|
||||
char * check_morph(const char * word, int len, char in_compound,
|
||||
const FLAG needflag = FLAG_NULL);
|
||||
|
||||
char * check_twosfx_morph(const char * word, int len,
|
||||
char in_compound, const FLAG needflag = FLAG_NULL);
|
||||
|
||||
inline FLAG getFlag() { return aflag; }
|
||||
inline const char * getKey() { return appnd; }
|
||||
char * add(const char * word, int len);
|
||||
|
||||
inline short getKeyLen() { return appndl; }
|
||||
|
||||
inline const char * getMorph() { return morphcode; }
|
||||
|
||||
inline const unsigned short * getCont() { return contclass; }
|
||||
inline short getContLen() { return contclasslen; }
|
||||
|
||||
inline PfxEntry * getNext() { return next; }
|
||||
inline PfxEntry * getNextNE() { return nextne; }
|
||||
inline PfxEntry * getNextEQ() { return nexteq; }
|
||||
inline PfxEntry * getFlgNxt() { return flgnxt; }
|
||||
|
||||
inline void setNext(PfxEntry * ptr) { next = ptr; }
|
||||
inline void setNextNE(PfxEntry * ptr) { nextne = ptr; }
|
||||
inline void setNextEQ(PfxEntry * ptr) { nexteq = ptr; }
|
||||
inline void setFlgNxt(PfxEntry * ptr) { flgnxt = ptr; }
|
||||
|
||||
inline int test_condition(const char * st);
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
/* A Suffix Entry */
|
||||
|
||||
class SfxEntry : public AffEntry
|
||||
{
|
||||
AffixMgr* pmyMgr;
|
||||
char * rappnd;
|
||||
|
||||
SfxEntry * next;
|
||||
SfxEntry * nexteq;
|
||||
SfxEntry * nextne;
|
||||
SfxEntry * flgnxt;
|
||||
|
||||
SfxEntry * l_morph;
|
||||
SfxEntry * r_morph;
|
||||
SfxEntry * eq_morph;
|
||||
|
||||
public:
|
||||
|
||||
SfxEntry(AffixMgr* pmgr, affentry* dp );
|
||||
~SfxEntry();
|
||||
|
||||
inline bool allowCross() { return ((opts & aeXPRODUCT) != 0); }
|
||||
struct hentry * check(const char * word, int len, int optflags,
|
||||
AffEntry* ppfx, char ** wlst, int maxSug, int * ns,
|
||||
const FLAG cclass = FLAG_NULL, const FLAG needflag = FLAG_NULL);
|
||||
|
||||
struct hentry * check_twosfx(const char * word, int len, int optflags, AffEntry* ppfx, const FLAG needflag = NULL);
|
||||
|
||||
char * check_twosfx_morph(const char * word, int len, int optflags,
|
||||
AffEntry* ppfx, const FLAG needflag = FLAG_NULL);
|
||||
struct hentry * get_next_homonym(struct hentry * he);
|
||||
struct hentry * get_next_homonym(struct hentry * word, int optflags, AffEntry* ppfx,
|
||||
const FLAG cclass, const FLAG needflag);
|
||||
|
||||
|
||||
inline FLAG getFlag() { return aflag; }
|
||||
inline const char * getKey() { return rappnd; }
|
||||
char * add(const char * word, int len);
|
||||
|
||||
|
||||
inline const char * getMorph() { return morphcode; }
|
||||
|
||||
inline const unsigned short * getCont() { return contclass; }
|
||||
inline short getContLen() { return contclasslen; }
|
||||
inline const char * getAffix() { return appnd; }
|
||||
|
||||
inline short getKeyLen() { return appndl; }
|
||||
|
||||
inline SfxEntry * getNext() { return next; }
|
||||
inline SfxEntry * getNextNE() { return nextne; }
|
||||
inline SfxEntry * getNextEQ() { return nexteq; }
|
||||
|
||||
inline SfxEntry * getLM() { return l_morph; }
|
||||
inline SfxEntry * getRM() { return r_morph; }
|
||||
inline SfxEntry * getEQM() { return eq_morph; }
|
||||
inline SfxEntry * getFlgNxt() { return flgnxt; }
|
||||
|
||||
inline void setNext(SfxEntry * ptr) { next = ptr; }
|
||||
inline void setNextNE(SfxEntry * ptr) { nextne = ptr; }
|
||||
inline void setNextEQ(SfxEntry * ptr) { nexteq = ptr; }
|
||||
inline void setFlgNxt(SfxEntry * ptr) { flgnxt = ptr; }
|
||||
|
||||
inline int test_condition(const char * st, const char * begin);
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
|
4040
goldlib/hunspell/affixmgr.cxx
Normal file
4040
goldlib/hunspell/affixmgr.cxx
Normal file
File diff suppressed because it is too large
Load Diff
206
goldlib/hunspell/affixmgr.hxx
Normal file
206
goldlib/hunspell/affixmgr.hxx
Normal file
@ -0,0 +1,206 @@
|
||||
#ifndef _AFFIXMGR_HXX_
|
||||
#define _AFFIXMGR_HXX_
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <cstdio>
|
||||
|
||||
#include "atypes.hxx"
|
||||
#include "baseaffi.hxx"
|
||||
#include "hashmgr.hxx"
|
||||
|
||||
// check flag duplication
|
||||
#define dupSFX (1 << 0)
|
||||
#define dupPFX (1 << 1)
|
||||
|
||||
class AffixMgr
|
||||
{
|
||||
|
||||
AffEntry * pStart[SETSIZE];
|
||||
AffEntry * sStart[SETSIZE];
|
||||
AffEntry * pFlag[CONTSIZE];
|
||||
AffEntry * sFlag[CONTSIZE];
|
||||
HashMgr * pHMgr;
|
||||
char * trystring;
|
||||
char * encoding;
|
||||
struct cs_info * csconv;
|
||||
int utf8;
|
||||
struct unicode_info2 * utf_tbl;
|
||||
int complexprefixes;
|
||||
FLAG compoundflag;
|
||||
FLAG compoundbegin;
|
||||
FLAG compoundmiddle;
|
||||
FLAG compoundend;
|
||||
FLAG compoundroot;
|
||||
FLAG compoundforbidflag;
|
||||
FLAG compoundpermitflag;
|
||||
int checkcompounddup;
|
||||
int checkcompoundrep;
|
||||
int checkcompoundcase;
|
||||
int checkcompoundtriple;
|
||||
FLAG forbiddenword;
|
||||
FLAG nosuggest;
|
||||
FLAG pseudoroot;
|
||||
int cpdmin;
|
||||
int numrep;
|
||||
replentry * reptable;
|
||||
int nummap;
|
||||
mapentry * maptable;
|
||||
int numbreak;
|
||||
char ** breaktable;
|
||||
int numcheckcpd;
|
||||
replentry * checkcpdtable;
|
||||
int numdefcpd;
|
||||
flagentry * defcpdtable;
|
||||
int maxngramsugs;
|
||||
int nosplitsugs;
|
||||
int sugswithdots;
|
||||
int cpdwordmax;
|
||||
int cpdmaxsyllable;
|
||||
char * cpdvowels;
|
||||
w_char * cpdvowels_utf16;
|
||||
int cpdvowels_utf16_len;
|
||||
char * cpdsyllablenum;
|
||||
const char * pfxappnd; // BUG: not stateless
|
||||
const char * sfxappnd; // BUG: not stateless
|
||||
FLAG sfxflag; // BUG: not stateless
|
||||
char * derived; // BUG: not stateless
|
||||
AffEntry * sfx; // BUG: not stateless
|
||||
AffEntry * pfx; // BUG: not stateless
|
||||
int checknum;
|
||||
char * wordchars;
|
||||
unsigned short * wordchars_utf16;
|
||||
int wordchars_utf16_len;
|
||||
char * version;
|
||||
char * lang;
|
||||
int langnum;
|
||||
FLAG lemma_present;
|
||||
FLAG circumfix;
|
||||
FLAG onlyincompound;
|
||||
FLAG keepcase;
|
||||
int checksharps;
|
||||
|
||||
int havecontclass; // boolean variable
|
||||
char contclasses[CONTSIZE]; // flags of possible continuing classes (twofold affix)
|
||||
flag flag_mode;
|
||||
|
||||
public:
|
||||
|
||||
AffixMgr(const char * affpath, HashMgr * ptr);
|
||||
~AffixMgr();
|
||||
struct hentry * affix_check(const char * word, int len,
|
||||
const unsigned short needflag = (unsigned short) 0, char in_compound = IN_CPD_NOT);
|
||||
struct hentry * prefix_check(const char * word, int len,
|
||||
char in_compound, const FLAG needflag = FLAG_NULL);
|
||||
inline int isSubset(const char * s1, const char * s2);
|
||||
struct hentry * prefix_check_twosfx(const char * word, int len,
|
||||
char in_compound, const FLAG needflag = FLAG_NULL);
|
||||
inline int isRevSubset(const char * s1, const char * end_of_s2, int len);
|
||||
struct hentry * suffix_check(const char * word, int len, int sfxopts, AffEntry* ppfx,
|
||||
char ** wlst, int maxSug, int * ns, const FLAG cclass = FLAG_NULL,
|
||||
const FLAG needflag = FLAG_NULL, char in_compound = IN_CPD_NOT);
|
||||
struct hentry * suffix_check_twosfx(const char * word, int len,
|
||||
int sfxopts, AffEntry* ppfx, const FLAG needflag = FLAG_NULL);
|
||||
|
||||
char * affix_check_morph(const char * word, int len,
|
||||
const FLAG needflag = FLAG_NULL, char in_compound = IN_CPD_NOT);
|
||||
char * prefix_check_morph(const char * word, int len,
|
||||
char in_compound, const FLAG needflag = FLAG_NULL);
|
||||
char * suffix_check_morph (const char * word, int len, int sfxopts, AffEntry * ppfx,
|
||||
const FLAG cclass = FLAG_NULL, const FLAG needflag = FLAG_NULL, char in_compound = IN_CPD_NOT);
|
||||
|
||||
char * prefix_check_twosfx_morph(const char * word, int len,
|
||||
char in_compound, const FLAG needflag = FLAG_NULL);
|
||||
char * suffix_check_twosfx_morph(const char * word, int len,
|
||||
int sfxopts, AffEntry * ppfx, const FLAG needflag = FLAG_NULL);
|
||||
|
||||
int expand_rootword(struct guessword * wlst, int maxn, const char * ts,
|
||||
int wl, const unsigned short * ap, unsigned short al, char * bad, int);
|
||||
|
||||
int get_syllable (const char * word, int wlen);
|
||||
int cpdrep_check(const char * word, int len);
|
||||
int cpdpat_check(const char * word, int len);
|
||||
int defcpd_check(hentry *** words, short wnum, hentry * rv, hentry ** rwords, char all);
|
||||
int cpdcase_check(const char * word, int len);
|
||||
int candidate_check(const char * word, int len);
|
||||
struct hentry * compound_check(const char * word, int len,
|
||||
short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words,
|
||||
char hu_mov_rule, int * cmpdstemnum, int * cmpdstem, char is_sug);
|
||||
|
||||
int compound_check_morph(const char * word, int len,
|
||||
short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words,
|
||||
char hu_mov_rule, char ** result, char * partresult);
|
||||
|
||||
struct hentry * lookup(const char * word);
|
||||
int get_numrep();
|
||||
struct replentry * get_reptable();
|
||||
int get_nummap();
|
||||
struct mapentry * get_maptable();
|
||||
int get_numbreak();
|
||||
char ** get_breaktable();
|
||||
char * get_encoding();
|
||||
int get_langnum();
|
||||
struct unicode_info2 * get_utf_conv();
|
||||
char * get_try_string();
|
||||
const char * get_wordchars();
|
||||
unsigned short * get_wordchars_utf16(int * len);
|
||||
int get_compound();
|
||||
FLAG get_compoundflag();
|
||||
FLAG get_compoundbegin();
|
||||
FLAG get_forbiddenword();
|
||||
FLAG get_nosuggest();
|
||||
FLAG get_pseudoroot();
|
||||
FLAG get_onlyincompound();
|
||||
FLAG get_compoundroot();
|
||||
FLAG get_lemma_present();
|
||||
int get_checknum();
|
||||
char * get_possible_root();
|
||||
const char * get_prefix();
|
||||
const char * get_suffix();
|
||||
const char * get_derived();
|
||||
const char * get_version();
|
||||
const int have_contclass();
|
||||
int get_utf8();
|
||||
int get_complexprefixes();
|
||||
char * get_suffixed(char );
|
||||
int get_maxngramsugs();
|
||||
int get_nosplitsugs();
|
||||
int get_sugswithdots(void);
|
||||
FLAG get_keepcase(void);
|
||||
int get_checksharps(void);
|
||||
|
||||
private:
|
||||
int parse_file(const char * affpath);
|
||||
int parse_try(char * line);
|
||||
int parse_set(char * line);
|
||||
int parse_flag(char * line, unsigned short * out, char * name);
|
||||
int parse_num(char * line, int * out, char * name);
|
||||
int parse_cpdflag(char * line);
|
||||
int parse_cpdforbid(char * line);
|
||||
int parse_forbid(char * line);
|
||||
int parse_cpdsyllable(char * line);
|
||||
int parse_syllablenum(char * line);
|
||||
int parse_reptable(char * line, FILE * af);
|
||||
int parse_maptable(char * line, FILE * af);
|
||||
int parse_breaktable(char * line, FILE * af);
|
||||
int parse_checkcpdtable(char * line, FILE * af);
|
||||
int parse_defcpdtable(char * line, FILE * af);
|
||||
int parse_affix(char * line, const char at, FILE * af, char * dupflags);
|
||||
int parse_wordchars(char * line);
|
||||
int parse_lang(char * line);
|
||||
int parse_version(char * line);
|
||||
|
||||
int encodeit(struct affentry * ptr, char * cs);
|
||||
int build_pfxtree(AffEntry* pfxptr);
|
||||
int build_sfxtree(AffEntry* sfxptr);
|
||||
int process_pfx_order();
|
||||
int process_sfx_order();
|
||||
AffEntry * process_pfx_in_order(AffEntry * ptr, AffEntry * nptr);
|
||||
AffEntry * process_sfx_in_order(AffEntry * ptr, AffEntry * nptr);
|
||||
int process_pfx_tree_to_list();
|
||||
int process_sfx_tree_to_list();
|
||||
void set_spec_utf8_encoding();
|
||||
int redundant_condition(char, char * strip, int stripl, const char * cond, char *);
|
||||
};
|
||||
|
||||
#endif
|
||||
|
85
goldlib/hunspell/atypes.hxx
Normal file
85
goldlib/hunspell/atypes.hxx
Normal file
@ -0,0 +1,85 @@
|
||||
#ifndef _ATYPES_HXX_
|
||||
#define _ATYPES_HXX_
|
||||
|
||||
// HUNSTEM def.
|
||||
#define HUNSTEM
|
||||
|
||||
#include "csutil.hxx"
|
||||
#include "hashmgr.hxx"
|
||||
|
||||
#define SETSIZE 256
|
||||
#define CONTSIZE 65536
|
||||
#define MAXWORDLEN 100
|
||||
#define MAXWORDUTF8LEN (MAXWORDLEN * 4)
|
||||
|
||||
// affentry options
|
||||
#define aeXPRODUCT (1 << 0)
|
||||
#define aeUTF8 (1 << 1)
|
||||
#define aeALIASF (1 << 2)
|
||||
#define aeALIASM (1 << 3)
|
||||
|
||||
enum {IN_CPD_NOT, IN_CPD_BEGIN, IN_CPD_END, IN_CPD_OTHER};
|
||||
|
||||
#define MAXLNLEN 8192 * 4
|
||||
|
||||
#define MAXCOMPOUND 10
|
||||
|
||||
#define MAXACC 1000
|
||||
|
||||
#define FLAG unsigned short
|
||||
#define FLAG_NULL 0x00
|
||||
#define FREE_FLAG(a) a = 0
|
||||
|
||||
#define TESTAFF( a, b , c ) flag_bsearch((unsigned short *) a, (unsigned short) b, c)
|
||||
|
||||
struct affentry
|
||||
{
|
||||
char * strip;
|
||||
char * appnd;
|
||||
unsigned char stripl;
|
||||
unsigned char appndl;
|
||||
char numconds;
|
||||
char opts;
|
||||
unsigned short aflag;
|
||||
union {
|
||||
char base[SETSIZE];
|
||||
struct {
|
||||
char ascii[SETSIZE/2];
|
||||
char neg[8];
|
||||
char all[8];
|
||||
w_char * wchars[8];
|
||||
int wlen[8];
|
||||
} utf8;
|
||||
} conds;
|
||||
char * morphcode;
|
||||
unsigned short * contclass;
|
||||
short contclasslen;
|
||||
};
|
||||
|
||||
struct replentry {
|
||||
char * pattern;
|
||||
char * pattern2;
|
||||
};
|
||||
|
||||
struct mapentry {
|
||||
char * set;
|
||||
w_char * set_utf16;
|
||||
int len;
|
||||
};
|
||||
|
||||
struct flagentry {
|
||||
FLAG * def;
|
||||
int len;
|
||||
};
|
||||
|
||||
struct guessword {
|
||||
char * word;
|
||||
bool allow;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
31
goldlib/hunspell/baseaffi.hxx
Normal file
31
goldlib/hunspell/baseaffi.hxx
Normal file
@ -0,0 +1,31 @@
|
||||
#ifndef _BASEAFF_HXX_
|
||||
#define _BASEAFF_HXX_
|
||||
|
||||
class AffEntry
|
||||
{
|
||||
public:
|
||||
|
||||
protected:
|
||||
char * appnd;
|
||||
char * strip;
|
||||
unsigned char appndl;
|
||||
unsigned char stripl;
|
||||
char numconds;
|
||||
char opts;
|
||||
unsigned short aflag;
|
||||
union {
|
||||
char base[SETSIZE];
|
||||
struct {
|
||||
char ascii[SETSIZE/2];
|
||||
char neg[8];
|
||||
char all[8];
|
||||
w_char * wchars[8];
|
||||
int wlen[8];
|
||||
} utf8;
|
||||
} conds;
|
||||
char * morphcode;
|
||||
unsigned short * contclass;
|
||||
short contclasslen;
|
||||
};
|
||||
|
||||
#endif
|
4970
goldlib/hunspell/csutil.cxx
Normal file
4970
goldlib/hunspell/csutil.cxx
Normal file
File diff suppressed because it is too large
Load Diff
125
goldlib/hunspell/csutil.hxx
Normal file
125
goldlib/hunspell/csutil.hxx
Normal file
@ -0,0 +1,125 @@
|
||||
#ifndef __CSUTILHXX__
|
||||
#define __CSUTILHXX__
|
||||
|
||||
// First some base level utility routines
|
||||
|
||||
typedef struct {
|
||||
unsigned char l;
|
||||
unsigned char h;
|
||||
} w_char;
|
||||
|
||||
// convert UTF-16 characters to UTF-8
|
||||
char * u16_u8(char * dest, int size, const w_char * src, int srclen);
|
||||
|
||||
// convert UTF-8 characters to UTF-16
|
||||
int u8_u16(w_char * dest, int size, const char * src);
|
||||
|
||||
// sort 2-byte vector
|
||||
void flag_qsort(unsigned short flags[], int begin, int end);
|
||||
|
||||
// binary search in 2-byte vector
|
||||
int flag_bsearch(unsigned short flags[], unsigned short flag, int right);
|
||||
|
||||
// remove end of line char(s)
|
||||
void mychomp(char * s);
|
||||
|
||||
// duplicate string
|
||||
char * mystrdup(const char * s);
|
||||
|
||||
// duplicate reverse of string
|
||||
char * myrevstrdup(const char * s);
|
||||
|
||||
// parse into tokens with char delimiter
|
||||
char * mystrsep(char ** sptr, const char delim);
|
||||
// parse into tokens with char delimiter
|
||||
char * mystrsep2(char ** sptr, const char delim);
|
||||
|
||||
// parse into tokens with char delimiter
|
||||
char * mystrrep(char *, const char *, const char *);
|
||||
|
||||
// append s to ends of every lines in text
|
||||
void strlinecat(char * lines, const char * s);
|
||||
|
||||
// tokenize into lines with new line
|
||||
int line_tok(const char * text, char *** lines);
|
||||
|
||||
// tokenize into lines with new line and uniq in place
|
||||
char * line_uniq(char * text);
|
||||
|
||||
// change \n to c in place
|
||||
char * line_join(char * text, char c);
|
||||
|
||||
// leave only last {[^}]*} pattern in string
|
||||
char * delete_zeros(char * morphout);
|
||||
|
||||
// reverse word
|
||||
void reverseword(char *);
|
||||
|
||||
// reverse word
|
||||
void reverseword_utf(char *);
|
||||
|
||||
// character encoding information
|
||||
struct cs_info {
|
||||
unsigned char ccase;
|
||||
unsigned char clower;
|
||||
unsigned char cupper;
|
||||
};
|
||||
|
||||
// Unicode character encoding information
|
||||
struct unicode_info {
|
||||
unsigned short c;
|
||||
unsigned short cupper;
|
||||
unsigned short clower;
|
||||
};
|
||||
|
||||
struct unicode_info2 {
|
||||
char cletter;
|
||||
unsigned short cupper;
|
||||
unsigned short clower;
|
||||
};
|
||||
|
||||
struct enc_entry {
|
||||
const char * enc_name;
|
||||
struct cs_info * cs_table;
|
||||
};
|
||||
|
||||
// language to encoding default map
|
||||
|
||||
struct lang_map {
|
||||
const char * lang;
|
||||
const char * def_enc;
|
||||
int num;
|
||||
};
|
||||
|
||||
struct cs_info * get_current_cs(const char * es);
|
||||
|
||||
struct unicode_info * get_utf_cs();
|
||||
|
||||
int get_utf_cs_len();
|
||||
|
||||
const char * get_default_enc(const char * lang);
|
||||
|
||||
int get_lang_num(const char * lang);
|
||||
|
||||
// convert null terminated string to all caps using encoding
|
||||
void enmkallcap(char * d, const char * p, const char * encoding);
|
||||
|
||||
// convert null terminated string to all little using encoding
|
||||
void enmkallsmall(char * d, const char * p, const char * encoding);
|
||||
|
||||
// convert null terminated string to have intial capital using encoding
|
||||
void enmkinitcap(char * d, const char * p, const char * encoding);
|
||||
|
||||
// convert null terminated string to all caps
|
||||
void mkallcap(char * p, const struct cs_info * csconv);
|
||||
|
||||
// convert null terminated string to all little
|
||||
void mkallsmall(char * p, const struct cs_info * csconv);
|
||||
|
||||
// convert null terminated string to have intial capital
|
||||
void mkinitcap(char * p, const struct cs_info * csconv);
|
||||
|
||||
// convert first nc characters of UTF-8 string to little
|
||||
void mkallsmall_utf(w_char * u, int nc, struct unicode_info2 * utfconv);
|
||||
|
||||
#endif
|
175
goldlib/hunspell/dictmgr.cxx
Normal file
175
goldlib/hunspell/dictmgr.cxx
Normal file
@ -0,0 +1,175 @@
|
||||
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <cctype>
|
||||
#include <cstdio>
|
||||
|
||||
#include "dictmgr.hxx"
|
||||
|
||||
#if !defined(_MSC_VER)
|
||||
using namespace std;
|
||||
#endif
|
||||
|
||||
DictMgr::DictMgr(const char * dictpath, const char * etype)
|
||||
{
|
||||
// load list of etype entries
|
||||
numdict = 0;
|
||||
pdentry = (dictentry *)malloc(MAXDICTIONARIES*sizeof(struct dictentry));
|
||||
if (pdentry) {
|
||||
if (parse_file(dictpath, etype)) {
|
||||
numdict = 0;
|
||||
// no dictionary.lst found is okay
|
||||
}
|
||||
} else {
|
||||
numdict = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
DictMgr::~DictMgr()
|
||||
{
|
||||
dictentry * pdict = NULL;
|
||||
if (pdentry) {
|
||||
pdict = pdentry;
|
||||
for (int i=0;i<numdict;i++) {
|
||||
if (pdict->lang) {
|
||||
free(pdict->lang);
|
||||
pdict->lang = NULL;
|
||||
}
|
||||
if (pdict->region) {
|
||||
free(pdict->region);
|
||||
pdict->region=NULL;
|
||||
}
|
||||
if (pdict->filename) {
|
||||
free(pdict->filename);
|
||||
pdict->filename = NULL;
|
||||
}
|
||||
pdict++;
|
||||
}
|
||||
free(pdentry);
|
||||
pdentry = NULL;
|
||||
pdict = NULL;
|
||||
}
|
||||
numdict = 0;
|
||||
}
|
||||
|
||||
|
||||
// read in list of etype entries and build up structure to describe them
|
||||
int DictMgr::parse_file(const char * dictpath, const char * etype)
|
||||
{
|
||||
|
||||
int i;
|
||||
char line[MAXDICTENTRYLEN+1];
|
||||
dictentry * pdict = pdentry;
|
||||
|
||||
// open the dictionary list file
|
||||
FILE * dictlst;
|
||||
dictlst = fopen(dictpath,"r");
|
||||
if (!dictlst) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// step one is to parse the dictionary list building up the
|
||||
// descriptive structures
|
||||
|
||||
// read in each line ignoring any that dont start with etype
|
||||
while (fgets(line,MAXDICTENTRYLEN,dictlst)) {
|
||||
mychomp(line);
|
||||
|
||||
/* parse in a dictionary entry */
|
||||
if (strncmp(line,etype,4) == 0) {
|
||||
if (numdict < MAXDICTIONARIES) {
|
||||
char * tp = line;
|
||||
char * piece;
|
||||
i = 0;
|
||||
while ((piece=mystrsep(&tp,' '))) {
|
||||
if (*piece != '\0') {
|
||||
switch(i) {
|
||||
case 0: break;
|
||||
case 1: pdict->lang = mystrdup(piece); break;
|
||||
case 2: if (strcmp (piece, "ANY") == 0)
|
||||
pdict->region = mystrdup("");
|
||||
else
|
||||
pdict->region = mystrdup(piece);
|
||||
break;
|
||||
case 3: pdict->filename = mystrdup(piece); break;
|
||||
default: break;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
free(piece);
|
||||
}
|
||||
if (i == 4) {
|
||||
numdict++;
|
||||
pdict++;
|
||||
} else {
|
||||
fprintf(stderr,"dictionary list corruption in line \"%s\"\n",line);
|
||||
fflush(stderr);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
fclose(dictlst);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// return text encoding of dictionary
|
||||
int DictMgr::get_list(dictentry ** ppentry)
|
||||
{
|
||||
*ppentry = pdentry;
|
||||
return numdict;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// strip strings into token based on single char delimiter
|
||||
// acts like strsep() but only uses a delim char and not
|
||||
// a delim string
|
||||
|
||||
char * DictMgr::mystrsep(char ** stringp, const char delim)
|
||||
{
|
||||
char * rv = NULL;
|
||||
char * mp = *stringp;
|
||||
int n = strlen(mp);
|
||||
if (n > 0) {
|
||||
char * dp = (char *)memchr(mp,(int)((unsigned char)delim),n);
|
||||
if (dp) {
|
||||
*stringp = dp+1;
|
||||
int nc = (int)((unsigned long)dp - (unsigned long)mp);
|
||||
rv = (char *) malloc(nc+1);
|
||||
memcpy(rv,mp,nc);
|
||||
*(rv+nc) = '\0';
|
||||
return rv;
|
||||
} else {
|
||||
rv = (char *) malloc(n+1);
|
||||
memcpy(rv, mp, n);
|
||||
*(rv+n) = '\0';
|
||||
*stringp = mp + n;
|
||||
return rv;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
// replaces strdup with ansi version
|
||||
char * DictMgr::mystrdup(const char * s)
|
||||
{
|
||||
char * d = NULL;
|
||||
if (s) {
|
||||
int sl = strlen(s);
|
||||
d = (char *) malloc(((sl+1) * sizeof(char)));
|
||||
if (d) memcpy(d,s,((sl+1)*sizeof(char)));
|
||||
}
|
||||
return d;
|
||||
}
|
||||
|
||||
|
||||
// remove cross-platform text line end characters
|
||||
void DictMgr:: mychomp(char * s)
|
||||
{
|
||||
int k = strlen(s);
|
||||
if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0';
|
||||
if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';
|
||||
}
|
||||
|
34
goldlib/hunspell/dictmgr.hxx
Normal file
34
goldlib/hunspell/dictmgr.hxx
Normal file
@ -0,0 +1,34 @@
|
||||
#ifndef _DICTMGR_HXX_
|
||||
#define _DICTMGR_HXX_
|
||||
|
||||
#define MAXDICTIONARIES 100
|
||||
#define MAXDICTENTRYLEN 1024
|
||||
|
||||
struct dictentry {
|
||||
char * filename;
|
||||
char * lang;
|
||||
char * region;
|
||||
};
|
||||
|
||||
|
||||
class DictMgr
|
||||
{
|
||||
|
||||
int numdict;
|
||||
dictentry * pdentry;
|
||||
|
||||
public:
|
||||
|
||||
DictMgr(const char * dictpath, const char * etype);
|
||||
~DictMgr();
|
||||
int get_list(dictentry** ppentry);
|
||||
|
||||
private:
|
||||
int parse_file(const char * dictpath, const char * etype);
|
||||
char * mystrsep(char ** stringp, const char delim);
|
||||
char * mystrdup(const char * s);
|
||||
void mychomp(char * s);
|
||||
|
||||
};
|
||||
|
||||
#endif
|
679
goldlib/hunspell/hashmgr.cxx
Normal file
679
goldlib/hunspell/hashmgr.cxx
Normal file
@ -0,0 +1,679 @@
|
||||
#include "license.hun"
|
||||
#include "license.mys"
|
||||
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <cctype>
|
||||
#ifdef HAVE_FCNTL_H
|
||||
#include <fcntl.h>
|
||||
#endif
|
||||
#include <cstdio>
|
||||
|
||||
#include "hashmgr.hxx"
|
||||
#include "csutil.hxx"
|
||||
|
||||
#if !defined(_MSC_VER)
|
||||
#include <unistd.h>
|
||||
using namespace std;
|
||||
#endif
|
||||
|
||||
// build a hash table from a munched word list
|
||||
|
||||
HashMgr::HashMgr(const char * tpath, const char * apath)
|
||||
{
|
||||
tablesize = 0;
|
||||
tableptr = NULL;
|
||||
flag_mode = FLAG_CHAR;
|
||||
complexprefixes = 0;
|
||||
utf8 = 0;
|
||||
numaliasf = 0;
|
||||
aliasf = NULL;
|
||||
numaliasm = 0;
|
||||
aliasm = NULL;
|
||||
load_config(apath);
|
||||
int ec = load_tables(tpath);
|
||||
if (ec) {
|
||||
/* error condition - what should we do here */
|
||||
fprintf(stderr,"Hash Manager Error : %d\n",ec);
|
||||
fflush(stderr);
|
||||
if (tableptr) {
|
||||
free(tableptr);
|
||||
}
|
||||
tablesize = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
HashMgr::~HashMgr()
|
||||
{
|
||||
if (tableptr) {
|
||||
// now pass through hash table freeing up everything
|
||||
// go through column by column of the table
|
||||
for (int i=0; i < tablesize; i++) {
|
||||
struct hentry * pt = &tableptr[i];
|
||||
struct hentry * nt = NULL;
|
||||
if (pt) {
|
||||
if (pt->astr && !aliasf) free(pt->astr);
|
||||
if (pt->word) free(pt->word);
|
||||
if (pt->description && !aliasm) free(pt->description);
|
||||
|
||||
pt = pt->next;
|
||||
}
|
||||
while(pt) {
|
||||
nt = pt->next;
|
||||
if (pt->astr && !aliasf) free(pt->astr);
|
||||
if (pt->word) free(pt->word);
|
||||
if (pt->description && !aliasm) free(pt->description);
|
||||
free(pt);
|
||||
pt = nt;
|
||||
}
|
||||
}
|
||||
free(tableptr);
|
||||
}
|
||||
tablesize = 0;
|
||||
|
||||
if (aliasf) {
|
||||
for (int j = 0; j < (numaliasf); j++) free(aliasf[j]);
|
||||
free(aliasf);
|
||||
aliasf = NULL;
|
||||
if (aliasflen) {
|
||||
free(aliasflen);
|
||||
aliasflen = NULL;
|
||||
}
|
||||
}
|
||||
if (aliasm) {
|
||||
for (int j = 0; j < (numaliasm); j++) free(aliasm[j]);
|
||||
free(aliasm);
|
||||
aliasm = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
// lookup a root word in the hashtable
|
||||
|
||||
struct hentry * HashMgr::lookup(const char *word) const
|
||||
{
|
||||
struct hentry * dp;
|
||||
if (tableptr) {
|
||||
dp = &tableptr[hash(word)];
|
||||
if (dp->word == NULL) return NULL;
|
||||
for ( ; dp != NULL; dp = dp->next) {
|
||||
if (strcmp(word,dp->word) == 0) return dp;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// add a word to the hash table (private)
|
||||
|
||||
int HashMgr::add_word(const char * word, int wl, unsigned short * aff, int al, const char * desc)
|
||||
{
|
||||
char * st = mystrdup(word);
|
||||
if (wl && !st) return 1;
|
||||
if (complexprefixes) {
|
||||
if (utf8) reverseword_utf(st); else reverseword(st);
|
||||
}
|
||||
int i = hash(st);
|
||||
struct hentry * dp = &tableptr[i];
|
||||
if (dp->word == NULL) {
|
||||
dp->wlen = wl;
|
||||
dp->alen = al;
|
||||
dp->word = st;
|
||||
dp->astr = aff;
|
||||
dp->next = NULL;
|
||||
dp->next_homonym = NULL;
|
||||
if (aliasm) {
|
||||
dp->description = (desc) ? get_aliasm(atoi(desc)) : mystrdup(desc);
|
||||
} else {
|
||||
dp->description = mystrdup(desc);
|
||||
if (desc && !dp->description) return 1;
|
||||
if (dp->description && complexprefixes) {
|
||||
if (utf8) reverseword_utf(dp->description); else reverseword(dp->description);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
struct hentry* hp = (struct hentry *) malloc (sizeof(struct hentry));
|
||||
if (!hp) return 1;
|
||||
hp->wlen = wl;
|
||||
hp->alen = al;
|
||||
hp->word = st;
|
||||
hp->astr = aff;
|
||||
hp->next = NULL;
|
||||
hp->next_homonym = NULL;
|
||||
if (aliasm) {
|
||||
hp->description = (desc) ? get_aliasm(atoi(desc)) : mystrdup(desc);
|
||||
} else {
|
||||
hp->description = mystrdup(desc);
|
||||
if (desc && !hp->description) return 1;
|
||||
if (dp->description && complexprefixes) {
|
||||
if (utf8) reverseword_utf(hp->description); else reverseword(hp->description);
|
||||
}
|
||||
}
|
||||
while (dp->next != NULL) {
|
||||
if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) dp->next_homonym = hp;
|
||||
dp=dp->next;
|
||||
}
|
||||
if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) dp->next_homonym = hp;
|
||||
dp->next = hp;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// add a custom dic. word to the hash table (public)
|
||||
|
||||
int HashMgr::put_word(const char * word, int wl, char * aff)
|
||||
{
|
||||
unsigned short * flags;
|
||||
int al = 0;
|
||||
if (aff) {
|
||||
al = decode_flags(&flags, aff);
|
||||
flag_qsort(flags, 0, al);
|
||||
} else {
|
||||
flags = NULL;
|
||||
}
|
||||
add_word(word, wl, flags, al, NULL);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int HashMgr::put_word_pattern(const char * word, int wl, const char * pattern)
|
||||
{
|
||||
unsigned short * flags;
|
||||
struct hentry * dp = lookup(pattern);
|
||||
if (!dp || !dp->astr) return 1;
|
||||
flags = (unsigned short *) malloc (dp->alen * sizeof(short));
|
||||
memcpy((void *) flags, (void *) dp->astr, dp->alen * sizeof(short));
|
||||
add_word(word, wl, flags, dp->alen, NULL);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// walk the hash table entry by entry - null at end
|
||||
struct hentry * HashMgr::walk_hashtable(int &col, struct hentry * hp) const
|
||||
{
|
||||
//reset to start
|
||||
if ((col < 0) || (hp == NULL)) {
|
||||
col = -1;
|
||||
hp = NULL;
|
||||
}
|
||||
|
||||
if (hp && hp->next != NULL) {
|
||||
hp = hp->next;
|
||||
} else {
|
||||
col++;
|
||||
hp = (col < tablesize) ? &tableptr[col] : NULL;
|
||||
// search for next non-blank column entry
|
||||
while (hp && (hp->word == NULL)) {
|
||||
col ++;
|
||||
hp = (col < tablesize) ? &tableptr[col] : NULL;
|
||||
}
|
||||
if (col < tablesize) return hp;
|
||||
hp = NULL;
|
||||
col = -1;
|
||||
}
|
||||
return hp;
|
||||
}
|
||||
|
||||
// load a munched word list and build a hash table on the fly
|
||||
int HashMgr::load_tables(const char * tpath)
|
||||
{
|
||||
int wl, al;
|
||||
char * ap;
|
||||
char * dp;
|
||||
unsigned short * flags;
|
||||
|
||||
// raw dictionary - munched file
|
||||
FILE * rawdict = fopen(tpath, "r");
|
||||
if (rawdict == NULL) return 1;
|
||||
|
||||
// first read the first line of file to get hash table size */
|
||||
char ts[MAXDELEN];
|
||||
if (! fgets(ts, MAXDELEN-1,rawdict)) return 2;
|
||||
mychomp(ts);
|
||||
if ((*ts < '1') || (*ts > '9')) fprintf(stderr, "error - missing word count in dictionary file\n");
|
||||
tablesize = atoi(ts);
|
||||
if (!tablesize) return 4;
|
||||
tablesize = tablesize + 5 + USERWORD;
|
||||
if ((tablesize %2) == 0) tablesize++;
|
||||
|
||||
// allocate the hash table
|
||||
tableptr = (struct hentry *) calloc(tablesize, sizeof(struct hentry));
|
||||
if (! tableptr) return 3;
|
||||
for (int i=0; i<tablesize; i++) tableptr[i].word = NULL;
|
||||
|
||||
// loop through all words on much list and add to hash
|
||||
// table and create word and affix strings
|
||||
|
||||
while (fgets(ts,MAXDELEN-1,rawdict)) {
|
||||
mychomp(ts);
|
||||
// split each line into word and morphological description
|
||||
dp = strchr(ts,'\t');
|
||||
|
||||
if (dp) {
|
||||
*dp = '\0';
|
||||
dp++;
|
||||
} else {
|
||||
dp = NULL;
|
||||
}
|
||||
|
||||
// split each line into word and affix char strings
|
||||
// "\/" signs slash in words (not affix separator)
|
||||
// "/" at beginning of the line is word character (not affix separator)
|
||||
ap = ts;
|
||||
while (ap = strchr(ap,'/')) {
|
||||
if (ap == ts) {
|
||||
ap++;
|
||||
continue;
|
||||
} else if (*(ap - 1) != '\\') break;
|
||||
// replace "\/" with "/"
|
||||
for (char * sp = ap - 1; *sp; *sp = *(sp + 1), sp++);
|
||||
|
||||
}
|
||||
|
||||
if (ap) {
|
||||
*ap = '\0';
|
||||
if (aliasf) {
|
||||
int index = atoi(ap + 1);
|
||||
al = get_aliasf(index, &flags);
|
||||
if (!al) {
|
||||
fprintf(stderr, "error - bad flag vector alias: %s\n", ts);
|
||||
*ap = '\0';
|
||||
}
|
||||
} else {
|
||||
al = decode_flags(&flags, ap + 1);
|
||||
flag_qsort(flags, 0, al);
|
||||
}
|
||||
} else {
|
||||
al = 0;
|
||||
ap = NULL;
|
||||
flags = NULL;
|
||||
}
|
||||
|
||||
wl = strlen(ts);
|
||||
|
||||
// add the word and its index
|
||||
if (add_word(ts,wl,flags,al,dp)) return 5;
|
||||
|
||||
}
|
||||
|
||||
fclose(rawdict);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
// the hash function is a simple load and rotate
|
||||
// algorithm borrowed
|
||||
|
||||
int HashMgr::hash(const char * word) const
|
||||
{
|
||||
long hv = 0;
|
||||
for (int i=0; i < 4 && *word != 0; i++)
|
||||
hv = (hv << 8) | (*word++);
|
||||
while (*word != 0) {
|
||||
ROTATE(hv,ROTATE_LEN);
|
||||
hv ^= (*word++);
|
||||
}
|
||||
return (unsigned long) hv % tablesize;
|
||||
}
|
||||
|
||||
int HashMgr::decode_flags(unsigned short ** result, char * flags) {
|
||||
int len;
|
||||
switch (flag_mode) {
|
||||
case FLAG_LONG: { // two-character flags (1x2yZz -> 1x 2y Zz)
|
||||
len = strlen(flags);
|
||||
if (len%2 == 1) fprintf(stderr,"error: length of FLAG_LONG flagvector is odd: %s\n", flags);
|
||||
len = len/2;
|
||||
*result = (unsigned short *) malloc(len * sizeof(short));
|
||||
for (int i = 0; i < len; i++) {
|
||||
(*result)[i] = (((unsigned short) flags[i * 2]) << 8) + (unsigned short) flags[i * 2 + 1];
|
||||
}
|
||||
break;
|
||||
}
|
||||
case FLAG_NUM: { // decimal numbers separated by comma (4521,23,233 -> 4521 23 233)
|
||||
len = 1;
|
||||
char * src = flags;
|
||||
unsigned short * dest;
|
||||
char * p;
|
||||
for (p = flags; *p; p++) {
|
||||
if (*p == ',') len++;
|
||||
}
|
||||
*result = (unsigned short *) malloc(len * sizeof(short));
|
||||
dest = *result;
|
||||
for (p = flags; *p; p++) {
|
||||
if (*p == ',') {
|
||||
*dest = (unsigned short) atoi(src);
|
||||
if (*dest == 0) fprintf(stderr, "error: 0 is wrong flag id\n");
|
||||
src = p + 1;
|
||||
dest++;
|
||||
}
|
||||
}
|
||||
*dest = (unsigned short) atoi(src);
|
||||
if (*dest == 0) fprintf(stderr, "error: 0 is wrong flag id\n");
|
||||
break;
|
||||
}
|
||||
case FLAG_UNI: { // UTF-8 characters
|
||||
w_char w[MAXDELEN/2];
|
||||
len = u8_u16(w, MAXDELEN/2, flags);
|
||||
*result = (unsigned short *) malloc(len * sizeof(short));
|
||||
memcpy(*result, w, len * sizeof(short));
|
||||
break;
|
||||
}
|
||||
default: { // Ispell's one-character flags (erfg -> e r f g)
|
||||
unsigned short * dest;
|
||||
len = strlen(flags);
|
||||
*result = (unsigned short *) malloc(len * sizeof(short));
|
||||
dest = *result;
|
||||
for (unsigned char * p = (unsigned char *) flags; *p; p++) {
|
||||
*dest = (unsigned short) *p;
|
||||
dest++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
unsigned short HashMgr::decode_flag(const char * f) {
|
||||
unsigned short s = 0;
|
||||
switch (flag_mode) {
|
||||
case FLAG_LONG:
|
||||
s = ((unsigned short) f[0] << 8) + (unsigned short) f[1];
|
||||
break;
|
||||
case FLAG_NUM:
|
||||
s = (unsigned short) atoi(f);
|
||||
break;
|
||||
case FLAG_UNI:
|
||||
u8_u16((w_char *) &s, 1, f);
|
||||
break;
|
||||
default:
|
||||
s = (unsigned short) *((unsigned char *)f);
|
||||
}
|
||||
if (!s) fprintf(stderr, "error: 0 is wrong flag id\n");
|
||||
return s;
|
||||
}
|
||||
|
||||
char * HashMgr::encode_flag(unsigned short f) {
|
||||
unsigned char ch[10];
|
||||
if (f==0) return mystrdup("(NULL)");
|
||||
if (flag_mode == FLAG_LONG) {
|
||||
ch[0] = (unsigned char) (f >> 8);
|
||||
ch[1] = (unsigned char) (f - ((f >> 8) << 8));
|
||||
ch[2] = '\0';
|
||||
} else if (flag_mode == FLAG_NUM) {
|
||||
sprintf((char *) ch, "%d", f);
|
||||
} else if (flag_mode == FLAG_UNI) {
|
||||
u16_u8((char *) &ch, 10, (w_char *) &f, 1);
|
||||
} else {
|
||||
ch[0] = (unsigned char) (f);
|
||||
ch[1] = '\0';
|
||||
}
|
||||
return mystrdup((char *) ch);
|
||||
}
|
||||
|
||||
// read in aff file and set flag mode
|
||||
int HashMgr::load_config(const char * affpath)
|
||||
{
|
||||
|
||||
// io buffers
|
||||
char line[MAXDELEN+1];
|
||||
|
||||
// open the affix file
|
||||
FILE * afflst;
|
||||
afflst = fopen(affpath,"r");
|
||||
if (!afflst) {
|
||||
fprintf(stderr,"Error - could not open affix description file %s\n",affpath);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// read in each line ignoring any that do not
|
||||
// start with a known line type indicator
|
||||
|
||||
while (fgets(line,MAXDELEN,afflst)) {
|
||||
mychomp(line);
|
||||
|
||||
/* parse in the try string */
|
||||
if ((strncmp(line,"FLAG",4) == 0) && isspace(line[4])) {
|
||||
if (flag_mode != FLAG_CHAR) {
|
||||
fprintf(stderr,"error: duplicate FLAG parameter\n");
|
||||
}
|
||||
if (strstr(line, "long")) flag_mode = FLAG_LONG;
|
||||
if (strstr(line, "num")) flag_mode = FLAG_NUM;
|
||||
if (strstr(line, "UTF-8")) flag_mode = FLAG_UNI;
|
||||
if (flag_mode == FLAG_CHAR) {
|
||||
fprintf(stderr,"error: FLAG need `num', `long' or `UTF-8' parameter: %s\n", line);
|
||||
}
|
||||
}
|
||||
if ((strncmp(line,"SET",3) == 0) && isspace(line[3]) && strstr(line, "UTF-8")) utf8 = 1;
|
||||
|
||||
if ((strncmp(line,"AF",2) == 0) && isspace(line[2])) {
|
||||
if (parse_aliasf(line, afflst)) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
if ((strncmp(line,"AM",2) == 0) && isspace(line[2])) {
|
||||
if (parse_aliasm(line, afflst)) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (strncmp(line,"COMPLEXPREFIXES",15) == 0) complexprefixes = 1;
|
||||
if (((strncmp(line,"SFX",3) == 0) || (strncmp(line,"PFX",3) == 0)) && isspace(line[3])) break;
|
||||
}
|
||||
fclose(afflst);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* parse in the ALIAS table */
|
||||
int HashMgr::parse_aliasf(char * line, FILE * af)
|
||||
{
|
||||
if (numaliasf != 0) {
|
||||
fprintf(stderr,"error: duplicate AF (alias for flag vector) tables used\n");
|
||||
return 1;
|
||||
}
|
||||
char * tp = line;
|
||||
char * piece;
|
||||
int i = 0;
|
||||
int np = 0;
|
||||
while ((piece=mystrsep(&tp, 0))) {
|
||||
if (*piece != '\0') {
|
||||
switch(i) {
|
||||
case 0: { np++; break; }
|
||||
case 1: {
|
||||
numaliasf = atoi(piece);
|
||||
if (numaliasf < 1) {
|
||||
numaliasf = 0;
|
||||
aliasf = NULL;
|
||||
aliasflen = NULL;
|
||||
fprintf(stderr,"incorrect number of entries in AF table\n");
|
||||
free(piece);
|
||||
return 1;
|
||||
}
|
||||
aliasf = (unsigned short **) malloc(numaliasf * sizeof(unsigned short *));
|
||||
aliasflen = (unsigned short *) malloc(numaliasf * sizeof(short));
|
||||
if (!aliasf || !aliasflen) {
|
||||
numaliasf = 0;
|
||||
if (aliasf) free(aliasf);
|
||||
if (aliasflen) free(aliasflen);
|
||||
aliasf = NULL;
|
||||
aliasflen = NULL;
|
||||
return 1;
|
||||
}
|
||||
np++;
|
||||
break;
|
||||
}
|
||||
default: break;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
free(piece);
|
||||
}
|
||||
if (np != 2) {
|
||||
numaliasf = 0;
|
||||
free(aliasf);
|
||||
free(aliasflen);
|
||||
aliasf = NULL;
|
||||
aliasflen = NULL;
|
||||
fprintf(stderr,"error: missing AF table information\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* now parse the numaliasf lines to read in the remainder of the table */
|
||||
char * nl = line;
|
||||
for (int j=0; j < numaliasf; j++) {
|
||||
if (!fgets(nl,MAXDELEN,af)) return 1;
|
||||
mychomp(nl);
|
||||
tp = nl;
|
||||
i = 0;
|
||||
aliasf[j] = NULL;
|
||||
aliasflen[j] = 0;
|
||||
while ((piece=mystrsep(&tp, 0))) {
|
||||
if (*piece != '\0') {
|
||||
switch(i) {
|
||||
case 0: {
|
||||
if (strncmp(piece,"AF",2) != 0) {
|
||||
numaliasf = 0;
|
||||
free(aliasf);
|
||||
free(aliasflen);
|
||||
aliasf = NULL;
|
||||
aliasflen = NULL;
|
||||
fprintf(stderr,"error: AF table is corrupt\n");
|
||||
free(piece);
|
||||
return 1;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 1: {
|
||||
aliasflen[j] = decode_flags(&(aliasf[j]), piece);
|
||||
flag_qsort(aliasf[j], 0, aliasflen[j]);
|
||||
break;
|
||||
}
|
||||
default: break;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
free(piece);
|
||||
}
|
||||
if (!aliasf[j]) {
|
||||
free(aliasf);
|
||||
free(aliasflen);
|
||||
aliasf = NULL;
|
||||
aliasflen = NULL;
|
||||
numaliasf = 0;
|
||||
fprintf(stderr,"error: AF table is corrupt\n");
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* parse morph alias definitions */
|
||||
int HashMgr::parse_aliasm(char * line, FILE * af)
|
||||
{
|
||||
if (numaliasm != 0) {
|
||||
fprintf(stderr,"error: duplicate AM (aliases for morphological descriptions) tables used\n");
|
||||
return 1;
|
||||
}
|
||||
char * tp = line;
|
||||
char * piece;
|
||||
int i = 0;
|
||||
int np = 0;
|
||||
while ((piece=mystrsep(&tp, 0))) {
|
||||
if (*piece != '\0') {
|
||||
switch(i) {
|
||||
case 0: { np++; break; }
|
||||
case 1: {
|
||||
numaliasm = atoi(piece);
|
||||
if (numaliasm < 1) {
|
||||
fprintf(stderr,"incorrect number of entries in AM table\n");
|
||||
free(piece);
|
||||
return 1;
|
||||
}
|
||||
aliasm = (char **) malloc(numaliasm * sizeof(char *));
|
||||
if (!aliasm) {
|
||||
numaliasm = 0;
|
||||
return 1;
|
||||
}
|
||||
np++;
|
||||
break;
|
||||
}
|
||||
default: break;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
free(piece);
|
||||
}
|
||||
if (np != 2) {
|
||||
numaliasm = 0;
|
||||
free(aliasm);
|
||||
aliasm = NULL;
|
||||
fprintf(stderr,"error: missing AM alias information\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* now parse the numaliasm lines to read in the remainder of the table */
|
||||
char * nl = line;
|
||||
for (int j=0; j < numaliasm; j++) {
|
||||
if (!fgets(nl,MAXDELEN,af)) return 1;
|
||||
mychomp(nl);
|
||||
tp = nl;
|
||||
i = 0;
|
||||
aliasm[j] = NULL;
|
||||
while ((piece=mystrsep(&tp, 0))) {
|
||||
if (*piece != '\0') {
|
||||
switch(i) {
|
||||
case 0: {
|
||||
if (strncmp(piece,"AM",2) != 0) {
|
||||
fprintf(stderr,"error: AM table is corrupt\n");
|
||||
free(piece);
|
||||
numaliasm = 0;
|
||||
free(aliasm);
|
||||
aliasm = NULL;
|
||||
return 1;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 1: {
|
||||
if (complexprefixes) {
|
||||
if (utf8) reverseword_utf(piece);
|
||||
else reverseword(piece);
|
||||
}
|
||||
aliasm[j] = mystrdup(piece);
|
||||
break; }
|
||||
default: break;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
free(piece);
|
||||
}
|
||||
if (!aliasm[j]) {
|
||||
numaliasm = 0;
|
||||
free(aliasm);
|
||||
aliasm = NULL;
|
||||
fprintf(stderr,"error: map table is corrupt\n");
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int HashMgr::is_aliasf() {
|
||||
return (aliasf != NULL);
|
||||
}
|
||||
|
||||
int HashMgr::is_aliasm() {
|
||||
return (aliasm != NULL);
|
||||
}
|
||||
|
||||
int HashMgr::get_aliasf(int index, unsigned short ** fvec) {
|
||||
if ((index > 0) && (index <= numaliasf)) {
|
||||
*fvec = aliasf[index - 1];
|
||||
return aliasflen[index - 1];
|
||||
}
|
||||
fprintf(stderr,"error: bad flag alias index: %d\n", index);
|
||||
fprintf(stderr,"hiba: %d\n", index);
|
||||
*fvec = NULL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
char * HashMgr::get_aliasm(int index) {
|
||||
if ((index > 0) && (index <= numaliasm)) return aliasm[index - 1];
|
||||
fprintf(stderr,"error: bad morph. alias index: %d\n", index);
|
||||
return NULL;
|
||||
}
|
51
goldlib/hunspell/hashmgr.hxx
Normal file
51
goldlib/hunspell/hashmgr.hxx
Normal file
@ -0,0 +1,51 @@
|
||||
#ifndef _HASHMGR_HXX_
|
||||
#define _HASHMGR_HXX_
|
||||
|
||||
#include <cstdio>
|
||||
#include "htypes.hxx"
|
||||
|
||||
enum flag { FLAG_CHAR, FLAG_LONG, FLAG_NUM, FLAG_UNI };
|
||||
|
||||
class HashMgr
|
||||
{
|
||||
int tablesize;
|
||||
struct hentry * tableptr;
|
||||
int userword;
|
||||
flag flag_mode;
|
||||
int complexprefixes;
|
||||
int utf8;
|
||||
int numaliasf; // flag vector `compression' with aliases
|
||||
unsigned short ** aliasf;
|
||||
unsigned short * aliasflen;
|
||||
int numaliasm; // morphological desciption `compression' with aliases
|
||||
char ** aliasm;
|
||||
|
||||
|
||||
public:
|
||||
HashMgr(const char * tpath, const char * apath);
|
||||
~HashMgr();
|
||||
|
||||
struct hentry * lookup(const char *) const;
|
||||
int hash(const char *) const;
|
||||
struct hentry * walk_hashtable(int & col, struct hentry * hp) const;
|
||||
|
||||
int put_word(const char * word, int wl, char * ap);
|
||||
int put_word_pattern(const char * word, int wl, const char * pattern);
|
||||
int decode_flags(unsigned short ** result, char * flags);
|
||||
unsigned short decode_flag(const char * flag);
|
||||
char * encode_flag(unsigned short flag);
|
||||
int is_aliasf();
|
||||
int is_aliasm();
|
||||
int get_aliasf(int index, unsigned short ** fvec);
|
||||
char * get_aliasm(int index);
|
||||
|
||||
private:
|
||||
int load_tables(const char * tpath);
|
||||
int add_word(const char * word, int wl, unsigned short * ap, int al, const char * desc);
|
||||
int load_config(const char * affpath);
|
||||
int parse_aliasf(char * line, FILE * af);
|
||||
int parse_aliasm(char * line, FILE * af);
|
||||
|
||||
};
|
||||
|
||||
#endif
|
25
goldlib/hunspell/htypes.hxx
Normal file
25
goldlib/hunspell/htypes.hxx
Normal file
@ -0,0 +1,25 @@
|
||||
#ifndef _HTYPES_HXX_
|
||||
#define _HTYPES_HXX_
|
||||
|
||||
#define MAXDELEN 8192
|
||||
|
||||
#define ROTATE_LEN 5
|
||||
|
||||
#define ROTATE(v,q) \
|
||||
(v) = ((v) << (q)) | (((v) >> (32 - q)) & ((1 << (q))-1));
|
||||
|
||||
// approx. number of user defined words
|
||||
#define USERWORD 1000
|
||||
|
||||
struct hentry
|
||||
{
|
||||
short wlen;
|
||||
short alen;
|
||||
char * word;
|
||||
unsigned short * astr;
|
||||
struct hentry * next;
|
||||
struct hentry * next_homonym;
|
||||
char * description;
|
||||
};
|
||||
|
||||
#endif
|
1644
goldlib/hunspell/hunspell.cxx
Normal file
1644
goldlib/hunspell/hunspell.cxx
Normal file
File diff suppressed because it is too large
Load Diff
143
goldlib/hunspell/hunspell.hxx
Normal file
143
goldlib/hunspell/hunspell.hxx
Normal file
@ -0,0 +1,143 @@
|
||||
#include "hashmgr.hxx"
|
||||
#include "affixmgr.hxx"
|
||||
#include "suggmgr.hxx"
|
||||
#include "csutil.hxx"
|
||||
#include "langnum.hxx"
|
||||
|
||||
#define NOCAP 0
|
||||
#define INITCAP 1
|
||||
#define ALLCAP 2
|
||||
#define HUHCAP 3
|
||||
#define HUHINITCAP 4
|
||||
|
||||
#define MAXSUGGESTION 15
|
||||
#define MAXSHARPS 5
|
||||
|
||||
#ifdef W32
|
||||
#define DLLTEST2_API __declspec(dllexport)
|
||||
#endif
|
||||
|
||||
#ifndef _MYSPELLMGR_HXX_
|
||||
#define _MYSPELLMGR_HXX_
|
||||
|
||||
#ifdef W32
|
||||
class DLLTEST2_API Hunspell
|
||||
#else
|
||||
class Hunspell
|
||||
#endif
|
||||
{
|
||||
AffixMgr* pAMgr;
|
||||
HashMgr* pHMgr;
|
||||
SuggestMgr* pSMgr;
|
||||
char * encoding;
|
||||
struct cs_info * csconv;
|
||||
struct unicode_info2 * utfconv;
|
||||
int langnum;
|
||||
int utf8;
|
||||
int complexprefixes;
|
||||
char** wordbreak;
|
||||
|
||||
/* XXX not stateless variables for compound handling */
|
||||
char * prevroot;
|
||||
int prevcompound;
|
||||
|
||||
/* forbidden_compound:
|
||||
* 0 = not forbidden
|
||||
* 1 = forbidden
|
||||
* 2 = forbidden compound (written without dash in Hungarian)
|
||||
*/
|
||||
int forbidden_compound;
|
||||
|
||||
|
||||
public:
|
||||
|
||||
/* Hunspell(aff, dic) - constructor of Hunspell class
|
||||
* input: path of affix file and dictionary file
|
||||
*/
|
||||
|
||||
Hunspell(const char * affpath, const char * dpath);
|
||||
|
||||
~Hunspell();
|
||||
|
||||
/* spell(word) - spellcheck word
|
||||
* output: 0 = bad word, not 0 = good word
|
||||
*/
|
||||
|
||||
int spell(const char *);
|
||||
|
||||
/* suggest(suggestions, word) - search suggestions
|
||||
* input: pointer to an array of strings pointer and the (bad) word
|
||||
* array of strings pointer (here *slst) may not be initialized
|
||||
* output: number of suggestions in string array, and suggestions in
|
||||
* a newly allocated array of strings (*slts will be NULL when number
|
||||
* of suggestion equals 0.)
|
||||
*/
|
||||
|
||||
int suggest(char*** slst, const char * word);
|
||||
|
||||
/* handling custom dictionary */
|
||||
|
||||
int put_word(const char * word);
|
||||
|
||||
/* suffix is an affix flag string, similarly in dictionary files */
|
||||
|
||||
int put_word_suffix(const char * word, const char * suffix);
|
||||
|
||||
/* pattern is a sample dictionary word
|
||||
* put word into custom dictionary with affix flags of pattern word
|
||||
*/
|
||||
|
||||
int put_word_pattern(const char * word, const char * pattern);
|
||||
|
||||
/* other */
|
||||
|
||||
char * get_dic_encoding();
|
||||
const char * get_wordchars();
|
||||
unsigned short * get_wordchars_utf16(int * len);
|
||||
struct cs_info * get_csconv();
|
||||
struct unicode_info2 * get_utf_conv();
|
||||
const char * get_version();
|
||||
|
||||
/* experimental functions */
|
||||
|
||||
/* morphological analysis */
|
||||
|
||||
char * morph(const char * word);
|
||||
int analyze(char*** out, const char *word);
|
||||
|
||||
char * morph_with_correction(const char * word);
|
||||
|
||||
/* stemmer function */
|
||||
|
||||
int stem(char*** slst, const char * word);
|
||||
|
||||
/* spec. suggestions */
|
||||
int suggest_auto(char*** slst, const char * word);
|
||||
int suggest_pos_stems(char*** slst, const char * word);
|
||||
char * get_possible_root();
|
||||
|
||||
/* not threadsafe functions for Hunspell command line API */
|
||||
|
||||
char * get_prevroot();
|
||||
int get_prevcompound();
|
||||
int get_forbidden_compound();
|
||||
|
||||
private:
|
||||
int cleanword(char *, const char *, int * pcaptype, int * pabbrev);
|
||||
int cleanword2(char *, const char *, w_char *, int * w_len, int * pcaptype, int * pabbrev);
|
||||
void mkinitcap(char *);
|
||||
int mkinitcap2(char * p, w_char * u, int nc);
|
||||
int mkinitsmall2(char * p, w_char * u, int nc);
|
||||
void mkallcap(char *);
|
||||
int mkallcap2(char * p, w_char * u, int nc);
|
||||
void mkallsmall(char *);
|
||||
int mkallsmall2(char * p, w_char * u, int nc);
|
||||
struct hentry * check(const char *);
|
||||
char * sharps_u8_l1(char * dest, char * source);
|
||||
hentry * spellsharps(char * base, char *, int, int, char * tmp);
|
||||
int is_keepcase(const hentry * rv);
|
||||
int insert_sug(char ***slst, char * word, int *ns);
|
||||
|
||||
};
|
||||
|
||||
#endif
|
37
goldlib/hunspell/langnum.hxx
Normal file
37
goldlib/hunspell/langnum.hxx
Normal file
@ -0,0 +1,37 @@
|
||||
#ifndef _LANGNUM_HXX_
|
||||
#define _LANGNUM_HXX_
|
||||
|
||||
/*
|
||||
language numbers for language specific codes
|
||||
see http://l10n.openoffice.org/languages.html
|
||||
*/
|
||||
|
||||
enum {
|
||||
LANG_az=100, // custom number
|
||||
LANG_bg=41,
|
||||
LANG_ca=37,
|
||||
LANG_cs=42,
|
||||
LANG_da=45,
|
||||
LANG_de=49,
|
||||
LANG_el=30,
|
||||
LANG_en=01,
|
||||
LANG_es=34,
|
||||
LANG_eu=10,
|
||||
LANG_fr=02,
|
||||
LANG_gl=38,
|
||||
LANG_hr=78,
|
||||
LANG_hu=36,
|
||||
LANG_it=39,
|
||||
LANG_la=99, // custom number
|
||||
LANG_lv=101, // custom number
|
||||
LANG_nl=31,
|
||||
LANG_pl=48,
|
||||
LANG_pt=03,
|
||||
LANG_ru=07,
|
||||
LANG_sv=50,
|
||||
LANG_tr=90,
|
||||
LANG_uk=80,
|
||||
LANG_xx=999
|
||||
};
|
||||
|
||||
#endif
|
57
goldlib/hunspell/license.hun
Normal file
57
goldlib/hunspell/license.hun
Normal file
@ -0,0 +1,57 @@
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Hunspell, based on MySpell.
|
||||
*
|
||||
* The Initial Developers of the Original Code are
|
||||
* Kevin Hendricks (MySpell) and Németh László (Hunspell).
|
||||
* Portions created by the Initial Developers are Copyright (C) 2002-2005
|
||||
* the Initial Developers. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* David Einstein
|
||||
* Davide Prina
|
||||
* Giuseppe Modugno
|
||||
* Gianluca Turconi
|
||||
* Simon Brouwer
|
||||
* Noll János
|
||||
* Bíró Árpád
|
||||
* Goldman Eleonóra
|
||||
* Sarlós Tamás
|
||||
* Bencsáth Boldizsár
|
||||
* Halácsy Péter
|
||||
* Dvornik László
|
||||
* Gefferth András
|
||||
* Nagy Viktor
|
||||
* Varga Dániel
|
||||
* Chris Halls
|
||||
* Rene Engelhard
|
||||
* Bram Moolenaar
|
||||
* Dafydd Jones
|
||||
* Harri Pitkänen
|
||||
* András Tímár
|
||||
* Tor Lillqvist
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
61
goldlib/hunspell/license.mys
Normal file
61
goldlib/hunspell/license.mys
Normal file
@ -0,0 +1,61 @@
|
||||
/*
|
||||
* Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
|
||||
* And Contributors. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All modifications to the source code must be clearly marked as
|
||||
* such. Binary redistributions based on modified source code
|
||||
* must be clearly marked as modified versions in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
|
||||
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
|
||||
* KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
*
|
||||
* NOTE: A special thanks and credit goes to Geoff Kuenning
|
||||
* the creator of ispell. MySpell's affix algorithms were
|
||||
* based on those of ispell which should be noted is
|
||||
* copyright Geoff Kuenning et.al. and now available
|
||||
* under a BSD style license. For more information on ispell
|
||||
* and affix compression in general, please see:
|
||||
* http://www.cs.ucla.edu/ficus-members/geoff/ispell.html
|
||||
* (the home page for ispell)
|
||||
*
|
||||
* An almost complete rewrite of MySpell for use by
|
||||
* the Mozilla project has been developed by David Einstein
|
||||
* (Deinst@world.std.com). David and I are now
|
||||
* working on parallel development tracks to help
|
||||
* our respective projects (Mozilla and OpenOffice.org
|
||||
* and we will maintain full affix file and dictionary
|
||||
* file compatibility and work on merging our versions
|
||||
* of MySpell back into a single tree. David has been
|
||||
* a significant help in improving MySpell.
|
||||
*
|
||||
* Special thanks also go to La'szlo' Ne'meth
|
||||
* <nemethl@gyorsposta.hu> who is the author of the
|
||||
* Hungarian dictionary and who developed and contributed
|
||||
* the code to support compound words in MySpell
|
||||
* and fixed numerous problems with the encoding
|
||||
* case conversion tables.
|
||||
*
|
||||
*/
|
1657
goldlib/hunspell/suggmgr.cxx
Normal file
1657
goldlib/hunspell/suggmgr.cxx
Normal file
File diff suppressed because it is too large
Load Diff
87
goldlib/hunspell/suggmgr.hxx
Normal file
87
goldlib/hunspell/suggmgr.hxx
Normal file
@ -0,0 +1,87 @@
|
||||
#ifndef _SUGGESTMGR_HXX_
|
||||
#define _SUGGESTMGR_HXX_
|
||||
|
||||
#define MAXSWL 100
|
||||
#define MAXSWUTF8L (MAXSWL * 4)
|
||||
#define MAX_ROOTS 50
|
||||
#define MAX_WORDS 200
|
||||
#define MAX_GUESS 200
|
||||
#define MAXNGRAMSUGS 5
|
||||
|
||||
#define MINTIMER 500
|
||||
#define MAXPLUSTIMER 500
|
||||
|
||||
#define NGRAM_IGNORE_LENGTH 0
|
||||
#define NGRAM_LONGER_WORSE 1
|
||||
#define NGRAM_ANY_MISMATCH 2
|
||||
|
||||
#include "atypes.hxx"
|
||||
#include "affixmgr.hxx"
|
||||
#include "hashmgr.hxx"
|
||||
#include "langnum.hxx"
|
||||
#include <time.h>
|
||||
|
||||
enum { LCS_UP, LCS_LEFT, LCS_UPLEFT };
|
||||
|
||||
class SuggestMgr
|
||||
{
|
||||
char * ctry;
|
||||
int ctryl;
|
||||
w_char * ctry_utf;
|
||||
|
||||
AffixMgr* pAMgr;
|
||||
int maxSug;
|
||||
struct cs_info * csconv;
|
||||
struct unicode_info2 * utfconv;
|
||||
int utf8;
|
||||
int nosplitsugs;
|
||||
int maxngramsugs;
|
||||
int complexprefixes;
|
||||
|
||||
|
||||
public:
|
||||
SuggestMgr(const char * tryme, int maxn, AffixMgr *aptr);
|
||||
~SuggestMgr();
|
||||
|
||||
int suggest(char*** slst, const char * word, int nsug);
|
||||
int ngsuggest(char ** wlst, char * word, HashMgr* pHMgr);
|
||||
int suggest_auto(char*** slst, const char * word, int nsug);
|
||||
int suggest_stems(char*** slst, const char * word, int nsug);
|
||||
int suggest_pos_stems(char*** slst, const char * word, int nsug);
|
||||
|
||||
char * suggest_morph(const char * word);
|
||||
char * suggest_morph_for_spelling_error(const char * word);
|
||||
|
||||
private:
|
||||
int check(const char *, int, int, int *, time_t *);
|
||||
int check_forbidden(const char *, int);
|
||||
|
||||
int replchars(char**, const char *, int, int);
|
||||
int doubledsyllable(char**, const char *, int, int);
|
||||
int forgotchar(char **, const char *, int, int);
|
||||
int swapchar(char **, const char *, int, int);
|
||||
int extrachar(char **, const char *, int, int);
|
||||
int badchar(char **, const char *, int, int);
|
||||
int twowords(char **, const char *, int, int);
|
||||
int fixstems(char **, const char *, int);
|
||||
|
||||
int forgotchar_utf(char**, const w_char *, int wl, int, int);
|
||||
int extrachar_utf(char**, const w_char *, int wl, int, int);
|
||||
int badchar_utf(char **, const w_char *, int wl, int, int);
|
||||
int swapchar_utf(char **, const w_char *, int wl, int, int);
|
||||
|
||||
int mapchars(char**, const char *, int, int);
|
||||
int map_related(const char *, int, char ** wlst, int, const mapentry*, int, int *, time_t *);
|
||||
int map_related_utf(w_char *, int, int, char ** wlst, int, const mapentry*, int, int *, time_t *);
|
||||
int ngram(int n, char * s1, const char * s2, int uselen);
|
||||
int mystrlen(const char * word);
|
||||
int equalfirstletter(char * s1, const char * s2);
|
||||
int commoncharacterpositions(char * s1, const char * s2, int * is_swap);
|
||||
void bubblesort( char ** rwd, int * rsc, int n);
|
||||
void lcs(const char * s, const char * s2, int * l1, int * l2, char ** result);
|
||||
int lcslen(const char * s, const char* s2);
|
||||
|
||||
};
|
||||
|
||||
#endif
|
||||
|
8506
goldlib/hunspell/utf_info.cxx
Normal file
8506
goldlib/hunspell/utf_info.cxx
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user