1658 lines
44 KiB
C++
1658 lines
44 KiB
C++
#include "license.hun"
|
|
#include "license.mys"
|
|
|
|
#include <cstdlib>
|
|
#include <cctype>
|
|
#include <cstring>
|
|
#include <cstdio>
|
|
|
|
#include "suggmgr.hxx"
|
|
|
|
#if !defined(_MSC_VER)
|
|
using namespace std;
|
|
#endif
|
|
|
|
|
|
SuggestMgr::SuggestMgr(const char * tryme, int maxn,
|
|
AffixMgr * aptr)
|
|
{
|
|
|
|
// register affix manager and check in string of chars to
|
|
// try when building candidate suggestions
|
|
pAMgr = aptr;
|
|
|
|
ctryl = 0;
|
|
ctry = NULL;
|
|
ctry_utf = NULL;
|
|
|
|
maxSug = maxn;
|
|
nosplitsugs = 0;
|
|
maxngramsugs = MAXNGRAMSUGS;
|
|
|
|
utf8 = 0;
|
|
utfconv = NULL;
|
|
complexprefixes = 0;
|
|
|
|
if (pAMgr) {
|
|
char * enc = pAMgr->get_encoding();
|
|
csconv = get_current_cs(enc);
|
|
free(enc);
|
|
nosplitsugs = pAMgr->get_nosplitsugs();
|
|
if (pAMgr->get_maxngramsugs() >= 0) maxngramsugs = pAMgr->get_maxngramsugs();
|
|
utf8 = pAMgr->get_utf8();
|
|
utfconv = pAMgr->get_utf_conv();
|
|
complexprefixes = pAMgr->get_complexprefixes();
|
|
}
|
|
|
|
if (tryme) {
|
|
if (utf8) {
|
|
w_char t[MAXSWL];
|
|
ctryl = u8_u16(t, MAXSWL, tryme);
|
|
ctry_utf = (w_char *) malloc(ctryl * sizeof(w_char));
|
|
memcpy(ctry_utf, t, ctryl * sizeof(w_char));
|
|
} else {
|
|
ctry = mystrdup(tryme);
|
|
ctryl = strlen(ctry);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
SuggestMgr::~SuggestMgr()
|
|
{
|
|
pAMgr = NULL;
|
|
if (ctry) free(ctry);
|
|
ctry = NULL;
|
|
if (ctry_utf) free(ctry_utf);
|
|
ctry_utf = NULL;
|
|
ctryl = 0;
|
|
maxSug = 0;
|
|
}
|
|
|
|
|
|
|
|
// generate suggestions for a mispelled word
|
|
// pass in address of array of char * pointers
|
|
|
|
int SuggestMgr::suggest(char*** slst, const char * w, int nsug)
|
|
{
|
|
int nocompoundtwowords = 0;
|
|
char ** wlst;
|
|
w_char word_utf[MAXSWL];
|
|
int wl=0;
|
|
|
|
char w2[MAXWORDUTF8LEN];
|
|
const char * word = w;
|
|
|
|
// word reversing wrapper for complex prefixes
|
|
if (complexprefixes) {
|
|
strcpy(w2, w);
|
|
if (utf8) reverseword_utf(w2); else reverseword(w2);
|
|
word = w2;
|
|
}
|
|
|
|
if (*slst) {
|
|
wlst = *slst;
|
|
} else {
|
|
wlst = (char **) malloc(maxSug * sizeof(char *));
|
|
if (wlst == NULL) return -1;
|
|
for (int i = 0; i < maxSug; i++) wlst[i] = NULL;
|
|
}
|
|
|
|
if (utf8) {
|
|
wl = u8_u16(word_utf, MAXSWL, word);
|
|
}
|
|
|
|
for (int cpdsuggest=0; (cpdsuggest<2) && (nocompoundtwowords==0); cpdsuggest++) {
|
|
|
|
// perhaps we made a typical fault of spelling
|
|
if ((nsug < maxSug) && (nsug > -1))
|
|
nsug = replchars(wlst, word, nsug, cpdsuggest);
|
|
|
|
// perhaps we made chose the wrong char from a related set
|
|
if ((nsug < maxSug) && (nsug > -1))
|
|
nsug = mapchars(wlst, word, nsug, cpdsuggest);
|
|
|
|
// did we swap the order of chars by mistake
|
|
if ((nsug < maxSug) && (nsug > -1)) {
|
|
nsug = (utf8) ? swapchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
|
|
swapchar(wlst, word, nsug, cpdsuggest);
|
|
}
|
|
|
|
// did we forget to add a char
|
|
if ((nsug < maxSug) && (nsug > -1)) {
|
|
nsug = (utf8) ? forgotchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
|
|
forgotchar(wlst, word, nsug, cpdsuggest);
|
|
}
|
|
|
|
// did we add a char that should not be there
|
|
if ((nsug < maxSug) && (nsug > -1)) {
|
|
nsug = (utf8) ? extrachar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
|
|
extrachar(wlst, word, nsug, cpdsuggest);
|
|
}
|
|
|
|
// did we just hit the wrong key in place of a good char
|
|
if ((nsug < maxSug) && (nsug > -1)) {
|
|
nsug = (utf8) ? badchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
|
|
badchar(wlst, word, nsug, cpdsuggest);
|
|
}
|
|
|
|
// only suggest compound words when no other suggestion
|
|
if ((cpdsuggest==0) && (nsug>0)) nocompoundtwowords=1;
|
|
|
|
// perhaps we forgot to hit space and two words ran together
|
|
if ((!nosplitsugs) && (nsug < maxSug) && (nsug > -1)) {
|
|
nsug = twowords(wlst, word, nsug, cpdsuggest);
|
|
}
|
|
|
|
} // repeating ``for'' statement compounding support
|
|
|
|
if (nsug < 0) {
|
|
// we ran out of memory - we should free up as much as possible
|
|
for (int i = 0; i < maxSug; i++)
|
|
if (wlst[i] != NULL) free(wlst[i]);
|
|
free(wlst);
|
|
wlst = NULL;
|
|
}
|
|
|
|
*slst = wlst;
|
|
return nsug;
|
|
}
|
|
|
|
// generate suggestions for a word with typical mistake
|
|
// pass in address of array of char * pointers
|
|
|
|
int SuggestMgr::suggest_auto(char*** slst, const char * w, int nsug)
|
|
{
|
|
int nocompoundtwowords = 0;
|
|
char ** wlst;
|
|
|
|
char w2[MAXWORDUTF8LEN];
|
|
const char * word = w;
|
|
|
|
// word reversing wrapper for complex prefixes
|
|
if (complexprefixes) {
|
|
strcpy(w2, w);
|
|
if (utf8) reverseword_utf(w2); else reverseword(w2);
|
|
word = w2;
|
|
}
|
|
|
|
if (*slst) {
|
|
wlst = *slst;
|
|
} else {
|
|
wlst = (char **) malloc(maxSug * sizeof(char *));
|
|
if (wlst == NULL) return -1;
|
|
}
|
|
|
|
for (int cpdsuggest=0; (cpdsuggest<2) && (nocompoundtwowords==0); cpdsuggest++) {
|
|
|
|
// perhaps we made a typical fault of spelling
|
|
if ((nsug < maxSug) && (nsug > -1))
|
|
nsug = replchars(wlst, word, nsug, cpdsuggest);
|
|
|
|
// perhaps we made chose the wrong char from a related set
|
|
if ((nsug < maxSug) && (nsug > -1) && (cpdsuggest == 0))
|
|
nsug = mapchars(wlst, word, nsug, cpdsuggest);
|
|
|
|
if ((cpdsuggest==0) && (nsug>0)) nocompoundtwowords=1;
|
|
|
|
// perhaps we forgot to hit space and two words ran together
|
|
|
|
if ((nsug < maxSug) && (nsug > -1) && check_forbidden(word, strlen(word))) {
|
|
nsug = twowords(wlst, word, nsug, cpdsuggest);
|
|
}
|
|
|
|
} // repeating ``for'' statement compounding support
|
|
|
|
if (nsug < 0) {
|
|
for (int i=0;i<maxSug; i++)
|
|
if (wlst[i] != NULL) free(wlst[i]);
|
|
free(wlst);
|
|
return -1;
|
|
}
|
|
|
|
*slst = wlst;
|
|
return nsug;
|
|
}
|
|
|
|
|
|
// suggestions for when chose the wrong char out of a related set
|
|
int SuggestMgr::mapchars(char** wlst, const char * word, int ns, int cpdsuggest)
|
|
{
|
|
time_t timelimit;
|
|
int timer;
|
|
|
|
int wl = strlen(word);
|
|
if (wl < 2 || ! pAMgr) return ns;
|
|
|
|
int nummap = pAMgr->get_nummap();
|
|
struct mapentry* maptable = pAMgr->get_maptable();
|
|
if (maptable==NULL) return ns;
|
|
|
|
timelimit = time(NULL);
|
|
timer = MINTIMER;
|
|
if (utf8) {
|
|
w_char w[MAXSWL];
|
|
int len = u8_u16(w, MAXSWL, word);
|
|
ns = map_related_utf(w, len, 0, wlst, ns, maptable, nummap, &timer, &timelimit);
|
|
} else ns = map_related(word, 0, wlst, ns, maptable, nummap, &timer, &timelimit);
|
|
return ns;
|
|
}
|
|
|
|
int SuggestMgr::map_related(const char * word, int i, char** wlst, int ns,
|
|
const mapentry* maptable, int nummap, int * timer, time_t * timelimit)
|
|
{
|
|
char c = *(word + i);
|
|
if (c == 0) {
|
|
int cwrd = 1;
|
|
int wl;
|
|
for (int m=0; m < ns; m++)
|
|
if (strcmp(word,wlst[m]) == 0) cwrd = 0;
|
|
if ((cwrd) && (wl = strlen(word)) && (check(word, wl, 0, timer, timelimit) ||
|
|
check(word, wl, 1, timer, timelimit))) {
|
|
if (ns < maxSug) {
|
|
wlst[ns] = mystrdup(word);
|
|
if (wlst[ns] == NULL) return -1;
|
|
ns++;
|
|
}
|
|
}
|
|
return ns;
|
|
}
|
|
int in_map = 0;
|
|
for (int j = 0; j < nummap; j++) {
|
|
if (strchr(maptable[j].set,c) != 0) {
|
|
in_map = 1;
|
|
char * newword = mystrdup(word);
|
|
for (int k = 0; k < maptable[j].len; k++) {
|
|
*(newword + i) = *(maptable[j].set + k);
|
|
ns = map_related(newword, (i+1), wlst, ns, maptable, nummap, timer, timelimit);
|
|
if (!(*timelimit)) return ns;
|
|
}
|
|
free(newword);
|
|
}
|
|
}
|
|
if (!in_map) {
|
|
i++;
|
|
ns = map_related(word, i, wlst, ns, maptable, nummap, timer, timelimit);
|
|
}
|
|
return ns;
|
|
}
|
|
|
|
int SuggestMgr::map_related_utf(w_char * word, int len, int i, char** wlst, int ns,
|
|
const mapentry* maptable, int nummap, int * timer, time_t * timelimit)
|
|
{
|
|
if (i == len) {
|
|
int cwrd = 1;
|
|
int wl;
|
|
char s[MAXSWUTF8L];
|
|
u16_u8(s, MAXSWUTF8L, word, len);
|
|
for (int m=0; m < ns; m++)
|
|
if (strcmp(s,wlst[m]) == 0) cwrd = 0;
|
|
if ((cwrd) && (wl = strlen(s)) && (check(s, wl, 0, timer, timelimit) ||
|
|
check(s, wl, 1, timer, timelimit))) {
|
|
if (ns < maxSug) {
|
|
wlst[ns] = mystrdup(s);
|
|
if (wlst[ns] == NULL) return -1;
|
|
ns++;
|
|
}
|
|
}
|
|
return ns;
|
|
}
|
|
int in_map = 0;
|
|
unsigned short c = *((unsigned short *) word + i);
|
|
for (int j = 0; j < nummap; j++) {
|
|
if (flag_bsearch((unsigned short *) maptable[j].set_utf16, c, maptable[j].len)) {
|
|
in_map = 1;
|
|
for (int k = 0; k < maptable[j].len; k++) {
|
|
*(word + i) = *(maptable[j].set_utf16 + k);
|
|
ns = map_related_utf(word, len, i + 1, wlst, ns, maptable, nummap, timer, timelimit);
|
|
if (!(*timelimit)) return ns;
|
|
}
|
|
*((unsigned short *) word + i) = c;
|
|
}
|
|
}
|
|
if (!in_map) {
|
|
i++;
|
|
ns = map_related_utf(word, len, i, wlst, ns, maptable, nummap, timer, timelimit);
|
|
}
|
|
return ns;
|
|
}
|
|
|
|
|
|
|
|
// suggestions for a typical fault of spelling, that
|
|
// differs with more, than 1 letter from the right form.
|
|
int SuggestMgr::replchars(char** wlst, const char * word, int ns, int cpdsuggest)
|
|
{
|
|
char candidate[MAXSWUTF8L];
|
|
const char * r;
|
|
int lenr, lenp;
|
|
int cwrd;
|
|
|
|
int wl = strlen(word);
|
|
if (wl < 2 || ! pAMgr) return ns;
|
|
|
|
int numrep = pAMgr->get_numrep();
|
|
struct replentry* reptable = pAMgr->get_reptable();
|
|
if (reptable==NULL) return ns;
|
|
|
|
for (int i=0; i < numrep; i++ ) {
|
|
r = word;
|
|
lenr = strlen(reptable[i].pattern2);
|
|
lenp = strlen(reptable[i].pattern);
|
|
// search every occurence of the pattern in the word
|
|
while ((r=strstr(r, reptable[i].pattern)) != NULL) {
|
|
strcpy(candidate, word);
|
|
if (r-word + lenr + strlen(r+lenp) >= MAXSWUTF8L) break;
|
|
strcpy(candidate+(r-word),reptable[i].pattern2);
|
|
strcpy(candidate+(r-word)+lenr, r+lenp);
|
|
cwrd = 1;
|
|
for (int k=0; k < ns; k++)
|
|
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
|
|
if ((cwrd) && check(candidate,strlen(candidate), cpdsuggest, NULL, NULL)) {
|
|
if (ns < maxSug) {
|
|
wlst[ns] = mystrdup(candidate);
|
|
if (wlst[ns] == NULL) {
|
|
for (int j=0; j<ns; j++) free(wlst[j]);
|
|
return -1;
|
|
}
|
|
ns++;
|
|
} else return ns;
|
|
}
|
|
r++; // search for the next letter
|
|
}
|
|
}
|
|
return ns;
|
|
}
|
|
|
|
// perhaps we made a special pattern mistake
|
|
// for example: vacation -> vacacation (doubled `ac')
|
|
int SuggestMgr::doubledsyllable(char** wlst, const char * word, int ns, int cpdsuggest)
|
|
{
|
|
char candidate[MAXSWUTF8L];
|
|
int state=0;
|
|
int cwrd;
|
|
|
|
int wl = strlen(word);
|
|
if (wl < 5 || ! pAMgr) return ns;
|
|
|
|
for (int i=2; i < wl; i++ ) {
|
|
if (word[i]==word[i-2]) {
|
|
state++;
|
|
if (state==3) {
|
|
strcpy(candidate,word);
|
|
strcpy(candidate+i-1,word+i+1);
|
|
cwrd = 1;
|
|
for (int k=0; k < ns; k++)
|
|
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
|
|
if ((cwrd) && check(candidate,strlen(candidate), cpdsuggest, NULL, NULL)) {
|
|
if (ns < maxSug) {
|
|
wlst[ns] = mystrdup(candidate);
|
|
if (wlst[ns] == NULL) {
|
|
for (int j=0; j<ns; j++) free(wlst[j]);
|
|
return -1;
|
|
}
|
|
ns++;
|
|
} else return ns;
|
|
}
|
|
state=0;
|
|
}
|
|
} else {
|
|
state=0;
|
|
}
|
|
}
|
|
return ns;
|
|
}
|
|
|
|
// error is wrong char in place of correct one
|
|
int SuggestMgr::badchar(char ** wlst, const char * word, int ns, int cpdsuggest)
|
|
{
|
|
char tmpc;
|
|
char candidate[MAXSWUTF8L];
|
|
time_t timelimit = time(NULL);
|
|
int timer = MINTIMER;
|
|
|
|
int wl = strlen(word);
|
|
int cwrd;
|
|
strcpy(candidate, word);
|
|
|
|
// swap out each char one by one and try all the tryme
|
|
// chars in its place to see if that makes a good word
|
|
for (int i=0; i < wl; i++) {
|
|
tmpc = candidate[i];
|
|
for (int j=0; j < ctryl; j++) {
|
|
if (ctry[j] == tmpc) continue;
|
|
candidate[i] = ctry[j];
|
|
cwrd = 1;
|
|
for (int k=0; k < ns; k++)
|
|
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
|
|
if ((cwrd) && check(candidate,wl, cpdsuggest, &timer, &timelimit)) {
|
|
if (ns < maxSug) {
|
|
wlst[ns] = mystrdup(candidate);
|
|
if (wlst[ns] == NULL) return -1;
|
|
ns++;
|
|
} else return ns;
|
|
}
|
|
if (!timelimit) return ns;
|
|
candidate[i] = tmpc;
|
|
}
|
|
}
|
|
return ns;
|
|
}
|
|
|
|
// error is wrong char in place of correct one
|
|
int SuggestMgr::badchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
|
|
{
|
|
w_char tmpc;
|
|
w_char candidate_utf[MAXSWL];
|
|
char candidate[MAXSWUTF8L];
|
|
int cwrd;
|
|
time_t timelimit = time(NULL);
|
|
int timer = MINTIMER;
|
|
|
|
memcpy(candidate_utf, word, wl * sizeof(w_char));
|
|
|
|
// swap out each char one by one and try all the tryme
|
|
// chars in its place to see if that makes a good word
|
|
for (int i=0; i < wl; i++) {
|
|
tmpc = candidate_utf[i];
|
|
for (int j=0; j < ctryl; j++) {
|
|
if ((ctry_utf[j].l == tmpc.l) && (ctry_utf[j].h == tmpc.h)) continue;
|
|
candidate_utf[i] = ctry_utf[j];
|
|
cwrd = 1;
|
|
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
|
|
for (int k=0; k < ns; k++)
|
|
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
|
|
if ((cwrd) && check(candidate, strlen(candidate), cpdsuggest, &timer, &timelimit)) {
|
|
if (ns < maxSug) {
|
|
wlst[ns] = mystrdup(candidate);
|
|
if (wlst[ns] == NULL) return -1;
|
|
ns++;
|
|
} else return ns;
|
|
}
|
|
if (!timelimit) return ns;
|
|
candidate_utf[i] = tmpc;
|
|
}
|
|
}
|
|
return ns;
|
|
}
|
|
|
|
// error is word has an extra letter it does not need
|
|
int SuggestMgr::extrachar_utf(char** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
|
|
{
|
|
char candidate[MAXSWUTF8L];
|
|
w_char candidate_utf[MAXSWL];
|
|
|
|
const w_char * p;
|
|
w_char * r;
|
|
int cwrd;
|
|
|
|
if (wl < 2) return ns;
|
|
|
|
// try omitting one char of word at a time
|
|
memcpy(candidate_utf, word + 1, (wl - 1) * sizeof(w_char));
|
|
for (p = word, r = candidate_utf; p < word + wl; ) {
|
|
cwrd = 1;
|
|
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl - 1);
|
|
for (int k=0; k < ns; k++)
|
|
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
|
|
if ((cwrd) && check(candidate, strlen(candidate), cpdsuggest, NULL, NULL)) {
|
|
if (ns < maxSug) {
|
|
wlst[ns] = mystrdup(candidate);
|
|
if (wlst[ns] == NULL) return -1;
|
|
ns++;
|
|
} else return ns;
|
|
}
|
|
*r++ = *p++;
|
|
}
|
|
return ns;
|
|
}
|
|
|
|
// error is word has an extra letter it does not need
|
|
int SuggestMgr::extrachar(char** wlst, const char * word, int ns, int cpdsuggest)
|
|
{
|
|
char candidate[MAXSWUTF8L];
|
|
const char * p;
|
|
char * r;
|
|
int cwrd;
|
|
|
|
int wl = strlen(word);
|
|
if (wl < 2) return ns;
|
|
|
|
// try omitting one char of word at a time
|
|
strcpy (candidate, word + 1);
|
|
for (p = word, r = candidate; *p != 0; ) {
|
|
cwrd = 1;
|
|
for (int k=0; k < ns; k++)
|
|
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
|
|
if ((cwrd) && check(candidate,wl-1, cpdsuggest, NULL, NULL)) {
|
|
if (ns < maxSug) {
|
|
wlst[ns] = mystrdup(candidate);
|
|
if (wlst[ns] == NULL) return -1;
|
|
ns++;
|
|
} else return ns;
|
|
}
|
|
*r++ = *p++;
|
|
}
|
|
return ns;
|
|
}
|
|
|
|
|
|
// error is missing a letter it needs
|
|
int SuggestMgr::forgotchar(char ** wlst, const char * word, int ns, int cpdsuggest)
|
|
{
|
|
char candidate[MAXSWUTF8L];
|
|
const char * p;
|
|
char * q;
|
|
int cwrd;
|
|
time_t timelimit = time(NULL);
|
|
int timer = MINTIMER;
|
|
|
|
int wl = strlen(word);
|
|
|
|
// try inserting a tryme character before every letter
|
|
strcpy(candidate + 1, word);
|
|
for (p = word, q = candidate; *p != 0; ) {
|
|
for (int i = 0; i < ctryl; i++) {
|
|
*q = ctry[i];
|
|
cwrd = 1;
|
|
for (int k=0; k < ns; k++)
|
|
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
|
|
if ((cwrd) && check(candidate, wl+1, cpdsuggest, &timer, &timelimit)) {
|
|
if (ns < maxSug) {
|
|
wlst[ns] = mystrdup(candidate);
|
|
if (wlst[ns] == NULL) return -1;
|
|
ns++;
|
|
} else return ns;
|
|
}
|
|
if (!timelimit) return ns;
|
|
}
|
|
*q++ = *p++;
|
|
}
|
|
|
|
// now try adding one to end */
|
|
for (int i = 0; i < ctryl; i++) {
|
|
*q = ctry[i];
|
|
cwrd = 1;
|
|
for (int k=0; k < ns; k++)
|
|
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
|
|
if ((cwrd) && check(candidate,wl+1, cpdsuggest, NULL, NULL)) {
|
|
if (ns < maxSug) {
|
|
wlst[ns] = mystrdup(candidate);
|
|
if (wlst[ns] == NULL) return -1;
|
|
ns++;
|
|
} else return ns;
|
|
}
|
|
}
|
|
return ns;
|
|
}
|
|
|
|
// error is missing a letter it needs
|
|
int SuggestMgr::forgotchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
|
|
{
|
|
w_char candidate_utf[MAXSWL];
|
|
char candidate[MAXSWUTF8L];
|
|
const w_char * p;
|
|
w_char * q;
|
|
int cwrd;
|
|
time_t timelimit = time(NULL);
|
|
int timer = MINTIMER;
|
|
|
|
// try inserting a tryme character before every letter
|
|
memcpy (candidate_utf + 1, word, wl * sizeof(w_char));
|
|
for (p = word, q = candidate_utf; p < (word + wl); ) {
|
|
for (int i = 0; i < ctryl; i++) {
|
|
*q = ctry_utf[i];
|
|
cwrd = 1;
|
|
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl + 1);
|
|
for (int k=0; k < ns; k++)
|
|
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
|
|
if ((cwrd) && check(candidate, strlen(candidate), cpdsuggest, &timer, &timelimit)) {
|
|
if (ns < maxSug) {
|
|
wlst[ns] = mystrdup(candidate);
|
|
if (wlst[ns] == NULL) return -1;
|
|
ns++;
|
|
} else return ns;
|
|
}
|
|
if (!timelimit) return ns;
|
|
}
|
|
*q++ = *p++;
|
|
}
|
|
|
|
// now try adding one to end */
|
|
for (int i = 0; i < ctryl; i++) {
|
|
*q = ctry_utf[i];
|
|
cwrd = 1;
|
|
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl + 1);
|
|
for (int k=0; k < ns; k++)
|
|
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
|
|
if ((cwrd) && check(candidate, strlen(candidate), cpdsuggest, NULL, NULL)) {
|
|
if (ns < maxSug) {
|
|
wlst[ns] = mystrdup(candidate);
|
|
if (wlst[ns] == NULL) return -1;
|
|
ns++;
|
|
} else return ns;
|
|
}
|
|
}
|
|
return ns;
|
|
}
|
|
|
|
|
|
/* error is should have been two words */
|
|
int SuggestMgr::twowords(char ** wlst, const char * word, int ns, int cpdsuggest)
|
|
{
|
|
char candidate[MAXSWUTF8L];
|
|
char * p;
|
|
int c1, c2, cwrd;
|
|
int forbidden = 0;
|
|
|
|
int wl=strlen(word);
|
|
if (wl < 3) return ns;
|
|
|
|
if (pAMgr->get_langnum() == LANG_hu) forbidden = check_forbidden(word, wl);
|
|
|
|
strcpy(candidate + 1, word);
|
|
|
|
// split the string into two pieces after every char
|
|
// if both pieces are good words make them a suggestion
|
|
for (p = candidate + 1; p[1] != '\0'; p++) {
|
|
p[-1] = *p;
|
|
// go to end of the UTF-8 character
|
|
while (utf8 && ((p[1] & 0xc0) == 0x80)) {
|
|
p++;
|
|
p[-1] = *p;
|
|
}
|
|
*p = '\0';
|
|
if ((c1=check(candidate,strlen(candidate), cpdsuggest, NULL, NULL))) {
|
|
if ((c2=check((p+1),strlen(p+1), cpdsuggest, NULL, NULL))) {
|
|
*p = ' ';
|
|
|
|
// spec. Hungarian code (need a better compound word support)
|
|
if ((pAMgr->get_langnum() == LANG_hu) && !forbidden &&
|
|
// if 3 repeating letter, use - instead of space
|
|
(((p[-1] == p[1]) && (((p>candidate+1) && (p[-1] == p[-2])) || (p[-1] == p[2]))) ||
|
|
// or multiple compounding, with more, than 6 syllables
|
|
((c1 == 3) && (c2 >= 2)))) *p = '-';
|
|
|
|
cwrd = 1;
|
|
for (int k=0; k < ns; k++)
|
|
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
|
|
if (ns < maxSug) {
|
|
if (cwrd) {
|
|
wlst[ns] = mystrdup(candidate);
|
|
if (wlst[ns] == NULL) return -1;
|
|
ns++;
|
|
}
|
|
} else return ns;
|
|
}
|
|
}
|
|
}
|
|
return ns;
|
|
}
|
|
|
|
|
|
// error is adjacent letter were swapped
|
|
int SuggestMgr::swapchar(char ** wlst, const char * word, int ns, int cpdsuggest)
|
|
{
|
|
char candidate[MAXSWUTF8L];
|
|
char * p;
|
|
char tmpc;
|
|
int cwrd;
|
|
|
|
int wl=strlen(word);
|
|
|
|
// try swapping adjacent chars one by one
|
|
strcpy(candidate, word);
|
|
for (p = candidate; p[1] != 0; p++) {
|
|
tmpc = *p;
|
|
*p = p[1];
|
|
p[1] = tmpc;
|
|
cwrd = 1;
|
|
for (int k=0; k < ns; k++)
|
|
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
|
|
if ((cwrd) && check(candidate,wl, cpdsuggest, NULL, NULL)) {
|
|
if (ns < maxSug) {
|
|
wlst[ns] = mystrdup(candidate);
|
|
if (wlst[ns] == NULL) return -1;
|
|
ns++;
|
|
} else return ns;
|
|
}
|
|
tmpc = *p;
|
|
*p = p[1];
|
|
p[1] = tmpc;
|
|
}
|
|
return ns;
|
|
}
|
|
|
|
// error is adjacent letter were swapped
|
|
int SuggestMgr::swapchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
|
|
{
|
|
w_char candidate_utf[MAXSWL];
|
|
char candidate[MAXSWUTF8L];
|
|
w_char * p;
|
|
w_char tmpc;
|
|
int cwrd;
|
|
|
|
// try swapping adjacent chars one by one
|
|
memcpy (candidate_utf, word, wl * sizeof(w_char));
|
|
for (p = candidate_utf; p < (candidate_utf + wl - 1); p++) {
|
|
tmpc = *p;
|
|
*p = p[1];
|
|
p[1] = tmpc;
|
|
cwrd = 1;
|
|
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
|
|
for (int k=0; k < ns; k++)
|
|
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
|
|
if ((cwrd) && check(candidate, strlen(candidate), cpdsuggest, NULL, NULL)) {
|
|
if (ns < maxSug) {
|
|
wlst[ns] = mystrdup(candidate);
|
|
if (wlst[ns] == NULL) return -1;
|
|
ns++;
|
|
} else return ns;
|
|
}
|
|
tmpc = *p;
|
|
*p = p[1];
|
|
p[1] = tmpc;
|
|
}
|
|
return ns;
|
|
}
|
|
|
|
// generate a set of suggestions for very poorly spelled words
|
|
int SuggestMgr::ngsuggest(char** wlst, char * w, HashMgr* pHMgr)
|
|
{
|
|
|
|
int i, j;
|
|
int lval;
|
|
int sc;
|
|
int lp;
|
|
|
|
if (! pHMgr) return 0;
|
|
|
|
// exhaustively search through all root words
|
|
// keeping track of the MAX_ROOTS most similar root words
|
|
struct hentry * roots[MAX_ROOTS];
|
|
int scores[MAX_ROOTS];
|
|
for (i = 0; i < MAX_ROOTS; i++) {
|
|
roots[i] = NULL;
|
|
scores[i] = -100 * i;
|
|
}
|
|
lp = MAX_ROOTS - 1;
|
|
|
|
char w2[MAXWORDUTF8LEN];
|
|
char * word = w;
|
|
|
|
// word reversing wrapper for complex prefixes
|
|
if (complexprefixes) {
|
|
strcpy(w2, w);
|
|
if (utf8) reverseword_utf(w2); else reverseword(w2);
|
|
word = w2;
|
|
}
|
|
|
|
char mw[MAXSWUTF8L];
|
|
w_char u8[MAXSWL];
|
|
int nc = strlen(word);
|
|
int n = (utf8) ? u8_u16(u8, MAXSWL, word) : nc;
|
|
|
|
struct hentry* hp = NULL;
|
|
int col = -1;
|
|
while ((hp = pHMgr->walk_hashtable(col, hp))) {
|
|
// check forbidden words
|
|
if ((hp->astr) && (pAMgr) &&
|
|
(TESTAFF(hp->astr, pAMgr->get_forbiddenword(), hp->alen) ||
|
|
TESTAFF(hp->astr, pAMgr->get_nosuggest(), hp->alen) ||
|
|
TESTAFF(hp->astr, pAMgr->get_onlyincompound(), hp->alen))) continue;
|
|
sc = ngram(3, word, hp->word, NGRAM_LONGER_WORSE);
|
|
if (sc > scores[lp]) {
|
|
scores[lp] = sc;
|
|
roots[lp] = hp;
|
|
int lval = sc;
|
|
for (j=0; j < MAX_ROOTS; j++)
|
|
if (scores[j] < lval) {
|
|
lp = j;
|
|
lval = scores[j];
|
|
}
|
|
}
|
|
}
|
|
|
|
// find minimum threshhold for a passable suggestion
|
|
// mangle original word three differnt ways
|
|
// and score them to generate a minimum acceptable score
|
|
int thresh = 0;
|
|
for (int sp = 1; sp < 4; sp++) {
|
|
if (utf8) {
|
|
for (int k=sp; k < n; k+=4) *((unsigned short *) u8 + k) = '*';
|
|
u16_u8(mw, MAXSWUTF8L, u8, n);
|
|
thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH);
|
|
} else {
|
|
strcpy(mw, word);
|
|
for (int k=sp; k < n; k+=4) *(mw + k) = '*';
|
|
thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH);
|
|
}
|
|
}
|
|
thresh = thresh / 3;
|
|
thresh--;
|
|
|
|
// now expand affixes on each of these root words and
|
|
// and use length adjusted ngram scores to select
|
|
// possible suggestions
|
|
char * guess[MAX_GUESS];
|
|
int gscore[MAX_GUESS];
|
|
for(i=0;i<MAX_GUESS;i++) {
|
|
guess[i] = NULL;
|
|
gscore[i] = -100 * i;
|
|
}
|
|
|
|
lp = MAX_GUESS - 1;
|
|
|
|
struct guessword * glst;
|
|
glst = (struct guessword *) calloc(MAX_WORDS,sizeof(struct guessword));
|
|
if (! glst) return 0;
|
|
|
|
for (i = 0; i < MAX_ROOTS; i++) {
|
|
|
|
if (roots[i]) {
|
|
struct hentry * rp = roots[i];
|
|
int nw = pAMgr->expand_rootword(glst, MAX_WORDS, rp->word, rp->wlen,
|
|
rp->astr, rp->alen, word, nc);
|
|
|
|
for (int k = 0; k < nw ; k++) {
|
|
sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH);
|
|
if ((sc > thresh)) {
|
|
if (sc > gscore[lp]) {
|
|
if (guess[lp]) free (guess[lp]);
|
|
gscore[lp] = sc;
|
|
guess[lp] = glst[k].word;
|
|
lval = sc;
|
|
for (j=0; j < MAX_GUESS; j++)
|
|
if (gscore[j] < lval) {
|
|
lp = j;
|
|
lval = gscore[j];
|
|
}
|
|
} else free (glst[k].word);
|
|
} else free(glst[k].word);
|
|
}
|
|
}
|
|
}
|
|
free(glst);
|
|
|
|
// now we are done generating guesses
|
|
// sort in order of decreasing score
|
|
|
|
bubblesort(&guess[0], &gscore[0], MAX_GUESS);
|
|
|
|
// weight suggestions with a similarity index, based on
|
|
// the longest common subsequent algorithm and resort
|
|
|
|
int is_swap;
|
|
for (i=0; i < MAX_GUESS; i++) {
|
|
if (guess[i]) {
|
|
// lowering guess[i]
|
|
char gl[MAXSWUTF8L];
|
|
int len;
|
|
if (utf8) {
|
|
w_char w[MAXSWL];
|
|
len = u8_u16(w, MAXSWL, guess[i]);
|
|
mkallsmall_utf(w, len, utfconv);
|
|
u16_u8(gl, MAXSWUTF8L, w, len);
|
|
} else {
|
|
strcpy(gl, guess[i]);
|
|
mkallsmall(gl, csconv);
|
|
len = strlen(guess[i]);
|
|
}
|
|
|
|
int lcs = lcslen(word, gl);
|
|
|
|
// same characters with different casing
|
|
if ((n == len) && (n == lcs)) {
|
|
gscore[i] += 2000;
|
|
break;
|
|
}
|
|
|
|
// heuristic weigthing of ngram scores
|
|
gscore[i] +=
|
|
// length of longest common subsequent minus lenght difference
|
|
2 * lcs - abs((int) (n - len)) +
|
|
// weight equal first letter
|
|
equalfirstletter(word, gl) +
|
|
// weight equal character positions
|
|
((lcs == commoncharacterpositions(word, gl, &is_swap)) ? 1: 0) +
|
|
// swap character (not neighboring)
|
|
((is_swap) ? 1000 : 0);
|
|
}
|
|
}
|
|
|
|
bubblesort(&guess[0], &gscore[0], MAX_GUESS);
|
|
|
|
// copy over
|
|
|
|
int ns = 0;
|
|
int same = 0;
|
|
for (i=0; i < MAX_GUESS; i++) {
|
|
if (guess[i]) {
|
|
if ((ns < maxngramsugs) && (ns < maxSug) && (!same || (gscore[i] > 1000))) {
|
|
int unique = 1;
|
|
// we have excellent suggestion(s)
|
|
if (gscore[i] > 1000) same = 1;
|
|
for (j=0; j < ns; j++)
|
|
// don't suggest previous suggestions or a previous suggestion with prefixes or affixes
|
|
if (strstr(guess[i], wlst[j]) ||
|
|
// check forbidden words
|
|
!check(guess[i], strlen(guess[i]), 0, NULL, NULL)) unique = 0;
|
|
if (unique) wlst[ns++] = guess[i]; else free(guess[i]);
|
|
} else free(guess[i]);
|
|
}
|
|
}
|
|
|
|
return ns;
|
|
}
|
|
|
|
|
|
// see if a candidate suggestion is spelled correctly
|
|
// needs to check both root words and words with affixes
|
|
|
|
// obsolote MySpell-HU modifications:
|
|
// return value 2 and 3 marks compounding with hyphen (-)
|
|
// `3' marks roots without suffix
|
|
int SuggestMgr::check(const char * word, int len, int cpdsuggest, int * timer, time_t * timelimit)
|
|
{
|
|
struct hentry * rv=NULL;
|
|
int nosuffix = 0;
|
|
|
|
// check time limit
|
|
if (timer) {
|
|
(*timer)--;
|
|
if (!(*timer) && timelimit) {
|
|
if (time(NULL) > *timelimit) {
|
|
*timelimit = 0;
|
|
return 0;
|
|
}
|
|
*timer = MAXPLUSTIMER;
|
|
}
|
|
}
|
|
|
|
if (pAMgr) {
|
|
if (cpdsuggest==1) {
|
|
if (pAMgr->get_compound()) {
|
|
rv = pAMgr->compound_check(word,len,0,0,0,0,NULL,0,NULL,NULL,1);
|
|
if (rv) return 3; // XXX obsolote categorisation
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
rv = pAMgr->lookup(word);
|
|
|
|
if (rv) {
|
|
if ((rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen)
|
|
|| TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen))) return 0;
|
|
if (rv->astr && (TESTAFF(rv->astr,pAMgr->get_pseudoroot(),rv->alen) ||
|
|
TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) rv = NULL;
|
|
} else rv = pAMgr->prefix_check(word, len, 0); // only prefix, and prefix + suffix XXX
|
|
|
|
if (rv) {
|
|
nosuffix=1;
|
|
} else {
|
|
rv = pAMgr->suffix_check(word, len, 0, NULL, NULL, 0, NULL); // only suffix
|
|
}
|
|
|
|
if (!rv && pAMgr->have_contclass()) {
|
|
rv = pAMgr->suffix_check_twosfx(word, len, 0, NULL, FLAG_NULL);
|
|
if (!rv) rv = pAMgr->prefix_check_twosfx(word, len, 1, FLAG_NULL);
|
|
}
|
|
|
|
// check forbidden words
|
|
if ((rv) && (rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen)
|
|
|| TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen) ||
|
|
TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) return 0;
|
|
|
|
if (rv) { // XXX obsolote
|
|
if ((pAMgr->get_compoundflag()) &&
|
|
TESTAFF(rv->astr, pAMgr->get_compoundflag(), rv->alen)) return 2 + nosuffix;
|
|
return 1;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
int SuggestMgr::check_forbidden(const char * word, int len)
|
|
{
|
|
struct hentry * rv = NULL;
|
|
|
|
if (pAMgr) {
|
|
rv = pAMgr->lookup(word);
|
|
if (rv && rv->astr && (TESTAFF(rv->astr,pAMgr->get_pseudoroot(),rv->alen) ||
|
|
TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) rv = NULL;
|
|
if (!(pAMgr->prefix_check(word,len,1)))
|
|
rv = pAMgr->suffix_check(word,len, 0, NULL, NULL, 0, NULL); // prefix+suffix, suffix
|
|
// check forbidden words
|
|
if ((rv) && (rv->astr) && TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen)) return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
// suggest stems, XXX experimental code
|
|
int SuggestMgr::suggest_stems(char*** slst, const char * w, int nsug)
|
|
{
|
|
char buf[MAXSWUTF8L];
|
|
char ** wlst;
|
|
int prevnsug = nsug;
|
|
|
|
char w2[MAXWORDUTF8LEN];
|
|
const char * word = w;
|
|
|
|
// word reversing wrapper for complex prefixes
|
|
if (complexprefixes) {
|
|
strcpy(w2, w);
|
|
if (utf8) reverseword_utf(w2); else reverseword(w2);
|
|
word = w2;
|
|
}
|
|
|
|
if (*slst) {
|
|
wlst = *slst;
|
|
} else {
|
|
wlst = (char **) calloc(maxSug, sizeof(char *));
|
|
if (wlst == NULL) return -1;
|
|
}
|
|
// perhaps there are a fix stem in the dictionary
|
|
if ((nsug < maxSug) && (nsug > -1)) {
|
|
|
|
nsug = fixstems(wlst, word, nsug);
|
|
if (nsug == prevnsug) {
|
|
char * s = mystrdup(word);
|
|
char * p = s + strlen(s);
|
|
while ((*p != '-') && (p != s)) p--;
|
|
if (*p == '-') {
|
|
*p = '\0';
|
|
nsug = fixstems(wlst, s, nsug);
|
|
if ((nsug == prevnsug) && (nsug < maxSug) && (nsug >= 0)) {
|
|
char * t;
|
|
buf[0] = '\0';
|
|
for (t = s; (t[0] != '\0') && ((t[0] >= '0') || (t[0] <= '9')); t++); // is a number?
|
|
if (*t != '\0') strcpy(buf, "# ");
|
|
strcat(buf, s);
|
|
wlst[nsug] = mystrdup(buf);
|
|
if (wlst[nsug] == NULL) return -1;
|
|
nsug++;
|
|
}
|
|
p++;
|
|
nsug = fixstems(wlst, p, nsug);
|
|
}
|
|
|
|
free(s);
|
|
}
|
|
}
|
|
|
|
if (nsug < 0) {
|
|
for (int i=0;i<maxSug; i++)
|
|
if (wlst[i] != NULL) free(wlst[i]);
|
|
free(wlst);
|
|
return -1;
|
|
}
|
|
|
|
*slst = wlst;
|
|
return nsug;
|
|
}
|
|
|
|
|
|
// there are fix stems in dictionary
|
|
int SuggestMgr::fixstems(char ** wlst, const char * word, int ns)
|
|
{
|
|
char fix[MAXSWUTF8L];
|
|
char buf[MAXSWUTF8L];
|
|
char prefix[MAXSWUTF8L] = "";
|
|
|
|
char * p;
|
|
int dicstem = 1; // 0 = lookup, 1= affix, 2 = compound
|
|
int cpdindex = 0;
|
|
struct hentry * rv = NULL;
|
|
struct hentry * rv2 = NULL;
|
|
|
|
int wl = strlen(word);
|
|
int cmpdstemnum;
|
|
int cmpdstem[MAXCOMPOUND];
|
|
|
|
if (pAMgr) {
|
|
rv = pAMgr->lookup(word);
|
|
if (rv) {
|
|
dicstem = 0;
|
|
} else {
|
|
// try stripping off affixes
|
|
rv = pAMgr->affix_check(word, wl);
|
|
|
|
// else try check compound word
|
|
if (!rv && pAMgr->get_compound()) {
|
|
rv = pAMgr->compound_check(word, wl,
|
|
0, 0, 100, 0, NULL, 0, &cmpdstemnum, cmpdstem,1);
|
|
|
|
if (rv) {
|
|
dicstem = 2;
|
|
for (int j = 0; j < cmpdstemnum; j++) {
|
|
cpdindex += cmpdstem[j];
|
|
}
|
|
if(! (pAMgr->lookup(word + cpdindex)))
|
|
pAMgr->affix_check(word + cpdindex, wl - cpdindex); // for prefix
|
|
}
|
|
}
|
|
|
|
|
|
if (pAMgr->get_prefix()) {
|
|
strcpy(prefix, pAMgr->get_prefix());
|
|
}
|
|
|
|
// XXX obsolote, will be a general solution for stemming
|
|
if ((prefix) && (strncmp(prefix, "leg", 3)==0)) prefix[0] = '\0'; // (HU)
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((rv) && (ns < maxSug)) {
|
|
|
|
// check fixstem flag and not_valid_stem flag
|
|
// first word
|
|
if ((ns < maxSug) && (dicstem < 2)) {
|
|
strcpy(buf, prefix);
|
|
if ((dicstem > 0) && pAMgr->get_derived()) {
|
|
// XXX obsolote
|
|
if (strlen(prefix) == 1) {
|
|
strcat(buf, (pAMgr->get_derived()) + 1);
|
|
} else {
|
|
strcat(buf, pAMgr->get_derived());
|
|
}
|
|
} else {
|
|
// special stem in affix description
|
|
const char * wordchars = pAMgr->get_wordchars();
|
|
if (rv->description &&
|
|
(strchr(wordchars, *(rv->description)))) {
|
|
char * desc = (rv->description) + 1;
|
|
while (strchr(wordchars, *desc)) desc++;
|
|
strncat(buf, rv->description, desc - (rv->description));
|
|
} else {
|
|
strcat(buf, rv->word);
|
|
}
|
|
}
|
|
wlst[ns] = mystrdup(buf);
|
|
if (wlst[ns] == NULL) return -1;
|
|
ns++;
|
|
}
|
|
|
|
if (dicstem == 2) {
|
|
|
|
// compound stem
|
|
|
|
// if (rv->astr && (strchr(rv->astr, '0') == NULL)) {
|
|
if (rv->astr) {
|
|
strcpy(buf, word);
|
|
buf[cpdindex] = '\0';
|
|
if (prefix) strcat(buf, prefix);
|
|
if (pAMgr->get_derived()) {
|
|
strcat(buf, pAMgr->get_derived());
|
|
} else {
|
|
// special stem in affix description
|
|
const char * wordchars = pAMgr->get_wordchars();
|
|
if (rv->description &&
|
|
(strchr(wordchars, *(rv->description)))) {
|
|
char * desc = (rv->description) + 1;
|
|
while (strchr(wordchars, *desc)) desc++;
|
|
strncat(buf, rv->description, desc - (rv->description));
|
|
} else {
|
|
strcat(buf, rv->word);
|
|
}
|
|
}
|
|
if (ns < maxSug) {
|
|
wlst[ns] = mystrdup(buf);
|
|
if (wlst[ns] == NULL) return -1;
|
|
ns++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
while (rv) {
|
|
if (0) { // obsolote
|
|
if ((p[1] > '0') && (p[1] <= '9')) {
|
|
if ((ns < maxSug) && (dicstem != 2)) {
|
|
int split = p[1] - '0';
|
|
if (rv->wlen <= split) break;
|
|
|
|
strcpy(fix, rv->word);
|
|
|
|
// checking verbs ending with `ik'
|
|
|
|
fix[rv->wlen - split] = 'i';
|
|
fix[rv->wlen - split + 1] = 'k';
|
|
fix[rv->wlen - split + 2] = '\0';
|
|
|
|
if (! (rv2 = pAMgr->lookup(fix))) {
|
|
fix[strlen(fix) - 2] = '\0';
|
|
rv2 = pAMgr->lookup(fix);
|
|
if ((!rv2)) {
|
|
*fix = csconv[((unsigned char) *fix)].cupper;
|
|
rv2 = pAMgr->lookup(fix);
|
|
if (! rv2) return ns;
|
|
}
|
|
|
|
}
|
|
|
|
if (0) {
|
|
strcpy(buf, prefix);
|
|
strcat(buf, fix);
|
|
wlst[ns] = mystrdup(buf);
|
|
if (wlst[ns] == NULL) return -1;
|
|
ns++;
|
|
}
|
|
|
|
rv = rv2;
|
|
|
|
} else return ns;
|
|
} else {
|
|
strcpy(fix, "__");
|
|
strcat(fix, rv->word);
|
|
rv = NULL;
|
|
rv2 = pAMgr->lookup(fix);
|
|
if ((rv2) && (rv2->astr) && (ns < maxSug))
|
|
if ((rv2) && (rv2->astr) && (ns < maxSug))
|
|
if (0) {
|
|
char buf2[MAXSWUTF8L];
|
|
|
|
strcpy(buf2, prefix);
|
|
|
|
if (*(rv2->astr) == '-') {
|
|
strcat(buf2, "");
|
|
} else {
|
|
strcat(buf2, "");
|
|
}
|
|
|
|
if (dicstem != 2) {
|
|
wlst[ns] = mystrdup(buf2);
|
|
if (wlst[ns] == NULL) return -1;
|
|
ns++;
|
|
}
|
|
|
|
if ((dicstem == 2) && (ns < maxSug)) {
|
|
strcpy(buf, word);
|
|
buf[cpdindex] = '\0';
|
|
strcat(buf + cpdindex, buf2);
|
|
|
|
if (pAMgr->get_compound() &&
|
|
(pAMgr->compound_check(buf, strlen(buf),
|
|
0,0,100,0,NULL,0,NULL,NULL,1))) {
|
|
wlst[ns] = mystrdup(buf);
|
|
if (wlst[ns] == NULL) return -1;
|
|
ns++;
|
|
}
|
|
}
|
|
// many stems
|
|
} else {
|
|
char * str = mystrdup("");
|
|
char * pos = str;
|
|
char * pos2;
|
|
do {
|
|
int suggest = 1;
|
|
pos2 = strchr(pos, '|');
|
|
if (pos2) *pos2 = '\0';
|
|
// ignore `-xxx' suggestion, when exists prefix
|
|
if (*pos == '-') {
|
|
pos++;
|
|
if (*prefix != '\0') suggest = 0;
|
|
}
|
|
// ignore `xxx-' suggestion, when word is not root
|
|
if ((strlen(pos) > 0) && (pos[strlen(pos)-1] == '-')) {
|
|
pos[strlen(pos)-1] = '\0';
|
|
strcpy(buf, prefix);
|
|
strcat(buf, fix + 2);
|
|
if ((dicstem != 0) && (strcmp(buf, word) != 0)) suggest = 0;
|
|
}
|
|
if ((suggest) && (ns < maxSug) && (strlen(pos) > 0)) {
|
|
strcpy(buf, prefix);
|
|
strcat(buf, pos);
|
|
wlst[ns] = mystrdup(buf);
|
|
if (wlst[ns] == NULL) return -1;
|
|
ns++;
|
|
}
|
|
if (pos2) pos = pos2 + 1;
|
|
} while (pos2);
|
|
free(str);
|
|
}
|
|
}
|
|
} else return ns;
|
|
|
|
}
|
|
|
|
return ns;
|
|
|
|
}
|
|
|
|
// suggest possible stems
|
|
int SuggestMgr::suggest_pos_stems(char*** slst, const char * w, int nsug)
|
|
{
|
|
char ** wlst;
|
|
|
|
struct hentry * rv = NULL;
|
|
|
|
char w2[MAXSWUTF8L];
|
|
const char * word = w;
|
|
|
|
// word reversing wrapper for complex prefixes
|
|
if (complexprefixes) {
|
|
strcpy(w2, w);
|
|
if (utf8) reverseword_utf(w2); else reverseword(w2);
|
|
word = w2;
|
|
}
|
|
|
|
int wl = strlen(word);
|
|
|
|
|
|
if (*slst) {
|
|
wlst = *slst;
|
|
} else {
|
|
wlst = (char **) calloc(maxSug, sizeof(char *));
|
|
if (wlst == NULL) return -1;
|
|
}
|
|
|
|
rv = pAMgr->suffix_check(word, wl, 0, NULL, wlst, maxSug, &nsug);
|
|
|
|
// delete dash from end of word
|
|
if (nsug > 0) {
|
|
for (int j=0; j < nsug; j++) {
|
|
if (wlst[j][strlen(wlst[j]) - 1] == '-') wlst[j][strlen(wlst[j]) - 1] = '\0';
|
|
}
|
|
}
|
|
|
|
*slst = wlst;
|
|
return nsug;
|
|
}
|
|
|
|
|
|
char * SuggestMgr::suggest_morph(const char * w)
|
|
{
|
|
char result[MAXLNLEN];
|
|
char * r = (char *) result;
|
|
char * st;
|
|
|
|
struct hentry * rv = NULL;
|
|
|
|
*result = '\0';
|
|
|
|
if (! pAMgr) return NULL;
|
|
|
|
char w2[MAXSWUTF8L];
|
|
const char * word = w;
|
|
|
|
// word reversing wrapper for complex prefixes
|
|
if (complexprefixes) {
|
|
strcpy(w2, w);
|
|
if (utf8) reverseword_utf(w2); else reverseword(w2);
|
|
word = w2;
|
|
}
|
|
|
|
rv = pAMgr->lookup(word);
|
|
|
|
while (rv) {
|
|
if ((!rv->astr) || !(TESTAFF(rv->astr, pAMgr->get_forbiddenword(), rv->alen) ||
|
|
TESTAFF(rv->astr, pAMgr->get_pseudoroot(), rv->alen) ||
|
|
TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) {
|
|
if (rv->description && ((!rv->astr) ||
|
|
!TESTAFF(rv->astr, pAMgr->get_lemma_present(), rv->alen)))
|
|
strcat(result, word);
|
|
if (rv->description) strcat(result, rv->description);
|
|
strcat(result, "\n");
|
|
}
|
|
rv = rv->next_homonym;
|
|
}
|
|
|
|
st = pAMgr->affix_check_morph(word,strlen(word));
|
|
if (st) {
|
|
strcat(result, st);
|
|
free(st);
|
|
}
|
|
|
|
if (pAMgr->get_compound() && (*result == '\0'))
|
|
pAMgr->compound_check_morph(word, strlen(word),
|
|
0, 0, 100, 0,NULL, 0, &r, NULL);
|
|
|
|
return (*result) ? mystrdup(line_uniq(delete_zeros(result))) : NULL;
|
|
}
|
|
|
|
char * SuggestMgr::suggest_morph_for_spelling_error(const char * word)
|
|
{
|
|
char * p = NULL;
|
|
char ** wlst = (char **) calloc(maxSug, sizeof(char *));
|
|
// we will use only the first suggestion
|
|
for (int i = 0; i < maxSug - 1; i++) wlst[i] = "";
|
|
int ns = suggest(&wlst, word, maxSug - 1);
|
|
if (ns == maxSug) {
|
|
p = suggest_morph(wlst[maxSug - 1]);
|
|
free(wlst[maxSug - 1]);
|
|
}
|
|
if (wlst) free(wlst);
|
|
return p;
|
|
}
|
|
|
|
|
|
// generate an n-gram score comparing s1 and s2
|
|
int SuggestMgr::ngram(int n, char * s1, const char * s2, int uselen)
|
|
{
|
|
int nscore = 0;
|
|
int ns;
|
|
int l1;
|
|
int l2;
|
|
|
|
if (utf8) {
|
|
w_char su1[MAXSWL];
|
|
w_char su2[MAXSWL];
|
|
l1 = u8_u16(su1, MAXSWL, s1);
|
|
l2 = u8_u16(su2, MAXSWL, s2);
|
|
if (!l2) return 0;
|
|
// decapitalize dictionary word
|
|
if (complexprefixes) {
|
|
mkallsmall_utf(su2+l2-1, 1, utfconv);
|
|
} else {
|
|
mkallsmall_utf(su2, 1, utfconv);
|
|
}
|
|
for (int j = 1; j <= n; j++) {
|
|
ns = 0;
|
|
for (int i = 0; i <= (l1-j); i++) {
|
|
for (int l = 0; l <= (l2-j); l++) {
|
|
int k;
|
|
for (k = 0; (k < j); k++) {
|
|
w_char * c1 = su1 + i + k;
|
|
w_char * c2 = su2 + l + k;
|
|
if ((c1->l != c2->l) || (c1->h != c2->h)) break;
|
|
}
|
|
if (k == j) {
|
|
ns++;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
nscore = nscore + ns;
|
|
if (ns < 2) break;
|
|
}
|
|
} else {
|
|
char t[MAXSWUTF8L];
|
|
l1 = strlen(s1);
|
|
l2 = strlen(s2);
|
|
if (!l2) return 0;
|
|
strcpy(t, s2);
|
|
if (complexprefixes) {
|
|
*(t+l2-1) = csconv[((unsigned char)*(t+l2-1))].clower;
|
|
} else {
|
|
mkallsmall(t, csconv);
|
|
/// *t = csconv[((unsigned char)*t)].clower;
|
|
}
|
|
for (int j = 1; j <= n; j++) {
|
|
ns = 0;
|
|
for (int i = 0; i <= (l1-j); i++) {
|
|
char c = *(s1 + i + j);
|
|
*(s1 + i + j) = '\0';
|
|
if (strstr(t,(s1+i))) ns++;
|
|
*(s1 + i + j ) = c;
|
|
}
|
|
nscore = nscore + ns;
|
|
if (ns < 2) break;
|
|
}
|
|
}
|
|
|
|
ns = 0;
|
|
if (uselen == NGRAM_LONGER_WORSE) ns = (l2-l1)-2;
|
|
if (uselen == NGRAM_ANY_MISMATCH) ns = abs(l2-l1)-2;
|
|
return (nscore - ((ns > 0) ? ns : 0));
|
|
}
|
|
|
|
int SuggestMgr::equalfirstletter(char * s1, const char * s2) {
|
|
if (utf8) {
|
|
w_char su1[MAXSWL];
|
|
w_char su2[MAXSWL];
|
|
// decapitalize dictionary word
|
|
if (complexprefixes) {
|
|
int l1 = u8_u16(su1, MAXSWL, s1);
|
|
int l2 = u8_u16(su2, MAXSWL, s2);
|
|
if (*((short *)su1+l1-1) == *((short *)su2+l2-1)) return 1;
|
|
} else {
|
|
u8_u16(su1, 1, s1);
|
|
u8_u16(su2, 1, s2);
|
|
if (*((short *)su1) == *((short *)su2)) return 1;
|
|
}
|
|
} else {
|
|
if (complexprefixes) {
|
|
int l1 = strlen(s1);
|
|
int l2 = strlen(s2);
|
|
if (*(s2+l1-1) == *(s2+l2-1)) return 1;
|
|
} else {
|
|
if (*s1 == *s2) return 1;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
int SuggestMgr::commoncharacterpositions(char * s1, const char * s2, int * is_swap) {
|
|
int num = 0;
|
|
int diff = 0;
|
|
int diffpos[2];
|
|
*is_swap = 0;
|
|
if (utf8) {
|
|
w_char su1[MAXSWL];
|
|
w_char su2[MAXSWL];
|
|
int l1 = u8_u16(su1, MAXSWL, s1);
|
|
int l2 = u8_u16(su2, MAXSWL, s2);
|
|
for (int i = 0; (i < l1) && (i < l2); i++) {
|
|
if (((short *) su1)[i] == ((short *) su2)[i]) {
|
|
num++;
|
|
} else {
|
|
if (diff < 2) diffpos[diff] = i;
|
|
diff++;
|
|
}
|
|
}
|
|
if ((diff == 2) && (l1 == l2) &&
|
|
(((short *) su1)[diffpos[0]] == ((short *) su2)[diffpos[1]]) &&
|
|
(((short *) su1)[diffpos[1]] == ((short *) su2)[diffpos[0]])) *is_swap = 1;
|
|
} else {
|
|
int i;
|
|
for (i = 0; (*(s1+i) != 0) && (*(s2+i) != 0); i++) {
|
|
if (*(s1+i) == *(s2+i)) {
|
|
num++;
|
|
} else {
|
|
if (diff < 2) diffpos[diff] = i;
|
|
diff++;
|
|
}
|
|
}
|
|
if ((diff == 2) && (*(s1+i) == 0) && (*(s2+i) == 0) &&
|
|
(*(s1+diffpos[0]) == *(s2+diffpos[1])) &&
|
|
(*(s1+diffpos[1]) == *(s2+diffpos[0]))) *is_swap = 1;
|
|
}
|
|
return num;
|
|
}
|
|
|
|
int SuggestMgr::mystrlen(const char * word) {
|
|
if (utf8) {
|
|
w_char w[MAXSWL];
|
|
return u8_u16(w, MAXSWL, word);
|
|
} else return strlen(word);
|
|
}
|
|
|
|
// sort in decreasing order of score
|
|
void SuggestMgr::bubblesort(char** rword, int* rsc, int n )
|
|
{
|
|
int m = 1;
|
|
while (m < n) {
|
|
int j = m;
|
|
while (j > 0) {
|
|
if (rsc[j-1] < rsc[j]) {
|
|
int sctmp = rsc[j-1];
|
|
char * wdtmp = rword[j-1];
|
|
rsc[j-1] = rsc[j];
|
|
rword[j-1] = rword[j];
|
|
rsc[j] = sctmp;
|
|
rword[j] = wdtmp;
|
|
j--;
|
|
} else break;
|
|
}
|
|
m++;
|
|
}
|
|
return;
|
|
}
|
|
|
|
// longest common subsequence
|
|
void SuggestMgr::lcs(const char * s, const char * s2, int * l1, int * l2, char ** result) {
|
|
int n, m;
|
|
w_char su[MAXSWL];
|
|
w_char su2[MAXSWL];
|
|
char * b;
|
|
char * c;
|
|
int i;
|
|
int j;
|
|
if (utf8) {
|
|
m = u8_u16(su, MAXSWL, s);
|
|
n = u8_u16(su2, MAXSWL, s2);
|
|
} else {
|
|
m = strlen(s);
|
|
n = strlen(s2);
|
|
}
|
|
c = (char *) malloc((m + 1) * (n + 1));
|
|
b = (char *) malloc((m + 1) * (n + 1));
|
|
for (i = 1; i <= m; i++) c[i*(n+1)] = 0;
|
|
for (j = 0; j <= n; j++) c[j] = 0;
|
|
for (i = 1; i <= m; i++) {
|
|
for (j = 1; j <= n; j++) {
|
|
if ((utf8) && (*((short *) su+i-1) == *((short *)su2+j-1))
|
|
|| (!utf8) && ((*(s+i-1)) == (*(s2+j-1)))) {
|
|
c[i*(n+1) + j] = c[(i-1)*(n+1) + j-1]+1;
|
|
b[i*(n+1) + j] = LCS_UPLEFT;
|
|
} else if (c[(i-1)*(n+1) + j] >= c[i*(n+1) + j-1]) {
|
|
c[i*(n+1) + j] = c[(i-1)*(n+1) + j];
|
|
b[i*(n+1) + j] = LCS_UP;
|
|
} else {
|
|
c[i*(n+1) + j] = c[i*(n+1) + j-1];
|
|
b[i*(n+1) + j] = LCS_LEFT;
|
|
}
|
|
}
|
|
}
|
|
*result = b;
|
|
free(c);
|
|
*l1 = m;
|
|
*l2 = n;
|
|
}
|
|
|
|
int SuggestMgr::lcslen(const char * s, const char* s2) {
|
|
int m;
|
|
int n;
|
|
int i;
|
|
int j;
|
|
char * result;
|
|
int len = 0;
|
|
lcs(s, s2, &m, &n, &result);
|
|
i = m;
|
|
j = n;
|
|
while ((i != 0) && (j != 0)) {
|
|
if (result[i*(n+1) + j] == LCS_UPLEFT) {
|
|
len++;
|
|
i--;
|
|
j--;
|
|
} else if (result[i*(n+1) + j] == LCS_UP) {
|
|
i--;
|
|
} else j--;
|
|
}
|
|
if (result) free(result);
|
|
return len;
|
|
}
|