This repository has been archived on 2024-04-08. You can view files and clone it, but cannot push or open issues or pull requests.
deb-goldedplus/goldlib/hunspell/suggmgr.cxx

1658 lines
44 KiB
C++
Raw Normal View History

2006-04-05 17:20:12 +00:00
#include "license.hun"
#include "license.mys"
#include <cstdlib>
#include <cctype>
#include <cstring>
#include <cstdio>
#include "suggmgr.hxx"
#if !defined(_MSC_VER)
using namespace std;
#endif
2006-05-14 18:37:26 +00:00
SuggestMgr::SuggestMgr(const char * tryme, int maxn,
2006-04-05 17:20:12 +00:00
AffixMgr * aptr)
{
2006-05-14 18:37:26 +00:00
// register affix manager and check in string of chars to
2006-04-05 17:20:12 +00:00
// try when building candidate suggestions
pAMgr = aptr;
ctryl = 0;
ctry = NULL;
ctry_utf = NULL;
maxSug = maxn;
nosplitsugs = 0;
maxngramsugs = MAXNGRAMSUGS;
utf8 = 0;
utfconv = NULL;
complexprefixes = 0;
if (pAMgr) {
char * enc = pAMgr->get_encoding();
csconv = get_current_cs(enc);
free(enc);
nosplitsugs = pAMgr->get_nosplitsugs();
if (pAMgr->get_maxngramsugs() >= 0) maxngramsugs = pAMgr->get_maxngramsugs();
utf8 = pAMgr->get_utf8();
utfconv = pAMgr->get_utf_conv();
complexprefixes = pAMgr->get_complexprefixes();
}
2006-05-14 18:37:26 +00:00
if (tryme) {
2006-04-05 17:20:12 +00:00
if (utf8) {
2006-05-14 18:37:26 +00:00
w_char t[MAXSWL];
2006-04-05 17:20:12 +00:00
ctryl = u8_u16(t, MAXSWL, tryme);
ctry_utf = (w_char *) malloc(ctryl * sizeof(w_char));
memcpy(ctry_utf, t, ctryl * sizeof(w_char));
} else {
ctry = mystrdup(tryme);
ctryl = strlen(ctry);
}
}
}
SuggestMgr::~SuggestMgr()
{
pAMgr = NULL;
if (ctry) free(ctry);
ctry = NULL;
if (ctry_utf) free(ctry_utf);
ctry_utf = NULL;
ctryl = 0;
maxSug = 0;
}
// generate suggestions for a mispelled word
// pass in address of array of char * pointers
int SuggestMgr::suggest(char*** slst, const char * w, int nsug)
{
int nocompoundtwowords = 0;
2006-05-14 18:37:26 +00:00
char ** wlst;
2006-04-05 17:20:12 +00:00
w_char word_utf[MAXSWL];
2006-05-14 18:37:26 +00:00
int wl=0;
2006-04-05 17:20:12 +00:00
char w2[MAXWORDUTF8LEN];
const char * word = w;
// word reversing wrapper for complex prefixes
if (complexprefixes) {
strcpy(w2, w);
if (utf8) reverseword_utf(w2); else reverseword(w2);
word = w2;
}
2006-05-14 18:37:26 +00:00
2006-04-05 17:20:12 +00:00
if (*slst) {
wlst = *slst;
} else {
wlst = (char **) malloc(maxSug * sizeof(char *));
if (wlst == NULL) return -1;
for (int i = 0; i < maxSug; i++) wlst[i] = NULL;
}
2006-05-14 18:37:26 +00:00
2006-04-05 17:20:12 +00:00
if (utf8) {
wl = u8_u16(word_utf, MAXSWL, word);
}
for (int cpdsuggest=0; (cpdsuggest<2) && (nocompoundtwowords==0); cpdsuggest++) {
// perhaps we made a typical fault of spelling
if ((nsug < maxSug) && (nsug > -1))
nsug = replchars(wlst, word, nsug, cpdsuggest);
// perhaps we made chose the wrong char from a related set
if ((nsug < maxSug) && (nsug > -1))
nsug = mapchars(wlst, word, nsug, cpdsuggest);
// did we swap the order of chars by mistake
if ((nsug < maxSug) && (nsug > -1)) {
nsug = (utf8) ? swapchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
swapchar(wlst, word, nsug, cpdsuggest);
}
// did we forget to add a char
if ((nsug < maxSug) && (nsug > -1)) {
nsug = (utf8) ? forgotchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
forgotchar(wlst, word, nsug, cpdsuggest);
}
// did we add a char that should not be there
if ((nsug < maxSug) && (nsug > -1)) {
nsug = (utf8) ? extrachar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
extrachar(wlst, word, nsug, cpdsuggest);
}
// did we just hit the wrong key in place of a good char
if ((nsug < maxSug) && (nsug > -1)) {
nsug = (utf8) ? badchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
badchar(wlst, word, nsug, cpdsuggest);
}
// only suggest compound words when no other suggestion
if ((cpdsuggest==0) && (nsug>0)) nocompoundtwowords=1;
// perhaps we forgot to hit space and two words ran together
if ((!nosplitsugs) && (nsug < maxSug) && (nsug > -1)) {
nsug = twowords(wlst, word, nsug, cpdsuggest);
}
} // repeating ``for'' statement compounding support
if (nsug < 0) {
// we ran out of memory - we should free up as much as possible
for (int i = 0; i < maxSug; i++)
if (wlst[i] != NULL) free(wlst[i]);
free(wlst);
wlst = NULL;
}
*slst = wlst;
return nsug;
}
// generate suggestions for a word with typical mistake
// pass in address of array of char * pointers
int SuggestMgr::suggest_auto(char*** slst, const char * w, int nsug)
{
int nocompoundtwowords = 0;
char ** wlst;
char w2[MAXWORDUTF8LEN];
const char * word = w;
// word reversing wrapper for complex prefixes
if (complexprefixes) {
strcpy(w2, w);
if (utf8) reverseword_utf(w2); else reverseword(w2);
word = w2;
}
if (*slst) {
wlst = *slst;
} else {
wlst = (char **) malloc(maxSug * sizeof(char *));
if (wlst == NULL) return -1;
}
for (int cpdsuggest=0; (cpdsuggest<2) && (nocompoundtwowords==0); cpdsuggest++) {
// perhaps we made a typical fault of spelling
if ((nsug < maxSug) && (nsug > -1))
nsug = replchars(wlst, word, nsug, cpdsuggest);
// perhaps we made chose the wrong char from a related set
if ((nsug < maxSug) && (nsug > -1) && (cpdsuggest == 0))
nsug = mapchars(wlst, word, nsug, cpdsuggest);
if ((cpdsuggest==0) && (nsug>0)) nocompoundtwowords=1;
// perhaps we forgot to hit space and two words ran together
if ((nsug < maxSug) && (nsug > -1) && check_forbidden(word, strlen(word))) {
nsug = twowords(wlst, word, nsug, cpdsuggest);
}
2006-05-14 18:37:26 +00:00
2006-04-05 17:20:12 +00:00
} // repeating ``for'' statement compounding support
if (nsug < 0) {
for (int i=0;i<maxSug; i++)
if (wlst[i] != NULL) free(wlst[i]);
free(wlst);
return -1;
}
*slst = wlst;
return nsug;
}
// suggestions for when chose the wrong char out of a related set
int SuggestMgr::mapchars(char** wlst, const char * word, int ns, int cpdsuggest)
{
time_t timelimit;
int timer;
2006-05-14 18:37:26 +00:00
2006-04-05 17:20:12 +00:00
int wl = strlen(word);
if (wl < 2 || ! pAMgr) return ns;
int nummap = pAMgr->get_nummap();
struct mapentry* maptable = pAMgr->get_maptable();
if (maptable==NULL) return ns;
timelimit = time(NULL);
timer = MINTIMER;
if (utf8) {
w_char w[MAXSWL];
int len = u8_u16(w, MAXSWL, word);
ns = map_related_utf(w, len, 0, wlst, ns, maptable, nummap, &timer, &timelimit);
} else ns = map_related(word, 0, wlst, ns, maptable, nummap, &timer, &timelimit);
return ns;
}
int SuggestMgr::map_related(const char * word, int i, char** wlst, int ns,
const mapentry* maptable, int nummap, int * timer, time_t * timelimit)
{
2006-05-14 18:37:26 +00:00
char c = *(word + i);
2006-04-05 17:20:12 +00:00
if (c == 0) {
int cwrd = 1;
int wl;
for (int m=0; m < ns; m++)
if (strcmp(word,wlst[m]) == 0) cwrd = 0;
2006-05-14 18:37:26 +00:00
if ((cwrd) && (wl = strlen(word)) && (check(word, wl, 0, timer, timelimit) ||
2006-04-05 17:20:12 +00:00
check(word, wl, 1, timer, timelimit))) {
if (ns < maxSug) {
wlst[ns] = mystrdup(word);
if (wlst[ns] == NULL) return -1;
ns++;
}
}
return ns;
2006-05-14 18:37:26 +00:00
}
2006-04-05 17:20:12 +00:00
int in_map = 0;
for (int j = 0; j < nummap; j++) {
if (strchr(maptable[j].set,c) != 0) {
in_map = 1;
char * newword = mystrdup(word);
for (int k = 0; k < maptable[j].len; k++) {
*(newword + i) = *(maptable[j].set + k);
ns = map_related(newword, (i+1), wlst, ns, maptable, nummap, timer, timelimit);
if (!(*timelimit)) return ns;
}
free(newword);
}
}
if (!in_map) {
i++;
ns = map_related(word, i, wlst, ns, maptable, nummap, timer, timelimit);
}
return ns;
}
int SuggestMgr::map_related_utf(w_char * word, int len, int i, char** wlst, int ns,
2006-05-14 18:37:26 +00:00
const mapentry* maptable, int nummap, int * timer, time_t * timelimit)
2006-04-05 17:20:12 +00:00
{
if (i == len) {
int cwrd = 1;
int wl;
char s[MAXSWUTF8L];
u16_u8(s, MAXSWUTF8L, word, len);
for (int m=0; m < ns; m++)
if (strcmp(s,wlst[m]) == 0) cwrd = 0;
2006-05-14 18:37:26 +00:00
if ((cwrd) && (wl = strlen(s)) && (check(s, wl, 0, timer, timelimit) ||
2006-04-05 17:20:12 +00:00
check(s, wl, 1, timer, timelimit))) {
if (ns < maxSug) {
wlst[ns] = mystrdup(s);
if (wlst[ns] == NULL) return -1;
ns++;
}
}
return ns;
2006-05-14 18:37:26 +00:00
}
2006-04-05 17:20:12 +00:00
int in_map = 0;
unsigned short c = *((unsigned short *) word + i);
for (int j = 0; j < nummap; j++) {
if (flag_bsearch((unsigned short *) maptable[j].set_utf16, c, maptable[j].len)) {
in_map = 1;
for (int k = 0; k < maptable[j].len; k++) {
*(word + i) = *(maptable[j].set_utf16 + k);
ns = map_related_utf(word, len, i + 1, wlst, ns, maptable, nummap, timer, timelimit);
if (!(*timelimit)) return ns;
}
*((unsigned short *) word + i) = c;
}
}
if (!in_map) {
i++;
ns = map_related_utf(word, len, i, wlst, ns, maptable, nummap, timer, timelimit);
}
return ns;
}
// suggestions for a typical fault of spelling, that
// differs with more, than 1 letter from the right form.
int SuggestMgr::replchars(char** wlst, const char * word, int ns, int cpdsuggest)
{
char candidate[MAXSWUTF8L];
const char * r;
int lenr, lenp;
int cwrd;
int wl = strlen(word);
if (wl < 2 || ! pAMgr) return ns;
int numrep = pAMgr->get_numrep();
struct replentry* reptable = pAMgr->get_reptable();
if (reptable==NULL) return ns;
for (int i=0; i < numrep; i++ ) {
r = word;
lenr = strlen(reptable[i].pattern2);
lenp = strlen(reptable[i].pattern);
// search every occurence of the pattern in the word
while ((r=strstr(r, reptable[i].pattern)) != NULL) {
strcpy(candidate, word);
if (r-word + lenr + strlen(r+lenp) >= MAXSWUTF8L) break;
strcpy(candidate+(r-word),reptable[i].pattern2);
strcpy(candidate+(r-word)+lenr, r+lenp);
cwrd = 1;
for (int k=0; k < ns; k++)
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
if ((cwrd) && check(candidate,strlen(candidate), cpdsuggest, NULL, NULL)) {
if (ns < maxSug) {
wlst[ns] = mystrdup(candidate);
if (wlst[ns] == NULL) {
for (int j=0; j<ns; j++) free(wlst[j]);
return -1;
}
ns++;
} else return ns;
}
r++; // search for the next letter
}
}
return ns;
}
// perhaps we made a special pattern mistake
// for example: vacation -> vacacation (doubled `ac')
int SuggestMgr::doubledsyllable(char** wlst, const char * word, int ns, int cpdsuggest)
{
char candidate[MAXSWUTF8L];
int state=0;
int cwrd;
int wl = strlen(word);
if (wl < 5 || ! pAMgr) return ns;
for (int i=2; i < wl; i++ ) {
if (word[i]==word[i-2]) {
state++;
if (state==3) {
strcpy(candidate,word);
strcpy(candidate+i-1,word+i+1);
cwrd = 1;
for (int k=0; k < ns; k++)
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
if ((cwrd) && check(candidate,strlen(candidate), cpdsuggest, NULL, NULL)) {
if (ns < maxSug) {
wlst[ns] = mystrdup(candidate);
if (wlst[ns] == NULL) {
for (int j=0; j<ns; j++) free(wlst[j]);
return -1;
}
ns++;
} else return ns;
}
state=0;
}
} else {
state=0;
}
}
return ns;
}
// error is wrong char in place of correct one
int SuggestMgr::badchar(char ** wlst, const char * word, int ns, int cpdsuggest)
{
char tmpc;
char candidate[MAXSWUTF8L];
time_t timelimit = time(NULL);
int timer = MINTIMER;
int wl = strlen(word);
int cwrd;
strcpy(candidate, word);
// swap out each char one by one and try all the tryme
// chars in its place to see if that makes a good word
for (int i=0; i < wl; i++) {
tmpc = candidate[i];
for (int j=0; j < ctryl; j++) {
if (ctry[j] == tmpc) continue;
candidate[i] = ctry[j];
cwrd = 1;
for (int k=0; k < ns; k++)
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
if ((cwrd) && check(candidate,wl, cpdsuggest, &timer, &timelimit)) {
if (ns < maxSug) {
wlst[ns] = mystrdup(candidate);
if (wlst[ns] == NULL) return -1;
ns++;
} else return ns;
}
if (!timelimit) return ns;
candidate[i] = tmpc;
}
}
return ns;
}
// error is wrong char in place of correct one
int SuggestMgr::badchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
{
w_char tmpc;
w_char candidate_utf[MAXSWL];
char candidate[MAXSWUTF8L];
int cwrd;
time_t timelimit = time(NULL);
int timer = MINTIMER;
2006-05-14 18:37:26 +00:00
2006-04-05 17:20:12 +00:00
memcpy(candidate_utf, word, wl * sizeof(w_char));
// swap out each char one by one and try all the tryme
// chars in its place to see if that makes a good word
for (int i=0; i < wl; i++) {
tmpc = candidate_utf[i];
for (int j=0; j < ctryl; j++) {
if ((ctry_utf[j].l == tmpc.l) && (ctry_utf[j].h == tmpc.h)) continue;
candidate_utf[i] = ctry_utf[j];
cwrd = 1;
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
for (int k=0; k < ns; k++)
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
if ((cwrd) && check(candidate, strlen(candidate), cpdsuggest, &timer, &timelimit)) {
if (ns < maxSug) {
wlst[ns] = mystrdup(candidate);
if (wlst[ns] == NULL) return -1;
ns++;
} else return ns;
}
if (!timelimit) return ns;
candidate_utf[i] = tmpc;
}
}
return ns;
}
2006-05-14 18:37:26 +00:00
// error is word has an extra letter it does not need
2006-04-05 17:20:12 +00:00
int SuggestMgr::extrachar_utf(char** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
{
char candidate[MAXSWUTF8L];
w_char candidate_utf[MAXSWL];
const w_char * p;
w_char * r;
int cwrd;
if (wl < 2) return ns;
// try omitting one char of word at a time
memcpy(candidate_utf, word + 1, (wl - 1) * sizeof(w_char));
for (p = word, r = candidate_utf; p < word + wl; ) {
cwrd = 1;
2006-05-14 18:37:26 +00:00
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl - 1);
2006-04-05 17:20:12 +00:00
for (int k=0; k < ns; k++)
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
if ((cwrd) && check(candidate, strlen(candidate), cpdsuggest, NULL, NULL)) {
if (ns < maxSug) {
wlst[ns] = mystrdup(candidate);
if (wlst[ns] == NULL) return -1;
ns++;
2006-05-14 18:37:26 +00:00
} else return ns;
2006-04-05 17:20:12 +00:00
}
*r++ = *p++;
}
return ns;
}
2006-05-14 18:37:26 +00:00
// error is word has an extra letter it does not need
2006-04-05 17:20:12 +00:00
int SuggestMgr::extrachar(char** wlst, const char * word, int ns, int cpdsuggest)
{
char candidate[MAXSWUTF8L];
const char * p;
char * r;
int cwrd;
int wl = strlen(word);
if (wl < 2) return ns;
// try omitting one char of word at a time
strcpy (candidate, word + 1);
for (p = word, r = candidate; *p != 0; ) {
cwrd = 1;
for (int k=0; k < ns; k++)
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
if ((cwrd) && check(candidate,wl-1, cpdsuggest, NULL, NULL)) {
if (ns < maxSug) {
wlst[ns] = mystrdup(candidate);
if (wlst[ns] == NULL) return -1;
ns++;
2006-05-14 18:37:26 +00:00
} else return ns;
2006-04-05 17:20:12 +00:00
}
*r++ = *p++;
}
return ns;
}
// error is missing a letter it needs
int SuggestMgr::forgotchar(char ** wlst, const char * word, int ns, int cpdsuggest)
{
char candidate[MAXSWUTF8L];
const char * p;
char * q;
int cwrd;
time_t timelimit = time(NULL);
int timer = MINTIMER;
int wl = strlen(word);
// try inserting a tryme character before every letter
strcpy(candidate + 1, word);
for (p = word, q = candidate; *p != 0; ) {
for (int i = 0; i < ctryl; i++) {
*q = ctry[i];
cwrd = 1;
for (int k=0; k < ns; k++)
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
if ((cwrd) && check(candidate, wl+1, cpdsuggest, &timer, &timelimit)) {
if (ns < maxSug) {
wlst[ns] = mystrdup(candidate);
if (wlst[ns] == NULL) return -1;
ns++;
2006-05-14 18:37:26 +00:00
} else return ns;
2006-04-05 17:20:12 +00:00
}
if (!timelimit) return ns;
}
*q++ = *p++;
}
// now try adding one to end */
for (int i = 0; i < ctryl; i++) {
*q = ctry[i];
cwrd = 1;
for (int k=0; k < ns; k++)
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
if ((cwrd) && check(candidate,wl+1, cpdsuggest, NULL, NULL)) {
if (ns < maxSug) {
wlst[ns] = mystrdup(candidate);
if (wlst[ns] == NULL) return -1;
ns++;
} else return ns;
}
}
return ns;
}
// error is missing a letter it needs
int SuggestMgr::forgotchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
{
w_char candidate_utf[MAXSWL];
char candidate[MAXSWUTF8L];
const w_char * p;
w_char * q;
int cwrd;
time_t timelimit = time(NULL);
int timer = MINTIMER;
// try inserting a tryme character before every letter
memcpy (candidate_utf + 1, word, wl * sizeof(w_char));
for (p = word, q = candidate_utf; p < (word + wl); ) {
for (int i = 0; i < ctryl; i++) {
*q = ctry_utf[i];
cwrd = 1;
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl + 1);
for (int k=0; k < ns; k++)
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
if ((cwrd) && check(candidate, strlen(candidate), cpdsuggest, &timer, &timelimit)) {
if (ns < maxSug) {
wlst[ns] = mystrdup(candidate);
if (wlst[ns] == NULL) return -1;
ns++;
2006-05-14 18:37:26 +00:00
} else return ns;
2006-04-05 17:20:12 +00:00
}
if (!timelimit) return ns;
}
*q++ = *p++;
}
// now try adding one to end */
for (int i = 0; i < ctryl; i++) {
*q = ctry_utf[i];
cwrd = 1;
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl + 1);
for (int k=0; k < ns; k++)
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
if ((cwrd) && check(candidate, strlen(candidate), cpdsuggest, NULL, NULL)) {
if (ns < maxSug) {
wlst[ns] = mystrdup(candidate);
if (wlst[ns] == NULL) return -1;
ns++;
} else return ns;
}
}
return ns;
}
/* error is should have been two words */
int SuggestMgr::twowords(char ** wlst, const char * word, int ns, int cpdsuggest)
{
char candidate[MAXSWUTF8L];
char * p;
int c1, c2, cwrd;
int forbidden = 0;
int wl=strlen(word);
if (wl < 3) return ns;
2006-05-14 18:37:26 +00:00
2006-04-05 17:20:12 +00:00
if (pAMgr->get_langnum() == LANG_hu) forbidden = check_forbidden(word, wl);
strcpy(candidate + 1, word);
// split the string into two pieces after every char
// if both pieces are good words make them a suggestion
for (p = candidate + 1; p[1] != '\0'; p++) {
p[-1] = *p;
// go to end of the UTF-8 character
while (utf8 && ((p[1] & 0xc0) == 0x80)) {
p++;
p[-1] = *p;
}
*p = '\0';
if ((c1=check(candidate,strlen(candidate), cpdsuggest, NULL, NULL))) {
if ((c2=check((p+1),strlen(p+1), cpdsuggest, NULL, NULL))) {
*p = ' ';
// spec. Hungarian code (need a better compound word support)
if ((pAMgr->get_langnum() == LANG_hu) && !forbidden &&
// if 3 repeating letter, use - instead of space
(((p[-1] == p[1]) && (((p>candidate+1) && (p[-1] == p[-2])) || (p[-1] == p[2]))) ||
// or multiple compounding, with more, than 6 syllables
((c1 == 3) && (c2 >= 2)))) *p = '-';
cwrd = 1;
for (int k=0; k < ns; k++)
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
if (ns < maxSug) {
if (cwrd) {
wlst[ns] = mystrdup(candidate);
if (wlst[ns] == NULL) return -1;
ns++;
}
} else return ns;
}
}
}
return ns;
}
// error is adjacent letter were swapped
int SuggestMgr::swapchar(char ** wlst, const char * word, int ns, int cpdsuggest)
{
char candidate[MAXSWUTF8L];
char * p;
char tmpc;
int cwrd;
int wl=strlen(word);
// try swapping adjacent chars one by one
strcpy(candidate, word);
for (p = candidate; p[1] != 0; p++) {
tmpc = *p;
*p = p[1];
p[1] = tmpc;
cwrd = 1;
for (int k=0; k < ns; k++)
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
if ((cwrd) && check(candidate,wl, cpdsuggest, NULL, NULL)) {
if (ns < maxSug) {
wlst[ns] = mystrdup(candidate);
if (wlst[ns] == NULL) return -1;
ns++;
} else return ns;
}
tmpc = *p;
*p = p[1];
p[1] = tmpc;
}
return ns;
}
// error is adjacent letter were swapped
int SuggestMgr::swapchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
{
w_char candidate_utf[MAXSWL];
char candidate[MAXSWUTF8L];
w_char * p;
w_char tmpc;
int cwrd;
// try swapping adjacent chars one by one
memcpy (candidate_utf, word, wl * sizeof(w_char));
for (p = candidate_utf; p < (candidate_utf + wl - 1); p++) {
tmpc = *p;
*p = p[1];
p[1] = tmpc;
cwrd = 1;
u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
for (int k=0; k < ns; k++)
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
if ((cwrd) && check(candidate, strlen(candidate), cpdsuggest, NULL, NULL)) {
if (ns < maxSug) {
wlst[ns] = mystrdup(candidate);
if (wlst[ns] == NULL) return -1;
ns++;
} else return ns;
}
tmpc = *p;
*p = p[1];
p[1] = tmpc;
}
return ns;
}
// generate a set of suggestions for very poorly spelled words
int SuggestMgr::ngsuggest(char** wlst, char * w, HashMgr* pHMgr)
{
int i, j;
int lval;
int sc;
int lp;
if (! pHMgr) return 0;
// exhaustively search through all root words
// keeping track of the MAX_ROOTS most similar root words
struct hentry * roots[MAX_ROOTS];
int scores[MAX_ROOTS];
for (i = 0; i < MAX_ROOTS; i++) {
roots[i] = NULL;
scores[i] = -100 * i;
}
lp = MAX_ROOTS - 1;
char w2[MAXWORDUTF8LEN];
char * word = w;
// word reversing wrapper for complex prefixes
if (complexprefixes) {
strcpy(w2, w);
if (utf8) reverseword_utf(w2); else reverseword(w2);
word = w2;
}
char mw[MAXSWUTF8L];
w_char u8[MAXSWL];
int nc = strlen(word);
int n = (utf8) ? u8_u16(u8, MAXSWL, word) : nc;
struct hentry* hp = NULL;
int col = -1;
while ((hp = pHMgr->walk_hashtable(col, hp))) {
// check forbidden words
2006-05-14 18:37:26 +00:00
if ((hp->astr) && (pAMgr) &&
2006-04-05 17:20:12 +00:00
(TESTAFF(hp->astr, pAMgr->get_forbiddenword(), hp->alen) ||
TESTAFF(hp->astr, pAMgr->get_nosuggest(), hp->alen) ||
TESTAFF(hp->astr, pAMgr->get_onlyincompound(), hp->alen))) continue;
sc = ngram(3, word, hp->word, NGRAM_LONGER_WORSE);
if (sc > scores[lp]) {
2006-05-14 18:37:26 +00:00
scores[lp] = sc;
2006-04-05 17:20:12 +00:00
roots[lp] = hp;
int lval = sc;
for (j=0; j < MAX_ROOTS; j++)
if (scores[j] < lval) {
lp = j;
lval = scores[j];
}
2006-05-14 18:37:26 +00:00
}
2006-04-05 17:20:12 +00:00
}
// find minimum threshhold for a passable suggestion
// mangle original word three differnt ways
// and score them to generate a minimum acceptable score
int thresh = 0;
for (int sp = 1; sp < 4; sp++) {
if (utf8) {
for (int k=sp; k < n; k+=4) *((unsigned short *) u8 + k) = '*';
u16_u8(mw, MAXSWUTF8L, u8, n);
thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH);
} else {
strcpy(mw, word);
for (int k=sp; k < n; k+=4) *(mw + k) = '*';
thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH);
}
}
thresh = thresh / 3;
thresh--;
// now expand affixes on each of these root words and
// and use length adjusted ngram scores to select
// possible suggestions
char * guess[MAX_GUESS];
int gscore[MAX_GUESS];
for(i=0;i<MAX_GUESS;i++) {
guess[i] = NULL;
gscore[i] = -100 * i;
}
lp = MAX_GUESS - 1;
struct guessword * glst;
glst = (struct guessword *) calloc(MAX_WORDS,sizeof(struct guessword));
if (! glst) return 0;
for (i = 0; i < MAX_ROOTS; i++) {
if (roots[i]) {
struct hentry * rp = roots[i];
int nw = pAMgr->expand_rootword(glst, MAX_WORDS, rp->word, rp->wlen,
rp->astr, rp->alen, word, nc);
for (int k = 0; k < nw ; k++) {
sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH);
if ((sc > thresh)) {
if (sc > gscore[lp]) {
if (guess[lp]) free (guess[lp]);
gscore[lp] = sc;
guess[lp] = glst[k].word;
lval = sc;
for (j=0; j < MAX_GUESS; j++)
if (gscore[j] < lval) {
lp = j;
lval = gscore[j];
}
2006-05-14 18:37:26 +00:00
} else free (glst[k].word);
2006-04-05 17:20:12 +00:00
} else free(glst[k].word);
}
}
}
free(glst);
// now we are done generating guesses
// sort in order of decreasing score
2006-05-14 18:37:26 +00:00
2006-04-05 17:20:12 +00:00
bubblesort(&guess[0], &gscore[0], MAX_GUESS);
// weight suggestions with a similarity index, based on
// the longest common subsequent algorithm and resort
int is_swap;
for (i=0; i < MAX_GUESS; i++) {
if (guess[i]) {
// lowering guess[i]
char gl[MAXSWUTF8L];
int len;
if (utf8) {
w_char w[MAXSWL];
len = u8_u16(w, MAXSWL, guess[i]);
mkallsmall_utf(w, len, utfconv);
u16_u8(gl, MAXSWUTF8L, w, len);
} else {
strcpy(gl, guess[i]);
mkallsmall(gl, csconv);
len = strlen(guess[i]);
}
int lcs = lcslen(word, gl);
// same characters with different casing
if ((n == len) && (n == lcs)) {
gscore[i] += 2000;
break;
}
2006-05-14 18:37:26 +00:00
2006-04-05 17:20:12 +00:00
// heuristic weigthing of ngram scores
gscore[i] +=
// length of longest common subsequent minus lenght difference
2 * lcs - abs((int) (n - len)) +
// weight equal first letter
equalfirstletter(word, gl) +
// weight equal character positions
((lcs == commoncharacterpositions(word, gl, &is_swap)) ? 1: 0) +
// swap character (not neighboring)
((is_swap) ? 1000 : 0);
}
}
bubblesort(&guess[0], &gscore[0], MAX_GUESS);
// copy over
int ns = 0;
int same = 0;
for (i=0; i < MAX_GUESS; i++) {
if (guess[i]) {
if ((ns < maxngramsugs) && (ns < maxSug) && (!same || (gscore[i] > 1000))) {
int unique = 1;
// we have excellent suggestion(s)
if (gscore[i] > 1000) same = 1;
for (j=0; j < ns; j++)
// don't suggest previous suggestions or a previous suggestion with prefixes or affixes
2006-05-14 18:37:26 +00:00
if (strstr(guess[i], wlst[j]) ||
2006-04-05 17:20:12 +00:00
// check forbidden words
!check(guess[i], strlen(guess[i]), 0, NULL, NULL)) unique = 0;
if (unique) wlst[ns++] = guess[i]; else free(guess[i]);
} else free(guess[i]);
}
}
return ns;
}
// see if a candidate suggestion is spelled correctly
// needs to check both root words and words with affixes
// obsolote MySpell-HU modifications:
// return value 2 and 3 marks compounding with hyphen (-)
// `3' marks roots without suffix
int SuggestMgr::check(const char * word, int len, int cpdsuggest, int * timer, time_t * timelimit)
{
struct hentry * rv=NULL;
int nosuffix = 0;
2006-05-14 18:37:26 +00:00
2006-04-05 17:20:12 +00:00
// check time limit
if (timer) {
(*timer)--;
if (!(*timer) && timelimit) {
if (time(NULL) > *timelimit) {
*timelimit = 0;
return 0;
}
*timer = MAXPLUSTIMER;
}
}
2006-05-14 18:37:26 +00:00
if (pAMgr) {
2006-04-05 17:20:12 +00:00
if (cpdsuggest==1) {
if (pAMgr->get_compound()) {
rv = pAMgr->compound_check(word,len,0,0,0,0,NULL,0,NULL,NULL,1);
if (rv) return 3; // XXX obsolote categorisation
}
return 0;
}
rv = pAMgr->lookup(word);
if (rv) {
if ((rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen)
|| TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen))) return 0;
if (rv->astr && (TESTAFF(rv->astr,pAMgr->get_pseudoroot(),rv->alen) ||
TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) rv = NULL;
} else rv = pAMgr->prefix_check(word, len, 0); // only prefix, and prefix + suffix XXX
2006-05-14 18:37:26 +00:00
2006-04-05 17:20:12 +00:00
if (rv) {
nosuffix=1;
} else {
rv = pAMgr->suffix_check(word, len, 0, NULL, NULL, 0, NULL); // only suffix
}
if (!rv && pAMgr->have_contclass()) {
rv = pAMgr->suffix_check_twosfx(word, len, 0, NULL, FLAG_NULL);
if (!rv) rv = pAMgr->prefix_check_twosfx(word, len, 1, FLAG_NULL);
}
// check forbidden words
if ((rv) && (rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen)
|| TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen) ||
TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) return 0;
2006-05-14 18:37:26 +00:00
if (rv) { // XXX obsolote
if ((pAMgr->get_compoundflag()) &&
TESTAFF(rv->astr, pAMgr->get_compoundflag(), rv->alen)) return 2 + nosuffix;
2006-04-05 17:20:12 +00:00
return 1;
}
}
return 0;
}
int SuggestMgr::check_forbidden(const char * word, int len)
{
struct hentry * rv = NULL;
2006-05-14 18:37:26 +00:00
if (pAMgr) {
2006-04-05 17:20:12 +00:00
rv = pAMgr->lookup(word);
if (rv && rv->astr && (TESTAFF(rv->astr,pAMgr->get_pseudoroot(),rv->alen) ||
TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) rv = NULL;
if (!(pAMgr->prefix_check(word,len,1)))
rv = pAMgr->suffix_check(word,len, 0, NULL, NULL, 0, NULL); // prefix+suffix, suffix
// check forbidden words
if ((rv) && (rv->astr) && TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen)) return 1;
}
return 0;
}
// suggest stems, XXX experimental code
int SuggestMgr::suggest_stems(char*** slst, const char * w, int nsug)
{
char buf[MAXSWUTF8L];
2006-05-14 18:37:26 +00:00
char ** wlst;
2006-04-05 17:20:12 +00:00
int prevnsug = nsug;
char w2[MAXWORDUTF8LEN];
const char * word = w;
// word reversing wrapper for complex prefixes
if (complexprefixes) {
strcpy(w2, w);
if (utf8) reverseword_utf(w2); else reverseword(w2);
word = w2;
}
if (*slst) {
wlst = *slst;
} else {
wlst = (char **) calloc(maxSug, sizeof(char *));
if (wlst == NULL) return -1;
}
// perhaps there are a fix stem in the dictionary
if ((nsug < maxSug) && (nsug > -1)) {
2006-05-14 18:37:26 +00:00
2006-04-05 17:20:12 +00:00
nsug = fixstems(wlst, word, nsug);
if (nsug == prevnsug) {
char * s = mystrdup(word);
char * p = s + strlen(s);
while ((*p != '-') && (p != s)) p--;
if (*p == '-') {
*p = '\0';
nsug = fixstems(wlst, s, nsug);
if ((nsug == prevnsug) && (nsug < maxSug) && (nsug >= 0)) {
char * t;
buf[0] = '\0';
for (t = s; (t[0] != '\0') && ((t[0] >= '0') || (t[0] <= '9')); t++); // is a number?
if (*t != '\0') strcpy(buf, "# ");
strcat(buf, s);
wlst[nsug] = mystrdup(buf);
if (wlst[nsug] == NULL) return -1;
nsug++;
}
p++;
nsug = fixstems(wlst, p, nsug);
}
free(s);
}
}
2006-05-14 18:37:26 +00:00
2006-04-05 17:20:12 +00:00
if (nsug < 0) {
for (int i=0;i<maxSug; i++)
if (wlst[i] != NULL) free(wlst[i]);
free(wlst);
return -1;
}
*slst = wlst;
return nsug;
}
// there are fix stems in dictionary
int SuggestMgr::fixstems(char ** wlst, const char * word, int ns)
{
char fix[MAXSWUTF8L];
char buf[MAXSWUTF8L];
char prefix[MAXSWUTF8L] = "";
char * p;
int dicstem = 1; // 0 = lookup, 1= affix, 2 = compound
int cpdindex = 0;
struct hentry * rv = NULL;
struct hentry * rv2 = NULL;
int wl = strlen(word);
int cmpdstemnum;
int cmpdstem[MAXCOMPOUND];
2006-05-14 18:37:26 +00:00
if (pAMgr) {
2006-04-05 17:20:12 +00:00
rv = pAMgr->lookup(word);
if (rv) {
dicstem = 0;
} else {
2006-05-14 18:37:26 +00:00
// try stripping off affixes
2006-04-05 17:20:12 +00:00
rv = pAMgr->affix_check(word, wl);
// else try check compound word
if (!rv && pAMgr->get_compound()) {
rv = pAMgr->compound_check(word, wl,
0, 0, 100, 0, NULL, 0, &cmpdstemnum, cmpdstem,1);
if (rv) {
dicstem = 2;
for (int j = 0; j < cmpdstemnum; j++) {
cpdindex += cmpdstem[j];
}
if(! (pAMgr->lookup(word + cpdindex)))
pAMgr->affix_check(word + cpdindex, wl - cpdindex); // for prefix
}
}
if (pAMgr->get_prefix()) {
strcpy(prefix, pAMgr->get_prefix());
}
// XXX obsolote, will be a general solution for stemming
2006-05-14 18:37:26 +00:00
if ((prefix) && (strncmp(prefix, "leg", 3)==0)) prefix[0] = '\0'; // (HU)
2006-04-05 17:20:12 +00:00
}
}
if ((rv) && (ns < maxSug)) {
2006-05-14 18:37:26 +00:00
2006-04-05 17:20:12 +00:00
// check fixstem flag and not_valid_stem flag
// first word
2006-05-14 18:37:26 +00:00
if ((ns < maxSug) && (dicstem < 2)) {
2006-04-05 17:20:12 +00:00
strcpy(buf, prefix);
if ((dicstem > 0) && pAMgr->get_derived()) {
// XXX obsolote
if (strlen(prefix) == 1) {
strcat(buf, (pAMgr->get_derived()) + 1);
} else {
strcat(buf, pAMgr->get_derived());
}
} else {
// special stem in affix description
const char * wordchars = pAMgr->get_wordchars();
2006-05-14 18:37:26 +00:00
if (rv->description &&
2006-04-05 17:20:12 +00:00
(strchr(wordchars, *(rv->description)))) {
char * desc = (rv->description) + 1;
while (strchr(wordchars, *desc)) desc++;
strncat(buf, rv->description, desc - (rv->description));
} else {
strcat(buf, rv->word);
}
}
wlst[ns] = mystrdup(buf);
if (wlst[ns] == NULL) return -1;
ns++;
}
if (dicstem == 2) {
// compound stem
// if (rv->astr && (strchr(rv->astr, '0') == NULL)) {
if (rv->astr) {
strcpy(buf, word);
buf[cpdindex] = '\0';
if (prefix) strcat(buf, prefix);
if (pAMgr->get_derived()) {
strcat(buf, pAMgr->get_derived());
} else {
// special stem in affix description
const char * wordchars = pAMgr->get_wordchars();
2006-05-14 18:37:26 +00:00
if (rv->description &&
2006-04-05 17:20:12 +00:00
(strchr(wordchars, *(rv->description)))) {
char * desc = (rv->description) + 1;
while (strchr(wordchars, *desc)) desc++;
strncat(buf, rv->description, desc - (rv->description));
} else {
strcat(buf, rv->word);
}
}
if (ns < maxSug) {
wlst[ns] = mystrdup(buf);
if (wlst[ns] == NULL) return -1;
ns++;
}
}
}
}
while (rv) {
if (0) { // obsolote
if ((p[1] > '0') && (p[1] <= '9')) {
if ((ns < maxSug) && (dicstem != 2)) {
int split = p[1] - '0';
if (rv->wlen <= split) break;
strcpy(fix, rv->word);
// checking verbs ending with `ik'
fix[rv->wlen - split] = 'i';
fix[rv->wlen - split + 1] = 'k';
fix[rv->wlen - split + 2] = '\0';
if (! (rv2 = pAMgr->lookup(fix))) {
fix[strlen(fix) - 2] = '\0';
rv2 = pAMgr->lookup(fix);
if ((!rv2)) {
*fix = csconv[((unsigned char) *fix)].cupper;
rv2 = pAMgr->lookup(fix);
if (! rv2) return ns;
}
}
if (0) {
strcpy(buf, prefix);
strcat(buf, fix);
wlst[ns] = mystrdup(buf);
if (wlst[ns] == NULL) return -1;
ns++;
}
rv = rv2;
2006-05-14 18:37:26 +00:00
} else return ns;
2006-04-05 17:20:12 +00:00
} else {
strcpy(fix, "__");
strcat(fix, rv->word);
rv = NULL;
rv2 = pAMgr->lookup(fix);
2006-05-14 18:37:26 +00:00
if ((rv2) && (rv2->astr) && (ns < maxSug))
if ((rv2) && (rv2->astr) && (ns < maxSug))
2006-04-05 17:20:12 +00:00
if (0) {
char buf2[MAXSWUTF8L];
strcpy(buf2, prefix);
2006-05-14 18:37:26 +00:00
2006-04-05 17:20:12 +00:00
if (*(rv2->astr) == '-') {
strcat(buf2, "");
} else {
strcat(buf2, "");
}
if (dicstem != 2) {
wlst[ns] = mystrdup(buf2);
if (wlst[ns] == NULL) return -1;
ns++;
}
if ((dicstem == 2) && (ns < maxSug)) {
strcpy(buf, word);
buf[cpdindex] = '\0';
strcat(buf + cpdindex, buf2);
if (pAMgr->get_compound() &&
(pAMgr->compound_check(buf, strlen(buf),
0,0,100,0,NULL,0,NULL,NULL,1))) {
wlst[ns] = mystrdup(buf);
if (wlst[ns] == NULL) return -1;
ns++;
}
}
// many stems
} else {
char * str = mystrdup("");
char * pos = str;
char * pos2;
do {
int suggest = 1;
pos2 = strchr(pos, '|');
if (pos2) *pos2 = '\0';
// ignore `-xxx' suggestion, when exists prefix
if (*pos == '-') {
pos++;
if (*prefix != '\0') suggest = 0;
}
// ignore `xxx-' suggestion, when word is not root
if ((strlen(pos) > 0) && (pos[strlen(pos)-1] == '-')) {
pos[strlen(pos)-1] = '\0';
strcpy(buf, prefix);
strcat(buf, fix + 2);
if ((dicstem != 0) && (strcmp(buf, word) != 0)) suggest = 0;
}
if ((suggest) && (ns < maxSug) && (strlen(pos) > 0)) {
strcpy(buf, prefix);
strcat(buf, pos);
wlst[ns] = mystrdup(buf);
if (wlst[ns] == NULL) return -1;
ns++;
}
if (pos2) pos = pos2 + 1;
} while (pos2);
free(str);
}
}
} else return ns;
}
return ns;
}
// suggest possible stems
int SuggestMgr::suggest_pos_stems(char*** slst, const char * w, int nsug)
{
2006-05-14 18:37:26 +00:00
char ** wlst;
2006-04-05 17:20:12 +00:00
struct hentry * rv = NULL;
char w2[MAXSWUTF8L];
const char * word = w;
// word reversing wrapper for complex prefixes
if (complexprefixes) {
strcpy(w2, w);
if (utf8) reverseword_utf(w2); else reverseword(w2);
word = w2;
}
int wl = strlen(word);
if (*slst) {
wlst = *slst;
} else {
wlst = (char **) calloc(maxSug, sizeof(char *));
if (wlst == NULL) return -1;
}
rv = pAMgr->suffix_check(word, wl, 0, NULL, wlst, maxSug, &nsug);
// delete dash from end of word
if (nsug > 0) {
for (int j=0; j < nsug; j++) {
if (wlst[j][strlen(wlst[j]) - 1] == '-') wlst[j][strlen(wlst[j]) - 1] = '\0';
}
}
*slst = wlst;
return nsug;
}
char * SuggestMgr::suggest_morph(const char * w)
{
char result[MAXLNLEN];
char * r = (char *) result;
char * st;
struct hentry * rv = NULL;
*result = '\0';
if (! pAMgr) return NULL;
char w2[MAXSWUTF8L];
const char * word = w;
// word reversing wrapper for complex prefixes
if (complexprefixes) {
strcpy(w2, w);
if (utf8) reverseword_utf(w2); else reverseword(w2);
word = w2;
}
rv = pAMgr->lookup(word);
2006-05-14 18:37:26 +00:00
2006-04-05 17:20:12 +00:00
while (rv) {
if ((!rv->astr) || !(TESTAFF(rv->astr, pAMgr->get_forbiddenword(), rv->alen) ||
TESTAFF(rv->astr, pAMgr->get_pseudoroot(), rv->alen) ||
TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) {
2006-05-14 18:37:26 +00:00
if (rv->description && ((!rv->astr) ||
2006-04-05 17:20:12 +00:00
!TESTAFF(rv->astr, pAMgr->get_lemma_present(), rv->alen)))
strcat(result, word);
if (rv->description) strcat(result, rv->description);
strcat(result, "\n");
}
rv = rv->next_homonym;
}
2006-05-14 18:37:26 +00:00
2006-04-05 17:20:12 +00:00
st = pAMgr->affix_check_morph(word,strlen(word));
if (st) {
strcat(result, st);
free(st);
}
if (pAMgr->get_compound() && (*result == '\0'))
pAMgr->compound_check_morph(word, strlen(word),
0, 0, 100, 0,NULL, 0, &r, NULL);
2006-05-14 18:37:26 +00:00
2006-04-05 17:20:12 +00:00
return (*result) ? mystrdup(line_uniq(delete_zeros(result))) : NULL;
}
char * SuggestMgr::suggest_morph_for_spelling_error(const char * word)
{
char * p = NULL;
char ** wlst = (char **) calloc(maxSug, sizeof(char *));
// we will use only the first suggestion
2018-07-14 13:22:48 +00:00
for (int i = 0; i < maxSug - 1; i++) wlst[i] = "";
2006-04-05 17:20:12 +00:00
int ns = suggest(&wlst, word, maxSug - 1);
if (ns == maxSug) {
p = suggest_morph(wlst[maxSug - 1]);
free(wlst[maxSug - 1]);
}
if (wlst) free(wlst);
2006-05-14 18:37:26 +00:00
return p;
2006-04-05 17:20:12 +00:00
}
// generate an n-gram score comparing s1 and s2
int SuggestMgr::ngram(int n, char * s1, const char * s2, int uselen)
{
int nscore = 0;
int ns;
int l1;
int l2;
if (utf8) {
w_char su1[MAXSWL];
w_char su2[MAXSWL];
l1 = u8_u16(su1, MAXSWL, s1);
l2 = u8_u16(su2, MAXSWL, s2);
if (!l2) return 0;
// decapitalize dictionary word
if (complexprefixes) {
mkallsmall_utf(su2+l2-1, 1, utfconv);
} else {
mkallsmall_utf(su2, 1, utfconv);
}
for (int j = 1; j <= n; j++) {
ns = 0;
for (int i = 0; i <= (l1-j); i++) {
for (int l = 0; l <= (l2-j); l++) {
int k;
for (k = 0; (k < j); k++) {
w_char * c1 = su1 + i + k;
w_char * c2 = su2 + l + k;
if ((c1->l != c2->l) || (c1->h != c2->h)) break;
}
if (k == j) {
ns++;
break;
}
}
}
nscore = nscore + ns;
if (ns < 2) break;
}
2006-05-14 18:37:26 +00:00
} else {
2006-04-05 17:20:12 +00:00
char t[MAXSWUTF8L];
l1 = strlen(s1);
l2 = strlen(s2);
if (!l2) return 0;
strcpy(t, s2);
if (complexprefixes) {
*(t+l2-1) = csconv[((unsigned char)*(t+l2-1))].clower;
} else {
mkallsmall(t, csconv);
/// *t = csconv[((unsigned char)*t)].clower;
}
for (int j = 1; j <= n; j++) {
ns = 0;
for (int i = 0; i <= (l1-j); i++) {
char c = *(s1 + i + j);
*(s1 + i + j) = '\0';
if (strstr(t,(s1+i))) ns++;
*(s1 + i + j ) = c;
}
nscore = nscore + ns;
if (ns < 2) break;
}
}
ns = 0;
if (uselen == NGRAM_LONGER_WORSE) ns = (l2-l1)-2;
if (uselen == NGRAM_ANY_MISMATCH) ns = abs(l2-l1)-2;
return (nscore - ((ns > 0) ? ns : 0));
}
int SuggestMgr::equalfirstletter(char * s1, const char * s2) {
if (utf8) {
w_char su1[MAXSWL];
w_char su2[MAXSWL];
// decapitalize dictionary word
if (complexprefixes) {
int l1 = u8_u16(su1, MAXSWL, s1);
int l2 = u8_u16(su2, MAXSWL, s2);
if (*((short *)su1+l1-1) == *((short *)su2+l2-1)) return 1;
} else {
u8_u16(su1, 1, s1);
u8_u16(su2, 1, s2);
if (*((short *)su1) == *((short *)su2)) return 1;
}
} else {
if (complexprefixes) {
int l1 = strlen(s1);
int l2 = strlen(s2);
if (*(s2+l1-1) == *(s2+l2-1)) return 1;
} else {
if (*s1 == *s2) return 1;
}
}
return 0;
}
int SuggestMgr::commoncharacterpositions(char * s1, const char * s2, int * is_swap) {
int num = 0;
int diff = 0;
int diffpos[2];
*is_swap = 0;
if (utf8) {
w_char su1[MAXSWL];
w_char su2[MAXSWL];
int l1 = u8_u16(su1, MAXSWL, s1);
int l2 = u8_u16(su2, MAXSWL, s2);
for (int i = 0; (i < l1) && (i < l2); i++) {
if (((short *) su1)[i] == ((short *) su2)[i]) {
num++;
} else {
if (diff < 2) diffpos[diff] = i;
diff++;
}
}
if ((diff == 2) && (l1 == l2) &&
(((short *) su1)[diffpos[0]] == ((short *) su2)[diffpos[1]]) &&
(((short *) su1)[diffpos[1]] == ((short *) su2)[diffpos[0]])) *is_swap = 1;
} else {
int i;
for (i = 0; (*(s1+i) != 0) && (*(s2+i) != 0); i++) {
if (*(s1+i) == *(s2+i)) {
num++;
} else {
if (diff < 2) diffpos[diff] = i;
diff++;
}
}
if ((diff == 2) && (*(s1+i) == 0) && (*(s2+i) == 0) &&
(*(s1+diffpos[0]) == *(s2+diffpos[1])) &&
(*(s1+diffpos[1]) == *(s2+diffpos[0]))) *is_swap = 1;
}
return num;
}
int SuggestMgr::mystrlen(const char * word) {
if (utf8) {
w_char w[MAXSWL];
return u8_u16(w, MAXSWL, word);
} else return strlen(word);
}
// sort in decreasing order of score
void SuggestMgr::bubblesort(char** rword, int* rsc, int n )
{
int m = 1;
while (m < n) {
int j = m;
while (j > 0) {
if (rsc[j-1] < rsc[j]) {
int sctmp = rsc[j-1];
char * wdtmp = rword[j-1];
rsc[j-1] = rsc[j];
rword[j-1] = rword[j];
rsc[j] = sctmp;
rword[j] = wdtmp;
j--;
} else break;
}
m++;
}
return;
}
// longest common subsequence
void SuggestMgr::lcs(const char * s, const char * s2, int * l1, int * l2, char ** result) {
int n, m;
w_char su[MAXSWL];
w_char su2[MAXSWL];
char * b;
char * c;
int i;
int j;
if (utf8) {
m = u8_u16(su, MAXSWL, s);
n = u8_u16(su2, MAXSWL, s2);
} else {
m = strlen(s);
n = strlen(s2);
}
c = (char *) malloc((m + 1) * (n + 1));
b = (char *) malloc((m + 1) * (n + 1));
for (i = 1; i <= m; i++) c[i*(n+1)] = 0;
for (j = 0; j <= n; j++) c[j] = 0;
for (i = 1; i <= m; i++) {
for (j = 1; j <= n; j++) {
if ((utf8) && (*((short *) su+i-1) == *((short *)su2+j-1))
|| (!utf8) && ((*(s+i-1)) == (*(s2+j-1)))) {
c[i*(n+1) + j] = c[(i-1)*(n+1) + j-1]+1;
b[i*(n+1) + j] = LCS_UPLEFT;
} else if (c[(i-1)*(n+1) + j] >= c[i*(n+1) + j-1]) {
c[i*(n+1) + j] = c[(i-1)*(n+1) + j];
b[i*(n+1) + j] = LCS_UP;
} else {
c[i*(n+1) + j] = c[i*(n+1) + j-1];
b[i*(n+1) + j] = LCS_LEFT;
}
}
}
*result = b;
free(c);
*l1 = m;
*l2 = n;
}
int SuggestMgr::lcslen(const char * s, const char* s2) {
int m;
int n;
int i;
int j;
char * result;
int len = 0;
lcs(s, s2, &m, &n, &result);
i = m;
j = n;
while ((i != 0) && (j != 0)) {
if (result[i*(n+1) + j] == LCS_UPLEFT) {
len++;
i--;
j--;
} else if (result[i*(n+1) + j] == LCS_UP) {
i--;
} else j--;
}
if (result) free(result);
return len;
}