540 lines
13 KiB
C++
540 lines
13 KiB
C++
#include "license.readme"
|
|
|
|
#include <cstdlib>
|
|
#include <cctype>
|
|
#include <cstring>
|
|
#include <cstdio>
|
|
|
|
#include "suggestmgr.hxx"
|
|
|
|
#if !defined(_MSC_VER)
|
|
using namespace std;
|
|
#endif
|
|
|
|
extern char * mystrdup(const char *);
|
|
|
|
|
|
SuggestMgr::SuggestMgr(const char * tryme, int maxn,
|
|
AffixMgr * aptr)
|
|
{
|
|
|
|
// register affix manager and check in string of chars to
|
|
// try when building candidate suggestions
|
|
pAMgr = aptr;
|
|
ctry = mystrdup(tryme);
|
|
ctryl = 0;
|
|
if (ctry)
|
|
ctryl = strlen(ctry);
|
|
maxSug = maxn;
|
|
nosplitsugs=(0==1);
|
|
if (pAMgr) pAMgr->get_nosplitsugs();
|
|
}
|
|
|
|
|
|
SuggestMgr::~SuggestMgr()
|
|
{
|
|
pAMgr = NULL;
|
|
if (ctry) free(ctry);
|
|
ctry = NULL;
|
|
ctryl = 0;
|
|
maxSug = 0;
|
|
}
|
|
|
|
|
|
|
|
// generate suggestions for a mispelled word
|
|
// pass in address of array of char * pointers
|
|
|
|
int SuggestMgr::suggest(char** wlst, int ns, const char * word)
|
|
{
|
|
|
|
int nsug = ns;
|
|
|
|
// perhaps we made chose the wrong char from a related set
|
|
if ((nsug < maxSug) && (nsug > -1))
|
|
nsug = mapchars(wlst, word, nsug);
|
|
|
|
// perhaps we made a typical fault of spelling
|
|
if ((nsug < maxSug) && (nsug > -1))
|
|
nsug = replchars(wlst, word, nsug);
|
|
|
|
// did we forget to add a char
|
|
if ((nsug < maxSug) && (nsug > -1))
|
|
nsug = forgotchar(wlst, word, nsug);
|
|
|
|
// did we swap the order of chars by mistake
|
|
if ((nsug < maxSug) && (nsug > -1))
|
|
nsug = swapchar(wlst, word, nsug);
|
|
|
|
// did we add a char that should not be there
|
|
if ((nsug < maxSug) && (nsug > -1))
|
|
nsug = extrachar(wlst, word, nsug);
|
|
|
|
// did we just hit the wrong key in place of a good char
|
|
if ((nsug < maxSug) && (nsug > -1))
|
|
nsug = badchar(wlst, word, nsug);
|
|
|
|
// perhaps we forgot to hit space and two words ran together
|
|
if (!nosplitsugs) {
|
|
if ((nsug < maxSug) && (nsug > -1))
|
|
nsug = twowords(wlst, word, nsug);
|
|
}
|
|
return nsug;
|
|
}
|
|
|
|
|
|
|
|
// suggestions for when chose the wrong char out of a related set
|
|
int SuggestMgr::mapchars(char** wlst, const char * word, int ns)
|
|
{
|
|
int wl = strlen(word);
|
|
if (wl < 2 || ! pAMgr) return ns;
|
|
|
|
int nummap = pAMgr->get_nummap();
|
|
struct mapentry* maptable = pAMgr->get_maptable();
|
|
if (maptable==NULL) return ns;
|
|
ns = map_related(word, 0, wlst, ns, maptable, nummap);
|
|
return ns;
|
|
}
|
|
|
|
|
|
int SuggestMgr::map_related(const char * word, int i, char** wlst, int ns, const mapentry* maptable, int nummap)
|
|
{
|
|
char c = *(word + i);
|
|
if (c == 0) {
|
|
int cwrd = 1;
|
|
for (int m=0; m < ns; m++)
|
|
if (strcmp(word,wlst[m]) == 0) cwrd = 0;
|
|
if ((cwrd) && check(word,strlen(word))) {
|
|
if (ns < maxSug) {
|
|
wlst[ns] = mystrdup(word);
|
|
if (wlst[ns] == NULL) return -1;
|
|
ns++;
|
|
}
|
|
}
|
|
return ns;
|
|
}
|
|
int in_map = 0;
|
|
for (int j = 0; j < nummap; j++) {
|
|
if (strchr(maptable[j].set,c) != 0) {
|
|
in_map = 1;
|
|
char * newword = strdup(word);
|
|
for (int k = 0; k < maptable[j].len; k++) {
|
|
*(newword + i) = *(maptable[j].set + k);
|
|
ns = map_related(newword, (i+1), wlst, ns, maptable, nummap);
|
|
}
|
|
free(newword);
|
|
}
|
|
}
|
|
if (!in_map) {
|
|
i++;
|
|
ns = map_related(word, i, wlst, ns, maptable, nummap);
|
|
}
|
|
return ns;
|
|
}
|
|
|
|
|
|
|
|
// suggestions for a typical fault of spelling, that
|
|
// differs with more, than 1 letter from the right form.
|
|
int SuggestMgr::replchars(char** wlst, const char * word, int ns)
|
|
{
|
|
char candidate[MAXSWL];
|
|
const char * r;
|
|
int lenr, lenp;
|
|
int cwrd;
|
|
|
|
int wl = strlen(word);
|
|
if (wl < 2 || ! pAMgr) return ns;
|
|
|
|
int numrep = pAMgr->get_numrep();
|
|
struct replentry* reptable = pAMgr->get_reptable();
|
|
if (reptable==NULL) return ns;
|
|
|
|
for (int i=0; i < numrep; i++ ) {
|
|
r = word;
|
|
lenr = strlen(reptable[i].replacement);
|
|
lenp = strlen(reptable[i].pattern);
|
|
// search every occurence of the pattern in the word
|
|
while ((r=strstr(r, reptable[i].pattern)) != NULL) {
|
|
strcpy(candidate, word);
|
|
if (r-word + lenr + strlen(r+lenp) >= MAXSWL) break;
|
|
strcpy(candidate+(r-word),reptable[i].replacement);
|
|
strcpy(candidate+(r-word)+lenr, r+lenp);
|
|
cwrd = 1;
|
|
for (int k=0; k < ns; k++)
|
|
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
|
|
if ((cwrd) && check(candidate,strlen(candidate))) {
|
|
if (ns < maxSug) {
|
|
wlst[ns] = mystrdup(candidate);
|
|
if (wlst[ns] == NULL) return -1;
|
|
ns++;
|
|
} else return ns;
|
|
}
|
|
r++; // search for the next letter
|
|
}
|
|
}
|
|
return ns;
|
|
}
|
|
|
|
|
|
// error is wrong char in place of correct one
|
|
int SuggestMgr::badchar(char ** wlst, const char * word, int ns)
|
|
{
|
|
char tmpc;
|
|
char candidate[MAXSWL];
|
|
|
|
int wl = strlen(word);
|
|
int cwrd;
|
|
strcpy (candidate, word);
|
|
|
|
// swap out each char one by one and try all the tryme
|
|
// chars in its place to see if that makes a good word
|
|
for (int i=0; i < wl; i++) {
|
|
tmpc = candidate[i];
|
|
for (int j=0; j < ctryl; j++) {
|
|
if (ctry[j] == tmpc) continue;
|
|
candidate[i] = ctry[j];
|
|
cwrd = 1;
|
|
for (int k=0; k < ns; k++)
|
|
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
|
|
if ((cwrd) && check(candidate,wl)) {
|
|
if (ns < maxSug) {
|
|
wlst[ns] = mystrdup(candidate);
|
|
if (wlst[ns] == NULL) return -1;
|
|
ns++;
|
|
} else return ns;
|
|
}
|
|
candidate[i] = tmpc;
|
|
}
|
|
}
|
|
return ns;
|
|
}
|
|
|
|
|
|
// error is word has an extra letter it does not need
|
|
int SuggestMgr::extrachar(char** wlst, const char * word, int ns)
|
|
{
|
|
char candidate[MAXSWL];
|
|
const char * p;
|
|
char * r;
|
|
int cwrd;
|
|
|
|
int wl = strlen(word);
|
|
if (wl < 2) return ns;
|
|
|
|
// try omitting one char of word at a time
|
|
strcpy (candidate, word + 1);
|
|
for (p = word, r = candidate; *p != 0; ) {
|
|
cwrd = 1;
|
|
for (int k=0; k < ns; k++)
|
|
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
|
|
if ((cwrd) && check(candidate,wl-1)) {
|
|
if (ns < maxSug) {
|
|
wlst[ns] = mystrdup(candidate);
|
|
if (wlst[ns] == NULL) return -1;
|
|
ns++;
|
|
} else return ns;
|
|
}
|
|
*r++ = *p++;
|
|
}
|
|
return ns;
|
|
}
|
|
|
|
|
|
// error is mising a letter it needs
|
|
int SuggestMgr::forgotchar(char ** wlst, const char * word, int ns)
|
|
{
|
|
char candidate[MAXSWL];
|
|
const char * p;
|
|
char * q;
|
|
int cwrd;
|
|
|
|
int wl = strlen(word);
|
|
|
|
// try inserting a tryme character before every letter
|
|
strcpy(candidate + 1, word);
|
|
for (p = word, q = candidate; *p != 0; ) {
|
|
for (int i = 0; i < ctryl; i++) {
|
|
*q = ctry[i];
|
|
cwrd = 1;
|
|
for (int k=0; k < ns; k++)
|
|
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
|
|
if ((cwrd) && check(candidate,wl+1)) {
|
|
if (ns < maxSug) {
|
|
wlst[ns] = mystrdup(candidate);
|
|
if (wlst[ns] == NULL) return -1;
|
|
ns++;
|
|
} else return ns;
|
|
}
|
|
}
|
|
*q++ = *p++;
|
|
}
|
|
|
|
// now try adding one to end */
|
|
for (int i = 0; i < ctryl; i++) {
|
|
*q = ctry[i];
|
|
cwrd = 1;
|
|
for (int k=0; k < ns; k++)
|
|
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
|
|
if ((cwrd) && check(candidate,wl+1)) {
|
|
if (ns < maxSug) {
|
|
wlst[ns] = mystrdup(candidate);
|
|
if (wlst[ns] == NULL) return -1;
|
|
ns++;
|
|
} else return ns;
|
|
}
|
|
}
|
|
return ns;
|
|
}
|
|
|
|
|
|
/* error is should have been two words */
|
|
int SuggestMgr::twowords(char ** wlst, const char * word, int ns)
|
|
{
|
|
char candidate[MAXSWL];
|
|
char * p;
|
|
|
|
int wl=strlen(word);
|
|
if (wl < 3) return ns;
|
|
strcpy(candidate + 1, word);
|
|
|
|
// split the string into two pieces after every char
|
|
// if both pieces are good words make them a suggestion
|
|
for (p = candidate + 1; p[1] != '\0'; p++) {
|
|
p[-1] = *p;
|
|
*p = '\0';
|
|
if (check(candidate,strlen(candidate))) {
|
|
if (check((p+1),strlen(p+1))) {
|
|
*p = ' ';
|
|
if (ns < maxSug) {
|
|
wlst[ns] = mystrdup(candidate);
|
|
if (wlst[ns] == NULL) return -1;
|
|
ns++;
|
|
} else return ns;
|
|
}
|
|
}
|
|
}
|
|
return ns;
|
|
}
|
|
|
|
|
|
// error is adjacent letter were swapped
|
|
int SuggestMgr::swapchar(char ** wlst, const char * word, int ns)
|
|
{
|
|
char candidate[MAXSWL];
|
|
char * p;
|
|
char tmpc;
|
|
int cwrd;
|
|
|
|
int wl = strlen(word);
|
|
|
|
// try swapping adjacent chars one by one
|
|
strcpy(candidate, word);
|
|
for (p = candidate; p[1] != 0; p++) {
|
|
tmpc = *p;
|
|
*p = p[1];
|
|
p[1] = tmpc;
|
|
cwrd = 1;
|
|
for (int k=0; k < ns; k++)
|
|
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
|
|
if ((cwrd) && check(candidate,wl)) {
|
|
if (ns < maxSug) {
|
|
wlst[ns] = mystrdup(candidate);
|
|
if (wlst[ns] == NULL) return -1;
|
|
ns++;
|
|
} else return ns;
|
|
}
|
|
tmpc = *p;
|
|
*p = p[1];
|
|
p[1] = tmpc;
|
|
}
|
|
return ns;
|
|
}
|
|
|
|
|
|
// generate a set of suggestions for very poorly spelled words
|
|
int SuggestMgr::ngsuggest(char** wlst, char * word, HashMgr* pHMgr)
|
|
{
|
|
|
|
int i, j;
|
|
int lval;
|
|
int sc;
|
|
int lp;
|
|
|
|
if (! pHMgr) return 0;
|
|
|
|
// exhaustively search through all root words
|
|
// keeping track of the MAX_ROOTS most similar root words
|
|
struct hentry * roots[MAX_ROOTS];
|
|
int scores[MAX_ROOTS];
|
|
for (i = 0; i < MAX_ROOTS; i++) {
|
|
roots[i] = NULL;
|
|
scores[i] = -100 * i;
|
|
}
|
|
lp = MAX_ROOTS - 1;
|
|
|
|
int n = strlen(word);
|
|
|
|
struct hentry* hp = NULL;
|
|
int col = -1;
|
|
while ((hp = pHMgr->walk_hashtable(col, hp))) {
|
|
sc = ngram(3, word, hp->word, NGRAM_LONGER_WORSE);
|
|
if (sc > scores[lp]) {
|
|
scores[lp] = sc;
|
|
roots[lp] = hp;
|
|
int lval = sc;
|
|
for (j=0; j < MAX_ROOTS; j++)
|
|
if (scores[j] < lval) {
|
|
lp = j;
|
|
lval = scores[j];
|
|
}
|
|
}
|
|
}
|
|
|
|
// find minimum threshhold for a passable suggestion
|
|
// mangle original word three differnt ways
|
|
// and score them to generate a minimum acceptable score
|
|
int thresh = 0;
|
|
char * mw = NULL;
|
|
for (int sp = 1; sp < 4; sp++) {
|
|
mw = strdup(word);
|
|
for (int k=sp; k < n; k+=4) *(mw + k) = '*';
|
|
thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH);
|
|
free(mw);
|
|
}
|
|
mw = NULL;
|
|
thresh = thresh / 3;
|
|
thresh--;
|
|
|
|
// now expand affixes on each of these root words and
|
|
// and use length adjusted ngram scores to select
|
|
// possible suggestions
|
|
char * guess[MAX_GUESS];
|
|
int gscore[MAX_GUESS];
|
|
for(i=0;i<MAX_GUESS;i++) {
|
|
guess[i] = NULL;
|
|
gscore[i] = -100 * i;
|
|
}
|
|
|
|
lp = MAX_GUESS - 1;
|
|
|
|
struct guessword * glst;
|
|
glst = (struct guessword *) calloc(MAX_WORDS,sizeof(struct guessword));
|
|
if (! glst) return 0;
|
|
|
|
for (i = 0; i < MAX_ROOTS; i++) {
|
|
|
|
if (roots[i]) {
|
|
struct hentry * rp = roots[i];
|
|
int nw = pAMgr->expand_rootword(glst, MAX_WORDS, rp->word, rp->wlen,
|
|
rp->astr, rp->alen);
|
|
for (int k = 0; k < nw; k++) {
|
|
sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH);
|
|
if (sc > thresh) {
|
|
if (sc > gscore[lp]) {
|
|
if (guess[lp]) free (guess[lp]);
|
|
gscore[lp] = sc;
|
|
guess[lp] = glst[k].word;
|
|
lval = sc;
|
|
for (j=0; j < MAX_GUESS; j++)
|
|
if (gscore[j] < lval) {
|
|
lp = j;
|
|
lval = gscore[j];
|
|
}
|
|
} else {
|
|
free (glst[k].word);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (glst) free(glst);
|
|
|
|
// now we are done generating guesses
|
|
// sort in order of decreasing score and copy over
|
|
|
|
bubblesort(&guess[0], &gscore[0], MAX_GUESS);
|
|
int ns = 0;
|
|
for (i=0; i < MAX_GUESS; i++) {
|
|
if (guess[i]) {
|
|
int unique = 1;
|
|
for (j=i+1; j < MAX_GUESS; j++)
|
|
if (guess[j])
|
|
if (!strcmp(guess[i], guess[j])) unique = 0;
|
|
if (unique) {
|
|
wlst[ns++] = guess[i];
|
|
} else {
|
|
free(guess[i]);
|
|
}
|
|
}
|
|
}
|
|
return ns;
|
|
}
|
|
|
|
|
|
|
|
|
|
// see if a candidate suggestion is spelled correctly
|
|
// needs to check both root words and words with affixes
|
|
int SuggestMgr::check(const char * word, int len)
|
|
{
|
|
struct hentry * rv=NULL;
|
|
if (pAMgr) {
|
|
rv = pAMgr->lookup(word);
|
|
if (rv == NULL) rv = pAMgr->affix_check(word,len);
|
|
}
|
|
if (rv) return 1;
|
|
return 0;
|
|
}
|
|
|
|
|
|
|
|
// generate an n-gram score comparing s1 and s2
|
|
int SuggestMgr::ngram(int n, char * s1, const char * s2, int uselen)
|
|
{
|
|
int nscore = 0;
|
|
int l1 = strlen(s1);
|
|
int l2 = strlen(s2);
|
|
int ns;
|
|
for (int j=1;j<=n;j++) {
|
|
ns = 0;
|
|
for (int i=0;i<=(l1-j);i++) {
|
|
char c = *(s1 + i + j);
|
|
*(s1 + i + j) = '\0';
|
|
if (strstr(s2,(s1+i))) ns++;
|
|
*(s1 + i + j ) = c;
|
|
}
|
|
nscore = nscore + ns;
|
|
if (ns < 2) break;
|
|
}
|
|
ns = 0;
|
|
if (uselen == NGRAM_LONGER_WORSE) ns = (l2-l1)-2;
|
|
if (uselen == NGRAM_ANY_MISMATCH) ns = abs(l2-l1)-2;
|
|
return (nscore - ((ns > 0) ? ns : 0));
|
|
}
|
|
|
|
|
|
// sort in decreasing order of score
|
|
void SuggestMgr::bubblesort(char** rword, int* rsc, int n )
|
|
{
|
|
int m = 1;
|
|
while (m < n) {
|
|
int j = m;
|
|
while (j > 0) {
|
|
if (rsc[j-1] < rsc[j]) {
|
|
int sctmp = rsc[j-1];
|
|
char * wdtmp = rword[j-1];
|
|
rsc[j-1] = rsc[j];
|
|
rword[j-1] = rword[j];
|
|
rsc[j] = sctmp;
|
|
rword[j] = wdtmp;
|
|
j--;
|
|
} else break;
|
|
}
|
|
m++;
|
|
}
|
|
return;
|
|
}
|
|
|