#include "license.hun" #include "license.mys" #include #include #include #include #include "suggmgr.hxx" #if !defined(_MSC_VER) using namespace std; #endif SuggestMgr::SuggestMgr(const char * tryme, int maxn, AffixMgr * aptr) { // register affix manager and check in string of chars to // try when building candidate suggestions pAMgr = aptr; ctryl = 0; ctry = NULL; ctry_utf = NULL; maxSug = maxn; nosplitsugs = 0; maxngramsugs = MAXNGRAMSUGS; utf8 = 0; utfconv = NULL; complexprefixes = 0; if (pAMgr) { char * enc = pAMgr->get_encoding(); csconv = get_current_cs(enc); free(enc); nosplitsugs = pAMgr->get_nosplitsugs(); if (pAMgr->get_maxngramsugs() >= 0) maxngramsugs = pAMgr->get_maxngramsugs(); utf8 = pAMgr->get_utf8(); utfconv = pAMgr->get_utf_conv(); complexprefixes = pAMgr->get_complexprefixes(); } if (tryme) { if (utf8) { w_char t[MAXSWL]; ctryl = u8_u16(t, MAXSWL, tryme); ctry_utf = (w_char *) malloc(ctryl * sizeof(w_char)); memcpy(ctry_utf, t, ctryl * sizeof(w_char)); } else { ctry = mystrdup(tryme); ctryl = strlen(ctry); } } } SuggestMgr::~SuggestMgr() { pAMgr = NULL; if (ctry) free(ctry); ctry = NULL; if (ctry_utf) free(ctry_utf); ctry_utf = NULL; ctryl = 0; maxSug = 0; } // generate suggestions for a mispelled word // pass in address of array of char * pointers int SuggestMgr::suggest(char*** slst, const char * w, int nsug) { int nocompoundtwowords = 0; char ** wlst; w_char word_utf[MAXSWL]; int wl=0; char w2[MAXWORDUTF8LEN]; const char * word = w; // word reversing wrapper for complex prefixes if (complexprefixes) { strcpy(w2, w); if (utf8) reverseword_utf(w2); else reverseword(w2); word = w2; } if (*slst) { wlst = *slst; } else { wlst = (char **) malloc(maxSug * sizeof(char *)); if (wlst == NULL) return -1; for (int i = 0; i < maxSug; i++) wlst[i] = NULL; } if (utf8) { wl = u8_u16(word_utf, MAXSWL, word); } for (int cpdsuggest=0; (cpdsuggest<2) && (nocompoundtwowords==0); cpdsuggest++) { // perhaps we made a typical fault of spelling if ((nsug < maxSug) && (nsug > -1)) nsug = replchars(wlst, word, nsug, cpdsuggest); // perhaps we made chose the wrong char from a related set if ((nsug < maxSug) && (nsug > -1)) nsug = mapchars(wlst, word, nsug, cpdsuggest); // did we swap the order of chars by mistake if ((nsug < maxSug) && (nsug > -1)) { nsug = (utf8) ? swapchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) : swapchar(wlst, word, nsug, cpdsuggest); } // did we forget to add a char if ((nsug < maxSug) && (nsug > -1)) { nsug = (utf8) ? forgotchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) : forgotchar(wlst, word, nsug, cpdsuggest); } // did we add a char that should not be there if ((nsug < maxSug) && (nsug > -1)) { nsug = (utf8) ? extrachar_utf(wlst, word_utf, wl, nsug, cpdsuggest) : extrachar(wlst, word, nsug, cpdsuggest); } // did we just hit the wrong key in place of a good char if ((nsug < maxSug) && (nsug > -1)) { nsug = (utf8) ? badchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) : badchar(wlst, word, nsug, cpdsuggest); } // only suggest compound words when no other suggestion if ((cpdsuggest==0) && (nsug>0)) nocompoundtwowords=1; // perhaps we forgot to hit space and two words ran together if ((!nosplitsugs) && (nsug < maxSug) && (nsug > -1)) { nsug = twowords(wlst, word, nsug, cpdsuggest); } } // repeating ``for'' statement compounding support if (nsug < 0) { // we ran out of memory - we should free up as much as possible for (int i = 0; i < maxSug; i++) if (wlst[i] != NULL) free(wlst[i]); free(wlst); wlst = NULL; } *slst = wlst; return nsug; } // generate suggestions for a word with typical mistake // pass in address of array of char * pointers int SuggestMgr::suggest_auto(char*** slst, const char * w, int nsug) { int nocompoundtwowords = 0; char ** wlst; char w2[MAXWORDUTF8LEN]; const char * word = w; // word reversing wrapper for complex prefixes if (complexprefixes) { strcpy(w2, w); if (utf8) reverseword_utf(w2); else reverseword(w2); word = w2; } if (*slst) { wlst = *slst; } else { wlst = (char **) malloc(maxSug * sizeof(char *)); if (wlst == NULL) return -1; } for (int cpdsuggest=0; (cpdsuggest<2) && (nocompoundtwowords==0); cpdsuggest++) { // perhaps we made a typical fault of spelling if ((nsug < maxSug) && (nsug > -1)) nsug = replchars(wlst, word, nsug, cpdsuggest); // perhaps we made chose the wrong char from a related set if ((nsug < maxSug) && (nsug > -1) && (cpdsuggest == 0)) nsug = mapchars(wlst, word, nsug, cpdsuggest); if ((cpdsuggest==0) && (nsug>0)) nocompoundtwowords=1; // perhaps we forgot to hit space and two words ran together if ((nsug < maxSug) && (nsug > -1) && check_forbidden(word, strlen(word))) { nsug = twowords(wlst, word, nsug, cpdsuggest); } } // repeating ``for'' statement compounding support if (nsug < 0) { for (int i=0;iget_nummap(); struct mapentry* maptable = pAMgr->get_maptable(); if (maptable==NULL) return ns; timelimit = time(NULL); timer = MINTIMER; if (utf8) { w_char w[MAXSWL]; int len = u8_u16(w, MAXSWL, word); ns = map_related_utf(w, len, 0, wlst, ns, maptable, nummap, &timer, &timelimit); } else ns = map_related(word, 0, wlst, ns, maptable, nummap, &timer, &timelimit); return ns; } int SuggestMgr::map_related(const char * word, int i, char** wlst, int ns, const mapentry* maptable, int nummap, int * timer, time_t * timelimit) { char c = *(word + i); if (c == 0) { int cwrd = 1; int wl; for (int m=0; m < ns; m++) if (strcmp(word,wlst[m]) == 0) cwrd = 0; if ((cwrd) && (wl = strlen(word)) && (check(word, wl, 0, timer, timelimit) || check(word, wl, 1, timer, timelimit))) { if (ns < maxSug) { wlst[ns] = mystrdup(word); if (wlst[ns] == NULL) return -1; ns++; } } return ns; } int in_map = 0; for (int j = 0; j < nummap; j++) { if (strchr(maptable[j].set,c) != 0) { in_map = 1; char * newword = mystrdup(word); for (int k = 0; k < maptable[j].len; k++) { *(newword + i) = *(maptable[j].set + k); ns = map_related(newword, (i+1), wlst, ns, maptable, nummap, timer, timelimit); if (!(*timelimit)) return ns; } free(newword); } } if (!in_map) { i++; ns = map_related(word, i, wlst, ns, maptable, nummap, timer, timelimit); } return ns; } int SuggestMgr::map_related_utf(w_char * word, int len, int i, char** wlst, int ns, const mapentry* maptable, int nummap, int * timer, time_t * timelimit) { if (i == len) { int cwrd = 1; int wl; char s[MAXSWUTF8L]; u16_u8(s, MAXSWUTF8L, word, len); for (int m=0; m < ns; m++) if (strcmp(s,wlst[m]) == 0) cwrd = 0; if ((cwrd) && (wl = strlen(s)) && (check(s, wl, 0, timer, timelimit) || check(s, wl, 1, timer, timelimit))) { if (ns < maxSug) { wlst[ns] = mystrdup(s); if (wlst[ns] == NULL) return -1; ns++; } } return ns; } int in_map = 0; unsigned short c = *((unsigned short *) word + i); for (int j = 0; j < nummap; j++) { if (flag_bsearch((unsigned short *) maptable[j].set_utf16, c, maptable[j].len)) { in_map = 1; for (int k = 0; k < maptable[j].len; k++) { *(word + i) = *(maptable[j].set_utf16 + k); ns = map_related_utf(word, len, i + 1, wlst, ns, maptable, nummap, timer, timelimit); if (!(*timelimit)) return ns; } *((unsigned short *) word + i) = c; } } if (!in_map) { i++; ns = map_related_utf(word, len, i, wlst, ns, maptable, nummap, timer, timelimit); } return ns; } // suggestions for a typical fault of spelling, that // differs with more, than 1 letter from the right form. int SuggestMgr::replchars(char** wlst, const char * word, int ns, int cpdsuggest) { char candidate[MAXSWUTF8L]; const char * r; int lenr, lenp; int cwrd; int wl = strlen(word); if (wl < 2 || ! pAMgr) return ns; int numrep = pAMgr->get_numrep(); struct replentry* reptable = pAMgr->get_reptable(); if (reptable==NULL) return ns; for (int i=0; i < numrep; i++ ) { r = word; lenr = strlen(reptable[i].pattern2); lenp = strlen(reptable[i].pattern); // search every occurence of the pattern in the word while ((r=strstr(r, reptable[i].pattern)) != NULL) { strcpy(candidate, word); if (r-word + lenr + strlen(r+lenp) >= MAXSWUTF8L) break; strcpy(candidate+(r-word),reptable[i].pattern2); strcpy(candidate+(r-word)+lenr, r+lenp); cwrd = 1; for (int k=0; k < ns; k++) if (strcmp(candidate,wlst[k]) == 0) cwrd = 0; if ((cwrd) && check(candidate,strlen(candidate), cpdsuggest, NULL, NULL)) { if (ns < maxSug) { wlst[ns] = mystrdup(candidate); if (wlst[ns] == NULL) { for (int j=0; j vacacation (doubled `ac') int SuggestMgr::doubledsyllable(char** wlst, const char * word, int ns, int cpdsuggest) { char candidate[MAXSWUTF8L]; int state=0; int cwrd; int wl = strlen(word); if (wl < 5 || ! pAMgr) return ns; for (int i=2; i < wl; i++ ) { if (word[i]==word[i-2]) { state++; if (state==3) { strcpy(candidate,word); strcpy(candidate+i-1,word+i+1); cwrd = 1; for (int k=0; k < ns; k++) if (strcmp(candidate,wlst[k]) == 0) cwrd = 0; if ((cwrd) && check(candidate,strlen(candidate), cpdsuggest, NULL, NULL)) { if (ns < maxSug) { wlst[ns] = mystrdup(candidate); if (wlst[ns] == NULL) { for (int j=0; jget_langnum() == LANG_hu) forbidden = check_forbidden(word, wl); strcpy(candidate + 1, word); // split the string into two pieces after every char // if both pieces are good words make them a suggestion for (p = candidate + 1; p[1] != '\0'; p++) { p[-1] = *p; // go to end of the UTF-8 character while (utf8 && ((p[1] & 0xc0) == 0x80)) { p++; p[-1] = *p; } *p = '\0'; if ((c1=check(candidate,strlen(candidate), cpdsuggest, NULL, NULL))) { if ((c2=check((p+1),strlen(p+1), cpdsuggest, NULL, NULL))) { *p = ' '; // spec. Hungarian code (need a better compound word support) if ((pAMgr->get_langnum() == LANG_hu) && !forbidden && // if 3 repeating letter, use - instead of space (((p[-1] == p[1]) && (((p>candidate+1) && (p[-1] == p[-2])) || (p[-1] == p[2]))) || // or multiple compounding, with more, than 6 syllables ((c1 == 3) && (c2 >= 2)))) *p = '-'; cwrd = 1; for (int k=0; k < ns; k++) if (strcmp(candidate,wlst[k]) == 0) cwrd = 0; if (ns < maxSug) { if (cwrd) { wlst[ns] = mystrdup(candidate); if (wlst[ns] == NULL) return -1; ns++; } } else return ns; } } } return ns; } // error is adjacent letter were swapped int SuggestMgr::swapchar(char ** wlst, const char * word, int ns, int cpdsuggest) { char candidate[MAXSWUTF8L]; char * p; char tmpc; int cwrd; int wl=strlen(word); // try swapping adjacent chars one by one strcpy(candidate, word); for (p = candidate; p[1] != 0; p++) { tmpc = *p; *p = p[1]; p[1] = tmpc; cwrd = 1; for (int k=0; k < ns; k++) if (strcmp(candidate,wlst[k]) == 0) cwrd = 0; if ((cwrd) && check(candidate,wl, cpdsuggest, NULL, NULL)) { if (ns < maxSug) { wlst[ns] = mystrdup(candidate); if (wlst[ns] == NULL) return -1; ns++; } else return ns; } tmpc = *p; *p = p[1]; p[1] = tmpc; } return ns; } // error is adjacent letter were swapped int SuggestMgr::swapchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest) { w_char candidate_utf[MAXSWL]; char candidate[MAXSWUTF8L]; w_char * p; w_char tmpc; int cwrd; // try swapping adjacent chars one by one memcpy (candidate_utf, word, wl * sizeof(w_char)); for (p = candidate_utf; p < (candidate_utf + wl - 1); p++) { tmpc = *p; *p = p[1]; p[1] = tmpc; cwrd = 1; u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); for (int k=0; k < ns; k++) if (strcmp(candidate,wlst[k]) == 0) cwrd = 0; if ((cwrd) && check(candidate, strlen(candidate), cpdsuggest, NULL, NULL)) { if (ns < maxSug) { wlst[ns] = mystrdup(candidate); if (wlst[ns] == NULL) return -1; ns++; } else return ns; } tmpc = *p; *p = p[1]; p[1] = tmpc; } return ns; } // generate a set of suggestions for very poorly spelled words int SuggestMgr::ngsuggest(char** wlst, char * w, HashMgr* pHMgr) { int i, j; int lval; int sc; int lp; if (! pHMgr) return 0; // exhaustively search through all root words // keeping track of the MAX_ROOTS most similar root words struct hentry * roots[MAX_ROOTS]; int scores[MAX_ROOTS]; for (i = 0; i < MAX_ROOTS; i++) { roots[i] = NULL; scores[i] = -100 * i; } lp = MAX_ROOTS - 1; char w2[MAXWORDUTF8LEN]; char * word = w; // word reversing wrapper for complex prefixes if (complexprefixes) { strcpy(w2, w); if (utf8) reverseword_utf(w2); else reverseword(w2); word = w2; } char mw[MAXSWUTF8L]; w_char u8[MAXSWL]; int nc = strlen(word); int n = (utf8) ? u8_u16(u8, MAXSWL, word) : nc; struct hentry* hp = NULL; int col = -1; while ((hp = pHMgr->walk_hashtable(col, hp))) { // check forbidden words if ((hp->astr) && (pAMgr) && (TESTAFF(hp->astr, pAMgr->get_forbiddenword(), hp->alen) || TESTAFF(hp->astr, pAMgr->get_nosuggest(), hp->alen) || TESTAFF(hp->astr, pAMgr->get_onlyincompound(), hp->alen))) continue; sc = ngram(3, word, hp->word, NGRAM_LONGER_WORSE); if (sc > scores[lp]) { scores[lp] = sc; roots[lp] = hp; int lval = sc; for (j=0; j < MAX_ROOTS; j++) if (scores[j] < lval) { lp = j; lval = scores[j]; } } } // find minimum threshhold for a passable suggestion // mangle original word three differnt ways // and score them to generate a minimum acceptable score int thresh = 0; for (int sp = 1; sp < 4; sp++) { if (utf8) { for (int k=sp; k < n; k+=4) *((unsigned short *) u8 + k) = '*'; u16_u8(mw, MAXSWUTF8L, u8, n); thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH); } else { strcpy(mw, word); for (int k=sp; k < n; k+=4) *(mw + k) = '*'; thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH); } } thresh = thresh / 3; thresh--; // now expand affixes on each of these root words and // and use length adjusted ngram scores to select // possible suggestions char * guess[MAX_GUESS]; int gscore[MAX_GUESS]; for(i=0;iexpand_rootword(glst, MAX_WORDS, rp->word, rp->wlen, rp->astr, rp->alen, word, nc); for (int k = 0; k < nw ; k++) { sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH); if ((sc > thresh)) { if (sc > gscore[lp]) { if (guess[lp]) free (guess[lp]); gscore[lp] = sc; guess[lp] = glst[k].word; lval = sc; for (j=0; j < MAX_GUESS; j++) if (gscore[j] < lval) { lp = j; lval = gscore[j]; } } else free (glst[k].word); } else free(glst[k].word); } } } free(glst); // now we are done generating guesses // sort in order of decreasing score bubblesort(&guess[0], &gscore[0], MAX_GUESS); // weight suggestions with a similarity index, based on // the longest common subsequent algorithm and resort int is_swap; for (i=0; i < MAX_GUESS; i++) { if (guess[i]) { // lowering guess[i] char gl[MAXSWUTF8L]; int len; if (utf8) { w_char w[MAXSWL]; len = u8_u16(w, MAXSWL, guess[i]); mkallsmall_utf(w, len, utfconv); u16_u8(gl, MAXSWUTF8L, w, len); } else { strcpy(gl, guess[i]); mkallsmall(gl, csconv); len = strlen(guess[i]); } int lcs = lcslen(word, gl); // same characters with different casing if ((n == len) && (n == lcs)) { gscore[i] += 2000; break; } // heuristic weigthing of ngram scores gscore[i] += // length of longest common subsequent minus lenght difference 2 * lcs - abs((int) (n - len)) + // weight equal first letter equalfirstletter(word, gl) + // weight equal character positions ((lcs == commoncharacterpositions(word, gl, &is_swap)) ? 1: 0) + // swap character (not neighboring) ((is_swap) ? 1000 : 0); } } bubblesort(&guess[0], &gscore[0], MAX_GUESS); // copy over int ns = 0; int same = 0; for (i=0; i < MAX_GUESS; i++) { if (guess[i]) { if ((ns < maxngramsugs) && (ns < maxSug) && (!same || (gscore[i] > 1000))) { int unique = 1; // we have excellent suggestion(s) if (gscore[i] > 1000) same = 1; for (j=0; j < ns; j++) // don't suggest previous suggestions or a previous suggestion with prefixes or affixes if (strstr(guess[i], wlst[j]) || // check forbidden words !check(guess[i], strlen(guess[i]), 0, NULL, NULL)) unique = 0; if (unique) wlst[ns++] = guess[i]; else free(guess[i]); } else free(guess[i]); } } return ns; } // see if a candidate suggestion is spelled correctly // needs to check both root words and words with affixes // obsolote MySpell-HU modifications: // return value 2 and 3 marks compounding with hyphen (-) // `3' marks roots without suffix int SuggestMgr::check(const char * word, int len, int cpdsuggest, int * timer, time_t * timelimit) { struct hentry * rv=NULL; int nosuffix = 0; // check time limit if (timer) { (*timer)--; if (!(*timer) && timelimit) { if (time(NULL) > *timelimit) { *timelimit = 0; return 0; } *timer = MAXPLUSTIMER; } } if (pAMgr) { if (cpdsuggest==1) { if (pAMgr->get_compound()) { rv = pAMgr->compound_check(word,len,0,0,0,0,NULL,0,NULL,NULL,1); if (rv) return 3; // XXX obsolote categorisation } return 0; } rv = pAMgr->lookup(word); if (rv) { if ((rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen) || TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen))) return 0; if (rv->astr && (TESTAFF(rv->astr,pAMgr->get_pseudoroot(),rv->alen) || TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) rv = NULL; } else rv = pAMgr->prefix_check(word, len, 0); // only prefix, and prefix + suffix XXX if (rv) { nosuffix=1; } else { rv = pAMgr->suffix_check(word, len, 0, NULL, NULL, 0, NULL); // only suffix } if (!rv && pAMgr->have_contclass()) { rv = pAMgr->suffix_check_twosfx(word, len, 0, NULL, FLAG_NULL); if (!rv) rv = pAMgr->prefix_check_twosfx(word, len, 1, FLAG_NULL); } // check forbidden words if ((rv) && (rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen) || TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen) || TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) return 0; if (rv) { // XXX obsolote if ((pAMgr->get_compoundflag()) && TESTAFF(rv->astr, pAMgr->get_compoundflag(), rv->alen)) return 2 + nosuffix; return 1; } } return 0; } int SuggestMgr::check_forbidden(const char * word, int len) { struct hentry * rv = NULL; if (pAMgr) { rv = pAMgr->lookup(word); if (rv && rv->astr && (TESTAFF(rv->astr,pAMgr->get_pseudoroot(),rv->alen) || TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) rv = NULL; if (!(pAMgr->prefix_check(word,len,1))) rv = pAMgr->suffix_check(word,len, 0, NULL, NULL, 0, NULL); // prefix+suffix, suffix // check forbidden words if ((rv) && (rv->astr) && TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen)) return 1; } return 0; } // suggest stems, XXX experimental code int SuggestMgr::suggest_stems(char*** slst, const char * w, int nsug) { char buf[MAXSWUTF8L]; char ** wlst; int prevnsug = nsug; char w2[MAXWORDUTF8LEN]; const char * word = w; // word reversing wrapper for complex prefixes if (complexprefixes) { strcpy(w2, w); if (utf8) reverseword_utf(w2); else reverseword(w2); word = w2; } if (*slst) { wlst = *slst; } else { wlst = (char **) calloc(maxSug, sizeof(char *)); if (wlst == NULL) return -1; } // perhaps there are a fix stem in the dictionary if ((nsug < maxSug) && (nsug > -1)) { nsug = fixstems(wlst, word, nsug); if (nsug == prevnsug) { char * s = mystrdup(word); char * p = s + strlen(s); while ((*p != '-') && (p != s)) p--; if (*p == '-') { *p = '\0'; nsug = fixstems(wlst, s, nsug); if ((nsug == prevnsug) && (nsug < maxSug) && (nsug >= 0)) { char * t; buf[0] = '\0'; for (t = s; (t[0] != '\0') && ((t[0] >= '0') || (t[0] <= '9')); t++); // is a number? if (*t != '\0') strcpy(buf, "# "); strcat(buf, s); wlst[nsug] = mystrdup(buf); if (wlst[nsug] == NULL) return -1; nsug++; } p++; nsug = fixstems(wlst, p, nsug); } free(s); } } if (nsug < 0) { for (int i=0;ilookup(word); if (rv) { dicstem = 0; } else { // try stripping off affixes rv = pAMgr->affix_check(word, wl); // else try check compound word if (!rv && pAMgr->get_compound()) { rv = pAMgr->compound_check(word, wl, 0, 0, 100, 0, NULL, 0, &cmpdstemnum, cmpdstem,1); if (rv) { dicstem = 2; for (int j = 0; j < cmpdstemnum; j++) { cpdindex += cmpdstem[j]; } if(! (pAMgr->lookup(word + cpdindex))) pAMgr->affix_check(word + cpdindex, wl - cpdindex); // for prefix } } if (pAMgr->get_prefix()) { strcpy(prefix, pAMgr->get_prefix()); } // XXX obsolote, will be a general solution for stemming if ((prefix) && (strncmp(prefix, "leg", 3)==0)) prefix[0] = '\0'; // (HU) } } if ((rv) && (ns < maxSug)) { // check fixstem flag and not_valid_stem flag // first word if ((ns < maxSug) && (dicstem < 2)) { strcpy(buf, prefix); if ((dicstem > 0) && pAMgr->get_derived()) { // XXX obsolote if (strlen(prefix) == 1) { strcat(buf, (pAMgr->get_derived()) + 1); } else { strcat(buf, pAMgr->get_derived()); } } else { // special stem in affix description const char * wordchars = pAMgr->get_wordchars(); if (rv->description && (strchr(wordchars, *(rv->description)))) { char * desc = (rv->description) + 1; while (strchr(wordchars, *desc)) desc++; strncat(buf, rv->description, desc - (rv->description)); } else { strcat(buf, rv->word); } } wlst[ns] = mystrdup(buf); if (wlst[ns] == NULL) return -1; ns++; } if (dicstem == 2) { // compound stem // if (rv->astr && (strchr(rv->astr, '0') == NULL)) { if (rv->astr) { strcpy(buf, word); buf[cpdindex] = '\0'; if (prefix) strcat(buf, prefix); if (pAMgr->get_derived()) { strcat(buf, pAMgr->get_derived()); } else { // special stem in affix description const char * wordchars = pAMgr->get_wordchars(); if (rv->description && (strchr(wordchars, *(rv->description)))) { char * desc = (rv->description) + 1; while (strchr(wordchars, *desc)) desc++; strncat(buf, rv->description, desc - (rv->description)); } else { strcat(buf, rv->word); } } if (ns < maxSug) { wlst[ns] = mystrdup(buf); if (wlst[ns] == NULL) return -1; ns++; } } } } while (rv) { if (0) { // obsolote if ((p[1] > '0') && (p[1] <= '9')) { if ((ns < maxSug) && (dicstem != 2)) { int split = p[1] - '0'; if (rv->wlen <= split) break; strcpy(fix, rv->word); // checking verbs ending with `ik' fix[rv->wlen - split] = 'i'; fix[rv->wlen - split + 1] = 'k'; fix[rv->wlen - split + 2] = '\0'; if (! (rv2 = pAMgr->lookup(fix))) { fix[strlen(fix) - 2] = '\0'; rv2 = pAMgr->lookup(fix); if ((!rv2)) { *fix = csconv[((unsigned char) *fix)].cupper; rv2 = pAMgr->lookup(fix); if (! rv2) return ns; } } if (0) { strcpy(buf, prefix); strcat(buf, fix); wlst[ns] = mystrdup(buf); if (wlst[ns] == NULL) return -1; ns++; } rv = rv2; } else return ns; } else { strcpy(fix, "__"); strcat(fix, rv->word); rv = NULL; rv2 = pAMgr->lookup(fix); if ((rv2) && (rv2->astr) && (ns < maxSug)) if ((rv2) && (rv2->astr) && (ns < maxSug)) if (0) { char buf2[MAXSWUTF8L]; strcpy(buf2, prefix); if (*(rv2->astr) == '-') { strcat(buf2, ""); } else { strcat(buf2, ""); } if (dicstem != 2) { wlst[ns] = mystrdup(buf2); if (wlst[ns] == NULL) return -1; ns++; } if ((dicstem == 2) && (ns < maxSug)) { strcpy(buf, word); buf[cpdindex] = '\0'; strcat(buf + cpdindex, buf2); if (pAMgr->get_compound() && (pAMgr->compound_check(buf, strlen(buf), 0,0,100,0,NULL,0,NULL,NULL,1))) { wlst[ns] = mystrdup(buf); if (wlst[ns] == NULL) return -1; ns++; } } // many stems } else { char * str = mystrdup(""); char * pos = str; char * pos2; do { int suggest = 1; pos2 = strchr(pos, '|'); if (pos2) *pos2 = '\0'; // ignore `-xxx' suggestion, when exists prefix if (*pos == '-') { pos++; if (*prefix != '\0') suggest = 0; } // ignore `xxx-' suggestion, when word is not root if ((strlen(pos) > 0) && (pos[strlen(pos)-1] == '-')) { pos[strlen(pos)-1] = '\0'; strcpy(buf, prefix); strcat(buf, fix + 2); if ((dicstem != 0) && (strcmp(buf, word) != 0)) suggest = 0; } if ((suggest) && (ns < maxSug) && (strlen(pos) > 0)) { strcpy(buf, prefix); strcat(buf, pos); wlst[ns] = mystrdup(buf); if (wlst[ns] == NULL) return -1; ns++; } if (pos2) pos = pos2 + 1; } while (pos2); free(str); } } } else return ns; } return ns; } // suggest possible stems int SuggestMgr::suggest_pos_stems(char*** slst, const char * w, int nsug) { char ** wlst; struct hentry * rv = NULL; char w2[MAXSWUTF8L]; const char * word = w; // word reversing wrapper for complex prefixes if (complexprefixes) { strcpy(w2, w); if (utf8) reverseword_utf(w2); else reverseword(w2); word = w2; } int wl = strlen(word); if (*slst) { wlst = *slst; } else { wlst = (char **) calloc(maxSug, sizeof(char *)); if (wlst == NULL) return -1; } rv = pAMgr->suffix_check(word, wl, 0, NULL, wlst, maxSug, &nsug); // delete dash from end of word if (nsug > 0) { for (int j=0; j < nsug; j++) { if (wlst[j][strlen(wlst[j]) - 1] == '-') wlst[j][strlen(wlst[j]) - 1] = '\0'; } } *slst = wlst; return nsug; } char * SuggestMgr::suggest_morph(const char * w) { char result[MAXLNLEN]; char * r = (char *) result; char * st; struct hentry * rv = NULL; *result = '\0'; if (! pAMgr) return NULL; char w2[MAXSWUTF8L]; const char * word = w; // word reversing wrapper for complex prefixes if (complexprefixes) { strcpy(w2, w); if (utf8) reverseword_utf(w2); else reverseword(w2); word = w2; } rv = pAMgr->lookup(word); while (rv) { if ((!rv->astr) || !(TESTAFF(rv->astr, pAMgr->get_forbiddenword(), rv->alen) || TESTAFF(rv->astr, pAMgr->get_pseudoroot(), rv->alen) || TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) { if (rv->description && ((!rv->astr) || !TESTAFF(rv->astr, pAMgr->get_lemma_present(), rv->alen))) strcat(result, word); if (rv->description) strcat(result, rv->description); strcat(result, "\n"); } rv = rv->next_homonym; } st = pAMgr->affix_check_morph(word,strlen(word)); if (st) { strcat(result, st); free(st); } if (pAMgr->get_compound() && (*result == '\0')) pAMgr->compound_check_morph(word, strlen(word), 0, 0, 100, 0,NULL, 0, &r, NULL); return (*result) ? mystrdup(line_uniq(delete_zeros(result))) : NULL; } char * SuggestMgr::suggest_morph_for_spelling_error(const char * word) { char * p = NULL; char ** wlst = (char **) calloc(maxSug, sizeof(char *)); // we will use only the first suggestion for (int i = 0; i < maxSug - 1; i++) wlst[i] = ""; int ns = suggest(&wlst, word, maxSug - 1); if (ns == maxSug) { p = suggest_morph(wlst[maxSug - 1]); free(wlst[maxSug - 1]); } if (wlst) free(wlst); return p; } // generate an n-gram score comparing s1 and s2 int SuggestMgr::ngram(int n, char * s1, const char * s2, int uselen) { int nscore = 0; int ns; int l1; int l2; if (utf8) { w_char su1[MAXSWL]; w_char su2[MAXSWL]; l1 = u8_u16(su1, MAXSWL, s1); l2 = u8_u16(su2, MAXSWL, s2); if (!l2) return 0; // decapitalize dictionary word if (complexprefixes) { mkallsmall_utf(su2+l2-1, 1, utfconv); } else { mkallsmall_utf(su2, 1, utfconv); } for (int j = 1; j <= n; j++) { ns = 0; for (int i = 0; i <= (l1-j); i++) { for (int l = 0; l <= (l2-j); l++) { int k; for (k = 0; (k < j); k++) { w_char * c1 = su1 + i + k; w_char * c2 = su2 + l + k; if ((c1->l != c2->l) || (c1->h != c2->h)) break; } if (k == j) { ns++; break; } } } nscore = nscore + ns; if (ns < 2) break; } } else { char t[MAXSWUTF8L]; l1 = strlen(s1); l2 = strlen(s2); if (!l2) return 0; strcpy(t, s2); if (complexprefixes) { *(t+l2-1) = csconv[((unsigned char)*(t+l2-1))].clower; } else { mkallsmall(t, csconv); /// *t = csconv[((unsigned char)*t)].clower; } for (int j = 1; j <= n; j++) { ns = 0; for (int i = 0; i <= (l1-j); i++) { char c = *(s1 + i + j); *(s1 + i + j) = '\0'; if (strstr(t,(s1+i))) ns++; *(s1 + i + j ) = c; } nscore = nscore + ns; if (ns < 2) break; } } ns = 0; if (uselen == NGRAM_LONGER_WORSE) ns = (l2-l1)-2; if (uselen == NGRAM_ANY_MISMATCH) ns = abs(l2-l1)-2; return (nscore - ((ns > 0) ? ns : 0)); } int SuggestMgr::equalfirstletter(char * s1, const char * s2) { if (utf8) { w_char su1[MAXSWL]; w_char su2[MAXSWL]; // decapitalize dictionary word if (complexprefixes) { int l1 = u8_u16(su1, MAXSWL, s1); int l2 = u8_u16(su2, MAXSWL, s2); if (*((short *)su1+l1-1) == *((short *)su2+l2-1)) return 1; } else { u8_u16(su1, 1, s1); u8_u16(su2, 1, s2); if (*((short *)su1) == *((short *)su2)) return 1; } } else { if (complexprefixes) { int l1 = strlen(s1); int l2 = strlen(s2); if (*(s2+l1-1) == *(s2+l2-1)) return 1; } else { if (*s1 == *s2) return 1; } } return 0; } int SuggestMgr::commoncharacterpositions(char * s1, const char * s2, int * is_swap) { int num = 0; int diff = 0; int diffpos[2]; *is_swap = 0; if (utf8) { w_char su1[MAXSWL]; w_char su2[MAXSWL]; int l1 = u8_u16(su1, MAXSWL, s1); int l2 = u8_u16(su2, MAXSWL, s2); for (int i = 0; (i < l1) && (i < l2); i++) { if (((short *) su1)[i] == ((short *) su2)[i]) { num++; } else { if (diff < 2) diffpos[diff] = i; diff++; } } if ((diff == 2) && (l1 == l2) && (((short *) su1)[diffpos[0]] == ((short *) su2)[diffpos[1]]) && (((short *) su1)[diffpos[1]] == ((short *) su2)[diffpos[0]])) *is_swap = 1; } else { int i; for (i = 0; (*(s1+i) != 0) && (*(s2+i) != 0); i++) { if (*(s1+i) == *(s2+i)) { num++; } else { if (diff < 2) diffpos[diff] = i; diff++; } } if ((diff == 2) && (*(s1+i) == 0) && (*(s2+i) == 0) && (*(s1+diffpos[0]) == *(s2+diffpos[1])) && (*(s1+diffpos[1]) == *(s2+diffpos[0]))) *is_swap = 1; } return num; } int SuggestMgr::mystrlen(const char * word) { if (utf8) { w_char w[MAXSWL]; return u8_u16(w, MAXSWL, word); } else return strlen(word); } // sort in decreasing order of score void SuggestMgr::bubblesort(char** rword, int* rsc, int n ) { int m = 1; while (m < n) { int j = m; while (j > 0) { if (rsc[j-1] < rsc[j]) { int sctmp = rsc[j-1]; char * wdtmp = rword[j-1]; rsc[j-1] = rsc[j]; rword[j-1] = rword[j]; rsc[j] = sctmp; rword[j] = wdtmp; j--; } else break; } m++; } return; } // longest common subsequence void SuggestMgr::lcs(const char * s, const char * s2, int * l1, int * l2, char ** result) { int n, m; w_char su[MAXSWL]; w_char su2[MAXSWL]; char * b; char * c; int i; int j; if (utf8) { m = u8_u16(su, MAXSWL, s); n = u8_u16(su2, MAXSWL, s2); } else { m = strlen(s); n = strlen(s2); } c = (char *) malloc((m + 1) * (n + 1)); b = (char *) malloc((m + 1) * (n + 1)); for (i = 1; i <= m; i++) c[i*(n+1)] = 0; for (j = 0; j <= n; j++) c[j] = 0; for (i = 1; i <= m; i++) { for (j = 1; j <= n; j++) { if ((utf8) && (*((short *) su+i-1) == *((short *)su2+j-1)) || (!utf8) && ((*(s+i-1)) == (*(s2+j-1)))) { c[i*(n+1) + j] = c[(i-1)*(n+1) + j-1]+1; b[i*(n+1) + j] = LCS_UPLEFT; } else if (c[(i-1)*(n+1) + j] >= c[i*(n+1) + j-1]) { c[i*(n+1) + j] = c[(i-1)*(n+1) + j]; b[i*(n+1) + j] = LCS_UP; } else { c[i*(n+1) + j] = c[i*(n+1) + j-1]; b[i*(n+1) + j] = LCS_LEFT; } } } *result = b; free(c); *l1 = m; *l2 = n; } int SuggestMgr::lcslen(const char * s, const char* s2) { int m; int n; int i; int j; char * result; int len = 0; lcs(s, s2, &m, &n, &result); i = m; j = n; while ((i != 0) && (j != 0)) { if (result[i*(n+1) + j] == LCS_UPLEFT) { len++; i--; j--; } else if (result[i*(n+1) + j] == LCS_UP) { i--; } else j--; } if (result) free(result); return len; }