Added MySpell library to project

This commit is contained in:
Ianos Gnatiuc 2006-03-11 16:21:45 +00:00
parent 6455734f3d
commit dbac4566d8
19 changed files with 7251 additions and 0 deletions

View File

@ -0,0 +1,393 @@
#include "license.readme"
#include <cctype>
#include <cstring>
#include <cstdlib>
#include <cstdio>
#include "affentry.hxx"
#if !defined(_MSC_VER)
using namespace std;
#endif
extern char * mystrdup(const char * s);
extern char * myrevstrdup(const char * s);
PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp)
{
// register affix manager
pmyMgr = pmgr;
// set up its intial values
achar = dp->achar; // char flag
strip = dp->strip; // string to strip
appnd = dp->appnd; // string to append
stripl = dp->stripl; // length of strip string
appndl = dp->appndl; // length of append string
numconds = dp->numconds; // number of conditions to match
xpflg = dp->xpflg; // cross product flag
// then copy over all of the conditions
memcpy(&conds[0],&dp->conds[0],SETSIZE*sizeof(conds[0]));
next = NULL;
nextne = NULL;
nexteq = NULL;
}
PfxEntry::~PfxEntry()
{
achar = '\0';
if (appnd) free(appnd);
if (strip)free(strip);
pmyMgr = NULL;
appnd = NULL;
strip = NULL;
}
// add prefix to this word assuming conditions hold
char * PfxEntry::add(const char * word, int len)
{
int cond;
char tword[MAXWORDLEN+1];
/* make sure all conditions match */
if ((len > stripl) && (len >= numconds)) {
unsigned char * cp = (unsigned char *) word;
for (cond = 0; cond < numconds; cond++) {
if ((conds[*cp++] & (1 << cond)) == 0)
break;
}
if (cond >= numconds) {
/* we have a match so add prefix */
int tlen = 0;
if (appndl) {
strcpy(tword,appnd);
tlen += appndl;
}
char * pp = tword + tlen;
strcpy(pp, (word + stripl));
return mystrdup(tword);
}
}
return NULL;
}
// check if this prefix entry matches
struct hentry * PfxEntry::check(const char * word, int len)
{
int cond; // condition number being examined
int tmpl; // length of tmpword
struct hentry * he; // hash entry of root word or NULL
unsigned char * cp;
char tmpword[MAXWORDLEN+1];
// on entry prefix is 0 length or already matches the beginning of the word.
// So if the remaining root word has positive length
// and if there are enough chars in root word and added back strip chars
// to meet the number of characters conditions, then test it
tmpl = len - appndl;
if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
// generate new root word by removing prefix and adding
// back any characters that would have been stripped
if (stripl) strcpy (tmpword, strip);
strcpy ((tmpword + stripl), (word + appndl));
// now make sure all of the conditions on characters
// are met. Please see the appendix at the end of
// this file for more info on exactly what is being
// tested
cp = (unsigned char *)tmpword;
for (cond = 0; cond < numconds; cond++) {
if ((conds[*cp++] & (1 << cond)) == 0) break;
}
// if all conditions are met then check if resulting
// root word in the dictionary
if (cond >= numconds) {
tmpl += stripl;
if ((he = pmyMgr->lookup(tmpword)) != NULL) {
if (TESTAFF(he->astr, achar, he->alen)) return he;
}
// prefix matched but no root word was found
// if XPRODUCT is allowed, try again but now
// ross checked combined with a suffix
if (xpflg & XPRODUCT) {
he = pmyMgr->suffix_check(tmpword, tmpl, XPRODUCT, (AffEntry *)this);
if (he) return he;
}
}
}
return NULL;
}
SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp)
{
// register affix manager
pmyMgr = pmgr;
// set up its intial values
achar = dp->achar; // char flag
strip = dp->strip; // string to strip
appnd = dp->appnd; // string to append
stripl = dp->stripl; // length of strip string
appndl = dp->appndl; // length of append string
numconds = dp->numconds; // number of conditions to match
xpflg = dp->xpflg; // cross product flag
// then copy over all of the conditions
memcpy(&conds[0],&dp->conds[0],SETSIZE*sizeof(conds[0]));
rappnd = myrevstrdup(appnd);
}
SfxEntry::~SfxEntry()
{
achar = '\0';
if (appnd) free(appnd);
if (rappnd) free(rappnd);
if (strip) free(strip);
pmyMgr = NULL;
appnd = NULL;
strip = NULL;
}
// add suffix to this word assuming conditions hold
char * SfxEntry::add(const char * word, int len)
{
int cond;
char tword[MAXWORDLEN+1];
/* make sure all conditions match */
if ((len > stripl) && (len >= numconds)) {
unsigned char * cp = (unsigned char *) (word + len);
for (cond = numconds; --cond >=0; ) {
if ((conds[*--cp] & (1 << cond)) == 0)
break;
}
if (cond < 0) {
/* we have a match so add suffix */
strcpy(tword,word);
int tlen = len;
if (stripl) {
tlen -= stripl;
}
char * pp = (tword + tlen);
if (appndl) {
strcpy(pp,appnd);
tlen += appndl;
} else *pp = '\0';
return mystrdup(tword);
}
}
return NULL;
}
// see if this suffix is present in the word
struct hentry * SfxEntry::check(const char * word, int len, int optflags, AffEntry* ppfx)
{
int tmpl; // length of tmpword
int cond; // condition beng examined
struct hentry * he; // hash entry pointer
unsigned char * cp;
char tmpword[MAXWORDLEN+1];
PfxEntry* ep = (PfxEntry *) ppfx;
// if this suffix is being cross checked with a prefix
// but it does not support cross products skip it
if ((optflags & XPRODUCT) != 0 && (xpflg & XPRODUCT) == 0)
return NULL;
// upon entry suffix is 0 length or already matches the end of the word.
// So if the remaining root word has positive length
// and if there are enough chars in root word and added back strip chars
// to meet the number of characters conditions, then test it
tmpl = len - appndl;
if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
// generate new root word by removing suffix and adding
// back any characters that would have been stripped or
// or null terminating the shorter string
strcpy (tmpword, word);
cp = (unsigned char *)(tmpword + tmpl);
if (stripl) {
strcpy ((char *)cp, strip);
tmpl += stripl;
cp = (unsigned char *)(tmpword + tmpl);
} else *cp = '\0';
// now make sure all of the conditions on characters
// are met. Please see the appendix at the end of
// this file for more info on exactly what is being
// tested
for (cond = numconds; --cond >= 0; ) {
if ((conds[*--cp] & (1 << cond)) == 0) break;
}
// if all conditions are met then check if resulting
// root word in the dictionary
if (cond < 0) {
if ((he = pmyMgr->lookup(tmpword)) != NULL) {
if (TESTAFF(he->astr, achar , he->alen) &&
((optflags & XPRODUCT) == 0 ||
TESTAFF(he->astr, ep->getFlag(), he->alen))) return he;
}
}
}
return NULL;
}
#if 0
Appendix: Understanding Affix Code
An affix is either a prefix or a suffix attached to root words to make
other words.
Basically a Prefix or a Suffix is set of AffEntry objects
which store information about the prefix or suffix along
with supporting routines to check if a word has a particular
prefix or suffix or a combination.
The structure affentry is defined as follows:
struct affentry
{
unsigned char achar; // char used to represent the affix
char * strip; // string to strip before adding affix
char * appnd; // the affix string to add
short stripl; // length of the strip string
short appndl; // length of the affix string
short numconds; // the number of conditions that must be met
short xpflg; // flag: XPRODUCT- combine both prefix and suffix
char conds[SETSIZE]; // array which encodes the conditions to be met
};
Here is a suffix borrowed from the en_US.aff file. This file
is whitespace delimited.
SFX D Y 4
SFX D 0 e d
SFX D y ied [^aeiou]y
SFX D 0 ed [^ey]
SFX D 0 ed [aeiou]y
This information can be interpreted as follows:
In the first line has 4 fields
Field
-----
1 SFX - indicates this is a suffix
2 D - is the name of the character flag which represents this suffix
3 Y - indicates it can be combined with prefixes (cross product)
4 4 - indicates that sequence of 4 affentry structures are needed to
properly store the affix information
The remaining lines describe the unique information for the 4 SfxEntry
objects that make up this affix. Each line can be interpreted
as follows: (note fields 1 and 2 are as a check against line 1 info)
Field
-----
1 SFX - indicates this is a suffix
2 D - is the name of the character flag for this affix
3 y - the string of chars to strip off before adding affix
(a 0 here indicates the NULL string)
4 ied - the string of affix characters to add
5 [^aeiou]y - the conditions which must be met before the affix
can be applied
Field 5 is interesting. Since this is a suffix, field 5 tells us that
there are 2 conditions that must be met. The first condition is that
the next to the last character in the word must *NOT* be any of the
following "a", "e", "i", "o" or "u". The second condition is that
the last character of the word must end in "y".
So how can we encode this information concisely and be able to
test for both conditions in a fast manner? The answer is found
but studying the wonderful ispell code of Geoff Kuenning, et.al.
(now available under a normal BSD license).
If we set up a conds array of 256 bytes indexed (0 to 255) and access it
using a character (cast to an unsigned char) of a string, we have 8 bits
of information we can store about that character. Specifically we
could use each bit to say if that character is allowed in any of the
last (or first for prefixes) 8 characters of the word.
Basically, each character at one end of the word (up to the number
of conditions) is used to index into the conds array and the resulting
value found there says whether the that character is valid for a
specific character position in the word.
For prefixes, it does this by setting bit 0 if that char is valid
in the first position, bit 1 if valid in the second position, and so on.
If a bit is not set, then that char is not valid for that postion in the
word.
If working with suffixes bit 0 is used for the character closest
to the front, bit 1 for the next character towards the end, ...,
with bit numconds-1 representing the last char at the end of the string.
Note: since entries in the conds[] are 8 bits, only 8 conditions
(read that only 8 character positions) can be examined at one
end of a word (the beginning for prefixes and the end for suffixes.
So to make this clearer, lets encode the conds array values for the
first two affentries for the suffix D described earlier.
For the first affentry:
numconds = 1 (only examine the last character)
conds['e'] = (1 << 0) (the word must end in an E)
all others are all 0
For the second affentry:
numconds = 2 (only examine the last two characters)
conds[X] = conds[X] | (1 << 0) (aeiou are not allowed)
where X is all characters *but* a, e, i, o, or u
conds['y'] = (1 << 1) (the last char must be a y)
all other bits for all other entries in the conds array are zero
#endif

View File

@ -0,0 +1,86 @@
#ifndef _AFFIX_HXX_
#define _AFFIX_HXX_
#include "atypes.hxx"
#include "baseaffix.hxx"
#include "affixmgr.hxx"
/* A Prefix Entry */
class PfxEntry : public AffEntry
{
AffixMgr* pmyMgr;
PfxEntry * next;
PfxEntry * nexteq;
PfxEntry * nextne;
PfxEntry * flgnxt;
public:
PfxEntry(AffixMgr* pmgr, affentry* dp );
~PfxEntry();
struct hentry * check(const char * word, int len);
inline bool allowCross() { return ((xpflg & XPRODUCT) != 0); }
inline unsigned char getFlag() { return achar; }
inline const char * getKey() { return appnd; }
char * add(const char * word, int len);
inline PfxEntry * getNext() { return next; }
inline PfxEntry * getNextNE() { return nextne; }
inline PfxEntry * getNextEQ() { return nexteq; }
inline PfxEntry * getFlgNxt() { return flgnxt; }
inline void setNext(PfxEntry * ptr) { next = ptr; }
inline void setNextNE(PfxEntry * ptr) { nextne = ptr; }
inline void setNextEQ(PfxEntry * ptr) { nexteq = ptr; }
inline void setFlgNxt(PfxEntry * ptr) { flgnxt = ptr; }
};
/* A Suffix Entry */
class SfxEntry : public AffEntry
{
AffixMgr* pmyMgr;
char * rappnd;
SfxEntry * next;
SfxEntry * nexteq;
SfxEntry * nextne;
SfxEntry * flgnxt;
public:
SfxEntry(AffixMgr* pmgr, affentry* dp );
~SfxEntry();
struct hentry * check(const char * word, int len, int optflags,
AffEntry* ppfx);
inline bool allowCross() { return ((xpflg & XPRODUCT) != 0); }
inline unsigned char getFlag() { return achar; }
inline const char * getKey() { return rappnd; }
char * add(const char * word, int len);
inline SfxEntry * getNext() { return next; }
inline SfxEntry * getNextNE() { return nextne; }
inline SfxEntry * getNextEQ() { return nexteq; }
inline SfxEntry * getFlgNxt() { return flgnxt; }
inline void setNext(SfxEntry * ptr) { next = ptr; }
inline void setNextNE(SfxEntry * ptr) { nextne = ptr; }
inline void setNextEQ(SfxEntry * ptr) { nexteq = ptr; }
inline void setFlgNxt(SfxEntry * ptr) { flgnxt = ptr; }
};
#endif

1233
goldlib/myspell/affixmgr.cxx Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,66 @@
#ifndef _AFFIXMGR_HXX_
#define _AFFIXMGR_HXX_
#include "atypes.hxx"
#include "baseaffix.hxx"
#include "hashmgr.hxx"
#include <cstdio>
class AffixMgr
{
AffEntry * pStart[SETSIZE];
AffEntry * sStart[SETSIZE];
AffEntry * pFlag[SETSIZE];
AffEntry * sFlag[SETSIZE];
HashMgr * pHMgr;
char * trystring;
char * encoding;
char * compound;
int cpdmin;
int numrep;
replentry * reptable;
int nummap;
mapentry * maptable;
bool nosplitsugs;
public:
AffixMgr(const char * affpath, HashMgr * ptr);
~AffixMgr();
struct hentry * affix_check(const char * word, int len);
struct hentry * prefix_check(const char * word, int len);
struct hentry * suffix_check(const char * word, int len, int sfxopts, AffEntry* ppfx);
int expand_rootword(struct guessword * wlst, int maxn,
const char * ts, int wl, const char * ap, int al);
struct hentry * compound_check(const char * word, int len, char compound_flag);
struct hentry * lookup(const char * word);
int get_numrep();
struct replentry * get_reptable();
int get_nummap();
struct mapentry * get_maptable();
char * get_encoding();
char * get_try_string();
char * get_compound();
bool get_nosplitsugs();
private:
int parse_file(const char * affpath);
int parse_try(char * line);
int parse_set(char * line);
int parse_cpdflag(char * line);
int parse_cpdmin(char * line);
int parse_reptable(char * line, FILE * af);
int parse_maptable(char * line, FILE * af);
int parse_affix(char * line, const char at, FILE * af);
void encodeit(struct affentry * ptr, char * cs);
int build_pfxlist(AffEntry* pfxptr);
int build_sfxlist(AffEntry* sfxptr);
int process_pfx_order();
int process_sfx_order();
};
#endif

View File

@ -0,0 +1,45 @@
#ifndef _ATYPES_HXX_
#define _ATYPES_HXX_
#define SETSIZE 256
#define MAXAFFIXES 256
#define MAXWORDLEN 100
#define XPRODUCT (1 << 0)
#define MAXLNLEN 1024
#define TESTAFF( a , b , c ) memchr((void *)(a), (int)(b), (size_t)(c) )
struct affentry
{
char * strip;
char * appnd;
short stripl;
short appndl;
short numconds;
short xpflg;
char achar;
char conds[SETSIZE];
};
struct replentry {
char * pattern;
char * replacement;
};
struct mapentry {
char * set;
int len;
};
struct guessword {
char * word;
bool allow;
};
#endif

View File

@ -0,0 +1,17 @@
#ifndef _BASEAFF_HXX_
#define _BASEAFF_HXX_
class AffEntry
{
protected:
char * appnd;
char * strip;
short appndl;
short stripl;
short numconds;
short xpflg;
char achar;
char conds[SETSIZE];
};
#endif

3850
goldlib/myspell/csutil.cxx Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,67 @@
#ifndef __CSUTILHXX__
#define __CSUTILHXX__
// First some base level utility routines
// remove end of line char(s)
void mychomp(char * s);
// duplicate string
char * mystrdup(const char * s);
// duplicate reverse of string
char * myrevstrdup(const char * s);
// parse into tokens with char delimiter
char * mystrsep(char ** sptr, const char delim);
// is one string a leading subset of another
int isSubset(const char * s1, const char * s2);
// character encoding information
struct cs_info {
unsigned char ccase;
unsigned char clower;
unsigned char cupper;
};
struct enc_entry {
const char * enc_name;
struct cs_info * cs_table;
};
// language to encoding default map
struct lang_map {
const char * lang;
const char * def_enc;
};
struct cs_info * get_current_cs(const char * es);
const char * get_default_enc(const char * lang);
// convert null terminated string to all caps using encoding
void enmkallcap(char * d, const char * p, const char * encoding);
// convert null terminated string to all little using encoding
void enmkallsmall(char * d, const char * p, const char * encoding);
// convert null terminated string to have intial capital using encoding
void enmkinitcap(char * d, const char * p, const char * encoding);
// convert null terminated string to all caps
void mkallcap(char * p, const struct cs_info * csconv);
// convert null terminated string to all little
void mkallsmall(char * p, const struct cs_info * csconv);
// convert null terminated string to have intial capital
void mkinitcap(char * p, const struct cs_info * csconv);
#endif

127
goldlib/myspell/dictmgr.cxx Normal file
View File

@ -0,0 +1,127 @@
#include <cstdlib>
#include <cstring>
#include <cctype>
#include <cstdio>
#include "dictmgr.hxx"
#if !defined(_MSC_VER)
using namespace std;
#endif
// some utility functions
extern void mychomp(char * s);
extern char * mystrdup(const char * s);
extern char * mystrsep(char ** stringp, const char delim);
DictMgr::DictMgr(const char * dictpath, const char * etype)
{
// load list of etype entries
numdict = 0;
pdentry = (dictentry *)malloc(MAXDICTIONARIES*sizeof(struct dictentry));
if (pdentry) {
if (parse_file(dictpath, etype)) {
numdict = 0;
// no dictionary.lst found is okay
}
} else {
numdict = 0;
}
}
DictMgr::~DictMgr()
{
dictentry * pdict = NULL;
if (pdentry) {
pdict = pdentry;
for (int i=0;i<numdict;i++) {
if (pdict->lang) {
free(pdict->lang);
pdict->lang = NULL;
}
if (pdict->region) {
free(pdict->region);
pdict->region=NULL;
}
if (pdict->filename) {
free(pdict->filename);
pdict->filename = NULL;
}
pdict++;
}
free(pdentry);
pdentry = NULL;
pdict = NULL;
}
numdict = 0;
}
// read in list of etype entries and build up structure to describe them
int DictMgr::parse_file(const char * dictpath, const char * etype)
{
int i;
char line[MAXDICTENTRYLEN+1];
dictentry * pdict = pdentry;
// open the dictionary list file
FILE * dictlst;
dictlst = fopen(dictpath,"r");
if (!dictlst) {
return 1;
}
// step one is to parse the dictionary list building up the
// descriptive structures
// read in each line ignoring any that dont start with etype
while (fgets(line,MAXDICTENTRYLEN,dictlst)) {
mychomp(line);
/* parse in a dictionary entry */
if (strncmp(line,etype,4) == 0) {
if (numdict < MAXDICTIONARIES) {
char * tp = line;
char * piece;
i = 0;
while ((piece=mystrsep(&tp,' '))) {
if (*piece != '\0') {
switch(i) {
case 0: break;
case 1: pdict->lang = mystrdup(piece); break;
case 2: if (strcmp (piece, "ANY") == 0)
pdict->region = mystrdup("");
else
pdict->region = mystrdup(piece);
break;
case 3: pdict->filename = mystrdup(piece); break;
default: break;
}
i++;
}
free(piece);
}
if (i == 4) {
numdict++;
pdict++;
} else {
fprintf(stderr,"dictionary list corruption in line \"%s\"\n",line);
fflush(stderr);
}
}
}
}
fclose(dictlst);
return 0;
}
// return text encoding of dictionary
int DictMgr::get_list(dictentry ** ppentry)
{
*ppentry = pdentry;
return numdict;
}

View File

@ -0,0 +1,31 @@
#ifndef _DICTMGR_HXX_
#define _DICTMGR_HXX_
#define MAXDICTIONARIES 100
#define MAXDICTENTRYLEN 1024
struct dictentry {
char * filename;
char * lang;
char * region;
};
class DictMgr
{
int numdict;
dictentry * pdentry;
public:
DictMgr(const char * dictpath, const char * etype);
~DictMgr();
int get_list(dictentry** ppentry);
private:
int parse_file(const char * dictpath, const char * etype);
};
#endif

View File

@ -0,0 +1,89 @@
#include <cstring>
#include <cstdlib>
#include <cstdio>
#include "myspell.hxx"
extern char * mystrdup(const char * s);
using namespace std;
int
main(int argc, char** argv)
{
char * af;
char * df;
char * wtc;
FILE* wtclst;
/* first parse the command line options */
/* arg1 - affix file, arg2 dictionary file, arg3 - file of words to check */
if (argv[1]) {
af = mystrdup(argv[1]);
} else {
fprintf(stderr,"correct syntax is:\n");
fprintf(stderr,"example affix_file dictionary_file file_of_words_to_check\n");
exit(1);
}
if (argv[2]) {
df = mystrdup(argv[2]);
} else {
fprintf(stderr,"correct syntax is:\n");
fprintf(stderr,"example affix_file dictionary_file file_of_words_to_check\n");
exit(1);
}
if (argv[3]) {
wtc = mystrdup(argv[3]);
} else {
fprintf(stderr,"correct syntax is:\n");
fprintf(stderr,"example affix_file dictionary_file file_of_words_to_check\n");
exit(1);
}
/* open the words to check list */
wtclst = fopen(wtc,"r");
if (!wtclst) {
fprintf(stderr,"Error - could not open file of words to check\n");
exit(1);
}
MySpell * pMS= new MySpell(af,df);
int k;
int dp;
char buf[101];
while(fgets(buf,100,wtclst)) {
k = strlen(buf);
*(buf + k - 1) = '\0';
dp = pMS->spell(buf);
if (dp) {
fprintf(stdout,"\"%s\" is okay\n",buf);
fprintf(stdout,"\n");
} else {
fprintf(stdout,"\"%s\" is incorrect!\n",buf);
fprintf(stdout," suggestions:\n");
char ** wlst;
int ns = pMS->suggest(&wlst,buf);
for (int i=0; i < ns; i++) {
fprintf(stdout," ...\"%s\"\n",wlst[i]);
free(wlst[i]);
}
fprintf(stdout,"\n");
free(wlst);
}
}
delete pMS;
fclose(wtclst);
free(wtc);
free(df);
free(af);
return 0;
}

213
goldlib/myspell/hashmgr.cxx Normal file
View File

@ -0,0 +1,213 @@
#include "license.readme"
#if !defined(_MSC_VER)
#include <unistd.h>
#endif
#include <cstdlib>
#include <cstring>
#include <fcntl.h>
#include <cstdio>
#include "hashmgr.hxx"
extern void mychomp(char * s);
extern char * mystrdup(const char *);
#if !defined(_MSC_VER)
using namespace std;
#endif
// build a hash table from a munched word list
HashMgr::HashMgr(const char * tpath)
{
tablesize = 0;
tableptr = NULL;
int ec = load_tables(tpath);
if (ec) {
/* error condition - what should we do here */
fprintf(stderr,"Hash Manager Error : %d\n",ec);
fflush(stderr);
if (tableptr) {
free(tableptr);
}
tablesize = 0;
}
}
HashMgr::~HashMgr()
{
if (tableptr) {
// now pass through hash table freeing up everything
// go through column by column of the table
for (int i=0; i < tablesize; i++) {
struct hentry * pt = &tableptr[i];
struct hentry * nt = NULL;
if (pt) {
if (pt->word) free(pt->word);
if (pt->astr) free(pt->astr);
pt = pt->next;
}
while(pt) {
nt = pt->next;
if (pt->word) free(pt->word);
if (pt->astr) free(pt->astr);
free(pt);
pt = nt;
}
}
free(tableptr);
}
tablesize = 0;
}
// lookup a root word in the hashtable
struct hentry * HashMgr::lookup(const char *word) const
{
struct hentry * dp;
if (tableptr) {
dp = &tableptr[hash(word)];
if (dp->word == NULL) return NULL;
for ( ; dp != NULL; dp = dp->next) {
if (strcmp(word,dp->word) == 0) return dp;
}
}
return NULL;
}
// add a word to the hash table (private)
int HashMgr::add_word(const char * word, int wl, const char * aff, int al)
{
int i = hash(word);
struct hentry * dp = &tableptr[i];
struct hentry* hp;
if (dp->word == NULL) {
dp->wlen = wl;
dp->alen = al;
dp->word = mystrdup(word);
dp->astr = mystrdup(aff);
dp->next = NULL;
if ((wl) && (dp->word == NULL)) return 1;
if ((al) && (dp->astr == NULL)) return 1;
} else {
hp = (struct hentry *) malloc (sizeof(struct hentry));
if (hp == NULL) return 1;
hp->wlen = wl;
hp->alen = al;
hp->word = mystrdup(word);
hp->astr = mystrdup(aff);
hp->next = NULL;
while (dp->next != NULL) dp=dp->next;
dp->next = hp;
if ((wl) && (hp->word == NULL)) return 1;
if ((al) && (hp->astr == NULL)) return 1;
}
return 0;
}
// walk the hash table entry by entry - null at end
struct hentry * HashMgr::walk_hashtable(int &col, struct hentry * hp) const
{
//reset to start
if ((col < 0) || (hp == NULL)) {
col = -1;
hp = NULL;
}
if (hp && hp->next != NULL) {
hp = hp->next;
} else {
col++;
hp = (col < tablesize) ? &tableptr[col] : NULL;
// search for next non-blank column entry
while (hp && (hp->word == NULL)) {
col ++;
hp = (col < tablesize) ? &tableptr[col] : NULL;
}
if (col < tablesize) return hp;
hp = NULL;
col = -1;
}
return hp;
}
// load a munched word list and build a hash table on the fly
int HashMgr::load_tables(const char * tpath)
{
int wl, al;
char * ap;
// raw dictionary - munched file
FILE * rawdict = fopen(tpath, "r");
if (rawdict == NULL) return 1;
// first read the first line of file to get hash table size */
char ts[MAXDELEN];
if (! fgets(ts, MAXDELEN-1,rawdict)) return 2;
mychomp(ts);
tablesize = atoi(ts);
if (!tablesize) return 4;
tablesize = tablesize + 5;
if ((tablesize %2) == 0) tablesize++;
// allocate the hash table
tableptr = (struct hentry *) calloc(tablesize, sizeof(struct hentry));
if (! tableptr) return 3;
// loop through all words on much list and add to hash
// table and create word and affix strings
while (fgets(ts,MAXDELEN-1,rawdict)) {
mychomp(ts);
// split each line into word and affix char strings
ap = strchr(ts,'/');
if (ap) {
*ap = '\0';
ap++;
al = strlen(ap);
} else {
al = 0;
ap = NULL;
}
wl = strlen(ts);
// add the word and its index
if (add_word(ts,wl,ap,al))
return 5;;
}
fclose(rawdict);
return 0;
}
// the hash function is a simple load and rotate
// algorithm borrowed
int HashMgr::hash(const char * word) const
{
long hv = 0;
for (int i=0; i < 4 && *word != 0; i++)
hv = (hv << 8) | (*word++);
while (*word != 0) {
ROTATE(hv,ROTATE_LEN);
hv ^= (*word++);
}
return (unsigned long) hv % tablesize;
}

View File

@ -0,0 +1,27 @@
#ifndef _HASHMGR_HXX_
#define _HASHMGR_HXX_
#include "htypes.hxx"
class HashMgr
{
int tablesize;
struct hentry * tableptr;
public:
HashMgr(const char * tpath);
~HashMgr();
struct hentry * lookup(const char *) const;
int hash(const char *) const;
struct hentry * walk_hashtable(int & col, struct hentry * hp) const;
private:
HashMgr( const HashMgr & ); // not implemented
HashMgr &operator=( const HashMgr & ); // not implemented
int load_tables(const char * tpath);
int add_word(const char * word, int wl, const char * ap, int al);
};
#endif

View File

@ -0,0 +1,20 @@
#ifndef _HTYPES_HXX_
#define _HTYPES_HXX_
#define MAXDELEN 256
#define ROTATE_LEN 5
#define ROTATE(v,q) \
(v) = ((v) << (q)) | (((v) >> (32 - q)) & ((1 << (q))-1));
struct hentry
{
short wlen;
short alen;
char * word;
char * astr;
struct hentry * next;
};
#endif

View File

@ -0,0 +1,61 @@
/*
* Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
* And Contributors. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. All modifications to the source code must be clearly marked as
* such. Binary redistributions based on modified source code
* must be clearly marked as modified versions in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
* KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*
* NOTE: A special thanks and credit goes to Geoff Kuenning
* the creator of ispell. MySpell's affix algorithms were
* based on those of ispell which should be noted is
* copyright Geoff Kuenning et.al. and now available
* under a BSD style license. For more information on ispell
* and affix compression in general, please see:
* http://www.cs.ucla.edu/ficus-members/geoff/ispell.html
* (the home page for ispell)
*
* An almost complete rewrite of MySpell for use by
* the Mozilla project has been developed by David Einstein
* (Deinst@world.std.com). David and I are now
* working on parallel development tracks to help
* our respective projects (Mozilla and OpenOffice.org
* and we will maintain full affix file and dictionary
* file compatibility and work on merging our versions
* of MySpell back into a single tree. David has been
* a significant help in improving MySpell.
*
* Special thanks also go to La'szlo' Ne'meth
* <nemethl@gyorsposta.hu> who is the author of the
* Hungarian dictionary and who developed and contributed
* the code to support compound words in MySpell
* and fixed numerous problems with the encoding
* case conversion tables.
*
*/

302
goldlib/myspell/myspell.cxx Normal file
View File

@ -0,0 +1,302 @@
#include "license.readme"
#include <cstring>
#include <cstdlib>
#include <cstdio>
#include "myspell.hxx"
#if !defined(_MSC_VER)
using namespace std;
#endif
MySpell::MySpell(const char * affpath, const char * dpath)
{
encoding = NULL;
csconv = NULL;
/* first set up the hash manager */
pHMgr = new HashMgr(dpath);
/* next set up the affix manager */
/* it needs access to the hash manager lookup methods */
pAMgr = new AffixMgr(affpath,pHMgr);
/* get the preferred try string and the dictionary */
/* encoding from the Affix Manager for that dictionary */
char * try_string = pAMgr->get_try_string();
encoding = pAMgr->get_encoding();
csconv = get_current_cs(encoding);
/* and finally set up the suggestion manager */
maxSug = 100;
pSMgr = new SuggestMgr(try_string, maxSug, pAMgr);
if (try_string) free(try_string);
}
MySpell::~MySpell()
{
if (pSMgr) delete pSMgr;
if (pAMgr) delete pAMgr;
if (pHMgr) delete pHMgr;
pSMgr = NULL;
pAMgr = NULL;
pHMgr = NULL;
csconv= NULL;
if (encoding) free(encoding);
encoding = NULL;
}
// make a copy of src at destination while removing all leading
// blanks and removing any trailing periods after recording
// their presence with the abbreviation flag
// also since already going through character by character,
// set the capitalization type
// return the length of the "cleaned" word
int MySpell::cleanword(char * dest, const char * src, int * pcaptype, int * pabbrev)
{
// with the new breakiterator code this should not be needed anymore
const char * special_chars = "._#$%&()* +,-/:;<=>[]\\^`{|}~\t \x0a\x0d\x01\'\"";
unsigned char * p = (unsigned char *) dest;
const unsigned char * q = (const unsigned char * ) src;
// first skip over any leading special characters
while ((*q != '\0') && (strchr(special_chars,(int)(*q)))) q++;
// now strip off any trailing special characters
// if a period comes after a normal char record its presence
*pabbrev = 0;
int nl = strlen((const char *)q);
while ((nl > 0) && (strchr(special_chars,(int)(*(q+nl-1))))) {
nl--;
}
if ( *(q+nl) == '.' ) *pabbrev = 1;
// if no characters are left it can't be an abbreviation and can't be capitalized
if (nl <= 0) {
*pcaptype = NOCAP;
*pabbrev = 0;
*p = '\0';
return 0;
}
// now determine the capitalization type of the first nl letters
int ncap = 0;
int nneutral = 0;
int nc = 0;
while (nl > 0) {
nc++;
if (csconv[(*q)].ccase) ncap++;
if (csconv[(*q)].cupper == csconv[(*q)].clower) nneutral++;
*p++ = *q++;
nl--;
}
// remember to terminate the destination string
*p = '\0';
// now finally set the captype
if (ncap == 0) {
*pcaptype = NOCAP;
} else if ((ncap == 1) && csconv[(unsigned char)(*dest)].ccase) {
*pcaptype = INITCAP;
} else if ((ncap == nc) || ((ncap + nneutral) == nc)){
*pcaptype = ALLCAP;
} else {
*pcaptype = HUHCAP;
}
return nc;
}
int MySpell::spell(const char * word)
{
char * rv=NULL;
char cw[MAXWORDLEN+1];
char wspace[MAXWORDLEN+1];
int wl = strlen(word);
if (wl > (MAXWORDLEN - 1)) return 0;
int captype = 0;
int abbv = 0;
wl = cleanword(cw, word, &captype, &abbv);
if (wl == 0) return 1;
switch(captype) {
case HUHCAP:
case NOCAP: {
rv = check(cw);
if ((abbv) && !(rv)) {
memcpy(wspace,cw,wl);
*(wspace+wl) = '.';
*(wspace+wl+1) = '\0';
rv = check(wspace);
}
break;
}
case ALLCAP: {
memcpy(wspace,cw,(wl+1));
mkallsmall(wspace, csconv);
rv = check(wspace);
if (!rv) {
mkinitcap(wspace, csconv);
rv = check(wspace);
}
if (!rv) rv = check(cw);
if ((abbv) && !(rv)) {
memcpy(wspace,cw,wl);
*(wspace+wl) = '.';
*(wspace+wl+1) = '\0';
rv = check(wspace);
}
break;
}
case INITCAP: {
memcpy(wspace,cw,(wl+1));
mkallsmall(wspace, csconv);
rv = check(wspace);
if (!rv) rv = check(cw);
if ((abbv) && !(rv)) {
memcpy(wspace,cw,wl);
*(wspace+wl) = '.';
*(wspace+wl+1) = '\0';
rv = check(wspace);
}
break;
}
}
if (rv) return 1;
return 0;
}
char * MySpell::check(const char * word)
{
struct hentry * he = NULL;
if (pHMgr)
he = pHMgr->lookup (word);
if ((he == NULL) && (pAMgr)) {
// try stripping off affixes */
he = pAMgr->affix_check(word, strlen(word));
// try check compound word
if ((he == NULL) && (pAMgr->get_compound())) {
he = pAMgr->compound_check(word, strlen(word), (pAMgr->get_compound())[0]);
}
}
if (he) return he->word;
return NULL;
}
int MySpell::suggest(char*** slst, const char * word)
{
char cw[MAXWORDLEN+1];
char wspace[MAXWORDLEN+1];
if (! pSMgr) return 0;
int wl = strlen(word);
if (wl > (MAXWORDLEN-1)) return 0;
int captype = 0;
int abbv = 0;
wl = cleanword(cw, word, &captype, &abbv);
if (wl == 0) return 0;
int ns = 0;
char ** wlst = (char **) calloc(maxSug, sizeof(char *));
if (wlst == NULL) return 0;
switch(captype) {
case NOCAP: {
ns = pSMgr->suggest(wlst, ns, cw);
break;
}
case INITCAP: {
memcpy(wspace,cw,(wl+1));
mkallsmall(wspace, csconv);
ns = pSMgr->suggest(wlst, ns, wspace);
if (ns > 0) {
for (int j=0; j < ns; j++)
mkinitcap(wlst[j], csconv);
}
ns = pSMgr->suggest(wlst,ns,cw);
break;
}
case HUHCAP: {
ns = pSMgr->suggest(wlst, ns, cw);
if (ns != -1) {
memcpy(wspace,cw,(wl+1));
mkallsmall(wspace, csconv);
ns = pSMgr->suggest(wlst, ns, wspace);
}
break;
}
case ALLCAP: {
memcpy(wspace,cw,(wl+1));
mkallsmall(wspace, csconv);
ns = pSMgr->suggest(wlst, ns, wspace);
if (ns > 0) {
for (int j=0; j < ns; j++)
mkallcap(wlst[j], csconv);
}
if (ns != -1)
ns = pSMgr->suggest(wlst, ns , cw);
break;
}
}
if (ns > 0) {
*slst = wlst;
return ns;
}
// try ngram approach since found nothing
if (ns == 0) {
ns = pSMgr->ngsuggest(wlst, cw, pHMgr);
if (ns) {
switch(captype) {
case NOCAP: break;
case HUHCAP: break;
case INITCAP: {
for (int j=0; j < ns; j++)
mkinitcap(wlst[j], csconv);
}
break;
case ALLCAP: {
for (int j=0; j < ns; j++)
mkallcap(wlst[j], csconv);
}
break;
}
*slst = wlst;
return ns;
}
}
if (ns < 0) {
// we ran out of memory - we should free up as much as possible
for (int i=0;i<maxSug; i++)
if (wlst[i] != NULL) free(wlst[i]);
}
if (wlst) free(wlst);
*slst = NULL;
return 0;
}
char * MySpell::get_dic_encoding()
{
return encoding;
}

View File

@ -0,0 +1,37 @@
#include "hashmgr.hxx"
#include "affixmgr.hxx"
#include "suggestmgr.hxx"
#include "csutil.hxx"
#define NOCAP 0
#define INITCAP 1
#define ALLCAP 2
#define HUHCAP 3
#ifndef _MYSPELLMGR_HXX_
#define _MYSPELLMGR_HXX_
class MySpell
{
AffixMgr* pAMgr;
HashMgr* pHMgr;
SuggestMgr* pSMgr;
char * encoding;
struct cs_info * csconv;
int maxSug;
public:
MySpell(const char * affpath, const char * dpath);
~MySpell();
int suggest(char*** slst, const char * word);
int spell(const char *);
char * get_dic_encoding();
private:
int cleanword(char *, const char *, int *, int *);
char * check(const char *);
};
#endif

View File

@ -0,0 +1,539 @@
#include "license.readme"
#include <cstdlib>
#include <cctype>
#include <cstring>
#include <cstdio>
#include "suggestmgr.hxx"
#if !defined(_MSC_VER)
using namespace std;
#endif
extern char * mystrdup(const char *);
SuggestMgr::SuggestMgr(const char * tryme, int maxn,
AffixMgr * aptr)
{
// register affix manager and check in string of chars to
// try when building candidate suggestions
pAMgr = aptr;
ctry = mystrdup(tryme);
ctryl = 0;
if (ctry)
ctryl = strlen(ctry);
maxSug = maxn;
nosplitsugs=(0==1);
if (pAMgr) pAMgr->get_nosplitsugs();
}
SuggestMgr::~SuggestMgr()
{
pAMgr = NULL;
if (ctry) free(ctry);
ctry = NULL;
ctryl = 0;
maxSug = 0;
}
// generate suggestions for a mispelled word
// pass in address of array of char * pointers
int SuggestMgr::suggest(char** wlst, int ns, const char * word)
{
int nsug = ns;
// perhaps we made chose the wrong char from a related set
if ((nsug < maxSug) && (nsug > -1))
nsug = mapchars(wlst, word, nsug);
// perhaps we made a typical fault of spelling
if ((nsug < maxSug) && (nsug > -1))
nsug = replchars(wlst, word, nsug);
// did we forget to add a char
if ((nsug < maxSug) && (nsug > -1))
nsug = forgotchar(wlst, word, nsug);
// did we swap the order of chars by mistake
if ((nsug < maxSug) && (nsug > -1))
nsug = swapchar(wlst, word, nsug);
// did we add a char that should not be there
if ((nsug < maxSug) && (nsug > -1))
nsug = extrachar(wlst, word, nsug);
// did we just hit the wrong key in place of a good char
if ((nsug < maxSug) && (nsug > -1))
nsug = badchar(wlst, word, nsug);
// perhaps we forgot to hit space and two words ran together
if (!nosplitsugs) {
if ((nsug < maxSug) && (nsug > -1))
nsug = twowords(wlst, word, nsug);
}
return nsug;
}
// suggestions for when chose the wrong char out of a related set
int SuggestMgr::mapchars(char** wlst, const char * word, int ns)
{
int wl = strlen(word);
if (wl < 2 || ! pAMgr) return ns;
int nummap = pAMgr->get_nummap();
struct mapentry* maptable = pAMgr->get_maptable();
if (maptable==NULL) return ns;
ns = map_related(word, 0, wlst, ns, maptable, nummap);
return ns;
}
int SuggestMgr::map_related(const char * word, int i, char** wlst, int ns, const mapentry* maptable, int nummap)
{
char c = *(word + i);
if (c == 0) {
int cwrd = 1;
for (int m=0; m < ns; m++)
if (strcmp(word,wlst[m]) == 0) cwrd = 0;
if ((cwrd) && check(word,strlen(word))) {
if (ns < maxSug) {
wlst[ns] = mystrdup(word);
if (wlst[ns] == NULL) return -1;
ns++;
}
}
return ns;
}
int in_map = 0;
for (int j = 0; j < nummap; j++) {
if (strchr(maptable[j].set,c) != 0) {
in_map = 1;
char * newword = strdup(word);
for (int k = 0; k < maptable[j].len; k++) {
*(newword + i) = *(maptable[j].set + k);
ns = map_related(newword, (i+1), wlst, ns, maptable, nummap);
}
free(newword);
}
}
if (!in_map) {
i++;
ns = map_related(word, i, wlst, ns, maptable, nummap);
}
return ns;
}
// suggestions for a typical fault of spelling, that
// differs with more, than 1 letter from the right form.
int SuggestMgr::replchars(char** wlst, const char * word, int ns)
{
char candidate[MAXSWL];
const char * r;
int lenr, lenp;
int cwrd;
int wl = strlen(word);
if (wl < 2 || ! pAMgr) return ns;
int numrep = pAMgr->get_numrep();
struct replentry* reptable = pAMgr->get_reptable();
if (reptable==NULL) return ns;
for (int i=0; i < numrep; i++ ) {
r = word;
lenr = strlen(reptable[i].replacement);
lenp = strlen(reptable[i].pattern);
// search every occurence of the pattern in the word
while ((r=strstr(r, reptable[i].pattern)) != NULL) {
strcpy(candidate, word);
if (r-word + lenr + strlen(r+lenp) >= MAXSWL) break;
strcpy(candidate+(r-word),reptable[i].replacement);
strcpy(candidate+(r-word)+lenr, r+lenp);
cwrd = 1;
for (int k=0; k < ns; k++)
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
if ((cwrd) && check(candidate,strlen(candidate))) {
if (ns < maxSug) {
wlst[ns] = mystrdup(candidate);
if (wlst[ns] == NULL) return -1;
ns++;
} else return ns;
}
r++; // search for the next letter
}
}
return ns;
}
// error is wrong char in place of correct one
int SuggestMgr::badchar(char ** wlst, const char * word, int ns)
{
char tmpc;
char candidate[MAXSWL];
int wl = strlen(word);
int cwrd;
strcpy (candidate, word);
// swap out each char one by one and try all the tryme
// chars in its place to see if that makes a good word
for (int i=0; i < wl; i++) {
tmpc = candidate[i];
for (int j=0; j < ctryl; j++) {
if (ctry[j] == tmpc) continue;
candidate[i] = ctry[j];
cwrd = 1;
for (int k=0; k < ns; k++)
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
if ((cwrd) && check(candidate,wl)) {
if (ns < maxSug) {
wlst[ns] = mystrdup(candidate);
if (wlst[ns] == NULL) return -1;
ns++;
} else return ns;
}
candidate[i] = tmpc;
}
}
return ns;
}
// error is word has an extra letter it does not need
int SuggestMgr::extrachar(char** wlst, const char * word, int ns)
{
char candidate[MAXSWL];
const char * p;
char * r;
int cwrd;
int wl = strlen(word);
if (wl < 2) return ns;
// try omitting one char of word at a time
strcpy (candidate, word + 1);
for (p = word, r = candidate; *p != 0; ) {
cwrd = 1;
for (int k=0; k < ns; k++)
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
if ((cwrd) && check(candidate,wl-1)) {
if (ns < maxSug) {
wlst[ns] = mystrdup(candidate);
if (wlst[ns] == NULL) return -1;
ns++;
} else return ns;
}
*r++ = *p++;
}
return ns;
}
// error is mising a letter it needs
int SuggestMgr::forgotchar(char ** wlst, const char * word, int ns)
{
char candidate[MAXSWL];
const char * p;
char * q;
int cwrd;
int wl = strlen(word);
// try inserting a tryme character before every letter
strcpy(candidate + 1, word);
for (p = word, q = candidate; *p != 0; ) {
for (int i = 0; i < ctryl; i++) {
*q = ctry[i];
cwrd = 1;
for (int k=0; k < ns; k++)
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
if ((cwrd) && check(candidate,wl+1)) {
if (ns < maxSug) {
wlst[ns] = mystrdup(candidate);
if (wlst[ns] == NULL) return -1;
ns++;
} else return ns;
}
}
*q++ = *p++;
}
// now try adding one to end */
for (int i = 0; i < ctryl; i++) {
*q = ctry[i];
cwrd = 1;
for (int k=0; k < ns; k++)
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
if ((cwrd) && check(candidate,wl+1)) {
if (ns < maxSug) {
wlst[ns] = mystrdup(candidate);
if (wlst[ns] == NULL) return -1;
ns++;
} else return ns;
}
}
return ns;
}
/* error is should have been two words */
int SuggestMgr::twowords(char ** wlst, const char * word, int ns)
{
char candidate[MAXSWL];
char * p;
int wl=strlen(word);
if (wl < 3) return ns;
strcpy(candidate + 1, word);
// split the string into two pieces after every char
// if both pieces are good words make them a suggestion
for (p = candidate + 1; p[1] != '\0'; p++) {
p[-1] = *p;
*p = '\0';
if (check(candidate,strlen(candidate))) {
if (check((p+1),strlen(p+1))) {
*p = ' ';
if (ns < maxSug) {
wlst[ns] = mystrdup(candidate);
if (wlst[ns] == NULL) return -1;
ns++;
} else return ns;
}
}
}
return ns;
}
// error is adjacent letter were swapped
int SuggestMgr::swapchar(char ** wlst, const char * word, int ns)
{
char candidate[MAXSWL];
char * p;
char tmpc;
int cwrd;
int wl = strlen(word);
// try swapping adjacent chars one by one
strcpy(candidate, word);
for (p = candidate; p[1] != 0; p++) {
tmpc = *p;
*p = p[1];
p[1] = tmpc;
cwrd = 1;
for (int k=0; k < ns; k++)
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
if ((cwrd) && check(candidate,wl)) {
if (ns < maxSug) {
wlst[ns] = mystrdup(candidate);
if (wlst[ns] == NULL) return -1;
ns++;
} else return ns;
}
tmpc = *p;
*p = p[1];
p[1] = tmpc;
}
return ns;
}
// generate a set of suggestions for very poorly spelled words
int SuggestMgr::ngsuggest(char** wlst, char * word, HashMgr* pHMgr)
{
int i, j;
int lval;
int sc;
int lp;
if (! pHMgr) return 0;
// exhaustively search through all root words
// keeping track of the MAX_ROOTS most similar root words
struct hentry * roots[MAX_ROOTS];
int scores[MAX_ROOTS];
for (i = 0; i < MAX_ROOTS; i++) {
roots[i] = NULL;
scores[i] = -100 * i;
}
lp = MAX_ROOTS - 1;
int n = strlen(word);
struct hentry* hp = NULL;
int col = -1;
while ((hp = pHMgr->walk_hashtable(col, hp))) {
sc = ngram(3, word, hp->word, NGRAM_LONGER_WORSE);
if (sc > scores[lp]) {
scores[lp] = sc;
roots[lp] = hp;
int lval = sc;
for (j=0; j < MAX_ROOTS; j++)
if (scores[j] < lval) {
lp = j;
lval = scores[j];
}
}
}
// find minimum threshhold for a passable suggestion
// mangle original word three differnt ways
// and score them to generate a minimum acceptable score
int thresh = 0;
char * mw = NULL;
for (int sp = 1; sp < 4; sp++) {
mw = strdup(word);
for (int k=sp; k < n; k+=4) *(mw + k) = '*';
thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH);
free(mw);
}
mw = NULL;
thresh = thresh / 3;
thresh--;
// now expand affixes on each of these root words and
// and use length adjusted ngram scores to select
// possible suggestions
char * guess[MAX_GUESS];
int gscore[MAX_GUESS];
for(i=0;i<MAX_GUESS;i++) {
guess[i] = NULL;
gscore[i] = -100 * i;
}
lp = MAX_GUESS - 1;
struct guessword * glst;
glst = (struct guessword *) calloc(MAX_WORDS,sizeof(struct guessword));
if (! glst) return 0;
for (i = 0; i < MAX_ROOTS; i++) {
if (roots[i]) {
struct hentry * rp = roots[i];
int nw = pAMgr->expand_rootword(glst, MAX_WORDS, rp->word, rp->wlen,
rp->astr, rp->alen);
for (int k = 0; k < nw; k++) {
sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH);
if (sc > thresh) {
if (sc > gscore[lp]) {
if (guess[lp]) free (guess[lp]);
gscore[lp] = sc;
guess[lp] = glst[k].word;
lval = sc;
for (j=0; j < MAX_GUESS; j++)
if (gscore[j] < lval) {
lp = j;
lval = gscore[j];
}
} else {
free (glst[k].word);
}
}
}
}
}
if (glst) free(glst);
// now we are done generating guesses
// sort in order of decreasing score and copy over
bubblesort(&guess[0], &gscore[0], MAX_GUESS);
int ns = 0;
for (i=0; i < MAX_GUESS; i++) {
if (guess[i]) {
int unique = 1;
for (j=i+1; j < MAX_GUESS; j++)
if (guess[j])
if (!strcmp(guess[i], guess[j])) unique = 0;
if (unique) {
wlst[ns++] = guess[i];
} else {
free(guess[i]);
}
}
}
return ns;
}
// see if a candidate suggestion is spelled correctly
// needs to check both root words and words with affixes
int SuggestMgr::check(const char * word, int len)
{
struct hentry * rv=NULL;
if (pAMgr) {
rv = pAMgr->lookup(word);
if (rv == NULL) rv = pAMgr->affix_check(word,len);
}
if (rv) return 1;
return 0;
}
// generate an n-gram score comparing s1 and s2
int SuggestMgr::ngram(int n, char * s1, const char * s2, int uselen)
{
int nscore = 0;
int l1 = strlen(s1);
int l2 = strlen(s2);
int ns;
for (int j=1;j<=n;j++) {
ns = 0;
for (int i=0;i<=(l1-j);i++) {
char c = *(s1 + i + j);
*(s1 + i + j) = '\0';
if (strstr(s2,(s1+i))) ns++;
*(s1 + i + j ) = c;
}
nscore = nscore + ns;
if (ns < 2) break;
}
ns = 0;
if (uselen == NGRAM_LONGER_WORSE) ns = (l2-l1)-2;
if (uselen == NGRAM_ANY_MISMATCH) ns = abs(l2-l1)-2;
return (nscore - ((ns > 0) ? ns : 0));
}
// sort in decreasing order of score
void SuggestMgr::bubblesort(char** rword, int* rsc, int n )
{
int m = 1;
while (m < n) {
int j = m;
while (j > 0) {
if (rsc[j-1] < rsc[j]) {
int sctmp = rsc[j-1];
char * wdtmp = rword[j-1];
rsc[j-1] = rsc[j];
rword[j-1] = rword[j];
rsc[j] = sctmp;
rword[j] = wdtmp;
j--;
} else break;
}
m++;
}
return;
}

View File

@ -0,0 +1,48 @@
#ifndef _SUGGESTMGR_HXX_
#define _SUGGESTMGR_HXX_
#define MAXSWL 100
#define MAX_ROOTS 10
#define MAX_WORDS 500
#define MAX_GUESS 10
#define NGRAM_IGNORE_LENGTH 0
#define NGRAM_LONGER_WORSE 1
#define NGRAM_ANY_MISMATCH 2
#include "atypes.hxx"
#include "affixmgr.hxx"
#include "hashmgr.hxx"
class SuggestMgr
{
char * ctry;
int ctryl;
AffixMgr* pAMgr;
int maxSug;
bool nosplitsugs;
public:
SuggestMgr(const char * tryme, int maxn, AffixMgr *aptr);
~SuggestMgr();
int suggest(char** wlst, int ns, const char * word);
int check(const char *, int);
int ngsuggest(char ** wlst, char * word, HashMgr* pHMgr);
private:
int replchars(char**, const char *, int);
int mapchars(char**, const char *, int);
int map_related(const char *, int, char ** wlst, int, const mapentry*, int);
int forgotchar(char **, const char *, int);
int swapchar(char **, const char *, int);
int extrachar(char **, const char *, int);
int badchar(char **, const char *, int);
int twowords(char **, const char *, int);
int ngram(int n, char * s1, const char * s2, int uselen);
void bubblesort( char ** rwd, int * rsc, int n);
};
#endif