This repository has been archived on 2024-04-08. You can view files and clone it, but cannot push or open issues or pull requests.
deb-goldedplus/goldlib/myspell/affixmgr.cxx

1234 lines
31 KiB
C++
Raw Normal View History

#include "license.rea"
2006-03-11 16:21:45 +00:00
#include <cstdlib>
#include <cstring>
#include <cstdio>
#include "affixmgr.hxx"
#include "affentry.hxx"
#if !defined(_MSC_VER)
using namespace std;
#endif
// First some base level utility routines
extern void mychomp(char * s);
extern char * mystrdup(const char * s);
extern char * myrevstrdup(const char * s);
extern char * mystrsep(char ** sptr, const char delim);
extern int isSubset(const char * s1, const char * s2);
2006-03-11 16:21:45 +00:00
AffixMgr::AffixMgr(const char * affpath, HashMgr* ptr)
2006-03-11 16:21:45 +00:00
{
// register hash manager and load affix data from aff file
pHMgr = ptr;
trystring = NULL;
encoding=NULL;
reptable = NULL;
numrep = 0;
maptable = NULL;
nummap = 0;
compound=NULL;
nosplitsugs= (0==1);
cpdmin = 3; // default value
for (int i=0; i < SETSIZE; i++) {
pStart[i] = NULL;
sStart[i] = NULL;
pFlag[i] = NULL;
sFlag[i] = NULL;
}
if (parse_file(affpath)) {
fprintf(stderr,"Failure loading aff file %s\n",affpath);
fflush(stderr);
}
}
AffixMgr::~AffixMgr()
2006-03-11 16:21:45 +00:00
{
2006-03-11 16:21:45 +00:00
// pass through linked prefix entries and clean up
for (int i=0; i < SETSIZE ;i++) {
pFlag[i] = NULL;
PfxEntry * ptr = (PfxEntry *)pStart[i];
PfxEntry * nptr = NULL;
while (ptr) {
nptr = ptr->getNext();
delete(ptr);
ptr = nptr;
nptr = NULL;
}
2006-03-11 16:21:45 +00:00
}
// pass through linked suffix entries and clean up
for (int j=0; j < SETSIZE ; j++) {
sFlag[j] = NULL;
SfxEntry * ptr = (SfxEntry *)sStart[j];
SfxEntry * nptr = NULL;
while (ptr) {
nptr = ptr->getNext();
delete(ptr);
ptr = nptr;
nptr = NULL;
}
2006-03-11 16:21:45 +00:00
}
if (trystring) free(trystring);
trystring=NULL;
if (encoding) free(encoding);
encoding=NULL;
if (maptable) {
2006-03-11 16:21:45 +00:00
for (int j=0; j < nummap; j++) {
free(maptable[j].set);
maptable[j].set = NULL;
maptable[j].len = 0;
}
free(maptable);
2006-03-11 16:21:45 +00:00
maptable = NULL;
}
nummap = 0;
if (reptable) {
2006-03-11 16:21:45 +00:00
for (int j=0; j < numrep; j++) {
free(reptable[j].pattern);
free(reptable[j].replacement);
reptable[j].pattern = NULL;
reptable[j].replacement = NULL;
}
free(reptable);
2006-03-11 16:21:45 +00:00
reptable = NULL;
}
numrep = 0;
if (compound) free(compound);
compound=NULL;
pHMgr = NULL;
cpdmin = 0;
}
// read in aff file and build up prefix and suffix entry objects
2006-03-11 16:21:45 +00:00
int AffixMgr::parse_file(const char * affpath)
{
// io buffers
char line[MAXLNLEN+1];
2006-03-11 16:21:45 +00:00
// affix type
char ft;
// open the affix file
FILE * afflst;
afflst = fopen(affpath,"r");
if (!afflst) {
fprintf(stderr,"Error - could not open affix description file %s\n",affpath);
return 1;
}
// step one is to parse the affix file building up the internal
// affix data structures
// read in each line ignoring any that do not
// start with a known line type indicator
while (fgets(line,MAXLNLEN,afflst)) {
mychomp(line);
/* parse in the try string */
if (strncmp(line,"TRY",3) == 0) {
if (parse_try(line)) {
return 1;
}
}
/* parse in the name of the character set used by the .dict and .aff */
if (strncmp(line,"SET",3) == 0) {
if (parse_set(line)) {
return 1;
}
}
/* parse in the flag used by the controlled compound words */
if (strncmp(line,"COMPOUNDFLAG",12) == 0) {
if (parse_cpdflag(line)) {
return 1;
}
}
/* parse in the flag used by the controlled compound words */
if (strncmp(line,"COMPOUNDMIN",11) == 0) {
if (parse_cpdmin(line)) {
return 1;
}
}
/* parse in the typical fault correcting table */
if (strncmp(line,"REP",3) == 0) {
if (parse_reptable(line, afflst)) {
return 1;
}
}
/* parse in the related character map table */
if (strncmp(line,"MAP",3) == 0) {
if (parse_maptable(line, afflst)) {
return 1;
}
}
// parse this affix: P - prefix, S - suffix
ft = ' ';
if (strncmp(line,"PFX",3) == 0) ft = 'P';
if (strncmp(line,"SFX",3) == 0) ft = 'S';
if (ft != ' ') {
if (parse_affix(line, ft, afflst)) {
return 1;
}
}
// handle NOSPLITSUGS
if (strncmp(line,"NOSPLITSUGS",11) == 0)
nosplitsugs=(0==0);
}
fclose(afflst);
// now we can speed up performance greatly taking advantage of the
2006-03-11 16:21:45 +00:00
// relationship between the affixes and the idea of "subsets".
// View each prefix as a potential leading subset of another and view
// each suffix (reversed) as a potential trailing subset of another.
// To illustrate this relationship if we know the prefix "ab" is found in the
// word to examine, only prefixes that "ab" is a leading subset of need be examined.
// Furthermore is "ab" is not present then none of the prefixes that "ab" is
// is a subset need be examined.
// The same argument goes for suffix string that are reversed.
// Then to top this off why not examine the first char of the word to quickly
// limit the set of prefixes to examine (i.e. the prefixes to examine must
2006-03-11 16:21:45 +00:00
// be leading supersets of the first character of the word (if they exist)
2006-03-11 16:21:45 +00:00
// To take advantage of this "subset" relationship, we need to add two links
// from entry. One to take next if the current prefix is found (call it nexteq)
// and one to take next if the current prefix is not found (call it nextne).
// Since we have built ordered lists, all that remains is to properly intialize
2006-03-11 16:21:45 +00:00
// the nextne and nexteq pointers that relate them
process_pfx_order();
process_sfx_order();
return 0;
}
// we want to be able to quickly access prefix information
// both by prefix flag, and sorted by prefix string itself
2006-03-11 16:21:45 +00:00
// so we need to set up two indexes
int AffixMgr::build_pfxlist(AffEntry* pfxptr)
{
PfxEntry * ptr;
PfxEntry * pptr;
PfxEntry * ep = (PfxEntry*) pfxptr;
// get the right starting points
const char * key = ep->getKey();
const unsigned char flg = ep->getFlag();
// first index by flag which must exist
ptr = (PfxEntry*)pFlag[flg];
ep->setFlgNxt(ptr);
pFlag[flg] = (AffEntry *) ep;
// next index by affix string
// handle the special case of null affix string
if (strlen(key) == 0) {
// always inset them at head of list at element 0
ptr = (PfxEntry*)pStart[0];
ep->setNext(ptr);
pStart[0] = (AffEntry*)ep;
return 0;
}
// now handle the general case
unsigned char sp = *((const unsigned char *)key);
ptr = (PfxEntry*)pStart[sp];
2006-03-11 16:21:45 +00:00
/* handle the insert at top of list case */
if ((!ptr) || ( strcmp( ep->getKey() , ptr->getKey() ) <= 0)) {
ep->setNext(ptr);
pStart[sp] = (AffEntry*)ep;
return 0;
}
/* otherwise find where it fits in order and insert it */
pptr = NULL;
for (; ptr != NULL; ptr = ptr->getNext()) {
if (strcmp( ep->getKey() , ptr->getKey() ) <= 0) break;
pptr = ptr;
}
pptr->setNext(ep);
ep->setNext(ptr);
return 0;
}
// we want to be able to quickly access suffix information
// both by suffix flag, and sorted by the reverse of the
// suffix string itself; so we need to set up two indexes
int AffixMgr::build_sfxlist(AffEntry* sfxptr)
{
SfxEntry * ptr;
SfxEntry * pptr;
SfxEntry * ep = (SfxEntry *) sfxptr;
/* get the right starting point */
const char * key = ep->getKey();
const unsigned char flg = ep->getFlag();
// first index by flag which must exist
ptr = (SfxEntry*)sFlag[flg];
ep->setFlgNxt(ptr);
sFlag[flg] = (AffEntry *) ep;
// next index by affix string
// handle the special case of null affix string
if (strlen(key) == 0) {
// always inset them at head of list at element 0
ptr = (SfxEntry*)sStart[0];
ep->setNext(ptr);
sStart[0] = (AffEntry*)ep;
return 0;
}
// now handle the normal case
unsigned char sp = *((const unsigned char *)key);
ptr = (SfxEntry*)sStart[sp];
2006-03-11 16:21:45 +00:00
/* handle the insert at top of list case */
if ((!ptr) || ( strcmp( ep->getKey() , ptr->getKey() ) <= 0)) {
ep->setNext(ptr);
sStart[sp] = (AffEntry*)ep;
return 0;
}
/* otherwise find where it fits in order and insert it */
pptr = NULL;
for (; ptr != NULL; ptr = ptr->getNext()) {
if (strcmp( ep->getKey(), ptr->getKey() ) <= 0) break;
pptr = ptr;
}
pptr->setNext(ep);
ep->setNext(ptr);
return 0;
}
// initialize the PfxEntry links NextEQ and NextNE to speed searching
int AffixMgr::process_pfx_order()
{
PfxEntry* ptr;
// loop through each prefix list starting point
for (int i=1; i < SETSIZE; i++) {
ptr = (PfxEntry*)pStart[i];
// look through the remainder of the list
// and find next entry with affix that
2006-03-11 16:21:45 +00:00
// the current one is not a subset of
// mark that as destination for NextNE
// use next in list that you are a subset
// of as NextEQ
for (; ptr != NULL; ptr = ptr->getNext()) {
PfxEntry * nptr = ptr->getNext();
for (; nptr != NULL; nptr = nptr->getNext()) {
if (! isSubset( ptr->getKey() , nptr->getKey() )) break;
}
ptr->setNextNE(nptr);
ptr->setNextEQ(NULL);
if ((ptr->getNext()) && isSubset(ptr->getKey() , (ptr->getNext())->getKey()))
2006-03-11 16:21:45 +00:00
ptr->setNextEQ(ptr->getNext());
}
// now clean up by adding smart search termination strings:
// if you are already a superset of the previous prefix
// but not a subset of the next, search can end here
// so set NextNE properly
ptr = (PfxEntry *) pStart[i];
for (; ptr != NULL; ptr = ptr->getNext()) {
PfxEntry * nptr = ptr->getNext();
PfxEntry * mptr = NULL;
for (; nptr != NULL; nptr = nptr->getNext()) {
if (! isSubset(ptr->getKey(),nptr->getKey())) break;
mptr = nptr;
}
if (mptr) mptr->setNextNE(NULL);
}
}
return 0;
}
// initialize the SfxEntry links NextEQ and NextNE to speed searching
int AffixMgr::process_sfx_order()
{
SfxEntry* ptr;
// loop through each prefix list starting point
for (int i=1; i < SETSIZE; i++) {
ptr = (SfxEntry *) sStart[i];
// look through the remainder of the list
// and find next entry with affix that
2006-03-11 16:21:45 +00:00
// the current one is not a subset of
// mark that as destination for NextNE
// use next in list that you are a subset
// of as NextEQ
for (; ptr != NULL; ptr = ptr->getNext()) {
SfxEntry * nptr = ptr->getNext();
for (; nptr != NULL; nptr = nptr->getNext()) {
if (! isSubset(ptr->getKey(),nptr->getKey())) break;
}
ptr->setNextNE(nptr);
ptr->setNextEQ(NULL);
if ((ptr->getNext()) && isSubset(ptr->getKey(),(ptr->getNext())->getKey()))
2006-03-11 16:21:45 +00:00
ptr->setNextEQ(ptr->getNext());
}
// now clean up by adding smart search termination strings:
// if you are already a superset of the previous suffix
// but not a subset of the next, search can end here
// so set NextNE properly
ptr = (SfxEntry *) sStart[i];
for (; ptr != NULL; ptr = ptr->getNext()) {
SfxEntry * nptr = ptr->getNext();
SfxEntry * mptr = NULL;
for (; nptr != NULL; nptr = nptr->getNext()) {
if (! isSubset(ptr->getKey(),nptr->getKey())) break;
mptr = nptr;
}
if (mptr) mptr->setNextNE(NULL);
}
}
return 0;
}
// takes aff file condition string and creates the
// conds array - please see the appendix at the end of the
// file affentry.cxx which describes what is going on here
// in much more detail
void AffixMgr::encodeit(struct affentry * ptr, char * cs)
{
unsigned char c;
int i, j, k;
unsigned char mbr[MAXLNLEN];
// now clear the conditions array */
for (i=0;i<SETSIZE;i++) ptr->conds[i] = (unsigned char) 0;
// now parse the string to create the conds array */
int nc = strlen(cs);
int neg = 0; // complement indicator
int grp = 0; // group indicator
int n = 0; // number of conditions
int ec = 0; // end condition indicator
int nm = 0; // number of member in group
// if no condition just return
if (strcmp(cs,".")==0) {
ptr->numconds = 0;
return;
}
i = 0;
while (i < nc) {
c = *((unsigned char *)(cs + i));
// start group indicator
if (c == '[') {
grp = 1;
c = 0;
}
// complement flag
if ((grp == 1) && (c == '^')) {
neg = 1;
c = 0;
}
// end goup indicator
if (c == ']') {
ec = 1;
c = 0;
}
// add character of group to list
if ((grp == 1) && (c != 0)) {
*(mbr + nm) = c;
nm++;
c = 0;
}
// end of condition
2006-03-11 16:21:45 +00:00
if (c != 0) {
ec = 1;
}
2006-03-11 16:21:45 +00:00
if (ec) {
if (grp == 1) {
if (neg == 0) {
// set the proper bits in the condition array vals for those chars
for (j=0;j<nm;j++) {
k = (unsigned int) mbr[j];
ptr->conds[k] = ptr->conds[k] | (1 << n);
}
} else {
// complement so set all of them and then unset indicated ones
for (j=0;j<SETSIZE;j++) ptr->conds[j] = ptr->conds[j] | (1 << n);
for (j=0;j<nm;j++) {
k = (unsigned int) mbr[j];
ptr->conds[k] = ptr->conds[k] & ~(1 << n);
}
}
neg = 0;
grp = 0;
2006-03-11 16:21:45 +00:00
nm = 0;
} else {
// not a group so just set the proper bit for this char
// but first handle special case of . inside condition
if (c == '.') {
// wild card character so set them all
for (j=0;j<SETSIZE;j++) ptr->conds[j] = ptr->conds[j] | (1 << n);
} else {
2006-03-11 16:21:45 +00:00
ptr->conds[(unsigned int) c] = ptr->conds[(unsigned int)c] | (1 << n);
}
}
n++;
ec = 0;
}
i++;
}
ptr->numconds = n;
return;
}
// check word for prefixes
struct hentry * AffixMgr::prefix_check (const char * word, int len)
{
struct hentry * rv= NULL;
2006-03-11 16:21:45 +00:00
// first handle the special case of 0 length prefixes
PfxEntry * pe = (PfxEntry *) pStart[0];
while (pe) {
rv = pe->check(word,len);
if (rv) return rv;
pe = pe->getNext();
}
2006-03-11 16:21:45 +00:00
// now handle the general case
unsigned char sp = *((const unsigned char *)word);
PfxEntry * pptr = (PfxEntry *)pStart[sp];
while (pptr) {
if (isSubset(pptr->getKey(),word)) {
rv = pptr->check(word,len);
if (rv) return rv;
pptr = pptr->getNextEQ();
} else {
pptr = pptr->getNextNE();
}
}
2006-03-11 16:21:45 +00:00
return NULL;
}
// check if compound word is correctly spelled
struct hentry * AffixMgr::compound_check (const char * word, int len, char compound_flag)
{
int i;
struct hentry * rv= NULL;
char * st;
char ch;
// handle case of string too short to be a piece of a compound word
2006-03-11 16:21:45 +00:00
if (len < cpdmin) return NULL;
st = mystrdup(word);
2006-03-11 16:21:45 +00:00
for (i=cpdmin; i < (len - (cpdmin-1)); i++) {
ch = st[i];
st[i] = '\0';
rv = lookup(st);
if (!rv) rv = affix_check(st,i);
if ((rv) && (TESTAFF(rv->astr, compound_flag, rv->alen))) {
rv = lookup((word+i));
if ((rv) && (TESTAFF(rv->astr, compound_flag, rv->alen))) {
free(st);
return rv;
}
rv = affix_check((word+i),strlen(word+i));
if ((rv) && (TESTAFF(rv->astr, compound_flag, rv->alen))) {
free(st);
return rv;
}
rv = compound_check((word+i),strlen(word+i),compound_flag);
2006-03-11 16:21:45 +00:00
if (rv) {
free(st);
return rv;
}
2006-03-11 16:21:45 +00:00
}
st[i] = ch;
}
free(st);
return NULL;
}
2006-03-11 16:21:45 +00:00
// check word for suffixes
struct hentry * AffixMgr::suffix_check (const char * word, int len,
2006-03-11 16:21:45 +00:00
int sfxopts, AffEntry * ppfx)
{
struct hentry * rv = NULL;
// first handle the special case of 0 length suffixes
SfxEntry * se = (SfxEntry *) sStart[0];
while (se) {
rv = se->check(word,len, sfxopts, ppfx);
if (rv) return rv;
se = se->getNext();
}
2006-03-11 16:21:45 +00:00
// now handle the general case
char * tmpword = myrevstrdup(word);
unsigned char sp = *((const unsigned char *)tmpword);
SfxEntry * sptr = (SfxEntry *) sStart[sp];
while (sptr) {
if (isSubset(sptr->getKey(),tmpword)) {
rv = sptr->check(word,len, sfxopts, ppfx);
if (rv) {
free(tmpword);
return rv;
}
sptr = sptr->getNextEQ();
} else {
sptr = sptr->getNextNE();
}
}
2006-03-11 16:21:45 +00:00
free(tmpword);
return NULL;
}
// check if word with affixes is correctly spelled
struct hentry * AffixMgr::affix_check (const char * word, int len)
{
struct hentry * rv= NULL;
// check all prefixes (also crossed with suffixes if allowed)
2006-03-11 16:21:45 +00:00
rv = prefix_check(word, len);
if (rv) return rv;
// if still not found check all suffixes
rv = suffix_check(word, len, 0, NULL);
return rv;
}
int AffixMgr::expand_rootword(struct guessword * wlst, int maxn,
2006-03-11 16:21:45 +00:00
const char * ts, int wl, const char * ap, int al)
{
int nh=0;
// first add root word to list
if (nh < maxn) {
wlst[nh].word = mystrdup(ts);
wlst[nh].allow = (1 == 0);
nh++;
}
// handle suffixes
for (int i = 0; i < al; i++) {
unsigned char c = (unsigned char) ap[i];
SfxEntry * sptr = (SfxEntry *)sFlag[c];
while (sptr) {
char * newword = sptr->add(ts, wl);
if (newword) {
if (nh < maxn) {
wlst[nh].word = newword;
wlst[nh].allow = sptr->allowCross();
nh++;
} else {
free(newword);
}
}
sptr = (SfxEntry *)sptr ->getFlgNxt();
}
}
int n = nh;
// handle cross products of prefixes and suffixes
for (int j=1;j<n ;j++)
if (wlst[j].allow) {
for (int k = 0; k < al; k++) {
unsigned char c = (unsigned char) ap[k];
PfxEntry * cptr = (PfxEntry *) pFlag[c];
while (cptr) {
if (cptr->allowCross()) {
int l1 = strlen(wlst[j].word);
char * newword = cptr->add(wlst[j].word, l1);
if (newword) {
if (nh < maxn) {
wlst[nh].word = newword;
wlst[nh].allow = cptr->allowCross();
nh++;
} else {
free(newword);
}
}
}
cptr = (PfxEntry *)cptr ->getFlgNxt();
}
}
}
// now handle pure prefixes
for (int m = 0; m < al; m ++) {
unsigned char c = (unsigned char) ap[m];
PfxEntry * ptr = (PfxEntry *) pFlag[c];
while (ptr) {
char * newword = ptr->add(ts, wl);
if (newword) {
if (nh < maxn) {
wlst[nh].word = newword;
wlst[nh].allow = ptr->allowCross();
nh++;
} else {
free(newword);
}
2006-03-11 16:21:45 +00:00
}
ptr = (PfxEntry *)ptr ->getFlgNxt();
}
}
return nh;
}
// return length of replacing table
int AffixMgr::get_numrep()
{
return numrep;
}
// return replacing table
struct replentry * AffixMgr::get_reptable()
{
if (! reptable ) return NULL;
return reptable;
}
// return length of character map table
int AffixMgr::get_nummap()
{
return nummap;
}
// return character map table
struct mapentry * AffixMgr::get_maptable()
{
if (! maptable ) return NULL;
return maptable;
}
// return text encoding of dictionary
char * AffixMgr::get_encoding()
{
if (! encoding ) {
encoding = mystrdup("ISO8859-1");
}
return mystrdup(encoding);
}
// return the preferred try string for suggestions
char * AffixMgr::get_try_string()
{
if (! trystring ) return NULL;
return mystrdup(trystring);
}
// return the compound words control flag
char * AffixMgr::get_compound()
{
if (! compound ) return NULL;
return mystrdup(compound);
}
// utility method to look up root words in hash table
struct hentry * AffixMgr::lookup(const char * word)
{
if (! pHMgr) return NULL;
return pHMgr->lookup(word);
}
// return nosplitsugs
bool AffixMgr::get_nosplitsugs(void)
{
return nosplitsugs;
}
/* parse in the try string */
int AffixMgr::parse_try(char * line)
{
if (trystring) {
fprintf(stderr,"error: duplicate TRY strings\n");
return 1;
}
char * tp = line;
char * piece;
int i = 0;
int np = 0;
while ((piece=mystrsep(&tp,' '))) {
if (*piece != '\0') {
switch(i) {
case 0: { np++; break; }
case 1: { trystring = mystrdup(piece); np++; break; }
default: break;
}
i++;
}
free(piece);
}
if (np != 2) {
fprintf(stderr,"error: missing TRY information\n");
return 1;
}
2006-03-11 16:21:45 +00:00
return 0;
}
/* parse in the name of the character set used by the .dict and .aff */
int AffixMgr::parse_set(char * line)
{
if (encoding) {
fprintf(stderr,"error: duplicate SET strings\n");
return 1;
}
char * tp = line;
char * piece;
int i = 0;
int np = 0;
while ((piece=mystrsep(&tp,' '))) {
if (*piece != '\0') {
switch(i) {
case 0: { np++; break; }
case 1: { encoding = mystrdup(piece); np++; break; }
default: break;
}
i++;
}
free(piece);
}
if (np != 2) {
fprintf(stderr,"error: missing SET information\n");
return 1;
}
2006-03-11 16:21:45 +00:00
return 0;
}
/* parse in the flag used by the controlled compound words */
int AffixMgr::parse_cpdflag(char * line)
{
if (compound) {
fprintf(stderr,"error: duplicate compound flags used\n");
return 1;
}
char * tp = line;
char * piece;
int i = 0;
int np = 0;
while ((piece=mystrsep(&tp,' '))) {
if (*piece != '\0') {
switch(i) {
case 0: { np++; break; }
case 1: { compound = mystrdup(piece); np++; break; }
default: break;
}
i++;
}
free(piece);
}
if (np != 2) {
fprintf(stderr,"error: missing compound flag information\n");
return 1;
}
return 0;
}
/* parse in the min compound word length */
int AffixMgr::parse_cpdmin(char * line)
{
char * tp = line;
char * piece;
int i = 0;
int np = 0;
while ((piece=mystrsep(&tp,' '))) {
if (*piece != '\0') {
switch(i) {
case 0: { np++; break; }
case 1: { cpdmin = atoi(piece); np++; break; }
default: break;
}
i++;
}
free(piece);
}
if (np != 2) {
fprintf(stderr,"error: missing compound min information\n");
return 1;
}
2006-03-11 16:21:45 +00:00
if ((cpdmin < 1) || (cpdmin > 50)) cpdmin = 3;
return 0;
}
/* parse in the typical fault correcting table */
int AffixMgr::parse_reptable(char * line, FILE * af)
{
if (numrep != 0) {
fprintf(stderr,"error: duplicate REP tables used\n");
return 1;
}
char * tp = line;
char * piece;
int i = 0;
int np = 0;
while ((piece=mystrsep(&tp,' '))) {
if (*piece != '\0') {
switch(i) {
case 0: { np++; break; }
case 1: {
2006-03-11 16:21:45 +00:00
numrep = atoi(piece);
if (numrep < 1) {
fprintf(stderr,"incorrect number of entries in replacement table\n");
free(piece);
return 1;
}
reptable = (replentry *) malloc(numrep * sizeof(struct replentry));
np++;
break;
}
default: break;
}
i++;
}
free(piece);
}
if (np != 2) {
fprintf(stderr,"error: missing replacement table information\n");
return 1;
}
2006-03-11 16:21:45 +00:00
/* now parse the numrep lines to read in the remainder of the table */
char * nl = line;
for (int j=0; j < numrep; j++) {
fgets(nl,MAXLNLEN,af);
mychomp(nl);
tp = nl;
i = 0;
reptable[j].pattern = NULL;
reptable[j].replacement = NULL;
while ((piece=mystrsep(&tp,' '))) {
if (*piece != '\0') {
switch(i) {
case 0: {
if (strncmp(piece,"REP",3) != 0) {
fprintf(stderr,"error: replacement table is corrupt\n");
free(piece);
return 1;
}
break;
}
case 1: { reptable[j].pattern = mystrdup(piece); break; }
case 2: { reptable[j].replacement = mystrdup(piece); break; }
default: break;
}
i++;
}
free(piece);
}
if ((!(reptable[j].pattern)) || (!(reptable[j].replacement))) {
fprintf(stderr,"error: replacement table is corrupt\n");
return 1;
}
}
return 0;
}
/* parse in the character map table */
int AffixMgr::parse_maptable(char * line, FILE * af)
{
if (nummap != 0) {
fprintf(stderr,"error: duplicate MAP tables used\n");
return 1;
}
char * tp = line;
char * piece;
int i = 0;
int np = 0;
while ((piece=mystrsep(&tp,' '))) {
if (*piece != '\0') {
switch(i) {
case 0: { np++; break; }
case 1: {
2006-03-11 16:21:45 +00:00
nummap = atoi(piece);
if (nummap < 1) {
fprintf(stderr,"incorrect number of entries in map table\n");
free(piece);
return 1;
}
maptable = (mapentry *) malloc(nummap * sizeof(struct mapentry));
np++;
break;
}
default: break;
}
i++;
}
free(piece);
}
if (np != 2) {
fprintf(stderr,"error: missing map table information\n");
return 1;
}
2006-03-11 16:21:45 +00:00
/* now parse the nummap lines to read in the remainder of the table */
char * nl = line;
for (int j=0; j < nummap; j++) {
fgets(nl,MAXLNLEN,af);
mychomp(nl);
tp = nl;
i = 0;
maptable[j].set = NULL;
maptable[j].len = 0;
while ((piece=mystrsep(&tp,' '))) {
if (*piece != '\0') {
switch(i) {
case 0: {
if (strncmp(piece,"MAP",3) != 0) {
fprintf(stderr,"error: map table is corrupt\n");
free(piece);
return 1;
}
break;
}
case 1: { maptable[j].set = mystrdup(piece);
2006-03-11 16:21:45 +00:00
maptable[j].len = strlen(maptable[j].set);
break; }
default: break;
}
i++;
}
free(piece);
}
if ((!(maptable[j].set)) || (!(maptable[j].len))) {
fprintf(stderr,"error: map table is corrupt\n");
return 1;
}
}
return 0;
}
int AffixMgr::parse_affix(char * line, const char at, FILE * af)
{
int numents = 0; // number of affentry structures to parse
char achar='\0'; // affix char identifier
short ff=0;
struct affentry * ptr= NULL;
struct affentry * nptr= NULL;
char * tp = line;
char * nl = line;
char * piece;
int i = 0;
// split affix header line into pieces
int np = 0;
while ((piece=mystrsep(&tp,' '))) {
if (*piece != '\0') {
switch(i) {
// piece 1 - is type of affix
case 0: { np++; break; }
2006-03-11 16:21:45 +00:00
// piece 2 - is affix char
case 1: { np++; achar = *piece; break; }
// piece 3 - is cross product indicator
2006-03-11 16:21:45 +00:00
case 2: { np++; if (*piece == 'Y') ff = XPRODUCT; break; }
// piece 4 - is number of affentries
case 3: {
2006-03-11 16:21:45 +00:00
np++;
numents = atoi(piece);
2006-03-11 16:21:45 +00:00
ptr = (struct affentry *) malloc(numents * sizeof(struct affentry));
ptr->xpflg = ff;
ptr->achar = achar;
break;
}
default: break;
}
i++;
}
free(piece);
}
// check to make sure we parsed enough pieces
if (np != 4) {
fprintf(stderr, "error: affix %c header has insufficient data in line %s\n",achar,nl);
free(ptr);
return 1;
}
2006-03-11 16:21:45 +00:00
// store away ptr to first affentry
nptr = ptr;
// now parse numents affentries for this affix
for (int j=0; j < numents; j++) {
fgets(nl,MAXLNLEN,af);
mychomp(nl);
tp = nl;
i = 0;
np = 0;
// split line into pieces
while ((piece=mystrsep(&tp,' '))) {
if (*piece != '\0') {
switch(i) {
// piece 1 - is type
case 0: {
2006-03-11 16:21:45 +00:00
np++;
if (nptr != ptr) nptr->xpflg = ptr->xpflg;
break;
}
// piece 2 - is affix char
case 1: {
2006-03-11 16:21:45 +00:00
np++;
if (*piece != achar) {
fprintf(stderr, "error: affix %c is corrupt near line %s\n",achar,nl);
fprintf(stderr, "error: possible incorrect count\n");
free(piece);
return 1;
}
if (nptr != ptr) nptr->achar = ptr->achar;
break;
}
// piece 3 - is string to strip or 0 for null
case 2: {
2006-03-11 16:21:45 +00:00
np++;
nptr->strip = mystrdup(piece);
nptr->stripl = strlen(nptr->strip);
if (strcmp(nptr->strip,"0") == 0) {
free(nptr->strip);
nptr->strip=mystrdup("");
nptr->stripl = 0;
}
break;
2006-03-11 16:21:45 +00:00
}
// piece 4 - is affix string or 0 for null
case 3: {
2006-03-11 16:21:45 +00:00
np++;
nptr->appnd = mystrdup(piece);
nptr->appndl = strlen(nptr->appnd);
if (strcmp(nptr->appnd,"0") == 0) {
free(nptr->appnd);
nptr->appnd=mystrdup("");
nptr->appndl = 0;
}
break;
2006-03-11 16:21:45 +00:00
}
// piece 5 - is the conditions descriptions
case 4: { np++; encodeit(nptr,piece); }
default: break;
}
i++;
}
free(piece);
}
// check to make sure we parsed enough pieces
if (np != 5) {
fprintf(stderr, "error: affix %c is corrupt near line %s\n",achar,nl);
free(ptr);
return 1;
}
nptr++;
}
2006-03-11 16:21:45 +00:00
// now create SfxEntry or PfxEntry objects and use links to
// build an ordered (sorted by affix string) list
nptr = ptr;
for (int k = 0; k < numents; k++) {
if (at == 'P') {
PfxEntry * pfxptr = new PfxEntry(this,nptr);
build_pfxlist((AffEntry *)pfxptr);
} else {
SfxEntry * sfxptr = new SfxEntry(this,nptr);
build_sfxlist((AffEntry *)sfxptr);
2006-03-11 16:21:45 +00:00
}
nptr++;
}
2006-03-11 16:21:45 +00:00
free(ptr);
return 0;
}