int load_tables(FILE * wdlst) { char * ap; char ts[MAX_LN_LEN]; /* first read the first line of file to get hash table size */ if (! fgets(ts, MAX_LN_LEN-1,wdlst)) return 2; mychomp(ts); tablesize = atoi(ts); tablesize = tablesize + 5; if ((tablesize %2) == 0) tablesize++; /* allocate the hash table */ tableptr = (struct hentry *) calloc(tablesize, sizeof(struct hentry)); if (! tableptr) return 3; /* loop thorugh all words on much list and add to hash * table and store away word and affix strings in tmpfile */ while (fgets(ts,MAX_LN_LEN-1,wdlst)) { mychomp(ts); ap = mystrdup(ts); add_word(ap); } return 0; }
int main(int argc, char** argv) { int i; int al, wl; FILE * wrdlst; FILE * afflst; char *wf, *af; char * ap; char ts[MAX_LN_LEN]; (void)argc; /* first parse the command line options */ /* arg1 - munched wordlist, arg2 - affix file */ if (argv[1]) { wf = mystrdup(argv[1]); } else { fprintf(stderr,"correct syntax is:\n"); fprintf(stderr,"unmunch dic_file affix_file\n"); exit(1); } if (argv[2]) { af = mystrdup(argv[2]); } else { fprintf(stderr,"correct syntax is:\n"); fprintf(stderr,"unmunch dic_file affix_file\n"); exit(1); } /* open the affix file */ afflst = fopen(af,"r"); if (!afflst) { fprintf(stderr,"Error - could not open affix description file\n"); exit(1); } /* step one is to parse the affix file building up the internal affix data structures */ numpfx = 0; numsfx = 0; fullstrip = 0; if (parse_aff_file(afflst)) { fprintf(stderr,"Error - in affix file loading\n"); exit(1); } fclose(afflst); fprintf(stderr,"parsed in %d prefixes and %d suffixes\n",numpfx,numsfx); /* affix file is now parsed so create hash table of wordlist on the fly */ /* open the wordlist */ wrdlst = fopen(wf,"r"); if (!wrdlst) { fprintf(stderr,"Error - could not open word list file\n"); exit(1); } /* skip over the hash table size */ if (! fgets(ts, MAX_LN_LEN-1,wrdlst)) { fclose(wrdlst); return 2; } mychomp(ts); while (fgets(ts,MAX_LN_LEN-1,wrdlst)) { mychomp(ts); /* split each line into word and affix char strings */ ap = strchr(ts,'/'); if (ap) { *ap = '\0'; ap++; al = strlen(ap); } else { al = 0; ap = NULL; } wl = strlen(ts); numwords = 0; wlist[numwords].word = mystrdup(ts); wlist[numwords].pallow = 0; numwords++; if (al) expand_rootword(ts,wl,ap); for (i=0; i < numwords; i++) { fprintf(stdout,"%s\n",wlist[i].word); free(wlist[i].word); wlist[i].word = NULL; wlist[i].pallow = 0; } } fclose(wrdlst); return 0; }
int parse_aff_file(FILE * afflst) { int i, j; int numents=0; char achar='\0'; short ff=0; char ft; struct affent * ptr= NULL; struct affent * nptr= NULL; char * line = malloc(MAX_LN_LEN); while (fgets(line,MAX_LN_LEN,afflst)) { mychomp(line); ft = ' '; fprintf(stderr,"parsing line: %s\n",line); if (strncmp(line,"FULLSTRIP",9) == 0) fullstrip = 1; if (strncmp(line,"PFX",3) == 0) ft = 'P'; if (strncmp(line,"SFX",3) == 0) ft = 'S'; if (ft != ' ') { char * tp = line; char * piece; ff = 0; i = 0; while ((piece=mystrsep(&tp,' '))) { if (*piece != '\0') { switch(i) { case 0: break; case 1: { achar = *piece; break; } case 2: { if (*piece == 'Y') ff = XPRODUCT; break; } case 3: { numents = atoi(piece); if ((numents < 0) || ((SIZE_MAX/sizeof(struct affent)) < numents)) { fprintf(stderr, "Error: too many entries: %d\n", numents); numents = 0; } else { ptr = malloc(numents * sizeof(struct affent)); ptr->achar = achar; ptr->xpflg = ff; fprintf(stderr,"parsing %c entries %d\n", achar,numents); } break; } default: break; } i++; } free(piece); } /* now parse all of the sub entries*/ nptr = ptr; for (j=0; j < numents; j++) { if (!fgets(line,MAX_LN_LEN,afflst)) return 1; mychomp(line); tp = line; i = 0; while ((piece=mystrsep(&tp,' '))) { if (*piece != '\0') { switch(i) { case 0: { if (nptr != ptr) { nptr->achar = ptr->achar; nptr->xpflg = ptr->xpflg; } break; } case 1: break; case 2: { nptr->strip = mystrdup(piece); nptr->stripl = strlen(nptr->strip); if (strcmp(nptr->strip,"0") == 0) { free(nptr->strip); nptr->strip=mystrdup(""); nptr->stripl = 0; } break; } case 3: { nptr->appnd = mystrdup(piece); nptr->appndl = strlen(nptr->appnd); if (strcmp(nptr->appnd,"0") == 0) { free(nptr->appnd); nptr->appnd=mystrdup(""); nptr->appndl = 0; } if (strchr(nptr->appnd, '/')) { char * addseparator = (char *) realloc(nptr->appnd, nptr->appndl + 2); if (addseparator) { nptr->appndl++; addseparator[nptr->appndl-1] = '|'; addseparator[nptr->appndl] = '\0'; nptr->appnd = addseparator; } } break; } case 4: { encodeit(nptr,piece);} fprintf(stderr, " affix: %s %d, strip: %s %d\n",nptr->appnd, nptr->appndl,nptr->strip,nptr->stripl); default: break; } i++; } free(piece); } nptr++; } if (ptr) { if (ft == 'P') { ptable[numpfx].aep = ptr; ptable[numpfx].num = numents; fprintf(stderr,"ptable %d num is %d flag %c\n",numpfx,ptable[numpfx].num,ptr->achar); numpfx++; } else if (ft == 'S') { stable[numsfx].aep = ptr; stable[numsfx].num = numents; fprintf(stderr,"stable %d num is %d flag %c\n",numsfx,stable[numsfx].num,ptr->achar); numsfx++; } ptr = NULL; } nptr = NULL; numents = 0; achar='\0'; } } free(line); return 0; }
void parse_aff_file(FILE * afflst) { int i, j; int numents = 0; char achar = '\0'; short ff=0; char ft; struct affent * ptr= NULL; struct affent * nptr= NULL; char * line = malloc(MAX_LN_LEN); while (fgets(line,MAX_LN_LEN,afflst)) { mychomp(line); ft = ' '; fprintf(stderr,"parsing line: %s\n",line); if (strncmp(line,"PFX",3) == 0) ft = 'P'; if (strncmp(line,"SFX",3) == 0) ft = 'S'; if (ft != ' ') { char * tp = line; char * piece; i = 0; ff = 0; while ((piece=mystrsep(&tp,' '))) { if (*piece != '\0') { switch(i) { case 0: break; case 1: { achar = *piece; break; } case 2: { if (*piece == 'Y') ff = XPRODUCT; break; } case 3: { numents = atoi(piece); ptr = malloc(numents * sizeof(struct affent)); ptr->achar = achar; ptr->xpflg = ff; fprintf(stderr,"parsing %c entries %d\n",achar,numents); break; } default: break; } i++; } free(piece); } /* now parse all of the sub entries*/ nptr = ptr; for (j=0; j < numents; j++) { fgets(line,MAX_LN_LEN,afflst); mychomp(line); tp = line; i = 0; while ((piece=mystrsep(&tp,' '))) { if (*piece != '\0') { switch(i) { case 0: { if (nptr != ptr) { nptr->achar = ptr->achar; nptr->xpflg = ptr->xpflg; } break; } case 1: break; case 2: { nptr->strip = mystrdup(piece); nptr->stripl = strlen(nptr->strip); if (strcmp(nptr->strip,"0") == 0) { free(nptr->strip); nptr->strip=mystrdup(""); nptr->stripl = 0; } break; } case 3: { nptr->appnd = mystrdup(piece); nptr->appndl = strlen(nptr->appnd); if (strcmp(nptr->appnd,"0") == 0) { free(nptr->appnd); nptr->appnd=mystrdup(""); nptr->appndl = 0; } break; } case 4: { encodeit(nptr,piece);} fprintf(stderr, " affix: %s %d, strip: %s %d\n",nptr->appnd, nptr->appndl,nptr->strip,nptr->stripl); default: break; } i++; } free(piece); } nptr++; } if (ft == 'P') { ptable[numpfx].aep = ptr; ptable[numpfx].num = numents; fprintf(stderr,"ptable %d num is %d\n",numpfx,ptable[numpfx].num); numpfx++; } else { stable[numsfx].aep = ptr; stable[numsfx].num = numents; fprintf(stderr,"stable %d num is %d\n",numsfx,stable[numsfx].num); numsfx++; } ptr = NULL; nptr = NULL; numents = 0; achar='\0'; } } free(line); }
// see if two-level suffix is present in the word char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags, PfxEntry* ppfx, const FLAG needflag) { int tmpl; // length of tmpword unsigned char * cp; char tmpword[MAXWORDUTF8LEN + 4]; PfxEntry* ep = ppfx; char * st; char result[MAXLNLEN]; *result = '\0'; // if this suffix is being cross checked with a prefix // but it does not support cross products skip it if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0) return NULL; // upon entry suffix is 0 length or already matches the end of the word. // So if the remaining root word has positive length // and if there are enough chars in root word and added back strip chars // to meet the number of characters conditions, then test it tmpl = len - appndl; if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && (tmpl + stripl >= numconds)) { // generate new root word by removing suffix and adding // back any characters that would have been stripped or // or null terminating the shorter string strcpy (tmpword, word); cp = (unsigned char *)(tmpword + tmpl); if (stripl) { strcpy ((char *)cp, strip); tmpl += stripl; cp = (unsigned char *)(tmpword + tmpl); } else *cp = '\0'; // now make sure all of the conditions on characters // are met. Please see the appendix at the end of // this file for more info on exactly what is being // tested // if all conditions are met then recall suffix_check if (test_condition((char *) cp, (char *) tmpword)) { if (ppfx) { // handle conditional suffix if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) { st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag); if (st) { if (ppfx->getMorph()) { mystrcat(result, ppfx->getMorph(), MAXLNLEN); mystrcat(result, " ", MAXLNLEN); } mystrcat(result,st, MAXLNLEN); free(st); mychomp(result); } } else { st = pmyMgr->suffix_check_morph(tmpword, tmpl, optflags, ppfx, aflag, needflag); if (st) { mystrcat(result, st, MAXLNLEN); free(st); mychomp(result); } } } else { st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag); if (st) { mystrcat(result, st, MAXLNLEN); free(st); mychomp(result); } } if (*result) return mystrdup(result); } } return NULL; }