Exemple #1
0
int load_tables(FILE * wdlst)
{
  char * ap;
  char ts[MAX_LN_LEN];

  /* first read the first line of file to get hash table size */
  if (! fgets(ts, MAX_LN_LEN-1,wdlst)) return 2;
  mychomp(ts);
  tablesize = atoi(ts);
  tablesize = tablesize + 5;
  if ((tablesize %2) == 0) tablesize++;

  /* allocate the hash table */
  tableptr = (struct hentry *) calloc(tablesize, sizeof(struct hentry));
  if (! tableptr) return 3;

  /* loop thorugh all words on much list and add to hash
   * table and store away word and affix strings in tmpfile
   */

  while (fgets(ts,MAX_LN_LEN-1,wdlst)) {
    mychomp(ts);
    ap = mystrdup(ts);
    add_word(ap);

  }
  return 0;
}
Exemple #2
0
int main(int argc, char** argv)
{

  int i;
  int al, wl;

  FILE * wrdlst;
  FILE * afflst;

  char *wf, *af;
  char * ap;
  char ts[MAX_LN_LEN];

  (void)argc;

  /* first parse the command line options */
  /* arg1 - munched wordlist, arg2 - affix file */

  if (argv[1]) {
       wf = mystrdup(argv[1]);
  } else {
    fprintf(stderr,"correct syntax is:\n"); 
    fprintf(stderr,"unmunch dic_file affix_file\n");
    exit(1);
  }
  if (argv[2]) {
       af = mystrdup(argv[2]);
  } else {
    fprintf(stderr,"correct syntax is:\n"); 
    fprintf(stderr,"unmunch dic_file affix_file\n");
    exit(1);
  }

  /* open the affix file */
  afflst = fopen(af,"r");
  if (!afflst) {
    fprintf(stderr,"Error - could not open affix description file\n");
    exit(1);
  }

  /* step one is to parse the affix file building up the internal
     affix data structures */

  numpfx = 0;
  numsfx = 0;
  fullstrip = 0;

  if (parse_aff_file(afflst)) {
    fprintf(stderr,"Error - in affix file loading\n");
    exit(1);
  }

  fclose(afflst);

  fprintf(stderr,"parsed in %d prefixes and %d suffixes\n",numpfx,numsfx);

  /* affix file is now parsed so create hash table of wordlist on the fly */

  /* open the wordlist */
  wrdlst = fopen(wf,"r");
  if (!wrdlst) {
    fprintf(stderr,"Error - could not open word list file\n");
    exit(1);
  }

  /* skip over the hash table size */
  if (! fgets(ts, MAX_LN_LEN-1,wrdlst)) {
    fclose(wrdlst);
    return 2;
  }
  mychomp(ts);

  while (fgets(ts,MAX_LN_LEN-1,wrdlst)) {
    mychomp(ts);
    /* split each line into word and affix char strings */
    ap = strchr(ts,'/');
    if (ap) {
      *ap = '\0';
      ap++;
      al = strlen(ap);
    } else {
      al = 0;
      ap = NULL;
    }

    wl = strlen(ts);

    numwords = 0;
    wlist[numwords].word = mystrdup(ts);
    wlist[numwords].pallow = 0;
    numwords++;
    
    if (al)
       expand_rootword(ts,wl,ap);
  
    for (i=0; i < numwords; i++) {
      fprintf(stdout,"%s\n",wlist[i].word);
      free(wlist[i].word);
      wlist[i].word = NULL;
      wlist[i].pallow = 0;
    }

  }

  fclose(wrdlst);
  return 0;
}
Exemple #3
0
int parse_aff_file(FILE * afflst)
{  
    int i, j;
    int numents=0;
    char achar='\0';
    short ff=0;
    char ft;
    struct affent * ptr= NULL;
    struct affent * nptr= NULL;
    char * line = malloc(MAX_LN_LEN);

    while (fgets(line,MAX_LN_LEN,afflst)) {
       mychomp(line);
       ft = ' ';
       fprintf(stderr,"parsing line: %s\n",line);
       if (strncmp(line,"FULLSTRIP",9) == 0) fullstrip = 1;
       if (strncmp(line,"PFX",3) == 0) ft = 'P';
       if (strncmp(line,"SFX",3) == 0) ft = 'S';
       if (ft != ' ') {
          char * tp = line;
          char * piece;
	  ff = 0;
          i = 0;
          while ((piece=mystrsep(&tp,' '))) {
             if (*piece != '\0') {
                 switch(i) {
                    case 0: break;
                    case 1: { achar = *piece; break; }
                    case 2: { if (*piece == 'Y') ff = XPRODUCT; break; }
                    case 3: { numents = atoi(piece); 
                              if ((numents < 0) ||
                                  ((SIZE_MAX/sizeof(struct affent)) < numents))
                              {
                                 fprintf(stderr,
                                     "Error: too many entries: %d\n", numents);
                                 numents = 0;
                              } else {
                                 ptr = malloc(numents * sizeof(struct affent));
                                 ptr->achar = achar;
                                 ptr->xpflg = ff;
                                 fprintf(stderr,"parsing %c entries %d\n",
                                         achar,numents);
                              }
                              break;
                            }
		    default: break;
                 }
                 i++;
             }
             free(piece);
          }
          /* now parse all of the sub entries*/
          nptr = ptr;
          for (j=0; j < numents; j++) {
             if (!fgets(line,MAX_LN_LEN,afflst)) return 1;
             mychomp(line);
             tp = line;
             i = 0;
             while ((piece=mystrsep(&tp,' '))) {
                if (*piece != '\0') {
                    switch(i) {
                       case 0: { if (nptr != ptr) {
                                   nptr->achar = ptr->achar;
                                   nptr->xpflg = ptr->xpflg;
                                 }
                                 break;
                               }
                       case 1: break;
                       case 2: { nptr->strip = mystrdup(piece);
                                 nptr->stripl = strlen(nptr->strip);
                                 if (strcmp(nptr->strip,"0") == 0) {
                                   free(nptr->strip);
                                   nptr->strip=mystrdup("");
				   nptr->stripl = 0;
                                 }
                                 break; 
                               }
                       case 3: { nptr->appnd = mystrdup(piece);
                                 nptr->appndl = strlen(nptr->appnd);
                                 if (strcmp(nptr->appnd,"0") == 0) {
                                   free(nptr->appnd);
                                   nptr->appnd=mystrdup("");
				   nptr->appndl = 0;
                                 }   
                                 if (strchr(nptr->appnd, '/')) {
                                    char * addseparator = (char *) realloc(nptr->appnd, nptr->appndl + 2);
                                    if (addseparator) {
                                      nptr->appndl++;
                                      addseparator[nptr->appndl-1] = '|';
                                      addseparator[nptr->appndl] = '\0';
                                      nptr->appnd = addseparator;
                                    }
                                 }
                                 break; 
                               }
                       case 4: { encodeit(nptr,piece);}
                               fprintf(stderr, "   affix: %s %d, strip: %s %d\n",nptr->appnd,
                                                   nptr->appndl,nptr->strip,nptr->stripl);
		       default: break;
                    }
                    i++;
                }
                free(piece);
             }
             nptr++;
          }
          if (ptr) {
             if (ft == 'P') {
                ptable[numpfx].aep = ptr;
                ptable[numpfx].num = numents;
                fprintf(stderr,"ptable %d num is %d flag %c\n",numpfx,ptable[numpfx].num,ptr->achar);
                numpfx++;
             } else if (ft == 'S') {
                stable[numsfx].aep = ptr;
                stable[numsfx].num = numents;
                fprintf(stderr,"stable %d num is %d flag %c\n",numsfx,stable[numsfx].num,ptr->achar);
                numsfx++;
             }
             ptr = NULL;
          }
          nptr = NULL;
          numents = 0;
          achar='\0';
       }
    }
    free(line);
    return 0;
}
Exemple #4
0
void parse_aff_file(FILE * afflst)
{  
    int i, j;
    int numents = 0;
    char achar = '\0';
    short ff=0;
    char ft;
    struct affent * ptr= NULL;
    struct affent * nptr= NULL;
    char * line = malloc(MAX_LN_LEN);

    while (fgets(line,MAX_LN_LEN,afflst)) {
       mychomp(line);
       ft = ' ';
       fprintf(stderr,"parsing line: %s\n",line);
       if (strncmp(line,"PFX",3) == 0) ft = 'P';
       if (strncmp(line,"SFX",3) == 0) ft = 'S';
       if (ft != ' ') {
          char * tp = line;
          char * piece;
          i = 0;
          ff = 0;
          while ((piece=mystrsep(&tp,' '))) {
             if (*piece != '\0') {
                 switch(i) {
                    case 0: break;
                    case 1: { achar = *piece; break; }
                    case 2: { if (*piece == 'Y') ff = XPRODUCT; break; }
                    case 3: { numents = atoi(piece); 
                              ptr = malloc(numents * sizeof(struct affent));
                              ptr->achar = achar;
                              ptr->xpflg = ff;
	                      fprintf(stderr,"parsing %c entries %d\n",achar,numents);
                              break;
                            }
		    default: break;
                 }
                 i++;
             }
             free(piece);
          }
          /* now parse all of the sub entries*/
          nptr = ptr;
          for (j=0; j < numents; j++) {
             fgets(line,MAX_LN_LEN,afflst);
             mychomp(line);
             tp = line;
             i = 0;
             while ((piece=mystrsep(&tp,' '))) {
                if (*piece != '\0') {
                    switch(i) {
                       case 0: { if (nptr != ptr) {
                                   nptr->achar = ptr->achar;
                                   nptr->xpflg = ptr->xpflg;
                                 }
                                 break;
                               }
                       case 1: break;
                       case 2: { nptr->strip = mystrdup(piece);
                                 nptr->stripl = strlen(nptr->strip);
                                 if (strcmp(nptr->strip,"0") == 0) {
                                   free(nptr->strip);
                                   nptr->strip=mystrdup("");
				   nptr->stripl = 0;
                                 }   
                                 break; 
                               }
                       case 3: { nptr->appnd = mystrdup(piece);
                                 nptr->appndl = strlen(nptr->appnd);
                                 if (strcmp(nptr->appnd,"0") == 0) {
                                   free(nptr->appnd);
                                   nptr->appnd=mystrdup("");
				   nptr->appndl = 0;
                                 }   
                                 break; 
                               }
                       case 4: { encodeit(nptr,piece);}
                               fprintf(stderr, "   affix: %s %d, strip: %s %d\n",nptr->appnd,
                                                   nptr->appndl,nptr->strip,nptr->stripl);
		       default: break;
                    }
                    i++;
                }
                free(piece);
             }
             nptr++;
          }
          if (ft == 'P') {
             ptable[numpfx].aep = ptr;
             ptable[numpfx].num = numents;
             fprintf(stderr,"ptable %d num is %d\n",numpfx,ptable[numpfx].num);
             numpfx++;
          } else {
             stable[numsfx].aep = ptr;
             stable[numsfx].num = numents;
             fprintf(stderr,"stable %d num is %d\n",numsfx,stable[numsfx].num);
             numsfx++;
          }
          ptr = NULL;
          nptr = NULL;
          numents = 0;
          achar='\0';
       }
    }
    free(line);
}
// see if two-level suffix is present in the word
char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags,
    PfxEntry* ppfx, const FLAG needflag)
{
    int                 tmpl;            // length of tmpword
    unsigned char *     cp;
    char                tmpword[MAXWORDUTF8LEN + 4];
    PfxEntry* ep = ppfx;
    char * st;

    char result[MAXLNLEN];

    *result = '\0';

    // if this suffix is being cross checked with a prefix
    // but it does not support cross products skip it

    if ((optflags & aeXPRODUCT) != 0 &&  (opts & aeXPRODUCT) == 0)
        return NULL;

    // upon entry suffix is 0 length or already matches the end of the word.
    // So if the remaining root word has positive length
    // and if there are enough chars in root word and added back strip chars
    // to meet the number of characters conditions, then test it

    tmpl = len - appndl;

    if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
       (tmpl + stripl >= numconds)) {

            // generate new root word by removing suffix and adding
            // back any characters that would have been stripped or
            // or null terminating the shorter string

            strcpy (tmpword, word);
            cp = (unsigned char *)(tmpword + tmpl);
            if (stripl) {
                strcpy ((char *)cp, strip);
                tmpl += stripl;
                cp = (unsigned char *)(tmpword + tmpl);
            } else *cp = '\0';

            // now make sure all of the conditions on characters
            // are met.  Please see the appendix at the end of
            // this file for more info on exactly what is being
            // tested

            // if all conditions are met then recall suffix_check

            if (test_condition((char *) cp, (char *) tmpword)) {
                if (ppfx) {
                    // handle conditional suffix
                    if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) {
                        st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
                        if (st) {
                            if (ppfx->getMorph()) {
                                mystrcat(result, ppfx->getMorph(), MAXLNLEN);
                                mystrcat(result, " ", MAXLNLEN);
                            }
                            mystrcat(result,st, MAXLNLEN);
                            free(st);
                            mychomp(result);
                        }
                    } else {
                        st = pmyMgr->suffix_check_morph(tmpword, tmpl, optflags, ppfx, aflag, needflag);
                        if (st) {
                            mystrcat(result, st, MAXLNLEN);
                            free(st);
                            mychomp(result);
                        }
                    }
                } else {
                        st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
                        if (st) {
                            mystrcat(result, st, MAXLNLEN);
                            free(st);
                            mychomp(result);
                        }
                }
                if (*result) return mystrdup(result);
            }
    }
    return NULL;
}