Beispiel #1
0
int main(int argc, char** argv)
{

  int i;
  int al, wl;

  FILE * wrdlst;
  FILE * afflst;

  char *wf, *af;
  char * ap;
  char ts[MAX_LN_LEN];

  (void)argc;

  /* first parse the command line options */
  /* arg1 - munched wordlist, arg2 - affix file */

  if (argv[1]) {
       wf = mystrdup(argv[1]);
  } else {
    fprintf(stderr,"correct syntax is:\n"); 
    fprintf(stderr,"unmunch dic_file affix_file\n");
    exit(1);
  }
  if (argv[2]) {
       af = mystrdup(argv[2]);
  } else {
    fprintf(stderr,"correct syntax is:\n"); 
    fprintf(stderr,"unmunch dic_file affix_file\n");
    exit(1);
  }

  /* open the affix file */
  afflst = fopen(af,"r");
  if (!afflst) {
    fprintf(stderr,"Error - could not open affix description file\n");
    exit(1);
  }

  /* step one is to parse the affix file building up the internal
     affix data structures */

  numpfx = 0;
  numsfx = 0;
  fullstrip = 0;

  if (parse_aff_file(afflst)) {
    fprintf(stderr,"Error - in affix file loading\n");
    exit(1);
  }

  fclose(afflst);

  fprintf(stderr,"parsed in %d prefixes and %d suffixes\n",numpfx,numsfx);

  /* affix file is now parsed so create hash table of wordlist on the fly */

  /* open the wordlist */
  wrdlst = fopen(wf,"r");
  if (!wrdlst) {
    fprintf(stderr,"Error - could not open word list file\n");
    exit(1);
  }

  /* skip over the hash table size */
  if (! fgets(ts, MAX_LN_LEN-1,wrdlst)) {
    fclose(wrdlst);
    return 2;
  }
  mychomp(ts);

  while (fgets(ts,MAX_LN_LEN-1,wrdlst)) {
    mychomp(ts);
    /* split each line into word and affix char strings */
    ap = strchr(ts,'/');
    if (ap) {
      *ap = '\0';
      ap++;
      al = strlen(ap);
    } else {
      al = 0;
      ap = NULL;
    }

    wl = strlen(ts);

    numwords = 0;
    wlist[numwords].word = mystrdup(ts);
    wlist[numwords].pallow = 0;
    numwords++;
    
    if (al)
       expand_rootword(ts,wl,ap);
  
    for (i=0; i < numwords; i++) {
      fprintf(stdout,"%s\n",wlist[i].word);
      free(wlist[i].word);
      wlist[i].word = NULL;
      wlist[i].pallow = 0;
    }

  }

  fclose(wrdlst);
  return 0;
}
Beispiel #2
0
int main(int argc, char** argv)
{

  int i, j, k, n;
  int rl, p , nwl;
  int al;

  FILE * wrdlst;
  FILE * afflst;

  char *nword, *wf, *af;
  char as[(MAX_PREFIXES + MAX_SUFFIXES)];
  char * ap;

  struct hentry * ep;
  struct hentry * ep1;
  struct affent * pfxp;
  struct affent * sfxp;

  /* first parse the command line options */
  /* arg1 - wordlist, arg2 - affix file */

  if (argv[1]) {
       wf = mystrdup(argv[1]);
  } else {
    fprintf(stderr,"correct syntax is:\n"); 
    fprintf(stderr,"munch word_list_file affix_file\n");
    exit(1);
  }
  if (argv[2]) {
       af = mystrdup(argv[2]);
  } else {
    fprintf(stderr,"correct syntax is:\n"); 
    fprintf(stderr,"munch word_list_file affix_file\n");
    exit(1);
  }

  /* open the affix file */
  afflst = fopen(af,"r");
  if (!afflst) {
    fprintf(stderr,"Error - could not open affix description file\n");
    exit(1);
  }

  /* step one is to parse the affix file building up the internal
     affix data structures */

  numpfx = 0;
  numsfx = 0;

  parse_aff_file(afflst);
  fclose(afflst);

  fprintf(stderr,"parsed in %d prefixes and %d suffixes\n",numpfx,numsfx);

  /* affix file is now parsed so create hash table of wordlist on the fly */

  /* open the wordlist */
  wrdlst = fopen(wf,"r");
  if (!wrdlst) {
    fprintf(stderr,"Error - could not open word list file\n");
    exit(1);
  }

  if (load_tables(wrdlst)) {
    fprintf(stderr,"Error building hash tables\n");
    exit(1);
  }
  fclose(wrdlst);

  for (i=0; i< tablesize; i++) {
    ep = &tableptr[i];
    if (ep->word == NULL) continue;
    for (  ;  ep != NULL;  ep = ep->next) {
      numroots = 0;
      aff_chk(ep->word,strlen(ep->word));
      if (numroots) {
            /* now there might be a number of combinations */
            /* of prefixes and suffixes that might match this */
            /* word.  So how to choose?  As a first shot look */
            /* for the shortest remaining root word to */
            /* to maximize the combinatorial power */

	    /* but be careful, do not REQUIRE a specific combination */
            /* of a prefix and a suffix to generate the word since */
            /* that violates the rule that the root word with just */
            /* the prefix or just the suffix must also exist in the */
            /* wordlist as well */

	    /* in fact because of the cross product issue, this not a  */
	    /* simple choice since some combinations of previous */ 
	    /* prefixes and new suffixes may not be valid. */
	    /*  The only way to know is to simply try them all */
  
            rl = 1000;
            p = -1;

            for (j = 0; j < numroots; j++){

	      /* first collect the root word info and build up */
              /* the potential new affix string */
               nword = (roots[j].hashent)->word;
               nwl = strlen(nword);
               *as = '\0';
               al = 0;
               ap = as;
               if (roots[j].prefix) *ap++ = (roots[j].prefix)->achar;
               if (roots[j].suffix) *ap++ = (roots[j].suffix)->achar;
               if ((roots[j].hashent)->affstr) {
		   strcpy(ap,(roots[j].hashent)->affstr);
               } else {
		 *ap = '\0';
               }
               al =strlen(as);

               /* now expand the potential affix string to generate */
               /* all legal words and make sure they all exist in the */
               /* word list */
               numwords = 0;
               wlist[numwords].word = mystrdup(nword);
               wlist[numwords].pallow = 0;
               numwords++;
               n = 0;
               if (al)
		 expand_rootword(nword,nwl,as,al);
               for (k=0; k<numwords; k++) {
		 if (lookup(wlist[k].word)) n++;
                 free(wlist[k].word);
                 wlist[k].word = NULL;
                 wlist[k].pallow = 0;
               }

               /* if all exist in word list then okay */
               if (n == numwords) {               
                  if (nwl < rl) {
                     rl = nwl;
                     p = j;
                  }
               }
            }
            if (p != -1) {
               ep1 = roots[p].hashent;
               pfxp = roots[p].prefix;
               sfxp = roots[p].suffix;
               ep1->keep = 1;
               if (pfxp != NULL) add_affix_char(ep1,pfxp->achar);
               if (sfxp != NULL) add_affix_char(ep1,sfxp->achar);
            } else {
	      ep->keep = 1;
            }
      } else {
            ep->keep = 1;
      }
    }
  }

  /* now output only the words to keep along with affixes info */
  /* first count how many words that is */
  k = 0;
  for (i=0; i< tablesize; i++) {
    ep = &tableptr[i];
    if (ep->word == NULL) continue;
    for (  ;  ep != NULL;  ep = ep->next) {
       if (ep->keep > 0) k++;
    }
  }
  fprintf(stdout,"%d\n",k);

  for (i=0; i< tablesize; i++) {
    ep = &tableptr[i];
    if (ep->word == NULL) continue;
    for (  ;  ep != NULL;  ep = ep->next) {
      if (ep->keep > 0) {
        if (ep->affstr != NULL) { 
	  fprintf(stdout,"%s/%s\n",ep->word,ep->affstr);
	} else {
          fprintf(stdout,"%s\n",ep->word);
        }
      }
    }
  }
  return 0;
}