Beispiel #1
0
/** 
 * <JA>
 * @brief  単語N-gramをファイルから読み込んでセットアップする. 
 *
 * ARPA フォーマットで指定時は,LRファイルと RL ファイルの組合せで
 * 動作が異なる. LR のみ,あるいは RL のみ指定時は,それをそのまま読み込む. 
 * 双方とも指定されている場合は,RLをまず主モデルとして読み込んだ後,
 * LR の 2-gram だけを第1パス用に主モデルに追加読み込みする. 
 *
 * また,読み込み終了後,辞書上のN-gramエントリとのマッチングを取る. 
 * 
 * </JA>
 * <EN>
 * @brief  Read in word N-gram from file and setup for recognition.
 *
 * When N-gram is specified in ARPA format, the behavior relies on whether
 * N-grams are specified in "-nlr" and "-nrl".  When either of them was
 * solely specified,  this function simply read it.  If both are specified,
 * it will read the RL model fully as a primary model, and additionally
 * read only the 2-gram part or the LR model as the first pass LM.
 *
 * Also, this function create mapping from dictionary words to LM entry.
 * 
 * </EN>
 *
 * @param lmconf [in] LM configuration variables
 * @param winfo [i/o] word dictionary that will be used with this N-gram.
 * each word in the dictionary will be assigned to an N-gram entry here.
 *
 * @return the newly created N-gram information data, or NULL on failure.
 * 
 */
static NGRAM_INFO *
initialize_ngram(JCONF_LM *lmconf, WORD_INFO *winfo)
{
  NGRAM_INFO *ngram;
  boolean ret;

  /* allocate new */
  ngram = ngram_info_new();
  /* load LM */
  if (lmconf->ngram_filename != NULL) {	/* binary format */
    ret = init_ngram_bin(ngram, lmconf->ngram_filename);
  } else {			/* ARPA format */
    /* if either forward or backward N-gram is specified, read it */
    /* if both specified, use backward N-gram as main and
       use forward 2-gram only for 1st pass (this is an old behavior) */
    if (lmconf->ngram_filename_rl_arpa) {
      ret = init_ngram_arpa(ngram, lmconf->ngram_filename_rl_arpa, DIR_RL);
      if (ret == FALSE) {
	ngram_info_free(ngram);
	return NULL;
      }
      if (lmconf->ngram_filename_lr_arpa) {
	ret = init_ngram_arpa_additional(ngram, lmconf->ngram_filename_lr_arpa);
	if (ret == FALSE) {
	  ngram_info_free(ngram);
	  return NULL;
	}
      }
    } else if (lmconf->ngram_filename_lr_arpa) {
      ret = init_ngram_arpa(ngram, lmconf->ngram_filename_lr_arpa, DIR_LR);
    }
  }
  if (ret == FALSE) {
    ngram_info_free(ngram);
    return NULL;
  }

  /* set unknown (=OOV) word id */
  if (strcmp(lmconf->unknown_name, UNK_WORD_DEFAULT)) {
    set_unknown_id(ngram, lmconf->unknown_name);
  }

  /* map dict item to N-gram entry */
  if (make_voca_ref(ngram, winfo) == FALSE) {
    ngram_info_free(ngram);
    return NULL;
  }

  /* post-fix EOS / BOS uni prob for SRILM */
  fix_uniprob_srilm(ngram, winfo);

  return(ngram);
}
Beispiel #2
0
int
main(int argc, char *argv[])
{
  FILE *fp;
  char header[512];
  time_t now;
  char *binfile, *lrfile, *rlfile, *outfile;
  int i;
  char *from_code, *to_code, *buf;
  boolean charconv_enabled = FALSE;
  boolean force_swap = FALSE;
  WORD_ID w;

  binfile = lrfile = rlfile = outfile = NULL;
  from_code = to_code = NULL;
  if (argc <= 1) {
    usage(argv[0]);
    return -1;
  }

  for(i=1;i<argc;i++) {
    if (argv[i][0] == '-') {
      if (argv[i][1] == 'd') {
	if (++i >= argc) {
	  printf("Error: no argument for option \"%s\"\n", argv[i]);
	  usage(argv[0]);
	  return -1;
	}
	binfile = argv[i];
      } else if (argv[i][1] == 'n') {
	switch(argv[i][2]) {
	case 'l':
	  if (++i >= argc) {
	    printf("Error: no argument for option \"%s\"\n", argv[i]);
	    usage(argv[0]);
	    return -1;
	  }
	  lrfile = argv[i];
	  break;
	case 'r':
	  if (++i >= argc) {
	    printf("Error: no argument for option \"%s\"\n", argv[i]);
	    usage(argv[0]);
	    return -1;
	  }
	  rlfile = argv[i];
	  break;
	default:
	  printf("Error: no such option \"%s\"\n", argv[i]);
	  usage(argv[0]);
	  return -1;
	}
      } else if (argv[i][1] == 'c') {
	if (++i >= argc) {
	  printf("Error: no argument for option \"%s\"\n", argv[i]);
	  usage(argv[0]);
	  return -1;
	}
	from_code = strcpy((char*)mymalloc(strlen(argv[i])+1), argv[i]);
	if (++i >= argc) {
	  printf("Error: no argument for option \"%s\"\n", argv[i]);
	  usage(argv[0]);
	  free(from_code);
	  return -1;
	}
	to_code = strcpy((char*)mymalloc(strlen(argv[i])+1),argv[i]);
	charconv_enabled = TRUE;
      } else if (argv[i][1] == 's') {
	force_swap = TRUE;
      }
    } else {
      if (outfile == NULL) {
	outfile = argv[i];
      } else {
	printf("Error: more than one output file\n");
	usage(argv[0]);
	return -1;
      }
    }
  }

  if (!outfile) {
    printf("Error: no output file specified\n");
    usage(argv[0]);
    return -1;
  }

  if (binfile) {
    if (lrfile || rlfile) {
      printf("Error: both binary file and ARPA file are specified\n");
      usage(argv[0]);
      return -1;
    }
    printf("bingram: %s\n", binfile);
  } else {
    if (rlfile) {
      printf("backward n-gram: %s\n", rlfile);
      if (lrfile) {
	printf("additional forward 2-gram for 1st pass: %s\n", lrfile);
      }
    } else if (lrfile) {
      printf("forward n-gram: %s\n", lrfile);
    } else {
      printf("Error: no input N-gram file specified\n");
      usage(argv[0]);
      return -1;
    }
  }

  printf("\nSTART LOADING\n\n");

  /* make header string */
  now = time(NULL);
  if (binfile) {
    sprintf(header, "converted at %s\nfrom bingram = %s\n", ctime(&now), binfile);
  } else {
    if (rlfile && lrfile) {
      sprintf(header, "converted at %s\nfrom n-gram = %s, LR 2-gram = %s\n", ctime(&now),  rlfile, lrfile);
    } else if (rlfile) {
      sprintf(header, "converted at %s\nfrom n-gram = %s\n", ctime(&now),  rlfile);
    } else {
      sprintf(header, "converted at %s\nfrom n-gram = %s\n", ctime(&now),  lrfile);
    }
  }

  ngram = ngram_info_new();
  if (binfile) {
    /* read in bingram */
    if (init_ngram_bin(ngram, binfile) == FALSE) return -1;
  } else {
    /* read in ARPA n-gram */
    if (force_swap) {
      ngram->bos_eos_swap = TRUE;
    }
    if (rlfile) {
      if (init_ngram_arpa(ngram, rlfile, DIR_RL) == FALSE) return -1;
      if (lrfile) {
	if (init_ngram_arpa_additional(ngram, lrfile) == FALSE) return -1;
      }
    } else if (lrfile) {
      if (init_ngram_arpa(ngram, lrfile, DIR_LR) == FALSE) return -1;
    }
  }

  print_ngram_info(stdout, ngram);
  
  if (charconv_enabled == TRUE) {
    /* do character conversion */
    if (charconv_setup(from_code, to_code) == -1) {
      fprintf(stderr, "failed to setup character convertsion\n");
      return -1;
    }
    buf = (char *)mymalloc(4096);
    for (w = 0; w < ngram->max_word_num; w++) {
      charconv(ngram->wname[w], buf, 4096);
      ngram->wname[w] = mybmalloc2(strlen(buf)+1, &(ngram->mroot));
      strcpy(ngram->wname[w], buf);
    }
    free(buf);
  }

  /* write in JULIUS binary format */
  if ((fp = fopen_writefile(outfile)) == NULL) {
    fprintf(stderr, "failed to open \"%s\"\n", outfile);
    return -1;
  }
  printf("\nWriting in v5 format to \"%s\"...\n", outfile);
  if (ngram_write_bin(fp, ngram, header) == FALSE){/* failed */
    fprintf(stderr, "failed to write \"%s\"\n",outfile);
    return -1;
  }
  fclose_writefile(fp);

  printf("completed\n");
  
  return 0;

}