/** * <JA> * @brief 単語N-gramをファイルから読み込んでセットアップする. * * ARPA フォーマットで指定時は,LRファイルと RL ファイルの組合せで * 動作が異なる. LR のみ,あるいは RL のみ指定時は,それをそのまま読み込む. * 双方とも指定されている場合は,RLをまず主モデルとして読み込んだ後, * LR の 2-gram だけを第1パス用に主モデルに追加読み込みする. * * また,読み込み終了後,辞書上のN-gramエントリとのマッチングを取る. * * </JA> * <EN> * @brief Read in word N-gram from file and setup for recognition. * * When N-gram is specified in ARPA format, the behavior relies on whether * N-grams are specified in "-nlr" and "-nrl". When either of them was * solely specified, this function simply read it. If both are specified, * it will read the RL model fully as a primary model, and additionally * read only the 2-gram part or the LR model as the first pass LM. * * Also, this function create mapping from dictionary words to LM entry. * * </EN> * * @param lmconf [in] LM configuration variables * @param winfo [i/o] word dictionary that will be used with this N-gram. * each word in the dictionary will be assigned to an N-gram entry here. * * @return the newly created N-gram information data, or NULL on failure. * */ static NGRAM_INFO * initialize_ngram(JCONF_LM *lmconf, WORD_INFO *winfo) { NGRAM_INFO *ngram; boolean ret; /* allocate new */ ngram = ngram_info_new(); /* load LM */ if (lmconf->ngram_filename != NULL) { /* binary format */ ret = init_ngram_bin(ngram, lmconf->ngram_filename); } else { /* ARPA format */ /* if either forward or backward N-gram is specified, read it */ /* if both specified, use backward N-gram as main and use forward 2-gram only for 1st pass (this is an old behavior) */ if (lmconf->ngram_filename_rl_arpa) { ret = init_ngram_arpa(ngram, lmconf->ngram_filename_rl_arpa, DIR_RL); if (ret == FALSE) { ngram_info_free(ngram); return NULL; } if (lmconf->ngram_filename_lr_arpa) { ret = init_ngram_arpa_additional(ngram, lmconf->ngram_filename_lr_arpa); if (ret == FALSE) { ngram_info_free(ngram); return NULL; } } } else if (lmconf->ngram_filename_lr_arpa) { ret = init_ngram_arpa(ngram, lmconf->ngram_filename_lr_arpa, DIR_LR); } } if (ret == FALSE) { ngram_info_free(ngram); return NULL; } /* set unknown (=OOV) word id */ if (strcmp(lmconf->unknown_name, UNK_WORD_DEFAULT)) { set_unknown_id(ngram, lmconf->unknown_name); } /* map dict item to N-gram entry */ if (make_voca_ref(ngram, winfo) == FALSE) { ngram_info_free(ngram); return NULL; } /* post-fix EOS / BOS uni prob for SRILM */ fix_uniprob_srilm(ngram, winfo); return(ngram); }
int main(int argc, char *argv[]) { FILE *fp; char header[512]; time_t now; char *binfile, *lrfile, *rlfile, *outfile; int i; char *from_code, *to_code, *buf; boolean charconv_enabled = FALSE; boolean force_swap = FALSE; WORD_ID w; binfile = lrfile = rlfile = outfile = NULL; from_code = to_code = NULL; if (argc <= 1) { usage(argv[0]); return -1; } for(i=1;i<argc;i++) { if (argv[i][0] == '-') { if (argv[i][1] == 'd') { if (++i >= argc) { printf("Error: no argument for option \"%s\"\n", argv[i]); usage(argv[0]); return -1; } binfile = argv[i]; } else if (argv[i][1] == 'n') { switch(argv[i][2]) { case 'l': if (++i >= argc) { printf("Error: no argument for option \"%s\"\n", argv[i]); usage(argv[0]); return -1; } lrfile = argv[i]; break; case 'r': if (++i >= argc) { printf("Error: no argument for option \"%s\"\n", argv[i]); usage(argv[0]); return -1; } rlfile = argv[i]; break; default: printf("Error: no such option \"%s\"\n", argv[i]); usage(argv[0]); return -1; } } else if (argv[i][1] == 'c') { if (++i >= argc) { printf("Error: no argument for option \"%s\"\n", argv[i]); usage(argv[0]); return -1; } from_code = strcpy((char*)mymalloc(strlen(argv[i])+1), argv[i]); if (++i >= argc) { printf("Error: no argument for option \"%s\"\n", argv[i]); usage(argv[0]); free(from_code); return -1; } to_code = strcpy((char*)mymalloc(strlen(argv[i])+1),argv[i]); charconv_enabled = TRUE; } else if (argv[i][1] == 's') { force_swap = TRUE; } } else { if (outfile == NULL) { outfile = argv[i]; } else { printf("Error: more than one output file\n"); usage(argv[0]); return -1; } } } if (!outfile) { printf("Error: no output file specified\n"); usage(argv[0]); return -1; } if (binfile) { if (lrfile || rlfile) { printf("Error: both binary file and ARPA file are specified\n"); usage(argv[0]); return -1; } printf("bingram: %s\n", binfile); } else { if (rlfile) { printf("backward n-gram: %s\n", rlfile); if (lrfile) { printf("additional forward 2-gram for 1st pass: %s\n", lrfile); } } else if (lrfile) { printf("forward n-gram: %s\n", lrfile); } else { printf("Error: no input N-gram file specified\n"); usage(argv[0]); return -1; } } printf("\nSTART LOADING\n\n"); /* make header string */ now = time(NULL); if (binfile) { sprintf(header, "converted at %s\nfrom bingram = %s\n", ctime(&now), binfile); } else { if (rlfile && lrfile) { sprintf(header, "converted at %s\nfrom n-gram = %s, LR 2-gram = %s\n", ctime(&now), rlfile, lrfile); } else if (rlfile) { sprintf(header, "converted at %s\nfrom n-gram = %s\n", ctime(&now), rlfile); } else { sprintf(header, "converted at %s\nfrom n-gram = %s\n", ctime(&now), lrfile); } } ngram = ngram_info_new(); if (binfile) { /* read in bingram */ if (init_ngram_bin(ngram, binfile) == FALSE) return -1; } else { /* read in ARPA n-gram */ if (force_swap) { ngram->bos_eos_swap = TRUE; } if (rlfile) { if (init_ngram_arpa(ngram, rlfile, DIR_RL) == FALSE) return -1; if (lrfile) { if (init_ngram_arpa_additional(ngram, lrfile) == FALSE) return -1; } } else if (lrfile) { if (init_ngram_arpa(ngram, lrfile, DIR_LR) == FALSE) return -1; } } print_ngram_info(stdout, ngram); if (charconv_enabled == TRUE) { /* do character conversion */ if (charconv_setup(from_code, to_code) == -1) { fprintf(stderr, "failed to setup character convertsion\n"); return -1; } buf = (char *)mymalloc(4096); for (w = 0; w < ngram->max_word_num; w++) { charconv(ngram->wname[w], buf, 4096); ngram->wname[w] = mybmalloc2(strlen(buf)+1, &(ngram->mroot)); strcpy(ngram->wname[w], buf); } free(buf); } /* write in JULIUS binary format */ if ((fp = fopen_writefile(outfile)) == NULL) { fprintf(stderr, "failed to open \"%s\"\n", outfile); return -1; } printf("\nWriting in v5 format to \"%s\"...\n", outfile); if (ngram_write_bin(fp, ngram, header) == FALSE){/* failed */ fprintf(stderr, "failed to write \"%s\"\n",outfile); return -1; } fclose_writefile(fp); printf("completed\n"); return 0; }