void read_wlist_into_siht(char *wlist_filename, int verbosity, sih_t *p_word_id_ht, /** Pointer of the word ID hash table */ vocab_sz_t * p_n_wlist /** Pointer of the size of wordlist */ ) { static char rname[]="read_wlist_into_siht"; FILE *wlist_fp = rr_iopen(wlist_filename); char wlist_entry[1024], word[256], *word_copy; vocab_sz_t entry_no = 0; while (fgets (wlist_entry, sizeof (wlist_entry), wlist_fp)) { if (strncmp(wlist_entry,"##",2) == 0) continue; entry_no++; /* printf("entry no %lld, wlist_entry %s\n",entry_no,wlist_entry);*/ if(entry_no%1000==0){ fprintf(stdout,"."); fflush(stdout); } sscanf (wlist_entry, "%s ", word); warn_on_wrong_vocab_comments(wlist_entry); word_copy = rr_salloc(word); sih_add(p_word_id_ht, word_copy, entry_no); free(word_copy); // HLW } fprintf(stdout,"\n"); fflush(stdout); rr_iclose(wlist_fp); if(verbose_cmuclmtk == 1) { if (verbosity) fprintf(stderr,"%s: a list of %d words was read from \"%s\".\n", rname,(int) entry_no,wlist_filename); } *p_n_wlist = entry_no; }
int oe_02_main (int argc, char **argv) { ng_t ng; arpa_lm_t arpa_ng; char input_string[500]; int num_of_args; char *args[MAX_ARGS]; char *lm_filename_arpa; char *lm_filename_binary; flag told_to_quit; flag inconsistant_parameters; flag backoff_from_unk_inc; flag backoff_from_unk_exc; flag backoff_from_ccs_inc; flag backoff_from_ccs_exc; flag arpa_lm; flag binary_lm; flag include_unks; char *fb_list_filename; char *probs_stream_filename; char *annotation_filename; char *text_stream_filename; char *oov_filename; char *ccs_filename; int generate_size; int random_seed; double log_base; char wlist_entry[1024]; char current_cc[200]; vocab_sz_t current_cc_id; FILE *context_cues_fp; int n; /* Process command line */ report_version(&argc,argv); if (pc_flagarg(&argc, argv,"-help") || argc == 1 || (strcmp(argv[1],"-binary") && strcmp(argv[1],"-arpa"))) { oe_02_help_message(); exit(1); } lm_filename_arpa = rr_salloc(pc_stringarg(&argc, argv,"-arpa","")); if (strcmp(lm_filename_arpa,"")) arpa_lm = 1; else arpa_lm = 0; lm_filename_binary = rr_salloc(pc_stringarg(&argc, argv,"-binary","")); if (strcmp(lm_filename_binary,"")) binary_lm = 1; else binary_lm = 0; if (arpa_lm && binary_lm) quit(-1,"Error : Can't use both -arpa and -binary flags.\n"); if (!arpa_lm && !binary_lm) quit(-1,"Error : Must specify either a binary or an arpa format language model.\n"); ccs_filename = rr_salloc(pc_stringarg(&argc, argv,"-context","")); if (binary_lm && strcmp(ccs_filename,"")) fprintf(stderr,"Warning - context cues file not needed with binary language model file.\nWill ignore it.\n"); pc_report_unk_args(&argc,argv,2); /* Load language model */ if (arpa_lm) { fprintf(stderr,"Reading in language model from file %s\n", lm_filename_arpa); load_arpa_lm(&arpa_ng,lm_filename_arpa); }else { fprintf(stderr,"Reading in language model from file %s\n", lm_filename_binary); load_lm(&ng,lm_filename_binary); } fprintf(stderr,"\nDone.\n"); n=arpa_lm? arpa_ng.n: ng.n; if (arpa_lm) { arpa_ng.context_cue = (flag *) rr_calloc(arpa_ng.table_sizes[0],sizeof(flag)); arpa_ng.no_of_ccs = 0; if (strcmp(ccs_filename,"")) { context_cues_fp = rr_iopen(ccs_filename); while (fgets (wlist_entry, sizeof (wlist_entry),context_cues_fp)) { if (strncmp(wlist_entry,"##",2)==0) continue; sscanf (wlist_entry, "%s ",current_cc); warn_on_wrong_vocab_comments(wlist_entry); if (sih_lookup(arpa_ng.vocab_ht,current_cc,¤t_cc_id) == 0) quit(-1,"Error : %s in the context cues file does not appear in the vocabulary.\n",current_cc); arpa_ng.context_cue[(unsigned short) current_cc_id] = 1; arpa_ng.no_of_ccs++; fprintf(stderr,"Context cue word : %s id = %lld\n",current_cc,current_cc_id); } rr_iclose(context_cues_fp); } } /* Process commands */ told_to_quit = 0; num_of_args = 0; while (!feof(stdin) && !told_to_quit) { printf("evallm : \n"); fgets(input_string, sizeof(input_string), stdin); if(strlen(input_string) < sizeof(input_string)-1) input_string[strlen(input_string)-1] = '\0'; //chop new-line else quit(1, "evallm input exceeds size of input buffer"); if (!feof(stdin)) { parse_comline(input_string,&num_of_args,args); log_base = pc_doublearg(&num_of_args,args,"-log_base",10.0); backoff_from_unk_inc = pc_flagarg(&num_of_args,args,"-backoff_from_unk_inc"); backoff_from_ccs_inc = pc_flagarg(&num_of_args,args,"-backoff_from_ccs_inc"); backoff_from_unk_exc = pc_flagarg(&num_of_args,args,"-backoff_from_unk_exc"); backoff_from_ccs_exc = pc_flagarg(&num_of_args,args,"-backoff_from_ccs_exc"); include_unks = pc_flagarg(&num_of_args,args,"-include_unks"); fb_list_filename = rr_salloc(pc_stringarg(&num_of_args,args,"-backoff_from_list","")); text_stream_filename = rr_salloc(pc_stringarg(&num_of_args,args,"-text","")); probs_stream_filename = rr_salloc(pc_stringarg(&num_of_args,args,"-probs","")); annotation_filename = rr_salloc(pc_stringarg(&num_of_args,args,"-annotate","")); oov_filename = rr_salloc(pc_stringarg(&num_of_args,args,"-oovs","")); generate_size = pc_intarg(&num_of_args,args,"-size",10000); random_seed = pc_intarg(&num_of_args,args,"-seed",-1); inconsistant_parameters = 0; if (backoff_from_unk_inc && backoff_from_unk_exc) { fprintf(stderr,"Error : Cannot specify both exclusive and inclusive forced backoff.\n"); fprintf(stderr,"Use only one of -backoff_from_unk_exc and -backoff_from_unk_inc\n"); inconsistant_parameters = 1; } if (backoff_from_ccs_inc && backoff_from_ccs_exc) { fprintf(stderr,"Error : Cannot specify both exclusive and inclusive forced backoff.\n"); fprintf(stderr,"Use only one of -backoff_from_ccs_exc and -backoff_from_ccs_inc\n"); inconsistant_parameters = 1; } if (num_of_args > 0) { if (!inconsistant_parameters) { if (!strcmp(args[0],"perplexity")) { compute_perplexity(&ng, &arpa_ng, text_stream_filename, probs_stream_filename, annotation_filename, oov_filename, fb_list_filename, backoff_from_unk_inc, backoff_from_unk_exc, backoff_from_ccs_inc, backoff_from_ccs_exc, arpa_lm, include_unks, log_base); }else /* do perplexity sentence by sentence [20090612] (air) */ if (!strcmp(args[0],"uttperp")) { FILE *uttfh,*tempfh; char utt[4096]; /* live dangerously... */ char tmpfil[128]; if ((uttfh = fopen(text_stream_filename,"r")) == NULL) { printf("Error: can't open %s\n",text_stream_filename); exit(1); } char *template = "uttperp_XXXXXX";// CHANGED HLW mkstemp(template);// CHANGED HLW
int oe_11_main(int argc,char** argv) { arpa_lm_t arpa_lm,lm1,lm2; char header1[MAX_HEADER]; char header2[MAX_HEADER]; flag backoff_from_unk_inc,backoff_from_unk_exc,backoff_from_ccs_inc,backoff_from_ccs_exc; char *lmfile1,*lmfile2,*newlmfile, *wtfile; char *fb_list_filename,*ccs_filename; fb_info *fb1,*fb2,*fb; double w1,w2; if (pc_flagarg(&argc, argv,"-help") || argc == 1) { oe_05_help_message(); exit(1); } lmfile1 = rr_salloc(pc_stringarg(&argc, argv,"-lm1","")); if (0 == strcmp(lmfile1, "")) { fprintf(stderr, "ERROR: Please specify a first input file with -lm1.\n"); oe_05_help_message(); } lmfile2 = rr_salloc(pc_stringarg(&argc, argv,"-lm2","")); if (0 == strcmp(lmfile2, "")) { fprintf(stderr, "ERROR: Please specify a second input file with -lm2.\n"); oe_05_help_message(); } newlmfile = rr_salloc(pc_stringarg(&argc, argv,"-lm","")); if (0 == strcmp(newlmfile, "")) { fprintf(stderr, "ERROR: Please specify a destination file with -lm.\n"); oe_05_help_message(); } fb_list_filename = rr_salloc(pc_stringarg(&argc, argv,"-forced_backoff","")); wtfile= rr_salloc(pc_stringarg(&argc, argv,"-weight","")); if (0 == strcmp(wtfile, "")) { fprintf(stderr, "ERROR: Please specify a weights file with -weight.\n"); oe_05_help_message(); } ccs_filename= rr_salloc(pc_stringarg(&argc, argv,"-context","")); backoff_from_unk_inc = pc_flagarg(&argc,argv,"-backoff_from_unk_inc"); backoff_from_ccs_inc = pc_flagarg(&argc,argv,"-backoff_from_ccs_inc"); backoff_from_unk_exc = pc_flagarg(&argc,argv,"-backoff_from_unk_exc"); backoff_from_ccs_exc = pc_flagarg(&argc,argv,"-backoff_from_ccs_exc"); robust_load_arpa_lm(&lm1,lmfile1,header1,MAX_HEADER); robust_load_arpa_lm(&lm2,lmfile2,header2,MAX_HEADER); load_weights(&w1,&w2,wtfile); printf ("\ncombine lms\n"); combine_lm(&arpa_lm,&lm1,&lm2); printf ("\nloading context cues.\n"); load_context_cue(&arpa_lm,ccs_filename); load_context_cue(&lm1,ccs_filename); load_context_cue(&lm2,ccs_filename); fb=gen_fb_list(arpa_lm.vocab_ht, arpa_lm.vocab_size, arpa_lm.vocab, arpa_lm.context_cue, backoff_from_unk_inc, backoff_from_unk_exc, backoff_from_ccs_inc, backoff_from_ccs_exc, fb_list_filename); fb1=gen_fb_list(lm1.vocab_ht, lm1.vocab_size, lm1.vocab, lm1.context_cue, backoff_from_unk_inc, backoff_from_unk_exc, backoff_from_ccs_inc, backoff_from_ccs_exc, fb_list_filename); fb2=gen_fb_list(lm2.vocab_ht, lm2.vocab_size, lm2.vocab, lm2.context_cue, backoff_from_unk_inc, backoff_from_unk_exc, backoff_from_ccs_inc, backoff_from_ccs_exc, fb_list_filename); printf ("\nrecaculate oov probabilities.\n"); recalc_oov_prob(&arpa_lm,&lm1,&lm2); printf ("\ncheck probabilities\n"); check_prob(&arpa_lm,&lm1,&lm2,fb1,fb2,w1,w2); printf ("\ncalculate interpolated probabilities\n"); calc_interpolated_prob(&arpa_lm,&lm1,&lm2,fb1,fb2,w1,w2); printf ("\ncalculate backoff weights\n"); calc_backoff_weight(&arpa_lm,fb); printf ("\nwrite interpolated lm\n"); write_interpolated_lm(&arpa_lm,newlmfile,header1,header2,2); printf ("\nfinished\n"); return 0; }
void combine_lm(arpa_lm_t *arpa_lm, arpa_lm_t *lm1, arpa_lm_t *lm2) { char *in_line; char *input_line; int i,j,k; int num_of_args; int pos_of_novelty; char *input_line_ptr_orig; char *word_copy; id__t *previous_ngram; id__t *current_ngram; vocab_sz_t temp_id; vocab_sz_t *pos_in_list; int previd; TBROWSE_UNION bru; char** words; words=(char**)NewArray(15,MAX_WORD,sizeof(char)); in_line = (char *) rr_malloc(1024*sizeof(char)); input_line = (char *) rr_malloc(1024*sizeof(char)); #import "OpenEarsStaticAnalysisToggle.h" #ifdef STATICANALYZEDEPENDENCIES #define __clang_analyzer__ 1 #endif #if !defined(__clang_analyzer__) || defined(STATICANALYZEDEPENDENCIES) #undef __clang_analyzer__ input_line_ptr_orig = input_line; #endif /* Read number of each k-gram */ arpa_lm->table_sizes = (table_size_t *) rr_malloc(sizeof(table_size_t)*11); arpa_lm->num_kgrams = (ngram_sz_t *) rr_malloc(sizeof(ngram_sz_t)*11); calc_merged_ngram_num(arpa_lm, lm1, lm2); previous_ngram = (id__t *) rr_calloc(arpa_lm->n,sizeof(id__t)); current_ngram = (id__t *) rr_calloc(arpa_lm->n,sizeof(id__t)); pos_in_list = (vocab_sz_t *) rr_malloc(sizeof(vocab_sz_t) * arpa_lm->n); ng_arpa_lm_alloc_struct(arpa_lm); /* Process 1-grams */ printf("Reading unigrams...\n"); i=0; begin_browse_union(lm1,lm2,1,&bru); while (get_next_ngram_union(words,&bru)) { word_copy = rr_salloc(words[0]); /* Do checks about open or closed vocab */ check_open_close_vocab(arpa_lm,word_copy,&i); } /* Process 2, ... , n-1 grams */ #import "OpenEarsStaticAnalysisToggle.h" #ifdef STATICANALYZEDEPENDENCIES #define __clang_analyzer__ 1 #endif #if !defined(__clang_analyzer__) || defined(STATICANALYZEDEPENDENCIES) #undef __clang_analyzer__ previd = -1; for (i=2;i<=arpa_lm->n-1;i++) { printf("\nReading %d-grams...\n",i); previd = -1; j=0; for (k=0;k<=arpa_lm->n-1;k++) { pos_in_list[k] = 0; } begin_browse_union(lm1,lm2,i,&bru); while (get_next_ngram_union(words,&bru)) { /* Process line into all relevant temp_words */ num_of_args = 0; #endif sih_lookup(arpa_lm->vocab_ht,words[i-1],&temp_id); arpa_lm->word_id[i-1][j] = temp_id; show_dot(j); j++; if (j>arpa_lm->table_sizes[i-1]) { quit(-1,"Error - Header information in ARPA format language model is incorrect.\nMore than %d %d-grams needed to be stored.\n",arpa_lm->table_sizes[i-1],i); } /* Make sure that indexes in previous table point to the right thing. */ for (k=0;k<=i-1;k++) { previous_ngram[k] = current_ngram[k]; sih_lookup(arpa_lm->vocab_ht,words[k],&temp_id); if (temp_id == 0 && strcmp(words[k],"<UNK>")) { quit(-1,"Error - found unknown word in n-gram file : %s\n", words[k]); } current_ngram[k] = temp_id; } /* Find position of novelty */ /*bug fixed, for the first ngram, pos_of novelty should be 0 - Wei Xu*/ if (j==1) pos_of_novelty=0; else { pos_of_novelty = i; for (k=0;k<=i-1;k++) { if (current_ngram[k] > previous_ngram[k]) { pos_of_novelty = k; k = arpa_lm->n; } else { if ((current_ngram[k] > previous_ngram[k]) && (j > 0)) { quit(-1,"Error : n-grams are not correctly ordered.\n"); } } } } if (pos_of_novelty == i && j != 1) quit(-1,"Error - Repeated %d-gram in ARPA format language model.\n", i); if (pos_of_novelty != i-1) { if (i==2) { /* Deal with unigram pointers */ for (k = previd + 1; k <= current_ngram[0]; k++) { arpa_lm->ind[0][k] = new_index(j-1, arpa_lm->ptr_table[0], &(arpa_lm->ptr_table_size[0]), k); } previd = current_ngram[0]; }else { for (k=pos_of_novelty;k<=i-2;k++) { if (k == 0) { pos_in_list[0] = current_ngram[0]; } else { pos_in_list[k] = MIN(get_full_index(arpa_lm->ind[k-1][pos_in_list[k-1]], arpa_lm->ptr_table[k-1], arpa_lm->ptr_table_size[k-1], pos_in_list[k-1]),pos_in_list[k]); while (arpa_lm->word_id[k][pos_in_list[k]] < current_ngram[k]) { pos_in_list[k]++; } } } for (k = previd + 1; k <= pos_in_list[i-2]; k++) { arpa_lm->ind[i-2][k] = new_index(j-1, arpa_lm->ptr_table[i-2], &(arpa_lm->ptr_table_size[i-2]), k); } previd = pos_in_list[i-2]; } } } /* Now need to tidy up pointers for bottom section of unigrams */ for (k = previd + 1; k <= arpa_lm->vocab_size; k++) { arpa_lm->ind[0][k] = new_index(arpa_lm->num_kgrams[1], arpa_lm->ptr_table[0], &(arpa_lm->ptr_table_size[0]), k); } } printf("\nReading %d-grams...\n",arpa_lm->n); j = 0; previd = 0; arpa_lm->ind[arpa_lm->n-2][0] = 0; for (k=0;k<=arpa_lm->n-1;k++) { /* bug fixed by Wei Xu : this is a serious bug*/ pos_in_list[k] = 0; // pos_in_list[0] = 0; } begin_browse_union(lm1,lm2,arpa_lm->n,&bru); while (get_next_ngram_union(words,&bru)) { show_dot(j); sih_lookup(arpa_lm->vocab_ht,words[arpa_lm->n-1],&temp_id); arpa_lm->word_id[arpa_lm->n-1][j] = temp_id; j++; for (k=0;k<=arpa_lm->n-1;k++) { previous_ngram[k] = current_ngram[k]; sih_lookup(arpa_lm->vocab_ht,words[k],&temp_id); if (temp_id == 0 && strcmp(words[k],"<UNK>")) { quit(-1,"Error - found unknown word in n-gram file : %s\n", words[k]); } current_ngram[k] = temp_id; } /* Find position of novelty */ /*bug fixed, for the first ngram, pos_of novelty should be 0 - Wei Xu*/ if (j==1) pos_of_novelty=0; else { pos_of_novelty = arpa_lm->n+1; for (k=0;k<=arpa_lm->n-1;k++) { if (current_ngram[k] > previous_ngram[k]) { pos_of_novelty = k; k = arpa_lm->n; }else { if ((current_ngram[k] > previous_ngram[k]) && (j>0)) { quit(-1,"Error : n-grams are not correctly ordered.\n"); } } } } if ( pos_of_novelty == arpa_lm->n+1 && j != 1 ) { quit(-1,"Error : Same %d-gram occurs twice in ARPA format LM.\n", arpa_lm->n); } if (pos_of_novelty != arpa_lm->n-1) { for (k=pos_of_novelty;k<=arpa_lm->n-2;k++) { if (k == 0) { pos_in_list[0] = current_ngram[0]; }else { pos_in_list[k] = MAX(get_full_index(arpa_lm->ind[k-1][pos_in_list[k-1]], arpa_lm->ptr_table[k-1], arpa_lm->ptr_table_size[k-1], pos_in_list[k-1]),pos_in_list[k]); while (arpa_lm->word_id[k][pos_in_list[k]] < current_ngram[k]) { pos_in_list[k]++; } } } for (k = previd + 1; k <= pos_in_list[arpa_lm->n-2]; k++) { arpa_lm->ind[arpa_lm->n-2][k] = new_index(j-1, arpa_lm->ptr_table[arpa_lm->n-2], &(arpa_lm->ptr_table_size[arpa_lm->n-2]), k); } previd = pos_in_list[arpa_lm->n-2]; } if (j>arpa_lm->table_sizes[arpa_lm->n-1]) { quit(-1,"Error - Header information in ARPA format language model is incorrect.\nMore than %d %d-grams needed to be stored.\n",arpa_lm->table_sizes[arpa_lm->n-1],arpa_lm->n-1); } } /* Tidy up */ free(previous_ngram); free(current_ngram); free(in_line); free(input_line); DeleteArray(words); }