void load_context_cue(arpa_lm_t* lm, char* ccs_filename) { FILE* context_cues_fp; char wlist_entry[1024]; char current_cc[200]; vocab_sz_t current_cc_id; lm->context_cue = (flag *) rr_calloc(lm->table_sizes[0],sizeof(flag)); lm->no_of_ccs = 0; if (strcmp(ccs_filename,"")) { context_cues_fp = rr_iopen(ccs_filename); while (fgets (wlist_entry, sizeof (wlist_entry),context_cues_fp)) { if (strncmp(wlist_entry,"##",2)==0) continue; sscanf (wlist_entry, "%s ",current_cc); warn_on_wrong_vocab_comments(wlist_entry); if (sih_lookup(lm->vocab_ht,current_cc,¤t_cc_id) == 0) quit(-1,"Error : %s in the context cues file does not appear in the vocabulary.\n",current_cc); lm->context_cue[(unsigned short) current_cc_id] = 1; lm->no_of_ccs++; fprintf(stderr,"Context cue word : %s id = %lld\n",current_cc,current_cc_id); } rr_iclose(context_cues_fp); } }
void read_vocab(ng_t* ng, int verbosity) { vocab_sz_t test_cc_id; vocab_sz_t current_cc_id; char current_cc[200]; char wlist_entry[1024]; pc_message(verbosity,2,"Reading vocabulary.\n"); /* Don't change the parameter of sih_create, because it will change the binary layout of the .binlm file */ ng->vocab_ht = sih_create(1000,0.5,2.0,1); read_voc(ng->vocab_filename,verbosity,ng->vocab_ht,&ng->vocab,&(ng->vocab_size)); /* Determine which of the vocabulary words are context cues */ ng->no_of_ccs = 0; ng->context_cue = (flag *) rr_calloc(ng->vocab_size+1,sizeof(flag)); if (ng->context_set) { /* This should be tied to l889 to l894 in lm_combine.c */ while (fgets (wlist_entry, sizeof (wlist_entry),ng->context_cues_fp)) { if (strncmp(wlist_entry,"##",2)==0) continue; sscanf (wlist_entry, "%s ",current_cc); warn_on_wrong_vocab_comments(wlist_entry); if (sih_lookup(ng->vocab_ht,current_cc,¤t_cc_id) == 0) pc_message(verbosity,1,"Warning : %s in the context cues file does not appear in the vocabulary.\n",current_cc); else { ng->context_cue[(unsigned short) current_cc_id] = 1; pc_message(verbosity,2,"Context cue word : %s id = %d\n",current_cc,current_cc_id); ng->no_of_ccs++; } } rr_iclose(ng->context_cues_fp); } if ((sih_lookup(ng->vocab_ht,"<s>",&test_cc_id) != 0)) if (ng->context_cue[(unsigned short) test_cc_id] == 0) fprintf(stderr,"WARNING: <s> appears as a vocabulary item, but is not labelled as a\ncontext cue.\n"); if ((sih_lookup(ng->vocab_ht,"<p>",&test_cc_id) != 0)) if (ng->context_cue[(unsigned short) test_cc_id] == 0) fprintf(stderr,"WARNING: <p> appears as a vocabulary item, but is not labelled as a\ncontext cue.\n"); if ((sih_lookup(ng->vocab_ht,"<art>",&test_cc_id) != 0)) if (ng->context_cue[(unsigned short) test_cc_id] == 0) fprintf(stderr,"WARNING: <art> appears as a vocabulary item, but is not labelled as a\ncontext cue.\n"); }
int read_vocab(char* vocab_filename, int verbosity, struct idngram_hash_table* vocabulary, int M ) { FILE *vocab_file; int vocab_size; char temp_word[MAX_WORD_LENGTH]; char temp_word2[MAX_WORD_LENGTH]; vocab_size = 0; vocab_file = rr_iopen(vocab_filename); pc_message(verbosity,2,"Reading vocabulary... \n"); while (fgets (temp_word, sizeof(temp_word),vocab_file)) { if (strncmp(temp_word,"##",2)==0) continue; sscanf (temp_word, "%s ",temp_word2); /* printf("hey hey %s %d\n ", temp_word2, idngram_hash(temp_word2,M));*/ /* Check for repeated words in the vocabulary */ if (index2(vocabulary,temp_word2) != 0) warn_on_repeated_words(temp_word2); warn_on_wrong_vocab_comments(temp_word); vocab_size++; /* printf("%s %d\n ", temp_word2, idngram_hash(temp_word2,M));*/ add_to_idngram_hashtable(vocabulary,idngram_hash(temp_word2,M),temp_word2,vocab_size); if(vocab_size == M){ quit(-1, "Number of entries reached the size of the hash. Run the program again with a larger has size -hash \n"); } } if (vocab_size > MAX_VOCAB_SIZE) fprintf(stderr,"text2idngram : vocab_size %d\n is larger than %d\n",vocab_size,MAX_VOCAB_SIZE); return 0; }
void read_wlist_into_siht(char *wlist_filename, int verbosity, sih_t *p_word_id_ht, /** Pointer of the word ID hash table */ vocab_sz_t * p_n_wlist /** Pointer of the size of wordlist */ ) { static char rname[]="read_wlist_into_siht"; FILE *wlist_fp = rr_iopen(wlist_filename); char wlist_entry[1024], word[256], *word_copy; vocab_sz_t entry_no = 0; while (fgets (wlist_entry, sizeof (wlist_entry), wlist_fp)) { if (strncmp(wlist_entry,"##",2) == 0) continue; entry_no++; /* printf("entry no %lld, wlist_entry %s\n",entry_no,wlist_entry);*/ if(entry_no%1000==0){ fprintf(stdout,"."); fflush(stdout); } sscanf (wlist_entry, "%s ", word); warn_on_wrong_vocab_comments(wlist_entry); word_copy = rr_salloc(word); sih_add(p_word_id_ht, word_copy, entry_no); free(word_copy); // HLW } fprintf(stdout,"\n"); fflush(stdout); rr_iclose(wlist_fp); if(verbose_cmuclmtk == 1) { if (verbosity) fprintf(stderr,"%s: a list of %d words was read from \"%s\".\n", rname,(int) entry_no,wlist_filename); } *p_n_wlist = entry_no; }
int oe_02_main (int argc, char **argv) { ng_t ng; arpa_lm_t arpa_ng; char input_string[500]; int num_of_args; char *args[MAX_ARGS]; char *lm_filename_arpa; char *lm_filename_binary; flag told_to_quit; flag inconsistant_parameters; flag backoff_from_unk_inc; flag backoff_from_unk_exc; flag backoff_from_ccs_inc; flag backoff_from_ccs_exc; flag arpa_lm; flag binary_lm; flag include_unks; char *fb_list_filename; char *probs_stream_filename; char *annotation_filename; char *text_stream_filename; char *oov_filename; char *ccs_filename; int generate_size; int random_seed; double log_base; char wlist_entry[1024]; char current_cc[200]; vocab_sz_t current_cc_id; FILE *context_cues_fp; int n; /* Process command line */ report_version(&argc,argv); if (pc_flagarg(&argc, argv,"-help") || argc == 1 || (strcmp(argv[1],"-binary") && strcmp(argv[1],"-arpa"))) { oe_02_help_message(); exit(1); } lm_filename_arpa = rr_salloc(pc_stringarg(&argc, argv,"-arpa","")); if (strcmp(lm_filename_arpa,"")) arpa_lm = 1; else arpa_lm = 0; lm_filename_binary = rr_salloc(pc_stringarg(&argc, argv,"-binary","")); if (strcmp(lm_filename_binary,"")) binary_lm = 1; else binary_lm = 0; if (arpa_lm && binary_lm) quit(-1,"Error : Can't use both -arpa and -binary flags.\n"); if (!arpa_lm && !binary_lm) quit(-1,"Error : Must specify either a binary or an arpa format language model.\n"); ccs_filename = rr_salloc(pc_stringarg(&argc, argv,"-context","")); if (binary_lm && strcmp(ccs_filename,"")) fprintf(stderr,"Warning - context cues file not needed with binary language model file.\nWill ignore it.\n"); pc_report_unk_args(&argc,argv,2); /* Load language model */ if (arpa_lm) { fprintf(stderr,"Reading in language model from file %s\n", lm_filename_arpa); load_arpa_lm(&arpa_ng,lm_filename_arpa); }else { fprintf(stderr,"Reading in language model from file %s\n", lm_filename_binary); load_lm(&ng,lm_filename_binary); } fprintf(stderr,"\nDone.\n"); n=arpa_lm? arpa_ng.n: ng.n; if (arpa_lm) { arpa_ng.context_cue = (flag *) rr_calloc(arpa_ng.table_sizes[0],sizeof(flag)); arpa_ng.no_of_ccs = 0; if (strcmp(ccs_filename,"")) { context_cues_fp = rr_iopen(ccs_filename); while (fgets (wlist_entry, sizeof (wlist_entry),context_cues_fp)) { if (strncmp(wlist_entry,"##",2)==0) continue; sscanf (wlist_entry, "%s ",current_cc); warn_on_wrong_vocab_comments(wlist_entry); if (sih_lookup(arpa_ng.vocab_ht,current_cc,¤t_cc_id) == 0) quit(-1,"Error : %s in the context cues file does not appear in the vocabulary.\n",current_cc); arpa_ng.context_cue[(unsigned short) current_cc_id] = 1; arpa_ng.no_of_ccs++; fprintf(stderr,"Context cue word : %s id = %lld\n",current_cc,current_cc_id); } rr_iclose(context_cues_fp); } } /* Process commands */ told_to_quit = 0; num_of_args = 0; while (!feof(stdin) && !told_to_quit) { printf("evallm : \n"); fgets(input_string, sizeof(input_string), stdin); if(strlen(input_string) < sizeof(input_string)-1) input_string[strlen(input_string)-1] = '\0'; //chop new-line else quit(1, "evallm input exceeds size of input buffer"); if (!feof(stdin)) { parse_comline(input_string,&num_of_args,args); log_base = pc_doublearg(&num_of_args,args,"-log_base",10.0); backoff_from_unk_inc = pc_flagarg(&num_of_args,args,"-backoff_from_unk_inc"); backoff_from_ccs_inc = pc_flagarg(&num_of_args,args,"-backoff_from_ccs_inc"); backoff_from_unk_exc = pc_flagarg(&num_of_args,args,"-backoff_from_unk_exc"); backoff_from_ccs_exc = pc_flagarg(&num_of_args,args,"-backoff_from_ccs_exc"); include_unks = pc_flagarg(&num_of_args,args,"-include_unks"); fb_list_filename = rr_salloc(pc_stringarg(&num_of_args,args,"-backoff_from_list","")); text_stream_filename = rr_salloc(pc_stringarg(&num_of_args,args,"-text","")); probs_stream_filename = rr_salloc(pc_stringarg(&num_of_args,args,"-probs","")); annotation_filename = rr_salloc(pc_stringarg(&num_of_args,args,"-annotate","")); oov_filename = rr_salloc(pc_stringarg(&num_of_args,args,"-oovs","")); generate_size = pc_intarg(&num_of_args,args,"-size",10000); random_seed = pc_intarg(&num_of_args,args,"-seed",-1); inconsistant_parameters = 0; if (backoff_from_unk_inc && backoff_from_unk_exc) { fprintf(stderr,"Error : Cannot specify both exclusive and inclusive forced backoff.\n"); fprintf(stderr,"Use only one of -backoff_from_unk_exc and -backoff_from_unk_inc\n"); inconsistant_parameters = 1; } if (backoff_from_ccs_inc && backoff_from_ccs_exc) { fprintf(stderr,"Error : Cannot specify both exclusive and inclusive forced backoff.\n"); fprintf(stderr,"Use only one of -backoff_from_ccs_exc and -backoff_from_ccs_inc\n"); inconsistant_parameters = 1; } if (num_of_args > 0) { if (!inconsistant_parameters) { if (!strcmp(args[0],"perplexity")) { compute_perplexity(&ng, &arpa_ng, text_stream_filename, probs_stream_filename, annotation_filename, oov_filename, fb_list_filename, backoff_from_unk_inc, backoff_from_unk_exc, backoff_from_ccs_inc, backoff_from_ccs_exc, arpa_lm, include_unks, log_base); }else /* do perplexity sentence by sentence [20090612] (air) */ if (!strcmp(args[0],"uttperp")) { FILE *uttfh,*tempfh; char utt[4096]; /* live dangerously... */ char tmpfil[128]; if ((uttfh = fopen(text_stream_filename,"r")) == NULL) { printf("Error: can't open %s\n",text_stream_filename); exit(1); } char *template = "uttperp_XXXXXX";// CHANGED HLW mkstemp(template);// CHANGED HLW
int main(int argc, char *argv[]) { int verbosity; int vocab_size; FILE *vocab_file; int buffer_size; flag write_ascii; int max_files; int number_of_tempfiles; char *vocab_filename; char *idngram_filename; char temp_word[MAX_WORD_LENGTH]; char temp_word2[MAX_WORD_LENGTH]; char temp_word3[MAX_WORD_LENGTH]; flag contains_unks; int position_in_buffer; FILE *outfile; FILE *tempfile; FILE *non_unk_fp; ngram_rec *buffer; flag same_ngram; int i; int j; int fof_size; int size_of_rec; char temp_directory[1000]; char *temp_file_ext; /* Vocab hash table things */ struct idngram_hash_table vocabulary; unsigned long hash_size; unsigned long M; wordid_t *current_ngram; int current_count; wordid_t *sort_ngram; int sort_count; /* Process command line */ report_version(&argc,argv); if (argc == 1 || pc_flagarg(&argc, argv,"-help")) { /* Display help message */ help_message(); exit(1); } n = pc_intarg( &argc, argv, "-n",DEFAULT_N); hash_size = pc_intarg( &argc, argv, "-hash",DEFAULT_HASH_SIZE); buffer_size = pc_intarg( &argc, argv, "-buffer",STD_MEM); write_ascii = pc_flagarg(&argc,argv,"-write_ascii"); verbosity = pc_intarg(&argc,argv,"-verbosity",DEFAULT_VERBOSITY); max_files = pc_intarg( &argc, argv, "-files",DEFAULT_MAX_FILES); fof_size = pc_intarg(&argc,argv,"-fof_size",10); vocab_filename = salloc(pc_stringarg( &argc, argv, "-vocab", "" )); idngram_filename = salloc(pc_stringarg( &argc, argv, "-idngram", "" )); if (!strcmp("",vocab_filename)) quit(-1,"Error : Must specify a vocabulary file.\n"); if (!strcmp("",idngram_filename)) quit(-1,"text2idngram : Error : Must specify idngram file.\n"); if (pc_flagarg(&argc,argv,"-compress")) temp_file_ext = salloc(".Z"); else { if (pc_flagarg(&argc,argv,"-gzip")) temp_file_ext = salloc(".gz"); else temp_file_ext = salloc(""); } strcpy(temp_directory, "cmuclmtk-XXXXXX"); if (mkdtemp(temp_directory) == NULL) { quit(-1, "Failed to create temporary folder: %s\n", strerror(errno)); } pc_report_unk_args(&argc,argv,verbosity); outfile = rr_fopen(idngram_filename,"wb"); pc_message(verbosity,2,"Vocab : %s\n",vocab_filename); pc_message(verbosity,2,"Output idngram : %s\n",idngram_filename); pc_message(verbosity,2,"Buffer size : %d\n",buffer_size); pc_message(verbosity,2,"Hash table size : %d\n",hash_size); pc_message(verbosity,2,"Max open files : %d\n",max_files); pc_message(verbosity,2,"n : %d\n",n); pc_message(verbosity,2,"FOF size : %d\n",fof_size); size_of_rec = (sizeof(wordid_t) * n) + 16 - (( n* sizeof(wordid_t)) % 16); buffer_size *= (1000000/((sizeof(ngram_rec) + size_of_rec))); fprintf(stderr,"buffer size = %d\n",buffer_size); /* Allocate memory for hash table */ fprintf(stderr,"Initialising hash table...\n"); M = nearest_prime(hash_size); new_idngram_hashtable(&vocabulary,M); /* Read in the vocabulary */ vocab_size = 0; vocab_file = rr_iopen(vocab_filename); pc_message(verbosity,2,"Reading vocabulary...\n"); while (fgets (temp_word, sizeof(temp_word),vocab_file)) { if (strncmp(temp_word,"##",2)==0) continue; sscanf (temp_word, "%s ",temp_word2); /* Check for vocabulary order */ if (vocab_size > 0 && strcmp(temp_word2,temp_word3)<0) quit(-1,"wngram2idngram : Error : Vocabulary is not alphabetically ordered.\n"); /* Check for repeated words in the vocabulary */ if (index2(&vocabulary,temp_word2) != 0) warn_on_repeated_words(temp_word); warn_on_wrong_vocab_comments(temp_word); vocab_size++; add_to_idngram_hashtable(&vocabulary,idngram_hash(temp_word2,M),temp_word2,vocab_size); strcpy(temp_word3,temp_word2); } if (vocab_size > MAX_VOCAB_SIZE) quit(-1,"Error : Vocabulary size exceeds maximum.\n"); pc_message(verbosity,2,"Allocating memory for the buffer...\n"); buffer=(ngram_rec *) rr_malloc((buffer_size+1)*sizeof(ngram_rec)); for (i=0;i<=buffer_size;i++) buffer[i].word = (wordid_t *) rr_malloc(n*sizeof(wordid_t)); /* Open the "non-OOV" tempfile */ sprintf(temp_word, "%s/1%s", temp_directory, temp_file_ext); non_unk_fp = rr_fopen(temp_word,"w"); pc_message(verbosity,2,"Writing non-OOV counts to temporary file %s\n", temp_word); number_of_tempfiles = 1; current_ngram = (wordid_t *) rr_malloc(n*sizeof(wordid_t)); sort_ngram = (wordid_t *) rr_malloc(n*sizeof(wordid_t)); /* Read text into buffer */ position_in_buffer = 0; while (!rr_feof(stdin)) { for (i=0;i<=n-1;i++) { get_word(stdin,temp_word); current_ngram[i]=index2(&vocabulary,temp_word); } if (scanf("%d",¤t_count) != 1) if (!rr_feof(stdin)) quit(-1,"Error reading n-gram count from stdin.\n"); if (!rr_feof(stdin)) { contains_unks = 0; for (i=0;i<=n-1;i++) { if (!current_ngram[i]) contains_unks = 1; } if (contains_unks) { /* Write to buffer */ position_in_buffer++; if (position_in_buffer >= buffer_size) { /* Sort buffer */ pc_message(verbosity,2, "Sorting n-grams which include an OOV word...\n"); qsort((void*) buffer,(size_t) position_in_buffer, sizeof(ngram_rec),compare_ngrams2); pc_message(verbosity,2,"Done.\n"); /* Write buffer to temporary file */ number_of_tempfiles++; sprintf(temp_word,"%s/%hu%s", temp_directory, number_of_tempfiles,temp_file_ext); pc_message(verbosity,2, "Writing sorted OOV-counts buffer to temporary file %s\n", temp_word); tempfile = rr_fopen(temp_word,"w"); for (i=0;i<=n-1;i++) sort_ngram[i] = buffer[0].word[i]; sort_count = buffer[0].count; for (i=0;i<=position_in_buffer-2;i++) { same_ngram = 1; for (j=n-1;j>=0;j--) { if (buffer[i].word[j] != sort_ngram[j]) { same_ngram = 0; j = -1; } } if (same_ngram) sort_count += buffer[i].count; else { for (j=0;j<=n-1;j++) { rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1, tempfile,"temporary n-gram ids"); sort_ngram[j] = buffer[i].word[j]; } rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile, "temporary n-gram counts"); sort_count = buffer[i].count; } } for (j=0;j<=n-1;j++) rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1, tempfile,"temporary n-gram ids"); rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile, "temporary n-gram counts"); rr_oclose(tempfile); position_in_buffer = 1; } for (i=0;i<=n-1;i++) buffer[position_in_buffer-1].word[i] = current_ngram[i]; buffer[position_in_buffer-1].count = current_count; }else { /* Write to temporary file */ for (i=0;i<=n-1;i++) rr_fwrite((char*)¤t_ngram[i],sizeof(wordid_t),1, non_unk_fp,"temporary n-gram ids"); rr_fwrite((char*)¤t_count,sizeof(int),1,non_unk_fp, "temporary n-gram counts"); } } } if (position_in_buffer > 0) { /* Only do this bit if we have actually seen some OOVs */ /* Sort final buffer */ pc_message(verbosity,2,"Sorting final buffer...\n"); qsort((void*) buffer,(size_t) position_in_buffer, sizeof(ngram_rec),compare_ngrams2); /* Write final buffer */ number_of_tempfiles++; sprintf(temp_word,"%s/%hu%s", temp_directory, number_of_tempfiles,temp_file_ext); pc_message(verbosity,2,"Writing sorted buffer to temporary file %s\n", temp_word); tempfile = rr_fopen(temp_word,"w"); for (i=0;i<=n-1;i++) sort_ngram[i] = buffer[0].word[i]; sort_count = buffer[0].count; for (i=1;i<=position_in_buffer-1;i++) { same_ngram = 1; for (j=n-1;j>=0;j--) { if (buffer[i].word[j] != sort_ngram[j]) { same_ngram = 0; j = -1; } } if (same_ngram) sort_count += buffer[i].count; else { for (j=0;j<=n-1;j++) { rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1, tempfile,"temporary n-gram ids"); sort_ngram[j] = buffer[i].word[j]; } rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile, "temporary n-gram counts"); sort_count = buffer[i].count; } } for (j=0;j<=n-1;j++) rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1, tempfile,"temporary n-gram ids"); rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile, "temporary n-gram counts"); fclose(tempfile); } /* Merge the temporary files, and output the result */ fclose(non_unk_fp); pc_message(verbosity,2,"Merging temporary files...\n"); merge_idngramfiles(1, number_of_tempfiles, temp_directory, temp_file_ext, max_files, outfile, write_ascii, fof_size, n); fclose(outfile); rmdir(temp_directory); pc_message(verbosity,0,"wngram2idngram : Done.\n"); return 0; }