void load_context_cue(arpa_lm_t* lm, char* ccs_filename) { FILE* context_cues_fp; char wlist_entry[1024]; char current_cc[200]; vocab_sz_t current_cc_id; lm->context_cue = (flag *) rr_calloc(lm->table_sizes[0],sizeof(flag)); lm->no_of_ccs = 0; if (strcmp(ccs_filename,"")) { context_cues_fp = rr_iopen(ccs_filename); while (fgets (wlist_entry, sizeof (wlist_entry),context_cues_fp)) { if (strncmp(wlist_entry,"##",2)==0) continue; sscanf (wlist_entry, "%s ",current_cc); warn_on_wrong_vocab_comments(wlist_entry); if (sih_lookup(lm->vocab_ht,current_cc,¤t_cc_id) == 0) quit(-1,"Error : %s in the context cues file does not appear in the vocabulary.\n",current_cc); lm->context_cue[(unsigned short) current_cc_id] = 1; lm->no_of_ccs++; fprintf(stderr,"Context cue word : %s id = %lld\n",current_cc,current_cc_id); } rr_iclose(context_cues_fp); } }
void calc_mem_req(ng_t *ng,flag is_ascii) { ngram current_ngram; ngram previous_ngram; count_t *ng_count; int i,j; current_ngram.id_array = (id__t *) rr_malloc(sizeof(id__t)*ng->n); previous_ngram.id_array = (id__t *) rr_malloc(sizeof(id__t)*ng->n); ng_count = (count_t *) rr_calloc(ng->n,sizeof(count_t)); current_ngram.n = ng->n; rewind(ng->id_gram_fp); while (!rr_feof(ng->id_gram_fp)) { for (i=0;i<=ng->n-1;i++) { previous_ngram.id_array[i]=current_ngram.id_array[i]; } get_ngram(ng->id_gram_fp,¤t_ngram,is_ascii); for (i=0;i<=ng->n-1;i++) { if (current_ngram.id_array[i] != previous_ngram.id_array[i]) { for (j=i;j<=ng->n-1;j++) { if (j>0) { if (ng_count[j] > ng->cutoffs[j-1]) { ng->table_sizes[j]++; } } ng_count[j] = current_ngram.count; } i=ng->n; } else { ng_count[i] += current_ngram.count; } } } for (i=1;i<=ng->n-1;i++) { if (ng_count[i] > ng->cutoffs[i-1]) { ng->table_sizes[i]++; } } /* Add a fudge factor, as problems can crop up with having to cut-off last few n-grams. */ for (i=1;i<=ng->n-1;i++) { ng->table_sizes[i]+=10; } rr_iclose(ng->id_gram_fp); ng->id_gram_fp = rr_iopen(ng->id_gram_filename); }
void read_vocab(ng_t* ng, int verbosity) { vocab_sz_t test_cc_id; vocab_sz_t current_cc_id; char current_cc[200]; char wlist_entry[1024]; pc_message(verbosity,2,"Reading vocabulary.\n"); /* Don't change the parameter of sih_create, because it will change the binary layout of the .binlm file */ ng->vocab_ht = sih_create(1000,0.5,2.0,1); read_voc(ng->vocab_filename,verbosity,ng->vocab_ht,&ng->vocab,&(ng->vocab_size)); /* Determine which of the vocabulary words are context cues */ ng->no_of_ccs = 0; ng->context_cue = (flag *) rr_calloc(ng->vocab_size+1,sizeof(flag)); if (ng->context_set) { /* This should be tied to l889 to l894 in lm_combine.c */ while (fgets (wlist_entry, sizeof (wlist_entry),ng->context_cues_fp)) { if (strncmp(wlist_entry,"##",2)==0) continue; sscanf (wlist_entry, "%s ",current_cc); warn_on_wrong_vocab_comments(wlist_entry); if (sih_lookup(ng->vocab_ht,current_cc,¤t_cc_id) == 0) pc_message(verbosity,1,"Warning : %s in the context cues file does not appear in the vocabulary.\n",current_cc); else { ng->context_cue[(unsigned short) current_cc_id] = 1; pc_message(verbosity,2,"Context cue word : %s id = %d\n",current_cc,current_cc_id); ng->no_of_ccs++; } } rr_iclose(ng->context_cues_fp); } if ((sih_lookup(ng->vocab_ht,"<s>",&test_cc_id) != 0)) if (ng->context_cue[(unsigned short) test_cc_id] == 0) fprintf(stderr,"WARNING: <s> appears as a vocabulary item, but is not labelled as a\ncontext cue.\n"); if ((sih_lookup(ng->vocab_ht,"<p>",&test_cc_id) != 0)) if (ng->context_cue[(unsigned short) test_cc_id] == 0) fprintf(stderr,"WARNING: <p> appears as a vocabulary item, but is not labelled as a\ncontext cue.\n"); if ((sih_lookup(ng->vocab_ht,"<art>",&test_cc_id) != 0)) if (ng->context_cue[(unsigned short) test_cc_id] == 0) fprintf(stderr,"WARNING: <art> appears as a vocabulary item, but is not labelled as a\ncontext cue.\n"); }
void load_weights(double* w1, double* w2,char* file) { FILE* fp; fp=rr_iopen(file); if (fscanf(fp,"%*s%lf",w1)!=1) quit(-1,"Error in reading weight file\n"); if (fscanf(fp,"%*s%lf",w2)!=1) quit(-1,"Error in reading weight file\n"); rr_iclose(fp); }
void read_wlist_into_siht(char *wlist_filename, int verbosity, sih_t *p_word_id_ht, /** Pointer of the word ID hash table */ vocab_sz_t * p_n_wlist /** Pointer of the size of wordlist */ ) { static char rname[]="read_wlist_into_siht"; FILE *wlist_fp = rr_iopen(wlist_filename); char wlist_entry[1024], word[256], *word_copy; vocab_sz_t entry_no = 0; while (fgets (wlist_entry, sizeof (wlist_entry), wlist_fp)) { if (strncmp(wlist_entry,"##",2) == 0) continue; entry_no++; /* printf("entry no %lld, wlist_entry %s\n",entry_no,wlist_entry);*/ if(entry_no%1000==0){ fprintf(stdout,"."); fflush(stdout); } sscanf (wlist_entry, "%s ", word); warn_on_wrong_vocab_comments(wlist_entry); word_copy = rr_salloc(word); sih_add(p_word_id_ht, word_copy, entry_no); free(word_copy); // HLW } fprintf(stdout,"\n"); fflush(stdout); rr_iclose(wlist_fp); if(verbose_cmuclmtk == 1) { if (verbosity) fprintf(stderr,"%s: a list of %d words was read from \"%s\".\n", rname,(int) entry_no,wlist_filename); } *p_n_wlist = entry_no; }
void read_voc(char *filename, int verbosity, sih_t *p_vocab_ht, char ***p_vocab, unsigned short *p_vocab_size) /* p_vocab==NULL means: build only a hash table */ { /* static char rname[] = "rd_voc"; */ /* Never used anyway! */ char *pperiod; int vocab_size; pperiod = rindex(filename,'.'); if (pperiod==NULL) pperiod = filename-1; if (strcmp(pperiod+1,"vocab_ht")==0) { /* file == hash_table */ FILE *fp=rr_iopen(filename); sih_val_read_from_file(p_vocab_ht, fp, filename, verbosity); rr_iclose(fp); vocab_size = p_vocab_ht->nentries; if (p_vocab!=NULL) { get_vocab_from_vocab_ht(p_vocab_ht, vocab_size, verbosity, p_vocab); *p_vocab[0] = salloc("<UNK>"); } } else { /* file == vocab(ascii) */ read_wlist_into_siht(filename, verbosity, p_vocab_ht, &vocab_size); if (p_vocab!=NULL) { read_wlist_into_array(filename, verbosity, p_vocab, &vocab_size); *p_vocab[0] = salloc("<UNK>"); } } if (p_vocab_size) { *p_vocab_size = vocab_size; } }
void compute_perplexity(ng_t *ng, arpa_lm_t *arpa_ng, char *text_stream_filename, char *probs_stream_filename, char *annotation_filename, char *oov_filename, char *fb_list_filename, flag backoff_from_unk_inc, flag backoff_from_unk_exc, flag backoff_from_ccs_inc, flag backoff_from_ccs_exc, flag arpa_lm, flag include_unks, double log_base) { fb_info *fb_list; FILE *temp_fp; FILE *text_stream_fp; FILE *probs_stream_fp; FILE *annotation_fp; FILE *oov_fp; flag out_probs; flag annotate; flag out_oovs; flag found_unk_wrongly; double prob; double sum_log_prob; int total_words; int excluded_unks; int excluded_ccs; char current_word[1000]; /* Hope that's big enough */ char **prev_words; vocab_sz_t current_id; id__t short_current_id; id__t *context; int context_length; int i; int bo_case; int actual_context_length; int *ngrams_hit; int n; /* Initialise file pointers to prevent warnings from the compiler. */ probs_stream_fp = NULL; annotation_fp = NULL; oov_fp = NULL; short_current_id = 0; found_unk_wrongly = 0; annotate = 0; bo_case = 0; if (arpa_lm) { n = arpa_ng->n; fb_list = gen_fb_list(arpa_ng->vocab_ht, (int) arpa_ng->vocab_size, arpa_ng->vocab, arpa_ng->context_cue, backoff_from_unk_inc, backoff_from_unk_exc, backoff_from_ccs_inc, backoff_from_ccs_exc, fb_list_filename); }else { n = ng->n; fb_list = gen_fb_list(ng->vocab_ht, (int) ng->vocab_size, ng->vocab, ng->context_cue, backoff_from_unk_inc, backoff_from_unk_exc, backoff_from_ccs_inc, backoff_from_ccs_exc, fb_list_filename); } ngrams_hit = (int *) rr_calloc(n,sizeof(int)); prev_words = (char **) rr_malloc(sizeof(char *)*n); for (i=0;i<=n-1;i++) prev_words[i] = (char *) rr_malloc(sizeof(char)*1000); /* Check that text_stream_filename and probs_stream_filename (if specified) are valid. Note that the checks employed by the standard rr_fopen tools are not suitable here, since we don't want the program to terminate if the paths are not found. */ if (!strcmp(text_stream_filename,"")) { printf("Error : Must specify a text file. Use the -text switch.\n"); return; } if (!rr_fexists(text_stream_filename) && strcmp(text_stream_filename,"-")) { printf("Error : Can't open file %s for reading.\n",text_stream_filename); return; } out_probs = strcmp(probs_stream_filename,""); annotate = strcmp(annotation_filename,""); out_oovs = strcmp(oov_filename,""); printf("Computing perplexity of the language model with respect\n"); printf(" to the text %s\n",text_stream_filename); if (out_probs) printf("Probability stream will be written to file %s\n", probs_stream_filename); if (annotate) printf("Annotation will be written to file %s\n", annotation_filename); if (out_oovs) printf("Out of vocabulary words will be written to file %s\n", oov_filename); if (backoff_from_unk_inc) printf("Will force inclusive back-off from OOVs.\n"); if (backoff_from_unk_exc) printf("Will force exclusive back-off from OOVs.\n"); if (backoff_from_ccs_inc) printf("Will force inclusive back-off from context cues.\n"); if (backoff_from_ccs_exc) printf("Will force exclusive back-off from context cues.\n"); if (strcmp(fb_list_filename,"")) printf("Will force back-off according to the contents of %s\n", fb_list_filename); if (include_unks) printf("Perplexity calculation will include OOVs.\n"); /* Check for existance of files, as rr functions will quit, which isn't what we want */ if (out_probs && strcmp(probs_stream_filename,"-")) { if ((temp_fp = fopen(probs_stream_filename,"w")) == NULL) { printf("Error : Can't open file %s for writing.\n",probs_stream_filename); return; } fclose(temp_fp); } if (annotate && strcmp(annotation_filename,"-")) { if ((temp_fp = fopen(annotation_filename,"w")) == NULL) { printf("Error : Can't open file %s for writing.\n",annotation_filename); return; } fclose(temp_fp); } if (out_oovs && strcmp(oov_filename,"-")) { if ((temp_fp = fopen(oov_filename,"w")) == NULL) { printf("Error : Can't open file %s for writing.\n",oov_filename); return; } fclose(temp_fp); } text_stream_fp = rr_iopen(text_stream_filename); if (out_probs) probs_stream_fp = rr_oopen(probs_stream_filename); if (annotate) annotation_fp = rr_oopen(annotation_filename); if (out_oovs) oov_fp = rr_oopen(oov_filename); context = (id__t *) rr_malloc(sizeof(id__t)*(n-1)); sum_log_prob = 0.0; total_words = 0; excluded_unks = 0; excluded_ccs = 0; while (!rr_feof(text_stream_fp)) { if (total_words > 0) { if (total_words < n) strcpy(prev_words[total_words-1],current_word); else { for (i=0;i<=n-3;i++) strcpy(prev_words[i],prev_words[i+1]); if (n>1) strcpy(prev_words[n-2],current_word); } } if (total_words < (n-1)) context_length = total_words; else context_length = n-1; /* Fill context with right stuff */ if (total_words > (n-1)) { for (i=0;i<=context_length-2;i++) context[i] = context[i+1]; } if (context_length != 0) context[context_length-1] = short_current_id; if (fscanf(text_stream_fp,"%s",current_word) != 1) { if (!rr_feof(text_stream_fp)) { printf("Error reading text file.\n"); return; } } if (!rr_feof(text_stream_fp)) { if (arpa_lm) { sih_lookup(arpa_ng->vocab_ht,current_word,¤t_id); if (arpa_ng->vocab_type == CLOSED_VOCAB && current_id == 0) { found_unk_wrongly = 1; printf("Error : %s is not in the vocabulary, and this is a closed \nvocabulary model.\n",current_word); } if (current_id > arpa_ng->vocab_size) quit(-1,"Error : returned value from sih_lookup (%d) is too high.\n",context[i]); else short_current_id = current_id; }else { sih_lookup(ng->vocab_ht,current_word,¤t_id); if (ng->vocab_type == CLOSED_VOCAB && current_id == 0) { found_unk_wrongly = 1; printf("Error : %s is not in the vocabulary, and this is a closed \nvocabulary model.\n",current_word); } if (current_id > ng->vocab_size) quit(-1,"Error : returned value from sih_lookup (%d) is too high.\n",context[i]); else short_current_id = current_id; } if (!found_unk_wrongly) { if (current_id == 0 && out_oovs) fprintf(oov_fp,"%s\n",current_word); if ((arpa_lm && (!(arpa_ng->context_cue[current_id]))) || ((!arpa_lm) && (!(ng->context_cue[current_id])))) { if (include_unks || current_id != 0) { prob = calc_prob_of(short_current_id, context, context_length, ng, arpa_ng, fb_list, &bo_case, &actual_context_length, arpa_lm); if (prob<= 0.0 || prob > 1.0) { fprintf(stderr,"Warning : "); if (short_current_id == 0) fprintf(stderr,"P( <UNK> | "); else fprintf(stderr,"P( %s | ",current_word); for (i=0;i<=actual_context_length-1;i++) { if (context[i+context_length-actual_context_length] == 0) fprintf(stderr,"<UNK> "); else fprintf(stderr,"%s ",prev_words[i]); } fprintf(stderr,") = %g logprob = %g \n ",prob,log(prob)/log(log_base)); fprintf(stderr,"bo_case == 0x%dx, actual_context_length == %d\n", bo_case, actual_context_length); } if (annotate) { if (short_current_id == 0) fprintf(annotation_fp,"P( <UNK> | "); else fprintf(annotation_fp,"P( %s | ",current_word); for (i=0;i<=actual_context_length-1;i++) { if (context[i+context_length-actual_context_length] == 0) fprintf(annotation_fp,"<UNK> "); else { if (arpa_lm) fprintf(annotation_fp,"%s ",arpa_ng->vocab[context[i+context_length-actual_context_length]]); else fprintf(annotation_fp,"%s ",ng->vocab[context[i+context_length-actual_context_length]]); } } fprintf(annotation_fp,") = %g logprob = %f bo_case = ",prob,log(prob)/log(log_base)); decode_bo_case(bo_case,actual_context_length,annotation_fp); } /* Calculate level to which we backed off */ for (i=actual_context_length-1;i>=0;i--) { int four_raise_i = 1<<(2*i); /* PWP */ /* * PWP: This was "if ((bo_case / (int) pow(3,i)) == 0)" * but was getting a divide-by-zero error on an Alpha * (it isn't clear to me why it should ever have done so) * Anyway, it is much faster to do in base-4. */ if ((bo_case == 0) || ((bo_case / four_raise_i) == 0)) { ngrams_hit[i+1]++; i = -2; }else bo_case -= ((bo_case / four_raise_i) * four_raise_i); } if (i != -3) ngrams_hit[0]++; if (out_probs) fprintf(probs_stream_fp,"%g\n",prob); sum_log_prob += log10(prob); } if (current_id == 0 && !include_unks) excluded_unks++; } else { if (((!arpa_lm) && ng->context_cue[current_id]) || (arpa_lm && arpa_ng->context_cue[current_id])) excluded_ccs++; } total_words++; } } } if (!found_unk_wrongly) { /* pow(x,y) = e**(y ln(x)) */ printf("Perplexity = %.2f, Entropy = %.2f bits\n", exp(-sum_log_prob/(total_words-excluded_ccs-excluded_unks) * log(10.0)), (-sum_log_prob/(total_words-excluded_ccs-excluded_unks) * log(10.0) / log(2.0))); printf("Computation based on %d words.\n", total_words-excluded_ccs-excluded_unks); for(i=n;i>=1;i--) { printf("Number of %d-grams hit = %d (%.2f%%)\n",i,ngrams_hit[i-1], (float) 100*ngrams_hit[i-1]/(total_words-excluded_ccs-excluded_unks) ); } printf("%d OOVs (%.2f%%) and %d context cues were removed from the calculation.\n", excluded_unks, (float) 100*excluded_unks/(total_words-excluded_ccs),excluded_ccs); } rr_iclose(text_stream_fp); if (out_probs) rr_oclose(probs_stream_fp); if (annotate) rr_oclose(annotation_fp); if (out_oovs) rr_oclose(oov_fp); free (fb_list); free (context); free (ngrams_hit); }
int oe_02_main (int argc, char **argv) { ng_t ng; arpa_lm_t arpa_ng; char input_string[500]; int num_of_args; char *args[MAX_ARGS]; char *lm_filename_arpa; char *lm_filename_binary; flag told_to_quit; flag inconsistant_parameters; flag backoff_from_unk_inc; flag backoff_from_unk_exc; flag backoff_from_ccs_inc; flag backoff_from_ccs_exc; flag arpa_lm; flag binary_lm; flag include_unks; char *fb_list_filename; char *probs_stream_filename; char *annotation_filename; char *text_stream_filename; char *oov_filename; char *ccs_filename; int generate_size; int random_seed; double log_base; char wlist_entry[1024]; char current_cc[200]; vocab_sz_t current_cc_id; FILE *context_cues_fp; int n; /* Process command line */ report_version(&argc,argv); if (pc_flagarg(&argc, argv,"-help") || argc == 1 || (strcmp(argv[1],"-binary") && strcmp(argv[1],"-arpa"))) { oe_02_help_message(); exit(1); } lm_filename_arpa = rr_salloc(pc_stringarg(&argc, argv,"-arpa","")); if (strcmp(lm_filename_arpa,"")) arpa_lm = 1; else arpa_lm = 0; lm_filename_binary = rr_salloc(pc_stringarg(&argc, argv,"-binary","")); if (strcmp(lm_filename_binary,"")) binary_lm = 1; else binary_lm = 0; if (arpa_lm && binary_lm) quit(-1,"Error : Can't use both -arpa and -binary flags.\n"); if (!arpa_lm && !binary_lm) quit(-1,"Error : Must specify either a binary or an arpa format language model.\n"); ccs_filename = rr_salloc(pc_stringarg(&argc, argv,"-context","")); if (binary_lm && strcmp(ccs_filename,"")) fprintf(stderr,"Warning - context cues file not needed with binary language model file.\nWill ignore it.\n"); pc_report_unk_args(&argc,argv,2); /* Load language model */ if (arpa_lm) { fprintf(stderr,"Reading in language model from file %s\n", lm_filename_arpa); load_arpa_lm(&arpa_ng,lm_filename_arpa); }else { fprintf(stderr,"Reading in language model from file %s\n", lm_filename_binary); load_lm(&ng,lm_filename_binary); } fprintf(stderr,"\nDone.\n"); n=arpa_lm? arpa_ng.n: ng.n; if (arpa_lm) { arpa_ng.context_cue = (flag *) rr_calloc(arpa_ng.table_sizes[0],sizeof(flag)); arpa_ng.no_of_ccs = 0; if (strcmp(ccs_filename,"")) { context_cues_fp = rr_iopen(ccs_filename); while (fgets (wlist_entry, sizeof (wlist_entry),context_cues_fp)) { if (strncmp(wlist_entry,"##",2)==0) continue; sscanf (wlist_entry, "%s ",current_cc); warn_on_wrong_vocab_comments(wlist_entry); if (sih_lookup(arpa_ng.vocab_ht,current_cc,¤t_cc_id) == 0) quit(-1,"Error : %s in the context cues file does not appear in the vocabulary.\n",current_cc); arpa_ng.context_cue[(unsigned short) current_cc_id] = 1; arpa_ng.no_of_ccs++; fprintf(stderr,"Context cue word : %s id = %lld\n",current_cc,current_cc_id); } rr_iclose(context_cues_fp); } } /* Process commands */ told_to_quit = 0; num_of_args = 0; while (!feof(stdin) && !told_to_quit) { printf("evallm : \n"); fgets(input_string, sizeof(input_string), stdin); if(strlen(input_string) < sizeof(input_string)-1) input_string[strlen(input_string)-1] = '\0'; //chop new-line else quit(1, "evallm input exceeds size of input buffer"); if (!feof(stdin)) { parse_comline(input_string,&num_of_args,args); log_base = pc_doublearg(&num_of_args,args,"-log_base",10.0); backoff_from_unk_inc = pc_flagarg(&num_of_args,args,"-backoff_from_unk_inc"); backoff_from_ccs_inc = pc_flagarg(&num_of_args,args,"-backoff_from_ccs_inc"); backoff_from_unk_exc = pc_flagarg(&num_of_args,args,"-backoff_from_unk_exc"); backoff_from_ccs_exc = pc_flagarg(&num_of_args,args,"-backoff_from_ccs_exc"); include_unks = pc_flagarg(&num_of_args,args,"-include_unks"); fb_list_filename = rr_salloc(pc_stringarg(&num_of_args,args,"-backoff_from_list","")); text_stream_filename = rr_salloc(pc_stringarg(&num_of_args,args,"-text","")); probs_stream_filename = rr_salloc(pc_stringarg(&num_of_args,args,"-probs","")); annotation_filename = rr_salloc(pc_stringarg(&num_of_args,args,"-annotate","")); oov_filename = rr_salloc(pc_stringarg(&num_of_args,args,"-oovs","")); generate_size = pc_intarg(&num_of_args,args,"-size",10000); random_seed = pc_intarg(&num_of_args,args,"-seed",-1); inconsistant_parameters = 0; if (backoff_from_unk_inc && backoff_from_unk_exc) { fprintf(stderr,"Error : Cannot specify both exclusive and inclusive forced backoff.\n"); fprintf(stderr,"Use only one of -backoff_from_unk_exc and -backoff_from_unk_inc\n"); inconsistant_parameters = 1; } if (backoff_from_ccs_inc && backoff_from_ccs_exc) { fprintf(stderr,"Error : Cannot specify both exclusive and inclusive forced backoff.\n"); fprintf(stderr,"Use only one of -backoff_from_ccs_exc and -backoff_from_ccs_inc\n"); inconsistant_parameters = 1; } if (num_of_args > 0) { if (!inconsistant_parameters) { if (!strcmp(args[0],"perplexity")) { compute_perplexity(&ng, &arpa_ng, text_stream_filename, probs_stream_filename, annotation_filename, oov_filename, fb_list_filename, backoff_from_unk_inc, backoff_from_unk_exc, backoff_from_ccs_inc, backoff_from_ccs_exc, arpa_lm, include_unks, log_base); }else /* do perplexity sentence by sentence [20090612] (air) */ if (!strcmp(args[0],"uttperp")) { FILE *uttfh,*tempfh; char utt[4096]; /* live dangerously... */ char tmpfil[128]; if ((uttfh = fopen(text_stream_filename,"r")) == NULL) { printf("Error: can't open %s\n",text_stream_filename); exit(1); } char *template = "uttperp_XXXXXX";// CHANGED HLW mkstemp(template);// CHANGED HLW
int oe_15_main( int argc, char **argv ) { FILE **fin; ngram *ng; ngram outng; flag *done, finished; int i, j, nfiles; /* Process the command line */ report_version(&argc,argv); procComLine( &argc, argv ) ; if( argc < 2 ) { printUsage( argv[0] ) ; exit( 1 ) ; } nfiles = argc - 1; /* allocate memory */ fin = (FILE **) rr_malloc( sizeof( FILE *) * nfiles ); done = (flag *) rr_malloc( sizeof( flag ) * nfiles ); ng = (ngram *) rr_malloc( sizeof( ngram ) * nfiles ); for( i = 0; i < nfiles; i++ ) { ng[i].id_array = (id__t *) rr_calloc( n, sizeof( id__t ) ); ng[i].n = n; } outng.id_array = (id__t *) rr_calloc( n, sizeof( id__t ) ); outng.n = n; /* open the input files */ for( i = 0; i < nfiles; i++ ) fin[i] = rr_iopen( argv[i+1] ); /* read first ngram from each file */ for( i = 0; i < nfiles; i++ ) { done[i] = 0; if ( !get_ngram( fin[i], &ng[i], ascii_in ) ) done[i] = 1; } finished = 0; while ( !finished ) { /* set outng to max possible */ for( i = 0; i < n; i++ ) outng.id_array[i] = MAX_VOCAB_SIZE; /* find smallest ngram */ for( i = 0; i < nfiles; i++ ) { if ( !done[i] ) if ( cmp_ngram( &outng, &ng[i] ) > 0 ) for( j = 0; j < n; j++ ) outng.id_array[j] = ng[i].id_array[j]; } outng.count = 0; for( i = 0; i < nfiles; i++ ) { if ( !done[i] ) { /* add counts of equal ngrams */ if ( cmp_ngram( &outng, &ng[i] ) == 0 ) { outng.count += ng[i].count; if ( !get_ngram( fin[i], &ng[i], ascii_in ) ) { /* check if all files done */ done[i] = 1; finished = 1; for( j = 0; j < nfiles; j++ ) if ( ! done[j] ) finished = 0; } } } } write_ngram( stdout, &outng, ascii_out ); } for( i = 0; i < nfiles; i++ ) rr_iclose( fin[i] ); fprintf(stderr,"mergeidngram : Done.\n"); return( 0 ); }
void merge_tempfiles (int start_file, int end_file, char *temp_file_root, char *temp_file_ext, int max_files, FILE *outfile, int n, int verbosity) { FILE *new_temp_file; char *new_temp_filename; FILE **temp_file; char **temp_filename; char **current_ngram; char smallest_ngram[1000]; int *current_ngram_count; flag *finished; flag all_finished; int temp_count; char temp_word[500]; int i,j; pc_message(verbosity,2,"Merging temp files %d through %d...\n", start_file, end_file); /* * If we try to do more than max_files, then merge into groups, * then merge groups recursively. */ if (end_file-start_file+1 > max_files) { int new_start_file, new_end_file; int n_file_groups = 1 + (end_file-start_file)/max_files; fprintf(stderr, "%d files to do, in %d groups\n", end_file-start_file, n_file_groups); new_temp_filename = (char *) rr_malloc(300*sizeof(char)); /* * These n_file_groups sets of files will be done in groups of * max_files batches each, as temp files numbered * end_file+1 ... end_file+n_file_groups, * and then these will be merged into the final result. */ for (i = 0; i < n_file_groups; i++) { /* do files i*max_files through min((i+1)*max_files-1,end_file); */ new_start_file = start_file + (i*max_files); new_end_file = start_file + ((i+1)*max_files) - 1; if (new_end_file > end_file) new_end_file = end_file; sprintf(new_temp_filename, "%s/%hu%s", temp_file_root, end_file+i+1, temp_file_ext); new_temp_file = rr_oopen(new_temp_filename); merge_tempfiles(new_start_file, new_end_file, temp_file_root, temp_file_ext, max_files, new_temp_file, n, verbosity); rr_iclose(new_temp_file); } merge_tempfiles(end_file+1, end_file+n_file_groups, temp_file_root, temp_file_ext, max_files, outfile, n, verbosity); return; } /* * We know we are now doing <= max_files. */ temp_file = (FILE **) rr_malloc((end_file+1)*sizeof(FILE *)); temp_filename = (char **) rr_malloc((end_file+1)*sizeof(char *)); for (i=start_file;i<=end_file;i++) { temp_filename[i] = (char *) rr_malloc(300*sizeof(char)); } current_ngram = (char **) rr_malloc((end_file+1)*sizeof(char *)); for (i=start_file;i<=end_file;i++) { current_ngram[i] = (char *) rr_malloc(1000*sizeof(char)); } current_ngram_count = (int *) rr_malloc((end_file+1)*sizeof(int)); finished = (flag *) rr_malloc(sizeof(flag)*(end_file+1)); /* Open all the temp files for reading */ for (i=start_file;i<=end_file;i++) { sprintf(temp_filename[i],"%s/%hu%s", temp_file_root,i,temp_file_ext); temp_file[i] = rr_iopen(temp_filename[i]); } /* Now go through the files simultaneously, and write out the appropriate ngram counts to the output file. */ for (i=start_file;i<=end_file;i++) { finished[i] = 0; if (!rr_feof(temp_file[i])) { for (j=0;j<=n-1;j++) { if (fscanf(temp_file[i],"%s",temp_word) != 1) { if (!rr_feof(temp_file[i])) quit(-1,"Error reading temp file %s\n",temp_filename[i]); }else { if (j==0) strcpy(current_ngram[i],temp_word); else { strcat(current_ngram[i]," "); strcat(current_ngram[i],temp_word); } } } if (fscanf(temp_file[i],"%d",¤t_ngram_count[i]) != 1) { if (!rr_feof(temp_file[i])) quit(-1,"Error reading temp file %s\n",temp_filename[i]); } } } all_finished = 0; while (!all_finished) { /* Find the smallest current ngram */ strcpy(smallest_ngram,""); for (i=start_file;i<=end_file;i++) { if (!finished[i]) { if (strcmp(smallest_ngram,current_ngram[i]) > 0 || (smallest_ngram[0] == '\0')) strcpy(smallest_ngram,current_ngram[i]); } } /* For each of the files that are currently holding this ngram, add its count to the temporary count, and read in a new ngram from the files. */ temp_count = 0; for (i=start_file;i<=end_file;i++) { if (!finished[i]) { if (!strcmp(smallest_ngram,current_ngram[i])) { temp_count += current_ngram_count[i]; if (!rr_feof(temp_file[i])) { for (j=0;j<=n-1;j++) { if (fscanf(temp_file[i],"%s",temp_word) != 1) { if (!rr_feof(temp_file[i])) { quit(-1,"Error reading temp file %s\n",temp_filename[i]); } }else { if (j==0) strcpy(current_ngram[i],temp_word); else { strcat(current_ngram[i]," "); strcat(current_ngram[i],temp_word); } } } if (fscanf(temp_file[i],"%d",¤t_ngram_count[i]) != 1) { if (!rr_feof(temp_file[i])) { quit(-1,"Error reading temp file count %s\n", temp_filename[i]); } } } /* * PWP: Note that the fscanf may have changed the state of * temp_file[i], so we re-ask the question rather than just * doing an "else". */ if (rr_feof(temp_file[i])) { finished[i] = 1; all_finished = 1; for (j=start_file;j<=end_file;j++) { if (!finished[j]) { all_finished = 0; } } } } } } /* * PWP: We cannot conditionalize this on (!all_finished) because * if we do we may have lost the very last count. (Consider the * case when several files have ran out of data, but the last * couple have the last count in them.) */ if (fprintf(outfile,"%s %d\n",smallest_ngram,temp_count) < 0) { quit(-1,"Write error encountered while attempting to merge temporary files.\nAborting, but keeping temporary files.\n"); } } for (i=start_file;i<=end_file;i++) { rr_iclose(temp_file[i]); remove(temp_filename[i]); } free(temp_file); for (i=start_file;i<=end_file;i++) { free(temp_filename[i]); } free(temp_filename); for (i=start_file;i<=end_file;i++) { free(current_ngram[i]); } free(current_ngram); free(current_ngram_count); free(finished); }
void main (int argc, char **argv) { ng_t ng; arpa_lm_t arpa_ng; char input_string[500]; int num_of_args; char *args[MAX_ARGS]; char *lm_filename_arpa; char *lm_filename_binary; flag told_to_quit; flag inconsistant_parameters; flag backoff_from_unk_inc; flag backoff_from_unk_exc; flag backoff_from_ccs_inc; flag backoff_from_ccs_exc; flag arpa_lm; flag binary_lm; flag include_unks; char *fb_list_filename; char *probs_stream_filename; char *annotation_filename; char *text_stream_filename; char *oov_filename; char *ccs_filename; double log_base; char wlist_entry[1024]; char current_cc[200]; int current_cc_id; FILE *context_cues_fp; int n; int generate_size = 10000; int random_seed; /* Process command line */ report_version(&argc,argv); if (pc_flagarg(&argc, argv,"-help") || argc == 1 || (strcmp(argv[1],"-binary") && strcmp(argv[1],"-arpa"))) { fprintf(stderr,"evallm : Evaluate a language model.\n"); fprintf(stderr,"Usage : evallm [ -binary .binlm | \n"); fprintf(stderr," -arpa .arpa [ -context .ccs ] ]\n"); exit(1); } lm_filename_arpa = salloc(pc_stringarg(&argc, argv,"-arpa","")); if (strcmp(lm_filename_arpa,"")) { arpa_lm = 1; } else { arpa_lm = 0; } lm_filename_binary = salloc(pc_stringarg(&argc, argv,"-binary","")); if (strcmp(lm_filename_binary,"")) { binary_lm = 1; } else { binary_lm = 0; } if (arpa_lm && binary_lm) { quit(-1,"Error : Can't use both -arpa and -binary flags.\n"); } if (!arpa_lm && !binary_lm) { quit(-1,"Error : Must specify either a binary or an arpa format language model.\n"); } ccs_filename = salloc(pc_stringarg(&argc, argv,"-context","")); if (binary_lm && strcmp(ccs_filename,"")) { fprintf(stderr,"Warning - context cues file not needed with binary language model file.\nWill ignore it.\n"); } pc_report_unk_args(&argc,argv,2); /* Load language model */ if (arpa_lm) { fprintf(stderr,"Reading in language model from file %s\n", lm_filename_arpa); load_arpa_lm(&arpa_ng,lm_filename_arpa); } else { fprintf(stderr,"Reading in language model from file %s\n", lm_filename_binary); load_lm(&ng,lm_filename_binary); } fprintf(stderr,"\nDone.\n"); if (!arpa_lm) { n=ng.n; } else { n=arpa_ng.n; } if (arpa_lm) { arpa_ng.context_cue = (flag *) rr_calloc(arpa_ng.table_sizes[0],sizeof(flag)); arpa_ng.no_of_ccs = 0; if (strcmp(ccs_filename,"")) { context_cues_fp = rr_iopen(ccs_filename); while (fgets (wlist_entry, sizeof (wlist_entry),context_cues_fp)) { if (strncmp(wlist_entry,"##",2)==0) continue; sscanf (wlist_entry, "%s ",current_cc); if (strncmp(wlist_entry,"#",1)==0) { fprintf(stderr,"\n\n===========================================================\n"); fprintf(stderr,":\nWARNING: line assumed NOT a comment:\n"); fprintf(stderr, ">>> %s <<<\n",wlist_entry); fprintf(stderr, " '%s' will be included in the context cues list\n",current_cc); fprintf(stderr, " (comments must start with '##')\n"); fprintf(stderr,"===========================================================\n\n"); } if (sih_lookup(arpa_ng.vocab_ht,current_cc,¤t_cc_id) == 0) { quit(-1,"Error : %s in the context cues file does not appear in the vocabulary.\n",current_cc); } arpa_ng.context_cue[(unsigned short) current_cc_id] = 1; arpa_ng.no_of_ccs++; fprintf(stderr,"Context cue word : %s id = %d\n",current_cc,current_cc_id); } rr_iclose(context_cues_fp); } } /* Process commands */ told_to_quit = 0; num_of_args = 0; while (!feof(stdin) && !told_to_quit) { printf("evallm : "); gets(input_string); if (!feof(stdin)) { parse_comline(input_string,&num_of_args,args); random_seed = pc_intarg(&num_of_args,args,"-seed",-1); generate_size = pc_intarg(&num_of_args,args,"-size",10000); log_base = pc_doublearg(&num_of_args,args,"-log_base",10.0); backoff_from_unk_inc = pc_flagarg(&num_of_args,args, "-backoff_from_unk_inc"); backoff_from_ccs_inc = pc_flagarg(&num_of_args,args, "-backoff_from_ccs_inc"); backoff_from_unk_exc = pc_flagarg(&num_of_args,args, "-backoff_from_unk_exc"); backoff_from_ccs_exc = pc_flagarg(&num_of_args,args, "-backoff_from_ccs_exc"); include_unks = pc_flagarg(&num_of_args,args,"-include_unks"); fb_list_filename = salloc(pc_stringarg(&num_of_args,args, "-backoff_from_list","")); text_stream_filename = salloc(pc_stringarg(&num_of_args,args,"-text","")); probs_stream_filename = salloc(pc_stringarg(&num_of_args,args,"-probs","")); annotation_filename = salloc(pc_stringarg(&num_of_args,args,"-annotate","")); oov_filename = salloc(pc_stringarg(&num_of_args,args,"-oovs","")); inconsistant_parameters = 0; if (backoff_from_unk_inc && backoff_from_unk_exc) { fprintf(stderr,"Error : Cannot specify both exclusive and inclusive forced backoff.\n"); fprintf(stderr,"Use only one of -backoff_from_unk_exc and -backoff_from_unk_inc\n"); inconsistant_parameters = 1; } if (backoff_from_ccs_inc && backoff_from_ccs_exc) { fprintf(stderr,"Error : Cannot specify both exclusive and inclusive forced backoff.\n"); fprintf(stderr,"Use only one of -backoff_from_ccs_exc and -backoff_from_ccs_inc\n"); inconsistant_parameters = 1; } if (num_of_args > 0) { if (!inconsistant_parameters) { if (!strcmp(args[0],"perplexity")) { compute_perplexity(&ng, &arpa_ng, text_stream_filename, probs_stream_filename, annotation_filename, oov_filename, fb_list_filename, backoff_from_unk_inc, backoff_from_unk_exc, backoff_from_ccs_inc, backoff_from_ccs_exc, arpa_lm, include_unks, log_base); } else { if (!strcmp(args[0],"validate")) { if (num_of_args != n) { fprintf(stderr,"Error : must specify %d words of context.\n", n-1); } else { /* Assume last n-1 parameters form context */ validate(&ng, &arpa_ng, &(args[num_of_args-n+1]), backoff_from_unk_inc, backoff_from_unk_exc, backoff_from_ccs_inc, backoff_from_ccs_exc, arpa_lm, fb_list_filename); } } else { if (!strcmp(args[0],"stats")) { if (arpa_lm) { display_arpa_stats(&arpa_ng); } else { display_stats(&ng); } } else { if (!strcmp(args[0],"quit")) { told_to_quit=1; } else if (!strcmp(args[0],"generate")) { if(arpa_lm) generate_words(NULL,&arpa_ng,generate_size,random_seed,text_stream_filename); else generate_words(&ng,NULL,generate_size,random_seed,text_stream_filename); } else { if (!strcmp(args[0],"help")) { printf("The user may specify one of the following commands: \n"); printf("\n"); printf(" - perplexity\n"); printf("\n"); printf("Computes the perplexity of a given text. May optionally specify words\n"); printf("from which to force back-off.\n"); printf("\n"); printf("Syntax: \n"); printf("\n"); printf("perplexity -text .text\n"); printf(" [ -probs .fprobs ]\n"); printf(" [ -oovs .oov_file ]\n"); printf(" [ -annotate .annotation_file ] \n"); printf(" [ -backoff_from_unk_inc | -backoff_from_unk_exc ]\n"); printf(" [ -backoff_from_ccs_inc | -backoff_from_ccs_exc ] \n"); printf(" [ -backoff_from_list .fblist ]\n"); printf(" [ -include_unks ]\n"); printf("\n"); printf(" - validate\n"); printf(" \n"); printf("Calculate the sum of the probabilities of all the words in the\n"); printf("vocabulary given the context specified by the user.\n"); printf("\n"); printf("Syntax: \n"); printf("\n"); printf("validate [ -backoff_from_unk -backoff_from_ccs |\n"); printf(" -backoff_from_list .fblist ]\n"); printf(" [ -forced_backoff_inc | -forced_back_off_exc ] \n"); printf(" word1 word2 ... word_(n-1)\n"); printf("\n"); printf("Where n is the n in n-gram. \n"); printf("\n"); printf(" - help\n"); printf("\n"); printf("Displays this help message.\n"); printf("\n"); printf("Syntax: \n"); printf("\n"); printf("help\n"); printf("\n"); printf(" - quit\n"); printf("\n"); printf("Exits the program.\n"); printf("\n"); printf("Syntax: \n"); printf("\n"); printf("quit\n"); } else { fprintf(stderr,"Unknown command : %s\nType \'help\'\n", args[0]); } } } } } } } } } fprintf(stderr,"evallm : Done.\n"); exit(0); }
int main(int argc, char **argv) { int i,j; ng_t* ng; int verbosity; int mem_alloc_method; /* Method used to decide how much memory to allocate for count tables */ int buffer_size; flag is_ascii; ngram current_ngram; ngram previous_ngram; count_t *ng_count; /* Array indicating the number of occurrances of the current 1-gram, 2-gram, ... ,n-gram Size depends on #define in general.h */ int nlines; int pos_of_novelty; int prev_id1; flag contains_unks; int mem_alloced; flag displayed_oov_warning; /** Display OOV warning */ /* ------------------ Process command line --------------------- */ report_version(&argc,argv); if (argc == 1 || pc_flagarg(&argc, argv,"-help")) { /* Display help message */ help_message(); exit(1); } verbosity = pc_intarg(&argc, argv,"-verbosity",DEFAULT_VERBOSITY); /* Initialization */ { ng=init_ng( &argc, argv, verbosity ); mem_alloc_method = init_alloc_method(ng, &argc, argv, &buffer_size); if (!strcmp(ng->id_gram_filename,"-") && mem_alloc_method == TWO_PASSES) quit(-1,"Error: If idngram is read from stdin, then cannot use -calc_mem option.\n"); is_ascii = set_lmformat(pc_flagarg(&argc,argv,"-ascii_input"), pc_flagarg(&argc,argv,"-bin_input"), ng); /* Report parameters */ report_param(verbosity,ng, is_ascii, mem_alloc_method, buffer_size); pc_report_unk_args(&argc,argv,verbosity); } /* --------------- Read in the vocabulary -------------- */ read_vocab(ng,verbosity); /* --------------- Allocate space for the table_size array --------- */ init_ng_table_size(ng, mem_alloc_method, is_ascii, verbosity, buffer_size ); /* ----------- Allocate memory for tree structure -------------- */ ng->count = NULL; ng->count4 = NULL; ng->marg_counts = NULL; ng->marg_counts4 = NULL; ng->count_table = NULL; ng->count = (count_ind_t **) rr_malloc(sizeof(count_ind_t *)*ng->n); ng->count4 = (count_t **) rr_malloc(sizeof(count_t *)*ng->n); ng->count_table = (count_t **) rr_malloc(sizeof(count_t *)*ng->n); if (ng->four_byte_counts) { ng->marg_counts4 = (count_t *) rr_calloc(sizeof(count_t), ng->table_sizes[0]); }else { for (i=0;i<=ng->n-1;i++) ng->count_table[i] = (count_t *) rr_calloc(ng->count_table_size+1, sizeof(count_t)); ng->marg_counts = (count_ind_t *) rr_calloc(sizeof(count_ind_t),ng->table_sizes[0]); fprintf(stderr, "table_size %d\n",ng->table_sizes[0]); fflush(stderr); } ng->word_id = (id__t **) rr_malloc(sizeof(id__t *)*ng->n); if (ng->four_byte_alphas) { ng->bo_weight4 = (four_byte_t **) rr_malloc(sizeof(four_byte_t *)*ng->n); ng->bo_weight4[0] = (four_byte_t *) rr_malloc(sizeof(four_byte_t)* ng->table_sizes[0]); }else { ng->bo_weight = (bo_weight_t **) rr_malloc(sizeof(bo_weight_t *)*ng->n); ng->bo_weight[0] = (bo_weight_t *) rr_malloc(sizeof(bo_weight_t)* ng->table_sizes[0]); } ng->ind = (index__t **) rr_malloc(sizeof(index__t *)*ng->n); /* First table */ if (ng->four_byte_counts) ng->count4[0] = (count_t *) rr_calloc(ng->table_sizes[0],sizeof(count_t)); else ng->count[0] = (count_ind_t *) rr_calloc(ng->table_sizes[0],sizeof(count_ind_t)); ng->uni_probs = (uni_probs_t *) rr_malloc(sizeof(uni_probs_t)* ng->table_sizes[0]); ng->uni_log_probs = (uni_probs_t *) rr_malloc(sizeof(uni_probs_t)* ng->table_sizes[0]); if (ng->n >=2) ng->ind[0] = (index__t *) rr_calloc(ng->table_sizes[0],sizeof(index__t)); for (i=1;i<=ng->n-2;i++) { ng->word_id[i] = (id__t *) rr_malloc(sizeof(id__t)*ng->table_sizes[i]); if (ng->four_byte_counts) ng->count4[i] = (count_t *) rr_malloc(sizeof(count_t)*ng->table_sizes[i]); else ng->count[i] = (count_ind_t *) rr_malloc(sizeof(count_ind_t)*ng->table_sizes[i]); if (ng->four_byte_alphas) ng->bo_weight4[i] = (four_byte_t *) rr_malloc(sizeof(four_byte_t)*ng->table_sizes[i]); else ng->bo_weight[i] = (bo_weight_t *) rr_malloc(sizeof(bo_weight_t)*ng->table_sizes[i]); ng->ind[i] = (index__t *) rr_malloc(sizeof(index__t)*ng->table_sizes[i]); mem_alloced = sizeof(count_ind_t) + sizeof(bo_weight_t) + sizeof(index__t) + sizeof(id__t); if (ng->four_byte_alphas) mem_alloced += 4; mem_alloced *= ng->table_sizes[i]; pc_message(verbosity,2,"Allocated %d bytes to table for %d-grams.\n", mem_alloced,i+1); } ng->word_id[ng->n-1] = (id__t *) rr_malloc(sizeof(id__t)*ng->table_sizes[ng->n-1]); if (ng->four_byte_counts) ng->count4[ng->n-1] = (count_t *) rr_malloc(sizeof(count_t)*ng->table_sizes[ng->n-1]); else ng->count[ng->n-1] = (count_ind_t *) rr_malloc(sizeof(count_ind_t)*ng->table_sizes[ng->n-1]); pc_message(verbosity,2,"Allocated (%d+%d) bytes to table for %d-grams.\n", ng->four_byte_counts?sizeof(count_t):sizeof(count_ind_t), sizeof(id__t)*ng->table_sizes[ng->n-1],ng->n); /* Allocate memory for table for first-byte of indices */ ng_allocate_ptr_table(ng,NULL,0); /* Allocate memory for alpha array */ ng->alpha_array = (double *) rr_malloc(sizeof(double)*ng->out_of_range_alphas); ng->size_of_alpha_array = 0; /* Allocate memory for frequency of frequency information */ ng->freq_of_freq = (fof_t **) rr_malloc(sizeof(fof_t *)*ng->n); NG_DISC_METH(ng)->allocate_freq_of_freq(ng); /* Read n-grams into the tree */ pc_message(verbosity,2,"Processing id n-gram file.\n"); pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n"); /* Allocate space for ngrams id arrays */ current_ngram.id_array = (id__t *) rr_calloc(ng->n,sizeof(id__t)); previous_ngram.id_array = (id__t *) rr_calloc(ng->n,sizeof(id__t)); current_ngram.n = ng->n; previous_ngram.n = ng->n; ng->num_kgrams = (ngram_sz_t *) rr_calloc(ng->n,sizeof(ngram_sz_t)); ng_count = (count_t *) rr_calloc(ng->n,sizeof(count_t)); nlines = 1; ng->n_unigrams = 0; /* Process first n-gram */ get_ngram(ng->id_gram_fp,¤t_ngram,is_ascii); contains_unks = ngram_chk_contains_unks(¤t_ngram,ng->n); /* Skip over any unknown words. They will come first, because <UNK> always has a word ID of zero. */ while (ng->vocab_type == CLOSED_VOCAB && contains_unks){ /* Stop looking if there are no more N-Grams. Of course, this means training will fail, since there are no unigrams. */ if (get_ngram(ng->id_gram_fp,¤t_ngram,is_ascii) == 0) break; contains_unks = ngram_chk_contains_unks(¤t_ngram,ng->n); } for (i=0;i<=ng->n-2;i++) { ng->ind[i][0] = new_index(0,ng->ptr_table[i],&(ng->ptr_table_size[i]),0); ng->word_id[i+1][0] = current_ngram.id_array[i+1]; ng->num_kgrams[i+1]++; ng_count[i] = current_ngram.count; } ng_count[0] = current_ngram.count; NG_DISC_METH(ng)->update_freq_of_freq(ng,ng->n-1,current_ngram.count); store_normal_count(ng,0,current_ngram.count,ng->n-1); if (current_ngram.count <= ng->cutoffs[ng->n-2]) ng->num_kgrams[ng->n-1]--; ngram_copy(&previous_ngram,¤t_ngram,ng->n); prev_id1 = current_ngram.id_array[0]; displayed_oov_warning = 0; while (!rr_feof(ng->id_gram_fp)) { if (get_ngram(ng->id_gram_fp,¤t_ngram,is_ascii)) { if (ng->vocab_type == CLOSED_VOCAB) contains_unks=ngram_chk_contains_unks(¤t_ngram,ng->n); if (!contains_unks || ng->vocab_type != CLOSED_VOCAB) { /* Test for where this ngram differs from last - do we have an out-of-order ngram? */ pos_of_novelty = ngram_find_pos_of_novelty(¤t_ngram,&previous_ngram,ng->n,nlines); nlines++; show_idngram_nlines(nlines, verbosity); /* Add new n-gram as soon as it is encountered */ /* If all of the positions 2,3,...,n of the n-gram are context cues then ignore the n-gram. */ if (ng->n > 1) { NG_DISC_METH(ng)->update_freq_of_freq(ng,ng->n-1,current_ngram.count); store_normal_count(ng,ng->num_kgrams[ng->n-1],current_ngram.count,ng->n-1); ng->word_id[ng->n-1][ng->num_kgrams[ng->n-1]] = current_ngram.id_array[ng->n-1]; ng->num_kgrams[ng->n-1]++; if (ng->num_kgrams[ng->n-1] >= ng->table_sizes[ng->n-1]) quit(-1,"\nMore than %d %d-grams needed to be stored. Rerun with a higher table size.\n",ng->table_sizes[ng->n-1],ng->n); } /* Deal with new 2,3,...,(n-1)-grams */ for (i=ng->n-2;i>=MAX(1,pos_of_novelty);i--) { NG_DISC_METH(ng)->update_freq_of_freq(ng,i,ng_count[i]); if (ng_count[i] <= ng->cutoffs[i-1]) ng->num_kgrams[i]--; else store_normal_count(ng,ng->num_kgrams[i]-1,ng_count[i],i); ng_count[i] = current_ngram.count; ng->word_id[i][ng->num_kgrams[i]] = current_ngram.id_array[i]; ng->ind[i][ng->num_kgrams[i]] = new_index(ng->num_kgrams[i+1]-1, ng->ptr_table[i], &(ng->ptr_table_size[i]), ng->num_kgrams[i]); ng->num_kgrams[i]++; if (ng->num_kgrams[i] >= ng->table_sizes[i]) quit(-1,"More than %d %d-grams needed to be stored. Rerun with a higher table size.\n",ng->table_sizes[i],i+1); } for (i=0;i<=pos_of_novelty-1;i++) ng_count[i] += current_ngram.count; /* Deal with new 1-grams */ if (pos_of_novelty == 0) { if (ng->n>1) { for (i = prev_id1 + 1; i <= current_ngram.id_array[0]; i++) { ng->ind[0][i] = new_index(ng->num_kgrams[1]-1, ng->ptr_table[0], &(ng->ptr_table_size[0]), i); } prev_id1 = current_ngram.id_array[0]; } NG_DISC_METH(ng)->update_freq_of_freq(ng,0,ng_count[0]); if (!ng->context_cue[previous_ngram.id_array[0]]) { ng->n_unigrams += ng_count[0]; store_normal_count(ng,previous_ngram.id_array[0],ng_count[0],0); } store_marginal_count(ng,previous_ngram.id_array[0],ng_count[0],0); ng_count[0] = current_ngram.count; } if (current_ngram.count <= ng->cutoffs[ng->n-2]) ng->num_kgrams[ng->n-1]--; ngram_copy(&previous_ngram,¤t_ngram,ng->n); }else { if (!displayed_oov_warning){ pc_message(verbosity,2,"Warning : id n-gram stream contains OOV's (n-grams will be ignored).\n"); displayed_oov_warning = 1; } } } } rr_iclose(ng->id_gram_fp); for (i=ng->n-2;i>=1;i--) { NG_DISC_METH(ng)->update_freq_of_freq(ng,i,ng_count[i]); if (ng_count[i] <= ng->cutoffs[i-1]) ng->num_kgrams[i]--; else store_normal_count(ng,ng->num_kgrams[i]-1,ng_count[i],i); } NG_DISC_METH(ng)->update_freq_of_freq(ng,0,ng_count[0]); if (!ng->context_cue[current_ngram.id_array[0]]) { ng->n_unigrams += ng_count[0]; store_normal_count(ng,current_ngram.id_array[0],ng_count[0],0); } store_marginal_count(ng,current_ngram.id_array[0],ng_count[0],0); if (ng->n>1) { for (i=current_ngram.id_array[0]+1;i<=ng->vocab_size;i++) ng->ind[0][i] = new_index(ng->num_kgrams[1], ng->ptr_table[0], &(ng->ptr_table_size[0]), current_ngram.id_array[0]); } /* The idngram reading is completed at this point */ pc_message(verbosity,2,"\n"); /* Impose a minimum unigram count, if required */ if (ng->min_unicount > 0) { int nchanged= 0; for (i=ng->first_id;i<=ng->vocab_size;i++) { if ((return_count(ng->four_byte_counts, ng->count_table[0], ng->count[0], ng->count4[0], i) < ng->min_unicount) && !ng->context_cue[i]) { /* There was a bug in V2's switch. Look at segment for ABSOLUTE */ NG_DISC_METH(ng)->reduce_ug_freq_of_freq(ng,i); ng->n_unigrams += (ng->min_unicount - ng->count[0][i]); store_normal_count(ng,i,ng->min_unicount,0); nchanged++; } } if (nchanged > 0) pc_message(verbosity,2, "Unigram counts of %d words were bumped up to %d.\n", nchanged,ng->min_unicount); } /* Count zeroton information for unigrams */ ng->freq_of_freq[0][0] = 0; for (i=ng->first_id;i<=ng->vocab_size;i++) { if (return_count(ng->four_byte_counts, ng->count_table[0], ng->count[0], ng->count4[0], i) == 0) { ng->freq_of_freq[0][0]++; } } if (ng->discounting_method == GOOD_TURING) { for (i=0;i<=ng->n-1;i++) for (j=1;j<=ng->fof_size[i];j++) pc_message(verbosity,3,"fof[%d][%d] = %d\n",i,j,ng->freq_of_freq[i][j]); } pc_message(verbosity,2,"Calculating discounted counts.\n"); NG_DISC_METH(ng)->compute_discount_aux(ng, verbosity); /* Smooth unigram distribution, to give some mass to zerotons */ compute_unigram(ng,verbosity); /* Increment Contexts if using Good-Turing discounting-> No need otherwise, since all values are discounted anyway. */ if (ng->discounting_method == GOOD_TURING) { pc_message(verbosity,2,"Incrementing contexts...\n"); for (i=ng->n-1;i>=1;i--) increment_context(ng,i,verbosity); } /* Calculate back-off weights */ pc_message(verbosity,2,"Calculating back-off weights...\n"); for (i=1;i<=ng->n-1;i++) compute_back_off(ng,i,verbosity); if (!ng->four_byte_alphas) pc_message(verbosity,3,"Number of out of range alphas = %d\n", ng->size_of_alpha_array); /* Write out LM */ pc_message(verbosity,2,"Writing out language model...\n"); if (ng->write_arpa) write_arpa_lm(ng,verbosity); if (ng->write_bin) write_bin_lm(ng,verbosity); pc_message(verbosity,0,"idngram2lm : Done.\n"); return 0; }