void compute_back_off(ng_t *ng,int n, int verbosity) { int *current_pos; int *end_pos; id__t *sought_ngram; int current_table; int ng_count; int i; double sum_cond_prob; double sum_bo_prob; double discounted_ngcount; double cond_prob; double bo_prob; double discount_mass; double leftout_bo_prob; double alpha; int bo_case; sum_cond_prob = 0.0; sum_bo_prob = 0.0; /* For the sake of warning-free compilation... */ discounted_ngcount = 0.0; current_pos = (int *)rr_calloc(n+1,sizeof(int)); sought_ngram = (id__t *) rr_calloc(n+1,sizeof(id__t)); end_pos = (int *)rr_calloc(n+1,sizeof(int)); /* Process the tree so that we get all the n-grams out in the right order. */ for (current_pos[0]=ng->first_id; current_pos[0]<=ng->vocab_size; current_pos[0]++) { if (return_count(ng->four_byte_counts, ng->count_table[0], ng->marg_counts, ng->marg_counts4, current_pos[0]) > 0) { current_table = 1; if (current_pos[0] == ng->vocab_size) { end_pos[1] = ng->num_kgrams[1]-1; } else { end_pos[1] = get_full_index(ng->ind[0][current_pos[0]+1], ng->ptr_table[0], ng->ptr_table_size[0], current_pos[0]+1)-1; } while (current_table > 0) { if (current_table == n) { if (current_pos[n] <= end_pos[n]){ ng_count = return_count(ng->four_byte_counts, ng->count_table[n], ng->count[n], ng->count4[n], current_pos[n]); switch (ng->discounting_method) { case GOOD_TURING: if (ng_count <= ng->disc_range[n]) { discounted_ngcount = ng->gt_disc_ratio[n][ng_count] * ng_count; } else { discounted_ngcount = ng_count; } break; case LINEAR: discounted_ngcount = ng->lin_disc_ratio[n] * ng_count; break; case ABSOLUTE: discounted_ngcount = ng_count - ng->abs_disc_const[n]; break; case WITTEN_BELL: if (n==1) { discounted_ngcount = ((double) return_count(ng->four_byte_counts, ng->count_table[0], ng->marg_counts, ng->marg_counts4, current_pos[0]) * ng_count) / (return_count(ng->four_byte_counts, ng->count_table[0], ng->marg_counts, ng->marg_counts4, current_pos[0]) + num_of_types(0,current_pos[0],ng)); } else { discounted_ngcount = ((double) return_count(ng->four_byte_counts, ng->count_table[n-1], ng->count[n-1], ng->count4[n-1], current_pos[n-1])* ng_count) / (return_count(ng->four_byte_counts, ng->count_table[n-1], ng->count[n-1], ng->count4[n-1], current_pos[n-1]) + num_of_types(n-1,current_pos[n-1],ng)); } break; } if (n==1) { cond_prob = ((double) discounted_ngcount / return_count(ng->four_byte_counts, ng->count_table[0], ng->marg_counts, ng->marg_counts4, current_pos[0])); } else { cond_prob = ((double) discounted_ngcount / return_count(ng->four_byte_counts, ng->count_table[n-1], ng->count[n-1], ng->count4[n-1], current_pos[n-1])); } sum_cond_prob += cond_prob; /* Fill up sought ngram array with correct stuff */ for (i=1;i<=n;i++) { sought_ngram[i-1] = ng->word_id[i][current_pos[i]]; } bo_ng_prob(n-1,sought_ngram,ng,verbosity,&bo_prob,&bo_case); sum_bo_prob += bo_prob; current_pos[n]++; } else { discount_mass = 1.0 - sum_cond_prob; if (discount_mass < 1e-10) { discount_mass = 0.0; pc_message(verbosity,2,"Warning : Back off weight for %s(id %d) ", ng->vocab[current_pos[0]],current_pos[0]); for (i=1;i<=n-1;i++) { pc_message(verbosity,2,"%s(id %d) ",ng->vocab[ng->word_id[i][current_pos[i]]],ng->word_id[i][current_pos[i]]); } pc_message(verbosity,2, "is set to 0 (sum of probs = %f).\nMay cause problems with zero probabilities.\n",sum_cond_prob); } leftout_bo_prob = 1.0 - sum_bo_prob; if (leftout_bo_prob < 1e-10) { leftout_bo_prob = 0.0; } if (leftout_bo_prob > 0.0) { alpha = discount_mass / leftout_bo_prob; } else { alpha = 0.0; /* Will not be used. Should happen very rarely. */ pc_message(verbosity,2,"Warning : Back off weight for %s(id %d) ", ng->vocab[current_pos[0]],current_pos[0]); for (i=1;i<=n-1;i++) { pc_message(verbosity,2,"%s(id %d) ",ng->vocab[ng->word_id[i][current_pos[i]]],ng->word_id[i][current_pos[i]]); } pc_message(verbosity,2, "is set to 0.\nMay cause problems with zero probabilities.\n"); } if (ng->four_byte_alphas) { ng->bo_weight4[n-1][current_pos[n-1]] = alpha; } else { ng->bo_weight[n-1][current_pos[n-1]] = short_alpha(alpha, ng->alpha_array, &(ng->size_of_alpha_array), 65535 - ng->out_of_range_alphas, ng->min_alpha, ng->max_alpha); } /* Finished current (n-1)-gram */ sum_cond_prob = 0.0; sum_bo_prob = 0.0; current_table--; if (current_table > 0) { current_pos[current_table]++; } } } else { if (current_pos[current_table] <= end_pos[current_table]) { current_table++; if (current_pos[current_table-1] == ng->num_kgrams[current_table-1]-1) { end_pos[current_table] = ng->num_kgrams[current_table]-1; } else { end_pos[current_table] = get_full_index(ng->ind[current_table-1][current_pos[current_table-1]+1],ng->ptr_table[current_table-1],ng->ptr_table_size[current_table-1],current_pos[current_table-1]+1)-1; } } else { current_table--; if (current_table > 0) { current_pos[current_table]++; } } } } } /* Now deal with zeroton unigrams */ else { if (n == 1) { if (ng->four_byte_alphas) { ng->bo_weight4[0][current_pos[0]] = 1.0; } else { ng->bo_weight[0][current_pos[0]] = short_alpha(1.0, ng->alpha_array, &(ng->size_of_alpha_array), 65535 - ng->out_of_range_alphas, ng->min_alpha, ng->max_alpha); } } } } free(end_pos); free(current_pos); free(sought_ngram); }
/* generate_words implements the "generate" command in the evallm command loop. It can take the following flags (yes, I know I should add this to the help text in evallm.c): -text filename The name of the file where text will be generated. -size 10000 An optional flag specifying the number of generated words. If unspecified, it will default to 10000. -seed 1237688 An optional flag specifying a seed for the random number generator. If unspecified, it will be set to the time in seconds since the epoch. Caveats: 1) It currently only works for binary language model files. 2) It currently assumes the language model is a trigram, although it probably wouldn't take much effort to make it more general. 3) When generating the first word, it currently tries to use an initial history of "<s> <s>" if "<s>" is in the vocabulary and just picks the word at index 1 otherwise. Something better could probably be done in this case (e.g. looking for other context cue words in the vocab list). 4) Certain platforms appear to use something other than RAND_MAX for the maximum number that can be generated by random(). This could cause strange things to happen, but could be fixed very easily by replacing RAND_MAX by the appropriate platform-specific constant. */ void generate_words(ng_t *png,arpa_lm_t *pang, int num_words,int random_seed,char *output_filename) { FILE *output_file; int i,j,bo_case,initial_history_id; id__t sought_trigram[3]; double p,acc,trigram_prob; vocab_sz_t lm_vocab_sz; char** lm_vocab; if(png!=NULL && pang!=NULL) quit(-1,"Confused by multiple input type.\n"); if(png!=NULL){ lm_vocab_sz=png->vocab_size; lm_vocab=png->vocab; } if(pang!=NULL){ quit(-1,"Currently doesn't support arpa input, please use the binary format created by idngram2lm.\n"); lm_vocab_sz=pang->vocab_size; lm_vocab=pang->vocab; } if (!(output_file=fopen(output_filename,"w"))) { fprintf(stderr,"Error: could not open %s for writing.\n",output_filename); fprintf(stderr, "Syntax: generate -seed seed_of_random_generator -size size_of_file -text output text file \n"); return; } if (random_seed==-1) random_seed=(unsigned int)time(NULL); srandom(random_seed); printf("Using %d as a random seed.\n",random_seed); initial_history_id = -1; for (i=0; i<lm_vocab_sz; ++i){ if (!strcmp(BEGIN_OF_SENTENCE_STRING,lm_vocab[i])){ initial_history_id = i; fprintf(stderr,"Found %s in the vocabulary at index %d.\n", BEGIN_OF_SENTENCE_STRING,i); break; } } if (initial_history_id == -1) { fprintf(stderr,"Did not find %s in the vocabulary.\n", BEGIN_OF_SENTENCE_STRING); /* Okay, we should do something much more intelligent here than just picking a vocabulary word, but I don't have the time right now and the corpora I use have "<s>" and should hit this case anyway. */ initial_history_id=1; } sought_trigram[0]=initial_history_id; sought_trigram[1]=initial_history_id; fprintf(stderr,"Using an initial history of \"%s %s\"\n", lm_vocab[sought_trigram[0]],lm_vocab[sought_trigram[1]]); for (i=1; i<=num_words; ++i) { p=((double)random())/RAND_MAX; /* This is platform-specific and needs to be fixed */ if (p<.5){ acc=0.0; for (j=0; j<=lm_vocab_sz; ++j){ sought_trigram[2]=j; bo_ng_prob(2, sought_trigram, png, DEFAULT_VERBOSITY, &trigram_prob, &bo_case); acc+=trigram_prob; if (p<=acc) break; } if (p>acc) fprintf(stderr, "WARNING: The sum over w3 of Pr(w3|%s,%s) was %f," "which was less than the randomly generated number %f.\n", lm_vocab[sought_trigram[0]],lm_vocab[sought_trigram[1]],acc,p); }else { acc=1.0; for (j=lm_vocab_sz; j>=0; --j){ sought_trigram[2]=j; bo_ng_prob(2, sought_trigram, png, DEFAULT_VERBOSITY, &trigram_prob, &bo_case); acc-=trigram_prob; if (p>=acc) break; } if (p<acc) fprintf(stderr, "WARNING: 1-(sum over w3 of Pr(w3|%s,%s) was %f," "which was greater than the randomly generated number %f.\n", lm_vocab[sought_trigram[0]],lm_vocab[sought_trigram[1]],acc,p); } fprintf(output_file,"%s ",lm_vocab[sought_trigram[2]]); if (!(i%10000)) printf("%d words output.\n",i); sought_trigram[0] = sought_trigram[1]; sought_trigram[1] = sought_trigram[2]; } fprintf(output_file,"\n"); }
double calc_prob_of(id__t sought_word, id__t *context, int context_length, ng_t *ng, arpa_lm_t *arpa_ng, fb_info *fb_list, int *bo_case, int *acl, flag arpa_lm) { int i; flag exc_back_off; int most_recent_fb; int actual_context_length; id__t *sought_ngram; double prob; exc_back_off = 0; if (arpa_lm) { if (sought_word == 0 && arpa_ng->vocab_type == CLOSED_VOCAB) { quit(-1,"Error : Cannot generate probability for <UNK> since this is a closed \nvocabulary model.\n"); } } else { if (sought_word == 0 && ng->vocab_type == CLOSED_VOCAB) { quit(-1,"Error : Cannot generate probability for <UNK> since this is a closed \nvocabulary model.\n"); } } most_recent_fb = -1; /* Find most recent word in the forced back-off list */ for (i=context_length-1;i>=0;i--) { if (fb_list[context[i]].backed_off) { most_recent_fb = i; if (fb_list[context[i]].inclusive) { exc_back_off = 0; } else { exc_back_off = 1; } i = -2; } } actual_context_length = context_length - most_recent_fb -1; if (!exc_back_off && most_recent_fb != -1) { actual_context_length++; } sought_ngram = (id__t *) rr_malloc(sizeof(id__t)*(actual_context_length+1)); for (i=0;i<=actual_context_length-1;i++) { if (exc_back_off) { sought_ngram[i] = context[i+most_recent_fb+1]; } else { if (most_recent_fb == -1) { sought_ngram[i] = context[i+most_recent_fb+1]; } else { sought_ngram[i] = context[i+most_recent_fb]; } } } sought_ngram[actual_context_length] = sought_word; if (arpa_lm) { arpa_bo_ng_prob(actual_context_length, sought_ngram, arpa_ng, 2, /* Verbosity */ &prob, bo_case); } else { bo_ng_prob(actual_context_length, sought_ngram, ng, 2, /* Verbosity */ &prob, bo_case); } *acl = actual_context_length; free(sought_ngram); return(prob); }