void compute_unigram(ng_t *ng,int verbosity) { int i; int count; int n_zerotons; int num_of_types; double floatN; double prob; double total_prob; double discount_mass; double total_zeroton_mass; double prob_zeroton; double prob_singleton; double leftover_mass; /* Make sure that we don't have a type 2 vocab and an UNK */ if (ng->vocab_type==OPEN_VOCAB_2 && return_count(ng->four_byte_counts, ng->count_table[0], ng->count[0], ng->count4[0], 0) != 0) { quit(-1,"Error : Open vocabulary type 2 requested, but there were OOVs in the \ntraining data.\n"); } if (ng->vocab_type == CLOSED_VOCAB) { ng->uni_probs[0] = 1e-99; } /* Make sure all context cues have a zero count */ if (ng->no_of_ccs > 0) { for (i=ng->first_id; i<=ng->vocab_size; i++) { if (ng->context_cue[i] && return_count(ng->four_byte_counts, ng->count_table[0], ng->count[0], ng->count4[0], i) != 0) { quit(-1,"Error : Context cue word has a non zero count.\n"); } } } /* Compute the discounted unigram, and the total */ floatN = (double) ng->n_unigrams; total_prob = 0.0; num_of_types = 0; for (i=ng->first_id; i<=ng->vocab_size; i++) { if (return_count(ng->four_byte_counts, ng->count_table[0], ng->count[0], ng->count4[0], i) > 0) { num_of_types++; } } for (i=ng->first_id; i<=ng->vocab_size; i++) { count = return_count(ng->four_byte_counts, ng->count_table[0], ng->count[0], ng->count4[0], i); prob = count/floatN; switch (ng->discounting_method) { case GOOD_TURING: if (count > 0 && count <= ng->disc_range[0]) { prob *= ng->gt_disc_ratio[0][count]; } else { if (count == 0) { prob = 1e-99; } } break; case LINEAR: if (count > 0) { prob *= ng->lin_disc_ratio[0]; } else { prob = 1e-99; } break; case ABSOLUTE: if (count > 0) { prob *= (count - ng->abs_disc_const[0])/count; } else { prob = 1e-99; } break; case WITTEN_BELL: if (count > 0) { prob *= floatN/(floatN+num_of_types); } else { prob = 1e-99; } break; } pc_message(verbosity,4," prob[%d] = %.8g count = %d \n",i,prob,count); ng->uni_probs[i] = prob; total_prob += prob; } /* Compute the discount mass */ discount_mass = 1.0 - total_prob; pc_message(verbosity,2,"Unigrams's discount mass is %g (n1/N = %g)\n", discount_mass,ng->freq_of_freq[0][1]/floatN); if (discount_mass < 1e-10 && discount_mass != 0.0) { discount_mass = 0.0; pc_message(verbosity,2,"Discount mass was rounded to zero.\n"); } /* Compute P(zeroton) & assign it to all zerotons (except context cues) */ leftover_mass = discount_mass; n_zerotons = ng->freq_of_freq[0][0] - ng->no_of_ccs; if ((n_zerotons > 0) && (discount_mass > 0.0)) { total_zeroton_mass = discount_mass; if (ng->vocab_type == OPEN_VOCAB_2) { total_zeroton_mass = (1.0 - ng->oov_fraction)*discount_mass; } prob_zeroton = total_zeroton_mass / n_zerotons; prob_singleton = 1 / floatN; switch (ng->discounting_method) { case GOOD_TURING: if (ng->disc_range[0] >= 1) { prob_singleton *= ng->gt_disc_ratio[0][1]; } break; case LINEAR: prob_singleton *= ng->lin_disc_ratio[0]; break; case ABSOLUTE: prob_singleton *= (1-ng->abs_disc_const[0]); break; case WITTEN_BELL: prob_singleton *= floatN/(floatN + num_of_types); break; } pc_message(verbosity,2,"%d zerotons, P(zeroton) = %g P(singleton) = %g\n", n_zerotons,prob_zeroton,prob_singleton); if (prob_zeroton > ng->zeroton_fraction*prob_singleton) { prob_zeroton = ng->zeroton_fraction*prob_singleton; pc_message(verbosity,1,"P(zeroton) was reduced to %.10f (%.3f of P(singleton))\n",prob_zeroton,ng->zeroton_fraction); } for (i=ng->first_id; i<=ng->vocab_size; i++) { if ((return_count(ng->four_byte_counts, ng->count_table[0], ng->count[0], ng->count4[0], i) == 0) && (!ng->context_cue[i])) { ng->uni_probs[i] = prob_zeroton; } } total_zeroton_mass = n_zerotons * prob_zeroton; leftover_mass = discount_mass - total_zeroton_mass; } /* Do renormalisation due to UNK */ if (ng->vocab_type == OPEN_VOCAB_2) { ng->uni_probs[0] += leftover_mass; if (ng->uni_probs[0] <= 0.0) { ng->uni_probs[0] = 1e-99; } } else { if (fabs(leftover_mass) > 1e-10) { for (i=ng->first_id; i<=ng->vocab_size; i++) { ng->uni_probs[i] /= (1.0 - leftover_mass); } if (fabs(leftover_mass)>1e-8) { pc_message(verbosity,1,"Unigram was renormalized to absorb a mass of %g\n",leftover_mass); } } } pc_message(verbosity,1,"prob[UNK] = %g\n",ng->uni_probs[0]); if ((n_zerotons>0) && (discount_mass<=0.0)) { pc_message(verbosity,1,"WARNING: %d non-context-cue words have zero probability\n\n",n_zerotons); } if (verbosity>=4) { fprintf(stderr,"THE FINAL UNIGRAM:\n"); for (i=ng->first_id; i<=ng->vocab_size; i++) { fprintf(stderr," unigram[%d]=%g\n",i,ng->uni_probs[i]); } } /* Test resulting unigram for consistency */ total_prob = 0.0; for (i=ng->first_id; i<=ng->vocab_size; i++) { total_prob += ng->uni_probs[i]; } if (fabs(1.0-total_prob) > 1e-6) { quit(-1,"ERROR: sum[P(w)] = %.10f\n",total_prob); } if (fabs(1.0-total_prob) > 1e-9) { pc_message(verbosity,1,"WARNING: sum[P(w)] = %.10f\n\n",total_prob); } /* Precompute logprobs */ for (i=ng->first_id; i<=ng->vocab_size; i++) { ng->uni_log_probs[i] = log(ng->uni_probs[i]); } }
void write_arpa_lm(ng_t *ng,int verbosity) { int *current_pos; int *end_pos; ngram_sz_t i; double log_10_of_e = 1.0 / log(10.0); /* HEADER */ pc_message(verbosity,1,"ARPA-style %d-gram will be written to %s\n",ng->n,ng->arpa_filename); write_arpa_copyright(ng->arpa_fp,ng->n,ng->vocab_size, ng->vocab[1],ng->vocab[2],ng->vocab[3]); display_vocabtype(ng->vocab_type,ng->oov_fraction, ng->arpa_fp); display_discounting_method(ng,ng->arpa_fp); write_arpa_format(ng->arpa_fp,ng->n); write_arpa_num_grams(ng->arpa_fp,ng,NULL,0); write_arpa_k_gram_header(ng->arpa_fp,1); for (i=ng->first_id; i<= (int) ng->vocab_size;i++) { double log10_uniprob; double log10_alpha; double alpha; log10_uniprob = ng->uni_log_probs[i]*log_10_of_e; if (ng->uni_probs[i]<=0.0) log10_uniprob = BAD_LOG_PROB; alpha=ng_double_alpha(ng,0,i); if(alpha > 0.0) log10_alpha = log10(alpha); else log10_alpha = BAD_LOG_PROB; fprintf(ng->arpa_fp,"%.4f %s",log10_uniprob,ng->vocab[i]); if (ng->n>1) fprintf(ng->arpa_fp,"\t%.4f\n",log10_alpha); else fprintf(ng->arpa_fp,"\n"); } current_pos = (int *) rr_malloc(ng->n*sizeof(int)); end_pos = (int *) rr_malloc(ng->n*sizeof(int)); /* Print 2-gram, ... (n-1)-gram info. */ for (i=1;i<=ng->n-1;i++) { /* Print out the (i+1)-gram */ int current_table, j; count_t ngcount, marg_count; double discounted_ngcount; double ngprob, log_10_ngprob, ngalpha, log_10_ngalpha; /* Initialise variables for the sake of warning-free compilation */ #ifdef STATICANALYZEDEPENDENCIES #define __clang_analyzer__ 1 #endif #if !defined(__clang_analyzer__) || defined(STATICANALYZEDEPENDENCIES) #undef __clang_analyzer__ discounted_ngcount = 0.0; log_10_ngalpha = 0.0; #endif write_arpa_k_gram_header(ng->arpa_fp,i+1); /* Go through the n-gram list in order */ for (j=0;j<=ng->n-1;j++) { current_pos[j] = 0; end_pos[j] = 0; } for (current_pos[0]=ng->first_id; current_pos[0]<=(int) ng->vocab_size; current_pos[0]++) { if (return_count(ng->four_byte_counts, ng->count_table[0], ng->marg_counts, ng->marg_counts4, current_pos[0]) > 0) { current_table = 1; if (current_pos[0] == (int) ng->vocab_size) end_pos[1] = (int ) ng->num_kgrams[1]-1; else { end_pos[1] = get_full_index(ng->ind[0][current_pos[0]+1], ng->ptr_table[0], ng->ptr_table_size[0], current_pos[0]+1)-1; } while (current_table > 0) { /* fprintf(stderr, "i %d, current_pos[i] %d, end_pos[i] %d\n", i, current_pos[i], end_pos[i]); fflush(stderr);*/ if (current_table == i) { if (current_pos[i] <= end_pos[i]) { /* fprintf(stderr, "%d\n",ng->count[i][current_pos[i]]); fprintf(stderr, "%d\n",ng->count_table[i][ng->count[i][current_pos[i]]]);*/ ngcount = return_count(ng->four_byte_counts, ng->count_table[i], ng->count[i], ng->count4[i], current_pos[i]); if (i==1) { marg_count = return_count(ng->four_byte_counts, ng->count_table[0], ng->marg_counts, ng->marg_counts4, current_pos[0]); }else { marg_count = return_count(ng->four_byte_counts, ng->count_table[i-1], ng->count[i-1], ng->count4[i-1], current_pos[i-1]); } if(ng->disc_meth==NULL) ng->disc_meth=(disc_meth_t*) disc_meth_init(ng->discounting_method); assert(ng->disc_meth); discounted_ngcount = NG_DISC_METH(ng)->dump_discounted_ngram_count(ng,i,ngcount,marg_count,current_pos); ngprob = (double) discounted_ngcount / marg_count; if (ngprob > 1.0) { fprintf(stderr, "discounted_ngcount = %f marg_count = %d %d %d %d\n", discounted_ngcount,marg_count,current_pos[0], current_pos[1],current_pos[2]); quit(-1,"Error : probablity of ngram is greater than one.\n"); } if (ngprob > 0.0) log_10_ngprob = log10(ngprob); else log_10_ngprob = BAD_LOG_PROB; if (i <= ng->n-2) { ngalpha = ng_double_alpha(ng, i, current_pos[i]); if (ngalpha > 0.0) log_10_ngalpha = log10(ngalpha); else log_10_ngalpha = BAD_LOG_PROB; } // BEGIN HLW VERSION if(((strstr (ng->vocab[current_pos[0]],"</s>")) == NULL)&&((i <= 1) || ((i > 1) && ((strstr (ng->vocab[(unsigned int) ng->word_id[i][current_pos[i]]],"<s>")) == NULL)))) { // if the overall entry is a trigram and it's going to end with <s>, skip it -- HLW fprintf(ng->arpa_fp,"%.4f ",log_10_ngprob); fprintf(ng->arpa_fp,"%s ",ng->vocab[current_pos[0]]); for (j=1;j<=i;j++){ fprintf(ng->arpa_fp,"%s ",ng->vocab[(unsigned int) ng->word_id[j][current_pos[j]]]); } if (i <= ng->n-2){ fprintf(ng->arpa_fp,"%.4f\n",log_10_ngalpha); } else{ fprintf(ng->arpa_fp,"\n"); } } else { // something is being skipped -- HLW if(i==0) { skipped_unigrams++; } else if(i==1) { skipped_bigrams++; } else if (i==2) { skipped_trigrams++; } } // END HLW VERSION // PREVIOUS VERSION: /* if (i <= ng->n-2) { ngalpha = ng_double_alpha(ng, i, current_pos[i]); if (ngalpha > 0.0) log_10_ngalpha = log10(ngalpha); else log_10_ngalpha = BAD_LOG_PROB; } fprintf(ng->arpa_fp,"%.4f ",log_10_ngprob); fprintf(ng->arpa_fp,"%s ",ng->vocab[current_pos[0]]); for (j=1;j<=i;j++){ // fprintf(stderr, "j %d, ng->wordid[j] %u, current_pos[j] %d, ng->word_id[j][current_pos[j]] %u\n",j, ng->word_id[j], current_pos[j], ng->word_id[j][current_pos[j]]); fprintf(ng->arpa_fp,"%s ",ng->vocab[(unsigned int) ng->word_id[j][current_pos[j]]]); } if (i <= ng->n-2) fprintf(ng->arpa_fp,"%.4f\n",log_10_ngalpha); else fprintf(ng->arpa_fp,"\n"); */ current_pos[i]++; }else { current_table--; if (current_table > 0) current_pos[current_table]++; } }else { if (current_pos[current_table] <= end_pos[current_table]) { current_table++; if (current_pos[current_table-1] == (int) ng->num_kgrams[current_table-1]-1) end_pos[current_table] = (int) ng->num_kgrams[current_table]-1; else { end_pos[current_table] = get_full_index(ng->ind[current_table-1][current_pos[current_table-1]+1], ng->ptr_table[current_table-1], ng->ptr_table_size[current_table-1], current_pos[current_table-1]+1) - 1; } }else { current_table--; if (current_table > 0) current_pos[current_table]++; } } } } } } free(current_pos); free(end_pos); fprintf(ng->arpa_fp,"\n\\end\\\n"); rr_oclose(ng->arpa_fp); // BEGIN HLW ADDITION // Now that the file is complete, let's go back and replace the placeholder ngram counts with the real final counts -- HLW final_ngram_count_replacement(ng->n,ng); unigram_count = 0; bigram_count = 0; trigram_count = 0; skipped_unigrams = 0; skipped_bigrams = 0; skipped_trigrams = 0; // END HLW ADDITION }
void compute_back_off(ng_t *ng,int n, int verbosity) { int *current_pos; int *end_pos; id__t *sought_ngram; int current_table; int ng_count; int i; double sum_cond_prob; double sum_bo_prob; double discounted_ngcount; double cond_prob; double bo_prob; double discount_mass; double leftout_bo_prob; double alpha; int bo_case; sum_cond_prob = 0.0; sum_bo_prob = 0.0; /* For the sake of warning-free compilation... */ discounted_ngcount = 0.0; current_pos = (int *)rr_calloc(n+1,sizeof(int)); sought_ngram = (id__t *) rr_calloc(n+1,sizeof(id__t)); end_pos = (int *)rr_calloc(n+1,sizeof(int)); /* Process the tree so that we get all the n-grams out in the right order. */ for (current_pos[0]=ng->first_id; current_pos[0]<=ng->vocab_size; current_pos[0]++) { if (return_count(ng->four_byte_counts, ng->count_table[0], ng->marg_counts, ng->marg_counts4, current_pos[0]) > 0) { current_table = 1; if (current_pos[0] == ng->vocab_size) { end_pos[1] = ng->num_kgrams[1]-1; } else { end_pos[1] = get_full_index(ng->ind[0][current_pos[0]+1], ng->ptr_table[0], ng->ptr_table_size[0], current_pos[0]+1)-1; } while (current_table > 0) { if (current_table == n) { if (current_pos[n] <= end_pos[n]){ ng_count = return_count(ng->four_byte_counts, ng->count_table[n], ng->count[n], ng->count4[n], current_pos[n]); switch (ng->discounting_method) { case GOOD_TURING: if (ng_count <= ng->disc_range[n]) { discounted_ngcount = ng->gt_disc_ratio[n][ng_count] * ng_count; } else { discounted_ngcount = ng_count; } break; case LINEAR: discounted_ngcount = ng->lin_disc_ratio[n] * ng_count; break; case ABSOLUTE: discounted_ngcount = ng_count - ng->abs_disc_const[n]; break; case WITTEN_BELL: if (n==1) { discounted_ngcount = ((double) return_count(ng->four_byte_counts, ng->count_table[0], ng->marg_counts, ng->marg_counts4, current_pos[0]) * ng_count) / (return_count(ng->four_byte_counts, ng->count_table[0], ng->marg_counts, ng->marg_counts4, current_pos[0]) + num_of_types(0,current_pos[0],ng)); } else { discounted_ngcount = ((double) return_count(ng->four_byte_counts, ng->count_table[n-1], ng->count[n-1], ng->count4[n-1], current_pos[n-1])* ng_count) / (return_count(ng->four_byte_counts, ng->count_table[n-1], ng->count[n-1], ng->count4[n-1], current_pos[n-1]) + num_of_types(n-1,current_pos[n-1],ng)); } break; } if (n==1) { cond_prob = ((double) discounted_ngcount / return_count(ng->four_byte_counts, ng->count_table[0], ng->marg_counts, ng->marg_counts4, current_pos[0])); } else { cond_prob = ((double) discounted_ngcount / return_count(ng->four_byte_counts, ng->count_table[n-1], ng->count[n-1], ng->count4[n-1], current_pos[n-1])); } sum_cond_prob += cond_prob; /* Fill up sought ngram array with correct stuff */ for (i=1;i<=n;i++) { sought_ngram[i-1] = ng->word_id[i][current_pos[i]]; } bo_ng_prob(n-1,sought_ngram,ng,verbosity,&bo_prob,&bo_case); sum_bo_prob += bo_prob; current_pos[n]++; } else { discount_mass = 1.0 - sum_cond_prob; if (discount_mass < 1e-10) { discount_mass = 0.0; pc_message(verbosity,2,"Warning : Back off weight for %s(id %d) ", ng->vocab[current_pos[0]],current_pos[0]); for (i=1;i<=n-1;i++) { pc_message(verbosity,2,"%s(id %d) ",ng->vocab[ng->word_id[i][current_pos[i]]],ng->word_id[i][current_pos[i]]); } pc_message(verbosity,2, "is set to 0 (sum of probs = %f).\nMay cause problems with zero probabilities.\n",sum_cond_prob); } leftout_bo_prob = 1.0 - sum_bo_prob; if (leftout_bo_prob < 1e-10) { leftout_bo_prob = 0.0; } if (leftout_bo_prob > 0.0) { alpha = discount_mass / leftout_bo_prob; } else { alpha = 0.0; /* Will not be used. Should happen very rarely. */ pc_message(verbosity,2,"Warning : Back off weight for %s(id %d) ", ng->vocab[current_pos[0]],current_pos[0]); for (i=1;i<=n-1;i++) { pc_message(verbosity,2,"%s(id %d) ",ng->vocab[ng->word_id[i][current_pos[i]]],ng->word_id[i][current_pos[i]]); } pc_message(verbosity,2, "is set to 0.\nMay cause problems with zero probabilities.\n"); } if (ng->four_byte_alphas) { ng->bo_weight4[n-1][current_pos[n-1]] = alpha; } else { ng->bo_weight[n-1][current_pos[n-1]] = short_alpha(alpha, ng->alpha_array, &(ng->size_of_alpha_array), 65535 - ng->out_of_range_alphas, ng->min_alpha, ng->max_alpha); } /* Finished current (n-1)-gram */ sum_cond_prob = 0.0; sum_bo_prob = 0.0; current_table--; if (current_table > 0) { current_pos[current_table]++; } } } else { if (current_pos[current_table] <= end_pos[current_table]) { current_table++; if (current_pos[current_table-1] == ng->num_kgrams[current_table-1]-1) { end_pos[current_table] = ng->num_kgrams[current_table]-1; } else { end_pos[current_table] = get_full_index(ng->ind[current_table-1][current_pos[current_table-1]+1],ng->ptr_table[current_table-1],ng->ptr_table_size[current_table-1],current_pos[current_table-1]+1)-1; } } else { current_table--; if (current_table > 0) { current_pos[current_table]++; } } } } } /* Now deal with zeroton unigrams */ else { if (n == 1) { if (ng->four_byte_alphas) { ng->bo_weight4[0][current_pos[0]] = 1.0; } else { ng->bo_weight[0][current_pos[0]] = short_alpha(1.0, ng->alpha_array, &(ng->size_of_alpha_array), 65535 - ng->out_of_range_alphas, ng->min_alpha, ng->max_alpha); } } } } free(end_pos); free(current_pos); free(sought_ngram); }
int main(int argc, char **argv) { int i,j; ng_t* ng; int verbosity; int mem_alloc_method; /* Method used to decide how much memory to allocate for count tables */ int buffer_size; flag is_ascii; ngram current_ngram; ngram previous_ngram; count_t *ng_count; /* Array indicating the number of occurrances of the current 1-gram, 2-gram, ... ,n-gram Size depends on #define in general.h */ int nlines; int pos_of_novelty; int prev_id1; flag contains_unks; int mem_alloced; flag displayed_oov_warning; /** Display OOV warning */ /* ------------------ Process command line --------------------- */ report_version(&argc,argv); if (argc == 1 || pc_flagarg(&argc, argv,"-help")) { /* Display help message */ help_message(); exit(1); } verbosity = pc_intarg(&argc, argv,"-verbosity",DEFAULT_VERBOSITY); /* Initialization */ { ng=init_ng( &argc, argv, verbosity ); mem_alloc_method = init_alloc_method(ng, &argc, argv, &buffer_size); if (!strcmp(ng->id_gram_filename,"-") && mem_alloc_method == TWO_PASSES) quit(-1,"Error: If idngram is read from stdin, then cannot use -calc_mem option.\n"); is_ascii = set_lmformat(pc_flagarg(&argc,argv,"-ascii_input"), pc_flagarg(&argc,argv,"-bin_input"), ng); /* Report parameters */ report_param(verbosity,ng, is_ascii, mem_alloc_method, buffer_size); pc_report_unk_args(&argc,argv,verbosity); } /* --------------- Read in the vocabulary -------------- */ read_vocab(ng,verbosity); /* --------------- Allocate space for the table_size array --------- */ init_ng_table_size(ng, mem_alloc_method, is_ascii, verbosity, buffer_size ); /* ----------- Allocate memory for tree structure -------------- */ ng->count = NULL; ng->count4 = NULL; ng->marg_counts = NULL; ng->marg_counts4 = NULL; ng->count_table = NULL; ng->count = (count_ind_t **) rr_malloc(sizeof(count_ind_t *)*ng->n); ng->count4 = (count_t **) rr_malloc(sizeof(count_t *)*ng->n); ng->count_table = (count_t **) rr_malloc(sizeof(count_t *)*ng->n); if (ng->four_byte_counts) { ng->marg_counts4 = (count_t *) rr_calloc(sizeof(count_t), ng->table_sizes[0]); }else { for (i=0;i<=ng->n-1;i++) ng->count_table[i] = (count_t *) rr_calloc(ng->count_table_size+1, sizeof(count_t)); ng->marg_counts = (count_ind_t *) rr_calloc(sizeof(count_ind_t),ng->table_sizes[0]); fprintf(stderr, "table_size %d\n",ng->table_sizes[0]); fflush(stderr); } ng->word_id = (id__t **) rr_malloc(sizeof(id__t *)*ng->n); if (ng->four_byte_alphas) { ng->bo_weight4 = (four_byte_t **) rr_malloc(sizeof(four_byte_t *)*ng->n); ng->bo_weight4[0] = (four_byte_t *) rr_malloc(sizeof(four_byte_t)* ng->table_sizes[0]); }else { ng->bo_weight = (bo_weight_t **) rr_malloc(sizeof(bo_weight_t *)*ng->n); ng->bo_weight[0] = (bo_weight_t *) rr_malloc(sizeof(bo_weight_t)* ng->table_sizes[0]); } ng->ind = (index__t **) rr_malloc(sizeof(index__t *)*ng->n); /* First table */ if (ng->four_byte_counts) ng->count4[0] = (count_t *) rr_calloc(ng->table_sizes[0],sizeof(count_t)); else ng->count[0] = (count_ind_t *) rr_calloc(ng->table_sizes[0],sizeof(count_ind_t)); ng->uni_probs = (uni_probs_t *) rr_malloc(sizeof(uni_probs_t)* ng->table_sizes[0]); ng->uni_log_probs = (uni_probs_t *) rr_malloc(sizeof(uni_probs_t)* ng->table_sizes[0]); if (ng->n >=2) ng->ind[0] = (index__t *) rr_calloc(ng->table_sizes[0],sizeof(index__t)); for (i=1;i<=ng->n-2;i++) { ng->word_id[i] = (id__t *) rr_malloc(sizeof(id__t)*ng->table_sizes[i]); if (ng->four_byte_counts) ng->count4[i] = (count_t *) rr_malloc(sizeof(count_t)*ng->table_sizes[i]); else ng->count[i] = (count_ind_t *) rr_malloc(sizeof(count_ind_t)*ng->table_sizes[i]); if (ng->four_byte_alphas) ng->bo_weight4[i] = (four_byte_t *) rr_malloc(sizeof(four_byte_t)*ng->table_sizes[i]); else ng->bo_weight[i] = (bo_weight_t *) rr_malloc(sizeof(bo_weight_t)*ng->table_sizes[i]); ng->ind[i] = (index__t *) rr_malloc(sizeof(index__t)*ng->table_sizes[i]); mem_alloced = sizeof(count_ind_t) + sizeof(bo_weight_t) + sizeof(index__t) + sizeof(id__t); if (ng->four_byte_alphas) mem_alloced += 4; mem_alloced *= ng->table_sizes[i]; pc_message(verbosity,2,"Allocated %d bytes to table for %d-grams.\n", mem_alloced,i+1); } ng->word_id[ng->n-1] = (id__t *) rr_malloc(sizeof(id__t)*ng->table_sizes[ng->n-1]); if (ng->four_byte_counts) ng->count4[ng->n-1] = (count_t *) rr_malloc(sizeof(count_t)*ng->table_sizes[ng->n-1]); else ng->count[ng->n-1] = (count_ind_t *) rr_malloc(sizeof(count_ind_t)*ng->table_sizes[ng->n-1]); pc_message(verbosity,2,"Allocated (%d+%d) bytes to table for %d-grams.\n", ng->four_byte_counts?sizeof(count_t):sizeof(count_ind_t), sizeof(id__t)*ng->table_sizes[ng->n-1],ng->n); /* Allocate memory for table for first-byte of indices */ ng_allocate_ptr_table(ng,NULL,0); /* Allocate memory for alpha array */ ng->alpha_array = (double *) rr_malloc(sizeof(double)*ng->out_of_range_alphas); ng->size_of_alpha_array = 0; /* Allocate memory for frequency of frequency information */ ng->freq_of_freq = (fof_t **) rr_malloc(sizeof(fof_t *)*ng->n); NG_DISC_METH(ng)->allocate_freq_of_freq(ng); /* Read n-grams into the tree */ pc_message(verbosity,2,"Processing id n-gram file.\n"); pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n"); /* Allocate space for ngrams id arrays */ current_ngram.id_array = (id__t *) rr_calloc(ng->n,sizeof(id__t)); previous_ngram.id_array = (id__t *) rr_calloc(ng->n,sizeof(id__t)); current_ngram.n = ng->n; previous_ngram.n = ng->n; ng->num_kgrams = (ngram_sz_t *) rr_calloc(ng->n,sizeof(ngram_sz_t)); ng_count = (count_t *) rr_calloc(ng->n,sizeof(count_t)); nlines = 1; ng->n_unigrams = 0; /* Process first n-gram */ get_ngram(ng->id_gram_fp,¤t_ngram,is_ascii); contains_unks = ngram_chk_contains_unks(¤t_ngram,ng->n); /* Skip over any unknown words. They will come first, because <UNK> always has a word ID of zero. */ while (ng->vocab_type == CLOSED_VOCAB && contains_unks){ /* Stop looking if there are no more N-Grams. Of course, this means training will fail, since there are no unigrams. */ if (get_ngram(ng->id_gram_fp,¤t_ngram,is_ascii) == 0) break; contains_unks = ngram_chk_contains_unks(¤t_ngram,ng->n); } for (i=0;i<=ng->n-2;i++) { ng->ind[i][0] = new_index(0,ng->ptr_table[i],&(ng->ptr_table_size[i]),0); ng->word_id[i+1][0] = current_ngram.id_array[i+1]; ng->num_kgrams[i+1]++; ng_count[i] = current_ngram.count; } ng_count[0] = current_ngram.count; NG_DISC_METH(ng)->update_freq_of_freq(ng,ng->n-1,current_ngram.count); store_normal_count(ng,0,current_ngram.count,ng->n-1); if (current_ngram.count <= ng->cutoffs[ng->n-2]) ng->num_kgrams[ng->n-1]--; ngram_copy(&previous_ngram,¤t_ngram,ng->n); prev_id1 = current_ngram.id_array[0]; displayed_oov_warning = 0; while (!rr_feof(ng->id_gram_fp)) { if (get_ngram(ng->id_gram_fp,¤t_ngram,is_ascii)) { if (ng->vocab_type == CLOSED_VOCAB) contains_unks=ngram_chk_contains_unks(¤t_ngram,ng->n); if (!contains_unks || ng->vocab_type != CLOSED_VOCAB) { /* Test for where this ngram differs from last - do we have an out-of-order ngram? */ pos_of_novelty = ngram_find_pos_of_novelty(¤t_ngram,&previous_ngram,ng->n,nlines); nlines++; show_idngram_nlines(nlines, verbosity); /* Add new n-gram as soon as it is encountered */ /* If all of the positions 2,3,...,n of the n-gram are context cues then ignore the n-gram. */ if (ng->n > 1) { NG_DISC_METH(ng)->update_freq_of_freq(ng,ng->n-1,current_ngram.count); store_normal_count(ng,ng->num_kgrams[ng->n-1],current_ngram.count,ng->n-1); ng->word_id[ng->n-1][ng->num_kgrams[ng->n-1]] = current_ngram.id_array[ng->n-1]; ng->num_kgrams[ng->n-1]++; if (ng->num_kgrams[ng->n-1] >= ng->table_sizes[ng->n-1]) quit(-1,"\nMore than %d %d-grams needed to be stored. Rerun with a higher table size.\n",ng->table_sizes[ng->n-1],ng->n); } /* Deal with new 2,3,...,(n-1)-grams */ for (i=ng->n-2;i>=MAX(1,pos_of_novelty);i--) { NG_DISC_METH(ng)->update_freq_of_freq(ng,i,ng_count[i]); if (ng_count[i] <= ng->cutoffs[i-1]) ng->num_kgrams[i]--; else store_normal_count(ng,ng->num_kgrams[i]-1,ng_count[i],i); ng_count[i] = current_ngram.count; ng->word_id[i][ng->num_kgrams[i]] = current_ngram.id_array[i]; ng->ind[i][ng->num_kgrams[i]] = new_index(ng->num_kgrams[i+1]-1, ng->ptr_table[i], &(ng->ptr_table_size[i]), ng->num_kgrams[i]); ng->num_kgrams[i]++; if (ng->num_kgrams[i] >= ng->table_sizes[i]) quit(-1,"More than %d %d-grams needed to be stored. Rerun with a higher table size.\n",ng->table_sizes[i],i+1); } for (i=0;i<=pos_of_novelty-1;i++) ng_count[i] += current_ngram.count; /* Deal with new 1-grams */ if (pos_of_novelty == 0) { if (ng->n>1) { for (i = prev_id1 + 1; i <= current_ngram.id_array[0]; i++) { ng->ind[0][i] = new_index(ng->num_kgrams[1]-1, ng->ptr_table[0], &(ng->ptr_table_size[0]), i); } prev_id1 = current_ngram.id_array[0]; } NG_DISC_METH(ng)->update_freq_of_freq(ng,0,ng_count[0]); if (!ng->context_cue[previous_ngram.id_array[0]]) { ng->n_unigrams += ng_count[0]; store_normal_count(ng,previous_ngram.id_array[0],ng_count[0],0); } store_marginal_count(ng,previous_ngram.id_array[0],ng_count[0],0); ng_count[0] = current_ngram.count; } if (current_ngram.count <= ng->cutoffs[ng->n-2]) ng->num_kgrams[ng->n-1]--; ngram_copy(&previous_ngram,¤t_ngram,ng->n); }else { if (!displayed_oov_warning){ pc_message(verbosity,2,"Warning : id n-gram stream contains OOV's (n-grams will be ignored).\n"); displayed_oov_warning = 1; } } } } rr_iclose(ng->id_gram_fp); for (i=ng->n-2;i>=1;i--) { NG_DISC_METH(ng)->update_freq_of_freq(ng,i,ng_count[i]); if (ng_count[i] <= ng->cutoffs[i-1]) ng->num_kgrams[i]--; else store_normal_count(ng,ng->num_kgrams[i]-1,ng_count[i],i); } NG_DISC_METH(ng)->update_freq_of_freq(ng,0,ng_count[0]); if (!ng->context_cue[current_ngram.id_array[0]]) { ng->n_unigrams += ng_count[0]; store_normal_count(ng,current_ngram.id_array[0],ng_count[0],0); } store_marginal_count(ng,current_ngram.id_array[0],ng_count[0],0); if (ng->n>1) { for (i=current_ngram.id_array[0]+1;i<=ng->vocab_size;i++) ng->ind[0][i] = new_index(ng->num_kgrams[1], ng->ptr_table[0], &(ng->ptr_table_size[0]), current_ngram.id_array[0]); } /* The idngram reading is completed at this point */ pc_message(verbosity,2,"\n"); /* Impose a minimum unigram count, if required */ if (ng->min_unicount > 0) { int nchanged= 0; for (i=ng->first_id;i<=ng->vocab_size;i++) { if ((return_count(ng->four_byte_counts, ng->count_table[0], ng->count[0], ng->count4[0], i) < ng->min_unicount) && !ng->context_cue[i]) { /* There was a bug in V2's switch. Look at segment for ABSOLUTE */ NG_DISC_METH(ng)->reduce_ug_freq_of_freq(ng,i); ng->n_unigrams += (ng->min_unicount - ng->count[0][i]); store_normal_count(ng,i,ng->min_unicount,0); nchanged++; } } if (nchanged > 0) pc_message(verbosity,2, "Unigram counts of %d words were bumped up to %d.\n", nchanged,ng->min_unicount); } /* Count zeroton information for unigrams */ ng->freq_of_freq[0][0] = 0; for (i=ng->first_id;i<=ng->vocab_size;i++) { if (return_count(ng->four_byte_counts, ng->count_table[0], ng->count[0], ng->count4[0], i) == 0) { ng->freq_of_freq[0][0]++; } } if (ng->discounting_method == GOOD_TURING) { for (i=0;i<=ng->n-1;i++) for (j=1;j<=ng->fof_size[i];j++) pc_message(verbosity,3,"fof[%d][%d] = %d\n",i,j,ng->freq_of_freq[i][j]); } pc_message(verbosity,2,"Calculating discounted counts.\n"); NG_DISC_METH(ng)->compute_discount_aux(ng, verbosity); /* Smooth unigram distribution, to give some mass to zerotons */ compute_unigram(ng,verbosity); /* Increment Contexts if using Good-Turing discounting-> No need otherwise, since all values are discounted anyway. */ if (ng->discounting_method == GOOD_TURING) { pc_message(verbosity,2,"Incrementing contexts...\n"); for (i=ng->n-1;i>=1;i--) increment_context(ng,i,verbosity); } /* Calculate back-off weights */ pc_message(verbosity,2,"Calculating back-off weights...\n"); for (i=1;i<=ng->n-1;i++) compute_back_off(ng,i,verbosity); if (!ng->four_byte_alphas) pc_message(verbosity,3,"Number of out of range alphas = %d\n", ng->size_of_alpha_array); /* Write out LM */ pc_message(verbosity,2,"Writing out language model...\n"); if (ng->write_arpa) write_arpa_lm(ng,verbosity); if (ng->write_bin) write_bin_lm(ng,verbosity); pc_message(verbosity,0,"idngram2lm : Done.\n"); return 0; }