void display_stats(ng_t *ng) { int i; fprintf(stderr,"This is a %hu-gram language model, based on a vocabulary of %lld words,\n",ng->n,ng->vocab_size); fprintf(stderr," which begins \"%s\", \"%s\", \"%s\"...\n",ng->vocab[1],ng->vocab[2],ng->vocab[3]); if (ng->no_of_ccs == 1) fprintf(stderr,"There is 1 context cue."); else fprintf(stderr,"There are %d context cues.\n",ng->no_of_ccs); if (ng->no_of_ccs > 0 && ng->no_of_ccs < 10) { if (ng->no_of_ccs == 1) fprintf(stderr,"This is : "); else fprintf(stderr,"These are : "); for (i=ng->first_id;i<=(int)ng->vocab_size;i++) { if (ng->context_cue[i]) fprintf(stderr,"\"%s\" ",ng->vocab[i]); } fprintf(stderr,"\n"); } display_vocabtype(ng->vocab_type,ng->oov_fraction,stderr); if (ng->four_byte_alphas) fprintf(stderr,"The back-off weights are stored in four bytes.\n"); else fprintf(stderr,"The back-off weights are stored in two bytes.\n"); for (i=2;i<=ng->n;i++) fprintf(stderr,"The %d-gram component was based on %d %d-grams.\n",i,(int)ng->num_kgrams[i-1],i); display_discounting_method(ng,stderr); }
void write_arpa_lm(ng_t *ng,int verbosity) { int *current_pos; int *end_pos; ngram_sz_t i; double log_10_of_e = 1.0 / log(10.0); /* HEADER */ pc_message(verbosity,1,"ARPA-style %d-gram will be written to %s\n",ng->n,ng->arpa_filename); write_arpa_copyright(ng->arpa_fp,ng->n,ng->vocab_size, ng->vocab[1],ng->vocab[2],ng->vocab[3]); display_vocabtype(ng->vocab_type,ng->oov_fraction, ng->arpa_fp); display_discounting_method(ng,ng->arpa_fp); write_arpa_format(ng->arpa_fp,ng->n); write_arpa_num_grams(ng->arpa_fp,ng,NULL,0); write_arpa_k_gram_header(ng->arpa_fp,1); for (i=ng->first_id; i<= (int) ng->vocab_size;i++) { double log10_uniprob; double log10_alpha; double alpha; log10_uniprob = ng->uni_log_probs[i]*log_10_of_e; if (ng->uni_probs[i]<=0.0) log10_uniprob = BAD_LOG_PROB; alpha=ng_double_alpha(ng,0,i); if(alpha > 0.0) log10_alpha = log10(alpha); else log10_alpha = BAD_LOG_PROB; fprintf(ng->arpa_fp,"%.4f %s",log10_uniprob,ng->vocab[i]); if (ng->n>1) fprintf(ng->arpa_fp,"\t%.4f\n",log10_alpha); else fprintf(ng->arpa_fp,"\n"); } current_pos = (int *) rr_malloc(ng->n*sizeof(int)); end_pos = (int *) rr_malloc(ng->n*sizeof(int)); /* Print 2-gram, ... (n-1)-gram info. */ for (i=1;i<=ng->n-1;i++) { /* Print out the (i+1)-gram */ int current_table, j; count_t ngcount, marg_count; double discounted_ngcount; double ngprob, log_10_ngprob, ngalpha, log_10_ngalpha; /* Initialise variables for the sake of warning-free compilation */ #ifdef STATICANALYZEDEPENDENCIES #define __clang_analyzer__ 1 #endif #if !defined(__clang_analyzer__) || defined(STATICANALYZEDEPENDENCIES) #undef __clang_analyzer__ discounted_ngcount = 0.0; log_10_ngalpha = 0.0; #endif write_arpa_k_gram_header(ng->arpa_fp,i+1); /* Go through the n-gram list in order */ for (j=0;j<=ng->n-1;j++) { current_pos[j] = 0; end_pos[j] = 0; } for (current_pos[0]=ng->first_id; current_pos[0]<=(int) ng->vocab_size; current_pos[0]++) { if (return_count(ng->four_byte_counts, ng->count_table[0], ng->marg_counts, ng->marg_counts4, current_pos[0]) > 0) { current_table = 1; if (current_pos[0] == (int) ng->vocab_size) end_pos[1] = (int ) ng->num_kgrams[1]-1; else { end_pos[1] = get_full_index(ng->ind[0][current_pos[0]+1], ng->ptr_table[0], ng->ptr_table_size[0], current_pos[0]+1)-1; } while (current_table > 0) { /* fprintf(stderr, "i %d, current_pos[i] %d, end_pos[i] %d\n", i, current_pos[i], end_pos[i]); fflush(stderr);*/ if (current_table == i) { if (current_pos[i] <= end_pos[i]) { /* fprintf(stderr, "%d\n",ng->count[i][current_pos[i]]); fprintf(stderr, "%d\n",ng->count_table[i][ng->count[i][current_pos[i]]]);*/ ngcount = return_count(ng->four_byte_counts, ng->count_table[i], ng->count[i], ng->count4[i], current_pos[i]); if (i==1) { marg_count = return_count(ng->four_byte_counts, ng->count_table[0], ng->marg_counts, ng->marg_counts4, current_pos[0]); }else { marg_count = return_count(ng->four_byte_counts, ng->count_table[i-1], ng->count[i-1], ng->count4[i-1], current_pos[i-1]); } if(ng->disc_meth==NULL) ng->disc_meth=(disc_meth_t*) disc_meth_init(ng->discounting_method); assert(ng->disc_meth); discounted_ngcount = NG_DISC_METH(ng)->dump_discounted_ngram_count(ng,i,ngcount,marg_count,current_pos); ngprob = (double) discounted_ngcount / marg_count; if (ngprob > 1.0) { fprintf(stderr, "discounted_ngcount = %f marg_count = %d %d %d %d\n", discounted_ngcount,marg_count,current_pos[0], current_pos[1],current_pos[2]); quit(-1,"Error : probablity of ngram is greater than one.\n"); } if (ngprob > 0.0) log_10_ngprob = log10(ngprob); else log_10_ngprob = BAD_LOG_PROB; if (i <= ng->n-2) { ngalpha = ng_double_alpha(ng, i, current_pos[i]); if (ngalpha > 0.0) log_10_ngalpha = log10(ngalpha); else log_10_ngalpha = BAD_LOG_PROB; } // BEGIN HLW VERSION if(((strstr (ng->vocab[current_pos[0]],"</s>")) == NULL)&&((i <= 1) || ((i > 1) && ((strstr (ng->vocab[(unsigned int) ng->word_id[i][current_pos[i]]],"<s>")) == NULL)))) { // if the overall entry is a trigram and it's going to end with <s>, skip it -- HLW fprintf(ng->arpa_fp,"%.4f ",log_10_ngprob); fprintf(ng->arpa_fp,"%s ",ng->vocab[current_pos[0]]); for (j=1;j<=i;j++){ fprintf(ng->arpa_fp,"%s ",ng->vocab[(unsigned int) ng->word_id[j][current_pos[j]]]); } if (i <= ng->n-2){ fprintf(ng->arpa_fp,"%.4f\n",log_10_ngalpha); } else{ fprintf(ng->arpa_fp,"\n"); } } else { // something is being skipped -- HLW if(i==0) { skipped_unigrams++; } else if(i==1) { skipped_bigrams++; } else if (i==2) { skipped_trigrams++; } } // END HLW VERSION // PREVIOUS VERSION: /* if (i <= ng->n-2) { ngalpha = ng_double_alpha(ng, i, current_pos[i]); if (ngalpha > 0.0) log_10_ngalpha = log10(ngalpha); else log_10_ngalpha = BAD_LOG_PROB; } fprintf(ng->arpa_fp,"%.4f ",log_10_ngprob); fprintf(ng->arpa_fp,"%s ",ng->vocab[current_pos[0]]); for (j=1;j<=i;j++){ // fprintf(stderr, "j %d, ng->wordid[j] %u, current_pos[j] %d, ng->word_id[j][current_pos[j]] %u\n",j, ng->word_id[j], current_pos[j], ng->word_id[j][current_pos[j]]); fprintf(ng->arpa_fp,"%s ",ng->vocab[(unsigned int) ng->word_id[j][current_pos[j]]]); } if (i <= ng->n-2) fprintf(ng->arpa_fp,"%.4f\n",log_10_ngalpha); else fprintf(ng->arpa_fp,"\n"); */ current_pos[i]++; }else { current_table--; if (current_table > 0) current_pos[current_table]++; } }else { if (current_pos[current_table] <= end_pos[current_table]) { current_table++; if (current_pos[current_table-1] == (int) ng->num_kgrams[current_table-1]-1) end_pos[current_table] = (int) ng->num_kgrams[current_table]-1; else { end_pos[current_table] = get_full_index(ng->ind[current_table-1][current_pos[current_table-1]+1], ng->ptr_table[current_table-1], ng->ptr_table_size[current_table-1], current_pos[current_table-1]+1) - 1; } }else { current_table--; if (current_table > 0) current_pos[current_table]++; } } } } } } free(current_pos); free(end_pos); fprintf(ng->arpa_fp,"\n\\end\\\n"); rr_oclose(ng->arpa_fp); // BEGIN HLW ADDITION // Now that the file is complete, let's go back and replace the placeholder ngram counts with the real final counts -- HLW final_ngram_count_replacement(ng->n,ng); unigram_count = 0; bigram_count = 0; trigram_count = 0; skipped_unigrams = 0; skipped_bigrams = 0; skipped_trigrams = 0; // END HLW ADDITION }