int oe_03_main (int argc, char **argv) { flag first_ngram; int n; fof_sz_t fof_size; flag is_ascii; int verbosity; fof_t **fof_array; ngram_sz_t *num_kgrams; ngram current_ngram; ngram previous_ngram; count_t *ng_count; int pos_of_novelty; int nlines; int i; report_version(&argc,argv); if (argc == 1 || pc_flagarg(&argc, argv,"-help")) { oe_04_help_message(); exit(1); } is_ascii = pc_flagarg(&argc, argv,"-ascii_input"); n = pc_intarg(&argc, argv,"-n",3); fof_size = pc_intarg(&argc, argv,"-fof_size",50); verbosity = pc_intarg(&argc, argv,"-verbosity",DEFAULT_VERBOSITY); pc_report_unk_args(&argc,argv,verbosity); pc_message(verbosity,2,"n = %d\n",n); pc_message(verbosity,2,"fof_size = %d\n",fof_size); current_ngram.n = n; previous_ngram.n = n; pos_of_novelty = n; fof_array = (fof_t **) rr_malloc(sizeof(fof_t *) * (n-1)); for (i=0;i<=n-2;i++) fof_array[i] = (fof_t *) rr_calloc(fof_size+1,sizeof(fof_t)); num_kgrams = (ngram_sz_t *) rr_calloc(n-1,sizeof(ngram_sz_t)); ng_count = (count_t *) rr_calloc(n-1,sizeof(count_t)); current_ngram.id_array = (id__t *) rr_calloc(n,sizeof(id__t)); previous_ngram.id_array = (id__t *) rr_calloc(n,sizeof(id__t)); pc_message(verbosity,2,"Processing id n-gram file.\n"); pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n"); nlines = 0; first_ngram = 1; while (!rr_feof(stdin)) { if (!first_ngram) ngram_copy(&previous_ngram,¤t_ngram,n); if (get_ngram(stdin,¤t_ngram,is_ascii)) { nlines++; show_idngram_nlines(nlines, verbosity); /* Test for where this ngram differs from last - do we have an out-of-order ngram? */ if (!first_ngram) pos_of_novelty = ngram_find_pos_of_novelty(¤t_ngram,&previous_ngram,n,nlines); else pos_of_novelty = 0; /* Add new N-gram */ num_kgrams[n-2]++; if (current_ngram.count <= fof_size) fof_array[n-2][current_ngram.count]++; if (!first_ngram) { for (i=n-2;i>=MAX(1,pos_of_novelty);i--) { num_kgrams[i-1]++; if (ng_count[i-1] <= fof_size) fof_array[i-1][ng_count[i-1]]++; ng_count[i-1] = current_ngram.count; } } else { for (i=n-2;i>=MAX(1,pos_of_novelty);i--) ng_count[i-1] = current_ngram.count; } for (i=0;i<=pos_of_novelty-2;i++) ng_count[i] += current_ngram.count; if (first_ngram) first_ngram = 0; } } /* Process last ngram */ for (i=n-2;i>=MAX(1,pos_of_novelty);i--) { num_kgrams[i-1]++; if (ng_count[i-1] <= fof_size) { fof_array[i-1][ng_count[i-1]]++; } ng_count[i-1] = current_ngram.count; } #import "OpenEarsStaticAnalysisToggle.h" #ifdef STATICANALYZEDEPENDENCIES #define __clang_analyzer__ 1 #endif #if !defined(__clang_analyzer__) || defined(STATICANALYZEDEPENDENCIES) #undef __clang_analyzer__ for (i=0;i<=pos_of_novelty-2;i++) ng_count[i] += current_ngram.count; display_fof_array(num_kgrams,fof_array,fof_size,stderr, n); #endif pc_message(verbosity,0,"idngram2stats : Done.\n"); exit(0); }
int main(int argc, char **argv) { int i,j; ng_t* ng; int verbosity; int mem_alloc_method; /* Method used to decide how much memory to allocate for count tables */ int buffer_size; flag is_ascii; ngram current_ngram; ngram previous_ngram; count_t *ng_count; /* Array indicating the number of occurrances of the current 1-gram, 2-gram, ... ,n-gram Size depends on #define in general.h */ int nlines; int pos_of_novelty; int prev_id1; flag contains_unks; int mem_alloced; flag displayed_oov_warning; /** Display OOV warning */ /* ------------------ Process command line --------------------- */ report_version(&argc,argv); if (argc == 1 || pc_flagarg(&argc, argv,"-help")) { /* Display help message */ help_message(); exit(1); } verbosity = pc_intarg(&argc, argv,"-verbosity",DEFAULT_VERBOSITY); /* Initialization */ { ng=init_ng( &argc, argv, verbosity ); mem_alloc_method = init_alloc_method(ng, &argc, argv, &buffer_size); if (!strcmp(ng->id_gram_filename,"-") && mem_alloc_method == TWO_PASSES) quit(-1,"Error: If idngram is read from stdin, then cannot use -calc_mem option.\n"); is_ascii = set_lmformat(pc_flagarg(&argc,argv,"-ascii_input"), pc_flagarg(&argc,argv,"-bin_input"), ng); /* Report parameters */ report_param(verbosity,ng, is_ascii, mem_alloc_method, buffer_size); pc_report_unk_args(&argc,argv,verbosity); } /* --------------- Read in the vocabulary -------------- */ read_vocab(ng,verbosity); /* --------------- Allocate space for the table_size array --------- */ init_ng_table_size(ng, mem_alloc_method, is_ascii, verbosity, buffer_size ); /* ----------- Allocate memory for tree structure -------------- */ ng->count = NULL; ng->count4 = NULL; ng->marg_counts = NULL; ng->marg_counts4 = NULL; ng->count_table = NULL; ng->count = (count_ind_t **) rr_malloc(sizeof(count_ind_t *)*ng->n); ng->count4 = (count_t **) rr_malloc(sizeof(count_t *)*ng->n); ng->count_table = (count_t **) rr_malloc(sizeof(count_t *)*ng->n); if (ng->four_byte_counts) { ng->marg_counts4 = (count_t *) rr_calloc(sizeof(count_t), ng->table_sizes[0]); }else { for (i=0;i<=ng->n-1;i++) ng->count_table[i] = (count_t *) rr_calloc(ng->count_table_size+1, sizeof(count_t)); ng->marg_counts = (count_ind_t *) rr_calloc(sizeof(count_ind_t),ng->table_sizes[0]); fprintf(stderr, "table_size %d\n",ng->table_sizes[0]); fflush(stderr); } ng->word_id = (id__t **) rr_malloc(sizeof(id__t *)*ng->n); if (ng->four_byte_alphas) { ng->bo_weight4 = (four_byte_t **) rr_malloc(sizeof(four_byte_t *)*ng->n); ng->bo_weight4[0] = (four_byte_t *) rr_malloc(sizeof(four_byte_t)* ng->table_sizes[0]); }else { ng->bo_weight = (bo_weight_t **) rr_malloc(sizeof(bo_weight_t *)*ng->n); ng->bo_weight[0] = (bo_weight_t *) rr_malloc(sizeof(bo_weight_t)* ng->table_sizes[0]); } ng->ind = (index__t **) rr_malloc(sizeof(index__t *)*ng->n); /* First table */ if (ng->four_byte_counts) ng->count4[0] = (count_t *) rr_calloc(ng->table_sizes[0],sizeof(count_t)); else ng->count[0] = (count_ind_t *) rr_calloc(ng->table_sizes[0],sizeof(count_ind_t)); ng->uni_probs = (uni_probs_t *) rr_malloc(sizeof(uni_probs_t)* ng->table_sizes[0]); ng->uni_log_probs = (uni_probs_t *) rr_malloc(sizeof(uni_probs_t)* ng->table_sizes[0]); if (ng->n >=2) ng->ind[0] = (index__t *) rr_calloc(ng->table_sizes[0],sizeof(index__t)); for (i=1;i<=ng->n-2;i++) { ng->word_id[i] = (id__t *) rr_malloc(sizeof(id__t)*ng->table_sizes[i]); if (ng->four_byte_counts) ng->count4[i] = (count_t *) rr_malloc(sizeof(count_t)*ng->table_sizes[i]); else ng->count[i] = (count_ind_t *) rr_malloc(sizeof(count_ind_t)*ng->table_sizes[i]); if (ng->four_byte_alphas) ng->bo_weight4[i] = (four_byte_t *) rr_malloc(sizeof(four_byte_t)*ng->table_sizes[i]); else ng->bo_weight[i] = (bo_weight_t *) rr_malloc(sizeof(bo_weight_t)*ng->table_sizes[i]); ng->ind[i] = (index__t *) rr_malloc(sizeof(index__t)*ng->table_sizes[i]); mem_alloced = sizeof(count_ind_t) + sizeof(bo_weight_t) + sizeof(index__t) + sizeof(id__t); if (ng->four_byte_alphas) mem_alloced += 4; mem_alloced *= ng->table_sizes[i]; pc_message(verbosity,2,"Allocated %d bytes to table for %d-grams.\n", mem_alloced,i+1); } ng->word_id[ng->n-1] = (id__t *) rr_malloc(sizeof(id__t)*ng->table_sizes[ng->n-1]); if (ng->four_byte_counts) ng->count4[ng->n-1] = (count_t *) rr_malloc(sizeof(count_t)*ng->table_sizes[ng->n-1]); else ng->count[ng->n-1] = (count_ind_t *) rr_malloc(sizeof(count_ind_t)*ng->table_sizes[ng->n-1]); pc_message(verbosity,2,"Allocated (%d+%d) bytes to table for %d-grams.\n", ng->four_byte_counts?sizeof(count_t):sizeof(count_ind_t), sizeof(id__t)*ng->table_sizes[ng->n-1],ng->n); /* Allocate memory for table for first-byte of indices */ ng_allocate_ptr_table(ng,NULL,0); /* Allocate memory for alpha array */ ng->alpha_array = (double *) rr_malloc(sizeof(double)*ng->out_of_range_alphas); ng->size_of_alpha_array = 0; /* Allocate memory for frequency of frequency information */ ng->freq_of_freq = (fof_t **) rr_malloc(sizeof(fof_t *)*ng->n); NG_DISC_METH(ng)->allocate_freq_of_freq(ng); /* Read n-grams into the tree */ pc_message(verbosity,2,"Processing id n-gram file.\n"); pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n"); /* Allocate space for ngrams id arrays */ current_ngram.id_array = (id__t *) rr_calloc(ng->n,sizeof(id__t)); previous_ngram.id_array = (id__t *) rr_calloc(ng->n,sizeof(id__t)); current_ngram.n = ng->n; previous_ngram.n = ng->n; ng->num_kgrams = (ngram_sz_t *) rr_calloc(ng->n,sizeof(ngram_sz_t)); ng_count = (count_t *) rr_calloc(ng->n,sizeof(count_t)); nlines = 1; ng->n_unigrams = 0; /* Process first n-gram */ get_ngram(ng->id_gram_fp,¤t_ngram,is_ascii); contains_unks = ngram_chk_contains_unks(¤t_ngram,ng->n); /* Skip over any unknown words. They will come first, because <UNK> always has a word ID of zero. */ while (ng->vocab_type == CLOSED_VOCAB && contains_unks){ /* Stop looking if there are no more N-Grams. Of course, this means training will fail, since there are no unigrams. */ if (get_ngram(ng->id_gram_fp,¤t_ngram,is_ascii) == 0) break; contains_unks = ngram_chk_contains_unks(¤t_ngram,ng->n); } for (i=0;i<=ng->n-2;i++) { ng->ind[i][0] = new_index(0,ng->ptr_table[i],&(ng->ptr_table_size[i]),0); ng->word_id[i+1][0] = current_ngram.id_array[i+1]; ng->num_kgrams[i+1]++; ng_count[i] = current_ngram.count; } ng_count[0] = current_ngram.count; NG_DISC_METH(ng)->update_freq_of_freq(ng,ng->n-1,current_ngram.count); store_normal_count(ng,0,current_ngram.count,ng->n-1); if (current_ngram.count <= ng->cutoffs[ng->n-2]) ng->num_kgrams[ng->n-1]--; ngram_copy(&previous_ngram,¤t_ngram,ng->n); prev_id1 = current_ngram.id_array[0]; displayed_oov_warning = 0; while (!rr_feof(ng->id_gram_fp)) { if (get_ngram(ng->id_gram_fp,¤t_ngram,is_ascii)) { if (ng->vocab_type == CLOSED_VOCAB) contains_unks=ngram_chk_contains_unks(¤t_ngram,ng->n); if (!contains_unks || ng->vocab_type != CLOSED_VOCAB) { /* Test for where this ngram differs from last - do we have an out-of-order ngram? */ pos_of_novelty = ngram_find_pos_of_novelty(¤t_ngram,&previous_ngram,ng->n,nlines); nlines++; show_idngram_nlines(nlines, verbosity); /* Add new n-gram as soon as it is encountered */ /* If all of the positions 2,3,...,n of the n-gram are context cues then ignore the n-gram. */ if (ng->n > 1) { NG_DISC_METH(ng)->update_freq_of_freq(ng,ng->n-1,current_ngram.count); store_normal_count(ng,ng->num_kgrams[ng->n-1],current_ngram.count,ng->n-1); ng->word_id[ng->n-1][ng->num_kgrams[ng->n-1]] = current_ngram.id_array[ng->n-1]; ng->num_kgrams[ng->n-1]++; if (ng->num_kgrams[ng->n-1] >= ng->table_sizes[ng->n-1]) quit(-1,"\nMore than %d %d-grams needed to be stored. Rerun with a higher table size.\n",ng->table_sizes[ng->n-1],ng->n); } /* Deal with new 2,3,...,(n-1)-grams */ for (i=ng->n-2;i>=MAX(1,pos_of_novelty);i--) { NG_DISC_METH(ng)->update_freq_of_freq(ng,i,ng_count[i]); if (ng_count[i] <= ng->cutoffs[i-1]) ng->num_kgrams[i]--; else store_normal_count(ng,ng->num_kgrams[i]-1,ng_count[i],i); ng_count[i] = current_ngram.count; ng->word_id[i][ng->num_kgrams[i]] = current_ngram.id_array[i]; ng->ind[i][ng->num_kgrams[i]] = new_index(ng->num_kgrams[i+1]-1, ng->ptr_table[i], &(ng->ptr_table_size[i]), ng->num_kgrams[i]); ng->num_kgrams[i]++; if (ng->num_kgrams[i] >= ng->table_sizes[i]) quit(-1,"More than %d %d-grams needed to be stored. Rerun with a higher table size.\n",ng->table_sizes[i],i+1); } for (i=0;i<=pos_of_novelty-1;i++) ng_count[i] += current_ngram.count; /* Deal with new 1-grams */ if (pos_of_novelty == 0) { if (ng->n>1) { for (i = prev_id1 + 1; i <= current_ngram.id_array[0]; i++) { ng->ind[0][i] = new_index(ng->num_kgrams[1]-1, ng->ptr_table[0], &(ng->ptr_table_size[0]), i); } prev_id1 = current_ngram.id_array[0]; } NG_DISC_METH(ng)->update_freq_of_freq(ng,0,ng_count[0]); if (!ng->context_cue[previous_ngram.id_array[0]]) { ng->n_unigrams += ng_count[0]; store_normal_count(ng,previous_ngram.id_array[0],ng_count[0],0); } store_marginal_count(ng,previous_ngram.id_array[0],ng_count[0],0); ng_count[0] = current_ngram.count; } if (current_ngram.count <= ng->cutoffs[ng->n-2]) ng->num_kgrams[ng->n-1]--; ngram_copy(&previous_ngram,¤t_ngram,ng->n); }else { if (!displayed_oov_warning){ pc_message(verbosity,2,"Warning : id n-gram stream contains OOV's (n-grams will be ignored).\n"); displayed_oov_warning = 1; } } } } rr_iclose(ng->id_gram_fp); for (i=ng->n-2;i>=1;i--) { NG_DISC_METH(ng)->update_freq_of_freq(ng,i,ng_count[i]); if (ng_count[i] <= ng->cutoffs[i-1]) ng->num_kgrams[i]--; else store_normal_count(ng,ng->num_kgrams[i]-1,ng_count[i],i); } NG_DISC_METH(ng)->update_freq_of_freq(ng,0,ng_count[0]); if (!ng->context_cue[current_ngram.id_array[0]]) { ng->n_unigrams += ng_count[0]; store_normal_count(ng,current_ngram.id_array[0],ng_count[0],0); } store_marginal_count(ng,current_ngram.id_array[0],ng_count[0],0); if (ng->n>1) { for (i=current_ngram.id_array[0]+1;i<=ng->vocab_size;i++) ng->ind[0][i] = new_index(ng->num_kgrams[1], ng->ptr_table[0], &(ng->ptr_table_size[0]), current_ngram.id_array[0]); } /* The idngram reading is completed at this point */ pc_message(verbosity,2,"\n"); /* Impose a minimum unigram count, if required */ if (ng->min_unicount > 0) { int nchanged= 0; for (i=ng->first_id;i<=ng->vocab_size;i++) { if ((return_count(ng->four_byte_counts, ng->count_table[0], ng->count[0], ng->count4[0], i) < ng->min_unicount) && !ng->context_cue[i]) { /* There was a bug in V2's switch. Look at segment for ABSOLUTE */ NG_DISC_METH(ng)->reduce_ug_freq_of_freq(ng,i); ng->n_unigrams += (ng->min_unicount - ng->count[0][i]); store_normal_count(ng,i,ng->min_unicount,0); nchanged++; } } if (nchanged > 0) pc_message(verbosity,2, "Unigram counts of %d words were bumped up to %d.\n", nchanged,ng->min_unicount); } /* Count zeroton information for unigrams */ ng->freq_of_freq[0][0] = 0; for (i=ng->first_id;i<=ng->vocab_size;i++) { if (return_count(ng->four_byte_counts, ng->count_table[0], ng->count[0], ng->count4[0], i) == 0) { ng->freq_of_freq[0][0]++; } } if (ng->discounting_method == GOOD_TURING) { for (i=0;i<=ng->n-1;i++) for (j=1;j<=ng->fof_size[i];j++) pc_message(verbosity,3,"fof[%d][%d] = %d\n",i,j,ng->freq_of_freq[i][j]); } pc_message(verbosity,2,"Calculating discounted counts.\n"); NG_DISC_METH(ng)->compute_discount_aux(ng, verbosity); /* Smooth unigram distribution, to give some mass to zerotons */ compute_unigram(ng,verbosity); /* Increment Contexts if using Good-Turing discounting-> No need otherwise, since all values are discounted anyway. */ if (ng->discounting_method == GOOD_TURING) { pc_message(verbosity,2,"Incrementing contexts...\n"); for (i=ng->n-1;i>=1;i--) increment_context(ng,i,verbosity); } /* Calculate back-off weights */ pc_message(verbosity,2,"Calculating back-off weights...\n"); for (i=1;i<=ng->n-1;i++) compute_back_off(ng,i,verbosity); if (!ng->four_byte_alphas) pc_message(verbosity,3,"Number of out of range alphas = %d\n", ng->size_of_alpha_array); /* Write out LM */ pc_message(verbosity,2,"Writing out language model...\n"); if (ng->write_arpa) write_arpa_lm(ng,verbosity); if (ng->write_bin) write_bin_lm(ng,verbosity); pc_message(verbosity,0,"idngram2lm : Done.\n"); return 0; }