Пример #1
0
int main(int argc, char **argv) {

  int i,j;
  ng_t* ng;
  int verbosity;
  int mem_alloc_method; /* Method used to decide how much memory to 
			   allocate for count tables */
  int buffer_size;
  flag is_ascii;
  ngram current_ngram;
  ngram previous_ngram;
  count_t *ng_count; /* Array indicating the number of occurrances of 
			   the current 1-gram, 2-gram, ... ,n-gram 
			   Size depends on #define in general.h
			*/  
  int nlines;
  int pos_of_novelty;
  int prev_id1;
  flag contains_unks;
  int mem_alloced;

  flag displayed_oov_warning; /** Display OOV warning 
			       */

  /*  ------------------  Process command line --------------------- */

  report_version(&argc,argv);

  if (argc == 1 || pc_flagarg(&argc, argv,"-help")) {    
    /* Display help message */    
    help_message();
    exit(1);
  }

  verbosity = pc_intarg(&argc, argv,"-verbosity",DEFAULT_VERBOSITY);

  /* Initialization */
  {
    ng=init_ng(
	    &argc,
	    argv,
	    verbosity
	    );
    
    mem_alloc_method = init_alloc_method(ng, &argc, argv, &buffer_size);
    
    if (!strcmp(ng->id_gram_filename,"-") && mem_alloc_method == TWO_PASSES)
      quit(-1,"Error: If idngram is read from stdin, then cannot use -calc_mem option.\n");
    
    is_ascii = set_lmformat(pc_flagarg(&argc,argv,"-ascii_input"),
			    pc_flagarg(&argc,argv,"-bin_input"),
			    ng);  

    /* Report parameters */
    report_param(verbosity,ng,
		 is_ascii, mem_alloc_method, buffer_size);

    pc_report_unk_args(&argc,argv,verbosity);

  }

  /* --------------- Read in the vocabulary -------------- */
  read_vocab(ng,verbosity);
       		     
  /* --------------- Allocate space for the table_size array --------- */
  init_ng_table_size(ng, 
		     mem_alloc_method,
		     is_ascii,
		     verbosity,
		     buffer_size
		     );

  /* ----------- Allocate memory for tree structure -------------- */

  ng->count = NULL;
  ng->count4 = NULL;
  ng->marg_counts = NULL;
  ng->marg_counts4 = NULL;
  ng->count_table = NULL;

  ng->count = (count_ind_t **) rr_malloc(sizeof(count_ind_t *)*ng->n);
  ng->count4 = (count_t **) rr_malloc(sizeof(count_t *)*ng->n);    
  ng->count_table = (count_t **) rr_malloc(sizeof(count_t *)*ng->n);

  if (ng->four_byte_counts) {
    ng->marg_counts4 = (count_t *) rr_calloc(sizeof(count_t), ng->table_sizes[0]);

  }else {
    for (i=0;i<=ng->n-1;i++) 
      ng->count_table[i] = (count_t *) rr_calloc(ng->count_table_size+1,
						sizeof(count_t));

    ng->marg_counts = (count_ind_t *) rr_calloc(sizeof(count_ind_t),ng->table_sizes[0]);
    fprintf(stderr, "table_size %d\n",ng->table_sizes[0]);
    fflush(stderr);
  }

  ng->word_id = (id__t **) rr_malloc(sizeof(id__t *)*ng->n);

  if (ng->four_byte_alphas) {
    ng->bo_weight4 = (four_byte_t **) rr_malloc(sizeof(four_byte_t *)*ng->n);
    ng->bo_weight4[0] = (four_byte_t *) rr_malloc(sizeof(four_byte_t)*
						ng->table_sizes[0]);
  }else {
    ng->bo_weight = (bo_weight_t **) rr_malloc(sizeof(bo_weight_t *)*ng->n);
    ng->bo_weight[0] = (bo_weight_t *) rr_malloc(sizeof(bo_weight_t)*
						ng->table_sizes[0]);
  }

  ng->ind = (index__t **)  rr_malloc(sizeof(index__t *)*ng->n);

  /* First table */
  if (ng->four_byte_counts) 
    ng->count4[0] = (count_t *) rr_calloc(ng->table_sizes[0],sizeof(count_t));
  else 
    ng->count[0] = (count_ind_t *) rr_calloc(ng->table_sizes[0],sizeof(count_ind_t));

  ng->uni_probs = (uni_probs_t *) rr_malloc(sizeof(uni_probs_t)*
					   ng->table_sizes[0]);
  ng->uni_log_probs = (uni_probs_t *) rr_malloc(sizeof(uni_probs_t)*
					       ng->table_sizes[0]);

  if (ng->n >=2) 
    ng->ind[0] = (index__t *) rr_calloc(ng->table_sizes[0],sizeof(index__t));

  for (i=1;i<=ng->n-2;i++) {    
    ng->word_id[i] = (id__t *) rr_malloc(sizeof(id__t)*ng->table_sizes[i]);

    if (ng->four_byte_counts) 
      ng->count4[i] = (count_t *) rr_malloc(sizeof(count_t)*ng->table_sizes[i]);
    else 
      ng->count[i] = (count_ind_t *) rr_malloc(sizeof(count_ind_t)*ng->table_sizes[i]);

    if (ng->four_byte_alphas) 
      ng->bo_weight4[i] = (four_byte_t *) rr_malloc(sizeof(four_byte_t)*ng->table_sizes[i]);
    else 
      ng->bo_weight[i] = (bo_weight_t *) rr_malloc(sizeof(bo_weight_t)*ng->table_sizes[i]);
    
    ng->ind[i] = (index__t *) rr_malloc(sizeof(index__t)*ng->table_sizes[i]);

    mem_alloced = sizeof(count_ind_t) + sizeof(bo_weight_t) + 
		sizeof(index__t) + sizeof(id__t);
    
    if (ng->four_byte_alphas) 
      mem_alloced += 4;
   
    mem_alloced *= ng->table_sizes[i];
    
    pc_message(verbosity,2,"Allocated %d bytes to table for %d-grams.\n",
	       mem_alloced,i+1);
    
  }

  ng->word_id[ng->n-1] = (id__t *) 
    rr_malloc(sizeof(id__t)*ng->table_sizes[ng->n-1]);

  if (ng->four_byte_counts) 
    ng->count4[ng->n-1] = (count_t *) rr_malloc(sizeof(count_t)*ng->table_sizes[ng->n-1]);    
  else 
    ng->count[ng->n-1] = (count_ind_t *) rr_malloc(sizeof(count_ind_t)*ng->table_sizes[ng->n-1]);

  pc_message(verbosity,2,"Allocated (%d+%d) bytes to table for %d-grams.\n",
	     ng->four_byte_counts?sizeof(count_t):sizeof(count_ind_t),
	     sizeof(id__t)*ng->table_sizes[ng->n-1],ng->n);
  
  /* Allocate memory for table for first-byte of indices */

  ng_allocate_ptr_table(ng,NULL,0);

  /* Allocate memory for alpha array */

  ng->alpha_array = (double *) rr_malloc(sizeof(double)*ng->out_of_range_alphas);
  ng->size_of_alpha_array = 0;

  /* Allocate memory for frequency of frequency information */

  ng->freq_of_freq = (fof_t **) rr_malloc(sizeof(fof_t *)*ng->n);

  NG_DISC_METH(ng)->allocate_freq_of_freq(ng);

  /* Read n-grams into the tree */
  pc_message(verbosity,2,"Processing id n-gram file.\n");
  pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n");

  /* Allocate space for ngrams id arrays */

  current_ngram.id_array = (id__t *) rr_calloc(ng->n,sizeof(id__t));
  previous_ngram.id_array = (id__t *) rr_calloc(ng->n,sizeof(id__t));
  current_ngram.n = ng->n;
  previous_ngram.n = ng->n;
  
  ng->num_kgrams = (ngram_sz_t *) rr_calloc(ng->n,sizeof(ngram_sz_t));
  ng_count = (count_t *) rr_calloc(ng->n,sizeof(count_t));
  nlines = 1;
  ng->n_unigrams = 0;

  /* Process first n-gram */  
  get_ngram(ng->id_gram_fp,&current_ngram,is_ascii);
  contains_unks = ngram_chk_contains_unks(&current_ngram,ng->n);

  /* Skip over any unknown words.  They will come first, because <UNK>
     always has a word ID of zero. */
  while (ng->vocab_type == CLOSED_VOCAB && contains_unks){
    /* Stop looking if there are no more N-Grams.  Of course, this
       means training will fail, since there are no unigrams. */
    if (get_ngram(ng->id_gram_fp,&current_ngram,is_ascii) == 0)
      break;
    contains_unks = ngram_chk_contains_unks(&current_ngram,ng->n);
  }

  for (i=0;i<=ng->n-2;i++) {
    ng->ind[i][0] = new_index(0,ng->ptr_table[i],&(ng->ptr_table_size[i]),0);
    ng->word_id[i+1][0] = current_ngram.id_array[i+1];
    ng->num_kgrams[i+1]++;
    ng_count[i] = current_ngram.count;
  }

  ng_count[0] = current_ngram.count;

  NG_DISC_METH(ng)->update_freq_of_freq(ng,ng->n-1,current_ngram.count);

  store_normal_count(ng,0,current_ngram.count,ng->n-1);

  if (current_ngram.count <= ng->cutoffs[ng->n-2]) 
    ng->num_kgrams[ng->n-1]--;

  ngram_copy(&previous_ngram,&current_ngram,ng->n);

  prev_id1 = current_ngram.id_array[0];
    
  displayed_oov_warning = 0;

  while (!rr_feof(ng->id_gram_fp)) {

    if (get_ngram(ng->id_gram_fp,&current_ngram,is_ascii)) {

      if (ng->vocab_type == CLOSED_VOCAB)
	contains_unks=ngram_chk_contains_unks(&current_ngram,ng->n);
    
      if (!contains_unks || ng->vocab_type != CLOSED_VOCAB) {

	/* Test for where this ngram differs from last - do we have an
	   out-of-order ngram? */
	pos_of_novelty = ngram_find_pos_of_novelty(&current_ngram,&previous_ngram,ng->n,nlines);
    
	nlines++; 
	show_idngram_nlines(nlines, verbosity);
    
	/* Add new n-gram as soon as it is encountered */
	/* If all of the positions 2,3,...,n of the n-gram are context
	   cues then ignore the n-gram. */
    
	if (ng->n > 1) {
	  NG_DISC_METH(ng)->update_freq_of_freq(ng,ng->n-1,current_ngram.count);
	        
	  store_normal_count(ng,ng->num_kgrams[ng->n-1],current_ngram.count,ng->n-1);
	  
	  ng->word_id[ng->n-1][ng->num_kgrams[ng->n-1]] = current_ngram.id_array[ng->n-1];
	  ng->num_kgrams[ng->n-1]++;	  
	  
	  if (ng->num_kgrams[ng->n-1] >= ng->table_sizes[ng->n-1])
	    quit(-1,"\nMore than %d %d-grams needed to be stored. Rerun with a higher table size.\n",ng->table_sizes[ng->n-1],ng->n);
	}
	/* Deal with new 2,3,...,(n-1)-grams */
      
	for (i=ng->n-2;i>=MAX(1,pos_of_novelty);i--) {

	  NG_DISC_METH(ng)->update_freq_of_freq(ng,i,ng_count[i]);
	  
	  if (ng_count[i] <= ng->cutoffs[i-1]) 
	    ng->num_kgrams[i]--;
	  else
	    store_normal_count(ng,ng->num_kgrams[i]-1,ng_count[i],i);

	  ng_count[i] = current_ngram.count;
	  ng->word_id[i][ng->num_kgrams[i]] = current_ngram.id_array[i];
	  ng->ind[i][ng->num_kgrams[i]] = new_index(ng->num_kgrams[i+1]-1,
						    ng->ptr_table[i],
						    &(ng->ptr_table_size[i]),
						    ng->num_kgrams[i]);
	  ng->num_kgrams[i]++;
	
	  if (ng->num_kgrams[i] >= ng->table_sizes[i])
	    quit(-1,"More than %d %d-grams needed to be stored. Rerun with a higher table size.\n",ng->table_sizes[i],i+1);	  
	}
      
	for (i=0;i<=pos_of_novelty-1;i++) 
	  ng_count[i] += current_ngram.count;
      
	/* Deal with new 1-grams */
      
	if (pos_of_novelty == 0) {
	  if (ng->n>1) {
	    for (i = prev_id1 + 1; i <= current_ngram.id_array[0]; i++) {
	      ng->ind[0][i] = new_index(ng->num_kgrams[1]-1,
				       ng->ptr_table[0],
				       &(ng->ptr_table_size[0]),
				       i);
	    }
	    prev_id1 = current_ngram.id_array[0];
	  }

	  NG_DISC_METH(ng)->update_freq_of_freq(ng,0,ng_count[0]);

	  if (!ng->context_cue[previous_ngram.id_array[0]]) {
	    ng->n_unigrams += ng_count[0];
	    store_normal_count(ng,previous_ngram.id_array[0],ng_count[0],0);
	  }

	  store_marginal_count(ng,previous_ngram.id_array[0],ng_count[0],0);
		      
	  ng_count[0] = current_ngram.count;
	}

	if (current_ngram.count <= ng->cutoffs[ng->n-2]) 
	  ng->num_kgrams[ng->n-1]--;

	ngram_copy(&previous_ngram,&current_ngram,ng->n);

      }else {
	if (!displayed_oov_warning){
	  pc_message(verbosity,2,"Warning : id n-gram stream contains OOV's (n-grams will be ignored).\n");
	  displayed_oov_warning = 1;
	}
      }
    }
  }

  rr_iclose(ng->id_gram_fp);

  for (i=ng->n-2;i>=1;i--) {

    NG_DISC_METH(ng)->update_freq_of_freq(ng,i,ng_count[i]);

    if (ng_count[i] <= ng->cutoffs[i-1]) 
      ng->num_kgrams[i]--;
    else 
      store_normal_count(ng,ng->num_kgrams[i]-1,ng_count[i],i);
      
  }
  
  NG_DISC_METH(ng)->update_freq_of_freq(ng,0,ng_count[0]);

  if (!ng->context_cue[current_ngram.id_array[0]]) {
    ng->n_unigrams += ng_count[0];
    store_normal_count(ng,current_ngram.id_array[0],ng_count[0],0);
  }

  store_marginal_count(ng,current_ngram.id_array[0],ng_count[0],0);

  if (ng->n>1) {
    for (i=current_ngram.id_array[0]+1;i<=ng->vocab_size;i++)
      ng->ind[0][i] = new_index(ng->num_kgrams[1],
				ng->ptr_table[0],
				&(ng->ptr_table_size[0]),
				current_ngram.id_array[0]);
  }

  /* The idngram reading is completed at this point */
  pc_message(verbosity,2,"\n");

  /* Impose a minimum unigram count, if required */

  if (ng->min_unicount > 0) {

    int nchanged= 0;

    for (i=ng->first_id;i<=ng->vocab_size;i++) {
      if ((return_count(ng->four_byte_counts,
			ng->count_table[0],
			ng->count[0],
			ng->count4[0],
			i) < ng->min_unicount) && !ng->context_cue[i]) {

	/* There was a bug in V2's switch.  Look at segment for ABSOLUTE */
	NG_DISC_METH(ng)->reduce_ug_freq_of_freq(ng,i);
	ng->n_unigrams += (ng->min_unicount - ng->count[0][i]);
	store_normal_count(ng,i,ng->min_unicount,0);
	nchanged++;
      }
    }

    if (nchanged > 0) 
      pc_message(verbosity,2,
		 "Unigram counts of %d words were bumped up to %d.\n",
		 nchanged,ng->min_unicount);
  }

  /* Count zeroton information for unigrams */

  ng->freq_of_freq[0][0] = 0;
  
  for (i=ng->first_id;i<=ng->vocab_size;i++) {
    if (return_count(ng->four_byte_counts,
		     ng->count_table[0],
		     ng->count[0],
		     ng->count4[0],
		     i) == 0) {
      ng->freq_of_freq[0][0]++;
    }
  }  

  if (ng->discounting_method == GOOD_TURING) {
    for (i=0;i<=ng->n-1;i++) 
      for (j=1;j<=ng->fof_size[i];j++) 
	pc_message(verbosity,3,"fof[%d][%d] = %d\n",i,j,ng->freq_of_freq[i][j]);
  }

  pc_message(verbosity,2,"Calculating discounted counts.\n");

  NG_DISC_METH(ng)->compute_discount_aux(ng, verbosity);
     
  /* Smooth unigram distribution, to give some mass to zerotons */     
  compute_unigram(ng,verbosity);

  /* Increment Contexts if using Good-Turing discounting-> No need otherwise,
     since all values are discounted anyway. */

  if (ng->discounting_method == GOOD_TURING) {
    pc_message(verbosity,2,"Incrementing contexts...\n");  

    for (i=ng->n-1;i>=1;i--) 
      increment_context(ng,i,verbosity);      
  }

  /* Calculate back-off weights */

  pc_message(verbosity,2,"Calculating back-off weights...\n");

  for (i=1;i<=ng->n-1;i++) 
    compute_back_off(ng,i,verbosity);

  if (!ng->four_byte_alphas) 
    pc_message(verbosity,3,"Number of out of range alphas = %d\n",
	       ng->size_of_alpha_array);

  /* Write out LM */

  pc_message(verbosity,2,"Writing out language model...\n");

  if (ng->write_arpa)
    write_arpa_lm(ng,verbosity);

  if (ng->write_bin) 
    write_bin_lm(ng,verbosity);

  pc_message(verbosity,0,"idngram2lm : Done.\n");

  return 0;    
}
Пример #2
0
void write_arpa_lm(ng_t *ng,int verbosity) {
    
    int *current_pos;
    int *end_pos;
    ngram_sz_t i;
    double log_10_of_e = 1.0 / log(10.0);
    
    /* HEADER */
    
    pc_message(verbosity,1,"ARPA-style %d-gram will be written to %s\n",ng->n,ng->arpa_filename);
    
    write_arpa_copyright(ng->arpa_fp,ng->n,ng->vocab_size, ng->vocab[1],ng->vocab[2],ng->vocab[3]);
    
    display_vocabtype(ng->vocab_type,ng->oov_fraction, ng->arpa_fp);  
    display_discounting_method(ng,ng->arpa_fp);
    write_arpa_format(ng->arpa_fp,ng->n);
    write_arpa_num_grams(ng->arpa_fp,ng,NULL,0);
    write_arpa_k_gram_header(ng->arpa_fp,1);
    
    for (i=ng->first_id; i<= (int) ng->vocab_size;i++) {
        
        double log10_uniprob;
        double log10_alpha;
        double alpha;
        
        log10_uniprob = ng->uni_log_probs[i]*log_10_of_e;
        
        if (ng->uni_probs[i]<=0.0)
            log10_uniprob = BAD_LOG_PROB;
        
        alpha=ng_double_alpha(ng,0,i);
        
        if(alpha > 0.0)
            log10_alpha = log10(alpha);
        else
            log10_alpha = BAD_LOG_PROB;
        
        fprintf(ng->arpa_fp,"%.4f %s",log10_uniprob,ng->vocab[i]);
        if (ng->n>1)
            fprintf(ng->arpa_fp,"\t%.4f\n",log10_alpha);
        else
            fprintf(ng->arpa_fp,"\n");
    }
    
    current_pos = (int *) rr_malloc(ng->n*sizeof(int));
    end_pos = (int *) rr_malloc(ng->n*sizeof(int)); 
    
    /* Print 2-gram, ... (n-1)-gram info. */
    
    for (i=1;i<=ng->n-1;i++) {
        
        /* Print out the (i+1)-gram */
        
        int current_table, j;
        count_t ngcount, marg_count;
        double discounted_ngcount;    
        double ngprob, log_10_ngprob, ngalpha, log_10_ngalpha;
        
        /* Initialise variables for the sake of warning-free compilation */
#ifdef STATICANALYZEDEPENDENCIES
#define __clang_analyzer__ 1
#endif
#if !defined(__clang_analyzer__) || defined(STATICANALYZEDEPENDENCIES)
#undef __clang_analyzer__
        
        discounted_ngcount = 0.0;
        log_10_ngalpha = 0.0;
#endif
        write_arpa_k_gram_header(ng->arpa_fp,i+1);
        
        /* Go through the n-gram list in order */
        
        for (j=0;j<=ng->n-1;j++) {
            current_pos[j] = 0;
            end_pos[j] = 0;
        }
        
        for (current_pos[0]=ng->first_id;
             current_pos[0]<=(int) ng->vocab_size;
             current_pos[0]++) {
            
            if (return_count(ng->four_byte_counts,
                             ng->count_table[0], 
                             ng->marg_counts,
                             ng->marg_counts4,
                             current_pos[0]) > 0) {
                
                current_table = 1;
                
                if (current_pos[0] == (int) ng->vocab_size)
                    end_pos[1] = (int ) ng->num_kgrams[1]-1;
                else {
                    end_pos[1] = get_full_index(ng->ind[0][current_pos[0]+1],
                                                ng->ptr_table[0],
                                                ng->ptr_table_size[0],
                                                current_pos[0]+1)-1;
                }
                
                while (current_table > 0) {
                    
                    /*	  fprintf(stderr, "i %d, current_pos[i] %d, end_pos[i] %d\n",
                     i,
                     current_pos[i],
                     end_pos[i]);
                     fflush(stderr);*/
                    
                    
                    if (current_table == i) {
                        
                        if (current_pos[i] <= end_pos[i]) {
                            
                            /*	      fprintf(stderr, "%d\n",ng->count[i][current_pos[i]]);
                             fprintf(stderr, "%d\n",ng->count_table[i][ng->count[i][current_pos[i]]]);*/
                            
                            ngcount = return_count(ng->four_byte_counts,
                                                   ng->count_table[i],
                                                   ng->count[i],
                                                   ng->count4[i],
                                                   current_pos[i]);
                            
                            
                            if (i==1) {
                                marg_count = return_count(ng->four_byte_counts,
                                                          ng->count_table[0], 
                                                          ng->marg_counts,
                                                          ng->marg_counts4,
                                                          current_pos[0]);
                            }else {
                                marg_count = return_count(ng->four_byte_counts,
                                                          ng->count_table[i-1],
                                                          ng->count[i-1],
                                                          ng->count4[i-1],
                                                          current_pos[i-1]);
                            }
                            
                            if(ng->disc_meth==NULL)
                                ng->disc_meth=(disc_meth_t*) disc_meth_init(ng->discounting_method);
                            
                            assert(ng->disc_meth);
                            discounted_ngcount = 
                            NG_DISC_METH(ng)->dump_discounted_ngram_count(ng,i,ngcount,marg_count,current_pos);
                            
                            ngprob = (double) discounted_ngcount / marg_count;
                            
                            if (ngprob > 1.0) {
                                fprintf(stderr,
                                        "discounted_ngcount = %f marg_count = %d %d %d %d\n",
                                        discounted_ngcount,marg_count,current_pos[0],
                                        current_pos[1],current_pos[2]);
                                quit(-1,"Error : probablity of ngram is greater than one.\n");
                            }
                            
                            if (ngprob > 0.0) 
                                log_10_ngprob = log10(ngprob);
                            else 
                                log_10_ngprob = BAD_LOG_PROB;
                            
                            if (i <= ng->n-2) {
                                ngalpha = ng_double_alpha(ng, i, current_pos[i]);
                                
                                if (ngalpha > 0.0)
                                    log_10_ngalpha = log10(ngalpha);
                                else
                                    log_10_ngalpha = BAD_LOG_PROB;
                            }
                            // BEGIN HLW VERSION
                            if(((strstr (ng->vocab[current_pos[0]],"</s>")) == NULL)&&((i <= 1) || ((i > 1) && ((strstr (ng->vocab[(unsigned int) ng->word_id[i][current_pos[i]]],"<s>")) == NULL)))) { // if the overall entry is a trigram and it's going to end with <s>, skip it -- HLW
                                
                                fprintf(ng->arpa_fp,"%.4f ",log_10_ngprob);
                                fprintf(ng->arpa_fp,"%s ",ng->vocab[current_pos[0]]);
                                for (j=1;j<=i;j++){
                                    
                                    fprintf(ng->arpa_fp,"%s ",ng->vocab[(unsigned int) ng->word_id[j][current_pos[j]]]);
                                }
                                
                                if (i <= ng->n-2){
                                    fprintf(ng->arpa_fp,"%.4f\n",log_10_ngalpha);
                                } else{
                                    fprintf(ng->arpa_fp,"\n");
                                }
                            } else {
                                // something is being skipped  -- HLW
                                if(i==0) {
                                    skipped_unigrams++;
                                } else if(i==1) {
                                    skipped_bigrams++;
                                } else if (i==2) {
                                    skipped_trigrams++;
                                }
                            }
                            // END HLW VERSION
                            
                            // PREVIOUS VERSION:
                            
                            /*
                             if (i <= ng->n-2) {
                             ngalpha = ng_double_alpha(ng, i, current_pos[i]);
                             
                             if (ngalpha > 0.0)
                             log_10_ngalpha = log10(ngalpha);
                             else
                             log_10_ngalpha = BAD_LOG_PROB;
                             }
                             
                             fprintf(ng->arpa_fp,"%.4f ",log_10_ngprob);
                             fprintf(ng->arpa_fp,"%s ",ng->vocab[current_pos[0]]);
                             for (j=1;j<=i;j++){
                             
                             //		fprintf(stderr, "j %d, ng->wordid[j] %u, current_pos[j] %d, ng->word_id[j][current_pos[j]] %u\n",j, ng->word_id[j], current_pos[j], ng->word_id[j][current_pos[j]]);
                             
                             fprintf(ng->arpa_fp,"%s ",ng->vocab[(unsigned int) ng->word_id[j][current_pos[j]]]);
                             }
                             
                             if (i <= ng->n-2)
                             fprintf(ng->arpa_fp,"%.4f\n",log_10_ngalpha);
                             else
                             fprintf(ng->arpa_fp,"\n");
                             */
                            
                            current_pos[i]++;        
                        }else {
                            current_table--;
                            if (current_table > 0)
                                current_pos[current_table]++;
                        }
                    }else {
                        
                        if (current_pos[current_table] <= end_pos[current_table]) {
                            current_table++;
                            if (current_pos[current_table-1] == (int) ng->num_kgrams[current_table-1]-1)
                                end_pos[current_table] = (int) ng->num_kgrams[current_table]-1;
                            else {
                                end_pos[current_table] = get_full_index(ng->ind[current_table-1][current_pos[current_table-1]+1],
                                                                        ng->ptr_table[current_table-1],
                                                                        ng->ptr_table_size[current_table-1],
                                                                        current_pos[current_table-1]+1) - 1;
                            }
                        }else {
                            current_table--;
                            if (current_table > 0)
                                current_pos[current_table]++;
                        }
                    }
                }
            }
        }
    } 
    
    free(current_pos);
    free(end_pos);
    
    fprintf(ng->arpa_fp,"\n\\end\\\n");
    
    rr_oclose(ng->arpa_fp);
    
    // BEGIN HLW ADDITION
    
    // Now that the file is complete, let's go back and replace the placeholder ngram counts with the real final counts  -- HLW
    
    final_ngram_count_replacement(ng->n,ng);
    
    unigram_count = 0;
    bigram_count = 0;
    trigram_count = 0;
    skipped_unigrams = 0;
    skipped_bigrams = 0;
    skipped_trigrams = 0;
    
    // END HLW ADDITION
} 
Пример #3
0
void report_param(int verbosity, ng_t *ng, 
		  flag is_ascii,
		  flag mem_alloc_method,
		  int buffer_size
		  )
{
  int i;
  pc_message(verbosity,2,"  n : %d\n",ng->n);
  pc_message(verbosity,2,"  Input file : %s",ng->id_gram_filename);
  if (is_ascii) 
    pc_message(verbosity,2,"     (ascii format)\n");
  else 
    pc_message(verbosity,2,"     (binary format)\n");

  pc_message(verbosity,2,"  Output files :\n");

  if (ng->write_arpa) 
    pc_message(verbosity,2,"     ARPA format   : %s\n",ng->arpa_filename);
  if (ng->write_bin) 
    pc_message(verbosity,2,"     Binary format : %s\n",ng->bin_filename);

  pc_message(verbosity,2,"  Vocabulary file : %s\n",ng->vocab_filename);
  if (ng->context_set) 
    pc_message(verbosity,2,"  Context cues file : %s\n",ng->context_cues_filename);

  pc_message(verbosity,2,"  Cutoffs :\n     ");
  for (i=0;i<=ng->n-2;i++) 
    pc_message(verbosity,2,"%d-gram : %d     ",i+2,ng->cutoffs[i]);

  pc_message(verbosity,2,"\n");

  switch (ng->vocab_type) {
  case CLOSED_VOCAB:
    pc_message(verbosity,2,"  Vocabulary type : Closed\n");
    break;
  case OPEN_VOCAB_1:
    pc_message(verbosity,2,"  Vocabulary type : Open - type 1\n");
    break;
  case OPEN_VOCAB_2:
    pc_message(verbosity,2,"  Vocabulary type : Open - type 2\n");
    pc_message(verbosity,2,"     OOV fraction = %g\n",ng->oov_fraction);
    break;
  }
  pc_message(verbosity,2,"  Minimum unigram count : %d\n",ng->min_unicount);
  pc_message(verbosity,2,"  Zeroton fraction : %g\n",ng->zeroton_fraction);
  if (ng->four_byte_counts) 
    pc_message(verbosity,2,"  Counts will be stored in four bytes.\n");
  else {
    pc_message(verbosity,2,"  Counts will be stored in two bytes.\n");
    pc_message(verbosity,2,"  Count table size : %d\n",ng->count_table_size);
  }

  pc_message(verbosity,2,"  Discounting method : ");
  NG_DISC_METH(ng)->verbose_method(ng, verbosity);

  pc_message(verbosity,2,"  Memory allocation for tree structure : \n");
  switch(mem_alloc_method) {
  case TWO_PASSES:
    pc_message(verbosity,2,"     Perform a preliminary pass over the id n-gram file to determine \n     the amount of memory to allocate\n");
    break;
  case BUFFER:
    pc_message(verbosity,2,"     Allocate %d MB of memory, shared equally between all n-gram tables.\n",buffer_size);
    break;
  case SPECIFIED:
    pc_message(verbosity,2,"     Memory requirement specified.\n          ");
    for (i=0;i<=ng->n-2;i++)
      pc_message(verbosity,2,"%d-gram : %d     ",i+2,ng->table_sizes[i+1]);
    pc_message(verbosity,2,"\n");
    break;
  }
  pc_message(verbosity,2,"  Back-off weight storage : \n");

  if (ng->four_byte_alphas) 
    pc_message(verbosity,2,"     Back-off weights will be stored in four bytes.\n");
  else {
    pc_message(verbosity,2,"     Back-off weights will be stored in two bytes.\n");
    pc_message(verbosity,2,"        Minimum back-off weight : %g\n",ng->min_alpha);
    pc_message(verbosity,2,"        Maximum back-off weight : %g\n",ng->max_alpha);
    pc_message(verbosity,2,"        Maximum number of out of range back-off weights : %d\n",ng->out_of_range_alphas);
  }

}