Пример #1
0
void compute_back_off(ng_t *ng,int n, int verbosity) {

  int *current_pos;
  int *end_pos;
  id__t *sought_ngram;
  int current_table;
  int ng_count;
  int i;
  double sum_cond_prob;
  double sum_bo_prob;
  double discounted_ngcount;
  double cond_prob;
  double bo_prob;
  double discount_mass;
  double leftout_bo_prob;
  double alpha;

  int bo_case;

  sum_cond_prob = 0.0;
  sum_bo_prob = 0.0;

  /* For the sake of warning-free compilation... */

  discounted_ngcount = 0.0;
  
  current_pos = (int *)rr_calloc(n+1,sizeof(int));
  sought_ngram = (id__t *) rr_calloc(n+1,sizeof(id__t));
  end_pos = (int *)rr_calloc(n+1,sizeof(int)); 
  
  /* Process the tree so that we get all the n-grams out in the right
     order. */
  
  for (current_pos[0]=ng->first_id;
       current_pos[0]<=ng->vocab_size;
       current_pos[0]++) {
    
    if (return_count(ng->four_byte_counts,
		     ng->count_table[0],
		     ng->marg_counts,
		     ng->marg_counts4,
		     current_pos[0]) > 0) {

      current_table = 1;
      
      if (current_pos[0] == ng->vocab_size) {
	end_pos[1] = ng->num_kgrams[1]-1;
      }
      else {
 	end_pos[1] = get_full_index(ng->ind[0][current_pos[0]+1],
				    ng->ptr_table[0],
				    ng->ptr_table_size[0],
				    current_pos[0]+1)-1;
      }

      while (current_table > 0) {

	if (current_table == n) {

	  if (current_pos[n] <= end_pos[n]){

	    ng_count = return_count(ng->four_byte_counts,
				    ng->count_table[n],
				    ng->count[n],
				    ng->count4[n],
				    current_pos[n]);

	    switch (ng->discounting_method) {
	    case GOOD_TURING:
	      if (ng_count <= ng->disc_range[n]) {
		discounted_ngcount = ng->gt_disc_ratio[n][ng_count] * ng_count;
	      }
	      else {
		discounted_ngcount = ng_count;
	      }
	      break;
	    case LINEAR:
	      discounted_ngcount = ng->lin_disc_ratio[n] * ng_count;
	      break;
	    case ABSOLUTE:
	      discounted_ngcount = ng_count - ng->abs_disc_const[n];
	      break;
	    case WITTEN_BELL:
	      if (n==1) {

		discounted_ngcount = ((double) 
				      return_count(ng->four_byte_counts,
						   ng->count_table[0],
						   ng->marg_counts,
						   ng->marg_counts4,
						   current_pos[0]) * ng_count)
		  / (return_count(ng->four_byte_counts,
				  ng->count_table[0],
				  ng->marg_counts,
				  ng->marg_counts4,
				  current_pos[0]) + 
		     num_of_types(0,current_pos[0],ng));
	      }
	      else {
		
		discounted_ngcount = ((double) 
				      return_count(ng->four_byte_counts,
						   ng->count_table[n-1],
						   ng->count[n-1],
						   ng->count4[n-1],
						   current_pos[n-1])* ng_count)
		  / (return_count(ng->four_byte_counts,
				  ng->count_table[n-1],
				  ng->count[n-1],
				  ng->count4[n-1],
				  current_pos[n-1]) + 
		     num_of_types(n-1,current_pos[n-1],ng));

	      }	  
	      
	      break;
	    }

	    if (n==1) {
	      cond_prob = ((double) discounted_ngcount / 
			   return_count(ng->four_byte_counts,
					ng->count_table[0],
					ng->marg_counts,
					ng->marg_counts4,
					current_pos[0]));
	    }
	    else {
	      cond_prob = ((double) discounted_ngcount /  
			   return_count(ng->four_byte_counts,
					ng->count_table[n-1],
					ng->count[n-1],
					ng->count4[n-1],
					current_pos[n-1]));

	    }
	    sum_cond_prob += cond_prob;

	    /* Fill up sought ngram array with correct stuff */

	    for (i=1;i<=n;i++) {
	      sought_ngram[i-1] = ng->word_id[i][current_pos[i]];
	    }


	    bo_ng_prob(n-1,sought_ngram,ng,verbosity,&bo_prob,&bo_case);
	    sum_bo_prob += bo_prob;
	    current_pos[n]++;			
					       
	  }
	  else {

	    discount_mass = 1.0 - sum_cond_prob;

	    if (discount_mass < 1e-10) {
	      discount_mass = 0.0;
	      pc_message(verbosity,2,"Warning : Back off weight for %s(id %d) ",
			 ng->vocab[current_pos[0]],current_pos[0]);
	      for (i=1;i<=n-1;i++) {
		pc_message(verbosity,2,"%s(id %d) ",ng->vocab[ng->word_id[i][current_pos[i]]],ng->word_id[i][current_pos[i]]);
	      }
	      pc_message(verbosity,2,
			 "is set to 0 (sum of probs = %f).\nMay cause problems with zero probabilities.\n",sum_cond_prob);
	    }

	    leftout_bo_prob = 1.0 - sum_bo_prob;
	    if (leftout_bo_prob < 1e-10) {
	      leftout_bo_prob = 0.0;
	    }

	    if (leftout_bo_prob > 0.0) {
	      alpha = discount_mass / leftout_bo_prob;
	    }
	    else {
	      alpha = 0.0;	/* Will not be used. Should happen very rarely. */
	      pc_message(verbosity,2,"Warning : Back off weight for %s(id %d) ",
			 ng->vocab[current_pos[0]],current_pos[0]);
	      for (i=1;i<=n-1;i++) {
		pc_message(verbosity,2,"%s(id %d) ",ng->vocab[ng->word_id[i][current_pos[i]]],ng->word_id[i][current_pos[i]]);
	      }
	      pc_message(verbosity,2,
			 "is set to 0.\nMay cause problems with zero probabilities.\n");

	    }
	  
	    if (ng->four_byte_alphas) {
	      ng->bo_weight4[n-1][current_pos[n-1]] = alpha;
	    }
	    else {
	      ng->bo_weight[n-1][current_pos[n-1]] = 
		short_alpha(alpha,
			    ng->alpha_array,
			    &(ng->size_of_alpha_array),
			    65535 - ng->out_of_range_alphas,
			    ng->min_alpha,
			    ng->max_alpha);
	    }
	  
	    /* Finished current (n-1)-gram */

	    sum_cond_prob = 0.0;
	    sum_bo_prob = 0.0;
	    current_table--;
	    if (current_table > 0) {
	      current_pos[current_table]++;
	    }
	  }
	}
	else {

	  if (current_pos[current_table] <= end_pos[current_table]) {
	    current_table++;
	    if (current_pos[current_table-1] == ng->num_kgrams[current_table-1]-1) {
	      end_pos[current_table] = ng->num_kgrams[current_table]-1;
	    }
	    else {
	      end_pos[current_table] = get_full_index(ng->ind[current_table-1][current_pos[current_table-1]+1],ng->ptr_table[current_table-1],ng->ptr_table_size[current_table-1],current_pos[current_table-1]+1)-1;
	    }
	  }
	  else {
	    current_table--;
	    if (current_table > 0) {
	      current_pos[current_table]++;
	    }
	  }
	}
      }
    }

    /* Now deal with zeroton unigrams */

    else {
      if (n == 1) {
	if (ng->four_byte_alphas) {
	  ng->bo_weight4[0][current_pos[0]] = 1.0;
	}
	else {
	  ng->bo_weight[0][current_pos[0]] = 
	    short_alpha(1.0,
			ng->alpha_array,
			&(ng->size_of_alpha_array),
			65535 - ng->out_of_range_alphas,
			ng->min_alpha,
			ng->max_alpha);
	}
      }
    }
  }
  free(end_pos);
  free(current_pos);
  free(sought_ngram);
  
}
Пример #2
0
/* generate_words implements the "generate" command in the evallm command loop.

   It can take the following flags (yes, I know I should add this to the help
   text in evallm.c):
   -text filename    The name of the file where text will be generated.
   -size 10000       An optional flag specifying the number of generated words.
                     If unspecified, it will default to 10000.
   -seed 1237688     An optional flag specifying a seed for the random number
                     generator.  If unspecified, it will be set to the time in
                     seconds since the epoch.

   Caveats:
   1) It currently only works for binary language model files.

   2) It currently assumes the language model is a trigram, although it 
   probably wouldn't take much effort to make it more general.

   3) When generating the first word, it currently tries to use an initial 
   history of "<s> <s>" if "<s>" is in the vocabulary and just picks the word 
   at index 1 otherwise.  Something better could probably be done in this case
   (e.g. looking for other context cue words in the vocab list).

   4) Certain platforms appear to use something other than RAND_MAX for the
   maximum number that can be generated by random().  This could cause strange
   things to happen, but could be fixed very easily by replacing RAND_MAX by
   the appropriate platform-specific constant.
*/
void generate_words(ng_t *png,arpa_lm_t *pang, int num_words,int random_seed,char *output_filename)
{
  FILE *output_file;
  int i,j,bo_case,initial_history_id;
  id__t sought_trigram[3];
  double p,acc,trigram_prob;
  vocab_sz_t lm_vocab_sz;
  char** lm_vocab;

  if(png!=NULL && pang!=NULL)
    quit(-1,"Confused by multiple input type.\n");
    
  if(png!=NULL){
    lm_vocab_sz=png->vocab_size;
    lm_vocab=png->vocab;
  }

  if(pang!=NULL){
    quit(-1,"Currently doesn't support arpa input, please use the binary format created by idngram2lm.\n");
    lm_vocab_sz=pang->vocab_size;
    lm_vocab=pang->vocab;
  }

  if (!(output_file=fopen(output_filename,"w"))) {
    fprintf(stderr,"Error: could not open %s for writing.\n",output_filename);
    fprintf(stderr, "Syntax: generate -seed seed_of_random_generator -size size_of_file -text output text file \n");

    return;
  }

  if (random_seed==-1)
    random_seed=(unsigned int)time(NULL);
  
  srandom(random_seed);

  printf("Using %d as a random seed.\n",random_seed);

  initial_history_id = -1;
  for (i=0; i<lm_vocab_sz; ++i){
    if (!strcmp(BEGIN_OF_SENTENCE_STRING,lm_vocab[i])){
      initial_history_id = i;
      fprintf(stderr,"Found %s in the vocabulary at index %d.\n",
	      BEGIN_OF_SENTENCE_STRING,i);
      break;
    }
  }

  if (initial_history_id == -1)    {
    fprintf(stderr,"Did not find %s in the vocabulary.\n",
	    BEGIN_OF_SENTENCE_STRING);
      /* Okay, we should do something much more intelligent here than
         just picking a vocabulary word, but I don't have the time right
         now and the corpora I use have "<s>" and should hit this case 
         anyway.
      */
    initial_history_id=1;
  }

  sought_trigram[0]=initial_history_id;
  sought_trigram[1]=initial_history_id;
  fprintf(stderr,"Using an initial history of \"%s %s\"\n",
	  lm_vocab[sought_trigram[0]],lm_vocab[sought_trigram[1]]);   

  for (i=1; i<=num_words; ++i) {
    p=((double)random())/RAND_MAX; /* This is platform-specific and needs to be fixed */

    if (p<.5){
      acc=0.0;
      for (j=0; j<=lm_vocab_sz; ++j){
	sought_trigram[2]=j;
	bo_ng_prob(2, sought_trigram, png, DEFAULT_VERBOSITY,
		   &trigram_prob, &bo_case);
	acc+=trigram_prob;
	if (p<=acc) break;
      }

      if (p>acc) 
	fprintf(stderr, "WARNING: The sum over w3 of Pr(w3|%s,%s) was %f,"
		"which was less than the randomly generated number %f.\n",
		lm_vocab[sought_trigram[0]],lm_vocab[sought_trigram[1]],acc,p);
    }else {
      acc=1.0;
      for (j=lm_vocab_sz; j>=0; --j){
	sought_trigram[2]=j;
	bo_ng_prob(2, sought_trigram, png, DEFAULT_VERBOSITY,
		   &trigram_prob, &bo_case);
	acc-=trigram_prob;
	if (p>=acc) break;
      }

      if (p<acc)
	fprintf(stderr, "WARNING: 1-(sum over w3 of Pr(w3|%s,%s) was %f,"
		"which was greater than the randomly generated number %f.\n",
		lm_vocab[sought_trigram[0]],lm_vocab[sought_trigram[1]],acc,p);
    }

    fprintf(output_file,"%s ",lm_vocab[sought_trigram[2]]);
    if (!(i%10000))
      printf("%d words output.\n",i);
    sought_trigram[0] = sought_trigram[1];
    sought_trigram[1] = sought_trigram[2];
  }
  fprintf(output_file,"\n");
}
Пример #3
0
double calc_prob_of(id__t sought_word,
		    id__t *context,
		    int context_length,
		    ng_t *ng,
		    arpa_lm_t *arpa_ng,
		    fb_info *fb_list,
		    int *bo_case,
		    int *acl,
		    flag arpa_lm) {

  int i;
  flag exc_back_off;
  int most_recent_fb;
  int actual_context_length;
  id__t *sought_ngram;
  double prob;

  exc_back_off = 0;

  if (arpa_lm) {
    if (sought_word == 0 && arpa_ng->vocab_type == CLOSED_VOCAB) {
      quit(-1,"Error : Cannot generate probability for <UNK> since this is a closed \nvocabulary model.\n");
    }   
  }
  else {
    if (sought_word == 0 && ng->vocab_type == CLOSED_VOCAB) {
      quit(-1,"Error : Cannot generate probability for <UNK> since this is a closed \nvocabulary model.\n");
    }
  }

  most_recent_fb = -1;
  
  /* Find most recent word in the forced back-off list */
  
  for (i=context_length-1;i>=0;i--) {

    if (fb_list[context[i]].backed_off) {
      most_recent_fb = i;
      if (fb_list[context[i]].inclusive) {
	exc_back_off = 0;
      }
      else {
	exc_back_off = 1;
      }
      i = -2;
    }

  }
  
  actual_context_length = context_length - most_recent_fb -1;

  if (!exc_back_off && most_recent_fb != -1) {
    actual_context_length++;
  }

  sought_ngram = (id__t *) rr_malloc(sizeof(id__t)*(actual_context_length+1));

  for (i=0;i<=actual_context_length-1;i++) {
    if (exc_back_off) {
      sought_ngram[i] = context[i+most_recent_fb+1];
    }
    else {
      if (most_recent_fb == -1) {
	sought_ngram[i] = context[i+most_recent_fb+1];
      }
      else {
	sought_ngram[i] = context[i+most_recent_fb];
      }
    }
  }
  sought_ngram[actual_context_length] = sought_word;


  if (arpa_lm) {
    arpa_bo_ng_prob(actual_context_length,
		    sought_ngram,
		    arpa_ng,
		    2,       /* Verbosity */
		    &prob,
		    bo_case);
  }
  else {
    bo_ng_prob(actual_context_length,
	       sought_ngram,
	       ng,
	       2,       /* Verbosity */
	       &prob,
	       bo_case);
  }

  *acl = actual_context_length;

  free(sought_ngram);
  
  return(prob);

}