コード例 #1
0
ファイル: idngram2lm.c プロジェクト: 10v/cmusphinx
void init_ng_table_size(ng_t *ng, 
		        flag mem_alloc_method, 
		        flag is_ascii,
		        int verbosity,
		        int buffer_size
		        )
{
  int middle_size;
  int end_size;

  if (ng->n>1) {
    switch(mem_alloc_method) {

    case TWO_PASSES: 
      ng->table_sizes = (table_size_t *) rr_calloc(ng->n,sizeof(table_size_t));
      pc_message(verbosity,2,"Calculating memory requirement.\n");
      calc_mem_req(ng,is_ascii);
      break;
    case BUFFER:
      ng->table_sizes = (table_size_t *) rr_malloc(ng->n*sizeof(table_size_t));
      middle_size = sizeof(count_ind_t) + sizeof(bo_weight_t) + 
	sizeof(index__t) + sizeof(id__t);
      end_size = sizeof(count_ind_t) + sizeof(id__t);

      if (ng->four_byte_alphas)
	middle_size += 2;

      if (ng->four_byte_counts) {
	middle_size += 2;
	end_size += 2;
      }   

      guess_mem(buffer_size,
		middle_size,
		end_size,
		ng->n,
		ng->table_sizes,
		verbosity);
      break;
    case SPECIFIED:
      break;
    }  
  }else{
    ng->table_sizes = (table_size_t *) rr_calloc(1,sizeof(table_size_t));
  }

  ng->table_sizes[0] = ng->vocab_size+1;

}		
コード例 #2
0
ファイル: lm_combine.c プロジェクト: QuinnEbert/zedom8or
void load_context_cue(arpa_lm_t* lm, char* ccs_filename)
{
  FILE* context_cues_fp;
  char wlist_entry[1024];
  char current_cc[200];
  vocab_sz_t current_cc_id;

  lm->context_cue = (flag *) rr_calloc(lm->table_sizes[0],sizeof(flag));    
  lm->no_of_ccs = 0;
  if (strcmp(ccs_filename,"")) {
    context_cues_fp = rr_iopen(ccs_filename);
    while (fgets (wlist_entry, sizeof (wlist_entry),context_cues_fp)) {
      if (strncmp(wlist_entry,"##",2)==0) continue;
      sscanf (wlist_entry, "%s ",current_cc);
      warn_on_wrong_vocab_comments(wlist_entry);
	
      if (sih_lookup(lm->vocab_ht,current_cc,&current_cc_id) == 0)
	quit(-1,"Error : %s in the context cues file does not appear in the vocabulary.\n",current_cc);
      
      lm->context_cue[(unsigned short) current_cc_id] = 1;
      lm->no_of_ccs++;
      fprintf(stderr,"Context cue word : %s id = %lld\n",current_cc,current_cc_id);
    }
    rr_iclose(context_cues_fp);
  }
}
コード例 #3
0
ファイル: calc_mem_req.c プロジェクト: AIDman/Kaldi
void calc_mem_req(ng_t *ng,flag is_ascii) {

  ngram current_ngram;
  ngram previous_ngram;
  count_t *ng_count;
  int i,j;

  current_ngram.id_array = (id__t *) rr_malloc(sizeof(id__t)*ng->n);
  previous_ngram.id_array = (id__t *) rr_malloc(sizeof(id__t)*ng->n);
  
  ng_count = (count_t *) rr_calloc(ng->n,sizeof(count_t));

  current_ngram.n = ng->n;

  rewind(ng->id_gram_fp);

  while (!rr_feof(ng->id_gram_fp)) {
    for (i=0;i<=ng->n-1;i++) {
      previous_ngram.id_array[i]=current_ngram.id_array[i];
    }
    get_ngram(ng->id_gram_fp,&current_ngram,is_ascii);
    for (i=0;i<=ng->n-1;i++) {
      if (current_ngram.id_array[i] != previous_ngram.id_array[i]) {
	for (j=i;j<=ng->n-1;j++) {
	  if (j>0) {
	    if (ng_count[j] > ng->cutoffs[j-1]) {
	      ng->table_sizes[j]++;
	    }
	  }
	  ng_count[j] =  current_ngram.count;
	}
	i=ng->n;
      }
      else {
	ng_count[i] += current_ngram.count;
      }
    }
  }

  for (i=1;i<=ng->n-1;i++) {
    if (ng_count[i] > ng->cutoffs[i-1]) {
      ng->table_sizes[i]++;
    }
  }

  /* Add a fudge factor, as problems can crop up with having to
     cut-off last few n-grams. */

  for (i=1;i<=ng->n-1;i++) {
    ng->table_sizes[i]+=10;
  }

  rr_iclose(ng->id_gram_fp);
  ng->id_gram_fp = rr_iopen(ng->id_gram_filename);

}
コード例 #4
0
ファイル: ngram.c プロジェクト: ramshasank/codebak
void ng_allocate_ptr_table(ng_t * ng, arpa_lm_t *arpa_ng, flag is_arpa )
{
    int i;
    if(is_arpa) {
        assert(arpa_ng);
        arpa_ng->ptr_table = (ptr_tab_t **) rr_malloc(sizeof(ptr_tab_t *)*arpa_ng->n);
        arpa_ng->ptr_table_size = (ptr_tab_sz_t *) rr_calloc(arpa_ng->n,sizeof(ptr_tab_sz_t));

        for (i=0; i<=arpa_ng->n-1; i++)
            arpa_ng->ptr_table[i] = (ptr_tab_t *) rr_calloc(65535,sizeof(ptr_tab_t));
    } else {
        assert(ng);
        ng->ptr_table = (ptr_tab_t **) rr_malloc(sizeof(ptr_tab_t *)*ng->n);
        ng->ptr_table_size = (ptr_tab_sz_t *) rr_calloc(ng->n,sizeof(ptr_tab_sz_t));

        for (i=0; i<=ng->n-1; i++)
            ng->ptr_table[i] = (ptr_tab_t *) rr_calloc(65535,sizeof(ptr_tab_t));
    }

}
コード例 #5
0
ファイル: idngram2lm.c プロジェクト: 10v/cmusphinx
void read_vocab(ng_t* ng, int verbosity)
{
  vocab_sz_t test_cc_id;
  vocab_sz_t current_cc_id;
  char current_cc[200];
  char wlist_entry[1024];

  pc_message(verbosity,2,"Reading vocabulary.\n");

  /* Don't change the parameter of sih_create, because it will change
     the binary layout of the .binlm file */

  ng->vocab_ht =
    sih_create(1000,0.5,2.0,1);

  read_voc(ng->vocab_filename,verbosity,ng->vocab_ht,&ng->vocab,&(ng->vocab_size));
  
  /* Determine which of the vocabulary words are context cues */

  ng->no_of_ccs = 0;
  ng->context_cue = (flag *) rr_calloc(ng->vocab_size+1,sizeof(flag));

  if (ng->context_set) {
    /* This should be tied to l889 to l894 in lm_combine.c
     */
    while (fgets (wlist_entry, sizeof (wlist_entry),ng->context_cues_fp)) {
      if (strncmp(wlist_entry,"##",2)==0) continue;
      sscanf (wlist_entry, "%s ",current_cc);
      warn_on_wrong_vocab_comments(wlist_entry);

      if (sih_lookup(ng->vocab_ht,current_cc,&current_cc_id) == 0) 
	pc_message(verbosity,1,"Warning : %s in the context cues file does not appear in the vocabulary.\n",current_cc);
      else {
	ng->context_cue[(unsigned short) current_cc_id] = 1;
	pc_message(verbosity,2,"Context cue word : %s id = %d\n",current_cc,current_cc_id);
	ng->no_of_ccs++;
      }
    }
    rr_iclose(ng->context_cues_fp);
  }

  if ((sih_lookup(ng->vocab_ht,"<s>",&test_cc_id) != 0)) 
    if (ng->context_cue[(unsigned short) test_cc_id] == 0) 
      fprintf(stderr,"WARNING: <s> appears as a vocabulary item, but is not labelled as a\ncontext cue.\n");

  if ((sih_lookup(ng->vocab_ht,"<p>",&test_cc_id) != 0)) 
    if (ng->context_cue[(unsigned short) test_cc_id] == 0) 
      fprintf(stderr,"WARNING: <p> appears as a vocabulary item, but is not labelled as a\ncontext cue.\n");

  if ((sih_lookup(ng->vocab_ht,"<art>",&test_cc_id) != 0)) 
    if (ng->context_cue[(unsigned short) test_cc_id] == 0) 
      fprintf(stderr,"WARNING: <art> appears as a vocabulary item, but is not labelled as a\ncontext cue.\n");

}
コード例 #6
0
ファイル: perplexity.c プロジェクト: 10v/cmusphinx
void compute_perplexity(ng_t *ng,
			arpa_lm_t *arpa_ng,
			char *text_stream_filename,
			char *probs_stream_filename,
			char *annotation_filename,
			char *oov_filename,
			char *fb_list_filename,
			flag backoff_from_unk_inc,
			flag backoff_from_unk_exc,
			flag backoff_from_ccs_inc,
			flag backoff_from_ccs_exc,
			flag arpa_lm,
			flag include_unks,
			double log_base) {

  fb_info *fb_list;
  FILE *temp_fp;
  FILE *text_stream_fp;
  FILE *probs_stream_fp;
  FILE *annotation_fp;
  FILE *oov_fp;
  flag out_probs;
  flag annotate;
  flag out_oovs;
  flag found_unk_wrongly;
  double prob;
  double sum_log_prob;
  int total_words;
  int excluded_unks;
  int excluded_ccs;
  char current_word[1000];  /* Hope that's big enough */
  char **prev_words;
  vocab_sz_t current_id;
  id__t short_current_id;
  id__t *context;
  int context_length;
  int i;
  int bo_case;
  int actual_context_length;
  int *ngrams_hit;
  int n;

  /* Initialise file pointers to prevent warnings from the compiler. */

  probs_stream_fp = NULL;
  annotation_fp = NULL;
  oov_fp = NULL;

  short_current_id = 0;

  found_unk_wrongly = 0;

  annotate = 0;

  bo_case = 0;

  if (arpa_lm) {
    n = arpa_ng->n;
    fb_list = gen_fb_list(arpa_ng->vocab_ht,
			  (int) arpa_ng->vocab_size,
			  arpa_ng->vocab,
			  arpa_ng->context_cue,
			  backoff_from_unk_inc,
			  backoff_from_unk_exc,
			  backoff_from_ccs_inc,
			  backoff_from_ccs_exc,
			  fb_list_filename);
  }else {
    n = ng->n;
    fb_list = gen_fb_list(ng->vocab_ht,
			  (int) ng->vocab_size,
			  ng->vocab,
			  ng->context_cue,
			  backoff_from_unk_inc,
			  backoff_from_unk_exc,
			  backoff_from_ccs_inc,
			  backoff_from_ccs_exc,
			  fb_list_filename);
  }
  
  ngrams_hit = (int *) rr_calloc(n,sizeof(int));
  prev_words = (char **) rr_malloc(sizeof(char *)*n);
  for (i=0;i<=n-1;i++)
    prev_words[i] = (char *) rr_malloc(sizeof(char)*1000);
  
  /* Check that text_stream_filename and probs_stream_filename (if
     specified) are valid. Note that the checks employed by the
     standard rr_fopen tools are not suitable here, since we don't
     want the program to terminate if the paths are not found. */

  if (!strcmp(text_stream_filename,"")) {
    printf("Error : Must specify a text file. Use the -text switch.\n");
    return;
  }

  if (!rr_fexists(text_stream_filename) && strcmp(text_stream_filename,"-")) {
    printf("Error : Can't open file %s for reading.\n",text_stream_filename);
    return;
  }

  out_probs = strcmp(probs_stream_filename,"");
  annotate = strcmp(annotation_filename,"");
  out_oovs = strcmp(oov_filename,"");

  printf("Computing perplexity of the language model with respect\n");
  printf("   to the text %s\n",text_stream_filename);
  if (out_probs)
    printf("Probability stream will be written to file %s\n",
	    probs_stream_filename);

  if (annotate)
    printf("Annotation will be written to file %s\n",
	    annotation_filename);

  if (out_oovs)
    printf("Out of vocabulary words will be written to file %s\n",
	    oov_filename);

  if (backoff_from_unk_inc)
    printf("Will force inclusive back-off from OOVs.\n");

  if (backoff_from_unk_exc)
    printf("Will force exclusive back-off from OOVs.\n");

  if (backoff_from_ccs_inc)
    printf("Will force inclusive back-off from context cues.\n");

  if (backoff_from_ccs_exc)
    printf("Will force exclusive back-off from context cues.\n");

  if (strcmp(fb_list_filename,"")) 
    printf("Will force back-off according to the contents of %s\n",
	    fb_list_filename);

  if (include_unks)
    printf("Perplexity calculation will include OOVs.\n");

  /* Check for existance of files, as rr functions will quit, which isn't
     what we want */

  if (out_probs && strcmp(probs_stream_filename,"-")) {
    if ((temp_fp = fopen(probs_stream_filename,"w")) == NULL) {
      printf("Error : Can't open file %s for writing.\n",probs_stream_filename);
      return;
    }
    fclose(temp_fp);
  }

  if (annotate && strcmp(annotation_filename,"-")) {
    if ((temp_fp = fopen(annotation_filename,"w")) == NULL) {
      printf("Error : Can't open file %s for writing.\n",annotation_filename);
      return;
    }
    fclose(temp_fp);
  }
    
  if (out_oovs && strcmp(oov_filename,"-")) {
    if ((temp_fp = fopen(oov_filename,"w")) == NULL) {
      printf("Error : Can't open file %s for writing.\n",oov_filename);
      return;
    }
    fclose(temp_fp);
  }

  text_stream_fp = rr_iopen(text_stream_filename);
  if (out_probs)
    probs_stream_fp = rr_oopen(probs_stream_filename);

  if (annotate)
    annotation_fp = rr_oopen(annotation_filename);

  if (out_oovs)
    oov_fp = rr_oopen(oov_filename);

  context = (id__t *) rr_malloc(sizeof(id__t)*(n-1));

  sum_log_prob = 0.0;
  total_words = 0;
  excluded_unks = 0;
  excluded_ccs = 0;

  while (!rr_feof(text_stream_fp)) {

    if (total_words > 0) {
      if (total_words < n)
	strcpy(prev_words[total_words-1],current_word);
      else {
	for (i=0;i<=n-3;i++)
	  strcpy(prev_words[i],prev_words[i+1]);

	if (n>1)
	  strcpy(prev_words[n-2],current_word);
      }
    }

    if (total_words < (n-1))
      context_length = total_words;
    else
      context_length = n-1;

    /* Fill context with right stuff */

    if (total_words > (n-1)) {
      for (i=0;i<=context_length-2;i++)
	context[i] = context[i+1];
    }

    if (context_length != 0)
      context[context_length-1] = short_current_id;

    if (fscanf(text_stream_fp,"%s",current_word) != 1) {
      if (!rr_feof(text_stream_fp)) {
	printf("Error reading text file.\n");
	return;
      }
    }

    if (!rr_feof(text_stream_fp)) {

      if (arpa_lm) {
	sih_lookup(arpa_ng->vocab_ht,current_word,&current_id);
	if (arpa_ng->vocab_type == CLOSED_VOCAB && current_id == 0) {
	  found_unk_wrongly = 1;
	  printf("Error : %s is not in the vocabulary, and this is a closed \nvocabulary model.\n",current_word);
	}
	if (current_id > arpa_ng->vocab_size)
	  quit(-1,"Error : returned value from sih_lookup (%d) is too high.\n",context[i]); 
	else
	  short_current_id = current_id;

      }else {
	sih_lookup(ng->vocab_ht,current_word,&current_id);
	if (ng->vocab_type == CLOSED_VOCAB && current_id == 0) {
	  found_unk_wrongly = 1;
	  printf("Error : %s is not in the vocabulary, and this is a closed \nvocabulary model.\n",current_word);
	}
	if (current_id > ng->vocab_size)
	  quit(-1,"Error : returned value from sih_lookup (%d) is too high.\n",context[i]); 
	else
	  short_current_id = current_id;

      }
    
      if (!found_unk_wrongly) {

	if (current_id == 0 && out_oovs)
	  fprintf(oov_fp,"%s\n",current_word);

	if ((arpa_lm && (!(arpa_ng->context_cue[current_id])))
	    || ((!arpa_lm) && (!(ng->context_cue[current_id])))) {

	  if (include_unks || current_id != 0) {

	    prob = calc_prob_of(short_current_id,
				context,
				context_length,
				ng,
				arpa_ng,
				fb_list,
				&bo_case,
				&actual_context_length,
				arpa_lm);


	    if (prob<= 0.0 || prob > 1.0) {
	      fprintf(stderr,"Warning : ");
	      if (short_current_id == 0)
		fprintf(stderr,"P( <UNK> | ");
	      else
		fprintf(stderr,"P( %s | ",current_word);
	  
	      for (i=0;i<=actual_context_length-1;i++) {
		if (context[i+context_length-actual_context_length] == 0)
		  fprintf(stderr,"<UNK> ");
		else
		  fprintf(stderr,"%s ",prev_words[i]);
	      }
	      fprintf(stderr,") = %g logprob = %g \n ",prob,log(prob)/log(log_base));
	      fprintf(stderr,"bo_case == 0x%dx, actual_context_length == %d\n",
		      bo_case, actual_context_length);
	    }
	  
	    if (annotate) {
	      if (short_current_id == 0)
		fprintf(annotation_fp,"P( <UNK> | ");
	      else 
		fprintf(annotation_fp,"P( %s | ",current_word);
	  
	      for (i=0;i<=actual_context_length-1;i++) {
		if (context[i+context_length-actual_context_length] == 0)
		  fprintf(annotation_fp,"<UNK> ");
		else {
		  if (arpa_lm)
		    fprintf(annotation_fp,"%s ",arpa_ng->vocab[context[i+context_length-actual_context_length]]);
		  else
		    fprintf(annotation_fp,"%s ",ng->vocab[context[i+context_length-actual_context_length]]);
		}
	      }
	      fprintf(annotation_fp,") = %g logprob = %f bo_case = ",prob,log(prob)/log(log_base));
	      decode_bo_case(bo_case,actual_context_length,annotation_fp);
	    }

	    /* Calculate level to which we backed off */

	    for (i=actual_context_length-1;i>=0;i--) {
 	      int four_raise_i = 1<<(2*i);  /* PWP */
 
 	      /*
 	       * PWP: This was "if ((bo_case / (int) pow(3,i)) == 0)"
 	       * but was getting a divide-by-zero error on an Alpha
 	       * (it isn't clear to me why it should ever have done so)
 	       * Anyway, it is much faster to do in base-4.
 	       */

	      if ((bo_case == 0) || ((bo_case / four_raise_i) == 0)) {
		ngrams_hit[i+1]++;
		i = -2;
	      }else
		bo_case -= ((bo_case / four_raise_i) * four_raise_i);
	    }
  
	    if (i != -3) 
	      ngrams_hit[0]++;

	    if (out_probs)
	      fprintf(probs_stream_fp,"%g\n",prob);
      
	    sum_log_prob += log10(prob);
			  
	  }

          if (current_id == 0 && !include_unks)
            excluded_unks++;
	}       
	else {
	  if (((!arpa_lm) && ng->context_cue[current_id]) || 
	      (arpa_lm && arpa_ng->context_cue[current_id]))
	    excluded_ccs++;
	}
	total_words++;
      }
    }
  }

  if (!found_unk_wrongly) {      /*  pow(x,y) = e**(y  ln(x)) */
    printf("Perplexity = %.2f, Entropy = %.2f bits\n", 
	    exp(-sum_log_prob/(total_words-excluded_ccs-excluded_unks) * 
		log(10.0)),
	   (-sum_log_prob/(total_words-excluded_ccs-excluded_unks) * 
	    log(10.0) / log(2.0)));
    printf("Computation based on %d words.\n",
	   total_words-excluded_ccs-excluded_unks);
    for(i=n;i>=1;i--) {
      printf("Number of %d-grams hit = %d  (%.2f%%)\n",i,ngrams_hit[i-1],
	     (float) 100*ngrams_hit[i-1]/(total_words-excluded_ccs-excluded_unks) );
    }
    printf("%d OOVs (%.2f%%) and %d context cues were removed from the calculation.\n",
	   excluded_unks,
	   (float) 100*excluded_unks/(total_words-excluded_ccs),excluded_ccs);
    
  }

  rr_iclose(text_stream_fp);

  if (out_probs)
    rr_oclose(probs_stream_fp);
  if (annotate)
    rr_oclose(annotation_fp);
  if (out_oovs)
    rr_oclose(oov_fp);

  free (fb_list);
  free (context);
  free (ngrams_hit);
}
コード例 #7
0
ファイル: evallm.c プロジェクト: JonGBowen/GoodVibes
int oe_02_main (int argc, char **argv) {

  ng_t ng;
  arpa_lm_t arpa_ng;
  char input_string[500];
  int num_of_args;
  char *args[MAX_ARGS];
  char *lm_filename_arpa;
  char *lm_filename_binary;
  flag told_to_quit;
  flag inconsistant_parameters;
  flag backoff_from_unk_inc;
  flag backoff_from_unk_exc;
  flag backoff_from_ccs_inc;
  flag backoff_from_ccs_exc;
  flag arpa_lm;
  flag binary_lm;
  flag include_unks;
  char *fb_list_filename;
  char *probs_stream_filename;
  char *annotation_filename;
  char *text_stream_filename;
  char *oov_filename;
  char *ccs_filename;
  int generate_size;
  int random_seed;
  double log_base;
  char wlist_entry[1024];
  char current_cc[200];
  vocab_sz_t current_cc_id;
  FILE *context_cues_fp;
  int n;

  /* Process command line */

  report_version(&argc,argv);

  if (pc_flagarg(&argc, argv,"-help") || 
      argc == 1 || 
      (strcmp(argv[1],"-binary") && strcmp(argv[1],"-arpa"))) {
    oe_02_help_message();
    exit(1);
  }

  lm_filename_arpa = rr_salloc(pc_stringarg(&argc, argv,"-arpa",""));

  if (strcmp(lm_filename_arpa,""))
    arpa_lm = 1;
  else
    arpa_lm = 0;

  lm_filename_binary = rr_salloc(pc_stringarg(&argc, argv,"-binary",""));

  if (strcmp(lm_filename_binary,""))
    binary_lm = 1;
  else
    binary_lm = 0;

  if (arpa_lm && binary_lm)
    quit(-1,"Error : Can't use both -arpa and -binary flags.\n");
  
  if (!arpa_lm && !binary_lm)
    quit(-1,"Error : Must specify either a binary or an arpa format language model.\n");

  ccs_filename = rr_salloc(pc_stringarg(&argc, argv,"-context",""));

  if (binary_lm && strcmp(ccs_filename,""))
    fprintf(stderr,"Warning - context cues file not needed with binary language model file.\nWill ignore it.\n");

  pc_report_unk_args(&argc,argv,2);
 
  /* Load language model */

  if (arpa_lm) {
    fprintf(stderr,"Reading in language model from file %s\n",
	    lm_filename_arpa);
    load_arpa_lm(&arpa_ng,lm_filename_arpa);
  }else {
    fprintf(stderr,"Reading in language model from file %s\n",
	    lm_filename_binary);
    load_lm(&ng,lm_filename_binary); 
  }

  fprintf(stderr,"\nDone.\n");

  n=arpa_lm?
    arpa_ng.n:
    ng.n;

  if (arpa_lm) {
    arpa_ng.context_cue = 
      (flag *) rr_calloc(arpa_ng.table_sizes[0],sizeof(flag));    
    arpa_ng.no_of_ccs = 0;
    if (strcmp(ccs_filename,"")) {
      context_cues_fp = rr_iopen(ccs_filename);
      while (fgets (wlist_entry, sizeof (wlist_entry),context_cues_fp)) {
	if (strncmp(wlist_entry,"##",2)==0) continue;
	sscanf (wlist_entry, "%s ",current_cc);

	warn_on_wrong_vocab_comments(wlist_entry);
	
	if (sih_lookup(arpa_ng.vocab_ht,current_cc,&current_cc_id) == 0)
	  quit(-1,"Error : %s in the context cues file does not appear in the vocabulary.\n",current_cc);
	
	arpa_ng.context_cue[(unsigned short) current_cc_id] = 1;
	arpa_ng.no_of_ccs++;
	fprintf(stderr,"Context cue word : %s id = %lld\n",current_cc,current_cc_id);
      }
      rr_iclose(context_cues_fp);
    }
  }

  /* Process commands */
  
  told_to_quit = 0;
  num_of_args = 0;

  while (!feof(stdin) && !told_to_quit) {
    printf("evallm : \n");
    fgets(input_string, sizeof(input_string), stdin);
    if(strlen(input_string) < sizeof(input_string)-1)
      input_string[strlen(input_string)-1] = '\0'; //chop new-line
    else 
      quit(1, "evallm input exceeds size of input buffer");

    if (!feof(stdin)) {
      parse_comline(input_string,&num_of_args,args);

      log_base = pc_doublearg(&num_of_args,args,"-log_base",10.0);

      backoff_from_unk_inc = pc_flagarg(&num_of_args,args,"-backoff_from_unk_inc");
      backoff_from_ccs_inc = pc_flagarg(&num_of_args,args,"-backoff_from_ccs_inc");
      backoff_from_unk_exc = pc_flagarg(&num_of_args,args,"-backoff_from_unk_exc");
      backoff_from_ccs_exc = pc_flagarg(&num_of_args,args,"-backoff_from_ccs_exc");
      include_unks = pc_flagarg(&num_of_args,args,"-include_unks");
      fb_list_filename = rr_salloc(pc_stringarg(&num_of_args,args,"-backoff_from_list",""));
    
      text_stream_filename = 
	rr_salloc(pc_stringarg(&num_of_args,args,"-text",""));
      probs_stream_filename = 
	rr_salloc(pc_stringarg(&num_of_args,args,"-probs",""));
      annotation_filename = 
	rr_salloc(pc_stringarg(&num_of_args,args,"-annotate",""));
      oov_filename = rr_salloc(pc_stringarg(&num_of_args,args,"-oovs",""));

      generate_size = pc_intarg(&num_of_args,args,"-size",10000);
      random_seed = pc_intarg(&num_of_args,args,"-seed",-1);

      inconsistant_parameters = 0;
    
      if (backoff_from_unk_inc && backoff_from_unk_exc) {
	fprintf(stderr,"Error : Cannot specify both exclusive and inclusive forced backoff.\n");
	fprintf(stderr,"Use only one of -backoff_from_unk_exc and -backoff_from_unk_inc\n");
	inconsistant_parameters = 1;
      }

      if (backoff_from_ccs_inc && backoff_from_ccs_exc) {
	fprintf(stderr,"Error : Cannot specify both exclusive and inclusive forced backoff.\n");
	fprintf(stderr,"Use only one of -backoff_from_ccs_exc and -backoff_from_ccs_inc\n");
	inconsistant_parameters = 1;
      }

      if (num_of_args > 0) {      
	if (!inconsistant_parameters) {
	  if (!strcmp(args[0],"perplexity")) {
	    compute_perplexity(&ng,
			       &arpa_ng,
			       text_stream_filename,
			       probs_stream_filename,
			       annotation_filename,
			       oov_filename,
			       fb_list_filename,
			       backoff_from_unk_inc,
			       backoff_from_unk_exc,
			       backoff_from_ccs_inc,
			       backoff_from_ccs_exc,
			       arpa_lm,
			       include_unks,
			       log_base);
	  }else
	    /* do perplexity sentence by sentence [20090612] (air) */
	    if (!strcmp(args[0],"uttperp")) {
	      FILE *uttfh,*tempfh;
	      char utt[4096]; /* live dangerously... */
	      char tmpfil[128];
	      if ((uttfh = fopen(text_stream_filename,"r")) == NULL) {
		printf("Error: can't open %s\n",text_stream_filename);
		exit(1);
	      }
            char *template = "uttperp_XXXXXX";// CHANGED HLW
            mkstemp(template);// CHANGED HLW
コード例 #8
0
ファイル: lm_combine.c プロジェクト: QuinnEbert/zedom8or
void combine_lm(arpa_lm_t *arpa_lm, arpa_lm_t *lm1, arpa_lm_t *lm2)
{
	char *in_line;
	char *input_line;
	int i,j,k;
	int num_of_args;
	int pos_of_novelty;
	char *input_line_ptr_orig;
	char *word_copy;
	id__t *previous_ngram;
	id__t *current_ngram;
	vocab_sz_t temp_id;
	vocab_sz_t *pos_in_list;
	int previd;
	TBROWSE_UNION bru;
	char** words;
	
	words=(char**)NewArray(15,MAX_WORD,sizeof(char));
	
	in_line = (char *) rr_malloc(1024*sizeof(char));
	input_line = (char *) rr_malloc(1024*sizeof(char));
#import "OpenEarsStaticAnalysisToggle.h"
#ifdef STATICANALYZEDEPENDENCIES
#define __clang_analyzer__ 1
#endif
#if !defined(__clang_analyzer__) || defined(STATICANALYZEDEPENDENCIES)
#undef __clang_analyzer__	
	input_line_ptr_orig = input_line;
#endif	
	
	/* Read number of each k-gram */
	
	arpa_lm->table_sizes = (table_size_t *) rr_malloc(sizeof(table_size_t)*11);
    
	arpa_lm->num_kgrams = (ngram_sz_t *) rr_malloc(sizeof(ngram_sz_t)*11);
		
	calc_merged_ngram_num(arpa_lm, lm1, lm2);
	
	previous_ngram = (id__t *) rr_calloc(arpa_lm->n,sizeof(id__t));
	current_ngram = (id__t *) rr_calloc(arpa_lm->n,sizeof(id__t));

	pos_in_list = (vocab_sz_t *) rr_malloc(sizeof(vocab_sz_t) * arpa_lm->n);
	ng_arpa_lm_alloc_struct(arpa_lm);
	
	/* Process 1-grams */
	
	printf("Reading unigrams...\n");
	
	i=0;
	
	begin_browse_union(lm1,lm2,1,&bru);
	
	while (get_next_ngram_union(words,&bru)) {
	  word_copy = rr_salloc(words[0]);
	  /* Do checks about open or closed vocab */
	  check_open_close_vocab(arpa_lm,word_copy,&i);
	}
	
	/* Process 2, ... , n-1 grams */
#import "OpenEarsStaticAnalysisToggle.h"
#ifdef STATICANALYZEDEPENDENCIES
#define __clang_analyzer__ 1
#endif
#if !defined(__clang_analyzer__) || defined(STATICANALYZEDEPENDENCIES)
#undef __clang_analyzer__	
	previd = -1;
	
	for (i=2;i<=arpa_lm->n-1;i++) {
		
		printf("\nReading %d-grams...\n",i);
		
		previd = -1;
		
		j=0;
		
		for (k=0;k<=arpa_lm->n-1;k++) {
			pos_in_list[k] = 0;
		}
		
		begin_browse_union(lm1,lm2,i,&bru);
		while (get_next_ngram_union(words,&bru)) {
			
			/* Process line into all relevant temp_words */			
			num_of_args = 0;						
#endif		
			sih_lookup(arpa_lm->vocab_ht,words[i-1],&temp_id);
			arpa_lm->word_id[i-1][j] = temp_id;
			
			show_dot(j);
			
			j++;
			if (j>arpa_lm->table_sizes[i-1]) {
				quit(-1,"Error - Header information in ARPA format language model is incorrect.\nMore than %d %d-grams needed to be stored.\n",arpa_lm->table_sizes[i-1],i);
			}
			
			/* Make sure that indexes in previous table point to 
			the right thing. */
			
			for (k=0;k<=i-1;k++) {
				previous_ngram[k] = current_ngram[k];
				sih_lookup(arpa_lm->vocab_ht,words[k],&temp_id);
				if (temp_id == 0 && strcmp(words[k],"<UNK>")) {
					quit(-1,"Error - found unknown word in n-gram file : %s\n",
						words[k]);
				}
				current_ngram[k] = temp_id;
			}
			
			/* Find position of novelty */
			
			/*bug fixed, for the first ngram, pos_of novelty should be 0 - Wei Xu*/
			if (j==1) pos_of_novelty=0;
			else {
				pos_of_novelty = i;
				
				for (k=0;k<=i-1;k++) {
					if (current_ngram[k] > previous_ngram[k]) {
						pos_of_novelty = k;
						k = arpa_lm->n;
					}
					else {
						if ((current_ngram[k] > previous_ngram[k]) && (j > 0)) {
							quit(-1,"Error : n-grams are not correctly ordered.\n");
						}
					}
				}
			}
			
			if (pos_of_novelty == i && j != 1)
			  quit(-1,"Error - Repeated %d-gram in ARPA format language model.\n",
			       i);
			
			if (pos_of_novelty != i-1) {
				if (i==2) {
					/* Deal with unigram pointers */
					
					for (k = previd + 1; k <= current_ngram[0]; k++) {
						arpa_lm->ind[0][k] = new_index(j-1,
							arpa_lm->ptr_table[0],
							&(arpa_lm->ptr_table_size[0]),
							k);
					}
					previd = current_ngram[0];
				}else {
					
					for (k=pos_of_novelty;k<=i-2;k++) {
						if (k == 0) {
							pos_in_list[0] = current_ngram[0];
						}
						else {
							pos_in_list[k] = 
								MIN(get_full_index(arpa_lm->ind[k-1][pos_in_list[k-1]],
								arpa_lm->ptr_table[k-1],   
								arpa_lm->ptr_table_size[k-1],   
								pos_in_list[k-1]),pos_in_list[k]);
							while (arpa_lm->word_id[k][pos_in_list[k]] < 
								current_ngram[k]) {
								pos_in_list[k]++;
							}
						}
					}
					for (k = previd + 1; k <= pos_in_list[i-2]; k++) {
						arpa_lm->ind[i-2][k] = 
							new_index(j-1,
							arpa_lm->ptr_table[i-2],
							&(arpa_lm->ptr_table_size[i-2]),
							k);
					}
					previd = pos_in_list[i-2];	    
				}
			}
		}
	
		/* Now need to tidy up pointers for bottom section of unigrams */
	
		for (k = previd + 1; k <= arpa_lm->vocab_size; k++) {
			arpa_lm->ind[0][k] = new_index(arpa_lm->num_kgrams[1],
				arpa_lm->ptr_table[0],
				&(arpa_lm->ptr_table_size[0]),
				k);
		}      
	
	}
  
	printf("\nReading %d-grams...\n",arpa_lm->n);
	
	j = 0;
	previd = 0;
	
	arpa_lm->ind[arpa_lm->n-2][0] = 0;
	
	for (k=0;k<=arpa_lm->n-1;k++) {
		/* bug fixed by Wei Xu : this is a serious bug*/
		pos_in_list[k] = 0;
		//    pos_in_list[0] = 0;
	}
	
	begin_browse_union(lm1,lm2,arpa_lm->n,&bru);
	while (get_next_ngram_union(words,&bru)) {

	  show_dot(j);
	  
		sih_lookup(arpa_lm->vocab_ht,words[arpa_lm->n-1],&temp_id);
		
		arpa_lm->word_id[arpa_lm->n-1][j] = temp_id;
		
		j++;
		
		for (k=0;k<=arpa_lm->n-1;k++) {
			previous_ngram[k] = current_ngram[k];
			sih_lookup(arpa_lm->vocab_ht,words[k],&temp_id);
			if (temp_id == 0 && strcmp(words[k],"<UNK>")) {
				quit(-1,"Error - found unknown word in n-gram file : %s\n",
					words[k]);
			}
			current_ngram[k] = temp_id;
		}
		
		/* Find position of novelty */
		
		/*bug fixed, for the first ngram, pos_of novelty should be 0 - Wei Xu*/
		if (j==1) pos_of_novelty=0;
		else {
			pos_of_novelty = arpa_lm->n+1;
			
			for (k=0;k<=arpa_lm->n-1;k++) {
				if (current_ngram[k] > previous_ngram[k]) {
					pos_of_novelty = k;
					k = arpa_lm->n;
				}else {
					if ((current_ngram[k] > previous_ngram[k]) && (j>0)) {
						quit(-1,"Error : n-grams are not correctly ordered.\n");
					}
				}
			}
		}
		
		if ( pos_of_novelty == arpa_lm->n+1 && j != 1 ) {
			quit(-1,"Error : Same %d-gram occurs twice in ARPA format LM.\n",
				arpa_lm->n);
		}
		
		if (pos_of_novelty != arpa_lm->n-1) {
			
			for (k=pos_of_novelty;k<=arpa_lm->n-2;k++) {
				if (k == 0) {
					pos_in_list[0] = current_ngram[0];
				}else {
					pos_in_list[k] = 
						MAX(get_full_index(arpa_lm->ind[k-1][pos_in_list[k-1]],
						arpa_lm->ptr_table[k-1],   
						arpa_lm->ptr_table_size[k-1],   
						pos_in_list[k-1]),pos_in_list[k]);
					while (arpa_lm->word_id[k][pos_in_list[k]] < 
						current_ngram[k]) {
						pos_in_list[k]++;
					}
				}
			}
			for (k = previd + 1; k <= pos_in_list[arpa_lm->n-2]; k++) {
				arpa_lm->ind[arpa_lm->n-2][k] = 
					new_index(j-1,
					arpa_lm->ptr_table[arpa_lm->n-2],
					&(arpa_lm->ptr_table_size[arpa_lm->n-2]),
					k);
			}
			previd = pos_in_list[arpa_lm->n-2];
		}
		
		if (j>arpa_lm->table_sizes[arpa_lm->n-1]) {
			quit(-1,"Error - Header information in ARPA format language model is incorrect.\nMore than %d %d-grams needed to be stored.\n",arpa_lm->table_sizes[arpa_lm->n-1],arpa_lm->n-1);
		}
	}
	
	
	
	/* Tidy up */
	
	
	free(previous_ngram);
	free(current_ngram);
	free(in_line);
	free(input_line);
	DeleteArray(words);
  
}
コード例 #9
0
ファイル: mergeidngram.c プロジェクト: JonGBowen/GoodVibes
int oe_15_main( int argc, char **argv )
{
  FILE **fin;
  ngram *ng;
  ngram outng;
  flag *done, finished;
  int i, j, nfiles;

  /* Process the command line */
  report_version(&argc,argv);
  procComLine( &argc, argv ) ;
  if( argc < 2 ) {
    printUsage( argv[0] ) ;
    exit( 1 ) ;
  }
  nfiles = argc - 1;

  /* allocate memory */
  fin = (FILE **) rr_malloc( sizeof( FILE *) * nfiles );
  done = (flag *) rr_malloc( sizeof( flag ) * nfiles );
  ng = (ngram *) rr_malloc( sizeof( ngram ) * nfiles );

  for( i = 0; i < nfiles; i++ ) {
    ng[i].id_array = (id__t *) rr_calloc( n, sizeof( id__t ) );
    ng[i].n = n;
  }
  outng.id_array = (id__t *) rr_calloc( n, sizeof( id__t ) );
  outng.n = n;

  /* open the input files */
  for( i = 0; i < nfiles; i++ )
    fin[i] = rr_iopen( argv[i+1] );

  /* read first ngram from each file */
  for( i = 0; i < nfiles; i++ ) {
    done[i] = 0;
    if ( !get_ngram( fin[i], &ng[i], ascii_in ) )
      done[i] = 1;
  }

  finished = 0;
  while ( !finished ) {
  /* set outng to max possible */
    for( i = 0; i < n; i++ )
      outng.id_array[i] = MAX_VOCAB_SIZE;
    
    /* find smallest ngram */
    for( i = 0; i < nfiles; i++ ) {
      if ( !done[i] ) 
	if ( cmp_ngram( &outng, &ng[i] ) > 0 ) 
	  for( j = 0; j < n; j++ ) outng.id_array[j] = ng[i].id_array[j];
    }
    
    outng.count = 0;
    for( i = 0; i < nfiles; i++ ) {
      if ( !done[i] ) {
	/* add counts of equal ngrams */
	if ( cmp_ngram( &outng, &ng[i] ) == 0 ) {
	  outng.count += ng[i].count;
	  if ( !get_ngram( fin[i], &ng[i], ascii_in ) ) {
	    /* check if all files done */
	    done[i] = 1;
	    finished = 1;
	    for( j = 0; j < nfiles; j++ ) 
	      if ( ! done[j] ) finished = 0;
	  }
	}
      }
    }

    write_ngram( stdout, &outng, ascii_out );
  }

  for( i = 0; i < nfiles; i++ )
    rr_iclose( fin[i] );

  fprintf(stderr,"mergeidngram : Done.\n");

  return( 0 );
}
コード例 #10
0
ファイル: idngram2stats.c プロジェクト: QuinnEbert/zedom8or
int oe_03_main (int argc, char **argv) {

  flag first_ngram;
  int n;
  fof_sz_t fof_size;
  flag is_ascii;
  int verbosity;
  fof_t **fof_array;
  ngram_sz_t *num_kgrams;
  ngram current_ngram;
  ngram previous_ngram;
  count_t *ng_count;
  int pos_of_novelty;
  int nlines;
  int i;

  report_version(&argc,argv);

  if (argc == 1 || pc_flagarg(&argc, argv,"-help")) {
    oe_04_help_message();
    exit(1);
  }

  is_ascii = pc_flagarg(&argc, argv,"-ascii_input");
  n = pc_intarg(&argc, argv,"-n",3);
  fof_size = pc_intarg(&argc, argv,"-fof_size",50);
  verbosity = pc_intarg(&argc, argv,"-verbosity",DEFAULT_VERBOSITY);

  pc_report_unk_args(&argc,argv,verbosity);

  pc_message(verbosity,2,"n        = %d\n",n);
  pc_message(verbosity,2,"fof_size = %d\n",fof_size);

  current_ngram.n = n;
  previous_ngram.n = n;
  pos_of_novelty = n;
  
  fof_array = (fof_t **) rr_malloc(sizeof(fof_t *) * (n-1));
  for (i=0;i<=n-2;i++) 
    fof_array[i] = (fof_t *) rr_calloc(fof_size+1,sizeof(fof_t));

  num_kgrams = (ngram_sz_t *) rr_calloc(n-1,sizeof(ngram_sz_t));
  ng_count = (count_t *) rr_calloc(n-1,sizeof(count_t));

  current_ngram.id_array = (id__t *) rr_calloc(n,sizeof(id__t));
  previous_ngram.id_array = (id__t *) rr_calloc(n,sizeof(id__t));

  pc_message(verbosity,2,"Processing id n-gram file.\n");
  pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n");

  nlines = 0;
  first_ngram = 1;
  
  while (!rr_feof(stdin)) {
    
    if (!first_ngram)
      ngram_copy(&previous_ngram,&current_ngram,n);

    if (get_ngram(stdin,&current_ngram,is_ascii)) {

      nlines++;
      show_idngram_nlines(nlines, verbosity);
    
      /* Test for where this ngram differs from last - do we have an
	 out-of-order ngram? */
    
      if (!first_ngram)
        pos_of_novelty = ngram_find_pos_of_novelty(&current_ngram,&previous_ngram,n,nlines);
      else
        pos_of_novelty = 0;

      /* Add new N-gram */
     
      num_kgrams[n-2]++;
      if (current_ngram.count <= fof_size) 
	fof_array[n-2][current_ngram.count]++;

      if (!first_ngram) {
	for (i=n-2;i>=MAX(1,pos_of_novelty);i--) {
	  num_kgrams[i-1]++;
	  if (ng_count[i-1] <= fof_size) 
	    fof_array[i-1][ng_count[i-1]]++;
	  
	  ng_count[i-1] = current_ngram.count;
	}
      } else {
	for (i=n-2;i>=MAX(1,pos_of_novelty);i--) 
	  ng_count[i-1] = current_ngram.count;
      }
	
      for (i=0;i<=pos_of_novelty-2;i++) 
	ng_count[i] += current_ngram.count;
	
      if (first_ngram)
        first_ngram = 0;
    }
  }

  /* Process last ngram */

  for (i=n-2;i>=MAX(1,pos_of_novelty);i--) {
    num_kgrams[i-1]++;
    if (ng_count[i-1] <= fof_size) {
      fof_array[i-1][ng_count[i-1]]++;
    }
    ng_count[i-1] = current_ngram.count;
  }
 #import "OpenEarsStaticAnalysisToggle.h"
#ifdef STATICANALYZEDEPENDENCIES
#define __clang_analyzer__ 1
#endif
#if !defined(__clang_analyzer__) || defined(STATICANALYZEDEPENDENCIES)
#undef __clang_analyzer__ 
  for (i=0;i<=pos_of_novelty-2;i++)
    ng_count[i] += current_ngram.count;

  display_fof_array(num_kgrams,fof_array,fof_size,stderr, n);
#endif
  pc_message(verbosity,0,"idngram2stats : Done.\n");

  exit(0);
  
}
コード例 #11
0
ファイル: compute_back_off.c プロジェクト: AIDman/Kaldi
void compute_back_off(ng_t *ng,int n, int verbosity) {

  int *current_pos;
  int *end_pos;
  id__t *sought_ngram;
  int current_table;
  int ng_count;
  int i;
  double sum_cond_prob;
  double sum_bo_prob;
  double discounted_ngcount;
  double cond_prob;
  double bo_prob;
  double discount_mass;
  double leftout_bo_prob;
  double alpha;

  int bo_case;

  sum_cond_prob = 0.0;
  sum_bo_prob = 0.0;

  /* For the sake of warning-free compilation... */

  discounted_ngcount = 0.0;
  
  current_pos = (int *)rr_calloc(n+1,sizeof(int));
  sought_ngram = (id__t *) rr_calloc(n+1,sizeof(id__t));
  end_pos = (int *)rr_calloc(n+1,sizeof(int)); 
  
  /* Process the tree so that we get all the n-grams out in the right
     order. */
  
  for (current_pos[0]=ng->first_id;
       current_pos[0]<=ng->vocab_size;
       current_pos[0]++) {
    
    if (return_count(ng->four_byte_counts,
		     ng->count_table[0],
		     ng->marg_counts,
		     ng->marg_counts4,
		     current_pos[0]) > 0) {

      current_table = 1;
      
      if (current_pos[0] == ng->vocab_size) {
	end_pos[1] = ng->num_kgrams[1]-1;
      }
      else {
 	end_pos[1] = get_full_index(ng->ind[0][current_pos[0]+1],
				    ng->ptr_table[0],
				    ng->ptr_table_size[0],
				    current_pos[0]+1)-1;
      }

      while (current_table > 0) {

	if (current_table == n) {

	  if (current_pos[n] <= end_pos[n]){

	    ng_count = return_count(ng->four_byte_counts,
				    ng->count_table[n],
				    ng->count[n],
				    ng->count4[n],
				    current_pos[n]);

	    switch (ng->discounting_method) {
	    case GOOD_TURING:
	      if (ng_count <= ng->disc_range[n]) {
		discounted_ngcount = ng->gt_disc_ratio[n][ng_count] * ng_count;
	      }
	      else {
		discounted_ngcount = ng_count;
	      }
	      break;
	    case LINEAR:
	      discounted_ngcount = ng->lin_disc_ratio[n] * ng_count;
	      break;
	    case ABSOLUTE:
	      discounted_ngcount = ng_count - ng->abs_disc_const[n];
	      break;
	    case WITTEN_BELL:
	      if (n==1) {

		discounted_ngcount = ((double) 
				      return_count(ng->four_byte_counts,
						   ng->count_table[0],
						   ng->marg_counts,
						   ng->marg_counts4,
						   current_pos[0]) * ng_count)
		  / (return_count(ng->four_byte_counts,
				  ng->count_table[0],
				  ng->marg_counts,
				  ng->marg_counts4,
				  current_pos[0]) + 
		     num_of_types(0,current_pos[0],ng));
	      }
	      else {
		
		discounted_ngcount = ((double) 
				      return_count(ng->four_byte_counts,
						   ng->count_table[n-1],
						   ng->count[n-1],
						   ng->count4[n-1],
						   current_pos[n-1])* ng_count)
		  / (return_count(ng->four_byte_counts,
				  ng->count_table[n-1],
				  ng->count[n-1],
				  ng->count4[n-1],
				  current_pos[n-1]) + 
		     num_of_types(n-1,current_pos[n-1],ng));

	      }	  
	      
	      break;
	    }

	    if (n==1) {
	      cond_prob = ((double) discounted_ngcount / 
			   return_count(ng->four_byte_counts,
					ng->count_table[0],
					ng->marg_counts,
					ng->marg_counts4,
					current_pos[0]));
	    }
	    else {
	      cond_prob = ((double) discounted_ngcount /  
			   return_count(ng->four_byte_counts,
					ng->count_table[n-1],
					ng->count[n-1],
					ng->count4[n-1],
					current_pos[n-1]));

	    }
	    sum_cond_prob += cond_prob;

	    /* Fill up sought ngram array with correct stuff */

	    for (i=1;i<=n;i++) {
	      sought_ngram[i-1] = ng->word_id[i][current_pos[i]];
	    }


	    bo_ng_prob(n-1,sought_ngram,ng,verbosity,&bo_prob,&bo_case);
	    sum_bo_prob += bo_prob;
	    current_pos[n]++;			
					       
	  }
	  else {

	    discount_mass = 1.0 - sum_cond_prob;

	    if (discount_mass < 1e-10) {
	      discount_mass = 0.0;
	      pc_message(verbosity,2,"Warning : Back off weight for %s(id %d) ",
			 ng->vocab[current_pos[0]],current_pos[0]);
	      for (i=1;i<=n-1;i++) {
		pc_message(verbosity,2,"%s(id %d) ",ng->vocab[ng->word_id[i][current_pos[i]]],ng->word_id[i][current_pos[i]]);
	      }
	      pc_message(verbosity,2,
			 "is set to 0 (sum of probs = %f).\nMay cause problems with zero probabilities.\n",sum_cond_prob);
	    }

	    leftout_bo_prob = 1.0 - sum_bo_prob;
	    if (leftout_bo_prob < 1e-10) {
	      leftout_bo_prob = 0.0;
	    }

	    if (leftout_bo_prob > 0.0) {
	      alpha = discount_mass / leftout_bo_prob;
	    }
	    else {
	      alpha = 0.0;	/* Will not be used. Should happen very rarely. */
	      pc_message(verbosity,2,"Warning : Back off weight for %s(id %d) ",
			 ng->vocab[current_pos[0]],current_pos[0]);
	      for (i=1;i<=n-1;i++) {
		pc_message(verbosity,2,"%s(id %d) ",ng->vocab[ng->word_id[i][current_pos[i]]],ng->word_id[i][current_pos[i]]);
	      }
	      pc_message(verbosity,2,
			 "is set to 0.\nMay cause problems with zero probabilities.\n");

	    }
	  
	    if (ng->four_byte_alphas) {
	      ng->bo_weight4[n-1][current_pos[n-1]] = alpha;
	    }
	    else {
	      ng->bo_weight[n-1][current_pos[n-1]] = 
		short_alpha(alpha,
			    ng->alpha_array,
			    &(ng->size_of_alpha_array),
			    65535 - ng->out_of_range_alphas,
			    ng->min_alpha,
			    ng->max_alpha);
	    }
	  
	    /* Finished current (n-1)-gram */

	    sum_cond_prob = 0.0;
	    sum_bo_prob = 0.0;
	    current_table--;
	    if (current_table > 0) {
	      current_pos[current_table]++;
	    }
	  }
	}
	else {

	  if (current_pos[current_table] <= end_pos[current_table]) {
	    current_table++;
	    if (current_pos[current_table-1] == ng->num_kgrams[current_table-1]-1) {
	      end_pos[current_table] = ng->num_kgrams[current_table]-1;
	    }
	    else {
	      end_pos[current_table] = get_full_index(ng->ind[current_table-1][current_pos[current_table-1]+1],ng->ptr_table[current_table-1],ng->ptr_table_size[current_table-1],current_pos[current_table-1]+1)-1;
	    }
	  }
	  else {
	    current_table--;
	    if (current_table > 0) {
	      current_pos[current_table]++;
	    }
	  }
	}
      }
    }

    /* Now deal with zeroton unigrams */

    else {
      if (n == 1) {
	if (ng->four_byte_alphas) {
	  ng->bo_weight4[0][current_pos[0]] = 1.0;
	}
	else {
	  ng->bo_weight[0][current_pos[0]] = 
	    short_alpha(1.0,
			ng->alpha_array,
			&(ng->size_of_alpha_array),
			65535 - ng->out_of_range_alphas,
			ng->min_alpha,
			ng->max_alpha);
	}
      }
    }
  }
  free(end_pos);
  free(current_pos);
  free(sought_ngram);
  
}
コード例 #12
0
ファイル: ac_lmfunc_impl.c プロジェクト: Ankit77/cmusphinx
void merge_idngramfiles (int start_file, 
		      int end_file, 
		      char *temp_file_root,
		      char *temp_file_ext,
		      int max_files,
		      FILE *outfile,
		      flag write_ascii,
		      int fof_size,
		      int n_order) {
  FILE *new_temp_file;
  char temp_string[1000];
  char *new_temp_filename;
  
  FILE **temp_file;
  char **temp_filename;
  wordid_t **current_ngram;
  wordid_t *smallest_ngram;
  wordid_t *previous_ngram;

  int *current_ngram_count;
  flag *finished;
  flag all_finished;
  int temp_count;
  int i,j;
  flag first_ngram;
  fof_t **fof_array;
  ngram_sz_t *num_kgrams;
  int *ng_count;
  int pos_of_novelty;
  
  n = n_order;
  
  pos_of_novelty = n; /* Simply for warning-free compilation */
  num_kgrams = (ngram_sz_t *) rr_calloc(n-1,sizeof(ngram_sz_t));
  ng_count = (int *) rr_calloc(n-1,sizeof(int));
  first_ngram = 1;
  
  previous_ngram = (wordid_t *) rr_calloc(n,sizeof(wordid_t));
  temp_file = (FILE **) rr_malloc(sizeof(FILE *) * (end_file-start_file+1));
  temp_filename = (char **) rr_malloc(sizeof(char *) * 
				      (end_file-start_file+1));

  /* should change to 2d array*/
  current_ngram = (wordid_t **) rr_malloc(sizeof(wordid_t *) * 
						(end_file-start_file+1));
  for (i=0;i<=end_file-start_file;i++) 
    current_ngram[i] = (wordid_t *) rr_malloc(sizeof(wordid_t)*n);

  current_ngram_count = (int *) rr_malloc(sizeof(int)*(end_file-start_file+1));

  finished = (flag *) rr_malloc(sizeof(flag)*(end_file-start_file+1));
  smallest_ngram = (wordid_t *) rr_malloc(sizeof(wordid_t)*n);

  /* should change to 2d array*/
  fof_array = (fof_t **) rr_malloc(sizeof(fof_t *)*(n-1));
  for (i=0;i<=n-2;i++) 
    fof_array[i] = (fof_t *) rr_calloc(fof_size+1,sizeof(fof_t));

  if (end_file-start_file+1 > max_files) {
    sprintf(temp_string,"%s/%hu%s",temp_file_root,
	    end_file+1,temp_file_ext);
    new_temp_filename = salloc(temp_string);
    new_temp_file = rr_oopen(new_temp_filename);
    merge_tempfiles(start_file,start_file+max_files-1,
		    temp_file_root,temp_file_ext,max_files,
		    new_temp_file,write_ascii,0);
    merge_tempfiles(start_file+max_files,end_file+1,
		    temp_file_root,temp_file_ext,max_files,
		    outfile,write_ascii,0);
  }else {

    /* Open all the temp files for reading */
    for (i=0;i<=end_file-start_file;i++) {
      sprintf(temp_string,"%s/%hu%s",temp_file_root,
	      i+start_file,temp_file_ext);
      temp_filename[i] = salloc(temp_string);
      temp_file[i] = rr_iopen(temp_filename[i]);
    }
    
    /* Now go through the files simultaneously, and write out the appropriate
       ngram counts to the output file. */

    for (i=end_file-start_file;i>=0;i--) {
      finished[i] = 0;
      if (!rr_feof(temp_file[i])) {
	for (j=0;j<=n-1;j++) {
	  rr_fread((char*) &current_ngram[i][j], sizeof(wordid_t),1,
		   temp_file[i],"temporary n-gram ids",0);
	}    
	rr_fread((char*) &current_ngram_count[i], sizeof(int),1,
		 temp_file[i],"temporary n-gram counts",0);
      }
    }
    
    all_finished = 0;

    while (!all_finished) {

      /* Find the smallest current ngram */
      for (i=0;i<=n-1;i++) 
	smallest_ngram[i] = MAX_WORDID;

      for (i=0;i<=end_file-start_file;i++) {
	if (!finished[i]) {
	  if (compare_ngrams3(smallest_ngram,current_ngram[i]) < 0) {
	    for (j=0;j<n;j++)
	      smallest_ngram[j] = current_ngram[i][j];
	  }
	}
      }

#if MAX_VOCAB_SIZE < 65535
      /* This check is well-meaning but completely useless since
	 smallest_ngram[i] by definition cannot contain any value
	 greater than MAX_VOCAB_SIZE (dhuggins@cs, 2006-03) */
      for (i=0;i<=n-1;i++) {
	if (smallest_ngram[i] > MAX_VOCAB_SIZE) {
	  quit(-1,"Error : Temporary files corrupted, invalid n-gram found.\n");
	}
      }
#endif
	  
      /* For each of the files that are currently holding this ngram,
	 add its count to the temporary count, and read in a new ngram
	 from the files. */

      temp_count = 0;

      for (i=0;i<=end_file-start_file;i++) {
	if (!finished[i]) {
	  if (compare_ngrams3(smallest_ngram,current_ngram[i]) == 0) {
	    temp_count = temp_count + current_ngram_count[i];
	    if (!rr_feof(temp_file[i])) {
	      for (j=0;j<=n-1;j++) {
		rr_fread((char*) &current_ngram[i][j],sizeof(wordid_t),1,
			 temp_file[i],"temporary n-gram ids",0);
	      }
	      rr_fread((char*)&current_ngram_count[i],sizeof(int),1,
		       temp_file[i],"temporary n-gram count",0);
	    }else {
	      finished[i] = 1;
	      all_finished = 1;
	      for (j=0;j<=end_file-start_file;j++) {
		if (!finished[j]) 
		  all_finished = 0;
	      }
	    }
	  }
	}
      }
      
      if (write_ascii) {
	for (i=0;i<=n-1;i++) {

	  if (fprintf(outfile,"%d ",smallest_ngram[i]) < 0) 
	    {
	      quit(-1,"Write error encountered while attempting to merge temporary files.\nAborting, but keeping temporary files.\n");
	    }
	}
	if (fprintf(outfile,"%d\n",temp_count) < 0)  
	  quit(-1,"Write error encountered while attempting to merge temporary files.\nAborting, but keeping temporary files.\n");

      }else {
	for (i=0;i<=n-1;i++) {
	  rr_fwrite((char*)&smallest_ngram[i],sizeof(wordid_t),1,
		    outfile,"n-gram ids");
	}
	rr_fwrite((char*)&temp_count,sizeof(count_t),1,outfile,"n-gram counts");		   
      }

      if (fof_size > 0 && n>1) { /* Add stuff to fof arrays */
	
	/* Code from idngram2stats */	
	pos_of_novelty = n;
	for (i=0;i<=n-1;i++) {
	  if (smallest_ngram[i] > previous_ngram[i]) {
	    pos_of_novelty = i;
	    i=n;
	  }
	}
	  
	/* Add new N-gram */
	  
	num_kgrams[n-2]++;
	if (temp_count <= fof_size)
	  fof_array[n-2][temp_count]++;

	if (!first_ngram) {
	  for (i=n-2;i>=MAX(1,pos_of_novelty);i--) {
	    num_kgrams[i-1]++;
	    if (ng_count[i-1] <= fof_size) {
	      fof_array[i-1][ng_count[i-1]]++;
	    }
	    ng_count[i-1] = temp_count;
	  }
	}else {
	  for (i=n-2;i>=MAX(1,pos_of_novelty);i--) {
	    ng_count[i-1] = temp_count;
	  }
	  first_ngram = 0;
	}
	  
	for (i=0;i<=pos_of_novelty-2;i++)
	  ng_count[i] += temp_count;

	for (i=0;i<=n-1;i++)
	  previous_ngram[i]=smallest_ngram[i];

      }
    }
    
    for (i=0;i<=end_file-start_file;i++) {
      fclose(temp_file[i]);
      remove(temp_filename[i]); 
    }
    
  }    

  if (fof_size > 0 && n>1) { /* Display fof arrays */

    /* Process last ngram */
    for (i=n-2;i>=MAX(1,pos_of_novelty);i--) {
      num_kgrams[i-1]++;
      if (ng_count[i-1] <= fof_size)
	fof_array[i-1][ng_count[i-1]]++;

      ng_count[i-1] = temp_count;
    }
    
    for (i=0;i<=pos_of_novelty-2;i++)
      ng_count[i] += temp_count;

    display_fof_array(num_kgrams,fof_array,fof_size,stderr, n);

  }

}
コード例 #13
0
void main (int argc, char **argv) {

  ng_t ng;
  arpa_lm_t arpa_ng;
  char input_string[500];
  int num_of_args;
  char *args[MAX_ARGS];
  char *lm_filename_arpa;
  char *lm_filename_binary;
  flag told_to_quit;
  flag inconsistant_parameters;
  flag backoff_from_unk_inc;
  flag backoff_from_unk_exc;
  flag backoff_from_ccs_inc;
  flag backoff_from_ccs_exc;
  flag arpa_lm;
  flag binary_lm;
  flag include_unks;
  char *fb_list_filename;
  char *probs_stream_filename;
  char *annotation_filename;
  char *text_stream_filename;
  char *oov_filename;
  char *ccs_filename;
  double log_base;
  char wlist_entry[1024];
  char current_cc[200];
  int current_cc_id;
  FILE *context_cues_fp;
  int n;
  int generate_size = 10000;
  int random_seed;

  /* Process command line */

  report_version(&argc,argv);

  if (pc_flagarg(&argc, argv,"-help") || 
      argc == 1 || 
      (strcmp(argv[1],"-binary") && strcmp(argv[1],"-arpa"))) {
   fprintf(stderr,"evallm : Evaluate a language model.\n");
   fprintf(stderr,"Usage : evallm [ -binary .binlm | \n");
   fprintf(stderr,"                 -arpa .arpa [ -context .ccs ] ]\n");
   exit(1);
  }

  lm_filename_arpa = salloc(pc_stringarg(&argc, argv,"-arpa",""));

  if (strcmp(lm_filename_arpa,"")) {
    arpa_lm = 1;
  }
  else {
    arpa_lm = 0;
  }

  lm_filename_binary = salloc(pc_stringarg(&argc, argv,"-binary",""));

  if (strcmp(lm_filename_binary,"")) {
    binary_lm = 1;
  }
  else {
    binary_lm = 0;
  }

  if (arpa_lm && binary_lm) {
    quit(-1,"Error : Can't use both -arpa and -binary flags.\n");
  }
  
  if (!arpa_lm && !binary_lm) {
    quit(-1,"Error : Must specify either a binary or an arpa format language model.\n");
  }

  ccs_filename = salloc(pc_stringarg(&argc, argv,"-context",""));

  if (binary_lm && strcmp(ccs_filename,"")) {
    fprintf(stderr,"Warning - context cues file not needed with binary language model file.\nWill ignore it.\n");
  }

  pc_report_unk_args(&argc,argv,2);
 
  /* Load language model */

  if (arpa_lm) {
    fprintf(stderr,"Reading in language model from file %s\n",
	    lm_filename_arpa);
    load_arpa_lm(&arpa_ng,lm_filename_arpa);
  }
  else {
    fprintf(stderr,"Reading in language model from file %s\n",
	    lm_filename_binary);
    load_lm(&ng,lm_filename_binary); 
  }

  fprintf(stderr,"\nDone.\n");

  if (!arpa_lm) {
    n=ng.n;
  }
  else {
    n=arpa_ng.n;
  }

  if (arpa_lm) {
    arpa_ng.context_cue = 
      (flag *) rr_calloc(arpa_ng.table_sizes[0],sizeof(flag));    
    arpa_ng.no_of_ccs = 0;
    if (strcmp(ccs_filename,"")) {
      context_cues_fp = rr_iopen(ccs_filename);
      while (fgets (wlist_entry, sizeof (wlist_entry),context_cues_fp)) {
	if (strncmp(wlist_entry,"##",2)==0) continue;
	sscanf (wlist_entry, "%s ",current_cc);
	if (strncmp(wlist_entry,"#",1)==0) {
	  fprintf(stderr,"\n\n===========================================================\n");
	  fprintf(stderr,":\nWARNING: line assumed NOT a comment:\n");
	  fprintf(stderr,     ">>> %s <<<\n",wlist_entry);
	  fprintf(stderr,     "         '%s' will be included in the context cues list\n",current_cc);
	  fprintf(stderr,     "         (comments must start with '##')\n");
	  fprintf(stderr,"===========================================================\n\n");
	}
	
	
	if (sih_lookup(arpa_ng.vocab_ht,current_cc,&current_cc_id) == 0) {
	  quit(-1,"Error : %s in the context cues file does not appear in the vocabulary.\n",current_cc);
	}
	
	arpa_ng.context_cue[(unsigned short) current_cc_id] = 1;
	arpa_ng.no_of_ccs++;
	fprintf(stderr,"Context cue word : %s id = %d\n",current_cc,current_cc_id);
      }
      rr_iclose(context_cues_fp);
    }
  }

  /* Process commands */
  
  told_to_quit = 0;
  num_of_args = 0;

  while (!feof(stdin) && !told_to_quit) {
    printf("evallm : ");
    gets(input_string);

    if (!feof(stdin)) {
      parse_comline(input_string,&num_of_args,args);

  
      random_seed = pc_intarg(&num_of_args,args,"-seed",-1);

      generate_size = pc_intarg(&num_of_args,args,"-size",10000);

      log_base = pc_doublearg(&num_of_args,args,"-log_base",10.0);

      backoff_from_unk_inc = pc_flagarg(&num_of_args,args,
					"-backoff_from_unk_inc");
      backoff_from_ccs_inc = pc_flagarg(&num_of_args,args,
					"-backoff_from_ccs_inc");
      backoff_from_unk_exc = pc_flagarg(&num_of_args,args,
					"-backoff_from_unk_exc");
      backoff_from_ccs_exc = pc_flagarg(&num_of_args,args,
					"-backoff_from_ccs_exc");
      include_unks = pc_flagarg(&num_of_args,args,"-include_unks");
      fb_list_filename = salloc(pc_stringarg(&num_of_args,args,
					     "-backoff_from_list",""));
    
      text_stream_filename = 
	salloc(pc_stringarg(&num_of_args,args,"-text",""));
      probs_stream_filename = 
	salloc(pc_stringarg(&num_of_args,args,"-probs",""));
      annotation_filename = 
	salloc(pc_stringarg(&num_of_args,args,"-annotate",""));
      oov_filename = salloc(pc_stringarg(&num_of_args,args,"-oovs",""));


      inconsistant_parameters = 0;
    
      if (backoff_from_unk_inc && backoff_from_unk_exc) {
	fprintf(stderr,"Error : Cannot specify both exclusive and inclusive forced backoff.\n");
	fprintf(stderr,"Use only one of -backoff_from_unk_exc and -backoff_from_unk_inc\n");
	inconsistant_parameters = 1;
      }

      if (backoff_from_ccs_inc && backoff_from_ccs_exc) {
	fprintf(stderr,"Error : Cannot specify both exclusive and inclusive forced backoff.\n");
	fprintf(stderr,"Use only one of -backoff_from_ccs_exc and -backoff_from_ccs_inc\n");
	inconsistant_parameters = 1;
      }

      if (num_of_args > 0) {
      
	if (!inconsistant_parameters) {
	  if (!strcmp(args[0],"perplexity")) {
	    compute_perplexity(&ng,
			       &arpa_ng,
			       text_stream_filename,
			       probs_stream_filename,
			       annotation_filename,
			       oov_filename,
			       fb_list_filename,
			       backoff_from_unk_inc,
			       backoff_from_unk_exc,
			       backoff_from_ccs_inc,
			       backoff_from_ccs_exc,
			       arpa_lm,
			       include_unks,
			       log_base);
	  }
	  else {
	    if (!strcmp(args[0],"validate")) {

	      if (num_of_args != n) {
		fprintf(stderr,"Error : must specify %d words of context.\n",
			n-1);
	      }
	      else {
	      
		/* Assume last n-1 parameters form context */
	      
		validate(&ng,
			 &arpa_ng,
			 &(args[num_of_args-n+1]),
			 backoff_from_unk_inc,
			 backoff_from_unk_exc,
			 backoff_from_ccs_inc,
			 backoff_from_ccs_exc,
			 arpa_lm,
			 fb_list_filename);
	      }
	    }
	    else {
	      if (!strcmp(args[0],"stats")) {
		if (arpa_lm) {
		  display_arpa_stats(&arpa_ng);
		}
		else {
		  display_stats(&ng);
		}
	      }
	      else {
		if (!strcmp(args[0],"quit")) {
		  told_to_quit=1;
		}
        else if (!strcmp(args[0],"generate")) {

        if(arpa_lm)
          generate_words(NULL,&arpa_ng,generate_size,random_seed,text_stream_filename);
        else
          generate_words(&ng,NULL,generate_size,random_seed,text_stream_filename);

      }
		else {
		  if (!strcmp(args[0],"help")) {

		    printf("The user may specify one of the following commands: \n");
		    printf("\n");
		    printf(" - perplexity\n");
		    printf("\n");
		    printf("Computes the perplexity of a given text. May optionally specify words\n");
		    printf("from which to force back-off.\n");
		    printf("\n");
		    printf("Syntax: \n");
		    printf("\n");
		    printf("perplexity -text .text\n");
		    printf("         [ -probs .fprobs ]\n");
		    printf("         [ -oovs .oov_file ]\n");
		    printf("         [ -annotate .annotation_file ]         \n");
		    printf("         [ -backoff_from_unk_inc | -backoff_from_unk_exc ]\n");
		    printf("         [ -backoff_from_ccs_inc | -backoff_from_ccs_exc ] \n");
		    printf("         [ -backoff_from_list .fblist ]\n");
		    printf("         [ -include_unks ]\n");
		    printf("\n");
		    printf(" - validate\n");
		    printf("       \n");
		    printf("Calculate the sum of the probabilities of all the words in the\n");
		    printf("vocabulary given the context specified by the user.\n");
		    printf("\n");
		    printf("Syntax: \n");
		    printf("\n");
		    printf("validate [ -backoff_from_unk -backoff_from_ccs |\n");
		    printf("           -backoff_from_list .fblist ]\n");
		    printf("         [ -forced_backoff_inc | -forced_back_off_exc ]      \n");
		    printf("           word1 word2 ... word_(n-1)\n");
		    printf("\n");
		    printf("Where n is the n in n-gram. \n");
		    printf("\n");
		    printf(" - help\n");
		    printf("\n");
		    printf("Displays this help message.\n");
		    printf("\n");
		    printf("Syntax: \n");
		    printf("\n");
		    printf("help\n");
		    printf("\n");
		    printf(" - quit\n");
		    printf("\n");
		    printf("Exits the program.\n");
		    printf("\n");
		    printf("Syntax: \n");
		    printf("\n");
		    printf("quit\n");	

		  } 
	      
		  else {
		    fprintf(stderr,"Unknown command : %s\nType \'help\'\n",
			    args[0]);
		  }
		}
	      }
	    }
	  }
	}
      }
    }    
  }

  fprintf(stderr,"evallm : Done.\n");

  exit(0);
  
}
コード例 #14
0
ファイル: idngram2lm.c プロジェクト: 10v/cmusphinx
ng_t * init_ng(
	     int* argc,
	     char **argv,
	     int verbosity
	     )
{
  int i;
  ng_t* ng;
  
  ng=(ng_t*) rr_calloc(1,sizeof(ng_t));

  ng->disc_meth=NULL;
  /* -n */
  ng->n = pc_intarg(argc, argv,"-n",DEFAULT_N); 

  if (ng->n<1) 
    quit(-1,"Error: Value of n must be larger than zero.\n");

  /* -cutoffs */
  ng->cutoffs = (cutoff_t *) pc_shortarrayarg(argc, argv, "-cutoffs",ng->n-1,ng->n-1);

  if (ng->cutoffs == NULL) 
    ng->cutoffs = (cutoff_t *) rr_calloc((ng->n-1)+1,sizeof(cutoff_t)); /* +1 for the sake of the correction in writing in write_lms.c */

  for (i=0;i<=ng->n-3;i++) {
    if (ng->cutoffs[i+1] < ng->cutoffs[i]) {
      quit(-1,"Error - cutoffs for (n+1)-gram must be greater than or equal to those for \nn-gram. You have %d-gram cutoff = %d > %d-gram cutoff = %d.\n",i+2,ng->cutoffs[i],i+3,ng->cutoffs[i+1]);
    }
  }

  /* -min_unicount */
  ng->min_unicount = pc_intarg(argc, argv, "-min_unicount",0);

  /* -idngram */
  ng->id_gram_filename = salloc(pc_stringarg(argc, argv,"-idngram",""));

  if (!strcmp(ng->id_gram_filename,""))
    quit(-1,"Error: id ngram file not specified. Use the -idngram flag.\n");

  /* -arpa & -bin */
  ng->arpa_filename = salloc(pc_stringarg(argc, argv,"-arpa",""));
  ng->bin_filename = salloc(pc_stringarg(argc, argv,"-binary",""));
  
  ng->write_arpa = strcmp("",ng->arpa_filename);
  ng->write_bin = strcmp("",ng->bin_filename);
  
  if (!(ng->write_arpa || ng->write_bin)) 
    quit(-1,"Error : must specify either an arpa, or a binary output file.\n");

  ng->count_table_size = DEFAULT_COUNT_TABLE_SIZE;

  /* -vocab */
  ng->vocab_filename = salloc(pc_stringarg(argc,argv,"-vocab",""));
 
  if (!strcmp("",ng->vocab_filename))
    quit(-1,"Error : vocabulary file not specified. Use the -vocab option.\n");  

  /* -context */
  ng->context_cues_filename = salloc(pc_stringarg(argc,argv,"-context",""));

  ng->context_set = strcmp("", ng->context_cues_filename);

  /* -vocab_type */
  ng->vocab_type = pc_intarg(argc,argv,"-vocab_type",1);
  
  /* -oov_fraction */
  ng->oov_fraction = pc_doublearg(argc, argv,"-oov_fraction",-1.0);


  if (ng->oov_fraction == -1.0)
    ng->oov_fraction=DEFAULT_OOV_FRACTION;
  else {
    if (ng->vocab_type != 2)
      pc_message(verbosity,1,"Warning : OOV fraction specified, but will not be used, since vocab type is not 2.\n");
  }

  if (ng->vocab_type == 0) 
    ng->first_id = 1;
  else
    ng->first_id = 0;

  /* Allow both "min_alpha" etc and "min_bo_weight" etc as valid
     syntax. The "bo_weight" form is preferred, but the "alpha" form is
     maintained as it was present in version 2.00 */

  ng->min_alpha = pc_doublearg(argc,argv,"-min_alpha",DEFAULT_MIN_ALPHA);
  ng->max_alpha = pc_doublearg(argc,argv,"-max_alpha",DEFAULT_MAX_ALPHA);
  ng->out_of_range_alphas = pc_intarg(argc,argv,"-out_of_range_alphas",
				      DEFAULT_OUT_OF_RANGE_ALPHAS);

  ng->min_alpha = pc_doublearg(argc,argv,"-min_bo_weight",ng->min_alpha);
  ng->max_alpha = pc_doublearg(argc,argv,"-max_bo_weight",ng->max_alpha);
  ng->out_of_range_alphas = pc_intarg(argc,argv,"-out_of_range_bo_weights",
				      ng->out_of_range_alphas);
  
  if (ng->min_alpha >= ng->max_alpha)
    quit(-1,"Error : Minimum of alpha range must be less than the maximum.\n");


  init_ng_disc_method(ng,
		      pc_flagarg(argc, argv,"-linear"),
		      pc_flagarg(argc,argv,"-absolute"),
		      pc_flagarg(argc,argv,"-witten_bell"),
		      pc_flagarg(argc,argv,"-good_turing"));		      
		      
  ng->disc_range = (unsigned short *) pc_shortarrayarg(argc, argv, "-disc_ranges",ng->n,ng->n);

  ng->disc_range_set = (ng->disc_range != NULL);

  if (ng->discounting_method == GOOD_TURING) {
    if (!ng->disc_range_set) {
      ng->disc_range = (unsigned short *) rr_malloc(sizeof(unsigned short) * ng->n);
      ng->disc_range[0] = DEFAULT_DISC_RANGE_1;
      for (i=1;i<=ng->n-1;i++) 
	ng->disc_range[i] = DEFAULT_DISC_RANGE_REST;
    }
    ng->fof_size = (fof_sz_t *) rr_malloc(sizeof(fof_sz_t) * ng->n);
    for (i=0;i<=ng->n-1;i++) 
      ng->fof_size[i] = ng->disc_range[i]+1;

  }else {
    if (ng->disc_range_set) 
      pc_message(verbosity,2,"Warning : discount ranges specified will be ignored, since they only apply\nto Good Turing discounting.\n");
  }

  ng->four_byte_alphas = !(pc_flagarg(argc, argv, "-two_byte_alphas") || 
			   pc_flagarg(argc, argv, "-two_byte_bo_weights"));

  ng->four_byte_counts = pc_flagarg(argc, argv, "-four_byte_counts");
  if(ng->four_byte_counts){
      pc_message(verbosity,2,"Using Four byte counts.\n");
  }

  ng->zeroton_fraction = pc_doublearg(argc,argv,"-zeroton_fraction",1.0);

  /* Attempt to open all the files that we will need for input and
     output. It is better to do it here than to spend a few hours of
     CPU processing id-gram counts, only to find that the output path
     is invalid. */

  ng->id_gram_fp = rr_iopen(ng->id_gram_filename);

  /* Vocab is read by Roni's function which does the file opening for
     us, so no need to do it here. Don't need to worry about time
     being lost if file doesn't exist, since vocab is first thing to
     be read anyway. */

  if (ng->context_set)
    ng->context_cues_fp = rr_iopen(ng->context_cues_filename);

  if (ng->write_arpa)
    ng->arpa_fp = rr_oopen(ng->arpa_filename);

  if (ng->write_bin) 
    ng->bin_fp = rr_oopen(ng->bin_filename);

  return ng;
}
コード例 #15
0
ファイル: idngram2lm.c プロジェクト: 10v/cmusphinx
int main(int argc, char **argv) {

  int i,j;
  ng_t* ng;
  int verbosity;
  int mem_alloc_method; /* Method used to decide how much memory to 
			   allocate for count tables */
  int buffer_size;
  flag is_ascii;
  ngram current_ngram;
  ngram previous_ngram;
  count_t *ng_count; /* Array indicating the number of occurrances of 
			   the current 1-gram, 2-gram, ... ,n-gram 
			   Size depends on #define in general.h
			*/  
  int nlines;
  int pos_of_novelty;
  int prev_id1;
  flag contains_unks;
  int mem_alloced;

  flag displayed_oov_warning; /** Display OOV warning 
			       */

  /*  ------------------  Process command line --------------------- */

  report_version(&argc,argv);

  if (argc == 1 || pc_flagarg(&argc, argv,"-help")) {    
    /* Display help message */    
    help_message();
    exit(1);
  }

  verbosity = pc_intarg(&argc, argv,"-verbosity",DEFAULT_VERBOSITY);

  /* Initialization */
  {
    ng=init_ng(
	    &argc,
	    argv,
	    verbosity
	    );
    
    mem_alloc_method = init_alloc_method(ng, &argc, argv, &buffer_size);
    
    if (!strcmp(ng->id_gram_filename,"-") && mem_alloc_method == TWO_PASSES)
      quit(-1,"Error: If idngram is read from stdin, then cannot use -calc_mem option.\n");
    
    is_ascii = set_lmformat(pc_flagarg(&argc,argv,"-ascii_input"),
			    pc_flagarg(&argc,argv,"-bin_input"),
			    ng);  

    /* Report parameters */
    report_param(verbosity,ng,
		 is_ascii, mem_alloc_method, buffer_size);

    pc_report_unk_args(&argc,argv,verbosity);

  }

  /* --------------- Read in the vocabulary -------------- */
  read_vocab(ng,verbosity);
       		     
  /* --------------- Allocate space for the table_size array --------- */
  init_ng_table_size(ng, 
		     mem_alloc_method,
		     is_ascii,
		     verbosity,
		     buffer_size
		     );

  /* ----------- Allocate memory for tree structure -------------- */

  ng->count = NULL;
  ng->count4 = NULL;
  ng->marg_counts = NULL;
  ng->marg_counts4 = NULL;
  ng->count_table = NULL;

  ng->count = (count_ind_t **) rr_malloc(sizeof(count_ind_t *)*ng->n);
  ng->count4 = (count_t **) rr_malloc(sizeof(count_t *)*ng->n);    
  ng->count_table = (count_t **) rr_malloc(sizeof(count_t *)*ng->n);

  if (ng->four_byte_counts) {
    ng->marg_counts4 = (count_t *) rr_calloc(sizeof(count_t), ng->table_sizes[0]);

  }else {
    for (i=0;i<=ng->n-1;i++) 
      ng->count_table[i] = (count_t *) rr_calloc(ng->count_table_size+1,
						sizeof(count_t));

    ng->marg_counts = (count_ind_t *) rr_calloc(sizeof(count_ind_t),ng->table_sizes[0]);
    fprintf(stderr, "table_size %d\n",ng->table_sizes[0]);
    fflush(stderr);
  }

  ng->word_id = (id__t **) rr_malloc(sizeof(id__t *)*ng->n);

  if (ng->four_byte_alphas) {
    ng->bo_weight4 = (four_byte_t **) rr_malloc(sizeof(four_byte_t *)*ng->n);
    ng->bo_weight4[0] = (four_byte_t *) rr_malloc(sizeof(four_byte_t)*
						ng->table_sizes[0]);
  }else {
    ng->bo_weight = (bo_weight_t **) rr_malloc(sizeof(bo_weight_t *)*ng->n);
    ng->bo_weight[0] = (bo_weight_t *) rr_malloc(sizeof(bo_weight_t)*
						ng->table_sizes[0]);
  }

  ng->ind = (index__t **)  rr_malloc(sizeof(index__t *)*ng->n);

  /* First table */
  if (ng->four_byte_counts) 
    ng->count4[0] = (count_t *) rr_calloc(ng->table_sizes[0],sizeof(count_t));
  else 
    ng->count[0] = (count_ind_t *) rr_calloc(ng->table_sizes[0],sizeof(count_ind_t));

  ng->uni_probs = (uni_probs_t *) rr_malloc(sizeof(uni_probs_t)*
					   ng->table_sizes[0]);
  ng->uni_log_probs = (uni_probs_t *) rr_malloc(sizeof(uni_probs_t)*
					       ng->table_sizes[0]);

  if (ng->n >=2) 
    ng->ind[0] = (index__t *) rr_calloc(ng->table_sizes[0],sizeof(index__t));

  for (i=1;i<=ng->n-2;i++) {    
    ng->word_id[i] = (id__t *) rr_malloc(sizeof(id__t)*ng->table_sizes[i]);

    if (ng->four_byte_counts) 
      ng->count4[i] = (count_t *) rr_malloc(sizeof(count_t)*ng->table_sizes[i]);
    else 
      ng->count[i] = (count_ind_t *) rr_malloc(sizeof(count_ind_t)*ng->table_sizes[i]);

    if (ng->four_byte_alphas) 
      ng->bo_weight4[i] = (four_byte_t *) rr_malloc(sizeof(four_byte_t)*ng->table_sizes[i]);
    else 
      ng->bo_weight[i] = (bo_weight_t *) rr_malloc(sizeof(bo_weight_t)*ng->table_sizes[i]);
    
    ng->ind[i] = (index__t *) rr_malloc(sizeof(index__t)*ng->table_sizes[i]);

    mem_alloced = sizeof(count_ind_t) + sizeof(bo_weight_t) + 
		sizeof(index__t) + sizeof(id__t);
    
    if (ng->four_byte_alphas) 
      mem_alloced += 4;
   
    mem_alloced *= ng->table_sizes[i];
    
    pc_message(verbosity,2,"Allocated %d bytes to table for %d-grams.\n",
	       mem_alloced,i+1);
    
  }

  ng->word_id[ng->n-1] = (id__t *) 
    rr_malloc(sizeof(id__t)*ng->table_sizes[ng->n-1]);

  if (ng->four_byte_counts) 
    ng->count4[ng->n-1] = (count_t *) rr_malloc(sizeof(count_t)*ng->table_sizes[ng->n-1]);    
  else 
    ng->count[ng->n-1] = (count_ind_t *) rr_malloc(sizeof(count_ind_t)*ng->table_sizes[ng->n-1]);

  pc_message(verbosity,2,"Allocated (%d+%d) bytes to table for %d-grams.\n",
	     ng->four_byte_counts?sizeof(count_t):sizeof(count_ind_t),
	     sizeof(id__t)*ng->table_sizes[ng->n-1],ng->n);
  
  /* Allocate memory for table for first-byte of indices */

  ng_allocate_ptr_table(ng,NULL,0);

  /* Allocate memory for alpha array */

  ng->alpha_array = (double *) rr_malloc(sizeof(double)*ng->out_of_range_alphas);
  ng->size_of_alpha_array = 0;

  /* Allocate memory for frequency of frequency information */

  ng->freq_of_freq = (fof_t **) rr_malloc(sizeof(fof_t *)*ng->n);

  NG_DISC_METH(ng)->allocate_freq_of_freq(ng);

  /* Read n-grams into the tree */
  pc_message(verbosity,2,"Processing id n-gram file.\n");
  pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n");

  /* Allocate space for ngrams id arrays */

  current_ngram.id_array = (id__t *) rr_calloc(ng->n,sizeof(id__t));
  previous_ngram.id_array = (id__t *) rr_calloc(ng->n,sizeof(id__t));
  current_ngram.n = ng->n;
  previous_ngram.n = ng->n;
  
  ng->num_kgrams = (ngram_sz_t *) rr_calloc(ng->n,sizeof(ngram_sz_t));
  ng_count = (count_t *) rr_calloc(ng->n,sizeof(count_t));
  nlines = 1;
  ng->n_unigrams = 0;

  /* Process first n-gram */  
  get_ngram(ng->id_gram_fp,&current_ngram,is_ascii);
  contains_unks = ngram_chk_contains_unks(&current_ngram,ng->n);

  /* Skip over any unknown words.  They will come first, because <UNK>
     always has a word ID of zero. */
  while (ng->vocab_type == CLOSED_VOCAB && contains_unks){
    /* Stop looking if there are no more N-Grams.  Of course, this
       means training will fail, since there are no unigrams. */
    if (get_ngram(ng->id_gram_fp,&current_ngram,is_ascii) == 0)
      break;
    contains_unks = ngram_chk_contains_unks(&current_ngram,ng->n);
  }

  for (i=0;i<=ng->n-2;i++) {
    ng->ind[i][0] = new_index(0,ng->ptr_table[i],&(ng->ptr_table_size[i]),0);
    ng->word_id[i+1][0] = current_ngram.id_array[i+1];
    ng->num_kgrams[i+1]++;
    ng_count[i] = current_ngram.count;
  }

  ng_count[0] = current_ngram.count;

  NG_DISC_METH(ng)->update_freq_of_freq(ng,ng->n-1,current_ngram.count);

  store_normal_count(ng,0,current_ngram.count,ng->n-1);

  if (current_ngram.count <= ng->cutoffs[ng->n-2]) 
    ng->num_kgrams[ng->n-1]--;

  ngram_copy(&previous_ngram,&current_ngram,ng->n);

  prev_id1 = current_ngram.id_array[0];
    
  displayed_oov_warning = 0;

  while (!rr_feof(ng->id_gram_fp)) {

    if (get_ngram(ng->id_gram_fp,&current_ngram,is_ascii)) {

      if (ng->vocab_type == CLOSED_VOCAB)
	contains_unks=ngram_chk_contains_unks(&current_ngram,ng->n);
    
      if (!contains_unks || ng->vocab_type != CLOSED_VOCAB) {

	/* Test for where this ngram differs from last - do we have an
	   out-of-order ngram? */
	pos_of_novelty = ngram_find_pos_of_novelty(&current_ngram,&previous_ngram,ng->n,nlines);
    
	nlines++; 
	show_idngram_nlines(nlines, verbosity);
    
	/* Add new n-gram as soon as it is encountered */
	/* If all of the positions 2,3,...,n of the n-gram are context
	   cues then ignore the n-gram. */
    
	if (ng->n > 1) {
	  NG_DISC_METH(ng)->update_freq_of_freq(ng,ng->n-1,current_ngram.count);
	        
	  store_normal_count(ng,ng->num_kgrams[ng->n-1],current_ngram.count,ng->n-1);
	  
	  ng->word_id[ng->n-1][ng->num_kgrams[ng->n-1]] = current_ngram.id_array[ng->n-1];
	  ng->num_kgrams[ng->n-1]++;	  
	  
	  if (ng->num_kgrams[ng->n-1] >= ng->table_sizes[ng->n-1])
	    quit(-1,"\nMore than %d %d-grams needed to be stored. Rerun with a higher table size.\n",ng->table_sizes[ng->n-1],ng->n);
	}
	/* Deal with new 2,3,...,(n-1)-grams */
      
	for (i=ng->n-2;i>=MAX(1,pos_of_novelty);i--) {

	  NG_DISC_METH(ng)->update_freq_of_freq(ng,i,ng_count[i]);
	  
	  if (ng_count[i] <= ng->cutoffs[i-1]) 
	    ng->num_kgrams[i]--;
	  else
	    store_normal_count(ng,ng->num_kgrams[i]-1,ng_count[i],i);

	  ng_count[i] = current_ngram.count;
	  ng->word_id[i][ng->num_kgrams[i]] = current_ngram.id_array[i];
	  ng->ind[i][ng->num_kgrams[i]] = new_index(ng->num_kgrams[i+1]-1,
						    ng->ptr_table[i],
						    &(ng->ptr_table_size[i]),
						    ng->num_kgrams[i]);
	  ng->num_kgrams[i]++;
	
	  if (ng->num_kgrams[i] >= ng->table_sizes[i])
	    quit(-1,"More than %d %d-grams needed to be stored. Rerun with a higher table size.\n",ng->table_sizes[i],i+1);	  
	}
      
	for (i=0;i<=pos_of_novelty-1;i++) 
	  ng_count[i] += current_ngram.count;
      
	/* Deal with new 1-grams */
      
	if (pos_of_novelty == 0) {
	  if (ng->n>1) {
	    for (i = prev_id1 + 1; i <= current_ngram.id_array[0]; i++) {
	      ng->ind[0][i] = new_index(ng->num_kgrams[1]-1,
				       ng->ptr_table[0],
				       &(ng->ptr_table_size[0]),
				       i);
	    }
	    prev_id1 = current_ngram.id_array[0];
	  }

	  NG_DISC_METH(ng)->update_freq_of_freq(ng,0,ng_count[0]);

	  if (!ng->context_cue[previous_ngram.id_array[0]]) {
	    ng->n_unigrams += ng_count[0];
	    store_normal_count(ng,previous_ngram.id_array[0],ng_count[0],0);
	  }

	  store_marginal_count(ng,previous_ngram.id_array[0],ng_count[0],0);
		      
	  ng_count[0] = current_ngram.count;
	}

	if (current_ngram.count <= ng->cutoffs[ng->n-2]) 
	  ng->num_kgrams[ng->n-1]--;

	ngram_copy(&previous_ngram,&current_ngram,ng->n);

      }else {
	if (!displayed_oov_warning){
	  pc_message(verbosity,2,"Warning : id n-gram stream contains OOV's (n-grams will be ignored).\n");
	  displayed_oov_warning = 1;
	}
      }
    }
  }

  rr_iclose(ng->id_gram_fp);

  for (i=ng->n-2;i>=1;i--) {

    NG_DISC_METH(ng)->update_freq_of_freq(ng,i,ng_count[i]);

    if (ng_count[i] <= ng->cutoffs[i-1]) 
      ng->num_kgrams[i]--;
    else 
      store_normal_count(ng,ng->num_kgrams[i]-1,ng_count[i],i);
      
  }
  
  NG_DISC_METH(ng)->update_freq_of_freq(ng,0,ng_count[0]);

  if (!ng->context_cue[current_ngram.id_array[0]]) {
    ng->n_unigrams += ng_count[0];
    store_normal_count(ng,current_ngram.id_array[0],ng_count[0],0);
  }

  store_marginal_count(ng,current_ngram.id_array[0],ng_count[0],0);

  if (ng->n>1) {
    for (i=current_ngram.id_array[0]+1;i<=ng->vocab_size;i++)
      ng->ind[0][i] = new_index(ng->num_kgrams[1],
				ng->ptr_table[0],
				&(ng->ptr_table_size[0]),
				current_ngram.id_array[0]);
  }

  /* The idngram reading is completed at this point */
  pc_message(verbosity,2,"\n");

  /* Impose a minimum unigram count, if required */

  if (ng->min_unicount > 0) {

    int nchanged= 0;

    for (i=ng->first_id;i<=ng->vocab_size;i++) {
      if ((return_count(ng->four_byte_counts,
			ng->count_table[0],
			ng->count[0],
			ng->count4[0],
			i) < ng->min_unicount) && !ng->context_cue[i]) {

	/* There was a bug in V2's switch.  Look at segment for ABSOLUTE */
	NG_DISC_METH(ng)->reduce_ug_freq_of_freq(ng,i);
	ng->n_unigrams += (ng->min_unicount - ng->count[0][i]);
	store_normal_count(ng,i,ng->min_unicount,0);
	nchanged++;
      }
    }

    if (nchanged > 0) 
      pc_message(verbosity,2,
		 "Unigram counts of %d words were bumped up to %d.\n",
		 nchanged,ng->min_unicount);
  }

  /* Count zeroton information for unigrams */

  ng->freq_of_freq[0][0] = 0;
  
  for (i=ng->first_id;i<=ng->vocab_size;i++) {
    if (return_count(ng->four_byte_counts,
		     ng->count_table[0],
		     ng->count[0],
		     ng->count4[0],
		     i) == 0) {
      ng->freq_of_freq[0][0]++;
    }
  }  

  if (ng->discounting_method == GOOD_TURING) {
    for (i=0;i<=ng->n-1;i++) 
      for (j=1;j<=ng->fof_size[i];j++) 
	pc_message(verbosity,3,"fof[%d][%d] = %d\n",i,j,ng->freq_of_freq[i][j]);
  }

  pc_message(verbosity,2,"Calculating discounted counts.\n");

  NG_DISC_METH(ng)->compute_discount_aux(ng, verbosity);
     
  /* Smooth unigram distribution, to give some mass to zerotons */     
  compute_unigram(ng,verbosity);

  /* Increment Contexts if using Good-Turing discounting-> No need otherwise,
     since all values are discounted anyway. */

  if (ng->discounting_method == GOOD_TURING) {
    pc_message(verbosity,2,"Incrementing contexts...\n");  

    for (i=ng->n-1;i>=1;i--) 
      increment_context(ng,i,verbosity);      
  }

  /* Calculate back-off weights */

  pc_message(verbosity,2,"Calculating back-off weights...\n");

  for (i=1;i<=ng->n-1;i++) 
    compute_back_off(ng,i,verbosity);

  if (!ng->four_byte_alphas) 
    pc_message(verbosity,3,"Number of out of range alphas = %d\n",
	       ng->size_of_alpha_array);

  /* Write out LM */

  pc_message(verbosity,2,"Writing out language model...\n");

  if (ng->write_arpa)
    write_arpa_lm(ng,verbosity);

  if (ng->write_bin) 
    write_bin_lm(ng,verbosity);

  pc_message(verbosity,0,"idngram2lm : Done.\n");

  return 0;    
}