C++ (Cpp) rr_iclose 예제들

예제 #1

0

파일 보기

파일: lm_combine.c 프로젝트: QuinnEbert/zedom8or

void load_context_cue(arpa_lm_t* lm, char* ccs_filename)
{
  FILE* context_cues_fp;
  char wlist_entry[1024];
  char current_cc[200];
  vocab_sz_t current_cc_id;

  lm->context_cue = (flag *) rr_calloc(lm->table_sizes[0],sizeof(flag));    
  lm->no_of_ccs = 0;
  if (strcmp(ccs_filename,"")) {
    context_cues_fp = rr_iopen(ccs_filename);
    while (fgets (wlist_entry, sizeof (wlist_entry),context_cues_fp)) {
      if (strncmp(wlist_entry,"##",2)==0) continue;
      sscanf (wlist_entry, "%s ",current_cc);
      warn_on_wrong_vocab_comments(wlist_entry);
	
      if (sih_lookup(lm->vocab_ht,current_cc,&current_cc_id) == 0)
	quit(-1,"Error : %s in the context cues file does not appear in the vocabulary.\n",current_cc);
      
      lm->context_cue[(unsigned short) current_cc_id] = 1;
      lm->no_of_ccs++;
      fprintf(stderr,"Context cue word : %s id = %lld\n",current_cc,current_cc_id);
    }
    rr_iclose(context_cues_fp);
  }
}

예제 #2

0

파일 보기

파일: calc_mem_req.c 프로젝트: AIDman/Kaldi

void calc_mem_req(ng_t *ng,flag is_ascii) {

  ngram current_ngram;
  ngram previous_ngram;
  count_t *ng_count;
  int i,j;

  current_ngram.id_array = (id__t *) rr_malloc(sizeof(id__t)*ng->n);
  previous_ngram.id_array = (id__t *) rr_malloc(sizeof(id__t)*ng->n);
  
  ng_count = (count_t *) rr_calloc(ng->n,sizeof(count_t));

  current_ngram.n = ng->n;

  rewind(ng->id_gram_fp);

  while (!rr_feof(ng->id_gram_fp)) {
    for (i=0;i<=ng->n-1;i++) {
      previous_ngram.id_array[i]=current_ngram.id_array[i];
    }
    get_ngram(ng->id_gram_fp,&current_ngram,is_ascii);
    for (i=0;i<=ng->n-1;i++) {
      if (current_ngram.id_array[i] != previous_ngram.id_array[i]) {
	for (j=i;j<=ng->n-1;j++) {
	  if (j>0) {
	    if (ng_count[j] > ng->cutoffs[j-1]) {
	      ng->table_sizes[j]++;
	    }
	  }
	  ng_count[j] =  current_ngram.count;
	}
	i=ng->n;
      }
      else {
	ng_count[i] += current_ngram.count;
      }
    }
  }

  for (i=1;i<=ng->n-1;i++) {
    if (ng_count[i] > ng->cutoffs[i-1]) {
      ng->table_sizes[i]++;
    }
  }

  /* Add a fudge factor, as problems can crop up with having to
     cut-off last few n-grams. */

  for (i=1;i<=ng->n-1;i++) {
    ng->table_sizes[i]+=10;
  }

  rr_iclose(ng->id_gram_fp);
  ng->id_gram_fp = rr_iopen(ng->id_gram_filename);

}

예제 #3

0

파일 보기

파일: idngram2lm.c 프로젝트: 10v/cmusphinx

void read_vocab(ng_t* ng, int verbosity)
{
  vocab_sz_t test_cc_id;
  vocab_sz_t current_cc_id;
  char current_cc[200];
  char wlist_entry[1024];

  pc_message(verbosity,2,"Reading vocabulary.\n");

  /* Don't change the parameter of sih_create, because it will change
     the binary layout of the .binlm file */

  ng->vocab_ht =
    sih_create(1000,0.5,2.0,1);

  read_voc(ng->vocab_filename,verbosity,ng->vocab_ht,&ng->vocab,&(ng->vocab_size));
  
  /* Determine which of the vocabulary words are context cues */

  ng->no_of_ccs = 0;
  ng->context_cue = (flag *) rr_calloc(ng->vocab_size+1,sizeof(flag));

  if (ng->context_set) {
    /* This should be tied to l889 to l894 in lm_combine.c
     */
    while (fgets (wlist_entry, sizeof (wlist_entry),ng->context_cues_fp)) {
      if (strncmp(wlist_entry,"##",2)==0) continue;
      sscanf (wlist_entry, "%s ",current_cc);
      warn_on_wrong_vocab_comments(wlist_entry);

      if (sih_lookup(ng->vocab_ht,current_cc,&current_cc_id) == 0) 
	pc_message(verbosity,1,"Warning : %s in the context cues file does not appear in the vocabulary.\n",current_cc);
      else {
	ng->context_cue[(unsigned short) current_cc_id] = 1;
	pc_message(verbosity,2,"Context cue word : %s id = %d\n",current_cc,current_cc_id);
	ng->no_of_ccs++;
      }
    }
    rr_iclose(ng->context_cues_fp);
  }

  if ((sih_lookup(ng->vocab_ht,"<s>",&test_cc_id) != 0)) 
    if (ng->context_cue[(unsigned short) test_cc_id] == 0) 
      fprintf(stderr,"WARNING: <s> appears as a vocabulary item, but is not labelled as a\ncontext cue.\n");

  if ((sih_lookup(ng->vocab_ht,"<p>",&test_cc_id) != 0)) 
    if (ng->context_cue[(unsigned short) test_cc_id] == 0) 
      fprintf(stderr,"WARNING: <p> appears as a vocabulary item, but is not labelled as a\ncontext cue.\n");

  if ((sih_lookup(ng->vocab_ht,"<art>",&test_cc_id) != 0)) 
    if (ng->context_cue[(unsigned short) test_cc_id] == 0) 
      fprintf(stderr,"WARNING: <art> appears as a vocabulary item, but is not labelled as a\ncontext cue.\n");

}

예제 #4

0

파일 보기

파일: lm_combine.c 프로젝트: QuinnEbert/zedom8or

void load_weights(double* w1, double* w2,char* file)
{
  FILE* fp;
  fp=rr_iopen(file);
	
  if (fscanf(fp,"%*s%lf",w1)!=1) 
    quit(-1,"Error in reading weight file\n");
  
  if (fscanf(fp,"%*s%lf",w2)!=1) 
    quit(-1,"Error in reading weight file\n");
	
  rr_iclose(fp);
}

예제 #5

0

파일 보기

파일: read_wlist_si.c 프로젝트: brunhil/SMILE

void read_wlist_into_siht(char *wlist_filename, int verbosity,  
                          sih_t *p_word_id_ht,  /** Pointer of the word ID hash table */
                          vocab_sz_t * p_n_wlist  /** Pointer of the size of wordlist */
                          )
{

    static char rname[]="read_wlist_into_siht";

    FILE   *wlist_fp = rr_iopen(wlist_filename);
    char   wlist_entry[1024], word[256], *word_copy;
    vocab_sz_t    entry_no = 0;
    
    while (fgets (wlist_entry, sizeof (wlist_entry), wlist_fp)) {
        if (strncmp(wlist_entry,"##",2) == 0) continue;
        entry_no++;
        
        /*     printf("entry no %lld, wlist_entry %s\n",entry_no,wlist_entry);*/
        if(entry_no%1000==0){
            fprintf(stdout,".");
            fflush(stdout);
        }
        sscanf (wlist_entry, "%s ", word);
        warn_on_wrong_vocab_comments(wlist_entry);
        word_copy = rr_salloc(word);
        sih_add(p_word_id_ht, word_copy, entry_no);
        free(word_copy); // HLW
    }
    fprintf(stdout,"\n");
    fflush(stdout);
    rr_iclose(wlist_fp);
 if(verbose_cmuclmtk == 1) {      
    if (verbosity)
        fprintf(stderr,"%s: a list of %d words was read from \"%s\".\n",
                rname,(int) entry_no,wlist_filename);
 }
    *p_n_wlist = entry_no;
}

예제 #6

0

파일 보기

파일: read_voc.c 프로젝트: Leobin7/Kaldi

void read_voc(char *filename, int verbosity,
              sih_t *p_vocab_ht, char ***p_vocab,
              unsigned short *p_vocab_size)
/* p_vocab==NULL means: build only a hash table */
{

    /*  static char rname[] = "rd_voc"; */  /* Never used anyway! */
    char *pperiod;
    int   vocab_size;

    pperiod = rindex(filename,'.');
    if (pperiod==NULL) pperiod = filename-1;

    if (strcmp(pperiod+1,"vocab_ht")==0) { 	     /* file == hash_table */
        FILE *fp=rr_iopen(filename);
        sih_val_read_from_file(p_vocab_ht, fp, filename, verbosity);
        rr_iclose(fp);
        vocab_size = p_vocab_ht->nentries;
        if (p_vocab!=NULL) {
            get_vocab_from_vocab_ht(p_vocab_ht, vocab_size, verbosity, p_vocab);
            *p_vocab[0] = salloc("<UNK>");
        }
    }
    else {					     /* file == vocab(ascii) */
        read_wlist_into_siht(filename, verbosity, p_vocab_ht, &vocab_size);
        if (p_vocab!=NULL) {
            read_wlist_into_array(filename, verbosity, p_vocab, &vocab_size);
            *p_vocab[0] = salloc("<UNK>");
        }
    }

    if (p_vocab_size) {
        *p_vocab_size = vocab_size;
    }

}

예제 #7

0

파일 보기

파일: perplexity.c 프로젝트: 10v/cmusphinx

void compute_perplexity(ng_t *ng,
			arpa_lm_t *arpa_ng,
			char *text_stream_filename,
			char *probs_stream_filename,
			char *annotation_filename,
			char *oov_filename,
			char *fb_list_filename,
			flag backoff_from_unk_inc,
			flag backoff_from_unk_exc,
			flag backoff_from_ccs_inc,
			flag backoff_from_ccs_exc,
			flag arpa_lm,
			flag include_unks,
			double log_base) {

  fb_info *fb_list;
  FILE *temp_fp;
  FILE *text_stream_fp;
  FILE *probs_stream_fp;
  FILE *annotation_fp;
  FILE *oov_fp;
  flag out_probs;
  flag annotate;
  flag out_oovs;
  flag found_unk_wrongly;
  double prob;
  double sum_log_prob;
  int total_words;
  int excluded_unks;
  int excluded_ccs;
  char current_word[1000];  /* Hope that's big enough */
  char **prev_words;
  vocab_sz_t current_id;
  id__t short_current_id;
  id__t *context;
  int context_length;
  int i;
  int bo_case;
  int actual_context_length;
  int *ngrams_hit;
  int n;

  /* Initialise file pointers to prevent warnings from the compiler. */

  probs_stream_fp = NULL;
  annotation_fp = NULL;
  oov_fp = NULL;

  short_current_id = 0;

  found_unk_wrongly = 0;

  annotate = 0;

  bo_case = 0;

  if (arpa_lm) {
    n = arpa_ng->n;
    fb_list = gen_fb_list(arpa_ng->vocab_ht,
			  (int) arpa_ng->vocab_size,
			  arpa_ng->vocab,
			  arpa_ng->context_cue,
			  backoff_from_unk_inc,
			  backoff_from_unk_exc,
			  backoff_from_ccs_inc,
			  backoff_from_ccs_exc,
			  fb_list_filename);
  }else {
    n = ng->n;
    fb_list = gen_fb_list(ng->vocab_ht,
			  (int) ng->vocab_size,
			  ng->vocab,
			  ng->context_cue,
			  backoff_from_unk_inc,
			  backoff_from_unk_exc,
			  backoff_from_ccs_inc,
			  backoff_from_ccs_exc,
			  fb_list_filename);
  }
  
  ngrams_hit = (int *) rr_calloc(n,sizeof(int));
  prev_words = (char **) rr_malloc(sizeof(char *)*n);
  for (i=0;i<=n-1;i++)
    prev_words[i] = (char *) rr_malloc(sizeof(char)*1000);
  
  /* Check that text_stream_filename and probs_stream_filename (if
     specified) are valid. Note that the checks employed by the
     standard rr_fopen tools are not suitable here, since we don't
     want the program to terminate if the paths are not found. */

  if (!strcmp(text_stream_filename,"")) {
    printf("Error : Must specify a text file. Use the -text switch.\n");
    return;
  }

  if (!rr_fexists(text_stream_filename) && strcmp(text_stream_filename,"-")) {
    printf("Error : Can't open file %s for reading.\n",text_stream_filename);
    return;
  }

  out_probs = strcmp(probs_stream_filename,"");
  annotate = strcmp(annotation_filename,"");
  out_oovs = strcmp(oov_filename,"");

  printf("Computing perplexity of the language model with respect\n");
  printf("   to the text %s\n",text_stream_filename);
  if (out_probs)
    printf("Probability stream will be written to file %s\n",
	    probs_stream_filename);

  if (annotate)
    printf("Annotation will be written to file %s\n",
	    annotation_filename);

  if (out_oovs)
    printf("Out of vocabulary words will be written to file %s\n",
	    oov_filename);

  if (backoff_from_unk_inc)
    printf("Will force inclusive back-off from OOVs.\n");

  if (backoff_from_unk_exc)
    printf("Will force exclusive back-off from OOVs.\n");

  if (backoff_from_ccs_inc)
    printf("Will force inclusive back-off from context cues.\n");

  if (backoff_from_ccs_exc)
    printf("Will force exclusive back-off from context cues.\n");

  if (strcmp(fb_list_filename,"")) 
    printf("Will force back-off according to the contents of %s\n",
	    fb_list_filename);

  if (include_unks)
    printf("Perplexity calculation will include OOVs.\n");

  /* Check for existance of files, as rr functions will quit, which isn't
     what we want */

  if (out_probs && strcmp(probs_stream_filename,"-")) {
    if ((temp_fp = fopen(probs_stream_filename,"w")) == NULL) {
      printf("Error : Can't open file %s for writing.\n",probs_stream_filename);
      return;
    }
    fclose(temp_fp);
  }

  if (annotate && strcmp(annotation_filename,"-")) {
    if ((temp_fp = fopen(annotation_filename,"w")) == NULL) {
      printf("Error : Can't open file %s for writing.\n",annotation_filename);
      return;
    }
    fclose(temp_fp);
  }
    
  if (out_oovs && strcmp(oov_filename,"-")) {
    if ((temp_fp = fopen(oov_filename,"w")) == NULL) {
      printf("Error : Can't open file %s for writing.\n",oov_filename);
      return;
    }
    fclose(temp_fp);
  }

  text_stream_fp = rr_iopen(text_stream_filename);
  if (out_probs)
    probs_stream_fp = rr_oopen(probs_stream_filename);

  if (annotate)
    annotation_fp = rr_oopen(annotation_filename);

  if (out_oovs)
    oov_fp = rr_oopen(oov_filename);

  context = (id__t *) rr_malloc(sizeof(id__t)*(n-1));

  sum_log_prob = 0.0;
  total_words = 0;
  excluded_unks = 0;
  excluded_ccs = 0;

  while (!rr_feof(text_stream_fp)) {

    if (total_words > 0) {
      if (total_words < n)
	strcpy(prev_words[total_words-1],current_word);
      else {
	for (i=0;i<=n-3;i++)
	  strcpy(prev_words[i],prev_words[i+1]);

	if (n>1)
	  strcpy(prev_words[n-2],current_word);
      }
    }

    if (total_words < (n-1))
      context_length = total_words;
    else
      context_length = n-1;

    /* Fill context with right stuff */

    if (total_words > (n-1)) {
      for (i=0;i<=context_length-2;i++)
	context[i] = context[i+1];
    }

    if (context_length != 0)
      context[context_length-1] = short_current_id;

    if (fscanf(text_stream_fp,"%s",current_word) != 1) {
      if (!rr_feof(text_stream_fp)) {
	printf("Error reading text file.\n");
	return;
      }
    }

    if (!rr_feof(text_stream_fp)) {

      if (arpa_lm) {
	sih_lookup(arpa_ng->vocab_ht,current_word,&current_id);
	if (arpa_ng->vocab_type == CLOSED_VOCAB && current_id == 0) {
	  found_unk_wrongly = 1;
	  printf("Error : %s is not in the vocabulary, and this is a closed \nvocabulary model.\n",current_word);
	}
	if (current_id > arpa_ng->vocab_size)
	  quit(-1,"Error : returned value from sih_lookup (%d) is too high.\n",context[i]); 
	else
	  short_current_id = current_id;

      }else {
	sih_lookup(ng->vocab_ht,current_word,&current_id);
	if (ng->vocab_type == CLOSED_VOCAB && current_id == 0) {
	  found_unk_wrongly = 1;
	  printf("Error : %s is not in the vocabulary, and this is a closed \nvocabulary model.\n",current_word);
	}
	if (current_id > ng->vocab_size)
	  quit(-1,"Error : returned value from sih_lookup (%d) is too high.\n",context[i]); 
	else
	  short_current_id = current_id;

      }
    
      if (!found_unk_wrongly) {

	if (current_id == 0 && out_oovs)
	  fprintf(oov_fp,"%s\n",current_word);

	if ((arpa_lm && (!(arpa_ng->context_cue[current_id])))
	    || ((!arpa_lm) && (!(ng->context_cue[current_id])))) {

	  if (include_unks || current_id != 0) {

	    prob = calc_prob_of(short_current_id,
				context,
				context_length,
				ng,
				arpa_ng,
				fb_list,
				&bo_case,
				&actual_context_length,
				arpa_lm);


	    if (prob<= 0.0 || prob > 1.0) {
	      fprintf(stderr,"Warning : ");
	      if (short_current_id == 0)
		fprintf(stderr,"P( <UNK> | ");
	      else
		fprintf(stderr,"P( %s | ",current_word);
	  
	      for (i=0;i<=actual_context_length-1;i++) {
		if (context[i+context_length-actual_context_length] == 0)
		  fprintf(stderr,"<UNK> ");
		else
		  fprintf(stderr,"%s ",prev_words[i]);
	      }
	      fprintf(stderr,") = %g logprob = %g \n ",prob,log(prob)/log(log_base));
	      fprintf(stderr,"bo_case == 0x%dx, actual_context_length == %d\n",
		      bo_case, actual_context_length);
	    }
	  
	    if (annotate) {
	      if (short_current_id == 0)
		fprintf(annotation_fp,"P( <UNK> | ");
	      else 
		fprintf(annotation_fp,"P( %s | ",current_word);
	  
	      for (i=0;i<=actual_context_length-1;i++) {
		if (context[i+context_length-actual_context_length] == 0)
		  fprintf(annotation_fp,"<UNK> ");
		else {
		  if (arpa_lm)
		    fprintf(annotation_fp,"%s ",arpa_ng->vocab[context[i+context_length-actual_context_length]]);
		  else
		    fprintf(annotation_fp,"%s ",ng->vocab[context[i+context_length-actual_context_length]]);
		}
	      }
	      fprintf(annotation_fp,") = %g logprob = %f bo_case = ",prob,log(prob)/log(log_base));
	      decode_bo_case(bo_case,actual_context_length,annotation_fp);
	    }

	    /* Calculate level to which we backed off */

	    for (i=actual_context_length-1;i>=0;i--) {
 	      int four_raise_i = 1<<(2*i);  /* PWP */
 
 	      /*
 	       * PWP: This was "if ((bo_case / (int) pow(3,i)) == 0)"
 	       * but was getting a divide-by-zero error on an Alpha
 	       * (it isn't clear to me why it should ever have done so)
 	       * Anyway, it is much faster to do in base-4.
 	       */

	      if ((bo_case == 0) || ((bo_case / four_raise_i) == 0)) {
		ngrams_hit[i+1]++;
		i = -2;
	      }else
		bo_case -= ((bo_case / four_raise_i) * four_raise_i);
	    }
  
	    if (i != -3) 
	      ngrams_hit[0]++;

	    if (out_probs)
	      fprintf(probs_stream_fp,"%g\n",prob);
      
	    sum_log_prob += log10(prob);
			  
	  }

          if (current_id == 0 && !include_unks)
            excluded_unks++;
	}       
	else {
	  if (((!arpa_lm) && ng->context_cue[current_id]) || 
	      (arpa_lm && arpa_ng->context_cue[current_id]))
	    excluded_ccs++;
	}
	total_words++;
      }
    }
  }

  if (!found_unk_wrongly) {      /*  pow(x,y) = e**(y  ln(x)) */
    printf("Perplexity = %.2f, Entropy = %.2f bits\n", 
	    exp(-sum_log_prob/(total_words-excluded_ccs-excluded_unks) * 
		log(10.0)),
	   (-sum_log_prob/(total_words-excluded_ccs-excluded_unks) * 
	    log(10.0) / log(2.0)));
    printf("Computation based on %d words.\n",
	   total_words-excluded_ccs-excluded_unks);
    for(i=n;i>=1;i--) {
      printf("Number of %d-grams hit = %d  (%.2f%%)\n",i,ngrams_hit[i-1],
	     (float) 100*ngrams_hit[i-1]/(total_words-excluded_ccs-excluded_unks) );
    }
    printf("%d OOVs (%.2f%%) and %d context cues were removed from the calculation.\n",
	   excluded_unks,
	   (float) 100*excluded_unks/(total_words-excluded_ccs),excluded_ccs);
    
  }

  rr_iclose(text_stream_fp);

  if (out_probs)
    rr_oclose(probs_stream_fp);
  if (annotate)
    rr_oclose(annotation_fp);
  if (out_oovs)
    rr_oclose(oov_fp);

  free (fb_list);
  free (context);
  free (ngrams_hit);
}

예제 #8

0

파일 보기

파일: evallm.c 프로젝트: JonGBowen/GoodVibes

int oe_02_main (int argc, char **argv) {

  ng_t ng;
  arpa_lm_t arpa_ng;
  char input_string[500];
  int num_of_args;
  char *args[MAX_ARGS];
  char *lm_filename_arpa;
  char *lm_filename_binary;
  flag told_to_quit;
  flag inconsistant_parameters;
  flag backoff_from_unk_inc;
  flag backoff_from_unk_exc;
  flag backoff_from_ccs_inc;
  flag backoff_from_ccs_exc;
  flag arpa_lm;
  flag binary_lm;
  flag include_unks;
  char *fb_list_filename;
  char *probs_stream_filename;
  char *annotation_filename;
  char *text_stream_filename;
  char *oov_filename;
  char *ccs_filename;
  int generate_size;
  int random_seed;
  double log_base;
  char wlist_entry[1024];
  char current_cc[200];
  vocab_sz_t current_cc_id;
  FILE *context_cues_fp;
  int n;

  /* Process command line */

  report_version(&argc,argv);

  if (pc_flagarg(&argc, argv,"-help") || 
      argc == 1 || 
      (strcmp(argv[1],"-binary") && strcmp(argv[1],"-arpa"))) {
    oe_02_help_message();
    exit(1);
  }

  lm_filename_arpa = rr_salloc(pc_stringarg(&argc, argv,"-arpa",""));

  if (strcmp(lm_filename_arpa,""))
    arpa_lm = 1;
  else
    arpa_lm = 0;

  lm_filename_binary = rr_salloc(pc_stringarg(&argc, argv,"-binary",""));

  if (strcmp(lm_filename_binary,""))
    binary_lm = 1;
  else
    binary_lm = 0;

  if (arpa_lm && binary_lm)
    quit(-1,"Error : Can't use both -arpa and -binary flags.\n");
  
  if (!arpa_lm && !binary_lm)
    quit(-1,"Error : Must specify either a binary or an arpa format language model.\n");

  ccs_filename = rr_salloc(pc_stringarg(&argc, argv,"-context",""));

  if (binary_lm && strcmp(ccs_filename,""))
    fprintf(stderr,"Warning - context cues file not needed with binary language model file.\nWill ignore it.\n");

  pc_report_unk_args(&argc,argv,2);
 
  /* Load language model */

  if (arpa_lm) {
    fprintf(stderr,"Reading in language model from file %s\n",
	    lm_filename_arpa);
    load_arpa_lm(&arpa_ng,lm_filename_arpa);
  }else {
    fprintf(stderr,"Reading in language model from file %s\n",
	    lm_filename_binary);
    load_lm(&ng,lm_filename_binary); 
  }

  fprintf(stderr,"\nDone.\n");

  n=arpa_lm?
    arpa_ng.n:
    ng.n;

  if (arpa_lm) {
    arpa_ng.context_cue = 
      (flag *) rr_calloc(arpa_ng.table_sizes[0],sizeof(flag));    
    arpa_ng.no_of_ccs = 0;
    if (strcmp(ccs_filename,"")) {
      context_cues_fp = rr_iopen(ccs_filename);
      while (fgets (wlist_entry, sizeof (wlist_entry),context_cues_fp)) {
	if (strncmp(wlist_entry,"##",2)==0) continue;
	sscanf (wlist_entry, "%s ",current_cc);

	warn_on_wrong_vocab_comments(wlist_entry);
	
	if (sih_lookup(arpa_ng.vocab_ht,current_cc,&current_cc_id) == 0)
	  quit(-1,"Error : %s in the context cues file does not appear in the vocabulary.\n",current_cc);
	
	arpa_ng.context_cue[(unsigned short) current_cc_id] = 1;
	arpa_ng.no_of_ccs++;
	fprintf(stderr,"Context cue word : %s id = %lld\n",current_cc,current_cc_id);
      }
      rr_iclose(context_cues_fp);
    }
  }

  /* Process commands */
  
  told_to_quit = 0;
  num_of_args = 0;

  while (!feof(stdin) && !told_to_quit) {
    printf("evallm : \n");
    fgets(input_string, sizeof(input_string), stdin);
    if(strlen(input_string) < sizeof(input_string)-1)
      input_string[strlen(input_string)-1] = '\0'; //chop new-line
    else 
      quit(1, "evallm input exceeds size of input buffer");

    if (!feof(stdin)) {
      parse_comline(input_string,&num_of_args,args);

      log_base = pc_doublearg(&num_of_args,args,"-log_base",10.0);

      backoff_from_unk_inc = pc_flagarg(&num_of_args,args,"-backoff_from_unk_inc");
      backoff_from_ccs_inc = pc_flagarg(&num_of_args,args,"-backoff_from_ccs_inc");
      backoff_from_unk_exc = pc_flagarg(&num_of_args,args,"-backoff_from_unk_exc");
      backoff_from_ccs_exc = pc_flagarg(&num_of_args,args,"-backoff_from_ccs_exc");
      include_unks = pc_flagarg(&num_of_args,args,"-include_unks");
      fb_list_filename = rr_salloc(pc_stringarg(&num_of_args,args,"-backoff_from_list",""));
    
      text_stream_filename = 
	rr_salloc(pc_stringarg(&num_of_args,args,"-text",""));
      probs_stream_filename = 
	rr_salloc(pc_stringarg(&num_of_args,args,"-probs",""));
      annotation_filename = 
	rr_salloc(pc_stringarg(&num_of_args,args,"-annotate",""));
      oov_filename = rr_salloc(pc_stringarg(&num_of_args,args,"-oovs",""));

      generate_size = pc_intarg(&num_of_args,args,"-size",10000);
      random_seed = pc_intarg(&num_of_args,args,"-seed",-1);

      inconsistant_parameters = 0;
    
      if (backoff_from_unk_inc && backoff_from_unk_exc) {
	fprintf(stderr,"Error : Cannot specify both exclusive and inclusive forced backoff.\n");
	fprintf(stderr,"Use only one of -backoff_from_unk_exc and -backoff_from_unk_inc\n");
	inconsistant_parameters = 1;
      }

      if (backoff_from_ccs_inc && backoff_from_ccs_exc) {
	fprintf(stderr,"Error : Cannot specify both exclusive and inclusive forced backoff.\n");
	fprintf(stderr,"Use only one of -backoff_from_ccs_exc and -backoff_from_ccs_inc\n");
	inconsistant_parameters = 1;
      }

      if (num_of_args > 0) {      
	if (!inconsistant_parameters) {
	  if (!strcmp(args[0],"perplexity")) {
	    compute_perplexity(&ng,
			       &arpa_ng,
			       text_stream_filename,
			       probs_stream_filename,
			       annotation_filename,
			       oov_filename,
			       fb_list_filename,
			       backoff_from_unk_inc,
			       backoff_from_unk_exc,
			       backoff_from_ccs_inc,
			       backoff_from_ccs_exc,
			       arpa_lm,
			       include_unks,
			       log_base);
	  }else
	    /* do perplexity sentence by sentence [20090612] (air) */
	    if (!strcmp(args[0],"uttperp")) {
	      FILE *uttfh,*tempfh;
	      char utt[4096]; /* live dangerously... */
	      char tmpfil[128];
	      if ((uttfh = fopen(text_stream_filename,"r")) == NULL) {
		printf("Error: can't open %s\n",text_stream_filename);
		exit(1);
	      }
            char *template = "uttperp_XXXXXX";// CHANGED HLW
            mkstemp(template);// CHANGED HLW

예제 #9

0

파일 보기

파일: mergeidngram.c 프로젝트: JonGBowen/GoodVibes

int oe_15_main( int argc, char **argv )
{
  FILE **fin;
  ngram *ng;
  ngram outng;
  flag *done, finished;
  int i, j, nfiles;

  /* Process the command line */
  report_version(&argc,argv);
  procComLine( &argc, argv ) ;
  if( argc < 2 ) {
    printUsage( argv[0] ) ;
    exit( 1 ) ;
  }
  nfiles = argc - 1;

  /* allocate memory */
  fin = (FILE **) rr_malloc( sizeof( FILE *) * nfiles );
  done = (flag *) rr_malloc( sizeof( flag ) * nfiles );
  ng = (ngram *) rr_malloc( sizeof( ngram ) * nfiles );

  for( i = 0; i < nfiles; i++ ) {
    ng[i].id_array = (id__t *) rr_calloc( n, sizeof( id__t ) );
    ng[i].n = n;
  }
  outng.id_array = (id__t *) rr_calloc( n, sizeof( id__t ) );
  outng.n = n;

  /* open the input files */
  for( i = 0; i < nfiles; i++ )
    fin[i] = rr_iopen( argv[i+1] );

  /* read first ngram from each file */
  for( i = 0; i < nfiles; i++ ) {
    done[i] = 0;
    if ( !get_ngram( fin[i], &ng[i], ascii_in ) )
      done[i] = 1;
  }

  finished = 0;
  while ( !finished ) {
  /* set outng to max possible */
    for( i = 0; i < n; i++ )
      outng.id_array[i] = MAX_VOCAB_SIZE;
    
    /* find smallest ngram */
    for( i = 0; i < nfiles; i++ ) {
      if ( !done[i] ) 
	if ( cmp_ngram( &outng, &ng[i] ) > 0 ) 
	  for( j = 0; j < n; j++ ) outng.id_array[j] = ng[i].id_array[j];
    }
    
    outng.count = 0;
    for( i = 0; i < nfiles; i++ ) {
      if ( !done[i] ) {
	/* add counts of equal ngrams */
	if ( cmp_ngram( &outng, &ng[i] ) == 0 ) {
	  outng.count += ng[i].count;
	  if ( !get_ngram( fin[i], &ng[i], ascii_in ) ) {
	    /* check if all files done */
	    done[i] = 1;
	    finished = 1;
	    for( j = 0; j < nfiles; j++ ) 
	      if ( ! done[j] ) finished = 0;
	  }
	}
      }
    }

    write_ngram( stdout, &outng, ascii_out );
  }

  for( i = 0; i < nfiles; i++ )
    rr_iclose( fin[i] );

  fprintf(stderr,"mergeidngram : Done.\n");

  return( 0 );
}

예제 #10

0

파일 보기

파일: ac_lmfunc_impl.c 프로젝트: Ankit77/cmusphinx

void merge_tempfiles (int start_file, 
		      int end_file, 
		      char *temp_file_root,
		      char *temp_file_ext,
		      int max_files,
		      FILE *outfile,
		      int n,
		      int verbosity) {

  FILE *new_temp_file;
  char *new_temp_filename;
  
  FILE **temp_file;
  char **temp_filename;
  char **current_ngram;
  char smallest_ngram[1000];
  int *current_ngram_count;
  flag *finished;
  flag all_finished;
  int temp_count;
  char temp_word[500];
  int i,j;
  
  pc_message(verbosity,2,"Merging temp files %d through %d...\n", start_file,
 	  end_file);
   /*
    * If we try to do more than max_files, then merge into groups,
    * then merge groups recursively.
    */
    if (end_file-start_file+1 > max_files) {
       int new_start_file, new_end_file;
       int n_file_groups = 1 + (end_file-start_file)/max_files;
 
       fprintf(stderr, "%d files to do, in %d groups\n", end_file-start_file,
 	      n_file_groups);
 
       new_temp_filename = (char *) rr_malloc(300*sizeof(char));
 
       /*
        * These n_file_groups sets of files will be done in groups of
        * max_files batches each, as temp files numbered
        * end_file+1 ... end_file+n_file_groups,
        * and then these will be merged into the final result.
        */
 
       for (i = 0; i < n_file_groups; i++) {
 	  /* do files i*max_files through min((i+1)*max_files-1,end_file); */
 	  new_start_file = start_file + (i*max_files);
 	  new_end_file = start_file + ((i+1)*max_files) - 1;
 	  if (new_end_file > end_file) new_end_file = end_file;
 	  
 	  sprintf(new_temp_filename,
 		  "%s/%hu%s",
 		  temp_file_root,
 		  end_file+i+1,
 		  temp_file_ext);
 
 	  new_temp_file = rr_oopen(new_temp_filename);
 
 	  merge_tempfiles(new_start_file,
 			  new_end_file,
 			  temp_file_root,
			  temp_file_ext,
 			  max_files,
 			  new_temp_file,
 			  n,
			  verbosity);
 
 	  rr_iclose(new_temp_file);
 
       }
 
       merge_tempfiles(end_file+1,
		       end_file+n_file_groups,
		       temp_file_root,
		       temp_file_ext,
		       max_files,
		       outfile,
		       n,
		       verbosity);
 
       return;
    }
    
   /*
    * We know we are now doing <= max_files.
    */
 
   temp_file = (FILE **) rr_malloc((end_file+1)*sizeof(FILE *));
   temp_filename = (char **) rr_malloc((end_file+1)*sizeof(char *));
   for (i=start_file;i<=end_file;i++) {
     temp_filename[i] = (char *) rr_malloc(300*sizeof(char));
   }
   current_ngram = (char **) rr_malloc((end_file+1)*sizeof(char *));
   for (i=start_file;i<=end_file;i++) {
     current_ngram[i] = (char *) rr_malloc(1000*sizeof(char));
   }
   current_ngram_count = (int *) rr_malloc((end_file+1)*sizeof(int));
   finished = (flag *) rr_malloc(sizeof(flag)*(end_file+1));
  
   /* Open all the temp files for reading */
   for (i=start_file;i<=end_file;i++) {
     sprintf(temp_filename[i],"%s/%hu%s",
	     temp_file_root,i,temp_file_ext);
     temp_file[i] = rr_iopen(temp_filename[i]);
   }
 
   /* Now go through the files simultaneously, and write out the appropriate
      ngram counts to the output file. */
 
   for (i=start_file;i<=end_file;i++) {
     finished[i] = 0;
     if (!rr_feof(temp_file[i])) {
       for (j=0;j<=n-1;j++) {
 	if (fscanf(temp_file[i],"%s",temp_word) != 1) {
 	  if (!rr_feof(temp_file[i]))
 	    quit(-1,"Error reading temp file %s\n",temp_filename[i]);
 	}else {
 	  if (j==0)
 	    strcpy(current_ngram[i],temp_word);
  	  else {
 	    strcat(current_ngram[i]," ");
 	    strcat(current_ngram[i],temp_word);
  	  }
  	}
       }
       if (fscanf(temp_file[i],"%d",&current_ngram_count[i]) != 1) {
	 if (!rr_feof(temp_file[i]))
	   quit(-1,"Error reading temp file %s\n",temp_filename[i]);
       }
     }
   }
   
   all_finished = 0;
   
   while (!all_finished) {
  
     /* Find the smallest current ngram */
 
     strcpy(smallest_ngram,"");
 
     for (i=start_file;i<=end_file;i++) {
       if (!finished[i]) {
	 if (strcmp(smallest_ngram,current_ngram[i]) > 0 ||
	     (smallest_ngram[0] == '\0'))
	   strcpy(smallest_ngram,current_ngram[i]);
       }
     }
     
     /* For each of the files that are currently holding this ngram,
        add its count to the temporary count, and read in a new ngram
        from the files. */
  
     temp_count = 0;
 
     for (i=start_file;i<=end_file;i++) {
       if (!finished[i]) {
 	if (!strcmp(smallest_ngram,current_ngram[i])) {
 	  temp_count += current_ngram_count[i];
 	  if (!rr_feof(temp_file[i])) {
 	    for (j=0;j<=n-1;j++) {
 	      if (fscanf(temp_file[i],"%s",temp_word) != 1) {
 		if (!rr_feof(temp_file[i])) {
 		  quit(-1,"Error reading temp file %s\n",temp_filename[i]);
 		}
 	      }else {
 		if (j==0)
 		  strcpy(current_ngram[i],temp_word);
  		else {
 		  strcat(current_ngram[i]," ");
 		  strcat(current_ngram[i],temp_word);
  		}
  	      }
 	    }
 	    if (fscanf(temp_file[i],"%d",&current_ngram_count[i]) != 1) {
 	      if (!rr_feof(temp_file[i])) {
 		quit(-1,"Error reading temp file count %s\n",
 		     temp_filename[i]);
  	      }
  	    }
 	  }
 
 	  /*
 	   * PWP: Note that the fscanf may have changed the state of
 	   * temp_file[i], so we re-ask the question rather than just
 	   * doing an "else".
 	   */
 	  if (rr_feof(temp_file[i])) {
 	    finished[i] = 1;
 	    all_finished = 1;
 	    for (j=start_file;j<=end_file;j++) {
 	      if (!finished[j]) {
 		all_finished = 0;
  	      }
  	    }
  	  }
  	}
        }
      }
 
     /*
      * PWP: We cannot conditionalize this on (!all_finished) because
      * if we do we may have lost the very last count.  (Consider the
      * case when several files have ran out of data, but the last
      * couple have the last count in them.)
      */
     if (fprintf(outfile,"%s %d\n",smallest_ngram,temp_count) < 0) {
       quit(-1,"Write error encountered while attempting to merge temporary files.\nAborting, but keeping temporary files.\n");
     }
   }
 
   for (i=start_file;i<=end_file;i++) {
     rr_iclose(temp_file[i]);
     remove(temp_filename[i]);
   }
    
   free(temp_file);
   for (i=start_file;i<=end_file;i++) {
      free(temp_filename[i]);
    }
   free(temp_filename);  

   for (i=start_file;i<=end_file;i++) {
      free(current_ngram[i]);
   }
   free(current_ngram);

  free(current_ngram_count);
  free(finished);
}

예제 #11

0

파일 보기

파일: evallm.c 프로젝트: hunterhector/LangStats12Spring

void main (int argc, char **argv) {

  ng_t ng;
  arpa_lm_t arpa_ng;
  char input_string[500];
  int num_of_args;
  char *args[MAX_ARGS];
  char *lm_filename_arpa;
  char *lm_filename_binary;
  flag told_to_quit;
  flag inconsistant_parameters;
  flag backoff_from_unk_inc;
  flag backoff_from_unk_exc;
  flag backoff_from_ccs_inc;
  flag backoff_from_ccs_exc;
  flag arpa_lm;
  flag binary_lm;
  flag include_unks;
  char *fb_list_filename;
  char *probs_stream_filename;
  char *annotation_filename;
  char *text_stream_filename;
  char *oov_filename;
  char *ccs_filename;
  double log_base;
  char wlist_entry[1024];
  char current_cc[200];
  int current_cc_id;
  FILE *context_cues_fp;
  int n;
  int generate_size = 10000;
  int random_seed;

  /* Process command line */

  report_version(&argc,argv);

  if (pc_flagarg(&argc, argv,"-help") || 
      argc == 1 || 
      (strcmp(argv[1],"-binary") && strcmp(argv[1],"-arpa"))) {
   fprintf(stderr,"evallm : Evaluate a language model.\n");
   fprintf(stderr,"Usage : evallm [ -binary .binlm | \n");
   fprintf(stderr,"                 -arpa .arpa [ -context .ccs ] ]\n");
   exit(1);
  }

  lm_filename_arpa = salloc(pc_stringarg(&argc, argv,"-arpa",""));

  if (strcmp(lm_filename_arpa,"")) {
    arpa_lm = 1;
  }
  else {
    arpa_lm = 0;
  }

  lm_filename_binary = salloc(pc_stringarg(&argc, argv,"-binary",""));

  if (strcmp(lm_filename_binary,"")) {
    binary_lm = 1;
  }
  else {
    binary_lm = 0;
  }

  if (arpa_lm && binary_lm) {
    quit(-1,"Error : Can't use both -arpa and -binary flags.\n");
  }
  
  if (!arpa_lm && !binary_lm) {
    quit(-1,"Error : Must specify either a binary or an arpa format language model.\n");
  }

  ccs_filename = salloc(pc_stringarg(&argc, argv,"-context",""));

  if (binary_lm && strcmp(ccs_filename,"")) {
    fprintf(stderr,"Warning - context cues file not needed with binary language model file.\nWill ignore it.\n");
  }

  pc_report_unk_args(&argc,argv,2);
 
  /* Load language model */

  if (arpa_lm) {
    fprintf(stderr,"Reading in language model from file %s\n",
	    lm_filename_arpa);
    load_arpa_lm(&arpa_ng,lm_filename_arpa);
  }
  else {
    fprintf(stderr,"Reading in language model from file %s\n",
	    lm_filename_binary);
    load_lm(&ng,lm_filename_binary); 
  }

  fprintf(stderr,"\nDone.\n");

  if (!arpa_lm) {
    n=ng.n;
  }
  else {
    n=arpa_ng.n;
  }

  if (arpa_lm) {
    arpa_ng.context_cue = 
      (flag *) rr_calloc(arpa_ng.table_sizes[0],sizeof(flag));    
    arpa_ng.no_of_ccs = 0;
    if (strcmp(ccs_filename,"")) {
      context_cues_fp = rr_iopen(ccs_filename);
      while (fgets (wlist_entry, sizeof (wlist_entry),context_cues_fp)) {
	if (strncmp(wlist_entry,"##",2)==0) continue;
	sscanf (wlist_entry, "%s ",current_cc);
	if (strncmp(wlist_entry,"#",1)==0) {
	  fprintf(stderr,"\n\n===========================================================\n");
	  fprintf(stderr,":\nWARNING: line assumed NOT a comment:\n");
	  fprintf(stderr,     ">>> %s <<<\n",wlist_entry);
	  fprintf(stderr,     "         '%s' will be included in the context cues list\n",current_cc);
	  fprintf(stderr,     "         (comments must start with '##')\n");
	  fprintf(stderr,"===========================================================\n\n");
	}
	
	
	if (sih_lookup(arpa_ng.vocab_ht,current_cc,&current_cc_id) == 0) {
	  quit(-1,"Error : %s in the context cues file does not appear in the vocabulary.\n",current_cc);
	}
	
	arpa_ng.context_cue[(unsigned short) current_cc_id] = 1;
	arpa_ng.no_of_ccs++;
	fprintf(stderr,"Context cue word : %s id = %d\n",current_cc,current_cc_id);
      }
      rr_iclose(context_cues_fp);
    }
  }

  /* Process commands */
  
  told_to_quit = 0;
  num_of_args = 0;

  while (!feof(stdin) && !told_to_quit) {
    printf("evallm : ");
    gets(input_string);

    if (!feof(stdin)) {
      parse_comline(input_string,&num_of_args,args);

  
      random_seed = pc_intarg(&num_of_args,args,"-seed",-1);

      generate_size = pc_intarg(&num_of_args,args,"-size",10000);

      log_base = pc_doublearg(&num_of_args,args,"-log_base",10.0);

      backoff_from_unk_inc = pc_flagarg(&num_of_args,args,
					"-backoff_from_unk_inc");
      backoff_from_ccs_inc = pc_flagarg(&num_of_args,args,
					"-backoff_from_ccs_inc");
      backoff_from_unk_exc = pc_flagarg(&num_of_args,args,
					"-backoff_from_unk_exc");
      backoff_from_ccs_exc = pc_flagarg(&num_of_args,args,
					"-backoff_from_ccs_exc");
      include_unks = pc_flagarg(&num_of_args,args,"-include_unks");
      fb_list_filename = salloc(pc_stringarg(&num_of_args,args,
					     "-backoff_from_list",""));
    
      text_stream_filename = 
	salloc(pc_stringarg(&num_of_args,args,"-text",""));
      probs_stream_filename = 
	salloc(pc_stringarg(&num_of_args,args,"-probs",""));
      annotation_filename = 
	salloc(pc_stringarg(&num_of_args,args,"-annotate",""));
      oov_filename = salloc(pc_stringarg(&num_of_args,args,"-oovs",""));


      inconsistant_parameters = 0;
    
      if (backoff_from_unk_inc && backoff_from_unk_exc) {
	fprintf(stderr,"Error : Cannot specify both exclusive and inclusive forced backoff.\n");
	fprintf(stderr,"Use only one of -backoff_from_unk_exc and -backoff_from_unk_inc\n");
	inconsistant_parameters = 1;
      }

      if (backoff_from_ccs_inc && backoff_from_ccs_exc) {
	fprintf(stderr,"Error : Cannot specify both exclusive and inclusive forced backoff.\n");
	fprintf(stderr,"Use only one of -backoff_from_ccs_exc and -backoff_from_ccs_inc\n");
	inconsistant_parameters = 1;
      }

      if (num_of_args > 0) {
      
	if (!inconsistant_parameters) {
	  if (!strcmp(args[0],"perplexity")) {
	    compute_perplexity(&ng,
			       &arpa_ng,
			       text_stream_filename,
			       probs_stream_filename,
			       annotation_filename,
			       oov_filename,
			       fb_list_filename,
			       backoff_from_unk_inc,
			       backoff_from_unk_exc,
			       backoff_from_ccs_inc,
			       backoff_from_ccs_exc,
			       arpa_lm,
			       include_unks,
			       log_base);
	  }
	  else {
	    if (!strcmp(args[0],"validate")) {

	      if (num_of_args != n) {
		fprintf(stderr,"Error : must specify %d words of context.\n",
			n-1);
	      }
	      else {
	      
		/* Assume last n-1 parameters form context */
	      
		validate(&ng,
			 &arpa_ng,
			 &(args[num_of_args-n+1]),
			 backoff_from_unk_inc,
			 backoff_from_unk_exc,
			 backoff_from_ccs_inc,
			 backoff_from_ccs_exc,
			 arpa_lm,
			 fb_list_filename);
	      }
	    }
	    else {
	      if (!strcmp(args[0],"stats")) {
		if (arpa_lm) {
		  display_arpa_stats(&arpa_ng);
		}
		else {
		  display_stats(&ng);
		}
	      }
	      else {
		if (!strcmp(args[0],"quit")) {
		  told_to_quit=1;
		}
        else if (!strcmp(args[0],"generate")) {

        if(arpa_lm)
          generate_words(NULL,&arpa_ng,generate_size,random_seed,text_stream_filename);
        else
          generate_words(&ng,NULL,generate_size,random_seed,text_stream_filename);

      }
		else {
		  if (!strcmp(args[0],"help")) {

		    printf("The user may specify one of the following commands: \n");
		    printf("\n");
		    printf(" - perplexity\n");
		    printf("\n");
		    printf("Computes the perplexity of a given text. May optionally specify words\n");
		    printf("from which to force back-off.\n");
		    printf("\n");
		    printf("Syntax: \n");
		    printf("\n");
		    printf("perplexity -text .text\n");
		    printf("         [ -probs .fprobs ]\n");
		    printf("         [ -oovs .oov_file ]\n");
		    printf("         [ -annotate .annotation_file ]         \n");
		    printf("         [ -backoff_from_unk_inc | -backoff_from_unk_exc ]\n");
		    printf("         [ -backoff_from_ccs_inc | -backoff_from_ccs_exc ] \n");
		    printf("         [ -backoff_from_list .fblist ]\n");
		    printf("         [ -include_unks ]\n");
		    printf("\n");
		    printf(" - validate\n");
		    printf("       \n");
		    printf("Calculate the sum of the probabilities of all the words in the\n");
		    printf("vocabulary given the context specified by the user.\n");
		    printf("\n");
		    printf("Syntax: \n");
		    printf("\n");
		    printf("validate [ -backoff_from_unk -backoff_from_ccs |\n");
		    printf("           -backoff_from_list .fblist ]\n");
		    printf("         [ -forced_backoff_inc | -forced_back_off_exc ]      \n");
		    printf("           word1 word2 ... word_(n-1)\n");
		    printf("\n");
		    printf("Where n is the n in n-gram. \n");
		    printf("\n");
		    printf(" - help\n");
		    printf("\n");
		    printf("Displays this help message.\n");
		    printf("\n");
		    printf("Syntax: \n");
		    printf("\n");
		    printf("help\n");
		    printf("\n");
		    printf(" - quit\n");
		    printf("\n");
		    printf("Exits the program.\n");
		    printf("\n");
		    printf("Syntax: \n");
		    printf("\n");
		    printf("quit\n");	

		  } 
	      
		  else {
		    fprintf(stderr,"Unknown command : %s\nType \'help\'\n",
			    args[0]);
		  }
		}
	      }
	    }
	  }
	}
      }
    }    
  }

  fprintf(stderr,"evallm : Done.\n");

  exit(0);
  
}

예제 #12

0

파일 보기

파일: idngram2lm.c 프로젝트: 10v/cmusphinx

int main(int argc, char **argv) {

  int i,j;
  ng_t* ng;
  int verbosity;
  int mem_alloc_method; /* Method used to decide how much memory to 
			   allocate for count tables */
  int buffer_size;
  flag is_ascii;
  ngram current_ngram;
  ngram previous_ngram;
  count_t *ng_count; /* Array indicating the number of occurrances of 
			   the current 1-gram, 2-gram, ... ,n-gram 
			   Size depends on #define in general.h
			*/  
  int nlines;
  int pos_of_novelty;
  int prev_id1;
  flag contains_unks;
  int mem_alloced;

  flag displayed_oov_warning; /** Display OOV warning 
			       */

  /*  ------------------  Process command line --------------------- */

  report_version(&argc,argv);

  if (argc == 1 || pc_flagarg(&argc, argv,"-help")) {    
    /* Display help message */    
    help_message();
    exit(1);
  }

  verbosity = pc_intarg(&argc, argv,"-verbosity",DEFAULT_VERBOSITY);

  /* Initialization */
  {
    ng=init_ng(
	    &argc,
	    argv,
	    verbosity
	    );
    
    mem_alloc_method = init_alloc_method(ng, &argc, argv, &buffer_size);
    
    if (!strcmp(ng->id_gram_filename,"-") && mem_alloc_method == TWO_PASSES)
      quit(-1,"Error: If idngram is read from stdin, then cannot use -calc_mem option.\n");
    
    is_ascii = set_lmformat(pc_flagarg(&argc,argv,"-ascii_input"),
			    pc_flagarg(&argc,argv,"-bin_input"),
			    ng);  

    /* Report parameters */
    report_param(verbosity,ng,
		 is_ascii, mem_alloc_method, buffer_size);

    pc_report_unk_args(&argc,argv,verbosity);

  }

  /* --------------- Read in the vocabulary -------------- */
  read_vocab(ng,verbosity);
       		     
  /* --------------- Allocate space for the table_size array --------- */
  init_ng_table_size(ng, 
		     mem_alloc_method,
		     is_ascii,
		     verbosity,
		     buffer_size
		     );

  /* ----------- Allocate memory for tree structure -------------- */

  ng->count = NULL;
  ng->count4 = NULL;
  ng->marg_counts = NULL;
  ng->marg_counts4 = NULL;
  ng->count_table = NULL;

  ng->count = (count_ind_t **) rr_malloc(sizeof(count_ind_t *)*ng->n);
  ng->count4 = (count_t **) rr_malloc(sizeof(count_t *)*ng->n);    
  ng->count_table = (count_t **) rr_malloc(sizeof(count_t *)*ng->n);

  if (ng->four_byte_counts) {
    ng->marg_counts4 = (count_t *) rr_calloc(sizeof(count_t), ng->table_sizes[0]);

  }else {
    for (i=0;i<=ng->n-1;i++) 
      ng->count_table[i] = (count_t *) rr_calloc(ng->count_table_size+1,
						sizeof(count_t));

    ng->marg_counts = (count_ind_t *) rr_calloc(sizeof(count_ind_t),ng->table_sizes[0]);
    fprintf(stderr, "table_size %d\n",ng->table_sizes[0]);
    fflush(stderr);
  }

  ng->word_id = (id__t **) rr_malloc(sizeof(id__t *)*ng->n);

  if (ng->four_byte_alphas) {
    ng->bo_weight4 = (four_byte_t **) rr_malloc(sizeof(four_byte_t *)*ng->n);
    ng->bo_weight4[0] = (four_byte_t *) rr_malloc(sizeof(four_byte_t)*
						ng->table_sizes[0]);
  }else {
    ng->bo_weight = (bo_weight_t **) rr_malloc(sizeof(bo_weight_t *)*ng->n);
    ng->bo_weight[0] = (bo_weight_t *) rr_malloc(sizeof(bo_weight_t)*
						ng->table_sizes[0]);
  }

  ng->ind = (index__t **)  rr_malloc(sizeof(index__t *)*ng->n);

  /* First table */
  if (ng->four_byte_counts) 
    ng->count4[0] = (count_t *) rr_calloc(ng->table_sizes[0],sizeof(count_t));
  else 
    ng->count[0] = (count_ind_t *) rr_calloc(ng->table_sizes[0],sizeof(count_ind_t));

  ng->uni_probs = (uni_probs_t *) rr_malloc(sizeof(uni_probs_t)*
					   ng->table_sizes[0]);
  ng->uni_log_probs = (uni_probs_t *) rr_malloc(sizeof(uni_probs_t)*
					       ng->table_sizes[0]);

  if (ng->n >=2) 
    ng->ind[0] = (index__t *) rr_calloc(ng->table_sizes[0],sizeof(index__t));

  for (i=1;i<=ng->n-2;i++) {    
    ng->word_id[i] = (id__t *) rr_malloc(sizeof(id__t)*ng->table_sizes[i]);

    if (ng->four_byte_counts) 
      ng->count4[i] = (count_t *) rr_malloc(sizeof(count_t)*ng->table_sizes[i]);
    else 
      ng->count[i] = (count_ind_t *) rr_malloc(sizeof(count_ind_t)*ng->table_sizes[i]);

    if (ng->four_byte_alphas) 
      ng->bo_weight4[i] = (four_byte_t *) rr_malloc(sizeof(four_byte_t)*ng->table_sizes[i]);
    else 
      ng->bo_weight[i] = (bo_weight_t *) rr_malloc(sizeof(bo_weight_t)*ng->table_sizes[i]);
    
    ng->ind[i] = (index__t *) rr_malloc(sizeof(index__t)*ng->table_sizes[i]);

    mem_alloced = sizeof(count_ind_t) + sizeof(bo_weight_t) + 
		sizeof(index__t) + sizeof(id__t);
    
    if (ng->four_byte_alphas) 
      mem_alloced += 4;
   
    mem_alloced *= ng->table_sizes[i];
    
    pc_message(verbosity,2,"Allocated %d bytes to table for %d-grams.\n",
	       mem_alloced,i+1);
    
  }

  ng->word_id[ng->n-1] = (id__t *) 
    rr_malloc(sizeof(id__t)*ng->table_sizes[ng->n-1]);

  if (ng->four_byte_counts) 
    ng->count4[ng->n-1] = (count_t *) rr_malloc(sizeof(count_t)*ng->table_sizes[ng->n-1]);    
  else 
    ng->count[ng->n-1] = (count_ind_t *) rr_malloc(sizeof(count_ind_t)*ng->table_sizes[ng->n-1]);

  pc_message(verbosity,2,"Allocated (%d+%d) bytes to table for %d-grams.\n",
	     ng->four_byte_counts?sizeof(count_t):sizeof(count_ind_t),
	     sizeof(id__t)*ng->table_sizes[ng->n-1],ng->n);
  
  /* Allocate memory for table for first-byte of indices */

  ng_allocate_ptr_table(ng,NULL,0);

  /* Allocate memory for alpha array */

  ng->alpha_array = (double *) rr_malloc(sizeof(double)*ng->out_of_range_alphas);
  ng->size_of_alpha_array = 0;

  /* Allocate memory for frequency of frequency information */

  ng->freq_of_freq = (fof_t **) rr_malloc(sizeof(fof_t *)*ng->n);

  NG_DISC_METH(ng)->allocate_freq_of_freq(ng);

  /* Read n-grams into the tree */
  pc_message(verbosity,2,"Processing id n-gram file.\n");
  pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n");

  /* Allocate space for ngrams id arrays */

  current_ngram.id_array = (id__t *) rr_calloc(ng->n,sizeof(id__t));
  previous_ngram.id_array = (id__t *) rr_calloc(ng->n,sizeof(id__t));
  current_ngram.n = ng->n;
  previous_ngram.n = ng->n;
  
  ng->num_kgrams = (ngram_sz_t *) rr_calloc(ng->n,sizeof(ngram_sz_t));
  ng_count = (count_t *) rr_calloc(ng->n,sizeof(count_t));
  nlines = 1;
  ng->n_unigrams = 0;

  /* Process first n-gram */  
  get_ngram(ng->id_gram_fp,&current_ngram,is_ascii);
  contains_unks = ngram_chk_contains_unks(&current_ngram,ng->n);

  /* Skip over any unknown words.  They will come first, because <UNK>
     always has a word ID of zero. */
  while (ng->vocab_type == CLOSED_VOCAB && contains_unks){
    /* Stop looking if there are no more N-Grams.  Of course, this
       means training will fail, since there are no unigrams. */
    if (get_ngram(ng->id_gram_fp,&current_ngram,is_ascii) == 0)
      break;
    contains_unks = ngram_chk_contains_unks(&current_ngram,ng->n);
  }

  for (i=0;i<=ng->n-2;i++) {
    ng->ind[i][0] = new_index(0,ng->ptr_table[i],&(ng->ptr_table_size[i]),0);
    ng->word_id[i+1][0] = current_ngram.id_array[i+1];
    ng->num_kgrams[i+1]++;
    ng_count[i] = current_ngram.count;
  }

  ng_count[0] = current_ngram.count;

  NG_DISC_METH(ng)->update_freq_of_freq(ng,ng->n-1,current_ngram.count);

  store_normal_count(ng,0,current_ngram.count,ng->n-1);

  if (current_ngram.count <= ng->cutoffs[ng->n-2]) 
    ng->num_kgrams[ng->n-1]--;

  ngram_copy(&previous_ngram,&current_ngram,ng->n);

  prev_id1 = current_ngram.id_array[0];
    
  displayed_oov_warning = 0;

  while (!rr_feof(ng->id_gram_fp)) {

    if (get_ngram(ng->id_gram_fp,&current_ngram,is_ascii)) {

      if (ng->vocab_type == CLOSED_VOCAB)
	contains_unks=ngram_chk_contains_unks(&current_ngram,ng->n);
    
      if (!contains_unks || ng->vocab_type != CLOSED_VOCAB) {

	/* Test for where this ngram differs from last - do we have an
	   out-of-order ngram? */
	pos_of_novelty = ngram_find_pos_of_novelty(&current_ngram,&previous_ngram,ng->n,nlines);
    
	nlines++; 
	show_idngram_nlines(nlines, verbosity);
    
	/* Add new n-gram as soon as it is encountered */
	/* If all of the positions 2,3,...,n of the n-gram are context
	   cues then ignore the n-gram. */
    
	if (ng->n > 1) {
	  NG_DISC_METH(ng)->update_freq_of_freq(ng,ng->n-1,current_ngram.count);
	        
	  store_normal_count(ng,ng->num_kgrams[ng->n-1],current_ngram.count,ng->n-1);
	  
	  ng->word_id[ng->n-1][ng->num_kgrams[ng->n-1]] = current_ngram.id_array[ng->n-1];
	  ng->num_kgrams[ng->n-1]++;	  
	  
	  if (ng->num_kgrams[ng->n-1] >= ng->table_sizes[ng->n-1])
	    quit(-1,"\nMore than %d %d-grams needed to be stored. Rerun with a higher table size.\n",ng->table_sizes[ng->n-1],ng->n);
	}
	/* Deal with new 2,3,...,(n-1)-grams */
      
	for (i=ng->n-2;i>=MAX(1,pos_of_novelty);i--) {

	  NG_DISC_METH(ng)->update_freq_of_freq(ng,i,ng_count[i]);
	  
	  if (ng_count[i] <= ng->cutoffs[i-1]) 
	    ng->num_kgrams[i]--;
	  else
	    store_normal_count(ng,ng->num_kgrams[i]-1,ng_count[i],i);

	  ng_count[i] = current_ngram.count;
	  ng->word_id[i][ng->num_kgrams[i]] = current_ngram.id_array[i];
	  ng->ind[i][ng->num_kgrams[i]] = new_index(ng->num_kgrams[i+1]-1,
						    ng->ptr_table[i],
						    &(ng->ptr_table_size[i]),
						    ng->num_kgrams[i]);
	  ng->num_kgrams[i]++;
	
	  if (ng->num_kgrams[i] >= ng->table_sizes[i])
	    quit(-1,"More than %d %d-grams needed to be stored. Rerun with a higher table size.\n",ng->table_sizes[i],i+1);	  
	}
      
	for (i=0;i<=pos_of_novelty-1;i++) 
	  ng_count[i] += current_ngram.count;
      
	/* Deal with new 1-grams */
      
	if (pos_of_novelty == 0) {
	  if (ng->n>1) {
	    for (i = prev_id1 + 1; i <= current_ngram.id_array[0]; i++) {
	      ng->ind[0][i] = new_index(ng->num_kgrams[1]-1,
				       ng->ptr_table[0],
				       &(ng->ptr_table_size[0]),
				       i);
	    }
	    prev_id1 = current_ngram.id_array[0];
	  }

	  NG_DISC_METH(ng)->update_freq_of_freq(ng,0,ng_count[0]);

	  if (!ng->context_cue[previous_ngram.id_array[0]]) {
	    ng->n_unigrams += ng_count[0];
	    store_normal_count(ng,previous_ngram.id_array[0],ng_count[0],0);
	  }

	  store_marginal_count(ng,previous_ngram.id_array[0],ng_count[0],0);
		      
	  ng_count[0] = current_ngram.count;
	}

	if (current_ngram.count <= ng->cutoffs[ng->n-2]) 
	  ng->num_kgrams[ng->n-1]--;

	ngram_copy(&previous_ngram,&current_ngram,ng->n);

      }else {
	if (!displayed_oov_warning){
	  pc_message(verbosity,2,"Warning : id n-gram stream contains OOV's (n-grams will be ignored).\n");
	  displayed_oov_warning = 1;
	}
      }
    }
  }

  rr_iclose(ng->id_gram_fp);

  for (i=ng->n-2;i>=1;i--) {

    NG_DISC_METH(ng)->update_freq_of_freq(ng,i,ng_count[i]);

    if (ng_count[i] <= ng->cutoffs[i-1]) 
      ng->num_kgrams[i]--;
    else 
      store_normal_count(ng,ng->num_kgrams[i]-1,ng_count[i],i);
      
  }
  
  NG_DISC_METH(ng)->update_freq_of_freq(ng,0,ng_count[0]);

  if (!ng->context_cue[current_ngram.id_array[0]]) {
    ng->n_unigrams += ng_count[0];
    store_normal_count(ng,current_ngram.id_array[0],ng_count[0],0);
  }

  store_marginal_count(ng,current_ngram.id_array[0],ng_count[0],0);

  if (ng->n>1) {
    for (i=current_ngram.id_array[0]+1;i<=ng->vocab_size;i++)
      ng->ind[0][i] = new_index(ng->num_kgrams[1],
				ng->ptr_table[0],
				&(ng->ptr_table_size[0]),
				current_ngram.id_array[0]);
  }

  /* The idngram reading is completed at this point */
  pc_message(verbosity,2,"\n");

  /* Impose a minimum unigram count, if required */

  if (ng->min_unicount > 0) {

    int nchanged= 0;

    for (i=ng->first_id;i<=ng->vocab_size;i++) {
      if ((return_count(ng->four_byte_counts,
			ng->count_table[0],
			ng->count[0],
			ng->count4[0],
			i) < ng->min_unicount) && !ng->context_cue[i]) {

	/* There was a bug in V2's switch.  Look at segment for ABSOLUTE */
	NG_DISC_METH(ng)->reduce_ug_freq_of_freq(ng,i);
	ng->n_unigrams += (ng->min_unicount - ng->count[0][i]);
	store_normal_count(ng,i,ng->min_unicount,0);
	nchanged++;
      }
    }

    if (nchanged > 0) 
      pc_message(verbosity,2,
		 "Unigram counts of %d words were bumped up to %d.\n",
		 nchanged,ng->min_unicount);
  }

  /* Count zeroton information for unigrams */

  ng->freq_of_freq[0][0] = 0;
  
  for (i=ng->first_id;i<=ng->vocab_size;i++) {
    if (return_count(ng->four_byte_counts,
		     ng->count_table[0],
		     ng->count[0],
		     ng->count4[0],
		     i) == 0) {
      ng->freq_of_freq[0][0]++;
    }
  }  

  if (ng->discounting_method == GOOD_TURING) {
    for (i=0;i<=ng->n-1;i++) 
      for (j=1;j<=ng->fof_size[i];j++) 
	pc_message(verbosity,3,"fof[%d][%d] = %d\n",i,j,ng->freq_of_freq[i][j]);
  }

  pc_message(verbosity,2,"Calculating discounted counts.\n");

  NG_DISC_METH(ng)->compute_discount_aux(ng, verbosity);
     
  /* Smooth unigram distribution, to give some mass to zerotons */     
  compute_unigram(ng,verbosity);

  /* Increment Contexts if using Good-Turing discounting-> No need otherwise,
     since all values are discounted anyway. */

  if (ng->discounting_method == GOOD_TURING) {
    pc_message(verbosity,2,"Incrementing contexts...\n");  

    for (i=ng->n-1;i>=1;i--) 
      increment_context(ng,i,verbosity);      
  }

  /* Calculate back-off weights */

  pc_message(verbosity,2,"Calculating back-off weights...\n");

  for (i=1;i<=ng->n-1;i++) 
    compute_back_off(ng,i,verbosity);

  if (!ng->four_byte_alphas) 
    pc_message(verbosity,3,"Number of out of range alphas = %d\n",
	       ng->size_of_alpha_array);

  /* Write out LM */

  pc_message(verbosity,2,"Writing out language model...\n");

  if (ng->write_arpa)
    write_arpa_lm(ng,verbosity);

  if (ng->write_bin) 
    write_bin_lm(ng,verbosity);

  pc_message(verbosity,0,"idngram2lm : Done.\n");

  return 0;    
}