예제 #1
0
void read_wlist_into_siht(char *wlist_filename, int verbosity,  
                          sih_t *p_word_id_ht,  /** Pointer of the word ID hash table */
                          vocab_sz_t * p_n_wlist  /** Pointer of the size of wordlist */
                          )
{

    static char rname[]="read_wlist_into_siht";

    FILE   *wlist_fp = rr_iopen(wlist_filename);
    char   wlist_entry[1024], word[256], *word_copy;
    vocab_sz_t    entry_no = 0;
    
    while (fgets (wlist_entry, sizeof (wlist_entry), wlist_fp)) {
        if (strncmp(wlist_entry,"##",2) == 0) continue;
        entry_no++;
        
        /*     printf("entry no %lld, wlist_entry %s\n",entry_no,wlist_entry);*/
        if(entry_no%1000==0){
            fprintf(stdout,".");
            fflush(stdout);
        }
        sscanf (wlist_entry, "%s ", word);
        warn_on_wrong_vocab_comments(wlist_entry);
        word_copy = rr_salloc(word);
        sih_add(p_word_id_ht, word_copy, entry_no);
        free(word_copy); // HLW
    }
    fprintf(stdout,"\n");
    fflush(stdout);
    rr_iclose(wlist_fp);
 if(verbose_cmuclmtk == 1) {      
    if (verbosity)
        fprintf(stderr,"%s: a list of %d words was read from \"%s\".\n",
                rname,(int) entry_no,wlist_filename);
 }
    *p_n_wlist = entry_no;
}
예제 #2
0
int oe_02_main (int argc, char **argv) {

  ng_t ng;
  arpa_lm_t arpa_ng;
  char input_string[500];
  int num_of_args;
  char *args[MAX_ARGS];
  char *lm_filename_arpa;
  char *lm_filename_binary;
  flag told_to_quit;
  flag inconsistant_parameters;
  flag backoff_from_unk_inc;
  flag backoff_from_unk_exc;
  flag backoff_from_ccs_inc;
  flag backoff_from_ccs_exc;
  flag arpa_lm;
  flag binary_lm;
  flag include_unks;
  char *fb_list_filename;
  char *probs_stream_filename;
  char *annotation_filename;
  char *text_stream_filename;
  char *oov_filename;
  char *ccs_filename;
  int generate_size;
  int random_seed;
  double log_base;
  char wlist_entry[1024];
  char current_cc[200];
  vocab_sz_t current_cc_id;
  FILE *context_cues_fp;
  int n;

  /* Process command line */

  report_version(&argc,argv);

  if (pc_flagarg(&argc, argv,"-help") || 
      argc == 1 || 
      (strcmp(argv[1],"-binary") && strcmp(argv[1],"-arpa"))) {
    oe_02_help_message();
    exit(1);
  }

  lm_filename_arpa = rr_salloc(pc_stringarg(&argc, argv,"-arpa",""));

  if (strcmp(lm_filename_arpa,""))
    arpa_lm = 1;
  else
    arpa_lm = 0;

  lm_filename_binary = rr_salloc(pc_stringarg(&argc, argv,"-binary",""));

  if (strcmp(lm_filename_binary,""))
    binary_lm = 1;
  else
    binary_lm = 0;

  if (arpa_lm && binary_lm)
    quit(-1,"Error : Can't use both -arpa and -binary flags.\n");
  
  if (!arpa_lm && !binary_lm)
    quit(-1,"Error : Must specify either a binary or an arpa format language model.\n");

  ccs_filename = rr_salloc(pc_stringarg(&argc, argv,"-context",""));

  if (binary_lm && strcmp(ccs_filename,""))
    fprintf(stderr,"Warning - context cues file not needed with binary language model file.\nWill ignore it.\n");

  pc_report_unk_args(&argc,argv,2);
 
  /* Load language model */

  if (arpa_lm) {
    fprintf(stderr,"Reading in language model from file %s\n",
	    lm_filename_arpa);
    load_arpa_lm(&arpa_ng,lm_filename_arpa);
  }else {
    fprintf(stderr,"Reading in language model from file %s\n",
	    lm_filename_binary);
    load_lm(&ng,lm_filename_binary); 
  }

  fprintf(stderr,"\nDone.\n");

  n=arpa_lm?
    arpa_ng.n:
    ng.n;

  if (arpa_lm) {
    arpa_ng.context_cue = 
      (flag *) rr_calloc(arpa_ng.table_sizes[0],sizeof(flag));    
    arpa_ng.no_of_ccs = 0;
    if (strcmp(ccs_filename,"")) {
      context_cues_fp = rr_iopen(ccs_filename);
      while (fgets (wlist_entry, sizeof (wlist_entry),context_cues_fp)) {
	if (strncmp(wlist_entry,"##",2)==0) continue;
	sscanf (wlist_entry, "%s ",current_cc);

	warn_on_wrong_vocab_comments(wlist_entry);
	
	if (sih_lookup(arpa_ng.vocab_ht,current_cc,&current_cc_id) == 0)
	  quit(-1,"Error : %s in the context cues file does not appear in the vocabulary.\n",current_cc);
	
	arpa_ng.context_cue[(unsigned short) current_cc_id] = 1;
	arpa_ng.no_of_ccs++;
	fprintf(stderr,"Context cue word : %s id = %lld\n",current_cc,current_cc_id);
      }
      rr_iclose(context_cues_fp);
    }
  }

  /* Process commands */
  
  told_to_quit = 0;
  num_of_args = 0;

  while (!feof(stdin) && !told_to_quit) {
    printf("evallm : \n");
    fgets(input_string, sizeof(input_string), stdin);
    if(strlen(input_string) < sizeof(input_string)-1)
      input_string[strlen(input_string)-1] = '\0'; //chop new-line
    else 
      quit(1, "evallm input exceeds size of input buffer");

    if (!feof(stdin)) {
      parse_comline(input_string,&num_of_args,args);

      log_base = pc_doublearg(&num_of_args,args,"-log_base",10.0);

      backoff_from_unk_inc = pc_flagarg(&num_of_args,args,"-backoff_from_unk_inc");
      backoff_from_ccs_inc = pc_flagarg(&num_of_args,args,"-backoff_from_ccs_inc");
      backoff_from_unk_exc = pc_flagarg(&num_of_args,args,"-backoff_from_unk_exc");
      backoff_from_ccs_exc = pc_flagarg(&num_of_args,args,"-backoff_from_ccs_exc");
      include_unks = pc_flagarg(&num_of_args,args,"-include_unks");
      fb_list_filename = rr_salloc(pc_stringarg(&num_of_args,args,"-backoff_from_list",""));
    
      text_stream_filename = 
	rr_salloc(pc_stringarg(&num_of_args,args,"-text",""));
      probs_stream_filename = 
	rr_salloc(pc_stringarg(&num_of_args,args,"-probs",""));
      annotation_filename = 
	rr_salloc(pc_stringarg(&num_of_args,args,"-annotate",""));
      oov_filename = rr_salloc(pc_stringarg(&num_of_args,args,"-oovs",""));

      generate_size = pc_intarg(&num_of_args,args,"-size",10000);
      random_seed = pc_intarg(&num_of_args,args,"-seed",-1);

      inconsistant_parameters = 0;
    
      if (backoff_from_unk_inc && backoff_from_unk_exc) {
	fprintf(stderr,"Error : Cannot specify both exclusive and inclusive forced backoff.\n");
	fprintf(stderr,"Use only one of -backoff_from_unk_exc and -backoff_from_unk_inc\n");
	inconsistant_parameters = 1;
      }

      if (backoff_from_ccs_inc && backoff_from_ccs_exc) {
	fprintf(stderr,"Error : Cannot specify both exclusive and inclusive forced backoff.\n");
	fprintf(stderr,"Use only one of -backoff_from_ccs_exc and -backoff_from_ccs_inc\n");
	inconsistant_parameters = 1;
      }

      if (num_of_args > 0) {      
	if (!inconsistant_parameters) {
	  if (!strcmp(args[0],"perplexity")) {
	    compute_perplexity(&ng,
			       &arpa_ng,
			       text_stream_filename,
			       probs_stream_filename,
			       annotation_filename,
			       oov_filename,
			       fb_list_filename,
			       backoff_from_unk_inc,
			       backoff_from_unk_exc,
			       backoff_from_ccs_inc,
			       backoff_from_ccs_exc,
			       arpa_lm,
			       include_unks,
			       log_base);
	  }else
	    /* do perplexity sentence by sentence [20090612] (air) */
	    if (!strcmp(args[0],"uttperp")) {
	      FILE *uttfh,*tempfh;
	      char utt[4096]; /* live dangerously... */
	      char tmpfil[128];
	      if ((uttfh = fopen(text_stream_filename,"r")) == NULL) {
		printf("Error: can't open %s\n",text_stream_filename);
		exit(1);
	      }
            char *template = "uttperp_XXXXXX";// CHANGED HLW
            mkstemp(template);// CHANGED HLW
예제 #3
0
int oe_11_main(int argc,char** argv)
{
	arpa_lm_t arpa_lm,lm1,lm2;
	char header1[MAX_HEADER];
	char header2[MAX_HEADER];
	flag backoff_from_unk_inc,backoff_from_unk_exc,backoff_from_ccs_inc,backoff_from_ccs_exc;
	char *lmfile1,*lmfile2,*newlmfile, *wtfile;
	char *fb_list_filename,*ccs_filename;
	fb_info *fb1,*fb2,*fb;
	double w1,w2;
	
	if (pc_flagarg(&argc, argv,"-help") || argc == 1) {
	  oe_05_help_message();
	  exit(1);
	}
	
	lmfile1 = rr_salloc(pc_stringarg(&argc, argv,"-lm1",""));
	if (0 == strcmp(lmfile1, "")) {
		fprintf(stderr, "ERROR: Please specify a first input file with -lm1.\n");
		oe_05_help_message();
	}
	lmfile2 = rr_salloc(pc_stringarg(&argc, argv,"-lm2",""));
	if (0 == strcmp(lmfile2, "")) {
		fprintf(stderr, "ERROR: Please specify a second input file with -lm2.\n");
		oe_05_help_message();
	}
	newlmfile = rr_salloc(pc_stringarg(&argc, argv,"-lm",""));
	if (0 == strcmp(newlmfile, "")) {
		fprintf(stderr, "ERROR: Please specify a destination file with -lm.\n");
		oe_05_help_message();
	}
	fb_list_filename = rr_salloc(pc_stringarg(&argc, argv,"-forced_backoff",""));
	wtfile= rr_salloc(pc_stringarg(&argc, argv,"-weight",""));
	if (0 == strcmp(wtfile, "")) {
		fprintf(stderr, "ERROR: Please specify a weights file with -weight.\n");
		oe_05_help_message();
	}
	ccs_filename= rr_salloc(pc_stringarg(&argc, argv,"-context",""));

	backoff_from_unk_inc = pc_flagarg(&argc,argv,"-backoff_from_unk_inc");
	backoff_from_ccs_inc = pc_flagarg(&argc,argv,"-backoff_from_ccs_inc");
	backoff_from_unk_exc = pc_flagarg(&argc,argv,"-backoff_from_unk_exc");
	backoff_from_ccs_exc = pc_flagarg(&argc,argv,"-backoff_from_ccs_exc");
  
	robust_load_arpa_lm(&lm1,lmfile1,header1,MAX_HEADER);
	robust_load_arpa_lm(&lm2,lmfile2,header2,MAX_HEADER);
	
	load_weights(&w1,&w2,wtfile);

	printf ("\ncombine lms\n");
	combine_lm(&arpa_lm,&lm1,&lm2);

	printf ("\nloading context cues.\n");
	load_context_cue(&arpa_lm,ccs_filename);
	load_context_cue(&lm1,ccs_filename);
	load_context_cue(&lm2,ccs_filename);

	fb=gen_fb_list(arpa_lm.vocab_ht,
		arpa_lm.vocab_size,
		arpa_lm.vocab,
		arpa_lm.context_cue,
		backoff_from_unk_inc,
		backoff_from_unk_exc,
		backoff_from_ccs_inc,
		backoff_from_ccs_exc,
		fb_list_filename);

	fb1=gen_fb_list(lm1.vocab_ht,
		lm1.vocab_size,
		lm1.vocab,
		lm1.context_cue,
		backoff_from_unk_inc,
		backoff_from_unk_exc,
		backoff_from_ccs_inc,
		backoff_from_ccs_exc,
		fb_list_filename);

	fb2=gen_fb_list(lm2.vocab_ht,
		lm2.vocab_size,
		lm2.vocab,
		lm2.context_cue,
		backoff_from_unk_inc,
		backoff_from_unk_exc,
		backoff_from_ccs_inc,
		backoff_from_ccs_exc,
		fb_list_filename);
	
	printf ("\nrecaculate oov probabilities.\n");
	recalc_oov_prob(&arpa_lm,&lm1,&lm2);

	printf ("\ncheck probabilities\n");
	check_prob(&arpa_lm,&lm1,&lm2,fb1,fb2,w1,w2);

	printf ("\ncalculate interpolated probabilities\n");
	calc_interpolated_prob(&arpa_lm,&lm1,&lm2,fb1,fb2,w1,w2);

	printf ("\ncalculate backoff weights\n");
	calc_backoff_weight(&arpa_lm,fb);

	printf ("\nwrite interpolated lm\n");
	write_interpolated_lm(&arpa_lm,newlmfile,header1,header2,2);

	printf ("\nfinished\n");

	return 0;
}
예제 #4
0
void combine_lm(arpa_lm_t *arpa_lm, arpa_lm_t *lm1, arpa_lm_t *lm2)
{
	char *in_line;
	char *input_line;
	int i,j,k;
	int num_of_args;
	int pos_of_novelty;
	char *input_line_ptr_orig;
	char *word_copy;
	id__t *previous_ngram;
	id__t *current_ngram;
	vocab_sz_t temp_id;
	vocab_sz_t *pos_in_list;
	int previd;
	TBROWSE_UNION bru;
	char** words;
	
	words=(char**)NewArray(15,MAX_WORD,sizeof(char));
	
	in_line = (char *) rr_malloc(1024*sizeof(char));
	input_line = (char *) rr_malloc(1024*sizeof(char));
#import "OpenEarsStaticAnalysisToggle.h"
#ifdef STATICANALYZEDEPENDENCIES
#define __clang_analyzer__ 1
#endif
#if !defined(__clang_analyzer__) || defined(STATICANALYZEDEPENDENCIES)
#undef __clang_analyzer__	
	input_line_ptr_orig = input_line;
#endif	
	
	/* Read number of each k-gram */
	
	arpa_lm->table_sizes = (table_size_t *) rr_malloc(sizeof(table_size_t)*11);
    
	arpa_lm->num_kgrams = (ngram_sz_t *) rr_malloc(sizeof(ngram_sz_t)*11);
		
	calc_merged_ngram_num(arpa_lm, lm1, lm2);
	
	previous_ngram = (id__t *) rr_calloc(arpa_lm->n,sizeof(id__t));
	current_ngram = (id__t *) rr_calloc(arpa_lm->n,sizeof(id__t));

	pos_in_list = (vocab_sz_t *) rr_malloc(sizeof(vocab_sz_t) * arpa_lm->n);
	ng_arpa_lm_alloc_struct(arpa_lm);
	
	/* Process 1-grams */
	
	printf("Reading unigrams...\n");
	
	i=0;
	
	begin_browse_union(lm1,lm2,1,&bru);
	
	while (get_next_ngram_union(words,&bru)) {
	  word_copy = rr_salloc(words[0]);
	  /* Do checks about open or closed vocab */
	  check_open_close_vocab(arpa_lm,word_copy,&i);
	}
	
	/* Process 2, ... , n-1 grams */
#import "OpenEarsStaticAnalysisToggle.h"
#ifdef STATICANALYZEDEPENDENCIES
#define __clang_analyzer__ 1
#endif
#if !defined(__clang_analyzer__) || defined(STATICANALYZEDEPENDENCIES)
#undef __clang_analyzer__	
	previd = -1;
	
	for (i=2;i<=arpa_lm->n-1;i++) {
		
		printf("\nReading %d-grams...\n",i);
		
		previd = -1;
		
		j=0;
		
		for (k=0;k<=arpa_lm->n-1;k++) {
			pos_in_list[k] = 0;
		}
		
		begin_browse_union(lm1,lm2,i,&bru);
		while (get_next_ngram_union(words,&bru)) {
			
			/* Process line into all relevant temp_words */			
			num_of_args = 0;						
#endif		
			sih_lookup(arpa_lm->vocab_ht,words[i-1],&temp_id);
			arpa_lm->word_id[i-1][j] = temp_id;
			
			show_dot(j);
			
			j++;
			if (j>arpa_lm->table_sizes[i-1]) {
				quit(-1,"Error - Header information in ARPA format language model is incorrect.\nMore than %d %d-grams needed to be stored.\n",arpa_lm->table_sizes[i-1],i);
			}
			
			/* Make sure that indexes in previous table point to 
			the right thing. */
			
			for (k=0;k<=i-1;k++) {
				previous_ngram[k] = current_ngram[k];
				sih_lookup(arpa_lm->vocab_ht,words[k],&temp_id);
				if (temp_id == 0 && strcmp(words[k],"<UNK>")) {
					quit(-1,"Error - found unknown word in n-gram file : %s\n",
						words[k]);
				}
				current_ngram[k] = temp_id;
			}
			
			/* Find position of novelty */
			
			/*bug fixed, for the first ngram, pos_of novelty should be 0 - Wei Xu*/
			if (j==1) pos_of_novelty=0;
			else {
				pos_of_novelty = i;
				
				for (k=0;k<=i-1;k++) {
					if (current_ngram[k] > previous_ngram[k]) {
						pos_of_novelty = k;
						k = arpa_lm->n;
					}
					else {
						if ((current_ngram[k] > previous_ngram[k]) && (j > 0)) {
							quit(-1,"Error : n-grams are not correctly ordered.\n");
						}
					}
				}
			}
			
			if (pos_of_novelty == i && j != 1)
			  quit(-1,"Error - Repeated %d-gram in ARPA format language model.\n",
			       i);
			
			if (pos_of_novelty != i-1) {
				if (i==2) {
					/* Deal with unigram pointers */
					
					for (k = previd + 1; k <= current_ngram[0]; k++) {
						arpa_lm->ind[0][k] = new_index(j-1,
							arpa_lm->ptr_table[0],
							&(arpa_lm->ptr_table_size[0]),
							k);
					}
					previd = current_ngram[0];
				}else {
					
					for (k=pos_of_novelty;k<=i-2;k++) {
						if (k == 0) {
							pos_in_list[0] = current_ngram[0];
						}
						else {
							pos_in_list[k] = 
								MIN(get_full_index(arpa_lm->ind[k-1][pos_in_list[k-1]],
								arpa_lm->ptr_table[k-1],   
								arpa_lm->ptr_table_size[k-1],   
								pos_in_list[k-1]),pos_in_list[k]);
							while (arpa_lm->word_id[k][pos_in_list[k]] < 
								current_ngram[k]) {
								pos_in_list[k]++;
							}
						}
					}
					for (k = previd + 1; k <= pos_in_list[i-2]; k++) {
						arpa_lm->ind[i-2][k] = 
							new_index(j-1,
							arpa_lm->ptr_table[i-2],
							&(arpa_lm->ptr_table_size[i-2]),
							k);
					}
					previd = pos_in_list[i-2];	    
				}
			}
		}
	
		/* Now need to tidy up pointers for bottom section of unigrams */
	
		for (k = previd + 1; k <= arpa_lm->vocab_size; k++) {
			arpa_lm->ind[0][k] = new_index(arpa_lm->num_kgrams[1],
				arpa_lm->ptr_table[0],
				&(arpa_lm->ptr_table_size[0]),
				k);
		}      
	
	}
  
	printf("\nReading %d-grams...\n",arpa_lm->n);
	
	j = 0;
	previd = 0;
	
	arpa_lm->ind[arpa_lm->n-2][0] = 0;
	
	for (k=0;k<=arpa_lm->n-1;k++) {
		/* bug fixed by Wei Xu : this is a serious bug*/
		pos_in_list[k] = 0;
		//    pos_in_list[0] = 0;
	}
	
	begin_browse_union(lm1,lm2,arpa_lm->n,&bru);
	while (get_next_ngram_union(words,&bru)) {

	  show_dot(j);
	  
		sih_lookup(arpa_lm->vocab_ht,words[arpa_lm->n-1],&temp_id);
		
		arpa_lm->word_id[arpa_lm->n-1][j] = temp_id;
		
		j++;
		
		for (k=0;k<=arpa_lm->n-1;k++) {
			previous_ngram[k] = current_ngram[k];
			sih_lookup(arpa_lm->vocab_ht,words[k],&temp_id);
			if (temp_id == 0 && strcmp(words[k],"<UNK>")) {
				quit(-1,"Error - found unknown word in n-gram file : %s\n",
					words[k]);
			}
			current_ngram[k] = temp_id;
		}
		
		/* Find position of novelty */
		
		/*bug fixed, for the first ngram, pos_of novelty should be 0 - Wei Xu*/
		if (j==1) pos_of_novelty=0;
		else {
			pos_of_novelty = arpa_lm->n+1;
			
			for (k=0;k<=arpa_lm->n-1;k++) {
				if (current_ngram[k] > previous_ngram[k]) {
					pos_of_novelty = k;
					k = arpa_lm->n;
				}else {
					if ((current_ngram[k] > previous_ngram[k]) && (j>0)) {
						quit(-1,"Error : n-grams are not correctly ordered.\n");
					}
				}
			}
		}
		
		if ( pos_of_novelty == arpa_lm->n+1 && j != 1 ) {
			quit(-1,"Error : Same %d-gram occurs twice in ARPA format LM.\n",
				arpa_lm->n);
		}
		
		if (pos_of_novelty != arpa_lm->n-1) {
			
			for (k=pos_of_novelty;k<=arpa_lm->n-2;k++) {
				if (k == 0) {
					pos_in_list[0] = current_ngram[0];
				}else {
					pos_in_list[k] = 
						MAX(get_full_index(arpa_lm->ind[k-1][pos_in_list[k-1]],
						arpa_lm->ptr_table[k-1],   
						arpa_lm->ptr_table_size[k-1],   
						pos_in_list[k-1]),pos_in_list[k]);
					while (arpa_lm->word_id[k][pos_in_list[k]] < 
						current_ngram[k]) {
						pos_in_list[k]++;
					}
				}
			}
			for (k = previd + 1; k <= pos_in_list[arpa_lm->n-2]; k++) {
				arpa_lm->ind[arpa_lm->n-2][k] = 
					new_index(j-1,
					arpa_lm->ptr_table[arpa_lm->n-2],
					&(arpa_lm->ptr_table_size[arpa_lm->n-2]),
					k);
			}
			previd = pos_in_list[arpa_lm->n-2];
		}
		
		if (j>arpa_lm->table_sizes[arpa_lm->n-1]) {
			quit(-1,"Error - Header information in ARPA format language model is incorrect.\nMore than %d %d-grams needed to be stored.\n",arpa_lm->table_sizes[arpa_lm->n-1],arpa_lm->n-1);
		}
	}
	
	
	
	/* Tidy up */
	
	
	free(previous_ngram);
	free(current_ngram);
	free(in_line);
	free(input_line);
	DeleteArray(words);
  
}