Exemplo n.º 1
0
void read_vocab(ng_t* ng, int verbosity)
{
  vocab_sz_t test_cc_id;
  vocab_sz_t current_cc_id;
  char current_cc[200];
  char wlist_entry[1024];

  pc_message(verbosity,2,"Reading vocabulary.\n");

  /* Don't change the parameter of sih_create, because it will change
     the binary layout of the .binlm file */

  ng->vocab_ht =
    sih_create(1000,0.5,2.0,1);

  read_voc(ng->vocab_filename,verbosity,ng->vocab_ht,&ng->vocab,&(ng->vocab_size));
  
  /* Determine which of the vocabulary words are context cues */

  ng->no_of_ccs = 0;
  ng->context_cue = (flag *) rr_calloc(ng->vocab_size+1,sizeof(flag));

  if (ng->context_set) {
    /* This should be tied to l889 to l894 in lm_combine.c
     */
    while (fgets (wlist_entry, sizeof (wlist_entry),ng->context_cues_fp)) {
      if (strncmp(wlist_entry,"##",2)==0) continue;
      sscanf (wlist_entry, "%s ",current_cc);
      warn_on_wrong_vocab_comments(wlist_entry);

      if (sih_lookup(ng->vocab_ht,current_cc,&current_cc_id) == 0) 
	pc_message(verbosity,1,"Warning : %s in the context cues file does not appear in the vocabulary.\n",current_cc);
      else {
	ng->context_cue[(unsigned short) current_cc_id] = 1;
	pc_message(verbosity,2,"Context cue word : %s id = %d\n",current_cc,current_cc_id);
	ng->no_of_ccs++;
      }
    }
    rr_iclose(ng->context_cues_fp);
  }

  if ((sih_lookup(ng->vocab_ht,"<s>",&test_cc_id) != 0)) 
    if (ng->context_cue[(unsigned short) test_cc_id] == 0) 
      fprintf(stderr,"WARNING: <s> appears as a vocabulary item, but is not labelled as a\ncontext cue.\n");

  if ((sih_lookup(ng->vocab_ht,"<p>",&test_cc_id) != 0)) 
    if (ng->context_cue[(unsigned short) test_cc_id] == 0) 
      fprintf(stderr,"WARNING: <p> appears as a vocabulary item, but is not labelled as a\ncontext cue.\n");

  if ((sih_lookup(ng->vocab_ht,"<art>",&test_cc_id) != 0)) 
    if (ng->context_cue[(unsigned short) test_cc_id] == 0) 
      fprintf(stderr,"WARNING: <art> appears as a vocabulary item, but is not labelled as a\ncontext cue.\n");

}
Exemplo n.º 2
0
void load_context_cue(arpa_lm_t* lm, char* ccs_filename)
{
  FILE* context_cues_fp;
  char wlist_entry[1024];
  char current_cc[200];
  vocab_sz_t current_cc_id;

  lm->context_cue = (flag *) rr_calloc(lm->table_sizes[0],sizeof(flag));    
  lm->no_of_ccs = 0;
  if (strcmp(ccs_filename,"")) {
    context_cues_fp = rr_iopen(ccs_filename);
    while (fgets (wlist_entry, sizeof (wlist_entry),context_cues_fp)) {
      if (strncmp(wlist_entry,"##",2)==0) continue;
      sscanf (wlist_entry, "%s ",current_cc);
      warn_on_wrong_vocab_comments(wlist_entry);
	
      if (sih_lookup(lm->vocab_ht,current_cc,&current_cc_id) == 0)
	quit(-1,"Error : %s in the context cues file does not appear in the vocabulary.\n",current_cc);
      
      lm->context_cue[(unsigned short) current_cc_id] = 1;
      lm->no_of_ccs++;
      fprintf(stderr,"Context cue word : %s id = %lld\n",current_cc,current_cc_id);
    }
    rr_iclose(context_cues_fp);
  }
}
Exemplo n.º 3
0
double calc_prob(char** words,int k,arpa_lm_t* lm,fb_info* fb_list)
{
  id__t id[MAX_K];
  int bo_case,acl;
  vocab_sz_t index;
  int i;
  
  for (i=0;i<k;i++) {
    sih_lookup(lm->vocab_ht,words[i],&index);
    id[i]=index;
  }
  
  return calc_prob_of(id[k-1],id,k-1,NULL,lm,fb_list,&bo_case,&acl,TRUE);
}
Exemplo n.º 4
0
void compute_perplexity(ng_t *ng,
			arpa_lm_t *arpa_ng,
			char *text_stream_filename,
			char *probs_stream_filename,
			char *annotation_filename,
			char *oov_filename,
			char *fb_list_filename,
			flag backoff_from_unk_inc,
			flag backoff_from_unk_exc,
			flag backoff_from_ccs_inc,
			flag backoff_from_ccs_exc,
			flag arpa_lm,
			flag include_unks,
			double log_base) {

  fb_info *fb_list;
  FILE *temp_fp;
  FILE *text_stream_fp;
  FILE *probs_stream_fp;
  FILE *annotation_fp;
  FILE *oov_fp;
  flag out_probs;
  flag annotate;
  flag out_oovs;
  flag found_unk_wrongly;
  double prob;
  double sum_log_prob;
  int total_words;
  int excluded_unks;
  int excluded_ccs;
  char current_word[1000];  /* Hope that's big enough */
  char **prev_words;
  vocab_sz_t current_id;
  id__t short_current_id;
  id__t *context;
  int context_length;
  int i;
  int bo_case;
  int actual_context_length;
  int *ngrams_hit;
  int n;

  /* Initialise file pointers to prevent warnings from the compiler. */

  probs_stream_fp = NULL;
  annotation_fp = NULL;
  oov_fp = NULL;

  short_current_id = 0;

  found_unk_wrongly = 0;

  annotate = 0;

  bo_case = 0;

  if (arpa_lm) {
    n = arpa_ng->n;
    fb_list = gen_fb_list(arpa_ng->vocab_ht,
			  (int) arpa_ng->vocab_size,
			  arpa_ng->vocab,
			  arpa_ng->context_cue,
			  backoff_from_unk_inc,
			  backoff_from_unk_exc,
			  backoff_from_ccs_inc,
			  backoff_from_ccs_exc,
			  fb_list_filename);
  }else {
    n = ng->n;
    fb_list = gen_fb_list(ng->vocab_ht,
			  (int) ng->vocab_size,
			  ng->vocab,
			  ng->context_cue,
			  backoff_from_unk_inc,
			  backoff_from_unk_exc,
			  backoff_from_ccs_inc,
			  backoff_from_ccs_exc,
			  fb_list_filename);
  }
  
  ngrams_hit = (int *) rr_calloc(n,sizeof(int));
  prev_words = (char **) rr_malloc(sizeof(char *)*n);
  for (i=0;i<=n-1;i++)
    prev_words[i] = (char *) rr_malloc(sizeof(char)*1000);
  
  /* Check that text_stream_filename and probs_stream_filename (if
     specified) are valid. Note that the checks employed by the
     standard rr_fopen tools are not suitable here, since we don't
     want the program to terminate if the paths are not found. */

  if (!strcmp(text_stream_filename,"")) {
    printf("Error : Must specify a text file. Use the -text switch.\n");
    return;
  }

  if (!rr_fexists(text_stream_filename) && strcmp(text_stream_filename,"-")) {
    printf("Error : Can't open file %s for reading.\n",text_stream_filename);
    return;
  }

  out_probs = strcmp(probs_stream_filename,"");
  annotate = strcmp(annotation_filename,"");
  out_oovs = strcmp(oov_filename,"");

  printf("Computing perplexity of the language model with respect\n");
  printf("   to the text %s\n",text_stream_filename);
  if (out_probs)
    printf("Probability stream will be written to file %s\n",
	    probs_stream_filename);

  if (annotate)
    printf("Annotation will be written to file %s\n",
	    annotation_filename);

  if (out_oovs)
    printf("Out of vocabulary words will be written to file %s\n",
	    oov_filename);

  if (backoff_from_unk_inc)
    printf("Will force inclusive back-off from OOVs.\n");

  if (backoff_from_unk_exc)
    printf("Will force exclusive back-off from OOVs.\n");

  if (backoff_from_ccs_inc)
    printf("Will force inclusive back-off from context cues.\n");

  if (backoff_from_ccs_exc)
    printf("Will force exclusive back-off from context cues.\n");

  if (strcmp(fb_list_filename,"")) 
    printf("Will force back-off according to the contents of %s\n",
	    fb_list_filename);

  if (include_unks)
    printf("Perplexity calculation will include OOVs.\n");

  /* Check for existance of files, as rr functions will quit, which isn't
     what we want */

  if (out_probs && strcmp(probs_stream_filename,"-")) {
    if ((temp_fp = fopen(probs_stream_filename,"w")) == NULL) {
      printf("Error : Can't open file %s for writing.\n",probs_stream_filename);
      return;
    }
    fclose(temp_fp);
  }

  if (annotate && strcmp(annotation_filename,"-")) {
    if ((temp_fp = fopen(annotation_filename,"w")) == NULL) {
      printf("Error : Can't open file %s for writing.\n",annotation_filename);
      return;
    }
    fclose(temp_fp);
  }
    
  if (out_oovs && strcmp(oov_filename,"-")) {
    if ((temp_fp = fopen(oov_filename,"w")) == NULL) {
      printf("Error : Can't open file %s for writing.\n",oov_filename);
      return;
    }
    fclose(temp_fp);
  }

  text_stream_fp = rr_iopen(text_stream_filename);
  if (out_probs)
    probs_stream_fp = rr_oopen(probs_stream_filename);

  if (annotate)
    annotation_fp = rr_oopen(annotation_filename);

  if (out_oovs)
    oov_fp = rr_oopen(oov_filename);

  context = (id__t *) rr_malloc(sizeof(id__t)*(n-1));

  sum_log_prob = 0.0;
  total_words = 0;
  excluded_unks = 0;
  excluded_ccs = 0;

  while (!rr_feof(text_stream_fp)) {

    if (total_words > 0) {
      if (total_words < n)
	strcpy(prev_words[total_words-1],current_word);
      else {
	for (i=0;i<=n-3;i++)
	  strcpy(prev_words[i],prev_words[i+1]);

	if (n>1)
	  strcpy(prev_words[n-2],current_word);
      }
    }

    if (total_words < (n-1))
      context_length = total_words;
    else
      context_length = n-1;

    /* Fill context with right stuff */

    if (total_words > (n-1)) {
      for (i=0;i<=context_length-2;i++)
	context[i] = context[i+1];
    }

    if (context_length != 0)
      context[context_length-1] = short_current_id;

    if (fscanf(text_stream_fp,"%s",current_word) != 1) {
      if (!rr_feof(text_stream_fp)) {
	printf("Error reading text file.\n");
	return;
      }
    }

    if (!rr_feof(text_stream_fp)) {

      if (arpa_lm) {
	sih_lookup(arpa_ng->vocab_ht,current_word,&current_id);
	if (arpa_ng->vocab_type == CLOSED_VOCAB && current_id == 0) {
	  found_unk_wrongly = 1;
	  printf("Error : %s is not in the vocabulary, and this is a closed \nvocabulary model.\n",current_word);
	}
	if (current_id > arpa_ng->vocab_size)
	  quit(-1,"Error : returned value from sih_lookup (%d) is too high.\n",context[i]); 
	else
	  short_current_id = current_id;

      }else {
	sih_lookup(ng->vocab_ht,current_word,&current_id);
	if (ng->vocab_type == CLOSED_VOCAB && current_id == 0) {
	  found_unk_wrongly = 1;
	  printf("Error : %s is not in the vocabulary, and this is a closed \nvocabulary model.\n",current_word);
	}
	if (current_id > ng->vocab_size)
	  quit(-1,"Error : returned value from sih_lookup (%d) is too high.\n",context[i]); 
	else
	  short_current_id = current_id;

      }
    
      if (!found_unk_wrongly) {

	if (current_id == 0 && out_oovs)
	  fprintf(oov_fp,"%s\n",current_word);

	if ((arpa_lm && (!(arpa_ng->context_cue[current_id])))
	    || ((!arpa_lm) && (!(ng->context_cue[current_id])))) {

	  if (include_unks || current_id != 0) {

	    prob = calc_prob_of(short_current_id,
				context,
				context_length,
				ng,
				arpa_ng,
				fb_list,
				&bo_case,
				&actual_context_length,
				arpa_lm);


	    if (prob<= 0.0 || prob > 1.0) {
	      fprintf(stderr,"Warning : ");
	      if (short_current_id == 0)
		fprintf(stderr,"P( <UNK> | ");
	      else
		fprintf(stderr,"P( %s | ",current_word);
	  
	      for (i=0;i<=actual_context_length-1;i++) {
		if (context[i+context_length-actual_context_length] == 0)
		  fprintf(stderr,"<UNK> ");
		else
		  fprintf(stderr,"%s ",prev_words[i]);
	      }
	      fprintf(stderr,") = %g logprob = %g \n ",prob,log(prob)/log(log_base));
	      fprintf(stderr,"bo_case == 0x%dx, actual_context_length == %d\n",
		      bo_case, actual_context_length);
	    }
	  
	    if (annotate) {
	      if (short_current_id == 0)
		fprintf(annotation_fp,"P( <UNK> | ");
	      else 
		fprintf(annotation_fp,"P( %s | ",current_word);
	  
	      for (i=0;i<=actual_context_length-1;i++) {
		if (context[i+context_length-actual_context_length] == 0)
		  fprintf(annotation_fp,"<UNK> ");
		else {
		  if (arpa_lm)
		    fprintf(annotation_fp,"%s ",arpa_ng->vocab[context[i+context_length-actual_context_length]]);
		  else
		    fprintf(annotation_fp,"%s ",ng->vocab[context[i+context_length-actual_context_length]]);
		}
	      }
	      fprintf(annotation_fp,") = %g logprob = %f bo_case = ",prob,log(prob)/log(log_base));
	      decode_bo_case(bo_case,actual_context_length,annotation_fp);
	    }

	    /* Calculate level to which we backed off */

	    for (i=actual_context_length-1;i>=0;i--) {
 	      int four_raise_i = 1<<(2*i);  /* PWP */
 
 	      /*
 	       * PWP: This was "if ((bo_case / (int) pow(3,i)) == 0)"
 	       * but was getting a divide-by-zero error on an Alpha
 	       * (it isn't clear to me why it should ever have done so)
 	       * Anyway, it is much faster to do in base-4.
 	       */

	      if ((bo_case == 0) || ((bo_case / four_raise_i) == 0)) {
		ngrams_hit[i+1]++;
		i = -2;
	      }else
		bo_case -= ((bo_case / four_raise_i) * four_raise_i);
	    }
  
	    if (i != -3) 
	      ngrams_hit[0]++;

	    if (out_probs)
	      fprintf(probs_stream_fp,"%g\n",prob);
      
	    sum_log_prob += log10(prob);
			  
	  }

          if (current_id == 0 && !include_unks)
            excluded_unks++;
	}       
	else {
	  if (((!arpa_lm) && ng->context_cue[current_id]) || 
	      (arpa_lm && arpa_ng->context_cue[current_id]))
	    excluded_ccs++;
	}
	total_words++;
      }
    }
  }

  if (!found_unk_wrongly) {      /*  pow(x,y) = e**(y  ln(x)) */
    printf("Perplexity = %.2f, Entropy = %.2f bits\n", 
	    exp(-sum_log_prob/(total_words-excluded_ccs-excluded_unks) * 
		log(10.0)),
	   (-sum_log_prob/(total_words-excluded_ccs-excluded_unks) * 
	    log(10.0) / log(2.0)));
    printf("Computation based on %d words.\n",
	   total_words-excluded_ccs-excluded_unks);
    for(i=n;i>=1;i--) {
      printf("Number of %d-grams hit = %d  (%.2f%%)\n",i,ngrams_hit[i-1],
	     (float) 100*ngrams_hit[i-1]/(total_words-excluded_ccs-excluded_unks) );
    }
    printf("%d OOVs (%.2f%%) and %d context cues were removed from the calculation.\n",
	   excluded_unks,
	   (float) 100*excluded_unks/(total_words-excluded_ccs),excluded_ccs);
    
  }

  rr_iclose(text_stream_fp);

  if (out_probs)
    rr_oclose(probs_stream_fp);
  if (annotate)
    rr_oclose(annotation_fp);
  if (out_oovs)
    rr_oclose(oov_fp);

  free (fb_list);
  free (context);
  free (ngrams_hit);
}
Exemplo n.º 5
0
int oe_02_main (int argc, char **argv) {

  ng_t ng;
  arpa_lm_t arpa_ng;
  char input_string[500];
  int num_of_args;
  char *args[MAX_ARGS];
  char *lm_filename_arpa;
  char *lm_filename_binary;
  flag told_to_quit;
  flag inconsistant_parameters;
  flag backoff_from_unk_inc;
  flag backoff_from_unk_exc;
  flag backoff_from_ccs_inc;
  flag backoff_from_ccs_exc;
  flag arpa_lm;
  flag binary_lm;
  flag include_unks;
  char *fb_list_filename;
  char *probs_stream_filename;
  char *annotation_filename;
  char *text_stream_filename;
  char *oov_filename;
  char *ccs_filename;
  int generate_size;
  int random_seed;
  double log_base;
  char wlist_entry[1024];
  char current_cc[200];
  vocab_sz_t current_cc_id;
  FILE *context_cues_fp;
  int n;

  /* Process command line */

  report_version(&argc,argv);

  if (pc_flagarg(&argc, argv,"-help") || 
      argc == 1 || 
      (strcmp(argv[1],"-binary") && strcmp(argv[1],"-arpa"))) {
    oe_02_help_message();
    exit(1);
  }

  lm_filename_arpa = rr_salloc(pc_stringarg(&argc, argv,"-arpa",""));

  if (strcmp(lm_filename_arpa,""))
    arpa_lm = 1;
  else
    arpa_lm = 0;

  lm_filename_binary = rr_salloc(pc_stringarg(&argc, argv,"-binary",""));

  if (strcmp(lm_filename_binary,""))
    binary_lm = 1;
  else
    binary_lm = 0;

  if (arpa_lm && binary_lm)
    quit(-1,"Error : Can't use both -arpa and -binary flags.\n");
  
  if (!arpa_lm && !binary_lm)
    quit(-1,"Error : Must specify either a binary or an arpa format language model.\n");

  ccs_filename = rr_salloc(pc_stringarg(&argc, argv,"-context",""));

  if (binary_lm && strcmp(ccs_filename,""))
    fprintf(stderr,"Warning - context cues file not needed with binary language model file.\nWill ignore it.\n");

  pc_report_unk_args(&argc,argv,2);
 
  /* Load language model */

  if (arpa_lm) {
    fprintf(stderr,"Reading in language model from file %s\n",
	    lm_filename_arpa);
    load_arpa_lm(&arpa_ng,lm_filename_arpa);
  }else {
    fprintf(stderr,"Reading in language model from file %s\n",
	    lm_filename_binary);
    load_lm(&ng,lm_filename_binary); 
  }

  fprintf(stderr,"\nDone.\n");

  n=arpa_lm?
    arpa_ng.n:
    ng.n;

  if (arpa_lm) {
    arpa_ng.context_cue = 
      (flag *) rr_calloc(arpa_ng.table_sizes[0],sizeof(flag));    
    arpa_ng.no_of_ccs = 0;
    if (strcmp(ccs_filename,"")) {
      context_cues_fp = rr_iopen(ccs_filename);
      while (fgets (wlist_entry, sizeof (wlist_entry),context_cues_fp)) {
	if (strncmp(wlist_entry,"##",2)==0) continue;
	sscanf (wlist_entry, "%s ",current_cc);

	warn_on_wrong_vocab_comments(wlist_entry);
	
	if (sih_lookup(arpa_ng.vocab_ht,current_cc,&current_cc_id) == 0)
	  quit(-1,"Error : %s in the context cues file does not appear in the vocabulary.\n",current_cc);
	
	arpa_ng.context_cue[(unsigned short) current_cc_id] = 1;
	arpa_ng.no_of_ccs++;
	fprintf(stderr,"Context cue word : %s id = %lld\n",current_cc,current_cc_id);
      }
      rr_iclose(context_cues_fp);
    }
  }

  /* Process commands */
  
  told_to_quit = 0;
  num_of_args = 0;

  while (!feof(stdin) && !told_to_quit) {
    printf("evallm : \n");
    fgets(input_string, sizeof(input_string), stdin);
    if(strlen(input_string) < sizeof(input_string)-1)
      input_string[strlen(input_string)-1] = '\0'; //chop new-line
    else 
      quit(1, "evallm input exceeds size of input buffer");

    if (!feof(stdin)) {
      parse_comline(input_string,&num_of_args,args);

      log_base = pc_doublearg(&num_of_args,args,"-log_base",10.0);

      backoff_from_unk_inc = pc_flagarg(&num_of_args,args,"-backoff_from_unk_inc");
      backoff_from_ccs_inc = pc_flagarg(&num_of_args,args,"-backoff_from_ccs_inc");
      backoff_from_unk_exc = pc_flagarg(&num_of_args,args,"-backoff_from_unk_exc");
      backoff_from_ccs_exc = pc_flagarg(&num_of_args,args,"-backoff_from_ccs_exc");
      include_unks = pc_flagarg(&num_of_args,args,"-include_unks");
      fb_list_filename = rr_salloc(pc_stringarg(&num_of_args,args,"-backoff_from_list",""));
    
      text_stream_filename = 
	rr_salloc(pc_stringarg(&num_of_args,args,"-text",""));
      probs_stream_filename = 
	rr_salloc(pc_stringarg(&num_of_args,args,"-probs",""));
      annotation_filename = 
	rr_salloc(pc_stringarg(&num_of_args,args,"-annotate",""));
      oov_filename = rr_salloc(pc_stringarg(&num_of_args,args,"-oovs",""));

      generate_size = pc_intarg(&num_of_args,args,"-size",10000);
      random_seed = pc_intarg(&num_of_args,args,"-seed",-1);

      inconsistant_parameters = 0;
    
      if (backoff_from_unk_inc && backoff_from_unk_exc) {
	fprintf(stderr,"Error : Cannot specify both exclusive and inclusive forced backoff.\n");
	fprintf(stderr,"Use only one of -backoff_from_unk_exc and -backoff_from_unk_inc\n");
	inconsistant_parameters = 1;
      }

      if (backoff_from_ccs_inc && backoff_from_ccs_exc) {
	fprintf(stderr,"Error : Cannot specify both exclusive and inclusive forced backoff.\n");
	fprintf(stderr,"Use only one of -backoff_from_ccs_exc and -backoff_from_ccs_inc\n");
	inconsistant_parameters = 1;
      }

      if (num_of_args > 0) {      
	if (!inconsistant_parameters) {
	  if (!strcmp(args[0],"perplexity")) {
	    compute_perplexity(&ng,
			       &arpa_ng,
			       text_stream_filename,
			       probs_stream_filename,
			       annotation_filename,
			       oov_filename,
			       fb_list_filename,
			       backoff_from_unk_inc,
			       backoff_from_unk_exc,
			       backoff_from_ccs_inc,
			       backoff_from_ccs_exc,
			       arpa_lm,
			       include_unks,
			       log_base);
	  }else
	    /* do perplexity sentence by sentence [20090612] (air) */
	    if (!strcmp(args[0],"uttperp")) {
	      FILE *uttfh,*tempfh;
	      char utt[4096]; /* live dangerously... */
	      char tmpfil[128];
	      if ((uttfh = fopen(text_stream_filename,"r")) == NULL) {
		printf("Error: can't open %s\n",text_stream_filename);
		exit(1);
	      }
            char *template = "uttperp_XXXXXX";// CHANGED HLW
            mkstemp(template);// CHANGED HLW
Exemplo n.º 6
0
void combine_lm(arpa_lm_t *arpa_lm, arpa_lm_t *lm1, arpa_lm_t *lm2)
{
	char *in_line;
	char *input_line;
	int i,j,k;
	int num_of_args;
	int pos_of_novelty;
	char *input_line_ptr_orig;
	char *word_copy;
	id__t *previous_ngram;
	id__t *current_ngram;
	vocab_sz_t temp_id;
	vocab_sz_t *pos_in_list;
	int previd;
	TBROWSE_UNION bru;
	char** words;
	
	words=(char**)NewArray(15,MAX_WORD,sizeof(char));
	
	in_line = (char *) rr_malloc(1024*sizeof(char));
	input_line = (char *) rr_malloc(1024*sizeof(char));
#import "OpenEarsStaticAnalysisToggle.h"
#ifdef STATICANALYZEDEPENDENCIES
#define __clang_analyzer__ 1
#endif
#if !defined(__clang_analyzer__) || defined(STATICANALYZEDEPENDENCIES)
#undef __clang_analyzer__	
	input_line_ptr_orig = input_line;
#endif	
	
	/* Read number of each k-gram */
	
	arpa_lm->table_sizes = (table_size_t *) rr_malloc(sizeof(table_size_t)*11);
    
	arpa_lm->num_kgrams = (ngram_sz_t *) rr_malloc(sizeof(ngram_sz_t)*11);
		
	calc_merged_ngram_num(arpa_lm, lm1, lm2);
	
	previous_ngram = (id__t *) rr_calloc(arpa_lm->n,sizeof(id__t));
	current_ngram = (id__t *) rr_calloc(arpa_lm->n,sizeof(id__t));

	pos_in_list = (vocab_sz_t *) rr_malloc(sizeof(vocab_sz_t) * arpa_lm->n);
	ng_arpa_lm_alloc_struct(arpa_lm);
	
	/* Process 1-grams */
	
	printf("Reading unigrams...\n");
	
	i=0;
	
	begin_browse_union(lm1,lm2,1,&bru);
	
	while (get_next_ngram_union(words,&bru)) {
	  word_copy = rr_salloc(words[0]);
	  /* Do checks about open or closed vocab */
	  check_open_close_vocab(arpa_lm,word_copy,&i);
	}
	
	/* Process 2, ... , n-1 grams */
#import "OpenEarsStaticAnalysisToggle.h"
#ifdef STATICANALYZEDEPENDENCIES
#define __clang_analyzer__ 1
#endif
#if !defined(__clang_analyzer__) || defined(STATICANALYZEDEPENDENCIES)
#undef __clang_analyzer__	
	previd = -1;
	
	for (i=2;i<=arpa_lm->n-1;i++) {
		
		printf("\nReading %d-grams...\n",i);
		
		previd = -1;
		
		j=0;
		
		for (k=0;k<=arpa_lm->n-1;k++) {
			pos_in_list[k] = 0;
		}
		
		begin_browse_union(lm1,lm2,i,&bru);
		while (get_next_ngram_union(words,&bru)) {
			
			/* Process line into all relevant temp_words */			
			num_of_args = 0;						
#endif		
			sih_lookup(arpa_lm->vocab_ht,words[i-1],&temp_id);
			arpa_lm->word_id[i-1][j] = temp_id;
			
			show_dot(j);
			
			j++;
			if (j>arpa_lm->table_sizes[i-1]) {
				quit(-1,"Error - Header information in ARPA format language model is incorrect.\nMore than %d %d-grams needed to be stored.\n",arpa_lm->table_sizes[i-1],i);
			}
			
			/* Make sure that indexes in previous table point to 
			the right thing. */
			
			for (k=0;k<=i-1;k++) {
				previous_ngram[k] = current_ngram[k];
				sih_lookup(arpa_lm->vocab_ht,words[k],&temp_id);
				if (temp_id == 0 && strcmp(words[k],"<UNK>")) {
					quit(-1,"Error - found unknown word in n-gram file : %s\n",
						words[k]);
				}
				current_ngram[k] = temp_id;
			}
			
			/* Find position of novelty */
			
			/*bug fixed, for the first ngram, pos_of novelty should be 0 - Wei Xu*/
			if (j==1) pos_of_novelty=0;
			else {
				pos_of_novelty = i;
				
				for (k=0;k<=i-1;k++) {
					if (current_ngram[k] > previous_ngram[k]) {
						pos_of_novelty = k;
						k = arpa_lm->n;
					}
					else {
						if ((current_ngram[k] > previous_ngram[k]) && (j > 0)) {
							quit(-1,"Error : n-grams are not correctly ordered.\n");
						}
					}
				}
			}
			
			if (pos_of_novelty == i && j != 1)
			  quit(-1,"Error - Repeated %d-gram in ARPA format language model.\n",
			       i);
			
			if (pos_of_novelty != i-1) {
				if (i==2) {
					/* Deal with unigram pointers */
					
					for (k = previd + 1; k <= current_ngram[0]; k++) {
						arpa_lm->ind[0][k] = new_index(j-1,
							arpa_lm->ptr_table[0],
							&(arpa_lm->ptr_table_size[0]),
							k);
					}
					previd = current_ngram[0];
				}else {
					
					for (k=pos_of_novelty;k<=i-2;k++) {
						if (k == 0) {
							pos_in_list[0] = current_ngram[0];
						}
						else {
							pos_in_list[k] = 
								MIN(get_full_index(arpa_lm->ind[k-1][pos_in_list[k-1]],
								arpa_lm->ptr_table[k-1],   
								arpa_lm->ptr_table_size[k-1],   
								pos_in_list[k-1]),pos_in_list[k]);
							while (arpa_lm->word_id[k][pos_in_list[k]] < 
								current_ngram[k]) {
								pos_in_list[k]++;
							}
						}
					}
					for (k = previd + 1; k <= pos_in_list[i-2]; k++) {
						arpa_lm->ind[i-2][k] = 
							new_index(j-1,
							arpa_lm->ptr_table[i-2],
							&(arpa_lm->ptr_table_size[i-2]),
							k);
					}
					previd = pos_in_list[i-2];	    
				}
			}
		}
	
		/* Now need to tidy up pointers for bottom section of unigrams */
	
		for (k = previd + 1; k <= arpa_lm->vocab_size; k++) {
			arpa_lm->ind[0][k] = new_index(arpa_lm->num_kgrams[1],
				arpa_lm->ptr_table[0],
				&(arpa_lm->ptr_table_size[0]),
				k);
		}      
	
	}
  
	printf("\nReading %d-grams...\n",arpa_lm->n);
	
	j = 0;
	previd = 0;
	
	arpa_lm->ind[arpa_lm->n-2][0] = 0;
	
	for (k=0;k<=arpa_lm->n-1;k++) {
		/* bug fixed by Wei Xu : this is a serious bug*/
		pos_in_list[k] = 0;
		//    pos_in_list[0] = 0;
	}
	
	begin_browse_union(lm1,lm2,arpa_lm->n,&bru);
	while (get_next_ngram_union(words,&bru)) {

	  show_dot(j);
	  
		sih_lookup(arpa_lm->vocab_ht,words[arpa_lm->n-1],&temp_id);
		
		arpa_lm->word_id[arpa_lm->n-1][j] = temp_id;
		
		j++;
		
		for (k=0;k<=arpa_lm->n-1;k++) {
			previous_ngram[k] = current_ngram[k];
			sih_lookup(arpa_lm->vocab_ht,words[k],&temp_id);
			if (temp_id == 0 && strcmp(words[k],"<UNK>")) {
				quit(-1,"Error - found unknown word in n-gram file : %s\n",
					words[k]);
			}
			current_ngram[k] = temp_id;
		}
		
		/* Find position of novelty */
		
		/*bug fixed, for the first ngram, pos_of novelty should be 0 - Wei Xu*/
		if (j==1) pos_of_novelty=0;
		else {
			pos_of_novelty = arpa_lm->n+1;
			
			for (k=0;k<=arpa_lm->n-1;k++) {
				if (current_ngram[k] > previous_ngram[k]) {
					pos_of_novelty = k;
					k = arpa_lm->n;
				}else {
					if ((current_ngram[k] > previous_ngram[k]) && (j>0)) {
						quit(-1,"Error : n-grams are not correctly ordered.\n");
					}
				}
			}
		}
		
		if ( pos_of_novelty == arpa_lm->n+1 && j != 1 ) {
			quit(-1,"Error : Same %d-gram occurs twice in ARPA format LM.\n",
				arpa_lm->n);
		}
		
		if (pos_of_novelty != arpa_lm->n-1) {
			
			for (k=pos_of_novelty;k<=arpa_lm->n-2;k++) {
				if (k == 0) {
					pos_in_list[0] = current_ngram[0];
				}else {
					pos_in_list[k] = 
						MAX(get_full_index(arpa_lm->ind[k-1][pos_in_list[k-1]],
						arpa_lm->ptr_table[k-1],   
						arpa_lm->ptr_table_size[k-1],   
						pos_in_list[k-1]),pos_in_list[k]);
					while (arpa_lm->word_id[k][pos_in_list[k]] < 
						current_ngram[k]) {
						pos_in_list[k]++;
					}
				}
			}
			for (k = previd + 1; k <= pos_in_list[arpa_lm->n-2]; k++) {
				arpa_lm->ind[arpa_lm->n-2][k] = 
					new_index(j-1,
					arpa_lm->ptr_table[arpa_lm->n-2],
					&(arpa_lm->ptr_table_size[arpa_lm->n-2]),
					k);
			}
			previd = pos_in_list[arpa_lm->n-2];
		}
		
		if (j>arpa_lm->table_sizes[arpa_lm->n-1]) {
			quit(-1,"Error - Header information in ARPA format language model is incorrect.\nMore than %d %d-grams needed to be stored.\n",arpa_lm->table_sizes[arpa_lm->n-1],arpa_lm->n-1);
		}
	}
	
	
	
	/* Tidy up */
	
	
	free(previous_ngram);
	free(current_ngram);
	free(in_line);
	free(input_line);
	DeleteArray(words);
  
}
Exemplo n.º 7
0
void main (int argc, char **argv) {

  ng_t ng;
  arpa_lm_t arpa_ng;
  char input_string[500];
  int num_of_args;
  char *args[MAX_ARGS];
  char *lm_filename_arpa;
  char *lm_filename_binary;
  flag told_to_quit;
  flag inconsistant_parameters;
  flag backoff_from_unk_inc;
  flag backoff_from_unk_exc;
  flag backoff_from_ccs_inc;
  flag backoff_from_ccs_exc;
  flag arpa_lm;
  flag binary_lm;
  flag include_unks;
  char *fb_list_filename;
  char *probs_stream_filename;
  char *annotation_filename;
  char *text_stream_filename;
  char *oov_filename;
  char *ccs_filename;
  double log_base;
  char wlist_entry[1024];
  char current_cc[200];
  int current_cc_id;
  FILE *context_cues_fp;
  int n;
  int generate_size = 10000;
  int random_seed;

  /* Process command line */

  report_version(&argc,argv);

  if (pc_flagarg(&argc, argv,"-help") || 
      argc == 1 || 
      (strcmp(argv[1],"-binary") && strcmp(argv[1],"-arpa"))) {
   fprintf(stderr,"evallm : Evaluate a language model.\n");
   fprintf(stderr,"Usage : evallm [ -binary .binlm | \n");
   fprintf(stderr,"                 -arpa .arpa [ -context .ccs ] ]\n");
   exit(1);
  }

  lm_filename_arpa = salloc(pc_stringarg(&argc, argv,"-arpa",""));

  if (strcmp(lm_filename_arpa,"")) {
    arpa_lm = 1;
  }
  else {
    arpa_lm = 0;
  }

  lm_filename_binary = salloc(pc_stringarg(&argc, argv,"-binary",""));

  if (strcmp(lm_filename_binary,"")) {
    binary_lm = 1;
  }
  else {
    binary_lm = 0;
  }

  if (arpa_lm && binary_lm) {
    quit(-1,"Error : Can't use both -arpa and -binary flags.\n");
  }
  
  if (!arpa_lm && !binary_lm) {
    quit(-1,"Error : Must specify either a binary or an arpa format language model.\n");
  }

  ccs_filename = salloc(pc_stringarg(&argc, argv,"-context",""));

  if (binary_lm && strcmp(ccs_filename,"")) {
    fprintf(stderr,"Warning - context cues file not needed with binary language model file.\nWill ignore it.\n");
  }

  pc_report_unk_args(&argc,argv,2);
 
  /* Load language model */

  if (arpa_lm) {
    fprintf(stderr,"Reading in language model from file %s\n",
	    lm_filename_arpa);
    load_arpa_lm(&arpa_ng,lm_filename_arpa);
  }
  else {
    fprintf(stderr,"Reading in language model from file %s\n",
	    lm_filename_binary);
    load_lm(&ng,lm_filename_binary); 
  }

  fprintf(stderr,"\nDone.\n");

  if (!arpa_lm) {
    n=ng.n;
  }
  else {
    n=arpa_ng.n;
  }

  if (arpa_lm) {
    arpa_ng.context_cue = 
      (flag *) rr_calloc(arpa_ng.table_sizes[0],sizeof(flag));    
    arpa_ng.no_of_ccs = 0;
    if (strcmp(ccs_filename,"")) {
      context_cues_fp = rr_iopen(ccs_filename);
      while (fgets (wlist_entry, sizeof (wlist_entry),context_cues_fp)) {
	if (strncmp(wlist_entry,"##",2)==0) continue;
	sscanf (wlist_entry, "%s ",current_cc);
	if (strncmp(wlist_entry,"#",1)==0) {
	  fprintf(stderr,"\n\n===========================================================\n");
	  fprintf(stderr,":\nWARNING: line assumed NOT a comment:\n");
	  fprintf(stderr,     ">>> %s <<<\n",wlist_entry);
	  fprintf(stderr,     "         '%s' will be included in the context cues list\n",current_cc);
	  fprintf(stderr,     "         (comments must start with '##')\n");
	  fprintf(stderr,"===========================================================\n\n");
	}
	
	
	if (sih_lookup(arpa_ng.vocab_ht,current_cc,&current_cc_id) == 0) {
	  quit(-1,"Error : %s in the context cues file does not appear in the vocabulary.\n",current_cc);
	}
	
	arpa_ng.context_cue[(unsigned short) current_cc_id] = 1;
	arpa_ng.no_of_ccs++;
	fprintf(stderr,"Context cue word : %s id = %d\n",current_cc,current_cc_id);
      }
      rr_iclose(context_cues_fp);
    }
  }

  /* Process commands */
  
  told_to_quit = 0;
  num_of_args = 0;

  while (!feof(stdin) && !told_to_quit) {
    printf("evallm : ");
    gets(input_string);

    if (!feof(stdin)) {
      parse_comline(input_string,&num_of_args,args);

  
      random_seed = pc_intarg(&num_of_args,args,"-seed",-1);

      generate_size = pc_intarg(&num_of_args,args,"-size",10000);

      log_base = pc_doublearg(&num_of_args,args,"-log_base",10.0);

      backoff_from_unk_inc = pc_flagarg(&num_of_args,args,
					"-backoff_from_unk_inc");
      backoff_from_ccs_inc = pc_flagarg(&num_of_args,args,
					"-backoff_from_ccs_inc");
      backoff_from_unk_exc = pc_flagarg(&num_of_args,args,
					"-backoff_from_unk_exc");
      backoff_from_ccs_exc = pc_flagarg(&num_of_args,args,
					"-backoff_from_ccs_exc");
      include_unks = pc_flagarg(&num_of_args,args,"-include_unks");
      fb_list_filename = salloc(pc_stringarg(&num_of_args,args,
					     "-backoff_from_list",""));
    
      text_stream_filename = 
	salloc(pc_stringarg(&num_of_args,args,"-text",""));
      probs_stream_filename = 
	salloc(pc_stringarg(&num_of_args,args,"-probs",""));
      annotation_filename = 
	salloc(pc_stringarg(&num_of_args,args,"-annotate",""));
      oov_filename = salloc(pc_stringarg(&num_of_args,args,"-oovs",""));


      inconsistant_parameters = 0;
    
      if (backoff_from_unk_inc && backoff_from_unk_exc) {
	fprintf(stderr,"Error : Cannot specify both exclusive and inclusive forced backoff.\n");
	fprintf(stderr,"Use only one of -backoff_from_unk_exc and -backoff_from_unk_inc\n");
	inconsistant_parameters = 1;
      }

      if (backoff_from_ccs_inc && backoff_from_ccs_exc) {
	fprintf(stderr,"Error : Cannot specify both exclusive and inclusive forced backoff.\n");
	fprintf(stderr,"Use only one of -backoff_from_ccs_exc and -backoff_from_ccs_inc\n");
	inconsistant_parameters = 1;
      }

      if (num_of_args > 0) {
      
	if (!inconsistant_parameters) {
	  if (!strcmp(args[0],"perplexity")) {
	    compute_perplexity(&ng,
			       &arpa_ng,
			       text_stream_filename,
			       probs_stream_filename,
			       annotation_filename,
			       oov_filename,
			       fb_list_filename,
			       backoff_from_unk_inc,
			       backoff_from_unk_exc,
			       backoff_from_ccs_inc,
			       backoff_from_ccs_exc,
			       arpa_lm,
			       include_unks,
			       log_base);
	  }
	  else {
	    if (!strcmp(args[0],"validate")) {

	      if (num_of_args != n) {
		fprintf(stderr,"Error : must specify %d words of context.\n",
			n-1);
	      }
	      else {
	      
		/* Assume last n-1 parameters form context */
	      
		validate(&ng,
			 &arpa_ng,
			 &(args[num_of_args-n+1]),
			 backoff_from_unk_inc,
			 backoff_from_unk_exc,
			 backoff_from_ccs_inc,
			 backoff_from_ccs_exc,
			 arpa_lm,
			 fb_list_filename);
	      }
	    }
	    else {
	      if (!strcmp(args[0],"stats")) {
		if (arpa_lm) {
		  display_arpa_stats(&arpa_ng);
		}
		else {
		  display_stats(&ng);
		}
	      }
	      else {
		if (!strcmp(args[0],"quit")) {
		  told_to_quit=1;
		}
        else if (!strcmp(args[0],"generate")) {

        if(arpa_lm)
          generate_words(NULL,&arpa_ng,generate_size,random_seed,text_stream_filename);
        else
          generate_words(&ng,NULL,generate_size,random_seed,text_stream_filename);

      }
		else {
		  if (!strcmp(args[0],"help")) {

		    printf("The user may specify one of the following commands: \n");
		    printf("\n");
		    printf(" - perplexity\n");
		    printf("\n");
		    printf("Computes the perplexity of a given text. May optionally specify words\n");
		    printf("from which to force back-off.\n");
		    printf("\n");
		    printf("Syntax: \n");
		    printf("\n");
		    printf("perplexity -text .text\n");
		    printf("         [ -probs .fprobs ]\n");
		    printf("         [ -oovs .oov_file ]\n");
		    printf("         [ -annotate .annotation_file ]         \n");
		    printf("         [ -backoff_from_unk_inc | -backoff_from_unk_exc ]\n");
		    printf("         [ -backoff_from_ccs_inc | -backoff_from_ccs_exc ] \n");
		    printf("         [ -backoff_from_list .fblist ]\n");
		    printf("         [ -include_unks ]\n");
		    printf("\n");
		    printf(" - validate\n");
		    printf("       \n");
		    printf("Calculate the sum of the probabilities of all the words in the\n");
		    printf("vocabulary given the context specified by the user.\n");
		    printf("\n");
		    printf("Syntax: \n");
		    printf("\n");
		    printf("validate [ -backoff_from_unk -backoff_from_ccs |\n");
		    printf("           -backoff_from_list .fblist ]\n");
		    printf("         [ -forced_backoff_inc | -forced_back_off_exc ]      \n");
		    printf("           word1 word2 ... word_(n-1)\n");
		    printf("\n");
		    printf("Where n is the n in n-gram. \n");
		    printf("\n");
		    printf(" - help\n");
		    printf("\n");
		    printf("Displays this help message.\n");
		    printf("\n");
		    printf("Syntax: \n");
		    printf("\n");
		    printf("help\n");
		    printf("\n");
		    printf(" - quit\n");
		    printf("\n");
		    printf("Exits the program.\n");
		    printf("\n");
		    printf("Syntax: \n");
		    printf("\n");
		    printf("quit\n");	

		  } 
	      
		  else {
		    fprintf(stderr,"Unknown command : %s\nType \'help\'\n",
			    args[0]);
		  }
		}
	      }
	    }
	  }
	}
      }
    }    
  }

  fprintf(stderr,"evallm : Done.\n");

  exit(0);
  
}