void main (int argc, char **argv) {

  ng_t ng;
  arpa_lm_t arpa_ng;
  char input_string[500];
  int num_of_args;
  char *args[MAX_ARGS];
  char *lm_filename_arpa;
  char *lm_filename_binary;
  flag told_to_quit;
  flag inconsistant_parameters;
  flag backoff_from_unk_inc;
  flag backoff_from_unk_exc;
  flag backoff_from_ccs_inc;
  flag backoff_from_ccs_exc;
  flag arpa_lm;
  flag binary_lm;
  flag include_unks;
  char *fb_list_filename;
  char *probs_stream_filename;
  char *annotation_filename;
  char *text_stream_filename;
  char *oov_filename;
  char *ccs_filename;
  double log_base;
  char wlist_entry[1024];
  char current_cc[200];
  int current_cc_id;
  FILE *context_cues_fp;
  int n;
  int generate_size = 10000;
  int random_seed;

  /* Process command line */

  report_version(&argc,argv);

  if (pc_flagarg(&argc, argv,"-help") || 
      argc == 1 || 
      (strcmp(argv[1],"-binary") && strcmp(argv[1],"-arpa"))) {
   fprintf(stderr,"evallm : Evaluate a language model.\n");
   fprintf(stderr,"Usage : evallm [ -binary .binlm | \n");
   fprintf(stderr,"                 -arpa .arpa [ -context .ccs ] ]\n");
   exit(1);
  }

  lm_filename_arpa = salloc(pc_stringarg(&argc, argv,"-arpa",""));

  if (strcmp(lm_filename_arpa,"")) {
    arpa_lm = 1;
  }
  else {
    arpa_lm = 0;
  }

  lm_filename_binary = salloc(pc_stringarg(&argc, argv,"-binary",""));

  if (strcmp(lm_filename_binary,"")) {
    binary_lm = 1;
  }
  else {
    binary_lm = 0;
  }

  if (arpa_lm && binary_lm) {
    quit(-1,"Error : Can't use both -arpa and -binary flags.\n");
  }
  
  if (!arpa_lm && !binary_lm) {
    quit(-1,"Error : Must specify either a binary or an arpa format language model.\n");
  }

  ccs_filename = salloc(pc_stringarg(&argc, argv,"-context",""));

  if (binary_lm && strcmp(ccs_filename,"")) {
    fprintf(stderr,"Warning - context cues file not needed with binary language model file.\nWill ignore it.\n");
  }

  pc_report_unk_args(&argc,argv,2);
 
  /* Load language model */

  if (arpa_lm) {
    fprintf(stderr,"Reading in language model from file %s\n",
	    lm_filename_arpa);
    load_arpa_lm(&arpa_ng,lm_filename_arpa);
  }
  else {
    fprintf(stderr,"Reading in language model from file %s\n",
	    lm_filename_binary);
    load_lm(&ng,lm_filename_binary); 
  }

  fprintf(stderr,"\nDone.\n");

  if (!arpa_lm) {
    n=ng.n;
  }
  else {
    n=arpa_ng.n;
  }

  if (arpa_lm) {
    arpa_ng.context_cue = 
      (flag *) rr_calloc(arpa_ng.table_sizes[0],sizeof(flag));    
    arpa_ng.no_of_ccs = 0;
    if (strcmp(ccs_filename,"")) {
      context_cues_fp = rr_iopen(ccs_filename);
      while (fgets (wlist_entry, sizeof (wlist_entry),context_cues_fp)) {
	if (strncmp(wlist_entry,"##",2)==0) continue;
	sscanf (wlist_entry, "%s ",current_cc);
	if (strncmp(wlist_entry,"#",1)==0) {
	  fprintf(stderr,"\n\n===========================================================\n");
	  fprintf(stderr,":\nWARNING: line assumed NOT a comment:\n");
	  fprintf(stderr,     ">>> %s <<<\n",wlist_entry);
	  fprintf(stderr,     "         '%s' will be included in the context cues list\n",current_cc);
	  fprintf(stderr,     "         (comments must start with '##')\n");
	  fprintf(stderr,"===========================================================\n\n");
	}
	
	
	if (sih_lookup(arpa_ng.vocab_ht,current_cc,&current_cc_id) == 0) {
	  quit(-1,"Error : %s in the context cues file does not appear in the vocabulary.\n",current_cc);
	}
	
	arpa_ng.context_cue[(unsigned short) current_cc_id] = 1;
	arpa_ng.no_of_ccs++;
	fprintf(stderr,"Context cue word : %s id = %d\n",current_cc,current_cc_id);
      }
      rr_iclose(context_cues_fp);
    }
  }

  /* Process commands */
  
  told_to_quit = 0;
  num_of_args = 0;

  while (!feof(stdin) && !told_to_quit) {
    printf("evallm : ");
    gets(input_string);

    if (!feof(stdin)) {
      parse_comline(input_string,&num_of_args,args);

  
      random_seed = pc_intarg(&num_of_args,args,"-seed",-1);

      generate_size = pc_intarg(&num_of_args,args,"-size",10000);

      log_base = pc_doublearg(&num_of_args,args,"-log_base",10.0);

      backoff_from_unk_inc = pc_flagarg(&num_of_args,args,
					"-backoff_from_unk_inc");
      backoff_from_ccs_inc = pc_flagarg(&num_of_args,args,
					"-backoff_from_ccs_inc");
      backoff_from_unk_exc = pc_flagarg(&num_of_args,args,
					"-backoff_from_unk_exc");
      backoff_from_ccs_exc = pc_flagarg(&num_of_args,args,
					"-backoff_from_ccs_exc");
      include_unks = pc_flagarg(&num_of_args,args,"-include_unks");
      fb_list_filename = salloc(pc_stringarg(&num_of_args,args,
					     "-backoff_from_list",""));
    
      text_stream_filename = 
	salloc(pc_stringarg(&num_of_args,args,"-text",""));
      probs_stream_filename = 
	salloc(pc_stringarg(&num_of_args,args,"-probs",""));
      annotation_filename = 
	salloc(pc_stringarg(&num_of_args,args,"-annotate",""));
      oov_filename = salloc(pc_stringarg(&num_of_args,args,"-oovs",""));


      inconsistant_parameters = 0;
    
      if (backoff_from_unk_inc && backoff_from_unk_exc) {
	fprintf(stderr,"Error : Cannot specify both exclusive and inclusive forced backoff.\n");
	fprintf(stderr,"Use only one of -backoff_from_unk_exc and -backoff_from_unk_inc\n");
	inconsistant_parameters = 1;
      }

      if (backoff_from_ccs_inc && backoff_from_ccs_exc) {
	fprintf(stderr,"Error : Cannot specify both exclusive and inclusive forced backoff.\n");
	fprintf(stderr,"Use only one of -backoff_from_ccs_exc and -backoff_from_ccs_inc\n");
	inconsistant_parameters = 1;
      }

      if (num_of_args > 0) {
      
	if (!inconsistant_parameters) {
	  if (!strcmp(args[0],"perplexity")) {
	    compute_perplexity(&ng,
			       &arpa_ng,
			       text_stream_filename,
			       probs_stream_filename,
			       annotation_filename,
			       oov_filename,
			       fb_list_filename,
			       backoff_from_unk_inc,
			       backoff_from_unk_exc,
			       backoff_from_ccs_inc,
			       backoff_from_ccs_exc,
			       arpa_lm,
			       include_unks,
			       log_base);
	  }
	  else {
	    if (!strcmp(args[0],"validate")) {

	      if (num_of_args != n) {
		fprintf(stderr,"Error : must specify %d words of context.\n",
			n-1);
	      }
	      else {
	      
		/* Assume last n-1 parameters form context */
	      
		validate(&ng,
			 &arpa_ng,
			 &(args[num_of_args-n+1]),
			 backoff_from_unk_inc,
			 backoff_from_unk_exc,
			 backoff_from_ccs_inc,
			 backoff_from_ccs_exc,
			 arpa_lm,
			 fb_list_filename);
	      }
	    }
	    else {
	      if (!strcmp(args[0],"stats")) {
		if (arpa_lm) {
		  display_arpa_stats(&arpa_ng);
		}
		else {
		  display_stats(&ng);
		}
	      }
	      else {
		if (!strcmp(args[0],"quit")) {
		  told_to_quit=1;
		}
        else if (!strcmp(args[0],"generate")) {

        if(arpa_lm)
          generate_words(NULL,&arpa_ng,generate_size,random_seed,text_stream_filename);
        else
          generate_words(&ng,NULL,generate_size,random_seed,text_stream_filename);

      }
		else {
		  if (!strcmp(args[0],"help")) {

		    printf("The user may specify one of the following commands: \n");
		    printf("\n");
		    printf(" - perplexity\n");
		    printf("\n");
		    printf("Computes the perplexity of a given text. May optionally specify words\n");
		    printf("from which to force back-off.\n");
		    printf("\n");
		    printf("Syntax: \n");
		    printf("\n");
		    printf("perplexity -text .text\n");
		    printf("         [ -probs .fprobs ]\n");
		    printf("         [ -oovs .oov_file ]\n");
		    printf("         [ -annotate .annotation_file ]         \n");
		    printf("         [ -backoff_from_unk_inc | -backoff_from_unk_exc ]\n");
		    printf("         [ -backoff_from_ccs_inc | -backoff_from_ccs_exc ] \n");
		    printf("         [ -backoff_from_list .fblist ]\n");
		    printf("         [ -include_unks ]\n");
		    printf("\n");
		    printf(" - validate\n");
		    printf("       \n");
		    printf("Calculate the sum of the probabilities of all the words in the\n");
		    printf("vocabulary given the context specified by the user.\n");
		    printf("\n");
		    printf("Syntax: \n");
		    printf("\n");
		    printf("validate [ -backoff_from_unk -backoff_from_ccs |\n");
		    printf("           -backoff_from_list .fblist ]\n");
		    printf("         [ -forced_backoff_inc | -forced_back_off_exc ]      \n");
		    printf("           word1 word2 ... word_(n-1)\n");
		    printf("\n");
		    printf("Where n is the n in n-gram. \n");
		    printf("\n");
		    printf(" - help\n");
		    printf("\n");
		    printf("Displays this help message.\n");
		    printf("\n");
		    printf("Syntax: \n");
		    printf("\n");
		    printf("help\n");
		    printf("\n");
		    printf(" - quit\n");
		    printf("\n");
		    printf("Exits the program.\n");
		    printf("\n");
		    printf("Syntax: \n");
		    printf("\n");
		    printf("quit\n");	

		  } 
	      
		  else {
		    fprintf(stderr,"Unknown command : %s\nType \'help\'\n",
			    args[0]);
		  }
		}
	      }
	    }
	  }
	}
      }
    }    
  }

  fprintf(stderr,"evallm : Done.\n");

  exit(0);
  
}
Beispiel #2
0
int oe_02_main (int argc, char **argv) {

  ng_t ng;
  arpa_lm_t arpa_ng;
  char input_string[500];
  int num_of_args;
  char *args[MAX_ARGS];
  char *lm_filename_arpa;
  char *lm_filename_binary;
  flag told_to_quit;
  flag inconsistant_parameters;
  flag backoff_from_unk_inc;
  flag backoff_from_unk_exc;
  flag backoff_from_ccs_inc;
  flag backoff_from_ccs_exc;
  flag arpa_lm;
  flag binary_lm;
  flag include_unks;
  char *fb_list_filename;
  char *probs_stream_filename;
  char *annotation_filename;
  char *text_stream_filename;
  char *oov_filename;
  char *ccs_filename;
  int generate_size;
  int random_seed;
  double log_base;
  char wlist_entry[1024];
  char current_cc[200];
  vocab_sz_t current_cc_id;
  FILE *context_cues_fp;
  int n;

  /* Process command line */

  report_version(&argc,argv);

  if (pc_flagarg(&argc, argv,"-help") || 
      argc == 1 || 
      (strcmp(argv[1],"-binary") && strcmp(argv[1],"-arpa"))) {
    oe_02_help_message();
    exit(1);
  }

  lm_filename_arpa = rr_salloc(pc_stringarg(&argc, argv,"-arpa",""));

  if (strcmp(lm_filename_arpa,""))
    arpa_lm = 1;
  else
    arpa_lm = 0;

  lm_filename_binary = rr_salloc(pc_stringarg(&argc, argv,"-binary",""));

  if (strcmp(lm_filename_binary,""))
    binary_lm = 1;
  else
    binary_lm = 0;

  if (arpa_lm && binary_lm)
    quit(-1,"Error : Can't use both -arpa and -binary flags.\n");
  
  if (!arpa_lm && !binary_lm)
    quit(-1,"Error : Must specify either a binary or an arpa format language model.\n");

  ccs_filename = rr_salloc(pc_stringarg(&argc, argv,"-context",""));

  if (binary_lm && strcmp(ccs_filename,""))
    fprintf(stderr,"Warning - context cues file not needed with binary language model file.\nWill ignore it.\n");

  pc_report_unk_args(&argc,argv,2);
 
  /* Load language model */

  if (arpa_lm) {
    fprintf(stderr,"Reading in language model from file %s\n",
	    lm_filename_arpa);
    load_arpa_lm(&arpa_ng,lm_filename_arpa);
  }else {
    fprintf(stderr,"Reading in language model from file %s\n",
	    lm_filename_binary);
    load_lm(&ng,lm_filename_binary); 
  }

  fprintf(stderr,"\nDone.\n");

  n=arpa_lm?
    arpa_ng.n:
    ng.n;

  if (arpa_lm) {
    arpa_ng.context_cue = 
      (flag *) rr_calloc(arpa_ng.table_sizes[0],sizeof(flag));    
    arpa_ng.no_of_ccs = 0;
    if (strcmp(ccs_filename,"")) {
      context_cues_fp = rr_iopen(ccs_filename);
      while (fgets (wlist_entry, sizeof (wlist_entry),context_cues_fp)) {
	if (strncmp(wlist_entry,"##",2)==0) continue;
	sscanf (wlist_entry, "%s ",current_cc);

	warn_on_wrong_vocab_comments(wlist_entry);
	
	if (sih_lookup(arpa_ng.vocab_ht,current_cc,&current_cc_id) == 0)
	  quit(-1,"Error : %s in the context cues file does not appear in the vocabulary.\n",current_cc);
	
	arpa_ng.context_cue[(unsigned short) current_cc_id] = 1;
	arpa_ng.no_of_ccs++;
	fprintf(stderr,"Context cue word : %s id = %lld\n",current_cc,current_cc_id);
      }
      rr_iclose(context_cues_fp);
    }
  }

  /* Process commands */
  
  told_to_quit = 0;
  num_of_args = 0;

  while (!feof(stdin) && !told_to_quit) {
    printf("evallm : \n");
    fgets(input_string, sizeof(input_string), stdin);
    if(strlen(input_string) < sizeof(input_string)-1)
      input_string[strlen(input_string)-1] = '\0'; //chop new-line
    else 
      quit(1, "evallm input exceeds size of input buffer");

    if (!feof(stdin)) {
      parse_comline(input_string,&num_of_args,args);

      log_base = pc_doublearg(&num_of_args,args,"-log_base",10.0);

      backoff_from_unk_inc = pc_flagarg(&num_of_args,args,"-backoff_from_unk_inc");
      backoff_from_ccs_inc = pc_flagarg(&num_of_args,args,"-backoff_from_ccs_inc");
      backoff_from_unk_exc = pc_flagarg(&num_of_args,args,"-backoff_from_unk_exc");
      backoff_from_ccs_exc = pc_flagarg(&num_of_args,args,"-backoff_from_ccs_exc");
      include_unks = pc_flagarg(&num_of_args,args,"-include_unks");
      fb_list_filename = rr_salloc(pc_stringarg(&num_of_args,args,"-backoff_from_list",""));
    
      text_stream_filename = 
	rr_salloc(pc_stringarg(&num_of_args,args,"-text",""));
      probs_stream_filename = 
	rr_salloc(pc_stringarg(&num_of_args,args,"-probs",""));
      annotation_filename = 
	rr_salloc(pc_stringarg(&num_of_args,args,"-annotate",""));
      oov_filename = rr_salloc(pc_stringarg(&num_of_args,args,"-oovs",""));

      generate_size = pc_intarg(&num_of_args,args,"-size",10000);
      random_seed = pc_intarg(&num_of_args,args,"-seed",-1);

      inconsistant_parameters = 0;
    
      if (backoff_from_unk_inc && backoff_from_unk_exc) {
	fprintf(stderr,"Error : Cannot specify both exclusive and inclusive forced backoff.\n");
	fprintf(stderr,"Use only one of -backoff_from_unk_exc and -backoff_from_unk_inc\n");
	inconsistant_parameters = 1;
      }

      if (backoff_from_ccs_inc && backoff_from_ccs_exc) {
	fprintf(stderr,"Error : Cannot specify both exclusive and inclusive forced backoff.\n");
	fprintf(stderr,"Use only one of -backoff_from_ccs_exc and -backoff_from_ccs_inc\n");
	inconsistant_parameters = 1;
      }

      if (num_of_args > 0) {      
	if (!inconsistant_parameters) {
	  if (!strcmp(args[0],"perplexity")) {
	    compute_perplexity(&ng,
			       &arpa_ng,
			       text_stream_filename,
			       probs_stream_filename,
			       annotation_filename,
			       oov_filename,
			       fb_list_filename,
			       backoff_from_unk_inc,
			       backoff_from_unk_exc,
			       backoff_from_ccs_inc,
			       backoff_from_ccs_exc,
			       arpa_lm,
			       include_unks,
			       log_base);
	  }else
	    /* do perplexity sentence by sentence [20090612] (air) */
	    if (!strcmp(args[0],"uttperp")) {
	      FILE *uttfh,*tempfh;
	      char utt[4096]; /* live dangerously... */
	      char tmpfil[128];
	      if ((uttfh = fopen(text_stream_filename,"r")) == NULL) {
		printf("Error: can't open %s\n",text_stream_filename);
		exit(1);
	      }
            char *template = "uttperp_XXXXXX";// CHANGED HLW
            mkstemp(template);// CHANGED HLW
Beispiel #3
0
ng_t * init_ng(
	     int* argc,
	     char **argv,
	     int verbosity
	     )
{
  int i;
  ng_t* ng;
  
  ng=(ng_t*) rr_calloc(1,sizeof(ng_t));

  ng->disc_meth=NULL;
  /* -n */
  ng->n = pc_intarg(argc, argv,"-n",DEFAULT_N); 

  if (ng->n<1) 
    quit(-1,"Error: Value of n must be larger than zero.\n");

  /* -cutoffs */
  ng->cutoffs = (cutoff_t *) pc_shortarrayarg(argc, argv, "-cutoffs",ng->n-1,ng->n-1);

  if (ng->cutoffs == NULL) 
    ng->cutoffs = (cutoff_t *) rr_calloc((ng->n-1)+1,sizeof(cutoff_t)); /* +1 for the sake of the correction in writing in write_lms.c */

  for (i=0;i<=ng->n-3;i++) {
    if (ng->cutoffs[i+1] < ng->cutoffs[i]) {
      quit(-1,"Error - cutoffs for (n+1)-gram must be greater than or equal to those for \nn-gram. You have %d-gram cutoff = %d > %d-gram cutoff = %d.\n",i+2,ng->cutoffs[i],i+3,ng->cutoffs[i+1]);
    }
  }

  /* -min_unicount */
  ng->min_unicount = pc_intarg(argc, argv, "-min_unicount",0);

  /* -idngram */
  ng->id_gram_filename = salloc(pc_stringarg(argc, argv,"-idngram",""));

  if (!strcmp(ng->id_gram_filename,""))
    quit(-1,"Error: id ngram file not specified. Use the -idngram flag.\n");

  /* -arpa & -bin */
  ng->arpa_filename = salloc(pc_stringarg(argc, argv,"-arpa",""));
  ng->bin_filename = salloc(pc_stringarg(argc, argv,"-binary",""));
  
  ng->write_arpa = strcmp("",ng->arpa_filename);
  ng->write_bin = strcmp("",ng->bin_filename);
  
  if (!(ng->write_arpa || ng->write_bin)) 
    quit(-1,"Error : must specify either an arpa, or a binary output file.\n");

  ng->count_table_size = DEFAULT_COUNT_TABLE_SIZE;

  /* -vocab */
  ng->vocab_filename = salloc(pc_stringarg(argc,argv,"-vocab",""));
 
  if (!strcmp("",ng->vocab_filename))
    quit(-1,"Error : vocabulary file not specified. Use the -vocab option.\n");  

  /* -context */
  ng->context_cues_filename = salloc(pc_stringarg(argc,argv,"-context",""));

  ng->context_set = strcmp("", ng->context_cues_filename);

  /* -vocab_type */
  ng->vocab_type = pc_intarg(argc,argv,"-vocab_type",1);
  
  /* -oov_fraction */
  ng->oov_fraction = pc_doublearg(argc, argv,"-oov_fraction",-1.0);


  if (ng->oov_fraction == -1.0)
    ng->oov_fraction=DEFAULT_OOV_FRACTION;
  else {
    if (ng->vocab_type != 2)
      pc_message(verbosity,1,"Warning : OOV fraction specified, but will not be used, since vocab type is not 2.\n");
  }

  if (ng->vocab_type == 0) 
    ng->first_id = 1;
  else
    ng->first_id = 0;

  /* Allow both "min_alpha" etc and "min_bo_weight" etc as valid
     syntax. The "bo_weight" form is preferred, but the "alpha" form is
     maintained as it was present in version 2.00 */

  ng->min_alpha = pc_doublearg(argc,argv,"-min_alpha",DEFAULT_MIN_ALPHA);
  ng->max_alpha = pc_doublearg(argc,argv,"-max_alpha",DEFAULT_MAX_ALPHA);
  ng->out_of_range_alphas = pc_intarg(argc,argv,"-out_of_range_alphas",
				      DEFAULT_OUT_OF_RANGE_ALPHAS);

  ng->min_alpha = pc_doublearg(argc,argv,"-min_bo_weight",ng->min_alpha);
  ng->max_alpha = pc_doublearg(argc,argv,"-max_bo_weight",ng->max_alpha);
  ng->out_of_range_alphas = pc_intarg(argc,argv,"-out_of_range_bo_weights",
				      ng->out_of_range_alphas);
  
  if (ng->min_alpha >= ng->max_alpha)
    quit(-1,"Error : Minimum of alpha range must be less than the maximum.\n");


  init_ng_disc_method(ng,
		      pc_flagarg(argc, argv,"-linear"),
		      pc_flagarg(argc,argv,"-absolute"),
		      pc_flagarg(argc,argv,"-witten_bell"),
		      pc_flagarg(argc,argv,"-good_turing"));		      
		      
  ng->disc_range = (unsigned short *) pc_shortarrayarg(argc, argv, "-disc_ranges",ng->n,ng->n);

  ng->disc_range_set = (ng->disc_range != NULL);

  if (ng->discounting_method == GOOD_TURING) {
    if (!ng->disc_range_set) {
      ng->disc_range = (unsigned short *) rr_malloc(sizeof(unsigned short) * ng->n);
      ng->disc_range[0] = DEFAULT_DISC_RANGE_1;
      for (i=1;i<=ng->n-1;i++) 
	ng->disc_range[i] = DEFAULT_DISC_RANGE_REST;
    }
    ng->fof_size = (fof_sz_t *) rr_malloc(sizeof(fof_sz_t) * ng->n);
    for (i=0;i<=ng->n-1;i++) 
      ng->fof_size[i] = ng->disc_range[i]+1;

  }else {
    if (ng->disc_range_set) 
      pc_message(verbosity,2,"Warning : discount ranges specified will be ignored, since they only apply\nto Good Turing discounting.\n");
  }

  ng->four_byte_alphas = !(pc_flagarg(argc, argv, "-two_byte_alphas") || 
			   pc_flagarg(argc, argv, "-two_byte_bo_weights"));

  ng->four_byte_counts = pc_flagarg(argc, argv, "-four_byte_counts");
  if(ng->four_byte_counts){
      pc_message(verbosity,2,"Using Four byte counts.\n");
  }

  ng->zeroton_fraction = pc_doublearg(argc,argv,"-zeroton_fraction",1.0);

  /* Attempt to open all the files that we will need for input and
     output. It is better to do it here than to spend a few hours of
     CPU processing id-gram counts, only to find that the output path
     is invalid. */

  ng->id_gram_fp = rr_iopen(ng->id_gram_filename);

  /* Vocab is read by Roni's function which does the file opening for
     us, so no need to do it here. Don't need to worry about time
     being lost if file doesn't exist, since vocab is first thing to
     be read anyway. */

  if (ng->context_set)
    ng->context_cues_fp = rr_iopen(ng->context_cues_filename);

  if (ng->write_arpa)
    ng->arpa_fp = rr_oopen(ng->arpa_filename);

  if (ng->write_bin) 
    ng->bin_fp = rr_oopen(ng->bin_filename);

  return ng;
}