Пример #1
0
static char *check_prob(
  BOOLEAN add_x,				/* add x-tuples if TRUE */
  double *a_p,					/* tuple->prob assoc. array */
  int n,					/* widest tuple */
  char *tuple,					/* call with "" */
  int tuplew,					/* tuple width; call with 0 */
  char *alpha 					/* alphabet */
) 
{
  int i;
  char *t = NULL, *x = NULL; 			/* tuple and x-tuple */
  char *missing;				/* missing tuple */
  int ti;					/* index of tuple */

  if (n==0) return(NULL);			/* everything is OK */

  Resize(t, tuplew+2, char);
  Resize(x, tuplew+2, char);
  
  for(i=0; alpha[i+add_x]; i++) {		/* ignore last letter (X) */
    /* append letter to tuple */
    strcpy(t, tuple); t[tuplew] = alpha[i]; t[tuplew+1] = '\0';
    /* check that tuple exists */
    ti = s2i(t);				/* index of tuple */
    if (a_p[ti] < 0) return(t);
    /* add the current tuple's probability to matching X-containing tuples */
    if (add_x) add_x_tuples(a_p, t, tuplew+1, ti, x, 0, 0);
    /* recur */
    missing = check_prob(add_x, a_p, n-1, t, tuplew+1, alpha);
    if (missing) return(missing);
  } /* letter */
  myfree(t);
  if (add_x) myfree(x);

  return(NULL);					/* all found */
} /* check_prob */
Пример #2
0
 inline bool check_dist_and_prob(const char* function, const RealType& alpha, const RealType& beta, RealType p, RealType* result, const Policy& pol)
 {
   return check_dist(function, alpha, beta, result, pol)
     && check_prob(function, p, result, pol);
 } // bool check_dist_and_prob
Пример #3
0
int oe_11_main(int argc,char** argv)
{
	arpa_lm_t arpa_lm,lm1,lm2;
	char header1[MAX_HEADER];
	char header2[MAX_HEADER];
	flag backoff_from_unk_inc,backoff_from_unk_exc,backoff_from_ccs_inc,backoff_from_ccs_exc;
	char *lmfile1,*lmfile2,*newlmfile, *wtfile;
	char *fb_list_filename,*ccs_filename;
	fb_info *fb1,*fb2,*fb;
	double w1,w2;
	
	if (pc_flagarg(&argc, argv,"-help") || argc == 1) {
	  oe_05_help_message();
	  exit(1);
	}
	
	lmfile1 = rr_salloc(pc_stringarg(&argc, argv,"-lm1",""));
	if (0 == strcmp(lmfile1, "")) {
		fprintf(stderr, "ERROR: Please specify a first input file with -lm1.\n");
		oe_05_help_message();
	}
	lmfile2 = rr_salloc(pc_stringarg(&argc, argv,"-lm2",""));
	if (0 == strcmp(lmfile2, "")) {
		fprintf(stderr, "ERROR: Please specify a second input file with -lm2.\n");
		oe_05_help_message();
	}
	newlmfile = rr_salloc(pc_stringarg(&argc, argv,"-lm",""));
	if (0 == strcmp(newlmfile, "")) {
		fprintf(stderr, "ERROR: Please specify a destination file with -lm.\n");
		oe_05_help_message();
	}
	fb_list_filename = rr_salloc(pc_stringarg(&argc, argv,"-forced_backoff",""));
	wtfile= rr_salloc(pc_stringarg(&argc, argv,"-weight",""));
	if (0 == strcmp(wtfile, "")) {
		fprintf(stderr, "ERROR: Please specify a weights file with -weight.\n");
		oe_05_help_message();
	}
	ccs_filename= rr_salloc(pc_stringarg(&argc, argv,"-context",""));

	backoff_from_unk_inc = pc_flagarg(&argc,argv,"-backoff_from_unk_inc");
	backoff_from_ccs_inc = pc_flagarg(&argc,argv,"-backoff_from_ccs_inc");
	backoff_from_unk_exc = pc_flagarg(&argc,argv,"-backoff_from_unk_exc");
	backoff_from_ccs_exc = pc_flagarg(&argc,argv,"-backoff_from_ccs_exc");
  
	robust_load_arpa_lm(&lm1,lmfile1,header1,MAX_HEADER);
	robust_load_arpa_lm(&lm2,lmfile2,header2,MAX_HEADER);
	
	load_weights(&w1,&w2,wtfile);

	printf ("\ncombine lms\n");
	combine_lm(&arpa_lm,&lm1,&lm2);

	printf ("\nloading context cues.\n");
	load_context_cue(&arpa_lm,ccs_filename);
	load_context_cue(&lm1,ccs_filename);
	load_context_cue(&lm2,ccs_filename);

	fb=gen_fb_list(arpa_lm.vocab_ht,
		arpa_lm.vocab_size,
		arpa_lm.vocab,
		arpa_lm.context_cue,
		backoff_from_unk_inc,
		backoff_from_unk_exc,
		backoff_from_ccs_inc,
		backoff_from_ccs_exc,
		fb_list_filename);

	fb1=gen_fb_list(lm1.vocab_ht,
		lm1.vocab_size,
		lm1.vocab,
		lm1.context_cue,
		backoff_from_unk_inc,
		backoff_from_unk_exc,
		backoff_from_ccs_inc,
		backoff_from_ccs_exc,
		fb_list_filename);

	fb2=gen_fb_list(lm2.vocab_ht,
		lm2.vocab_size,
		lm2.vocab,
		lm2.context_cue,
		backoff_from_unk_inc,
		backoff_from_unk_exc,
		backoff_from_ccs_inc,
		backoff_from_ccs_exc,
		fb_list_filename);
	
	printf ("\nrecaculate oov probabilities.\n");
	recalc_oov_prob(&arpa_lm,&lm1,&lm2);

	printf ("\ncheck probabilities\n");
	check_prob(&arpa_lm,&lm1,&lm2,fb1,fb2,w1,w2);

	printf ("\ncalculate interpolated probabilities\n");
	calc_interpolated_prob(&arpa_lm,&lm1,&lm2,fb1,fb2,w1,w2);

	printf ("\ncalculate backoff weights\n");
	calc_backoff_weight(&arpa_lm,fb);

	printf ("\nwrite interpolated lm\n");
	write_interpolated_lm(&arpa_lm,newlmfile,header1,header2,2);

	printf ("\nfinished\n");

	return 0;
}
Пример #4
0
extern double *read_markov_model( 
  char *pfile, 					/* name of probability file */
  double *freq,					/* letter frequencies */
  char *alpha,					/* alphabet expected */
  BOOLEAN add_x,				/* add x-tuples if TRUE */
  BOOLEAN rc,					/* average reverse complements*/
  int *order					/* order of model read */
) 
{
  int i;					/* index into array */
  double a_p[MAX_BACK_SIZE];			/* tuple-prob array */
  double *a_cp=NULL; 				/* conditional prob. array */
  FILE *pfilep;					/* file pointer to file */
  char *line=NULL;				/* line buffer */
  char **fields=NULL;				/* fields of line */
  int nfields;					/* number of fields in line */
  int line_no=0;				/* line number */
  char *tuple;					/* the tuple */
  double p;					/* the probability */
  int maxw=0;					/* maximum tuple width */
  int alen=strlen(alpha);			/* length of alphabet */
  int ntuples;					/* number of tuples */

  /* check input */
  if (!pfile && !freq) {
    fprintf(stderr, "read_markov_model error: specify pfile or freq\n");
    exit(1);
  }

  /* add 'X 'to the alphabet if requested */
  if (add_x) {
    char *tmp = NULL;
    Resize(tmp, alen+2, char);
    strcpy(tmp, alpha);
    tmp[alen] = 'X'; tmp[alen+1] = '\0';
    alpha = tmp;
    alen++; 
  }

  /* setup the mapping from ascii to integer and back */
  setup_index(alpha);

  /* use the frequencies if given */
  if (freq) {					/* frequencies given */
    Resize(a_cp, alen, double);
    for (i=0; i<alen-add_x; i++) RND(freq[i], 8, a_cp[i]);
    if (add_x) a_cp[i] = 1.0;			/* Pr(X) */
    /* average reverse complement probabilities together if requested */
    if (rc) average_rc(add_x, a_cp, 1, "", 0, alpha); 
    return(a_cp);
  }

  /* initialize probability array */
  for (i=0; i<MAX_BACK_SIZE; i++) a_p[i] = -1;

  /* read in the probabilities and save indexed by uppercase tuple name */
  if (!(pfilep = fopen(pfile, "r"))) {
    fprintf(stderr, "Unable to open file %s for reading.\n", pfile);
    exit(1);
  }

  /*fprintf(stderr, "Reading background probabilities...\n");*/
  while (1) {					/* read file */
    int len, index;
    line_no++;
    Getline(pfilep, line, len);			/* read next line */
    if (!line) break;				/* at EOF */
    if (line[0] == '#') continue;		/* skip comments */
    Split(line, fields, nfields);		/* get tuple and prob */
    if (nfields != 2) {
      fprintf(stderr, 
        "Formatting error in file %s line %d: %s\n", pfile, line_no, line);
      exit(1);
    }
    tuple = fields[0];
    p = atof(fields[1]);
    if (p<0 || p>1) {
      fprintf(stderr, "Illegal probability in file %s line %d: %s\n", 
        pfile, line_no, line);
    }
    len = strlen(tuple);
    maxw = MAX(len, maxw);
    index = s2i(tuple);
    if (index < 0) {
      fprintf(stderr, "Illegal character in word `%s' in file %s line %d: %s\n",
        tuple, pfile, line_no, line);
      exit(1);
    }
    if (index >= MAX_BACK_SIZE) {
      for (i=1, ntuples=0; i<=maxw; i++) ntuples+= pow(alen, i);
      fprintf(stderr, "Background model too large.  Use smaller model or increase \nMAX_BACK_SIZE to at least %d in background.h and recompile.\n", ntuples);
      exit(1);
    }
    a_p[index] = p;				/* store probability */
  }
  fclose(pfilep);

  /* check that all necessary probabilities are defined */
  tuple = check_prob(add_x, a_p, maxw, "", 0, alpha); 
  if (tuple) { 
    fprintf(stderr, "File %s gives no probability for %s.\n", pfile, 
      tuple);
    exit(1);
  }

  *order = maxw - 1;				/* order of Markov model */

  /* average reverse complement probabilities together if requested */
  if (rc) average_rc(add_x, a_p, maxw, "", 0, alpha); 

  /* get conditional probabilities */
  for (i=1, ntuples=0; i<=maxw; i++) ntuples+= pow(alen, i);
  a_cp = get_cond_prob(a_p, ntuples);

  /* print the probabilities */
#ifdef DEBUG
  print_prob(a_cp, maxw, "", 0, alpha);
#endif

  return(a_cp);					/* return conditionals */
} /* read_markov_model */