static char *check_prob( BOOLEAN add_x, /* add x-tuples if TRUE */ double *a_p, /* tuple->prob assoc. array */ int n, /* widest tuple */ char *tuple, /* call with "" */ int tuplew, /* tuple width; call with 0 */ char *alpha /* alphabet */ ) { int i; char *t = NULL, *x = NULL; /* tuple and x-tuple */ char *missing; /* missing tuple */ int ti; /* index of tuple */ if (n==0) return(NULL); /* everything is OK */ Resize(t, tuplew+2, char); Resize(x, tuplew+2, char); for(i=0; alpha[i+add_x]; i++) { /* ignore last letter (X) */ /* append letter to tuple */ strcpy(t, tuple); t[tuplew] = alpha[i]; t[tuplew+1] = '\0'; /* check that tuple exists */ ti = s2i(t); /* index of tuple */ if (a_p[ti] < 0) return(t); /* add the current tuple's probability to matching X-containing tuples */ if (add_x) add_x_tuples(a_p, t, tuplew+1, ti, x, 0, 0); /* recur */ missing = check_prob(add_x, a_p, n-1, t, tuplew+1, alpha); if (missing) return(missing); } /* letter */ myfree(t); if (add_x) myfree(x); return(NULL); /* all found */ } /* check_prob */
inline bool check_dist_and_prob(const char* function, const RealType& alpha, const RealType& beta, RealType p, RealType* result, const Policy& pol) { return check_dist(function, alpha, beta, result, pol) && check_prob(function, p, result, pol); } // bool check_dist_and_prob
int oe_11_main(int argc,char** argv) { arpa_lm_t arpa_lm,lm1,lm2; char header1[MAX_HEADER]; char header2[MAX_HEADER]; flag backoff_from_unk_inc,backoff_from_unk_exc,backoff_from_ccs_inc,backoff_from_ccs_exc; char *lmfile1,*lmfile2,*newlmfile, *wtfile; char *fb_list_filename,*ccs_filename; fb_info *fb1,*fb2,*fb; double w1,w2; if (pc_flagarg(&argc, argv,"-help") || argc == 1) { oe_05_help_message(); exit(1); } lmfile1 = rr_salloc(pc_stringarg(&argc, argv,"-lm1","")); if (0 == strcmp(lmfile1, "")) { fprintf(stderr, "ERROR: Please specify a first input file with -lm1.\n"); oe_05_help_message(); } lmfile2 = rr_salloc(pc_stringarg(&argc, argv,"-lm2","")); if (0 == strcmp(lmfile2, "")) { fprintf(stderr, "ERROR: Please specify a second input file with -lm2.\n"); oe_05_help_message(); } newlmfile = rr_salloc(pc_stringarg(&argc, argv,"-lm","")); if (0 == strcmp(newlmfile, "")) { fprintf(stderr, "ERROR: Please specify a destination file with -lm.\n"); oe_05_help_message(); } fb_list_filename = rr_salloc(pc_stringarg(&argc, argv,"-forced_backoff","")); wtfile= rr_salloc(pc_stringarg(&argc, argv,"-weight","")); if (0 == strcmp(wtfile, "")) { fprintf(stderr, "ERROR: Please specify a weights file with -weight.\n"); oe_05_help_message(); } ccs_filename= rr_salloc(pc_stringarg(&argc, argv,"-context","")); backoff_from_unk_inc = pc_flagarg(&argc,argv,"-backoff_from_unk_inc"); backoff_from_ccs_inc = pc_flagarg(&argc,argv,"-backoff_from_ccs_inc"); backoff_from_unk_exc = pc_flagarg(&argc,argv,"-backoff_from_unk_exc"); backoff_from_ccs_exc = pc_flagarg(&argc,argv,"-backoff_from_ccs_exc"); robust_load_arpa_lm(&lm1,lmfile1,header1,MAX_HEADER); robust_load_arpa_lm(&lm2,lmfile2,header2,MAX_HEADER); load_weights(&w1,&w2,wtfile); printf ("\ncombine lms\n"); combine_lm(&arpa_lm,&lm1,&lm2); printf ("\nloading context cues.\n"); load_context_cue(&arpa_lm,ccs_filename); load_context_cue(&lm1,ccs_filename); load_context_cue(&lm2,ccs_filename); fb=gen_fb_list(arpa_lm.vocab_ht, arpa_lm.vocab_size, arpa_lm.vocab, arpa_lm.context_cue, backoff_from_unk_inc, backoff_from_unk_exc, backoff_from_ccs_inc, backoff_from_ccs_exc, fb_list_filename); fb1=gen_fb_list(lm1.vocab_ht, lm1.vocab_size, lm1.vocab, lm1.context_cue, backoff_from_unk_inc, backoff_from_unk_exc, backoff_from_ccs_inc, backoff_from_ccs_exc, fb_list_filename); fb2=gen_fb_list(lm2.vocab_ht, lm2.vocab_size, lm2.vocab, lm2.context_cue, backoff_from_unk_inc, backoff_from_unk_exc, backoff_from_ccs_inc, backoff_from_ccs_exc, fb_list_filename); printf ("\nrecaculate oov probabilities.\n"); recalc_oov_prob(&arpa_lm,&lm1,&lm2); printf ("\ncheck probabilities\n"); check_prob(&arpa_lm,&lm1,&lm2,fb1,fb2,w1,w2); printf ("\ncalculate interpolated probabilities\n"); calc_interpolated_prob(&arpa_lm,&lm1,&lm2,fb1,fb2,w1,w2); printf ("\ncalculate backoff weights\n"); calc_backoff_weight(&arpa_lm,fb); printf ("\nwrite interpolated lm\n"); write_interpolated_lm(&arpa_lm,newlmfile,header1,header2,2); printf ("\nfinished\n"); return 0; }
extern double *read_markov_model( char *pfile, /* name of probability file */ double *freq, /* letter frequencies */ char *alpha, /* alphabet expected */ BOOLEAN add_x, /* add x-tuples if TRUE */ BOOLEAN rc, /* average reverse complements*/ int *order /* order of model read */ ) { int i; /* index into array */ double a_p[MAX_BACK_SIZE]; /* tuple-prob array */ double *a_cp=NULL; /* conditional prob. array */ FILE *pfilep; /* file pointer to file */ char *line=NULL; /* line buffer */ char **fields=NULL; /* fields of line */ int nfields; /* number of fields in line */ int line_no=0; /* line number */ char *tuple; /* the tuple */ double p; /* the probability */ int maxw=0; /* maximum tuple width */ int alen=strlen(alpha); /* length of alphabet */ int ntuples; /* number of tuples */ /* check input */ if (!pfile && !freq) { fprintf(stderr, "read_markov_model error: specify pfile or freq\n"); exit(1); } /* add 'X 'to the alphabet if requested */ if (add_x) { char *tmp = NULL; Resize(tmp, alen+2, char); strcpy(tmp, alpha); tmp[alen] = 'X'; tmp[alen+1] = '\0'; alpha = tmp; alen++; } /* setup the mapping from ascii to integer and back */ setup_index(alpha); /* use the frequencies if given */ if (freq) { /* frequencies given */ Resize(a_cp, alen, double); for (i=0; i<alen-add_x; i++) RND(freq[i], 8, a_cp[i]); if (add_x) a_cp[i] = 1.0; /* Pr(X) */ /* average reverse complement probabilities together if requested */ if (rc) average_rc(add_x, a_cp, 1, "", 0, alpha); return(a_cp); } /* initialize probability array */ for (i=0; i<MAX_BACK_SIZE; i++) a_p[i] = -1; /* read in the probabilities and save indexed by uppercase tuple name */ if (!(pfilep = fopen(pfile, "r"))) { fprintf(stderr, "Unable to open file %s for reading.\n", pfile); exit(1); } /*fprintf(stderr, "Reading background probabilities...\n");*/ while (1) { /* read file */ int len, index; line_no++; Getline(pfilep, line, len); /* read next line */ if (!line) break; /* at EOF */ if (line[0] == '#') continue; /* skip comments */ Split(line, fields, nfields); /* get tuple and prob */ if (nfields != 2) { fprintf(stderr, "Formatting error in file %s line %d: %s\n", pfile, line_no, line); exit(1); } tuple = fields[0]; p = atof(fields[1]); if (p<0 || p>1) { fprintf(stderr, "Illegal probability in file %s line %d: %s\n", pfile, line_no, line); } len = strlen(tuple); maxw = MAX(len, maxw); index = s2i(tuple); if (index < 0) { fprintf(stderr, "Illegal character in word `%s' in file %s line %d: %s\n", tuple, pfile, line_no, line); exit(1); } if (index >= MAX_BACK_SIZE) { for (i=1, ntuples=0; i<=maxw; i++) ntuples+= pow(alen, i); fprintf(stderr, "Background model too large. Use smaller model or increase \nMAX_BACK_SIZE to at least %d in background.h and recompile.\n", ntuples); exit(1); } a_p[index] = p; /* store probability */ } fclose(pfilep); /* check that all necessary probabilities are defined */ tuple = check_prob(add_x, a_p, maxw, "", 0, alpha); if (tuple) { fprintf(stderr, "File %s gives no probability for %s.\n", pfile, tuple); exit(1); } *order = maxw - 1; /* order of Markov model */ /* average reverse complement probabilities together if requested */ if (rc) average_rc(add_x, a_p, maxw, "", 0, alpha); /* get conditional probabilities */ for (i=1, ntuples=0; i<=maxw; i++) ntuples+= pow(alen, i); a_cp = get_cond_prob(a_p, ntuples); /* print the probabilities */ #ifdef DEBUG print_prob(a_cp, maxw, "", 0, alpha); #endif return(a_cp); /* return conditionals */ } /* read_markov_model */