static void utest_ReadWrite(ESL_RANDOMNESS *rng) { char msg[] = "bg Read/Write unit test failed"; char tmpfile[32] = "esltmpXXXXXX"; FILE *fp = NULL; ESL_ALPHABET *abc = NULL; /* random alphabet choice eslRNA..eslDICE */ float *fq = NULL; P7_BG *bg = NULL; if ((abc = esl_alphabet_Create(esl_rnd_Roll(rng, 5) + 1)) == NULL) esl_fatal(msg); if (( bg = p7_bg_Create(abc)) == NULL) esl_fatal(msg); if (( fq = malloc(sizeof(float) * abc->K)) == NULL) esl_fatal(msg); do { if (esl_dirichlet_FSampleUniform(rng, abc->K, fq) != eslOK) esl_fatal(msg); } while (esl_vec_FMin(fq, abc->K) < 0.001); /* small p's will get rounded off and fail FCompare() */ esl_vec_FCopy(fq, abc->K, bg->f); if (esl_tmpfile_named(tmpfile, &fp) != eslOK) esl_fatal(msg); if ( p7_bg_Write(fp, bg) != eslOK) esl_fatal(msg); fclose(fp); esl_vec_FSet(bg->f, bg->abc->K, 0.0); if ( p7_bg_Read(tmpfile, bg, NULL) != eslOK) esl_fatal(msg); if ( esl_vec_FCompare(fq, bg->f, bg->abc->K, 0.01) != eslOK) esl_fatal(msg); p7_bg_Destroy(bg); esl_alphabet_Destroy(abc); free(fq); remove(tmpfile); }
/* Function: p7_Seqmodel() * Synopsis: Make a profile HMM from a single sequence. * * Purpose: Make a profile HMM from a single sequence, for * probabilistic Smith/Waterman alignment, HMMER3-style. * * The query is digital sequence <dsq> of length <M> * residues in alphabet <abc>, named <name>. * * The scoring system is given by <Q>, <f>, <popen>, and * <pextend>. <Q> is a $K \times K$ matrix giving * conditional residue probabilities $P(a \mid b)}$; these * are typically obtained by reverse engineering a score * matrix like BLOSUM62. <f> is a vector of $K$ background * frequencies $p_a$. <popen> and <pextend> are the * probabilities assigned to gap-open ($t_{MI}$ and * $t_{MD}$) and gap-extend ($t_{II}$ and $t_{DD}$) * transitions. * * The <p7H_SINGLE> flag is set on the <hmm>. Model * configuration (<p7_profile_Config(), friends> detects * this flag. <B->Mk> entry transitions include a match * state occupancy term for profile HMMs, but for single * queries, that <occ[]> term is assumed 1.0 for all * positions. See commentary in modelconfig.c. * * Args: * * Returns: <eslOK> on success, and a newly allocated HMM is returned * in <ret_hmm>. * * Throws: <eslEMEM> on allocation error, and <*ret_hmm> is <NULL>. */ int p7_Seqmodel(const ESL_ALPHABET *abc, ESL_DSQ *dsq, int M, char *name, ESL_DMATRIX *Q, float *f, double popen, double pextend, P7_HMM **ret_hmm) { int status; P7_HMM *hmm = NULL; char *logmsg = "[HMM created from a query sequence]"; int k; if ((hmm = p7_hmm_Create(M, abc)) == NULL) { status = eslEMEM; goto ERROR; } for (k = 0; k <= M; k++) { /* Use rows of P matrix as source of match emission vectors */ if (k > 0) esl_vec_D2F(Q->mx[(int) dsq[k]], abc->K, hmm->mat[k]); /* Set inserts to background for now. This will be improved. */ esl_vec_FCopy(f, abc->K, hmm->ins[k]); hmm->t[k][p7H_MM] = 1.0 - 2 * popen; hmm->t[k][p7H_MI] = popen; hmm->t[k][p7H_MD] = popen; hmm->t[k][p7H_IM] = 1.0 - pextend; hmm->t[k][p7H_II] = pextend; hmm->t[k][p7H_DM] = 1.0 - pextend; hmm->t[k][p7H_DD] = pextend; } /* Deal w/ special stuff at node M, overwriting a little of what we * just did. */ hmm->t[M][p7H_MM] = 1.0 - popen; hmm->t[M][p7H_MD] = 0.; hmm->t[M][p7H_DM] = 1.0; hmm->t[M][p7H_DD] = 0.; /* Add mandatory annotation */ p7_hmm_SetName(hmm, name); p7_hmm_AppendComlog(hmm, 1, &logmsg); hmm->nseq = 1; p7_hmm_SetCtime(hmm); hmm->checksum = 0; hmm->flags |= p7H_SINGLE; *ret_hmm = hmm; return eslOK; ERROR: if (hmm != NULL) p7_hmm_Destroy(hmm); *ret_hmm = NULL; return status; }
/* Function: p7_profile_Copy() * Synopsis: Copy a profile. * * Purpose: Copies profile <src> to profile <dst>, where <dst> * has already been allocated to be of sufficient size, * and has the same alphabet. * * Returns: <eslOK> on success. * * Throws: <eslEMEM> on allocation error; <eslEINVAL> if <dst> is too small * to fit <src> or is for a different alphabet. */ int p7_profile_Copy(const P7_PROFILE *src, P7_PROFILE *dst) { int x,z; int status; if (src->M > dst->allocM) ESL_EXCEPTION(eslEINVAL, "destination profile is too small to hold a copy of source profile"); if (src->abc->type != dst->abc->type) ESL_EXCEPTION(eslEINVAL, "destination profile has different alphabet than source"); dst->M = src->M; esl_vec_FCopy(src->tsc, (src->M+1)*p7P_NTRANS, dst->tsc); for (x = 0; x < src->abc->Kp; x++) esl_vec_FCopy(src->rsc[x], (src->M+1)*p7P_NR, dst->rsc[x]); for (x = 0; x < p7P_NXSTATES; x++) esl_vec_FCopy(src->xsc[x], p7P_NXTRANS, dst->xsc[x]); dst->L = src->L; dst->nj = src->nj; dst->pglocal = src->pglocal; if (dst->name) free(dst->name); if (dst->acc) free(dst->acc); if (dst->desc) free(dst->desc); if ((status = esl_strdup(src->name, -1, &(dst->name))) != eslOK) return status; if ((status = esl_strdup(src->acc, -1, &(dst->acc))) != eslOK) return status; if ((status = esl_strdup(src->desc, -1, &(dst->desc))) != eslOK) return status; strcpy(dst->rf, src->rf); /* RF is optional: if it's not set, *rf=0, and strcpy still works fine */ strcpy(dst->mm, src->mm); /* MM is also optional annotation */ strcpy(dst->cs, src->cs); /* CS is also optional annotation */ strcpy(dst->consensus, src->consensus); /* consensus though is always present on a valid profile */ for (z = 0; z < p7_NEVPARAM; z++) dst->evparam[z] = src->evparam[z]; for (z = 0; z < p7_NCUTOFFS; z++) dst->cutoff[z] = src->cutoff[z]; for (z = 0; z < p7_MAXABET; z++) dst->compo[z] = src->compo[z]; for (x = 0; x < p7_NOFFSETS; ++x) dst->offs[x] = src->offs[x]; dst->roff = src->roff; dst->eoff = src->eoff; dst->max_length = src->max_length; return eslOK; }
/* Function: cp9_Copy() * Synopsis: Copy a CM plan 9 HMM. * * Purpose: Copies cp9 hmm <src> to cp9 hmm <dst>, where <dst> * has already been allocated to be of sufficient size. * * <src> should be properly normalized, no check is done to * ensure that. If <src> is logoddsified (src->flags & * CPLAN9_HASBITS) its bit scores will be copied to <dst>, * otherwise they are invalid and won't be copied. * * Returns: <eslOK> on success. * * Throws: <eslEMEM> on allocation error; <eslEINVAL> if <dst> is too small * to fit <src>. */ int cp9_Copy(const CP9_t *src, CP9_t *dst) { int status; int k; int src_has_bits = (src->flags & CPLAN9_HASBITS) ? TRUE : FALSE; if (src->M != dst->M) return eslEINVAL; dst->abc = src->abc; for(k = 0; k <= src->M; k++) { esl_vec_FCopy(src->t[k], cp9_NTRANS, dst->t[k]); esl_vec_FCopy(src->mat[k], src->abc->K, dst->mat[k]); esl_vec_FCopy(src->ins[k], src->abc->K, dst->ins[k]); } esl_vec_FCopy(src->begin, src->M+1, dst->begin); esl_vec_FCopy(src->end, src->M+1, dst->end); if(src_has_bits) { esl_vec_ICopy(src->bsc_mem, src->M+1, dst->bsc_mem); esl_vec_ICopy(src->esc_mem, src->M+1, dst->esc_mem); } /* exploit linear-memory of these 2d arrays */ if(src_has_bits) { esl_vec_ICopy(src->tsc_mem, cp9_NTRANS * (src->M+1), dst->tsc_mem); esl_vec_ICopy(src->msc_mem, src->abc->Kp * (src->M+1), dst->msc_mem); esl_vec_ICopy(src->isc_mem, src->abc->Kp * (src->M+1), dst->isc_mem); esl_vec_ICopy(src->otsc, cp9O_NTRANS * (src->M+1), dst->otsc); } /* EL info */ dst->el_self = src->el_self; dst->el_selfsc = src->el_selfsc; esl_vec_ICopy(src->has_el, src->M+1, dst->has_el); esl_vec_ICopy(src->el_from_ct, src->M+2, dst->el_from_ct); for(k = 0; k <= src->M+1; k++) { if(src->el_from_ct[k] > 0) { ESL_ALLOC(dst->el_from_idx[k], sizeof(int) * src->el_from_ct[k]); ESL_ALLOC(dst->el_from_cmnd[k], sizeof(int) * src->el_from_ct[k]); esl_vec_ICopy(src->el_from_idx[k], src->el_from_ct[k], dst->el_from_idx[k]); esl_vec_ICopy(src->el_from_cmnd[k], src->el_from_ct[k], dst->el_from_cmnd[k]); } } dst->null2_omega = src->null2_omega; dst->null3_omega = src->null3_omega; esl_vec_FCopy(src->null, src->abc->K, dst->null); dst->p1 = src->p1; dst->flags = src->flags; return eslOK; ERROR: return status; }
/* Function: p7_bg_SetFilter() * Synopsis: Configure filter HMM with new model composition. * Incept: SRE, Fri Dec 5 09:08:15 2008 [Janelia] * * Purpose: The "filter HMM" is an experimental filter in the * acceleration pipeline for avoiding biased composition * sequences. It has no effect on final scoring, if a * sequence passes all steps of the pipeline; it is only * used to eliminate biased sequences from further * consideration early in the pipeline, before the big guns * of domain postprocessing are applied. * * At least at present, it doesn't actually work as well as * one would hope. This will be an area of future work. * What we really want to do is make a better null model of * real protein sequences (and their biases), and incorporate * that model into the flanks (NCJ states) of the profile. * * <compo> is the average model residue composition, from * either the HMM or the copy in a profile or optimized * profile. <M> is the length of the model in nodes. * * Returns: <eslOK> on success. * * Throws: (no abnormal error conditions) * * Xref: J4/25: generalized to use composition vector, not * specifically an HMM. * * Note: This looks like a two-state HMM, but if you start thinking * about its length distribution ("oh my god, L0 assumes a * fixed L=400 expectation, it's all wrong, it's not conditional * on the target sequence length and length modeling's messed * up!"), don't panic. It's set up as a conditional-on-L model; * the P(L) term is added in p7_bg_FilterScore() below. */ int p7_bg_SetFilter(P7_BG *bg, int M, const float *compo) { float L0 = 400.0; /* mean length in state 0 of filter HMM (normal background) */ float L1 = (float) M / 8.0; /* mean length in state 1 of filter HMM (biased segment) */ /* State 0 is the normal iid model. */ bg->fhmm->t[0][0] = L0 / (L0+1.0f); bg->fhmm->t[0][1] = 1.0f / (L0+1.0f); bg->fhmm->t[0][2] = 1.0f; /* 1.0 transition to E means we'll set length distribution externally. */ esl_vec_FCopy(bg->f, bg->abc->K, bg->fhmm->e[0]); /* State 1 is the potentially biased model composition. */ bg->fhmm->t[1][0] = 1.0f / (L1+1.0f); bg->fhmm->t[1][1] = L1 / (L1+1.0f); bg->fhmm->t[1][2] = 1.0f; /* 1.0 transition to E means we'll set length distribution externally. */ esl_vec_FCopy(compo, bg->abc->K, bg->fhmm->e[1]); bg->fhmm->pi[0] = 0.999; bg->fhmm->pi[1] = 0.001; esl_hmm_Configure(bg->fhmm, bg->f); return eslOK; }
int esl_vec_FLogValidate(float *vec, int n, float tol, char *errbuf) { int status; float *expvec = NULL; if (errbuf) *errbuf = 0; if (n == 0) return eslOK; ESL_ALLOC(expvec, sizeof(float)*n); esl_vec_FCopy(vec, n, expvec); esl_vec_FExp(expvec, n); if ((status = esl_vec_FValidate(expvec, n, tol, errbuf)) != eslOK) goto ERROR; free(expvec); return eslOK; ERROR: if (expvec != NULL) free(expvec); return eslOK; }
static void utest_pvectors(void) { char *msg = "pvector unit test failed"; double p1[4] = { 0.25, 0.25, 0.25, 0.25 }; double p2[4]; double p3[4]; float p1f[4]; float p2f[4] = { 0.0, 0.5, 0.5, 0.0 }; float p3f[4]; int n = 4; double result; esl_vec_D2F(p1, n, p1f); esl_vec_F2D(p2f, n, p2); if (esl_vec_DValidate(p1, n, 1e-12, NULL) != eslOK) esl_fatal(msg); if (esl_vec_FValidate(p1f, n, 1e-7, NULL) != eslOK) esl_fatal(msg); result = esl_vec_DEntropy(p1, n); if (esl_DCompare(2.0, result, 1e-9) != eslOK) esl_fatal(msg); result = esl_vec_FEntropy(p1f, n); if (esl_DCompare(2.0, result, 1e-9) != eslOK) esl_fatal(msg); result = esl_vec_DEntropy(p2, n); if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg); result = esl_vec_FEntropy(p2f, n); if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg); result = esl_vec_DRelEntropy(p2, p1, n); if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg); result = esl_vec_FRelEntropy(p2f, p1f, n); if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg); result = esl_vec_DRelEntropy(p1, p2, n); if (result != eslINFINITY) esl_fatal(msg); result = esl_vec_FRelEntropy(p1f, p2f, n); if (result != eslINFINITY) esl_fatal(msg); esl_vec_DLog(p2, n); if (esl_vec_DLogValidate(p2, n, 1e-12, NULL) != eslOK) esl_fatal(msg); esl_vec_DExp(p2, n); if (p2[0] != 0.) esl_fatal(msg); esl_vec_FLog(p2f, n); if (esl_vec_FLogValidate(p2f, n, 1e-7, NULL) != eslOK) esl_fatal(msg); esl_vec_FExp(p2f, n); if (p2f[0] != 0.) esl_fatal(msg); esl_vec_DCopy(p2, n, p3); esl_vec_DScale(p3, n, 10.); esl_vec_DNorm(p3, n); if (esl_vec_DCompare(p2, p3, n, 1e-12) != eslOK) esl_fatal(msg); esl_vec_DLog(p3, n); result = esl_vec_DLogSum(p3, n); if (esl_DCompare(0.0, result, 1e-12) != eslOK) esl_fatal(msg); esl_vec_DIncrement(p3, n, 2.0); esl_vec_DLogNorm(p3, n); if (esl_vec_DCompare(p2, p3, n, 1e-12) != eslOK) esl_fatal(msg); esl_vec_FCopy(p2f, n, p3f); esl_vec_FScale(p3f, n, 10.); esl_vec_FNorm(p3f, n); if (esl_vec_FCompare(p2f, p3f, n, 1e-7) != eslOK) esl_fatal(msg); esl_vec_FLog(p3f, n); result = esl_vec_FLogSum(p3f, n); if (esl_DCompare(0.0, result, 1e-7) != eslOK) esl_fatal(msg); esl_vec_FIncrement(p3f, n, 2.0); esl_vec_FLogNorm(p3f, n); if (esl_vec_FCompare(p2f, p3f, n, 1e-7) != eslOK) esl_fatal(msg); return; }
/** * int main(int argc, char **argv) * Main driver */ int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* command line processing */ ESL_ALPHABET *abc = NULL; char *hmmfile = NULL; char *outhmmfile = NULL; P7_HMMFILE *hfp = NULL; FILE *outhmmfp; /* HMM output file handle */ P7_HMM *hmm = NULL; P7_BG *bg = NULL; int nhmm; double x; float KL; int status; char errbuf[eslERRBUFSIZE]; float average_internal_transitions[ p7H_NTRANSITIONS ]; int k; char errmsg[eslERRBUFSIZE]; /* Process the command line options. */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_GetBoolean(go, "-h") == TRUE) { profillic_p7_banner(stdout, argv[0], banner); esl_usage(stdout, argv[0], usage); puts("\nOptions:"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=docgroup, 2 = indentation; 80=textwidth*/ exit(0); } if (esl_opt_ArgNumber(go) != 2) { puts("Incorrect number of command line arguments."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if ((hmmfile = esl_opt_GetArg(go, 1)) == NULL) { puts("Failed to read <input hmmfile> argument from command line."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if ((outhmmfile = esl_opt_GetArg(go, 2)) == NULL) { puts("Failed to read <output hmmfile> argument from command line."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } profillic_p7_banner(stdout, argv[0], banner); /* Initializations: open the input HMM file for reading */ status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, hmmfile, errbuf); /* Initializations: open the output HMM file for writing */ if ((outhmmfp = fopen(outhmmfile, "w")) == NULL) ESL_FAIL(status, errmsg, "Failed to open HMM file %s for writing", outhmmfile); /* Main body: read HMMs one at a time, print one line of stats */ printf("#\n"); printf("# %-4s %-20s %-12s %8s %8s %6s %6s %6s %6s %6s\n", "idx", "name", "accession", "nseq", "eff_nseq", "M", "relent", "info", "p relE", "compKL"); printf("# %-4s %-20s %-12s %8s %8s %6s %6s %6s %6s %6s\n", "----", "--------------------", "------------", "--------", "--------", "------", "------", "------", "------", "------"); nhmm = 0; while ((status = p7_hmmfile_Read(hfp, &abc, &hmm)) != eslEOF) { if (status == eslEOD) esl_fatal("read failed, HMM file %s may be truncated?", hmmfile); else if (status == eslEFORMAT) esl_fatal("bad file format in HMM file %s", hmmfile); else if (status == eslEINCOMPAT) esl_fatal("HMM file %s contains different alphabets", hmmfile); else if (status != eslOK) esl_fatal("Unexpected error in reading HMMs from %s", hmmfile); nhmm++; if (bg == NULL) bg = p7_bg_Create(abc); esl_vec_FSet(average_internal_transitions, p7H_NTRANSITIONS, 0.); for( k = 1; k < hmm->M; k++ ) { esl_vec_FAdd(average_internal_transitions, hmm->t[k], p7H_NTRANSITIONS); } // Match transitions esl_vec_FNorm(average_internal_transitions, 3); // Insert transitions esl_vec_FNorm(average_internal_transitions + 3, 2); // Delete transitions esl_vec_FNorm(average_internal_transitions + 5, 2); // Ok now set them. for( k = 1; k < hmm->M; k++ ) { esl_vec_FCopy( average_internal_transitions, p7H_NTRANSITIONS, hmm->t[k] ); } if ((status = p7_hmm_Validate(hmm, errmsg, 0.0001)) != eslOK) return status; if ((status = p7_hmmfile_WriteASCII(outhmmfp, -1, hmm)) != eslOK) ESL_FAIL(status, errmsg, "HMM save failed"); p7_MeanPositionRelativeEntropy(hmm, bg, &x); p7_hmm_CompositionKLDist(hmm, bg, &KL, NULL); printf("%-6d %-20s %-12s %8d %8.2f %6d %6.2f %6.2f %6.2f %6.2f\n", nhmm, hmm->name, hmm->acc == NULL ? "-" : hmm->acc, hmm->nseq, hmm->eff_nseq, hmm->M, p7_MeanMatchRelativeEntropy(hmm, bg), p7_MeanMatchInfo(hmm, bg), x, KL); /* p7_MeanForwardScore(hmm, bg)); */ p7_hmm_Destroy(hmm); } p7_bg_Destroy(bg); esl_alphabet_Destroy(abc); p7_hmmfile_Close(hfp); if (outhmmfp != NULL) fclose(outhmmfp); esl_getopts_Destroy(go); exit(0); }
/* process_workunit() * * This is the routine that actually does the work. * * A work unit consists of one HMM, <hmm>. * The result is the <scores> array, which contains an array of N scores; * caller provides this memory. * How those scores are generated is controlled by the application configuration in <cfg>. */ static int process_workunit(ESL_GETOPTS *go, struct cfg_s *cfg, char *errbuf, P7_HMM *hmm, double *scores, int *alilens) { int L = esl_opt_GetInteger(go, "-L"); P7_PROFILE *gm = NULL; P7_OPROFILE *om = NULL; P7_REFMX *rmx = NULL; P7_CHECKPTMX *cx = NULL; P7_FILTERMX *fx = NULL; P7_TRACE *tr = NULL; ESL_DSQ *dsq = NULL; int i; int scounts[p7T_NSTATETYPES]; /* state usage counts from a trace */ float sc; float nullsc; int status; P7_HARDWARE *hw; if ((hw = p7_hardware_Create ()) == NULL) p7_Fail("Couldn't get HW information data structure"); /* Optionally set a custom background, determined by model composition; * an experimental hack. */ if (esl_opt_GetBoolean(go, "--bgcomp")) { float *p = NULL; float KL; p7_hmm_CompositionKLDist(hmm, cfg->bg, &KL, &p); esl_vec_FCopy(p, cfg->abc->K, cfg->bg->f); } /* Create and configure our generic profile, as requested */ gm = p7_profile_Create(hmm->M, cfg->abc); if (esl_opt_GetBoolean(go, "--multi")) { if (esl_opt_GetBoolean(go, "--dual")) { p7_profile_Config (gm, hmm, cfg->bg); } else if (esl_opt_GetBoolean(go, "--local")) { p7_profile_ConfigLocal (gm, hmm, cfg->bg, L); } else if (esl_opt_GetBoolean(go, "--glocal")) { p7_profile_ConfigGlocal(gm, hmm, cfg->bg, L); } } else if (esl_opt_GetBoolean(go, "--uni")) { if (esl_opt_GetBoolean(go, "--dual")) { p7_profile_ConfigCustom (gm, hmm, cfg->bg, L, 0.0, 0.5); } else if (esl_opt_GetBoolean(go, "--local")) { p7_profile_ConfigUnilocal (gm, hmm, cfg->bg, L); } else if (esl_opt_GetBoolean(go, "--glocal")) { p7_profile_ConfigUniglocal(gm, hmm, cfg->bg, L); } } p7_profile_SetLength(gm, L); p7_bg_SetLength(cfg->bg, L); if (esl_opt_GetBoolean(go, "--x-no-lengthmodel")) elide_length_model(gm, cfg->bg); /* Allocate DP matrix for <gm>. */ rmx = p7_refmx_Create(gm->M, L); /* Create and configure the vectorized profile, if needed; * and allocate its DP matrix */ if (esl_opt_GetBoolean(go, "--vector")) { om = p7_oprofile_Create(gm->M, cfg->abc, om->simd); p7_oprofile_Convert(gm, om); cx = p7_checkptmx_Create(gm->M, L, ESL_MBYTES(32), om->simd); fx = p7_filtermx_Create(gm->M, om->simd); } /* Remaining allocation */ ESL_ALLOC(dsq, sizeof(ESL_DSQ) * (L+2)); tr = p7_trace_Create(); /* Collect scores from N random sequences of length L */ for (i = 0; i < cfg->N; i++) { esl_rsq_xfIID(cfg->r, cfg->bg->f, cfg->abc->K, L, dsq); sc = eslINFINITY; /* Vectorized implementations of Viterbi, MSV may overflow. * In this case, they'll leave sc=eslINFINITY. * Then we fail over to the nonvector "generic" implementation. * That's why this next block isn't an if/else. */ if (esl_opt_GetBoolean(go, "--vector")) { if (esl_opt_GetBoolean(go, "--vit")) p7_ViterbiFilter(dsq, L, om, fx, &sc); else if (esl_opt_GetBoolean(go, "--fwd")) p7_ForwardFilter(dsq, L, om, cx, &sc); else if (esl_opt_GetBoolean(go, "--msv")) p7_MSVFilter (dsq, L, om, fx, &sc); } /* If we tried a vector calculation above but it overflowed, * or if we're to do --generic DP calculations, sc==eslINFINITY now; * hence the if condition here: */ if (sc == eslINFINITY) { if (esl_opt_GetBoolean(go, "--fwd")) p7_ReferenceForward(dsq, L, gm, rmx, &sc); /* any mode: dual,local,glocal; gm's config takes care of this */ else if (esl_opt_GetBoolean(go, "--vit")) p7_ReferenceViterbi(dsq, L, gm, rmx, tr, &sc); /* local-only mode. cmdline opts processing has already assured that --local set */ else if (esl_opt_GetBoolean(go, "--msv")) p7_Die("We used to be able to do a generic MSV algorithm - but no longer"); } /* Optional: get Viterbi alignment length too. */ if (esl_opt_GetBoolean(go, "-a")) /* -a only works with Viterbi; getopts has checked this already; <tr> must be valid */ { p7_trace_GetStateUseCounts(tr, scounts); /* there's various ways we could counts "alignment length". * Here we'll use the total length of model used, in nodes: M+D states. * score vs al would gives us relative entropy / model position. */ /* alilens[i] = scounts[p7T_D] + scounts[p7T_I]; SRE: temporarily testing this instead */ alilens[i] = scounts[p7T_ML] + scounts[p7T_DL] + scounts[p7T_IL] + scounts[p7T_MG] + scounts[p7T_DG] + scounts[p7T_IG]; p7_trace_Reuse(tr); } p7_bg_NullOne(cfg->bg, dsq, L, &nullsc); scores[i] = (sc - nullsc) / eslCONST_LOG2; if (cx) p7_checkptmx_Reuse(cx); if (fx) p7_filtermx_Reuse(fx); p7_refmx_Reuse(rmx); } status = eslOK; /* deliberate flowthru */ ERROR: if (dsq != NULL) free(dsq); p7_checkptmx_Destroy(cx); p7_filtermx_Destroy(fx); p7_oprofile_Destroy(om); p7_profile_Destroy(gm); p7_refmx_Destroy(rmx); p7_trace_Destroy(tr); if (status == eslEMEM) sprintf(errbuf, "allocation failure"); return status; }
/* Function: p7_bg_Read() * Synopsis: Read background frequencies from a file. * * Purpose: Read new background frequencies from file <bgfile>, * overwriting the frequencies previously in the * <P7_BG> object <bg>. * * Note that <bg> is already created by the caller, not * created here. Also note that <p7_bg_Read()> only reads * residue background frequencies used for the "null * model", whereas a <P7_BG> object contains additional * information for the bias filter and for the biased * composition correction. * * Args: bgfile - file to read. * bg - existing <P7_BG> object provided by the caller. * errbuf - OPTIONAL: space for an error message, upon parse errors; or NULL. * * Returns: <eslOK> on success, and background frequencies in <bg> * are overwritten. * * <eslENOTFOUND> if <bgfile> can't be opened for reading. * <eslEFORMAT> if parsing of <bgfile> fails for some * reason. In both cases, <errbuf> contains a * user-directed error message upon return, including (if * relevant) the file name <bgfile> and the line number on * which an error was detected. <bg> is unmodified. * * Throws: <eslEMEM> on allocation failure; <bg> is unmodified, * and <errbuf> is empty. */ int p7_bg_Read(char *bgfile, P7_BG *bg, char *errbuf) { ESL_FILEPARSER *efp = NULL; float *fq = NULL; int n = 0; char *tok; int toklen; int alphatype; ESL_DSQ x; int status; if (errbuf) errbuf[0] = '\0'; status = esl_fileparser_Open(bgfile, NULL, &efp); if (status == eslENOTFOUND) ESL_XFAIL(eslENOTFOUND, errbuf, "couldn't open bg file %s for reading", bgfile); else if (status != eslOK) goto ERROR; esl_fileparser_SetCommentChar(efp, '#'); /* First token is alphabet type: amino | DNA | RNA */ status = esl_fileparser_GetToken(efp, &tok, &toklen); if (status == eslEOF) ESL_XFAIL(eslEFORMAT, errbuf, "premature end of file [line %d of bgfile %s]", efp->linenumber, bgfile); else if (status != eslOK) goto ERROR; alphatype = esl_abc_EncodeType(tok); if (alphatype == eslUNKNOWN) ESL_XFAIL(eslEFORMAT, errbuf, "expected alphabet type but saw \"%s\" [line %d of bgfile %s]", tok, efp->linenumber, bgfile); else if (alphatype != bg->abc->type) ESL_XFAIL(eslEFORMAT, errbuf, "bg file's alphabet is %s; expected %s [line %d, %s]", tok, esl_abc_DecodeType(bg->abc->type), efp->linenumber, bgfile); ESL_ALLOC(fq, sizeof(float) * bg->abc->K); esl_vec_FSet(fq, bg->abc->K, -1.0); while ((status = esl_fileparser_NextLine(efp)) == eslOK) { status = esl_fileparser_GetTokenOnLine(efp, &tok, &toklen); if (status == eslEOL) ESL_XFAIL(eslEFORMAT, errbuf, "premature end of file [line %d of bgfile %s", efp->linenumber, bgfile); else if (status != eslOK) goto ERROR; if (toklen != 1 || ! esl_abc_CIsCanonical(bg->abc, *tok)) ESL_XFAIL(eslEFORMAT, errbuf, "expected to parse a residue letter; saw %s [line %d of bgfile %s]", tok, efp->linenumber, bgfile); x = esl_abc_DigitizeSymbol(bg->abc, *tok); if (fq[x] != -1.0) ESL_XFAIL(eslEFORMAT, errbuf, "already parsed probability of %c [line %d of bgfile %s]", bg->abc->sym[x], efp->linenumber, bgfile); n++; status = esl_fileparser_GetTokenOnLine(efp, &tok, &toklen); if (status == eslEOL) ESL_XFAIL(eslEFORMAT, errbuf, "premature end of file, expected a probability [line %d of bgfile %s]", efp->linenumber, bgfile); else if (status != eslOK) goto ERROR; if (! esl_str_IsReal(tok)) ESL_XFAIL(eslEFORMAT, errbuf, "expected a probability, saw %s [line %d of bgfile %s]", tok, efp->linenumber, bgfile); fq[x] = atof(tok); status = esl_fileparser_GetTokenOnLine(efp, &tok, &toklen); if (status == eslOK) ESL_XFAIL(eslEFORMAT, errbuf, "extra unexpected data found [line %d of bgfile %s]", efp->linenumber, bgfile); else if (status != eslEOL) goto ERROR; } if (status != eslEOF) goto ERROR; if ( n != bg->abc->K) ESL_XFAIL(eslEFORMAT, errbuf, "expected %d residue frequencies, but found %d in bgfile %s", bg->abc->K, n, bgfile); if ( esl_FCompare(esl_vec_FSum(fq, bg->abc->K), 1.0, 0.001) != eslOK) ESL_XFAIL(eslEFORMAT, errbuf, "residue frequencies do not sum to 1.0 in bgfile %s", bgfile); /* all checking complete. no more error cases. overwrite bg with the new frequencies */ esl_vec_FNorm(fq, bg->abc->K); esl_vec_FCopy(fq, bg->abc->K, bg->f); free(fq); esl_fileparser_Close(efp); return eslOK; ERROR: if (fq) free(fq); if (efp) esl_fileparser_Close(efp); return status; }
/* Function: p7_GNull2_ByExpectation() * Synopsis: Calculate null2 model from posterior probabilities. * Incept: SRE, Thu Feb 28 09:52:28 2008 [Janelia] * * Purpose: Calculate the "null2" model for the envelope encompassed * by a posterior probability calculation <pp> for model * <gm>. Return the null2 odds emission probabilities * $\frac{f'{x}}{f{x}}$ in <null2>, which caller * provides as space for at least <alphabet->Kp> residues. * * The expectation method is applied to envelopes in * simple, well resolved regions (regions containing just a * single envelope, where no stochastic traceback * clustering was required). * * Make sure that the posterior probability matrix <pp> has * been calculated by the caller for only the envelope; thus * its rows are numbered <1..Ld>, for envelope <ienv..jenv> * of length <Ld=jenv-ienv+1>. * * Args: gm - profile, in any mode, target length model set to <L> * pp - posterior prob matrix, for <gm> against domain envelope <dsq+i-1> (offset) * null2 - RETURN: null2 odds ratios per residue; <0..Kp-1>; caller allocated space * * Returns: <eslOK> on success; <null2> contains the null2 scores. The 0 * row of <pp> has been used as temp space, and happens to contain * the expected frequency that each M,I,N,C,J state is used in this * <pp> matrix to generate residues. * * Throws: (no abnormal error conditions) */ int p7_GNull2_ByExpectation(const P7_PROFILE *gm, P7_GMX *pp, float *null2) { int M = gm->M; int Ld = pp->L; float **dp = pp->dp; float *xmx = pp->xmx; float xfactor; int x; /* over symbols 0..K-1 */ int i; /* over offset envelope dsq positions 1..Ld */ int k; /* over model M states 1..M, I states 1..M-1 */ /* Calculate expected # of times that each emitting state was used * in generating the Ld residues in this domain. * The 0 row in <wrk> is used to hold these numbers. */ esl_vec_FCopy(pp->dp[1], (M+1)*p7G_NSCELLS, pp->dp[0]); esl_vec_FCopy(pp->xmx+p7G_NXCELLS, p7G_NXCELLS, pp->xmx); for (i = 2; i <= Ld; i++) { esl_vec_FAdd(pp->dp[0], pp->dp[i], (M+1)*p7G_NSCELLS); esl_vec_FAdd(pp->xmx, pp->xmx+i*p7G_NXCELLS, p7G_NXCELLS); } /* Convert those expected #'s to log frequencies; these we'll use as * the log posterior weights. */ esl_vec_FLog(pp->dp[0], (M+1)*p7G_NSCELLS); esl_vec_FLog(pp->xmx, p7G_NXCELLS); esl_vec_FIncrement(pp->dp[0], (M+1)*p7G_NSCELLS, -log((float)Ld)); esl_vec_FIncrement(pp->xmx, p7G_NXCELLS, -log((float)Ld)); /* Calculate null2's log odds emission probabilities, by taking * posterior weighted sum over all emission vectors used in paths * explaining the domain. * This is dog-slow; a point for future optimization. */ xfactor = XMX(0,p7G_N); xfactor = p7_FLogsum(xfactor, XMX(0,p7G_C)); xfactor = p7_FLogsum(xfactor, XMX(0,p7G_J)); esl_vec_FSet(null2, gm->abc->K, -eslINFINITY); for (x = 0; x < gm->abc->K; x++) { for (k = 1; k < M; k++) { null2[x] = p7_FLogsum(null2[x], MMX(0,k) + p7P_MSC(gm, k, x)); null2[x] = p7_FLogsum(null2[x], IMX(0,k) + p7P_ISC(gm, k, x)); } null2[x] = p7_FLogsum(null2[x], MMX(0,M) + p7P_MSC(gm, k, x)); null2[x] = p7_FLogsum(null2[x], xfactor); } esl_vec_FExp (null2, gm->abc->K); /* now null2[x] = \frac{f_d(x)}{f_0(x)} for all x in alphabet, * 0..K-1, where f_d(x) are the ad hoc "null2" residue frequencies * for this envelope. */ /* make valid scores for all degeneracies, by averaging the odds ratios. */ esl_abc_FAvgScVec(gm->abc, null2); /* does not set gap, nonres, missing */ null2[gm->abc->K] = 1.0; /* gap character */ null2[gm->abc->Kp-2] = 1.0; /* nonresidue "*" */ null2[gm->abc->Kp-1] = 1.0; /* missing data "~" */ return eslOK; }