/* Function: p7_sparsemask_Compare() * Synopsis: Compare two sparse masks for equality. * * Purpose: Compare <sm1> and <sm2>; return <eslOK> if they * are equal, <eslFAIL> if they are not. */ int p7_sparsemask_Compare_avx512(const P7_SPARSEMASK *sm1, const P7_SPARSEMASK *sm2) { #ifdef HAVE_AVX512 char msg[] = "P7_SPARSEMASK comparison failed"; int i; int s; if(sm2->simd != AVX512){ ESL_FAIL(eslFAIL, NULL, "Can't compare sparsemasks generated for different SIMD instruction sets"); } if ( (sm1->L != sm2->L) || (sm1->M != sm2->M) || (sm1->S_AVX_512 != sm2->S_AVX_512) || (sm1->nrow_AVX_512 != sm2->nrow_AVX_512) || (sm1->ncells_AVX_512 != sm2->ncells_AVX_512)) ESL_FAIL(eslFAIL, NULL, msg); for (s = 0; s <= sm1->S_AVX_512+1; s++) { if (sm1->seg_AVX_512[s].ia != sm2->seg_AVX_512[s].ia) ESL_FAIL(eslFAIL, NULL, msg); if (sm1->seg_AVX_512[s].ib != sm2->seg_AVX_512[s].ib) ESL_FAIL(eslFAIL, NULL, msg); } if ( esl_vec_ICompare(sm1->n_AVX_512, sm2->n_AVX_512, sm1->L+1) != eslOK) ESL_FAIL(eslFAIL, NULL, msg); for (i = 0; i <= sm1->L; i++) if ( esl_vec_ICompare(sm1->k_AVX_512[i], sm2->k_AVX_512[i], sm1->n_AVX_512[i]) != eslOK) ESL_FAIL(eslFAIL, NULL, msg); return eslOK; #endif #ifndef HAVE_AVX512 return eslENORESULT; #endif }
/* Function: esl_rmx_ValidateQ() * Incept: SRE, Sun Mar 11 10:30:50 2007 [Janelia] * * Purpose: Validates an instantaneous rate matrix <Q> for a * continuous-time Markov process, whose elements $q_{ij}$ * represent instantaneous transition rates $i \rightarrow * j$. * * Rows satisfy the condition that * $q_{ii} = -\sum_{i \neq j} q_{ij}$, and also * that $q_{ij} \geq 0$ for all $j \neq i$. * * <tol> specifies the floating-point tolerance to which * that condition must hold: <fabs(sum-q_ii) <= tol>. * * <errbuf> is an optional error message buffer. The caller * may pass <NULL> or a pointer to a buffer of at least * <eslERRBUFSIZE> characters. * * Args: Q - rate matrix to validate * tol - floating-point tolerance (0.00001, for example) * errbuf - OPTIONAL: ptr to an error buffer of at least * <eslERRBUFSIZE> characters. * * Returns: <eslOK> on successful validation. * <eslFAIL> on failure, and if a non-<NULL> <errbuf> was * provided by the caller, a message describing * the reason for the failure is put there. * * Throws: (no abnormal error conditions) */ int esl_rmx_ValidateQ(ESL_DMATRIX *Q, double tol, char *errbuf) { int i,j; double qi; if (Q->type != eslGENERAL) ESL_EXCEPTION(eslEINVAL, "Q must be type eslGENERAL to be validated"); if (Q->n != Q->m) ESL_EXCEPTION(eslEINVAL, "a rate matrix Q must be square"); for (i = 0; i < Q->n; i++) { qi = 0.; for (j = 0; j < Q->m; j++) { if (i != j) { if (Q->mx[i][j] < 0.) ESL_FAIL(eslFAIL, errbuf, "offdiag elem %d,%d < 0",i,j); qi += Q->mx[i][j]; } else { if (Q->mx[i][j] > 0.) ESL_FAIL(eslFAIL, errbuf, "diag elem %d,%d < 0", i,j); } } if (fabs(qi + Q->mx[i][i]) > tol) ESL_FAIL(eslFAIL, errbuf, "row %d does not sum to 0.0", i); } return eslOK; }
/* Function: p7_sparsemask_Validate() * Synopsis: Validate a P7_SPARSEMASK sparse DP mask. * * Purpose: Validate the contents of sparse mask <sm>. * Return <eslOK> if it passes. Return <eslFAIL> * if it fails, and set <errbuf> to contain an * explanation, if caller provides a non-<NULL> * <errbuf>. * * Args: sm - sparse DP mask to validate * errbuf - [eslERRBUFSIZE] space for an error msg; or NULL * * Returns: <eslOK> on success; <errbuf>, if provided, is set * to an empty string "\0". * * <eslFAIL> on failure; <errbuf>, if provided, contains an * informative error message. * * Note: We don't check for all possible invalidity; the goal of a * Validate() is primarily to catch any future problems * similar to past problems that we've already run across * in debugging/testing. */ int p7_sparsemask_Validate_avx512(const P7_SPARSEMASK *sm, char *errbuf) { #ifdef HAVE_AVX512 int g, i; if (errbuf) errbuf[0] = '\0'; if ( sm->L < 1) ESL_FAIL(eslFAIL, errbuf, "L must be >=1"); if ( sm->M < 1) ESL_FAIL(eslFAIL, errbuf, "M must be >=1"); if ( sm->S_AVX_512 < 0) ESL_FAIL(eslFAIL, errbuf, "S must be >=0"); for (g = 1; g <= sm->S_AVX_512; g++) { if (sm->seg_AVX_512[g-1].ib >= sm->seg_AVX_512[g].ia) ESL_FAIL(eslFAIL, errbuf, "seg %d overlaps with previous one", g); // Note boundary condition, seg[0].ib=-1 if (sm->seg_AVX_512[g].ia > sm->seg_AVX_512[g].ib) ESL_FAIL(eslFAIL, errbuf, "ia..ib are not in order for seg %d", g); if (sm->seg_AVX_512[g].ia < 1 || sm->seg_AVX_512[g].ia > sm->L) ESL_FAIL(eslFAIL, errbuf, "ia[%d] is invalid", g); if (sm->seg_AVX_512[g].ib < 1 || sm->seg_AVX_512[g].ib > sm->L) ESL_FAIL(eslFAIL, errbuf, "ib[%d] is invalid", g); for (i = sm->seg_AVX_512[g-1].ib+1; i < sm->seg_AVX_512[g].ia; i++) // Note boundary condition. Sentinel seg[0].ib == -1, so (i = seg[0]+1) means 0 if (sm->n_AVX_512[i] != 0) ESL_FAIL(eslFAIL, errbuf, "n[i] != 0 for i unmarked, not in sparse segment"); for (i = sm->seg_AVX_512[g].ia; i <= sm->seg_AVX_512[g].ib; i++) if (sm->n_AVX_512[i] == 0) ESL_FAIL(eslFAIL, errbuf, "n[i] == 0 for i supposedly marked in sparse seg"); } for (i = sm->seg_AVX_512[sm->S_AVX_512].ib+1; i <= sm->L; i++) if (sm->n_AVX_512[i] != 0) ESL_FAIL(eslFAIL, errbuf, "n[i] != 0 for i unmarked, not in sparse segment"); return eslOK; #endif #ifndef HAVE_AVX512 return eslENORESULT; #endif }
int p7_masstrace_Compare(const P7_MASSTRACE *mte, const P7_MASSTRACE *mta, float tol) { char msg[] = "masstrace object comparison failed"; int i,k; if (mte->L != mta->L) ESL_FAIL(eslFAIL, NULL, msg); if (mte->M != mta->M) ESL_FAIL(eslFAIL, NULL, msg); if (mte->i0 != mta->i0) ESL_FAIL(eslFAIL, NULL, msg); if (mte->k0 != mta->k0) ESL_FAIL(eslFAIL, NULL, msg); if (mte->st0 != mta->st0) ESL_FAIL(eslFAIL, NULL, msg); if (mte->imass && mta->imass) { for (i = 1; i <= mte->L; i++) { if (mte->imass[i] == 0.0 && mta->imass[i] > 0.0) ESL_FAIL(eslFAIL, NULL, msg); if (esl_FCompareAbs(mte->imass[i], mta->imass[i], tol) != eslOK) ESL_FAIL(eslFAIL, NULL, msg); } } for (k = 1; k <= mte->M; k++) { if (mte->kmass[k] == 0.0 && mta->kmass[k] > 0.0) ESL_FAIL(eslFAIL, NULL, msg); if (esl_FCompareAbs(mte->kmass[k], mta->kmass[k], tol) != eslOK) ESL_FAIL(eslFAIL, NULL, msg); } return eslOK; }
/* init_master_cfg() * Called by either master version, mpi or serial. * Already set: * cfg->hmmfile - command line arg * Sets: * cfg->hfp - open HMM stream * cfg->ofp - open output steam * cfg->survfp - open xmgrace survival plot file * cfg->efp - open E vs. E plot file * cfg->ffp - open filter power data file * cfg->xfp - open binary score file * cfg->alfp - open alignment length file * * Error handling relies on the result pointers being initialized to * NULL by the caller. * * Errors in the MPI master here are considered to be "recoverable", * in the sense that we'll try to delay output of the error message * until we've cleanly shut down the worker processes. Therefore * errors return (code, errmsg) by the ESL_FAIL mech. */ static int init_master_cfg(ESL_GETOPTS *go, struct cfg_s *cfg, char *errbuf) { char *filename; int status; status = p7_hmmfile_OpenE(cfg->hmmfile, NULL, &(cfg->hfp), NULL); if (status == eslENOTFOUND) ESL_FAIL(eslFAIL, errbuf, "Failed to open HMM file %s for reading.\n", cfg->hmmfile); else if (status == eslEFORMAT) ESL_FAIL(eslFAIL, errbuf, "File %s does not appear to be in a recognized HMM format.\n", cfg->hmmfile); else if (status != eslOK) ESL_FAIL(eslFAIL, errbuf, "Unexpected error %d in opening HMM file %s.\n", status, cfg->hmmfile); filename = esl_opt_GetString(go, "-o"); if (filename != NULL) { if ((cfg->ofp = fopen(filename, "w")) == NULL) ESL_FAIL(eslFAIL, errbuf, "Failed to open -o output file %s\n", filename); } else cfg->ofp = stdout; filename = esl_opt_GetString(go, "--pfile"); if (filename != NULL) { if ((cfg->survfp = fopen(filename, "w")) == NULL) ESL_FAIL(eslFAIL, errbuf, "Failed to open --pfile output file %s\n", filename); } filename = esl_opt_GetString(go, "--efile"); if (filename != NULL) { if ((cfg->efp = fopen(filename, "w")) == NULL) ESL_FAIL(eslFAIL, errbuf, "Failed to open --efile output file %s\n", filename); } filename = esl_opt_GetString(go, "--ffile"); if (filename != NULL) { if ((cfg->ffp = fopen(filename, "w")) == NULL) ESL_FAIL(eslFAIL, errbuf, "Failed to open --ffile output file %s\n", filename); } filename = esl_opt_GetString(go, "--xfile"); if (filename != NULL) { if ((cfg->xfp = fopen(filename, "w")) == NULL) ESL_FAIL(eslFAIL, errbuf, "Failed to open --xfile output file %s\n", filename); } filename = esl_opt_GetString(go, "--afile"); if (filename != NULL) { if ((cfg->alfp = fopen(filename, "w")) == NULL) ESL_FAIL(eslFAIL, errbuf, "Failed to open --afile output file %s\n", filename); } return eslOK; }
int p7_hit_Validate(const P7_HIT *hit, char *errbuf) { int d; int status; if (hit->name == NULL) ESL_FAIL(eslFAIL, errbuf, "name cannot be NULL"); if (isnan(hit->sortkey) || isnan(hit->score) || isnan(hit->pre_score) || isnan(hit->sum_score) || isnan(hit->lnP) || isnan(hit->pre_lnP) || isnan(hit->sum_lnP) || isnan(hit->nexpected)) ESL_FAIL(eslFAIL, errbuf, "NaN found"); if ( (hit->flags & (! ( p7_IS_REPORTED | p7_IS_INCLUDED | p7_IS_NEW | p7_IS_DROPPED | p7_IS_DUPLICATE))) != 0) ESL_FAIL(eslFAIL, errbuf, "unrecognized flag is up"); if (hit->ndom < 0) ESL_FAIL(eslFAIL, errbuf, "negative ndom"); if (hit->noverlaps < 0 || hit->noverlaps > hit->ndom) ESL_FAIL(eslFAIL, errbuf, "bad noverlaps"); if (hit->nreported < 0 || hit->nreported > hit->ndom) ESL_FAIL(eslFAIL, errbuf, "bad nreported"); if (hit->nincluded < 0 || hit->nincluded > hit->ndom) ESL_FAIL(eslFAIL, errbuf, "bad nincluded"); if (hit->best_domain < 0 || hit->best_domain >= hit->ndom) ESL_FAIL(eslFAIL, errbuf, "bad best_domain"); for (d = 0; d < hit->ndom; d++) if (( status = p7_domain_Validate(&(hit->dcl[d]), errbuf)) != eslOK) return status; return eslOK; }
/* output_filter_power() * * Used for testing whether the filters (MSV scores, Viterbi scores) * have the power they're supposed to have: for example, if MSV filter * is set at a P-value threshold of 0.02, ~2% of sequences should get * through, regardless of things like model and target sequence * length. * * Output a file suitable for constructing histograms over many HMMs, * for a particular choice of hmmsim'ed L and N targets: * <hmm name> <# of seqs passing threshold> <fraction of seqs passing threshold> * * SRE, Thu Apr 9 08:57:32 2009 [Janelia] xref J4/133 */ static int output_filter_power(ESL_GETOPTS *go, struct cfg_s *cfg, char *errbuf, P7_HMM *hmm, double *scores) { double pthresh = esl_opt_GetReal(go, "--pthresh"); /* P-value threshold set for the filter score */ double P; /* calculated P-value (using HMM's own calibration) */ int npass = 0; /* number of scores that pass the P threshold */ double fpass; /* fraction of scores that pass the P threshold */ int i; /* counter over scores */ int do_gumbel; /* flag for how to determine P values */ double pmu, plambda; if (esl_opt_GetBoolean(go, "--vit")) { pmu = hmm->evparam[p7_VMU]; plambda = hmm->evparam[p7_VLAMBDA]; do_gumbel = TRUE; } else if (esl_opt_GetBoolean(go, "--msv")) { pmu = hmm->evparam[p7_MMU]; plambda = hmm->evparam[p7_MLAMBDA]; do_gumbel = TRUE; } else if (esl_opt_GetBoolean(go, "--fwd")) { pmu = hmm->evparam[p7_FTAU]; plambda = hmm->evparam[p7_FLAMBDA]; do_gumbel = FALSE; } else ESL_FAIL(eslEINVAL, errbuf, "can only use --ffile with viterbi, msv, or fwd scores"); for (i = 0; i < cfg->N; i++) { P = (do_gumbel ? esl_gumbel_surv(scores[i], pmu, plambda) : esl_exp_surv (scores[i], pmu, plambda)); if (P <= pthresh) npass++; } fpass = (double) npass / (double) cfg->N; fprintf(cfg->ffp, "%s\t%d\t%.4f\n", hmm->name, npass, fpass); return eslOK; }
/* map_rfpos_to_apos * * Given an MSA, determine the alignment position of each * non-gap RF (reference) position. The abc is only necessary * for defining gap characters. * * rf2a_map[0..rfpos..rflen-1] = apos, apos is the alignment position (0..msa->alen-1) that * is non-gap RF position rfpos+1 (for rfpos in 0..rflen-1) */ static int map_rfpos_to_apos(ESL_MSA *msa, ESL_ALPHABET *abc, char *errbuf, int64_t alen, int **ret_i_am_rf, int **ret_rf2a_map, int *ret_rflen) { int status; int rflen = 0; int *rf2a_map = NULL; int *i_am_rf = NULL; int rfpos = 0; int apos = 0; /* contract check */ if(msa->rf == NULL) ESL_FAIL(eslEINVAL, errbuf, "Error, trying to map RF positions to alignment positions, but msa->rf is NULL."); /* count non-gap RF columns */ for(apos = 0; apos < alen; apos++) { if((! esl_abc_CIsGap(abc, msa->rf[apos])) && (! esl_abc_CIsMissing(abc, msa->rf[apos])) && (! esl_abc_CIsNonresidue(abc, msa->rf[apos]))) { rflen++; /* I don't use esl_abc_CIsResidue() b/c that would return FALSE for 'x' with RNA and DNA */ } } /* build map */ ESL_ALLOC(i_am_rf, sizeof(int) * alen); ESL_ALLOC(rf2a_map, sizeof(int) * rflen); for(apos = 0; apos < alen; apos++) { if((! esl_abc_CIsGap(abc, msa->rf[apos])) && (! esl_abc_CIsMissing(abc, msa->rf[apos])) && (! esl_abc_CIsNonresidue(abc, msa->rf[apos]))) { i_am_rf[apos] = TRUE; rf2a_map[rfpos++] = apos; } else { i_am_rf[apos] = FALSE; } } *ret_i_am_rf = i_am_rf; *ret_rf2a_map = rf2a_map; *ret_rflen = rflen; return eslOK; ERROR: if(i_am_rf != NULL) free(i_am_rf); if(rf2a_map != NULL) free(rf2a_map); ESL_FAIL(status, errbuf, "Error, out of memory while mapping RF positions to alignment positions."); }
/* Function: esl_rmx_ValidateP() * Incept: SRE, Sun Mar 11 10:30:50 2007 [Janelia] * * Purpose: Validates a conditional probability matrix <P>, whose * elements $P_{ij}$ represent conditional probabilities * $P(j \mid i)$; for example in a first-order Markov * chain, or a continuous-time Markov transition process * where <P> is for a particular $t$. * * Rows must sum to one, and each element $P_{ij}$ is a * probability $0 \leq P_{ij} \leq 1$. * * <tol> specifies the floating-point tolerance to which * the row sums must equal one: <fabs(sum-1.0) <= tol>. * * <errbuf> is an optional error message buffer. The caller * may pass <NULL> or a pointer to a buffer of at least * <eslERRBUFSIZE> characters. * * Args: P - matrix to validate * tol - floating-point tolerance (0.00001, for example) * errbuf - OPTIONAL: ptr to an error buffer of at least * <eslERRBUFSIZE> characters. * * Returns: <eslOK> on successful validation. * <eslFAIL> on failure, and if a non-<NULL> <errbuf> was * provided by the caller, a message describing * the reason for the failure is put there. * * Throws: (no abnormal error conditions) */ int esl_rmx_ValidateP(ESL_DMATRIX *P, double tol, char *errbuf) { int i,j; double sum; if (P->type != eslGENERAL) ESL_EXCEPTION(eslEINVAL, "P must be type eslGENERAL to be validated"); for (i = 0; i < P->n; i++) { sum = esl_vec_DSum(P->mx[i], P->m); if (fabs(sum-1.0) > tol) ESL_FAIL(eslFAIL, errbuf, "row %d does not sum to 1.0", i); for (j = 0; j < P->m; j++) if (P->mx[i][j] < 0.0 || P->mx[i][j] > 1.0) ESL_FAIL(eslFAIL, errbuf, "element %d,%d is not a probability (%f)", i,j,P->mx[i][j]); } return eslOK; }
/* Function: cp9_GetNCalcsPerResidue() * Date: EPN, Thu Jan 17 06:12:37 2008 * * Returns: eslOK on success, eslEINCOMPAT on contract violation. * <ret_cp9_ncalcs_per_res> set as millions of DP calculations * per residue for the CP9 HMM. */ int cp9_GetNCalcsPerResidue(CP9_t *cp9, char *errbuf, float *ret_cp9_ncalcs_per_res) { int cp9_ntrans; float cp9_ncalcs_per_res; if(cp9 == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_GetNCalcsPerRes(), cp9 == NULL."); if(ret_cp9_ncalcs_per_res == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_GetNCalcsPerRes(), ret_cp9_ncalcs_per_res == NULL."); /* determine millions of CP9 DP calcs per residue */ cp9_ntrans = NHMMSTATETYPES * NHMMSTATETYPES; /* 3*3 = 9 transitions in global mode */ if(cp9->flags & CPLAN9_LOCAL_BEGIN) cp9_ntrans++; if(cp9->flags & CPLAN9_LOCAL_END) cp9_ntrans++; if(cp9->flags & CPLAN9_EL) cp9_ntrans++; cp9_ncalcs_per_res = (cp9_ntrans * cp9->M) / 1000000.; /* convert to millions of calcs per residue */ *ret_cp9_ncalcs_per_res = cp9_ncalcs_per_res; return eslOK; }
/* init_master_cfg() * Called by masters, mpi or serial. * Already set: * cfg->hmmfile - command line arg 1 * cfg->alifile - command line arg 2 * cfg->postmsafile - option -O (default NULL) * cfg->fmt - format of alignment file * Sets: * cfg->afp - open alignment file * cfg->abc - digital alphabet * cfg->hmmfp - open HMM file * cfg->postmsafp - open MSA resave file, or NULL * * Errors in the MPI master here are considered to be "recoverable", * in the sense that we'll try to delay output of the error message * until we've cleanly shut down the worker processes. Therefore * errors return (code, errmsg) by the ESL_FAIL mech. */ static int init_master_cfg(const ESL_GETOPTS *go, struct cfg_s *cfg, char *errmsg) { int status; if (esl_opt_GetString(go, "-o") != NULL) { if ((cfg->ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) ESL_FAIL(eslFAIL, errmsg, "Failed to open -o output file %s\n", esl_opt_GetString(go, "-o")); } else cfg->ofp = stdout; status = esl_msafile_Open(cfg->alifile, cfg->fmt, NULL, &(cfg->afp)); if (status == eslENOTFOUND) ESL_FAIL(status, errmsg, "Alignment file %s doesn't exist or is not readable\n", cfg->alifile); else if (status == eslEFORMAT) ESL_FAIL(status, errmsg, "Couldn't determine format of alignment %s\n", cfg->alifile); else if (status != eslOK) ESL_FAIL(status, errmsg, "Alignment file open failed with error %d\n", status); if (esl_opt_GetBoolean(go, "--amino")) cfg->abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) cfg->abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) cfg->abc = esl_alphabet_Create(eslRNA); else { int type; status = esl_msafile_GuessAlphabet(cfg->afp, &type); if (status == eslEAMBIGUOUS) ESL_FAIL(status, errmsg, "Failed to guess the bio alphabet used in %s.\nUse --dna, --rna, or --amino option to specify it.", cfg->alifile); else if (status == eslEFORMAT) ESL_FAIL(status, errmsg, "Alignment file parse failed: %s\n", cfg->afp->errbuf); else if (status == eslENODATA) ESL_FAIL(status, errmsg, "Alignment file %s is empty\n", cfg->alifile); else if (status != eslOK) ESL_FAIL(status, errmsg, "Failed to read alignment file %s\n", cfg->alifile); cfg->abc = esl_alphabet_Create(type); } esl_msafile_SetDigital(cfg->afp, cfg->abc); if ((cfg->hmmfp = fopen(cfg->hmmfile, "w")) == NULL) ESL_FAIL(status, errmsg, "Failed to open HMM file %s for writing", cfg->hmmfile); if (cfg->postmsafile != NULL) { if ((cfg->postmsafp = fopen(cfg->postmsafile, "w")) == NULL) ESL_FAIL(status, errmsg, "Failed to MSA resave file %s for writing", cfg->postmsafile); } else cfg->postmsafp = NULL; output_header(go, cfg); /* with msa == NULL, output_result() prints the tabular results header, if needed */ output_result(cfg, errmsg, 0, NULL, NULL, NULL, 0.0); return eslOK; }
/* Function: p7_anchors_Validate() * Synopsis: Validate an anchor set object. * * Purpose: Validates an anchor set object. * * If <M>,<L> dimensions are provided, then the sentinels * at <0> and <D+1> are validated too. If <M> or <L> are * unknown they can be passed as 0, and the sentinels in * <anch> will be used to determine them -- which of course * depends on the sentinels being valid, so is less strong. * * Args: anch - anchors to validate * L - sequence length if known; else 0 * M - profile length if known; else 0 * errbuf - optional error message, allocated for eslERRBUFSIZE; or NULL * * Returns: <eslOK> on success. * <eslFAIL> on failure, and if <errbuf> was provided, it contains * an informative error message. * * Throws: (no abnormal error conditions) */ int p7_anchors_Validate(P7_ANCHORS *anch, int L, int M, char *errbuf) { int D = anch->D; int d; /* If M or L aren't provided, set them from the sentinels */ if (!L) L = anch->a[D+1].i0 - 1; if (!M) M = anch->a[0].k0 - 1; for (d = 0; d <= D; d++) if (! (anch->a[d].i0 < anch->a[d+1].i0)) ESL_FAIL(eslFAIL, errbuf, "i0 anchors not sorted"); for (d = 1; d <= D; d++) { if (! (anch->a[d].i0 >= 1 && anch->a[d].i0 <= L)) ESL_FAIL(eslFAIL, errbuf, "i0 %d not in range 1..L", d); if (! (anch->a[d].k0 >= 1 && anch->a[d].k0 <= M)) ESL_FAIL(eslFAIL, errbuf, "k0 %d not in range 1..M", d); } if (anch->a[0].i0 != 0 || anch->a[0].k0 != M+1) ESL_FAIL(eslFAIL, errbuf, "sentinel 0 invalid"); if (anch->a[D+1].i0 != L+1 || anch->a[D+1].k0 != 0) ESL_FAIL(eslFAIL, errbuf, "sentinel D+1 invalid"); return eslOK; }
/* set_msa_name() * Make sure the alignment has a name; this name will * then be transferred to the model. * * We can only do this for a single alignment in a file. For multi-MSA * files, each MSA is required to have a name already. * * Priority is: * 1. Use -n <name> if set, overriding any name the alignment might already have. * 2. Use alignment's existing name, if non-NULL. * 3. Make a name, from alignment file name without path and without filename extension * (e.g. "/usr/foo/globins.slx" gets named "globins") * If none of these succeeds, return <eslEINVAL>. * * If a multiple MSA database (e.g. Stockholm/Pfam), and we encounter * an MSA that doesn't already have a name, return <eslEINVAL> if nali > 1. * (We don't know we're in a multiple MSA database until we're on the second * alignment.) * * If we're in MPI mode, we assume we're in a multiple MSA database, * even on the first alignment. * * Because we can't tell whether we've got more than one * alignment 'til we're on the second one, these fatal errors * only happen after the first HMM has already been built. * Oh well. */ static int set_msa_name(struct cfg_s *cfg, char *errbuf, ESL_MSA *msa) { char *name = NULL; int status; if (cfg->do_mpi == FALSE && cfg->nali == 1) /* first (only?) HMM in file: */ { if (cfg->hmmName != NULL) { if ((status = esl_msa_SetName(msa, cfg->hmmName)) != eslOK) return status; } else if (msa->name != NULL) { cfg->nnamed++; } else if (! cfg->afp->do_stdin) { if ((status = esl_FileTail(cfg->afp->fname, TRUE, &name)) != eslOK) return status; /* TRUE=nosuffix */ if ((status = esl_msa_SetName(msa, name)) != eslOK) return status; free(name); } else ESL_FAIL(eslEINVAL, errbuf, "Failed to set model name: msa has no name, no msa filename, and no -n"); } else { if (cfg->hmmName != NULL) ESL_FAIL(eslEINVAL, errbuf, "Oops. Wait. You can't use -n with an alignment database."); else if (msa->name != NULL) cfg->nnamed++; else ESL_FAIL(eslEINVAL, errbuf, "Oops. Wait. I need name annotation on each alignment in a multi MSA file; failed on #%d", cfg->nali+1); /* special kind of failure: the *first* alignment didn't have a name, and we used the filename to * construct one; now that we see a second alignment, we realize this was a boo-boo*/ if (cfg->nnamed != cfg->nali) ESL_FAIL(eslEINVAL, errbuf, "Oops. Wait. I need name annotation on each alignment in a multi MSA file; first MSA didn't have one"); } return eslOK; }
int p7_tophits_Validate(const P7_TOPHITS *th, char *errbuf) { int i; int idx; int status; if (th->is_sorted_by_sortkey || th->is_sorted_by_seqidx) { for (i = 0; i < th->N; i++) { idx = th->hit[i] - th->unsrt; /* i.e., by ptr arithmetic: #i in sorted list is #idx in unsorted list */ if (idx < 0 || idx >= th->N) ESL_FAIL(eslFAIL, errbuf, "sorted hit number %d points to bad address", i); /* TestSample() currently doesn't sort its sampled hit array, so we don't test for proper sortedness */ } } if (th->nreported < 0 || th->nreported > th->N) ESL_FAIL(eslFAIL, errbuf, "bad nreported field"); if (th->nincluded < 0 || th->nincluded > th->N) ESL_FAIL(eslFAIL, errbuf, "bad nreported field"); if (th->is_sorted_by_sortkey && th->is_sorted_by_seqidx) ESL_FAIL(eslFAIL, errbuf, "both sort type flags are up"); for (i = 0; i < th->N; i++) if (( status = p7_hit_Validate( &(th->unsrt[i]), errbuf)) != eslOK) return status; return eslOK; }
/* set_relative_weights(): * Set msa->wgt vector, using user's choice of relative weighting algorithm. */ static int relative_weights(P7_BUILDER *bld, ESL_MSA *msa) { int status = eslOK; if (bld->wgt_strategy == p7_WGT_NONE) { esl_vec_DSet(msa->wgt, msa->nseq, 1.); } else if (bld->wgt_strategy == p7_WGT_GIVEN) ; else if (bld->wgt_strategy == p7_WGT_PB) status = esl_msaweight_PB(msa); else if (bld->wgt_strategy == p7_WGT_GSC) status = esl_msaweight_GSC(msa); else if (bld->wgt_strategy == p7_WGT_BLOSUM) status = esl_msaweight_BLOSUM(msa, bld->wid); else ESL_EXCEPTION(eslEINCONCEIVABLE, "no such weighting strategy"); if (status != eslOK) ESL_FAIL(status, bld->errbuf, "failed to set relative weights in alignment"); return eslOK; }
/* validate_msa: * SRE, Thu Dec 3 16:10:31 2009 [J5/119; bug #h70 fix] * * HMMER uses a convention for missing data characters: they * indicate that a sequence is a fragment. (See * esl_msa_MarkFragments()). * * Because of the way these fragments will be handled in tracebacks, * we reject any alignment that uses missing data characters in any * other way. * * This validation step costs negligible time. */ static int validate_msa(P7_BUILDER *bld, ESL_MSA *msa) { int idx; int64_t apos; for (idx = 0; idx < msa->nseq; idx++) { apos = 1; while ( esl_abc_XIsMissing(msa->abc, msa->ax[idx][apos]) && apos <= msa->alen) apos++; while (! esl_abc_XIsMissing(msa->abc, msa->ax[idx][apos]) && apos <= msa->alen) apos++; while ( esl_abc_XIsMissing(msa->abc, msa->ax[idx][apos]) && apos <= msa->alen) apos++; if (apos != msa->alen+1) ESL_FAIL(eslEINVAL, bld->errbuf, "msa %s; sequence %s\nhas missing data chars (~) other than at fragment edges", msa->name, msa->sqname[idx]); } return eslOK; }
/* map_sub_msas * * msa1 and msa2 contain the same named sequences, msa1 contains a superset * of the columns in msa2. Determine which of the msa1 columns the msa2 * columns correspond to. */ static int map_sub_msas(const ESL_GETOPTS *go, char *errbuf, ESL_MSA *msa1, ESL_MSA *msa2, char **ret_msa1_to_msa2_mask) { int status; int apos1, apos2; /* counters over alignment position in msa1, msa2 respectively */ int i; int *msa1_to_msa2_map; /* [0..apos1..msa1->alen] msa2 alignment position that apos1 corresponds to */ char *mask; /* contract check */ if(! (msa1->flags & eslMSA_DIGITAL)) ESL_FAIL(eslEINVAL, errbuf, "in map_sub_msas() msa1 (%s) not digitized.\n", esl_opt_GetArg(go, 1)); if(! (msa2->flags & eslMSA_DIGITAL)) ESL_FAIL(eslEINVAL, errbuf, "in map_sub_msas() msa2 (%s) not digitized.\n", esl_opt_GetString(go, "--submap")); if(msa1->alen <= msa2->alen) ESL_FAIL(eslEINVAL, errbuf, "in map_sub_msas() alignment length for msa1 (%" PRId64 "d) <= length for msa2 (%" PRId64 ")\n", msa1->alen, msa2->alen); ESL_ALLOC(mask, sizeof(char) * (msa1->alen+1)); for(apos1 = 0; apos1 < msa1->alen; apos1++) mask[apos1] = '0'; mask[msa1->alen] = '\0'; ESL_ALLOC(msa1_to_msa2_map, sizeof(int) * (msa1->alen+1)); esl_vec_ISet(msa1_to_msa2_map, (msa1->alen+1), -1); /* both alignments must have same 'named' sequences in same order */ if(msa1->nseq != msa2->nseq) ESL_FAIL(eslEINVAL, errbuf, "in map_sub_msas() msa1 has %d sequences, msa2 has %d sequences\n", msa1->nseq, msa2->nseq); for(i = 0; i < msa1->nseq; i++) { if(strcmp(msa1->sqname[i], msa2->sqname[i]) != 0) ESL_FAIL(eslEINVAL, errbuf, "in map_sub_msas() msa1 seq %d is named %s, msa2 seq %d is named %s\n", i, msa1->sqname[i], i, msa2->sqname[i]); } apos1 = 1; apos2 = 1; while((apos2 <= msa2->alen) || (apos1 <= msa1->alen)) { /* determine which apos1 (alignment column in msa1), apos2 (alignment column in msa2) corresponds to */ for(i = 0; i < msa1->nseq; i++) { if(msa1->ax[i][apos1] != msa2->ax[i][apos2]) { apos1++; break; /* try next apos1 */ } } if(i == msa1->nseq) { /* found a match */ msa1_to_msa2_map[apos1] = apos2; mask[(apos1-1)] = '1'; apos1++; apos2++; } } if((apos1 != (msa1->alen+1)) || (apos2 != (msa2->alen+1))) ESL_FAIL(eslEINVAL, errbuf, "in map_sub_msas(), failure mapping alignments, end of loop apos1-1 = %d (msa1->alen: %" PRId64 ") and apos2-1 = %d (msa2->alen: %" PRId64 ")\n", apos1-1, msa1->alen, apos2-1, msa2->alen); free(msa1_to_msa2_map); *ret_msa1_to_msa2_mask = mask; return eslOK; ERROR: return status; }
/* Function: DispatchSqBlockAlignment() * Date: EPN, Fri Dec 30 14:59:43 2011 * * Purpose: Given a CM and a block of sequences, align the * sequence(s) using the appropriate alignment function and * return relevant data for eventual output in <ret_dataA>. * This function simply calls DispatchSqAlignment() serially * for each sequence in the block, and creates an array * of the <ret_data> DispatchSqAlignment() returns. * * Currently <mode>, <cp9b_valid> and <pass_idx> values sent * to DispatchSqAlignment() are hard-coded to * TRMODE_UNKNOWN, FALSE, and PLI_PASS_5P_AND_3P_FORCE (if * cm->align_opts & CM_ALIGN_TRUNC) or PLI_PASS_STD_ANY (if * (! cm->align_opts & CM_ALIGN_TRUNC)). This is because * this function is only used by the alignment pipeline, in * which these values are correct. If this changes, we may * want caller to pass in an array of modes, cp9b_valids and * pass_idx values, one per sq. * * If (cm->flags & CM_ALIGN_XTAU) we'll potentially tighten * HMM bands until the required DP matrices are below out * limit (<mxsize>). cm->maxtau is the max allowed tau value * during this iterative band tightening, and cm->xtau is * the factor by which we multiply cm->tau at each iteration * during band tightening. * * Args: cm - the covariance model * errbuf - char buffer for reporting errors * sq_block - block of sequences to align * mxsize - max size in Mb of allowable DP mx * w - stopwatch for timing individual stages * w_tot - stopwatch for timing total time per seq * r - RNG, req'd if CM_ALIGN_SAMPLE, can be NULL otherwise * ret_dataA - RETURN: newly created array of CM_ALNDATA objects * * Returns: eslOK on success; * eslEINCOMPAT on contract violation, errbuf is filled; * eslEMEM if we run out of memory; * <ret_dataA> is alloc'ed and filled with sq_block->count CM_ALNDATA objects. */ int DispatchSqBlockAlignment(CM_t *cm, char *errbuf, ESL_SQ_BLOCK *sq_block, float mxsize, ESL_STOPWATCH *w, ESL_STOPWATCH *w_tot, ESL_RANDOMNESS *r, CM_ALNDATA ***ret_dataA) { int status; /* easel status */ int j; /* counter over parsetrees */ CM_ALNDATA **dataA = NULL; /* CM_ALNDATA array we'll create and return */ ESL_SQ *sqp; /* ptr to a ESL_SQ */ int pass_idx; /* pass_idx passed to DispatchSqAlignment() */ char mode; /* mode passed to DispatchSqAlignment() */ int cp9b_valid; /* passed to DispatchSqAlignment() */ ESL_ALLOC(dataA, sizeof(CM_ALNDATA *) * ESL_MAX(1, sq_block->count)); // avoid 0 malloc for(j = 0; j < sq_block->count; j++) dataA[j] = NULL; /* DispatchSqAligment() needs a mode, pipeline pass index, and * knowledge of whether cm->cp9b are valid for sequence to align * (see note in 'Purpose' above). Currently the relevant values * for these are as follows: */ mode = TRMODE_UNKNOWN; pass_idx = (cm->align_opts & CM_ALIGN_TRUNC) ? PLI_PASS_5P_AND_3P_FORCE : PLI_PASS_STD_ANY; cp9b_valid = FALSE; /* main loop: for each sequence, call DispatchSqAlignment() to do the work */ for(j = 0; j < sq_block->count; j++) { sqp = sq_block->list + j; if((status = DispatchSqAlignment(cm, errbuf, sqp, sq_block->first_seqidx + j, mxsize, mode, pass_idx, cp9b_valid, w, w_tot, r, &(dataA[j]))) != eslOK) goto ERROR; } *ret_dataA = dataA; return eslOK; ERROR: if(dataA != NULL) { for(j = 0; j < sq_block->count; j++) { if(dataA[j] != NULL) cm_alndata_Destroy(dataA[j], FALSE); } free(dataA); } *ret_dataA = NULL; if(status == eslEMEM) ESL_FAIL(status, errbuf, "DispatchSqBlockAlignment(), out of memory"); else return status; /* errbuf was filled by DispatchSqAlignment() */ }
static int output_result(const struct cfg_s *cfg, char *errbuf, int msaidx, ESL_MSA *msa, P7_HMM *hmm, ESL_MSA *postmsa, double entropy) { int status; /* Special case: output the tabular results header. * Arranged this way to keep the two fprintf()'s close together in the code, * so we can keep the data and labels properly sync'ed. */ if (msa == NULL) { fprintf(cfg->ofp, "#%4s %-20s %5s %5s %5s %8s %6s %s\n", " idx", "name", "nseq", "alen", "mlen", "eff_nseq", "re/pos", "description"); fprintf(cfg->ofp, "#%4s %-20s %5s %5s %5s %8s %6s %s\n", "----", "--------------------", "-----", "-----", "-----", "--------", "------", "-----------"); return eslOK; } if ((status = p7_hmm_Validate(hmm, errbuf, 0.0001)) != eslOK) return status; if ((status = p7_hmmfile_WriteASCII(cfg->hmmfp, -1, hmm)) != eslOK) ESL_FAIL(status, errbuf, "HMM save failed"); /* # name nseq alen M eff_nseq re/pos description*/ fprintf(cfg->ofp, "%-5d %-20s %5d %5" PRId64 " %5d %8.2f %6.3f %s\n", msaidx, (msa->name != NULL) ? msa->name : "", msa->nseq, msa->alen, hmm->M, hmm->eff_nseq, entropy, (msa->desc != NULL) ? msa->desc : ""); if (cfg->postmsafp != NULL && postmsa != NULL) { esl_msa_Write(cfg->postmsafp, postmsa, eslMSAFILE_STOCKHOLM); } return eslOK; }
/* Function: GrowCP9Matrix() * * Purpose: Reallocate a CP9 dp matrix, if necessary, for seq for * length N, or 2 rows (if we're scanning in memory * efficient mode, in this case N == 1, nrows = N+1). * * Note: unlike HMMER, M never changes, so we only have * to worry about increasing the number of rows if nec. * * Returns individual ptrs to the four matrix components * as a convenience. * * This function allocates the requested matrix regardless * of it's size. * * If kmin and kmax are non-NULL, the matrix will be a p7 * HMM banded matrix as defined by bands in kmin, kmax. * In this case N must be length of the sequence. If caller * wants a non-banded CP9 matrix, pass kmin = kmax = NULL. * * Args: mx - an already allocated matrix to grow. * N - seq length to allocate for; N+1 rows * M - size of model, contract enforces this must == mx->M * kmin - OPTIONAL: [0.1..i..N] minimum k for residue i * kmax - OPTIONAL: [0.1..i..N] maximum k for residue i * mmx, imx, dmx, elmx, erow * - RETURN: ptrs to four mx components as a convenience * * Return: eslOK on success, eslEINCOMPAT if contract is violated, * mx is (re)allocated here. */ int GrowCP9Matrix(CP9_MX *mx, char *errbuf, int N, int M, int *kmin, int *kmax, int ***mmx, int ***imx, int ***dmx, int ***elmx, int **erow) { int status; void *p; int i; int ncells_needed = 0; int do_banded; int cur_ncells = 0; int do_reallocate; if(mx->M != M) ESL_FAIL(eslEINCOMPAT, errbuf, "GrowCP9Matrix(), mx->M: %d != M passed in: %d\n", mx->M, M); if(N < 0) ESL_FAIL(eslEINCOMPAT, errbuf, "GrowCP9Matrix(), N: %d < 0\n", N); do_banded = (kmin != NULL && kmax == NULL) ? TRUE : FALSE; if(do_banded) { for (i = 0; i <= N; i++) ncells_needed += (kmax[i] - kmin[i] + 1); } else ncells_needed = (N+1) * (M+1); do_reallocate = (ncells_needed <= mx->ncells_allocated) ? FALSE : TRUE; if(do_reallocate) { /* we need more space */ ESL_RALLOC(mx->mmx, p, sizeof(int *) * (N+1)); ESL_RALLOC(mx->imx, p, sizeof(int *) * (N+1)); ESL_RALLOC(mx->dmx, p, sizeof(int *) * (N+1)); ESL_RALLOC(mx->elmx, p, sizeof(int *) * (N+1)); ESL_RALLOC(mx->erow, p, sizeof(int) * (N+1)); ESL_RALLOC(mx->mmx_mem, p, sizeof(int) * ncells_needed); ESL_RALLOC(mx->imx_mem, p, sizeof(int) * ncells_needed); ESL_RALLOC(mx->dmx_mem, p, sizeof(int) * ncells_needed); ESL_RALLOC(mx->elmx_mem, p, sizeof(int) * ncells_needed); mx->ncells_allocated = ncells_needed; /* update size */ mx->size_Mb = (float) sizeof(CP9_MX); mx->size_Mb += (float) (sizeof(int *) * (N+1) * 4); /* mx->*mx ptrs */ mx->size_Mb += (float) (sizeof(int) * (ncells_needed * 4)); /* mx->*mx_mem */ mx->size_Mb += (float) (sizeof(int) * (N+1)); /* mx->erow */ mx->size_Mb /= 1000000.; } if(do_banded || do_reallocate) { /* rearrange pointers */ mx->mmx[0] = mx->mmx_mem; mx->imx[0] = mx->imx_mem; mx->dmx[0] = mx->dmx_mem; mx->elmx[0] = mx->elmx_mem; if(do_banded) { cur_ncells = kmax[0] - kmin[0] + 1; for (i = 1; i <= N; i++) { mx->mmx[i] = mx->mmx[0] + cur_ncells; mx->imx[i] = mx->imx[0] + cur_ncells; mx->dmx[i] = mx->dmx[0] + cur_ncells; mx->elmx[i]= mx->elmx[0]+ cur_ncells; cur_ncells += kmax[i] - kmin[i] + 1; } } else { /* non-banded, we only get here if we didn't go to done, i.e. we reallocated */ for (i = 1; i <= N; i++) { mx->mmx[i] = mx->mmx[0] + (i*(M+1)); mx->imx[i] = mx->imx[0] + (i*(M+1)); mx->dmx[i] = mx->dmx[0] + (i*(M+1)); mx->elmx[i]= mx->elmx[0]+ (i*(M+1)); } } } mx->rows = N; mx->kmin = kmin; /* could be NULL */ mx->kmax = kmax; /* could be NULL */ mx->ncells_valid = ncells_needed; if (mmx != NULL) *mmx = mx->mmx; if (imx != NULL) *imx = mx->imx; if (dmx != NULL) *dmx = mx->dmx; if (elmx!= NULL) *elmx= mx->elmx; if (erow != NULL) *erow = mx->erow; return eslOK; ERROR: ESL_FAIL(status, errbuf, ("GrowCP9Matrix(), memory reallocation error.")); }
/* dump_infocontent_info * * Given an MSA with RF annotation, dump information content per column data to * an open output file. */ static int dump_infocontent_info(FILE *fp, ESL_ALPHABET *abc, double **abc_ct, int use_weights, int nali, int64_t alen, int nseq, int *i_am_rf, char *msa_name, char *alifile, char *errbuf) { int status; int apos, rfpos; double bg_ent; double *bg = NULL; double *abc_freq = NULL; double nnongap; ESL_ALLOC(bg, sizeof(double) * abc->K); esl_vec_DSet(bg, abc->K, 1./(abc->K)); bg_ent = esl_vec_DEntropy(bg, abc->K); free(bg); ESL_ALLOC(abc_freq, sizeof(double) * abc->K); fprintf(fp, "# Information content per column (bits):\n"); fprintf(fp, "# Alignment file: %s\n", alifile); fprintf(fp, "# Alignment idx: %d\n", nali); if(msa_name != NULL) { fprintf(fp, "# Alignment name: %s\n", msa_name); } fprintf(fp, "# Number of sequences: %d\n", nseq); if(use_weights) { fprintf(fp, "# IMPORTANT: Counts are weighted based on sequence weights in alignment file.\n"); } else { fprintf(fp, "# Sequence weights from alignment were ignored (if they existed).\n"); } fprintf(fp, "#\n"); if(i_am_rf != NULL) { fprintf(fp, "# %7s %7s %10s %10s\n", "rfpos", "alnpos", "freqnongap", "info(bits)"); fprintf(fp, "# %7s %7s %10s %10s\n", "-------", "-------", "----------", "----------"); } else { fprintf(fp, "# %7s %10s %10s\n", "alnpos", "freqnongap", "info(bits)"); fprintf(fp, "# %7s %10s %10s\n", "-------", "----------", "----------"); } rfpos = 0; for(apos = 0; apos < alen; apos++) { if(i_am_rf != NULL) { if(i_am_rf[apos]) { fprintf(fp, " %7d", rfpos+1); rfpos++; } else { fprintf(fp, " %7s", "-"); } } nnongap = esl_vec_DSum(abc_ct[apos], abc->K); esl_vec_DCopy(abc_ct[apos], abc->K, abc_freq); esl_vec_DNorm(abc_freq, abc->K); fprintf(fp, " %7d %10.8f %10.8f\n", apos+1, nnongap / (nnongap + abc_ct[apos][abc->K]), (bg_ent - esl_vec_DEntropy(abc_freq, abc->K))); } fprintf(fp, "//\n"); if(abc_freq != NULL) free(abc_freq); return eslOK; ERROR: ESL_FAIL(eslEINVAL, errbuf, "out of memory"); return status; /* NEVERREACHED */ }
/* dump_posterior_sequence_info * * Dump per-sequence posterior probability data to a file. * */ static int dump_posterior_sequence_info(FILE *fp, ESL_MSA *msa, int nali, char *alifile, char *errbuf) { int i,p,apos; /* counters over sequences, columns of MSA */ int ppidx; int nppvals = 12; int nnongap; double sum; float ppavgA[11]; char ppstring[12] = "0123456789*."; int seq_pp_ct[12]; ppavgA[0] = 0.025; ppavgA[1] = 0.10; ppavgA[2] = 0.20; ppavgA[3] = 0.30; ppavgA[4] = 0.40; ppavgA[5] = 0.50; ppavgA[6] = 0.60; ppavgA[7] = 0.70; ppavgA[8] = 0.80; ppavgA[9] = 0.90; ppavgA[10] = 0.975; fprintf(fp, "# Posterior probability stats per sequence:\n"); fprintf(fp, "# Alignment file: %s\n", alifile); fprintf(fp, "# Alignment idx: %d\n", nali); if(msa->name != NULL) { fprintf(fp, "# Alignment name: %s\n", msa->name); } fprintf(fp, "# Number of sequences: %d\n", msa->nseq); fprintf(fp, "# %7s %-40s %7s", "seqidx", "seqname", "nnongap"); for(p = 0; p < nppvals-1; p++) { /* don't include gaps in per-sequence output */ fprintf(fp, " %7c", ppstring[p]); } fprintf(fp, " %7s\n", "avgPP"); fprintf(fp, "# %7s %40s %7s", "-------", "----------------------------------------", "-------"); for(p = 0; p < nppvals-1; p++) { /* don't include gaps in per-sequence output */ fprintf(fp, " %7s", "-------"); } fprintf(fp, " %7s\n", "-------"); for(i = 0; i < msa->nseq; i++) { if(msa->pp[i] != NULL) { fprintf(fp, " %7d %-40s", i+1, msa->sqname[i]); sum = 0.; esl_vec_ISet(seq_pp_ct, nppvals, 0); for(apos = 0; apos < msa->alen; apos++) { if((ppidx = get_pp_idx(msa->abc, msa->pp[i][apos])) == -1) ESL_FAIL(eslEFORMAT, errbuf, "bad #=GR PP char: %c", msa->pp[i][apos]); seq_pp_ct[ppidx]++; } nnongap = esl_vec_ISum(seq_pp_ct, 11); fprintf(fp, " %7d", nnongap); for(p = 0; p < nppvals-1; p++) { /* don't include gaps in per-sequence output */ fprintf(fp, " %7d", seq_pp_ct[p]); if(p <= 10) sum += (float) seq_pp_ct[p] * ppavgA[p]; } fprintf(fp, " %.5f\n", sum / (float) nnongap); } } fprintf(fp, "//\n"); return eslOK; }
/* dump_insert_info * * Given an MSA with RF annotation, print out information about how many 'insertions' come * after each non-gap RF column (consensus column). */ static int dump_insert_info(FILE *fp, ESL_MSA *msa, int use_weights, int nali, int *i_am_rf, char *alifile, char *errbuf) { int status; int apos, rfpos; double **ict; double *total_ict; int i; int rflen; double seqwt; /* weight of current sequence */ double nseq; /* contract check */ if(! (msa->flags & eslMSA_DIGITAL)) ESL_XFAIL(eslEINVAL, errbuf, "in dump_insert_info(), msa must be digitized."); if(msa->rf == NULL) ESL_XFAIL(eslEINVAL, errbuf, "No #=GC RF markup in alignment, it is needed for --iinfo."); if(i_am_rf == NULL) ESL_XFAIL(eslEINVAL, errbuf, "internal error, dump_insert_info() i_am_rf is NULL."); if(use_weights && msa->wgt == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "dump_insert_info(): use_weights==TRUE but msa->wgt == NULL"); ESL_ALLOC(total_ict, sizeof(double) * (msa->alen+2)); esl_vec_DSet(total_ict, (msa->alen+2), 0.); ESL_ALLOC(ict, sizeof(double *) * (msa->alen+2)); for(i = 0; i <= msa->alen; i++) { ESL_ALLOC(ict[i], sizeof(double) * (msa->nseq)); esl_vec_DSet(ict[i], (msa->nseq), 0.); } fprintf(fp, "# Insert information:\n"); fprintf(fp, "# Alignment file: %s\n", alifile); fprintf(fp, "# Alignment idx: %d\n", nali); if(msa->name != NULL) { fprintf(fp, "# Alignment name: %s\n", msa->name); } fprintf(fp, "# rfpos is the nongap RF position after which insertions occur\n"); fprintf(fp, "# An rfpos of '0' indicates insertions before the first nongap RF position\n"); fprintf(fp, "# Number of sequences: %d\n", msa->nseq); if(use_weights) { fprintf(fp, "# IMPORTANT: Counts are weighted based on sequence weights in alignment file.\n"); } else { fprintf(fp, "# Sequence weights from alignment were ignored (if they existed).\n"); } fprintf(fp, "#\n"); fprintf(fp, "# %8s %10s %8s %8s\n", "rfpos", "nseq w/ins", "freq ins", "avg len"); fprintf(fp, "# %8s %10s %8s %8s\n", "--------", "----------", "--------", "--------"); rflen = 0; for(apos = 1; apos <= msa->alen; apos++) if(i_am_rf[apos-1]) rflen++; rfpos = 0; for(apos = 1; apos <= msa->alen; apos++) { if(i_am_rf[apos-1]) rfpos++; else { for(i = 0; i < msa->nseq; i++) { seqwt = use_weights ? msa->wgt[i] : 1.0; if(esl_abc_XIsResidue(msa->abc, msa->ax[i][apos])) { ict[rfpos][i]++; total_ict[rfpos] += seqwt; } } } } rflen = rfpos; for(rfpos = 0; rfpos <= rflen; rfpos++) { nseq = 0.; for(i = 0; i < msa->nseq; i++) { if(ict[rfpos][i] >= 1) { seqwt = use_weights ? msa->wgt[i] : 1.0; nseq += seqwt; } } if(nseq > 0.) fprintf(fp, " %8d %10.1f %8.6f %8.3f\n", rfpos, nseq, nseq / (float) msa->nseq, ((float) total_ict[rfpos] / (float) nseq)); } fprintf(fp, "//\n"); for(i = 0; i <= msa->alen; i++) free(ict[i]); free(ict); free(total_ict); return eslOK; ERROR: return status; }
/* map_msas * * Align msa1 and msa2. * For each column in msa1, determine the corresponding column * in msa2. This implementation requires: * - msa1 and msa2 contain exactly the same sequences in the same order * Note: the seqs in msa1 and msa2 do not have to have the same names. * * Uses a DP algorithm similar to Needleman-Wunsch, but that's aligning * two alignment columns at a time instead of two residues. */ static int map_msas(const ESL_GETOPTS *go, char *errbuf, ESL_MSA *msa1, ESL_MSA *msa2, int **ret_msa1_to_msa2_map) { int status; int **one2two; /* [0..c..rflen1][0..a..alen2] number of residues from non-gap RF column c of msa1 * aligned in column a of msa 2 */ int *rf2a_map1 = NULL; /* msa1 map of reference columns (non-gap RF residues) to alignment columns, NULL if msa1->rf == NULL */ int *rf2a_map2 = NULL; /* msa2 map of reference columns (non-gap RF residues) to alignment columns, NULL if msa2->rf == NULL */ int *a2rf_map1 = NULL; /* msa1 map of alignment columns to reference columns, NULL if msa1->rf == NULL */ int *a2rf_map2 = NULL; /* msa2 map of alignment columns to reference columns, NULL if msa2->rf == NULL */ int apos1, apos2; /* counters over alignment position in msa1, msa2 respectively */ int alen1, alen2; /* alignment lengths */ int rfpos1, rfpos2; /* counters over reference positions */ int rflen1, rflen2; /* reference (non-gap RF) lengths */ int **mx; /* [0..c..rflen1][0..a..alen2] dp matrix, score of max scoring aln * from 1..c in msa1 and 1..a in msa 2 */ int **tb; /* [0..c..rflen1][0..a..alen2] traceback ptrs, 0 for diagonal, 1 for vertical */ char *seq1, *seq2; /* temporary strings for ensuring dealigned sequences in msa1 and msa2 are identical */ int64_t len1, len2; /* length of seq1, seq2 */ int isgap1, isgap2; /* is this residue a gap in msa1, msa2? */ int i; /* counter over sequences */ int *res1_per_apos; /* [0..apos..alen1] number of residues in column apos of msa1 */ int sc; /* max score of full path (alignment) through dp mx */ int tb_sc; /* score of traceback, should equal sc */ int *one2two_map; /* [0..a..alen1] the alignment, msa2 column that column apos1 in msa1 maps to */ int total_res = 0; /* total number of residues in msa1 */ float coverage; /* fraction of total_res that are within mapped msa2 columns from one2two_map, * this is tb_sc / total_res */ int total_cres1=0; /* total number of residues in reference positions in msa1 */ int covered_cres1 = 0; /* number of residues in reference positions in msa1 that also appear in the corresponding * mapped column of msa2 */ int be_quiet = esl_opt_GetBoolean(go, "-q"); int *choices; int i_choice; /* contract check */ if(! (msa1->flags & eslMSA_DIGITAL)) ESL_FAIL(eslEINVAL, errbuf, "in map_msas() msa1 (%s) not digitized.\n", esl_opt_GetArg(go, 1)); if(! (msa2->flags & eslMSA_DIGITAL)) ESL_FAIL(eslEINVAL, errbuf, "in map_msas() msa2 (%s) not digitized.\n", esl_opt_GetArg(go, 2)); alen1 = msa1->alen; alen2 = msa2->alen; /* Map msa1 (reference) columns to alignment positions */ rflen1 = rflen2 = 0; if(msa1->rf != NULL) if((status = map_rfpos_to_apos(msa1, &rf2a_map1, &a2rf_map1, &rflen1)) != eslOK) goto ERROR; if(msa2->rf != NULL) if((status = map_rfpos_to_apos(msa2, &rf2a_map2, &a2rf_map2, &rflen2)) != eslOK) goto ERROR; if(! be_quiet) { printf("# %-25s alignment length: %d\n", esl_opt_GetArg(go, 1), alen1); printf("# %-25s alignment length: %d\n", esl_opt_GetArg(go, 2), alen2); } /* collect counts in one2two[i][j]: number of sequences for which residue aligned in msa1 non-gap column i * is aligned in msa2 alignment column j. */ ESL_ALLOC(seq1, sizeof(char) * (alen1+1)); ESL_ALLOC(seq2, sizeof(char) * (alen2+1)); ESL_ALLOC(one2two, sizeof(int *) * (alen1+1)); for(apos1 = 0; apos1 <= alen1; apos1++) { ESL_ALLOC(one2two[apos1], sizeof(int) * (alen2+1)); esl_vec_ISet(one2two[apos1], (alen2+1), 0); } total_res = 0; for(i = 0; i < msa1->nseq; i++) { /* ensure raw (unaligned) seq i in the 2 msas is the same */ esl_abc_Textize(msa1->abc, msa1->ax[i], alen1, seq1); esl_abc_Textize(msa1->abc, msa2->ax[i], alen2, seq2); /* note: msa*1*->abc used on purpose, allows DNA/RNA to peacefully coexist in this func */ esl_strdealign(seq1, seq1, "-_.~", &len1); esl_strdealign(seq2, seq2, "-_.~", &len2); if(len1 != len2) { ESL_FAIL(eslEINVAL, errbuf, "unaligned seq number %d (msa1: %s, msa2: %s) differs in length %s (%" PRId64 ") and %s (%" PRId64 "), those files must contain identical raw seqs\n", i, msa1->sqname[i], msa2->sqname[i], esl_opt_GetArg(go, 1), len1, esl_opt_GetArg(go, 2), len2); } if(strncmp(seq1, seq2, len1) != 0) ESL_FAIL(eslEINVAL, errbuf, "unaligned seq number %d differs between %s and %s, those files must contain identical raw seqs\n", i, esl_opt_GetArg(go, 1), esl_opt_GetArg(go, 2)); total_res += len1; apos1 = apos2 = 1; while((apos1 <= alen1) || (apos2 <= alen2)) { isgap1 = esl_abc_XIsGap(msa1->abc, msa1->ax[i][apos1]); isgap2 = esl_abc_XIsGap(msa2->abc, msa2->ax[i][apos2]); if ( isgap1 && isgap2) { apos1++; apos2++; } else if ( isgap1 && !isgap2) { apos1++; } else if (!isgap1 && isgap2) { apos2++; } else if ( msa1->ax[i][apos1] == msa2->ax[i][apos2]) { one2two[apos1++][apos2++]++; /* two2one[apos2][apos1]++; */ } } } /****************************************************************** * DP alignment of msa1 to msa2 * dp matrix: mx[apos1][apos2] apos1=1..msa->alen1, apos2=1..alen2 (apos1=0 || apos2=0 is invalid) * mx[apos1][apos2] = score of maximal alignment for apos1=1..apos1, apos2'=1..apos2 INCLUDING * apos1 and apos2. Score is number of residues from msa1 columns * 1..apos1 that exist in their respective aligned columns in msa2 (the growing * maximally scoring alignment). */ /****************************************************************** * initialization */ ESL_ALLOC(mx, sizeof(int *) * (alen1+1)); ESL_ALLOC(tb, sizeof(int *) * (alen1+1)); for(apos1 = 0; apos1 <= alen1; apos1++) { ESL_ALLOC(mx[apos1], sizeof(int) * (alen2+1)); ESL_ALLOC(tb[apos1], sizeof(int) * (alen2+1)); esl_vec_ISet(mx[apos1], (alen2+1), 0); esl_vec_ISet(tb[apos1], (alen2+1), -2); /* -2 is a bogus value, if we see it during traceback, there's a problem */ tb[apos1][0] = HORZ; /* special case, if we hit apos2==0 and apos1 > 0, we have to do HORZ moves until apos1==1 */ } esl_vec_ISet(tb[0], (alen2+1), VERT); /* special case, if we hit apos1==0 and apos2 > 0, we have to do VERT moves until apos2==1 */ tb[0][0] = -2; /* all alignments must end here */ ESL_ALLOC(res1_per_apos, sizeof(int) * (alen1+1)); esl_vec_ISet(res1_per_apos, (alen1+1), 0); mx[0][0] = 0; tb[0][0] = -1; /* last cell, special value */ /***************************************************************** * recursion */ ESL_ALLOC(choices, sizeof(int) * NCHOICES); for(apos1 = 1; apos1 <= alen1; apos1++) { for(apos2 = 1; apos2 <= alen2; apos2++) { choices[DIAG] = mx[(apos1-1)][(apos2-1)] + one2two[apos1][apos2]; choices[VERT] = mx[ apos1 ][(apos2-1)]; choices[HORZ] = mx[(apos1-1)][ apos2 ]; i_choice = esl_vec_IArgMax(choices, NCHOICES); mx[apos1][apos2] = choices[i_choice]; tb[apos1][apos2] = i_choice; res1_per_apos[apos1] += one2two[apos1][apos2]; /*printf("mx[%3d][%3d]: %5d (%d)\n", apos1, apos2, mx[apos1][apos2], tb[apos1][apos2]);*/ } } free(choices); total_cres1 = 0; if(rf2a_map1 != NULL) { for(rfpos1 = 1; rfpos1 <= rflen1; rfpos1++) total_cres1 += res1_per_apos[rf2a_map1[rfpos1]]; } /***************************************************************** * traceback */ sc = mx[alen1][alen2]; if(!be_quiet) { /* printf("score %d\n", sc);*/ if(a2rf_map1 != NULL && a2rf_map2 != NULL) { printf("# %12s %12s %22s\n", " msa 1 ", " msa 2 ", ""); printf("# %12s %12s %22s\n", "------------", "------------", ""); printf("# %5s %5s %5s %5s %22s\n", "rfpos", "apos", "rfpos", "apos", " num common residues"); printf("# %5s %5s %5s %5s %22s\n", "-----", "-----", "-----", "-----", "---------------------"); } else if(a2rf_map1 != NULL) { printf("# %12s %5s %22s\n", " msa 1 ", "msa 2", ""); printf("# %12s %5s %22s\n", "------------", "-----", ""); printf("# %5s %5s %5s %22s\n", "rfpos", "apos", "apos", " num common residues"); printf("# %5s %5s %5s %22s\n", "-----", "-----", "-----", "---------------------"); } else if (a2rf_map2 != NULL) { printf("# %5s %12s %22s\n", "msa 1", " msa 2 ", ""); printf("# %5s %12s %22s\n", "-----", "------------", ""); printf("# %5s %5s %5s %22s\n", "apos", "rfpos", "apos", " num common residues"); printf("# %5s %5s %5s %22s\n", "-----", "-----", "-----", "---------------------"); } else { printf("# %5s %5s %22s\n", "msa 1", "msa 2", ""); printf("# %5s %5s %22s\n", "-----", "-----", ""); printf("# %5s %5s %22s\n", "apos", "apos", " num common residues"); printf("# %5s %5s %22s\n", "-----", "-----", "---------------------"); } } /* traceback, and build one2two_map[] */ apos1 = alen1; apos2 = alen2; tb_sc = 0; covered_cres1 = 0; ESL_ALLOC(one2two_map, sizeof(int) * (alen1+1)); esl_vec_ISet(one2two_map, (alen1+1), 0); one2two_map[0] = -1; /* invalid */ while(tb[apos1][apos2] != -1) { if(tb[apos1][apos2] == DIAG) { /* diagonal move */ rfpos1 = (a2rf_map1 == NULL) ? -1 : a2rf_map1[apos1]; rfpos2 = (a2rf_map2 == NULL) ? -1 : a2rf_map2[apos2]; if(!be_quiet) { if(a2rf_map1 != NULL && a2rf_map2 != NULL) { if(rfpos1 == -1 && rfpos2 == -1) { printf(" %5s %5d --> %5s %5d %5d / %5d (%.4f)\n", "-", apos1, "-", apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } else if (rfpos1 == -1) { printf(" %5s %5d --> %5d %5d %5d / %5d (%.4f)\n", "-", apos1, rfpos2, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } else if (rfpos2 == -1) { printf(" %5d %5d --> %5s %5d %5d / %5d (%.4f)\n", rfpos1, apos1, "-", apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } else { printf(" %5d %5d --> %5d %5d %5d / %5d (%.4f)\n", rfpos1, apos1, rfpos2, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } } else if(a2rf_map1 != NULL) { if (rfpos1 == -1) { printf(" %5s %5d --> %5d %5d / %5d (%.4f)\n", "-", apos1, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } else { printf(" %5d %5d --> %5d %5d / %5d (%.4f)\n", rfpos1, apos1, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } } else if (a2rf_map2 != NULL) { if (rfpos2 == -1) { printf(" %5d --> %5s %5d %5d / %5d (%.4f)\n", apos1, "-", apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } else { printf(" %5d --> %5d %5d %5d / %5d (%.4f)\n", apos1, rfpos2, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } } else { printf(" %5d --> %5d %5d / %5d (%.4f)\n", apos1, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } } tb_sc += one2two[apos1][apos2]; one2two_map[apos1] = apos2; if(rfpos1 > 0) covered_cres1 += one2two[apos1][apos2]; /* apos1 is a rfpos */ apos1--; apos2--; } else if(tb[apos1][apos2] == VERT) { apos2--; /* vertical move */ } else if(tb[apos1][apos2] == HORZ) { apos1--; /* horizontal move */ } else if(tb[apos1][apos2] != -1) /* shouldn't happen */ ESL_FAIL(eslEINVAL, errbuf, "in dp traceback, tb[apos1: %d][apos2: %d] %d\n", apos1, apos2, tb[apos1][apos2]); } /* done DP code **********************************/ if(!be_quiet) printf("# Total trace back sc: %d\n", tb_sc); if(tb_sc != sc) ESL_FAIL(eslEINVAL, errbuf, "in dp traceback, tb_sc (%d) != sc (%d)\n", tb_sc, sc); coverage = (float) tb_sc / (float) total_res; printf("# Coverage: %6d / %6d (%.4f)\n# Coverage is fraction of residues from %s in optimally mapped columns in %s\n", tb_sc, total_res, coverage, esl_opt_GetArg(go, 1), esl_opt_GetArg(go, 2)); if(total_cres1 > 0) printf("# RF coverage: %6d / %6d (%.4f)\n# RF coverage is fraction of non-gap RF residues from %s in optimally mapped columns in %s\n", covered_cres1, total_cres1, (float) covered_cres1 / (float) total_cres1, esl_opt_GetArg(go, 1), esl_opt_GetArg(go, 2)); /* print masks if nec */ if((status = map2masks(go, errbuf, alen1, alen2, a2rf_map1, a2rf_map2, rf2a_map1, rf2a_map2, rflen1, rflen2, one2two_map)) != eslOK) return status; /* clean up and return */ for(apos1 = 0; apos1 <= alen1; apos1++) { free(mx[apos1]); free(tb[apos1]); } free(mx); free(tb); for(apos1 = 0; apos1 <= alen1; apos1++) free(one2two[apos1]); free(one2two); free(res1_per_apos); if(rf2a_map1 != NULL) free(rf2a_map1); if(rf2a_map2 != NULL) free(rf2a_map2); if(a2rf_map1 != NULL) free(a2rf_map1); if(a2rf_map2 != NULL) free(a2rf_map2); free(seq1); free(seq2); *ret_msa1_to_msa2_map = one2two_map; return eslOK; ERROR: return status; }
/* map2masks * * Given a map of alignment columns in msa1 to alignment columns * to msa2, construct and output masks as per command-line options. * * Args: msa1_to_msa2_map: [1..apos..msa1->alen]: '0': msa1 apos maps to a gap in msa2 (doesn't map to any column in msa2) * 'x': msa1 apos maps to posn x in msa2 (x>0) */ static int map2masks(const ESL_GETOPTS *go, char *errbuf, int alen1, int alen2, int *a2rf_map1, int *a2rf_map2, int *rf2a_map1, int *rf2a_map2, int rflen1, int rflen2, int *msa1_to_msa2_map) { int status; int apos1, apos2; /* counters over alignment position in msa1, msa2 respectively */ int rfpos1, rfpos2; /* counters over reference positions */ int num_ones; /* number of 1s in current mask */ int num_zeroes; /* number of 0s in current mask */ FILE *fp; char *mask = NULL; if(esl_opt_GetString(go, "--mask-a2a")) { if ((fp = fopen(esl_opt_GetString(go, "--mask-a2a"), "w")) == NULL) ESL_FAIL(eslFAIL, errbuf, "Failed to open --mask-a2a mask output file %s", esl_opt_GetString(go, "--mask-a2a")); /* construct mask as follows: * mask[0..apos1..alen1-1] = '1' if column apos1+1 maps to an alignment column of msa2 * = '0' if column apos1+1 maps to a gap in msa2 (doesn't map to any column in msa2) */ ESL_ALLOC(mask, sizeof(char) * (alen1+1)); num_ones = num_zeroes = 0; for(apos1 = 1; apos1 <= alen1; apos1++) { if(msa1_to_msa2_map[apos1] == 0) { mask[(apos1-1)] = '0'; num_zeroes++; } else { mask[(apos1-1)] = '1'; num_ones++; } } mask[alen1] = '\0'; fprintf(fp, "%s\n", mask); free(mask); fclose(fp); printf("# Mask of 1/0s with 1 indicating aln column in %s maps to aln column in %s saved to file %s.\n# (Length: %d; '1's: %d; '0's: %d)\n", esl_opt_GetArg(go, 1), esl_opt_GetArg(go, 2), esl_opt_GetString(go, "--mask-a2a"), (num_ones+num_zeroes), num_ones, num_zeroes); } if(esl_opt_GetString(go, "--mask-a2rf")) { if (a2rf_map2 == NULL) ESL_FAIL(eslFAIL, errbuf, "with --mask-a2rf, <msafile2> %s must have #=GC RF annotation, but it doesn't.", esl_opt_GetArg(go, 2)); if ((fp = fopen(esl_opt_GetString(go, "--mask-a2rf"), "w")) == NULL) ESL_FAIL(eslFAIL, errbuf, "Failed to open --mask-a2rf mask output file %s\n", esl_opt_GetString(go, "--mask-a2rf")); /* construct mask as follows: * mask[0..apos1..alen1-1] = '1' if column apos1+1 maps to a reference column (non-gap in RF) of msa2 * = '0' if column apos1+1 maps to a gap (doesn't map to any column in msa2) or an insert (gap in RF) in msa2 */ ESL_ALLOC(mask, sizeof(char) * (alen1+1)); num_ones = num_zeroes = 0; for(apos1 = 1; apos1 <= alen1; apos1++) { apos2 = msa1_to_msa2_map[apos1]; if(apos2 == 0) { mask[(apos1-1)] = '0'; num_zeroes++; } /* apos1 doesn't map to any column in msa2 */ else { rfpos2 = a2rf_map2[apos2]; if(rfpos2 <= 0) { mask[(apos1-1)] = '0'; num_zeroes++; } /* apos1 maps to a gap RF (insert) in msa2 */ else { mask[(apos1-1)] = '1'; num_ones++; } /* apos1 maps to a non-gap RF (reference) column in msa2 */ } } mask[alen1] = '\0'; fprintf(fp, "%s\n", mask); free(mask); fclose(fp); printf("# Mask of 1/0s with 1 indicating aln column in %s maps to reference (non-gap RF) column in %s saved to file %s.\n# (Length: %d; '1's: %d; '0's: %d)\n", esl_opt_GetArg(go, 1), esl_opt_GetArg(go, 2), esl_opt_GetString(go, "--mask-a2rf"), (num_ones+num_zeroes), num_ones, num_zeroes); } if(esl_opt_GetString(go, "--mask-rf2a")) { if (a2rf_map1 == NULL) ESL_FAIL(eslFAIL, errbuf, "with --mask-rf2a, <msafile1> %s must have #=GC RF annotation, but it doesn't.", esl_opt_GetArg(go, 1)); if ((fp = fopen(esl_opt_GetString(go, "--mask-rf2a"), "w")) == NULL) ESL_FAIL(eslFAIL, errbuf, "Failed to open --mask-rf2a mask output file %s\n", esl_opt_GetString(go, "--mask-rf2a")); /* construct mask as follows: * mask[0..rfpos1..rflen-1] = '1' if non-gap RF msa1 column rfpos1+1 maps to an alignment column of msa2 * = '0' if non-gap RF msa1 column rfpos1+1 maps to a gap in msa2 (doesn't map to any column in msa2) */ ESL_ALLOC(mask, sizeof(char) * (rflen1+1)); num_ones = num_zeroes = 0; for(rfpos1 = 1; rfpos1 <= rflen1; rfpos1++) { apos1 = rf2a_map1[rfpos1]; apos2 = msa1_to_msa2_map[apos1]; if(apos2 == 0) { mask[(rfpos1-1)] = '0'; num_zeroes++; } else { mask[(rfpos1-1)] = '1'; num_ones++; } } mask[rflen1] = '\0'; fprintf(fp, "%s\n", mask); free(mask); fclose(fp); printf("# Mask of 1/0s with 1 indicating reference (non-gap RF) column in %s maps to aln column in %s saved to file %s.\n# (Length: %d; '1's: %d; '0's: %d)\n", esl_opt_GetArg(go, 1), esl_opt_GetArg(go, 2), esl_opt_GetString(go, "--mask-rf2a"), (num_ones+num_zeroes), num_ones, num_zeroes); } if(esl_opt_GetString(go, "--mask-rf2rf")) { if (a2rf_map1 == NULL) ESL_FAIL(eslFAIL, errbuf, "with --mask-rf2rf, <msafile1> %s must have #=GC RF annotation, but it doesn't.", esl_opt_GetArg(go, 1)); if (a2rf_map2 == NULL) ESL_FAIL(eslFAIL, errbuf, "with --mask-rf2rf, <msafile2> %s must have #=GC RF annotation, but it doesn't.", esl_opt_GetArg(go, 2)); if ((fp = fopen(esl_opt_GetString(go, "--mask-rf2rf"), "w")) == NULL) ESL_FAIL(eslFAIL, errbuf, "Failed to open --mask-rf2rf mask output file %s\n", esl_opt_GetString(go, "--mask-rf2rf")); /* construct mask as follows: * mask[0..apos1..alen-1] = '1' if column apos1+1 maps to a reference column (non-gap in RF) of msa2 * = '0' if column apos1+1 maps to a gap (doesn't map to any column in msa2) or an insert (gap in RF) in msa2 */ ESL_ALLOC(mask, sizeof(char) * (alen1+1)); num_ones = num_zeroes = 0; for(rfpos1 = 1; rfpos1 <= rflen1; rfpos1++) { apos1 = rf2a_map1[rfpos1]; apos2 = msa1_to_msa2_map[apos1]; if(apos2 == 0) { mask[(rfpos1-1)] = '0'; num_zeroes++; } /* rfpos1 doesn't map to any column in msa2 */ else { rfpos2 = a2rf_map2[apos2]; if(rfpos2 <= 0) { mask[(rfpos1-1)] = '0'; num_zeroes++; } /* rfpos1 maps to a gap RF (insert) in msa2 */ else { mask[(rfpos1-1)] = '1'; num_ones++; } /* rfpos1 maps to a non-gap RF (reference) column in msa2 */ } } mask[rflen1] = '\0'; fprintf(fp, "%s\n", mask); free(mask); fclose(fp); printf("# Mask of 1/0s with 1 indicating reference (non-gap RF) column in %s maps to reference (non-gap RF) column in %s saved to file %s.\n# (Length: %d; '1's: %d; '0's: %d)\n", esl_opt_GetArg(go, 1), esl_opt_GetArg(go, 2), esl_opt_GetString(go, "--mask-rf2rf"), (num_ones+num_zeroes), num_ones, num_zeroes); } return eslOK; ERROR: ESL_FAIL(eslEMEM, errbuf, "map2masks(): memory allocation error."); return status; /* NEVERREACHED */ }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* application configuration */ ESL_ALPHABET *abc = NULL; /* biological alphabet */ char *alifile1= NULL; /* alignment 1 file name */ char *alifile2= NULL; /* alignment 2 file name */ int fmt; /* format code for alifiles */ ESLX_MSAFILE *afp1 = NULL; /* open alignment file 1 */ ESLX_MSAFILE *afp2 = NULL; /* open alignment file 2 */ ESL_MSA *msa1 = NULL; /* multiple sequence alignment 1 */ ESL_MSA *msa2 = NULL; /* multiple sequence alignment 2 */ int status; /* easel return code */ char errbuf[eslERRBUFSIZE*4]; int *msa1_to_msa2_map; /* map from <msafile1> to <msafile2> */ char *sub_msa1_to_msa2_mask; /* with --sub the map from <msafile1> to <msafile2> in mask form */ FILE *subfp = NULL; /*********************************************** * Parse command line ***********************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_GetBoolean(go, "-h") ) { esl_banner(stdout, argv[0], banner); esl_usage (stdout, argv[0], usage); puts("\nwhere basic options are:"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); exit(0); } if (esl_opt_ArgNumber(go) != 2) { printf("Incorrect number of command line arguments.\n"); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } alifile1 = esl_opt_GetArg(go, 1); alifile2 = esl_opt_GetArg(go, 2); fmt = eslMSAFILE_STOCKHOLM; /*********************************************** * Open the MSA files ***********************************************/ if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); if ( (status = eslx_msafile_Open(&abc, alifile1, NULL, fmt, NULL, &afp1)) != eslOK) eslx_msafile_OpenFailure(afp1, status); if ( (status = eslx_msafile_Open(&abc, alifile2, NULL, fmt, NULL, &afp2)) != eslOK) eslx_msafile_OpenFailure(afp2, status); /****************************************************************** * Read first alignment from each file, we only use the first one ******************************************************************/ if ((status = eslx_msafile_Read(afp1, &msa1)) != eslOK) eslx_msafile_ReadFailure(afp1, status); if ((status = eslx_msafile_Read(afp2, &msa2)) != eslOK) eslx_msafile_ReadFailure(afp2, status); /* map the alignments in msa1 and msa2 */ if(! esl_opt_IsOn(go, "--submap")) { if((status = map_msas(go, errbuf, msa1, msa2, &msa1_to_msa2_map)) != eslOK) goto ERROR; free(msa1_to_msa2_map); } /* --submap: if nec, map <msafile1> to a subset of it's own columns in <msafile2> */ else { /* --submap was enabled */ if ((subfp = fopen(esl_opt_GetString(go, "--submap"), "w")) == NULL) ESL_FAIL(eslFAIL, errbuf, "Failed to open --submap output file %s\n", esl_opt_GetString(go, "--submap")); if((status = map_sub_msas(go, errbuf, msa1, msa2, &sub_msa1_to_msa2_mask)) != eslOK) goto ERROR; fprintf(subfp, "%s\n", sub_msa1_to_msa2_mask); fclose(subfp); subfp = NULL; printf("# Mask of 1/0s with 1 indicating aln column in %s maps to a column in %s saved to file %s.\n", alifile1, alifile2, esl_opt_GetString(go, "--submap")); free(sub_msa1_to_msa2_mask); } /* Cleanup, normal return */ eslx_msafile_Close(afp1); eslx_msafile_Close(afp2); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); esl_msa_Destroy(msa1); esl_msa_Destroy(msa2); return 0; ERROR: if (afp1) eslx_msafile_Close(afp1); if (afp2) eslx_msafile_Close(afp2); if (go) esl_getopts_Destroy(go); if (msa1) esl_msa_Destroy(msa1); if (msa2) esl_msa_Destroy(msa2); if (subfp) fclose(subfp); esl_fatal(errbuf); return 1; /* never reached */ }
int p7_masstrace_Validate(const P7_MASSTRACE *mt, char *errbuf) { float tol = 1e-3; int i,k; if (mt->L <= 0) ESL_FAIL(eslFAIL, errbuf, "L=0"); if (mt->M <= 0) ESL_FAIL(eslFAIL, errbuf, "L=0"); if (mt->i0 < 1 || mt->i0 > mt->L) ESL_FAIL(eslFAIL, errbuf, "i0 range"); if (mt->k0 < 1 || mt->k0 > mt->M) ESL_FAIL(eslFAIL, errbuf, "k0 range"); if ( ! p7_trace_IsMain(mt->st0)) ESL_FAIL(eslFAIL, errbuf, "st0 not {MID}{LG}"); if (mt->imass && mt->imass[0] != 0.) ESL_FAIL(eslFAIL, errbuf, "imass[0] not 0"); if (mt->imass && mt->imass[mt->i0] != 1.) ESL_FAIL(eslFAIL, errbuf, "imass[i0] not 1"); if (mt->imass && mt->imass[mt->L+1] != 0.) ESL_FAIL(eslFAIL, errbuf, "imass[L+1] not 0"); if (mt->kmass[0] != 0.) ESL_FAIL(eslFAIL, errbuf, "kmass[0] not 0"); if (mt->kmass[mt->k0] != 1.) ESL_FAIL(eslFAIL, errbuf, "kmass[k0] not 1"); if (mt->kmass[mt->M+1] != 0.) ESL_FAIL(eslFAIL, errbuf, "kmass[M+1] not 0"); if (mt->imass) { for (i = 0; i <= mt->L+1; i++) if (!isfinite(mt->imass[i]) || mt->imass[i] < 0.0 || mt->imass[i] > 1+tol) ESL_FAIL(eslFAIL, errbuf, "imass[%d] isn't a probability: %f\n", i, mt->imass[i]); } for (k = 0; k <= mt->M+1; k++) if (!isfinite(mt->kmass[k]) || mt->kmass[k] < 0.0 || mt->kmass[k] > 1+tol) ESL_FAIL(eslFAIL, errbuf, "kmass[%d] isn't a probability: %f\n", k, mt->kmass[k]); return eslOK; }
/* Function: esl_min_ConjugateGradientDescent() * Incept: SRE, Wed Jun 22 08:49:42 2005 [St. Louis] * * Purpose: n-dimensional minimization by conjugate gradient descent. * * An initial point is provided by <x>, a vector of <n> * components. The caller also provides a function <*func()> that * compute the objective function f(x) when called as * <(*func)(x, n, prm)>, and a function <*dfunc()> that can * compute the gradient <dx> at <x> when called as * <(*dfunc)(x, n, prm, dx)>, given an allocated vector <dx> * to put the derivative in. Any additional data or fixed * parameters that these functions require are passed by * the void pointer <prm>. * * The first step of each iteration is to try to bracket * the minimum along the current direction. The initial step * size is controlled by <u[]>; the first step will not exceed * <u[i]> for any dimension <i>. (You can think of <u> as * being the natural "units" to use along a graph axis, if * you were plotting the objective function.) * * The caller also provides an allocated workspace sufficient to * hold four allocated n-vectors. (4 * sizeof(double) * n). * * Iterations continue until the objective function has changed * by less than a fraction <tol>. This should not be set to less than * sqrt(<DBL_EPSILON>). * * Upon return, <x> is the minimum, and <ret_fx> is f(x), * the function value at <x>. * * Args: x - an initial guess n-vector; RETURN: x at the minimum * u - "units": maximum initial step size along gradient when bracketing. * n - dimensionality of all vectors * *func() - function for computing objective function f(x) * *dfunc() - function for computing a gradient at x * prm - void ptr to any data/params func,dfunc need * tol - convergence criterion applied to f(x) * wrk - allocated 4xn-vector for workspace * ret_fx - optRETURN: f(x) at the minimum * * Returns: <eslOK> on success. * * Throws: <eslENOHALT> if it fails to converge in MAXITERATIONS. * <eslERANGE> if the minimum is not finite, which may * indicate a problem in the implementation or choice of <*func()>. * * Xref: STL9/101. */ int esl_min_ConjugateGradientDescent(double *x, double *u, int n, double (*func)(double *, int, void *), void (*dfunc)(double *, int, void *, double *), void *prm, double tol, double *wrk, double *ret_fx) { double oldfx; double coeff; int i, i1; double *dx, *cg, *w1, *w2; double cvg; double fa,fb,fc; double ax,bx,cx; double fx; dx = wrk; cg = wrk + n; w1 = wrk + 2*n; w2 = wrk + 3*n; oldfx = (*func)(x, n, prm); /* init the objective function */ /* Bail out if the function is +/-inf: this can happen if the caller * has screwed something up, or has chosen a bad start point. */ if (oldfx == eslINFINITY || oldfx == -eslINFINITY) ESL_EXCEPTION(eslERANGE, "minimum not finite"); if (dfunc != NULL) { (*dfunc)(x, n, prm, dx); /* find the current negative gradient, - df(x)/dxi */ esl_vec_DScale(dx, n, -1.0); } else numeric_derivative(x, u, n, func, prm, 1e-4, dx); /* resort to brute force */ esl_vec_DCopy(dx, n, cg); /* and make that the first conjugate direction, cg */ /* (failsafe) convergence test: a zero direction can happen, * and it either means we're stuck or we're finished (most likely stuck) */ for (i1 = 0; i1 < n; i1++) if (cg[i1] != 0.) break; if (i1 == n) { if (ret_fx != NULL) *ret_fx = oldfx; return eslOK; } for (i = 0; i < MAXITERATIONS; i++) { /* Figure out the initial step size. */ bx = fabs(u[0] / cg[0]); for (i1 = 1; i1 < n; i1++) { cx = fabs(u[i1] / cg[i1]); if (cx < bx) bx = cx; } /* Bracket the minimum. */ bracket(x, cg, n, bx, func, prm, w1, &ax, &bx, &cx, &fa, &fb, &fc); /* Minimize along the line given by the conjugate gradient <cg> */ brent(x, cg, n, func, prm, ax, cx, 1e-3, 1e-8, w2, NULL, &fx); esl_vec_DCopy(w2, n, x); /* Bail out if the function is now +/-inf: this can happen if the caller * has screwed something up. */ if (fx == eslINFINITY || fx == -eslINFINITY) ESL_EXCEPTION(eslERANGE, "minimum not finite"); /* Find the negative gradient at that point (temporarily in w1) */ if (dfunc != NULL) { (*dfunc)(x, n, prm, w1); esl_vec_DScale(w1, n, -1.0); } else numeric_derivative(x, u, n, func, prm, 1e-4, w1); /* resort to brute force */ /* Calculate the Polak-Ribiere coefficient */ for (coeff = 0., i1 = 0; i1 < n; i1++) coeff += (w1[i1] - dx[i1]) * w1[i1]; coeff /= esl_vec_DDot(dx, dx, n); /* Calculate the next conjugate gradient direction in w2 */ esl_vec_DCopy(w1, n, w2); esl_vec_DAddScaled(w2, cg, coeff, n); /* Finishing set up for next iteration: */ esl_vec_DCopy(w1, n, dx); esl_vec_DCopy(w2, n, cg); /* Now: x is the current point; * fx is the function value at that point; * dx is the current gradient at x; * cg is the current conjugate gradient direction. */ /* Main convergence test. 1e-9 factor is fudging the case where our * minimum is at exactly f()=0. */ cvg = 2.0 * fabs((oldfx-fx)) / (1e-10 + fabs(oldfx) + fabs(fx)); // fprintf(stderr, "(%d): Old f() = %.9f New f() = %.9f Convergence = %.9f\n", i, oldfx, fx, cvg); // fprintf(stdout, "(%d): Old f() = %.9f New f() = %.9f Convergence = %.9f\n", i, oldfx, fx, cvg); #if eslDEBUGLEVEL >= 2 printf("\nesl_min_ConjugateGradientDescent():\n"); printf("new point: "); for (i1 = 0; i1 < n; i1++) printf("%g ", x[i1]); printf("\nnew gradient: "); for (i1 = 0; i1 < n; i1++) printf("%g ", dx[i1]); numeric_derivative(x, u, n, func, prm, 1e-4, w1); printf("\n(numeric grad): "); for (i1 = 0; i1 < n; i1++) printf("%g ", w1[i1]); printf("\nnew direction: "); for (i1 = 0; i1 < n; i1++) printf("%g ", cg[i1]); printf("\nOld f() = %g New f() = %g Convergence = %g\n\n", oldfx, fx, cvg); #endif if (cvg <= tol) break; /* Second (failsafe) convergence test: a zero direction can happen, * and it either means we're stuck or we're finished (most likely stuck) */ for (i1 = 0; i1 < n; i1++) if (cg[i1] != 0.) break; if (i1 == n) break; oldfx = fx; } if (ret_fx != NULL) *ret_fx = fx; if (i == MAXITERATIONS) ESL_FAIL(eslENOHALT, NULL, " "); // ESL_EXCEPTION(eslENOHALT, "Failed to converge in ConjugateGradientDescent()"); return eslOK; }
/* Function: DispatchSqAlignment() * Date: EPN, Thu Jan 12 14:47:26 2012 * * Purpose: Given a CM and a sequence, align the sequence(s) using * the appropriate alignment function and return relevant * data for eventual output in <ret_data>. * * This function can be called from either an alignment * pipeline (i.e. cmalign) or a search/scan pipeline * (i.e. cmsearch or cmscan). <idx> is the (overloaded) flag * for determining which, if -1, we're a search/scan * pipeline. This is only relevant because in a search/scan * pipeline we don't care about determining spos/epos so we * don't call ParsetreeToCMBounds(). * * If (cm->flags & CM_ALIGN_XTAU) we'll potentially tighten * HMM bands until the required DP matrices are below out * limit (<mxsize>). cm->maxtau is the max allowed tau value * during this iterative band tightening, and cm->xtau is * the factor by which we multiply cm->tau at each iteration * during band tightening. * * Args: cm - the covariance model * errbuf - char buffer for reporting errors * sq - sequence to align * idx - index of sequence (may be used to reorder data later) * mxsize - max size in Mb of allowable DP mx * mode - preset mode of alignment (TRMODE_UNKNOWN if unknown) * pass_idx - pipeline pass index, determines trunc penalty * cp9b_valid - TRUE if cm->cp9b are valid, don't compute HMM bands * w - stopwatch for timing individual stages, can be NULL * w_tot - stopwatch for timing total time per seq, can be NULL * r - RNG, req'd if CM_ALIGN_SAMPLE, can be NULL otherwise * ret_data - RETURN: newly created CM_ALNDATA object * * Returns: eslOK on success; * eslEINCOMPAT on contract violation, errbuf is filled; * eslEMEM if we run out of memory; * <ret_data> is alloc'ed and filled. */ int DispatchSqAlignment(CM_t *cm, char *errbuf, ESL_SQ *sq, int64_t idx, float mxsize, char mode, int pass_idx, int cp9b_valid, ESL_STOPWATCH *w, ESL_STOPWATCH *w_tot, ESL_RANDOMNESS *r, CM_ALNDATA **ret_data) { int status; /* easel status */ CM_ALNDATA *data = NULL; /* CM_ALNDATA we'll create and fill */ float sc = 0.; /* score from alignment function */ float pp = 0.; /* average PP from alignment function */ Parsetree_t *tr = NULL; /* ptr to a parsetree */ char *ppstr = NULL; /* ptr to a PP string */ float secs_bands = 0.; /* seconds elapsed for band calculation */ float secs_aln = 0.; /* seconds elapsed for alignment calculation */ float mb_tot = 0.; /* size of all DP matrices used for alignment */ double tau = -1.; /* tau used for calculating bands */ float thresh1 = -1.; /* cp9b->thresh1 used for calculating bands */ float thresh2 = -1.; /* cp9b->thresh2 used for calculating bands */ int spos = -1; /* start posn: first non-gap CM consensus position */ int epos = -1; /* end posn: final non-gap CM consensus position */ double save_tau = cm->tau; /* cm->tau upon entrance, we restore before leaving */ float save_thresh1 = (cm->cp9b == NULL) ? -1. : cm->cp9b->thresh1; float save_thresh2 = (cm->cp9b == NULL) ? -1. : cm->cp9b->thresh2; /* alignment options */ int do_nonbanded = (cm->align_opts & CM_ALIGN_NONBANDED) ? TRUE : FALSE; int do_qdb = (cm->align_opts & CM_ALIGN_QDB) ? TRUE : FALSE; int do_hbanded = (do_nonbanded || do_qdb) ? FALSE : TRUE; int do_optacc = (cm->align_opts & CM_ALIGN_OPTACC) ? TRUE : FALSE; int do_sample = (cm->align_opts & CM_ALIGN_SAMPLE) ? TRUE : FALSE; int do_post = (cm->align_opts & CM_ALIGN_POST) ? TRUE : FALSE; int do_sub = (cm->align_opts & CM_ALIGN_SUB) ? TRUE : FALSE; int do_small = (cm->align_opts & CM_ALIGN_SMALL) ? TRUE : FALSE; int do_trunc = (cm->align_opts & CM_ALIGN_TRUNC) ? TRUE : FALSE; int do_xtau = (cm->align_opts & CM_ALIGN_XTAU) ? TRUE : FALSE; int doing_search = FALSE; #if eslDEBUGLEVEL >= 1 printf("in DispatchSqAlignment() %s\n", sq->name); printf("\tdo_nonbanded: %d\n", do_nonbanded); printf("\tdo_optacc: %d\n", do_optacc); printf("\tdo_sample: %d\n", do_sample); printf("\tdo_post: %d\n", do_post); printf("\tdo_sub: %d\n", do_sub); printf("\tdo_small: %d\n", do_small); printf("\tdo_trunc: %d\n", do_trunc); printf("\tdo_qdb: %d\n", do_qdb); printf("\tdoing_search: %d\n", doing_search); #endif /* sub-mode specific variables (wouldn't be needed if sub mode were not supported) */ CM_t *orig_cm = cm; /* pointer to the original CM */ CM_t *sub_cm = NULL; /* the sub CM */ CMSubMap_t *submap = NULL; /* map from mother CM to sub CM, and vice versa */ Parsetree_t *full_tr = NULL; /* converted parsetree to full CM */ /* contract check */ if(do_small && do_hbanded) ESL_XFAIL(eslEINCOMPAT, errbuf, "DispatchSqAlignment() trying to do small and HMM banded alignment"); if(do_small && do_optacc) ESL_XFAIL(eslEINCOMPAT, errbuf, "DispatchSqAlignment() trying to do small and opt acc alignment"); if(do_post && do_small) ESL_XFAIL(eslEINCOMPAT, errbuf, "DispatchSqAlignment() trying to do PP and small alignment"); if(do_optacc && do_sample) ESL_XFAIL(eslEINCOMPAT, errbuf, "DispatchSqAlignment() trying to sample and do optacc alignment"); if(do_sub && do_small) ESL_XFAIL(eslEINCOMPAT, errbuf, "DispatchSqAlignment() trying to do sub and small alignment"); if(do_sub && do_trunc) ESL_XFAIL(eslEINCOMPAT, errbuf, "DispatchSqAlignment() trying to do sub and truncated alignment"); if(do_sample && r == NULL) ESL_XFAIL(eslEINCOMPAT, errbuf, "DispatchSqAlignment() trying to sample but RNG r == NULL"); if(do_xtau && ! do_hbanded) ESL_XFAIL(eslEINCOMPAT, errbuf, "DispatchSqAlignment() trying to multiply tau without HMM banded alignment"); if(do_xtau && cp9b_valid) ESL_XFAIL(eslEINCOMPAT, errbuf, "DispatchSqAlignment() trying to multiply tau but HMM bands already valid"); if(do_qdb && do_nonbanded) ESL_XFAIL(eslEINCOMPAT, errbuf, "DispatchSqAlignment() trying to do qdb and nonbanded alignment"); if(do_qdb && do_trunc) ESL_XFAIL(eslEINCOMPAT, errbuf, "DispatchSqAlignment() trying to use qdbs and truncated alignment"); /* qdb + trunc combo disallowed only b/c no function exists for it yet */ if(do_qdb && (! do_small)) ESL_XFAIL(eslEINCOMPAT, errbuf, "DispatchSqAlignment() trying to use qdbs but not divide and conquer"); /* qdb + small combo disallowed b/c only non-HMM banded non-small alignment functions are not set up to use QDBs */ if(do_qdb && cm->qdbinfo == NULL) { ESL_XFAIL(eslEINCOMPAT, errbuf, "DispatchSqAlignment() trying to use qdbs but cm->qdbinfo is NULL"); } if(do_qdb && (cm->qdbinfo->dmin2 == NULL || cm->qdbinfo->dmax2 == NULL)) { ESL_XFAIL(eslEINCOMPAT, errbuf, "DispatchSqAlignment() trying to use qdbs but cm->qdbinfo is NULL"); } if(do_trunc && (! cm_pli_PassAllowsTruncation(pass_idx))) { ESL_XFAIL(eslEINCOMPAT, errbuf, "DispatchSqAlignment() trying to do truncated alignment, but pass_idx doesn't allow truncation (PLI_PASS_STD_ANY)"); } if(pass_idx == PLI_PASS_STD_ANY && (mode == TRMODE_L || mode == TRMODE_R || mode == TRMODE_T)) { ESL_XFAIL(eslEINCOMPAT, errbuf, "DispatchSqAlignment() mode is L, R, or T, but pass_idx is PLI_PASS_STD_ANY"); } if(w_tot != NULL) esl_stopwatch_Start(w_tot); /* do sub-mode specific pre-alignment steps, if nec */ if(do_sub) { if((status = sub_alignment_prep(cm, errbuf, sq, &submap, &sub_cm)) != eslOK) goto ERROR; cm = sub_cm; } if(w != NULL) esl_stopwatch_Start(w); /* do small D&C alignment, if nec */ if(do_small) { if(do_trunc) { sc = TrCYK_DnC(cm, sq->dsq, sq->L, 0, 1, sq->L, pass_idx, FALSE, &tr); /* FALSE: don't reproduce 1.0 behavior */ mb_tot = 4. * CYKNonQDBSmallMbNeeded(cm, sq->L); /* not sure how accurate this is */ } else { /* with QDB, always use dmin2/dmax2, the looser of the two sets of QDBs in cm->qdbinfo */ sc = CYKDivideAndConquer(cm, sq->dsq, sq->L, 0, 1, sq->L, &tr, (do_qdb) ? cm->qdbinfo->dmin2 : NULL, (do_qdb) ? cm->qdbinfo->dmax2 : NULL); mb_tot = CYKNonQDBSmallMbNeeded(cm, sq->L); } } else { /* do_small is FALSE */ if(do_nonbanded || do_qdb) { /* do not use HMM bands */ if(do_trunc) { if((status = cm_TrAlignSizeNeeded(cm, errbuf, sq->L, mxsize, do_sample, do_post, NULL, NULL, NULL, &mb_tot)) != eslOK) goto ERROR; if((status = cm_TrAlign(cm, errbuf, sq->dsq, sq->L, mxsize, mode, pass_idx, do_optacc, do_sample, cm->trnb_mx, cm->trnb_shmx, cm->trnb_omx, cm->trnb_emx, r, do_post ? &ppstr : NULL, &tr, NULL, &pp, &sc)) != eslOK) goto ERROR; } else { if((status = cm_AlignSizeNeeded(cm, errbuf, sq->L, mxsize, do_sample, do_post, NULL, NULL, NULL, &mb_tot)) != eslOK) goto ERROR; if((status = cm_Align(cm, errbuf, sq->dsq, sq->L, mxsize, do_optacc, do_sample, cm->nb_mx, cm->nb_shmx, cm->nb_omx, cm->nb_emx, r, do_post ? &ppstr : NULL, &tr, &pp, &sc)) != eslOK) goto ERROR; } } else { /* use HMM bands */ if(! cp9b_valid) { if(do_xtau) { /* multiply tau (if nec) until required mx is below Mb limit (mxsize) */ if((status = cp9_IterateSeq2Bands(cm, errbuf, sq->dsq, 1, sq->L, pass_idx, mxsize, doing_search, do_sample, do_post, cm->maxtau, NULL)) != eslOK) goto ERROR; } else { if((status = cp9_Seq2Bands(cm, errbuf, cm->cp9_mx, cm->cp9_bmx, cm->cp9_bmx, sq->dsq, 1, sq->L, cm->cp9b, doing_search, pass_idx, 0)) != eslOK) goto ERROR; } if(w != NULL) esl_stopwatch_Stop(w); secs_bands = (w == NULL) ? 0. : w->elapsed; tau = cm->tau; thresh1 = cm->cp9b->thresh1; thresh2 = cm->cp9b->thresh2; /* note: we don't set these three if cp9b_valid is TRUE */ } if(w != NULL) esl_stopwatch_Start(w); if(do_trunc) { if((status = cm_TrAlignSizeNeededHB(cm, errbuf, sq->L, mxsize, do_sample, do_post, NULL, NULL, NULL, &mb_tot)) != eslOK) goto ERROR; if((status = cm_TrAlignHB(cm, errbuf, sq->dsq, sq->L, mxsize, mode, pass_idx, do_optacc, do_sample, cm->trhb_mx, cm->trhb_shmx, cm->trhb_omx, cm->trhb_emx, r, do_post ? &ppstr : NULL, &tr, NULL, &pp, &sc)) != eslOK) goto ERROR; } else { if((status = cm_AlignSizeNeededHB(cm, errbuf, sq->L, mxsize, do_sample, do_post, NULL, NULL, NULL, &mb_tot)) != eslOK) goto ERROR; if((status = cm_AlignHB(cm, errbuf, sq->dsq, sq->L, mxsize, do_optacc, do_sample, cm->hb_mx, cm->hb_shmx, cm->hb_omx, cm->hb_emx, r, do_post ? &ppstr : NULL, &tr, &pp, &sc)) != eslOK) goto ERROR; } /* add size of CP9 matrices used for calculating bands */ mb_tot += ((float) cm->cp9_mx->ncells_valid * sizeof(int)) / 1000000.; mb_tot += ((float) cm->cp9_bmx->ncells_valid * sizeof(int)) / 1000000.; if(do_sub) { /* add size of original CM's CP9 matrices used for calculating start/end position */ mb_tot += ((float) orig_cm->cp9_mx->ncells_valid * sizeof(int)) / 1000000.; mb_tot += ((float) orig_cm->cp9_bmx->ncells_valid * sizeof(int)) / 1000000.; } } } if(w != NULL) esl_stopwatch_Stop(w); secs_aln = (w == NULL) ? 0. : w->elapsed; if(do_sub) { /* convert sub cm parsetree to a full CM parsetree */ if((status = sub_cm2cm_parsetree(orig_cm, cm, &full_tr, tr, submap, 0)) != eslOK) ESL_XFAIL(status, errbuf, "out of memory, converting sub parsetree to full parsetree"); /* free sub data structures, we're done with them */ FreeParsetree(tr); tr = full_tr; FreeCM(cm); cm = orig_cm; FreeSubMap(submap); submap = NULL; } /* determine start and end points of the parsetree, * but only if we're not in a search/scan pipeline */ if(idx != -1) { /* we're not in a search/scan pipeline */ if((status = ParsetreeToCMBounds(cm, tr, TRUE, TRUE, errbuf, NULL, NULL, NULL, NULL, &spos, &epos)) != eslOK) goto ERROR; } /* create and fill data */ ESL_ALLOC(data, sizeof(CM_ALNDATA)); data->sq = sq; data->idx = idx; data->tr = tr; data->sc = sc; data->pp = (do_post) ? pp : 0.; data->ppstr = (do_post) ? ppstr : NULL; data->spos = spos; data->epos = epos; data->secs_bands = (do_nonbanded) ? 0. : secs_bands; data->secs_aln = secs_aln; data->mb_tot = mb_tot; data->tau = tau; data->thresh1 = thresh1; data->thresh2 = thresh2; if(w_tot != NULL) esl_stopwatch_Stop(w_tot); data->secs_tot = (w_tot == NULL) ? 0. : w_tot->elapsed; *ret_data = data; cm->tau = save_tau; if(cm->cp9b != NULL) { cm->cp9b->thresh1 = save_thresh1; cm->cp9b->thresh2 = save_thresh2; } return eslOK; ERROR: cm->tau = save_tau; if(cm->cp9b != NULL) { cm->cp9b->thresh1 = save_thresh1; cm->cp9b->thresh2 = save_thresh2; } if(data != NULL) cm_alndata_Destroy(data, FALSE); *ret_data = NULL; if(status == eslEMEM) ESL_FAIL(status, errbuf, "DispatchSqAlignment(), out of memory"); return status; }
/** * int main(int argc, char **argv) * Main driver */ int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* command line processing */ ESL_ALPHABET *abc = NULL; char *hmmfile = NULL; char *outhmmfile = NULL; P7_HMMFILE *hfp = NULL; FILE *outhmmfp; /* HMM output file handle */ P7_HMM *hmm = NULL; P7_BG *bg = NULL; int nhmm; double x; float KL; int status; char errbuf[eslERRBUFSIZE]; float average_internal_transitions[ p7H_NTRANSITIONS ]; int k; char errmsg[eslERRBUFSIZE]; /* Process the command line options. */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_GetBoolean(go, "-h") == TRUE) { profillic_p7_banner(stdout, argv[0], banner); esl_usage(stdout, argv[0], usage); puts("\nOptions:"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=docgroup, 2 = indentation; 80=textwidth*/ exit(0); } if (esl_opt_ArgNumber(go) != 2) { puts("Incorrect number of command line arguments."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if ((hmmfile = esl_opt_GetArg(go, 1)) == NULL) { puts("Failed to read <input hmmfile> argument from command line."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if ((outhmmfile = esl_opt_GetArg(go, 2)) == NULL) { puts("Failed to read <output hmmfile> argument from command line."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } profillic_p7_banner(stdout, argv[0], banner); /* Initializations: open the input HMM file for reading */ status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, hmmfile, errbuf); /* Initializations: open the output HMM file for writing */ if ((outhmmfp = fopen(outhmmfile, "w")) == NULL) ESL_FAIL(status, errmsg, "Failed to open HMM file %s for writing", outhmmfile); /* Main body: read HMMs one at a time, print one line of stats */ printf("#\n"); printf("# %-4s %-20s %-12s %8s %8s %6s %6s %6s %6s %6s\n", "idx", "name", "accession", "nseq", "eff_nseq", "M", "relent", "info", "p relE", "compKL"); printf("# %-4s %-20s %-12s %8s %8s %6s %6s %6s %6s %6s\n", "----", "--------------------", "------------", "--------", "--------", "------", "------", "------", "------", "------"); nhmm = 0; while ((status = p7_hmmfile_Read(hfp, &abc, &hmm)) != eslEOF) { if (status == eslEOD) esl_fatal("read failed, HMM file %s may be truncated?", hmmfile); else if (status == eslEFORMAT) esl_fatal("bad file format in HMM file %s", hmmfile); else if (status == eslEINCOMPAT) esl_fatal("HMM file %s contains different alphabets", hmmfile); else if (status != eslOK) esl_fatal("Unexpected error in reading HMMs from %s", hmmfile); nhmm++; if (bg == NULL) bg = p7_bg_Create(abc); esl_vec_FSet(average_internal_transitions, p7H_NTRANSITIONS, 0.); for( k = 1; k < hmm->M; k++ ) { esl_vec_FAdd(average_internal_transitions, hmm->t[k], p7H_NTRANSITIONS); } // Match transitions esl_vec_FNorm(average_internal_transitions, 3); // Insert transitions esl_vec_FNorm(average_internal_transitions + 3, 2); // Delete transitions esl_vec_FNorm(average_internal_transitions + 5, 2); // Ok now set them. for( k = 1; k < hmm->M; k++ ) { esl_vec_FCopy( average_internal_transitions, p7H_NTRANSITIONS, hmm->t[k] ); } if ((status = p7_hmm_Validate(hmm, errmsg, 0.0001)) != eslOK) return status; if ((status = p7_hmmfile_WriteASCII(outhmmfp, -1, hmm)) != eslOK) ESL_FAIL(status, errmsg, "HMM save failed"); p7_MeanPositionRelativeEntropy(hmm, bg, &x); p7_hmm_CompositionKLDist(hmm, bg, &KL, NULL); printf("%-6d %-20s %-12s %8d %8.2f %6d %6.2f %6.2f %6.2f %6.2f\n", nhmm, hmm->name, hmm->acc == NULL ? "-" : hmm->acc, hmm->nseq, hmm->eff_nseq, hmm->M, p7_MeanMatchRelativeEntropy(hmm, bg), p7_MeanMatchInfo(hmm, bg), x, KL); /* p7_MeanForwardScore(hmm, bg)); */ p7_hmm_Destroy(hmm); } p7_bg_Destroy(bg); esl_alphabet_Destroy(abc); p7_hmmfile_Close(hfp); if (outhmmfp != NULL) fclose(outhmmfp); esl_getopts_Destroy(go); exit(0); }