/* validate_msa: * SRE, Thu Dec 3 16:10:31 2009 [J5/119; bug #h70 fix] * * HMMER uses a convention for missing data characters: they * indicate that a sequence is a fragment. (See * esl_msa_MarkFragments()). * * Because of the way these fragments will be handled in tracebacks, * we reject any alignment that uses missing data characters in any * other way. * * This validation step costs negligible time. */ static int validate_msa(P7_BUILDER *bld, ESL_MSA *msa) { int idx; int64_t apos; for (idx = 0; idx < msa->nseq; idx++) { apos = 1; while ( esl_abc_XIsMissing(msa->abc, msa->ax[idx][apos]) && apos <= msa->alen) apos++; while (! esl_abc_XIsMissing(msa->abc, msa->ax[idx][apos]) && apos <= msa->alen) apos++; while ( esl_abc_XIsMissing(msa->abc, msa->ax[idx][apos]) && apos <= msa->alen) apos++; if (apos != msa->alen+1) ESL_FAIL(eslEINVAL, bld->errbuf, "msa %s; sequence %s\nhas missing data chars (~) other than at fragment edges", msa->name, msa->sqname[idx]); } return eslOK; }
/* Function: p7_Fastmodelmaker() * * Purpose: Heuristic model construction. * Construct an HMM from an alignment by a simple rule, * based on the fractional occupancy of each columns w/ * residues vs gaps. Any column w/ a fractional * occupancy of $\geq$ <symfrac> is assigned as a MATCH column; * for instance, if thresh = 0.5, columns w/ $\geq$ 50\% * residues are assigned to match... roughly speaking. * * "Roughly speaking" because sequences may be weighted * in the input <msa>, and because missing data symbols are * ignored, in order to deal with sequence fragments. * * The <msa> must be in digital mode. * * If the caller wants to designate any sequences as * fragments, it does so by converting all N-terminal and * C-terminal flanking gap symbols to missing data symbols. * * NOTE: p7_Fastmodelmaker() will slightly revise the * alignment if the assignment of columns implies * DI and ID transitions. * * Returns the HMM in counts form (ready for applying Dirichlet * priors as the next step). Also returns fake traceback * for each training sequence. * * Models must have at least one node, so if the <msa> defined * no consensus columns, a <eslENORESULT> error is returned. * * Args: msa - multiple sequence alignment * symfrac - threshold for residue occupancy; >= assigns MATCH * bld - holds information on regions requiring masking, optionally NULL -> no masking * ret_hmm - RETURN: counts-form HMM * opt_tr - optRETURN: array of tracebacks for aseq's * * Return: <eslOK> on success. ret_hmm and opt_tr allocated here, * and must be free'd by the caller (FreeTrace(tr[i]), free(tr), * FreeHMM(hmm)). * * Returns <eslENORESULT> if no consensus columns were annotated; * in this case, <ret_hmm> and <opt_tr> are returned NULL. * * Throws: <eslEMEM> on allocation failure; <eslEINVAL> if the * <msa> isn't in digital mode. */ int p7_Fastmodelmaker(ESL_MSA *msa, float symfrac, P7_BUILDER *bld, P7_HMM **ret_hmm, P7_TRACE ***opt_tr) { int status; /* return status flag */ int *matassign = NULL; /* MAT state assignments if 1; 1..alen */ int idx; /* counter over sequences */ int apos; /* counter for aligned columns */ float r; /* weighted residue count */ float totwgt; /* weighted residue+gap count */ if (! (msa->flags & eslMSA_DIGITAL)) ESL_XEXCEPTION(eslEINVAL, "need digital MSA"); /* Allocations: matassign is 1..alen array of bit flags. */ ESL_ALLOC(matassign, sizeof(int) * (msa->alen+1)); /* Determine weighted sym freq in each column, set matassign[] accordingly. */ for (apos = 1; apos <= msa->alen; apos++) { r = totwgt = 0.; for (idx = 0; idx < msa->nseq; idx++) { if (esl_abc_XIsResidue(msa->abc, msa->ax[idx][apos])) { r += msa->wgt[idx]; totwgt += msa->wgt[idx]; } else if (esl_abc_XIsGap(msa->abc, msa->ax[idx][apos])) { totwgt += msa->wgt[idx]; } else if (esl_abc_XIsMissing(msa->abc, msa->ax[idx][apos])) continue; } if (r > 0. && r / totwgt >= symfrac) matassign[apos] = TRUE; else matassign[apos] = FALSE; } /* Once we have matassign calculated, modelmakers behave * the same; matassign2hmm() does this stuff (traceback construction, * trace counting) and sets up ret_hmm and opt_tr. */ if ((status = matassign2hmm(msa, matassign, ret_hmm, opt_tr)) != eslOK) { fprintf (stderr, "hmm construction error during trace counting\n"); goto ERROR; } free(matassign); return eslOK; ERROR: if (matassign != NULL) free(matassign); return status; }
/* Function: p7_Alimask_MakeModel2AliMap() * Synopsis: Compute map of coordinate in the alignment corresponding to each model position. * * Args: msa - The alignment for which the mapped model is to be computed. We assume * the MSA has already been manipulated to account for model building * flags (e.g. weighting). * do_hand - TRUE when the model is to follow a hand-build RF line (which must be * part of the file. * symfraq - if weighted occupancy exceeds this value, include the column in the model. * map - int array into which the map values will be stored. Calling function * must allocate (msa->alen+1) ints. * * Returns: The number of mapped model positions. */ int p7_Alimask_MakeModel2AliMap(ESL_MSA *msa, int do_hand, float symfrac, int *map ) { int i = 0; int apos, idx; float r; /* weighted residue count */ float totwgt; /* weighted residue+gap count */ i = 0; if ( do_hand ) { if (msa->rf == NULL) p7_Fail("Model file does not contain an RF line, required for --hand.\n"); /* Watch for off-by-one. rf is [0..alen-1]*/ for (apos = 1; apos <= msa->alen; apos++) { if (!esl_abc_CIsGap(msa->abc, msa->rf[apos-1]) ) { map[i] = apos; i++; } } } else { for (apos = 1; apos <= msa->alen; apos++) { r = totwgt = 0.; for (idx = 0; idx < msa->nseq; idx++) { if (esl_abc_XIsResidue(msa->abc, msa->ax[idx][apos])) { r += msa->wgt[idx]; totwgt += msa->wgt[idx]; } else if (esl_abc_XIsGap(msa->abc, msa->ax[idx][apos])) { totwgt += msa->wgt[idx]; } else if (esl_abc_XIsMissing(msa->abc, msa->ax[idx][apos])) continue; } if (r > 0. && r / totwgt >= symfrac) { map[i] = apos; i++; } } } return i; }