/* Function: p7_Builder() * Synopsis: Build a new HMM from an MSA. * * Purpose: Take the multiple sequence alignment <msa> and a build configuration <bld>, * and build a new HMM. * * Effective sequence number determination and calibration steps require * additionally providing a null model <bg>. * * Args: bld - build configuration * msa - multiple sequence alignment * bg - null model * opt_hmm - optRETURN: new HMM * opt_trarr - optRETURN: array of faux tracebacks, <0..nseq-1> * opt_gm - optRETURN: profile corresponding to <hmm> * opt_om - optRETURN: optimized profile corresponding to <gm> * opt_postmsa - optRETURN: RF-annotated, possibly modified MSA * * Returns: <eslOK> on success. The new HMM is optionally returned in * <*opt_hmm>, along with optional returns of an array of faux tracebacks * for each sequence in <*opt_trarr>, the annotated MSA used to construct * the model in <*opt_postmsa>, a configured search profile in * <*opt_gm>, and an optimized search profile in <*opt_om>. These are * all optional returns because the caller may, for example, be interested * only in an optimized profile, or may only be interested in the HMM. * * Returns <eslENORESULT> if no consensus columns were annotated. * Returns <eslEFORMAT> on MSA format problems, such as a missing RF annotation * line in hand architecture construction. On any returned error, * <bld->errbuf> contains an informative error message. * * Throws: <eslEMEM> on allocation error. * <eslEINVAL> if relative weights couldn't be calculated from <msa>. * * Xref: J4/30. */ int p7_Builder(P7_BUILDER *bld, ESL_MSA *msa, P7_BG *bg, P7_HMM **opt_hmm, P7_TRACE ***opt_trarr, P7_PROFILE **opt_gm, P7_OPROFILE **opt_om, ESL_MSA **opt_postmsa) { int i,j; uint32_t checksum = 0; /* checksum calculated for the input MSA. hmmalign --mapali verifies against this. */ P7_HMM *hmm = NULL; P7_TRACE **tr = NULL; P7_TRACE ***tr_ptr = (opt_trarr != NULL || opt_postmsa != NULL) ? &tr : NULL; int status; if ((status = validate_msa (bld, msa)) != eslOK) goto ERROR; if ((status = esl_msa_Checksum (msa, &checksum)) != eslOK) ESL_XFAIL(status, bld->errbuf, "Failed to calculate checksum"); if ((status = relative_weights (bld, msa)) != eslOK) goto ERROR; if ((status = esl_msa_MarkFragments(msa, bld->fragthresh)) != eslOK) goto ERROR; if ((status = build_model (bld, msa, &hmm, tr_ptr)) != eslOK) goto ERROR; //Ensures that the weighted-average I->I count <= bld->max_insert_len //(MI currently contains the number of observed insert-starts) if (bld->max_insert_len>0) for (i=1; i<hmm->M; i++ ) hmm->t[i][p7H_II] = ESL_MIN(hmm->t[i][p7H_II], bld->max_insert_len*hmm->t[i][p7H_MI]); if ((status = effective_seqnumber (bld, msa, hmm, bg)) != eslOK) goto ERROR; if ((status = parameterize (bld, hmm)) != eslOK) goto ERROR; if ((status = annotate (bld, msa, hmm)) != eslOK) goto ERROR; if ((status = calibrate (bld, hmm, bg, opt_gm, opt_om)) != eslOK) goto ERROR; if ((status = make_post_msa (bld, msa, hmm, tr, opt_postmsa)) != eslOK) goto ERROR; //force masked positions to background (it'll be close already, so no relevant impact on weighting) if (hmm->mm != NULL) for (i=1; i<hmm->M; i++ ) if (hmm->mm[i] == 'm') for (j=0; j<hmm->abc->K; j++) hmm->mat[i][j] = bg->f[j]; if ( bld->abc->type == eslDNA || bld->abc->type == eslRNA ) { if (bld->w_len > 0) hmm->max_length = bld->w_len; else if (bld->w_beta == 0.0) hmm->max_length = hmm->M *4; else if ( (status = p7_Builder_MaxLength(hmm, bld->w_beta)) != eslOK) goto ERROR; } hmm->checksum = checksum; hmm->flags |= p7H_CHKSUM; if (opt_hmm != NULL) *opt_hmm = hmm; else p7_hmm_Destroy(hmm); if (opt_trarr != NULL) *opt_trarr = tr; else p7_trace_DestroyArray(tr, msa->nseq); return eslOK; ERROR: p7_hmm_Destroy(hmm); p7_trace_DestroyArray(tr, msa->nseq); if (opt_gm != NULL) p7_profile_Destroy(*opt_gm); if (opt_om != NULL) p7_oprofile_Destroy(*opt_om); return status; }
static int map_alignment(const char *msafile, const P7_HMM *hmm, ESL_SQ ***ret_sq, P7_TRACE ***ret_tr, int *ret_ntot) { ESL_SQ **sq = NULL; P7_TRACE **tr = NULL; ESLX_MSAFILE *afp = NULL; ESL_MSA *msa = NULL; ESL_ALPHABET *abc = (ESL_ALPHABET *) hmm->abc; /* removing const'ness to make compiler happy. Safe. */ int *matassign = NULL; uint32_t chksum = 0; int i,k; int status; status = eslx_msafile_Open(&abc, msafile, NULL, eslMSAFILE_UNKNOWN, NULL, &afp); if (status != eslOK) eslx_msafile_OpenFailure(afp, status); status = eslx_msafile_Read(afp, &msa); if (status != eslOK) eslx_msafile_ReadFailure(afp, status); if (! (hmm->flags & p7H_CHKSUM) ) esl_fatal("HMM has no checksum. --mapali unreliable without it."); if (! (hmm->flags & p7H_MAP) ) esl_fatal("HMM has no map. --mapali can't work without it."); esl_msa_Checksum(msa, &chksum); if (hmm->checksum != chksum) esl_fatal("--mapali MSA %s isn't same as the one HMM came from (checksum mismatch)", msafile); ESL_ALLOC(sq, sizeof(ESL_SQ *) * msa->nseq); ESL_ALLOC(tr, sizeof(P7_TRACE *) * msa->nseq); ESL_ALLOC(matassign, sizeof(int) * (msa->alen + 1)); esl_vec_ISet(matassign, msa->alen+1, 0); for (k = 1; k <= hmm->M; k++) matassign[hmm->map[k]] = 1; p7_trace_FauxFromMSA(msa, matassign, p7_DEFAULT, tr); /* The 'faux' core traces constructed by FauxFromMSA() may contain * D->I and I->D transitions. They may *only* now be passed to * p7_tracealign_Seqs(), which can deal with these 'illegal' * transitions, in order to exactly reproduce the input --mapali * alignment. */ for (i = 0; i < msa->nseq; i++) esl_sq_FetchFromMSA(msa, i, &(sq[i])); *ret_ntot = msa->nseq; *ret_tr = tr; *ret_sq = sq; eslx_msafile_Close(afp); esl_msa_Destroy(msa); free(matassign); return eslOK; ERROR: *ret_ntot = 0; *ret_tr = NULL; *ret_sq = NULL; if (afp != NULL) eslx_msafile_Close(afp); if (msa != NULL) esl_msa_Destroy(msa); if (matassign != NULL) free(matassign); return status; }