/* Function: p7_tracealign_MSA() * Synopsis: Convert array of traces (for a previous MSA) to a new MSA. * Incept: SRE, Mon Mar 2 18:18:22 2009 [Casa de Gatos] * * Purpose: Identical to <p7_tracealign_Seqs()> except that the trace * array <tr> accompanies a digital multiple alignment <premsa>, * rather than an array of digital sequences. * * This gets used in <p7_Builder()>, where we've * constructed an array of faux traces directly from an * input alignment, and we want to reconstruct the * MSA that corresponds to what HMMER actually used * to build its model (after trace doctoring to be * compatible with Plan 7, and with <#=RF> annotation * on assigned consensus columns). * * Xref: J4/102. */ int p7_tracealign_MSA(const ESL_MSA *premsa, P7_TRACE **tr, int M, int optflags, ESL_MSA **ret_postmsa) { const ESL_ALPHABET *abc = premsa->abc; ESL_MSA *msa = NULL; /* RETURN: new MSA */ int *inscount = NULL; /* array of max gaps between aligned columns */ int *matmap = NULL; /* matmap[k] = apos of match k matmap[1..M] = [1..alen] */ int *matuse = NULL; /* TRUE if an alignment column is associated with match state k [1..M] */ int idx; /* counter over sequences */ int alen; /* width of alignment */ int status; if ((status = map_new_msa(tr, premsa->nseq, M, optflags, &inscount, &matuse, &matmap, &alen)) != eslOK) return status; if (optflags & p7_DIGITIZE) { if ((status = make_digital_msa(NULL, premsa, tr, premsa->nseq, matuse, matmap, M, alen, optflags, &msa)) != eslOK) goto ERROR; } else { if ((status = make_text_msa (NULL, premsa, tr, premsa->nseq, matuse, matmap, M, alen, optflags, &msa)) != eslOK) goto ERROR; } if ((status = annotate_rf(msa, M, matuse, matmap)) != eslOK) goto ERROR; if ((status = annotate_posterior_probability(msa, tr, matmap, M, optflags)) != eslOK) goto ERROR; if (optflags & p7_DIGITIZE) rejustify_insertions_digital( msa, inscount, matmap, matuse, M); else rejustify_insertions_text (abc, msa, inscount, matmap, matuse, M); /* Transfer information from old MSA to new */ esl_msa_SetName (msa, premsa->name); esl_msa_SetDesc (msa, premsa->desc); esl_msa_SetAccession(msa, premsa->acc); for (idx = 0; idx < premsa->nseq; idx++) { esl_msa_SetSeqName (msa, idx, premsa->sqname[idx]); if (msa->sqacc) esl_msa_SetSeqAccession (msa, idx, premsa->sqacc[idx]); if (msa->sqdesc) esl_msa_SetSeqDescription(msa, idx, premsa->sqdesc[idx]); msa->wgt[idx] = premsa->wgt[idx]; } if (premsa->flags & eslMSA_HASWGTS) msa->flags |= eslMSA_HASWGTS; free(inscount); free(matmap); free(matuse); *ret_postmsa = msa; return eslOK; ERROR: if (msa != NULL) esl_msa_Destroy(msa); if (inscount != NULL) free(inscount); if (matmap != NULL) free(matmap); if (matuse != NULL) free(matuse); *ret_postmsa = NULL; return status; }
/* Function: p7_tracealign_Seqs() * Synopsis: Convert array of traces (for a sequence array) to a new MSA. * Incept: SRE, Tue Oct 21 19:40:33 2008 [Janelia] * * Purpose: Convert an array of <nseq> traces <tr[0..nseq-1]>, * corresponding to an array of digital sequences * <sq[0..nseq-1]> aligned to a model of * length <M>, to a new multiple sequence alignment. * The new alignment structure is allocated here, and returned * in <*ret_msa>. * * As a special case, the traces may contain I->D and D->I * transitions. This feature is used by <hmmalign --mapali> * to reconstruct an input alignment without modification * from trace doctoring. * * <optflags> controls some optional behaviors in producing * the alignment, as follows: * * <p7_DIGITIZE>: creates the MSA in digital mode, as * opposed to a default text mode. * * <p7_ALL_CONSENSUS_COLS>: create a column for every * consensus column in the model, even if it means having * all gap characters (deletions) in a column; this * guarantees that the alignment will have at least <M> * columns. The default is to only show columns that have * at least one residue in them. * * <p7_TRIM>: trim off any residues that get assigned to * flanking N,C states (in profile traces) or I_0 and I_M * (in core traces). * * The <optflags> can be combined by logical OR; for * example, <p7_DIGITIZE | p7_ALL_CONSENSUS_COLS>. * * Args: sq - array of digital sequences, 0..nseq-1 * tr - array of tracebacks, 0..nseq-1 * nseq - number of sequences * M - length of model sequences were aligned to * optflags - flags controlling optional behaviours. * ret_msa - RETURN: new multiple sequence alignment * * Returns: <eslOK> on success, and <*ret_msa> points to a new * <ESL_MSA> object. Caller is responsible for free'ing * this new MSA with <esl_msa_Destroy()>. * * Throws: <eslEMEM> on allocation failure; <*ret_msa> is <NULL>. * * Notes: * why a text mode, when most of HMMER works in digital * sequences and alignments? Text mode MSAs are created * for output, whereas digital mode MSAs are created for * internal use. Text mode allows HMMER's output * conventions to be used for match vs. insert columns: * lowercase/. for residues/gaps in inserts, uppercase/- * for residues/gaps in match columns. * * * why not pass HMM as an argument, so we can transfer * column annotation? In <p7_tophits_Alignment()>, the * HMM is unavailable -- because of constraints of what's * made available to the master process in an MPI * implementation. (We could make the HMM an optional * argument.) */ int p7_tracealign_Seqs(ESL_SQ **sq, P7_TRACE **tr, int nseq, int M, int optflags, ESL_MSA **ret_msa) { ESL_MSA *msa = NULL; /* RETURN: new MSA */ const ESL_ALPHABET *abc = sq[0]->abc; int *inscount = NULL; /* array of max gaps between aligned columns */ int *matmap = NULL; /* matmap[k] = apos of match k matmap[1..M] = [1..alen] */ int *matuse = NULL; /* TRUE if an alignment column is associated with match state k [1..M] */ int idx; /* counter over sequences */ int alen; /* width of alignment */ int status; if ((status = map_new_msa(tr, nseq, M, optflags, &inscount, &matuse, &matmap, &alen)) != eslOK) return status; if (optflags & p7_DIGITIZE) { if ((status = make_digital_msa(sq, NULL, tr, nseq, matuse, matmap, M, alen, optflags, &msa)) != eslOK) goto ERROR; } else { if ((status = make_text_msa (sq, NULL, tr, nseq, matuse, matmap, M, alen, optflags, &msa)) != eslOK) goto ERROR; } if ((status = annotate_rf(msa, M, matuse, matmap)) != eslOK) goto ERROR; if ((status = annotate_posterior_probability(msa, tr, matmap, M, optflags)) != eslOK) goto ERROR; if (optflags & p7_DIGITIZE) rejustify_insertions_digital( msa, inscount, matmap, matuse, M); else rejustify_insertions_text (abc, msa, inscount, matmap, matuse, M); for (idx = 0; idx < nseq; idx++) { esl_msa_SetSeqName(msa, idx, sq[idx]->name); if (sq[idx]->acc[0] != '\0') esl_msa_SetSeqAccession (msa, idx, sq[idx]->acc); if (sq[idx]->desc[0] != '\0') esl_msa_SetSeqDescription(msa, idx, sq[idx]->desc); msa->wgt[idx] = 1.0; if (msa->sqlen != NULL) msa->sqlen[idx] = sq[idx]->n; } free(inscount); free(matmap); free(matuse); *ret_msa = msa; return eslOK; ERROR: if (msa != NULL) esl_msa_Destroy(msa); if (inscount != NULL) free(inscount); if (matmap != NULL) free(matmap); if (matuse != NULL) free(matuse); *ret_msa = NULL; return status; }
/* Function: esl_msafile_a2m_Read() * Synopsis: Read a UCSC A2M format alignment. * * Purpose: Read an MSA from an open <ESL_MSAFILE> <afp>, parsing * for UCSC A2M (SAM) format. Create a new MSA, * and return a ptr to it in <*ret_msa>. Caller is responsible * for freeing this <ESL_MSA>. * * The <msa> has a reference line (<msa->rf[]>) that * corresponds to the uppercase/lowercase columns in the * alignment: consensus (uppercase) columns are marked 'X', * and insert (lowercase) columns are marked '.' in the RF * annotation line. * * This input parser can deal both with "dotless" A2M, and * full A2M format with dots. * * Args: afp - open <ESL_MSAFILE> * ret_msa - RETURN: newly parsed <ESL_MSA> * * Returns: <eslOK> on success. <*ret_msa> is set to the newly * allocated MSA, and <afp> is at EOF. * * <eslEOF> if no (more) alignment data are found in * <afp>, and <afp> is returned at EOF. * * <eslEFORMAT> on a parse error. <*ret_msa> is set to * <NULL>. <afp> contains information sufficient for * constructing useful diagnostic output: * | <afp->errmsg> | user-directed error message | * | <afp->linenumber> | line # where error was detected | * | <afp->line> | offending line (not NUL-term) | * | <afp->n> | length of offending line | * | <afp->bf->filename> | name of the file | * and <afp> is poised at the start of the following line, * so (in principle) the caller could try to resume * parsing. * * Throws: <eslEMEM> - an allocation failed. * <eslESYS> - a system call such as fread() failed * <eslEINCONCEIVABLE> - "impossible" corruption * On these, <*ret_msa> is returned <NULL>, and the state of * <afp> is undefined. */ int esl_msafile_a2m_Read(ESL_MSAFILE *afp, ESL_MSA **ret_msa) { ESL_MSA *msa = NULL; char **csflag = NULL; /* csflag[i][pos] is TRUE if aseq[i][pos] was uppercase consensus */ int *nins = NULL; /* # of inserted residues before each consensus col [0..ncons-1] */ int *this_nins = NULL; /* # of inserted residues before each consensus residue in this seq */ int nseq = 0; int ncons = 0; int idx; int64_t thislen; int64_t spos; int this_ncons; int cpos, bpos; char *p, *tok; esl_pos_t n, toklen; int status; ESL_DASSERT1( (afp->format == eslMSAFILE_A2M) ); afp->errmsg[0] = '\0'; #ifdef eslAUGMENT_ALPHABET if (afp->abc && (msa = esl_msa_CreateDigital(afp->abc, 16, -1)) == NULL) { status = eslEMEM; goto ERROR; } #endif if (! afp->abc && (msa = esl_msa_Create( 16, -1)) == NULL) { status = eslEMEM; goto ERROR; } ESL_ALLOC(csflag, sizeof(char *) * msa->sqalloc); for (idx = 0; idx < msa->sqalloc; idx++) csflag[idx] = NULL; /* skip leading blank lines in file */ while ( (status = esl_msafile_GetLine(afp, &p, &n)) == eslOK && esl_memspn(afp->line, afp->n, " \t") == afp->n) ; if (status != eslOK) goto ERROR; /* includes normal EOF */ /* tolerate sloppy space at start of name/desc line */ while (n && isspace(*p)) { p++; n--; } if (*p != '>') ESL_XFAIL(eslEFORMAT, afp->errmsg, "expected A2M name/desc line starting with >"); do { /* for each record starting in '>': */ p++; n--; /* advance past > */ if ( (status = esl_memtok(&p, &n, " \t", &tok, &toklen)) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "no name found for A2M record"); if (nseq >= msa->sqalloc) { int old_sqalloc = msa->sqalloc; if ( (status = esl_msa_Expand(msa)) != eslOK) goto ERROR; ESL_REALLOC(csflag, sizeof(char *) * msa->sqalloc); for (idx = old_sqalloc; idx < msa->sqalloc; idx++) csflag[idx] = NULL; } if ( (status = esl_msa_SetSeqName (msa, nseq, tok, toklen)) != eslOK) goto ERROR; if (n && (status = esl_msa_SetSeqDescription(msa, nseq, p, n)) != eslOK) goto ERROR; /* now for each sequence line... */ thislen = 0; /* count of lowercase, uppercase, and '-': w/o dots, on first pass */ this_ncons = 0; /* count of uppercase + '-': number of consensus columns in alignment: must match for all seqs */ if (nseq) { for (cpos = 0; cpos <= ncons; cpos++) // A little tricksy. <this_nins> is allocated on first seq, when nseq=0. this_nins[cpos] = 0; // cppcheck gets confused and erroneously calls "possible null pointer deference"; ignore it. } while ( (status = esl_msafile_GetLine(afp, &p, &n)) == eslOK) { while (n && isspace(*p)) { p++; n--; } /* tolerate and skip leading whitespace on line */ if (n == 0) continue; /* tolerate and skip blank lines */ if (*p == '>') break; ESL_REALLOC(csflag[nseq], sizeof(char) * (thislen + n + 1)); /* might be an overalloc by a bit, depending on whitespace on line */ if (nseq == 0) { ESL_REALLOC(this_nins, sizeof(int) * (this_ncons + n + 1)); for (cpos = this_ncons; cpos <= this_ncons+n; cpos++) this_nins[cpos] = 0; } for (spos = thislen, bpos = 0; bpos < n; bpos++) { if (p[bpos] == 'O') continue; else if (isupper(p[bpos])) { csflag[nseq][spos++] = TRUE; this_ncons++; } else if (islower(p[bpos])) { csflag[nseq][spos++] = FALSE; this_nins[this_ncons]++; } else if (p[bpos] == '-') { csflag[nseq][spos++] = TRUE; this_ncons++; } if (ncons && this_ncons > ncons) ESL_XFAIL(eslEFORMAT, afp->errmsg, "unexpected # of consensus residues, didn't match previous seq(s)"); } csflag[nseq][spos] = TRUE; /* need a sentinel, because of the way the padding functions work */ #ifdef eslAUGMENT_ALPHABET if (msa->abc) { status = esl_abc_dsqcat(afp->inmap, &(msa->ax[nseq]), &thislen, p, n); } #endif if (! msa->abc) { status = esl_strmapcat (afp->inmap, &(msa->aseq[nseq]), &thislen, p, n); } if (status == eslEINVAL) ESL_XFAIL(eslEFORMAT, afp->errmsg, "one or more invalid sequence characters"); else if (status != eslOK) goto ERROR; ESL_DASSERT1( (spos == thislen) ); } if (status != eslOK && status != eslEOF) goto ERROR; /* exception thrown by esl_msafile_GetLine() */ /* status == OK: then *p == '>'. status == eslEOF: we're eof. status == anything else: error */ /* Finished reading a sequence record. */ if (nseq == 0) { ncons = this_ncons; ESL_ALLOC(nins, sizeof(int) * (ncons+1)); for (cpos = 0; cpos <= ncons; cpos++) nins[cpos] = this_nins[cpos]; } else { if (this_ncons != ncons) ESL_XFAIL(eslEFORMAT, afp->errmsg, "unexpected # of consensus residues, didn't match previous seq(s)"); for (cpos = 0; cpos <= ncons; cpos++) nins[cpos] = ESL_MAX(nins[cpos], this_nins[cpos]); } nseq++; } while (status == eslOK); /* Now we have nseq *unaligned* sequences in ax/aseq[0..nseq-1]; call the length slen, though we don't explicitly store it * csflag[idx][spos] tells us whether each unaligned residue is an insertion or consensus, for spos==0..slen-1. * nins[0..ncons] tells us the max number of inserted residues before each consensus column * This is sufficient information to reconstruct each aligned sequence. */ msa->nseq = nseq; #ifdef eslAUGMENT_ALPHABET if (msa->abc) { if ((status = a2m_padding_digital(msa, csflag, nins, ncons)) != eslOK) goto ERROR; } #endif if (!msa->abc) { if ((status = a2m_padding_text (msa, csflag, nins, ncons)) != eslOK) goto ERROR; } if (( status = esl_msa_SetDefaultWeights(msa)) != eslOK) goto ERROR; *ret_msa = msa; free(nins); free(this_nins); for (idx = 0; idx < msa->nseq; idx++) free(csflag[idx]); free(csflag); return eslOK; ERROR: if (nins) free(nins); if (this_nins) free(this_nins); if (csflag) { for (idx = 0; idx < msa->nseq; idx++) if (csflag[idx]) free(csflag[idx]); free(csflag); } if (msa) esl_msa_Destroy(msa); return status; }