static double squid_xdistance(ESL_ALPHABET *a, ESL_DSQ *x1, ESL_DSQ *x2) { int diff = 0; int valid = 0; for (; *x1 != eslDSQ_SENTINEL; x1++, x2++) { if (esl_abc_XIsGap(a, *x1) || esl_abc_XIsGap(a, *x2)) continue; if (*x1 != *x2) diff++; valid++; } return (valid > 0 ? ((double) diff / (double) valid) : 0.0); }
/* Function: p7_Fastmodelmaker() * * Purpose: Heuristic model construction. * Construct an HMM from an alignment by a simple rule, * based on the fractional occupancy of each columns w/ * residues vs gaps. Any column w/ a fractional * occupancy of $\geq$ <symfrac> is assigned as a MATCH column; * for instance, if thresh = 0.5, columns w/ $\geq$ 50\% * residues are assigned to match... roughly speaking. * * "Roughly speaking" because sequences may be weighted * in the input <msa>, and because missing data symbols are * ignored, in order to deal with sequence fragments. * * The <msa> must be in digital mode. * * If the caller wants to designate any sequences as * fragments, it does so by converting all N-terminal and * C-terminal flanking gap symbols to missing data symbols. * * NOTE: p7_Fastmodelmaker() will slightly revise the * alignment if the assignment of columns implies * DI and ID transitions. * * Returns the HMM in counts form (ready for applying Dirichlet * priors as the next step). Also returns fake traceback * for each training sequence. * * Models must have at least one node, so if the <msa> defined * no consensus columns, a <eslENORESULT> error is returned. * * Args: msa - multiple sequence alignment * symfrac - threshold for residue occupancy; >= assigns MATCH * bld - holds information on regions requiring masking, optionally NULL -> no masking * ret_hmm - RETURN: counts-form HMM * opt_tr - optRETURN: array of tracebacks for aseq's * * Return: <eslOK> on success. ret_hmm and opt_tr allocated here, * and must be free'd by the caller (FreeTrace(tr[i]), free(tr), * FreeHMM(hmm)). * * Returns <eslENORESULT> if no consensus columns were annotated; * in this case, <ret_hmm> and <opt_tr> are returned NULL. * * Throws: <eslEMEM> on allocation failure; <eslEINVAL> if the * <msa> isn't in digital mode. */ int p7_Fastmodelmaker(ESL_MSA *msa, float symfrac, P7_BUILDER *bld, P7_HMM **ret_hmm, P7_TRACE ***opt_tr) { int status; /* return status flag */ int *matassign = NULL; /* MAT state assignments if 1; 1..alen */ int idx; /* counter over sequences */ int apos; /* counter for aligned columns */ float r; /* weighted residue count */ float totwgt; /* weighted residue+gap count */ if (! (msa->flags & eslMSA_DIGITAL)) ESL_XEXCEPTION(eslEINVAL, "need digital MSA"); /* Allocations: matassign is 1..alen array of bit flags. */ ESL_ALLOC(matassign, sizeof(int) * (msa->alen+1)); /* Determine weighted sym freq in each column, set matassign[] accordingly. */ for (apos = 1; apos <= msa->alen; apos++) { r = totwgt = 0.; for (idx = 0; idx < msa->nseq; idx++) { if (esl_abc_XIsResidue(msa->abc, msa->ax[idx][apos])) { r += msa->wgt[idx]; totwgt += msa->wgt[idx]; } else if (esl_abc_XIsGap(msa->abc, msa->ax[idx][apos])) { totwgt += msa->wgt[idx]; } else if (esl_abc_XIsMissing(msa->abc, msa->ax[idx][apos])) continue; } if (r > 0. && r / totwgt >= symfrac) matassign[apos] = TRUE; else matassign[apos] = FALSE; } /* Once we have matassign calculated, modelmakers behave * the same; matassign2hmm() does this stuff (traceback construction, * trace counting) and sets up ret_hmm and opt_tr. */ if ((status = matassign2hmm(msa, matassign, ret_hmm, opt_tr)) != eslOK) { fprintf (stderr, "hmm construction error during trace counting\n"); goto ERROR; } free(matassign); return eslOK; ERROR: if (matassign != NULL) free(matassign); return status; }
/* get_gaps_per_column * * Given an MSA, determine the number of gaps per * column, and return a newly allocated array with this * into in *ret_ngaps. */ static int get_gaps_per_column(ESL_MSA *msa, int **ret_ngaps) { int status; int i, apos; int *ngaps = NULL; /* contract check */ if(! (msa->flags & eslMSA_DIGITAL)) { status = eslEINVAL; goto ERROR; } ESL_ALLOC(ngaps, sizeof(int) * (msa->alen+1)); esl_vec_ISet(ngaps, msa->alen+1, 0); for(i = 0; i < msa->nseq; i++) { for(apos = 1; apos <= msa->alen; apos++) ngaps[apos] += esl_abc_XIsGap(msa->abc, msa->ax[i][apos]); } *ret_ngaps = ngaps; return eslOK; ERROR: if(ngaps != NULL) free(ngaps); return status; }
/* Function: p7_Alimask_MakeModel2AliMap() * Synopsis: Compute map of coordinate in the alignment corresponding to each model position. * * Args: msa - The alignment for which the mapped model is to be computed. We assume * the MSA has already been manipulated to account for model building * flags (e.g. weighting). * do_hand - TRUE when the model is to follow a hand-build RF line (which must be * part of the file. * symfraq - if weighted occupancy exceeds this value, include the column in the model. * map - int array into which the map values will be stored. Calling function * must allocate (msa->alen+1) ints. * * Returns: The number of mapped model positions. */ int p7_Alimask_MakeModel2AliMap(ESL_MSA *msa, int do_hand, float symfrac, int *map ) { int i = 0; int apos, idx; float r; /* weighted residue count */ float totwgt; /* weighted residue+gap count */ i = 0; if ( do_hand ) { if (msa->rf == NULL) p7_Fail("Model file does not contain an RF line, required for --hand.\n"); /* Watch for off-by-one. rf is [0..alen-1]*/ for (apos = 1; apos <= msa->alen; apos++) { if (!esl_abc_CIsGap(msa->abc, msa->rf[apos-1]) ) { map[i] = apos; i++; } } } else { for (apos = 1; apos <= msa->alen; apos++) { r = totwgt = 0.; for (idx = 0; idx < msa->nseq; idx++) { if (esl_abc_XIsResidue(msa->abc, msa->ax[idx][apos])) { r += msa->wgt[idx]; totwgt += msa->wgt[idx]; } else if (esl_abc_XIsGap(msa->abc, msa->ax[idx][apos])) { totwgt += msa->wgt[idx]; } else if (esl_abc_XIsMissing(msa->abc, msa->ax[idx][apos])) continue; } if (r > 0. && r / totwgt >= symfrac) { map[i] = apos; i++; } } } return i; }
/* Function: rejustify_insertions_digital() * Synopsis: * Incept: SRE, Thu Oct 23 13:06:12 2008 [Janelia] * * Purpose: * * Args: msa - alignment to rejustify * digital mode: ax[0..nseq-1][1..alen] and abc is valid * text mode: aseq[0..nseq-1][0..alen-1] * inserts - # of inserted columns following node k, for k=0.1..M * inserts[0] is for N state; inserts[M] is for C state * matmap - index of column associated with node k [k=0.1..M; matmap[0] = 0] * this is an alignment column index 1..alen, same offset as <ax> * if applied to text mode aseq or annotation, remember to -1 * if no residues use match state k, matmap[k] is the * index of the last column used before node k's columns * start: thus matmap[k]+1 is always the start of * node k's insertion (if any). * matuse - TRUE if an alignment column is associated with node k: [k=0.1..M; matuse[0] = 0]. * if matuse[k] == 0, every sequence deleted at node k, * and we're collapsing the column rather than showing all * gaps. * * Note: The insertion for node k is of length <inserts[k]> columns, * and in 1..alen coords it runs from * matmap[k]+1 .. matmap[k+1]-matuse[k+1]. * * * Returns: * * Throws: (no abnormal error conditions) * * Xref: */ static int rejustify_insertions_digital(ESL_MSA *msa, const int *inserts, const int *matmap, const int *matuse, int M) { int idx; int k; int apos; int nins; int npos, opos; for (idx = 0; idx < msa->nseq; idx++) { for (k = 0; k < M; k++) if (inserts[k] > 1) { for (nins = 0, apos = matmap[k]+1; apos <= matmap[k+1]-matuse[k+1]; apos++) if (esl_abc_XIsResidue(msa->abc, msa->ax[idx][apos])) nins++; if (k == 0) nins = 0; /* N-terminus is right justified */ else nins /= 2; /* split in half; nins now = # of residues left left-justified */ opos = npos = matmap[k+1]-matuse[k+1]; while (opos >= matmap[k]+1+nins) { if (esl_abc_XIsGap(msa->abc, msa->ax[idx][opos])) opos--; else { msa->ax[idx][npos] = msa->ax[idx][opos]; if (msa->pp != NULL && msa->pp[idx] != NULL) msa->pp[idx][npos-1] = msa->pp[idx][opos-1]; npos--; opos--; } } while (npos >= matmap[k]+1+nins) { msa->ax[idx][npos] = esl_abc_XGetGap(msa->abc); if (msa->pp != NULL && msa->pp[idx] != NULL) msa->pp[idx][npos-1] = '.'; npos--; } } } return eslOK; }
/* Function: esl_msashuffle_XQRNA() * Synopsis: Gap-preserving column shuffle of a digital pairwise alignment. * Incept: SRE, Tue Jan 22 09:09:52 2008 [Market Street Cafe, Leesburg] * * Purpose: Shuffle a digital pairwise alignment <x>,<y> while * preserving the position of gaps, where both sequences are * in digital alphabet <abc>, using the random number * generator <r>. Return the shuffled alignment in <xs>, * <ys>. Caller provides allocated space for <xs> and <ys> * for at least the same length of <x>,<y>. * * Works by doing three separate * shuffles, of (1) columns with residues in both * <x> and <y>, (2) columns with residue in <x> and gap in <y>, * and (3) columns with gap in <x> and residue in <y>. * * <xs>,<x> and <ys>,<y> may be identical: that is, to shuffle * an alignment "in place", destroying the original * alignment, just call <esl_msashuffle_XQRNA(r, abc, x,y,x,y)>. * * Returns: <eslOK> on success, and the shuffled alignment is * returned in <xs>, <ys>. * * Throws: <eslEMEM> on allocation failure. */ int esl_msashuffle_XQRNA(ESL_RANDOMNESS *r, ESL_ALPHABET *abc, ESL_DSQ *x, ESL_DSQ *y, ESL_DSQ *xs, ESL_DSQ *ys) { int L; int *xycol = NULL; int *xcol = NULL; int *ycol = NULL; int nxy, nx, ny; int i; int pos, c; char xsym, ysym; int status; L = esl_abc_dsqlen(x); if (esl_abc_dsqlen(y) != L) ESL_XEXCEPTION(eslEINVAL, "sequences of different lengths in qrna shuffle"); if (xs != x) esl_abc_dsqcpy(x, L, xs); if (ys != y) esl_abc_dsqcpy(y, L, ys); /* First, construct three arrays containing lists of the column positions * of the three types of columns. (If a column contains gaps in both x and y, * we've already simply copied it to the shuffled sequence.) */ ESL_ALLOC(xycol, sizeof(int) * L); ESL_ALLOC(xcol, sizeof(int) * L); ESL_ALLOC(ycol, sizeof(int) * L); nxy = nx = ny = 0; for (i = 1; i <= L; i++) { if ( esl_abc_XIsGap(abc, x[i]) && esl_abc_XIsGap(abc, y[i])) { continue; } else if (! esl_abc_XIsGap(abc, x[i]) && ! esl_abc_XIsGap(abc, y[i])) { xycol[nxy] = i; nxy++; } else if ( esl_abc_XIsGap(abc, x[i])) { ycol[ny] = i; ny++; } else if ( esl_abc_XIsGap(abc, y[i])) { xcol[nx] = i; nx++; } } /* Second, shuffle the sequences indirectly, via shuffling these arrays. * Yow, careful with those indices, and with order of the statements... */ for (; nxy > 1; nxy--) { pos = esl_rnd_Roll(r, nxy); xsym = xs[xycol[pos]]; ysym = ys[xycol[pos]]; c = xycol[pos]; xs[xycol[pos]] = xs[xycol[nxy-1]]; ys[xycol[pos]] = ys[xycol[nxy-1]]; xycol[pos] = xycol[nxy-1]; xs[xycol[nxy-1]] = xsym; ys[xycol[nxy-1]] = ysym; xycol[pos] = xycol[nxy-1]; } for (; nx > 1; nx--) { pos = esl_rnd_Roll(r, nx); xsym = xs[xcol[pos]]; ysym = ys[xcol[pos]]; c = xcol[pos]; xs[xcol[pos]] = xs[xcol[nx-1]]; ys[xcol[pos]] = ys[xcol[nx-1]]; xcol[pos] = xcol[nx-1]; xs[xcol[nx-1]] = xsym; ys[xcol[nx-1]] = ysym; xcol[nx-1] = c; } for (; ny > 1; ny--) { pos = esl_rnd_Roll(r, ny); xsym = xs[ycol[pos]]; ysym = ys[ycol[pos]]; c = ycol[pos]; xs[ycol[pos]] = xs[ycol[ny-1]]; ys[ycol[pos]] = ys[ycol[ny-1]]; ycol[pos] = ycol[ny-1]; xs[ycol[ny-1]] = xsym; ys[ycol[ny-1]] = ysym; ycol[ny-1] = c; } free(xycol); free(xcol); free(ycol); return eslOK; ERROR: if (xycol != NULL) free(xycol); if (xcol != NULL) free(xcol); if (ycol != NULL) free(ycol); return status; }
/* Function: p7_null3_score() * * Purpose: Calculate a correction (in log_2 odds) to be applied * to a sequence, using a null model based on the * composition of the target sequence. * The null model is constructed /post hoc/ as the * distribution of the target sequence; if the target * sequence is 40% A, 5% C, 5% G, 40% T, then the null * model is (0.4, 0.05, 0.05, 0.4). This function is * based heavily on Infernal's ScoreCorrectionNull3(), * with two important changes: * - it leaves the log2 conversion from NATS to BITS * for the calling function. * - it doesn't include the omega score modifier * (based on prior probability of using the null3 * model), again leaving this to the calling function. * * Args: abc - alphabet for hit (only used to get alphabet size) * dsq - the sequence the hit resides in * tr - trace of the alignment, used to find the match states * (non-match chars are ignored in computing freq, not used if NULL) * start - start position of hit in dsq * stop - end position of hit in dsq * bg - background, used for the default null model's emission freq * ret_sc - RETURN: the correction to the score (in NATS); * caller subtracts this from hit score to get * corrected score. * Return: void, ret_sc: the log-odds score correction (in NATS). */ void p7_null3_score(const ESL_ALPHABET *abc, const ESL_DSQ *dsq, P7_TRACE *tr, int start, int stop, P7_BG *bg, float *ret_sc) { float score = 0.; int status; int i; float *freq; int dir; int tr_pos; ESL_ALLOC(freq, sizeof(float) * abc->K); esl_vec_FSet(freq, abc->K, 0.0); /* contract check */ if(abc == NULL) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "p7_null3_score() alphabet is NULL.%s\n", ""); if(dsq == NULL) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "p7_null3_score() dsq alphabet is NULL.%s\n", ""); if(abc->type != eslRNA && abc->type != eslDNA) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "p7_null3_score() expects alphabet of RNA or DNA.%s\n", ""); dir = start < stop ? 1 : -1; if (tr != NULL) { /* skip the parts of the trace that precede the first match state */ tr_pos = 2; i = start; while (tr->st[tr_pos] != p7T_M) { if (tr->st[tr_pos] == p7T_N) i += dir; tr_pos++; } /* tally frequencies from characters hitting match state*/ while (tr->st[tr_pos] != p7T_E) { if (tr->st[tr_pos] == p7T_M) { if(esl_abc_XIsGap(abc, dsq[i])) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "in p7_null3_score(), res %d is a gap!%s\n", ""); esl_abc_FCount(abc, freq, dsq[i], 1.); } if (tr->st[tr_pos] != p7T_D ) i += dir; tr_pos++; } } else { /* tally frequencies from the full envelope */ for (i=ESL_MIN(start,stop); i <= ESL_MAX(start,stop); i++) { if(esl_abc_XIsGap(abc, dsq[i])) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "in p7_null3_score(), res %d is a gap!%s\n", ""); esl_abc_FCount(abc, freq, dsq[i], 1.); } } esl_vec_FNorm(freq, abc->K); /* now compute score modifier (nats) - note: even with tr!=NULL, this includes the unmatched characters*/ for (i = 0; i < abc->K; i++) score += freq[i]==0 ? 0.0 : esl_logf( freq[i]/bg->f[i] ) * freq[i] * ( (stop-start)*dir +1) ; /* Return the correction to the bit score. */ score = p7_FLogsum(0., score); *ret_sc = score; return; ERROR: esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "p7_null3_score() memory allocation error.%s\n", ""); return; /* never reached */ }
/* map_msas * * Align msa1 and msa2. * For each column in msa1, determine the corresponding column * in msa2. This implementation requires: * - msa1 and msa2 contain exactly the same sequences in the same order * Note: the seqs in msa1 and msa2 do not have to have the same names. * * Uses a DP algorithm similar to Needleman-Wunsch, but that's aligning * two alignment columns at a time instead of two residues. */ static int map_msas(const ESL_GETOPTS *go, char *errbuf, ESL_MSA *msa1, ESL_MSA *msa2, int **ret_msa1_to_msa2_map) { int status; int **one2two; /* [0..c..rflen1][0..a..alen2] number of residues from non-gap RF column c of msa1 * aligned in column a of msa 2 */ int *rf2a_map1 = NULL; /* msa1 map of reference columns (non-gap RF residues) to alignment columns, NULL if msa1->rf == NULL */ int *rf2a_map2 = NULL; /* msa2 map of reference columns (non-gap RF residues) to alignment columns, NULL if msa2->rf == NULL */ int *a2rf_map1 = NULL; /* msa1 map of alignment columns to reference columns, NULL if msa1->rf == NULL */ int *a2rf_map2 = NULL; /* msa2 map of alignment columns to reference columns, NULL if msa2->rf == NULL */ int apos1, apos2; /* counters over alignment position in msa1, msa2 respectively */ int alen1, alen2; /* alignment lengths */ int rfpos1, rfpos2; /* counters over reference positions */ int rflen1, rflen2; /* reference (non-gap RF) lengths */ int **mx; /* [0..c..rflen1][0..a..alen2] dp matrix, score of max scoring aln * from 1..c in msa1 and 1..a in msa 2 */ int **tb; /* [0..c..rflen1][0..a..alen2] traceback ptrs, 0 for diagonal, 1 for vertical */ char *seq1, *seq2; /* temporary strings for ensuring dealigned sequences in msa1 and msa2 are identical */ int64_t len1, len2; /* length of seq1, seq2 */ int isgap1, isgap2; /* is this residue a gap in msa1, msa2? */ int i; /* counter over sequences */ int *res1_per_apos; /* [0..apos..alen1] number of residues in column apos of msa1 */ int sc; /* max score of full path (alignment) through dp mx */ int tb_sc; /* score of traceback, should equal sc */ int *one2two_map; /* [0..a..alen1] the alignment, msa2 column that column apos1 in msa1 maps to */ int total_res = 0; /* total number of residues in msa1 */ float coverage; /* fraction of total_res that are within mapped msa2 columns from one2two_map, * this is tb_sc / total_res */ int total_cres1=0; /* total number of residues in reference positions in msa1 */ int covered_cres1 = 0; /* number of residues in reference positions in msa1 that also appear in the corresponding * mapped column of msa2 */ int be_quiet = esl_opt_GetBoolean(go, "-q"); int *choices; int i_choice; /* contract check */ if(! (msa1->flags & eslMSA_DIGITAL)) ESL_FAIL(eslEINVAL, errbuf, "in map_msas() msa1 (%s) not digitized.\n", esl_opt_GetArg(go, 1)); if(! (msa2->flags & eslMSA_DIGITAL)) ESL_FAIL(eslEINVAL, errbuf, "in map_msas() msa2 (%s) not digitized.\n", esl_opt_GetArg(go, 2)); alen1 = msa1->alen; alen2 = msa2->alen; /* Map msa1 (reference) columns to alignment positions */ rflen1 = rflen2 = 0; if(msa1->rf != NULL) if((status = map_rfpos_to_apos(msa1, &rf2a_map1, &a2rf_map1, &rflen1)) != eslOK) goto ERROR; if(msa2->rf != NULL) if((status = map_rfpos_to_apos(msa2, &rf2a_map2, &a2rf_map2, &rflen2)) != eslOK) goto ERROR; if(! be_quiet) { printf("# %-25s alignment length: %d\n", esl_opt_GetArg(go, 1), alen1); printf("# %-25s alignment length: %d\n", esl_opt_GetArg(go, 2), alen2); } /* collect counts in one2two[i][j]: number of sequences for which residue aligned in msa1 non-gap column i * is aligned in msa2 alignment column j. */ ESL_ALLOC(seq1, sizeof(char) * (alen1+1)); ESL_ALLOC(seq2, sizeof(char) * (alen2+1)); ESL_ALLOC(one2two, sizeof(int *) * (alen1+1)); for(apos1 = 0; apos1 <= alen1; apos1++) { ESL_ALLOC(one2two[apos1], sizeof(int) * (alen2+1)); esl_vec_ISet(one2two[apos1], (alen2+1), 0); } total_res = 0; for(i = 0; i < msa1->nseq; i++) { /* ensure raw (unaligned) seq i in the 2 msas is the same */ esl_abc_Textize(msa1->abc, msa1->ax[i], alen1, seq1); esl_abc_Textize(msa1->abc, msa2->ax[i], alen2, seq2); /* note: msa*1*->abc used on purpose, allows DNA/RNA to peacefully coexist in this func */ esl_strdealign(seq1, seq1, "-_.~", &len1); esl_strdealign(seq2, seq2, "-_.~", &len2); if(len1 != len2) { ESL_FAIL(eslEINVAL, errbuf, "unaligned seq number %d (msa1: %s, msa2: %s) differs in length %s (%" PRId64 ") and %s (%" PRId64 "), those files must contain identical raw seqs\n", i, msa1->sqname[i], msa2->sqname[i], esl_opt_GetArg(go, 1), len1, esl_opt_GetArg(go, 2), len2); } if(strncmp(seq1, seq2, len1) != 0) ESL_FAIL(eslEINVAL, errbuf, "unaligned seq number %d differs between %s and %s, those files must contain identical raw seqs\n", i, esl_opt_GetArg(go, 1), esl_opt_GetArg(go, 2)); total_res += len1; apos1 = apos2 = 1; while((apos1 <= alen1) || (apos2 <= alen2)) { isgap1 = esl_abc_XIsGap(msa1->abc, msa1->ax[i][apos1]); isgap2 = esl_abc_XIsGap(msa2->abc, msa2->ax[i][apos2]); if ( isgap1 && isgap2) { apos1++; apos2++; } else if ( isgap1 && !isgap2) { apos1++; } else if (!isgap1 && isgap2) { apos2++; } else if ( msa1->ax[i][apos1] == msa2->ax[i][apos2]) { one2two[apos1++][apos2++]++; /* two2one[apos2][apos1]++; */ } } } /****************************************************************** * DP alignment of msa1 to msa2 * dp matrix: mx[apos1][apos2] apos1=1..msa->alen1, apos2=1..alen2 (apos1=0 || apos2=0 is invalid) * mx[apos1][apos2] = score of maximal alignment for apos1=1..apos1, apos2'=1..apos2 INCLUDING * apos1 and apos2. Score is number of residues from msa1 columns * 1..apos1 that exist in their respective aligned columns in msa2 (the growing * maximally scoring alignment). */ /****************************************************************** * initialization */ ESL_ALLOC(mx, sizeof(int *) * (alen1+1)); ESL_ALLOC(tb, sizeof(int *) * (alen1+1)); for(apos1 = 0; apos1 <= alen1; apos1++) { ESL_ALLOC(mx[apos1], sizeof(int) * (alen2+1)); ESL_ALLOC(tb[apos1], sizeof(int) * (alen2+1)); esl_vec_ISet(mx[apos1], (alen2+1), 0); esl_vec_ISet(tb[apos1], (alen2+1), -2); /* -2 is a bogus value, if we see it during traceback, there's a problem */ tb[apos1][0] = HORZ; /* special case, if we hit apos2==0 and apos1 > 0, we have to do HORZ moves until apos1==1 */ } esl_vec_ISet(tb[0], (alen2+1), VERT); /* special case, if we hit apos1==0 and apos2 > 0, we have to do VERT moves until apos2==1 */ tb[0][0] = -2; /* all alignments must end here */ ESL_ALLOC(res1_per_apos, sizeof(int) * (alen1+1)); esl_vec_ISet(res1_per_apos, (alen1+1), 0); mx[0][0] = 0; tb[0][0] = -1; /* last cell, special value */ /***************************************************************** * recursion */ ESL_ALLOC(choices, sizeof(int) * NCHOICES); for(apos1 = 1; apos1 <= alen1; apos1++) { for(apos2 = 1; apos2 <= alen2; apos2++) { choices[DIAG] = mx[(apos1-1)][(apos2-1)] + one2two[apos1][apos2]; choices[VERT] = mx[ apos1 ][(apos2-1)]; choices[HORZ] = mx[(apos1-1)][ apos2 ]; i_choice = esl_vec_IArgMax(choices, NCHOICES); mx[apos1][apos2] = choices[i_choice]; tb[apos1][apos2] = i_choice; res1_per_apos[apos1] += one2two[apos1][apos2]; /*printf("mx[%3d][%3d]: %5d (%d)\n", apos1, apos2, mx[apos1][apos2], tb[apos1][apos2]);*/ } } free(choices); total_cres1 = 0; if(rf2a_map1 != NULL) { for(rfpos1 = 1; rfpos1 <= rflen1; rfpos1++) total_cres1 += res1_per_apos[rf2a_map1[rfpos1]]; } /***************************************************************** * traceback */ sc = mx[alen1][alen2]; if(!be_quiet) { /* printf("score %d\n", sc);*/ if(a2rf_map1 != NULL && a2rf_map2 != NULL) { printf("# %12s %12s %22s\n", " msa 1 ", " msa 2 ", ""); printf("# %12s %12s %22s\n", "------------", "------------", ""); printf("# %5s %5s %5s %5s %22s\n", "rfpos", "apos", "rfpos", "apos", " num common residues"); printf("# %5s %5s %5s %5s %22s\n", "-----", "-----", "-----", "-----", "---------------------"); } else if(a2rf_map1 != NULL) { printf("# %12s %5s %22s\n", " msa 1 ", "msa 2", ""); printf("# %12s %5s %22s\n", "------------", "-----", ""); printf("# %5s %5s %5s %22s\n", "rfpos", "apos", "apos", " num common residues"); printf("# %5s %5s %5s %22s\n", "-----", "-----", "-----", "---------------------"); } else if (a2rf_map2 != NULL) { printf("# %5s %12s %22s\n", "msa 1", " msa 2 ", ""); printf("# %5s %12s %22s\n", "-----", "------------", ""); printf("# %5s %5s %5s %22s\n", "apos", "rfpos", "apos", " num common residues"); printf("# %5s %5s %5s %22s\n", "-----", "-----", "-----", "---------------------"); } else { printf("# %5s %5s %22s\n", "msa 1", "msa 2", ""); printf("# %5s %5s %22s\n", "-----", "-----", ""); printf("# %5s %5s %22s\n", "apos", "apos", " num common residues"); printf("# %5s %5s %22s\n", "-----", "-----", "---------------------"); } } /* traceback, and build one2two_map[] */ apos1 = alen1; apos2 = alen2; tb_sc = 0; covered_cres1 = 0; ESL_ALLOC(one2two_map, sizeof(int) * (alen1+1)); esl_vec_ISet(one2two_map, (alen1+1), 0); one2two_map[0] = -1; /* invalid */ while(tb[apos1][apos2] != -1) { if(tb[apos1][apos2] == DIAG) { /* diagonal move */ rfpos1 = (a2rf_map1 == NULL) ? -1 : a2rf_map1[apos1]; rfpos2 = (a2rf_map2 == NULL) ? -1 : a2rf_map2[apos2]; if(!be_quiet) { if(a2rf_map1 != NULL && a2rf_map2 != NULL) { if(rfpos1 == -1 && rfpos2 == -1) { printf(" %5s %5d --> %5s %5d %5d / %5d (%.4f)\n", "-", apos1, "-", apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } else if (rfpos1 == -1) { printf(" %5s %5d --> %5d %5d %5d / %5d (%.4f)\n", "-", apos1, rfpos2, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } else if (rfpos2 == -1) { printf(" %5d %5d --> %5s %5d %5d / %5d (%.4f)\n", rfpos1, apos1, "-", apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } else { printf(" %5d %5d --> %5d %5d %5d / %5d (%.4f)\n", rfpos1, apos1, rfpos2, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } } else if(a2rf_map1 != NULL) { if (rfpos1 == -1) { printf(" %5s %5d --> %5d %5d / %5d (%.4f)\n", "-", apos1, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } else { printf(" %5d %5d --> %5d %5d / %5d (%.4f)\n", rfpos1, apos1, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } } else if (a2rf_map2 != NULL) { if (rfpos2 == -1) { printf(" %5d --> %5s %5d %5d / %5d (%.4f)\n", apos1, "-", apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } else { printf(" %5d --> %5d %5d %5d / %5d (%.4f)\n", apos1, rfpos2, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } } else { printf(" %5d --> %5d %5d / %5d (%.4f)\n", apos1, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } } tb_sc += one2two[apos1][apos2]; one2two_map[apos1] = apos2; if(rfpos1 > 0) covered_cres1 += one2two[apos1][apos2]; /* apos1 is a rfpos */ apos1--; apos2--; } else if(tb[apos1][apos2] == VERT) { apos2--; /* vertical move */ } else if(tb[apos1][apos2] == HORZ) { apos1--; /* horizontal move */ } else if(tb[apos1][apos2] != -1) /* shouldn't happen */ ESL_FAIL(eslEINVAL, errbuf, "in dp traceback, tb[apos1: %d][apos2: %d] %d\n", apos1, apos2, tb[apos1][apos2]); } /* done DP code **********************************/ if(!be_quiet) printf("# Total trace back sc: %d\n", tb_sc); if(tb_sc != sc) ESL_FAIL(eslEINVAL, errbuf, "in dp traceback, tb_sc (%d) != sc (%d)\n", tb_sc, sc); coverage = (float) tb_sc / (float) total_res; printf("# Coverage: %6d / %6d (%.4f)\n# Coverage is fraction of residues from %s in optimally mapped columns in %s\n", tb_sc, total_res, coverage, esl_opt_GetArg(go, 1), esl_opt_GetArg(go, 2)); if(total_cres1 > 0) printf("# RF coverage: %6d / %6d (%.4f)\n# RF coverage is fraction of non-gap RF residues from %s in optimally mapped columns in %s\n", covered_cres1, total_cres1, (float) covered_cres1 / (float) total_cres1, esl_opt_GetArg(go, 1), esl_opt_GetArg(go, 2)); /* print masks if nec */ if((status = map2masks(go, errbuf, alen1, alen2, a2rf_map1, a2rf_map2, rf2a_map1, rf2a_map2, rflen1, rflen2, one2two_map)) != eslOK) return status; /* clean up and return */ for(apos1 = 0; apos1 <= alen1; apos1++) { free(mx[apos1]); free(tb[apos1]); } free(mx); free(tb); for(apos1 = 0; apos1 <= alen1; apos1++) free(one2two[apos1]); free(one2two); free(res1_per_apos); if(rf2a_map1 != NULL) free(rf2a_map1); if(rf2a_map2 != NULL) free(rf2a_map2); if(a2rf_map1 != NULL) free(a2rf_map1); if(a2rf_map2 != NULL) free(a2rf_map2); free(seq1); free(seq2); *ret_msa1_to_msa2_map = one2two_map; return eslOK; ERROR: return status; }