/* Function: esl_dst_XPairId() * Synopsis: Pairwise identity of two aligned digital seqs. * Incept: SRE, Tue Apr 18 09:24:05 2006 [St. Louis] * * Purpose: Digital version of <esl_dst_PairId()>: <adsq1> and * <adsq2> are digitized aligned sequences, in alphabet * <abc>. Otherwise, same as <esl_dst_PairId()>. * * Args: abc - digital alphabet in use * ax1 - aligned digital seq 1 * ax2 - aligned digital seq 2 * opt_pid - optRETURN: pairwise identity, 0<=x<=1 * opt_nid - optRETURN: # of identities * opt_n - optRETURN: denominator MIN(len1,len2) * * Returns: <eslOK> on success. <opt_distance>, <opt_nid>, <opt_n> * contain the answers, for any of these that were passed * non-<NULL> pointers. * * Throws: <eslEINVAL> if the strings are different lengths (not aligned). */ int esl_dst_XPairId(const ESL_ALPHABET *abc, const ESL_DSQ *ax1, const ESL_DSQ *ax2, double *opt_distance, int *opt_nid, int *opt_n) { int status; int idents; /* total identical positions */ int len1, len2; /* lengths of seqs */ int i; /* position in aligned seqs */ idents = len1 = len2 = 0; for (i = 1; ax1[i] != eslDSQ_SENTINEL && ax2[i] != eslDSQ_SENTINEL; i++) { if (esl_abc_XIsCanonical(abc, ax1[i])) len1++; if (esl_abc_XIsCanonical(abc, ax2[i])) len2++; if (esl_abc_XIsCanonical(abc, ax1[i]) && esl_abc_XIsCanonical(abc, ax2[i]) && ax1[i] == ax2[i]) idents++; } if (len2 < len1) len1 = len2; if (ax1[i] != eslDSQ_SENTINEL || ax2[i] != eslDSQ_SENTINEL) ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned"); if (opt_distance != NULL) *opt_distance = ( len1==0 ? 0. : (double) idents / (double) len1 ); if (opt_nid != NULL) *opt_nid = idents; if (opt_n != NULL) *opt_n = len1; return eslOK; ERROR: if (opt_distance != NULL) *opt_distance = 0.; if (opt_nid != NULL) *opt_nid = 0; if (opt_n != NULL) *opt_n = 0; return status; }
/* Function: esl_dst_XJukesCantor() * Synopsis: Jukes-Cantor distance for two aligned digitized seqs. * Incept: SRE, Tue Apr 18 15:26:51 2006 [St. Louis] * * Purpose: Calculate the generalized Jukes-Cantor distance between two * aligned digital strings <ax> and <ay>, in substitutions/site, * using alphabet <abc> to evaluate identities and differences. * The maximum likelihood estimate for the distance is optionally returned in * <opt_distance>. The large-sample variance for the distance * estimate is optionally returned in <opt_variance>. * * Identical to <esl_dst_CJukesCantor()>, except that it takes * digital sequences instead of character strings. * * Args: abc - bioalphabet to use for comparisons * ax - 1st digital aligned seq * ay - 2nd digital aligned seq * opt_distance - optRETURN: ML estimate of distance d * opt_variance - optRETURN: large-sample variance of d * * Returns: <eslOK> on success. As in <esl_dst_CJukesCantor()>, the * distance and variance may be infinite, in which case they * are returned as <HUGE_VAL>. * * Throws: <eslEINVAL> if the two strings aren't the same length (and * thus can't have been properly aligned). * <eslEDIVZERO> if no aligned residues were counted. * On either failure, the distance and variance are set * to <HUGE_VAL>. */ int esl_dst_XJukesCantor(const ESL_ALPHABET *abc, const ESL_DSQ *ax, const ESL_DSQ *ay, double *opt_distance, double *opt_variance) { int status; int n1, n2; /* number of observed identities, substitutions */ int i; /* position in aligned seqs */ n1 = n2 = 0; for (i = 1; ax[i] != eslDSQ_SENTINEL && ay[i] != eslDSQ_SENTINEL; i++) { if (esl_abc_XIsCanonical(abc, ax[i]) && esl_abc_XIsCanonical(abc, ay[i])) { if (ax[i] == ay[i]) n1++; else n2++; } } if (ax[i] != eslDSQ_SENTINEL || ay[i] != eslDSQ_SENTINEL) ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned"); return jukescantor(n1, n2, abc->K, opt_distance, opt_variance); ERROR: if (opt_distance != NULL) *opt_distance = HUGE_VAL; if (opt_variance != NULL) *opt_variance = HUGE_VAL; return status; }
/* Function: esl_msaweight_PB() * Synopsis: PB (position-based) weights. * Incept: SRE, Sun Nov 5 08:59:28 2006 [Janelia] * * Purpose: Given a multiple alignment <msa>, calculate sequence * weights according to the position-based weighting * algorithm (Henikoff and Henikoff, JMB 243:574-578, * 1994). These weights are stored internally in the <msa> * object, replacing any weights that may have already been * there. Weights are $\geq 0$ and they sum to <msa->nseq>. * * The <msa> may be in either digitized or text mode. * Digital mode is preferred, so that the algorithm * deals with degenerate residue symbols properly. * * The Henikoffs' algorithm does not give rules for dealing * with gaps or degenerate residue symbols. The rule here * is to ignore them. This means that longer sequences * initially get more weight; hence a "double * normalization" in which the weights are first divided by * sequence length in canonical residues (to compensate for * that effect), then normalized to sum to nseq. * * An advantage of the PB method is efficiency. * It is $O(1)$ in memory and $O(NL)$ time, for an alignment of * N sequences and L columns. This makes it a good method * for ad hoc weighting of very deep alignments. * * When the alignment is in simple text mode, IUPAC * degenerate symbols are not dealt with correctly; instead, * the algorithm simply uses the 26 letters as "residues" * (case-insensitively), and treats all other residues as * gaps. * * Returns: <eslOK> on success, and the weights inside <msa> have been * modified. * * Throws: <eslEMEM> on allocation error, in which case <msa> is * returned unmodified. * * Xref: [Henikoff94b]; squid::weight.c::PositionBasedWeights(). */ int esl_msaweight_PB(ESL_MSA *msa) { int *nres = NULL; /* counts of each residue observed in a column */ int ntotal; /* number of different symbols observed in a column */ int rlen; /* number of residues in a sequence */ int idx, pos, i; int K; /* alphabet size */ int status; /* Contract checks */ ESL_DASSERT1( (msa->nseq >= 1) ); ESL_DASSERT1( (msa->alen >= 1) ); if (msa->nseq == 1) { msa->wgt[0] = 1.0; return eslOK; } /* Initialize */ if (! (msa->flags & eslMSA_DIGITAL)) { ESL_ALLOC(nres, sizeof(int) * 26); K = 26; } #ifdef eslAUGMENT_ALPHABET else { ESL_ALLOC(nres, sizeof(int) * msa->abc->K); K = msa->abc->K; } #endif esl_vec_DSet(msa->wgt, msa->nseq, 0.); /* This section handles text alignments */ if (! (msa->flags & eslMSA_DIGITAL)) { for (pos = 0; pos < msa->alen; pos++) { /* Collect # of letters A..Z in this column, and total */ esl_vec_ISet(nres, K, 0.); for (idx = 0; idx < msa->nseq; idx++) if (isalpha((int) msa->aseq[idx][pos])) nres[toupper((int) msa->aseq[idx][pos]) - 'A'] ++; for (ntotal = 0, i = 0; i < K; i++) if (nres[i] > 0) ntotal++; /* Bump weight on each seq by PB rule */ if (ntotal > 0) { for (idx = 0; idx < msa->nseq; idx++) { if (isalpha((int) msa->aseq[idx][pos])) msa->wgt[idx] += 1. / (double) (ntotal * nres[toupper((int) msa->aseq[idx][pos]) - 'A'] ); } } } /* first normalization by # of residues counted in each seq */ for (idx = 0; idx < msa->nseq; idx++) { for (rlen = 0, pos = 0; pos < msa->alen; pos++) if (isalpha((int) msa->aseq[idx][pos])) rlen++; if (ntotal > 0) msa->wgt[idx] /= (double) rlen; /* if rlen == 0 for this seq, its weight is still 0.0, as initialized. */ } } /* This section handles digital alignments. */ #ifdef eslAUGMENT_ALPHABET else { for (pos = 1; pos <= msa->alen; pos++) { /* Collect # of residues 0..K-1 in this column, and total # */ esl_vec_ISet(nres, K, 0.); for (idx = 0; idx < msa->nseq; idx++) if (esl_abc_XIsCanonical(msa->abc, msa->ax[idx][pos])) nres[(int) msa->ax[idx][pos]] ++; for (ntotal = 0, i = 0; i < K; i++) if (nres[i] > 0) ntotal++; /* Bump weight on each sequence by PB rule */ if (ntotal > 0) { for (idx = 0; idx < msa->nseq; idx++) { if (esl_abc_XIsCanonical(msa->abc, msa->ax[idx][pos])) msa->wgt[idx] += 1. / (double) (ntotal * nres[msa->ax[idx][pos]]); } } } /* first normalization by # of residues counted in each seq */ for (idx = 0; idx < msa->nseq; idx++) { for (rlen = 0, pos = 1; pos <= msa->alen; pos++) if (esl_abc_XIsCanonical(msa->abc, msa->ax[idx][pos])) rlen++; if (rlen > 0) msa->wgt[idx] /= (double) rlen; /* if rlen == 0 for this seq, its weight is still 0.0, as initialized. */ } } #endif /* Make weights normalize up to nseq, and return. In pathological * case where all wgts were 0 (no seqs contain any unambiguous * residues), weights become 1.0. */ esl_vec_DNorm(msa->wgt, msa->nseq); esl_vec_DScale(msa->wgt, msa->nseq, (double) msa->nseq); msa->flags |= eslMSA_HASWGTS; free(nres); return eslOK; ERROR: if (nres != NULL) free(nres); return status; }