/* Each unit test is given an alignment with certain known * properties: * seqs 0,1 are identical * seqs 0,2 are completely different * seqs 3..N are random * The alignment may contain gaps, so don't assume that the * # of compared residues == alignment length. The alignment * contains only canonical residues, because one of our tests * is that C and X functions give the same results. */ static int utest_CPairId(char **as, int N) { double pid; int nid; int nres; int L; int i,j; /* Self comparison gives identity = 1. */ L = strlen(as[0]); if (esl_dst_CPairId(as[0], as[0], &pid, &nid, &nres) != eslOK) abort(); if (pid != 1.0 || nid != L || nres > L) abort(); /* So does 0,1 comparison */ if (esl_dst_CPairId(as[0], as[1], &pid, &nid, &nres) != eslOK) abort(); if (pid != 1.0 || nid != L || nres > L) abort(); /* 0,2 comparison gives 0.0, 0 */ if (esl_dst_CPairId(as[0], as[2], &pid, &nid, &nres) != eslOK) abort(); if (pid != 0.0 || nid != 0 || nres > L) abort(); /* remaining comparisons shouldn't fail */ for (i = 3; i < N; i++) for (j = i; j < N; j++) { if (esl_dst_CPairId(as[i], as[j], &pid, &nid, &nres) != eslOK) abort(); if (pid < 0. || pid > 1. || nid < 0 || nid > L || nres > L) abort(); } /* API should accept NULL for return values */ if (esl_dst_CPairId(as[0], as[0], NULL, NULL, NULL) != eslOK) abort(); return eslOK; }
/* Function: esl_dst_CPairIdMx() * Synopsis: NxN identity matrix for N aligned text sequences. * Incept: SRE, Thu Apr 27 08:46:08 2006 [New York] * * Purpose: Given a multiple sequence alignment <as>, consisting * of <N> aligned character strings; calculate * a symmetric fractional pairwise identity matrix by $N(N-1)/2$ * calls to <esl_dst_CPairId()>, and return it in * <ret_D>. * * Args: as - aligned seqs (all same length), [0..N-1] * N - # of aligned sequences * ret_S - RETURN: symmetric fractional identity matrix * * Returns: <eslOK> on success, and <ret_S> contains the fractional * identity matrix. Caller free's <S> with * <esl_dmatrix_Destroy()>. * * Throws: <eslEINVAL> if a seq has a different * length than others. On failure, <ret_D> is returned <NULL> * and state of inputs is unchanged. */ int esl_dst_CPairIdMx(char **as, int N, ESL_DMATRIX **ret_S) { ESL_DMATRIX *S = NULL; int status; int i,j; if (( S = esl_dmatrix_Create(N,N) ) == NULL) goto ERROR; for (i = 0; i < N; i++) { S->mx[i][i] = 1.; for (j = i+1; j < N; j++) { status = esl_dst_CPairId(as[i], as[j], &(S->mx[i][j]), NULL, NULL); if (status != eslOK) ESL_XEXCEPTION(status, "Pairwise identity calculation failed at seqs %d,%d\n", i,j); S->mx[j][i] = S->mx[i][j]; } } if (ret_S != NULL) *ret_S = S; else esl_dmatrix_Destroy(S); return eslOK; ERROR: if (S != NULL) esl_dmatrix_Destroy(S); if (ret_S != NULL) *ret_S = NULL; return status; }
/* Function: esl_msaweight_IDFilter() * Synopsis: Filter by %ID. * Incept: ER, Wed Oct 29 10:06:43 2008 [Janelia] * * Purpose: Constructs a new alignment by removing near-identical * sequences from a given alignment (where identity is * calculated *based on the alignment*). * Does not affect the given alignment. * Keeps earlier sequence, discards later one. * * Usually called as an ad hoc sequence "weighting" mechanism. * * Limitations: * Unparsed Stockholm markup is not propagated into the * new alignment. * * Return: <eslOK> on success, and the <newmsa>. * * Throws: <eslEMEM> on allocation error. <eslEINVAL> if a pairwise * identity calculation fails because of corrupted sequence * data. In either case, the <msa> is unmodified. * * Xref: squid::weight.c::FilterAlignment(). */ int esl_msaweight_IDFilter(const ESL_MSA *msa, double maxid, ESL_MSA **ret_newmsa) { int *list = NULL; /* array of seqs in new msa */ int *useme = NULL; /* TRUE if seq is kept in new msa */ int nnew; /* number of seqs in new alignment */ double ident; /* pairwise percentage id */ int i,j; /* seqs counters*/ int remove; /* TRUE if sq is to be removed */ int status; /* Contract checks */ ESL_DASSERT1( (msa != NULL) ); ESL_DASSERT1( (msa->nseq >= 1) ); ESL_DASSERT1( (msa->alen >= 1) ); /* allocate */ ESL_ALLOC(list, sizeof(int) * msa->nseq); ESL_ALLOC(useme, sizeof(int) * msa->nseq); esl_vec_ISet(useme, msa->nseq, 0); /* initialize array */ /* find which seqs to keep (list) */ nnew = 0; for (i = 0; i < msa->nseq; i++) { remove = FALSE; for (j = 0; j < nnew; j++) { if (! (msa->flags & eslMSA_DIGITAL)) { if ((status = esl_dst_CPairId(msa->aseq[i], msa->aseq[list[j]], &ident, NULL, NULL)) != eslOK) goto ERROR; } #ifdef eslAUGMENT_ALPHABET else { if ((status = esl_dst_XPairId(msa->abc, msa->ax[i], msa->ax[list[j]], &ident, NULL, NULL)) != eslOK) goto ERROR; } #endif if (ident > maxid) { remove = TRUE; break; } } if (remove == FALSE) { list[nnew++] = i; useme[i] = TRUE; } } if ((status = esl_msa_SequenceSubset(msa, useme, ret_newmsa)) != eslOK) goto ERROR; free(list); free(useme); return eslOK; ERROR: if (list != NULL) free(list); if (useme != NULL) free(useme); return status; }
/* Function: esl_dst_CAverageId() * Synopsis: Calculate avg identity for multiple alignment * Incept: SRE, Fri May 18 15:02:38 2007 [Janelia] * * Purpose: Calculates the average pairwise fractional identity in * a multiple sequence alignment <as>, consisting of <N> * aligned character sequences of identical length. * * If an exhaustive calculation would require more than * <max_comparisons> pairwise comparisons, then instead of * looking at all pairs, calculate the average over a * stochastic sample of <max_comparisons> random pairs. * This allows the routine to work efficiently even on very * deep MSAs. * * Each fractional pairwise identity (range $[0..$ pid $..1]$ * is calculated using <esl_dsq_CPairId()>. * * Returns: <eslOK> on success, and <*ret_id> contains the average * fractional identity. * * Throws: <eslEMEM> on allocation failure. * <eslEINVAL> if any of the aligned sequence pairs aren't * of the same length. * In either case, <*ret_id> is set to 0. */ int esl_dst_CAverageId(char **as, int N, int max_comparisons, double *ret_id) { int status; double id; double sum; int i,j,n; if (N <= 1) { *ret_id = 1.; return eslOK; } *ret_id = 0.; /* Is nseq small enough that we can average over all pairwise comparisons? */ if ((N * (N-1) / 2) <= max_comparisons) { for (i = 0; i < N; i++) for (j = i+1; j < N; j++) { if ((status = esl_dst_CPairId(as[i], as[j], &id, NULL, NULL)) != eslOK) return status; sum += id; } id /= (double) (N * (N-1) / 2); } /* If nseq is large, calculate average over a stochastic sample. */ else { ESL_RANDOMNESS *r = esl_randomness_CreateTimeseeded(); for (n = 0; n < max_comparisons; n++) { do { i = esl_rnd_Roll(r, N); j = esl_rnd_Roll(r, N); } while (j == i); /* make sure j != i */ if ((status = esl_dst_CPairId(as[i], as[j], &id, NULL, NULL)) != eslOK) return status; sum += id; } id /= (double) max_comparisons; esl_randomness_Destroy(r); } *ret_id = id; return eslOK; }
/* Definition of %id linkage in text-mode aligned seqs (>= maxid): */ static int msacluster_clinkage(const void *v1, const void *v2, const void *p, int *ret_link) { char *as1 = *(char **) v1; char *as2 = *(char **) v2; double maxid = *(double *) p; double pid; int status = eslOK; #if defined(eslMSACLUSTER_REGRESSION) || defined(eslMSAWEIGHT_REGRESSION) pid = 1. - squid_distance(as1, as2); #else if ((status = esl_dst_CPairId(as1, as2, &pid, NULL, NULL)) != eslOK) return status; #endif *ret_link = (pid >= maxid ? TRUE : FALSE); return status; }
static int utest_XPairId(ESL_ALPHABET *abc, char **as, ESL_DSQ **ax, int N) { double pid, pid2; int nid, nid2; int nres, nres2; int dL, L; int i,j; /* Self comparison gives identity = 1. */ dL = esl_abc_dsqlen(ax[0]); L = strlen(as[0]); if (dL != L) abort(); if (esl_dst_XPairId(abc, ax[0], ax[0], &pid, &nid, &nres) != eslOK) abort(); if (pid != 1.0 || nid != L || nres > dL) abort(); /* So does 0,1 comparison */ if (esl_dst_XPairId(abc, ax[0], ax[1], &pid, &nid, &nres) != eslOK) abort(); if (pid != 1.0 || nid != L || nres > L) abort(); /* 0,2 comparison gives 0.0, 0 */ if (esl_dst_XPairId(abc, ax[0], ax[2], &pid, &nid, &nres) != eslOK) abort(); if (pid != 0.0 || nid != 0 || nres > L) abort(); /* remaining comparisons shouldn't fail, and should be identical to text mode */ for (i = 3; i < N; i++) for (j = i; j < N; j++) { if (esl_dst_XPairId(abc, ax[i], ax[j], &pid, &nid, &nres) != eslOK) abort(); if (esl_dst_CPairId(as[i], as[j], &pid2, &nid2, &nres2) != eslOK) abort(); if (pid < 0. || pid > 1. || nid < 0 || nid > L || nres > L) abort(); if (pid != pid2 || nid != nid2 || nres != nres2) abort(); } /* API should accept NULL for return values */ if (esl_dst_XPairId(abc, ax[0], ax[0], NULL, NULL, NULL) != eslOK) abort(); return eslOK; }