/* Function: esl_dst_XPairIdMx() * Synopsis: NxN identity matrix for N aligned digital seqs. * Incept: SRE, Thu Apr 27 09:08:11 2006 [New York] * * Purpose: Given a digitized multiple sequence alignment <ax>, consisting * of <N> aligned digital sequences in alphabet <abc>; calculate * a symmetric pairwise fractional identity matrix by $N(N-1)/2$ * calls to <esl_dst_XPairId()>, and return it in <ret_S>. * * Args: abc - digital alphabet in use * ax - aligned dsq's, [0..N-1][1..alen] * N - number of aligned sequences * ret_S - RETURN: NxN matrix of fractional identities * * Returns: <eslOK> on success, and <ret_S> contains the distance * matrix. Caller is obligated to free <S> with * <esl_dmatrix_Destroy()>. * * Throws: <eslEINVAL> if a seq has a different * length than others. On failure, <ret_S> is returned <NULL> * and state of inputs is unchanged. */ int esl_dst_XPairIdMx(const ESL_ALPHABET *abc, ESL_DSQ **ax, int N, ESL_DMATRIX **ret_S) { int status; ESL_DMATRIX *S = NULL; int i,j; if (( S = esl_dmatrix_Create(N,N) ) == NULL) goto ERROR; for (i = 0; i < N; i++) { S->mx[i][i] = 1.; for (j = i+1; j < N; j++) { status = esl_dst_XPairId(abc, ax[i], ax[j], &(S->mx[i][j]), NULL, NULL); if (status != eslOK) ESL_XEXCEPTION(status, "Pairwise identity calculation failed at seqs %d,%d\n", i,j); S->mx[j][i] = S->mx[i][j]; } } if (ret_S != NULL) *ret_S = S; else esl_dmatrix_Destroy(S); return eslOK; ERROR: if (S != NULL) esl_dmatrix_Destroy(S); if (ret_S != NULL) *ret_S = NULL; return status; }
/* Function: esl_msaweight_IDFilter() * Synopsis: Filter by %ID. * Incept: ER, Wed Oct 29 10:06:43 2008 [Janelia] * * Purpose: Constructs a new alignment by removing near-identical * sequences from a given alignment (where identity is * calculated *based on the alignment*). * Does not affect the given alignment. * Keeps earlier sequence, discards later one. * * Usually called as an ad hoc sequence "weighting" mechanism. * * Limitations: * Unparsed Stockholm markup is not propagated into the * new alignment. * * Return: <eslOK> on success, and the <newmsa>. * * Throws: <eslEMEM> on allocation error. <eslEINVAL> if a pairwise * identity calculation fails because of corrupted sequence * data. In either case, the <msa> is unmodified. * * Xref: squid::weight.c::FilterAlignment(). */ int esl_msaweight_IDFilter(const ESL_MSA *msa, double maxid, ESL_MSA **ret_newmsa) { int *list = NULL; /* array of seqs in new msa */ int *useme = NULL; /* TRUE if seq is kept in new msa */ int nnew; /* number of seqs in new alignment */ double ident; /* pairwise percentage id */ int i,j; /* seqs counters*/ int remove; /* TRUE if sq is to be removed */ int status; /* Contract checks */ ESL_DASSERT1( (msa != NULL) ); ESL_DASSERT1( (msa->nseq >= 1) ); ESL_DASSERT1( (msa->alen >= 1) ); /* allocate */ ESL_ALLOC(list, sizeof(int) * msa->nseq); ESL_ALLOC(useme, sizeof(int) * msa->nseq); esl_vec_ISet(useme, msa->nseq, 0); /* initialize array */ /* find which seqs to keep (list) */ nnew = 0; for (i = 0; i < msa->nseq; i++) { remove = FALSE; for (j = 0; j < nnew; j++) { if (! (msa->flags & eslMSA_DIGITAL)) { if ((status = esl_dst_CPairId(msa->aseq[i], msa->aseq[list[j]], &ident, NULL, NULL)) != eslOK) goto ERROR; } #ifdef eslAUGMENT_ALPHABET else { if ((status = esl_dst_XPairId(msa->abc, msa->ax[i], msa->ax[list[j]], &ident, NULL, NULL)) != eslOK) goto ERROR; } #endif if (ident > maxid) { remove = TRUE; break; } } if (remove == FALSE) { list[nnew++] = i; useme[i] = TRUE; } } if ((status = esl_msa_SequenceSubset(msa, useme, ret_newmsa)) != eslOK) goto ERROR; free(list); free(useme); return eslOK; ERROR: if (list != NULL) free(list); if (useme != NULL) free(useme); return status; }
/* Function: esl_dst_XAverageId() * Synopsis: Calculate avg identity for digital MSA * Incept: SRE, Fri May 18 15:19:14 2007 [Janelia] * * Purpose: Calculates the average pairwise fractional identity in * a digital multiple sequence alignment <ax>, consisting of <N> * aligned digital sequences of identical length. * * If an exhaustive calculation would require more than * <max_comparisons> pairwise comparisons, then instead of * looking at all pairs, calculate the average over a * stochastic sample of <max_comparisons> random pairs. * This allows the routine to work efficiently even on very * deep MSAs. * * Each fractional pairwise identity (range $[0..$ pid $..1]$ * is calculated using <esl_dsq_XPairId()>. * * Returns: <eslOK> on success, and <*ret_id> contains the average * fractional identity. * * Throws: <eslEMEM> on allocation failure. * <eslEINVAL> if any of the aligned sequence pairs aren't * of the same length. * In either case, <*ret_id> is set to 0. */ int esl_dst_XAverageId(const ESL_ALPHABET *abc, ESL_DSQ **ax, int N, int max_comparisons, double *ret_id) { int status; double id; double sum; int i,j,n; if (N <= 1) { *ret_id = 1.; return eslOK; } *ret_id = 0.; /* Is N small enough that we can average over all pairwise comparisons? watch out for numerical overflow in this: Pfam N's easily overflow when squared */ if (N <= max_comparisons && N <= sqrt(2. * max_comparisons) && (N * (N-1) / 2) <= max_comparisons) { for (i = 0; i < N; i++) for (j = i+1; j < N; j++) { if ((status = esl_dst_XPairId(abc, ax[i], ax[j], &id, NULL, NULL)) != eslOK) return status; sum += id; } sum /= (double) (N * (N-1) / 2); } /* If nseq is large, calculate average over a stochastic sample. */ else { ESL_RANDOMNESS *r = esl_randomness_CreateTimeseeded(); for (n = 0; n < max_comparisons; n++) { do { i = esl_rnd_Roll(r, N); j = esl_rnd_Roll(r, N); } while (j == i); /* make sure j != i */ if ((status = esl_dst_XPairId(abc, ax[i], ax[j], &id, NULL, NULL)) != eslOK) return status; sum += id; } sum /= (double) max_comparisons; esl_randomness_Destroy(r); } *ret_id = sum; return eslOK; }
int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage); char *msafile = esl_opt_GetArg(go, 1); ESL_ALPHABET *abc = NULL; int infmt = eslMSAFILE_UNKNOWN; ESLX_MSAFILE *afp = NULL; ESL_MSA *msa = NULL; FILE *ofp = stdout; int nali = 0; int namewidth; double pid; int nid, n; int i,j; int status; /* allow user to assert the input MSA alphabet */ if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); /* allow user to assert the input MSA format */ if (esl_opt_IsOn(go, "--informat") && (infmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--informat"))) == eslMSAFILE_UNKNOWN) esl_fatal("%s is not a valid MSA file format for --informat", esl_opt_GetString(go, "--informat")); /* digital open */ if ( ( status = eslx_msafile_Open(&abc, msafile, NULL, infmt, NULL, &afp)) != eslOK) eslx_msafile_OpenFailure(afp, status); while ((status = eslx_msafile_Read(afp, &msa)) == eslOK) { nali++; namewidth = esl_str_GetMaxWidth(msa->sqname, msa->nseq); for (i = 0; i < msa->nseq; i++) for (j = i+1; j < msa->nseq; j++) { esl_dst_XPairId(abc, msa->ax[i], msa->ax[j], &pid, &nid, &n); fprintf(ofp, "%-*s %-*s %6.2f %6d %6d\n", namewidth, msa->sqname[i], namewidth, msa->sqname[j], pid*100.0, nid, n); } esl_msa_Destroy(msa); } if (nali == 0 || status != eslEOF) eslx_msafile_ReadFailure(afp, status); eslx_msafile_Close(afp); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
static int utest_XPairId(ESL_ALPHABET *abc, char **as, ESL_DSQ **ax, int N) { double pid, pid2; int nid, nid2; int nres, nres2; int dL, L; int i,j; /* Self comparison gives identity = 1. */ dL = esl_abc_dsqlen(ax[0]); L = strlen(as[0]); if (dL != L) abort(); if (esl_dst_XPairId(abc, ax[0], ax[0], &pid, &nid, &nres) != eslOK) abort(); if (pid != 1.0 || nid != L || nres > dL) abort(); /* So does 0,1 comparison */ if (esl_dst_XPairId(abc, ax[0], ax[1], &pid, &nid, &nres) != eslOK) abort(); if (pid != 1.0 || nid != L || nres > L) abort(); /* 0,2 comparison gives 0.0, 0 */ if (esl_dst_XPairId(abc, ax[0], ax[2], &pid, &nid, &nres) != eslOK) abort(); if (pid != 0.0 || nid != 0 || nres > L) abort(); /* remaining comparisons shouldn't fail, and should be identical to text mode */ for (i = 3; i < N; i++) for (j = i; j < N; j++) { if (esl_dst_XPairId(abc, ax[i], ax[j], &pid, &nid, &nres) != eslOK) abort(); if (esl_dst_CPairId(as[i], as[j], &pid2, &nid2, &nres2) != eslOK) abort(); if (pid < 0. || pid > 1. || nid < 0 || nid > L || nres > L) abort(); if (pid != pid2 || nid != nid2 || nres != nres2) abort(); } /* API should accept NULL for return values */ if (esl_dst_XPairId(abc, ax[0], ax[0], NULL, NULL, NULL) != eslOK) abort(); return eslOK; }
static int msacluster_xlinkage(const void *v1, const void *v2, const void *p, int *ret_link) { ESL_DSQ *ax1 = *(ESL_DSQ **) v1; ESL_DSQ *ax2 = *(ESL_DSQ **) v2; struct msa_param_s *param = (struct msa_param_s *) p; double pid; int status = eslOK; #if defined(eslMSACLUSTER_REGRESSION) || defined(eslMSAWEIGHT_REGRESSION) pid = 1. - squid_xdistance(param->abc, ax1, ax2); #else if ( (status = esl_dst_XPairId(param->abc, ax1, ax2, &pid, NULL, NULL)) != eslOK) return status; #endif *ret_link = (pid >= param->maxid ? TRUE : FALSE); return status; }