static int utest_XDiffMx(ESL_ALPHABET *abc, char **as, ESL_DSQ **ax, int N) { ESL_DMATRIX *D, *D2; int i, j; if (esl_dst_XDiffMx(abc, ax, N, &D) != eslOK) abort(); if (esl_dst_CDiffMx(as, N, &D2) != eslOK) abort(); for (i = 0; i < N; i++) for (j = i; j < N; j++) if (fabs(D->mx[i][j] - D2->mx[j][i]) > 0.01) abort(); esl_dmatrix_Destroy(D); esl_dmatrix_Destroy(D2); return eslOK; }
static int utest_CDiffMx(char **as, int N) { ESL_DMATRIX *D; int i,j; double diff; if (esl_dst_CDiffMx(as, N, &D) != eslOK) abort(); for (i = 0; i < N; i++) if (D->mx[i][i] != 0.0) abort(); diff = 0.; for (i = 3; i < N; i++) for (j = i+1; j < N; j++) diff += D->mx[i][j]; diff /= (double) ((N-3) * (N-4) / 2); /* first 3 don't count */ if (diff < 0.65 || diff > 0.85) abort(); /* should be 0.75 */ esl_dmatrix_Destroy(D); return eslOK; }
/* Function: esl_msaweight_GSC() * Synopsis: GSC weights. * Incept: SRE, Fri Nov 3 13:31:14 2006 [Janelia] * * Purpose: Given a multiple sequence alignment <msa>, calculate * sequence weights according to the * Gerstein/Sonnhammer/Chothia algorithm. These weights * are stored internally in the <msa> object, replacing * any weights that may have already been there. Weights * are $\geq 0$ and they sum to <msa->nseq>. * * The <msa> may be in either digitized or text mode. * Digital mode is preferred, so that distance calculations * used by the GSC algorithm are robust against degenerate * residue symbols. * * This is an implementation of Gerstein et al., "A method to * weight protein sequences to correct for unequal * representation", JMB 236:1067-1078, 1994. * * The algorithm is $O(N^2)$ memory (it requires a pairwise * distance matrix) and $O(N^3 + LN^2)$ time ($N^3$ for a UPGMA * tree building step, $LN^2$ for distance matrix construction) * for an alignment of N sequences and L columns. * * In the current implementation, the actual memory * requirement is dominated by two full NxN distance * matrices (one tmp copy in UPGMA, and one here): for * 8-byte doubles, that's $16N^2$ bytes. To keep the * calculation under memory limits, don't process large * alignments: max 1400 sequences for 32 MB, max 4000 * sequences for 256 MB, max 8000 seqs for 1 GB. Watch * out, because Pfam alignments can easily blow this up. * * Note: Memory usage could be improved. UPGMA consumes a distance * matrix, but that can be D itself, not a copy, if the * caller doesn't mind the destruction of D. Also, D is * symmetrical, so we could use upper or lower triangular * matrices if we rewrote dmatrix to allow them. * * I also think UPGMA can be reduced to O(N^2) time, by * being more tricky about rapidly identifying the minimum * element: could keep min of each row, and update that, * I think. * * Returns: <eslOK> on success, and the weights inside <msa> have been * modified. * * Throws: <eslEINVAL> if the alignment data are somehow invalid and * distance matrices can't be calculated. <eslEMEM> on an * allocation error. In either case, the original <msa> is * left unmodified. * * Xref: [Gerstein94]; squid::weight.c::GSCWeights(); STL11/81. */ int esl_msaweight_GSC(ESL_MSA *msa) { ESL_DMATRIX *D = NULL; /* distance matrix */ ESL_TREE *T = NULL; /* UPGMA tree */ double *x = NULL; /* storage per node, 0..N-2 */ double lw, rw; /* total branchlen on left, right subtrees */ double lx, rx; /* distribution of weight to left, right side */ int i; /* counter over nodes */ int status; /* Contract checks */ ESL_DASSERT1( (msa != NULL) ); ESL_DASSERT1( (msa->nseq >= 1) ); ESL_DASSERT1( (msa->alen >= 1) ); ESL_DASSERT1( (msa->wgt != NULL) ); if (msa->nseq == 1) { msa->wgt[0] = 1.0; return eslOK; } /* GSC weights use a rooted tree with "branch lengths" calculated by * UPGMA on a fractional difference matrix - pretty crude. */ if (! (msa->flags & eslMSA_DIGITAL)) { if ((status = esl_dst_CDiffMx(msa->aseq, msa->nseq, &D)) != eslOK) goto ERROR; } #ifdef eslAUGMENT_ALPHABET else { if ((status = esl_dst_XDiffMx(msa->abc, msa->ax, msa->nseq, &D)) != eslOK) goto ERROR; } #endif /* oi, look out here. UPGMA is correct, but old squid library uses * single linkage, so for regression tests ONLY, we use single link. */ #ifdef eslMSAWEIGHT_REGRESSION if ((status = esl_tree_SingleLinkage(D, &T)) != eslOK) goto ERROR; #else if ((status = esl_tree_UPGMA(D, &T)) != eslOK) goto ERROR; #endif esl_tree_SetCladesizes(T); ESL_ALLOC(x, sizeof(double) * (T->N-1)); /* Postorder traverse (leaves to root) to calculate the total branch * length under each internal node; store this in x[]. Remember the * total branch length (x[0]) for a future sanity check. */ for (i = T->N-2; i >= 0; i--) { x[i] = T->ld[i] + T->rd[i]; if (T->left[i] > 0) x[i] += x[T->left[i]]; if (T->right[i] > 0) x[i] += x[T->right[i]]; } /* Preorder traverse (root to leaves) to calculate the weights. Now * we use x[] to mean, the total weight *above* this node that we will * apportion to the node's left and right children. The two * meanings of x[] never cross: every x[] beneath x[i] is still a * total branch length. * * Because the API guarantees that msa is returned unmodified in case * of an exception, and we're touching msa->wgt here, no exceptions * may be thrown from now on in this function. */ x[0] = 0; /* initialize: no branch to the root. */ for (i = 0; i <= T->N-2; i++) { lw = T->ld[i]; if (T->left[i] > 0) lw += x[T->left[i]]; rw = T->rd[i]; if (T->right[i] > 0) rw += x[T->right[i]]; if (lw+rw == 0.) { /* A special case arises in GSC weights when all branch lengths in a subtree are 0. * In this case, all seqs in this clade should get equal weights, sharing x[i] equally. * So, split x[i] in proportion to cladesize, not to branch weight. */ if (T->left[i] > 0) lx = x[i] * ((double) T->cladesize[T->left[i]] / (double) T->cladesize[i]); else lx = x[i] / (double) T->cladesize[i]; if (T->right[i] > 0) rx = x[i] * ((double) T->cladesize[T->right[i]] / (double) T->cladesize[i]); else rx = x[i] / (double) T->cladesize[i]; } else /* normal case: x[i] split in proportion to branch weight. */ { lx = x[i] * lw/(lw+rw); rx = x[i] * rw/(lw+rw); } if (T->left[i] <= 0) msa->wgt[-(T->left[i])] = lx + T->ld[i]; else x[T->left[i]] = lx + T->ld[i]; if (T->right[i] <= 0) msa->wgt[-(T->right[i])] = rx + T->rd[i]; else x[T->right[i]] = rx + T->rd[i]; } /* Renormalize weights to sum to N. */ esl_vec_DNorm(msa->wgt, msa->nseq); esl_vec_DScale(msa->wgt, msa->nseq, (double) msa->nseq); msa->flags |= eslMSA_HASWGTS; free(x); esl_tree_Destroy(T); esl_dmatrix_Destroy(D); return eslOK; ERROR: if (x != NULL) free(x); if (T != NULL) esl_tree_Destroy(T); if (D != NULL) esl_dmatrix_Destroy(D); return status; }