/* Each unit test is given an alignment with certain known
 * properties:
 *    seqs 0,1 are identical
 *    seqs 0,2 are completely different
 *    seqs 3..N are random
 * The alignment may contain gaps, so don't assume that the 
 * # of compared residues == alignment length. The alignment
 * contains only canonical residues, because one of our tests
 * is that C and X functions give the same results.
 */
static int 
utest_CPairId(char **as, int N)
{
  double pid;
  int    nid;
  int    nres;	
  int    L;
  int    i,j;

  /* Self comparison gives identity = 1. */
  L = strlen(as[0]);
  if (esl_dst_CPairId(as[0], as[0], &pid, &nid, &nres) != eslOK) abort();
  if (pid  != 1.0 || nid != L || nres > L) abort();

  /* So does 0,1 comparison  */
  if (esl_dst_CPairId(as[0], as[1], &pid, &nid, &nres) != eslOK) abort();
  if (pid  != 1.0 || nid != L || nres > L) abort();

  /* 0,2 comparison gives 0.0, 0 */
  if (esl_dst_CPairId(as[0], as[2], &pid, &nid, &nres) != eslOK) abort();
  if (pid  != 0.0 || nid != 0 || nres > L) abort();
  
  /* remaining comparisons shouldn't fail */
  for (i = 3; i < N; i++)
    for (j = i; j < N; j++)
      {
	if (esl_dst_CPairId(as[i], as[j], &pid, &nid, &nres) != eslOK) abort();
	if (pid < 0. || pid > 1. || nid < 0 || nid > L || nres > L)    abort();
      }

  /* API should accept NULL for return values */
  if (esl_dst_CPairId(as[0], as[0], NULL, NULL, NULL) != eslOK) abort();  
  return eslOK;
}
/* Function:  esl_dst_CPairIdMx()
 * Synopsis:  NxN identity matrix for N aligned text sequences.
 * Incept:    SRE, Thu Apr 27 08:46:08 2006 [New York]
 *
 * Purpose:   Given a multiple sequence alignment <as>, consisting
 *            of <N> aligned character strings; calculate
 *            a symmetric fractional pairwise identity matrix by $N(N-1)/2$
 *            calls to <esl_dst_CPairId()>, and return it in 
 *            <ret_D>.
 *
 * Args:      as      - aligned seqs (all same length), [0..N-1]
 *            N       - # of aligned sequences
 *            ret_S   - RETURN: symmetric fractional identity matrix
 *
 * Returns:   <eslOK> on success, and <ret_S> contains the fractional
 *            identity matrix. Caller free's <S> with
 *            <esl_dmatrix_Destroy()>.
 *
 * Throws:    <eslEINVAL> if a seq has a different
 *            length than others. On failure, <ret_D> is returned <NULL>
 *            and state of inputs is unchanged.
 */
int
esl_dst_CPairIdMx(char **as, int N, ESL_DMATRIX **ret_S)
{
  ESL_DMATRIX *S = NULL;
  int status;
  int i,j;

  if (( S = esl_dmatrix_Create(N,N) ) == NULL) goto ERROR;
  
  for (i = 0; i < N; i++)
    {
      S->mx[i][i] = 1.;
      for (j = i+1; j < N; j++)
	{
	  status = esl_dst_CPairId(as[i], as[j], &(S->mx[i][j]), NULL, NULL);
	  if (status != eslOK)
	    ESL_XEXCEPTION(status, "Pairwise identity calculation failed at seqs %d,%d\n", i,j);
	  S->mx[j][i] =  S->mx[i][j];
	}
    }
  if (ret_S != NULL) *ret_S = S; else esl_dmatrix_Destroy(S);
  return eslOK;

 ERROR:
  if (S     != NULL)  esl_dmatrix_Destroy(S);
  if (ret_S != NULL) *ret_S = NULL;
  return status;
}
示例#3
0
/* Function:  esl_msaweight_IDFilter()
 * Synopsis:  Filter by %ID.
 * Incept:    ER, Wed Oct 29 10:06:43 2008 [Janelia]
 * 
 * Purpose:   Constructs a new alignment by removing near-identical 
 *            sequences from a given alignment (where identity is 
 *            calculated *based on the alignment*).
 *            Does not affect the given alignment.
 *            Keeps earlier sequence, discards later one. 
 *           
 *            Usually called as an ad hoc sequence "weighting" mechanism.
 *           
 * Limitations:
 *            Unparsed Stockholm markup is not propagated into the
 *            new alignment.
 *           
 * Return:    <eslOK> on success, and the <newmsa>.
 *
 * Throws:    <eslEMEM> on allocation error. <eslEINVAL> if a pairwise
 *            identity calculation fails because of corrupted sequence 
 *            data. In either case, the <msa> is unmodified.
 *
 * Xref:      squid::weight.c::FilterAlignment().
 */
int
esl_msaweight_IDFilter(const ESL_MSA *msa, double maxid, ESL_MSA **ret_newmsa)
{
  int     *list   = NULL;               /* array of seqs in new msa */
  int     *useme  = NULL;               /* TRUE if seq is kept in new msa */
  int      nnew;			/* number of seqs in new alignment */
  double   ident;                       /* pairwise percentage id */
  int      i,j;                         /* seqs counters*/
  int      remove;                      /* TRUE if sq is to be removed */
  int      status;
  
  /* Contract checks
   */
  ESL_DASSERT1( (msa       != NULL) );
  ESL_DASSERT1( (msa->nseq >= 1)    );
  ESL_DASSERT1( (msa->alen >= 1)    );

  /* allocate */
  ESL_ALLOC(list,  sizeof(int) * msa->nseq);
  ESL_ALLOC(useme, sizeof(int) * msa->nseq);
  esl_vec_ISet(useme, msa->nseq, 0); /* initialize array */

  /* find which seqs to keep (list) */
  nnew = 0;
  for (i = 0; i < msa->nseq; i++)
    {
      remove = FALSE;
      for (j = 0; j < nnew; j++)
	{
	  if (! (msa->flags & eslMSA_DIGITAL)) {
	    if ((status = esl_dst_CPairId(msa->aseq[i], msa->aseq[list[j]], &ident, NULL, NULL))       != eslOK) goto ERROR;
	  } 
#ifdef eslAUGMENT_ALPHABET
	  else {
	    if ((status = esl_dst_XPairId(msa->abc, msa->ax[i], msa->ax[list[j]], &ident, NULL, NULL)) != eslOK) goto ERROR;
	  }
#endif
	  
	  if (ident > maxid)
	    { 
	      remove = TRUE; 
	      break; 
	    }
	}
      if (remove == FALSE) {
	list[nnew++] = i;
	useme[i]     = TRUE;
      }
    }
  if ((status = esl_msa_SequenceSubset(msa, useme, ret_newmsa)) != eslOK) goto ERROR;
 
  free(list);
  free(useme);
  return eslOK;

 ERROR:
  if (list  != NULL) free(list);
  if (useme != NULL) free(useme);
  return status;
}
/* Function:  esl_dst_CAverageId()
 * Synopsis:  Calculate avg identity for multiple alignment
 * Incept:    SRE, Fri May 18 15:02:38 2007 [Janelia]
 *
 * Purpose:   Calculates the average pairwise fractional identity in
 *            a multiple sequence alignment <as>, consisting of <N>
 *            aligned character sequences of identical length.
 *            
 *            If an exhaustive calculation would require more than
 *            <max_comparisons> pairwise comparisons, then instead of
 *            looking at all pairs, calculate the average over a
 *            stochastic sample of <max_comparisons> random pairs.
 *            This allows the routine to work efficiently even on very
 *            deep MSAs.
 *            
 *            Each fractional pairwise identity (range $[0..$ pid $..1]$
 *            is calculated using <esl_dsq_CPairId()>.
 *
 * Returns:   <eslOK> on success, and <*ret_id> contains the average
 *            fractional identity.
 *
 * Throws:    <eslEMEM> on allocation failure.
 *            <eslEINVAL> if any of the aligned sequence pairs aren't 
 *            of the same length.
 *            In either case, <*ret_id> is set to 0.
 */
int
esl_dst_CAverageId(char **as, int N, int max_comparisons, double *ret_id)
{
  int    status;
  double id;
  double sum;
  int    i,j,n;
  
  if (N <= 1) { *ret_id = 1.; return eslOK; }
  *ret_id = 0.;

  /* Is nseq small enough that we can average over all pairwise comparisons? */
  if ((N * (N-1) / 2) <= max_comparisons)
    {
      for (i = 0; i < N; i++)
	for (j = i+1; j < N; j++)
	  {
	    if ((status = esl_dst_CPairId(as[i], as[j], &id, NULL, NULL)) != eslOK) return status;
	    sum += id;
	  }
      id /= (double) (N * (N-1) / 2);
    }

  /* If nseq is large, calculate average over a stochastic sample. */
  else				
    {
      ESL_RANDOMNESS *r = esl_randomness_CreateTimeseeded();

      for (n = 0; n < max_comparisons; n++)
	{
	  do { i = esl_rnd_Roll(r, N); j = esl_rnd_Roll(r, N); } while (j == i); /* make sure j != i */
	  if ((status = esl_dst_CPairId(as[i], as[j], &id, NULL, NULL)) != eslOK) return status;
	  sum += id;
	}
      id /= (double) max_comparisons;
      esl_randomness_Destroy(r);
    }

  *ret_id = id;
  return eslOK;
}
示例#5
0
/* Definition of %id linkage in text-mode aligned seqs (>= maxid): */
static int
msacluster_clinkage(const void *v1, const void *v2, const void *p, int *ret_link)
{
  char  *as1   = *(char **) v1;
  char  *as2   = *(char **) v2;
  double maxid = *(double *) p;
  double pid;
  int    status = eslOK;

#if defined(eslMSACLUSTER_REGRESSION) || defined(eslMSAWEIGHT_REGRESSION)
  pid = 1. - squid_distance(as1, as2);
#else  
  if ((status = esl_dst_CPairId(as1, as2, &pid, NULL, NULL)) != eslOK) return status;
#endif

  *ret_link = (pid >= maxid ? TRUE : FALSE); 
  return status;
}
static int 
utest_XPairId(ESL_ALPHABET *abc, char **as, ESL_DSQ **ax, int N)
{
  double pid, pid2;
  int    nid, nid2;
  int    nres, nres2;	
  int    dL, L;
  int    i,j;

  /* Self comparison gives identity = 1. */
  dL = esl_abc_dsqlen(ax[0]);
  L  = strlen(as[0]); 
  if (dL != L) abort();
  if (esl_dst_XPairId(abc, ax[0], ax[0], &pid, &nid, &nres) != eslOK) abort();
  if (pid  != 1.0 || nid != L || nres > dL) abort();

  /* So does 0,1 comparison  */
  if (esl_dst_XPairId(abc, ax[0], ax[1], &pid, &nid, &nres) != eslOK) abort();
  if (pid  != 1.0 || nid != L || nres > L) abort();

  /* 0,2 comparison gives 0.0, 0 */
  if (esl_dst_XPairId(abc, ax[0], ax[2], &pid, &nid, &nres) != eslOK) abort();
  if (pid  != 0.0 || nid != 0 || nres > L) abort();
  
  /* remaining comparisons shouldn't fail, and should be identical to text mode */
  for (i = 3; i < N; i++)
    for (j = i; j < N; j++)
      {
	if (esl_dst_XPairId(abc, ax[i], ax[j], &pid, &nid, &nres) != eslOK) abort();
	if (esl_dst_CPairId(as[i], as[j], &pid2, &nid2, &nres2)   != eslOK) abort();
	if (pid < 0. || pid > 1. || nid < 0 || nid > L || nres > L)         abort();
	if (pid != pid2 || nid != nid2 || nres != nres2)                    abort();
      }

  /* API should accept NULL for return values */
  if (esl_dst_XPairId(abc, ax[0], ax[0], NULL, NULL, NULL) != eslOK) abort();  
  return eslOK;

}