Ejemplo n.º 1
0
/* Function:  esl_dst_XPairIdMx()
 * Synopsis:  NxN identity matrix for N aligned digital seqs.
 * Incept:    SRE, Thu Apr 27 09:08:11 2006 [New York]
 *
 * Purpose:   Given a digitized multiple sequence alignment <ax>, consisting
 *            of <N> aligned digital sequences in alphabet <abc>; calculate
 *            a symmetric pairwise fractional identity matrix by $N(N-1)/2$
 *            calls to <esl_dst_XPairId()>, and return it in <ret_S>.
 *            
 * Args:      abc   - digital alphabet in use
 *            ax    - aligned dsq's, [0..N-1][1..alen]                  
 *            N     - number of aligned sequences
 *            ret_S - RETURN: NxN matrix of fractional identities
 *
 * Returns:   <eslOK> on success, and <ret_S> contains the distance
 *            matrix. Caller is obligated to free <S> with 
 *            <esl_dmatrix_Destroy()>. 
 *
 * Throws:    <eslEINVAL> if a seq has a different
 *            length than others. On failure, <ret_S> is returned <NULL>
 *            and state of inputs is unchanged.
 */
int
esl_dst_XPairIdMx(const ESL_ALPHABET *abc,  ESL_DSQ **ax, int N, ESL_DMATRIX **ret_S)
{
  int status;
  ESL_DMATRIX *S = NULL;
  int i,j;

  if (( S = esl_dmatrix_Create(N,N) ) == NULL) goto ERROR;
  
  for (i = 0; i < N; i++)
    {
      S->mx[i][i] = 1.;
      for (j = i+1; j < N; j++)
	{
	  status = esl_dst_XPairId(abc, ax[i], ax[j], &(S->mx[i][j]), NULL, NULL);
	  if (status != eslOK)
	    ESL_XEXCEPTION(status, "Pairwise identity calculation failed at seqs %d,%d\n", i,j);
	  S->mx[j][i] =  S->mx[i][j];
	}
    }
  if (ret_S != NULL) *ret_S = S; else esl_dmatrix_Destroy(S);
  return eslOK;

 ERROR:
  if (S     != NULL)  esl_dmatrix_Destroy(S);
  if (ret_S != NULL) *ret_S = NULL;
  return status;
}
Ejemplo n.º 2
0
/* Function:  esl_msaweight_IDFilter()
 * Synopsis:  Filter by %ID.
 * Incept:    ER, Wed Oct 29 10:06:43 2008 [Janelia]
 * 
 * Purpose:   Constructs a new alignment by removing near-identical 
 *            sequences from a given alignment (where identity is 
 *            calculated *based on the alignment*).
 *            Does not affect the given alignment.
 *            Keeps earlier sequence, discards later one. 
 *           
 *            Usually called as an ad hoc sequence "weighting" mechanism.
 *           
 * Limitations:
 *            Unparsed Stockholm markup is not propagated into the
 *            new alignment.
 *           
 * Return:    <eslOK> on success, and the <newmsa>.
 *
 * Throws:    <eslEMEM> on allocation error. <eslEINVAL> if a pairwise
 *            identity calculation fails because of corrupted sequence 
 *            data. In either case, the <msa> is unmodified.
 *
 * Xref:      squid::weight.c::FilterAlignment().
 */
int
esl_msaweight_IDFilter(const ESL_MSA *msa, double maxid, ESL_MSA **ret_newmsa)
{
  int     *list   = NULL;               /* array of seqs in new msa */
  int     *useme  = NULL;               /* TRUE if seq is kept in new msa */
  int      nnew;			/* number of seqs in new alignment */
  double   ident;                       /* pairwise percentage id */
  int      i,j;                         /* seqs counters*/
  int      remove;                      /* TRUE if sq is to be removed */
  int      status;
  
  /* Contract checks
   */
  ESL_DASSERT1( (msa       != NULL) );
  ESL_DASSERT1( (msa->nseq >= 1)    );
  ESL_DASSERT1( (msa->alen >= 1)    );

  /* allocate */
  ESL_ALLOC(list,  sizeof(int) * msa->nseq);
  ESL_ALLOC(useme, sizeof(int) * msa->nseq);
  esl_vec_ISet(useme, msa->nseq, 0); /* initialize array */

  /* find which seqs to keep (list) */
  nnew = 0;
  for (i = 0; i < msa->nseq; i++)
    {
      remove = FALSE;
      for (j = 0; j < nnew; j++)
	{
	  if (! (msa->flags & eslMSA_DIGITAL)) {
	    if ((status = esl_dst_CPairId(msa->aseq[i], msa->aseq[list[j]], &ident, NULL, NULL))       != eslOK) goto ERROR;
	  } 
#ifdef eslAUGMENT_ALPHABET
	  else {
	    if ((status = esl_dst_XPairId(msa->abc, msa->ax[i], msa->ax[list[j]], &ident, NULL, NULL)) != eslOK) goto ERROR;
	  }
#endif
	  
	  if (ident > maxid)
	    { 
	      remove = TRUE; 
	      break; 
	    }
	}
      if (remove == FALSE) {
	list[nnew++] = i;
	useme[i]     = TRUE;
      }
    }
  if ((status = esl_msa_SequenceSubset(msa, useme, ret_newmsa)) != eslOK) goto ERROR;
 
  free(list);
  free(useme);
  return eslOK;

 ERROR:
  if (list  != NULL) free(list);
  if (useme != NULL) free(useme);
  return status;
}
Ejemplo n.º 3
0
/* Function:  esl_dst_XAverageId()
 * Synopsis:  Calculate avg identity for digital MSA 
 * Incept:    SRE, Fri May 18 15:19:14 2007 [Janelia]
 *
 * Purpose:   Calculates the average pairwise fractional identity in
 *            a digital multiple sequence alignment <ax>, consisting of <N>
 *            aligned digital sequences of identical length.
 *            
 *            If an exhaustive calculation would require more than
 *            <max_comparisons> pairwise comparisons, then instead of
 *            looking at all pairs, calculate the average over a
 *            stochastic sample of <max_comparisons> random pairs.
 *            This allows the routine to work efficiently even on very
 *            deep MSAs.
 *            
 *            Each fractional pairwise identity (range $[0..$ pid $..1]$
 *            is calculated using <esl_dsq_XPairId()>.
 *
 * Returns:   <eslOK> on success, and <*ret_id> contains the average
 *            fractional identity.
 *
 * Throws:    <eslEMEM> on allocation failure.
 *            <eslEINVAL> if any of the aligned sequence pairs aren't 
 *            of the same length.
 *            In either case, <*ret_id> is set to 0.
 */
int
esl_dst_XAverageId(const ESL_ALPHABET *abc, ESL_DSQ **ax, int N, int max_comparisons, double *ret_id)
{
  int    status;
  double id;
  double sum;
  int    i,j,n;
  
  if (N <= 1) { *ret_id = 1.; return eslOK; }
  *ret_id = 0.;

  /* Is N small enough that we can average over all pairwise comparisons? 
     watch out for numerical overflow in this: Pfam N's easily overflow when squared
   */
  if (N <= max_comparisons &&
      N <= sqrt(2. * max_comparisons) &&
      (N * (N-1) / 2) <= max_comparisons)
    {
      for (i = 0; i < N; i++)
	for (j = i+1; j < N; j++)
	  {
	    if ((status = esl_dst_XPairId(abc, ax[i], ax[j], &id, NULL, NULL)) != eslOK) return status;
	    sum += id;
	  }
      sum /= (double) (N * (N-1) / 2);
    }

  /* If nseq is large, calculate average over a stochastic sample. */
  else				
    {
      ESL_RANDOMNESS *r = esl_randomness_CreateTimeseeded();

      for (n = 0; n < max_comparisons; n++)
	{
	  do { i = esl_rnd_Roll(r, N); j = esl_rnd_Roll(r, N); } while (j == i); /* make sure j != i */
	  if ((status = esl_dst_XPairId(abc, ax[i], ax[j], &id, NULL, NULL)) != eslOK) return status;
	  sum += id;
	}
      sum /= (double) max_comparisons;
      esl_randomness_Destroy(r);
    }

  *ret_id = sum;
  return eslOK;
}
Ejemplo n.º 4
0
int
main(int argc, char **argv)
{
  ESL_GETOPTS  *go      = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage);
  char         *msafile = esl_opt_GetArg(go, 1);
  ESL_ALPHABET *abc     = NULL;
  int           infmt   = eslMSAFILE_UNKNOWN;
  ESLX_MSAFILE *afp     = NULL;
  ESL_MSA      *msa     = NULL;
  FILE         *ofp     = stdout;
  int           nali    = 0;
  int           namewidth;
  double        pid;
  int           nid, n;
  int           i,j;
  int           status;

  /* allow user to assert the input MSA alphabet */
  if      (esl_opt_GetBoolean(go, "--rna"))   abc = esl_alphabet_Create(eslRNA);
  else if (esl_opt_GetBoolean(go, "--dna"))   abc = esl_alphabet_Create(eslDNA);
  else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); 

  /* allow user to assert the input MSA format */
  if (esl_opt_IsOn(go, "--informat") &&
      (infmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--informat"))) == eslMSAFILE_UNKNOWN)
    esl_fatal("%s is not a valid MSA file format for --informat", esl_opt_GetString(go, "--informat"));

  /* digital open */
  if ( ( status = eslx_msafile_Open(&abc, msafile, NULL, infmt, NULL, &afp)) != eslOK)
    eslx_msafile_OpenFailure(afp, status);

  while ((status = eslx_msafile_Read(afp, &msa)) == eslOK)
    {	
      nali++;

      namewidth = esl_str_GetMaxWidth(msa->sqname, msa->nseq);

      for (i = 0; i < msa->nseq; i++)
	for (j = i+1; j < msa->nseq; j++)
	  {
	    esl_dst_XPairId(abc, msa->ax[i], msa->ax[j], &pid, &nid, &n);
	    fprintf(ofp, "%-*s %-*s %6.2f %6d %6d\n", namewidth, msa->sqname[i], namewidth, msa->sqname[j], pid*100.0, nid, n);
	  }

      esl_msa_Destroy(msa);
    }
  if (nali == 0 || status != eslEOF) eslx_msafile_ReadFailure(afp, status); 

  eslx_msafile_Close(afp);
  esl_alphabet_Destroy(abc);
  esl_getopts_Destroy(go);
  return 0;
}
Ejemplo n.º 5
0
static int 
utest_XPairId(ESL_ALPHABET *abc, char **as, ESL_DSQ **ax, int N)
{
  double pid, pid2;
  int    nid, nid2;
  int    nres, nres2;	
  int    dL, L;
  int    i,j;

  /* Self comparison gives identity = 1. */
  dL = esl_abc_dsqlen(ax[0]);
  L  = strlen(as[0]); 
  if (dL != L) abort();
  if (esl_dst_XPairId(abc, ax[0], ax[0], &pid, &nid, &nres) != eslOK) abort();
  if (pid  != 1.0 || nid != L || nres > dL) abort();

  /* So does 0,1 comparison  */
  if (esl_dst_XPairId(abc, ax[0], ax[1], &pid, &nid, &nres) != eslOK) abort();
  if (pid  != 1.0 || nid != L || nres > L) abort();

  /* 0,2 comparison gives 0.0, 0 */
  if (esl_dst_XPairId(abc, ax[0], ax[2], &pid, &nid, &nres) != eslOK) abort();
  if (pid  != 0.0 || nid != 0 || nres > L) abort();
  
  /* remaining comparisons shouldn't fail, and should be identical to text mode */
  for (i = 3; i < N; i++)
    for (j = i; j < N; j++)
      {
	if (esl_dst_XPairId(abc, ax[i], ax[j], &pid, &nid, &nres) != eslOK) abort();
	if (esl_dst_CPairId(as[i], as[j], &pid2, &nid2, &nres2)   != eslOK) abort();
	if (pid < 0. || pid > 1. || nid < 0 || nid > L || nres > L)         abort();
	if (pid != pid2 || nid != nid2 || nres != nres2)                    abort();
      }

  /* API should accept NULL for return values */
  if (esl_dst_XPairId(abc, ax[0], ax[0], NULL, NULL, NULL) != eslOK) abort();  
  return eslOK;

}
Ejemplo n.º 6
0
static int
msacluster_xlinkage(const void *v1, const void *v2, const void *p, int *ret_link)
{
  ESL_DSQ *ax1              = *(ESL_DSQ **) v1;
  ESL_DSQ *ax2              = *(ESL_DSQ **) v2;
  struct msa_param_s *param = (struct msa_param_s *) p;
  double   pid;
  int      status = eslOK;

#if defined(eslMSACLUSTER_REGRESSION) || defined(eslMSAWEIGHT_REGRESSION)
  pid = 1. - squid_xdistance(param->abc, ax1, ax2);
#else  
  if ( (status = esl_dst_XPairId(param->abc, ax1, ax2, &pid, NULL, NULL)) != eslOK) return status;
#endif

  *ret_link = (pid >= param->maxid ? TRUE : FALSE); 
  return status;
}