Esempio n. 1
0
/* Function:  esl_wuss2ct()
* Incept:    SRE, Tue Feb 15 08:44:54 2005 [St. Louis]
*
* Purpose:   Given a secondary structure string <ss>, <0..len-1>,
*            in WUSS notation, convert it to a CT array, <1..len>,
*            in <ct>. Caller provides a <ct> allocated for at least 
*            <len+1> ints. <ct[i]> is the position that residue i
*            base pairs to, or 0 if i is unpaired. <ct[0]> is undefined
*            (but if you care: it is set to 0).
*            
*            WUSS notation is interpreted loosely here, as input
*            WUSS.  Any matching bracket pair or upper/lower case
*            alphabetic pair is interpreted as a base pair; any other
*            WUSS annotation is interpreted as unpaired.
*            
* Returns:   <eslOK> on success. Returns <eslESYNTAX> if the WUSS
*            string isn't valid.
*            
* Throws:    <eslEMEM> on allocation failure.           
*/
int 
esl_wuss2ct(char *ss, int len, int *ct)
{
    ESL_STACK *pda[27];     /* 1 secondary structure + up to 26 levels of pk's */
    int        i;
    int        pos, pair;
    int        status;      /* success or failure return status */

    /* Initialization: always initialize the main pda (0);
    * we'll init the pk pda's on demand.
    */
    if ((pda[0] = esl_stack_ICreate()) == NULL) goto FINISH;
    for (i = 1; i <= 26; i++) pda[i] = NULL;

    for (pos = 0; pos <= len; pos++) ct[pos] = 0;

    for (pos = 1; pos <= len; pos++)
    {
        if (!isprint((int) ss[pos-1]))  /* armor against garbage */
        { status = eslESYNTAX; goto FINISH; }

        /* left side of a pair: push position onto stack 0 (pos = 1..L) */
        else if (ss[pos-1] == '<' ||
            ss[pos-1] == '(' ||
            ss[pos-1] == '[' ||
            ss[pos-1] == '{')
        {
            if ((status = esl_stack_IPush(pda[0], pos)) != eslOK) goto FINISH;
        }

        /* right side of a pair; resolve pair; check for agreement */
        else if (ss[pos-1] == '>' || 
            ss[pos-1] == ')' ||
            ss[pos-1] == ']' ||
            ss[pos-1] == '}')
        {
            if (esl_stack_IPop(pda[0], &pair) == eslEOD)
            { status = eslESYNTAX; goto FINISH;} /* no closing bracket */
            else if ((ss[pair-1] == '<' && ss[pos-1] != '>') ||
                (ss[pair-1] == '(' && ss[pos-1] != ')') ||
                (ss[pair-1] == '[' && ss[pos-1] != ']') ||
                (ss[pair-1] == '{' && ss[pos-1] != '}'))
            { status = eslESYNTAX; goto FINISH; }  /* brackets don't match */
            else
            {
                ct[pos]  = pair;
                ct[pair] = pos;
            }
        }
        /* same stuff for pseudoknots */
        else if (isupper((int) ss[pos-1])) 
        {
            /* Create the PK stacks on demand.
            */
            i = ss[pos-1] - 'A' + 1;
            if (pda[i] == NULL) 
                if ((pda[i] = esl_stack_ICreate()) == NULL) 
                { status = eslEMEM; goto FINISH; }

                if ((status = esl_stack_IPush(pda[i], pos)) != eslOK) goto FINISH;
        }
        else if (islower((int) ss[pos-1])) 
        {
            i = ss[pos-1] - 'a' + 1;
            if (pda[i] == NULL || 
                esl_stack_IPop(pda[i], &pair) == eslEOD)
            { status = eslESYNTAX; goto FINISH;}
            else
            {
                ct[pos]  = pair;
                ct[pair] = pos;
            }
        }
        else if (strchr(":,_-.~", ss[pos-1]) == NULL)
        { status = eslESYNTAX; goto FINISH; } /* bogus character */
    }
    status = eslOK;

FINISH:
    for (i = 0; i <= 26; i++)
        if (pda[i] != NULL) 
        { /* nothing should be left on stacks */
            if (esl_stack_ObjectCount(pda[i]) != 0)
                status = eslESYNTAX;
            esl_stack_Destroy(pda[i]);
        }
        return status;
}
Esempio n. 2
0
/* Function:  esl_ct2simplewuss()
 * Incept:    ER, Wed Aug 22 13:31:54 EDT 2012 [Janelia]
 *
 * Purpose:   Convert a CT array <ct> for <n> residues (1..n) to a simple WUSS
 *            format string <ss>. <ss> must be allocated for at least
 *            n+1 chars (+1 for the terminal NUL). 
 *
 *            This function can be used with the <ct> of a secondary
 *            structure including arbitrary pseudoknots, or for the 
 *            <ct> or a tertiary structure (say cWH, tWH, cSS,... H bonds). 
 *
 *            The string <ss> has basepairs annotated as <>, Aa, Bb, ..., Zz;
 *            unpaired bases are annotated as '.'.
 *
 *            Attemting to convert a <ct> that requires more letters
 *            than [A-Z] will return an <eslEINVAL> error.
 *
 *            Attempting to convert a <ct> that involves triplet interactions
 *            will return an <eslEINVAL> error.
 *
 * Returns:   <eslOK> on success.
 * 
 * Throws:    <eslEMEM> on allocation failure.
 *            <eslEINCONCEIVABLE> on internal failure.
 */
int
esl_ct2simplewuss(int *ct, int n, char *ss)
{
  int        rb[26];                /* array that delimits the right bound of a pseudoknot character */
  ESL_STACK *pda    = NULL;         /* stack for "main" secondary structure */
  ESL_STACK *auxpk  = NULL;	    /* aux stack for pseudoknot */
  int       *cct    = NULL;         /* copy of ct vector */
  int        leftbound, rightbound; /* left and right bound to find basepairs belonging to a given pseudoknot */
  int        xpk = 0;               /* number of pseudoknot chararactes used */
  int        npk = 0;               /* number of pseudoknots */
  int        npairs = 0;            /* total number of basepairs */
  int        npairs_reached = 0;    /* number of basepairs found so far */
  int        found_partner;         /* true if we've found left partner of a given base in stack pda */
  int        i,j,k;                 /* sequence indices */
  int        x;                     /* index for pseudoknot characters */
  int        status = eslEMEM;	    /* exit status 'til proven otherwise */

  /* total number of basepairs */
  for (j = 1; j <= n; j ++) { if (ct[j] > 0 && j < ct[j]) npairs ++; }
  
  /* Copy of ct; if a pseudoknotted structure, cct will be modified later.
   */
  ESL_ALLOC(cct, sizeof(int)*(n+1));
  esl_vec_ICopy(ct, (n+1), cct);
  
  /* Initialize rightbounds for all 26 pseudoknot indices */
  for (x = 0; x < 26; x ++) rb[x] = -1;

  /* init ss[] to single stranded */
  for (j = 0; j < n; j ++) { ss[j] = '.'; }  
  ss[n] = '\0'; 
 
  /* Initialization*/
  if ((pda   = esl_stack_ICreate()) == NULL) goto FINISH;
  if ((auxpk = esl_stack_ICreate()) == NULL) goto FINISH;
  
  for (j = 1; j <= n; j++)
    {
      if (cct[j] == 0)	/* unpaired: push j. */
	{
	  if (esl_stack_IPush(pda, j) != eslOK) goto FINISH;
	}
      else if (cct[j] > j) /* left side of a bp: push j. */
	{
	  if (esl_stack_IPush(pda, j) != eslOK) goto FINISH;
	}
      else   /* right side of a bp; main routine: fingh the left partner */
	{
	  found_partner = FALSE;

	  /* Pop back until we find the left partner of j;
	   * In case this is not a nested structure, finding
	   * the left partner of j will require to put bases 
	   * aside into stack auxpk.
	   */	 
	  while (esl_stack_ObjectCount(pda)) 
	    {
	      if (esl_stack_IPop(pda, &i) != eslOK) goto FINISH;
	      
	      if (cct[i] == j)  /* we found the i,j pair. */
		{
		  found_partner = TRUE;
		  npairs_reached ++;	

		  ss[i-1] = '<';
		  ss[j-1] = '>';
		  break;
		}
	      
	      else if (cct[i] == 0) 
		{
		  if (ct[i] == 0) ss[i-1] = '.';
		}

	      else /* cct[i]>0, != j: i is paired, but not to j: pseudoknot! */
		{
		  /* i is in the way to find j's left partner. 
		   * Move i to stack auxpk; resolve pseudoknot(s) after we've found partern for j.
		   */ 
		  if (esl_stack_IPush(auxpk, i) != eslOK) goto FINISH;
		}
	    } 
	  
	  if (!found_partner) {
	    esl_stack_Destroy(pda); esl_stack_Destroy(auxpk); free(cct); 
	    ESL_EXCEPTION(eslEINVAL, "Cannot find left partner (%d) of base %d. Likely a triplet", ct[j], j);
	  }
	} /* finished finding the left partner of j */
      
      /* After we've found the left partner of j, resolve pks found along the way.
       * Then, remove the pseudoknotted based from cct so we can find the rest of the structure.
       */
      if (esl_stack_ObjectCount(auxpk)) {

	/* init for first pseudoknot */
	leftbound  = cct[j];
	rightbound = leftbound + 1;
	xpk        = -1;            /* start with 'A' if possible again */

	while (esl_stack_IPop(auxpk, &i) == eslOK) {

	  for (k = rightbound-1; k > leftbound; k --) 
	    {
	      if      (cct[k] == 0)          { continue; } 
	      else if (cct[k] >  rightbound) { continue; } 
	      else if (cct[k] == i)          { break; }                  /* i continues the given pseudoknot */
	      else                           { k = leftbound; break; }   /* a new pseudoknot */		    		
	    }
	  
	  if (k == leftbound) /* a new pseudoknot */
	    {
	      npk ++;
	      xpk ++;
	      /* figure out if we can use this alphabet index, or bump it up if necessary */
	      while (i < rb[xpk]) { xpk ++; }
	      
	      leftbound  = (rightbound < cct[i])? rightbound : cct[j];
	      rightbound = cct[i];
	    }
	      
	  npairs_reached ++;
	  if (xpk+(int)('a') <= (int)('z')) {

	    /* update the rightbound of this pk index if necessary */
	    if (cct[i] > rb[xpk]) rb[xpk] = cct[i];
	    
	    /* Add pk indices for this basepair */
	    ss[i-1]      = (char)(xpk+(int)('A'));
	    ss[cct[i]-1] = (char)(xpk+(int)('a'));
	    
	    /* remove pseudoknotted pair from cct */
	    cct[i]     = 0;
	    cct[ct[i]] = 0;
	  }
	  else  ESL_EXCEPTION(eslEINVAL, "Don't have enough letters to describe all different pseudoknots.");	      
	    	  
	} 	
      } /* while there is something in auxpk stack */

    } /* finished loop over j: end position on seq, 1..n*/ 
  
  status = eslOK;

 ERROR:
 FINISH:
  if (npairs != npairs_reached) 		  
    ESL_EXCEPTION(eslFAIL, "found %d out of %d pairs.", npairs_reached, npairs);
  if (pda   != NULL) esl_stack_Destroy(pda);
  if (auxpk != NULL) esl_stack_Destroy(auxpk);
  if (cct   != NULL) free(cct);
  return status;
}
Esempio n. 3
0
/* Function:  esl_ct2wuss()
 * Incept:    SRE, Wed Feb 16 11:22:53 2005 [St. Louis]
 *
 * Purpose:   Convert a CT array <ct> for <n> residues (1..n) to a WUSS
 *            format string <ss>. <ss> must be allocated for at least
 *            n+1 chars (+1 for the terminal NUL). 
 *
 *            ER, Sat Aug 18 13:22:03 EDT 2012 
 *            esl\_ct2wuss() extended to deal with pseudoknots structures.
 *            Pseudoknots are annotated as AA...aa, BB...bb,..., ZZ..zz.
 *            Attemting to convert a <ct> that requires more letters
 *            than [A-Z] will return an <eslEINVAL> error.
 *
 *            Attempting to convert a <ct> that involves triplet interactions
 *            will return an <eslEINVAL> error.
 *
 * Returns:   <eslOK> on success.
 * 
 * Throws:    <eslEMEM> on allocation failure.
 *            <eslEINCONCEIVABLE> on internal failure.
 */
int
esl_ct2wuss(int *ct, int n, char *ss)
{
  int        rb[26];                /* array that delimits the right bound of a pseudoknot character */
  ESL_STACK *pda    = NULL;         /* stack for "main" secondary structure */
  ESL_STACK *auxpk  = NULL;	    /* aux stack for pseudoknot */
  ESL_STACK *auxss  = NULL;	    /* aux stack for single stranded */
  int       *cct    = NULL;         /* copy of ct vector */
  int        nfaces;                /* number of faces in a cWW structure */
  int        minface;               /* max depth of faces in a cWW structure */
  int        leftbound, rightbound; /* left and right bound to find basepairs belonging to a given pseudoknot */
  int        xpk = 0;               /* number of pseudoknot chararactes used */
  int        npk = 0;               /* number of pseudoknots */
  int        npairs = 0;            /* total number of basepairs */
  int        npairs_reached = 0;    /* number of basepairs found so far */
  int        found_partner;         /* true if we've found left partner of a given base in stack pda */
  int        i,j,k;                 /* sequence indices */
  int        x;                     /* index for pseudoknot characters */
  int        status = eslEMEM;	    /* exit status 'til proven otherwise */

  /* total number of basepairs */
  for (j = 1; j <= n; j ++) { if (ct[j] > 0 && j < ct[j]) npairs ++; }
  
  /* Copy of ct; if a pseudoknotted structure, cct will be modified later.
   */
  ESL_ALLOC(cct, sizeof(int)*(n+1));
  esl_vec_ICopy(ct, (n+1), cct);
  
  /* Initialize rightbounds for all 26 pseudoknot indices */
  for (x = 0; x < 26; x ++) rb[x] = -1;

  /* init ss[] to single stranded */
  for (j = 0; j < n; j ++) { ss[j] = ':'; }  
  ss[n] = '\0'; 
 
  /* Initialization*/
  if ((pda   = esl_stack_ICreate()) == NULL) goto FINISH;
  if ((auxpk = esl_stack_ICreate()) == NULL) goto FINISH;
  if ((auxss = esl_stack_ICreate()) == NULL) goto FINISH;
  
  for (j = 1; j <= n; j++)
    {
      if (cct[j] == 0)	/* unpaired: push j. */
	{
	  if (esl_stack_IPush(pda, j) != eslOK) goto FINISH;
	}
      else if (cct[j] > j) /* left side of a bp: push j. */
	{
	  if (esl_stack_IPush(pda, j) != eslOK) goto FINISH;
	}
      else   /* right side of a bp; main routine: fingh the left partner */
	{
	  found_partner = FALSE;
	  /* Pop back until we find the left partner of j;
	   * In case this is not a nested structure, finding
	   * the left partner of j will require to put bases 
	   * aside into stack auxpk.
	   *
	   * After we find the left partner of j,
	   * store single stranded residues in auxss;
	   * keep track of #faces and the maximum face depth.
	   */
	  nfaces  = 0;
	  minface = -1;
	 
	  while (esl_stack_ObjectCount(pda)) 
	    {
	      if (esl_stack_IPop(pda, &i) != eslOK) goto FINISH;
	      
	      if (i < 0) 		/* a face counter */
		{
		  nfaces++;
		  if (i < minface) minface = i;
		}

	      else if (cct[i] == j)  /* we found the i,j pair. */
		{
		  found_partner = TRUE;
		  npairs_reached ++;	
		  /* Now we know i,j pair; and we know how many faces are
		   * above them; and we know the max depth of those faces.
		   * That's enough to label the pair in WUSS notation.
		   * if nfaces == 0, minface is -1; <> a closing bp of a hairpin.
		   * if nfaces == 1, inherit minface, we're continuing a stem.
		   * if nfaces > 1, bump minface in depth; we're closing a bifurc.
		   */
		  if (nfaces > 1 && minface > -4) minface--;
		  switch (minface) {
		  case -1: ss[i-1] = '<'; ss[j-1] = '>'; break;
		  case -2: ss[i-1] = '('; ss[j-1] = ')'; break;
		  case -3: ss[i-1] = '['; ss[j-1] = ']'; break;
		  case -4: ss[i-1] = '{'; ss[j-1] = '}'; break;
		  default:
		    esl_stack_Destroy(pda); esl_stack_Destroy(auxpk); esl_stack_Destroy(auxss); free(cct); 
		    ESL_EXCEPTION(eslEINCONCEIVABLE, "no such face code");
		  }
		  if (esl_stack_IPush(pda, minface) != eslOK) goto FINISH;
		  
		  /* Now, aux contains all the unpaired residues we need to label,
		   * according to the # of faces "above" them:
		   *  nfaces = 0: hairpin loop
		   *  nfaces = 1: bulge or interior loop
		   *  nfaces > 1: multifurc
		   */
		  while (esl_stack_IPop(auxss, &i) == eslOK)
		    {
		      switch (nfaces) {
			
		      case 0:  ss[i-1] = '_'; break;
		      case 1:  ss[i-1] = '-'; break;
		      default: ss[i-1] = ','; break; /* nfaces > 1 */
		      }
		    }
		  break;
		}
	      
	      else if (cct[i] == 0) 
		{
		  /* add to auxss only if originally sigle stranded */
		  if (ct[i] == 0) { if (esl_stack_IPush(auxss, i) != eslOK) goto FINISH; }
		}

	      else /* cct[i]>0, != j: i is paired, but not to j: pseudoknot! */
		{
		  /* i is in the way to find j's left partner. 
		   * Move i to stack auxpk; resolve pseudoknot(s) after we've found partern for j.
		   */ 
		  if (esl_stack_IPush(auxpk, i) != eslOK) goto FINISH;
		}
	    } 
	  
	  if (!found_partner) {
	    esl_stack_Destroy(pda); esl_stack_Destroy(auxpk); esl_stack_Destroy(auxss); free(cct); 
	    ESL_EXCEPTION(eslEINVAL, "Cannot find left partner (%d) of base %d. Likely a triplet", ct[j], j);
	  }
	} /* finished finding the left partner of j */
      
      /* After we've found the left partner of j, resolve pks found along the way.
       * Then, remove the pseudoknotted based from cct so we can find the rest of the structure.
       */
      if (esl_stack_ObjectCount(auxpk)) {

	/* init for first pseudoknot */
	leftbound  = cct[j];
	rightbound = leftbound + 1;
	xpk        = -1;            /* start with 'A' if possible again */

	while (esl_stack_IPop(auxpk, &i) == eslOK) {

	  for (k = rightbound-1; k > leftbound; k --) 
	    {
	      if      (cct[k] == 0)          { continue; } 
	      else if (cct[k] >  rightbound) { continue; } 
	      else if (cct[k] == i)          { break; }                  /* i continues the given pseudoknot */
	      else                           { k = leftbound; break; }   /* a new pseudoknot */		    		
	    }
	  
	  if (k == leftbound) /* a new pseudoknot */
	    {
	      npk ++;
	      xpk ++;
	      /* figure out if we can use this alphabet index, or bump it up if necessary */
	      while (i < rb[xpk]) { xpk ++; }
	      
	      leftbound  = (rightbound < cct[i])? rightbound : cct[j];
	      rightbound = cct[i];
	    }
	      
	  npairs_reached ++;
	  if (xpk+(int)('a') <= (int)('z')) {

	    /* update the rightbound of this pk index if necessary */
	    if (cct[i] > rb[xpk]) rb[xpk] = cct[i];
	    
	    /* Add pk indices for this basepair */
	    ss[i-1]      = (char)(xpk+(int)('A'));
	    ss[cct[i]-1] = (char)(xpk+(int)('a'));
	    
	    /* remove pseudoknotted pair from cct */
	    cct[i]     = 0;
	    cct[ct[i]] = 0;
	  }
	  else  ESL_EXCEPTION(eslEINVAL, "Don't have enough letters to describe all different pseudoknots.");	      
	    	  
	} 	
      } /* while there is something in auxpk stack */

    } /* finished loop over j: end position on seq, 1..n*/ 
  
  status = eslOK;

 ERROR:
 FINISH:
  if (npairs != npairs_reached) 		  
    ESL_EXCEPTION(eslFAIL, "found %d out of %d pairs.", npairs_reached, npairs);
  if (pda   != NULL) esl_stack_Destroy(pda);
  if (auxpk != NULL) esl_stack_Destroy(auxpk);
  if (auxss != NULL) esl_stack_Destroy(auxss);
  if (cct   != NULL) free(cct);
  return status;
}
Esempio n. 4
0
/* Each test sequence will contain one or two domains, depending on whether --single is set.
 */
static int
synthesize_positives(ESL_GETOPTS *go, struct cfg_s *cfg, char *testname, ESL_STACK *teststack, int *ret_ntest)
{
  ESL_SQ *domain1, *domain2;
  ESL_SQ *sq;
  void   *p;
  int64_t L;			/* total length of synthetic test seq */
  int     d1n, d2n;		/* lengths of two domains             */
  int     L1,L2,L3;		/* lengths of three random regions    */
  int     i,j;
  int     ntest = 0;
  int     ndomains = ( (esl_opt_GetBoolean(go, "--single") == TRUE) ? 1 : 2);
  int     status;

  while (esl_stack_ObjectCount(teststack) >= ndomains)
    {
      ESL_RALLOC(cfg->test_lens, p, (cfg->ntest+1) * sizeof(struct testseq_s));

      /* Pop our one or two test domains off the stack */
      esl_stack_PPop(teststack, &p);   
      domain1 = p; 
      d1n     = domain1->n;

      if (ndomains == 2)
	{
	  esl_stack_PPop(teststack, &p); 
	  domain2 = p;
	  d2n = domain2->n;
	}
      else
	{
	  domain2 = NULL;
	  d2n     = 0;
	}

      /* Select a random total sequence length */
      if (d1n+d2n > cfg->db_maxL) esl_fatal("can't construct test seq; no db seq >= %d residues\n", d1n+d2n);
      do {                                                     
	if (esl_ssi_FindNumber(cfg->dbfp->data.ascii.ssi, esl_rnd_Roll(cfg->r, cfg->db_nseq), NULL, NULL, NULL, &L, NULL) != eslOK)
	  esl_fatal("failed to look up a random seq");
      } while (L < d1n+d2n);

      /* Now figure out the embedding */
      if (ndomains == 2) 
	{
	  /* Select random lengths of three flanking domains;
	   * Imagine picking two "insert after" points i,j in sequence 1..L', for
	   * L' = L-d1n-d2n (the total length of nonhomologous test seq)
	   */
	  do {
	    i = esl_rnd_Roll(cfg->r, L - d1n - d2n + 1 ); /* i = 0..L' */
	    j = esl_rnd_Roll(cfg->r, L - d1n - d2n + 1 ); /* j = 0..L' */
	  } while (i > j);

	  /* now 1           .. i         = random region 1 (if i==0, there's none); 
	   *     i+1         .. i+d1n     = domain 1
	   *     i+d1n+1     .. j+d1n     = random region 2 (if i==j, there's none);
	   *     j+d1n+1     .. j+d1n+d2n = domain 2
	   *     j+d1n+d2n+1 .. L         = random region 3 (if j == L-d1n-d2n, there's none);
	   */
	  L1 = i;			
	  L2 = j-i;
	  L3 = L - d1n - d2n - j;
	}
      else 
	{ /* embedding one domain */
	  i = esl_rnd_Roll(cfg->r, L - d1n + 1 ); /* i = 0..L' */
	  /* now 1           .. i         = random region 1 (if i==0, there's none); 
	   *     i+1         .. i+d1n     = domain 1
	   *     i+d1n+1     .. L         = random region 2 (if i==j, there's none);
	   */
	  L1 = i;			
	  L2 = L - d1n - L1;
	  L3 = 0;
	}
      
      sq = esl_sq_CreateDigital(cfg->abc);
      esl_sq_GrowTo(sq, L);
      sq->n = L;
      if (ndomains == 2) 
	{
	  esl_sq_FormatName(sq, "%s/%d/%d-%d/%d-%d", testname, cfg->ntest, i+1, i+d1n, j+d1n+1, j+d1n+d2n);
	  esl_sq_FormatDesc(sq, "domains: %s %s", domain1->name, domain2->name);
	}
      else
	{
	  esl_sq_FormatName(sq, "%s/%d/%d-%d",   testname, cfg->ntest, i+1, i+d1n);
	  esl_sq_FormatDesc(sq, "domain: %s", domain1->name);
	}

      fprintf(cfg->possummfp, "%-35s %5d %5d %5d %5d %5d %5d", sq->name, (int) sq->n, L1, d1n, L2, d2n, L3);


      sq->dsq[0] = sq->dsq[L+1] = eslDSQ_SENTINEL;
      set_random_segment(go, cfg, cfg->possummfp, sq->dsq+1,           L1);
      memcpy(sq->dsq+i+1,     domain1->dsq+1, sizeof(ESL_DSQ) * d1n);
      fprintf(cfg->possummfp, " %-24s %5d %5d", domain1->name, 1, d1n);
      set_random_segment(go, cfg, cfg->possummfp, sq->dsq+i+d1n+1,     L2);
      if (ndomains == 2) 
	{
	  memcpy(sq->dsq+j+d1n+1, domain2->dsq+1, sizeof(ESL_DSQ) * d2n);
	  fprintf(cfg->possummfp, " %-24s %5d %5d", domain2->name, 1, d2n);
	  set_random_segment(go, cfg, cfg->possummfp, sq->dsq+j+d1n+d2n+1, L3);
	}
      fprintf(cfg->possummfp, "\n");

      cfg->test_lens[cfg->ntest].L   = L;
      cfg->test_lens[cfg->ntest].L1  = L1;
      cfg->test_lens[cfg->ntest].d1n = d1n;
      cfg->test_lens[cfg->ntest].L2  = L2;
      cfg->test_lens[cfg->ntest].d2n = d2n;
      cfg->test_lens[cfg->ntest].L3  = L3;
      cfg->ntest++;
      ntest++;

      esl_sqio_Write(cfg->out_seqfp, sq, eslSQFILE_FASTA, FALSE);

      esl_sq_Destroy(domain1);
      if (ndomains == 2) esl_sq_Destroy(domain2);
      esl_sq_Destroy(sq);
    }

  *ret_ntest = ntest;
  return eslOK;

 ERROR:
  esl_fatal("Failure in synthesize_positives");
  return status;
}
Esempio n. 5
0
int
main(int argc, char **argv)
{
  ESL_GETOPTS  *go      = NULL;	/* command line configuration      */
  struct cfg_s  cfg;     	/* application configuration       */
  char         *basename= NULL;	/* base of the output file names   */
  char         *alifile = NULL;	/* alignment file name             */
  char         *dbfile  = NULL;	/* name of seq db file             */
  char          outfile[256];	/* name of an output file          */
  int           alifmt;		/* format code for alifile         */
  int           dbfmt;		/* format code for dbfile          */
  ESL_MSAFILE  *afp     = NULL;	/* open alignment file             */
  ESL_MSA      *origmsa = NULL;	/* one multiple sequence alignment */
  ESL_MSA      *msa     = NULL;	/* MSA after frags are removed     */
  ESL_MSA      *trainmsa= NULL;	/* training set, aligned           */
  ESL_STACK    *teststack=NULL; /* test set: stack of ESL_SQ ptrs  */
  int           status;		/* easel return code               */
  int           nfrags;		/* # of fragments removed          */
  int           ntestdom;       /* # of test domains               */
  int           ntest;		/* # of test sequences created     */
  int           nali;		/* number of alignments read       */
  double        avgid;
  
  
  /* Parse command line */
  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf);
  if (esl_opt_VerifyConfig(go)               != eslOK) cmdline_failure(argv[0], "Error in app configuration:   %s\n", go->errbuf);
  if (esl_opt_GetBoolean(go, "-h"))                    cmdline_help(argv[0], go);
  if (esl_opt_ArgNumber(go)                  != 3)     cmdline_failure(argv[0], "Incorrect number of command line arguments\n");
  basename = esl_opt_GetArg(go, 1); 
  alifile  = esl_opt_GetArg(go, 2);
  dbfile   = esl_opt_GetArg(go, 3);
  alifmt   = eslMSAFILE_STOCKHOLM;
  dbfmt    = eslSQFILE_FASTA;

  /* Set up the configuration structure shared amongst functions here */
  if (esl_opt_IsDefault(go, "--seed"))   cfg.r = esl_randomness_CreateTimeseeded();
  else                                   cfg.r = esl_randomness_Create(esl_opt_GetInteger(go, "--seed"));
  cfg.abc       = NULL;		          /* until we open the MSA file, below */
  cfg.fragfrac  = esl_opt_GetReal(go, "-F");
  cfg.idthresh1 = esl_opt_GetReal(go, "-1");
  cfg.idthresh2 = esl_opt_GetReal(go, "-2");
  cfg.test_lens = NULL;
  cfg.ntest     = 0;

  /* Open the output files */ 
  if (snprintf(outfile, 256, "%s.msa", basename) >= 256)  esl_fatal("Failed to construct output MSA file name");
  if ((cfg.out_msafp = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open MSA output file %s\n", outfile);
  if (snprintf(outfile, 256, "%s.fa",  basename) >= 256)  esl_fatal("Failed to construct output FASTA file name");
  if ((cfg.out_seqfp = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open FASTA output file %s\n", outfile);
  if (snprintf(outfile, 256, "%s.pos", basename) >= 256)  esl_fatal("Failed to construct pos test set summary file name");
  if ((cfg.possummfp = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open pos test set summary file %s\n", outfile);
  if (snprintf(outfile, 256, "%s.neg", basename) >= 256)  esl_fatal("Failed to construct neg test set summary file name");
  if ((cfg.negsummfp = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open neg test set summary file %s\n", outfile);
  if (snprintf(outfile, 256, "%s.tbl", basename) >= 256)  esl_fatal("Failed to construct benchmark table file name");
  if ((cfg.tblfp     = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open benchmark table file %s\n", outfile);

  /* Open the MSA file; determine alphabet */
  status = esl_msafile_Open(alifile, alifmt, NULL, &afp);
  if      (status == eslENOTFOUND) esl_fatal("Alignment file %s doesn't exist or is not readable\n", alifile);
  else if (status == eslEFORMAT)   esl_fatal("Couldn't determine format of alignment %s\n", alifile);
  else if (status != eslOK)        esl_fatal("Alignment file open failed with error %d\n", status);

  if      (esl_opt_GetBoolean(go, "--amino"))   cfg.abc = esl_alphabet_Create(eslAMINO);
  else if (esl_opt_GetBoolean(go, "--dna"))     cfg.abc = esl_alphabet_Create(eslDNA);
  else if (esl_opt_GetBoolean(go, "--rna"))     cfg.abc = esl_alphabet_Create(eslRNA);
  else {
    int type;
    status = esl_msafile_GuessAlphabet(afp, &type);
    if (status == eslEAMBIGUOUS)    esl_fatal("Failed to guess the bio alphabet used in %s.\nUse --dna, --rna, or --amino option to specify it.", alifile);
    else if (status == eslEFORMAT)  esl_fatal("Alignment file parse failed: %s\n", afp->errbuf);
    else if (status == eslENODATA)  esl_fatal("Alignment file %s is empty\n", alifile);
    else if (status != eslOK)       esl_fatal("Failed to read alignment file %s\n", alifile);
    cfg.abc = esl_alphabet_Create(type);
  }
  esl_msafile_SetDigital(afp, cfg.abc);

  if (cfg.abc->type == eslAMINO) esl_composition_SW34(cfg.fq);
  else                           esl_vec_DSet(cfg.fq, cfg.abc->K, 1.0 / (double) cfg.abc->K);

  /* Open and process the dbfile; make sure it's in the same alphabet */
  process_dbfile(&cfg, dbfile, dbfmt);

  /* Read and process MSAs one at a time  */
  nali = 0;
  while ((status = esl_msa_Read(afp, &origmsa)) == eslOK)
    {
      remove_fragments(&cfg, origmsa, &msa, &nfrags);
      separate_sets   (&cfg, msa, &trainmsa, &teststack);
      ntestdom = esl_stack_ObjectCount(teststack);

      if (ntestdom >= 2) 
	{
	  esl_stack_Shuffle(cfg.r, teststack);
	  synthesize_positives(go, &cfg, msa->name, teststack, &ntest);

	  esl_msa_MinimGaps(trainmsa, NULL, NULL);
	  esl_msa_Write(cfg.out_msafp, trainmsa, eslMSAFILE_STOCKHOLM);

	  esl_dst_XAverageId(cfg.abc, trainmsa->ax, trainmsa->nseq, 10000, &avgid); /* 10000 is max_comparisons, before sampling kicks in */
	  fprintf(cfg.tblfp, "%-20s  %3.0f%% %6d %6d %6d %6d %6d %6d\n", msa->name, 100.*avgid, (int) trainmsa->alen, msa->nseq, nfrags, trainmsa->nseq, ntestdom, ntest);
	  nali++;
	}

      esl_msa_Destroy(trainmsa);
      esl_msa_Destroy(origmsa);
      esl_msa_Destroy(msa);
    }
  if      (status == eslEFORMAT)  esl_fatal("Alignment file parse error, line %d of file %s:\n%s\nOffending line is:\n%s\n", 
					    afp->linenumber, afp->fname, afp->errbuf, afp->buf);	
  else if (status != eslEOF)      esl_fatal("Alignment file read failed with error code %d\n", status);
  else if (nali   == 0)           esl_fatal("No alignments found in file %s\n", alifile);

  if (nali > 0)
    synthesize_negatives(go, &cfg, esl_opt_GetInteger(go, "-N"));

  fclose(cfg.out_msafp);
  fclose(cfg.out_seqfp);
  fclose(cfg.possummfp);
  fclose(cfg.negsummfp);
  fclose(cfg.tblfp);
  esl_randomness_Destroy(cfg.r);
  esl_alphabet_Destroy(cfg.abc);
  esl_msafile_Close(afp);
  esl_getopts_Destroy(go);
  return 0;
}
Esempio n. 6
0
int
main(int argc, char **argv)
{
  ESL_GETOPTS  *go      = NULL;	/* command line configuration      */
  struct cfg_s  cfg;     	/* application configuration       */
  char         *basename= NULL;	/* base of the output file names   */
  char         *alifile = NULL;	/* alignment file name             */
  char         *dbfile  = NULL;	/* name of seq db file             */
  char          outfile[256];	/* name of an output file          */
  int           alifmt;		/* format code for alifile         */
  int           dbfmt;		/* format code for dbfile          */
  ESLX_MSAFILE  *afp    = NULL;	/* open alignment file             */
  ESL_MSA      *origmsa = NULL;	/* one multiple sequence alignment */
  ESL_MSA      *msa     = NULL;	/* MSA after frags are removed     */
  ESL_MSA      *trainmsa= NULL;	/* training set, aligned           */
  ESL_STACK    *teststack=NULL; /* test set: stack of ESL_SQ ptrs  */
  int           status;		/* easel return code               */
  int           nfrags;		/* # of fragments removed          */
  int           ntestdom;       /* # of test domains               */
  int           ntest;		/* # of test sequences created     */
  int           nali;		/* number of alignments read       */
  double        avgid;
  
  
  /* Parse command line */
  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf);
  if (esl_opt_VerifyConfig(go)               != eslOK) cmdline_failure(argv[0], "Error in app configuration:   %s\n", go->errbuf);
  if (esl_opt_GetBoolean(go, "-h"))                    cmdline_help(argv[0], go);
  if (esl_opt_ArgNumber(go)                  != 3)     cmdline_failure(argv[0], "Incorrect number of command line arguments\n");
  basename = esl_opt_GetArg(go, 1); 
  alifile  = esl_opt_GetArg(go, 2);
  dbfile   = esl_opt_GetArg(go, 3);
  alifmt   = eslMSAFILE_STOCKHOLM;
  dbfmt    = eslSQFILE_FASTA;

  /* Set up the configuration structure shared amongst functions here */
  if (esl_opt_IsDefault(go, "--seed"))   cfg.r = esl_randomness_CreateTimeseeded();
  else                                   cfg.r = esl_randomness_Create(esl_opt_GetInteger(go, "--seed"));
  cfg.abc        = NULL;		          /* until we open the MSA file, below */
  cfg.fragfrac   = esl_opt_GetReal(go, "-F");
  cfg.idthresh1  = esl_opt_GetReal(go, "-1");
  cfg.idthresh2  = esl_opt_GetReal(go, "-2");
  cfg.test_lens  = NULL;
  cfg.ntest      = 0;
  cfg.max_ntest  = (esl_opt_IsOn(go, "--maxtest")  ? esl_opt_GetInteger(go, "--maxtest")  : 0); 
  cfg.max_ntrain = (esl_opt_IsOn(go, "--maxtrain") ? esl_opt_GetInteger(go, "--maxtrain") : 0); 

  /* Open the output files */ 
  if (snprintf(outfile, 256, "%s.msa", basename) >= 256)  esl_fatal("Failed to construct output MSA file name");
  if ((cfg.out_msafp = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open MSA output file %s\n", outfile);
  if (snprintf(outfile, 256, "%s.fa",  basename) >= 256)  esl_fatal("Failed to construct output FASTA file name");
  if ((cfg.out_seqfp = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open FASTA output file %s\n", outfile);
  if (snprintf(outfile, 256, "%s.pos", basename) >= 256)  esl_fatal("Failed to construct pos test set summary file name");
  if ((cfg.possummfp = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open pos test set summary file %s\n", outfile);
  if (snprintf(outfile, 256, "%s.neg", basename) >= 256)  esl_fatal("Failed to construct neg test set summary file name");
  if ((cfg.negsummfp = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open neg test set summary file %s\n", outfile);
  if (snprintf(outfile, 256, "%s.tbl", basename) >= 256)  esl_fatal("Failed to construct benchmark table file name");
  if ((cfg.tblfp     = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open benchmark table file %s\n", outfile);
  if (esl_opt_GetBoolean(go, "--pid")) {
    if (snprintf(outfile, 256, "%s.pid", basename) >= 256)  esl_fatal("Failed to construct %%id table file name");
    if ((cfg.pidfp   = fopen(outfile, "w"))        == NULL) esl_fatal("Failed to open %%id table file %s\n", outfile);
  } else cfg.pidfp   = NULL;

  /* Open the MSA file, digital mode; determine alphabet */
  if      (esl_opt_GetBoolean(go, "--amino"))   cfg.abc = esl_alphabet_Create(eslAMINO);
  else if (esl_opt_GetBoolean(go, "--dna"))     cfg.abc = esl_alphabet_Create(eslDNA);
  else if (esl_opt_GetBoolean(go, "--rna"))     cfg.abc = esl_alphabet_Create(eslRNA);

  status = eslx_msafile_Open(&(cfg.abc), alifile, NULL, alifmt, NULL, &afp);
  if (status != eslOK) eslx_msafile_OpenFailure(afp, status);

  if (cfg.abc->type == eslAMINO) esl_composition_SW34(cfg.fq);
  else                           esl_vec_DSet(cfg.fq, cfg.abc->K, 1.0 / (double) cfg.abc->K);

  /* Open and process the dbfile; make sure it's in the same alphabet */
  process_dbfile(&cfg, dbfile, dbfmt);

  /* Read and process MSAs one at a time  */
  nali = 0;
  while ((status = eslx_msafile_Read(afp, &origmsa)) != eslEOF)
    {
      if (status != eslOK) eslx_msafile_ReadFailure(afp, status);
      esl_msa_ConvertDegen2X(origmsa); 
      esl_msa_Hash(origmsa);

      remove_fragments(&cfg, origmsa, &msa, &nfrags);
      separate_sets   (&cfg, msa, &trainmsa, &teststack);

      if ( esl_stack_ObjectCount(teststack) >= 2) 
	{
	  /* randomize test domain order, and apply size limit if any */
	  esl_stack_Shuffle(cfg.r, teststack);
	  if (cfg.max_ntest) pstack_select_topn(&teststack, cfg.max_ntest);
	  ntestdom =  esl_stack_ObjectCount(teststack);

	  /* randomize training set alignment order, and apply size limit if any */
	  esl_msashuffle_PermuteSequenceOrder(cfg.r, trainmsa);
	  if (cfg.max_ntrain) msa_select_topn(&trainmsa, cfg.max_ntrain);
	  esl_msa_MinimGaps(trainmsa, NULL, NULL, FALSE);
	  
	  if (esl_opt_GetBoolean(go, "--pid")) write_pids(cfg.pidfp, origmsa, trainmsa, teststack);

	  synthesize_positives(go, &cfg, msa->name, teststack, &ntest);

	  eslx_msafile_Write(cfg.out_msafp, trainmsa, eslMSAFILE_STOCKHOLM);

	  esl_dst_XAverageId(cfg.abc, trainmsa->ax, trainmsa->nseq, 10000, &avgid); /* 10000 is max_comparisons, before sampling kicks in */
	  fprintf(cfg.tblfp, "%-20s  %3.0f%% %6d %6d %6d %6d %6d %6d\n", msa->name, 100.*avgid, (int) trainmsa->alen, msa->nseq, nfrags, trainmsa->nseq, ntestdom, ntest);
	  nali++;
	}

      esl_msa_Destroy(trainmsa);
      esl_msa_Destroy(origmsa);
      esl_msa_Destroy(msa);
    }
  if  (nali == 0) esl_fatal("No alignments found in file %s\n", alifile);
  
  synthesize_negatives(go, &cfg, esl_opt_GetInteger(go, "-N"));

  fclose(cfg.out_msafp);
  fclose(cfg.out_seqfp);
  fclose(cfg.possummfp);
  fclose(cfg.negsummfp);
  fclose(cfg.tblfp);
  if (cfg.pidfp) fclose(cfg.pidfp);
  esl_randomness_Destroy(cfg.r);
  esl_alphabet_Destroy(cfg.abc);
  eslx_msafile_Close(afp);
  esl_getopts_Destroy(go);
  return 0;
}