Example #1
0
/* Function:  esl_wuss_full()
* Incept:    SRE, Mon Feb 28 09:44:40 2005 [St. Louis]
*
* Purpose:   Given a simple ("input") WUSS format annotation string <oldss>,
*            convert it to full ("output") WUSS format in <newss>.
*            <newss> must be allocated by the caller to be at least as 
*            long as <oldss>. <oldss> and <newss> can be the same,
*            to convert a secondary structure string in place.
*            
*            Pseudoknot annotation is preserved, if <oldss> had it.
*
* Returns:   <eslSYNTAX> if <oldss> isn't in valid WUSS format.
*
* Throws:    <eslEMEM> on allocation failure.
*            <eslEINCONCEIVABLE> on internal error that can't happen.
*/
int
esl_wuss_full(char *oldss, char *newss)
{
    char *tmp = NULL;
    int  *ct  = NULL;
    int   n;
    int   i;
    int   status;

    /* We can use the ct2wuss algorithm to generate a full WUSS string -
    * convert to ct, then back to WUSS.  ct2wuss doesn't deal with pk's
    * though, and we want to propagate pk annotation if it's there.  So
    * we need two workspaces: ct array, and a temporary ss string that
    * we use to hold non-pk annotation.  As a final step, we overlay
    * the pk annotation from the original oldss annotation.
    */
    n = strlen(oldss);
    ESL_ALLOC_WITH_TYPE(ct, int*,  sizeof(int)  * (n+1));
    ESL_ALLOC_WITH_TYPE(tmp, char*, sizeof(char) * (n+1));

    esl_wuss_nopseudo(oldss, tmp);/* tmp = nonpseudoknotted oldss */

    status = esl_wuss2ct(tmp, n, ct);   /* ct  = oldss in ct format, no pks */
    if (status != eslOK) goto ERROR;

    status = esl_ct2wuss(ct, n, tmp);   /* now tmp is a full WUSS string */
    if (status == eslEINVAL) { status = eslEINCONCEIVABLE; goto ERROR; }/* we're sure, no pk's */
    else if (status != eslOK) goto ERROR; /* EMEM, EINCONCEIVABLE  */

    for (i = 0; i < n; i++)
        if (isalpha(oldss[i])) newss[i] = oldss[i];	/* transfer pk annotation */
        else newss[i] = tmp[i];                     /* transfer new WUSS      */

        free(ct);
        free(tmp);
        return eslOK;

ERROR:
        free(ct);
        free(tmp);
        return status;
}
/* count_msa()
 *                   
 * Given an msa, count residues, and optionally base pairs and
 * posterior probabilities per column and store them in <ret_abc_ct>
 * and <ret_pp_ct>.
 * 
 * <ret_abc_ct> [0..apos..alen-1][0..abc->K]:
 * - per position count of each symbol in alphabet over all seqs.
 * 
 * <ret_bp_ct>  [0..apos..alen-1][0..abc->Kp-1][0..abc->Kp-1] 
 * - per (non-pknotted) consensus basepair count of each possible basepair 
 *   over all seqs basepairs are indexed by 'i' the minimum of 'i:j' for a 
 *   pair between i and j, where i < j. Note that non-canonicals and 
 *   gaps and the like are all stored independently.
 *
 * <ret_pp_ct> [0..apos..alen-1][0..11]
 * - per position count of each posterior probability code over all seqs.
 * 
 * A 'gap' has a looser definition than in esl_abc here, esl_abc's gap, 
 * missing residues and nonresidues are all considered 'gaps' here.
 * 
 * If we encounter an error, we return non-eslOK status and fill
 * errbuf with error message.
 * 
 * Returns eslOK upon success.
 */
static int count_msa(ESL_MSA *msa, char *errbuf, int nali, int no_ambig, int use_weights, double ***ret_abc_ct, double ****ret_bp_ct, double ***ret_pp_ct)
{
  int status;
  double  **abc_ct = NULL;
  double ***bp_ct = NULL;
  int       apos, rpos, i, x;
  int       nppvals = 12;         /* '0'-'9' = 0-9, '*' = 10, gap = '11' */
  double  **pp_ct = NULL;         /* [0..alen-1][0..nppvals-1] per position count of each possible PP char over all seqs */
  int       ppidx; 
  /* variables related to getting bp counts */
  int      *ct = NULL;            /* 0..alen-1 base pair partners array for current sequence */
  char     *ss_nopseudo = NULL;   /* no-pseudoknot version of structure */
  double    seqwt;  /* weight of current sequence, always 1.0 if !use_weights */

  if(! (msa->flags & eslMSA_DIGITAL)) ESL_FAIL(eslEINVAL, errbuf, "count_msa() contract violation, MSA is not digitized");
  if(use_weights && msa->wgt == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "count_msa(): use_weights==TRUE but msa->wgt == NULL");

  /* allocate pp_ct array, if nec */
  if(ret_pp_ct != NULL) { 
    if(msa->pp == NULL) ESL_FAIL(eslEINVAL, errbuf, "count_msa() ret_pp_ct != NULL, but msa->pp is NULL");
    ESL_ALLOC(pp_ct, sizeof(double *) * msa->alen);
    for(apos = 0; apos < msa->alen; apos++) { 
      ESL_ALLOC(pp_ct[apos], sizeof(double) * nppvals);
      esl_vec_DSet(pp_ct[apos], nppvals, 0.);
    }
  }

  /* allocate and initialize bp_ct, if nec */
  if(ret_bp_ct != NULL) { 
    ESL_ALLOC(bp_ct,  sizeof(double **) * msa->alen); 
    /* get ct array which defines the consensus base pairs */
    ESL_ALLOC(ct,  sizeof(int)  * (msa->alen+1));
    ESL_ALLOC(ss_nopseudo, sizeof(char) * (msa->alen+1));
    esl_wuss_nopseudo(msa->ss_cons, ss_nopseudo);
    if ((status = esl_wuss2ct(ss_nopseudo, msa->alen, ct)) != eslOK) ESL_FAIL(status, errbuf, "Consensus structure string is inconsistent.");
    for(apos = 0; apos < msa->alen; apos++) { 
      /* careful ct is indexed 1..alen, not 0..alen-1 */
      if(ct[(apos+1)] > (apos+1)) { /* apos+1 is an 'i' in an i:j pair, where i < j */
	ESL_ALLOC(bp_ct[apos], sizeof(double *) * (msa->abc->Kp));
	for(x = 0; x < msa->abc->Kp; x++) { 
	  ESL_ALLOC(bp_ct[apos][x], sizeof(double) * (msa->abc->Kp));
	  esl_vec_DSet(bp_ct[apos][x], msa->abc->Kp, 0.);
	}
      }
      else { /* apos+1 is not an 'i' in an i:j pair, where i < j, set to NULL */
	bp_ct[apos] = NULL;
      }
    }
  }

  ESL_ALLOC(abc_ct, sizeof(double *) * msa->alen); 
  for(apos = 0; apos < msa->alen; apos++) { 
    ESL_ALLOC(abc_ct[apos], sizeof(double) * (msa->abc->K+1));
    esl_vec_DSet(abc_ct[apos], (msa->abc->K+1), 0.);
  }

  for(i = 0; i < msa->nseq; i++) { 
    seqwt = use_weights ? msa->wgt[i] : 1.0;

    for(apos = 0; apos < msa->alen; apos++) { /* update appropriate abc count, careful, ax ranges from 1..msa->alen (but abc_ct is 0..msa->alen-1) */
      if((! no_ambig) || (! esl_abc_XIsDegenerate(msa->abc, msa->ax[i][apos+1]))) { /* skip ambiguities (degenerate residues) if no_ambig is TRUE */
	if((status = esl_abc_DCount(msa->abc, abc_ct[apos], msa->ax[i][apos+1], seqwt)) != eslOK) ESL_FAIL(status, errbuf, "problem counting residue %d of seq %d", apos, i);
      }
    }

    /* get bp counts, if nec */
    if(bp_ct != NULL) { 
      for(apos = 0; apos < msa->alen; apos++) { /* update appropriate abc count, careful, ax ranges from 1..msa->alen (but abc_ct is 0..msa->alen-1) */
	if(bp_ct[apos] != NULL) { /* our flag for whether position (apos+1) is an 'i' in an i:j pair where i < j */
	  rpos = ct[apos+1] - 1; /* ct is indexed 1..alen */
	  bp_ct[apos][msa->ax[i][apos+1]][msa->ax[i][rpos+1]] += seqwt;
	}
      }
    }

    /* get PP counts, if nec  */
    if(pp_ct != NULL) { 
      if(msa->pp[i] != NULL) { 
	for(apos = 0; apos < msa->alen; apos++) { 
	  if((! no_ambig) || (! esl_abc_XIsDegenerate(msa->abc, msa->ax[i][apos+1]))) { /* skip ambiguities (degenerate residues) if no_ambig is TRUE */
	    if((ppidx = get_pp_idx(msa->abc, msa->pp[i][apos])) == -1) ESL_FAIL(eslEFORMAT, errbuf, "bad #=GR PP char: %c", msa->pp[i][apos]);
	    pp_ct[apos][ppidx] += seqwt;
	  }
	}
      }
    }
  }

  *ret_abc_ct  = abc_ct;
  if(ret_bp_ct != NULL) *ret_bp_ct = bp_ct; /* we only allocated bp_ct if ret_bp_ct != NULL */
  if(ret_pp_ct != NULL) *ret_pp_ct = pp_ct; /* we only allocated pp_ct if ret_pp_ct != NULL */

  if(ss_nopseudo != NULL) free(ss_nopseudo);
  if(ct != NULL) free(ct);

  return eslOK;

 ERROR:
  if(abc_ct != NULL)  esl_Free2D((void **) abc_ct, msa->alen);
  if(bp_ct != NULL)   esl_Free3D((void ***) bp_ct, msa->alen, msa->abc->Kp);
  if(pp_ct != NULL)   esl_Free2D((void **) pp_ct, msa->alen);
  ESL_FAIL(status, errbuf, "Error, out of memory while counting important values in the msa.");
  return status; /* NEVERREACHED */
}
/* dump_basepair_counts
 *                   
 * Dump per-basepaired-column basepair counts from bp_ct[][][] to 
 * an open output file. Only pairs involving canonical residues
 * are printed. (i.e. for RNA: AA,AC,AG,AU, CA,CC,CG,CU, GA,GC,GG,GU,
 * UA,UC,UG,UU).
 *
 * <bp_ct>  [0..apos..alen-1][0..abc->Kp-1][0..abc->Kp-1] 
 * - per (non-pknotted) consensus basepair count of each possible basepair 
 *   over all seqs basepairs are indexed by 'i' the minimum of 'i:j' for a 
 *   pair between i and j, where i < j. Note that non-canonicals and 
 *   gaps and the like are all stored independently.
 */
static int dump_basepair_counts(FILE *fp, ESL_MSA *msa, ESL_ALPHABET *abc, double ***bp_ct, int use_weights, int nali, int nseq, char *msa_name, char *alifile, char *errbuf)
{
  int status;
  int apos, rpos;
  int i, j;

  int      *ct = NULL;            /* 0..msa->alen-1 base pair partners array for current sequence */
  char     *ss_nopseudo = NULL;   /* no-pseudoknot version of structure */

  /* get ct array which defines the consensus base pairs */
  ESL_ALLOC(ct,  sizeof(int) * (msa->alen+1));
  ESL_ALLOC(ss_nopseudo, sizeof(char) * (msa->alen+1));
  esl_wuss_nopseudo(msa->ss_cons, ss_nopseudo);
  if ((status = esl_wuss2ct(ss_nopseudo, msa->alen, ct)) != eslOK) ESL_FAIL(status, errbuf, "Consensus structure string is inconsistent.");

  fprintf(fp, "# Per-column basepair counts:\n");
  fprintf(fp, "# Alignment file: %s\n", alifile);
  fprintf(fp, "# Alignment idx:  %d\n", nali);
  if(msa_name != NULL) { fprintf(fp, "# Alignment name: %s\n", msa_name); }
  fprintf(fp, "# Number of sequences: %d\n", nseq);
  fprintf(fp, "# Only basepairs involving two canonical (non-degenerate) residues were counted.\n");
  if(use_weights) { fprintf(fp, "# IMPORTANT: Counts are weighted based on sequence weights in alignment file.\n"); }
  else            { fprintf(fp, "# Sequence weights from alignment were ignored (if they existed).\n"); }
  fprintf(fp, "#\n");

  fprintf(fp, "# %7s  %7s", "lpos",    "rpos"); 
  for(i = 0; i < abc->K; i++) { 
    for(j = 0; j < abc->K; j++) {  
      fprintf(fp, "    %c%c  ", abc->sym[i], abc->sym[j]);
    }
  }
  fprintf(fp, "\n");

  fprintf(fp, "# %7s  %7s", "-------",    "-------"); 
  for(i = 0; i < abc->K; i++) { 
    for(j = 0; j < abc->K; j++) {  
      fprintf(fp, "  %6s", "------");
    }
  }
  fprintf(fp, "\n");

  for(apos = 0; apos < msa->alen; apos++) {
    if(bp_ct[apos] != NULL) { 
      rpos = ct[(apos+1)];
      fprintf(fp, "  %7d  %7d", apos+1, rpos);
      for(i = 0; i < abc->K; i++) { 
	for(j = 0; j < abc->K; j++) {  
	  fprintf(fp, "  %6d", (int) bp_ct[apos][i][j]);
	}
      }
      fprintf(fp, "\n");
    }
  }
  fprintf(fp, "//\n");

  if(ss_nopseudo != NULL) free(ss_nopseudo);
  if(ct != NULL) free(ct);
  return eslOK;

 ERROR:
  if(ss_nopseudo != NULL) free(ss_nopseudo);
  if(ct != NULL) free(ct);
  ESL_FAIL(status, errbuf, "Error, out of memory while dumping basepair info");
}