/* Function: esl_wuss_full() * Incept: SRE, Mon Feb 28 09:44:40 2005 [St. Louis] * * Purpose: Given a simple ("input") WUSS format annotation string <oldss>, * convert it to full ("output") WUSS format in <newss>. * <newss> must be allocated by the caller to be at least as * long as <oldss>. <oldss> and <newss> can be the same, * to convert a secondary structure string in place. * * Pseudoknot annotation is preserved, if <oldss> had it. * * Returns: <eslSYNTAX> if <oldss> isn't in valid WUSS format. * * Throws: <eslEMEM> on allocation failure. * <eslEINCONCEIVABLE> on internal error that can't happen. */ int esl_wuss_full(char *oldss, char *newss) { char *tmp = NULL; int *ct = NULL; int n; int i; int status; /* We can use the ct2wuss algorithm to generate a full WUSS string - * convert to ct, then back to WUSS. ct2wuss doesn't deal with pk's * though, and we want to propagate pk annotation if it's there. So * we need two workspaces: ct array, and a temporary ss string that * we use to hold non-pk annotation. As a final step, we overlay * the pk annotation from the original oldss annotation. */ n = strlen(oldss); ESL_ALLOC_WITH_TYPE(ct, int*, sizeof(int) * (n+1)); ESL_ALLOC_WITH_TYPE(tmp, char*, sizeof(char) * (n+1)); esl_wuss_nopseudo(oldss, tmp);/* tmp = nonpseudoknotted oldss */ status = esl_wuss2ct(tmp, n, ct); /* ct = oldss in ct format, no pks */ if (status != eslOK) goto ERROR; status = esl_ct2wuss(ct, n, tmp); /* now tmp is a full WUSS string */ if (status == eslEINVAL) { status = eslEINCONCEIVABLE; goto ERROR; }/* we're sure, no pk's */ else if (status != eslOK) goto ERROR; /* EMEM, EINCONCEIVABLE */ for (i = 0; i < n; i++) if (isalpha(oldss[i])) newss[i] = oldss[i]; /* transfer pk annotation */ else newss[i] = tmp[i]; /* transfer new WUSS */ free(ct); free(tmp); return eslOK; ERROR: free(ct); free(tmp); return status; }
/* count_msa() * * Given an msa, count residues, and optionally base pairs and * posterior probabilities per column and store them in <ret_abc_ct> * and <ret_pp_ct>. * * <ret_abc_ct> [0..apos..alen-1][0..abc->K]: * - per position count of each symbol in alphabet over all seqs. * * <ret_bp_ct> [0..apos..alen-1][0..abc->Kp-1][0..abc->Kp-1] * - per (non-pknotted) consensus basepair count of each possible basepair * over all seqs basepairs are indexed by 'i' the minimum of 'i:j' for a * pair between i and j, where i < j. Note that non-canonicals and * gaps and the like are all stored independently. * * <ret_pp_ct> [0..apos..alen-1][0..11] * - per position count of each posterior probability code over all seqs. * * A 'gap' has a looser definition than in esl_abc here, esl_abc's gap, * missing residues and nonresidues are all considered 'gaps' here. * * If we encounter an error, we return non-eslOK status and fill * errbuf with error message. * * Returns eslOK upon success. */ static int count_msa(ESL_MSA *msa, char *errbuf, int nali, int no_ambig, int use_weights, double ***ret_abc_ct, double ****ret_bp_ct, double ***ret_pp_ct) { int status; double **abc_ct = NULL; double ***bp_ct = NULL; int apos, rpos, i, x; int nppvals = 12; /* '0'-'9' = 0-9, '*' = 10, gap = '11' */ double **pp_ct = NULL; /* [0..alen-1][0..nppvals-1] per position count of each possible PP char over all seqs */ int ppidx; /* variables related to getting bp counts */ int *ct = NULL; /* 0..alen-1 base pair partners array for current sequence */ char *ss_nopseudo = NULL; /* no-pseudoknot version of structure */ double seqwt; /* weight of current sequence, always 1.0 if !use_weights */ if(! (msa->flags & eslMSA_DIGITAL)) ESL_FAIL(eslEINVAL, errbuf, "count_msa() contract violation, MSA is not digitized"); if(use_weights && msa->wgt == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "count_msa(): use_weights==TRUE but msa->wgt == NULL"); /* allocate pp_ct array, if nec */ if(ret_pp_ct != NULL) { if(msa->pp == NULL) ESL_FAIL(eslEINVAL, errbuf, "count_msa() ret_pp_ct != NULL, but msa->pp is NULL"); ESL_ALLOC(pp_ct, sizeof(double *) * msa->alen); for(apos = 0; apos < msa->alen; apos++) { ESL_ALLOC(pp_ct[apos], sizeof(double) * nppvals); esl_vec_DSet(pp_ct[apos], nppvals, 0.); } } /* allocate and initialize bp_ct, if nec */ if(ret_bp_ct != NULL) { ESL_ALLOC(bp_ct, sizeof(double **) * msa->alen); /* get ct array which defines the consensus base pairs */ ESL_ALLOC(ct, sizeof(int) * (msa->alen+1)); ESL_ALLOC(ss_nopseudo, sizeof(char) * (msa->alen+1)); esl_wuss_nopseudo(msa->ss_cons, ss_nopseudo); if ((status = esl_wuss2ct(ss_nopseudo, msa->alen, ct)) != eslOK) ESL_FAIL(status, errbuf, "Consensus structure string is inconsistent."); for(apos = 0; apos < msa->alen; apos++) { /* careful ct is indexed 1..alen, not 0..alen-1 */ if(ct[(apos+1)] > (apos+1)) { /* apos+1 is an 'i' in an i:j pair, where i < j */ ESL_ALLOC(bp_ct[apos], sizeof(double *) * (msa->abc->Kp)); for(x = 0; x < msa->abc->Kp; x++) { ESL_ALLOC(bp_ct[apos][x], sizeof(double) * (msa->abc->Kp)); esl_vec_DSet(bp_ct[apos][x], msa->abc->Kp, 0.); } } else { /* apos+1 is not an 'i' in an i:j pair, where i < j, set to NULL */ bp_ct[apos] = NULL; } } } ESL_ALLOC(abc_ct, sizeof(double *) * msa->alen); for(apos = 0; apos < msa->alen; apos++) { ESL_ALLOC(abc_ct[apos], sizeof(double) * (msa->abc->K+1)); esl_vec_DSet(abc_ct[apos], (msa->abc->K+1), 0.); } for(i = 0; i < msa->nseq; i++) { seqwt = use_weights ? msa->wgt[i] : 1.0; for(apos = 0; apos < msa->alen; apos++) { /* update appropriate abc count, careful, ax ranges from 1..msa->alen (but abc_ct is 0..msa->alen-1) */ if((! no_ambig) || (! esl_abc_XIsDegenerate(msa->abc, msa->ax[i][apos+1]))) { /* skip ambiguities (degenerate residues) if no_ambig is TRUE */ if((status = esl_abc_DCount(msa->abc, abc_ct[apos], msa->ax[i][apos+1], seqwt)) != eslOK) ESL_FAIL(status, errbuf, "problem counting residue %d of seq %d", apos, i); } } /* get bp counts, if nec */ if(bp_ct != NULL) { for(apos = 0; apos < msa->alen; apos++) { /* update appropriate abc count, careful, ax ranges from 1..msa->alen (but abc_ct is 0..msa->alen-1) */ if(bp_ct[apos] != NULL) { /* our flag for whether position (apos+1) is an 'i' in an i:j pair where i < j */ rpos = ct[apos+1] - 1; /* ct is indexed 1..alen */ bp_ct[apos][msa->ax[i][apos+1]][msa->ax[i][rpos+1]] += seqwt; } } } /* get PP counts, if nec */ if(pp_ct != NULL) { if(msa->pp[i] != NULL) { for(apos = 0; apos < msa->alen; apos++) { if((! no_ambig) || (! esl_abc_XIsDegenerate(msa->abc, msa->ax[i][apos+1]))) { /* skip ambiguities (degenerate residues) if no_ambig is TRUE */ if((ppidx = get_pp_idx(msa->abc, msa->pp[i][apos])) == -1) ESL_FAIL(eslEFORMAT, errbuf, "bad #=GR PP char: %c", msa->pp[i][apos]); pp_ct[apos][ppidx] += seqwt; } } } } } *ret_abc_ct = abc_ct; if(ret_bp_ct != NULL) *ret_bp_ct = bp_ct; /* we only allocated bp_ct if ret_bp_ct != NULL */ if(ret_pp_ct != NULL) *ret_pp_ct = pp_ct; /* we only allocated pp_ct if ret_pp_ct != NULL */ if(ss_nopseudo != NULL) free(ss_nopseudo); if(ct != NULL) free(ct); return eslOK; ERROR: if(abc_ct != NULL) esl_Free2D((void **) abc_ct, msa->alen); if(bp_ct != NULL) esl_Free3D((void ***) bp_ct, msa->alen, msa->abc->Kp); if(pp_ct != NULL) esl_Free2D((void **) pp_ct, msa->alen); ESL_FAIL(status, errbuf, "Error, out of memory while counting important values in the msa."); return status; /* NEVERREACHED */ }
/* dump_basepair_counts * * Dump per-basepaired-column basepair counts from bp_ct[][][] to * an open output file. Only pairs involving canonical residues * are printed. (i.e. for RNA: AA,AC,AG,AU, CA,CC,CG,CU, GA,GC,GG,GU, * UA,UC,UG,UU). * * <bp_ct> [0..apos..alen-1][0..abc->Kp-1][0..abc->Kp-1] * - per (non-pknotted) consensus basepair count of each possible basepair * over all seqs basepairs are indexed by 'i' the minimum of 'i:j' for a * pair between i and j, where i < j. Note that non-canonicals and * gaps and the like are all stored independently. */ static int dump_basepair_counts(FILE *fp, ESL_MSA *msa, ESL_ALPHABET *abc, double ***bp_ct, int use_weights, int nali, int nseq, char *msa_name, char *alifile, char *errbuf) { int status; int apos, rpos; int i, j; int *ct = NULL; /* 0..msa->alen-1 base pair partners array for current sequence */ char *ss_nopseudo = NULL; /* no-pseudoknot version of structure */ /* get ct array which defines the consensus base pairs */ ESL_ALLOC(ct, sizeof(int) * (msa->alen+1)); ESL_ALLOC(ss_nopseudo, sizeof(char) * (msa->alen+1)); esl_wuss_nopseudo(msa->ss_cons, ss_nopseudo); if ((status = esl_wuss2ct(ss_nopseudo, msa->alen, ct)) != eslOK) ESL_FAIL(status, errbuf, "Consensus structure string is inconsistent."); fprintf(fp, "# Per-column basepair counts:\n"); fprintf(fp, "# Alignment file: %s\n", alifile); fprintf(fp, "# Alignment idx: %d\n", nali); if(msa_name != NULL) { fprintf(fp, "# Alignment name: %s\n", msa_name); } fprintf(fp, "# Number of sequences: %d\n", nseq); fprintf(fp, "# Only basepairs involving two canonical (non-degenerate) residues were counted.\n"); if(use_weights) { fprintf(fp, "# IMPORTANT: Counts are weighted based on sequence weights in alignment file.\n"); } else { fprintf(fp, "# Sequence weights from alignment were ignored (if they existed).\n"); } fprintf(fp, "#\n"); fprintf(fp, "# %7s %7s", "lpos", "rpos"); for(i = 0; i < abc->K; i++) { for(j = 0; j < abc->K; j++) { fprintf(fp, " %c%c ", abc->sym[i], abc->sym[j]); } } fprintf(fp, "\n"); fprintf(fp, "# %7s %7s", "-------", "-------"); for(i = 0; i < abc->K; i++) { for(j = 0; j < abc->K; j++) { fprintf(fp, " %6s", "------"); } } fprintf(fp, "\n"); for(apos = 0; apos < msa->alen; apos++) { if(bp_ct[apos] != NULL) { rpos = ct[(apos+1)]; fprintf(fp, " %7d %7d", apos+1, rpos); for(i = 0; i < abc->K; i++) { for(j = 0; j < abc->K; j++) { fprintf(fp, " %6d", (int) bp_ct[apos][i][j]); } } fprintf(fp, "\n"); } } fprintf(fp, "//\n"); if(ss_nopseudo != NULL) free(ss_nopseudo); if(ct != NULL) free(ct); return eslOK; ERROR: if(ss_nopseudo != NULL) free(ss_nopseudo); if(ct != NULL) free(ct); ESL_FAIL(status, errbuf, "Error, out of memory while dumping basepair info"); }