int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; ESL_RANDOMNESS *r = NULL; char **as = NULL; /* aligned character seqs (random, iid) */ int N,L; /* # of seqs, and their aligned lengths */ int seed; int i,j; int status; double p[4]; /* ACGT probabilities */ #ifdef eslAUGMENT_ALPHABET ESL_DSQ **ax = NULL; /* digitized alignment */ ESL_ALPHABET *abc = NULL; #endif /* Process command line */ go = esl_getopts_Create(options); esl_opt_ProcessCmdline(go, argc, argv); esl_opt_VerifyConfig(go); if (esl_opt_GetBoolean(go, "-h") == TRUE) { puts(usage); puts("\n where options are:"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2=indentation; 80=width */ return 0; } L = esl_opt_GetInteger(go, "-L"); N = esl_opt_GetInteger(go, "-N"); seed = esl_opt_GetInteger(go, "--seed"); if (esl_opt_ArgNumber(go) != 0) { puts("Incorrect number of command line arguments."); puts(usage); return 1; } esl_getopts_Destroy(go); /* Create a random DNA alignment; * force it to obey the conventions of the unit tests: * 0,1 are identical * 0,2 are completely dissimilar */ r = esl_randomness_Create(seed); for (i = 0; i < 4; i++) p[i] = 0.25; ESL_ALLOC(as, sizeof(char *) * N); for (i = 0; i < N; i++) ESL_ALLOC(as[i], sizeof(char) * (L+1)); esl_rsq_IID(r, "ACGT", p, 4, L, as[0]); strcpy(as[1], as[0]); esl_rsq_IID(r, "ACGT", p, 4, L, as[2]); for (j = 0; j < L; j++) while (as[2][j] == as[0][j]) as[2][j] = "ACGT"[esl_rnd_Roll(r, 4)]; for (i = 3; i < N; i++) esl_rsq_IID(r, "ACGT", p, 4, L, as[i]); #ifdef eslAUGMENT_ALPHABET abc = esl_alphabet_Create(eslDNA); ESL_ALLOC(ax, sizeof(ESL_DSQ *) * N); for (i = 0; i < N; i++) esl_abc_CreateDsq(abc, as[i], &(ax[i])); #endif /*eslAUGMENT_ALPHABET*/ /* Unit tests */ if (utest_CPairId(as, N) != eslOK) return eslFAIL; if (utest_CJukesCantor(4, as, N) != eslOK) return eslFAIL; #ifdef eslAUGMENT_ALPHABET if (utest_XPairId(abc, as, ax, N) != eslOK) return eslFAIL; if (utest_XJukesCantor(abc, as, ax, N) != eslOK) return eslFAIL; #endif /*eslAUGMENT_ALPHABET*/ #ifdef eslAUGMENT_DMATRIX if (utest_CPairIdMx(as, N) != eslOK) return eslFAIL; if (utest_CDiffMx(as, N) != eslOK) return eslFAIL; if (utest_CJukesCantorMx(4, as, N) != eslOK) return eslFAIL; #endif /* eslAUGMENT_DMATRIX*/ #if defined (eslAUGMENT_ALPHABET) && defined (eslAUGMENT_DMATRIX) if (utest_XPairIdMx(abc, as, ax, N) != eslOK) return eslFAIL; if (utest_XDiffMx(abc, as, ax, N) != eslOK) return eslFAIL; if (utest_XJukesCantorMx(abc, as, ax, N) != eslOK) return eslFAIL; #endif esl_randomness_Destroy(r); esl_Free2D((void **) as, N); #ifdef eslAUGMENT_ALPHABET esl_alphabet_Destroy(abc); esl_Free2D((void **) ax, N); #endif return eslOK; ERROR: return eslFAIL; }
/* count_msa() * * Given an msa, count residues, and optionally base pairs and * posterior probabilities per column and store them in <ret_abc_ct> * and <ret_pp_ct>. * * <ret_abc_ct> [0..apos..alen-1][0..abc->K]: * - per position count of each symbol in alphabet over all seqs. * * <ret_bp_ct> [0..apos..alen-1][0..abc->Kp-1][0..abc->Kp-1] * - per (non-pknotted) consensus basepair count of each possible basepair * over all seqs basepairs are indexed by 'i' the minimum of 'i:j' for a * pair between i and j, where i < j. Note that non-canonicals and * gaps and the like are all stored independently. * * <ret_pp_ct> [0..apos..alen-1][0..11] * - per position count of each posterior probability code over all seqs. * * A 'gap' has a looser definition than in esl_abc here, esl_abc's gap, * missing residues and nonresidues are all considered 'gaps' here. * * If we encounter an error, we return non-eslOK status and fill * errbuf with error message. * * Returns eslOK upon success. */ static int count_msa(ESL_MSA *msa, char *errbuf, int nali, int no_ambig, int use_weights, double ***ret_abc_ct, double ****ret_bp_ct, double ***ret_pp_ct) { int status; double **abc_ct = NULL; double ***bp_ct = NULL; int apos, rpos, i, x; int nppvals = 12; /* '0'-'9' = 0-9, '*' = 10, gap = '11' */ double **pp_ct = NULL; /* [0..alen-1][0..nppvals-1] per position count of each possible PP char over all seqs */ int ppidx; /* variables related to getting bp counts */ int *ct = NULL; /* 0..alen-1 base pair partners array for current sequence */ char *ss_nopseudo = NULL; /* no-pseudoknot version of structure */ double seqwt; /* weight of current sequence, always 1.0 if !use_weights */ if(! (msa->flags & eslMSA_DIGITAL)) ESL_FAIL(eslEINVAL, errbuf, "count_msa() contract violation, MSA is not digitized"); if(use_weights && msa->wgt == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "count_msa(): use_weights==TRUE but msa->wgt == NULL"); /* allocate pp_ct array, if nec */ if(ret_pp_ct != NULL) { if(msa->pp == NULL) ESL_FAIL(eslEINVAL, errbuf, "count_msa() ret_pp_ct != NULL, but msa->pp is NULL"); ESL_ALLOC(pp_ct, sizeof(double *) * msa->alen); for(apos = 0; apos < msa->alen; apos++) { ESL_ALLOC(pp_ct[apos], sizeof(double) * nppvals); esl_vec_DSet(pp_ct[apos], nppvals, 0.); } } /* allocate and initialize bp_ct, if nec */ if(ret_bp_ct != NULL) { ESL_ALLOC(bp_ct, sizeof(double **) * msa->alen); /* get ct array which defines the consensus base pairs */ ESL_ALLOC(ct, sizeof(int) * (msa->alen+1)); ESL_ALLOC(ss_nopseudo, sizeof(char) * (msa->alen+1)); esl_wuss_nopseudo(msa->ss_cons, ss_nopseudo); if ((status = esl_wuss2ct(ss_nopseudo, msa->alen, ct)) != eslOK) ESL_FAIL(status, errbuf, "Consensus structure string is inconsistent."); for(apos = 0; apos < msa->alen; apos++) { /* careful ct is indexed 1..alen, not 0..alen-1 */ if(ct[(apos+1)] > (apos+1)) { /* apos+1 is an 'i' in an i:j pair, where i < j */ ESL_ALLOC(bp_ct[apos], sizeof(double *) * (msa->abc->Kp)); for(x = 0; x < msa->abc->Kp; x++) { ESL_ALLOC(bp_ct[apos][x], sizeof(double) * (msa->abc->Kp)); esl_vec_DSet(bp_ct[apos][x], msa->abc->Kp, 0.); } } else { /* apos+1 is not an 'i' in an i:j pair, where i < j, set to NULL */ bp_ct[apos] = NULL; } } } ESL_ALLOC(abc_ct, sizeof(double *) * msa->alen); for(apos = 0; apos < msa->alen; apos++) { ESL_ALLOC(abc_ct[apos], sizeof(double) * (msa->abc->K+1)); esl_vec_DSet(abc_ct[apos], (msa->abc->K+1), 0.); } for(i = 0; i < msa->nseq; i++) { seqwt = use_weights ? msa->wgt[i] : 1.0; for(apos = 0; apos < msa->alen; apos++) { /* update appropriate abc count, careful, ax ranges from 1..msa->alen (but abc_ct is 0..msa->alen-1) */ if((! no_ambig) || (! esl_abc_XIsDegenerate(msa->abc, msa->ax[i][apos+1]))) { /* skip ambiguities (degenerate residues) if no_ambig is TRUE */ if((status = esl_abc_DCount(msa->abc, abc_ct[apos], msa->ax[i][apos+1], seqwt)) != eslOK) ESL_FAIL(status, errbuf, "problem counting residue %d of seq %d", apos, i); } } /* get bp counts, if nec */ if(bp_ct != NULL) { for(apos = 0; apos < msa->alen; apos++) { /* update appropriate abc count, careful, ax ranges from 1..msa->alen (but abc_ct is 0..msa->alen-1) */ if(bp_ct[apos] != NULL) { /* our flag for whether position (apos+1) is an 'i' in an i:j pair where i < j */ rpos = ct[apos+1] - 1; /* ct is indexed 1..alen */ bp_ct[apos][msa->ax[i][apos+1]][msa->ax[i][rpos+1]] += seqwt; } } } /* get PP counts, if nec */ if(pp_ct != NULL) { if(msa->pp[i] != NULL) { for(apos = 0; apos < msa->alen; apos++) { if((! no_ambig) || (! esl_abc_XIsDegenerate(msa->abc, msa->ax[i][apos+1]))) { /* skip ambiguities (degenerate residues) if no_ambig is TRUE */ if((ppidx = get_pp_idx(msa->abc, msa->pp[i][apos])) == -1) ESL_FAIL(eslEFORMAT, errbuf, "bad #=GR PP char: %c", msa->pp[i][apos]); pp_ct[apos][ppidx] += seqwt; } } } } } *ret_abc_ct = abc_ct; if(ret_bp_ct != NULL) *ret_bp_ct = bp_ct; /* we only allocated bp_ct if ret_bp_ct != NULL */ if(ret_pp_ct != NULL) *ret_pp_ct = pp_ct; /* we only allocated pp_ct if ret_pp_ct != NULL */ if(ss_nopseudo != NULL) free(ss_nopseudo); if(ct != NULL) free(ct); return eslOK; ERROR: if(abc_ct != NULL) esl_Free2D((void **) abc_ct, msa->alen); if(bp_ct != NULL) esl_Free3D((void ***) bp_ct, msa->alen, msa->abc->Kp); if(pp_ct != NULL) esl_Free2D((void **) pp_ct, msa->alen); ESL_FAIL(status, errbuf, "Error, out of memory while counting important values in the msa."); return status; /* NEVERREACHED */ }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* application configuration */ ESL_ALPHABET *abc = NULL; /* biological alphabet */ char *alifile = NULL; /* alignment file name */ int fmt = eslMSAFILE_UNKNOWN; /* format code for alifile */ ESLX_MSAFILE *afp = NULL; /* open msa file */ ESL_MSAFILE2 *old_afp = NULL; /* open msa file, legacy (--small) */ ESL_MSA *msa = NULL; /* one multiple sequence alignment */ int nali; /* number of alignments read */ int i; /* counter over seqs */ int64_t alen; /* alignment length */ int nseq; /* number of sequences in the msa */ int64_t rlen; /* a raw (unaligned) seq length */ int64_t small, large; /* smallest, largest sequence */ int64_t nres; /* total # of residues in msa */ double avgid; /* average fractional pair id */ int max_comparisons; /* maximum # comparisons for avg id */ int do_stall; /* used to stall when debugging */ double **abc_ct = NULL; /* [0..msa->alen-1][0..abc->K] number of each residue at each position (abc->K is gap) */ double ***bp_ct = NULL; /* [0..msa->alen-1][0..abc->Kp-1][0..abc->Kp-1] per (non-pknotted) consensus basepair * * count of each possible basepair over all seqs basepairs are indexed by 'i' the minimum * * of 'i:j' for a pair between i and j, where i < j. */ double **pp_ct = NULL; /* [0..msa->alen-1][0..11], count of each posterior probability (PP) code, over all sequences, gap is 11 */ int *i_am_rf = NULL; /* [0..i..msa->alen-1]: TRUE if pos i is non-gap RF posn, if msa->rf == NULL remains NULL */ int *rf2a_map = NULL; /* [0..rfpos..rflen-1] = apos, * apos is the alignment position (0..msa->alen-1) that * is non-gap RF position rfpos+1 (for rfpos in 0..rflen-1) */ int rflen = -1; /* nongap RF length */ char errbuf[eslERRBUFSIZE]; int status; /* easel return code */ /* optional output files */ FILE *iinfofp = NULL; /* output file for --iinfo */ FILE *pcinfofp = NULL; /* output file for --pcinfo */ FILE *psinfofp = NULL; /* output file for --psinfo */ FILE *rinfofp = NULL; /* output file for --rinfo */ FILE *icinfofp = NULL; /* output file for --icinfo */ FILE *listfp = NULL; /* output file for --list */ FILE *cinfofp = NULL; /* output file for --cinfo */ FILE *bpinfofp = NULL; /* output file for --bpinfo */ int use_weights; /* TRUE if --weight, reported weighted counts (using msa->wgt) to all output files */ int weights_exist; /* TRUE if at least one msa->wgt value differs from 1.0, FALSE if not (or if msa->wgt==NULL) */ /*********************************************** * Parse command line ***********************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_GetBoolean(go, "-h") ) { esl_banner(stdout, argv[0], banner); esl_usage (stdout, argv[0], usage); puts("\n where options are:"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); puts("\n small memory mode, requires --amino,--dna, or --rna and --informat pfam:"); esl_opt_DisplayHelp(stdout, go, 2, 2, 80); puts("\n optional output files:"); esl_opt_DisplayHelp(stdout, go, 3, 2, 80); exit(0); } if (esl_opt_ArgNumber(go) != 1) { printf("Incorrect number of command line arguments.\n"); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } alifile = esl_opt_GetArg(go, 1); if (esl_opt_IsOn(go, "--informat") && (fmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--informat"))) == eslMSAFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --informat", esl_opt_GetString(go, "--informat")); if (esl_opt_GetBoolean(go, "--small") && fmt != eslMSAFILE_PFAM) esl_fatal("--small requires --informat pfam\n"); max_comparisons = 1000; do_stall = esl_opt_GetBoolean(go, "--stall"); /* a stall point for attaching gdb */ while (do_stall); /*********************************************** * Open the MSA file; determine alphabet; set for digital input ***********************************************/ if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); /* We'd like to get rid of the legacy msafile interface, but it * includes small memory functionality for Pfam format which we have * to replace first. For now, use both interfaces, new and legacy */ if ( esl_opt_GetBoolean(go, "--small") ) { if (! abc) esl_fatal("--small requires one of --amino, --dna, --rna be specified."); status = esl_msafile2_OpenDigital(abc, alifile, NULL, &old_afp); if (status == eslENOTFOUND) esl_fatal("Alignment file %s doesn't exist or is not readable\n", alifile); else if (status == eslEFORMAT) esl_fatal("Couldn't determine format of alignment %s\n", alifile); else if (status != eslOK) esl_fatal("Alignment file open failed with error %d\n", status); } else { if ( (status = eslx_msafile_Open(&abc, alifile, NULL, fmt, NULL, &afp)) != eslOK) eslx_msafile_OpenFailure(afp, status); } /************************************** * Open optional output files, as nec * **************************************/ /* determine name for first list file, if nec */ if( esl_opt_IsOn(go, "--list")) { if ((listfp = fopen(esl_opt_GetString(go, "--list"), "w")) == NULL) esl_fatal("Failed to open --list output file %s\n", esl_opt_GetString(go, "--list")); } if( esl_opt_IsOn(go, "--icinfo")) { if ((icinfofp = fopen(esl_opt_GetString(go, "--icinfo"), "w")) == NULL) esl_fatal("Failed to open --icinfo output file %s\n", esl_opt_GetString(go, "--icinfo")); } if( esl_opt_IsOn(go, "--rinfo")) { if ((rinfofp = fopen(esl_opt_GetString(go, "--rinfo"), "w")) == NULL) esl_fatal("Failed to open --rinfo output file %s\n", esl_opt_GetString(go, "--rinfo")); } if( esl_opt_IsOn(go, "--pcinfo")) { if ((pcinfofp = fopen(esl_opt_GetString(go, "--pcinfo"), "w")) == NULL) esl_fatal("Failed to open --pcinfo output file %s\n", esl_opt_GetString(go, "--pcinfo")); } if( esl_opt_IsOn(go, "--psinfo")) { if ((psinfofp = fopen(esl_opt_GetString(go, "--psinfo"), "w")) == NULL) esl_fatal("Failed to open --psinfo output file %s\n", esl_opt_GetString(go, "--psinfo")); } if( esl_opt_IsOn(go, "--iinfo")) { if ((iinfofp = fopen(esl_opt_GetString(go, "--iinfo"), "w")) == NULL) esl_fatal("Failed to open --iinfo output file %s\n", esl_opt_GetString(go, "--iinfo")); } if( esl_opt_IsOn(go, "--cinfo")) { if ((cinfofp = fopen(esl_opt_GetString(go, "--cinfo"), "w")) == NULL) esl_fatal("Failed to open --cinfo output file %s\n", esl_opt_GetString(go, "--cinfo")); } if( esl_opt_IsOn(go, "--bpinfo")) { if ((bpinfofp = fopen(esl_opt_GetString(go, "--bpinfo"), "w")) == NULL) esl_fatal("Failed to open --bpinfo output file %s\n", esl_opt_GetString(go, "--bpinfo")); } /*********************************************** * Read MSAs one at a time. ***********************************************/ if (esl_opt_GetBoolean(go, "-1")) { puts("#"); if(! esl_opt_GetBoolean(go, "--small")) { printf("# %-4s %-20s %10s %7s %7s %12s %6s %6s %10s %3s\n", "idx", "name", "format", "nseq", "alen", "nres", "small", "large", "avlen", "%id"); printf("# %-4s %-20s %10s %7s %7s %12s %6s %6s %10s %3s\n", "----", "--------------------", "----------", "-------", "-------", "------------", "------", "------", "----------", "---"); } else { printf("# %-4s %-20s %10s %7s %7s %12s %10s\n", "idx", "name", "format", "nseq", "alen", "nres", "avlen"); printf("# %-4s %-20s %10s %7s %7s %12s %10s\n", "----", "--------------------", "----------", "-------", "-------", "------------", "----------"); } } nali = 0; fmt = (esl_opt_GetBoolean(go, "--small") ? old_afp->format : afp->format); while ( (status = ( esl_opt_GetBoolean(go, "--small") ? esl_msafile2_ReadInfoPfam(old_afp, listfp, abc, -1, NULL, NULL, &msa, &nseq, &alen, NULL, NULL, NULL, NULL, NULL, &abc_ct, &pp_ct, NULL, NULL, NULL) : eslx_msafile_Read (afp, &msa))) == eslOK) { nali++; nres = 0; if (! esl_opt_GetBoolean(go, "--small")) { nseq = msa->nseq; alen = msa->alen; small = large = -1; for (i = 0; i < msa->nseq; i++) { rlen = esl_abc_dsqrlen(msa->abc, msa->ax[i]); nres += rlen; if (small == -1 || rlen < small) small = rlen; if (large == -1 || rlen > large) large = rlen; } esl_dst_XAverageId(abc, msa->ax, msa->nseq, max_comparisons, &avgid); } else { /* --small invoked */ for(i = 0; i < alen; i++) nres += (int) esl_vec_DSum(abc_ct[i], abc->K); } if (esl_opt_GetBoolean(go, "-1")) { printf("%-6d %-20s %10s %7d %7" PRId64 " %12" PRId64, nali, msa->name, eslx_msafile_DecodeFormat(fmt), nseq, alen, nres); if (! esl_opt_GetBoolean(go, "--small")) { printf(" %6" PRId64 " %6" PRId64 " %10.1f %3.0f\n", small, large, (double) nres / (double) msa->nseq, 100.*avgid); } else { printf(" %10.1f\n", (double) nres / (double) nseq); } } else { printf("Alignment number: %d\n", nali); if (msa->name != NULL) printf("Alignment name: %s\n", msa->name); printf("Format: %s\n", eslx_msafile_DecodeFormat(fmt)); printf("Number of sequences: %d\n", nseq); printf("Alignment length: %" PRId64 "\n", alen); printf("Total # residues: %" PRId64 "\n", nres); if(! esl_opt_GetBoolean(go, "--small")) { printf("Smallest: %" PRId64 "\n", small); printf("Largest: %" PRId64 "\n", large); } printf("Average length: %.1f\n", (double) nres / (double) nseq); if(! esl_opt_GetBoolean(go, "--small")) { printf("Average identity: %.0f%%\n", 100.*avgid); } printf("//\n"); } /* Dump data to optional output files, if nec */ if(esl_opt_IsOn(go, "--list")) { if(! esl_opt_GetBoolean(go, "--small")) { /* only print sequence name to list file if ! --small, else we already have in esl_msafile2_ReadInfoPfam() */ for(i = 0; i < msa->nseq; i++) fprintf(listfp, "%s\n", msa->sqname[i]); } } /* if RF exists, get i_am_rf array[0..alen] which tells us which positions are non-gap RF positions * and rf2a_map, a map of non-gap RF positions to overall alignment positions */ if(msa->rf != NULL) { if((status = map_rfpos_to_apos(msa, abc, errbuf, alen, &i_am_rf, &rf2a_map, &rflen)) != eslOK) esl_fatal(errbuf); } else i_am_rf = NULL; weights_exist = check_msa_weights(msa); use_weights = (weights_exist && esl_opt_GetBoolean(go, "--weight")) ? TRUE : FALSE; if( (! esl_opt_GetBoolean(go, "--small")) && (esl_opt_IsOn(go, "--icinfo") || esl_opt_IsOn(go, "--rinfo") || esl_opt_IsOn(go, "--pcinfo") || esl_opt_IsOn(go, "--cinfo") || esl_opt_IsOn(go, "--bpinfo"))) { /* collect counts of each residue and PPs (if they exist) from the msa */ if(esl_opt_GetBoolean(go, "--weight") && msa->wgt == NULL) esl_fatal("--weight requires all alignments have #=GS WT annotation, but aln %d does not", nali); if((status = count_msa(msa, errbuf, nali, esl_opt_GetBoolean(go, "--noambig"), /* ignore ambiguous residues? */ esl_opt_GetBoolean(go, "--weight"), /* use msa->wgt sequence weights? */ &abc_ct, ((bpinfofp != NULL && msa->ss_cons != NULL) ? &bp_ct : NULL), /* get basepair counts? */ (msa->pp != NULL ? &pp_ct : NULL))) /* get PP counts? */ != eslOK) esl_fatal(errbuf); } if( esl_opt_IsOn(go, "--icinfo")) { if((status = dump_infocontent_info(icinfofp, abc, abc_ct, use_weights, nali, alen, nseq, i_am_rf, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } if( esl_opt_IsOn(go, "--rinfo")) { if((status = dump_residue_info(rinfofp, abc, abc_ct, use_weights, nali, alen, nseq, i_am_rf, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } if(esl_opt_IsOn(go, "--pcinfo")) { if(pp_ct == NULL) esl_fatal("Error: --pcinfo requires all alignments have #=GR PP annotation, but alignment %d does not", nali); if((status = dump_posterior_column_info(pcinfofp, pp_ct, use_weights, nali, alen, nseq, i_am_rf, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } if(esl_opt_IsOn(go, "--psinfo")) { if(msa->pp == NULL) esl_fatal("Error: --psinfo requires all alignments have #=GR PP annotation, but alignment %d does not", nali); if((status = dump_posterior_sequence_info(psinfofp, msa, nali, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } if( esl_opt_IsOn(go, "--iinfo")) { if(msa->rf == NULL) esl_fatal("--iinfo requires all alignments have #=GC RF annotation, but alignment %d does not", nali); if(esl_opt_GetBoolean(go, "--weight") && msa->wgt == NULL) esl_fatal("--weight requires all alignments have #=GS WT annotation, but aln %d does not", nali); if((status = dump_insert_info(iinfofp, msa, use_weights, nali, i_am_rf, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } if( esl_opt_IsOn(go, "--cinfo")) { if((status = dump_column_residue_counts(cinfofp, abc, abc_ct, esl_opt_GetBoolean(go, "--noambig"), use_weights, nali, alen, nseq, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } if( esl_opt_IsOn(go, "--bpinfo")) { if(msa->ss_cons == NULL) esl_fatal("--bpinfo requires all alignments have #=GC SS_cons annotation, but alignment %d does not", nali); if((status = dump_basepair_counts(bpinfofp, msa, abc, bp_ct, use_weights, nali, nseq, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } esl_msa_Destroy(msa); if(abc_ct != NULL) { esl_Free2D((void **) abc_ct, alen); abc_ct = NULL; } if(bp_ct != NULL) { esl_Free3D((void ***) bp_ct, alen, abc->Kp); bp_ct = NULL; } if(pp_ct != NULL) { esl_Free2D((void **) pp_ct, alen); pp_ct = NULL; } if(i_am_rf != NULL) { free(i_am_rf); i_am_rf = NULL; } if(rf2a_map != NULL) { free(rf2a_map); rf2a_map = NULL; } } /* If an msa read failed, we've dropped out to here with an informative status code. * we have to handle failures from new vs. legacy msa parsing differently */ if (esl_opt_GetBoolean(go, "--small")) { if (status == eslEFORMAT) esl_fatal("Alignment file parse error, line %d of file %s:\n%s\nOffending line is:\n%s\n", old_afp->linenumber, old_afp->fname, old_afp->errbuf, old_afp->buf); else if (status != eslEOF) esl_fatal("Alignment file read failed with error code %d\n", status); else if (nali == 0) esl_fatal("No alignments found in file %s\n", alifile); } else { if (nali == 0 || status != eslEOF) eslx_msafile_ReadFailure(afp, status); } /* Cleanup, normal return */ if(listfp != NULL) { fclose(listfp); printf("# List of sequences in %d alignment(s) saved to file %s\n", nali, esl_opt_GetString(go, "--list")); } if(icinfofp != NULL) { fclose(icinfofp); printf("# Information content data saved to file %s.\n", esl_opt_GetString(go, "--icinfo")); } if(rinfofp != NULL) { fclose(rinfofp); printf("# Residue data saved to file %s.\n", esl_opt_GetString(go, "--rinfo")); } if(pcinfofp != NULL) { fclose(pcinfofp); printf("# Per-column posterior probability data saved to file %s.\n", esl_opt_GetString(go, "--pcinfo")); } if(psinfofp != NULL) { fclose(psinfofp); printf("# Per-sequence posterior probability data saved to file %s.\n", esl_opt_GetString(go, "--psinfo")); } if(iinfofp != NULL) { printf("# Insert data saved to file %s.\n", esl_opt_GetString(go, "--iinfo")); fclose(iinfofp); } if(cinfofp != NULL) { printf("# Per-column counts data saved to file %s.\n", esl_opt_GetString(go, "--cinfo")); fclose(cinfofp); } if(bpinfofp != NULL) { printf("# Per-column basepair counts data saved to file %s.\n", esl_opt_GetString(go, "--bpinfo")); fclose(bpinfofp); } if (afp) eslx_msafile_Close(afp); if (old_afp) esl_msafile2_Close(old_afp); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
/* annotate_posterior_probability() * Synopsis: Add posterior probability annotation lines to new MSA. */ static int annotate_posterior_probability(ESL_MSA *msa, P7_TRACE **tr, const int *matmap, int M, int optflags) { double *totp = NULL; /* total posterior probability in column <apos>: [0..alen-1] */ int *matuse = NULL; /* #seqs with pp annotation in column <apos>: [0..alen-1] */ int idx; /* counter over sequences [0..nseq-1] */ int apos; /* counter for alignment columns: pp's are [0..alen-1] (unlike ax) */ int z; /* counter over trace positions [0..tr->N-1] */ int status; /* Determine if any of the traces have posterior probability annotation. */ for (idx = 0; idx < msa->nseq; idx++) if (tr[idx]->pp != NULL) break; if (idx == msa->nseq) return eslOK; ESL_ALLOC(matuse, sizeof(double) * (msa->alen)); esl_vec_ISet(matuse, msa->alen, 0); ESL_ALLOC(totp, sizeof(double) * (msa->alen)); esl_vec_DSet(totp, msa->alen, 0.0); ESL_ALLOC(msa->pp, sizeof(char *) * msa->sqalloc); for (idx = 0; idx < msa->nseq; idx++) { if (tr[idx]->pp == NULL) { msa->pp[idx] = NULL; continue; } ESL_ALLOC(msa->pp[idx], sizeof(char) * (msa->alen+1)); for (apos = 0; apos < msa->alen; apos++) msa->pp[idx][apos] = '.'; msa->pp[idx][msa->alen] = '\0'; apos = 0; for (z = 0; z < tr[idx]->N; z++) { switch (tr[idx]->st[z]) { case p7T_M: msa->pp[idx][matmap[tr[idx]->k[z]]-1] = p7_alidisplay_EncodePostProb(tr[idx]->pp[z]); totp [matmap[tr[idx]->k[z]]-1]+= tr[idx]->pp[z]; matuse[matmap[tr[idx]->k[z]]-1]++; case p7T_D: apos = matmap[tr[idx]->k[z]]; break; case p7T_I: if ( !(optflags & p7_TRIM) || (tr[idx]->k[z] != 0 && tr[idx]->k[z] != M)) { msa->pp[idx][apos] = p7_alidisplay_EncodePostProb(tr[idx]->pp[z]); apos++; } break; case p7T_N: case p7T_C: if (! (optflags & p7_TRIM) && tr[idx]->i[z] > 0) { msa->pp[idx][apos] = p7_alidisplay_EncodePostProb(tr[idx]->pp[z]); apos++; } break; case p7T_E: apos = matmap[M]; /* set position for C-terminal tail */ break; default: break; } } } for (; idx < msa->sqalloc; idx++) msa->pp[idx] = NULL; /* for completeness, following easel MSA conventions, but should be a no-op: nseq==sqalloc */ /* Consensus posterior probability annotation: only on match columns */ ESL_ALLOC(msa->pp_cons, sizeof(char) * (msa->alen+1)); for (apos = 0; apos < msa->alen; apos++) msa->pp_cons[apos] = '.'; msa->pp_cons[msa->alen] = '\0'; for (apos = 0; apos < msa->alen; apos++) if (matuse[apos]) msa->pp_cons[apos] = p7_alidisplay_EncodePostProb( totp[apos] / (double) matuse[apos]); free(matuse); free(totp); return eslOK; ERROR: if (matuse != NULL) free(matuse); if (totp != NULL) free(totp); if (msa->pp != NULL) esl_Free2D((void **) msa->pp, msa->sqalloc); return status; }