/* Function: InitializeCP9Matrix() * Purpose: Set all valid cells in a CP9 matrix to -INFTY * * Return: (void) */ void InitializeCP9Matrix(CP9_MX *mx) { esl_vec_ISet(mx->mmx_mem, mx->ncells_valid, -INFTY); esl_vec_ISet(mx->imx_mem, mx->ncells_valid, -INFTY); esl_vec_ISet(mx->dmx_mem, mx->ncells_valid, -INFTY); esl_vec_ISet(mx->elmx_mem, mx->ncells_valid, -INFTY); esl_vec_ISet(mx->erow, mx->rows, -INFTY); return; }
/* The DChoose() and FChoose() unit tests. */ static void utest_choose(ESL_RANDOMNESS *r, int n, int nbins, int be_verbose) { double *pd = NULL; float *pf = NULL; int *ct = NULL; int i; double X2, diff, exp, X2p; if ((pd = malloc(sizeof(double) * nbins)) == NULL) esl_fatal("malloc failed"); if ((pf = malloc(sizeof(float) * nbins)) == NULL) esl_fatal("malloc failed"); if ((ct = malloc(sizeof(int) * nbins)) == NULL) esl_fatal("malloc failed"); /* Sample a random multinomial probability vector. */ if (esl_dirichlet_DSampleUniform(r, nbins, pd) != eslOK) esl_fatal("dirichlet sample failed"); esl_vec_D2F(pd, nbins, pf); /* Sample observed counts using DChoose(). */ esl_vec_ISet(ct, nbins, 0); for (i = 0; i < n; i++) ct[esl_rnd_DChoose(r, pd, nbins)]++; /* X^2 test on those observed counts. */ for (X2 = 0., i=0; i < nbins; i++) { exp = (double) n * pd[i]; diff = (double) ct[i] - exp; X2 += diff*diff/exp; } if (esl_stats_ChiSquaredTest(nbins, X2, &X2p) != eslOK) esl_fatal("chi square eval failed"); if (be_verbose) printf("DChoose(): \t%g\n", X2p); if (X2p < 0.01) esl_fatal("chi squared test failed"); /* Repeat above for FChoose(). */ esl_vec_ISet(ct, nbins, 0); for (i = 0; i < n; i++) ct[esl_rnd_FChoose(r, pf, nbins)]++; for (X2 = 0., i=0; i < nbins; i++) { exp = (double) n * pd[i]; diff = (double) ct[i] - exp; X2 += diff*diff/exp; } if (esl_stats_ChiSquaredTest(nbins, X2, &X2p) != eslOK) esl_fatal("chi square eval failed"); if (be_verbose) printf("FChoose(): \t%g\n", X2p); if (X2p < 0.01) esl_fatal("chi squared test failed"); free(pd); free(pf); free(ct); return; }
/* Function: esl_msaweight_IDFilter() * Synopsis: Filter by %ID. * Incept: ER, Wed Oct 29 10:06:43 2008 [Janelia] * * Purpose: Constructs a new alignment by removing near-identical * sequences from a given alignment (where identity is * calculated *based on the alignment*). * Does not affect the given alignment. * Keeps earlier sequence, discards later one. * * Usually called as an ad hoc sequence "weighting" mechanism. * * Limitations: * Unparsed Stockholm markup is not propagated into the * new alignment. * * Return: <eslOK> on success, and the <newmsa>. * * Throws: <eslEMEM> on allocation error. <eslEINVAL> if a pairwise * identity calculation fails because of corrupted sequence * data. In either case, the <msa> is unmodified. * * Xref: squid::weight.c::FilterAlignment(). */ int esl_msaweight_IDFilter(const ESL_MSA *msa, double maxid, ESL_MSA **ret_newmsa) { int *list = NULL; /* array of seqs in new msa */ int *useme = NULL; /* TRUE if seq is kept in new msa */ int nnew; /* number of seqs in new alignment */ double ident; /* pairwise percentage id */ int i,j; /* seqs counters*/ int remove; /* TRUE if sq is to be removed */ int status; /* Contract checks */ ESL_DASSERT1( (msa != NULL) ); ESL_DASSERT1( (msa->nseq >= 1) ); ESL_DASSERT1( (msa->alen >= 1) ); /* allocate */ ESL_ALLOC(list, sizeof(int) * msa->nseq); ESL_ALLOC(useme, sizeof(int) * msa->nseq); esl_vec_ISet(useme, msa->nseq, 0); /* initialize array */ /* find which seqs to keep (list) */ nnew = 0; for (i = 0; i < msa->nseq; i++) { remove = FALSE; for (j = 0; j < nnew; j++) { if (! (msa->flags & eslMSA_DIGITAL)) { if ((status = esl_dst_CPairId(msa->aseq[i], msa->aseq[list[j]], &ident, NULL, NULL)) != eslOK) goto ERROR; } #ifdef eslAUGMENT_ALPHABET else { if ((status = esl_dst_XPairId(msa->abc, msa->ax[i], msa->ax[list[j]], &ident, NULL, NULL)) != eslOK) goto ERROR; } #endif if (ident > maxid) { remove = TRUE; break; } } if (remove == FALSE) { list[nnew++] = i; useme[i] = TRUE; } } if ((status = esl_msa_SequenceSubset(msa, useme, ret_newmsa)) != eslOK) goto ERROR; free(list); free(useme); return eslOK; ERROR: if (list != NULL) free(list); if (useme != NULL) free(useme); return status; }
/* map_sub_msas * * msa1 and msa2 contain the same named sequences, msa1 contains a superset * of the columns in msa2. Determine which of the msa1 columns the msa2 * columns correspond to. */ static int map_sub_msas(const ESL_GETOPTS *go, char *errbuf, ESL_MSA *msa1, ESL_MSA *msa2, char **ret_msa1_to_msa2_mask) { int status; int apos1, apos2; /* counters over alignment position in msa1, msa2 respectively */ int i; int *msa1_to_msa2_map; /* [0..apos1..msa1->alen] msa2 alignment position that apos1 corresponds to */ char *mask; /* contract check */ if(! (msa1->flags & eslMSA_DIGITAL)) ESL_FAIL(eslEINVAL, errbuf, "in map_sub_msas() msa1 (%s) not digitized.\n", esl_opt_GetArg(go, 1)); if(! (msa2->flags & eslMSA_DIGITAL)) ESL_FAIL(eslEINVAL, errbuf, "in map_sub_msas() msa2 (%s) not digitized.\n", esl_opt_GetString(go, "--submap")); if(msa1->alen <= msa2->alen) ESL_FAIL(eslEINVAL, errbuf, "in map_sub_msas() alignment length for msa1 (%" PRId64 "d) <= length for msa2 (%" PRId64 ")\n", msa1->alen, msa2->alen); ESL_ALLOC(mask, sizeof(char) * (msa1->alen+1)); for(apos1 = 0; apos1 < msa1->alen; apos1++) mask[apos1] = '0'; mask[msa1->alen] = '\0'; ESL_ALLOC(msa1_to_msa2_map, sizeof(int) * (msa1->alen+1)); esl_vec_ISet(msa1_to_msa2_map, (msa1->alen+1), -1); /* both alignments must have same 'named' sequences in same order */ if(msa1->nseq != msa2->nseq) ESL_FAIL(eslEINVAL, errbuf, "in map_sub_msas() msa1 has %d sequences, msa2 has %d sequences\n", msa1->nseq, msa2->nseq); for(i = 0; i < msa1->nseq; i++) { if(strcmp(msa1->sqname[i], msa2->sqname[i]) != 0) ESL_FAIL(eslEINVAL, errbuf, "in map_sub_msas() msa1 seq %d is named %s, msa2 seq %d is named %s\n", i, msa1->sqname[i], i, msa2->sqname[i]); } apos1 = 1; apos2 = 1; while((apos2 <= msa2->alen) || (apos1 <= msa1->alen)) { /* determine which apos1 (alignment column in msa1), apos2 (alignment column in msa2) corresponds to */ for(i = 0; i < msa1->nseq; i++) { if(msa1->ax[i][apos1] != msa2->ax[i][apos2]) { apos1++; break; /* try next apos1 */ } } if(i == msa1->nseq) { /* found a match */ msa1_to_msa2_map[apos1] = apos2; mask[(apos1-1)] = '1'; apos1++; apos2++; } } if((apos1 != (msa1->alen+1)) || (apos2 != (msa2->alen+1))) ESL_FAIL(eslEINVAL, errbuf, "in map_sub_msas(), failure mapping alignments, end of loop apos1-1 = %d (msa1->alen: %" PRId64 ") and apos2-1 = %d (msa2->alen: %" PRId64 ")\n", apos1-1, msa1->alen, apos2-1, msa2->alen); free(msa1_to_msa2_map); *ret_msa1_to_msa2_mask = mask; return eslOK; ERROR: return status; }
/* map_rfpos_to_apos * * Given an MSA, determine the alignment position each * reference position refers to. */ static int map_rfpos_to_apos(ESL_MSA *msa, int **ret_rf2a_map, int **ret_a2rf_map, int *ret_rflen) { int status; int rflen = 0; int *rf2a_map = NULL; int *a2rf_map = NULL; int rfpos = 0; int apos = 0; /* contract check */ if(msa->rf == NULL) { status = eslEINVAL; goto ERROR; } /* count reference columns */ for(apos = 1; apos <= msa->alen; apos++) if((! esl_abc_CIsGap(msa->abc, msa->rf[(apos-1)])) && (! esl_abc_CIsMissing(msa->abc, msa->rf[(apos-1)])) && (! esl_abc_CIsNonresidue(msa->abc, msa->rf[(apos-1)]))) rflen++; /* build maps */ ESL_ALLOC(rf2a_map, sizeof(int) * (rflen+1)); ESL_ALLOC(a2rf_map, sizeof(int) * (msa->alen+1)); esl_vec_ISet(a2rf_map, msa->alen+1, -1); rf2a_map[0] = -1; for(apos = 1; apos <= msa->alen; apos++) { if((! esl_abc_CIsGap(msa->abc, msa->rf[(apos-1)])) && (! esl_abc_CIsMissing(msa->abc, msa->rf[(apos-1)])) && (! esl_abc_CIsNonresidue(msa->abc, msa->rf[(apos-1)]))) { rf2a_map[++rfpos] = apos; a2rf_map[apos] = rfpos; } /* else a2rf_map[apos] remains -1 as it was initialized */ } if(ret_rf2a_map != NULL) *ret_rf2a_map = rf2a_map; else free(rf2a_map); if(ret_a2rf_map != NULL) *ret_a2rf_map = a2rf_map; else free(a2rf_map); if(ret_rflen != NULL) *ret_rflen = rflen; return eslOK; ERROR: if(rf2a_map != NULL) free(rf2a_map); if(a2rf_map != NULL) free(a2rf_map); return status; }
/* get_gaps_per_column * * Given an MSA, determine the number of gaps per * column, and return a newly allocated array with this * into in *ret_ngaps. */ static int get_gaps_per_column(ESL_MSA *msa, int **ret_ngaps) { int status; int i, apos; int *ngaps = NULL; /* contract check */ if(! (msa->flags & eslMSA_DIGITAL)) { status = eslEINVAL; goto ERROR; } ESL_ALLOC(ngaps, sizeof(int) * (msa->alen+1)); esl_vec_ISet(ngaps, msa->alen+1, 0); for(i = 0; i < msa->nseq; i++) { for(apos = 1; apos <= msa->alen; apos++) ngaps[apos] += esl_abc_XIsGap(msa->abc, msa->ax[i][apos]); } *ret_ngaps = ngaps; return eslOK; ERROR: if(ngaps != NULL) free(ngaps); return status; }
/* The esl_random() unit test: * a binned frequency test. */ static void utest_random(long seed, int n, int nbins, int be_verbose) { ESL_RANDOMNESS *r = NULL; int *counts = NULL; double X2p = 0.; int i; double X2, exp, diff; if ((counts = malloc(sizeof(int) * nbins)) == NULL) esl_fatal("malloc failed"); esl_vec_ISet(counts, nbins, 0); /* This contrived call sequence exercises CreateTimeseeded() and * Init(), while leaving us a reproducible chain. Because it's * reproducible, we know this test succeeds, despite being * statistical in nature. */ if ((r = esl_randomness_CreateTimeseeded()) == NULL) esl_fatal("randomness create failed"); if (esl_randomness_Init(r, seed) != eslOK) esl_fatal("randomness init failed"); for (i = 0; i < n; i++) counts[esl_rnd_Roll(r, nbins)]++; /* X^2 value: \sum (o_i - e_i)^2 / e_i */ for (X2 = 0., i = 0; i < nbins; i++) { exp = (double) n / (double) nbins; diff = (double) counts[i] - exp; X2 += diff*diff/exp; } if (esl_stats_ChiSquaredTest(nbins, X2, &X2p) != eslOK) esl_fatal("chi squared eval failed"); if (be_verbose) printf("random(): \t%g\n", X2p); if (X2p < 0.01) esl_fatal("chi squared test failed"); esl_randomness_Destroy(r); free(counts); return; }
/* Function: esl_msaweight_BLOSUM() * Synopsis: BLOSUM weights. * Incept: SRE, Sun Nov 5 09:52:41 2006 [Janelia] * * Purpose: Given a multiple sequence alignment <msa> and an identity * threshold <maxid>, calculate sequence weights using the * BLOSUM algorithm (Henikoff and Henikoff, PNAS * 89:10915-10919, 1992). These weights are stored * internally in the <msa> object, replacing any weights * that may have already been there. Weights are $\geq 0$ * and they sum to <msa->nseq>. * * The algorithm does a single linkage clustering by * fractional id, defines clusters such that no two clusters * have a pairwise link $\geq$ <maxid>), and assigns * weights of $\frac{1}{M_i}$ to each of the $M_i$ * sequences in each cluster $i$. The <maxid> threshold * is a fractional pairwise identity, in the range * $0..1$. * * The <msa> may be in either digitized or text mode. * Digital mode is preferred, so that the pairwise identity * calculations deal with degenerate residue symbols * properly. * * Returns: <eslOK> on success, and the weights inside <msa> have been * modified. * * Throws: <eslEMEM> on allocation error. <eslEINVAL> if a pairwise * identity calculation fails because of corrupted sequence * data. In either case, the <msa> is unmodified. * * Xref: [Henikoff92]; squid::weight.c::BlosumWeights(). */ int esl_msaweight_BLOSUM(ESL_MSA *msa, double maxid) { int *c = NULL; /* cluster assignments for each sequence */ int *nmem = NULL; /* number of seqs in each cluster */ int nc; /* number of clusters */ int i; /* loop counter */ int status; /* Contract checks */ ESL_DASSERT1( (maxid >= 0. && maxid <= 1.) ); ESL_DASSERT1( (msa->nseq >= 1) ); ESL_DASSERT1( (msa->alen >= 1) ); if (msa->nseq == 1) { msa->wgt[0] = 1.0; return eslOK; } if ((status = esl_msacluster_SingleLinkage(msa, maxid, &c, NULL, &nc)) != eslOK) goto ERROR; ESL_ALLOC(nmem, sizeof(int) * nc); esl_vec_ISet(nmem, nc, 0); for (i = 0; i < msa->nseq; i++) nmem[c[i]]++; for (i = 0; i < msa->nseq; i++) msa->wgt[i] = 1. / (double) nmem[c[i]]; /* Make weights normalize up to nseq, and return. */ esl_vec_DNorm(msa->wgt, msa->nseq); esl_vec_DScale(msa->wgt, msa->nseq, (double) msa->nseq); msa->flags |= eslMSA_HASWGTS; free(nmem); free(c); return eslOK; ERROR: if (c != NULL) free(c); if (nmem != NULL) free(nmem); return status; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* application configuration */ ESL_ALPHABET *abc = NULL; /* biological alphabet */ char *alifile = NULL; /* alignment file name */ int fmt; /* format code for alifiles */ ESL_MSAFILE *afp = NULL; /* open alignment file */ ESL_MSA *msa = NULL; /* multiple sequence alignment */ int status; /* easel return code */ int do_info = TRUE; /* TRUE if -i */ int do_max = FALSE; /* TRUE if -x */ int do_ffreq = FALSE; /* TRUE if --ffreq */ int do_fmin = FALSE; /* TRUE if --fmin */ float fthresh = 0.; /* <x> from -f <x> */ int do_remove_bps = FALSE; /* TRUE if -r */ int do_consistent = FALSE; /* TRUE if -c */ int do_indi2cons = FALSE; /* TRUE if --indi <x> */ int have_cons; /* TRUE if first alignment has consensus sequence */ int do_newcons = FALSE; /* TRUE if we're creating a new consensus structure * and outputing a new alignment (if -x -f -c or --indi) */ int do_a = FALSE; /* TRUE if -a */ char *indi; /* for <x> from --indi <x> */ int nindi_read; /* number of individual sequence SS lines we've read for current alignment */ int a; /* counter over seqs */ int i, i2; /* counter over residues */ int j, j2; /* counter over residues */ int nali; /* counter over alignments */ int **bp = NULL; /* bp[i][j] is number of individual bps exist between aln cols i and j */ int *cur_ct = NULL; /* ct array of basepairs for current sequence */ int *cons_ct = NULL; /* ct array of basepairs for SS_cons being created */ int *xcons_ct = NULL; /* ct array of basepairs for existing SS_cons */ int *ngaps = NULL; /* number of gaps in each alignment position */ FILE *ofp; /* output file (default is stdout) */ int be_verbose = FALSE; /* TRUE to print extra info */ int seqthresh; /* sequence number threshold for defining a bp as consensus (int) ((fthresh * nseq) + 0.5)*/ char *sscons = NULL; /* the new SS_cons line */ FILE *lfp = NULL; /* file to list sequences with conflicting bps to */ int nlist = 0; /* number of sequences listed to list file */ int *nconflictsA; /* number of conflicting bps in seq a's individual structure annotation */ int nconflicts_total = 0; /* total number of conflicts */ int nconflicts_list = 0; /* total number of conflicts in sequences listed to file <x> from -l <x> */ int noverlaps_total = 0; /* total number of overlaps */ int nconsistent_total = 0; /* total number of consistent bps */ int nbps_total = 0; /* total number of bps */ int *nconsistentA; /* number of consistent bps in seq a's individual structure annotation */ int *noverlapsA; /* number of bps in seq a's indi structure that overlap with consensus structure */ int *nbpsA; /* number of bps in seq a's indi structure that overlap with consensus structure */ int ncons_bps = 0; /* number of bps in consensus structure */ int max_noverlaps_aidx; int max_nconsistent_aidx; int max_nbps_aidx; int *removebp; /* removebp[i] is TRUE remove consensus bp [i]:xcons_ct[i] */ int *has_conflict; int *nmates_l2r; /* half matrix, nmate_l2r[i] = <x>, i < nmate_l2r[i], there are <x> different right mates j for i */ int *nmates_r2l; /* half matrix, nmate_r2l[j] = <x>, j < nmate_r2l[j], there are <x> different left mates i for j */ int lmax; /* with -l, maximum number of conflicts to allow */ int namewidth = 18; /* length of 'SS_cons(consensus)' */ char *namedashes = NULL; /* to store underline for seq name */ /* --fmin related variables */ int nbps = 0; int prev_nbps = -1; float fmin; int inconsistent_flag; int pknot_flag; int k,l; /*********************************************** * Parse command line ***********************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_GetBoolean(go, "-h") ) { esl_banner(stdout, argv[0], banner); esl_usage (stdout, argv[0], usage); puts("\nwhere basic options are:"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); puts("\noptions for defining a new consensus structure (all of these require -o):"); esl_opt_DisplayHelp(stdout, go, 2, 2, 80); puts("\noptions for listing sequences based on structure:"); esl_opt_DisplayHelp(stdout, go, 3, 2, 80); exit(0); } if (esl_opt_ArgNumber(go) != 1) { printf("Incorrect number of command line arguments.\n"); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } alifile = esl_opt_GetArg(go, 1); fmt = eslMSAFILE_STOCKHOLM; /*********************************************** * Open the MSA file; determine alphabet; set for digital input ***********************************************/ if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); if ( (status = esl_msafile_Open(&abc, alifile, NULL, fmt, NULL, &afp)) != eslOK) esl_msafile_OpenFailure(afp, status); /* open output file */ if (esl_opt_GetString(go, "-o") != NULL) { if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) esl_fatal("Failed to open -o output file %s\n", esl_opt_GetString(go, "-o")); } else ofp = NULL; if (esl_opt_GetString(go, "-l") != NULL) { if ((lfp = fopen(esl_opt_GetString(go, "-l"), "w")) == NULL) esl_fatal("Failed to open -l output file %s\n", esl_opt_GetString(go, "-l")); } /* determine if we're creating a structure */ do_max = esl_opt_GetBoolean(go, "-x"); if(!(esl_opt_IsDefault(go, "--ffreq"))) { do_ffreq = TRUE; fthresh = esl_opt_GetReal(go, "--ffreq"); } if(!(esl_opt_IsDefault(go, "--fmin"))) { do_fmin = TRUE; } do_remove_bps = esl_opt_GetBoolean(go, "-r"); do_consistent = esl_opt_GetBoolean(go, "-c"); if(!(esl_opt_IsDefault(go, "--indi"))) { do_indi2cons = TRUE; } if(do_max || do_ffreq || do_fmin || do_remove_bps || do_consistent || do_indi2cons) { do_newcons = TRUE; } do_a = esl_opt_GetBoolean(go, "-a"); if(do_a || do_max || do_ffreq || do_fmin || do_remove_bps || do_consistent || do_indi2cons) { do_info = FALSE; } /*********************************************** * Read MSAs one at a time. ***********************************************/ nali = 0; have_cons = FALSE; lmax = esl_opt_GetInteger(go, "--lmax"); if(esl_opt_GetBoolean(go, "-v")) be_verbose = TRUE; while ((status = esl_msafile_Read(afp, &msa)) != eslEOF) { if (status != eslOK) esl_msafile_ReadFailure(afp, status); nali++; /* determine max length name */ namewidth = 18; /* length of 'SS_cons(consensus)' */ for(i = 0; i < msa->nseq; i++) namewidth = ESL_MAX(namewidth, strlen(msa->sqname[i])); if(namedashes != NULL) { free(namedashes); } ESL_ALLOC(namedashes, sizeof(char) * namewidth+1); namedashes[namewidth] = '\0'; for(i = 0; i < namewidth; i++) namedashes[i] = '-'; ESL_ALLOC(sscons, sizeof(char) * (msa->alen+1)); ESL_ALLOC(cur_ct, sizeof(int) * (msa->alen+1)); ESL_ALLOC(cons_ct, sizeof(int) * (msa->alen+1)); ESL_ALLOC(xcons_ct, sizeof(int) * (msa->alen+1)); ESL_ALLOC(bp, sizeof(int *) * (msa->alen+1)); ESL_ALLOC(removebp, sizeof(int) * (msa->alen+1)); ESL_ALLOC(has_conflict, sizeof(int) * (msa->alen+1)); ESL_ALLOC(nmates_l2r, sizeof(int) * (msa->alen+1)); ESL_ALLOC(nmates_r2l, sizeof(int) * (msa->alen+1)); esl_vec_ISet(cur_ct, (msa->alen+1), 0); esl_vec_ISet(cons_ct, (msa->alen+1), 0); esl_vec_ISet(xcons_ct, (msa->alen+1), 0); esl_vec_ISet(removebp, (msa->alen+1), FALSE); esl_vec_ISet(has_conflict, (msa->alen+1), FALSE); esl_vec_ISet(nmates_l2r, (msa->alen+1), 0); esl_vec_ISet(nmates_r2l, (msa->alen+1), 0); ESL_ALLOC(nconflictsA, sizeof(int) * msa->nseq); ESL_ALLOC(noverlapsA, sizeof(int) * msa->nseq); ESL_ALLOC(nconsistentA, sizeof(int) * msa->nseq); ESL_ALLOC(nbpsA, sizeof(int) * msa->nseq); esl_vec_ISet(nconflictsA, msa->nseq, 0); esl_vec_ISet(noverlapsA, msa->nseq, 0); esl_vec_ISet(nconsistentA, msa->nseq, 0); esl_vec_ISet(nbpsA, msa->nseq, 0); max_noverlaps_aidx = max_nconsistent_aidx = max_nbps_aidx = 0; nconsistent_total = nbps_total = noverlaps_total = nconflicts_total = nconflicts_list = 0; for(i = 1; i <= msa->alen; i++) { ESL_ALLOC(bp[i], sizeof(int) * (msa->alen+1)); esl_vec_ISet(bp[i], (msa->alen+1), 0); } /* make sure we have ss_cons and indi ss if we need it */ if(msa->ss_cons == NULL && do_remove_bps) esl_fatal("-r requires all alignments have SS_cons annotation, alignment %d does not.", nali); if(msa->ss == NULL && do_max) esl_fatal("-x requires all alignments have individual SS annotation, alignment %d does not.", nali); if(msa->ss == NULL && do_consistent) esl_fatal("-c requires all alignments have individual SS annotation, alignment %d does not.", nali); if(msa->ss == NULL && do_indi2cons) esl_fatal("--indi requires all alignments have individual SS annotation, alignment %d does not.", nali); if(msa->ss == NULL && do_ffreq) esl_fatal("--ffreq requires all alignments have individual SS annotation, alignment %d does not.", nali); if(msa->ss == NULL && do_fmin) esl_fatal("--fmin requires all alignments have individual SS annotation, alignment %d does not.", nali); if(msa->ss_cons != NULL) { if((status = esl_wuss2ct(msa->ss_cons, msa->alen, xcons_ct)) != eslOK) { esl_fatal("Existing SS_cons for alignment %d is invalid.", nali); } ncons_bps = 0; for(i = 1; i <= msa->alen; i++) if(xcons_ct[i] != 0 && i < xcons_ct[i]) ncons_bps++; if(nali > 1 && !have_cons) esl_fatal("the first aln has SS_cons but aln %d lacks it, if one has it, they all must.", nali); if(nali == 1) have_cons = TRUE; } else if (lfp != NULL) { esl_fatal("the -l option requires existing SS_cons annotation, aln %d lacks it.", nali); } else if (do_remove_bps) { esl_fatal("the -r option requires existing SS_cons annotation, aln %d lacks it.", nali); } else if (do_consistent) { esl_fatal("the -c option requires existing SS_cons annotation, aln %d lacks it.", nali); } else { if(nali > 1 && have_cons) esl_fatal("the first aln does not have SS_cons but aln %d does, if one has it, they all must.", nali); } if(do_info) { printf("# Per-sequence basepair information:\n"); printf("# Alignment file: %s\n", alifile); printf("# Alignment idx: %d\n", nali); if(msa->name != NULL) { printf("# Alignment name: %s\n", msa->name); } if(have_cons) { printf("#\n"); printf("# indibp: number of basepairs in the individual sequence SS annotation\n"); printf("# ovrlap: number of indibp basepairs that also exist as consensus basepairs\n"); printf("# cnsist: number of indibp basepairs that do not conflict with any consensus basepairs\n"); printf("# cnflct: number of indibp basepairs that conflict with >= 1 consensus basepairs\n"); printf("#\n"); printf("# A conflict exists between two basepairs in different structures, one between columns i and j\n"); printf("# and the other between columns k and l, if (i == k and j != l) or (j == l and i != k).\n"); printf("#\n"); printf("# %-*s %6s %6s %6s %6s\n", namewidth, "seqname", "indibp", "ovrlap", "cnsist", "cnflct"); printf("# %-*s %6s %6s %6s %6s\n", namewidth, namedashes, "------", "------", "-----", "------"); } else { printf("# %-*s %6s\n", namewidth, "seqname", "nbp"); printf("# %-*s %6s\n", namewidth, namedashes, "------"); } } nindi_read = 0; for (a = 0; a < msa->nseq; a++) { if(msa->ss != NULL && msa->ss[a] != NULL) { if((status = esl_wuss2ct(msa->ss[a], msa->alen, cur_ct)) != eslOK) { esl_fatal("SS annotation for sequence %d, aln %d is invalid.\n", (a+1), nali); } nindi_read++; for(i = 1; i <= msa->alen; i++) { if(i < cur_ct[i]) { bp[i][cur_ct[i]]++; if(bp[i][cur_ct[i]] == 1) { nmates_l2r[i]++; nmates_r2l[cur_ct[i]]++; } } } for(i = 1; i <= msa->alen; i++) { if(cur_ct[i] != 0 && i < cur_ct[i]) { if(xcons_ct[i] == cur_ct[i]) noverlapsA[a]++; if((xcons_ct[i] != 0) && (xcons_ct[i] != cur_ct[i])) { if(be_verbose) { printf("ali: %2d seq %3d (%s) bp %4d:%4d conflicts with consensus bp %4d:%4d\n", nali, a, msa->sqname[a], i, cur_ct[i], i, xcons_ct[i]); } nconflictsA[a]++; /* indi bp i:cur_ct[i] conflicts with i:xcons_ct[i] */ removebp[i] = TRUE; removebp[xcons_ct[i]] = TRUE; } else if((xcons_ct[cur_ct[i]] != 0) && (xcons_ct[cur_ct[i]] != i) && (cur_ct[xcons_ct[cur_ct[i]]] == 0)) { if(be_verbose) { printf("ali: %2d seq %3d (%s) bp %4d:%4d conflicts with consensus bp %4d:%4d\n", nali, a, msa->sqname[a], xcons_ct[i], cur_ct[xcons_ct[i]], xcons_ct[cur_ct[i]], cur_ct[i]); } nconflictsA[a]++; /* indi bp i:cur_ct[i] conflicts with xcons_ct[cur_ct[i]]:cur_ct[i] */ removebp[cur_ct[i]] = TRUE; removebp[xcons_ct[cur_ct[i]]] = TRUE; } else nconsistentA[a]++; } } if(nconflictsA[a] > lmax) { if(lfp != NULL) fprintf(lfp, "%s\n", msa->sqname[a]); nconflicts_list += nconflictsA[a]; nlist++; } nbpsA[a] = nconflictsA[a] + nconsistentA[a]; nconflicts_total += nconflictsA[a]; nconsistent_total += nconsistentA[a]; noverlaps_total += noverlapsA[a]; nbps_total += nbpsA[a]; if(do_info && have_cons) printf(" %-*s %6d %6d %6d %6d\n", namewidth, msa->sqname[a], nbpsA[a], noverlapsA[a], nconsistentA[a], nconflictsA[a]); if(do_info && !have_cons) printf(" %-*s %6d\n", namewidth, msa->sqname[a], nbpsA[a]); if(nbpsA[a] > nbpsA[max_nbps_aidx]) max_nbps_aidx = a; if((noverlapsA[a] > noverlapsA[max_noverlaps_aidx]) || ((noverlapsA[a] == noverlapsA[max_noverlaps_aidx]) && (nbpsA[a] > nbpsA[max_noverlaps_aidx]))) max_noverlaps_aidx = a; if((nconsistentA[a] > nconsistentA[max_nconsistent_aidx]) || ((nconsistentA[a] == nconsistentA[max_nconsistent_aidx]) && (nbpsA[a] > nbpsA[max_nconsistent_aidx]))) max_nconsistent_aidx = a; } else if(do_newcons || esl_opt_GetBoolean(go, "-a")) { esl_fatal("No SS annotation for sequence %d, aln %d.\n", (a+1), nali); } } if(do_info && have_cons) { if(nindi_read > 0) printf("\n"); printf(" %-*s %6d %6d %6d %6d\n", namewidth, "SS_cons(consensus)", ncons_bps, ncons_bps, ncons_bps, 0); if(nindi_read > 0) { printf("\n# %6d/%6d (%.3f) overlap\n", noverlaps_total, nbps_total, nbps_total > 0 ? (float) noverlaps_total / (float) nbps_total : 0.); printf("# %6d/%6d (%.3f) consistent\n", nconsistent_total, nbps_total, nbps_total > 0 ? (float) nconsistent_total / (float) nbps_total: 0.); printf("# %6d/%6d (%.3f) conflict\n", nconflicts_total, nbps_total, nbps_total > 0 ? (float) nconflicts_total / (float) nbps_total: 0.); } else { printf("# No sequences in the alignment have GR SS annotation.\n"); } } if(lfp != NULL) { printf("# %d/%d sequences with %.3f individual bps on avg that conflict with SS_cons written to %s\n", nlist, msa->nseq, (float) nconflicts_list / (float) nlist, esl_opt_GetString(go, "-l")); } /* determine number of gaps per alignment column */ if((status = get_gaps_per_column(msa, &ngaps)) != eslOK) goto ERROR; /* -x: determine max bp structure OR * -a: list all conflicts in individual structures */ if(do_max || do_a) { for(i = 1; i <= msa->alen; i++) { if(nmates_l2r[i] > 1) {/* list the conflicts */ has_conflict[i] = TRUE; for(j = 1; j <= msa->alen; j++) { if(bp[i][j] > 0) { if(do_a) printf("More than 1 right mates for left mate %4d %4d:%4d bp exists in %4d/%4d seqs (%.3f)\n", i, i, j, bp[i][j], msa->nseq - ngaps[i], (float) bp[i][j] / (float) (msa->nseq - ngaps[i])); has_conflict[j] = TRUE; } } } } for(i = 1; i <= msa->alen; i++) { if(nmates_r2l[i] > 1) {/* list the conflicts */ has_conflict[i] = TRUE; for(j = 1; j <= msa->alen; j++) { if(bp[j][i] > 0) { if(do_a) printf("More than 1 left mates for right mate %4d %4d:%4d bp exists in %4d/%4d seqs (%.3f)\n", i, j, i, bp[j][i], msa->nseq - ngaps[i], (float) bp[j][i] / (float) (msa->nseq - ngaps[i])); has_conflict[j] = TRUE; } } } } for(i = 1; i <= msa->alen; i++) { /*printf("conflict[%4d]: %d\n", i, has_conflict[i]);*/ if(nmates_l2r[i] == 1 && (!(has_conflict[i]))) { j = i+1; while(bp[i][j] == 0) j++; cons_ct[i] = j; cons_ct[j] = i; } } /* remove pseudoknotted bps greedily */ for(i = 1; i <= msa->alen; i++) { j = cons_ct[i]; if(j != 0 && i < j) { for(i2 = i+1; i2 <= msa->alen; i2++) { j2 = cons_ct[i2]; if(j2 != 0 && i2 < j2) { if((i2 < j) && (j < j2)) { /*printf("KNOT %4d:%4d (%4d) %4d:%4d (%4d)\n", i, j, bp[i][j], i2, j2, bp[i2][j2]);*/ /* note: remove both if they have equal number of sequences */ if(bp[i][j] <= bp[i2][j2]) { /*printf("rm %4d:%4d\n", i, j);*/ cons_ct[cons_ct[i]] = 0; cons_ct[i] = 0; } if(bp[i][j] >= bp[i2][j2]) { /*printf("rm %4d:%4d\n", i2, j2);*/ cons_ct[cons_ct[i2]] = 0; cons_ct[i2] = 0; } } } } } } } /***************************************/ /*PARANOID, second check for knots for(i = 1; i <= msa->alen; i++) { j = cons_ct[i]; if(j != 0 && i < j) { printf("BP: %4d:%4d\n", i, j); for(i2 = 1; i2 <= msa->alen; i2++) { j2 = cons_ct[i2]; if(j2 != 0 && i2 < j2) { if((i2 < j) && (j < j2)) { if((i < i2)) { printf("KNOT %4d:%4d (%4d) %4d:%4d (%4d)\n", i, j, bp[i][j], i2, j2, bp[i2][j2]); } } } } } } ******************************************/ /***************************************/ /*PARANOID, check cons_ct for consistency for(i = 1; i <= msa->alen; i++) { if(cons_ct[i] != 0) { if(cons_ct[cons_ct[i]] != i) { printf("ERROR: i: %4d cons_ct[i]: %4d cons_ct[cons_ct[i]]: %4d\n", i, cons_ct[i], cons_ct[cons_ct[i]]); } } } */ /*PARANOID, write out SS_cons for(i = 1; i <= msa->alen; i++) { if(i < cons_ct[i]) printf("<"); else if(cons_ct[i] != 0) { printf(">"); } else printf("."); } printf("\n"); */ /***************************************/ /* textize alignment */ if((status = esl_msa_Textize(msa)) != eslOK) esl_fatal("ERROR textizing alignment %d\n", nali); /* --fmin */ if(do_fmin) { /* define ss_cons */ prev_nbps = -1; fthresh = 0.99; inconsistent_flag = pknot_flag = FALSE; printf("# Defining consensus structure:\n"); printf("# indi SS basepair aln columns i:j (from at least 1 indi SS) will become consensus basepair\n"); printf("# if > <x> individual SS contain i:j as a pair\n"); printf("# We'll search for minimal <x> that gives a consistent consensus structure.\n"); printf("# A consistent structure has each position involved in 0 or 1 basepairs.\n"); printf("#\n"); printf("# Alignment file: %s\n", alifile); printf("# Alignment idx: %d\n", nali); printf("# Number of seqs: %d\n", msa->nseq); printf("#\n"); printf("# %5s %23s %6s\n", "<x>", "nseq-required-with-bp", "numbps"); printf("# %5s %23s %6s\n", "-----", "-----------------------", "------"); while(fthresh >= 0.00 && (inconsistent_flag == FALSE) && (pknot_flag == FALSE)) { nbps = 0; seqthresh = (int) (fthresh * msa->nseq); /*printf("fthresh: %f seqthresh: %d nseq: %d\n", fthresh, seqthresh, msa->nseq);*/ esl_vec_ISet(cons_ct, msa->alen+1, 0); for(i = 1; i <= msa->alen; i++) { for(j = i+1; j <= msa->alen; j++) { if(bp[i][j] > seqthresh) { if(cons_ct[i] != 0 || cons_ct[j] != 0) { inconsistent_flag = TRUE; } /* check for pseudoknots */ for(k = i+1; k < j; k++) { l = cons_ct[k]; if((k < l) && (l > j)) { pknot_flag = TRUE; } if((k > l) && (l != 0) && (l < i)) { pknot_flag = TRUE; } } cons_ct[i] = j; cons_ct[j] = i; nbps++; } } } if(inconsistent_flag) printf(" %.3f %23d %s\n", fthresh, seqthresh+1, "inconsistent"); else if(pknot_flag) printf(" %.3f %23d %s\n", fthresh, seqthresh+1, "pseudoknotted"); else { if(nbps != prev_nbps) { printf(" %.3f %23d %6d\n", fthresh, seqthresh+1, nbps); } fmin = fthresh; } fthresh -= 0.01; prev_nbps = nbps; } fthresh = fmin; esl_vec_ISet(cons_ct, msa->alen+1, 0); } /* --ffreq: determine structure by defining consensus bps that occur in <x> fraction of indi structures */ if(do_ffreq || do_fmin) { if(do_fmin) { printf("#\n# <x> determined to be %.3f\n", fthresh); } if(do_ffreq) { printf("# Defining consensus structure:\n"); printf("# indi SS basepair aln columns i:j (from at least 1 indi SS) will become consensus basepair\n"); printf("# if > %f individual SS contain i:j as a pair\n", fthresh); } esl_vec_ISet(cons_ct, msa->alen+1, 0); /* define ss_cons */ seqthresh = (int) (fthresh * msa->nseq); /*printf("fthresh: %f seqthresh: %d nseq: %d\n", fthresh, seqthresh, msa->nseq);*/ for(i = 1; i <= msa->alen; i++) { for(j = i+1; j <= msa->alen; j++) { if(bp[i][j] > seqthresh) { if(cons_ct[i] != 0) { esl_fatal("ERROR, two base pairs including position %d satisfy threshold (%d:%d and %d:%d)!\n", i, i, cons_ct[i], i, j); } if(cons_ct[j] != 0) { esl_fatal("ERROR, two base pairs including position %d satisfy threshold (%d:%d and %d:%d)!\n", j, j, cons_ct[j], i, j); } cons_ct[i] = j; cons_ct[j] = i; } } } } /* -r: redefine consensus struct by removing any bps that conflict with individual structures */ if(do_remove_bps) { for(i = 1; i <= msa->alen; i++) { if(!(removebp[i])) { cons_ct[i] = xcons_ct[i]; cons_ct[cons_ct[i]] = i; } else { printf("# Removing consensus bp: %d:%d\n", i, xcons_ct[i]); cons_ct[xcons_ct[i]] = 0; cons_ct[i] = 0; } } } /* -c: define consensus structure as indi sequence with highest number of consistent bps with structure OR */ /* --indi: define consensus structure as indi sequence <x> from --indi <x> */ if(do_consistent || do_indi2cons) { if(do_indi2cons) { indi = esl_opt_GetString(go, "--indi"); for(a = 0; a < msa->nseq; a++) { if(strcmp(indi, msa->sqname[a]) == 0) break; } if(a == msa->nseq) esl_fatal("ERROR, could not find a sequence named %s in the alignment.\n", indi); } else { /* do_consistent */ a = max_nconsistent_aidx; } if(msa->ss == NULL || msa->ss[a] == NULL) esl_fatal("ERROR, no individual SS annotation for %s in the alignment.\n", msa->sqname[a]); if((status = esl_wuss2ct(msa->ss[a], msa->alen, cons_ct)) != eslOK) { esl_fatal("Second pass... SS annotation for sequence %d, aln %d is invalid.\n", (a), nali); } printf("# Defined new SS_cons as SS annotation for %s (%d basepairs)\n", msa->sqname[a], nbpsA[a]); if(esl_opt_GetBoolean(go, "--rfc") || esl_opt_GetBoolean(go, "--rfindi")) { if(msa->rf != NULL) { free(msa->rf); msa->rf = NULL; } if((status = esl_strcat(&(msa->rf), -1, msa->aseq[a], msa->alen)) != eslOK) goto ERROR; printf("# Defined new RF as %s sequence\n", msa->sqname[a]); } } /* write out alignment with new SS_cons */ if(do_newcons) { if((status = esl_ct2wuss(cons_ct, msa->alen, sscons)) != eslOK) goto ERROR; if(msa->ss_cons != NULL) { free(msa->ss_cons); msa->ss_cons = NULL; } if((status = esl_strcat(&(msa->ss_cons), -1, sscons, msa->alen)) != eslOK) goto ERROR; status = esl_msafile_Write(ofp, msa, (esl_opt_GetBoolean(go, "--pfam") ? eslMSAFILE_PFAM : eslMSAFILE_STOCKHOLM)); if (status == eslEMEM) esl_fatal("Memory error when outputting alignment\n"); else if (status != eslOK) esl_fatal("Writing alignment file failed with error %d\n", status); } free(sscons); free(cur_ct); free(cons_ct); free(xcons_ct); for(i = 1; i <= msa->alen; i++) free(bp[i]); free(bp); esl_msa_Destroy(msa); } if (nali == 0) esl_fatal("No alignments found in file %s\n", alifile); /* Cleanup, normal return */ if(lfp != NULL) fclose(lfp); if(ofp != NULL) { printf("# Alignment(s) saved to file %s\n", esl_opt_GetString(go, "-o")); fclose(ofp); } esl_msafile_Close(afp); esl_getopts_Destroy(go); return 0; ERROR: if(afp) esl_msafile_Close(afp); if(go) esl_getopts_Destroy(go); if(msa) esl_msa_Destroy(msa); if(lfp) fclose(lfp); if(ofp) fclose(ofp); esl_fatal("ERROR\n"); return 1; }
int main(int argc, char **argv) { ESL_GETOPTS *go; /* application configuration */ int kstatus, tstatus;/* return code from Easel routine */ int fmt; /* expected format of kfile, tfile */ char *kfile, *tfile; /* known, test structure file */ ESL_MSAFILE *kfp, *tfp; /* open kfile, tfile */ ESL_MSA *ka, *ta; /* known, trusted alignment */ int64_t klen, tlen; /* lengths of dealigned seqs */ int i; /* counter over sequences */ int apos; /* counter over alignment columns */ int rfpos; /* counter over consensus (non-gap RF) columns */ int is_rfpos; /* TRUE if current apos is a consensus pos, FALSE if not */ int uapos; /* counter over unaligned residue positions */ int nali; /* number of alignment we're on in each file */ int **kp; /* [0..i..nseq-1][1..r..sq->n] = x known non-gap RF position of residue r in sequence i */ int **tp; /* [0..i..nseq-1][1..r..sq->n] = x predicted non-gap RF position of residue r in sequence i */ /* for both kp and pp, if x <= 0, residue r for seq i is not aligned to a non-gap RF position, but rather as an 'insert' * after non-gap RF position (x * -1) */ int *km_pos; /* [0..rflen] = x, in known aln, number of residues aligned to non-gap RF column x; special case: mct[0] = 0 */ int *ki_pos; /* [0..rflen] = x, in known aln, number of residues inserted after non-gap RF column x */ int *tm_pos; /* [0..rflen] = x, in predicted aln, number of residues aligned to non-gap RF column x; special case: mct[0] = 0 */ int *ti_pos; /* [0..rflen] = x, in predicted aln, number of residues inserted after non-gap RF column x */ int *cor_tm_pos; /* [0..rflen] = x, in predicted aln, number of correctly predicted residues aligned to non-gap RF column x; special case: mct[0] = 0 */ int *cor_ti_pos; /* [0..rflen] = x, in predicted aln, number of correctly predicted residues inserted after non-gap RF column x */ int *km_seq; /* [0..i..nseq-1] = x, in known aln, number of residues aligned to non-gap RF columns in seq i; */ int *ki_seq; /* [0..i..nseq-1] = x, in known aln, number of residues inserted in seq i */ int *tm_seq; /* [0..i..nseq-1] = x, in predicted aln, number of residues aligned to non-gap RF columns in seq i; */ int *ti_seq; /* [0..i..nseq-1] = x, in predicted aln, number of residues inserted in seq i */ int *cor_tm_seq; /* [0..i..nseq-1] = x, in predicted aln, number of correctly predicted residues aligned to non-gap RF columns in seq i */ int *cor_ti_seq; /* [0..i..nseq-1] = x, in predicted aln, number of correctly predicted residues inserted in seq i */ int *seqlen; /* [0..i..nseq-1] = x, unaligned seq i has length x */ ESL_ALPHABET *abc = NULL; /* alphabet for all alignments */ int rflen, t_rflen; /* non-gap RF length (consensus lengths) */ int status; char *namedashes; int ni; int namewidth = 8; /* length of 'seq name' */ int cor_tm, cor_ti, km, ki; /* correct predicted match, correct predicted insert, total match, total insert */ char *mask = NULL; int masklen; ESL_DSQ *ks; ESL_DSQ *ts; FILE *dfp = NULL; /* for --c2dfile */ /* variables needed for -p and related options */ int do_post = FALSE; /* TRUE if -p enabled */ int do_post_for_this_rfpos = FALSE; /* set for each consensus position, always TRUE unless --mask-p2xm */ int p; /* counter over integerized posteriors */ int *ptm = NULL; /* [0..p..10] number of total matches with posterior value p (10="*")*/ int *pti = NULL; /* [0..p..10] number of total inserts with posterior value p */ int *cor_ptm = NULL; /* [0..p..10] number of correct matches with posterior value p */ int *cor_pti = NULL; /* [0..p..10] number of correct inserts with posterior value p */ int npostvals = 11; /* number of posterior values 0-9, * */ int ppidx; /* index of PP */ char ppchars[11] = "0123456789*"; int cm_cor_ptm, cm_cor_pti, cm_ptm, cm_pti, cm_incor_ptm, cm_incor_pti; /* cumulative counts of posteriors */ // int tot_cor_ptm, tot_cor_pti, tot_ptm, tot_pti; /* total counts of posteriors */ // int tot_incor_ptm,tot_incor_pti; // SRE: commented out; don't seem to be used; need to silence compiler warning char errbuf[eslERRBUFSIZE]; /*********************************************** * Parse command line ***********************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_GetBoolean(go, "-h") ) { esl_banner(stdout, argv[0], banner); esl_usage (stdout, argv[0], usage); puts("\n where options are:"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); esl_opt_DisplayHelp(stdout, go, 2, 2, 80); exit(EXIT_SUCCESS); } if (esl_opt_ArgNumber(go) != 2) { printf("Incorrect number of command line arguments.\n"); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } kfile = esl_opt_GetArg(go, 1); tfile = esl_opt_GetArg(go, 2); fmt = eslMSAFILE_STOCKHOLM; /*********************************************** * Open the two Stockholm files. ***********************************************/ if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); if ( (kstatus = esl_msafile_Open(&abc, kfile, NULL, fmt, NULL, &kfp)) != eslOK) esl_msafile_OpenFailure(kfp, kstatus); if ( (tstatus = esl_msafile_Open(&abc, tfile, NULL, fmt, NULL, &tfp)) != eslOK) esl_msafile_OpenFailure(tfp, tstatus); do_post = esl_opt_GetBoolean(go, "-p"); /* read the mask file if --p-mask is enabled */ if(! esl_opt_IsDefault(go, "--p-mask")) { if((status = read_mask_file(esl_opt_GetString(go, "--p-mask"), errbuf, &mask, &masklen)) != eslOK) esl_fatal(errbuf); } /* open the c2dfile for output, if nec */ if (esl_opt_IsOn(go, "--c2dfile")) { if ((dfp = fopen(esl_opt_GetString(go, "--c2dfile"), "w")) == NULL) esl_fatal("Failed to open --c2dfile output file %s\n", esl_opt_GetString(go, "--c2dfile")); } /*********************************************** * Do alignment comparisons, one seq at a time; * this means looping over all seqs in all alignments. ***********************************************/ nali = 0; while ( (kstatus = esl_msafile_Read(kfp, &ka)) != eslEOF) { if ( kstatus != eslOK) esl_msafile_ReadFailure(kfp, kstatus); if ( (tstatus = esl_msafile_Read(tfp, &ta)) != eslOK) esl_msafile_ReadFailure(tfp, tstatus); nali++; if((nali > 1) && (esl_opt_IsOn(go, "--c2dfile"))) esl_fatal("--c2dfile is only meant for msafiles with single alignments"); /* Sanity check on alignment */ if (ka->nseq != ta->nseq) esl_fatal("trusted, test alignments don't have same seq #\n"); if (ka->rf == NULL) esl_fatal("trusted alignment has no reference annotation\n"); if (ta->rf == NULL) esl_fatal("test alignment has no reference annotation\n"); /* make sure the sequences are all identical */ ESL_ALLOC(seqlen, sizeof(int) * ka->nseq); for(i = 0; i < ka->nseq; i++) { if(strcmp(ka->sqname[i], ta->sqname[i]) != 0) esl_fatal("sequence %d of trusted alignment %s has different name than seq %d of predicted alignment %s\n", (i+1), ka->sqname[i], (i+1), ta->sqname[i]); ESL_ALLOC(ks, sizeof(ESL_DSQ) * (ka->alen+2)); memcpy(ks, ka->ax[i], (ka->alen+2) * sizeof(ESL_DSQ)); esl_abc_XDealign(ka->abc, ks, ka->ax[i], &klen); ESL_ALLOC(ts, sizeof(ESL_DSQ) * (ta->alen+2)); memcpy(ts, ta->ax[i], (ta->alen+2) * sizeof(ESL_DSQ)); esl_abc_XDealign(ta->abc, ts, ta->ax[i], &tlen); if (tlen != klen) esl_fatal("dealigned sequence mismatch, seq %d, when dealigned, is %d residues in the known alignment, but %d residues in the trusted alignment.", (i+1), klen, tlen); if (memcmp(ks, ts, sizeof(ESL_DSQ) * klen) != 0) esl_fatal("dealigned sequence mismatch, seq %d %s, when dealigned, are not identical.", (i+1), ka->sqname[i]); seqlen[i] = tlen; free(ks); free(ts); } /* determine non-gap RF length */ rflen = 0; for(apos = 1; apos <= ka->alen; apos++) { if((! esl_abc_CIsGap (ka->abc, ka->rf[apos-1])) && (! esl_abc_CIsMissing(ka->abc, ka->rf[apos-1]))) rflen++; } t_rflen = 0; for(apos = 1; apos <= ta->alen; apos++) { if((! esl_abc_CIsGap (ta->abc, ta->rf[apos-1])) && (! esl_abc_CIsMissing (ta->abc, ta->rf[apos-1]))) t_rflen++; } if(t_rflen != rflen) esl_fatal("Trusted alignment non-gap RF length (%d) != predicted alignment non-gap RF length (%d).\n", rflen, t_rflen); /* if -p, make sure the test alignment has posterior probabilities, and allocate our counters for correct/incorrect per post value */ if(do_post) { if(! esl_opt_IsDefault(go, "--p-mask")) { if(masklen != rflen) { esl_fatal("Length of mask in %s (%d) not equal to non-gap RF len of alignments (%d)\n", esl_opt_GetString(go, "--p-mask"), masklen, rflen); } } if(ta->pp == NULL) esl_fatal("-p requires \"#=GR PP\" annotation in the test alignment, but none exists"); ESL_ALLOC(ptm, sizeof(int) * npostvals); ESL_ALLOC(pti, sizeof(int) * npostvals); ESL_ALLOC(cor_ptm, sizeof(int) * npostvals); ESL_ALLOC(cor_pti, sizeof(int) * npostvals); esl_vec_ISet(ptm, npostvals, 0); esl_vec_ISet(pti, npostvals, 0); esl_vec_ISet(cor_ptm, npostvals, 0); esl_vec_ISet(cor_pti, npostvals, 0); } /* allocate and initialize our counters */ ESL_ALLOC(kp, sizeof(int *) * ka->nseq); ESL_ALLOC(tp, sizeof(int *) * ta->nseq); for(i = 0; i < ka->nseq; i++) { ESL_ALLOC(kp[i], sizeof(int) * (seqlen[i]+1)); ESL_ALLOC(tp[i], sizeof(int) * (seqlen[i]+1)); esl_vec_ISet(kp[i], seqlen[i]+1, -987654321); esl_vec_ISet(tp[i], seqlen[i]+1, -987654321); } ESL_ALLOC(km_pos, sizeof(int) * (rflen+1)); ESL_ALLOC(ki_pos, sizeof(int) * (rflen+1)); ESL_ALLOC(tm_pos, sizeof(int) * (rflen+1)); ESL_ALLOC(ti_pos, sizeof(int) * (rflen+1)); ESL_ALLOC(cor_tm_pos, sizeof(int) * (rflen+1)); ESL_ALLOC(cor_ti_pos, sizeof(int) * (rflen+1)); esl_vec_ISet(km_pos, rflen+1, 0); esl_vec_ISet(ki_pos, rflen+1, 0); esl_vec_ISet(tm_pos, rflen+1, 0); esl_vec_ISet(ti_pos, rflen+1, 0); esl_vec_ISet(cor_tm_pos, rflen+1, 0); esl_vec_ISet(cor_ti_pos, rflen+1, 0); ESL_ALLOC(km_seq, sizeof(int) * ka->nseq); ESL_ALLOC(ki_seq, sizeof(int) * ka->nseq); ESL_ALLOC(tm_seq, sizeof(int) * ka->nseq); ESL_ALLOC(ti_seq, sizeof(int) * ka->nseq); ESL_ALLOC(cor_tm_seq, sizeof(int) * ka->nseq); ESL_ALLOC(cor_ti_seq, sizeof(int) * ka->nseq); esl_vec_ISet(km_seq, ka->nseq, 0); esl_vec_ISet(ki_seq, ka->nseq, 0); esl_vec_ISet(tm_seq, ka->nseq, 0); esl_vec_ISet(ti_seq, ka->nseq, 0); esl_vec_ISet(cor_tm_seq, ka->nseq, 0); esl_vec_ISet(cor_ti_seq, ka->nseq, 0); /* determine non-gap RF location of each residue in known alignment */ for(i = 0; i < ka->nseq; i++) { uapos = rfpos = 0; for(apos = 1; apos <= ka->alen; apos++) { is_rfpos = FALSE; if((! esl_abc_CIsGap (ka->abc, ka->rf[apos-1])) && (! esl_abc_CIsMissing (ka->abc, ka->rf[apos-1]))) { rfpos++; is_rfpos = TRUE; } if(esl_abc_XIsResidue(ka->abc, ka->ax[i][apos])) { uapos++; kp[i][uapos] = (is_rfpos) ? rfpos : (-1 * rfpos); if(is_rfpos) { km_pos[rfpos]++; km_seq[i]++; } else { ki_pos[rfpos]++; ki_seq[i]++; } } } } /* determine non-gap RF location of each residue in predicted alignment */ for(i = 0; i < ta->nseq; i++) { uapos = rfpos = 0; for(apos = 1; apos <= ta->alen; apos++) { is_rfpos = FALSE; if((! esl_abc_CIsGap (abc, ta->rf[apos-1])) && (! esl_abc_CIsMissing (abc, ta->rf[apos-1]))) { rfpos++; is_rfpos = TRUE; if(do_post) { do_post_for_this_rfpos = (mask != NULL && mask[rfpos-1] == '0') ? FALSE : TRUE; } } if(esl_abc_XIsResidue(ta->abc, ta->ax[i][apos])) { uapos++; tp[i][uapos] = (is_rfpos) ? rfpos : (-1 * rfpos); if(do_post) { if(esl_abc_CIsGap(abc, ta->pp[i][(apos-1)])) esl_fatal("gap PP value for nongap residue: ali: %d seq: %d apos: %d\n", nali, i, apos); ppidx = get_pp_idx(abc, ta->pp[i][(apos-1)]); if(ppidx == -1) esl_fatal("unrecognized PP value (%c) for nongap residue: ali: %d seq: %d apos: %d\n", ta->pp[i][(apos-1)], nali, i, apos); } if(is_rfpos) { tm_pos[rfpos]++; tm_seq[i]++; if(do_post_for_this_rfpos) ptm[ppidx]++; } else { ti_pos[rfpos]++; ti_seq[i]++; if(do_post) pti[ppidx]++; } if(kp[i][uapos] == tp[i][uapos]) { /* correctly predicted this residue */ if(is_rfpos) { cor_tm_seq[i]++; cor_tm_pos[rfpos]++; if(do_post_for_this_rfpos) cor_ptm[ppidx]++; } else { cor_ti_seq[i]++; cor_ti_pos[rfpos]++; if(do_post) cor_pti[ppidx]++; } } } } } if((! (esl_opt_GetBoolean(go, "-c"))) && (! esl_opt_GetBoolean(go, "-p"))) { /* print per sequence statistics */ /* determine the longest name in msa */ for(ni = 0; ni < ka->nseq; ni++) namewidth = ESL_MAX(namewidth, strlen(ka->sqname[ni])); ESL_ALLOC(namedashes, sizeof(char) * namewidth+1); namedashes[namewidth] = '\0'; for(ni = 0; ni < namewidth; ni++) namedashes[ni] = '-'; printf("# %-*s %6s %28s %28s %28s\n", namewidth, "seq name", "len", "match columns", "insert columns", "all columns"); printf("# %-*s %6s %28s %28s %28s\n", namewidth, namedashes, "------", "----------------------------", "----------------------------", "----------------------------"); for(i = 0; i < ta->nseq; i++) { printf(" %-*s %6d %8d / %8d (%.3f) %8d / %8d (%.3f) %8d / %8d (%.3f)\n", namewidth, ka->sqname[i], seqlen[i], cor_tm_seq[i], km_seq[i], (km_seq[i] == 0) ? 0. : ((float) cor_tm_seq[i] / (float) km_seq[i]), cor_ti_seq[i], ki_seq[i], (ki_seq[i] == 0) ? 0. : ((float) cor_ti_seq[i] / (float) ki_seq[i]), (cor_tm_seq[i] + cor_ti_seq[i]), (km_seq[i] + ki_seq[i]), ((float) (cor_tm_seq[i] + cor_ti_seq[i]) / ((float) km_seq[i] + ki_seq[i]))); } cor_tm = esl_vec_ISum(cor_tm_seq, ka->nseq); cor_ti = esl_vec_ISum(cor_ti_seq, ka->nseq); km = esl_vec_ISum(km_seq, ka->nseq); ki = esl_vec_ISum(ki_seq, ka->nseq); printf("# %-*s %6s %28s %28s %28s\n", namewidth, namedashes, "-----", "----------------------------", "----------------------------", "----------------------------"); printf("# %-*s %6s %8d / %8d (%.3f) %8d / %8d (%.3f) %8d / %8d (%.3f)\n", namewidth, "*all*", "-", cor_tm, km, ((float) cor_tm / (float) km), cor_ti, ki, ((float) cor_ti / (float) ki), (cor_tm+cor_ti), (km+ki), (((float) (cor_tm + cor_ti))/ ((float) (km + ki)))); free(namedashes); for(i = 0; i < ka->nseq; i++) { free(kp[i]); free(tp[i]); } } else if(esl_opt_GetBoolean(go, "-c")) { /* print per column statistics */ printf("# %5s %20s %20s %20s\n", "rfpos", "match", "insert", "both"); printf("# %5s %20s %20s %20s\n", "-----", "--------------------", "--------------------", "--------------------"); for(rfpos = 0; rfpos <= rflen; rfpos++) { printf(" %5d %4d / %4d (%.3f) %4d / %4d (%.3f) %4d / %4d (%.3f)\n", rfpos, cor_tm_pos[rfpos], km_pos[rfpos], (km_pos[rfpos] == 0) ? 0. : ((float) cor_tm_pos[rfpos] / (float) km_pos[rfpos]), cor_ti_pos[rfpos], ki_pos[rfpos], (ki_pos[rfpos] == 0) ? 0. : ((float) cor_ti_pos[rfpos] / (float) ki_pos[rfpos]), (cor_tm_pos[rfpos] + cor_ti_pos[rfpos]), (km_pos[rfpos] + ki_pos[rfpos]), ((float) (cor_tm_pos[rfpos] + cor_ti_pos[rfpos]) / ((float) km_pos[rfpos] + ki_pos[rfpos]))); } } else if(do_post) { /* do posterior output */ if(mask == NULL) { printf("# %2s %29s %29s\n", "", " match columns ", " insert columns "); printf("# %2s %29s %29s\n", "", "-----------------------------", "-----------------------------") ; printf("# %2s %8s %8s %9s %8s %8s %9s\n", "PP", "ncorrect", "ntotal", "fractcor", "ncorrect", "ntotal", "fractcor"); printf("# %2s %8s %8s %9s %8s %8s %9s\n", "--", "--------", "--------", "---------", "--------", "--------", "---------"); } else { printf("# %2s %29s %29s\n", "", " match columns within mask ", " insert columns "); printf("# %2s %29s %29s\n", "", "-----------------------------", "-----------------------------") ; printf("# %2s %8s %8s %9s %8s %8s %9s\n", "PP", "ncorrect", "ntotal", "fractcor", "ncorrect", "ntotal", "fractcor"); printf("# %2s %8s %8s %9s %8s %8s %9s\n", "--", "--------", "--------", "---------", "--------", "--------", "---------"); } cm_ptm = cm_pti = cm_cor_ptm = cm_cor_pti = cm_incor_ptm = cm_incor_pti = 0; //tot_ptm = esl_vec_ISum(ptm, npostvals); //tot_pti = esl_vec_ISum(pti, npostvals); //tot_cor_ptm = esl_vec_ISum(cor_ptm, npostvals); //tot_cor_pti = esl_vec_ISum(cor_pti, npostvals); //tot_incor_ptm = tot_ptm - tot_cor_ptm; //tot_incor_pti = tot_pti - tot_cor_pti; for(p = (npostvals-1); p >= 0; p--) { cm_cor_ptm += cor_ptm[p]; cm_cor_pti += cor_pti[p]; cm_ptm += ptm[p]; cm_pti += pti[p]; cm_incor_ptm += ptm[p] - cor_ptm[p]; cm_incor_pti += pti[p] - cor_pti[p]; printf(" %2c %8d / %8d (%.5f) %8d / %8d (%.5f)\n", ppchars[p], cor_ptm[p], ptm[p], (ptm[p] == 0) ? 0. : (float) cor_ptm[p] / (float) ptm[p], cor_pti[p], pti[p], (pti[p] == 0) ? 0. : (float) cor_pti[p] / (float) pti[p]); } } /* handle --c2dfile */ if (dfp != NULL) { /* match stats, 4 fields, CMYK color values */ for(rfpos = 1; rfpos <= rflen; rfpos++) { if(km_pos[rfpos] == 0) { /* special case, no known alignment residues, a blank position */ fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 0., 0., 0., 0.); } else { fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 0., /* cyan */ 1. - ((float) cor_tm_pos[rfpos] / (float) km_pos[rfpos]), /* magenta, fraction incorrect */ 1. - ((float) km_pos[rfpos] / ta->nseq), /* yellow, 1 - fraction of seqs with residue in column */ 0.); } } fprintf(dfp, "//\n"); /* insert stats, 4 fields, CMYK color values */ rfpos = 0; /* special case, combine insert posn 0 and 1 together */ if(ki_pos[rfpos] == 0) { /* special case, no known alignment residues, a blank position */ fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 0., 0., 0., 0.); } else { fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 0., /* cyan */ 1. - ((float) (cor_ti_pos[0] + cor_ti_pos[1]) / ((float) (ki_pos[0] + ki_pos[1]))), /* magenta, fraction correct */ 0., 0.); } /* insert stats posn 2..rflen */ for(rfpos = 2; rfpos <= rflen; rfpos++) { if(ki_pos[rfpos] == 0) { /* special case, no known alignment residues, a blank position */ fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 0., 0., 0., 0.); } else { fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 0., /* cyan */ 1. - ((float) cor_ti_pos[rfpos] / (float) ki_pos[rfpos]), /* magenta, fraction correct */ 0., 0.); } } fprintf(dfp, "//\n"); } if(ptm != NULL) free(ptm); if(pti != NULL) free(pti); if(cor_ptm != NULL) free(cor_ptm); if(cor_ptm != NULL) free(cor_pti); free(kp); free(tp); free(km_seq); free(ki_seq); free(tm_seq); free(ti_seq); free(cor_tm_seq); free(cor_ti_seq); free(km_pos); free(ki_pos); free(tm_pos); free(ti_pos); free(cor_tm_pos); free(cor_ti_pos); free(seqlen); esl_msa_Destroy(ka); esl_msa_Destroy(ta); } if(mask != NULL) free(mask); if(dfp != NULL) { fclose(dfp); printf("# Draw file of per-column stats saved to file: %s\n", esl_opt_GetString(go, "--c2dfile")); } if(abc) esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); esl_msafile_Close(tfp); esl_msafile_Close(kfp); return 0; ERROR: return status; }
/* map_new_msa() * * Construct <inscount[0..M]>, <matuse[1..M]>, and <matmap[1..M]> * arrays for mapping model consensus nodes <1..M> onto columns * <1..alen> of a new MSA. * * Here's the problem. We want to align the match states in columns, * but some sequences have inserted symbols in them; we need some * sort of overall knowledge of where the inserts are and how long * they are in order to create the alignment. * * Here's our trick. inscount[] is a 0..M array; inserts[k] stores * the maximum number of times insert substate k was used. This * is the maximum number of gaps to insert between canonical * column k and k+1. inserts[0] is the N-term tail; inserts[M] is * the C-term tail. * * Additionally, matuse[k=1..M] says whether we're going to make an * alignment column for consensus position k. By default this is * <TRUE> only if there is at least one residue in the column. If * the <p7_ALL_CONSENSUS_COLS> option flag is set, though, all * matuse[1..M] are set <TRUE>. (matuse[0] is unused, always <FALSE>.) * * Then, using these arrays, we construct matmap[] and determine alen. * If match state k is represented as an alignment column, * matmap[1..M] = that position, <1..alen>. * If match state k is not in the alignment (<matuse[k] == FALSE>), * matmap[k] = matmap[k-1] = the last alignment column that a match * state did map to; this is a trick to make some apos coordinate setting * work cleanly. * Because of this trick, you can't just assume because matmap[k] is * nonzero that match state k maps somewhere in the alignment; * you have to check matuse[k] == TRUE, then look at what matmap[k] says. * Remember that N and C emit on transition, hence the check for an * N->N or C->C transition before bumping nins. * <matmap[0]> is unused; by convention, <matmap[0] = 0>. */ static int map_new_msa(P7_TRACE **tr, int nseq, int M, int optflags, int **ret_inscount, int **ret_matuse, int **ret_matmap, int *ret_alen) { int *inscount = NULL; /* inscount[k=0..M] == max # of inserts in node k */ int *matuse = NULL; /* matuse[k=1..M] == TRUE|FALSE: does node k map to an alignment column */ int *matmap = NULL; /* matmap[k=1..M]: if matuse[k] TRUE, what column 1..alen does node k map to */ int idx; /* counter over sequences */ int nins; /* counter for inserted residues observed */ int z; /* index into trace positions */ int alen; /* length of alignment */ int k; /* counter over nodes 1..M */ int status; ESL_ALLOC(inscount, sizeof(int) * (M+1)); ESL_ALLOC(matuse, sizeof(int) * (M+1)); matuse[0] = 0; ESL_ALLOC(matmap, sizeof(int) * (M+1)); matmap[0] = 0; esl_vec_ISet(inscount, M+1, 0); if (optflags & p7_ALL_CONSENSUS_COLS) esl_vec_ISet(matuse+1, M, TRUE); else esl_vec_ISet(matuse+1, M, FALSE); for (idx = 0; idx < nseq; idx++) { nins = 0; k = 0; for (z = 1; z < tr[idx]->N; z++) { switch (tr[idx]->st[z]) { case p7T_I: nins++; break; case p7T_N: if (tr[idx]->st[z-1] == p7T_N) nins++; break; case p7T_C: if (tr[idx]->st[z-1] == p7T_C) nins++; break; case p7T_M: /* M,D: record max. reset ctr; M only: set matuse[] */ k = tr[idx]->k[z]; /* k++ doesn't work. May be a B->X->Mk fragment entry */ inscount[k-1] = ESL_MAX(nins, inscount[k-1]); matuse[k] = TRUE; nins = 0; break; case p7T_D: /* Can handle I->D transitions even though currently not in H3 models */ k = tr[idx]->k[z]; /* k++ doesn't work; see above */ inscount[k-1] = ESL_MAX(nins, inscount[k-1]); nins = 0; break; case p7T_T: /* T: record C-tail max, for a profile trace */ inscount[M] = ESL_MAX(nins, inscount[M]); break; case p7T_E: /* this handles case of core traces, which do have I_M state */ inscount[k] = ESL_MAX(nins, inscount[k]); /* [M] doesn't work, because of {DMI}k->X->E frag exit */ break; case p7T_B: /* B: record N-tail max for a profile trace; I0 for a core trace */ inscount[0] = ESL_MAX(nins, inscount[0]); nins = 0; break; case p7T_S: break; /* don't need to do anything on S,X states */ case p7T_X: break; case p7T_J: p7_Die("J state unsupported"); default: p7_Die("Unrecognized statetype %d", tr[idx]->st[z]); } } } /* if we're trimming N and C off, reset inscount[0], inscount[M] to 0. */ if (optflags & p7_TRIM) { inscount[0] = inscount[M] = 0; } /* Use inscount, matuse to set the matmap[] */ alen = inscount[0]; for (k = 1; k <= M; k++) { if (matuse[k]) { matmap[k] = alen+1; alen += 1+inscount[k]; } else { matmap[k] = alen; alen += inscount[k]; } } *ret_inscount = inscount; *ret_matuse = matuse; *ret_matmap = matmap; *ret_alen = alen; return eslOK; ERROR: if (inscount != NULL) free(inscount); if (matuse != NULL) free(matuse); if (matmap != NULL) free(matmap); *ret_inscount = NULL; *ret_matuse = NULL; *ret_matmap = NULL; *ret_alen = 0; return status; }
/* Function: esl_msaweight_PB() * Synopsis: PB (position-based) weights. * Incept: SRE, Sun Nov 5 08:59:28 2006 [Janelia] * * Purpose: Given a multiple alignment <msa>, calculate sequence * weights according to the position-based weighting * algorithm (Henikoff and Henikoff, JMB 243:574-578, * 1994). These weights are stored internally in the <msa> * object, replacing any weights that may have already been * there. Weights are $\geq 0$ and they sum to <msa->nseq>. * * The <msa> may be in either digitized or text mode. * Digital mode is preferred, so that the algorithm * deals with degenerate residue symbols properly. * * The Henikoffs' algorithm does not give rules for dealing * with gaps or degenerate residue symbols. The rule here * is to ignore them. This means that longer sequences * initially get more weight; hence a "double * normalization" in which the weights are first divided by * sequence length in canonical residues (to compensate for * that effect), then normalized to sum to nseq. * * An advantage of the PB method is efficiency. * It is $O(1)$ in memory and $O(NL)$ time, for an alignment of * N sequences and L columns. This makes it a good method * for ad hoc weighting of very deep alignments. * * When the alignment is in simple text mode, IUPAC * degenerate symbols are not dealt with correctly; instead, * the algorithm simply uses the 26 letters as "residues" * (case-insensitively), and treats all other residues as * gaps. * * Returns: <eslOK> on success, and the weights inside <msa> have been * modified. * * Throws: <eslEMEM> on allocation error, in which case <msa> is * returned unmodified. * * Xref: [Henikoff94b]; squid::weight.c::PositionBasedWeights(). */ int esl_msaweight_PB(ESL_MSA *msa) { int *nres = NULL; /* counts of each residue observed in a column */ int ntotal; /* number of different symbols observed in a column */ int rlen; /* number of residues in a sequence */ int idx, pos, i; int K; /* alphabet size */ int status; /* Contract checks */ ESL_DASSERT1( (msa->nseq >= 1) ); ESL_DASSERT1( (msa->alen >= 1) ); if (msa->nseq == 1) { msa->wgt[0] = 1.0; return eslOK; } /* Initialize */ if (! (msa->flags & eslMSA_DIGITAL)) { ESL_ALLOC(nres, sizeof(int) * 26); K = 26; } #ifdef eslAUGMENT_ALPHABET else { ESL_ALLOC(nres, sizeof(int) * msa->abc->K); K = msa->abc->K; } #endif esl_vec_DSet(msa->wgt, msa->nseq, 0.); /* This section handles text alignments */ if (! (msa->flags & eslMSA_DIGITAL)) { for (pos = 0; pos < msa->alen; pos++) { /* Collect # of letters A..Z in this column, and total */ esl_vec_ISet(nres, K, 0.); for (idx = 0; idx < msa->nseq; idx++) if (isalpha((int) msa->aseq[idx][pos])) nres[toupper((int) msa->aseq[idx][pos]) - 'A'] ++; for (ntotal = 0, i = 0; i < K; i++) if (nres[i] > 0) ntotal++; /* Bump weight on each seq by PB rule */ if (ntotal > 0) { for (idx = 0; idx < msa->nseq; idx++) { if (isalpha((int) msa->aseq[idx][pos])) msa->wgt[idx] += 1. / (double) (ntotal * nres[toupper((int) msa->aseq[idx][pos]) - 'A'] ); } } } /* first normalization by # of residues counted in each seq */ for (idx = 0; idx < msa->nseq; idx++) { for (rlen = 0, pos = 0; pos < msa->alen; pos++) if (isalpha((int) msa->aseq[idx][pos])) rlen++; if (ntotal > 0) msa->wgt[idx] /= (double) rlen; /* if rlen == 0 for this seq, its weight is still 0.0, as initialized. */ } } /* This section handles digital alignments. */ #ifdef eslAUGMENT_ALPHABET else { for (pos = 1; pos <= msa->alen; pos++) { /* Collect # of residues 0..K-1 in this column, and total # */ esl_vec_ISet(nres, K, 0.); for (idx = 0; idx < msa->nseq; idx++) if (esl_abc_XIsCanonical(msa->abc, msa->ax[idx][pos])) nres[(int) msa->ax[idx][pos]] ++; for (ntotal = 0, i = 0; i < K; i++) if (nres[i] > 0) ntotal++; /* Bump weight on each sequence by PB rule */ if (ntotal > 0) { for (idx = 0; idx < msa->nseq; idx++) { if (esl_abc_XIsCanonical(msa->abc, msa->ax[idx][pos])) msa->wgt[idx] += 1. / (double) (ntotal * nres[msa->ax[idx][pos]]); } } } /* first normalization by # of residues counted in each seq */ for (idx = 0; idx < msa->nseq; idx++) { for (rlen = 0, pos = 1; pos <= msa->alen; pos++) if (esl_abc_XIsCanonical(msa->abc, msa->ax[idx][pos])) rlen++; if (rlen > 0) msa->wgt[idx] /= (double) rlen; /* if rlen == 0 for this seq, its weight is still 0.0, as initialized. */ } } #endif /* Make weights normalize up to nseq, and return. In pathological * case where all wgts were 0 (no seqs contain any unambiguous * residues), weights become 1.0. */ esl_vec_DNorm(msa->wgt, msa->nseq); esl_vec_DScale(msa->wgt, msa->nseq, (double) msa->nseq); msa->flags |= eslMSA_HASWGTS; free(nres); return eslOK; ERROR: if (nres != NULL) free(nres); return status; }
/* annotate_posterior_probability() * Synopsis: Add posterior probability annotation lines to new MSA. */ static int annotate_posterior_probability(ESL_MSA *msa, P7_TRACE **tr, const int *matmap, int M, int optflags) { double *totp = NULL; /* total posterior probability in column <apos>: [0..alen-1] */ int *matuse = NULL; /* #seqs with pp annotation in column <apos>: [0..alen-1] */ int idx; /* counter over sequences [0..nseq-1] */ int apos; /* counter for alignment columns: pp's are [0..alen-1] (unlike ax) */ int z; /* counter over trace positions [0..tr->N-1] */ int status; /* Determine if any of the traces have posterior probability annotation. */ for (idx = 0; idx < msa->nseq; idx++) if (tr[idx]->pp != NULL) break; if (idx == msa->nseq) return eslOK; ESL_ALLOC(matuse, sizeof(double) * (msa->alen)); esl_vec_ISet(matuse, msa->alen, 0); ESL_ALLOC(totp, sizeof(double) * (msa->alen)); esl_vec_DSet(totp, msa->alen, 0.0); ESL_ALLOC(msa->pp, sizeof(char *) * msa->sqalloc); for (idx = 0; idx < msa->nseq; idx++) { if (tr[idx]->pp == NULL) { msa->pp[idx] = NULL; continue; } ESL_ALLOC(msa->pp[idx], sizeof(char) * (msa->alen+1)); for (apos = 0; apos < msa->alen; apos++) msa->pp[idx][apos] = '.'; msa->pp[idx][msa->alen] = '\0'; apos = 0; for (z = 0; z < tr[idx]->N; z++) { switch (tr[idx]->st[z]) { case p7T_M: msa->pp[idx][matmap[tr[idx]->k[z]]-1] = p7_alidisplay_EncodePostProb(tr[idx]->pp[z]); totp [matmap[tr[idx]->k[z]]-1]+= tr[idx]->pp[z]; matuse[matmap[tr[idx]->k[z]]-1]++; case p7T_D: apos = matmap[tr[idx]->k[z]]; break; case p7T_I: if ( !(optflags & p7_TRIM) || (tr[idx]->k[z] != 0 && tr[idx]->k[z] != M)) { msa->pp[idx][apos] = p7_alidisplay_EncodePostProb(tr[idx]->pp[z]); apos++; } break; case p7T_N: case p7T_C: if (! (optflags & p7_TRIM) && tr[idx]->i[z] > 0) { msa->pp[idx][apos] = p7_alidisplay_EncodePostProb(tr[idx]->pp[z]); apos++; } break; case p7T_E: apos = matmap[M]; /* set position for C-terminal tail */ break; default: break; } } } for (; idx < msa->sqalloc; idx++) msa->pp[idx] = NULL; /* for completeness, following easel MSA conventions, but should be a no-op: nseq==sqalloc */ /* Consensus posterior probability annotation: only on match columns */ ESL_ALLOC(msa->pp_cons, sizeof(char) * (msa->alen+1)); for (apos = 0; apos < msa->alen; apos++) msa->pp_cons[apos] = '.'; msa->pp_cons[msa->alen] = '\0'; for (apos = 0; apos < msa->alen; apos++) if (matuse[apos]) msa->pp_cons[apos] = p7_alidisplay_EncodePostProb( totp[apos] / (double) matuse[apos]); free(matuse); free(totp); return eslOK; ERROR: if (matuse != NULL) free(matuse); if (totp != NULL) free(totp); if (msa->pp != NULL) esl_Free2D((void **) msa->pp, msa->sqalloc); return status; }
/* map_msas * * Align msa1 and msa2. * For each column in msa1, determine the corresponding column * in msa2. This implementation requires: * - msa1 and msa2 contain exactly the same sequences in the same order * Note: the seqs in msa1 and msa2 do not have to have the same names. * * Uses a DP algorithm similar to Needleman-Wunsch, but that's aligning * two alignment columns at a time instead of two residues. */ static int map_msas(const ESL_GETOPTS *go, char *errbuf, ESL_MSA *msa1, ESL_MSA *msa2, int **ret_msa1_to_msa2_map) { int status; int **one2two; /* [0..c..rflen1][0..a..alen2] number of residues from non-gap RF column c of msa1 * aligned in column a of msa 2 */ int *rf2a_map1 = NULL; /* msa1 map of reference columns (non-gap RF residues) to alignment columns, NULL if msa1->rf == NULL */ int *rf2a_map2 = NULL; /* msa2 map of reference columns (non-gap RF residues) to alignment columns, NULL if msa2->rf == NULL */ int *a2rf_map1 = NULL; /* msa1 map of alignment columns to reference columns, NULL if msa1->rf == NULL */ int *a2rf_map2 = NULL; /* msa2 map of alignment columns to reference columns, NULL if msa2->rf == NULL */ int apos1, apos2; /* counters over alignment position in msa1, msa2 respectively */ int alen1, alen2; /* alignment lengths */ int rfpos1, rfpos2; /* counters over reference positions */ int rflen1, rflen2; /* reference (non-gap RF) lengths */ int **mx; /* [0..c..rflen1][0..a..alen2] dp matrix, score of max scoring aln * from 1..c in msa1 and 1..a in msa 2 */ int **tb; /* [0..c..rflen1][0..a..alen2] traceback ptrs, 0 for diagonal, 1 for vertical */ char *seq1, *seq2; /* temporary strings for ensuring dealigned sequences in msa1 and msa2 are identical */ int64_t len1, len2; /* length of seq1, seq2 */ int isgap1, isgap2; /* is this residue a gap in msa1, msa2? */ int i; /* counter over sequences */ int *res1_per_apos; /* [0..apos..alen1] number of residues in column apos of msa1 */ int sc; /* max score of full path (alignment) through dp mx */ int tb_sc; /* score of traceback, should equal sc */ int *one2two_map; /* [0..a..alen1] the alignment, msa2 column that column apos1 in msa1 maps to */ int total_res = 0; /* total number of residues in msa1 */ float coverage; /* fraction of total_res that are within mapped msa2 columns from one2two_map, * this is tb_sc / total_res */ int total_cres1=0; /* total number of residues in reference positions in msa1 */ int covered_cres1 = 0; /* number of residues in reference positions in msa1 that also appear in the corresponding * mapped column of msa2 */ int be_quiet = esl_opt_GetBoolean(go, "-q"); int *choices; int i_choice; /* contract check */ if(! (msa1->flags & eslMSA_DIGITAL)) ESL_FAIL(eslEINVAL, errbuf, "in map_msas() msa1 (%s) not digitized.\n", esl_opt_GetArg(go, 1)); if(! (msa2->flags & eslMSA_DIGITAL)) ESL_FAIL(eslEINVAL, errbuf, "in map_msas() msa2 (%s) not digitized.\n", esl_opt_GetArg(go, 2)); alen1 = msa1->alen; alen2 = msa2->alen; /* Map msa1 (reference) columns to alignment positions */ rflen1 = rflen2 = 0; if(msa1->rf != NULL) if((status = map_rfpos_to_apos(msa1, &rf2a_map1, &a2rf_map1, &rflen1)) != eslOK) goto ERROR; if(msa2->rf != NULL) if((status = map_rfpos_to_apos(msa2, &rf2a_map2, &a2rf_map2, &rflen2)) != eslOK) goto ERROR; if(! be_quiet) { printf("# %-25s alignment length: %d\n", esl_opt_GetArg(go, 1), alen1); printf("# %-25s alignment length: %d\n", esl_opt_GetArg(go, 2), alen2); } /* collect counts in one2two[i][j]: number of sequences for which residue aligned in msa1 non-gap column i * is aligned in msa2 alignment column j. */ ESL_ALLOC(seq1, sizeof(char) * (alen1+1)); ESL_ALLOC(seq2, sizeof(char) * (alen2+1)); ESL_ALLOC(one2two, sizeof(int *) * (alen1+1)); for(apos1 = 0; apos1 <= alen1; apos1++) { ESL_ALLOC(one2two[apos1], sizeof(int) * (alen2+1)); esl_vec_ISet(one2two[apos1], (alen2+1), 0); } total_res = 0; for(i = 0; i < msa1->nseq; i++) { /* ensure raw (unaligned) seq i in the 2 msas is the same */ esl_abc_Textize(msa1->abc, msa1->ax[i], alen1, seq1); esl_abc_Textize(msa1->abc, msa2->ax[i], alen2, seq2); /* note: msa*1*->abc used on purpose, allows DNA/RNA to peacefully coexist in this func */ esl_strdealign(seq1, seq1, "-_.~", &len1); esl_strdealign(seq2, seq2, "-_.~", &len2); if(len1 != len2) { ESL_FAIL(eslEINVAL, errbuf, "unaligned seq number %d (msa1: %s, msa2: %s) differs in length %s (%" PRId64 ") and %s (%" PRId64 "), those files must contain identical raw seqs\n", i, msa1->sqname[i], msa2->sqname[i], esl_opt_GetArg(go, 1), len1, esl_opt_GetArg(go, 2), len2); } if(strncmp(seq1, seq2, len1) != 0) ESL_FAIL(eslEINVAL, errbuf, "unaligned seq number %d differs between %s and %s, those files must contain identical raw seqs\n", i, esl_opt_GetArg(go, 1), esl_opt_GetArg(go, 2)); total_res += len1; apos1 = apos2 = 1; while((apos1 <= alen1) || (apos2 <= alen2)) { isgap1 = esl_abc_XIsGap(msa1->abc, msa1->ax[i][apos1]); isgap2 = esl_abc_XIsGap(msa2->abc, msa2->ax[i][apos2]); if ( isgap1 && isgap2) { apos1++; apos2++; } else if ( isgap1 && !isgap2) { apos1++; } else if (!isgap1 && isgap2) { apos2++; } else if ( msa1->ax[i][apos1] == msa2->ax[i][apos2]) { one2two[apos1++][apos2++]++; /* two2one[apos2][apos1]++; */ } } } /****************************************************************** * DP alignment of msa1 to msa2 * dp matrix: mx[apos1][apos2] apos1=1..msa->alen1, apos2=1..alen2 (apos1=0 || apos2=0 is invalid) * mx[apos1][apos2] = score of maximal alignment for apos1=1..apos1, apos2'=1..apos2 INCLUDING * apos1 and apos2. Score is number of residues from msa1 columns * 1..apos1 that exist in their respective aligned columns in msa2 (the growing * maximally scoring alignment). */ /****************************************************************** * initialization */ ESL_ALLOC(mx, sizeof(int *) * (alen1+1)); ESL_ALLOC(tb, sizeof(int *) * (alen1+1)); for(apos1 = 0; apos1 <= alen1; apos1++) { ESL_ALLOC(mx[apos1], sizeof(int) * (alen2+1)); ESL_ALLOC(tb[apos1], sizeof(int) * (alen2+1)); esl_vec_ISet(mx[apos1], (alen2+1), 0); esl_vec_ISet(tb[apos1], (alen2+1), -2); /* -2 is a bogus value, if we see it during traceback, there's a problem */ tb[apos1][0] = HORZ; /* special case, if we hit apos2==0 and apos1 > 0, we have to do HORZ moves until apos1==1 */ } esl_vec_ISet(tb[0], (alen2+1), VERT); /* special case, if we hit apos1==0 and apos2 > 0, we have to do VERT moves until apos2==1 */ tb[0][0] = -2; /* all alignments must end here */ ESL_ALLOC(res1_per_apos, sizeof(int) * (alen1+1)); esl_vec_ISet(res1_per_apos, (alen1+1), 0); mx[0][0] = 0; tb[0][0] = -1; /* last cell, special value */ /***************************************************************** * recursion */ ESL_ALLOC(choices, sizeof(int) * NCHOICES); for(apos1 = 1; apos1 <= alen1; apos1++) { for(apos2 = 1; apos2 <= alen2; apos2++) { choices[DIAG] = mx[(apos1-1)][(apos2-1)] + one2two[apos1][apos2]; choices[VERT] = mx[ apos1 ][(apos2-1)]; choices[HORZ] = mx[(apos1-1)][ apos2 ]; i_choice = esl_vec_IArgMax(choices, NCHOICES); mx[apos1][apos2] = choices[i_choice]; tb[apos1][apos2] = i_choice; res1_per_apos[apos1] += one2two[apos1][apos2]; /*printf("mx[%3d][%3d]: %5d (%d)\n", apos1, apos2, mx[apos1][apos2], tb[apos1][apos2]);*/ } } free(choices); total_cres1 = 0; if(rf2a_map1 != NULL) { for(rfpos1 = 1; rfpos1 <= rflen1; rfpos1++) total_cres1 += res1_per_apos[rf2a_map1[rfpos1]]; } /***************************************************************** * traceback */ sc = mx[alen1][alen2]; if(!be_quiet) { /* printf("score %d\n", sc);*/ if(a2rf_map1 != NULL && a2rf_map2 != NULL) { printf("# %12s %12s %22s\n", " msa 1 ", " msa 2 ", ""); printf("# %12s %12s %22s\n", "------------", "------------", ""); printf("# %5s %5s %5s %5s %22s\n", "rfpos", "apos", "rfpos", "apos", " num common residues"); printf("# %5s %5s %5s %5s %22s\n", "-----", "-----", "-----", "-----", "---------------------"); } else if(a2rf_map1 != NULL) { printf("# %12s %5s %22s\n", " msa 1 ", "msa 2", ""); printf("# %12s %5s %22s\n", "------------", "-----", ""); printf("# %5s %5s %5s %22s\n", "rfpos", "apos", "apos", " num common residues"); printf("# %5s %5s %5s %22s\n", "-----", "-----", "-----", "---------------------"); } else if (a2rf_map2 != NULL) { printf("# %5s %12s %22s\n", "msa 1", " msa 2 ", ""); printf("# %5s %12s %22s\n", "-----", "------------", ""); printf("# %5s %5s %5s %22s\n", "apos", "rfpos", "apos", " num common residues"); printf("# %5s %5s %5s %22s\n", "-----", "-----", "-----", "---------------------"); } else { printf("# %5s %5s %22s\n", "msa 1", "msa 2", ""); printf("# %5s %5s %22s\n", "-----", "-----", ""); printf("# %5s %5s %22s\n", "apos", "apos", " num common residues"); printf("# %5s %5s %22s\n", "-----", "-----", "---------------------"); } } /* traceback, and build one2two_map[] */ apos1 = alen1; apos2 = alen2; tb_sc = 0; covered_cres1 = 0; ESL_ALLOC(one2two_map, sizeof(int) * (alen1+1)); esl_vec_ISet(one2two_map, (alen1+1), 0); one2two_map[0] = -1; /* invalid */ while(tb[apos1][apos2] != -1) { if(tb[apos1][apos2] == DIAG) { /* diagonal move */ rfpos1 = (a2rf_map1 == NULL) ? -1 : a2rf_map1[apos1]; rfpos2 = (a2rf_map2 == NULL) ? -1 : a2rf_map2[apos2]; if(!be_quiet) { if(a2rf_map1 != NULL && a2rf_map2 != NULL) { if(rfpos1 == -1 && rfpos2 == -1) { printf(" %5s %5d --> %5s %5d %5d / %5d (%.4f)\n", "-", apos1, "-", apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } else if (rfpos1 == -1) { printf(" %5s %5d --> %5d %5d %5d / %5d (%.4f)\n", "-", apos1, rfpos2, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } else if (rfpos2 == -1) { printf(" %5d %5d --> %5s %5d %5d / %5d (%.4f)\n", rfpos1, apos1, "-", apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } else { printf(" %5d %5d --> %5d %5d %5d / %5d (%.4f)\n", rfpos1, apos1, rfpos2, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } } else if(a2rf_map1 != NULL) { if (rfpos1 == -1) { printf(" %5s %5d --> %5d %5d / %5d (%.4f)\n", "-", apos1, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } else { printf(" %5d %5d --> %5d %5d / %5d (%.4f)\n", rfpos1, apos1, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } } else if (a2rf_map2 != NULL) { if (rfpos2 == -1) { printf(" %5d --> %5s %5d %5d / %5d (%.4f)\n", apos1, "-", apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } else { printf(" %5d --> %5d %5d %5d / %5d (%.4f)\n", apos1, rfpos2, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } } else { printf(" %5d --> %5d %5d / %5d (%.4f)\n", apos1, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } } tb_sc += one2two[apos1][apos2]; one2two_map[apos1] = apos2; if(rfpos1 > 0) covered_cres1 += one2two[apos1][apos2]; /* apos1 is a rfpos */ apos1--; apos2--; } else if(tb[apos1][apos2] == VERT) { apos2--; /* vertical move */ } else if(tb[apos1][apos2] == HORZ) { apos1--; /* horizontal move */ } else if(tb[apos1][apos2] != -1) /* shouldn't happen */ ESL_FAIL(eslEINVAL, errbuf, "in dp traceback, tb[apos1: %d][apos2: %d] %d\n", apos1, apos2, tb[apos1][apos2]); } /* done DP code **********************************/ if(!be_quiet) printf("# Total trace back sc: %d\n", tb_sc); if(tb_sc != sc) ESL_FAIL(eslEINVAL, errbuf, "in dp traceback, tb_sc (%d) != sc (%d)\n", tb_sc, sc); coverage = (float) tb_sc / (float) total_res; printf("# Coverage: %6d / %6d (%.4f)\n# Coverage is fraction of residues from %s in optimally mapped columns in %s\n", tb_sc, total_res, coverage, esl_opt_GetArg(go, 1), esl_opt_GetArg(go, 2)); if(total_cres1 > 0) printf("# RF coverage: %6d / %6d (%.4f)\n# RF coverage is fraction of non-gap RF residues from %s in optimally mapped columns in %s\n", covered_cres1, total_cres1, (float) covered_cres1 / (float) total_cres1, esl_opt_GetArg(go, 1), esl_opt_GetArg(go, 2)); /* print masks if nec */ if((status = map2masks(go, errbuf, alen1, alen2, a2rf_map1, a2rf_map2, rf2a_map1, rf2a_map2, rflen1, rflen2, one2two_map)) != eslOK) return status; /* clean up and return */ for(apos1 = 0; apos1 <= alen1; apos1++) { free(mx[apos1]); free(tb[apos1]); } free(mx); free(tb); for(apos1 = 0; apos1 <= alen1; apos1++) free(one2two[apos1]); free(one2two); free(res1_per_apos); if(rf2a_map1 != NULL) free(rf2a_map1); if(rf2a_map2 != NULL) free(rf2a_map2); if(a2rf_map1 != NULL) free(a2rf_map1); if(a2rf_map2 != NULL) free(a2rf_map2); free(seq1); free(seq2); *ret_msa1_to_msa2_map = one2two_map; return eslOK; ERROR: return status; }
static int map_alignment(const char *msafile, const P7_HMM *hmm, ESL_SQ ***ret_sq, P7_TRACE ***ret_tr, int *ret_ntot) { ESL_SQ **sq = NULL; P7_TRACE **tr = NULL; ESLX_MSAFILE *afp = NULL; ESL_MSA *msa = NULL; ESL_ALPHABET *abc = (ESL_ALPHABET *) hmm->abc; /* removing const'ness to make compiler happy. Safe. */ int *matassign = NULL; uint32_t chksum = 0; int i,k; int status; status = eslx_msafile_Open(&abc, msafile, NULL, eslMSAFILE_UNKNOWN, NULL, &afp); if (status != eslOK) eslx_msafile_OpenFailure(afp, status); status = eslx_msafile_Read(afp, &msa); if (status != eslOK) eslx_msafile_ReadFailure(afp, status); if (! (hmm->flags & p7H_CHKSUM) ) esl_fatal("HMM has no checksum. --mapali unreliable without it."); if (! (hmm->flags & p7H_MAP) ) esl_fatal("HMM has no map. --mapali can't work without it."); esl_msa_Checksum(msa, &chksum); if (hmm->checksum != chksum) esl_fatal("--mapali MSA %s isn't same as the one HMM came from (checksum mismatch)", msafile); ESL_ALLOC(sq, sizeof(ESL_SQ *) * msa->nseq); ESL_ALLOC(tr, sizeof(P7_TRACE *) * msa->nseq); ESL_ALLOC(matassign, sizeof(int) * (msa->alen + 1)); esl_vec_ISet(matassign, msa->alen+1, 0); for (k = 1; k <= hmm->M; k++) matassign[hmm->map[k]] = 1; p7_trace_FauxFromMSA(msa, matassign, p7_DEFAULT, tr); /* The 'faux' core traces constructed by FauxFromMSA() may contain * D->I and I->D transitions. They may *only* now be passed to * p7_tracealign_Seqs(), which can deal with these 'illegal' * transitions, in order to exactly reproduce the input --mapali * alignment. */ for (i = 0; i < msa->nseq; i++) esl_sq_FetchFromMSA(msa, i, &(sq[i])); *ret_ntot = msa->nseq; *ret_tr = tr; *ret_sq = sq; eslx_msafile_Close(afp); esl_msa_Destroy(msa); free(matassign); return eslOK; ERROR: *ret_ntot = 0; *ret_tr = NULL; *ret_sq = NULL; if (afp != NULL) eslx_msafile_Close(afp); if (msa != NULL) esl_msa_Destroy(msa); if (matassign != NULL) free(matassign); return status; }
/* dump_posterior_sequence_info * * Dump per-sequence posterior probability data to a file. * */ static int dump_posterior_sequence_info(FILE *fp, ESL_MSA *msa, int nali, char *alifile, char *errbuf) { int i,p,apos; /* counters over sequences, columns of MSA */ int ppidx; int nppvals = 12; int nnongap; double sum; float ppavgA[11]; char ppstring[12] = "0123456789*."; int seq_pp_ct[12]; ppavgA[0] = 0.025; ppavgA[1] = 0.10; ppavgA[2] = 0.20; ppavgA[3] = 0.30; ppavgA[4] = 0.40; ppavgA[5] = 0.50; ppavgA[6] = 0.60; ppavgA[7] = 0.70; ppavgA[8] = 0.80; ppavgA[9] = 0.90; ppavgA[10] = 0.975; fprintf(fp, "# Posterior probability stats per sequence:\n"); fprintf(fp, "# Alignment file: %s\n", alifile); fprintf(fp, "# Alignment idx: %d\n", nali); if(msa->name != NULL) { fprintf(fp, "# Alignment name: %s\n", msa->name); } fprintf(fp, "# Number of sequences: %d\n", msa->nseq); fprintf(fp, "# %7s %-40s %7s", "seqidx", "seqname", "nnongap"); for(p = 0; p < nppvals-1; p++) { /* don't include gaps in per-sequence output */ fprintf(fp, " %7c", ppstring[p]); } fprintf(fp, " %7s\n", "avgPP"); fprintf(fp, "# %7s %40s %7s", "-------", "----------------------------------------", "-------"); for(p = 0; p < nppvals-1; p++) { /* don't include gaps in per-sequence output */ fprintf(fp, " %7s", "-------"); } fprintf(fp, " %7s\n", "-------"); for(i = 0; i < msa->nseq; i++) { if(msa->pp[i] != NULL) { fprintf(fp, " %7d %-40s", i+1, msa->sqname[i]); sum = 0.; esl_vec_ISet(seq_pp_ct, nppvals, 0); for(apos = 0; apos < msa->alen; apos++) { if((ppidx = get_pp_idx(msa->abc, msa->pp[i][apos])) == -1) ESL_FAIL(eslEFORMAT, errbuf, "bad #=GR PP char: %c", msa->pp[i][apos]); seq_pp_ct[ppidx]++; } nnongap = esl_vec_ISum(seq_pp_ct, 11); fprintf(fp, " %7d", nnongap); for(p = 0; p < nppvals-1; p++) { /* don't include gaps in per-sequence output */ fprintf(fp, " %7d", seq_pp_ct[p]); if(p <= 10) sum += (float) seq_pp_ct[p] * ppavgA[p]; } fprintf(fp, " %.5f\n", sum / (float) nnongap); } } fprintf(fp, "//\n"); return eslOK; }
void AllocCPlan9Body(CP9_t *hmm, int M, const ESL_ALPHABET *abc) { int status; int k, x; hmm->abc = abc; hmm->M = M; ESL_ALLOC(hmm->t, (M+1) * sizeof(float *)); ESL_ALLOC(hmm->mat, (M+1) * sizeof(float *)); ESL_ALLOC(hmm->ins, (M+1) * sizeof(float *)); ESL_ALLOC(hmm->t[0],(cp9_NTRANS*(M+1)) * sizeof(float)); ESL_ALLOC(hmm->mat[0],(abc->K*(M+1)) * sizeof(float)); ESL_ALLOC(hmm->ins[0],(abc->K*(M+1)) * sizeof(float)); ESL_ALLOC(hmm->tsc, cp9_NTRANS * sizeof(int *)); ESL_ALLOC(hmm->msc, hmm->abc->Kp * sizeof(int *)); ESL_ALLOC(hmm->isc, hmm->abc->Kp * sizeof(int *)); ESL_ALLOC(hmm->tsc_mem,(cp9_NTRANS*(M+1)) * sizeof(int)); ESL_ALLOC(hmm->msc_mem,(hmm->abc->Kp*(M+1)) * sizeof(int)); ESL_ALLOC(hmm->isc_mem,(hmm->abc->Kp*(M+1)) * sizeof(int)); hmm->tsc[0] = hmm->tsc_mem; hmm->msc[0] = hmm->msc_mem; hmm->isc[0] = hmm->isc_mem; /* transition scores reordered */ ESL_ALLOC(hmm->otsc, sizeof(int) * (M+1) * cp9O_NTRANS); /* note allocation strategy for important 2D arrays -- trying * to keep locality as much as possible, cache efficiency etc. */ for (k = 1; k <= M; k++) { hmm->mat[k] = hmm->mat[0] + k * abc->K; hmm->ins[k] = hmm->ins[0] + k * abc->K; hmm->t[k] = hmm->t[0] + k * cp9_NTRANS; } for (x = 1; x < hmm->abc->Kp; x++) { hmm->msc[x] = hmm->msc[0] + x * (M+1); hmm->isc[x] = hmm->isc[0] + x * (M+1); } for (x = 0; x < cp9_NTRANS; x++) hmm->tsc[x] = hmm->tsc[0] + x * (M+1); /* tsc[x][0] is used as a boundary condition sometimes [Viterbi()], * so set to -inf always. */ for (x = 0; x < cp9_NTRANS; x++) hmm->tsc[x][0] = -INFTY; ESL_ALLOC(hmm->begin, (M+1) * sizeof(float)); ESL_ALLOC(hmm->end, (M+1) * sizeof(float)); ESL_ALLOC(hmm->bsc_mem, (M+1) * sizeof(int)); ESL_ALLOC(hmm->esc_mem, (M+1) * sizeof(int)); ESL_ALLOC(hmm->null, (abc->K) * sizeof(float)); hmm->bsc = hmm->bsc_mem; hmm->esc = hmm->esc_mem; /* end[0], begin[0], esc[0] and bsc[0] are never * used, set them to 0. and -INFTY */ hmm->end[0] = hmm->begin[0] = -INFTY; hmm->esc[0] = hmm->bsc[0] = -INFTY; ESL_ALLOC(hmm->has_el, (M+1) * sizeof(int)); ESL_ALLOC(hmm->el_from_ct, (M+2) * sizeof(int)); ESL_ALLOC(hmm->el_from_idx,(M+2) * sizeof(int *)); ESL_ALLOC(hmm->el_from_cmnd,(M+2) * sizeof(int *)); esl_vec_ISet(hmm->has_el, M+1, FALSE); esl_vec_ISet(hmm->el_from_ct, M+1, 0); for(k = 0; k <= M+1; k++) { hmm->el_from_idx[k] = NULL; hmm->el_from_cmnd[k] = NULL; } return; ERROR: cm_Fail("Memory allocation error."); }
/* Function: cm_tr_penalties_Create() * Date: EPN, Sat Jan 21 12:03:52 2012 * * Purpose: Allocate and initialize a CM_TR_PENALTIES object. * A CM and its emit map are required to determine * truncation penalty scores. This is annoyingly * complex, see verbose notes within code below. * * Some of the code in this function, specifically * that which calculates the probability of a fragment * aligning at a given node, is checkable, but only * if we disallow truncated begins into insert states. * However, we want to allow truncated begins in reality. * I've left in a flag for ignoring inserts (<ignore_inserts>) * I used in testing this function. Set it to TRUE to * perform the test. * * Returns: Newly allocated CM_TR_PENALTIES object. NULL if out * of memory. */ CM_TR_PENALTIES * cm_tr_penalties_Create(CM_t *cm, int ignore_inserts, char *errbuf) { int status; int v, nd, m, i1, i2; int lpos, rpos; int i; /* variables used for determining ratio of inserts to match at each consensus position */ float *mexpocc = NULL; /* [0..c..clen] probability match state is used to emit at cons posn c */ float *iexpocc = NULL; /* [0..c..clen] probability insert state is used to emit after cons posn c */ double *psi = NULL; /* [0..v..M-1] expected occupancy of state v */ float m_psi, i1_psi, i2_psi; /* temp psi values */ float summed_psi; CM_TR_PENALTIES *trp = NULL; /* variables used for calculating global truncation penalties */ float g_5and3; /* fragment probability if 5' and 3' truncation are allowed */ float g_5or3; /* fragment probability if 5' or 3' truncation are allowed */ /* variables used for calculating local truncation penalties */ float *begin = NULL; /* local begin probabilities 0..v..M-1 */ int subtree_clen; /* consensus length of subtree under this node */ float prv53, prv5, prv3; /* previous node's fragment probability, 5'&3', 5' only, 3'only */ float cur53, cur5, cur3; /* current node's fragment probability, 5'&3', 5' only, 3'only */ int nfrag53, nfrag5, nfrag3; /* number of fragments, 5'&3', 5' only, 3'only */ if(cm == NULL || cm->emap == NULL) goto ERROR; ESL_ALLOC(trp, sizeof(CM_TR_PENALTIES)); trp->M = cm->M; trp->ignored_inserts = ignore_inserts; /* Define truncation penalties for each state v. This will be * the score for doing a truncated begin into state v. * * Important note: For this discussion we assume that sequences can * only be truncated at consensus positions, which means we don't * have to worry about truncated begins into inserts. This is an * approximation (also made by Diana and Sean in the 2009 trCYK * paper) that greatly simplifies the explanation of the calculation * of the truncation penalties. The examples in my ELN3 notebook * also use this simplification. However, I need to be able to do * truncated begins into insert states in some cases (some pass/mode * combinations see ELN bottom of p.47). I explain first the * rationale for calculating truncation penalties ignoring inserts * and then I describe how I adapt those penalties to allow * for inserts. * * This is a lengthy comment. I've divided it into 3 sections: * Section 1. Global mode truncation penalties, ignoring inserts. * Section 2. Local mode truncation penalties, ignoring inserts. * Section 3. Adapting truncation penalties to allow for inserts. * ************************************************************** * Section 1. Global mode truncation penalties, ignoring inserts. * * We want the truncation penalty to be the log of the probability * that the particular fragment we're aligning was generated from * the following generative process. The generative process differs * between global and local mode. * * In global mode: * o Sample global parsetree which spans consensus positions 1..clen. * o Randomly choose g and h in range 1..clen, where h >= g and * truncate sequence from g..h. The first residue will either be * an insert before position g, or a match at position g of the * model. The final residue will either be an insert after position * h or a match at position h of the model. * * All g,h fragments are equiprobable, so the probability of any * particular fragment is 2 / (clen * (clen+1)). So log_2 of this * value is the truncation penalty for all truncated alignments in * global mode where both 5' and 3' truncation are allowed. * * We store this penalty, per-state in the * g_ptyAA[TRPENALTY_5P_AND_3P][0..v..M-1]. The penalty is * identical for all emitting states. The penalty value for * non-emitters is IMPOSSIBLE because truncated begins are * not allowed into non-emitters. * * If only 5' OR 3' truncation is allowed, we only truncate at g or * h, which menas there's 1/clen possible fragments and log_2 * (1/clen) is our global truncation penalty. * * However, if 5' truncation is allowed we can only do a truncated * begin into states that with a consensus subtree that spans * position clen (since we don't allow a truncation at the 3' end). * Thus any state whose subtree that doesn't span clen gets * an IMPOSSIBLE value for its truncation score in: * g_ptyAA[TRPENALTY_5P_ONLY][0..v..M-1]. * * Likewise, if 3' truncation is allowed we can only do a truncated * begin into states that with a consensus subtree that spans * position 1 (since we don't allow a truncation at the 5' end). * * There's an example of computing all three types of penalties for * a simple CM in ELN 3 p43. * ************************************************************ * Section 2. Local mode truncation penalties, ignoring inserts. * * Generative process that generates fragments in local mode: * o Sample local begin state b with consensus subtree from i..j from * local begin state distribution. * o Randomly choose g and h in range i..j, where h >= g and * truncate sequence from g..h. The first residue will either be * an insert before position g, or a match at position g of the * model. The final residue will either be an insert after position * h or a match at position h of the model. * * Unlike in global mode, in local mode all fragments are not * equiprobable since the local begin state distribution can be * anything, and each b allows different sets of fragments to be * generated (because they can only span from i to j). * * The truncation penalty should be the log of the probability of * aligning the current fragment to the model. So we need to know * the probability of generating each possible fragment. * We could calculate probability of any fragment g,h with the * following inefficient algorithm: * * For each start fragment point g, * For each start fragment point h, * For each state v, * If lpos[v] <= g && rpos[v] >= h, then * prob[g][h] += begin[v] * 2. / (st_clen[v] * (st_clen[v]+1)); * * Where lpos[v]/rpos[v] are the left/right consensus positions in * consensus subtree rooted at state v. And st_clen[v] is rpos[v] - * lpos[v] + 1, the consensus length of that subtree. * * This gives us prob[g][h], the probability of generating fragment * g,h. But we want to apply the penalty to a state, not to a * fragment, to avoid needing to know the fragment boundaries g,h * during the DP recursion when applying the penalty. * * To facilitate this, we need to find state t, the state with * smallest subtree that contains g,h. State t is relevant because * it is the state which will root the alignment of the fragment g,h * by using a truncated begin transition into t. This gives a new * algorithm: * * For each start fragment point g, * For each start fragment point h, * Identify state t, the max valued state for which * lpos[v] <= g && rpos[v] >= h, then { * prob[t] += prob[g][h] * fcount[t]++; * } * * prob[t] will be the probability of observing an alignment that * uses a truncated begin into t to align any fragment. Then we take * average over all fragments: prob[t] / fcount[t] (since we'll only * be aligning one of those fragments) and use the log of that * probability as the penalty for observing a truncated alignment * rooted at state t. Conveniently, it turns out that all fragments * that share t are equiprobable (have equal prob[g][h] values), so * the average probability is the actual probability for each * fragment, and thus the correct penalty to apply. * * Fortunately, we can compute the correct penalty much more * efficiently than the two algorithms shown above. The * efficient way is implemented below. A test that the penalties * are correctly computed is in cm_tr_penalties_Validate(). * * This discussion assumes we're truncating 5' and 3', but if we're * only truncating 5' or 3' The situation is a little different. * * There's an example of computing all three types of penalties for * a simple CM in ELN3 p44-45. * ************************************************************ * Section 3. Adapting truncation penalties to allow for inserts. * * We need to be able to do truncated begins into insert states * because we enforce that the first/final residue of a sequence be * included in 5'/3' truncated alignments and we want to be able * to properly align those residues if they're probably emitted * by insert states. * * The methods/logic explained in sections 1 and 2 above I believe * is correct IF we ignore inserts (assume truncated begins into * them are impossible). But we need to allow inserts, so I modify * the truncation penalties as described above to allow for inserts * as follows. We can calculate the appropriate truncated begin * penalty for all MATP_MP, MATL_ML, MATR_MR, BIF_B states as with * the methods described above by ignoring inserts. This gives us a * probability p of using that state as the root of the truncated * alignment, i.e. the truncated begin state. (The log_2 of this * probability is the penalty.) We then partition p amongst the * MATP_MP, MATL_ML, MATR_MR, BIF_B states and any parent insert * states, i.e. any insert state that can transition into the * match/bif state. For each match/bif state there's 0, 1 or 2 * parent inserts. We then partition p based on the relative * expected occupancy of these inserts versus the match/bif state. * * This is certainly 'incorrect' in that it doesn't reflect the * true probability of a fragment being aligned to each of the * states, but it should be a close approximation. I think doing * it correctly is basically impossible in the context of a single * state-specific penalty (i.e. the penalty would have to be per-fragment * which would be hard to deal with in the DP functions). */ /* allocate and initialize the penalty arrays */ ESL_ALLOC(trp->g_ptyAA, sizeof(float *) * NTRPENALTY); ESL_ALLOC(trp->l_ptyAA, sizeof(float *) * NTRPENALTY); ESL_ALLOC(trp->ig_ptyAA, sizeof(int *) * NTRPENALTY); ESL_ALLOC(trp->il_ptyAA, sizeof(int *) * NTRPENALTY); for(i = 0; i < NTRPENALTY; i++) { trp->g_ptyAA[i] = NULL; trp->l_ptyAA[i] = NULL; trp->il_ptyAA[i] = NULL; trp->ig_ptyAA[i] = NULL; ESL_ALLOC(trp->g_ptyAA[i], sizeof(float) * cm->M); ESL_ALLOC(trp->l_ptyAA[i], sizeof(float) * cm->M); ESL_ALLOC(trp->ig_ptyAA[i], sizeof(int) * cm->M); ESL_ALLOC(trp->il_ptyAA[i], sizeof(int) * cm->M); esl_vec_FSet(trp->g_ptyAA[i], cm->M, IMPOSSIBLE); esl_vec_FSet(trp->l_ptyAA[i], cm->M, IMPOSSIBLE); esl_vec_ISet(trp->ig_ptyAA[i], cm->M, -INFTY); esl_vec_ISet(trp->il_ptyAA[i], cm->M, -INFTY); } /* DumpEmitMap(stdout, cm->emap, cm); */ /* Calculate local begin probabilities and expected occupancy */ ESL_ALLOC(begin, sizeof(float) * cm->M); cm_CalculateLocalBeginProbs(cm, cm->pbegin, cm->t, begin); if((status = cm_ExpectedPositionOccupancy(cm, &mexpocc, &iexpocc, &psi, NULL, NULL, NULL)) != eslOK) goto ERROR; /* Fill global and local truncation penalties in a single loop. We * step through all nodes and set the truncation penalties for the * MATP_MP, MATL_ML, MATR_MR, and BIF_B states and any parent * inserts (i1, i2) of those states. */ g_5and3 = 2. / (cm->clen * (cm->clen+1)); /* for global mode: probability of all fragments if we're truncating 5' and 3' */ g_5or3 = 1. / cm->clen; /* for global mode: probability of all fragments if we're only truncating 5' or 3' */ prv5 = prv3 = prv53 = 0.; /* initialize 'previous' probability values used for calc'ing local truncation penalties */ for(nd = 0; nd < cm->nodes; nd++) { lpos = (cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd) ? cm->emap->lpos[nd] : cm->emap->lpos[nd] + 1; rpos = (cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATR_nd) ? cm->emap->rpos[nd] : cm->emap->rpos[nd] - 1; /* now set penalties for match and insert states m, i1 and maybe i2 (if we're a MATP_MP or BIF_B) */ if(cm->ndtype[nd] == END_nd) { prv5 = prv3 = prv53 = 0.; } else if(cm->ndtype[nd] == BEGL_nd || cm->ndtype[nd] == BEGR_nd) { prv5 = (cm->ndtype[nd] == BEGL_nd) ? 0. : trp->l_ptyAA[TRPENALTY_5P_ONLY][cm->plast[cm->nodemap[nd]]]; /* parent BIF_B's probability */; prv3 = (cm->ndtype[nd] == BEGR_nd) ? 0. : trp->l_ptyAA[TRPENALTY_3P_ONLY][cm->plast[cm->nodemap[nd]]]; /* parent BIF_B's probability */; prv53 = trp->l_ptyAA[TRPENALTY_5P_AND_3P][cm->plast[cm->nodemap[nd]]]; /* parent BIF_B's probability */ } else if(cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd || cm->ndtype[nd] == MATR_nd || cm->ndtype[nd] == BIF_nd) { /* determine match states and insert states that pertain to this node */ m = cm->nodemap[nd]; /* MATP_MP, MATL_ML, MATR_MR, or BIF_B */ InsertsGivenNodeIndex(cm, nd-1, &i1, &i2); m_psi = psi[m]; if(cm->ndtype[nd] == MATP_MP) { m_psi += (psi[m+1] + psi[m+2]); } /* include MATP_ML and MATP_MR psi */ i1_psi = (i1 == -1) ? 0. : psi[i1]; i2_psi = (i2 == -1) ? 0. : psi[i2]; summed_psi = m_psi + i1_psi + i2_psi; if(ignore_inserts) { i1_psi = i2_psi = 0.; summed_psi = m_psi; } /* Global penalties */ /* sanity check, we should only set truncation penalty once per state */ if(NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_5P_AND_3P][m])) goto ERROR; if((i1 != -1) && NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_5P_AND_3P][i1])) goto ERROR; if((i2 != -1) && NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_5P_AND_3P][i2])) goto ERROR; /* divide up the probability g_5and3 amongst relevant states m, i1, i2, weighted by psi */ trp->g_ptyAA[TRPENALTY_5P_AND_3P][m] = (m_psi / summed_psi) * g_5and3; if(i1 != -1) trp->g_ptyAA[TRPENALTY_5P_AND_3P][i1] = (i1_psi / summed_psi) * g_5and3; if(i2 != -1) trp->g_ptyAA[TRPENALTY_5P_AND_3P][i2] = (i2_psi / summed_psi) * g_5and3; /* same thing, for 5P only and 3P only */ if(rpos == cm->clen) { /* else it will remain IMPOSSIBLE */ trp->g_ptyAA[TRPENALTY_5P_ONLY][m] = (m_psi / summed_psi) * g_5or3; if(i1 != -1) trp->g_ptyAA[TRPENALTY_5P_ONLY][i1] = (i1_psi / summed_psi) * g_5or3; if(i2 != -1) trp->g_ptyAA[TRPENALTY_5P_ONLY][i2] = (i2_psi / summed_psi) * g_5or3; } if(lpos == 1) { /* else it will remain IMPOSSIBLE */ trp->g_ptyAA[TRPENALTY_3P_ONLY][m] = (m_psi / summed_psi) * g_5or3; if(i1 != -1) trp->g_ptyAA[TRPENALTY_3P_ONLY][i1] = (i1_psi / summed_psi) * g_5or3; if(i2 != -1) trp->g_ptyAA[TRPENALTY_3P_ONLY][i2] = (i2_psi / summed_psi) * g_5or3; } /* Local penalties */ subtree_clen = rpos - lpos + 1; nfrag5 = subtree_clen; nfrag3 = subtree_clen; nfrag53 = (subtree_clen * (subtree_clen+1)) / 2; /* determine probability of observing a fragment aligned at * state m (here, m is what I call t above and in notes) and * partition that probability between m and i1 and/or i2 by * relative occupancy of match versus inserts */ cur5 = begin[m] / (float) nfrag5 + prv5; cur3 = begin[m] / (float) nfrag3 + prv3; cur53 = begin[m] / (float) nfrag53 + prv53; /* sanity check, we should only set truncation penalty once per state */ if(NOT_IMPOSSIBLE(trp->l_ptyAA[TRPENALTY_5P_AND_3P][m])) goto ERROR; if((i1 != -1) && NOT_IMPOSSIBLE(trp->l_ptyAA[TRPENALTY_5P_AND_3P][i1])) goto ERROR; if((i2 != -1) && NOT_IMPOSSIBLE(trp->l_ptyAA[TRPENALTY_5P_AND_3P][i2])) goto ERROR; trp->l_ptyAA[TRPENALTY_5P_AND_3P][m] = (m_psi / summed_psi) * cur53; if(i1 != -1) trp->l_ptyAA[TRPENALTY_5P_AND_3P][i1] = (i1_psi / summed_psi) * cur53; if(i2 != -1) trp->l_ptyAA[TRPENALTY_5P_AND_3P][i2] = (i2_psi / summed_psi) * cur53; trp->l_ptyAA[TRPENALTY_5P_ONLY][m] = (m_psi / summed_psi) * cur5; if(i1 != -1) trp->l_ptyAA[TRPENALTY_5P_ONLY][i1] = (i1_psi / summed_psi) * cur5; if(i2 != -1) trp->l_ptyAA[TRPENALTY_5P_ONLY][i2] = (i2_psi / summed_psi) * cur5; trp->l_ptyAA[TRPENALTY_3P_ONLY][m] = (m_psi / summed_psi) * cur3; if(i1 != -1) trp->l_ptyAA[TRPENALTY_3P_ONLY][i1] = (i1_psi / summed_psi) * cur3; if(i2 != -1) trp->l_ptyAA[TRPENALTY_3P_ONLY][i2] = (i2_psi / summed_psi) * cur3; prv5 = (cm->ndtype[nd] == MATL_nd) ? cur5 : 0.; prv3 = (cm->ndtype[nd] == MATR_nd) ? cur3 : 0.; prv53 = cur53; } } /* all penalties are currently probabilities, convert them to log * probs and set integer penalties (careful, we have to check if * IMPOSSIBLE first) */ for(v = 0; v < cm->M; v++) { if((cm->stid[v] == MATP_MP || cm->stid[v] == MATL_ML || cm->stid[v] == MATR_MR || cm->stid[v] == BIF_B) || ((cm->sttype[v] == IL_st || cm->sttype[v] == IR_st) && (! StateIsDetached(cm, v)))) { /* Check for rare special case: if we're a MATP_IL and next * two states are MATP_IR and END_E, then we won't have set * a trunction penalty. This state will keep an impossible * truncated begin score, if we did a truncated begin into * it we'd just emit from the MATP_IL and then go to the * END_E anyway (the MATP_IR will be detached. */ if(cm->stid[v] == MATP_IL && cm->ndtype[cm->ndidx[v]+1] == END_nd) continue; /* glocal 5P AND 3P: all of these should have been set to a non-IMPOSSIBLE value */ if(! NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_5P_AND_3P][v])) goto ERROR; trp->ig_ptyAA[TRPENALTY_5P_AND_3P][v] = Prob2Score(trp->g_ptyAA[TRPENALTY_5P_AND_3P][v], 1.0); trp->g_ptyAA[TRPENALTY_5P_AND_3P][v] = sreLOG2(trp->g_ptyAA[TRPENALTY_5P_AND_3P][v]); /* glocal 5P only: some may be IMPOSSIBLE */ if(NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_5P_ONLY][v])) { trp->ig_ptyAA[TRPENALTY_5P_ONLY][v] = Prob2Score(trp->g_ptyAA[TRPENALTY_5P_ONLY][v], 1.0); trp->g_ptyAA[TRPENALTY_5P_ONLY][v] = sreLOG2(trp->g_ptyAA[TRPENALTY_5P_ONLY][v]); } /* glocal 5P only: some may be IMPOSSIBLE */ if(NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_3P_ONLY][v])) { trp->ig_ptyAA[TRPENALTY_3P_ONLY][v] = Prob2Score(trp->g_ptyAA[TRPENALTY_3P_ONLY][v], 1.0); trp->g_ptyAA[TRPENALTY_3P_ONLY][v] = sreLOG2(trp->g_ptyAA[TRPENALTY_3P_ONLY][v]); } /* local penalties all of these should have been set to a non-IMPOSSIBLE value */ if(! NOT_IMPOSSIBLE(trp->il_ptyAA[TRPENALTY_5P_AND_3P][v])) goto ERROR; if(! NOT_IMPOSSIBLE(trp->il_ptyAA[TRPENALTY_5P_ONLY][v])) goto ERROR; if(! NOT_IMPOSSIBLE(trp->il_ptyAA[TRPENALTY_3P_ONLY][v])) goto ERROR; trp->il_ptyAA[TRPENALTY_5P_AND_3P][v] = Prob2Score(trp->l_ptyAA[TRPENALTY_5P_AND_3P][v], 1.0); trp->il_ptyAA[TRPENALTY_5P_ONLY][v] = Prob2Score(trp->l_ptyAA[TRPENALTY_5P_ONLY][v], 1.0); trp->il_ptyAA[TRPENALTY_3P_ONLY][v] = Prob2Score(trp->l_ptyAA[TRPENALTY_3P_ONLY][v], 1.0); trp->l_ptyAA[TRPENALTY_5P_AND_3P][v] = sreLOG2(trp->l_ptyAA[TRPENALTY_5P_AND_3P][v]); trp->l_ptyAA[TRPENALTY_5P_ONLY][v] = sreLOG2(trp->l_ptyAA[TRPENALTY_5P_ONLY][v]); trp->l_ptyAA[TRPENALTY_3P_ONLY][v] = sreLOG2(trp->l_ptyAA[TRPENALTY_3P_ONLY][v]); } } if(ignore_inserts) { if((status = cm_tr_penalties_Validate(trp, cm, 0.0001, errbuf)) != eslOK) { printf("%s", errbuf); goto ERROR; } } /* cm_tr_penalties_Dump(stdout, cm, trp); */ if(mexpocc != NULL) free(mexpocc); if(iexpocc != NULL) free(iexpocc); if(psi != NULL) free(psi); if(begin != NULL) free(begin); return trp; ERROR: if(mexpocc != NULL) free(mexpocc); if(iexpocc != NULL) free(iexpocc); if(psi != NULL) free(psi); if(begin != NULL) free(begin); if(trp != NULL) cm_tr_penalties_Destroy(trp); return NULL; }