/* map_msas * * Align msa1 and msa2. * For each column in msa1, determine the corresponding column * in msa2. This implementation requires: * - msa1 and msa2 contain exactly the same sequences in the same order * Note: the seqs in msa1 and msa2 do not have to have the same names. * * Uses a DP algorithm similar to Needleman-Wunsch, but that's aligning * two alignment columns at a time instead of two residues. */ static int map_msas(const ESL_GETOPTS *go, char *errbuf, ESL_MSA *msa1, ESL_MSA *msa2, int **ret_msa1_to_msa2_map) { int status; int **one2two; /* [0..c..rflen1][0..a..alen2] number of residues from non-gap RF column c of msa1 * aligned in column a of msa 2 */ int *rf2a_map1 = NULL; /* msa1 map of reference columns (non-gap RF residues) to alignment columns, NULL if msa1->rf == NULL */ int *rf2a_map2 = NULL; /* msa2 map of reference columns (non-gap RF residues) to alignment columns, NULL if msa2->rf == NULL */ int *a2rf_map1 = NULL; /* msa1 map of alignment columns to reference columns, NULL if msa1->rf == NULL */ int *a2rf_map2 = NULL; /* msa2 map of alignment columns to reference columns, NULL if msa2->rf == NULL */ int apos1, apos2; /* counters over alignment position in msa1, msa2 respectively */ int alen1, alen2; /* alignment lengths */ int rfpos1, rfpos2; /* counters over reference positions */ int rflen1, rflen2; /* reference (non-gap RF) lengths */ int **mx; /* [0..c..rflen1][0..a..alen2] dp matrix, score of max scoring aln * from 1..c in msa1 and 1..a in msa 2 */ int **tb; /* [0..c..rflen1][0..a..alen2] traceback ptrs, 0 for diagonal, 1 for vertical */ char *seq1, *seq2; /* temporary strings for ensuring dealigned sequences in msa1 and msa2 are identical */ int64_t len1, len2; /* length of seq1, seq2 */ int isgap1, isgap2; /* is this residue a gap in msa1, msa2? */ int i; /* counter over sequences */ int *res1_per_apos; /* [0..apos..alen1] number of residues in column apos of msa1 */ int sc; /* max score of full path (alignment) through dp mx */ int tb_sc; /* score of traceback, should equal sc */ int *one2two_map; /* [0..a..alen1] the alignment, msa2 column that column apos1 in msa1 maps to */ int total_res = 0; /* total number of residues in msa1 */ float coverage; /* fraction of total_res that are within mapped msa2 columns from one2two_map, * this is tb_sc / total_res */ int total_cres1=0; /* total number of residues in reference positions in msa1 */ int covered_cres1 = 0; /* number of residues in reference positions in msa1 that also appear in the corresponding * mapped column of msa2 */ int be_quiet = esl_opt_GetBoolean(go, "-q"); int *choices; int i_choice; /* contract check */ if(! (msa1->flags & eslMSA_DIGITAL)) ESL_FAIL(eslEINVAL, errbuf, "in map_msas() msa1 (%s) not digitized.\n", esl_opt_GetArg(go, 1)); if(! (msa2->flags & eslMSA_DIGITAL)) ESL_FAIL(eslEINVAL, errbuf, "in map_msas() msa2 (%s) not digitized.\n", esl_opt_GetArg(go, 2)); alen1 = msa1->alen; alen2 = msa2->alen; /* Map msa1 (reference) columns to alignment positions */ rflen1 = rflen2 = 0; if(msa1->rf != NULL) if((status = map_rfpos_to_apos(msa1, &rf2a_map1, &a2rf_map1, &rflen1)) != eslOK) goto ERROR; if(msa2->rf != NULL) if((status = map_rfpos_to_apos(msa2, &rf2a_map2, &a2rf_map2, &rflen2)) != eslOK) goto ERROR; if(! be_quiet) { printf("# %-25s alignment length: %d\n", esl_opt_GetArg(go, 1), alen1); printf("# %-25s alignment length: %d\n", esl_opt_GetArg(go, 2), alen2); } /* collect counts in one2two[i][j]: number of sequences for which residue aligned in msa1 non-gap column i * is aligned in msa2 alignment column j. */ ESL_ALLOC(seq1, sizeof(char) * (alen1+1)); ESL_ALLOC(seq2, sizeof(char) * (alen2+1)); ESL_ALLOC(one2two, sizeof(int *) * (alen1+1)); for(apos1 = 0; apos1 <= alen1; apos1++) { ESL_ALLOC(one2two[apos1], sizeof(int) * (alen2+1)); esl_vec_ISet(one2two[apos1], (alen2+1), 0); } total_res = 0; for(i = 0; i < msa1->nseq; i++) { /* ensure raw (unaligned) seq i in the 2 msas is the same */ esl_abc_Textize(msa1->abc, msa1->ax[i], alen1, seq1); esl_abc_Textize(msa1->abc, msa2->ax[i], alen2, seq2); /* note: msa*1*->abc used on purpose, allows DNA/RNA to peacefully coexist in this func */ esl_strdealign(seq1, seq1, "-_.~", &len1); esl_strdealign(seq2, seq2, "-_.~", &len2); if(len1 != len2) { ESL_FAIL(eslEINVAL, errbuf, "unaligned seq number %d (msa1: %s, msa2: %s) differs in length %s (%" PRId64 ") and %s (%" PRId64 "), those files must contain identical raw seqs\n", i, msa1->sqname[i], msa2->sqname[i], esl_opt_GetArg(go, 1), len1, esl_opt_GetArg(go, 2), len2); } if(strncmp(seq1, seq2, len1) != 0) ESL_FAIL(eslEINVAL, errbuf, "unaligned seq number %d differs between %s and %s, those files must contain identical raw seqs\n", i, esl_opt_GetArg(go, 1), esl_opt_GetArg(go, 2)); total_res += len1; apos1 = apos2 = 1; while((apos1 <= alen1) || (apos2 <= alen2)) { isgap1 = esl_abc_XIsGap(msa1->abc, msa1->ax[i][apos1]); isgap2 = esl_abc_XIsGap(msa2->abc, msa2->ax[i][apos2]); if ( isgap1 && isgap2) { apos1++; apos2++; } else if ( isgap1 && !isgap2) { apos1++; } else if (!isgap1 && isgap2) { apos2++; } else if ( msa1->ax[i][apos1] == msa2->ax[i][apos2]) { one2two[apos1++][apos2++]++; /* two2one[apos2][apos1]++; */ } } } /****************************************************************** * DP alignment of msa1 to msa2 * dp matrix: mx[apos1][apos2] apos1=1..msa->alen1, apos2=1..alen2 (apos1=0 || apos2=0 is invalid) * mx[apos1][apos2] = score of maximal alignment for apos1=1..apos1, apos2'=1..apos2 INCLUDING * apos1 and apos2. Score is number of residues from msa1 columns * 1..apos1 that exist in their respective aligned columns in msa2 (the growing * maximally scoring alignment). */ /****************************************************************** * initialization */ ESL_ALLOC(mx, sizeof(int *) * (alen1+1)); ESL_ALLOC(tb, sizeof(int *) * (alen1+1)); for(apos1 = 0; apos1 <= alen1; apos1++) { ESL_ALLOC(mx[apos1], sizeof(int) * (alen2+1)); ESL_ALLOC(tb[apos1], sizeof(int) * (alen2+1)); esl_vec_ISet(mx[apos1], (alen2+1), 0); esl_vec_ISet(tb[apos1], (alen2+1), -2); /* -2 is a bogus value, if we see it during traceback, there's a problem */ tb[apos1][0] = HORZ; /* special case, if we hit apos2==0 and apos1 > 0, we have to do HORZ moves until apos1==1 */ } esl_vec_ISet(tb[0], (alen2+1), VERT); /* special case, if we hit apos1==0 and apos2 > 0, we have to do VERT moves until apos2==1 */ tb[0][0] = -2; /* all alignments must end here */ ESL_ALLOC(res1_per_apos, sizeof(int) * (alen1+1)); esl_vec_ISet(res1_per_apos, (alen1+1), 0); mx[0][0] = 0; tb[0][0] = -1; /* last cell, special value */ /***************************************************************** * recursion */ ESL_ALLOC(choices, sizeof(int) * NCHOICES); for(apos1 = 1; apos1 <= alen1; apos1++) { for(apos2 = 1; apos2 <= alen2; apos2++) { choices[DIAG] = mx[(apos1-1)][(apos2-1)] + one2two[apos1][apos2]; choices[VERT] = mx[ apos1 ][(apos2-1)]; choices[HORZ] = mx[(apos1-1)][ apos2 ]; i_choice = esl_vec_IArgMax(choices, NCHOICES); mx[apos1][apos2] = choices[i_choice]; tb[apos1][apos2] = i_choice; res1_per_apos[apos1] += one2two[apos1][apos2]; /*printf("mx[%3d][%3d]: %5d (%d)\n", apos1, apos2, mx[apos1][apos2], tb[apos1][apos2]);*/ } } free(choices); total_cres1 = 0; if(rf2a_map1 != NULL) { for(rfpos1 = 1; rfpos1 <= rflen1; rfpos1++) total_cres1 += res1_per_apos[rf2a_map1[rfpos1]]; } /***************************************************************** * traceback */ sc = mx[alen1][alen2]; if(!be_quiet) { /* printf("score %d\n", sc);*/ if(a2rf_map1 != NULL && a2rf_map2 != NULL) { printf("# %12s %12s %22s\n", " msa 1 ", " msa 2 ", ""); printf("# %12s %12s %22s\n", "------------", "------------", ""); printf("# %5s %5s %5s %5s %22s\n", "rfpos", "apos", "rfpos", "apos", " num common residues"); printf("# %5s %5s %5s %5s %22s\n", "-----", "-----", "-----", "-----", "---------------------"); } else if(a2rf_map1 != NULL) { printf("# %12s %5s %22s\n", " msa 1 ", "msa 2", ""); printf("# %12s %5s %22s\n", "------------", "-----", ""); printf("# %5s %5s %5s %22s\n", "rfpos", "apos", "apos", " num common residues"); printf("# %5s %5s %5s %22s\n", "-----", "-----", "-----", "---------------------"); } else if (a2rf_map2 != NULL) { printf("# %5s %12s %22s\n", "msa 1", " msa 2 ", ""); printf("# %5s %12s %22s\n", "-----", "------------", ""); printf("# %5s %5s %5s %22s\n", "apos", "rfpos", "apos", " num common residues"); printf("# %5s %5s %5s %22s\n", "-----", "-----", "-----", "---------------------"); } else { printf("# %5s %5s %22s\n", "msa 1", "msa 2", ""); printf("# %5s %5s %22s\n", "-----", "-----", ""); printf("# %5s %5s %22s\n", "apos", "apos", " num common residues"); printf("# %5s %5s %22s\n", "-----", "-----", "---------------------"); } } /* traceback, and build one2two_map[] */ apos1 = alen1; apos2 = alen2; tb_sc = 0; covered_cres1 = 0; ESL_ALLOC(one2two_map, sizeof(int) * (alen1+1)); esl_vec_ISet(one2two_map, (alen1+1), 0); one2two_map[0] = -1; /* invalid */ while(tb[apos1][apos2] != -1) { if(tb[apos1][apos2] == DIAG) { /* diagonal move */ rfpos1 = (a2rf_map1 == NULL) ? -1 : a2rf_map1[apos1]; rfpos2 = (a2rf_map2 == NULL) ? -1 : a2rf_map2[apos2]; if(!be_quiet) { if(a2rf_map1 != NULL && a2rf_map2 != NULL) { if(rfpos1 == -1 && rfpos2 == -1) { printf(" %5s %5d --> %5s %5d %5d / %5d (%.4f)\n", "-", apos1, "-", apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } else if (rfpos1 == -1) { printf(" %5s %5d --> %5d %5d %5d / %5d (%.4f)\n", "-", apos1, rfpos2, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } else if (rfpos2 == -1) { printf(" %5d %5d --> %5s %5d %5d / %5d (%.4f)\n", rfpos1, apos1, "-", apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } else { printf(" %5d %5d --> %5d %5d %5d / %5d (%.4f)\n", rfpos1, apos1, rfpos2, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } } else if(a2rf_map1 != NULL) { if (rfpos1 == -1) { printf(" %5s %5d --> %5d %5d / %5d (%.4f)\n", "-", apos1, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } else { printf(" %5d %5d --> %5d %5d / %5d (%.4f)\n", rfpos1, apos1, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } } else if (a2rf_map2 != NULL) { if (rfpos2 == -1) { printf(" %5d --> %5s %5d %5d / %5d (%.4f)\n", apos1, "-", apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } else { printf(" %5d --> %5d %5d %5d / %5d (%.4f)\n", apos1, rfpos2, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } } else { printf(" %5d --> %5d %5d / %5d (%.4f)\n", apos1, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); } } tb_sc += one2two[apos1][apos2]; one2two_map[apos1] = apos2; if(rfpos1 > 0) covered_cres1 += one2two[apos1][apos2]; /* apos1 is a rfpos */ apos1--; apos2--; } else if(tb[apos1][apos2] == VERT) { apos2--; /* vertical move */ } else if(tb[apos1][apos2] == HORZ) { apos1--; /* horizontal move */ } else if(tb[apos1][apos2] != -1) /* shouldn't happen */ ESL_FAIL(eslEINVAL, errbuf, "in dp traceback, tb[apos1: %d][apos2: %d] %d\n", apos1, apos2, tb[apos1][apos2]); } /* done DP code **********************************/ if(!be_quiet) printf("# Total trace back sc: %d\n", tb_sc); if(tb_sc != sc) ESL_FAIL(eslEINVAL, errbuf, "in dp traceback, tb_sc (%d) != sc (%d)\n", tb_sc, sc); coverage = (float) tb_sc / (float) total_res; printf("# Coverage: %6d / %6d (%.4f)\n# Coverage is fraction of residues from %s in optimally mapped columns in %s\n", tb_sc, total_res, coverage, esl_opt_GetArg(go, 1), esl_opt_GetArg(go, 2)); if(total_cres1 > 0) printf("# RF coverage: %6d / %6d (%.4f)\n# RF coverage is fraction of non-gap RF residues from %s in optimally mapped columns in %s\n", covered_cres1, total_cres1, (float) covered_cres1 / (float) total_cres1, esl_opt_GetArg(go, 1), esl_opt_GetArg(go, 2)); /* print masks if nec */ if((status = map2masks(go, errbuf, alen1, alen2, a2rf_map1, a2rf_map2, rf2a_map1, rf2a_map2, rflen1, rflen2, one2two_map)) != eslOK) return status; /* clean up and return */ for(apos1 = 0; apos1 <= alen1; apos1++) { free(mx[apos1]); free(tb[apos1]); } free(mx); free(tb); for(apos1 = 0; apos1 <= alen1; apos1++) free(one2two[apos1]); free(one2two); free(res1_per_apos); if(rf2a_map1 != NULL) free(rf2a_map1); if(rf2a_map2 != NULL) free(rf2a_map2); if(a2rf_map1 != NULL) free(a2rf_map1); if(a2rf_map2 != NULL) free(a2rf_map2); free(seq1); free(seq2); *ret_msa1_to_msa2_map = one2two_map; return eslOK; ERROR: return status; }
/* The "enumeration" test samples a random enumerable HMM (transitions to insert are 0, * so the generated seq space only includes seqs of L<=M). * * The test scores all seqs of length <=M by both Viterbi and Forward, verifies that * the two scores are identical, and verifies that the sum of all the probabilities is * 1.0. It also verifies that the score of a sequence of length M+1 is indeed -infinity. * * Because this function is going to work in unscaled probabilities, adding them up, * all P(seq) terms must be >> DBL_EPSILON. That means M must be small; on the order * of <= 10. */ static void utest_enumeration(ESL_GETOPTS *go, ESL_RANDOMNESS *r, ESL_ALPHABET *abc, int M) { char errbuf[eslERRBUFSIZE]; P7_HMM *hmm = NULL; P7_PROFILE *gm = NULL; P7_BG *bg = NULL; ESL_DSQ *dsq = NULL; P7_GMX *gx = NULL; float vsc, fsc; float bg_ll; /* log P(seq | bg) */ double vp, fp; /* P(seq,\pi | model) and P(seq | model) */ int L; int i; double total_p; char *seq; /* Sample an enumerable HMM & profile of length M. */ if (p7_hmm_SampleEnumerable(r, M, abc, &hmm) != eslOK) esl_fatal("failed to sample an enumerable HMM"); if ((bg = p7_bg_Create(abc)) == NULL) esl_fatal("failed to create null model"); if ((gm = p7_profile_Create(hmm->M, abc)) == NULL) esl_fatal("failed to create profile"); if (p7_ProfileConfig(hmm, bg, gm, 0, p7_UNILOCAL) != eslOK) esl_fatal("failed to config profile"); if (p7_hmm_Validate (hmm, errbuf, 0.0001) != eslOK) esl_fatal("whoops, HMM is bad!: %s", errbuf); if (p7_profile_Validate(gm, errbuf, 0.0001) != eslOK) esl_fatal("whoops, profile is bad!: %s", errbuf); if ( (dsq = malloc(sizeof(ESL_DSQ) * (M+3))) == NULL) esl_fatal("allocation failed"); if ( (seq = malloc(sizeof(char) * (M+2))) == NULL) esl_fatal("allocation failed"); if ((gx = p7_gmx_Create(hmm->M, M+3)) == NULL) esl_fatal("matrix creation failed"); /* Enumerate all sequences of length L <= M */ total_p = 0; for (L = 0; L <= M; L++) { /* Initialize dsq of length L at 0000... */ dsq[0] = dsq[L+1] = eslDSQ_SENTINEL; for (i = 1; i <= L; i++) dsq[i] = 0; while (1) /* enumeration of seqs of length L*/ { if (p7_GViterbi(dsq, L, gm, gx, &vsc) != eslOK) esl_fatal("viterbi failed"); if (p7_GForward(dsq, L, gm, gx, &fsc) != eslOK) esl_fatal("forward failed"); /* calculate bg log likelihood component of the scores */ for (bg_ll = 0., i = 1; i <= L; i++) bg_ll += log(bg->f[dsq[i]]); /* convert to probabilities, adding the bg LL back to the LLR */ vp = exp(vsc + bg_ll); fp = exp(fsc + bg_ll); if (esl_opt_GetBoolean(go, "--vv")) { esl_abc_Textize(abc, dsq, L, seq); printf("probability of sequence: %10s %16g (lod v=%8.4f f=%8.4f)\n", seq, fp, vsc, fsc); } total_p += fp; /* Increment dsq like a reversed odometer */ for (i = 1; i <= L; i++) if (dsq[i] < abc->K-1) { dsq[i]++; break; } else { dsq[i] = 0; } if (i > L) break; /* we're done enumerating sequences */ } } /* That sum is subject to significant numerical error because of * discretization error in FLogsum(); don't expect it to be too close. */ if (total_p < 0.999 || total_p > 1.001) esl_fatal("Enumeration unit test failed: total Forward p isn't near 1.0 (%g)", total_p); if (esl_opt_GetBoolean(go, "-v")) { printf("enumeration test: total p is %g\n", total_p); } p7_gmx_Destroy(gx); p7_bg_Destroy(bg); p7_profile_Destroy(gm); p7_hmm_Destroy(hmm); free(dsq); free(seq); }