Esempio n. 1
0
/* map_msas
 *                   
 * Align msa1 and msa2.
 * For each column in msa1, determine the corresponding column
 * in msa2. This implementation requires:
 *  - msa1 and msa2 contain exactly the same sequences in the same order
 * Note: the seqs in msa1 and msa2 do not have to have the same names.
 *
 * Uses a DP algorithm similar to Needleman-Wunsch, but that's aligning
 * two alignment columns at a time instead of two residues. 
 */
static int
map_msas(const ESL_GETOPTS *go, char *errbuf, ESL_MSA *msa1, ESL_MSA *msa2, int **ret_msa1_to_msa2_map)
{
  int status;
  int **one2two;              /* [0..c..rflen1][0..a..alen2] number of residues from non-gap RF column c of msa1
			       * aligned in column a of msa 2 */
  int *rf2a_map1 = NULL;       /* msa1 map of reference columns (non-gap RF residues) to alignment columns, NULL if msa1->rf == NULL */
  int *rf2a_map2 = NULL;       /* msa2 map of reference columns (non-gap RF residues) to alignment columns, NULL if msa2->rf == NULL */
  int *a2rf_map1 = NULL;       /* msa1 map of alignment columns to reference columns, NULL if msa1->rf == NULL */
  int *a2rf_map2 = NULL;       /* msa2 map of alignment columns to reference columns, NULL if msa2->rf == NULL */
  int apos1, apos2;           /* counters over alignment position in msa1, msa2 respectively */
  int alen1, alen2;           /* alignment lengths */
  int rfpos1, rfpos2;           /* counters over reference positions */
  int rflen1, rflen2;           /* reference (non-gap RF) lengths */
  int **mx;                   /* [0..c..rflen1][0..a..alen2] dp matrix, score of max scoring aln 
			       * from 1..c in msa1 and 1..a in msa 2 */
  int **tb;                   /* [0..c..rflen1][0..a..alen2] traceback ptrs, 0 for diagonal, 1 for vertical */
  char *seq1, *seq2;          /* temporary strings for ensuring dealigned sequences in msa1 and msa2 are identical */
  int64_t len1, len2;         /* length of seq1, seq2 */
  int isgap1, isgap2;         /* is this residue a gap in msa1, msa2? */
  int i;                      /* counter over sequences */
  int *res1_per_apos;         /* [0..apos..alen1] number of residues in column apos of msa1 */
  int sc;                     /* max score of full path (alignment) through dp mx */
  int tb_sc;                  /* score of traceback, should equal sc */
  int *one2two_map;           /* [0..a..alen1] the alignment, msa2 column that column apos1 in msa1 maps to */
  int total_res = 0;          /* total number of residues in msa1 */
  float coverage;             /* fraction of total_res that are within mapped msa2 columns from one2two_map, 
			       * this is tb_sc / total_res */
  int  total_cres1=0;         /* total number of residues in reference positions in msa1 */ 
  int  covered_cres1 = 0;     /* number of residues in reference positions in msa1 that also appear in the corresponding
			       * mapped column of msa2 
			       */
  int be_quiet = esl_opt_GetBoolean(go, "-q");
  int *choices;
  int i_choice;

  /* contract check */
  if(! (msa1->flags & eslMSA_DIGITAL)) ESL_FAIL(eslEINVAL, errbuf, "in map_msas() msa1 (%s) not digitized.\n", esl_opt_GetArg(go, 1));
  if(! (msa2->flags & eslMSA_DIGITAL)) ESL_FAIL(eslEINVAL, errbuf, "in map_msas() msa2 (%s) not digitized.\n", esl_opt_GetArg(go, 2));
  alen1 = msa1->alen;
  alen2 = msa2->alen;
  
  /* Map msa1 (reference) columns to alignment positions */
  rflen1 = rflen2 = 0;
  if(msa1->rf != NULL) if((status = map_rfpos_to_apos(msa1, &rf2a_map1, &a2rf_map1, &rflen1)) != eslOK) goto ERROR;
  if(msa2->rf != NULL) if((status = map_rfpos_to_apos(msa2, &rf2a_map2, &a2rf_map2, &rflen2)) != eslOK) goto ERROR;
  if(! be_quiet) {
    printf("# %-25s alignment length:              %d\n", esl_opt_GetArg(go, 1), alen1);
    printf("# %-25s alignment length:              %d\n", esl_opt_GetArg(go, 2), alen2);
  }
  /* collect counts in one2two[i][j]: number of sequences for which residue aligned in msa1 non-gap column i
   * is aligned in msa2 alignment column j.
   */
  ESL_ALLOC(seq1, sizeof(char) * (alen1+1));
  ESL_ALLOC(seq2, sizeof(char) * (alen2+1));
  ESL_ALLOC(one2two, sizeof(int *) * (alen1+1));
  for(apos1 = 0; apos1 <= alen1; apos1++) { 
    ESL_ALLOC(one2two[apos1], sizeof(int) * (alen2+1));
    esl_vec_ISet(one2two[apos1], (alen2+1), 0);
  }

  total_res = 0;
  for(i = 0; i < msa1->nseq; i++) { 
    /* ensure raw (unaligned) seq i in the 2 msas is the same */
    esl_abc_Textize(msa1->abc, msa1->ax[i], alen1, seq1); 
    esl_abc_Textize(msa1->abc, msa2->ax[i], alen2, seq2); /* note: msa*1*->abc used on purpose, allows DNA/RNA to peacefully coexist in this func */
    esl_strdealign(seq1, seq1, "-_.~", &len1);
    esl_strdealign(seq2, seq2, "-_.~", &len2);

    if(len1 != len2) { 
      ESL_FAIL(eslEINVAL, errbuf, "unaligned seq number %d (msa1: %s, msa2: %s) differs in length %s (%" PRId64 ") and %s (%" PRId64 "), those files must contain identical raw seqs\n",
	       i, msa1->sqname[i], msa2->sqname[i], esl_opt_GetArg(go, 1), len1, esl_opt_GetArg(go, 2), len2);
    }
    if(strncmp(seq1, seq2, len1) != 0)  ESL_FAIL(eslEINVAL, errbuf, "unaligned seq number %d differs between %s and %s, those files must contain identical raw seqs\n", i, esl_opt_GetArg(go, 1), esl_opt_GetArg(go, 2));
    total_res += len1;
    
    apos1 = apos2 = 1;
    while((apos1 <= alen1) || (apos2 <= alen2)) {
      isgap1 = esl_abc_XIsGap(msa1->abc, msa1->ax[i][apos1]);
      isgap2 = esl_abc_XIsGap(msa2->abc, msa2->ax[i][apos2]);
      if      ( isgap1 &&  isgap2) { apos1++; apos2++; }
      else if ( isgap1 && !isgap2) { apos1++;          }
      else if (!isgap1 &&  isgap2) {          apos2++; }
      else if ( msa1->ax[i][apos1] == msa2->ax[i][apos2]) { 
	one2two[apos1++][apos2++]++;
	/* two2one[apos2][apos1]++; */
      }
    }
  }

  /******************************************************************
   * DP alignment of msa1 to msa2
   * dp matrix: mx[apos1][apos2] apos1=1..msa->alen1, apos2=1..alen2 (apos1=0 || apos2=0 is invalid)
   * mx[apos1][apos2] = score of maximal alignment for apos1=1..apos1, apos2'=1..apos2 INCLUDING
   *                    apos1 and apos2. Score is number of residues from msa1 columns
   *                    1..apos1 that exist in their respective aligned columns in msa2 (the growing
   *                    maximally scoring alignment).
   */

  /******************************************************************
   * initialization 
   */
  ESL_ALLOC(mx, sizeof(int *) * (alen1+1));
  ESL_ALLOC(tb, sizeof(int *) * (alen1+1));
  for(apos1 = 0; apos1 <= alen1; apos1++) { 
    ESL_ALLOC(mx[apos1], sizeof(int) * (alen2+1));
    ESL_ALLOC(tb[apos1], sizeof(int) * (alen2+1));
    esl_vec_ISet(mx[apos1], (alen2+1), 0);
    esl_vec_ISet(tb[apos1], (alen2+1), -2); /* -2 is a bogus value, if we see it during traceback, there's a problem */
    tb[apos1][0] = HORZ; /* special case, if we hit apos2==0 and apos1 > 0, we have to do HORZ moves until apos1==1 */
  }
  esl_vec_ISet(tb[0], (alen2+1), VERT); /* special case, if we hit apos1==0 and apos2 > 0, we have to do VERT moves until apos2==1 */
  tb[0][0] = -2; /* all alignments must end here */

  ESL_ALLOC(res1_per_apos, sizeof(int) * (alen1+1));
  esl_vec_ISet(res1_per_apos, (alen1+1), 0);
  mx[0][0] = 0;
  tb[0][0] = -1; /* last cell, special value */

  /*****************************************************************
   * recursion
   */
  ESL_ALLOC(choices, sizeof(int) * NCHOICES);
  for(apos1 = 1; apos1 <= alen1; apos1++) {
    for(apos2 = 1; apos2 <= alen2; apos2++) {
      choices[DIAG] = mx[(apos1-1)][(apos2-1)] + one2two[apos1][apos2];
      choices[VERT] = mx[ apos1   ][(apos2-1)];
      choices[HORZ] = mx[(apos1-1)][ apos2   ];
      i_choice  = esl_vec_IArgMax(choices, NCHOICES);
      mx[apos1][apos2] = choices[i_choice];
      tb[apos1][apos2] = i_choice; 
      res1_per_apos[apos1] += one2two[apos1][apos2];
      /*printf("mx[%3d][%3d]: %5d (%d)\n", apos1, apos2, mx[apos1][apos2], tb[apos1][apos2]);*/
    }
  }
  free(choices);

  total_cres1 = 0;
  if(rf2a_map1 != NULL) { 
    for(rfpos1 = 1; rfpos1 <= rflen1; rfpos1++) total_cres1 += res1_per_apos[rf2a_map1[rfpos1]];
  }

  /*****************************************************************
   * traceback 
   */
  
  sc = mx[alen1][alen2];
  if(!be_quiet) {
    /* printf("score %d\n", sc);*/
    if(a2rf_map1 != NULL && a2rf_map2 != NULL) { 
      printf("# %12s       %12s  %22s\n", "   msa 1   ", "   msa 2   ", "");
      printf("# %12s       %12s  %22s\n", "------------", "------------", "");
      printf("# %5s  %5s       %5s  %5s  %22s\n", "rfpos",  "apos",  "rfpos",  "apos",  " num common residues");
      printf("# %5s  %5s       %5s  %5s  %22s\n", "-----", "-----", "-----", "-----", "---------------------");
    }
    else if(a2rf_map1 != NULL) { 
      printf("# %12s        %5s  %22s\n", "   msa 1   ", "msa 2", "");
      printf("# %12s        %5s  %22s\n", "------------", "-----", "");
      printf("# %5s  %5s       %5s  %22s\n", "rfpos",  "apos",  "apos",  " num common residues");
      printf("# %5s  %5s       %5s  %22s\n", "-----", "-----", "-----", "---------------------");
    }
    else if (a2rf_map2 != NULL) { 
      printf("# %5s        %12s  %22s\n", "msa 1", "   msa 2   ", "");
      printf("# %5s        %12s  %22s\n", "-----", "------------", "");
      printf("# %5s        %5s  %5s  %22s\n", "apos",  "rfpos",  "apos",  " num common residues");
      printf("# %5s        %5s  %5s  %22s\n", "-----", "-----", "-----", "---------------------");
    }
    else {
      printf("# %5s        %5s  %22s\n", "msa 1", "msa 2", "");
      printf("# %5s        %5s  %22s\n", "-----", "-----", "");
      printf("# %5s        %5s  %22s\n", "apos",  "apos",  " num common residues");
      printf("# %5s        %5s  %22s\n", "-----", "-----", "---------------------");
    }
  }

  /* traceback, and build one2two_map[] */
  apos1 = alen1;
  apos2 = alen2;
  tb_sc = 0;
  covered_cres1 = 0;
  ESL_ALLOC(one2two_map, sizeof(int) * (alen1+1));
  esl_vec_ISet(one2two_map, (alen1+1), 0);
  one2two_map[0] = -1; /* invalid */

  while(tb[apos1][apos2] != -1) {
    if(tb[apos1][apos2] == DIAG) { /* diagonal move */
      rfpos1 = (a2rf_map1 == NULL) ? -1 : a2rf_map1[apos1];
      rfpos2 = (a2rf_map2 == NULL) ? -1 : a2rf_map2[apos2];
      if(!be_quiet) { 
	if(a2rf_map1 != NULL && a2rf_map2 != NULL) { 
	  if(rfpos1 == -1 && rfpos2 == -1) { 
	    printf("  %5s  %5d  -->  %5s  %5d  %5d / %5d (%.4f)\n", "-",    apos1, "-",    apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); 
	  }
	  else if (rfpos1 == -1) { 
	    printf("  %5s  %5d  -->  %5d  %5d  %5d / %5d (%.4f)\n", "-",    apos1, rfpos2, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); 
	  }
	  else if (rfpos2 == -1) { 
	    printf("  %5d  %5d  -->  %5s  %5d  %5d / %5d (%.4f)\n", rfpos1, apos1, "-",    apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); 
	  }
	  else { 
	    printf("  %5d  %5d  -->  %5d  %5d  %5d / %5d (%.4f)\n", rfpos1, apos1, rfpos2, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); 
	  }
	}	
	else if(a2rf_map1 != NULL) { 
	  if (rfpos1 == -1) { 
	    printf("  %5s  %5d  -->  %5d  %5d / %5d (%.4f)\n", "-",   apos1, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); 
	  }
	  else { 
	    printf("  %5d  %5d  -->  %5d  %5d / %5d (%.4f)\n", rfpos1, apos1, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); 
	  }
	}
	else if (a2rf_map2 != NULL) { 
	  if (rfpos2 == -1) { 
	    printf("  %5d  -->  %5s  %5d  %5d / %5d (%.4f)\n", apos1, "-",    apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); 
	  }
	  else { 
	    printf("  %5d  -->  %5d  %5d  %5d / %5d (%.4f)\n", apos1, rfpos2, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); 
	  }
	}
	else {
	  printf("  %5d  -->  %5d  %5d / %5d (%.4f)\n", apos1, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); 
	}
      }
      tb_sc += one2two[apos1][apos2];
      one2two_map[apos1] = apos2;
      if(rfpos1 > 0) covered_cres1 += one2two[apos1][apos2]; /* apos1 is a rfpos */
      apos1--; apos2--;
    }
    else if(tb[apos1][apos2] == VERT) { 
      apos2--; /* vertical move */
    }
    else if(tb[apos1][apos2] == HORZ) { 
      apos1--; /* horizontal move */
    }
    else if(tb[apos1][apos2] != -1) /* shouldn't happen */
      ESL_FAIL(eslEINVAL, errbuf, "in dp traceback, tb[apos1: %d][apos2: %d] %d\n", apos1, apos2, tb[apos1][apos2]);
  }
  /* done DP code 
   **********************************/

  if(!be_quiet) printf("# Total trace back sc: %d\n", tb_sc);
  if(tb_sc != sc) ESL_FAIL(eslEINVAL, errbuf, "in dp traceback, tb_sc (%d) != sc (%d)\n", tb_sc, sc);
  coverage = (float) tb_sc / (float) total_res;
  printf("# Coverage: %6d / %6d (%.4f)\n# Coverage is fraction of residues from %s in optimally mapped columns in %s\n", tb_sc, total_res, coverage, esl_opt_GetArg(go, 1), esl_opt_GetArg(go, 2));
  if(total_cres1 > 0) printf("# RF coverage: %6d / %6d (%.4f)\n# RF coverage is fraction of non-gap RF residues from %s in optimally mapped columns in %s\n", covered_cres1, total_cres1, (float) covered_cres1 / (float) total_cres1, esl_opt_GetArg(go, 1), esl_opt_GetArg(go, 2));
  /* print masks if nec */
  if((status = map2masks(go, errbuf, alen1, alen2, a2rf_map1, a2rf_map2, rf2a_map1, rf2a_map2, rflen1, rflen2, one2two_map)) != eslOK) return status;

  /* clean up and return */
  for(apos1 = 0; apos1 <= alen1; apos1++) { 
    free(mx[apos1]);
    free(tb[apos1]);
  }
  free(mx);
  free(tb);

  for(apos1 = 0; apos1 <= alen1; apos1++) free(one2two[apos1]);
  free(one2two);
  free(res1_per_apos);
  if(rf2a_map1 != NULL) free(rf2a_map1);
  if(rf2a_map2 != NULL) free(rf2a_map2);
  if(a2rf_map1 != NULL) free(a2rf_map1);
  if(a2rf_map2 != NULL) free(a2rf_map2);

  free(seq1);
  free(seq2);
  *ret_msa1_to_msa2_map = one2two_map;
  return eslOK;
  
 ERROR: 
  return status;
}
Esempio n. 2
0
/* The "enumeration" test samples a random enumerable HMM (transitions to insert are 0,
 * so the generated seq space only includes seqs of L<=M). 
 *
 * The test scores all seqs of length <=M by both Viterbi and Forward, verifies that 
 * the two scores are identical, and verifies that the sum of all the probabilities is
 * 1.0. It also verifies that the score of a sequence of length M+1 is indeed -infinity.
 * 
 * Because this function is going to work in unscaled probabilities, adding them up,
 * all P(seq) terms must be >> DBL_EPSILON.  That means M must be small; on the order 
 * of <= 10. 
 */
static void
utest_enumeration(ESL_GETOPTS *go, ESL_RANDOMNESS *r, ESL_ALPHABET *abc, int M)
{
  char            errbuf[eslERRBUFSIZE];
  P7_HMM         *hmm  = NULL;
  P7_PROFILE     *gm   = NULL;
  P7_BG          *bg   = NULL;
  ESL_DSQ        *dsq  = NULL;
  P7_GMX         *gx   = NULL;
  float  vsc, fsc;
  float  bg_ll;   		/* log P(seq | bg) */
  double vp, fp;		/* P(seq,\pi | model) and P(seq | model) */
  int L;
  int i;
  double total_p;
  char   *seq;
    
  /* Sample an enumerable HMM & profile of length M.
   */
  if (p7_hmm_SampleEnumerable(r, M, abc, &hmm)      != eslOK) esl_fatal("failed to sample an enumerable HMM");
  if ((bg = p7_bg_Create(abc))                      == NULL)  esl_fatal("failed to create null model");
  if ((gm = p7_profile_Create(hmm->M, abc))         == NULL)  esl_fatal("failed to create profile");
  if (p7_ProfileConfig(hmm, bg, gm, 0, p7_UNILOCAL) != eslOK) esl_fatal("failed to config profile");
  if (p7_hmm_Validate    (hmm, errbuf, 0.0001)      != eslOK) esl_fatal("whoops, HMM is bad!: %s", errbuf);
  if (p7_profile_Validate(gm, errbuf, 0.0001)       != eslOK) esl_fatal("whoops, profile is bad!: %s", errbuf);

  if (  (dsq = malloc(sizeof(ESL_DSQ) * (M+3)))     == NULL)  esl_fatal("allocation failed");
  if (  (seq = malloc(sizeof(char)    * (M+2)))     == NULL)  esl_fatal("allocation failed");
  if ((gx     = p7_gmx_Create(hmm->M, M+3))         == NULL)  esl_fatal("matrix creation failed");

  /* Enumerate all sequences of length L <= M
   */
  total_p = 0;
  for (L = 0; L <= M; L++)
    {
      /* Initialize dsq of length L at 0000... */
      dsq[0] = dsq[L+1] = eslDSQ_SENTINEL;
      for (i = 1; i <= L; i++) dsq[i] = 0;

      while (1) 		/* enumeration of seqs of length L*/
	{
	  if (p7_GViterbi(dsq, L, gm, gx, &vsc)  != eslOK) esl_fatal("viterbi failed");
	  if (p7_GForward(dsq, L, gm, gx, &fsc)  != eslOK) esl_fatal("forward failed");
 
	  /* calculate bg log likelihood component of the scores */
	  for (bg_ll = 0., i = 1; i <= L; i++)  bg_ll += log(bg->f[dsq[i]]);
	  
	  /* convert to probabilities, adding the bg LL back to the LLR */
	  vp =  exp(vsc + bg_ll);
	  fp =  exp(fsc + bg_ll);

	  if (esl_opt_GetBoolean(go, "--vv")) {
	    esl_abc_Textize(abc, dsq, L, seq);
	    printf("probability of sequence: %10s   %16g  (lod v=%8.4f f=%8.4f)\n", seq, fp, vsc, fsc);
	  }
	  total_p += fp;

	  /* Increment dsq like a reversed odometer */
	  for (i = 1; i <= L; i++) 
	    if (dsq[i] < abc->K-1) { dsq[i]++; break; } else { dsq[i] = 0; }
	  if (i > L) break;	/* we're done enumerating sequences */
	}
    }

  /* That sum is subject to significant numerical error because of
   * discretization error in FLogsum(); don't expect it to be too close.
   */
  if (total_p < 0.999 || total_p > 1.001) esl_fatal("Enumeration unit test failed: total Forward p isn't near 1.0 (%g)", total_p);
  if (esl_opt_GetBoolean(go, "-v")) {
    printf("enumeration test: total p is %g\n", total_p);
  }
  
  p7_gmx_Destroy(gx);
  p7_bg_Destroy(bg);
  p7_profile_Destroy(gm);
  p7_hmm_Destroy(hmm);
  free(dsq);
  free(seq);
}