Пример #1
0
/* Function: InitializeCP9Matrix()
 * Purpose:  Set all valid cells in a CP9 matrix to -INFTY
 * 
 * Return:   (void)
 */
void
InitializeCP9Matrix(CP9_MX *mx)
{
  esl_vec_ISet(mx->mmx_mem, mx->ncells_valid, -INFTY);
  esl_vec_ISet(mx->imx_mem, mx->ncells_valid, -INFTY);
  esl_vec_ISet(mx->dmx_mem, mx->ncells_valid, -INFTY);
  esl_vec_ISet(mx->elmx_mem, mx->ncells_valid, -INFTY);
  esl_vec_ISet(mx->erow, mx->rows, -INFTY);
  return;
}
Пример #2
0
/* The DChoose() and FChoose() unit tests.
 */
static void
utest_choose(ESL_RANDOMNESS *r, int n, int nbins, int be_verbose)
{
  double *pd = NULL;
  float  *pf = NULL;
  int    *ct = NULL;
  int     i;
  double  X2, diff, exp, X2p;

  if ((pd = malloc(sizeof(double) * nbins)) == NULL) esl_fatal("malloc failed"); 
  if ((pf = malloc(sizeof(float)  * nbins)) == NULL) esl_fatal("malloc failed");
  if ((ct = malloc(sizeof(int)    * nbins)) == NULL) esl_fatal("malloc failed");

  /* Sample a random multinomial probability vector.  */
  if (esl_dirichlet_DSampleUniform(r, nbins, pd) != eslOK) esl_fatal("dirichlet sample failed");
  esl_vec_D2F(pd, nbins, pf);

  /* Sample observed counts using DChoose(). */
  esl_vec_ISet(ct, nbins, 0);
  for (i = 0; i < n; i++)
    ct[esl_rnd_DChoose(r, pd, nbins)]++;

  /* X^2 test on those observed counts. */
  for (X2 = 0., i=0; i < nbins; i++) {
    exp = (double) n * pd[i];
    diff = (double) ct[i] - exp;
    X2 += diff*diff/exp;
  }
  if (esl_stats_ChiSquaredTest(nbins, X2, &X2p) != eslOK) esl_fatal("chi square eval failed");
  if (be_verbose) printf("DChoose():  \t%g\n", X2p);
  if (X2p < 0.01) esl_fatal("chi squared test failed");

  /* Repeat above for FChoose(). */
  esl_vec_ISet(ct, nbins, 0);
  for (i = 0; i < n; i++)
    ct[esl_rnd_FChoose(r, pf, nbins)]++;
  for (X2 = 0., i=0; i < nbins; i++) {
    exp = (double) n * pd[i];
    diff = (double) ct[i] - exp;
    X2 += diff*diff/exp;
  }
  if (esl_stats_ChiSquaredTest(nbins, X2, &X2p) != eslOK) esl_fatal("chi square eval failed");
  if (be_verbose) printf("FChoose():  \t%g\n", X2p);
  if (X2p < 0.01) esl_fatal("chi squared test failed");
  
  free(pd);
  free(pf);
  free(ct);
  return;
}
Пример #3
0
/* Function:  esl_msaweight_IDFilter()
 * Synopsis:  Filter by %ID.
 * Incept:    ER, Wed Oct 29 10:06:43 2008 [Janelia]
 * 
 * Purpose:   Constructs a new alignment by removing near-identical 
 *            sequences from a given alignment (where identity is 
 *            calculated *based on the alignment*).
 *            Does not affect the given alignment.
 *            Keeps earlier sequence, discards later one. 
 *           
 *            Usually called as an ad hoc sequence "weighting" mechanism.
 *           
 * Limitations:
 *            Unparsed Stockholm markup is not propagated into the
 *            new alignment.
 *           
 * Return:    <eslOK> on success, and the <newmsa>.
 *
 * Throws:    <eslEMEM> on allocation error. <eslEINVAL> if a pairwise
 *            identity calculation fails because of corrupted sequence 
 *            data. In either case, the <msa> is unmodified.
 *
 * Xref:      squid::weight.c::FilterAlignment().
 */
int
esl_msaweight_IDFilter(const ESL_MSA *msa, double maxid, ESL_MSA **ret_newmsa)
{
  int     *list   = NULL;               /* array of seqs in new msa */
  int     *useme  = NULL;               /* TRUE if seq is kept in new msa */
  int      nnew;			/* number of seqs in new alignment */
  double   ident;                       /* pairwise percentage id */
  int      i,j;                         /* seqs counters*/
  int      remove;                      /* TRUE if sq is to be removed */
  int      status;
  
  /* Contract checks
   */
  ESL_DASSERT1( (msa       != NULL) );
  ESL_DASSERT1( (msa->nseq >= 1)    );
  ESL_DASSERT1( (msa->alen >= 1)    );

  /* allocate */
  ESL_ALLOC(list,  sizeof(int) * msa->nseq);
  ESL_ALLOC(useme, sizeof(int) * msa->nseq);
  esl_vec_ISet(useme, msa->nseq, 0); /* initialize array */

  /* find which seqs to keep (list) */
  nnew = 0;
  for (i = 0; i < msa->nseq; i++)
    {
      remove = FALSE;
      for (j = 0; j < nnew; j++)
	{
	  if (! (msa->flags & eslMSA_DIGITAL)) {
	    if ((status = esl_dst_CPairId(msa->aseq[i], msa->aseq[list[j]], &ident, NULL, NULL))       != eslOK) goto ERROR;
	  } 
#ifdef eslAUGMENT_ALPHABET
	  else {
	    if ((status = esl_dst_XPairId(msa->abc, msa->ax[i], msa->ax[list[j]], &ident, NULL, NULL)) != eslOK) goto ERROR;
	  }
#endif
	  
	  if (ident > maxid)
	    { 
	      remove = TRUE; 
	      break; 
	    }
	}
      if (remove == FALSE) {
	list[nnew++] = i;
	useme[i]     = TRUE;
      }
    }
  if ((status = esl_msa_SequenceSubset(msa, useme, ret_newmsa)) != eslOK) goto ERROR;
 
  free(list);
  free(useme);
  return eslOK;

 ERROR:
  if (list  != NULL) free(list);
  if (useme != NULL) free(useme);
  return status;
}
Пример #4
0
/* map_sub_msas
 *                   
 * msa1 and msa2 contain the same named sequences, msa1 contains a superset 
 * of the columns in msa2. Determine which of the msa1 columns the msa2
 * columns correspond to.
 */
static int
map_sub_msas(const ESL_GETOPTS *go, char *errbuf, ESL_MSA *msa1, ESL_MSA *msa2, char **ret_msa1_to_msa2_mask)
{
  int status;
  int  apos1, apos2;          /* counters over alignment position in msa1, msa2 respectively */
  int i;
  int *msa1_to_msa2_map;    /* [0..apos1..msa1->alen] msa2 alignment position that apos1 corresponds to */
  char *mask;

  /* contract check */
  if(! (msa1->flags & eslMSA_DIGITAL)) ESL_FAIL(eslEINVAL, errbuf, "in map_sub_msas() msa1 (%s) not digitized.\n", esl_opt_GetArg(go, 1));
  if(! (msa2->flags & eslMSA_DIGITAL)) ESL_FAIL(eslEINVAL, errbuf, "in map_sub_msas() msa2 (%s) not digitized.\n", esl_opt_GetString(go, "--submap"));
  if(msa1->alen <= msa2->alen) ESL_FAIL(eslEINVAL, errbuf, "in map_sub_msas() alignment length for msa1 (%" PRId64 "d) <= length for msa2 (%" PRId64 ")\n", msa1->alen, msa2->alen);
  
  ESL_ALLOC(mask, sizeof(char) * (msa1->alen+1));
  for(apos1 = 0; apos1 < msa1->alen; apos1++) mask[apos1] = '0';
  mask[msa1->alen] = '\0';

  ESL_ALLOC(msa1_to_msa2_map, sizeof(int) * (msa1->alen+1));
  esl_vec_ISet(msa1_to_msa2_map, (msa1->alen+1), -1);

  /* both alignments must have same 'named' sequences in same order */
  if(msa1->nseq != msa2->nseq) ESL_FAIL(eslEINVAL, errbuf, "in map_sub_msas() msa1 has %d sequences, msa2 has %d sequences\n", msa1->nseq, msa2->nseq);
  for(i = 0; i < msa1->nseq; i++) { 
    if(strcmp(msa1->sqname[i], msa2->sqname[i]) != 0) ESL_FAIL(eslEINVAL, errbuf, "in map_sub_msas() msa1 seq %d is named %s, msa2 seq %d is named %s\n", i, msa1->sqname[i], i, msa2->sqname[i]);
  }

  apos1 = 1;
  apos2 = 1;
  while((apos2 <= msa2->alen) || (apos1 <= msa1->alen)) { /* determine which apos1 (alignment column in msa1), apos2 (alignment column in msa2) corresponds to */
    for(i = 0; i < msa1->nseq; i++) { 
      if(msa1->ax[i][apos1] != msa2->ax[i][apos2]) { 
	apos1++; 
	break; /* try next apos1 */ 
      }
    }	
    if(i == msa1->nseq) { /* found a match */
      msa1_to_msa2_map[apos1] = apos2;
      mask[(apos1-1)] = '1';
      apos1++;
      apos2++;
    }
  }
  if((apos1 != (msa1->alen+1)) || (apos2 != (msa2->alen+1))) ESL_FAIL(eslEINVAL, errbuf, "in map_sub_msas(), failure mapping alignments, end of loop apos1-1 = %d (msa1->alen: %" PRId64 ") and apos2-1 = %d (msa2->alen: %" PRId64 ")\n", apos1-1, msa1->alen, apos2-1, msa2->alen);

  free(msa1_to_msa2_map);
  *ret_msa1_to_msa2_mask = mask;
  return eslOK;
  
 ERROR: 
  return status;
}
Пример #5
0
/* map_rfpos_to_apos
 *                   
 * Given an MSA, determine the alignment position each
 * reference position refers to. 
 */
static int map_rfpos_to_apos(ESL_MSA *msa, int **ret_rf2a_map, int **ret_a2rf_map, int *ret_rflen)
{
  int status;
  int rflen = 0;
  int *rf2a_map = NULL;
  int *a2rf_map = NULL;
  int rfpos = 0;
  int apos = 0;
  /* contract check */
  if(msa->rf == NULL) { status = eslEINVAL; goto ERROR; }

  /* count reference columns */
  for(apos = 1; apos <= msa->alen; apos++)
    if((! esl_abc_CIsGap(msa->abc, msa->rf[(apos-1)])) && 
       (! esl_abc_CIsMissing(msa->abc, msa->rf[(apos-1)])) && 
       (! esl_abc_CIsNonresidue(msa->abc, msa->rf[(apos-1)]))) rflen++;

  /* build maps */
  ESL_ALLOC(rf2a_map, sizeof(int) * (rflen+1));
  ESL_ALLOC(a2rf_map, sizeof(int) * (msa->alen+1));
  esl_vec_ISet(a2rf_map, msa->alen+1, -1);
  rf2a_map[0] = -1;
  for(apos = 1; apos <= msa->alen; apos++) { 
    if((! esl_abc_CIsGap(msa->abc, msa->rf[(apos-1)])) && 
       (! esl_abc_CIsMissing(msa->abc, msa->rf[(apos-1)])) && 
       (! esl_abc_CIsNonresidue(msa->abc, msa->rf[(apos-1)]))) { 
      rf2a_map[++rfpos] = apos;
      a2rf_map[apos]   = rfpos;
    }
    /* else a2rf_map[apos] remains -1 as it was initialized */
  }
  

  if(ret_rf2a_map != NULL) *ret_rf2a_map = rf2a_map;
  else                    free(rf2a_map);
  if(ret_a2rf_map != NULL) *ret_a2rf_map = a2rf_map;
  else                    free(a2rf_map);
  if(ret_rflen != NULL) *ret_rflen    = rflen;
  return eslOK;

 ERROR:
  if(rf2a_map != NULL) free(rf2a_map);
  if(a2rf_map != NULL) free(a2rf_map);
  return status;
}
Пример #6
0
/* get_gaps_per_column 
 *                   
 * Given an MSA, determine the number of gaps per
 * column, and return a newly allocated array with this
 * into in *ret_ngaps. 
 */
static int get_gaps_per_column(ESL_MSA *msa, int **ret_ngaps)
{
  int status;
  int i, apos;
  int *ngaps = NULL;
  /* contract check */
  if(! (msa->flags & eslMSA_DIGITAL)) { status = eslEINVAL; goto ERROR; }

  ESL_ALLOC(ngaps, sizeof(int) * (msa->alen+1));
  esl_vec_ISet(ngaps, msa->alen+1, 0);
  for(i = 0; i < msa->nseq; i++) {
    for(apos = 1; apos <= msa->alen; apos++)
      ngaps[apos] += esl_abc_XIsGap(msa->abc, msa->ax[i][apos]);
  }
  *ret_ngaps = ngaps;
  return eslOK;

 ERROR:
  if(ngaps != NULL) free(ngaps);
  return status;
}
Пример #7
0
/* The esl_random() unit test:
 * a binned frequency test.
 */
static void
utest_random(long seed, int n, int nbins, int be_verbose)
{
  ESL_RANDOMNESS *r      = NULL;
  int            *counts = NULL;
  double          X2p    = 0.;
  int             i;
  double          X2, exp, diff;

  if ((counts = malloc(sizeof(int) * nbins)) == NULL) esl_fatal("malloc failed");
  esl_vec_ISet(counts, nbins, 0);

  /* This contrived call sequence exercises CreateTimeseeded() and
   * Init(), while leaving us a reproducible chain. Because it's
   * reproducible, we know this test succeeds, despite being
   * statistical in nature.
   */
  if ((r = esl_randomness_CreateTimeseeded()) == NULL)  esl_fatal("randomness create failed");
  if (esl_randomness_Init(r, seed)            != eslOK) esl_fatal("randomness init failed");

  for (i = 0; i < n; i++)
    counts[esl_rnd_Roll(r, nbins)]++;

  /* X^2 value: \sum (o_i - e_i)^2 / e_i */
  for (X2 = 0., i = 0; i < nbins; i++) {
    exp  = (double) n / (double) nbins;
    diff = (double) counts[i] - exp;
    X2 +=  diff*diff/exp;
  }
  if (esl_stats_ChiSquaredTest(nbins, X2, &X2p) != eslOK) esl_fatal("chi squared eval failed");
  if (be_verbose) printf("random():  \t%g\n", X2p);
  if (X2p < 0.01) esl_fatal("chi squared test failed");

  esl_randomness_Destroy(r);
  free(counts);
  return;
}
Пример #8
0
/* Function:  esl_msaweight_BLOSUM()
 * Synopsis:  BLOSUM weights.
 * Incept:    SRE, Sun Nov  5 09:52:41 2006 [Janelia]
 *
 * Purpose:   Given a multiple sequence alignment <msa> and an identity
 *            threshold <maxid>, calculate sequence weights using the
 *            BLOSUM algorithm (Henikoff and Henikoff, PNAS
 *            89:10915-10919, 1992). These weights are stored
 *            internally in the <msa> object, replacing any weights
 *            that may have already been there. Weights are $\geq 0$
 *            and they sum to <msa->nseq>.
 *            
 *            The algorithm does a single linkage clustering by
 *            fractional id, defines clusters such that no two clusters
 *            have a pairwise link $\geq$ <maxid>), and assigns
 *            weights of $\frac{1}{M_i}$ to each of the $M_i$
 *            sequences in each cluster $i$. The <maxid> threshold
 *            is a fractional pairwise identity, in the range
 *            $0..1$.
 *            
 *            The <msa> may be in either digitized or text mode.
 *            Digital mode is preferred, so that the pairwise identity
 *            calculations deal with degenerate residue symbols
 *            properly.
 *
 * Returns:   <eslOK> on success, and the weights inside <msa> have been
 *            modified. 
 *            
 * Throws:    <eslEMEM> on allocation error. <eslEINVAL> if a pairwise
 *            identity calculation fails because of corrupted sequence 
 *            data. In either case, the <msa> is unmodified.
 *
 * Xref:      [Henikoff92]; squid::weight.c::BlosumWeights().
 */
int
esl_msaweight_BLOSUM(ESL_MSA *msa, double maxid)
{
  int  *c    = NULL; /* cluster assignments for each sequence */
  int  *nmem = NULL; /* number of seqs in each cluster */
  int   nc;	     /* number of clusters  */
  int   i;           /* loop counter */
  int   status;

  /* Contract checks
   */
  ESL_DASSERT1( (maxid >= 0. && maxid <= 1.) );
  ESL_DASSERT1( (msa->nseq >= 1) );
  ESL_DASSERT1( (msa->alen >= 1) );
  if (msa->nseq == 1) { msa->wgt[0] = 1.0; return eslOK; }

  if ((status = esl_msacluster_SingleLinkage(msa, maxid, &c, NULL, &nc)) != eslOK) goto ERROR;
  ESL_ALLOC(nmem, sizeof(int) * nc);
  esl_vec_ISet(nmem, nc, 0);
  for (i = 0; i < msa->nseq; i++) nmem[c[i]]++;
  for (i = 0; i < msa->nseq; i++) msa->wgt[i] = 1. / (double) nmem[c[i]];

  /* Make weights normalize up to nseq, and return.
   */
  esl_vec_DNorm(msa->wgt, msa->nseq);
  esl_vec_DScale(msa->wgt, msa->nseq, (double) msa->nseq);	
  msa->flags |= eslMSA_HASWGTS;

  free(nmem);
  free(c);
  return eslOK;

 ERROR:
  if (c    != NULL) free(c);
  if (nmem != NULL) free(nmem);
  return status;
}
Пример #9
0
int
main(int argc, char **argv)
{
  ESL_GETOPTS  *go      = NULL;	/* application configuration       */
  ESL_ALPHABET *abc     = NULL;	/* biological alphabet             */
  char         *alifile = NULL;	/* alignment file name             */
  int           fmt;		/* format code for alifiles        */
  ESL_MSAFILE  *afp     = NULL;	/* open alignment file             */
  ESL_MSA      *msa     = NULL;	/* multiple sequence alignment     */
  int           status;		/* easel return code               */

  int           do_info = TRUE;                /* TRUE if -i */
  int           do_max = FALSE;                /* TRUE if -x */
  int           do_ffreq = FALSE;              /* TRUE if --ffreq */
  int           do_fmin  = FALSE;              /* TRUE if --fmin */
  float         fthresh = 0.;                  /* <x> from -f <x> */
  int           do_remove_bps = FALSE;         /* TRUE if -r */
  int           do_consistent = FALSE;         /* TRUE if -c */
  int           do_indi2cons = FALSE;          /* TRUE if --indi <x> */
  int           have_cons;                     /* TRUE if first alignment has consensus sequence */
  int           do_newcons = FALSE;            /* TRUE if we're creating a new consensus structure
						* and outputing a new alignment (if -x -f -c or --indi)
						*/
  int           do_a = FALSE;                  /* TRUE if -a */
  char         *indi;                          /* for <x> from --indi <x> */
  int           nindi_read;                    /* number of individual sequence SS lines we've read for current alignment */

  int           a;		               /* counter over seqs               */
  int           i, i2;		               /* counter over residues */
  int           j, j2;		               /* counter over residues */
  int           nali;                          /* counter over alignments */
  int         **bp = NULL;                     /* bp[i][j] is number of individual bps exist between aln cols i and j */
  int          *cur_ct = NULL;                 /* ct array of basepairs for current sequence */
  int          *cons_ct = NULL;                /* ct array of basepairs for SS_cons being created */
  int          *xcons_ct = NULL;               /* ct array of basepairs for existing SS_cons */
  int          *ngaps = NULL;                  /* number of gaps in each alignment position */
  FILE         *ofp;		               /* output file (default is stdout) */
  int           be_verbose = FALSE;            /* TRUE to print extra info */
  int           seqthresh;                     /* sequence number threshold for defining a bp as consensus (int) ((fthresh * nseq) + 0.5)*/
  char         *sscons = NULL;                 /* the new SS_cons line */
  FILE         *lfp = NULL;                    /* file to list sequences with conflicting bps to */
  int           nlist = 0;                     /* number of sequences listed to list file */
  int          *nconflictsA;                   /* number of conflicting bps in seq a's individual structure annotation */
  int           nconflicts_total = 0;          /* total number of conflicts */
  int           nconflicts_list = 0;           /* total number of conflicts in sequences listed to file <x> from -l <x> */
  int           noverlaps_total = 0;           /* total number of overlaps */
  int           nconsistent_total = 0;         /* total number of consistent bps */
  int           nbps_total = 0;                /* total number of bps */
  int          *nconsistentA;                  /* number of consistent bps in seq a's individual structure annotation */
  int          *noverlapsA;                    /* number of bps in seq a's indi structure that overlap with consensus structure */
  int          *nbpsA;                         /* number of bps in seq a's indi structure that overlap with consensus structure */
  int           ncons_bps = 0;                 /* number of bps in consensus structure */
  int           max_noverlaps_aidx;
  int           max_nconsistent_aidx;
  int           max_nbps_aidx;
  int          *removebp;                      /* removebp[i] is TRUE remove consensus bp [i]:xcons_ct[i] */
  int          *has_conflict;    
  int          *nmates_l2r;                    /* half matrix, nmate_l2r[i] = <x>, i < nmate_l2r[i], there are <x> different right mates j for i */
  int          *nmates_r2l;                    /* half matrix, nmate_r2l[j] = <x>, j < nmate_r2l[j], there are <x> different left  mates i for j */

  int           lmax;                          /* with -l, maximum number of conflicts to allow */
  int           namewidth = 18;                 /* length of 'SS_cons(consensus)' */
  char         *namedashes = NULL;             /* to store underline for seq name */

  /* --fmin related variables */
  int nbps = 0;
  int prev_nbps = -1;
  float fmin;
  int inconsistent_flag;
  int pknot_flag;
  int k,l;

  /***********************************************
   * Parse command line
   ***********************************************/

  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK ||
      esl_opt_VerifyConfig(go)               != eslOK)
    {
      printf("Failed to parse command line: %s\n", go->errbuf);
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  if (esl_opt_GetBoolean(go, "-h") )
    {
      esl_banner(stdout, argv[0], banner);
      esl_usage (stdout, argv[0], usage);
      puts("\nwhere basic options are:");
      esl_opt_DisplayHelp(stdout, go, 1, 2, 80);
      puts("\noptions for defining a new consensus structure (all of these require -o):");
      esl_opt_DisplayHelp(stdout, go, 2, 2, 80);
      puts("\noptions for listing sequences based on structure:");
      esl_opt_DisplayHelp(stdout, go, 3, 2, 80);
      exit(0);
    }

  if (esl_opt_ArgNumber(go) != 1) 
    {
      printf("Incorrect number of command line arguments.\n");
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  alifile  = esl_opt_GetArg(go, 1);

  fmt = eslMSAFILE_STOCKHOLM;

  /***********************************************
   * Open the MSA file; determine alphabet; set for digital input
   ***********************************************/

  if      (esl_opt_GetBoolean(go, "--dna"))  abc = esl_alphabet_Create(eslDNA);
  else if (esl_opt_GetBoolean(go, "--rna"))  abc = esl_alphabet_Create(eslRNA);

  if ( (status = esl_msafile_Open(&abc, alifile, NULL, fmt, NULL, &afp)) != eslOK)
    esl_msafile_OpenFailure(afp, status);

  /* open output file */
  if (esl_opt_GetString(go, "-o") != NULL) {
    if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) 
	esl_fatal("Failed to open -o output file %s\n", esl_opt_GetString(go, "-o"));
  } else ofp = NULL;
  if (esl_opt_GetString(go, "-l") != NULL) { 
    if ((lfp = fopen(esl_opt_GetString(go, "-l"), "w")) == NULL) 
	esl_fatal("Failed to open -l output file %s\n", esl_opt_GetString(go, "-l"));
  }

  /* determine if we're creating a structure */
  do_max = esl_opt_GetBoolean(go, "-x");
    
  if(!(esl_opt_IsDefault(go, "--ffreq"))) { 
    do_ffreq = TRUE; 
    fthresh = esl_opt_GetReal(go, "--ffreq"); 
  }
  if(!(esl_opt_IsDefault(go, "--fmin"))) { 
    do_fmin = TRUE; 
  }
  do_remove_bps = esl_opt_GetBoolean(go, "-r"); 
  do_consistent = esl_opt_GetBoolean(go, "-c");
  if(!(esl_opt_IsDefault(go, "--indi"))) { 
    do_indi2cons = TRUE; 
  }
  if(do_max || do_ffreq || do_fmin || do_remove_bps || do_consistent || do_indi2cons) { 
    do_newcons = TRUE;
  }
  do_a = esl_opt_GetBoolean(go, "-a");
  if(do_a || do_max || do_ffreq || do_fmin || do_remove_bps || do_consistent || do_indi2cons) { 
    do_info = FALSE;
  }

  /***********************************************
   * Read MSAs one at a time.
   ***********************************************/
  nali = 0;
  have_cons = FALSE;
  lmax = esl_opt_GetInteger(go, "--lmax");
  if(esl_opt_GetBoolean(go, "-v")) be_verbose = TRUE; 

  while ((status = esl_msafile_Read(afp, &msa)) != eslEOF)
    {
      if (status != eslOK) esl_msafile_ReadFailure(afp, status);
      nali++;

      /* determine max length name */
      namewidth = 18; /* length of 'SS_cons(consensus)' */
      for(i = 0; i < msa->nseq; i++) namewidth = ESL_MAX(namewidth, strlen(msa->sqname[i]));
      if(namedashes != NULL) { free(namedashes); }
      ESL_ALLOC(namedashes, sizeof(char) * namewidth+1);
      namedashes[namewidth] = '\0';
      for(i = 0; i < namewidth; i++) namedashes[i] = '-';

      ESL_ALLOC(sscons, sizeof(char) * (msa->alen+1));
      ESL_ALLOC(cur_ct, sizeof(int) * (msa->alen+1));
      ESL_ALLOC(cons_ct, sizeof(int) * (msa->alen+1));
      ESL_ALLOC(xcons_ct, sizeof(int) * (msa->alen+1));
      ESL_ALLOC(bp, sizeof(int *) * (msa->alen+1));
      ESL_ALLOC(removebp, sizeof(int) * (msa->alen+1));
      ESL_ALLOC(has_conflict, sizeof(int) * (msa->alen+1));
      ESL_ALLOC(nmates_l2r, sizeof(int) * (msa->alen+1));
      ESL_ALLOC(nmates_r2l, sizeof(int) * (msa->alen+1));
      esl_vec_ISet(cur_ct, (msa->alen+1), 0);
      esl_vec_ISet(cons_ct, (msa->alen+1), 0);
      esl_vec_ISet(xcons_ct, (msa->alen+1), 0);
      esl_vec_ISet(removebp, (msa->alen+1), FALSE);
      esl_vec_ISet(has_conflict, (msa->alen+1), FALSE);
      esl_vec_ISet(nmates_l2r, (msa->alen+1), 0);
      esl_vec_ISet(nmates_r2l, (msa->alen+1), 0);

      ESL_ALLOC(nconflictsA, sizeof(int) * msa->nseq);
      ESL_ALLOC(noverlapsA, sizeof(int) * msa->nseq);
      ESL_ALLOC(nconsistentA, sizeof(int) * msa->nseq);
      ESL_ALLOC(nbpsA, sizeof(int) * msa->nseq);
      esl_vec_ISet(nconflictsA, msa->nseq, 0);
      esl_vec_ISet(noverlapsA, msa->nseq, 0);
      esl_vec_ISet(nconsistentA, msa->nseq, 0);
      esl_vec_ISet(nbpsA, msa->nseq, 0);

      max_noverlaps_aidx = max_nconsistent_aidx = max_nbps_aidx = 0;
      nconsistent_total = nbps_total = noverlaps_total = nconflicts_total = nconflicts_list = 0;
      for(i = 1; i <= msa->alen; i++) { 
	ESL_ALLOC(bp[i], sizeof(int) * (msa->alen+1));
	esl_vec_ISet(bp[i], (msa->alen+1), 0);
      }

      /* make sure we have ss_cons and indi ss if we need it */
      if(msa->ss_cons == NULL && do_remove_bps) esl_fatal("-r requires all alignments have SS_cons annotation, alignment %d does not.", nali);
      if(msa->ss == NULL && do_max)             esl_fatal("-x requires all alignments have individual SS annotation, alignment %d does not.", nali);
      if(msa->ss == NULL && do_consistent)      esl_fatal("-c requires all alignments have individual SS annotation, alignment %d does not.", nali);
      if(msa->ss == NULL && do_indi2cons)       esl_fatal("--indi requires all alignments have individual SS annotation, alignment %d does not.", nali);
      if(msa->ss == NULL && do_ffreq)           esl_fatal("--ffreq requires all alignments have individual SS annotation, alignment %d does not.", nali);
      if(msa->ss == NULL && do_fmin)            esl_fatal("--fmin requires all alignments have individual SS annotation, alignment %d does not.", nali);

      if(msa->ss_cons != NULL) { 
	if((status = esl_wuss2ct(msa->ss_cons, msa->alen, xcons_ct)) != eslOK) { 
	  esl_fatal("Existing SS_cons for alignment %d is invalid.", nali);
	}
	ncons_bps = 0;
	for(i = 1; i <= msa->alen; i++) 
	  if(xcons_ct[i] != 0 && i < xcons_ct[i]) 
	    ncons_bps++;

	if(nali > 1 && !have_cons)
	  esl_fatal("the first aln has SS_cons but aln %d lacks it, if one has it, they all must.", nali); 
	if(nali == 1) have_cons = TRUE;
      }
      else if (lfp != NULL) { 
	esl_fatal("the -l option requires existing SS_cons annotation, aln %d lacks it.", nali); 
      }
      else if (do_remove_bps) { 
	esl_fatal("the -r option requires existing SS_cons annotation, aln %d lacks it.", nali); 
      }
      else if (do_consistent) { 
	esl_fatal("the -c option requires existing SS_cons annotation, aln %d lacks it.", nali); 
      }
      else { 
	if(nali > 1 && have_cons)
	  esl_fatal("the first aln does not have SS_cons but aln %d does, if one has it, they all must.", nali); 
      }

      if(do_info) { 
	printf("# Per-sequence basepair information:\n"); 
	printf("# Alignment file: %s\n", alifile);
	printf("# Alignment idx:  %d\n", nali);
	if(msa->name != NULL) { printf("# Alignment name: %s\n", msa->name); }
	if(have_cons) { 
	  printf("#\n");
	  printf("# indibp: number of basepairs in the individual sequence SS annotation\n");
	  printf("# ovrlap: number of indibp basepairs that also exist as consensus basepairs\n");
	  printf("# cnsist: number of indibp basepairs that do not conflict with any consensus basepairs\n");
	  printf("# cnflct: number of indibp basepairs that conflict with >= 1 consensus basepairs\n");
	  printf("#\n");
	  printf("# A conflict exists between two basepairs in different structures, one between columns i and j\n");
	  printf("# and the other between columns k and l, if (i == k and j != l) or (j == l and i != k).\n");
	  printf("#\n");
	  printf("# %-*s  %6s  %6s  %6s  %6s\n", namewidth, "seqname", "indibp", "ovrlap", "cnsist", "cnflct");
	  printf("# %-*s  %6s  %6s  %6s  %6s\n", namewidth, namedashes, "------", "------", "-----", "------");
	}
	else { 
	  printf("# %-*s  %6s\n", namewidth, "seqname", "nbp");
	  printf("# %-*s  %6s\n", namewidth, namedashes, "------");
	}
      }

      nindi_read = 0;
      for (a = 0; a < msa->nseq; a++) { 
	if(msa->ss != NULL && msa->ss[a] != NULL) { 
	  if((status = esl_wuss2ct(msa->ss[a], msa->alen, cur_ct)) != eslOK) { 
	    esl_fatal("SS annotation for sequence %d, aln %d  is invalid.\n", (a+1), nali);
	  }
	  nindi_read++;
	  for(i = 1; i <= msa->alen; i++) { 
	    if(i < cur_ct[i]) { 
	      bp[i][cur_ct[i]]++;
	      if(bp[i][cur_ct[i]] == 1) { 
		nmates_l2r[i]++;
		nmates_r2l[cur_ct[i]]++;
	      }
	    }
	  }

	  for(i = 1; i <= msa->alen; i++) { 
	    if(cur_ct[i] != 0 && i < cur_ct[i]) { 
	      if(xcons_ct[i] == cur_ct[i]) noverlapsA[a]++;
	      if((xcons_ct[i] != 0) && (xcons_ct[i] != cur_ct[i])) { 
		if(be_verbose) { printf("ali: %2d seq %3d (%s) bp %4d:%4d conflicts with consensus bp %4d:%4d\n", nali, a, msa->sqname[a], i, cur_ct[i], i, xcons_ct[i]); }
		nconflictsA[a]++;
		/* indi bp i:cur_ct[i] conflicts with i:xcons_ct[i] */
		removebp[i]           = TRUE;
		removebp[xcons_ct[i]] = TRUE;
	      }
	      else if((xcons_ct[cur_ct[i]] != 0) && (xcons_ct[cur_ct[i]] != i) && (cur_ct[xcons_ct[cur_ct[i]]] == 0)) { 
		if(be_verbose) { printf("ali: %2d seq %3d (%s) bp %4d:%4d conflicts with consensus bp %4d:%4d\n", nali, a, msa->sqname[a], xcons_ct[i], cur_ct[xcons_ct[i]], xcons_ct[cur_ct[i]], cur_ct[i]); }
		nconflictsA[a]++;
		/* indi bp i:cur_ct[i] conflicts with xcons_ct[cur_ct[i]]:cur_ct[i] */
		removebp[cur_ct[i]] = TRUE;
		removebp[xcons_ct[cur_ct[i]]] = TRUE;
	      }
	      else nconsistentA[a]++;
	    }		  
	  }
	  if(nconflictsA[a] > lmax) { 
	    if(lfp != NULL) fprintf(lfp, "%s\n", msa->sqname[a]); 
	    nconflicts_list += nconflictsA[a];
	    nlist++;
	  }
	  nbpsA[a] = nconflictsA[a] + nconsistentA[a];
	  nconflicts_total += nconflictsA[a];
	  nconsistent_total += nconsistentA[a];
	  noverlaps_total += noverlapsA[a];
	  nbps_total += nbpsA[a];

	  if(do_info && have_cons)  printf("  %-*s  %6d  %6d  %6d  %6d\n", namewidth, msa->sqname[a], nbpsA[a], noverlapsA[a], nconsistentA[a], nconflictsA[a]); 
	  if(do_info && !have_cons) printf("  %-*s  %6d\n", namewidth, msa->sqname[a], nbpsA[a]);
	  if(nbpsA[a] > nbpsA[max_nbps_aidx]) max_nbps_aidx = a;
	  if((noverlapsA[a] > noverlapsA[max_noverlaps_aidx]) || ((noverlapsA[a] == noverlapsA[max_noverlaps_aidx]) && (nbpsA[a] > nbpsA[max_noverlaps_aidx]))) max_noverlaps_aidx = a;
	  if((nconsistentA[a] > nconsistentA[max_nconsistent_aidx]) || ((nconsistentA[a] == nconsistentA[max_nconsistent_aidx]) && (nbpsA[a] > nbpsA[max_nconsistent_aidx]))) max_nconsistent_aidx = a;
	}
	else if(do_newcons || esl_opt_GetBoolean(go, "-a")) { esl_fatal("No SS annotation for sequence %d, aln %d.\n", (a+1), nali); }
      }

      if(do_info && have_cons) { 
	if(nindi_read > 0) printf("\n"); 
	printf("  %-*s  %6d  %6d  %6d  %6d\n", namewidth, "SS_cons(consensus)", ncons_bps, ncons_bps, ncons_bps, 0); 
	if(nindi_read > 0) { 
	  printf("\n# %6d/%6d (%.3f) overlap\n", noverlaps_total, nbps_total, nbps_total > 0 ? (float) noverlaps_total / (float) nbps_total : 0.);
	  printf("# %6d/%6d (%.3f) consistent\n", nconsistent_total, nbps_total, nbps_total > 0 ? (float) nconsistent_total / (float) nbps_total: 0.);
	  printf("# %6d/%6d (%.3f) conflict\n", nconflicts_total, nbps_total, nbps_total > 0 ? (float) nconflicts_total / (float) nbps_total: 0.);
	}
	else { 
	  printf("# No sequences in the alignment have GR SS annotation.\n");
	}
      }

      if(lfp != NULL) { 
	printf("# %d/%d sequences with %.3f individual bps on avg that conflict with SS_cons written to %s\n", nlist, msa->nseq, (float) nconflicts_list / (float) nlist, esl_opt_GetString(go, "-l")); 
      }

      /* determine number of gaps per alignment column */
      if((status = get_gaps_per_column(msa, &ngaps)) != eslOK) goto ERROR;

      /* -x: determine max bp structure OR
       * -a: list all conflicts in individual structures */
      if(do_max || do_a) { 
	for(i = 1; i <= msa->alen; i++) { 
	  if(nmates_l2r[i] > 1) {/* list the conflicts */
	    has_conflict[i] = TRUE;
	    for(j = 1; j <= msa->alen; j++) { 
	      if(bp[i][j] > 0) { 
		if(do_a) printf("More than 1 right mates for left  mate %4d   %4d:%4d bp exists in %4d/%4d seqs (%.3f)\n", i, i, j, bp[i][j], msa->nseq - ngaps[i], (float) bp[i][j] / (float) (msa->nseq - ngaps[i])); 
		has_conflict[j] = TRUE;
	      }
	    }
	  }
	}
	for(i = 1; i <= msa->alen; i++) { 
	  if(nmates_r2l[i] > 1) {/* list the conflicts */
	    has_conflict[i] = TRUE;
	    for(j = 1; j <= msa->alen; j++) { 
	      if(bp[j][i] > 0) { 
		if(do_a) printf("More than 1 left  mates for right mate %4d   %4d:%4d bp exists in %4d/%4d seqs (%.3f)\n", i, j, i, bp[j][i], msa->nseq - ngaps[i], (float) bp[j][i] / (float) (msa->nseq - ngaps[i])); 
		has_conflict[j] = TRUE;
	      }
	    }
	  }
	}
	for(i = 1; i <= msa->alen; i++) { 
	  /*printf("conflict[%4d]: %d\n", i, has_conflict[i]);*/
	  if(nmates_l2r[i] == 1 && (!(has_conflict[i]))) { 
	    j = i+1; 
	    while(bp[i][j] == 0) j++;
	    cons_ct[i] = j;
	    cons_ct[j] = i;
	  }
	}

	/* remove pseudoknotted bps greedily */
	for(i = 1; i <= msa->alen; i++) { 
	  j = cons_ct[i]; 
	  if(j != 0 && i < j) { 
	    for(i2 = i+1; i2 <= msa->alen; i2++) { 
	      j2 = cons_ct[i2];
	      if(j2 != 0 && i2 < j2) { 
		if((i2 < j) && (j < j2)) { 
		  /*printf("KNOT %4d:%4d (%4d) %4d:%4d (%4d)\n", i, j, bp[i][j], i2, j2, bp[i2][j2]);*/
		  /* note: remove both if they have equal number of sequences */
		  if(bp[i][j] <= bp[i2][j2]) { 
		    /*printf("rm %4d:%4d\n", i, j);*/
		    cons_ct[cons_ct[i]] = 0;
		    cons_ct[i]          = 0;
		  }
		  if(bp[i][j] >= bp[i2][j2]) { 
		    /*printf("rm %4d:%4d\n", i2, j2);*/
		    cons_ct[cons_ct[i2]] = 0;
		    cons_ct[i2]          = 0;
		  }
		}
	      }
	    }
	  }
	}
      }
      
      /***************************************/
      /*PARANOID, second check for knots 
      for(i = 1; i <= msa->alen; i++) { 
	j = cons_ct[i]; 
	if(j != 0 && i < j) { 
	  printf("BP: %4d:%4d\n", i, j);
	  for(i2 = 1; i2 <= msa->alen; i2++) { 
	    j2 = cons_ct[i2];
	    if(j2 != 0 && i2 < j2) { 
	      if((i2 < j) && (j < j2)) { 
		if((i < i2)) { 
		  printf("KNOT %4d:%4d (%4d) %4d:%4d (%4d)\n", i, j, bp[i][j], i2, j2, bp[i2][j2]);
		}
	      }
	    }
	  }
	}
      }
      ******************************************/

      /***************************************/
      /*PARANOID, check cons_ct for consistency
      for(i = 1; i <= msa->alen; i++) { 
	if(cons_ct[i] != 0) { 
	  if(cons_ct[cons_ct[i]] != i) { printf("ERROR: i: %4d cons_ct[i]: %4d cons_ct[cons_ct[i]]: %4d\n", i, cons_ct[i], cons_ct[cons_ct[i]]); }
	}
      }
      */
      /*PARANOID, write out SS_cons 
      for(i = 1; i <= msa->alen; i++) { 
	if(i < cons_ct[i]) printf("<"); 
	else if(cons_ct[i] != 0) { printf(">"); }
	else printf(".");
      }
      printf("\n");
      */
      /***************************************/

      /* textize alignment */
      if((status = esl_msa_Textize(msa)) != eslOK) esl_fatal("ERROR textizing alignment %d\n", nali); 

      /* --fmin */
      if(do_fmin) { 
	/* define ss_cons */
	prev_nbps = -1;
	fthresh = 0.99;
	inconsistent_flag = pknot_flag = FALSE;
	printf("# Defining consensus structure:\n");
	printf("# indi SS basepair aln columns i:j (from at least 1 indi SS) will become consensus basepair\n");
	printf("# if > <x> individual SS contain i:j as a pair\n");
	printf("# We'll search for minimal <x> that gives a consistent consensus structure.\n");
	printf("# A consistent structure has each position involved in 0 or 1 basepairs.\n");
	printf("#\n");
	printf("# Alignment file: %s\n", alifile);
	printf("# Alignment idx:  %d\n", nali);
	printf("# Number of seqs: %d\n", msa->nseq);
	printf("#\n");
	printf("# %5s  %23s  %6s\n", "<x>", "nseq-required-with-bp", "numbps");
	printf("# %5s  %23s  %6s\n", "-----", "-----------------------", "------");
	while(fthresh >= 0.00 && (inconsistent_flag == FALSE) && (pknot_flag == FALSE)) { 
	  nbps = 0;
	  seqthresh = (int) (fthresh * msa->nseq);
	  /*printf("fthresh: %f seqthresh: %d nseq: %d\n", fthresh, seqthresh, msa->nseq);*/
	  esl_vec_ISet(cons_ct, msa->alen+1, 0);
	  for(i = 1; i <= msa->alen; i++) { 
	    for(j = i+1; j <= msa->alen; j++) { 
	      if(bp[i][j] > seqthresh) { 
		if(cons_ct[i] != 0 || cons_ct[j] != 0) { 
		  inconsistent_flag = TRUE;
		}
		/* check for pseudoknots */
		for(k = i+1; k < j; k++) { 
		  l = cons_ct[k];
		  if((k < l) && (l > j)) { 
		    pknot_flag = TRUE;
		  }
		  if((k > l) && (l != 0) && (l < i)) { 
		    pknot_flag = TRUE;
		  }
		}
		cons_ct[i] = j;
		cons_ct[j] = i;
		nbps++;
	      }
	    }
	  }
	  if(inconsistent_flag) 
	    printf("  %.3f  %23d  %s\n", fthresh, seqthresh+1, "inconsistent");
	  else if(pknot_flag) 
	    printf("  %.3f  %23d  %s\n", fthresh, seqthresh+1, "pseudoknotted");
	  else { 
	    if(nbps != prev_nbps) { 
	      printf("  %.3f  %23d  %6d\n", fthresh, seqthresh+1, nbps);
	    }
	    fmin = fthresh;
	  }
	  fthresh -= 0.01;
	  prev_nbps = nbps;
	}
	fthresh = fmin;
	esl_vec_ISet(cons_ct, msa->alen+1, 0);
      }

      /* --ffreq: determine structure by defining consensus bps that occur in <x> fraction of indi structures */
      if(do_ffreq || do_fmin) { 
	if(do_fmin)  { printf("#\n# <x> determined to be %.3f\n", fthresh); }
	if(do_ffreq) { 
	  printf("# Defining consensus structure:\n");
	  printf("# indi SS basepair aln columns i:j (from at least 1 indi SS) will become consensus basepair\n");
	  printf("# if > %f individual SS contain i:j as a pair\n", fthresh);
	}
	esl_vec_ISet(cons_ct, msa->alen+1, 0);
	/* define ss_cons */
	  seqthresh = (int) (fthresh * msa->nseq);
	  /*printf("fthresh: %f seqthresh: %d nseq: %d\n", fthresh, seqthresh, msa->nseq);*/
	  for(i = 1; i <= msa->alen; i++) { 
	    for(j = i+1; j <= msa->alen; j++) { 
	      if(bp[i][j] > seqthresh) { 
		if(cons_ct[i] != 0) { 
		  esl_fatal("ERROR, two base pairs including position %d satisfy threshold (%d:%d and %d:%d)!\n", i, i, cons_ct[i], i, j);
		}
		if(cons_ct[j] != 0) { 
		  esl_fatal("ERROR, two base pairs including position %d satisfy threshold (%d:%d and %d:%d)!\n", j, j, cons_ct[j], i, j);
		}
		cons_ct[i] = j;
		cons_ct[j] = i;
	      }
	    }
	  }
	}

      /* -r: redefine consensus struct by removing any bps that conflict with individual structures */
      if(do_remove_bps) { 
	for(i = 1; i <= msa->alen; i++) { 
	  if(!(removebp[i])) {
	    cons_ct[i]          = xcons_ct[i];
	    cons_ct[cons_ct[i]] = i;
	  }
	  else {
	    printf("# Removing consensus bp: %d:%d\n", i, xcons_ct[i]);
	    cons_ct[xcons_ct[i]] = 0; 
	    cons_ct[i]           = 0; 
	  }
	}
      }
      
      /* -c:     define consensus structure as indi sequence with highest number of consistent bps with structure  OR */
      /* --indi: define consensus structure as indi sequence <x> from --indi <x> */
      if(do_consistent || do_indi2cons) {
	if(do_indi2cons) { 
	  indi = esl_opt_GetString(go, "--indi");
	  for(a = 0; a < msa->nseq; a++) { 
	    if(strcmp(indi, msa->sqname[a]) == 0) break;
	  }
	  if(a == msa->nseq) esl_fatal("ERROR, could not find a sequence named %s in the alignment.\n", indi);
	}
	else { /* do_consistent */
	  a = max_nconsistent_aidx;
	}
	if(msa->ss == NULL || msa->ss[a] == NULL) esl_fatal("ERROR, no individual SS annotation for %s in the alignment.\n", msa->sqname[a]);
	if((status = esl_wuss2ct(msa->ss[a], msa->alen, cons_ct)) != eslOK) { 
	  esl_fatal("Second pass... SS annotation for sequence %d, aln %d  is invalid.\n", (a), nali);
	}	
	printf("# Defined new SS_cons as SS annotation for %s (%d basepairs)\n", msa->sqname[a], nbpsA[a]);
	if(esl_opt_GetBoolean(go, "--rfc") || esl_opt_GetBoolean(go, "--rfindi")) {
	  if(msa->rf != NULL) { free(msa->rf); msa->rf = NULL; }
	  if((status = esl_strcat(&(msa->rf), -1, msa->aseq[a], msa->alen)) != eslOK) goto ERROR;
	  printf("# Defined new RF as %s sequence\n", msa->sqname[a]);
	}
      }
      
      /* write out alignment with new SS_cons */
      if(do_newcons) { 
	if((status = esl_ct2wuss(cons_ct, msa->alen, sscons)) != eslOK) goto ERROR;
	if(msa->ss_cons != NULL) { free(msa->ss_cons); msa->ss_cons = NULL; }
	if((status = esl_strcat(&(msa->ss_cons), -1, sscons, msa->alen)) != eslOK) goto ERROR;
	status = esl_msafile_Write(ofp, msa, (esl_opt_GetBoolean(go, "--pfam") ? eslMSAFILE_PFAM : eslMSAFILE_STOCKHOLM));
	if      (status == eslEMEM) esl_fatal("Memory error when outputting alignment\n");
	else if (status != eslOK)   esl_fatal("Writing alignment file failed with error %d\n", status);
      }
      
      free(sscons);
      free(cur_ct);
      free(cons_ct);
      free(xcons_ct);
      for(i = 1; i <= msa->alen; i++) free(bp[i]);
      free(bp);
      esl_msa_Destroy(msa);
    }
  if (nali == 0) esl_fatal("No alignments found in file %s\n", alifile);

  /* Cleanup, normal return
   */
  if(lfp != NULL) fclose(lfp);
  if(ofp != NULL) { 
    printf("# Alignment(s) saved to file %s\n", esl_opt_GetString(go, "-o"));
    fclose(ofp);
  }
  esl_msafile_Close(afp);
  esl_getopts_Destroy(go);
  return 0;

 ERROR:
  if(afp) esl_msafile_Close(afp);
  if(go)  esl_getopts_Destroy(go);
  if(msa) esl_msa_Destroy(msa);
  if(lfp) fclose(lfp);
  if(ofp) fclose(ofp);
  esl_fatal("ERROR\n");
  return 1;
  
}
Пример #10
0
int
main(int argc, char **argv)
{
  ESL_GETOPTS *go;		/* application configuration       */
  int          kstatus, tstatus;/* return code from Easel routine  */
  int          fmt;		/* expected format of kfile, tfile */
  char        *kfile, *tfile;   /* known, test structure file      */
  ESL_MSAFILE *kfp, *tfp;       /* open kfile, tfile               */
  ESL_MSA     *ka,  *ta; 	/* known, trusted alignment        */
  int64_t      klen, tlen;	/* lengths of dealigned seqs       */
  int          i;		/* counter over sequences          */
  int          apos;		/* counter over alignment columns  */
  int          rfpos;		/* counter over consensus (non-gap RF) columns  */
  int       is_rfpos;            /* TRUE if current apos is a consensus pos, FALSE if not */
  int          uapos;		/* counter over unaligned residue positions */
  int          nali;            /* number of alignment we're on in each file */

  int        **kp;              /* [0..i..nseq-1][1..r..sq->n] = x known non-gap RF position of residue r in sequence i */
  int        **tp;              /* [0..i..nseq-1][1..r..sq->n] = x predicted non-gap RF position of residue r in sequence i */
  /* for both kp and pp, if x <= 0, residue r for seq i is not aligned to a non-gap RF position, but rather as an 'insert'
   * after non-gap RF position (x * -1) 
   */
  int        *km_pos;          /* [0..rflen] = x, in known aln,     number of residues aligned to non-gap RF column x; special case: mct[0] = 0 */
  int        *ki_pos;          /* [0..rflen] = x, in known aln,     number of residues inserted after non-gap RF column x */
  int        *tm_pos;          /* [0..rflen] = x, in predicted aln, number of residues aligned to non-gap RF column x; special case: mct[0] = 0 */
  int        *ti_pos;          /* [0..rflen] = x, in predicted aln, number of residues inserted after non-gap RF column x */
  int    *cor_tm_pos;          /* [0..rflen] = x, in predicted aln, number of correctly predicted residues aligned to non-gap RF column x; special case: mct[0] = 0 */
  int    *cor_ti_pos;          /* [0..rflen] = x, in predicted aln, number of correctly predicted residues inserted after non-gap RF column x */

  int        *km_seq;          /* [0..i..nseq-1] = x, in known aln,     number of residues aligned to non-gap RF columns in seq i; */
  int        *ki_seq;          /* [0..i..nseq-1] = x, in known aln,     number of residues inserted in seq i */
  int        *tm_seq;          /* [0..i..nseq-1] = x, in predicted aln, number of residues aligned to non-gap RF columns in seq i; */
  int        *ti_seq;          /* [0..i..nseq-1] = x, in predicted aln, number of residues inserted in seq i */
  int    *cor_tm_seq;          /* [0..i..nseq-1] = x, in predicted aln, number of correctly predicted residues aligned to non-gap RF columns in seq i */
  int    *cor_ti_seq;          /* [0..i..nseq-1] = x, in predicted aln, number of correctly predicted residues inserted in seq i */

  int     *seqlen;             /* [0..i..nseq-1] = x, unaligned seq i has length x */
  ESL_ALPHABET *abc = NULL;    /* alphabet for all alignments */
  int      rflen, t_rflen;     /* non-gap RF length (consensus lengths) */
  int   status;
  char *namedashes;
  int ni;
  int namewidth = 8; /* length of 'seq name' */
  int cor_tm, cor_ti, km, ki; /* correct predicted match, correct predicted insert, total match, total insert */
  char *mask = NULL;
  int masklen;
  ESL_DSQ *ks;
  ESL_DSQ *ts;
  FILE *dfp = NULL; /* for --c2dfile */

  /* variables needed for -p and related options */
  int do_post = FALSE; /* TRUE if -p enabled */
  int do_post_for_this_rfpos = FALSE; /* set for each consensus position, always TRUE unless --mask-p2xm */
  int p;               /* counter over integerized posteriors */
  int *ptm = NULL;     /* [0..p..10] number of total   matches with posterior value p (10="*")*/
  int *pti = NULL;     /* [0..p..10] number of total   inserts with posterior value p */
  int *cor_ptm = NULL; /* [0..p..10] number of correct matches with posterior value p */
  int *cor_pti = NULL; /* [0..p..10] number of correct inserts with posterior value p */
  int npostvals = 11;  /* number of posterior values 0-9, * */
  int ppidx;           /* index of PP */
  char ppchars[11] = "0123456789*";
  int cm_cor_ptm, cm_cor_pti, cm_ptm, cm_pti, cm_incor_ptm, cm_incor_pti; /* cumulative counts of posteriors */
  // int tot_cor_ptm, tot_cor_pti, tot_ptm, tot_pti;       /* total counts of posteriors */
  // int tot_incor_ptm,tot_incor_pti;                      // SRE: commented out; don't seem to be used; need to silence compiler warning
  char errbuf[eslERRBUFSIZE];

  /***********************************************
   * Parse command line
   ***********************************************/

  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK ||
      esl_opt_VerifyConfig(go)               != eslOK)
    {
      printf("Failed to parse command line: %s\n", go->errbuf);
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  if (esl_opt_GetBoolean(go, "-h") )
    {
      esl_banner(stdout, argv[0], banner);
      esl_usage (stdout, argv[0], usage);
      puts("\n where options are:");
      esl_opt_DisplayHelp(stdout, go, 1, 2, 80);
      esl_opt_DisplayHelp(stdout, go, 2, 2, 80);
      exit(EXIT_SUCCESS);
    }

  if (esl_opt_ArgNumber(go) != 2) 
    {
      printf("Incorrect number of command line arguments.\n");
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  kfile = esl_opt_GetArg(go, 1);
  tfile = esl_opt_GetArg(go, 2);
  
  fmt = eslMSAFILE_STOCKHOLM;

  /***********************************************
   * Open the two Stockholm files.
   ***********************************************/

  if      (esl_opt_GetBoolean(go, "--amino"))   abc = esl_alphabet_Create(eslAMINO);
  else if (esl_opt_GetBoolean(go, "--dna"))     abc = esl_alphabet_Create(eslDNA);
  else if (esl_opt_GetBoolean(go, "--rna"))     abc = esl_alphabet_Create(eslRNA);

  if ( (kstatus = esl_msafile_Open(&abc, kfile, NULL, fmt, NULL, &kfp)) != eslOK) esl_msafile_OpenFailure(kfp, kstatus);
  if ( (tstatus = esl_msafile_Open(&abc, tfile, NULL, fmt, NULL, &tfp)) != eslOK) esl_msafile_OpenFailure(tfp, tstatus);

  do_post = esl_opt_GetBoolean(go, "-p");

  /* read the mask file if --p-mask is enabled */
  if(! esl_opt_IsDefault(go, "--p-mask")) { 
    if((status = read_mask_file(esl_opt_GetString(go, "--p-mask"), errbuf, &mask, &masklen)) != eslOK) esl_fatal(errbuf);
  }
  /* open the c2dfile for output, if nec */
  if (esl_opt_IsOn(go, "--c2dfile")) { 
    if ((dfp = fopen(esl_opt_GetString(go, "--c2dfile"), "w")) == NULL) esl_fatal("Failed to open --c2dfile output file %s\n", esl_opt_GetString(go, "--c2dfile"));
  }

  /***********************************************
   * Do alignment comparisons, one seq at a time;
   * this means looping over all seqs in all alignments.
   ***********************************************/
  nali = 0;
  while ( (kstatus = esl_msafile_Read(kfp, &ka)) != eslEOF)
    {
      if (  kstatus                               != eslOK) esl_msafile_ReadFailure(kfp, kstatus);
      if ( (tstatus = esl_msafile_Read(tfp, &ta)) != eslOK) esl_msafile_ReadFailure(tfp, tstatus);

      nali++;
      if((nali > 1) && (esl_opt_IsOn(go, "--c2dfile"))) esl_fatal("--c2dfile is only meant for msafiles with single alignments"); 

      /* Sanity check on alignment
       */
      if (ka->nseq != ta->nseq)
	esl_fatal("trusted, test alignments don't have same seq #\n");
      if (ka->rf == NULL)
	esl_fatal("trusted alignment has no reference annotation\n");
      if (ta->rf == NULL)
	esl_fatal("test alignment has no reference annotation\n");

      /* make sure the sequences are all identical */
      ESL_ALLOC(seqlen, sizeof(int) * ka->nseq);
      for(i = 0; i < ka->nseq; i++) { 
	if(strcmp(ka->sqname[i], ta->sqname[i]) != 0) esl_fatal("sequence %d of trusted alignment %s has different name than seq %d of predicted alignment %s\n", (i+1), ka->sqname[i], (i+1), ta->sqname[i]); 
	ESL_ALLOC(ks, sizeof(ESL_DSQ) * (ka->alen+2));
	memcpy(ks, ka->ax[i], (ka->alen+2) * sizeof(ESL_DSQ));
	esl_abc_XDealign(ka->abc, ks, ka->ax[i], &klen);

	ESL_ALLOC(ts, sizeof(ESL_DSQ) * (ta->alen+2));
	memcpy(ts, ta->ax[i], (ta->alen+2) * sizeof(ESL_DSQ));
	esl_abc_XDealign(ta->abc, ts, ta->ax[i], &tlen);

	if (tlen != klen)
	  esl_fatal("dealigned sequence mismatch, seq %d, when dealigned, is %d residues in the known alignment, but %d residues in the trusted alignment.", (i+1), klen, tlen);

	if (memcmp(ks, ts, sizeof(ESL_DSQ) * klen) != 0) 
	  esl_fatal("dealigned sequence mismatch, seq %d %s, when dealigned, are not identical.", (i+1), ka->sqname[i]);

	seqlen[i] = tlen;
	free(ks);
	free(ts);
      }

      /* determine non-gap RF length */
      rflen = 0;
      for(apos = 1; apos <= ka->alen; apos++) { 
	if((! esl_abc_CIsGap    (ka->abc, ka->rf[apos-1])) && 
	   (! esl_abc_CIsMissing(ka->abc, ka->rf[apos-1]))) rflen++;
      }
      t_rflen = 0;
      for(apos = 1; apos <= ta->alen; apos++) { 
	if((! esl_abc_CIsGap       (ta->abc, ta->rf[apos-1])) && 
	   (! esl_abc_CIsMissing   (ta->abc, ta->rf[apos-1]))) t_rflen++;
      }
      if(t_rflen != rflen) esl_fatal("Trusted alignment non-gap RF length (%d) != predicted alignment non-gap RF length (%d).\n", rflen, t_rflen);

      /* if -p, make sure the test alignment has posterior probabilities, and allocate our counters for correct/incorrect per post value */
      if(do_post) { 
	if(! esl_opt_IsDefault(go, "--p-mask")) {
	  if(masklen != rflen) { 
	    esl_fatal("Length of mask in %s (%d) not equal to non-gap RF len of alignments (%d)\n", esl_opt_GetString(go, "--p-mask"), masklen, rflen);
	  }
	}
	if(ta->pp == NULL) esl_fatal("-p requires \"#=GR PP\" annotation in the test alignment, but none exists");
	ESL_ALLOC(ptm,     sizeof(int) * npostvals);
	ESL_ALLOC(pti,     sizeof(int) * npostvals);
	ESL_ALLOC(cor_ptm, sizeof(int) * npostvals);
	ESL_ALLOC(cor_pti, sizeof(int) * npostvals);
	esl_vec_ISet(ptm, npostvals, 0);
	esl_vec_ISet(pti, npostvals, 0);
	esl_vec_ISet(cor_ptm, npostvals, 0);
	esl_vec_ISet(cor_pti, npostvals, 0);
      }

      /* allocate and initialize our counters */
      ESL_ALLOC(kp, sizeof(int *) * ka->nseq);
      ESL_ALLOC(tp, sizeof(int *) * ta->nseq);
      for(i = 0; i < ka->nseq; i++) { 
	ESL_ALLOC(kp[i], sizeof(int) * (seqlen[i]+1));
	ESL_ALLOC(tp[i], sizeof(int) * (seqlen[i]+1));
	esl_vec_ISet(kp[i], seqlen[i]+1, -987654321);
	esl_vec_ISet(tp[i], seqlen[i]+1, -987654321);
      }

      ESL_ALLOC(km_pos, sizeof(int) * (rflen+1));
      ESL_ALLOC(ki_pos, sizeof(int) * (rflen+1));
      ESL_ALLOC(tm_pos, sizeof(int) * (rflen+1));
      ESL_ALLOC(ti_pos, sizeof(int) * (rflen+1));
      ESL_ALLOC(cor_tm_pos, sizeof(int) * (rflen+1));
      ESL_ALLOC(cor_ti_pos, sizeof(int) * (rflen+1));
      esl_vec_ISet(km_pos, rflen+1, 0);
      esl_vec_ISet(ki_pos, rflen+1, 0);
      esl_vec_ISet(tm_pos, rflen+1, 0);
      esl_vec_ISet(ti_pos, rflen+1, 0);
      esl_vec_ISet(cor_tm_pos, rflen+1, 0);
      esl_vec_ISet(cor_ti_pos, rflen+1, 0);

      ESL_ALLOC(km_seq, sizeof(int) * ka->nseq);
      ESL_ALLOC(ki_seq, sizeof(int) * ka->nseq);
      ESL_ALLOC(tm_seq, sizeof(int) * ka->nseq);
      ESL_ALLOC(ti_seq, sizeof(int) * ka->nseq);
      ESL_ALLOC(cor_tm_seq, sizeof(int) * ka->nseq);
      ESL_ALLOC(cor_ti_seq, sizeof(int) * ka->nseq);
      esl_vec_ISet(km_seq, ka->nseq, 0);
      esl_vec_ISet(ki_seq, ka->nseq, 0);
      esl_vec_ISet(tm_seq, ka->nseq, 0);
      esl_vec_ISet(ti_seq, ka->nseq, 0);
      esl_vec_ISet(cor_tm_seq, ka->nseq, 0);
      esl_vec_ISet(cor_ti_seq, ka->nseq, 0);

      /* determine non-gap RF location of each residue in known alignment */
      for(i = 0; i < ka->nseq; i++) { 
	uapos = rfpos = 0;
	for(apos = 1; apos <= ka->alen; apos++) { 
	  is_rfpos = FALSE;
	  if((! esl_abc_CIsGap       (ka->abc, ka->rf[apos-1])) &&
	     (! esl_abc_CIsMissing   (ka->abc, ka->rf[apos-1]))) { 
	    rfpos++; is_rfpos = TRUE;
	  }
	  if(esl_abc_XIsResidue(ka->abc, ka->ax[i][apos])) { 
	    uapos++;
	    kp[i][uapos] = (is_rfpos) ? rfpos : (-1 * rfpos);
	    if(is_rfpos) { km_pos[rfpos]++; km_seq[i]++; }
	    else         { ki_pos[rfpos]++; ki_seq[i]++; }
	  }
	}
      }

      /* determine non-gap RF location of each residue in predicted alignment */
      for(i = 0; i < ta->nseq; i++) { 
	uapos = rfpos = 0;
	for(apos = 1; apos <= ta->alen; apos++) { 
	  is_rfpos = FALSE;
	  if((! esl_abc_CIsGap       (abc, ta->rf[apos-1])) && 
	     (! esl_abc_CIsMissing   (abc, ta->rf[apos-1]))) { 
	    rfpos++; is_rfpos = TRUE;
	    if(do_post) { 
	      do_post_for_this_rfpos = (mask != NULL && mask[rfpos-1] == '0') ? FALSE : TRUE;
	    }
	  }
	  if(esl_abc_XIsResidue(ta->abc, ta->ax[i][apos])) { 
	    uapos++;
	    tp[i][uapos] = (is_rfpos) ? rfpos : (-1 * rfpos);
	    if(do_post) { 
	      if(esl_abc_CIsGap(abc, ta->pp[i][(apos-1)])) esl_fatal("gap PP value for nongap residue: ali: %d seq: %d apos: %d\n", nali, i, apos);
	      ppidx = get_pp_idx(abc, ta->pp[i][(apos-1)]);
	      if(ppidx == -1) esl_fatal("unrecognized PP value (%c) for nongap residue: ali: %d seq: %d apos: %d\n", ta->pp[i][(apos-1)], nali, i, apos);
	    }
	    if(is_rfpos) { 
	      tm_pos[rfpos]++; tm_seq[i]++; 
	      if(do_post_for_this_rfpos) ptm[ppidx]++;
	    }
	    else { 
	      ti_pos[rfpos]++; ti_seq[i]++; 
	      if(do_post) pti[ppidx]++;
	    }
	    if(kp[i][uapos] == tp[i][uapos]) { /* correctly predicted this residue */
	      if(is_rfpos) { 
		cor_tm_seq[i]++; cor_tm_pos[rfpos]++; 
		if(do_post_for_this_rfpos) cor_ptm[ppidx]++;
	      } 
	      else {
		cor_ti_seq[i]++; cor_ti_pos[rfpos]++; 
		if(do_post) cor_pti[ppidx]++;
	      } 
	    }
	  }
	}
      }
      if((! (esl_opt_GetBoolean(go, "-c"))) && (! esl_opt_GetBoolean(go, "-p"))) { 
	/* print per sequence statistics */
	/* determine the longest name in msa */
	for(ni = 0; ni < ka->nseq; ni++) namewidth = ESL_MAX(namewidth, strlen(ka->sqname[ni]));
	ESL_ALLOC(namedashes, sizeof(char) * namewidth+1);
	namedashes[namewidth] = '\0';
	for(ni = 0; ni < namewidth; ni++) namedashes[ni] = '-';
	
	printf("# %-*s  %6s  %28s  %28s  %28s\n", namewidth, "seq name", "len",    "match columns", "insert columns", "all columns");
	printf("# %-*s  %6s  %28s  %28s  %28s\n", namewidth, namedashes, "------", "----------------------------", "----------------------------", "----------------------------");
	for(i = 0; i < ta->nseq; i++) { 
	  printf("  %-*s  %6d  %8d / %8d  (%.3f)  %8d / %8d  (%.3f)  %8d / %8d  (%.3f)\n", namewidth, ka->sqname[i], seqlen[i],
		 cor_tm_seq[i], km_seq[i], (km_seq[i] == 0) ? 0. : ((float) cor_tm_seq[i] / (float) km_seq[i]), 
		 cor_ti_seq[i], ki_seq[i], (ki_seq[i] == 0) ? 0. : ((float) cor_ti_seq[i] / (float) ki_seq[i]), 
		 (cor_tm_seq[i] + cor_ti_seq[i]), (km_seq[i] + ki_seq[i]), ((float) (cor_tm_seq[i] + cor_ti_seq[i]) / ((float) km_seq[i] + ki_seq[i]))); 
	}
	cor_tm = esl_vec_ISum(cor_tm_seq, ka->nseq);
	cor_ti = esl_vec_ISum(cor_ti_seq, ka->nseq);
	km = esl_vec_ISum(km_seq, ka->nseq);
	ki = esl_vec_ISum(ki_seq, ka->nseq);
	
	printf("# %-*s  %6s  %28s  %28s  %28s\n", namewidth, namedashes, "-----", "----------------------------", "----------------------------", "----------------------------");
	printf("# %-*s  %6s  %8d / %8d  (%.3f)  %8d / %8d  (%.3f)  %8d / %8d  (%.3f)\n",
	       namewidth, "*all*", "-", 
	       cor_tm, km, ((float) cor_tm / (float) km), 
	       cor_ti, ki, ((float) cor_ti / (float) ki), 
	       (cor_tm+cor_ti), (km+ki), (((float) (cor_tm + cor_ti))/ ((float) (km + ki)))); 
	free(namedashes);
	for(i = 0; i < ka->nseq; i++) { 
	  free(kp[i]); 
	  free(tp[i]); 
	}
      }
      else if(esl_opt_GetBoolean(go, "-c")) { /* print per column statistics */
	printf("# %5s  %20s  %20s  %20s\n", "rfpos", "match", "insert", "both");
	printf("# %5s  %20s  %20s  %20s\n", "-----", "--------------------", "--------------------", "--------------------");
	for(rfpos = 0; rfpos <= rflen; rfpos++) { 
	  printf("  %5d  %4d / %4d  (%.3f)  %4d / %4d  (%.3f)  %4d / %4d  (%.3f)\n", rfpos, 
		 
		 cor_tm_pos[rfpos], km_pos[rfpos], (km_pos[rfpos] == 0) ? 0. : ((float) cor_tm_pos[rfpos] / (float) km_pos[rfpos]), 
		 cor_ti_pos[rfpos], ki_pos[rfpos], (ki_pos[rfpos] == 0) ? 0. : ((float) cor_ti_pos[rfpos] / (float) ki_pos[rfpos]), 
		 (cor_tm_pos[rfpos] + cor_ti_pos[rfpos]), (km_pos[rfpos] + ki_pos[rfpos]), ((float) (cor_tm_pos[rfpos] + cor_ti_pos[rfpos]) / ((float) km_pos[rfpos] + ki_pos[rfpos]))); 
	}
      }
      else if(do_post) { /* do posterior output */
	if(mask == NULL) { 
	  printf("# %2s  %29s  %29s\n", "",   "      match columns          ", "      insert columns         ");
	  printf("# %2s  %29s  %29s\n", "",   "-----------------------------", "-----------------------------") ;
	  printf("# %2s  %8s   %8s %9s  %8s   %8s %9s\n", "PP", "ncorrect", "ntotal",   "fractcor",  "ncorrect", "ntotal",   "fractcor");
	  printf("# %2s  %8s   %8s %9s  %8s   %8s %9s\n", "--", "--------", "--------", "---------", "--------", "--------", "---------");
	}
	else { 
	  printf("# %2s  %29s  %29s\n", "",   " match columns within mask   ", "      insert columns         ");
	  printf("# %2s  %29s  %29s\n", "",   "-----------------------------", "-----------------------------") ;
	  printf("# %2s  %8s   %8s %9s  %8s   %8s %9s\n", "PP", "ncorrect", "ntotal",   "fractcor",  "ncorrect", "ntotal",   "fractcor");
	  printf("# %2s  %8s   %8s %9s  %8s   %8s %9s\n", "--", "--------", "--------", "---------", "--------", "--------", "---------");
	}
	cm_ptm = cm_pti = cm_cor_ptm = cm_cor_pti = cm_incor_ptm = cm_incor_pti = 0;
	//tot_ptm = esl_vec_ISum(ptm, npostvals);
	//tot_pti = esl_vec_ISum(pti, npostvals);
	//tot_cor_ptm = esl_vec_ISum(cor_ptm, npostvals);
	//tot_cor_pti = esl_vec_ISum(cor_pti, npostvals);
	//tot_incor_ptm = tot_ptm - tot_cor_ptm;
	//tot_incor_pti = tot_pti - tot_cor_pti;
	for(p = (npostvals-1); p >= 0; p--) { 
	  cm_cor_ptm += cor_ptm[p];
	  cm_cor_pti += cor_pti[p];
	  cm_ptm     += ptm[p];
	  cm_pti     += pti[p];
	  cm_incor_ptm += ptm[p] - cor_ptm[p];
	  cm_incor_pti += pti[p] - cor_pti[p];
	  printf("  %2c  %8d / %8d (%.5f)  %8d / %8d (%.5f)\n", 
		 ppchars[p], cor_ptm[p], ptm[p], 
		 (ptm[p] == 0) ? 0. : (float) cor_ptm[p] / (float) ptm[p], 
		 cor_pti[p], pti[p], 
		 (pti[p] == 0) ? 0. : (float) cor_pti[p] / (float) pti[p]);
	}
      }

      /* handle --c2dfile */
      if (dfp != NULL) { 
	/* match stats, 4 fields, CMYK color values */
	for(rfpos = 1; rfpos <= rflen; rfpos++) { 
	  if(km_pos[rfpos] == 0) { /* special case, no known alignment residues, a blank position */
	    fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 0., 0., 0., 0.);
	  }
	  else { 
	    fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 
		    0., /* cyan */
		    1. - ((float) cor_tm_pos[rfpos] / (float) km_pos[rfpos]), /* magenta, fraction incorrect */
		    1. - ((float) km_pos[rfpos] / ta->nseq), /* yellow, 1 - fraction of seqs with residue in column */
		    0.);
	  }		 
	}	
	fprintf(dfp, "//\n");
	/* insert stats, 4 fields, CMYK color values */
	rfpos = 0; /* special case, combine insert posn 0 and 1 together */
	if(ki_pos[rfpos] == 0) { /* special case, no known alignment residues, a blank position */
	  fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 0., 0., 0., 0.);
	}
	else { 
	  fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 
		  0., /* cyan */
		  1. - ((float) (cor_ti_pos[0] + cor_ti_pos[1]) / ((float) (ki_pos[0] + ki_pos[1]))), /* magenta, fraction correct */
		  0.,
		  0.);
	}
	/* insert stats posn 2..rflen */
	for(rfpos = 2; rfpos <= rflen; rfpos++) { 
	  if(ki_pos[rfpos] == 0) { /* special case, no known alignment residues, a blank position */
	    fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 0., 0., 0., 0.);
	  }
	  else { 
	    fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 
		    0., /* cyan */
		    1. - ((float) cor_ti_pos[rfpos] / (float) ki_pos[rfpos]), /* magenta, fraction correct */
		    0.,
		    0.);
	  }
	} 
	fprintf(dfp, "//\n");
      }
      
      if(ptm != NULL) free(ptm);
      if(pti != NULL) free(pti);
      if(cor_ptm != NULL) free(cor_ptm);
      if(cor_ptm != NULL) free(cor_pti);
      free(kp);
      free(tp);
      free(km_seq);
      free(ki_seq);
      free(tm_seq);
      free(ti_seq);
      free(cor_tm_seq);
      free(cor_ti_seq);
      free(km_pos);
      free(ki_pos);
      free(tm_pos);
      free(ti_pos);
      free(cor_tm_pos);
      free(cor_ti_pos);
      free(seqlen);
      esl_msa_Destroy(ka);
      esl_msa_Destroy(ta);
    }

  if(mask != NULL) free(mask);
  if(dfp != NULL) { 
    fclose(dfp);
    printf("# Draw file of per-column stats saved to file: %s\n", esl_opt_GetString(go, "--c2dfile"));
  }
	   
  if(abc) esl_alphabet_Destroy(abc);
  esl_getopts_Destroy(go);
  esl_msafile_Close(tfp);
  esl_msafile_Close(kfp);
  return 0;

 ERROR:
  return status;
}
Пример #11
0
/* map_new_msa()
 * 
 * Construct <inscount[0..M]>, <matuse[1..M]>, and <matmap[1..M]>
 * arrays for mapping model consensus nodes <1..M> onto columns
 * <1..alen> of a new MSA.
 * 
 * Here's the problem. We want to align the match states in columns,
 * but some sequences have inserted symbols in them; we need some
 * sort of overall knowledge of where the inserts are and how long
 * they are in order to create the alignment.
 * 
 * Here's our trick. inscount[] is a 0..M array; inserts[k] stores
 * the maximum number of times insert substate k was used. This
 * is the maximum number of gaps to insert between canonical 
 * column k and k+1.  inserts[0] is the N-term tail; inserts[M] is
 * the C-term tail.
 * 
 * Additionally, matuse[k=1..M] says whether we're going to make an
 * alignment column for consensus position k. By default this is
 * <TRUE> only if there is at least one residue in the column. If
 * the <p7_ALL_CONSENSUS_COLS> option flag is set, though, all
 * matuse[1..M] are set <TRUE>. (matuse[0] is unused, always <FALSE>.)
 * 
 * Then, using these arrays, we construct matmap[] and determine alen. 
 * If match state k is represented as an alignment column,
 * matmap[1..M] = that position, <1..alen>.
 * If match state k is not in the alignment (<matuse[k] == FALSE>),
 * matmap[k] = matmap[k-1] = the last alignment column that a match
 * state did map to; this is a trick to make some apos coordinate setting 
 * work cleanly. 
 * Because of this trick, you can't just assume because matmap[k] is 
 * nonzero that match state k maps somewhere in the alignment; 
 * you have to check matuse[k] == TRUE, then look at what matmap[k] says.
 * Remember that N and C emit on transition, hence the check for an
 * N->N or C->C transition before bumping nins. 
 * <matmap[0]> is unused; by convention, <matmap[0] = 0>.
 */
static int
map_new_msa(P7_TRACE **tr, int nseq, int M, int optflags, int **ret_inscount, 
	    int **ret_matuse, int **ret_matmap, int *ret_alen)
{
  int *inscount = NULL;	  /* inscount[k=0..M] == max # of inserts in node k */
  int *matuse   = NULL;	  /* matuse[k=1..M] == TRUE|FALSE: does node k map to an alignment column */
  int *matmap   = NULL;	  /* matmap[k=1..M]: if matuse[k] TRUE, what column 1..alen does node k map to */
  int  idx;		  /* counter over sequences */
  int  nins;		  /* counter for inserted residues observed */
  int  z;		  /* index into trace positions */
  int  alen;		  /* length of alignment */
  int  k;		  /* counter over nodes 1..M */
  int  status;
  
  ESL_ALLOC(inscount, sizeof(int) * (M+1));   
  ESL_ALLOC(matuse,   sizeof(int) * (M+1)); matuse[0] = 0;
  ESL_ALLOC(matmap,   sizeof(int) * (M+1)); matmap[0] = 0;
  esl_vec_ISet(inscount, M+1, 0);
  if (optflags & p7_ALL_CONSENSUS_COLS) esl_vec_ISet(matuse+1, M, TRUE); 
  else                                  esl_vec_ISet(matuse+1, M, FALSE);

  for (idx = 0; idx < nseq; idx++) 
    {
      nins = 0;
      k    = 0;
      for (z = 1; z < tr[idx]->N; z++) 
	{
	  switch (tr[idx]->st[z]) {
	  case p7T_I:                                nins++; break;
	  case p7T_N: if (tr[idx]->st[z-1] == p7T_N) nins++; break;
	  case p7T_C: if (tr[idx]->st[z-1] == p7T_C) nins++; break;
	    
	  case p7T_M: /* M,D: record max. reset ctr; M only: set matuse[] */
	    k             = tr[idx]->k[z];	/* k++ doesn't work. May be a B->X->Mk fragment entry */
	    inscount[k-1] = ESL_MAX(nins, inscount[k-1]); 
	    matuse[k]     = TRUE;
	    nins          = 0;
	    break;

	  case p7T_D: /* Can handle I->D transitions even though currently not in H3 models */
	    k             = tr[idx]->k[z];      /* k++ doesn't work; see above */
	    inscount[k-1] = ESL_MAX(nins, inscount[k-1]); 
	    nins          = 0;
	    break;

	  case p7T_T: /* T: record C-tail max, for a profile trace */
	    inscount[M] = ESL_MAX(nins, inscount[M]);
	    break;

	  case p7T_E: /* this handles case of core traces, which do have I_M state  */
	    inscount[k] = ESL_MAX(nins, inscount[k]); /* [M] doesn't work, because of {DMI}k->X->E frag exit */
	    break;

	  case p7T_B: /* B: record N-tail max for a profile trace; I0 for a core trace  */
	    inscount[0] = ESL_MAX(nins, inscount[0]); 
	    nins = 0;
	    break;

	  case p7T_S: break;	/* don't need to do anything on S,X states */
	  case p7T_X: break;

	  case p7T_J: p7_Die("J state unsupported");
	  default:    p7_Die("Unrecognized statetype %d", tr[idx]->st[z]);
	  }
	}
    }

  /* if we're trimming N and C off, reset inscount[0], inscount[M] to 0. */
  if (optflags & p7_TRIM) { inscount[0] = inscount[M] = 0; }
  
  /* Use inscount, matuse to set the matmap[] */
  alen      = inscount[0];
  for (k = 1; k <= M; k++) {
    if (matuse[k]) { matmap[k] = alen+1; alen += 1+inscount[k]; }
    else           { matmap[k] = alen;   alen +=   inscount[k]; }
  }

  *ret_inscount = inscount;
  *ret_matuse   = matuse;
  *ret_matmap   = matmap;
  *ret_alen     = alen;
  return eslOK;

 ERROR:
  if (inscount != NULL) free(inscount); 
  if (matuse   != NULL) free(matuse);
  if (matmap   != NULL) free(matmap);
  *ret_inscount = NULL;
  *ret_matuse   = NULL;
  *ret_matmap   = NULL;
  *ret_alen     = 0;
  return status;
}
Пример #12
0
/* Function:  esl_msaweight_PB()
 * Synopsis:  PB (position-based) weights.
 * Incept:    SRE, Sun Nov  5 08:59:28 2006 [Janelia]
 *
 * Purpose:   Given a multiple alignment <msa>, calculate sequence
 *            weights according to the position-based weighting
 *            algorithm (Henikoff and Henikoff, JMB 243:574-578,
 *            1994). These weights are stored internally in the <msa>
 *            object, replacing any weights that may have already been
 *            there. Weights are $\geq 0$ and they sum to <msa->nseq>.
 *            
 *            The <msa> may be in either digitized or text mode.
 *            Digital mode is preferred, so that the algorithm
 *            deals with degenerate residue symbols properly.
 *            
 *            The Henikoffs' algorithm does not give rules for dealing
 *            with gaps or degenerate residue symbols. The rule here
 *            is to ignore them. This means that longer sequences
 *            initially get more weight; hence a "double
 *            normalization" in which the weights are first divided by
 *            sequence length in canonical residues (to compensate for
 *            that effect), then normalized to sum to nseq.
 *            
 *            An advantage of the PB method is efficiency.
 *            It is $O(1)$ in memory and $O(NL)$ time, for an alignment of
 *            N sequences and L columns. This makes it a good method 
 *            for ad hoc weighting of very deep alignments.
 *            
 *            When the alignment is in simple text mode, IUPAC
 *            degenerate symbols are not dealt with correctly; instead,
 *            the algorithm simply uses the 26 letters as "residues"
 *            (case-insensitively), and treats all other residues as
 *            gaps.
 *
 * Returns:   <eslOK> on success, and the weights inside <msa> have been
 *            modified. 
 *
 * Throws:    <eslEMEM> on allocation error, in which case <msa> is
 *            returned unmodified.
 *
 * Xref:      [Henikoff94b]; squid::weight.c::PositionBasedWeights().
 */
int
esl_msaweight_PB(ESL_MSA *msa)
{
  int    *nres = NULL;   	/* counts of each residue observed in a column */
  int     ntotal;		/* number of different symbols observed in a column */
  int     rlen;			/* number of residues in a sequence */
  int     idx, pos, i;
  int     K;			/* alphabet size */
  int     status;

  /* Contract checks
   */
  ESL_DASSERT1( (msa->nseq >= 1) );
  ESL_DASSERT1( (msa->alen >= 1) );
  if (msa->nseq == 1) { msa->wgt[0] = 1.0; return eslOK; }

  /* Initialize
   */
  if (! (msa->flags & eslMSA_DIGITAL)) 
    { ESL_ALLOC(nres, sizeof(int) * 26);          K = 26;          }
#ifdef eslAUGMENT_ALPHABET
  else 
    { ESL_ALLOC(nres, sizeof(int) * msa->abc->K); K = msa->abc->K; }
#endif

  esl_vec_DSet(msa->wgt, msa->nseq, 0.);

  /* This section handles text alignments */
  if (! (msa->flags & eslMSA_DIGITAL)) 
    {
      for (pos = 0; pos < msa->alen; pos++)
	{
	  /* Collect # of letters A..Z in this column, and total */
	  esl_vec_ISet(nres, K, 0.);
	  for (idx = 0; idx < msa->nseq; idx++)
	    if (isalpha((int) msa->aseq[idx][pos]))
	      nres[toupper((int) msa->aseq[idx][pos]) - 'A'] ++;
	  for (ntotal = 0, i = 0; i < K; i++) if (nres[i] > 0) ntotal++;

	  /* Bump weight on each seq by PB rule */
	  if (ntotal > 0) {
	    for (idx = 0; idx < msa->nseq; idx++) {
	      if (isalpha((int) msa->aseq[idx][pos]))
		msa->wgt[idx] += 1. / 
		  (double) (ntotal * nres[toupper((int) msa->aseq[idx][pos]) - 'A'] );
	    }
	  }
	}

      /* first normalization by # of residues counted in each seq */
      for (idx = 0; idx < msa->nseq; idx++) {
	for (rlen = 0, pos = 0; pos < msa->alen; pos++) 
      	  if (isalpha((int) msa->aseq[idx][pos])) rlen++;
	if (ntotal > 0) msa->wgt[idx] /= (double) rlen;
	/* if rlen == 0 for this seq, its weight is still 0.0, as initialized. */
      }
    }

  /* This section handles digital alignments. */
#ifdef eslAUGMENT_ALPHABET
  else
    {
      for (pos = 1; pos <= msa->alen; pos++)
	{
	  /* Collect # of residues 0..K-1 in this column, and total # */
	  esl_vec_ISet(nres, K, 0.);
	  for (idx = 0; idx < msa->nseq; idx++)
	    if (esl_abc_XIsCanonical(msa->abc, msa->ax[idx][pos]))
	      nres[(int) msa->ax[idx][pos]] ++;
	  for (ntotal = 0, i = 0; i < K; i++) if (nres[i] > 0) ntotal++;

	  /* Bump weight on each sequence by PB rule */
	  if (ntotal > 0) {
	    for (idx = 0; idx < msa->nseq; idx++) {
	      if (esl_abc_XIsCanonical(msa->abc, msa->ax[idx][pos]))
		msa->wgt[idx] += 1. / (double) (ntotal * nres[msa->ax[idx][pos]]);
	    }
	  }
	}

      /* first normalization by # of residues counted in each seq */
      for (idx = 0; idx < msa->nseq; idx++)
	{
	  for (rlen = 0, pos = 1; pos <= msa->alen; pos++) 
	    if (esl_abc_XIsCanonical(msa->abc, msa->ax[idx][pos])) rlen++;
	  if (rlen > 0) msa->wgt[idx] /= (double) rlen;
	  /* if rlen == 0 for this seq, its weight is still 0.0, as initialized. */
	}
    }
#endif

  /* Make weights normalize up to nseq, and return.  In pathological
   * case where all wgts were 0 (no seqs contain any unambiguous
   * residues), weights become 1.0.
   */
  esl_vec_DNorm(msa->wgt, msa->nseq);
  esl_vec_DScale(msa->wgt, msa->nseq, (double) msa->nseq);	
  msa->flags |= eslMSA_HASWGTS;

  free(nres);
  return eslOK;

 ERROR:
  if (nres != NULL) free(nres);
  return status;
}
Пример #13
0
/* annotate_posterior_probability()
 * Synopsis:  Add posterior probability annotation lines to new MSA.
 */
static int
annotate_posterior_probability(ESL_MSA *msa, P7_TRACE **tr, const int *matmap, int M, int optflags)
{
  double *totp   = NULL;	/* total posterior probability in column <apos>: [0..alen-1] */
  int    *matuse = NULL;	/* #seqs with pp annotation in column <apos>: [0..alen-1] */
  int     idx;    		/* counter over sequences [0..nseq-1] */
  int     apos;			/* counter for alignment columns: pp's are [0..alen-1] (unlike ax) */
  int     z;			/* counter over trace positions [0..tr->N-1] */
  int     status;

  /* Determine if any of the traces have posterior probability annotation. */
  for (idx = 0; idx < msa->nseq; idx++)
    if (tr[idx]->pp != NULL) break;
  if (idx == msa->nseq) return eslOK;

  ESL_ALLOC(matuse, sizeof(double) * (msa->alen)); esl_vec_ISet(matuse, msa->alen, 0);
  ESL_ALLOC(totp,   sizeof(double) * (msa->alen)); esl_vec_DSet(totp,   msa->alen, 0.0);

  ESL_ALLOC(msa->pp, sizeof(char *) * msa->sqalloc);
  for (idx = 0; idx < msa->nseq; idx++)
    {
      if (tr[idx]->pp == NULL) { msa->pp[idx] = NULL; continue; }

      ESL_ALLOC(msa->pp[idx], sizeof(char) * (msa->alen+1));
      for (apos = 0; apos < msa->alen; apos++) msa->pp[idx][apos] = '.';
      msa->pp[idx][msa->alen] = '\0';

      apos = 0;
      for (z = 0; z < tr[idx]->N; z++)
	{
	  switch (tr[idx]->st[z]) {
	  case p7T_M: 
	    msa->pp[idx][matmap[tr[idx]->k[z]]-1] = p7_alidisplay_EncodePostProb(tr[idx]->pp[z]);  
	    totp  [matmap[tr[idx]->k[z]]-1]+= tr[idx]->pp[z];
	    matuse[matmap[tr[idx]->k[z]]-1]++;
	  case p7T_D:
	    apos = matmap[tr[idx]->k[z]]; 
	    break;

	  case p7T_I:
	    if ( !(optflags & p7_TRIM) || (tr[idx]->k[z] != 0 && tr[idx]->k[z] != M)) {
	      msa->pp[idx][apos] = p7_alidisplay_EncodePostProb(tr[idx]->pp[z]);  
	      apos++;
	    }
	    break;

	  case p7T_N:
	  case p7T_C:
	    if (! (optflags & p7_TRIM) && tr[idx]->i[z] > 0) {
	      msa->pp[idx][apos] = p7_alidisplay_EncodePostProb(tr[idx]->pp[z]);
	      apos++;
	    }
	    break;

	  case p7T_E:
	    apos = matmap[M];	/* set position for C-terminal tail */
	    break;
  
	  default:
	    break;
	  }
	}
    }
  for (; idx < msa->sqalloc; idx++) msa->pp[idx] = NULL; /* for completeness, following easel MSA conventions, but should be a no-op: nseq==sqalloc */

  /* Consensus posterior probability annotation: only on match columns */
  ESL_ALLOC(msa->pp_cons, sizeof(char) * (msa->alen+1));
  for (apos = 0; apos < msa->alen; apos++) msa->pp_cons[apos] = '.';
  msa->pp_cons[msa->alen] = '\0';
  for (apos = 0; apos < msa->alen; apos++)
    if (matuse[apos]) msa->pp_cons[apos] = p7_alidisplay_EncodePostProb( totp[apos] / (double) matuse[apos]);
  
  free(matuse);
  free(totp);
  return eslOK;

 ERROR:
  if (matuse  != NULL) free(matuse);
  if (totp    != NULL) free(totp);  
  if (msa->pp != NULL) esl_Free2D((void **) msa->pp, msa->sqalloc);
  return status;
}
Пример #14
0
/* map_msas
 *                   
 * Align msa1 and msa2.
 * For each column in msa1, determine the corresponding column
 * in msa2. This implementation requires:
 *  - msa1 and msa2 contain exactly the same sequences in the same order
 * Note: the seqs in msa1 and msa2 do not have to have the same names.
 *
 * Uses a DP algorithm similar to Needleman-Wunsch, but that's aligning
 * two alignment columns at a time instead of two residues. 
 */
static int
map_msas(const ESL_GETOPTS *go, char *errbuf, ESL_MSA *msa1, ESL_MSA *msa2, int **ret_msa1_to_msa2_map)
{
  int status;
  int **one2two;              /* [0..c..rflen1][0..a..alen2] number of residues from non-gap RF column c of msa1
			       * aligned in column a of msa 2 */
  int *rf2a_map1 = NULL;       /* msa1 map of reference columns (non-gap RF residues) to alignment columns, NULL if msa1->rf == NULL */
  int *rf2a_map2 = NULL;       /* msa2 map of reference columns (non-gap RF residues) to alignment columns, NULL if msa2->rf == NULL */
  int *a2rf_map1 = NULL;       /* msa1 map of alignment columns to reference columns, NULL if msa1->rf == NULL */
  int *a2rf_map2 = NULL;       /* msa2 map of alignment columns to reference columns, NULL if msa2->rf == NULL */
  int apos1, apos2;           /* counters over alignment position in msa1, msa2 respectively */
  int alen1, alen2;           /* alignment lengths */
  int rfpos1, rfpos2;           /* counters over reference positions */
  int rflen1, rflen2;           /* reference (non-gap RF) lengths */
  int **mx;                   /* [0..c..rflen1][0..a..alen2] dp matrix, score of max scoring aln 
			       * from 1..c in msa1 and 1..a in msa 2 */
  int **tb;                   /* [0..c..rflen1][0..a..alen2] traceback ptrs, 0 for diagonal, 1 for vertical */
  char *seq1, *seq2;          /* temporary strings for ensuring dealigned sequences in msa1 and msa2 are identical */
  int64_t len1, len2;         /* length of seq1, seq2 */
  int isgap1, isgap2;         /* is this residue a gap in msa1, msa2? */
  int i;                      /* counter over sequences */
  int *res1_per_apos;         /* [0..apos..alen1] number of residues in column apos of msa1 */
  int sc;                     /* max score of full path (alignment) through dp mx */
  int tb_sc;                  /* score of traceback, should equal sc */
  int *one2two_map;           /* [0..a..alen1] the alignment, msa2 column that column apos1 in msa1 maps to */
  int total_res = 0;          /* total number of residues in msa1 */
  float coverage;             /* fraction of total_res that are within mapped msa2 columns from one2two_map, 
			       * this is tb_sc / total_res */
  int  total_cres1=0;         /* total number of residues in reference positions in msa1 */ 
  int  covered_cres1 = 0;     /* number of residues in reference positions in msa1 that also appear in the corresponding
			       * mapped column of msa2 
			       */
  int be_quiet = esl_opt_GetBoolean(go, "-q");
  int *choices;
  int i_choice;

  /* contract check */
  if(! (msa1->flags & eslMSA_DIGITAL)) ESL_FAIL(eslEINVAL, errbuf, "in map_msas() msa1 (%s) not digitized.\n", esl_opt_GetArg(go, 1));
  if(! (msa2->flags & eslMSA_DIGITAL)) ESL_FAIL(eslEINVAL, errbuf, "in map_msas() msa2 (%s) not digitized.\n", esl_opt_GetArg(go, 2));
  alen1 = msa1->alen;
  alen2 = msa2->alen;
  
  /* Map msa1 (reference) columns to alignment positions */
  rflen1 = rflen2 = 0;
  if(msa1->rf != NULL) if((status = map_rfpos_to_apos(msa1, &rf2a_map1, &a2rf_map1, &rflen1)) != eslOK) goto ERROR;
  if(msa2->rf != NULL) if((status = map_rfpos_to_apos(msa2, &rf2a_map2, &a2rf_map2, &rflen2)) != eslOK) goto ERROR;
  if(! be_quiet) {
    printf("# %-25s alignment length:              %d\n", esl_opt_GetArg(go, 1), alen1);
    printf("# %-25s alignment length:              %d\n", esl_opt_GetArg(go, 2), alen2);
  }
  /* collect counts in one2two[i][j]: number of sequences for which residue aligned in msa1 non-gap column i
   * is aligned in msa2 alignment column j.
   */
  ESL_ALLOC(seq1, sizeof(char) * (alen1+1));
  ESL_ALLOC(seq2, sizeof(char) * (alen2+1));
  ESL_ALLOC(one2two, sizeof(int *) * (alen1+1));
  for(apos1 = 0; apos1 <= alen1; apos1++) { 
    ESL_ALLOC(one2two[apos1], sizeof(int) * (alen2+1));
    esl_vec_ISet(one2two[apos1], (alen2+1), 0);
  }

  total_res = 0;
  for(i = 0; i < msa1->nseq; i++) { 
    /* ensure raw (unaligned) seq i in the 2 msas is the same */
    esl_abc_Textize(msa1->abc, msa1->ax[i], alen1, seq1); 
    esl_abc_Textize(msa1->abc, msa2->ax[i], alen2, seq2); /* note: msa*1*->abc used on purpose, allows DNA/RNA to peacefully coexist in this func */
    esl_strdealign(seq1, seq1, "-_.~", &len1);
    esl_strdealign(seq2, seq2, "-_.~", &len2);

    if(len1 != len2) { 
      ESL_FAIL(eslEINVAL, errbuf, "unaligned seq number %d (msa1: %s, msa2: %s) differs in length %s (%" PRId64 ") and %s (%" PRId64 "), those files must contain identical raw seqs\n",
	       i, msa1->sqname[i], msa2->sqname[i], esl_opt_GetArg(go, 1), len1, esl_opt_GetArg(go, 2), len2);
    }
    if(strncmp(seq1, seq2, len1) != 0)  ESL_FAIL(eslEINVAL, errbuf, "unaligned seq number %d differs between %s and %s, those files must contain identical raw seqs\n", i, esl_opt_GetArg(go, 1), esl_opt_GetArg(go, 2));
    total_res += len1;
    
    apos1 = apos2 = 1;
    while((apos1 <= alen1) || (apos2 <= alen2)) {
      isgap1 = esl_abc_XIsGap(msa1->abc, msa1->ax[i][apos1]);
      isgap2 = esl_abc_XIsGap(msa2->abc, msa2->ax[i][apos2]);
      if      ( isgap1 &&  isgap2) { apos1++; apos2++; }
      else if ( isgap1 && !isgap2) { apos1++;          }
      else if (!isgap1 &&  isgap2) {          apos2++; }
      else if ( msa1->ax[i][apos1] == msa2->ax[i][apos2]) { 
	one2two[apos1++][apos2++]++;
	/* two2one[apos2][apos1]++; */
      }
    }
  }

  /******************************************************************
   * DP alignment of msa1 to msa2
   * dp matrix: mx[apos1][apos2] apos1=1..msa->alen1, apos2=1..alen2 (apos1=0 || apos2=0 is invalid)
   * mx[apos1][apos2] = score of maximal alignment for apos1=1..apos1, apos2'=1..apos2 INCLUDING
   *                    apos1 and apos2. Score is number of residues from msa1 columns
   *                    1..apos1 that exist in their respective aligned columns in msa2 (the growing
   *                    maximally scoring alignment).
   */

  /******************************************************************
   * initialization 
   */
  ESL_ALLOC(mx, sizeof(int *) * (alen1+1));
  ESL_ALLOC(tb, sizeof(int *) * (alen1+1));
  for(apos1 = 0; apos1 <= alen1; apos1++) { 
    ESL_ALLOC(mx[apos1], sizeof(int) * (alen2+1));
    ESL_ALLOC(tb[apos1], sizeof(int) * (alen2+1));
    esl_vec_ISet(mx[apos1], (alen2+1), 0);
    esl_vec_ISet(tb[apos1], (alen2+1), -2); /* -2 is a bogus value, if we see it during traceback, there's a problem */
    tb[apos1][0] = HORZ; /* special case, if we hit apos2==0 and apos1 > 0, we have to do HORZ moves until apos1==1 */
  }
  esl_vec_ISet(tb[0], (alen2+1), VERT); /* special case, if we hit apos1==0 and apos2 > 0, we have to do VERT moves until apos2==1 */
  tb[0][0] = -2; /* all alignments must end here */

  ESL_ALLOC(res1_per_apos, sizeof(int) * (alen1+1));
  esl_vec_ISet(res1_per_apos, (alen1+1), 0);
  mx[0][0] = 0;
  tb[0][0] = -1; /* last cell, special value */

  /*****************************************************************
   * recursion
   */
  ESL_ALLOC(choices, sizeof(int) * NCHOICES);
  for(apos1 = 1; apos1 <= alen1; apos1++) {
    for(apos2 = 1; apos2 <= alen2; apos2++) {
      choices[DIAG] = mx[(apos1-1)][(apos2-1)] + one2two[apos1][apos2];
      choices[VERT] = mx[ apos1   ][(apos2-1)];
      choices[HORZ] = mx[(apos1-1)][ apos2   ];
      i_choice  = esl_vec_IArgMax(choices, NCHOICES);
      mx[apos1][apos2] = choices[i_choice];
      tb[apos1][apos2] = i_choice; 
      res1_per_apos[apos1] += one2two[apos1][apos2];
      /*printf("mx[%3d][%3d]: %5d (%d)\n", apos1, apos2, mx[apos1][apos2], tb[apos1][apos2]);*/
    }
  }
  free(choices);

  total_cres1 = 0;
  if(rf2a_map1 != NULL) { 
    for(rfpos1 = 1; rfpos1 <= rflen1; rfpos1++) total_cres1 += res1_per_apos[rf2a_map1[rfpos1]];
  }

  /*****************************************************************
   * traceback 
   */
  
  sc = mx[alen1][alen2];
  if(!be_quiet) {
    /* printf("score %d\n", sc);*/
    if(a2rf_map1 != NULL && a2rf_map2 != NULL) { 
      printf("# %12s       %12s  %22s\n", "   msa 1   ", "   msa 2   ", "");
      printf("# %12s       %12s  %22s\n", "------------", "------------", "");
      printf("# %5s  %5s       %5s  %5s  %22s\n", "rfpos",  "apos",  "rfpos",  "apos",  " num common residues");
      printf("# %5s  %5s       %5s  %5s  %22s\n", "-----", "-----", "-----", "-----", "---------------------");
    }
    else if(a2rf_map1 != NULL) { 
      printf("# %12s        %5s  %22s\n", "   msa 1   ", "msa 2", "");
      printf("# %12s        %5s  %22s\n", "------------", "-----", "");
      printf("# %5s  %5s       %5s  %22s\n", "rfpos",  "apos",  "apos",  " num common residues");
      printf("# %5s  %5s       %5s  %22s\n", "-----", "-----", "-----", "---------------------");
    }
    else if (a2rf_map2 != NULL) { 
      printf("# %5s        %12s  %22s\n", "msa 1", "   msa 2   ", "");
      printf("# %5s        %12s  %22s\n", "-----", "------------", "");
      printf("# %5s        %5s  %5s  %22s\n", "apos",  "rfpos",  "apos",  " num common residues");
      printf("# %5s        %5s  %5s  %22s\n", "-----", "-----", "-----", "---------------------");
    }
    else {
      printf("# %5s        %5s  %22s\n", "msa 1", "msa 2", "");
      printf("# %5s        %5s  %22s\n", "-----", "-----", "");
      printf("# %5s        %5s  %22s\n", "apos",  "apos",  " num common residues");
      printf("# %5s        %5s  %22s\n", "-----", "-----", "---------------------");
    }
  }

  /* traceback, and build one2two_map[] */
  apos1 = alen1;
  apos2 = alen2;
  tb_sc = 0;
  covered_cres1 = 0;
  ESL_ALLOC(one2two_map, sizeof(int) * (alen1+1));
  esl_vec_ISet(one2two_map, (alen1+1), 0);
  one2two_map[0] = -1; /* invalid */

  while(tb[apos1][apos2] != -1) {
    if(tb[apos1][apos2] == DIAG) { /* diagonal move */
      rfpos1 = (a2rf_map1 == NULL) ? -1 : a2rf_map1[apos1];
      rfpos2 = (a2rf_map2 == NULL) ? -1 : a2rf_map2[apos2];
      if(!be_quiet) { 
	if(a2rf_map1 != NULL && a2rf_map2 != NULL) { 
	  if(rfpos1 == -1 && rfpos2 == -1) { 
	    printf("  %5s  %5d  -->  %5s  %5d  %5d / %5d (%.4f)\n", "-",    apos1, "-",    apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); 
	  }
	  else if (rfpos1 == -1) { 
	    printf("  %5s  %5d  -->  %5d  %5d  %5d / %5d (%.4f)\n", "-",    apos1, rfpos2, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); 
	  }
	  else if (rfpos2 == -1) { 
	    printf("  %5d  %5d  -->  %5s  %5d  %5d / %5d (%.4f)\n", rfpos1, apos1, "-",    apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); 
	  }
	  else { 
	    printf("  %5d  %5d  -->  %5d  %5d  %5d / %5d (%.4f)\n", rfpos1, apos1, rfpos2, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); 
	  }
	}	
	else if(a2rf_map1 != NULL) { 
	  if (rfpos1 == -1) { 
	    printf("  %5s  %5d  -->  %5d  %5d / %5d (%.4f)\n", "-",   apos1, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); 
	  }
	  else { 
	    printf("  %5d  %5d  -->  %5d  %5d / %5d (%.4f)\n", rfpos1, apos1, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); 
	  }
	}
	else if (a2rf_map2 != NULL) { 
	  if (rfpos2 == -1) { 
	    printf("  %5d  -->  %5s  %5d  %5d / %5d (%.4f)\n", apos1, "-",    apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); 
	  }
	  else { 
	    printf("  %5d  -->  %5d  %5d  %5d / %5d (%.4f)\n", apos1, rfpos2, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); 
	  }
	}
	else {
	  printf("  %5d  -->  %5d  %5d / %5d (%.4f)\n", apos1, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); 
	}
      }
      tb_sc += one2two[apos1][apos2];
      one2two_map[apos1] = apos2;
      if(rfpos1 > 0) covered_cres1 += one2two[apos1][apos2]; /* apos1 is a rfpos */
      apos1--; apos2--;
    }
    else if(tb[apos1][apos2] == VERT) { 
      apos2--; /* vertical move */
    }
    else if(tb[apos1][apos2] == HORZ) { 
      apos1--; /* horizontal move */
    }
    else if(tb[apos1][apos2] != -1) /* shouldn't happen */
      ESL_FAIL(eslEINVAL, errbuf, "in dp traceback, tb[apos1: %d][apos2: %d] %d\n", apos1, apos2, tb[apos1][apos2]);
  }
  /* done DP code 
   **********************************/

  if(!be_quiet) printf("# Total trace back sc: %d\n", tb_sc);
  if(tb_sc != sc) ESL_FAIL(eslEINVAL, errbuf, "in dp traceback, tb_sc (%d) != sc (%d)\n", tb_sc, sc);
  coverage = (float) tb_sc / (float) total_res;
  printf("# Coverage: %6d / %6d (%.4f)\n# Coverage is fraction of residues from %s in optimally mapped columns in %s\n", tb_sc, total_res, coverage, esl_opt_GetArg(go, 1), esl_opt_GetArg(go, 2));
  if(total_cres1 > 0) printf("# RF coverage: %6d / %6d (%.4f)\n# RF coverage is fraction of non-gap RF residues from %s in optimally mapped columns in %s\n", covered_cres1, total_cres1, (float) covered_cres1 / (float) total_cres1, esl_opt_GetArg(go, 1), esl_opt_GetArg(go, 2));
  /* print masks if nec */
  if((status = map2masks(go, errbuf, alen1, alen2, a2rf_map1, a2rf_map2, rf2a_map1, rf2a_map2, rflen1, rflen2, one2two_map)) != eslOK) return status;

  /* clean up and return */
  for(apos1 = 0; apos1 <= alen1; apos1++) { 
    free(mx[apos1]);
    free(tb[apos1]);
  }
  free(mx);
  free(tb);

  for(apos1 = 0; apos1 <= alen1; apos1++) free(one2two[apos1]);
  free(one2two);
  free(res1_per_apos);
  if(rf2a_map1 != NULL) free(rf2a_map1);
  if(rf2a_map2 != NULL) free(rf2a_map2);
  if(a2rf_map1 != NULL) free(a2rf_map1);
  if(a2rf_map2 != NULL) free(a2rf_map2);

  free(seq1);
  free(seq2);
  *ret_msa1_to_msa2_map = one2two_map;
  return eslOK;
  
 ERROR: 
  return status;
}
Пример #15
0
static int
map_alignment(const char *msafile, const P7_HMM *hmm, ESL_SQ ***ret_sq, P7_TRACE ***ret_tr, int *ret_ntot)
{
  ESL_SQ       **sq        = NULL;
  P7_TRACE     **tr        = NULL;
  ESLX_MSAFILE  *afp       = NULL;
  ESL_MSA       *msa       = NULL;
  ESL_ALPHABET  *abc       = (ESL_ALPHABET *) hmm->abc; /* removing const'ness to make compiler happy. Safe. */
  int           *matassign = NULL;
  uint32_t       chksum    = 0;
  int            i,k;
  int            status;

  status = eslx_msafile_Open(&abc, msafile, NULL, eslMSAFILE_UNKNOWN, NULL, &afp);
  if (status != eslOK) eslx_msafile_OpenFailure(afp, status);

  status = eslx_msafile_Read(afp, &msa);
  if (status != eslOK) eslx_msafile_ReadFailure(afp, status);

  if (! (hmm->flags & p7H_CHKSUM)  )  esl_fatal("HMM has no checksum. --mapali unreliable without it.");
  if (! (hmm->flags & p7H_MAP)  )     esl_fatal("HMM has no map. --mapali can't work without it.");
  esl_msa_Checksum(msa, &chksum);
  if (hmm->checksum != chksum)        esl_fatal("--mapali MSA %s isn't same as the one HMM came from (checksum mismatch)", msafile);

  ESL_ALLOC(sq, sizeof(ESL_SQ *)   * msa->nseq);
  ESL_ALLOC(tr, sizeof(P7_TRACE *) * msa->nseq);
  ESL_ALLOC(matassign, sizeof(int) * (msa->alen + 1));

  esl_vec_ISet(matassign, msa->alen+1, 0);
  for (k = 1; k <= hmm->M; k++) matassign[hmm->map[k]] = 1;

  p7_trace_FauxFromMSA(msa, matassign, p7_DEFAULT, tr);

  /* The 'faux' core traces constructed by FauxFromMSA() may contain
   * D->I and I->D transitions.  They may *only* now be passed to
   * p7_tracealign_Seqs(), which can deal with these 'illegal'
   * transitions, in order to exactly reproduce the input --mapali
   * alignment.
   */

  for (i = 0; i < msa->nseq; i++)
    esl_sq_FetchFromMSA(msa, i, &(sq[i]));
      
  *ret_ntot = msa->nseq;
  *ret_tr   = tr;
  *ret_sq   = sq;

  eslx_msafile_Close(afp);
  esl_msa_Destroy(msa);
  free(matassign);
  return eslOK;

 ERROR:
  *ret_ntot = 0;
  *ret_tr   = NULL;
  *ret_sq   = NULL;
  if (afp       != NULL) eslx_msafile_Close(afp);
  if (msa       != NULL) esl_msa_Destroy(msa);
  if (matassign != NULL) free(matassign);
  return status;
}
Пример #16
0
/* dump_posterior_sequence_info
 *                   
 * Dump per-sequence posterior probability data to a file.
 *
 */      
static int dump_posterior_sequence_info(FILE *fp, ESL_MSA *msa, int nali, char *alifile, char *errbuf)
{
  int    i,p,apos;     /* counters over sequences, columns of MSA */
  int    ppidx;
  int    nppvals = 12;
  int    nnongap;
  double sum;
  float ppavgA[11];
  char ppstring[12] = "0123456789*.";
  int seq_pp_ct[12];

  ppavgA[0]  = 0.025;  
  ppavgA[1]  = 0.10;
  ppavgA[2]  = 0.20;
  ppavgA[3]  = 0.30;
  ppavgA[4]  = 0.40;
  ppavgA[5]  = 0.50;
  ppavgA[6]  = 0.60;
  ppavgA[7]  = 0.70;
  ppavgA[8]  = 0.80;
  ppavgA[9]  = 0.90;
  ppavgA[10] = 0.975;

  fprintf(fp, "# Posterior probability stats per sequence:\n");
  fprintf(fp, "# Alignment file: %s\n", alifile);
  fprintf(fp, "# Alignment idx:  %d\n", nali);
  if(msa->name != NULL) { fprintf(fp, "# Alignment name: %s\n", msa->name); }
  fprintf(fp, "# Number of sequences: %d\n", msa->nseq);
  fprintf(fp, "# %7s  %-40s  %7s", "seqidx", "seqname", "nnongap");
  for(p = 0; p < nppvals-1; p++) {  /* don't include gaps in per-sequence output */
    fprintf(fp, "  %7c", ppstring[p]);
  }
  fprintf(fp, "  %7s\n", "avgPP");

  fprintf(fp, "# %7s  %40s  %7s", "-------", "----------------------------------------", "-------");
  for(p = 0; p < nppvals-1; p++) { /* don't include gaps in per-sequence output */
    fprintf(fp, "  %7s", "-------");
  }
  fprintf(fp, "  %7s\n", "-------");

  for(i = 0; i < msa->nseq; i++) { 
    if(msa->pp[i] != NULL) { 
      fprintf(fp, "  %7d  %-40s", i+1, msa->sqname[i]);
      sum = 0.;
      esl_vec_ISet(seq_pp_ct, nppvals, 0);
      for(apos = 0; apos < msa->alen; apos++) { 
	if((ppidx = get_pp_idx(msa->abc, msa->pp[i][apos])) == -1) ESL_FAIL(eslEFORMAT, errbuf, "bad #=GR PP char: %c", msa->pp[i][apos]);
	seq_pp_ct[ppidx]++;
      }
      nnongap = esl_vec_ISum(seq_pp_ct, 11);
      fprintf(fp, "  %7d", nnongap);
      for(p = 0; p < nppvals-1; p++) { /* don't include gaps in per-sequence output */
	fprintf(fp, "  %7d", seq_pp_ct[p]);
	if(p <= 10) sum += (float) seq_pp_ct[p] * ppavgA[p];
      }
      fprintf(fp, "  %.5f\n", sum / (float) nnongap);
    }
  }
  fprintf(fp, "//\n");

  return eslOK;
}
Пример #17
0
void
AllocCPlan9Body(CP9_t *hmm, int M, const ESL_ALPHABET *abc)
{
    int status;
    int k, x;

    hmm->abc = abc;

    hmm->M = M;

    ESL_ALLOC(hmm->t,   (M+1) *           sizeof(float *));
    ESL_ALLOC(hmm->mat, (M+1) *           sizeof(float *));
    ESL_ALLOC(hmm->ins, (M+1) *           sizeof(float *));
    ESL_ALLOC(hmm->t[0],(cp9_NTRANS*(M+1)) *  sizeof(float));
    ESL_ALLOC(hmm->mat[0],(abc->K*(M+1))   * sizeof(float));
    ESL_ALLOC(hmm->ins[0],(abc->K*(M+1))   * sizeof(float));

    ESL_ALLOC(hmm->tsc, cp9_NTRANS     *   sizeof(int *));
    ESL_ALLOC(hmm->msc, hmm->abc->Kp   *   sizeof(int *));
    ESL_ALLOC(hmm->isc, hmm->abc->Kp   *   sizeof(int *));
    ESL_ALLOC(hmm->tsc_mem,(cp9_NTRANS*(M+1))     *       sizeof(int));
    ESL_ALLOC(hmm->msc_mem,(hmm->abc->Kp*(M+1)) * sizeof(int));
    ESL_ALLOC(hmm->isc_mem,(hmm->abc->Kp*(M+1)) * sizeof(int));

    hmm->tsc[0] = hmm->tsc_mem;
    hmm->msc[0] = hmm->msc_mem;
    hmm->isc[0] = hmm->isc_mem;

    /* transition scores reordered */
    ESL_ALLOC(hmm->otsc, sizeof(int)   * (M+1)  * cp9O_NTRANS);

    /* note allocation strategy for important 2D arrays -- trying
     * to keep locality as much as possible, cache efficiency etc.
     */
    for (k = 1; k <= M; k++) {
        hmm->mat[k] = hmm->mat[0] + k * abc->K;
        hmm->ins[k] = hmm->ins[0] + k * abc->K;
        hmm->t[k]   = hmm->t[0]   + k * cp9_NTRANS;
    }
    for (x = 1; x < hmm->abc->Kp; x++) {
        hmm->msc[x] = hmm->msc[0] + x * (M+1);
        hmm->isc[x] = hmm->isc[0] + x * (M+1);
    }
    for (x = 0; x < cp9_NTRANS; x++)
        hmm->tsc[x] = hmm->tsc[0] + x * (M+1);

    /* tsc[x][0] is used as a boundary condition sometimes [Viterbi()],
     * so set to -inf always.
     */
    for (x = 0; x < cp9_NTRANS; x++)
        hmm->tsc[x][0] = -INFTY;

    ESL_ALLOC(hmm->begin, (M+1) * sizeof(float));
    ESL_ALLOC(hmm->end,   (M+1) * sizeof(float));

    ESL_ALLOC(hmm->bsc_mem, (M+1) * sizeof(int));
    ESL_ALLOC(hmm->esc_mem, (M+1) * sizeof(int));

    ESL_ALLOC(hmm->null, (abc->K) * sizeof(float));

    hmm->bsc = hmm->bsc_mem;
    hmm->esc = hmm->esc_mem;

    /* end[0], begin[0], esc[0] and bsc[0] are never
     * used, set them to 0. and -INFTY */
    hmm->end[0] = hmm->begin[0] = -INFTY;
    hmm->esc[0] = hmm->bsc[0] = -INFTY;

    ESL_ALLOC(hmm->has_el,     (M+1) * sizeof(int));
    ESL_ALLOC(hmm->el_from_ct, (M+2) * sizeof(int));
    ESL_ALLOC(hmm->el_from_idx,(M+2) * sizeof(int *));
    ESL_ALLOC(hmm->el_from_cmnd,(M+2) * sizeof(int *));
    esl_vec_ISet(hmm->has_el,     M+1, FALSE);
    esl_vec_ISet(hmm->el_from_ct, M+1, 0);
    for(k = 0; k <= M+1; k++) {
        hmm->el_from_idx[k] = NULL;
        hmm->el_from_cmnd[k] = NULL;
    }

    return;
ERROR:
    cm_Fail("Memory allocation error.");
}
Пример #18
0
/* Function: cm_tr_penalties_Create()
 * Date:     EPN, Sat Jan 21 12:03:52 2012
 *
 * Purpose:  Allocate and initialize a CM_TR_PENALTIES object.
 *           A CM and its emit map are required to determine
 *           truncation penalty scores. This is annoyingly
 *           complex, see verbose notes within code below.
 * 
 *           Some of the code in this function, specifically
 *           that which calculates the probability of a fragment
 *           aligning at a given node, is checkable, but only
 *           if we disallow truncated begins into insert states.
 *           However, we want to allow truncated begins in reality.
 *           I've left in a flag for ignoring inserts (<ignore_inserts>)
 *           I used in testing this function. Set it to TRUE to 
 *           perform the test. 
 *
 * Returns:  Newly allocated CM_TR_PENALTIES object. NULL if out
 *           of memory.
 */
CM_TR_PENALTIES *
cm_tr_penalties_Create(CM_t *cm, int ignore_inserts, char *errbuf)
{
  int status;
  int v, nd, m, i1, i2;
  int lpos, rpos;
  int i;

  /* variables used for determining ratio of inserts to match at each consensus position */
  float   *mexpocc  = NULL;  /* [0..c..clen] probability match  state is used to emit at cons posn c */     
  float   *iexpocc  = NULL;  /* [0..c..clen] probability insert state is used to emit after cons posn c */     
  double  *psi      = NULL;  /* [0..v..M-1]  expected occupancy of state v */
  float    m_psi, i1_psi, i2_psi; /* temp psi values */
  float    summed_psi; 
  CM_TR_PENALTIES *trp = NULL;

  /* variables used for calculating global truncation penalties */
  float g_5and3; /* fragment probability if 5' and 3' truncation are allowed */
  float g_5or3;  /* fragment probability if 5' or  3' truncation are allowed */

  /* variables used for calculating local truncation penalties */
  float *begin = NULL;           /* local begin probabilities 0..v..M-1 */
  int   subtree_clen;            /* consensus length of subtree under this node */
  float prv53, prv5, prv3;       /* previous node's fragment probability, 5'&3', 5' only, 3'only */
  float cur53, cur5, cur3;       /* current node's fragment probability, 5'&3', 5' only, 3'only */
  int   nfrag53, nfrag5, nfrag3; /* number of fragments, 5'&3', 5' only, 3'only */

  if(cm == NULL || cm->emap == NULL) goto ERROR;

  ESL_ALLOC(trp, sizeof(CM_TR_PENALTIES));

  trp->M = cm->M;
  trp->ignored_inserts = ignore_inserts;

  /* Define truncation penalties for each state v. This will be 
   * the score for doing a truncated begin into state v.
   * 
   * Important note: For this discussion we assume that sequences can
   * only be truncated at consensus positions, which means we don't
   * have to worry about truncated begins into inserts. This is an
   * approximation (also made by Diana and Sean in the 2009 trCYK
   * paper) that greatly simplifies the explanation of the calculation
   * of the truncation penalties.  The examples in my ELN3 notebook
   * also use this simplification. However, I need to be able to do
   * truncated begins into insert states in some cases (some pass/mode
   * combinations see ELN bottom of p.47). I explain first the
   * rationale for calculating truncation penalties ignoring inserts
   * and then I describe how I adapt those penalties to allow
   * for inserts. 
   * 
   * This is a lengthy comment. I've divided it into 3 sections:
   * Section 1. Global mode truncation penalties, ignoring inserts.
   * Section 2. Local mode truncation penalties, ignoring inserts.
   * Section 3. Adapting truncation penalties to allow for inserts.
   *
   **************************************************************
   * Section 1. Global mode truncation penalties, ignoring inserts.
   *
   * We want the truncation penalty to be the log of the probability
   * that the particular fragment we're aligning was generated from
   * the following generative process. The generative process differs
   * between global and local mode. 
   * 
   * In global mode: 
   * o Sample global parsetree which spans consensus positions 1..clen.
   * o Randomly choose g and h in range 1..clen, where h >= g and
   *   truncate sequence from g..h. The first residue will either be
   *   an insert before position g, or a match at position g of the 
   *   model. The final residue will either be an insert after position
   *   h or a match at position h of the model.
   * 
   * All g,h fragments are equiprobable, so the probability of any
   * particular fragment is 2 / (clen * (clen+1)). So log_2 of this
   * value is the truncation penalty for all truncated alignments in
   * global mode where both 5' and 3' truncation are allowed. 
   * 
   * We store this penalty, per-state in the
   * g_ptyAA[TRPENALTY_5P_AND_3P][0..v..M-1].  The penalty is
   * identical for all emitting states. The penalty value for
   * non-emitters is IMPOSSIBLE because truncated begins are 
   * not allowed into non-emitters. 
   * 
   * If only 5' OR 3' truncation is allowed, we only truncate at g or
   * h, which menas there's 1/clen possible fragments and log_2
   * (1/clen) is our global truncation penalty. 
   * 
   * However, if 5' truncation is allowed we can only do a truncated
   * begin into states that with a consensus subtree that spans
   * position clen (since we don't allow a truncation at the 3' end).
   * Thus any state whose subtree that doesn't span clen gets
   * an IMPOSSIBLE value for its truncation score in:
   * g_ptyAA[TRPENALTY_5P_ONLY][0..v..M-1].
   * 
   * Likewise, if 3' truncation is allowed we can only do a truncated
   * begin into states that with a consensus subtree that spans
   * position 1 (since we don't allow a truncation at the 5' end).
   *
   * There's an example of computing all three types of penalties for
   * a simple CM in ELN 3 p43.
   * 
   ************************************************************
   * Section 2. Local mode truncation penalties, ignoring inserts.
   * 
   * Generative process that generates fragments in local mode:
   * o Sample local begin state b with consensus subtree from i..j from
   *   local begin state distribution.
   * o Randomly choose g and h in range i..j, where h >= g and
   *   truncate sequence from g..h. The first residue will either be
   *   an insert before position g, or a match at position g of the 
   *   model. The final residue will either be an insert after position
   *   h or a match at position h of the model.
   * 
   * Unlike in global mode, in local mode all fragments are not
   * equiprobable since the local begin state distribution can be
   * anything, and each b allows different sets of fragments to be
   * generated (because they can only span from i to j).
   * 
   * The truncation penalty should be the log of the probability of
   * aligning the current fragment to the model. So we need to know 
   * the probability of generating each possible fragment. 
   * We could calculate probability of any fragment g,h with the 
   * following inefficient algorithm:
   *
   * For each start fragment point g,
   *   For each start fragment point h,
   *     For each state v,
   *       If lpos[v] <= g && rpos[v] >= h, then
   *       prob[g][h] += begin[v] * 2. / (st_clen[v] * (st_clen[v]+1));
   * 
   * Where lpos[v]/rpos[v] are the left/right consensus positions in
   * consensus subtree rooted at state v. And st_clen[v] is rpos[v] -
   * lpos[v] + 1, the consensus length of that subtree. 
   *  
   * This gives us prob[g][h], the probability of generating fragment
   * g,h. But we want to apply the penalty to a state, not to a
   * fragment, to avoid needing to know the fragment boundaries g,h
   * during the DP recursion when applying the penalty.  
   * 
   * To facilitate this, we need to find state t, the state with
   * smallest subtree that contains g,h. State t is relevant because
   * it is the state which will root the alignment of the fragment g,h
   * by using a truncated begin transition into t. This gives a new
   * algorithm:
   *
   * For each start fragment point g,
   *   For each start fragment point h,
   *     Identify state t, the max valued state for which 
   *       lpos[v] <= g && rpos[v] >= h, then {
   *         prob[t] += prob[g][h]
   *         fcount[t]++;
   *       }
   * 
   * prob[t] will be the probability of observing an alignment that
   * uses a truncated begin into t to align any fragment. Then we take
   * average over all fragments: prob[t] / fcount[t] (since we'll only
   * be aligning one of those fragments) and use the log of that
   * probability as the penalty for observing a truncated alignment
   * rooted at state t. Conveniently, it turns out that all fragments
   * that share t are equiprobable (have equal prob[g][h] values), so
   * the average probability is the actual probability for each
   * fragment, and thus the correct penalty to apply.
   * 
   * Fortunately, we can compute the correct penalty much more
   * efficiently than the two algorithms shown above. The
   * efficient way is implemented below. A test that the penalties
   * are correctly computed is in cm_tr_penalties_Validate().
   * 
   * This discussion assumes we're truncating 5' and 3', but if we're
   * only truncating 5' or 3' The situation is a little different.
   * 
   * There's an example of computing all three types of penalties for
   * a simple CM in ELN3 p44-45.
   *
   ************************************************************
   * Section 3. Adapting truncation penalties to allow for inserts.
   * 
   * We need to be able to do truncated begins into insert states
   * because we enforce that the first/final residue of a sequence be
   * included in 5'/3' truncated alignments and we want to be able
   * to properly align those residues if they're probably emitted
   * by insert states. 
   * 
   * The methods/logic explained in sections 1 and 2 above I believe
   * is correct IF we ignore inserts (assume truncated begins into
   * them are impossible). But we need to allow inserts, so I modify
   * the truncation penalties as described above to allow for inserts
   * as follows. We can calculate the appropriate truncated begin
   * penalty for all MATP_MP, MATL_ML, MATR_MR, BIF_B states as with
   * the methods described above by ignoring inserts. This gives us a
   * probability p of using that state as the root of the truncated
   * alignment, i.e. the truncated begin state. (The log_2 of this
   * probability is the penalty.) We then partition p amongst the
   * MATP_MP, MATL_ML, MATR_MR, BIF_B states and any parent insert
   * states, i.e. any insert state that can transition into the
   * match/bif state. For each match/bif state there's 0, 1 or 2
   * parent inserts. We then partition p based on the relative
   * expected occupancy of these inserts versus the match/bif state.
   * 
   * This is certainly 'incorrect' in that it doesn't reflect the
   * true probability of a fragment being aligned to each of the
   * states, but it should be a close approximation. I think doing
   * it correctly is basically impossible in the context of a single
   * state-specific penalty (i.e. the penalty would have to be per-fragment
   * which would be hard to deal with in the DP functions).
   */ 

  /* allocate and initialize the penalty arrays */
  ESL_ALLOC(trp->g_ptyAA,  sizeof(float *) * NTRPENALTY); 
  ESL_ALLOC(trp->l_ptyAA,  sizeof(float *) * NTRPENALTY); 
  ESL_ALLOC(trp->ig_ptyAA, sizeof(int *)   * NTRPENALTY); 
  ESL_ALLOC(trp->il_ptyAA, sizeof(int *)   * NTRPENALTY); 

  for(i = 0; i < NTRPENALTY; i++) { 
    trp->g_ptyAA[i]  = NULL;
    trp->l_ptyAA[i]  = NULL;
    trp->il_ptyAA[i] = NULL;
    trp->ig_ptyAA[i] = NULL;
    ESL_ALLOC(trp->g_ptyAA[i],  sizeof(float) * cm->M);
    ESL_ALLOC(trp->l_ptyAA[i],  sizeof(float) * cm->M);
    ESL_ALLOC(trp->ig_ptyAA[i], sizeof(int)   * cm->M);
    ESL_ALLOC(trp->il_ptyAA[i], sizeof(int)   * cm->M);
    esl_vec_FSet(trp->g_ptyAA[i],   cm->M, IMPOSSIBLE);
    esl_vec_FSet(trp->l_ptyAA[i],   cm->M, IMPOSSIBLE);
    esl_vec_ISet(trp->ig_ptyAA[i],  cm->M, -INFTY);
    esl_vec_ISet(trp->il_ptyAA[i],  cm->M, -INFTY);
  }

  /* DumpEmitMap(stdout, cm->emap, cm); */

  /* Calculate local begin probabilities and expected occupancy */
  ESL_ALLOC(begin, sizeof(float) * cm->M);
  cm_CalculateLocalBeginProbs(cm, cm->pbegin, cm->t, begin);
  if((status = cm_ExpectedPositionOccupancy(cm, &mexpocc, &iexpocc, &psi, NULL, NULL, NULL)) != eslOK) goto ERROR;

  /* Fill global and local truncation penalties in a single loop. We
   * step through all nodes and set the truncation penalties for the
   * MATP_MP, MATL_ML, MATR_MR, and BIF_B states and any parent
   * inserts (i1, i2) of those states.
   */
  g_5and3 = 2. / (cm->clen * (cm->clen+1)); /* for global mode: probability of all fragments if we're truncating 5' and 3' */
  g_5or3  = 1. / cm->clen;                  /* for global mode: probability of all fragments if we're only truncating 5' or  3' */

  prv5 = prv3 = prv53 = 0.; /* initialize 'previous' probability values used for calc'ing local truncation penalties */
  for(nd = 0; nd < cm->nodes; nd++) { 
    lpos = (cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd) ? cm->emap->lpos[nd] : cm->emap->lpos[nd] + 1;
    rpos = (cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATR_nd) ? cm->emap->rpos[nd] : cm->emap->rpos[nd] - 1;

    /* now set penalties for match and insert states m, i1 and maybe i2 (if we're a MATP_MP or BIF_B) */
    if(cm->ndtype[nd] == END_nd) { 
      prv5 = prv3 = prv53 = 0.;
    }
    else if(cm->ndtype[nd] == BEGL_nd || cm->ndtype[nd] == BEGR_nd) {
      prv5  = (cm->ndtype[nd] == BEGL_nd) ? 0. : trp->l_ptyAA[TRPENALTY_5P_ONLY][cm->plast[cm->nodemap[nd]]];  /* parent BIF_B's probability */;
      prv3  = (cm->ndtype[nd] == BEGR_nd) ? 0. : trp->l_ptyAA[TRPENALTY_3P_ONLY][cm->plast[cm->nodemap[nd]]];  /* parent BIF_B's probability */;
      prv53 = trp->l_ptyAA[TRPENALTY_5P_AND_3P][cm->plast[cm->nodemap[nd]]];  /* parent BIF_B's probability */
    }
    else if(cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd || cm->ndtype[nd] == MATR_nd || cm->ndtype[nd] == BIF_nd) { 
      /* determine match states and insert states that pertain to this node */
      m = cm->nodemap[nd]; /* MATP_MP, MATL_ML, MATR_MR, or BIF_B */
      InsertsGivenNodeIndex(cm, nd-1, &i1, &i2);

      m_psi = psi[m];
      if(cm->ndtype[nd] == MATP_MP) { m_psi += (psi[m+1] + psi[m+2]); } /* include MATP_ML and MATP_MR psi */
      i1_psi = (i1 == -1) ? 0. : psi[i1];
      i2_psi = (i2 == -1) ? 0. : psi[i2]; 
      summed_psi = m_psi + i1_psi + i2_psi; 
      if(ignore_inserts) { 
	i1_psi = i2_psi = 0.;
	summed_psi = m_psi;
      }

      /* Global penalties */
      /* sanity check, we should only set truncation penalty once per state */
      if(NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_5P_AND_3P][m]))  goto ERROR;
      if((i1 != -1) && NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_5P_AND_3P][i1])) goto ERROR;
      if((i2 != -1) && NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_5P_AND_3P][i2])) goto ERROR;
      /* divide up the probability g_5and3 amongst relevant states m, i1, i2, weighted by psi */
      trp->g_ptyAA[TRPENALTY_5P_AND_3P][m]  = (m_psi  / summed_psi) * g_5and3;
      if(i1 != -1) trp->g_ptyAA[TRPENALTY_5P_AND_3P][i1] = (i1_psi / summed_psi) * g_5and3;
      if(i2 != -1) trp->g_ptyAA[TRPENALTY_5P_AND_3P][i2] = (i2_psi / summed_psi) * g_5and3;

      /* same thing, for 5P only and 3P only */
      if(rpos == cm->clen) { /* else it will remain IMPOSSIBLE */
	trp->g_ptyAA[TRPENALTY_5P_ONLY][m]  = (m_psi / summed_psi) * g_5or3;
	if(i1 != -1) trp->g_ptyAA[TRPENALTY_5P_ONLY][i1] = (i1_psi / summed_psi) * g_5or3;
	if(i2 != -1) trp->g_ptyAA[TRPENALTY_5P_ONLY][i2] = (i2_psi / summed_psi) * g_5or3;
      }
      if(lpos == 1) { /* else it will remain IMPOSSIBLE */
	trp->g_ptyAA[TRPENALTY_3P_ONLY][m]  = (m_psi  / summed_psi) * g_5or3;
	if(i1 != -1) trp->g_ptyAA[TRPENALTY_3P_ONLY][i1] = (i1_psi / summed_psi) * g_5or3;
	if(i2 != -1) trp->g_ptyAA[TRPENALTY_3P_ONLY][i2] = (i2_psi / summed_psi) * g_5or3;
      }

      /* Local penalties */
      subtree_clen = rpos - lpos + 1;
      nfrag5  = subtree_clen;
      nfrag3  = subtree_clen;
      nfrag53 = (subtree_clen * (subtree_clen+1)) / 2;

      /* determine probability of observing a fragment aligned at
       * state m (here, m is what I call t above and in notes) and
       * partition that probability between m and i1 and/or i2 by
       * relative occupancy of match versus inserts
       */
      cur5  = begin[m] / (float) nfrag5  + prv5;
      cur3  = begin[m] / (float) nfrag3  + prv3;
      cur53 = begin[m] / (float) nfrag53 + prv53;

      /* sanity check, we should only set truncation penalty once per state */
      if(NOT_IMPOSSIBLE(trp->l_ptyAA[TRPENALTY_5P_AND_3P][m]))  goto ERROR;
      if((i1 != -1) && NOT_IMPOSSIBLE(trp->l_ptyAA[TRPENALTY_5P_AND_3P][i1])) goto ERROR;
      if((i2 != -1) && NOT_IMPOSSIBLE(trp->l_ptyAA[TRPENALTY_5P_AND_3P][i2])) goto ERROR;

      trp->l_ptyAA[TRPENALTY_5P_AND_3P][m]  = (m_psi  / summed_psi) * cur53;
      if(i1 != -1) trp->l_ptyAA[TRPENALTY_5P_AND_3P][i1] = (i1_psi / summed_psi) * cur53;
      if(i2 != -1) trp->l_ptyAA[TRPENALTY_5P_AND_3P][i2] = (i2_psi / summed_psi) * cur53;

      trp->l_ptyAA[TRPENALTY_5P_ONLY][m]  = (m_psi  / summed_psi) * cur5;
      if(i1 != -1) trp->l_ptyAA[TRPENALTY_5P_ONLY][i1] = (i1_psi / summed_psi) * cur5;
      if(i2 != -1) trp->l_ptyAA[TRPENALTY_5P_ONLY][i2] = (i2_psi / summed_psi) * cur5;

      trp->l_ptyAA[TRPENALTY_3P_ONLY][m]  = (m_psi  / summed_psi) * cur3;
      if(i1 != -1) trp->l_ptyAA[TRPENALTY_3P_ONLY][i1] = (i1_psi / summed_psi) * cur3;
      if(i2 != -1) trp->l_ptyAA[TRPENALTY_3P_ONLY][i2] = (i2_psi / summed_psi) * cur3;

      prv5  = (cm->ndtype[nd] == MATL_nd) ? cur5 : 0.;
      prv3  = (cm->ndtype[nd] == MATR_nd) ? cur3 : 0.;
      prv53 = cur53;
    }
  }

  /* all penalties are currently probabilities, convert them to log
   * probs and set integer penalties (careful, we have to check if
   * IMPOSSIBLE first)
   */
  for(v = 0; v < cm->M; v++) 
    { 
      if((cm->stid[v] == MATP_MP || cm->stid[v] == MATL_ML || cm->stid[v] == MATR_MR || cm->stid[v] == BIF_B) || 
	 ((cm->sttype[v] == IL_st || cm->sttype[v] == IR_st) && (! StateIsDetached(cm, v)))) 
	{
	  /* Check for rare special case: if we're a MATP_IL and next
	   * two states are MATP_IR and END_E, then we won't have set
	   * a trunction penalty. This state will keep an impossible
	   * truncated begin score, if we did a truncated begin into
	   * it we'd just emit from the MATP_IL and then go to the
	   * END_E anyway (the MATP_IR will be detached.
	   */
	  if(cm->stid[v] == MATP_IL && cm->ndtype[cm->ndidx[v]+1] == END_nd) continue;

	  /* glocal 5P AND 3P: all of these should have been set to a non-IMPOSSIBLE value */
	  if(! NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_5P_AND_3P][v])) goto ERROR;
	  trp->ig_ptyAA[TRPENALTY_5P_AND_3P][v] = Prob2Score(trp->g_ptyAA[TRPENALTY_5P_AND_3P][v], 1.0);
	  trp->g_ptyAA[TRPENALTY_5P_AND_3P][v]  = sreLOG2(trp->g_ptyAA[TRPENALTY_5P_AND_3P][v]);
	  
	  /* glocal 5P only: some may be IMPOSSIBLE */
	  if(NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_5P_ONLY][v])) { 
	    trp->ig_ptyAA[TRPENALTY_5P_ONLY][v] = Prob2Score(trp->g_ptyAA[TRPENALTY_5P_ONLY][v], 1.0);
	    trp->g_ptyAA[TRPENALTY_5P_ONLY][v]  = sreLOG2(trp->g_ptyAA[TRPENALTY_5P_ONLY][v]);
	  }
	  /* glocal 5P only: some may be IMPOSSIBLE */
	  if(NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_3P_ONLY][v])) { 
	    trp->ig_ptyAA[TRPENALTY_3P_ONLY][v] = Prob2Score(trp->g_ptyAA[TRPENALTY_3P_ONLY][v], 1.0);
	    trp->g_ptyAA[TRPENALTY_3P_ONLY][v]  = sreLOG2(trp->g_ptyAA[TRPENALTY_3P_ONLY][v]);
	  }

	  /* local penalties all of these should have been set to a non-IMPOSSIBLE value */
	  if(! NOT_IMPOSSIBLE(trp->il_ptyAA[TRPENALTY_5P_AND_3P][v])) goto ERROR;
	  if(! NOT_IMPOSSIBLE(trp->il_ptyAA[TRPENALTY_5P_ONLY][v]))   goto ERROR;
	  if(! NOT_IMPOSSIBLE(trp->il_ptyAA[TRPENALTY_3P_ONLY][v]))   goto ERROR;

	  trp->il_ptyAA[TRPENALTY_5P_AND_3P][v] = Prob2Score(trp->l_ptyAA[TRPENALTY_5P_AND_3P][v], 1.0);
	  trp->il_ptyAA[TRPENALTY_5P_ONLY][v]   = Prob2Score(trp->l_ptyAA[TRPENALTY_5P_ONLY][v], 1.0);
	  trp->il_ptyAA[TRPENALTY_3P_ONLY][v]   = Prob2Score(trp->l_ptyAA[TRPENALTY_3P_ONLY][v], 1.0);
	  trp->l_ptyAA[TRPENALTY_5P_AND_3P][v]  = sreLOG2(trp->l_ptyAA[TRPENALTY_5P_AND_3P][v]);
	  trp->l_ptyAA[TRPENALTY_5P_ONLY][v]    = sreLOG2(trp->l_ptyAA[TRPENALTY_5P_ONLY][v]);
	  trp->l_ptyAA[TRPENALTY_3P_ONLY][v]    = sreLOG2(trp->l_ptyAA[TRPENALTY_3P_ONLY][v]);
	}
    }

  if(ignore_inserts) { 
    if((status = cm_tr_penalties_Validate(trp, cm, 0.0001, errbuf)) != eslOK) { printf("%s", errbuf);  goto ERROR; }
  }

  /* cm_tr_penalties_Dump(stdout, cm, trp); */

  if(mexpocc != NULL) free(mexpocc);
  if(iexpocc != NULL) free(iexpocc);
  if(psi     != NULL) free(psi);
  if(begin   != NULL) free(begin);

  return trp;

 ERROR:
  if(mexpocc != NULL) free(mexpocc);
  if(iexpocc != NULL) free(iexpocc);
  if(psi     != NULL) free(psi);
  if(begin   != NULL) free(begin);
  if(trp     != NULL) cm_tr_penalties_Destroy(trp);

  return NULL;
}