/* Function:  p7_prior_CreateLaplace()
 * Synopsis:  Creates Laplace plus-one prior.
 * Incept:    SRE, Sat Jun 30 09:48:13 2007 [Janelia]
 *
 * Purpose:   Create a Laplace plus-one prior for alphabet <abc>.
 */
P7_PRIOR *
p7_prior_CreateLaplace(const ESL_ALPHABET *abc)
{
  P7_PRIOR *pri = NULL;
  int        status;

  ESL_ALLOC(pri, sizeof(P7_PRIOR));
  pri->tm = pri->ti = pri->td = pri->em = pri->ei = NULL;

  pri->tm = esl_mixdchlet_Create(1, 3);	     /* single component; 3 params */
  pri->ti = esl_mixdchlet_Create(1, 2);	     /* single component; 2 params */
  pri->td = esl_mixdchlet_Create(1, 2);	     /* single component; 2 params */
  pri->em = esl_mixdchlet_Create(1, abc->K); /* single component; K params */
  pri->ei = esl_mixdchlet_Create(1, abc->K); /* single component; K params */

  if (pri->tm == NULL || pri->ti == NULL || pri->td == NULL || pri->em == NULL || pri->ei == NULL) goto ERROR;

  pri->tm->pq[0] = 1.0;   esl_vec_DSet(pri->tm->alpha[0], 3,      1.0);  /* match transitions  */
  pri->ti->pq[0] = 1.0;   esl_vec_DSet(pri->ti->alpha[0], 2,      1.0);  /* insert transitions */
  pri->td->pq[0] = 1.0;   esl_vec_DSet(pri->td->alpha[0], 2,      1.0);  /* delete transitions */
  pri->em->pq[0] = 1.0;   esl_vec_DSet(pri->em->alpha[0], abc->K, 1.0);  /* match emissions    */
  pri->ei->pq[0] = 1.0;   esl_vec_DSet(pri->ei->alpha[0], abc->K, 1.0);  /* insert emissions   */
  return pri;

 ERROR:
  p7_prior_Destroy(pri);
  return NULL;
}
Exemple #2
0
/* seq_generation()
 *
 * Generating sequences.
 */
static int
seq_generation(ESL_GETOPTS *go, ESL_RANDOMNESS *r, FILE *ofp, int outfmt)
{
  ESL_ALPHABET *abc = NULL;
  ESL_SQ       *sq  = NULL;
  double       *fq  = NULL;
  int           alphatype = eslUNKNOWN;   // static checkers can't see that 1 of --rna, --dna, --amino must be true
  int           N         = esl_opt_GetInteger(go, "-N");
  int           L         = esl_opt_GetInteger(go, "-L");
  int           i;
  int           status;

  if (L <= 0) esl_fatal("To generate sequences, set -L option (length of generated seqs) > 0 ");
  if (esl_opt_GetBoolean(go, "--rna"))   alphatype = eslRNA;
  if (esl_opt_GetBoolean(go, "--dna"))   alphatype = eslDNA;
  if (esl_opt_GetBoolean(go, "--amino")) alphatype = eslAMINO;
  abc = esl_alphabet_Create(alphatype);
  sq  = esl_sq_CreateDigital(abc);
  esl_sq_GrowTo(sq, L);

  /* Pick the iid frequency distribution to use */
  ESL_ALLOC(fq, sizeof(double) * abc->K);
  switch (alphatype) {
  case eslRNA:
  case eslDNA:    esl_vec_DSet(fq, 4, 0.25); break;
  case eslAMINO:  esl_composition_SW34(fq);  break;
  default:        esl_vec_DSet(fq, abc->K, 1.0 / (double) abc->K); break;
  }
    
  /* generate */
  for (i = 0; i < N; i++)
    {
      esl_rsq_xIID(r, fq, abc->K, L, sq->dsq);
      if (N > 1) esl_sq_FormatName(sq, "random%d", i);
      else       esl_sq_SetName(sq, "random");
      sq->n = L;
      esl_sqio_Write(ofp, sq, outfmt, FALSE);
    }

  free(fq);
  esl_alphabet_Destroy(abc);
  esl_sq_Destroy(sq);
  return eslOK;

 ERROR:
  if (fq != NULL) free(fq);
  esl_alphabet_Destroy(abc);
  esl_sq_Destroy(sq);
  return status;
}
Exemple #3
0
/* The gradient of the NLL w.r.t. each free parameter in p.
 */
static void
hyperexp_complete_gradient(double *p, int np, void *dptr, double *dp)
{
  struct hyperexp_data *data = (struct hyperexp_data *) dptr;
  ESL_HYPEREXP         *h    = data->h;
  double pdf;
  int i,k;
  int pidx;			
  
  hyperexp_unpack_paramvector(p, np, h);
  esl_vec_DSet(dp, np, 0.);
  for (i = 0; i < data->n; i++)
    {
      /* FIXME: I think the calculation below may need to be done
       * in log space, to avoid underflow errors; see complete_binned_gradient()
       */
      /* Precalculate q_k PDF_k(x) terms, and their sum */
      for (k = 0; k < h->K; k++)
	h->wrk[k] = h->q[k] * esl_exp_pdf(data->x[i], h->mu, h->lambda[k]);
      pdf = esl_vec_DSum(h->wrk, h->K);

      pidx = 0;
      if (! h->fixmix) {
	for (k = 1; k < h->K; k++) /* generic d/dQ solution for mixture models */
	  dp[pidx++] -= h->wrk[k]/pdf - h->q[k];
      }
      
      for (k = 0; k < h->K; k++)
	if (! h->fixlambda[k])
	  dp[pidx++] -= (1.-h->lambda[k]*(data->x[i]-h->mu))*h->wrk[k]/pdf; /* d/dw */
    }
}
Exemple #4
0
static void
hyperexp_complete_binned_gradient(double *p, int np, void *dptr, double *dp)
{
  struct hyperexp_binned_data *data = (struct hyperexp_binned_data *) dptr;
  ESL_HISTOGRAM               *g    = data->g;
  ESL_HYPEREXP                *h    = data->h;
  int i,k;
  int pidx;			
  double z;
  double tmp;
  double ai, delta;
  
  hyperexp_unpack_paramvector(p, np, h);
  esl_vec_DSet(dp, np, 0.);
  delta = g->w;

  /* counting over occupied, uncensored histogram bins */
  for (i = g->cmin; i <= g->imax; i++)
    {
      if (g->obs[i] == 0) continue;
      ai = esl_histogram_Bin2LBound(g, i);
      if (ai < h->mu) ai = h->mu; /* careful about the left boundary: no x < h->mu */

      /* Calculate log (q_m alpha_m(a_i) terms
       */
      for (k = 0; k < h->K; k++)
	{
	  h->wrk[k] = log(h->q[k]) - h->lambda[k]*(ai-h->mu);
	  if (delta * h->lambda[k] < eslSMALLX1) 
	    h->wrk[k] += log(delta * h->lambda[k]);
	  else
	    h->wrk[k] += log(1 - exp(-delta * h->lambda[k]));
	}
      z = esl_vec_DLogSum(h->wrk, h->K); /* z= log \sum_k q_k alpha_k(a_i) */

      /* Bump the gradients for Q_1..Q_{K-1} */
      pidx = 0;
      if (! h->fixmix) {
	for (k = 1; k < h->K; k++)
	  dp[pidx++] -= g->obs[i] * (exp(h->wrk[k] - z) - h->q[k]);
      }
	
      /* Bump the gradients for w_0..w_{K-1}
       */
      for (k = 0; k < h->K; k++)
	if (! h->fixlambda[k])
	  {
	    tmp  = log(h->q[k]) + log(h->lambda[k])- h->lambda[k]*(ai-h->mu);
	    tmp  = exp(tmp - z);
	    tmp *= (ai + delta - h->mu) * exp(-delta * h->lambda[k]) - (ai - h->mu);
	    dp[pidx++] -= g->obs[i] * tmp;
	  }
    }  
}
Exemple #5
0
int main(void)
{
  double *p;
  char    labels[] = "ACGT";
  int     n = 4;

  p = malloc(sizeof(double) * n);
  esl_vec_DSet(p, n, 1.0);
  esl_vec_DNorm(p, n);
  esl_vec_DDump(stdout, p, n, labels);
  free(p);
  return 0;
}
Exemple #6
0
/* set_relative_weights():
 * Set msa->wgt vector, using user's choice of relative weighting algorithm.
 */
static int
relative_weights(P7_BUILDER *bld, ESL_MSA *msa)
{
  int status = eslOK;

  if      (bld->wgt_strategy == p7_WGT_NONE)                    { esl_vec_DSet(msa->wgt, msa->nseq, 1.); }
  else if (bld->wgt_strategy == p7_WGT_GIVEN)                   ;
  else if (bld->wgt_strategy == p7_WGT_PB)                      status = esl_msaweight_PB(msa); 
  else if (bld->wgt_strategy == p7_WGT_GSC)                     status = esl_msaweight_GSC(msa); 
  else if (bld->wgt_strategy == p7_WGT_BLOSUM)                  status = esl_msaweight_BLOSUM(msa, bld->wid); 
  else ESL_EXCEPTION(eslEINCONCEIVABLE, "no such weighting strategy");

  if (status != eslOK) ESL_FAIL(status, bld->errbuf, "failed to set relative weights in alignment");
  return eslOK;
}
Exemple #7
0
/* Function:  esl_paml_ReadE()
 * Incept:    SRE, Fri Jul  9 09:27:24 2004 [St. Louis]
 *
 * Purpose:   Read an amino acid rate matrix in PAML format from stream
 *            <fp>. Return it in two pieces: the symmetric E
 *            exchangeability matrix in <E>, and the stationary
 *            probability vector $\pi$ in <pi>.
 *            Caller provides the memory for both <E> and <pi>.  <E>
 *            is a $20 \times 20$ matrix allocated as
 *            <esl_dmatrix_Create(20, 20)>. <pi> is an array with
 *            space for at least 20 doubles.
 *            
 *            The <E> matrix is symmetric for off-diagonal elements:
 *            $E_{ij} = E_{ij}$ for $i \neq j$.  The on-diagonal
 *            elements $E_{ii}$ are not valid and should not be
 *            accessed.  (They are set to zero.)

 *            The rate matrix will later be obtained from <E>
 *            and <pi> as 
 *                $Q_{ij} = E_{ij} \pi_j$ for $i \neq j$ 
 *            and
 *                $Q_{ii} = -\sum_{j \neq i} Q_{ij}$ 
 *            then scaled to units of one
 *            substitution/site; see <esl_ratemx_E2Q()> and
 *            <esl_ratemx_ScaleTo()>.
 *
 *            Data file format: First 190 numbers are a
 *            lower-triangular matrix E of amino acid
 *            exchangeabilities $E_{ij}$. Next 20 numbers are the
 *            amino acid frequencies $\pi_i$. Remainder of the
 *            datafile is ignored.
 *            
 *            The alphabet order in the matrix and the frequency
 *            vector is assumed to be "ARNDCQEGHILKMFPSTWYV"
 *            (alphabetical by three-letter code), which appears to be
 *            PAML's default order. This is transformed to Easel's
 *            "ACDEFGHIKLMNPQRSTVWY" (alphabetical by one-letter code)
 *            in the $E_{ij}$ and $\pi_i$ that are returned.
 *            
 * Args:      fp   - open datafile for reading.
 *            E    - RETURN: E matrix of amino acid exchangeabilities e_ij,
 *                     symmetric (E_ij = E_ji),
 *                     in Easel amino acid alphabet order A..Y.
 *                     Caller provides appropriately allocated space.
 *            pi   - RETURN: \pi_i vector of amino acid frequencies,
 *                    in Easel amino acid alphabet order A..Y.
 *                    Caller provides appropriately allocated space.
 *
 * Returns:   <eslOK> on success.
 *            Returns <eslEOF> on premature end of file (parse failed), in which
 *            case the contents of <E> and <pi> are undefined.
 *            
 * Throws:    <eslEMEM> on internal allocation failure,
 *            and the contents of <E> and <pi> are undefined.
 *
 * Xref:      STL8/p.56.
 */
int
esl_paml_ReadE(FILE *fp, ESL_DMATRIX *E, double *pi)
{
  int             status;
  ESL_FILEPARSER *efp = NULL;
  char           *tok;
  int             i,j;
  char           *pamlorder = "ARNDCQEGHILKMFPSTWYV";
  char           *eslorder  = "ACDEFGHIKLMNPQRSTVWY";
  int             perm[20];

  if ((status =  esl_dmatrix_SetZero(E))                 != eslOK) goto ERROR;
  esl_vec_DSet(pi, 20, 0.);

  if ((efp =    esl_fileparser_Create(fp))               == NULL)  goto ERROR;
  if ((status = esl_fileparser_SetCommentChar(efp, '#')) != eslOK) goto ERROR;

  /* Construct the alphabet permutation we need.
   * perm[i] -> original row/column i goes to row/column perm[i]
   */
   for (i = 0; i < 20; i++)
     perm[i] = (int) (strchr(eslorder, pamlorder[i]) - eslorder);

   /* Read the s_ij matrix data in, permuting as we go. */

   for (i = 1; i < 20; i++)
    for (j = 0; j < i; j++)
      {
	if ((status = esl_fileparser_GetToken(efp, &tok, NULL)) != eslOK) goto ERROR;
	E->mx[perm[i]][perm[j]] = atof(tok);
	E->mx[perm[j]][perm[i]] = E->mx[perm[i]][perm[j]];
      }

   /* Read the pi_i vector in, permuting as we read. */
  for (i = 0; i < 20; i++)
    {
      if ((status = esl_fileparser_GetToken(efp, &tok, NULL)) != eslOK) goto ERROR;
      pi[perm[i]] = atof(tok);
    }

  esl_fileparser_Destroy(efp);
  return eslOK;

 ERROR:
  if (efp != NULL) esl_fileparser_Destroy(efp);
  return status;
}
static void
utest_SetWAG(void)
{
  char errbuf[eslERRBUFSIZE];
  ESL_DMATRIX *Q = NULL;
  ESL_DMATRIX *P = NULL;
  double       t = 50.0;	/* sufficiently large to drive e^tQ to stationarity  */
  double       pi[20];
  int          i;

  if ((Q = esl_dmatrix_Create(20, 20))     == NULL)  esl_fatal("malloc failed");
  if ((P = esl_dmatrix_Create(20, 20))     == NULL)  esl_fatal("malloc failed");

  /* This tests that exponentiating WAG gives a stable conditional 
   * probability matrix solution. (It doesn't particularly test that
   * WAG was set correctly, but how could we have screwed that up?)
   */
  if (esl_rmx_SetWAG(Q, NULL)              != eslOK) esl_fatal("_SetWAG() failed");
  if (esl_dmx_Exp(Q, t, P)                 != eslOK) esl_fatal("matrix exponentiation failed");
  if (esl_rmx_ValidateP(P, 1e-7, errbuf)   != eslOK) esl_fatal("P validation failed: %s", errbuf);
  if (esl_rmx_ValidateQ(Q, 1e-7, errbuf)   != eslOK) esl_fatal("Q validation failed: %s", errbuf);

  /* This tests setting WAG to different stationary pi's than default,
   * then tests that exponentiating to large t reaches those stationaries.
   */
  esl_vec_DSet(pi, 20, 0.05);
  if (esl_rmx_SetWAG(Q, pi)                != eslOK) esl_fatal("_SetWAG() failed");
  if (esl_dmx_Exp(Q, t, P)                 != eslOK) esl_fatal("matrix exponentiation failed");
  if (esl_rmx_ValidateP(P, 1e-7, errbuf)   != eslOK) esl_fatal("P validation failed: %s", errbuf);
  if (esl_rmx_ValidateQ(Q, 1e-7, errbuf)   != eslOK) esl_fatal("Q validation failed: %s", errbuf);
  for (i = 0; i < 20; i++)
    if (esl_vec_DCompare(P->mx[i], pi, 20, 1e-7) != eslOK) esl_fatal("P didn't converge to right pi's");

  esl_dmatrix_Destroy(Q);
  esl_dmatrix_Destroy(P);
  return;
}
int
main(int argc, char **argv)
{
  ESL_GETOPTS  *go      = NULL;	/* command line configuration      */
  struct cfg_s  cfg;     	/* application configuration       */
  char         *basename= NULL;	/* base of the output file names   */
  char         *alifile = NULL;	/* alignment file name             */
  char         *dbfile  = NULL;	/* name of seq db file             */
  char          outfile[256];	/* name of an output file          */
  int           alifmt;		/* format code for alifile         */
  int           dbfmt;		/* format code for dbfile          */
  ESL_MSAFILE  *afp     = NULL;	/* open alignment file             */
  ESL_MSA      *origmsa = NULL;	/* one multiple sequence alignment */
  ESL_MSA      *msa     = NULL;	/* MSA after frags are removed     */
  ESL_MSA      *trainmsa= NULL;	/* training set, aligned           */
  ESL_STACK    *teststack=NULL; /* test set: stack of ESL_SQ ptrs  */
  int           status;		/* easel return code               */
  int           nfrags;		/* # of fragments removed          */
  int           ntestdom;       /* # of test domains               */
  int           ntest;		/* # of test sequences created     */
  int           nali;		/* number of alignments read       */
  double        avgid;
  
  
  /* Parse command line */
  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf);
  if (esl_opt_VerifyConfig(go)               != eslOK) cmdline_failure(argv[0], "Error in app configuration:   %s\n", go->errbuf);
  if (esl_opt_GetBoolean(go, "-h"))                    cmdline_help(argv[0], go);
  if (esl_opt_ArgNumber(go)                  != 3)     cmdline_failure(argv[0], "Incorrect number of command line arguments\n");
  basename = esl_opt_GetArg(go, 1); 
  alifile  = esl_opt_GetArg(go, 2);
  dbfile   = esl_opt_GetArg(go, 3);
  alifmt   = eslMSAFILE_STOCKHOLM;
  dbfmt    = eslSQFILE_FASTA;

  /* Set up the configuration structure shared amongst functions here */
  if (esl_opt_IsDefault(go, "--seed"))   cfg.r = esl_randomness_CreateTimeseeded();
  else                                   cfg.r = esl_randomness_Create(esl_opt_GetInteger(go, "--seed"));
  cfg.abc       = NULL;		          /* until we open the MSA file, below */
  cfg.fragfrac  = esl_opt_GetReal(go, "-F");
  cfg.idthresh1 = esl_opt_GetReal(go, "-1");
  cfg.idthresh2 = esl_opt_GetReal(go, "-2");
  cfg.test_lens = NULL;
  cfg.ntest     = 0;

  /* Open the output files */ 
  if (snprintf(outfile, 256, "%s.msa", basename) >= 256)  esl_fatal("Failed to construct output MSA file name");
  if ((cfg.out_msafp = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open MSA output file %s\n", outfile);
  if (snprintf(outfile, 256, "%s.fa",  basename) >= 256)  esl_fatal("Failed to construct output FASTA file name");
  if ((cfg.out_seqfp = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open FASTA output file %s\n", outfile);
  if (snprintf(outfile, 256, "%s.pos", basename) >= 256)  esl_fatal("Failed to construct pos test set summary file name");
  if ((cfg.possummfp = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open pos test set summary file %s\n", outfile);
  if (snprintf(outfile, 256, "%s.neg", basename) >= 256)  esl_fatal("Failed to construct neg test set summary file name");
  if ((cfg.negsummfp = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open neg test set summary file %s\n", outfile);
  if (snprintf(outfile, 256, "%s.tbl", basename) >= 256)  esl_fatal("Failed to construct benchmark table file name");
  if ((cfg.tblfp     = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open benchmark table file %s\n", outfile);

  /* Open the MSA file; determine alphabet */
  status = esl_msafile_Open(alifile, alifmt, NULL, &afp);
  if      (status == eslENOTFOUND) esl_fatal("Alignment file %s doesn't exist or is not readable\n", alifile);
  else if (status == eslEFORMAT)   esl_fatal("Couldn't determine format of alignment %s\n", alifile);
  else if (status != eslOK)        esl_fatal("Alignment file open failed with error %d\n", status);

  if      (esl_opt_GetBoolean(go, "--amino"))   cfg.abc = esl_alphabet_Create(eslAMINO);
  else if (esl_opt_GetBoolean(go, "--dna"))     cfg.abc = esl_alphabet_Create(eslDNA);
  else if (esl_opt_GetBoolean(go, "--rna"))     cfg.abc = esl_alphabet_Create(eslRNA);
  else {
    int type;
    status = esl_msafile_GuessAlphabet(afp, &type);
    if (status == eslEAMBIGUOUS)    esl_fatal("Failed to guess the bio alphabet used in %s.\nUse --dna, --rna, or --amino option to specify it.", alifile);
    else if (status == eslEFORMAT)  esl_fatal("Alignment file parse failed: %s\n", afp->errbuf);
    else if (status == eslENODATA)  esl_fatal("Alignment file %s is empty\n", alifile);
    else if (status != eslOK)       esl_fatal("Failed to read alignment file %s\n", alifile);
    cfg.abc = esl_alphabet_Create(type);
  }
  esl_msafile_SetDigital(afp, cfg.abc);

  if (cfg.abc->type == eslAMINO) esl_composition_SW34(cfg.fq);
  else                           esl_vec_DSet(cfg.fq, cfg.abc->K, 1.0 / (double) cfg.abc->K);

  /* Open and process the dbfile; make sure it's in the same alphabet */
  process_dbfile(&cfg, dbfile, dbfmt);

  /* Read and process MSAs one at a time  */
  nali = 0;
  while ((status = esl_msa_Read(afp, &origmsa)) == eslOK)
    {
      remove_fragments(&cfg, origmsa, &msa, &nfrags);
      separate_sets   (&cfg, msa, &trainmsa, &teststack);
      ntestdom = esl_stack_ObjectCount(teststack);

      if (ntestdom >= 2) 
	{
	  esl_stack_Shuffle(cfg.r, teststack);
	  synthesize_positives(go, &cfg, msa->name, teststack, &ntest);

	  esl_msa_MinimGaps(trainmsa, NULL, NULL);
	  esl_msa_Write(cfg.out_msafp, trainmsa, eslMSAFILE_STOCKHOLM);

	  esl_dst_XAverageId(cfg.abc, trainmsa->ax, trainmsa->nseq, 10000, &avgid); /* 10000 is max_comparisons, before sampling kicks in */
	  fprintf(cfg.tblfp, "%-20s  %3.0f%% %6d %6d %6d %6d %6d %6d\n", msa->name, 100.*avgid, (int) trainmsa->alen, msa->nseq, nfrags, trainmsa->nseq, ntestdom, ntest);
	  nali++;
	}

      esl_msa_Destroy(trainmsa);
      esl_msa_Destroy(origmsa);
      esl_msa_Destroy(msa);
    }
  if      (status == eslEFORMAT)  esl_fatal("Alignment file parse error, line %d of file %s:\n%s\nOffending line is:\n%s\n", 
					    afp->linenumber, afp->fname, afp->errbuf, afp->buf);	
  else if (status != eslEOF)      esl_fatal("Alignment file read failed with error code %d\n", status);
  else if (nali   == 0)           esl_fatal("No alignments found in file %s\n", alifile);

  if (nali > 0)
    synthesize_negatives(go, &cfg, esl_opt_GetInteger(go, "-N"));

  fclose(cfg.out_msafp);
  fclose(cfg.out_seqfp);
  fclose(cfg.possummfp);
  fclose(cfg.negsummfp);
  fclose(cfg.tblfp);
  esl_randomness_Destroy(cfg.r);
  esl_alphabet_Destroy(cfg.abc);
  esl_msafile_Close(afp);
  esl_getopts_Destroy(go);
  return 0;
}
/* Function:  p7_prior_CreateNucleic()
 *
 * Purpose:   Creates the default mixture Dirichlet prior for nucleotide
 *            sequences.
 *
 *            The transition priors (match, insert, delete) are all
 *            single Dirichlets, trained on a portion of the rmark dataset
 *
 *            The match emission prior is an eight-component mixture
 *            trained against a portion of the rmark dataset
 *
 *            The insert emission prior is a single Dirichlet with
 *            high $|\alpha|$, such that insert emission probabilities
 *            are essentially fixed by the prior, regardless of
 *            observed count data.
 *
 * Returns:   a pointer to the new <P7_PRIOR> structure.
 */
P7_PRIOR *
p7_prior_CreateNucleic(void)
{
  int status;
  P7_PRIOR *pri = NULL;
  int q;

  /* Plus-1 Laplace prior
  int num_comp = 1;
  static double defmq[2] =  { 1.0  };
  static double defm[1][4] = {
         { 1.0, 1.0, 1.0, 1.0} //
  };
*/

  /* Match emission priors are trained on Rmark3 database
   * Xref: ~wheelert/notebook/2011/0325_nhmmer_new_parameters
   */
   int num_comp = 4;
   static double defmq[4] = { 0.24, 0.26, 0.08, 0.42  };
   static double defm[4][4] = {
         { 0.16,  0.45,  0.12,   0.39},
         { 0.09,  0.03,  0.09,   0.04},
         { 1.29,  0.40,  6.58,   0.51},
         { 1.74,  1.49,  1.57,   1.95}
	};



  ESL_ALLOC(pri, sizeof(P7_PRIOR));
  pri->tm = pri->ti = pri->td = pri->em = pri->ei = NULL;


  pri->tm = esl_mixdchlet_Create(1, 3);  // match transitions; single component; 3 params
  pri->ti = esl_mixdchlet_Create(1, 2);  // insert transitions; single component; 2 params
  pri->td = esl_mixdchlet_Create(1, 2);  // delete transitions; single component; 2 params
  pri->em = esl_mixdchlet_Create(num_comp, 4); // match emissions; X component; 4 params
  pri->ei = esl_mixdchlet_Create(1, 4); // insert emissions; single component; 4 params


  if (pri->tm == NULL || pri->ti == NULL || pri->td == NULL || pri->em == NULL || pri->ei == NULL) goto ERROR;

  /* Transition priors: roughly, learned from rmark benchmark - hand-beautified (trimming overspecified significant digits)
   */
  pri->tm->pq[0]       = 1.0;
  pri->tm->alpha[0][0] = 2.0; // TMM
  pri->tm->alpha[0][1] = 0.1; // TMI
  pri->tm->alpha[0][2] = 0.1; // TMD


  pri->ti->pq[0]       = 1.0;
  pri->ti->alpha[0][0] = 0.06; // TIM
  pri->ti->alpha[0][1] = 0.2; // TII

  pri->td->pq[0]       = 1.0;
  pri->td->alpha[0][0] = 0.1; // TDM
  pri->td->alpha[0][1] = 0.2; // TDD



  /* Match emission priors  */
  for (q = 0; q < num_comp; q++)
    {
      pri->em->pq[q] = defmq[q];
      esl_vec_DCopy(defm[q], 4, pri->em->alpha[q]);
    }


  /* Insert emission priors. Should that alphas be lower? higher?
   */
  pri->ei->pq[0] = 1.0;
  esl_vec_DSet(pri->ei->alpha[0], 4, 1.0);

  return pri;

 ERROR:
  if (pri != NULL) p7_prior_Destroy(pri);
  return NULL;
}
Exemple #11
0
/* Function:  esl_msaweight_PB()
 * Synopsis:  PB (position-based) weights.
 * Incept:    SRE, Sun Nov  5 08:59:28 2006 [Janelia]
 *
 * Purpose:   Given a multiple alignment <msa>, calculate sequence
 *            weights according to the position-based weighting
 *            algorithm (Henikoff and Henikoff, JMB 243:574-578,
 *            1994). These weights are stored internally in the <msa>
 *            object, replacing any weights that may have already been
 *            there. Weights are $\geq 0$ and they sum to <msa->nseq>.
 *            
 *            The <msa> may be in either digitized or text mode.
 *            Digital mode is preferred, so that the algorithm
 *            deals with degenerate residue symbols properly.
 *            
 *            The Henikoffs' algorithm does not give rules for dealing
 *            with gaps or degenerate residue symbols. The rule here
 *            is to ignore them. This means that longer sequences
 *            initially get more weight; hence a "double
 *            normalization" in which the weights are first divided by
 *            sequence length in canonical residues (to compensate for
 *            that effect), then normalized to sum to nseq.
 *            
 *            An advantage of the PB method is efficiency.
 *            It is $O(1)$ in memory and $O(NL)$ time, for an alignment of
 *            N sequences and L columns. This makes it a good method 
 *            for ad hoc weighting of very deep alignments.
 *            
 *            When the alignment is in simple text mode, IUPAC
 *            degenerate symbols are not dealt with correctly; instead,
 *            the algorithm simply uses the 26 letters as "residues"
 *            (case-insensitively), and treats all other residues as
 *            gaps.
 *
 * Returns:   <eslOK> on success, and the weights inside <msa> have been
 *            modified. 
 *
 * Throws:    <eslEMEM> on allocation error, in which case <msa> is
 *            returned unmodified.
 *
 * Xref:      [Henikoff94b]; squid::weight.c::PositionBasedWeights().
 */
int
esl_msaweight_PB(ESL_MSA *msa)
{
  int    *nres = NULL;   	/* counts of each residue observed in a column */
  int     ntotal;		/* number of different symbols observed in a column */
  int     rlen;			/* number of residues in a sequence */
  int     idx, pos, i;
  int     K;			/* alphabet size */
  int     status;

  /* Contract checks
   */
  ESL_DASSERT1( (msa->nseq >= 1) );
  ESL_DASSERT1( (msa->alen >= 1) );
  if (msa->nseq == 1) { msa->wgt[0] = 1.0; return eslOK; }

  /* Initialize
   */
  if (! (msa->flags & eslMSA_DIGITAL)) 
    { ESL_ALLOC(nres, sizeof(int) * 26);          K = 26;          }
#ifdef eslAUGMENT_ALPHABET
  else 
    { ESL_ALLOC(nres, sizeof(int) * msa->abc->K); K = msa->abc->K; }
#endif

  esl_vec_DSet(msa->wgt, msa->nseq, 0.);

  /* This section handles text alignments */
  if (! (msa->flags & eslMSA_DIGITAL)) 
    {
      for (pos = 0; pos < msa->alen; pos++)
	{
	  /* Collect # of letters A..Z in this column, and total */
	  esl_vec_ISet(nres, K, 0.);
	  for (idx = 0; idx < msa->nseq; idx++)
	    if (isalpha((int) msa->aseq[idx][pos]))
	      nres[toupper((int) msa->aseq[idx][pos]) - 'A'] ++;
	  for (ntotal = 0, i = 0; i < K; i++) if (nres[i] > 0) ntotal++;

	  /* Bump weight on each seq by PB rule */
	  if (ntotal > 0) {
	    for (idx = 0; idx < msa->nseq; idx++) {
	      if (isalpha((int) msa->aseq[idx][pos]))
		msa->wgt[idx] += 1. / 
		  (double) (ntotal * nres[toupper((int) msa->aseq[idx][pos]) - 'A'] );
	    }
	  }
	}

      /* first normalization by # of residues counted in each seq */
      for (idx = 0; idx < msa->nseq; idx++) {
	for (rlen = 0, pos = 0; pos < msa->alen; pos++) 
      	  if (isalpha((int) msa->aseq[idx][pos])) rlen++;
	if (ntotal > 0) msa->wgt[idx] /= (double) rlen;
	/* if rlen == 0 for this seq, its weight is still 0.0, as initialized. */
      }
    }

  /* This section handles digital alignments. */
#ifdef eslAUGMENT_ALPHABET
  else
    {
      for (pos = 1; pos <= msa->alen; pos++)
	{
	  /* Collect # of residues 0..K-1 in this column, and total # */
	  esl_vec_ISet(nres, K, 0.);
	  for (idx = 0; idx < msa->nseq; idx++)
	    if (esl_abc_XIsCanonical(msa->abc, msa->ax[idx][pos]))
	      nres[(int) msa->ax[idx][pos]] ++;
	  for (ntotal = 0, i = 0; i < K; i++) if (nres[i] > 0) ntotal++;

	  /* Bump weight on each sequence by PB rule */
	  if (ntotal > 0) {
	    for (idx = 0; idx < msa->nseq; idx++) {
	      if (esl_abc_XIsCanonical(msa->abc, msa->ax[idx][pos]))
		msa->wgt[idx] += 1. / (double) (ntotal * nres[msa->ax[idx][pos]]);
	    }
	  }
	}

      /* first normalization by # of residues counted in each seq */
      for (idx = 0; idx < msa->nseq; idx++)
	{
	  for (rlen = 0, pos = 1; pos <= msa->alen; pos++) 
	    if (esl_abc_XIsCanonical(msa->abc, msa->ax[idx][pos])) rlen++;
	  if (rlen > 0) msa->wgt[idx] /= (double) rlen;
	  /* if rlen == 0 for this seq, its weight is still 0.0, as initialized. */
	}
    }
#endif

  /* Make weights normalize up to nseq, and return.  In pathological
   * case where all wgts were 0 (no seqs contain any unambiguous
   * residues), weights become 1.0.
   */
  esl_vec_DNorm(msa->wgt, msa->nseq);
  esl_vec_DScale(msa->wgt, msa->nseq, (double) msa->nseq);	
  msa->flags |= eslMSA_HASWGTS;

  free(nres);
  return eslOK;

 ERROR:
  if (nres != NULL) free(nres);
  return status;
}
int
main(int argc, char **argv)
{
  ESL_GETOPTS  *go      = NULL;	/* command line configuration      */
  struct cfg_s  cfg;     	/* application configuration       */
  char         *basename= NULL;	/* base of the output file names   */
  char         *alifile = NULL;	/* alignment file name             */
  char         *dbfile  = NULL;	/* name of seq db file             */
  char          outfile[256];	/* name of an output file          */
  int           alifmt;		/* format code for alifile         */
  int           dbfmt;		/* format code for dbfile          */
  ESLX_MSAFILE  *afp    = NULL;	/* open alignment file             */
  ESL_MSA      *origmsa = NULL;	/* one multiple sequence alignment */
  ESL_MSA      *msa     = NULL;	/* MSA after frags are removed     */
  ESL_MSA      *trainmsa= NULL;	/* training set, aligned           */
  ESL_STACK    *teststack=NULL; /* test set: stack of ESL_SQ ptrs  */
  int           status;		/* easel return code               */
  int           nfrags;		/* # of fragments removed          */
  int           ntestdom;       /* # of test domains               */
  int           ntest;		/* # of test sequences created     */
  int           nali;		/* number of alignments read       */
  double        avgid;
  
  
  /* Parse command line */
  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf);
  if (esl_opt_VerifyConfig(go)               != eslOK) cmdline_failure(argv[0], "Error in app configuration:   %s\n", go->errbuf);
  if (esl_opt_GetBoolean(go, "-h"))                    cmdline_help(argv[0], go);
  if (esl_opt_ArgNumber(go)                  != 3)     cmdline_failure(argv[0], "Incorrect number of command line arguments\n");
  basename = esl_opt_GetArg(go, 1); 
  alifile  = esl_opt_GetArg(go, 2);
  dbfile   = esl_opt_GetArg(go, 3);
  alifmt   = eslMSAFILE_STOCKHOLM;
  dbfmt    = eslSQFILE_FASTA;

  /* Set up the configuration structure shared amongst functions here */
  if (esl_opt_IsDefault(go, "--seed"))   cfg.r = esl_randomness_CreateTimeseeded();
  else                                   cfg.r = esl_randomness_Create(esl_opt_GetInteger(go, "--seed"));
  cfg.abc        = NULL;		          /* until we open the MSA file, below */
  cfg.fragfrac   = esl_opt_GetReal(go, "-F");
  cfg.idthresh1  = esl_opt_GetReal(go, "-1");
  cfg.idthresh2  = esl_opt_GetReal(go, "-2");
  cfg.test_lens  = NULL;
  cfg.ntest      = 0;
  cfg.max_ntest  = (esl_opt_IsOn(go, "--maxtest")  ? esl_opt_GetInteger(go, "--maxtest")  : 0); 
  cfg.max_ntrain = (esl_opt_IsOn(go, "--maxtrain") ? esl_opt_GetInteger(go, "--maxtrain") : 0); 

  /* Open the output files */ 
  if (snprintf(outfile, 256, "%s.msa", basename) >= 256)  esl_fatal("Failed to construct output MSA file name");
  if ((cfg.out_msafp = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open MSA output file %s\n", outfile);
  if (snprintf(outfile, 256, "%s.fa",  basename) >= 256)  esl_fatal("Failed to construct output FASTA file name");
  if ((cfg.out_seqfp = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open FASTA output file %s\n", outfile);
  if (snprintf(outfile, 256, "%s.pos", basename) >= 256)  esl_fatal("Failed to construct pos test set summary file name");
  if ((cfg.possummfp = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open pos test set summary file %s\n", outfile);
  if (snprintf(outfile, 256, "%s.neg", basename) >= 256)  esl_fatal("Failed to construct neg test set summary file name");
  if ((cfg.negsummfp = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open neg test set summary file %s\n", outfile);
  if (snprintf(outfile, 256, "%s.tbl", basename) >= 256)  esl_fatal("Failed to construct benchmark table file name");
  if ((cfg.tblfp     = fopen(outfile, "w"))      == NULL) esl_fatal("Failed to open benchmark table file %s\n", outfile);
  if (esl_opt_GetBoolean(go, "--pid")) {
    if (snprintf(outfile, 256, "%s.pid", basename) >= 256)  esl_fatal("Failed to construct %%id table file name");
    if ((cfg.pidfp   = fopen(outfile, "w"))        == NULL) esl_fatal("Failed to open %%id table file %s\n", outfile);
  } else cfg.pidfp   = NULL;

  /* Open the MSA file, digital mode; determine alphabet */
  if      (esl_opt_GetBoolean(go, "--amino"))   cfg.abc = esl_alphabet_Create(eslAMINO);
  else if (esl_opt_GetBoolean(go, "--dna"))     cfg.abc = esl_alphabet_Create(eslDNA);
  else if (esl_opt_GetBoolean(go, "--rna"))     cfg.abc = esl_alphabet_Create(eslRNA);

  status = eslx_msafile_Open(&(cfg.abc), alifile, NULL, alifmt, NULL, &afp);
  if (status != eslOK) eslx_msafile_OpenFailure(afp, status);

  if (cfg.abc->type == eslAMINO) esl_composition_SW34(cfg.fq);
  else                           esl_vec_DSet(cfg.fq, cfg.abc->K, 1.0 / (double) cfg.abc->K);

  /* Open and process the dbfile; make sure it's in the same alphabet */
  process_dbfile(&cfg, dbfile, dbfmt);

  /* Read and process MSAs one at a time  */
  nali = 0;
  while ((status = eslx_msafile_Read(afp, &origmsa)) != eslEOF)
    {
      if (status != eslOK) eslx_msafile_ReadFailure(afp, status);
      esl_msa_ConvertDegen2X(origmsa); 
      esl_msa_Hash(origmsa);

      remove_fragments(&cfg, origmsa, &msa, &nfrags);
      separate_sets   (&cfg, msa, &trainmsa, &teststack);

      if ( esl_stack_ObjectCount(teststack) >= 2) 
	{
	  /* randomize test domain order, and apply size limit if any */
	  esl_stack_Shuffle(cfg.r, teststack);
	  if (cfg.max_ntest) pstack_select_topn(&teststack, cfg.max_ntest);
	  ntestdom =  esl_stack_ObjectCount(teststack);

	  /* randomize training set alignment order, and apply size limit if any */
	  esl_msashuffle_PermuteSequenceOrder(cfg.r, trainmsa);
	  if (cfg.max_ntrain) msa_select_topn(&trainmsa, cfg.max_ntrain);
	  esl_msa_MinimGaps(trainmsa, NULL, NULL, FALSE);
	  
	  if (esl_opt_GetBoolean(go, "--pid")) write_pids(cfg.pidfp, origmsa, trainmsa, teststack);

	  synthesize_positives(go, &cfg, msa->name, teststack, &ntest);

	  eslx_msafile_Write(cfg.out_msafp, trainmsa, eslMSAFILE_STOCKHOLM);

	  esl_dst_XAverageId(cfg.abc, trainmsa->ax, trainmsa->nseq, 10000, &avgid); /* 10000 is max_comparisons, before sampling kicks in */
	  fprintf(cfg.tblfp, "%-20s  %3.0f%% %6d %6d %6d %6d %6d %6d\n", msa->name, 100.*avgid, (int) trainmsa->alen, msa->nseq, nfrags, trainmsa->nseq, ntestdom, ntest);
	  nali++;
	}

      esl_msa_Destroy(trainmsa);
      esl_msa_Destroy(origmsa);
      esl_msa_Destroy(msa);
    }
  if  (nali == 0) esl_fatal("No alignments found in file %s\n", alifile);
  
  synthesize_negatives(go, &cfg, esl_opt_GetInteger(go, "-N"));

  fclose(cfg.out_msafp);
  fclose(cfg.out_seqfp);
  fclose(cfg.possummfp);
  fclose(cfg.negsummfp);
  fclose(cfg.tblfp);
  if (cfg.pidfp) fclose(cfg.pidfp);
  esl_randomness_Destroy(cfg.r);
  esl_alphabet_Destroy(cfg.abc);
  eslx_msafile_Close(afp);
  esl_getopts_Destroy(go);
  return 0;
}
/* dump_insert_info
 *                   
 * Given an MSA with RF annotation, print out information about how many 'insertions' come
 * after each non-gap RF column (consensus column). 
 */
static int dump_insert_info(FILE *fp, ESL_MSA *msa, int use_weights, int nali, int *i_am_rf, char *alifile, char *errbuf)
{
  int status;
  int apos, rfpos;
  double **ict;
  double *total_ict;
  int i;
  int rflen;
  double seqwt; /* weight of current sequence */
  double nseq;

  /* contract check */
  if(! (msa->flags & eslMSA_DIGITAL)) ESL_XFAIL(eslEINVAL, errbuf, "in dump_insert_info(), msa must be digitized.");
  if(msa->rf == NULL) ESL_XFAIL(eslEINVAL, errbuf, "No #=GC RF markup in alignment, it is needed for --iinfo.");
  if(i_am_rf == NULL) ESL_XFAIL(eslEINVAL, errbuf, "internal error, dump_insert_info() i_am_rf is NULL.");
  if(use_weights && msa->wgt == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "dump_insert_info(): use_weights==TRUE but msa->wgt == NULL");

  ESL_ALLOC(total_ict, sizeof(double) * (msa->alen+2));
  esl_vec_DSet(total_ict, (msa->alen+2), 0.);

  ESL_ALLOC(ict,  sizeof(double *) * (msa->alen+2));
  for(i = 0; i <= msa->alen; i++)
    {
      ESL_ALLOC(ict[i],  sizeof(double) * (msa->nseq));
      esl_vec_DSet(ict[i], (msa->nseq), 0.);
    }

  fprintf(fp, "# Insert information:\n");
  fprintf(fp, "# Alignment file: %s\n", alifile);
  fprintf(fp, "# Alignment idx:  %d\n", nali);
  if(msa->name != NULL) { fprintf(fp, "# Alignment name: %s\n", msa->name); }
  fprintf(fp, "# rfpos is the nongap RF position after which insertions occur\n");
  fprintf(fp, "# An rfpos of '0' indicates insertions before the first nongap RF position\n");
  fprintf(fp, "# Number of sequences: %d\n", msa->nseq);
  if(use_weights) { fprintf(fp, "# IMPORTANT: Counts are weighted based on sequence weights in alignment file.\n"); }
  else            { fprintf(fp, "# Sequence weights from alignment were ignored (if they existed).\n"); }
  fprintf(fp, "#\n");

  fprintf(fp, "# %8s  %10s  %8s  %8s\n", "rfpos",    "nseq w/ins",  "freq ins", "avg len");
  fprintf(fp, "# %8s  %10s  %8s  %8s\n", "--------", "----------", "--------", "--------");

  rflen = 0;
  for(apos = 1; apos <= msa->alen; apos++)
    if(i_am_rf[apos-1]) rflen++;

  rfpos = 0;
  for(apos = 1; apos <= msa->alen; apos++)
    {
      if(i_am_rf[apos-1]) rfpos++;
      else {
	for(i = 0; i < msa->nseq; i++) { 
	  seqwt = use_weights ? msa->wgt[i] : 1.0;
	  if(esl_abc_XIsResidue(msa->abc, msa->ax[i][apos])) { 
	    ict[rfpos][i]++;
	    total_ict[rfpos] += seqwt;
	  }
	}	
      }  
    }
  rflen = rfpos;

  for(rfpos = 0; rfpos <= rflen; rfpos++)
    {
      nseq = 0.;
      for(i = 0; i < msa->nseq; i++) { 
	if(ict[rfpos][i] >= 1) { 
	  seqwt = use_weights ? msa->wgt[i] : 1.0;
	  nseq += seqwt;
	}
      }
      if(nseq > 0.) 
	fprintf(fp, "  %8d  %10.1f  %8.6f  %8.3f\n", rfpos, nseq, nseq / (float) msa->nseq, ((float) total_ict[rfpos] / (float) nseq));
    }
  fprintf(fp, "//\n");

  for(i = 0; i <= msa->alen; i++) free(ict[i]);
  free(ict);
  free(total_ict);

  return eslOK;

 ERROR:
  return status;
}
/* dump_infocontent_info
 *                   
 * Given an MSA with RF annotation, dump information content per column data to 
 * an open output file.
 */
static int dump_infocontent_info(FILE *fp, ESL_ALPHABET *abc, double **abc_ct, int use_weights, int nali, int64_t alen, int nseq, int *i_am_rf, char *msa_name, char *alifile, char *errbuf)
{
  int status;
  int apos, rfpos;
  double bg_ent;
  double *bg = NULL;
  double *abc_freq = NULL;
  double nnongap;

  ESL_ALLOC(bg, sizeof(double) * abc->K);
  esl_vec_DSet(bg, abc->K, 1./(abc->K));
  bg_ent = esl_vec_DEntropy(bg, abc->K);
  free(bg);

  ESL_ALLOC(abc_freq, sizeof(double) * abc->K);


  fprintf(fp, "# Information content per column (bits):\n");
  fprintf(fp, "# Alignment file: %s\n", alifile);
  fprintf(fp, "# Alignment idx:  %d\n", nali);
  if(msa_name != NULL) { fprintf(fp, "# Alignment name: %s\n", msa_name); }
  fprintf(fp, "# Number of sequences: %d\n", nseq);
  if(use_weights) { fprintf(fp, "# IMPORTANT: Counts are weighted based on sequence weights in alignment file.\n"); }
  else            { fprintf(fp, "# Sequence weights from alignment were ignored (if they existed).\n"); }
  fprintf(fp, "#\n");

  if(i_am_rf != NULL) { 
    fprintf(fp, "# %7s  %7s  %10s  %10s\n", "rfpos",    "alnpos",  "freqnongap", "info(bits)");
    fprintf(fp, "# %7s  %7s  %10s  %10s\n", "-------", "-------",  "----------", "----------");
  }  
  else { 
    fprintf(fp, "# %7s  %10s  %10s\n", "alnpos",  "freqnongap", "info(bits)");
    fprintf(fp, "# %7s  %10s  %10s\n", "-------", "----------", "----------");
  }

  rfpos = 0;
  for(apos = 0; apos < alen; apos++) {
    if(i_am_rf != NULL) { 
      if(i_am_rf[apos]) { 
	fprintf(fp, "  %7d", rfpos+1);
	rfpos++; 
      }
      else { 
	fprintf(fp, "  %7s", "-");
      }
    }
    nnongap = esl_vec_DSum(abc_ct[apos], abc->K);
    esl_vec_DCopy(abc_ct[apos], abc->K, abc_freq);
    esl_vec_DNorm(abc_freq, abc->K);
    fprintf(fp, "  %7d  %10.8f  %10.8f\n", apos+1, 
	    nnongap / (nnongap + abc_ct[apos][abc->K]),
	    (bg_ent - esl_vec_DEntropy(abc_freq, abc->K)));
  }
  fprintf(fp, "//\n");

  if(abc_freq != NULL) free(abc_freq);

  return eslOK;

 ERROR:
  ESL_FAIL(eslEINVAL, errbuf, "out of memory");
  return status; /* NEVERREACHED */
}
/* count_msa()
 *                   
 * Given an msa, count residues, and optionally base pairs and
 * posterior probabilities per column and store them in <ret_abc_ct>
 * and <ret_pp_ct>.
 * 
 * <ret_abc_ct> [0..apos..alen-1][0..abc->K]:
 * - per position count of each symbol in alphabet over all seqs.
 * 
 * <ret_bp_ct>  [0..apos..alen-1][0..abc->Kp-1][0..abc->Kp-1] 
 * - per (non-pknotted) consensus basepair count of each possible basepair 
 *   over all seqs basepairs are indexed by 'i' the minimum of 'i:j' for a 
 *   pair between i and j, where i < j. Note that non-canonicals and 
 *   gaps and the like are all stored independently.
 *
 * <ret_pp_ct> [0..apos..alen-1][0..11]
 * - per position count of each posterior probability code over all seqs.
 * 
 * A 'gap' has a looser definition than in esl_abc here, esl_abc's gap, 
 * missing residues and nonresidues are all considered 'gaps' here.
 * 
 * If we encounter an error, we return non-eslOK status and fill
 * errbuf with error message.
 * 
 * Returns eslOK upon success.
 */
static int count_msa(ESL_MSA *msa, char *errbuf, int nali, int no_ambig, int use_weights, double ***ret_abc_ct, double ****ret_bp_ct, double ***ret_pp_ct)
{
  int status;
  double  **abc_ct = NULL;
  double ***bp_ct = NULL;
  int       apos, rpos, i, x;
  int       nppvals = 12;         /* '0'-'9' = 0-9, '*' = 10, gap = '11' */
  double  **pp_ct = NULL;         /* [0..alen-1][0..nppvals-1] per position count of each possible PP char over all seqs */
  int       ppidx; 
  /* variables related to getting bp counts */
  int      *ct = NULL;            /* 0..alen-1 base pair partners array for current sequence */
  char     *ss_nopseudo = NULL;   /* no-pseudoknot version of structure */
  double    seqwt;  /* weight of current sequence, always 1.0 if !use_weights */

  if(! (msa->flags & eslMSA_DIGITAL)) ESL_FAIL(eslEINVAL, errbuf, "count_msa() contract violation, MSA is not digitized");
  if(use_weights && msa->wgt == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "count_msa(): use_weights==TRUE but msa->wgt == NULL");

  /* allocate pp_ct array, if nec */
  if(ret_pp_ct != NULL) { 
    if(msa->pp == NULL) ESL_FAIL(eslEINVAL, errbuf, "count_msa() ret_pp_ct != NULL, but msa->pp is NULL");
    ESL_ALLOC(pp_ct, sizeof(double *) * msa->alen);
    for(apos = 0; apos < msa->alen; apos++) { 
      ESL_ALLOC(pp_ct[apos], sizeof(double) * nppvals);
      esl_vec_DSet(pp_ct[apos], nppvals, 0.);
    }
  }

  /* allocate and initialize bp_ct, if nec */
  if(ret_bp_ct != NULL) { 
    ESL_ALLOC(bp_ct,  sizeof(double **) * msa->alen); 
    /* get ct array which defines the consensus base pairs */
    ESL_ALLOC(ct,  sizeof(int)  * (msa->alen+1));
    ESL_ALLOC(ss_nopseudo, sizeof(char) * (msa->alen+1));
    esl_wuss_nopseudo(msa->ss_cons, ss_nopseudo);
    if ((status = esl_wuss2ct(ss_nopseudo, msa->alen, ct)) != eslOK) ESL_FAIL(status, errbuf, "Consensus structure string is inconsistent.");
    for(apos = 0; apos < msa->alen; apos++) { 
      /* careful ct is indexed 1..alen, not 0..alen-1 */
      if(ct[(apos+1)] > (apos+1)) { /* apos+1 is an 'i' in an i:j pair, where i < j */
	ESL_ALLOC(bp_ct[apos], sizeof(double *) * (msa->abc->Kp));
	for(x = 0; x < msa->abc->Kp; x++) { 
	  ESL_ALLOC(bp_ct[apos][x], sizeof(double) * (msa->abc->Kp));
	  esl_vec_DSet(bp_ct[apos][x], msa->abc->Kp, 0.);
	}
      }
      else { /* apos+1 is not an 'i' in an i:j pair, where i < j, set to NULL */
	bp_ct[apos] = NULL;
      }
    }
  }

  ESL_ALLOC(abc_ct, sizeof(double *) * msa->alen); 
  for(apos = 0; apos < msa->alen; apos++) { 
    ESL_ALLOC(abc_ct[apos], sizeof(double) * (msa->abc->K+1));
    esl_vec_DSet(abc_ct[apos], (msa->abc->K+1), 0.);
  }

  for(i = 0; i < msa->nseq; i++) { 
    seqwt = use_weights ? msa->wgt[i] : 1.0;

    for(apos = 0; apos < msa->alen; apos++) { /* update appropriate abc count, careful, ax ranges from 1..msa->alen (but abc_ct is 0..msa->alen-1) */
      if((! no_ambig) || (! esl_abc_XIsDegenerate(msa->abc, msa->ax[i][apos+1]))) { /* skip ambiguities (degenerate residues) if no_ambig is TRUE */
	if((status = esl_abc_DCount(msa->abc, abc_ct[apos], msa->ax[i][apos+1], seqwt)) != eslOK) ESL_FAIL(status, errbuf, "problem counting residue %d of seq %d", apos, i);
      }
    }

    /* get bp counts, if nec */
    if(bp_ct != NULL) { 
      for(apos = 0; apos < msa->alen; apos++) { /* update appropriate abc count, careful, ax ranges from 1..msa->alen (but abc_ct is 0..msa->alen-1) */
	if(bp_ct[apos] != NULL) { /* our flag for whether position (apos+1) is an 'i' in an i:j pair where i < j */
	  rpos = ct[apos+1] - 1; /* ct is indexed 1..alen */
	  bp_ct[apos][msa->ax[i][apos+1]][msa->ax[i][rpos+1]] += seqwt;
	}
      }
    }

    /* get PP counts, if nec  */
    if(pp_ct != NULL) { 
      if(msa->pp[i] != NULL) { 
	for(apos = 0; apos < msa->alen; apos++) { 
	  if((! no_ambig) || (! esl_abc_XIsDegenerate(msa->abc, msa->ax[i][apos+1]))) { /* skip ambiguities (degenerate residues) if no_ambig is TRUE */
	    if((ppidx = get_pp_idx(msa->abc, msa->pp[i][apos])) == -1) ESL_FAIL(eslEFORMAT, errbuf, "bad #=GR PP char: %c", msa->pp[i][apos]);
	    pp_ct[apos][ppidx] += seqwt;
	  }
	}
      }
    }
  }

  *ret_abc_ct  = abc_ct;
  if(ret_bp_ct != NULL) *ret_bp_ct = bp_ct; /* we only allocated bp_ct if ret_bp_ct != NULL */
  if(ret_pp_ct != NULL) *ret_pp_ct = pp_ct; /* we only allocated pp_ct if ret_pp_ct != NULL */

  if(ss_nopseudo != NULL) free(ss_nopseudo);
  if(ct != NULL) free(ct);

  return eslOK;

 ERROR:
  if(abc_ct != NULL)  esl_Free2D((void **) abc_ct, msa->alen);
  if(bp_ct != NULL)   esl_Free3D((void ***) bp_ct, msa->alen, msa->abc->Kp);
  if(pp_ct != NULL)   esl_Free2D((void **) pp_ct, msa->alen);
  ESL_FAIL(status, errbuf, "Error, out of memory while counting important values in the msa.");
  return status; /* NEVERREACHED */
}
int
main(int argc, char **argv)
{
  ESL_GETOPTS    *go        = NULL;
  char           *seqfile   = NULL;
  ESL_SQFILE     *sqfp      = NULL;
  int             infmt     = eslSQFILE_UNKNOWN;
  int             alphatype = eslUNKNOWN;
  ESL_ALPHABET   *abc       = NULL;
  ESL_SQ         *sq        = NULL;
  int64_t         nseq      = 0;   
  int64_t         nres      = 0;
  int64_t         small     = 0;
  int64_t         large     = 0;
  double         *monoc     = NULL; /* monoresidue composition per sequence  */
  double         *monoc_all = NULL; /* monoresidue composition over all seqs */
  int             do_comp   = FALSE;
  int             status    = eslOK;
  int             wstatus;
  int             i;
  int             x;

  /* Parse command line */
  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf);
  if (esl_opt_VerifyConfig(go)               != eslOK) cmdline_failure(argv[0], "Error in app configuration: %s\n",   go->errbuf);
  if (esl_opt_GetBoolean(go, "-h") )                   cmdline_help(argv[0], go);
  if (esl_opt_ArgNumber(go) != 1)                      cmdline_failure(argv[0], "Incorrect number of command line arguments.\n");

  seqfile = esl_opt_GetArg(go, 1);
  do_comp = esl_opt_GetBoolean(go, "-c");

  if (esl_opt_GetString(go, "--informat") != NULL) {
    infmt = esl_sqio_FormatCode(esl_opt_GetString(go, "--informat"));
    if (infmt == eslSQFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --informat"); 
  }

  status = esl_sqfile_Open(seqfile, infmt, NULL, &sqfp);
  if      (status == eslENOTFOUND) esl_fatal("No such file %s", seqfile);
  else if (status == eslEFORMAT)   esl_fatal("Format of seqfile %s unrecognized.", seqfile);
  else if (status != eslOK)        esl_fatal("Open failed, code %d.", status);

  if      (esl_opt_GetBoolean(go, "--rna"))   alphatype = eslRNA;
  else if (esl_opt_GetBoolean(go, "--dna"))   alphatype = eslDNA;
  else if (esl_opt_GetBoolean(go, "--amino")) alphatype = eslAMINO;
  else {
    status = esl_sqfile_GuessAlphabet(sqfp, &alphatype);
    if      (status == eslEAMBIGUOUS) esl_fatal("Couldn't guess alphabet from first sequence in %s", seqfile);
    else if (status == eslEFORMAT)    esl_fatal("Sequence file parse error, line %d of file %s:\n%s\n",
					       sqfp->linenumber, seqfile, sqfp->errbuf);
    else if (status == eslENODATA)    esl_fatal("Sequence file %s contains no data?", seqfile);
    else if (status != eslOK)         esl_fatal("Failed to guess alphabet (error code %d)\n", status);
  }
  abc = esl_alphabet_Create(alphatype);
  sq  = esl_sq_CreateDigital(abc);
  esl_sqfile_SetDigital(sqfp, abc);

  if (do_comp) {
    ESL_ALLOC(monoc,     (abc->Kp) * sizeof(double));  
    ESL_ALLOC(monoc_all, (abc->Kp) * sizeof(double));  
    esl_vec_DSet(monoc_all, abc->Kp, 0.0);
    esl_vec_DSet(monoc,     abc->Kp, 0.0);
  }

  while ((wstatus = esl_sqio_ReadWindow(sqfp, 0, 4096, sq)) != eslEOF)
    {
      if (wstatus == eslOK)
	{
	  if (do_comp) 
	    for (i = 1; i <= sq->n; i++) 
	      monoc[sq->dsq[i]]++;
	}
      else if (wstatus == eslEOD) 
	{			
	  if (nseq == 0) { small = large = sq->L; }
	  else {
	    small = ESL_MIN(small, sq->L);
	    large = ESL_MAX(large, sq->L);
	  }

	  if (esl_opt_GetBoolean(go, "-a")) {
	    printf("= %-20s %8" PRId64 " %s\n", sq->name, sq->L, (sq->desc != NULL) ? sq->desc : "");
	  }

	  nres += sq->L;
	  nseq++;
	  esl_sq_Reuse(sq);
	  if (do_comp) {
	    esl_vec_DAdd(monoc_all, monoc, abc->Kp);
	    esl_vec_DSet(monoc, abc->Kp, 0.0);
	  }
	}
      else if (wstatus == eslEFORMAT)
	{
	  esl_fatal("Failed to parse sequence at line %ld, file %s:\n%s", 
		    (long) sqfp->linenumber, sqfp->filename, sqfp->errbuf);
	}
      else 
	esl_fatal("Failed in reading sequence:\n%s\n", sqfp->errbuf);
    }

  printf("Format:              %s\n",   esl_sqio_DescribeFormat(sqfp->format));
  printf("Alphabet type:       %s\n",   esl_abc_DescribeType(abc->type));
  printf("Number of sequences: %" PRId64 "\n", nseq);
  printf("Total # residues:    %" PRId64 "\n", nres);
  printf("Smallest:            %" PRId64 "\n", small);
  printf("Largest:             %" PRId64 "\n", large);
  printf("Average length:      %.1f\n", (float) nres / (float) nseq);

  if (do_comp) {
    printf("\nResidue composition:\n");
    for (x = 0; x < abc->Kp; x++)
      if (x < abc->K || monoc_all[x] > 0)
	printf("residue: %c   %10.0f  %.4f\n", abc->sym[x], monoc_all[x], monoc_all[x] / (double) nres);
    free(monoc);
    free(monoc_all);
  }

  esl_alphabet_Destroy(abc);
  esl_sq_Destroy(sq);
  esl_sqfile_Close(sqfp);
  esl_getopts_Destroy(go);
  return 0;

 ERROR:
  return status;
}
Exemple #17
0
/* Function:  p7_prior_CreateNucleicNew()
 * Incept:    TJW, Thu Nov 12 21:15:11 EST 2009 [Couch at home]
 *
 * Purpose:   Creates the default mixture Dirichlet prior for nucleotiden
 *            sequences.
 *
 *            The transition priors (match, insert, delete) are all
 *            single Dirichlets, originally trained by Graeme
 *            Mitchison in the mid-1990's. Notes have been lost, but
 *            we believe they were trained on an early version of
 *            Pfam.
 *
 *            The match emission prior is an eight-component mixture
 *            trained against a portion of the rmark dataset
 *
 *            The insert emission prior is a single Dirichlet with
 *            high $|\alpha|$, such that insert emission probabilities
 *            are essentially fixed by the prior, regardless of
 *            observed count data.
 *
 * Returns:   a pointer to the new <P7_PRIOR> structure.
 */
P7_PRIOR *
p7_prior_CreateNucleic(void)
{
  int status;
  P7_PRIOR *pri = NULL;
  int q;


  /* Match emission priors are trained on rmark database [Nawrocki 08]
   */

  /* Plus-1 Laplace prior
  int num_comp = 1;
  static double defmq[2] =  { 1.0  };
  static double defm[1][4] = {
         { 1.0, 1.0, 1.0, 1.0} //
  };
*/
/*
  int num_comp = 2;
  static double defmq[2] =  { 0.42,  0.58   };
  static double defm[2][4] = {
         { 0.94,   0.90,   0.89,   1.13}, //
         { 0.096,  0.078,  0.093,   0.089} //
  };
*/
/*
   //weird - but this performs marginally better than the best 2- 5- or 8-component mixtures tested
  // (on rmark - MER: 2 better than 5/8-comp  , 3 better than 2-comp )
   int num_comp = 4;
   static double defmq[4] = { 0.16, 0.29, 0.12, 0.43  };
   static double defm[4][4] = {
         { 0.36,   0.10,   5.3,   0.13}, // G
         { 0.05,  0.18,  0.03,   0.19}, // CT
         { 7.1,  0.13,  0.35,   0.17}, // A
         { 0.96,  0.92,  0.91,   1.19} // uniform
	};
 */

   /*On rmark, this model does only slightly better than the 2-component model
     It's chosen as the default on grounds of reasonableness, given that it shows
     a non-uniform transition:transversion ratio. It's based on the results
     of training against a portion of rmark, but the overspecified numbers
     resulting from that training have been rounded/simplified.
   */
   int num_comp = 5;
   static double defmq[5] = { 0.16, 0.13, 0.17, 0.15, 0.39 };
   static double defm[5][4] = {
         { 6.0,   0.2,  0.5,   0.2}, // A
         { 0.2,  8.0,  0.2,   0.5}, // C
         { 0.5,  0.2,  8.0,   0.2}, // G
         { 0.2,  0.5,  0.2,   4.0}, // T
         { 1.3,  1.2,  1.2,   1.4}   // uniform
	};


/* gives no improved performance in my hands over the 5-component model
  int num_comp = 8;
  static double defmq[8] = { 0.13, 0.08, 0.08, 0.13, 0.08, 0.08, 0.17, 0.25 } ;
  static double defm[8][4] = {
       { 4.0,   0.3,   0.5,   0.4}, // A
       { 0.3,   22.0,  0.3,   0.8}, // C
       { 1.0,   0.4,   28.0,  0.4}, // G
       { 0.5,   0.8,   0.3,   6.0}, // T
       { 1.8,   0.8,   6.0,   1.0}, // AG
       { 0.6,   6.0,   0.6,   2.4}, // CT
       { 0.03,  0.01,  0.02,  0.02}, // anything, but highly conserved
       { 2.0,   2.0,   2.0,   2.0}  // anything, not much conservation
       };
 */

  ESL_ALLOC(pri, sizeof(P7_PRIOR));
  pri->tm = pri->ti = pri->td = pri->em = pri->ei = NULL;


  pri->tm = esl_mixdchlet_Create(1, 3);  // match transitions; single component; 3 params
  pri->ti = esl_mixdchlet_Create(1, 2);  // insert transitions; single component; 2 params
  pri->td = esl_mixdchlet_Create(1, 2);  // delete transitions; single component; 2 params
  pri->em = esl_mixdchlet_Create(num_comp, 4); // match emissions; X component; 4 params
  pri->ei = esl_mixdchlet_Create(1, 4); // insert emissions; single component; 4 params


  if (pri->tm == NULL || pri->ti == NULL || pri->td == NULL || pri->em == NULL || pri->ei == NULL) goto ERROR;

  /* Transition priors: roughly, learned from rmark benchmark - hand-beautified (trimming overspecified significant digits)
   */
  pri->tm->pq[0]       = 1.0;
  pri->tm->alpha[0][0] = 2.0; // TMM
  pri->tm->alpha[0][1] = 0.1; // TMI
  pri->tm->alpha[0][2] = 0.1; // TMD


  pri->ti->pq[0]       = 1.0;
  pri->ti->alpha[0][0] = 0.06; // TIM
  pri->ti->alpha[0][1] = 0.2; // TII

  pri->td->pq[0]       = 1.0;
  pri->td->alpha[0][0] = 0.1; // TDM
  pri->td->alpha[0][1] = 0.2; // TDD



  /* Match emission priors  */
  for (q = 0; q < num_comp; q++)
    {
      pri->em->pq[q] = defmq[q];
      esl_vec_DCopy(defm[q], 4, pri->em->alpha[q]);
    }


  /* Insert emission priors. Should that alphas be lower? higher?
   */
  pri->ei->pq[0] = 1.0;
  esl_vec_DSet(pri->ei->alpha[0], 4, 1.0);

  return pri;

 ERROR:
  if (pri != NULL) p7_prior_Destroy(pri);
  return NULL;
}
int
main(int argc, char **argv)
{

  int i,j;

  ESL_GETOPTS     *go = NULL;	/* command line processing                 */
  ESL_STOPWATCH   *w  = esl_stopwatch_Create();

  int              status;
  ESL_MSA      *msa         = NULL;
  FILE         *ofp         = NULL;    /* output file (default is stdout) */
  ESL_ALPHABET *abc         = NULL;    /* digital alphabet */

  char         *alifile;  /* name of the alignment file we're building HMMs from  */
  ESLX_MSAFILE *afp         = NULL;            /* open alifile  */
  int           infmt       = eslMSAFILE_UNKNOWN;    /* autodetect alignment format by default. */
  int           outfmt      = eslMSAFILE_STOCKHOLM;


  char         *postmsafile;  /* optional file to resave annotated, modified MSAs to  */
  FILE         *postmsafp = NULL;  /* open <postmsafile>, or NULL */

  int           mask_range_cnt = 0;
  uint32_t      mask_starts[100]; // over-the-top allocation.
  uint32_t      mask_ends[100];

  char         *rangestr;
  char         *range;


  int     *map = NULL; /* map[i]=j,  means model position i comes from column j of the alignment; 1..alen */

  int    keep_mm;

  /* Set processor specific flags */
  impl_Init();
  alifile     = NULL;
  postmsafile = NULL;

  /* Parse the command line
   */
  process_commandline(argc, argv, &go, &alifile, &postmsafile);
  keep_mm = esl_opt_IsUsed(go, "--apendmask");

  /* Initialize what we can in the config structure (without knowing the alphabet yet).
   * Fields controlled by masters are set up in usual_master() or mpi_master()
   * Fields used by workers are set up in mpi_worker()
   */
  ofp         = NULL;
  infmt         = eslMSAFILE_UNKNOWN;
  afp         = NULL;
  abc         = NULL;

  if (esl_opt_IsOn(go, "--informat")) {
    infmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--informat"));
    if (infmt == eslMSAFILE_UNKNOWN) p7_Fail("%s is not a recognized input sequence file format\n", esl_opt_GetString(go, "--informat"));
  }

  /* Determine output alignment file format */
  outfmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--outformat"));
  if (outfmt == eslMSAFILE_UNKNOWN)    p7_Fail(argv[0], "%s is not a recognized output MSA file format\n", esl_opt_GetString(go, "--outformat"));



  /* Parse the ranges */

  if (esl_opt_IsUsed(go, "--alirange")) {
    esl_strdup(esl_opt_GetString(go, "--alirange"), -1, &rangestr) ;
  } else if (esl_opt_IsUsed(go, "--modelrange")) {
    esl_strdup(esl_opt_GetString(go, "--modelrange"), -1, &rangestr) ;
  } else if (esl_opt_IsUsed(go, "--model2ali")) {
    esl_strdup(esl_opt_GetString(go, "--model2ali"), -1, &rangestr) ;
  } else if (esl_opt_IsUsed(go, "--ali2model")) {
    esl_strdup(esl_opt_GetString(go, "--ali2model"), -1, &rangestr) ;
  } else {
    if (puts("Must specify mask range with --modelrange, --alirange, --model2ali, or --ali2model\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto ERROR;
  }

  while ( (status = esl_strtok(&rangestr, ",", &range) ) == eslOK) {
    status = esl_regexp_ParseCoordString(range, mask_starts + mask_range_cnt, mask_ends + mask_range_cnt );
    if (status == eslESYNTAX) esl_fatal("range flags take coords <from>..<to>; %s not recognized", range);
    if (status == eslFAIL)    esl_fatal("Failed to find <from> or <to> coord in %s", range);

    mask_range_cnt++;
  }


  /* Start timing. */
  esl_stopwatch_Start(w);


  /* Open files, set alphabet.
   *   afp       - open alignment file for input
   *   abc       - alphabet expected or guessed in ali file
   *   postmsafp - open MSA output file
   *   ofp       - optional open output file, or stdout
   */
  if      (esl_opt_GetBoolean(go, "--amino"))   abc = esl_alphabet_Create(eslAMINO);
  else if (esl_opt_GetBoolean(go, "--dna"))     abc = esl_alphabet_Create(eslDNA);
  else if (esl_opt_GetBoolean(go, "--rna"))     abc = esl_alphabet_Create(eslRNA);
  else                                          abc = NULL;
  
  status = eslx_msafile_Open(&abc, alifile, NULL, infmt, NULL, &afp);
  if (status != eslOK) eslx_msafile_OpenFailure(afp, status);

  if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) {
    postmsafp = fopen(postmsafile, "w");
    if (postmsafp == NULL) p7_Fail("Failed to MSA output file %s for writing", postmsafile);
  }

  if (esl_opt_IsUsed(go, "-o")) 
    {
      ofp = fopen(esl_opt_GetString(go, "-o"), "w");
      if (ofp == NULL) p7_Fail("Failed to open -o output file %s\n", esl_opt_GetString(go, "-o"));
    } 
  else ofp = stdout;


  /* Looks like the i/o is set up successfully...
   * Initial output to the user
   */
  output_header(go, ofp, alifile, postmsafile);                                  /* cheery output header                                */

  /* read the alignment */
  if ((status = eslx_msafile_Read(afp, &msa)) != eslOK)  eslx_msafile_ReadFailure(afp, status);


  if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) {
    /* add/modify mmline for the mask */
    if (msa->mm == NULL) {
      ESL_ALLOC(msa->mm, msa->alen);
      keep_mm = FALSE;
    }

    if (!keep_mm)
      for (i=0; i<msa->alen; i++) msa->mm[i] = '.';

  }

  // convert model coordinates to alignment coordinates, if necessary
  if (esl_opt_IsUsed(go, "--modelrange") || esl_opt_IsUsed(go, "--model2ali") || esl_opt_IsUsed(go, "--ali2model") ) {

    float symfrac = esl_opt_GetReal(go, "--symfrac");
    int do_hand  =  esl_opt_IsOn(go, "--hand");
    int L;

    //same as p7_builder relative_weights
    if      (esl_opt_IsOn(go, "--wnone")  )                  { esl_vec_DSet(msa->wgt, msa->nseq, 1.); }
    else if (esl_opt_IsOn(go, "--wgiven") )                  ;
    else if (esl_opt_IsOn(go, "--wpb")    )                  status = esl_msaweight_PB(msa);
    else if (esl_opt_IsOn(go, "--wgsc")   )                  status = esl_msaweight_GSC(msa);
    else if (esl_opt_IsOn(go, "--wblosum"))                  status = esl_msaweight_BLOSUM(msa, esl_opt_GetReal(go, "--wid"));

    if ((status =  esl_msa_MarkFragments(msa, esl_opt_GetReal(go, "--fragthresh")))           != eslOK) goto ERROR;

    //build a map of model mask coordinates to alignment coords
    ESL_ALLOC(map, sizeof(int)     * (msa->alen+1));
    L = p7_Alimask_MakeModel2AliMap(msa, do_hand, symfrac, map );


    if ( esl_opt_IsUsed(go, "--model2ali") ) {
      //print mapping
      printf ("model coordinates     alignment coordinates\n");
      for (i=0; i<mask_range_cnt; i++)
        printf ("%8d..%-8d -> %8d..%-8d\n", mask_starts[i], mask_ends[i], map[mask_starts[i]-1], map[mask_ends[i]-1]);
      /* If I wanted to, I could print all the map values independently:
        printf("\n\n-----------\n");
        printf("Map\n");
        printf("---\n");
        for (i=0; i<L; i++)
          printf("%d -> %d\n", i+1, map[i]);
      */
    } else if ( esl_opt_IsUsed(go, "--ali2model") ) {
      //print mapping  (requires scanning the inverted map
      int alistart = 0;
      int aliend = 0;
      printf ("alignment coordinates     model coordinates\n");
      for (i=0; i<mask_range_cnt; i++) {
        //find j for ali positions
        while (map[alistart] < mask_starts[i] )
          alistart++;
        aliend = alistart;
        while (map[aliend] < mask_ends[i] )
          aliend++;

        printf ("   %8d..%-8d -> %8d..%-8d\n", map[alistart], map[aliend], alistart+1, aliend+1);
      }
    } else {
      //convert the mask coords based on map
      for (i=0; i<mask_range_cnt; i++) {
          mask_starts[i] = map[mask_starts[i]-1]; //-1 because mmline is offset by one relative to the 1-base alignment
          mask_ends[i]   = map[mask_ends[i]-1];
      }
    }
  }

  if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) {
    //overwrite '.' with 'm' everywhere the range says to do it
    for (i=0; i<mask_range_cnt; i++)
      for (j=mask_starts[i]; j<=mask_ends[i]; j++)
        msa->mm[j-1] = 'm';

    if ((status = eslx_msafile_Write(postmsafp, msa, outfmt))  != eslOK) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed");
  }

  esl_stopwatch_Stop(w);

  if (esl_opt_IsOn(go, "-o"))  fclose(ofp);
  if (postmsafp) fclose(postmsafp);
  if (afp)   eslx_msafile_Close(afp);
  if (abc)   esl_alphabet_Destroy(abc);

  esl_getopts_Destroy(go);
  esl_stopwatch_Destroy(w);
  return 0;


  ERROR:
   return eslFAIL;
}
Exemple #19
0
/* annotate_posterior_probability()
 * Synopsis:  Add posterior probability annotation lines to new MSA.
 */
static int
annotate_posterior_probability(ESL_MSA *msa, P7_TRACE **tr, const int *matmap, int M, int optflags)
{
  double *totp   = NULL;	/* total posterior probability in column <apos>: [0..alen-1] */
  int    *matuse = NULL;	/* #seqs with pp annotation in column <apos>: [0..alen-1] */
  int     idx;    		/* counter over sequences [0..nseq-1] */
  int     apos;			/* counter for alignment columns: pp's are [0..alen-1] (unlike ax) */
  int     z;			/* counter over trace positions [0..tr->N-1] */
  int     status;

  /* Determine if any of the traces have posterior probability annotation. */
  for (idx = 0; idx < msa->nseq; idx++)
    if (tr[idx]->pp != NULL) break;
  if (idx == msa->nseq) return eslOK;

  ESL_ALLOC(matuse, sizeof(double) * (msa->alen)); esl_vec_ISet(matuse, msa->alen, 0);
  ESL_ALLOC(totp,   sizeof(double) * (msa->alen)); esl_vec_DSet(totp,   msa->alen, 0.0);

  ESL_ALLOC(msa->pp, sizeof(char *) * msa->sqalloc);
  for (idx = 0; idx < msa->nseq; idx++)
    {
      if (tr[idx]->pp == NULL) { msa->pp[idx] = NULL; continue; }

      ESL_ALLOC(msa->pp[idx], sizeof(char) * (msa->alen+1));
      for (apos = 0; apos < msa->alen; apos++) msa->pp[idx][apos] = '.';
      msa->pp[idx][msa->alen] = '\0';

      apos = 0;
      for (z = 0; z < tr[idx]->N; z++)
	{
	  switch (tr[idx]->st[z]) {
	  case p7T_M: 
	    msa->pp[idx][matmap[tr[idx]->k[z]]-1] = p7_alidisplay_EncodePostProb(tr[idx]->pp[z]);  
	    totp  [matmap[tr[idx]->k[z]]-1]+= tr[idx]->pp[z];
	    matuse[matmap[tr[idx]->k[z]]-1]++;
	  case p7T_D:
	    apos = matmap[tr[idx]->k[z]]; 
	    break;

	  case p7T_I:
	    if ( !(optflags & p7_TRIM) || (tr[idx]->k[z] != 0 && tr[idx]->k[z] != M)) {
	      msa->pp[idx][apos] = p7_alidisplay_EncodePostProb(tr[idx]->pp[z]);  
	      apos++;
	    }
	    break;

	  case p7T_N:
	  case p7T_C:
	    if (! (optflags & p7_TRIM) && tr[idx]->i[z] > 0) {
	      msa->pp[idx][apos] = p7_alidisplay_EncodePostProb(tr[idx]->pp[z]);
	      apos++;
	    }
	    break;

	  case p7T_E:
	    apos = matmap[M];	/* set position for C-terminal tail */
	    break;
  
	  default:
	    break;
	  }
	}
    }
  for (; idx < msa->sqalloc; idx++) msa->pp[idx] = NULL; /* for completeness, following easel MSA conventions, but should be a no-op: nseq==sqalloc */

  /* Consensus posterior probability annotation: only on match columns */
  ESL_ALLOC(msa->pp_cons, sizeof(char) * (msa->alen+1));
  for (apos = 0; apos < msa->alen; apos++) msa->pp_cons[apos] = '.';
  msa->pp_cons[msa->alen] = '\0';
  for (apos = 0; apos < msa->alen; apos++)
    if (matuse[apos]) msa->pp_cons[apos] = p7_alidisplay_EncodePostProb( totp[apos] / (double) matuse[apos]);
  
  free(matuse);
  free(totp);
  return eslOK;

 ERROR:
  if (matuse  != NULL) free(matuse);
  if (totp    != NULL) free(totp);  
  if (msa->pp != NULL) esl_Free2D((void **) msa->pp, msa->sqalloc);
  return status;
}