/* Function: default_nucleic_prior()
 * 
 * Purpose:  Set the default DNA prior. (for now, almost a Laplace)
 */
static struct p7prior_s *
default_nucleic_prior(void)
{
  struct p7prior_s *pri;

  pri = P7AllocPrior();
  pri->strategy = PRI_DCHLET;

  /* The use of the Pfam-trained amino acid transition priors
   * here is TOTALLY bogus. But it works better than a straight
   * Laplace, esp. for Maxmodelmaker(). For example, a Laplace 
   * prior builds M=1 models for a single sequence GAATTC (at
   * one time an open "bug").
   */
  pri->tnum        = 1;
  pri->tq[0]       = 1.;
  pri->t[0][TMM]   = 0.7939;
  pri->t[0][TMI]   = 0.0278;
  pri->t[0][TMD]   = 0.0135;
  pri->t[0][TIM]   = 0.1551;
  pri->t[0][TII]   = 0.1331;
  pri->t[0][TDM]   = 0.9002;
  pri->t[0][TDD]   = 0.5630;
  
  pri->mnum  = 1;
  pri->mq[0] = 1.;
  FSet(pri->m[0], Alphabet_size, 1.);

  pri->inum  = 1;
  pri->iq[0] = 1.;
  FSet(pri->i[0], Alphabet_size, 1.);

  return pri;
}
Exemple #2
0
struct p7prior_s *
default_amino_prior(void)
{
  struct p7prior_s *pri;
  int x, q;

  static float defmq[5] = {
    0.178091, 0.056591, 0.0960191, 0.0781233, 0.0834977 };
 static float defm[5][6] = {
    { 0.270671, 0.039848, 0.017576, 0.016415, 0.014268, 0.216147 },
    { 0.021465, 0.010300, 0.011741, 0.010883, 0.385651, 0.029156 },
    { 0.561459, 0.045448, 0.438366, 0.764167, 0.087364, 0.583402 },
    { 0.070143, 0.011140, 0.019479, 0.094657, 0.013162, 0.073732 },
    { 0.041103, 0.014794, 0.005610, 0.010216, 0.153602, 0.012049 }
  };

  pri = P7AllocPrior();
  pri->mnum  = 5;
  for (q = 0; q < pri->mnum; q++)
    {
      pri->mq[q] = defmq[q];
      for (x = 0; x < 6; x++)
        pri->m[q][x] = defm[q][x];
    }
  return pri;
}
/* Function: P7LaplacePrior()
 * 
 * Purpose:  Create a Laplace plus-one prior. (single component Dirichlets). 
 *           Global alphabet info is assumed to have been set already.
 *
 * Args:     (void)
 *
 * Return:   prior. Allocated here; call FreePrior() to free it.
 */ 
struct p7prior_s *
P7LaplacePrior(void)
{
  struct p7prior_s *pri;
  
  pri = P7AllocPrior();
  pri->strategy = PRI_DCHLET;

  pri->tnum     = 1;
  pri->tq[0]    = 1.;
  FSet(pri->t[0], 8, 1.); 
  
  pri->mnum  = 1;
  pri->mq[0] = 1.;
  FSet(pri->m[0], Alphabet_size, 1.);

  pri->inum  = 1;
  pri->iq[0] = 1.;
  FSet(pri->i[0], Alphabet_size, 1.);

  return pri;
}
/* Function: P7ReadPrior()
 * 
 * Purpose:  Input a prior from disk file.
 */
struct p7prior_s *
P7ReadPrior(char *prifile) 
{
  FILE             *fp;
  struct p7prior_s *pri;
  char             *sptr;
  int               q, x;

  if ((fp = fopen(prifile, "r")) == NULL)
    Die("Failed to open HMMER prior file %s\n", prifile);
  pri = P7AllocPrior();

  /* First entry is the strategy: 
   * Only standard Dirichlet prior (simple or mixture) is supported in Plan7 so far
   */
  sptr = Getword(fp, sqdARG_STRING);
  s2upper(sptr);
  if      (strcmp(sptr, "DIRICHLET") == 0) pri->strategy = PRI_DCHLET;
  else Die("No such prior strategy %s; failed to parse file %s", sptr, prifile);

  /* Second entry is the alphabet type:
   * Amino or Nucleic
   */
  sptr = Getword(fp, sqdARG_STRING);
  s2upper(sptr);
  if (strcmp(sptr, "AMINO") == 0)
    { 
      if (Alphabet_type != hmmAMINO)
	Die("HMM and/or sequences are DNA/RNA; can't use protein prior %s", prifile);
    }
  else if (strcmp(sptr, "NUCLEIC") == 0)
    {
      if (Alphabet_type != hmmNUCLEIC)
	Die("HMM and/or sequences are protein; can't use DNA/RNA prior %s", prifile);
    }
  else 
    Die("Alphabet \"%s\" in prior file %s isn't valid.", sptr, prifile);

  /* State transition priors:
   * # of mixtures.
   * then for each mixture:
   *    prior P(q)
   *    Dirichlet terms for Tmm, Tmi, Tmd, Tim, Tii, Tid, Tdm, Tdi, Tdd
   */
  pri->tnum = atoi(Getword(fp, sqdARG_INT));
  if (pri->tnum < 0)
    Die("%d is bad; need at least one state transition mixture component", pri->tnum);
  if (pri->tnum > MAXDCHLET)
    Die("%d is bad, too many transition components (MAXDCHLET = %d)\n", MAXDCHLET);
  for (q = 0; q < pri->tnum; q++)
    {
      pri->tq[q]    = (float) atof(Getword(fp, sqdARG_FLOAT));
      for (x = 0; x < 7; x++) 
	pri->t[q][x] = (float) atof(Getword(fp, sqdARG_FLOAT));
    }

  /* Match emission priors:
   * # of mixtures.
   * then for each mixture:
   *    prior P(q)
   *    Dirichlet terms for Alphabet_size symbols in Alphabet
   */
  pri->mnum = atoi(Getword(fp, sqdARG_INT));
  if (pri->mnum < 0)
    Die("%d is bad; need at least one match emission mixture component", pri->mnum);
  if (pri->mnum > MAXDCHLET)
    Die("%d is bad; too many match components (MAXDCHLET = %d)\n", pri->mnum, MAXDCHLET);

  for (q = 0; q < pri->mnum; q++)
    {
      pri->mq[q] = (float) atof(Getword(fp, sqdARG_FLOAT));
      for (x = 0; x < Alphabet_size; x++) 
	pri->m[q][x] = (float) atof(Getword(fp, sqdARG_FLOAT));
    }
  
  /* Insert emission priors:
   * # of mixtures.
   * then for each mixture component:
   *    prior P(q)
   *    Dirichlet terms for Alphabet_size symbols in Alphabet
   */
  pri->inum = atoi(Getword(fp, sqdARG_INT));
  if (pri->inum < 0)
    Die("%d is bad; need at least one insert emission mixture component", pri->inum);
  if (pri->inum > MAXDCHLET)
    Die("%d is bad; too many insert components (MAXDCHLET = %d)\n", pri->inum,  MAXDCHLET);
  for (q = 0; q < pri->inum; q++)
    {
      pri->iq[q]  = (float) atof(Getword(fp, sqdARG_FLOAT));
      for (x = 0; x < Alphabet_size; x++) 
	pri->i[q][x] = (float) atof(Getword(fp, sqdARG_FLOAT));
    }

  fclose(fp);
  return pri;
}
/* Function: default_amino_prior()
 * 
 * Purpose:  Set the default protein prior.
 */
static struct p7prior_s *
default_amino_prior(void)
{
  struct p7prior_s *pri;
  int q, x;
				/* default match mixture coefficients */
  static float defmq[9] = {
    0.178091, 0.056591, 0.0960191, 0.0781233, 0.0834977, 
    0.0904123, 0.114468, 0.0682132, 0.234585 };

				/* default match mixture Dirichlet components */
  static float defm[9][20] = {
    { 0.270671, 0.039848, 0.017576, 0.016415, 0.014268, 
      0.131916, 0.012391, 0.022599, 0.020358, 0.030727, 
      0.015315, 0.048298, 0.053803, 0.020662, 0.023612,
      0.216147, 0.147226, 0.065438, 0.003758, 0.009621 },
    { 0.021465, 0.010300, 0.011741, 0.010883, 0.385651, 
      0.016416, 0.076196, 0.035329, 0.013921, 0.093517, 
      0.022034, 0.028593, 0.013086, 0.023011, 0.018866, 
      0.029156, 0.018153, 0.036100, 0.071770, 0.419641 },
    { 0.561459, 0.045448, 0.438366, 0.764167, 0.087364,
      0.259114, 0.214940, 0.145928, 0.762204, 0.247320,
      0.118662, 0.441564, 0.174822, 0.530840, 0.465529, 
      0.583402, 0.445586, 0.227050, 0.029510, 0.121090 },
    { 0.070143, 0.011140, 0.019479, 0.094657, 0.013162, 
      0.048038, 0.077000, 0.032939, 0.576639, 0.072293, 
      0.028240, 0.080372, 0.037661, 0.185037, 0.506783, 
      0.073732, 0.071587, 0.042532, 0.011254, 0.028723 },
    { 0.041103, 0.014794, 0.005610, 0.010216, 0.153602, 
      0.007797, 0.007175, 0.299635, 0.010849, 0.999446, 
      0.210189, 0.006127, 0.013021, 0.019798, 0.014509, 
      0.012049, 0.035799, 0.180085, 0.012744, 0.026466 },
    { 0.115607, 0.037381, 0.012414, 0.018179, 0.051778, 
      0.017255, 0.004911, 0.796882, 0.017074, 0.285858, 
      0.075811, 0.014548, 0.015092, 0.011382, 0.012696, 
      0.027535, 0.088333, 0.944340, 0.004373, 0.016741 },
    { 0.093461, 0.004737, 0.387252, 0.347841, 0.010822, 
      0.105877, 0.049776, 0.014963, 0.094276, 0.027761, 
      0.010040, 0.187869, 0.050018, 0.110039, 0.038668, 
      0.119471, 0.065802, 0.025430, 0.003215, 0.018742 },
    { 0.452171, 0.114613, 0.062460, 0.115702, 0.284246,
      0.140204, 0.100358, 0.550230, 0.143995, 0.700649, 
      0.276580, 0.118569, 0.097470, 0.126673, 0.143634, 
      0.278983, 0.358482, 0.661750, 0.061533, 0.199373 },
    { 0.005193, 0.004039, 0.006722, 0.006121, 0.003468, 
      0.016931, 0.003647, 0.002184, 0.005019, 0.005990, 
      0.001473, 0.004158, 0.009055, 0.003630, 0.006583, 
      0.003172, 0.003690, 0.002967, 0.002772, 0.002686 },
  };

  pri = P7AllocPrior();
  pri->strategy = PRI_DCHLET;

  /* Transition priors are subjective, but borrowed from GJM's estimations
   * on Pfam
   */
  pri->tnum     = 1;
  pri->tq[0]    = 1.0;
  pri->t[0][TMM]   = 0.7939;
  pri->t[0][TMI]   = 0.0278;	/* Markus suggests: ~10x MD: ~0.036: test!  */
  pri->t[0][TMD]   = 0.0135;	/* Markus suggests: ~0.1x MI: ~0.004 */
  pri->t[0][TIM]   = 0.1551;
  pri->t[0][TII]   = 0.1331;
  pri->t[0][TDM]   = 0.9002;
  pri->t[0][TDD]   = 0.5630;
  
  /* Match emission priors are a mixture Dirichlet,
   * from Kimmen Sjolander (Blocks9)
   */
  pri->mnum  = 9;
  for (q = 0; q < pri->mnum; q++) 
    {
      pri->mq[q] = defmq[q];
      for (x = 0; x < 20; x++)
	pri->m[q][x] = defm[q][x];
    }

  /* These insert emission priors are subjective. Observed frequencies
   * were obtained from PFAM 1.0, 10 Nov 96; 
   *      see ~/projects/plan7/InsertStatistics.
   * Inserts are slightly biased towards polar residues and away from
   * hydrophobic residues.
   */
  pri->inum  = 1;
  pri->iq[0] = 1.;
  pri->i[0][0]  = 681.;         /* A */
  pri->i[0][1]  = 120.;         /* C */
  pri->i[0][2]  = 623.;         /* D */
  pri->i[0][3]  = 651.;         /* E */
  pri->i[0][4]  = 313.;         /* F */
  pri->i[0][5]  = 902.;         /* G */
  pri->i[0][6]  = 241.;         /* H */
  pri->i[0][7]  = 371.;         /* I */
  pri->i[0][8]  = 687.;         /* K */
  pri->i[0][9]  = 676.;         /* L */
  pri->i[0][10] = 143.;         /* M */
  pri->i[0][11] = 548.;         /* N */
  pri->i[0][12] = 647.;         /* P */
  pri->i[0][13] = 415.;         /* Q */
  pri->i[0][14] = 551.;         /* R */
  pri->i[0][15] = 926.;         /* S */
  pri->i[0][16] = 623.;         /* T */
  pri->i[0][17] = 505.;         /* V */
  pri->i[0][18] = 102.;         /* W */
  pri->i[0][19] = 269.;         /* Y */

  return pri;
}