Esempio n. 1
0
static void
utest_ReadWrite(ESL_RANDOMNESS *rng)
{
  char          msg[]       = "bg Read/Write unit test failed";
  char          tmpfile[32] = "esltmpXXXXXX";
  FILE         *fp          = NULL;
  ESL_ALPHABET *abc         = NULL;   /* random alphabet choice eslRNA..eslDICE */
  float        *fq          = NULL;
  P7_BG        *bg          = NULL; 

  if ((abc = esl_alphabet_Create(esl_rnd_Roll(rng, 5) + 1)) == NULL)  esl_fatal(msg);
  if (( bg = p7_bg_Create(abc))                             == NULL)  esl_fatal(msg);
  if (( fq = malloc(sizeof(float) * abc->K))                == NULL)  esl_fatal(msg);                 
  do {
    if (esl_dirichlet_FSampleUniform(rng, abc->K, fq)      != eslOK) esl_fatal(msg);
  } while (esl_vec_FMin(fq, abc->K) < 0.001); /* small p's will get rounded off and fail FCompare() */
  esl_vec_FCopy(fq, abc->K, bg->f);

  if (esl_tmpfile_named(tmpfile, &fp) != eslOK) esl_fatal(msg);
  if ( p7_bg_Write(fp, bg)            != eslOK) esl_fatal(msg);
  fclose(fp);

  esl_vec_FSet(bg->f, bg->abc->K, 0.0);
  if ( p7_bg_Read(tmpfile, bg, NULL)                 != eslOK) esl_fatal(msg);
  if ( esl_vec_FCompare(fq, bg->f, bg->abc->K, 0.01) != eslOK) esl_fatal(msg);

  p7_bg_Destroy(bg);
  esl_alphabet_Destroy(abc);
  free(fq);
  remove(tmpfile);
}
Esempio n. 2
0
/* Function:  p7_Seqmodel()
 * Synopsis:  Make a profile HMM from a single sequence.
 *
 * Purpose:   Make a profile HMM from a single sequence, for
 *            probabilistic Smith/Waterman alignment, HMMER3-style.
 *            
 *            The query is digital sequence <dsq> of length <M>
 *            residues in alphabet <abc>, named <name>. 
 *            
 *            The scoring system is given by <Q>, <f>, <popen>, and
 *            <pextend>. <Q> is a $K \times K$ matrix giving
 *            conditional residue probabilities $P(a \mid b)}$; these
 *            are typically obtained by reverse engineering a score
 *            matrix like BLOSUM62. <f> is a vector of $K$ background
 *            frequencies $p_a$. <popen> and <pextend> are the
 *            probabilities assigned to gap-open ($t_{MI}$ and
 *            $t_{MD}$) and gap-extend ($t_{II}$ and $t_{DD}$)
 *            transitions.
 *            
 *            The <p7H_SINGLE> flag is set on the <hmm>. Model
 *            configuration (<p7_profile_Config(), friends> detects
 *            this flag. <B->Mk> entry transitions include a match
 *            state occupancy term for profile HMMs, but for single
 *            queries, that <occ[]> term is assumed 1.0 for all
 *            positions. See commentary in modelconfig.c.
 *            
 * Args:      
 *
 * Returns:   <eslOK> on success, and a newly allocated HMM is returned
 *            in <ret_hmm>. 
 *
 * Throws:    <eslEMEM> on allocation error, and <*ret_hmm> is <NULL>.
 */
int
p7_Seqmodel(const ESL_ALPHABET *abc, ESL_DSQ *dsq, int M, char *name,
	    ESL_DMATRIX *Q, float *f, double popen, double pextend,
	    P7_HMM **ret_hmm)
{
  int     status;
  P7_HMM *hmm    = NULL;
  char   *logmsg = "[HMM created from a query sequence]";
  int     k;

  if ((hmm = p7_hmm_Create(M, abc)) == NULL) { status = eslEMEM; goto ERROR; }
  
  for (k = 0; k <= M; k++)
    {
      /* Use rows of P matrix as source of match emission vectors */
      if (k > 0) esl_vec_D2F(Q->mx[(int) dsq[k]], abc->K, hmm->mat[k]);

      /* Set inserts to background for now. This will be improved. */
      esl_vec_FCopy(f, abc->K, hmm->ins[k]);

      hmm->t[k][p7H_MM] = 1.0 - 2 * popen;
      hmm->t[k][p7H_MI] = popen;
      hmm->t[k][p7H_MD] = popen;
      hmm->t[k][p7H_IM] = 1.0 - pextend;
      hmm->t[k][p7H_II] = pextend;
      hmm->t[k][p7H_DM] = 1.0 - pextend;
      hmm->t[k][p7H_DD] = pextend;
    }

  /* Deal w/ special stuff at node M, overwriting a little of what we
   * just did. 
   */
  hmm->t[M][p7H_MM] = 1.0 - popen;
  hmm->t[M][p7H_MD] = 0.;
  hmm->t[M][p7H_DM] = 1.0;
  hmm->t[M][p7H_DD] = 0.;
  
  /* Add mandatory annotation
   */
  p7_hmm_SetName(hmm, name);
  p7_hmm_AppendComlog(hmm, 1, &logmsg);
  hmm->nseq     = 1;
  p7_hmm_SetCtime(hmm);
  hmm->checksum = 0;

  hmm->flags |= p7H_SINGLE;
  *ret_hmm = hmm;
  return eslOK;
  
 ERROR:
  if (hmm != NULL) p7_hmm_Destroy(hmm);
  *ret_hmm = NULL;
  return status;
}
Esempio n. 3
0
/* Function:  p7_profile_Copy()
 * Synopsis:  Copy a profile.
 *
 * Purpose:   Copies profile <src> to profile <dst>, where <dst>
 *            has already been allocated to be of sufficient size,
 *            and has the same alphabet.
 *
 * Returns:   <eslOK> on success.
 * 
 * Throws:    <eslEMEM> on allocation error; <eslEINVAL> if <dst> is too small 
 *            to fit <src> or is for a different alphabet.
 */
int
p7_profile_Copy(const P7_PROFILE *src, P7_PROFILE *dst)
{
  int x,z;
  int status;

  if (src->M         >   dst->allocM)   ESL_EXCEPTION(eslEINVAL, "destination profile is too small to hold a copy of source profile");
  if (src->abc->type != dst->abc->type) ESL_EXCEPTION(eslEINVAL, "destination profile has different alphabet than source");

  dst->M = src->M;
  esl_vec_FCopy(src->tsc, (src->M+1)*p7P_NTRANS, dst->tsc);
  for (x = 0; x < src->abc->Kp;   x++) esl_vec_FCopy(src->rsc[x], (src->M+1)*p7P_NR, dst->rsc[x]);
  for (x = 0; x < p7P_NXSTATES;   x++) esl_vec_FCopy(src->xsc[x], p7P_NXTRANS,       dst->xsc[x]);

  dst->L           = src->L;
  dst->nj          = src->nj;
  dst->pglocal     = src->pglocal;

  if (dst->name) free(dst->name);   
  if (dst->acc)  free(dst->acc);    
  if (dst->desc) free(dst->desc);   
  if ((status = esl_strdup(src->name, -1, &(dst->name)))      != eslOK) return status; 
  if ((status = esl_strdup(src->acc,  -1, &(dst->acc)))       != eslOK) return status; 
  if ((status = esl_strdup(src->desc, -1, &(dst->desc)))      != eslOK) return status; 

  strcpy(dst->rf,        src->rf);         /* RF is optional: if it's not set, *rf=0, and strcpy still works fine */
  strcpy(dst->mm,        src->mm);         /* MM is also optional annotation */
  strcpy(dst->cs,        src->cs);         /* CS is also optional annotation */
  strcpy(dst->consensus, src->consensus);  /* consensus though is always present on a valid profile */

  for (z = 0; z < p7_NEVPARAM; z++) dst->evparam[z] = src->evparam[z];
  for (z = 0; z < p7_NCUTOFFS; z++) dst->cutoff[z]  = src->cutoff[z];
  for (z = 0; z < p7_MAXABET;  z++) dst->compo[z]   = src->compo[z];

  for (x = 0; x < p7_NOFFSETS; ++x) dst->offs[x] = src->offs[x];
  dst->roff        = src->roff;
  dst->eoff        = src->eoff;

  dst->max_length  = src->max_length;
  return eslOK;
}
Esempio n. 4
0
/* Function:  cp9_Copy()
 * Synopsis:  Copy a CM plan 9 HMM.
 *
 * Purpose:   Copies cp9 hmm <src> to cp9 hmm <dst>, where <dst>
 *            has already been allocated to be of sufficient size.
 *
 *            <src> should be properly normalized, no check is done to
 *            ensure that. If <src> is logoddsified (src->flags &
 *            CPLAN9_HASBITS) its bit scores will be copied to <dst>,
 *            otherwise they are invalid and won't be copied.
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    <eslEMEM> on allocation error; <eslEINVAL> if <dst> is too small
 *            to fit <src>.
 */
int
cp9_Copy(const CP9_t *src, CP9_t *dst)
{
    int status;
    int k;
    int src_has_bits = (src->flags & CPLAN9_HASBITS) ? TRUE : FALSE;

    if (src->M != dst->M) return eslEINVAL;

    dst->abc = src->abc;

    for(k = 0; k <= src->M; k++) {
        esl_vec_FCopy(src->t[k],   cp9_NTRANS,  dst->t[k]);
        esl_vec_FCopy(src->mat[k], src->abc->K, dst->mat[k]);
        esl_vec_FCopy(src->ins[k], src->abc->K, dst->ins[k]);
    }
    esl_vec_FCopy(src->begin, src->M+1, dst->begin);
    esl_vec_FCopy(src->end,   src->M+1, dst->end);
    if(src_has_bits) {
        esl_vec_ICopy(src->bsc_mem, src->M+1, dst->bsc_mem);
        esl_vec_ICopy(src->esc_mem, src->M+1, dst->esc_mem);
    }

    /* exploit linear-memory of these 2d arrays */
    if(src_has_bits) {
        esl_vec_ICopy(src->tsc_mem, cp9_NTRANS   * (src->M+1), dst->tsc_mem);
        esl_vec_ICopy(src->msc_mem, src->abc->Kp * (src->M+1), dst->msc_mem);
        esl_vec_ICopy(src->isc_mem, src->abc->Kp * (src->M+1), dst->isc_mem);
        esl_vec_ICopy(src->otsc,    cp9O_NTRANS  * (src->M+1), dst->otsc);
    }

    /* EL info */
    dst->el_self     = src->el_self;
    dst->el_selfsc   = src->el_selfsc;
    esl_vec_ICopy(src->has_el,     src->M+1,    dst->has_el);
    esl_vec_ICopy(src->el_from_ct, src->M+2,    dst->el_from_ct);
    for(k = 0; k <= src->M+1; k++) {
        if(src->el_from_ct[k] > 0) {
            ESL_ALLOC(dst->el_from_idx[k],  sizeof(int) * src->el_from_ct[k]);
            ESL_ALLOC(dst->el_from_cmnd[k], sizeof(int) * src->el_from_ct[k]);
            esl_vec_ICopy(src->el_from_idx[k],  src->el_from_ct[k], dst->el_from_idx[k]);
            esl_vec_ICopy(src->el_from_cmnd[k], src->el_from_ct[k], dst->el_from_cmnd[k]);
        }
    }

    dst->null2_omega = src->null2_omega;
    dst->null3_omega = src->null3_omega;
    esl_vec_FCopy(src->null, src->abc->K, dst->null);

    dst->p1    = src->p1;
    dst->flags = src->flags;

    return eslOK;

ERROR:
    return status;
}
Esempio n. 5
0
/* Function:  p7_bg_SetFilter()
 * Synopsis:  Configure filter HMM with new model composition.
 * Incept:    SRE, Fri Dec  5 09:08:15 2008 [Janelia]
 *
 * Purpose:   The "filter HMM" is an experimental filter in the
 *            acceleration pipeline for avoiding biased composition
 *            sequences. It has no effect on final scoring, if a
 *            sequence passes all steps of the pipeline; it is only
 *            used to eliminate biased sequences from further
 *            consideration early in the pipeline, before the big guns
 *            of domain postprocessing are applied.
 *            
 *            At least at present, it doesn't actually work as well as
 *            one would hope.  This will be an area of future work.
 *            What we really want to do is make a better null model of
 *            real protein sequences (and their biases), and incorporate
 *            that model into the flanks (NCJ states) of the profile.
 *            
 *            <compo> is the average model residue composition, from
 *            either the HMM or the copy in a profile or optimized
 *            profile. <M> is the length of the model in nodes.
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    (no abnormal error conditions)
 *
 * Xref:      J4/25: generalized to use composition vector, not
 *                   specifically an HMM. 
 *                   
 * Note:      This looks like a two-state HMM, but if you start thinking
 *            about its length distribution ("oh my god, L0 assumes a
 *            fixed L=400 expectation, it's all wrong, it's not conditional
 *            on the target sequence length and length modeling's messed
 *            up!"), don't panic. It's set up as a conditional-on-L model;
 *            the P(L) term is added in p7_bg_FilterScore() below.                  
 */
int
p7_bg_SetFilter(P7_BG *bg, int M, const float *compo)
{
  float L0 = 400.0;		/* mean length in state 0 of filter HMM (normal background) */
  float L1 = (float) M / 8.0; 	/* mean length in state 1 of filter HMM (biased segment) */

  /* State 0 is the normal iid model. */
  bg->fhmm->t[0][0] =   L0 / (L0+1.0f);
  bg->fhmm->t[0][1] = 1.0f / (L0+1.0f);
  bg->fhmm->t[0][2] = 1.0f;          	/* 1.0 transition to E means we'll set length distribution externally. */
  esl_vec_FCopy(bg->f, bg->abc->K, bg->fhmm->e[0]);

  /* State 1 is the potentially biased model composition. */
  bg->fhmm->t[1][0] = 1.0f / (L1+1.0f);
  bg->fhmm->t[1][1] =   L1 / (L1+1.0f);
  bg->fhmm->t[1][2] = 1.0f;         	/* 1.0 transition to E means we'll set length distribution externally. */
  esl_vec_FCopy(compo, bg->abc->K, bg->fhmm->e[1]);

  bg->fhmm->pi[0] = 0.999;
  bg->fhmm->pi[1] = 0.001;

  esl_hmm_Configure(bg->fhmm, bg->f);
  return eslOK;
}
Esempio n. 6
0
int
esl_vec_FLogValidate(float *vec, int n, float tol, char *errbuf)
{
  int     status;
  float  *expvec = NULL;

  if (errbuf) *errbuf = 0;
  if (n == 0) return eslOK;

  ESL_ALLOC(expvec, sizeof(float)*n);
  esl_vec_FCopy(vec, n, expvec);
  esl_vec_FExp(expvec, n); 
  if ((status = esl_vec_FValidate(expvec, n, tol, errbuf)) != eslOK) goto ERROR;
  free(expvec);
  return eslOK;

 ERROR:
  if (expvec != NULL) free(expvec);
  return eslOK;
}
Esempio n. 7
0
static void
utest_pvectors(void)
{
  char  *msg   = "pvector unit test failed";
  double p1[4] = { 0.25, 0.25, 0.25, 0.25 };
  double p2[4];
  double p3[4];
  float  p1f[4]; 
  float  p2f[4] = { 0.0,   0.5, 0.5,  0.0  };
  float  p3f[4];
  int    n = 4;
  double result;

  esl_vec_D2F(p1,  n, p1f);
  esl_vec_F2D(p2f, n, p2);  

  if (esl_vec_DValidate(p1,  n, 1e-12, NULL) != eslOK) esl_fatal(msg);
  if (esl_vec_FValidate(p1f, n, 1e-7,  NULL) != eslOK) esl_fatal(msg);

  result = esl_vec_DEntropy(p1,  n);          if (esl_DCompare(2.0, result, 1e-9) != eslOK) esl_fatal(msg);
  result = esl_vec_FEntropy(p1f, n);          if (esl_DCompare(2.0, result, 1e-9) != eslOK) esl_fatal(msg);
  result = esl_vec_DEntropy(p2,  n);          if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg);
  result = esl_vec_FEntropy(p2f, n);          if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg);

  result = esl_vec_DRelEntropy(p2,  p1,  n);  if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg);
  result = esl_vec_FRelEntropy(p2f, p1f, n);  if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg);

  result = esl_vec_DRelEntropy(p1,  p2,  n);  if (result != eslINFINITY)  esl_fatal(msg);
  result = esl_vec_FRelEntropy(p1f, p2f, n);  if (result != eslINFINITY)  esl_fatal(msg);

  esl_vec_DLog(p2, n);
  if (esl_vec_DLogValidate(p2, n, 1e-12, NULL) != eslOK) esl_fatal(msg);
  esl_vec_DExp(p2, n);
  if (p2[0] != 0.) esl_fatal(msg);

  esl_vec_FLog(p2f, n);
  if (esl_vec_FLogValidate(p2f, n, 1e-7, NULL) != eslOK) esl_fatal(msg);
  esl_vec_FExp(p2f, n);
  if (p2f[0] != 0.) esl_fatal(msg);

  esl_vec_DCopy(p2, n, p3);
  esl_vec_DScale(p3, n, 10.);
  esl_vec_DNorm(p3, n);
  if (esl_vec_DCompare(p2, p3, n, 1e-12) != eslOK) esl_fatal(msg);

  esl_vec_DLog(p3, n);
  result = esl_vec_DLogSum(p3, n); if (esl_DCompare(0.0, result, 1e-12) != eslOK) esl_fatal(msg);
  esl_vec_DIncrement(p3, n, 2.0);
  esl_vec_DLogNorm(p3, n);
  if (esl_vec_DCompare(p2, p3, n, 1e-12) != eslOK) esl_fatal(msg);

  esl_vec_FCopy(p2f, n, p3f);
  esl_vec_FScale(p3f, n, 10.);
  esl_vec_FNorm(p3f, n);
  if (esl_vec_FCompare(p2f, p3f, n, 1e-7) != eslOK) esl_fatal(msg);

  esl_vec_FLog(p3f, n);
  result = esl_vec_FLogSum(p3f, n); if (esl_DCompare(0.0, result, 1e-7) != eslOK) esl_fatal(msg);
  esl_vec_FIncrement(p3f, n, 2.0);
  esl_vec_FLogNorm(p3f, n);
  if (esl_vec_FCompare(p2f, p3f, n, 1e-7) != eslOK) esl_fatal(msg);

  return;
}
/**
 * int main(int argc, char **argv)
 * Main driver
 */
int
main(int argc, char **argv)
{
  ESL_GETOPTS     *go	   = NULL;      /* command line processing                   */
  ESL_ALPHABET    *abc     = NULL;
  char            *hmmfile = NULL;
  char            *outhmmfile = NULL;
  P7_HMMFILE      *hfp     = NULL;
  FILE         *outhmmfp;          /* HMM output file handle                  */
  P7_HMM          *hmm     = NULL;
  P7_BG           *bg      = NULL;
  int              nhmm;	
  double           x;
  float            KL;
  int              status;
  char             errbuf[eslERRBUFSIZE];

  float average_internal_transitions[ p7H_NTRANSITIONS ];
  int k;

  char        errmsg[eslERRBUFSIZE];

  /* Process the command line options.
   */
  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || 
      esl_opt_VerifyConfig(go)               != eslOK)
    {
      printf("Failed to parse command line: %s\n", go->errbuf);
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }
  if (esl_opt_GetBoolean(go, "-h") == TRUE) 
    {
      profillic_p7_banner(stdout, argv[0], banner);
      esl_usage(stdout, argv[0], usage);
      puts("\nOptions:");
      esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=docgroup, 2 = indentation; 80=textwidth*/
      exit(0);
    }
  if (esl_opt_ArgNumber(go) != 2) 
    {
      puts("Incorrect number of command line arguments.");
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  if ((hmmfile = esl_opt_GetArg(go, 1)) == NULL) 
    {
      puts("Failed to read <input hmmfile> argument from command line.");
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  if ((outhmmfile = esl_opt_GetArg(go, 2)) == NULL) 
    {
      puts("Failed to read <output hmmfile> argument from command line.");
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  profillic_p7_banner(stdout, argv[0], banner);
  
  /* Initializations: open the input HMM file for reading
   */
  status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf);
  if      (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf);
  else if (status == eslEFORMAT)   p7_Fail("File format problem in trying to open HMM file %s.\n%s\n",                hmmfile, errbuf);
  else if (status != eslOK)        p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n",               status, hmmfile, errbuf);  

  /* Initializations: open the output HMM file for writing
   */
  if ((outhmmfp = fopen(outhmmfile, "w")) == NULL) ESL_FAIL(status, errmsg, "Failed to open HMM file %s for writing", outhmmfile);

  /* Main body: read HMMs one at a time, print one line of stats
   */
  printf("#\n");
  printf("# %-4s %-20s %-12s %8s %8s %6s %6s %6s %6s %6s\n", "idx",  "name",                 "accession",    "nseq",     "eff_nseq", "M",      "relent", "info",   "p relE", "compKL");
  printf("# %-4s %-20s %-12s %8s %8s %6s %6s %6s %6s %6s\n", "----", "--------------------", "------------", "--------", "--------", "------", "------", "------", "------", "------");

  nhmm = 0;
  while ((status = p7_hmmfile_Read(hfp, &abc, &hmm)) != eslEOF) 
    {
      if      (status == eslEOD)       esl_fatal("read failed, HMM file %s may be truncated?", hmmfile);
      else if (status == eslEFORMAT)   esl_fatal("bad file format in HMM file %s",             hmmfile);
      else if (status == eslEINCOMPAT) esl_fatal("HMM file %s contains different alphabets",   hmmfile);
      else if (status != eslOK)        esl_fatal("Unexpected error in reading HMMs from %s",   hmmfile);
      nhmm++;

      if (bg == NULL) bg = p7_bg_Create(abc);

      esl_vec_FSet(average_internal_transitions, p7H_NTRANSITIONS, 0.);
      for( k = 1; k < hmm->M; k++ ) {
        esl_vec_FAdd(average_internal_transitions, hmm->t[k], p7H_NTRANSITIONS);
      }
      // Match transitions
      esl_vec_FNorm(average_internal_transitions, 3);
      // Insert transitions
      esl_vec_FNorm(average_internal_transitions + 3, 2);
      // Delete transitions
      esl_vec_FNorm(average_internal_transitions + 5, 2);
      // Ok now set them.
      for( k = 1; k < hmm->M; k++ ) {
        esl_vec_FCopy( average_internal_transitions, p7H_NTRANSITIONS, hmm->t[k] );
      }

      if ((status = p7_hmm_Validate(hmm, errmsg, 0.0001))       != eslOK) return status;
      if ((status = p7_hmmfile_WriteASCII(outhmmfp, -1, hmm)) != eslOK) ESL_FAIL(status, errmsg, "HMM save failed");
  
      p7_MeanPositionRelativeEntropy(hmm, bg, &x); 
      p7_hmm_CompositionKLDist(hmm, bg, &KL, NULL);

      printf("%-6d %-20s %-12s %8d %8.2f %6d %6.2f %6.2f %6.2f %6.2f\n",
	     nhmm,
	     hmm->name,
	     hmm->acc == NULL ? "-" : hmm->acc,
	     hmm->nseq,
	     hmm->eff_nseq,
	     hmm->M,
	     p7_MeanMatchRelativeEntropy(hmm, bg),
	     p7_MeanMatchInfo(hmm, bg),
	     x,
	     KL);

	     /*	     p7_MeanForwardScore(hmm, bg)); */

      p7_hmm_Destroy(hmm);
    }

  p7_bg_Destroy(bg);
  esl_alphabet_Destroy(abc);
  p7_hmmfile_Close(hfp);
  if (outhmmfp != NULL) fclose(outhmmfp);
 esl_getopts_Destroy(go);
  exit(0);
}
Esempio n. 9
0
/* process_workunit()
 *
 * This is the routine that actually does the work.
 *
 * A work unit consists of one HMM, <hmm>.
 * The result is the <scores> array, which contains an array of N scores;
 * caller provides this memory.
 * How those scores are generated is controlled by the application configuration in <cfg>.
 */
static int
process_workunit(ESL_GETOPTS *go, struct cfg_s *cfg, char *errbuf, P7_HMM *hmm, double *scores, int *alilens)
{
  int             L   = esl_opt_GetInteger(go, "-L");
  P7_PROFILE     *gm  = NULL;
  P7_OPROFILE    *om  = NULL;
  P7_REFMX       *rmx = NULL;
  P7_CHECKPTMX   *cx  = NULL;
  P7_FILTERMX    *fx  = NULL;
  P7_TRACE       *tr  = NULL;
  ESL_DSQ        *dsq = NULL;
  int             i;
  int             scounts[p7T_NSTATETYPES]; /* state usage counts from a trace */
  float           sc;
  float           nullsc;
  int             status;
   P7_HARDWARE *hw;
  if ((hw = p7_hardware_Create ()) == NULL)  p7_Fail("Couldn't get HW information data structure"); 
  /* Optionally set a custom background, determined by model composition;
   * an experimental hack. 
   */
  if (esl_opt_GetBoolean(go, "--bgcomp")) 
    {
      float *p = NULL;
      float  KL;

      p7_hmm_CompositionKLDist(hmm, cfg->bg, &KL, &p);
      esl_vec_FCopy(p, cfg->abc->K, cfg->bg->f);
    }

  /* Create and configure our generic profile, as requested */
  gm = p7_profile_Create(hmm->M, cfg->abc);
  if (esl_opt_GetBoolean(go, "--multi")) 
    {
      if      (esl_opt_GetBoolean(go, "--dual"))   { p7_profile_Config      (gm, hmm, cfg->bg);    }
      else if (esl_opt_GetBoolean(go, "--local"))  { p7_profile_ConfigLocal (gm, hmm, cfg->bg, L); }
      else if (esl_opt_GetBoolean(go, "--glocal")) { p7_profile_ConfigGlocal(gm, hmm, cfg->bg, L); }
    }
  else if (esl_opt_GetBoolean(go, "--uni")) 
    {
      if      (esl_opt_GetBoolean(go, "--dual"))   { p7_profile_ConfigCustom   (gm, hmm, cfg->bg, L, 0.0, 0.5); }
      else if (esl_opt_GetBoolean(go, "--local"))  { p7_profile_ConfigUnilocal (gm, hmm, cfg->bg, L);           }
      else if (esl_opt_GetBoolean(go, "--glocal")) { p7_profile_ConfigUniglocal(gm, hmm, cfg->bg, L);           }
    }
  p7_profile_SetLength(gm, L);
  p7_bg_SetLength(cfg->bg, L);  

  if (esl_opt_GetBoolean(go, "--x-no-lengthmodel")) elide_length_model(gm, cfg->bg);

  /* Allocate DP matrix for <gm>.
   */
  rmx = p7_refmx_Create(gm->M, L);

  /* Create and configure the vectorized profile, if needed;
   * and allocate its DP matrix
   */
  if (esl_opt_GetBoolean(go, "--vector"))
    {
      om = p7_oprofile_Create(gm->M, cfg->abc, om->simd);
      p7_oprofile_Convert(gm, om);
      cx = p7_checkptmx_Create(gm->M, L, ESL_MBYTES(32), om->simd);
      fx = p7_filtermx_Create(gm->M, om->simd);
    }
  
  /* Remaining allocation */
  ESL_ALLOC(dsq, sizeof(ESL_DSQ) * (L+2));
  tr = p7_trace_Create();

  /* Collect scores from N random sequences of length L  */
  for (i = 0; i < cfg->N; i++)
    {
      esl_rsq_xfIID(cfg->r, cfg->bg->f, cfg->abc->K, L, dsq);
      sc = eslINFINITY;

      /* Vectorized implementations of Viterbi, MSV may overflow.
       * In this case, they'll leave sc=eslINFINITY.
       * Then we fail over to the nonvector "generic" implementation.
       * That's why this next block isn't an if/else.
       */
      if (esl_opt_GetBoolean(go, "--vector")) 
	{
	  if      (esl_opt_GetBoolean(go, "--vit")) p7_ViterbiFilter(dsq, L, om, fx, &sc);
	  else if (esl_opt_GetBoolean(go, "--fwd")) p7_ForwardFilter(dsq, L, om, cx, &sc);
	  else if (esl_opt_GetBoolean(go, "--msv")) p7_MSVFilter    (dsq, L, om, fx, &sc);
	} 

      /* If we tried a vector calculation above but it overflowed,
       * or if we're to do --generic DP calculations, sc==eslINFINITY now;
       * hence the if condition here:
       */
      if (sc == eslINFINITY)
	{
	  if      (esl_opt_GetBoolean(go, "--fwd"))  p7_ReferenceForward(dsq, L, gm, rmx,     &sc); /* any mode: dual,local,glocal; gm's config takes care of this */
	  else if (esl_opt_GetBoolean(go, "--vit"))  p7_ReferenceViterbi(dsq, L, gm, rmx, tr, &sc); /* local-only mode. cmdline opts processing has already assured that --local set */
	  else if (esl_opt_GetBoolean(go, "--msv"))  p7_Die("We used to be able to do a generic MSV algorithm - but no longer");
	}

      /* Optional: get Viterbi alignment length too. */
      if (esl_opt_GetBoolean(go, "-a"))  /* -a only works with Viterbi; getopts has checked this already; <tr> must be valid */
	{
	  p7_trace_GetStateUseCounts(tr, scounts);

	  /* there's various ways we could counts "alignment length". 
	   * Here we'll use the total length of model used, in nodes: M+D states.
           * score vs al would gives us relative entropy / model position.
	   */
	  /* alilens[i] = scounts[p7T_D] + scounts[p7T_I]; SRE: temporarily testing this instead */
	  alilens[i] = scounts[p7T_ML] + scounts[p7T_DL] + scounts[p7T_IL] +
	    scounts[p7T_MG] + scounts[p7T_DG] + scounts[p7T_IG];
	  p7_trace_Reuse(tr);
	}

      p7_bg_NullOne(cfg->bg, dsq, L, &nullsc);
      scores[i] = (sc - nullsc) / eslCONST_LOG2;

      if (cx) p7_checkptmx_Reuse(cx);
      if (fx) p7_filtermx_Reuse(fx);
      p7_refmx_Reuse(rmx);
    }
  status      = eslOK;
  /* deliberate flowthru */
 ERROR:
  if (dsq != NULL) free(dsq);
  p7_checkptmx_Destroy(cx);
  p7_filtermx_Destroy(fx);
  p7_oprofile_Destroy(om);
  p7_profile_Destroy(gm);
  p7_refmx_Destroy(rmx);
  p7_trace_Destroy(tr);
  if (status == eslEMEM) sprintf(errbuf, "allocation failure");
  return status;
}
Esempio n. 10
0
/* Function:  p7_bg_Read()
 * Synopsis:  Read background frequencies from a file.
 *
 * Purpose:   Read new background frequencies from file <bgfile>,
 *            overwriting the frequencies previously in the 
 *            <P7_BG> object <bg>.
 *            
 *            Note that <bg> is already created by the caller, not
 *            created here. Also note that <p7_bg_Read()> only reads
 *            residue background frequencies used for the "null
 *            model", whereas a <P7_BG> object contains additional
 *            information for the bias filter and for the biased
 *            composition correction.
 *            
 * Args:      bgfile  - file to read.
 *            bg      - existing <P7_BG> object provided by the caller.
 *            errbuf  - OPTIONAL: space for an error message, upon parse errors; or NULL.
 *
 * Returns:   <eslOK> on success, and background frequencies in <bg>
 *            are overwritten.
 * 
 *            <eslENOTFOUND> if <bgfile> can't be opened for reading.
 *            <eslEFORMAT> if parsing of <bgfile> fails for some
 *            reason.  In both cases, <errbuf> contains a
 *            user-directed error message upon return, including (if
 *            relevant) the file name <bgfile> and the line number on
 *            which an error was detected. <bg> is unmodified.
 *
 * Throws:    <eslEMEM> on allocation failure; <bg> is unmodified,
 *            and <errbuf> is empty.
 */
int
p7_bg_Read(char *bgfile, P7_BG *bg, char *errbuf)
{
  ESL_FILEPARSER *efp   = NULL;
  float          *fq    = NULL;
  int             n     = 0;
  char           *tok;
  int             toklen;
  int             alphatype;
  ESL_DSQ         x;
  int             status;

  if (errbuf) errbuf[0] = '\0';

  status =  esl_fileparser_Open(bgfile, NULL, &efp);
  if      (status == eslENOTFOUND) ESL_XFAIL(eslENOTFOUND, errbuf, "couldn't open bg file  %s for reading", bgfile);
  else if (status != eslOK)        goto ERROR;

  esl_fileparser_SetCommentChar(efp, '#');

  /* First token is alphabet type: amino | DNA | RNA */
  status = esl_fileparser_GetToken(efp, &tok, &toklen);
  if      (status == eslEOF) ESL_XFAIL(eslEFORMAT, errbuf, "premature end of file [line %d of bgfile %s]", efp->linenumber, bgfile);
  else if (status != eslOK)  goto ERROR;

  alphatype = esl_abc_EncodeType(tok);
  if      (alphatype == eslUNKNOWN)    ESL_XFAIL(eslEFORMAT, errbuf, "expected alphabet type but saw \"%s\" [line %d of bgfile %s]", tok, efp->linenumber, bgfile);
  else if (alphatype != bg->abc->type) ESL_XFAIL(eslEFORMAT, errbuf, "bg file's alphabet is %s; expected %s [line %d, %s]", tok, esl_abc_DecodeType(bg->abc->type), efp->linenumber, bgfile);
  
  ESL_ALLOC(fq, sizeof(float) * bg->abc->K);
  esl_vec_FSet(fq, bg->abc->K, -1.0);

  while ((status = esl_fileparser_NextLine(efp)) == eslOK)
    {
      status = esl_fileparser_GetTokenOnLine(efp, &tok, &toklen);
      if      (status == eslEOL) ESL_XFAIL(eslEFORMAT, errbuf, "premature end of file [line %d of bgfile %s", efp->linenumber, bgfile);
      else if (status != eslOK)  goto ERROR;

      if      (toklen != 1 ||   ! esl_abc_CIsCanonical(bg->abc, *tok))
	ESL_XFAIL(eslEFORMAT, errbuf, "expected to parse a residue letter; saw %s [line %d of bgfile %s]", tok, efp->linenumber, bgfile);

      x = esl_abc_DigitizeSymbol(bg->abc, *tok);
      if (fq[x] != -1.0)         ESL_XFAIL(eslEFORMAT, errbuf, "already parsed probability of %c [line %d of bgfile %s]", bg->abc->sym[x], efp->linenumber, bgfile);
      n++;

      status = esl_fileparser_GetTokenOnLine(efp, &tok, &toklen);
      if      (status == eslEOL) ESL_XFAIL(eslEFORMAT, errbuf, "premature end of file, expected a probability [line %d of bgfile %s]", efp->linenumber, bgfile);
      else if (status != eslOK)  goto ERROR;
      if (! esl_str_IsReal(tok)) ESL_XFAIL(eslEFORMAT, errbuf, "expected a probability, saw %s [line %d of bgfile %s]", tok, efp->linenumber, bgfile);

      fq[x] = atof(tok);

      status = esl_fileparser_GetTokenOnLine(efp, &tok, &toklen);
      if      (status == eslOK)  ESL_XFAIL(eslEFORMAT, errbuf, "extra unexpected data found [line %d of bgfile %s]", efp->linenumber, bgfile);
      else if (status != eslEOL) goto ERROR;
    }
  if (status != eslEOF) goto ERROR;

  if ( n != bg->abc->K) 
    ESL_XFAIL(eslEFORMAT, errbuf, "expected %d residue frequencies, but found %d in bgfile %s", bg->abc->K, n, bgfile);
  if ( esl_FCompare(esl_vec_FSum(fq, bg->abc->K), 1.0, 0.001) != eslOK) 
    ESL_XFAIL(eslEFORMAT, errbuf, "residue frequencies do not sum to 1.0 in bgfile %s", bgfile);
  
  /* all checking complete. no more error cases. overwrite bg with the new frequencies */
  esl_vec_FNorm(fq, bg->abc->K);
  esl_vec_FCopy(fq, bg->abc->K, bg->f);

  free(fq);
  esl_fileparser_Close(efp);
  return eslOK;

 ERROR:
  if (fq)  free(fq);
  if (efp) esl_fileparser_Close(efp);
  return status;
}
Esempio n. 11
0
/* Function:  p7_GNull2_ByExpectation()
 * Synopsis:  Calculate null2 model from posterior probabilities.
 * Incept:    SRE, Thu Feb 28 09:52:28 2008 [Janelia]
 *
 * Purpose:   Calculate the "null2" model for the envelope encompassed
 *            by a posterior probability calculation <pp> for model
 *            <gm>.  Return the null2 odds emission probabilities
 *            $\frac{f'{x}}{f{x}}$ in <null2>, which caller
 *            provides as space for at least <alphabet->Kp> residues.
 *            
 *            The expectation method is applied to envelopes in
 *            simple, well resolved regions (regions containing just a
 *            single envelope, where no stochastic traceback
 *            clustering was required).
 *            
 *            Make sure that the posterior probability matrix <pp> has
 *            been calculated by the caller for only the envelope; thus
 *            its rows are numbered <1..Ld>, for envelope <ienv..jenv>
 *            of length <Ld=jenv-ienv+1>.
 *            
 * Args:      gm    - profile, in any mode, target length model set to <L>
 *            pp    - posterior prob matrix, for <gm> against domain envelope <dsq+i-1> (offset)
 *            null2 - RETURN: null2 odds ratios per residue; <0..Kp-1>; caller allocated space
 *
 * Returns:   <eslOK> on success; <null2> contains the null2 scores. The 0
 *            row of <pp> has been used as temp space, and happens to contain
 *            the expected frequency that each M,I,N,C,J state is used in this
 *            <pp> matrix to generate residues.
 *
 * Throws:    (no abnormal error conditions)
 */
int
p7_GNull2_ByExpectation(const P7_PROFILE *gm, P7_GMX *pp, float *null2)
{
  int      M      = gm->M;
  int      Ld     = pp->L;
  float  **dp     = pp->dp;
  float   *xmx    = pp->xmx;
  float    xfactor;
  int      x;			/* over symbols 0..K-1                       */
  int      i;			/* over offset envelope dsq positions 1..Ld  */
  int      k;			/* over model M states 1..M, I states 1..M-1 */

  /* Calculate expected # of times that each emitting state was used
   * in generating the Ld residues in this domain.
   * The 0 row in <wrk> is used to hold these numbers.
   */
  esl_vec_FCopy(pp->dp[1],            (M+1)*p7G_NSCELLS, pp->dp[0]); 
  esl_vec_FCopy(pp->xmx+p7G_NXCELLS,  p7G_NXCELLS,       pp->xmx);   
  for (i = 2; i <= Ld; i++)
    {
      esl_vec_FAdd(pp->dp[0], pp->dp[i],             (M+1)*p7G_NSCELLS);
      esl_vec_FAdd(pp->xmx,   pp->xmx+i*p7G_NXCELLS, p7G_NXCELLS); 
    }
  
  /* Convert those expected #'s to log frequencies; these we'll use as
   * the log posterior weights.
   */
  esl_vec_FLog(pp->dp[0], (M+1)*p7G_NSCELLS);
  esl_vec_FLog(pp->xmx,   p7G_NXCELLS);  

  esl_vec_FIncrement(pp->dp[0], (M+1)*p7G_NSCELLS, -log((float)Ld));
  esl_vec_FIncrement(pp->xmx,   p7G_NXCELLS,       -log((float)Ld)); 

  /* Calculate null2's log odds emission probabilities, by taking
   * posterior weighted sum over all emission vectors used in paths
   * explaining the domain.
   * This is dog-slow; a point for future optimization.
   */
  xfactor = XMX(0,p7G_N);
  xfactor = p7_FLogsum(xfactor, XMX(0,p7G_C));
  xfactor = p7_FLogsum(xfactor, XMX(0,p7G_J));
  esl_vec_FSet(null2, gm->abc->K, -eslINFINITY);
  for (x = 0; x < gm->abc->K; x++)
    { 
      for (k = 1; k < M; k++)
	{
	  null2[x] = p7_FLogsum(null2[x], MMX(0,k) + p7P_MSC(gm, k, x));
	  null2[x] = p7_FLogsum(null2[x], IMX(0,k) + p7P_ISC(gm, k, x));
	}
      null2[x] = p7_FLogsum(null2[x], MMX(0,M) + p7P_MSC(gm, k, x));
      null2[x] = p7_FLogsum(null2[x], xfactor);
    }

  esl_vec_FExp (null2, gm->abc->K);
  /* now null2[x] = \frac{f_d(x)}{f_0(x)} for all x in alphabet,
   * 0..K-1, where f_d(x) are the ad hoc "null2" residue frequencies
   * for this envelope.
   */

  /* make valid scores for all degeneracies, by averaging the odds ratios. */
  esl_abc_FAvgScVec(gm->abc, null2); /* does not set gap, nonres, missing  */
  null2[gm->abc->K]    = 1.0;        /* gap character    */
  null2[gm->abc->Kp-2] = 1.0;	     /* nonresidue "*"   */
  null2[gm->abc->Kp-1] = 1.0;	     /* missing data "~" */

  return eslOK;
}