/* Function: PAMPrior()
 * 
 * Purpose:  Produces an ad hoc "Dirichlet mixture" prior for
 *           match emissions, using a PAM matrix. 
 *           
 *           Side effect notice: PAMPrior() replaces the match
 *           emission section of an existing Dirichlet prior,
 *           which is /expected/ to be a simple one-component 
 *           kind of prior. The insert emissions /must/ be a
 *           one-component prior (because of details in how 
 *           PriorifyEmissionVector() is done). However, 
 *           the transitions /could/ be a mixture Dirichlet prior 
 *           without causing problems. In other words, the
 *           -p and -P options of hmmb can coexist, but there
 *           may be conflicts. PAMPrior() checks for these,
 *           so there's no serious problem, except that the
 *           error message from PAMPrior() might be confusing to
 *           a user. 
 */
void
PAMPrior(char *pamfile, struct p7prior_s *pri, float wt)
{
  FILE  *fp;
  char  *blastpamfile;            /* BLAST looks in aa/ subdirectory of BLASTMAT */
  int  **pam;
  float  scale;
  int    xi, xj;
  int    idx1, idx2;

  if (Alphabet_type != hmmAMINO)
    Die("PAM prior is only valid for protein sequences");
  if (pri->strategy != PRI_DCHLET)
    Die("PAM prior may only be applied over an existing Dirichlet prior");
  if (pri->inum != 1)
    Die("PAM prior requires that the insert emissions be a single Dirichlet");
  if (MAXDCHLET < 20)
    Die("Whoa, code is misconfigured; MAXDCHLET must be >= 20 for PAM prior");

  blastpamfile = FileConcat("aa", pamfile);

  if ((fp = fopen(pamfile, "r")) == NULL &&
      (fp = EnvFileOpen(pamfile, "BLASTMAT", NULL)) == NULL &&
      (fp = EnvFileOpen(blastpamfile, "BLASTMAT", NULL)) == NULL)
    Die("Failed to open PAM scoring matrix file %s", pamfile);
  if (! ParsePAMFile(fp, &pam, &scale))
    Die("Failed to parse PAM scoring matrix file %s", pamfile);
  fclose(fp);
  free(blastpamfile);

  pri->strategy = PRI_PAM;
  pri->mnum     = 20;
  
  /* Convert PAM entries back to conditional prob's P(xj | xi),
   * which we'll use as "pseudocounts" weighted by wt.
   */
  for (xi = 0; xi < Alphabet_size; xi++)
    for (xj = 0; xj < Alphabet_size; xj++)
      {
        idx1 = Alphabet[xi] - 'A';
        idx2 = Alphabet[xj] - 'A';
        pri->m[xi][xj] = aafq[xj] * exp((float) pam[idx1][idx2] * scale);
      }
  
  /* Normalize so that rows add up to wt.
   * i.e. Sum(xj) mat[xi][xj] = wt for every row xi
   */
  for (xi = 0; xi < Alphabet_size; xi++)
    {
      pri->mq[xi] = 1. / Alphabet_size;
      FNorm(pri->m[xi], Alphabet_size);
      FScale(pri->m[xi], Alphabet_size, wt);
    }

  Free2DArray((void **)pam,27);
}
Esempio n. 2
0
/* Function: MSAFileOpen()
 * Date:     SRE, Tue May 18 13:22:01 1999 [St. Louis]
 *
 * Purpose:  Open an alignment database file and prepare
 *           for reading one alignment, or sequentially
 *           in the (rare) case of multiple MSA databases
 *           (e.g. Stockholm format).
 *           
 * Args:     filename - name of file to open
 *                      if "-", read stdin
 *                      if it ends in ".gz", read from pipe to gunzip -dc
 *           format   - format of file (e.g. MSAFILE_STOCKHOLM)
 *           env      - environment variable for path (e.g. BLASTDB)
 *
 * Returns:  opened MSAFILE * on success.
 *           NULL on failure: 
 *             usually, because the file doesn't exist;
 *             for gzip'ed files, may also mean that gzip isn't in the path.
 */
MSAFILE *
MSAFileOpen(char *filename, int format, char *env)
{
  MSAFILE *afp;
  
  afp        = MallocOrDie(sizeof(MSAFILE));
  if (strcmp(filename, "-") == 0)
    {
      afp->f         = stdin;
      afp->do_stdin  = TRUE; 
      afp->do_gzip   = FALSE;
      afp->fname     = sre_strdup("[STDIN]", -1);
      afp->ssi       = NULL;	/* can't index stdin because we can't seek*/
    }
#ifndef SRE_STRICT_ANSI		
  /* popen(), pclose() aren't portable to non-POSIX systems; disable */
  else if (Strparse("^.*\\.gz$", filename, 0))
    {
      char cmd[256];

      /* Note that popen() will return "successfully"
       * if file doesn't exist, because gzip works fine
       * and prints an error! So we have to check for
       * existence of file ourself.
       */
      if (! FileExists(filename))
	Die("%s: file does not exist", filename);
      if (strlen(filename) + strlen("gzip -dc ") >= 256)
	Die("filename > 255 char in MSAFileOpen()"); 
      sprintf(cmd, "gzip -dc %s", filename);
      if ((afp->f = popen(cmd, "r")) == NULL)
	return NULL;

      afp->do_stdin = FALSE;
      afp->do_gzip  = TRUE;
      afp->fname    = sre_strdup(filename, -1);
      /* we can't index a .gz file, because we can't seek in a pipe afaik */
      afp->ssi      = NULL;	
    }
#endif /*SRE_STRICT_ANSI*/
  else
    {
      char *ssifile;
      char *dir;

      /* When we open a file, it may be either in the current
       * directory, or in the directory indicated by the env
       * argument - and we have to construct the SSI filename accordingly.
       */
      if ((afp->f = fopen(filename, "r")) != NULL)
	{
	  ssifile = MallocOrDie(sizeof(char) * (strlen(filename) + 5));
	  sprintf(ssifile, "%s.ssi", filename);
	}
      else if ((afp->f = EnvFileOpen(filename, env, &dir)) != NULL)
	{
	  char *full;
	  full = FileConcat(dir, filename);
	  ssifile = MallocOrDie(sizeof(char) * (strlen(full) + strlen(filename)  + 5));
	  sprintf(ssifile, "%s.ssi", full);
	  free(dir);
	}
      else return NULL;

      afp->do_stdin = FALSE;
      afp->do_gzip  = FALSE;
      afp->fname    = sre_strdup(filename, -1);
      afp->ssi      = NULL;

      /* Open the SSI index file. If it doesn't exist, or
       * it's corrupt, or some error happens, afp->ssi stays NULL.
       */
      SSIOpen(ssifile, &(afp->ssi));
      free(ssifile);
    }

  /* Invoke autodetection if we haven't already been told what
   * to expect.
   */
  if (format == MSAFILE_UNKNOWN)
    {
      if (afp->do_stdin == TRUE || afp->do_gzip)
	Die("Can't autodetect alignment file format from a stdin or gzip pipe");
      format = MSAFileFormat(afp);
      if (format == MSAFILE_UNKNOWN)
	Die("Can't determine format of multiple alignment file %s", afp->fname);
    }

  afp->format     = format;
  afp->linenumber = 0;
  afp->buf        = NULL;
  afp->buflen     = 0;

  return afp;
}
/* Function: CP9_HMMFileOpen()
 * 
 * Purpose:  Open an HMM file for reading. The file may be either
 *           an index for a library of HMMs, or an HMM. 
 *           
 * Args:     hmmfile - name of file
 *           env     - NULL, or environment variable for HMM database.
 *           
 * Return:   Valid HMMFILE *, or NULL on failure.
 */
CP9HMMFILE * 
CP9_HMMFileOpen(char *hmmfile, char *env)
{
  CP9HMMFILE     *hmmfp;
  unsigned int magic;
  char         buf[512];
  char        *ssifile;
  char        *dir;        /* dir name in which HMM file was found */
  int          status;

  hmmfp = (CP9HMMFILE *) MallocOrDie (sizeof(CP9HMMFILE));
  hmmfp->f          = NULL; 
  hmmfp->parser     = NULL;
  hmmfp->is_binary  = FALSE;
  hmmfp->byteswap   = FALSE;
  hmmfp->is_seekable= TRUE;	/* always; right now, an HMM must always be in a file. */
  
  /* Open the file. Look in current directory.
   * If that doesn't work, check environment var for
   * a second possible directory (usually the location
   * of a system-wide HMM library).
   * Using dir name if necessary, construct correct SSI file name.
   */
  hmmfp->f   = NULL;
  hmmfp->ssi = NULL;
  if ((hmmfp->f = fopen(hmmfile, "r")) != NULL)
    {
      ssifile = MallocOrDie(sizeof(char) * (strlen(hmmfile) + 5));
      sprintf(ssifile, "%s.ssi", hmmfile);

      if ((hmmfp->mode = SSIRecommendMode(hmmfile)) == -1)
	Die("SSIRecommendMode() failed");
    }
  else if ((hmmfp->f = EnvFileOpen(hmmfile, env, &dir)) != NULL)
    {
      char *full;
      full    = FileConcat(dir, hmmfile);

      ssifile = MallocOrDie(sizeof(char) * (strlen(full) + strlen(hmmfile) + 5));
      sprintf(ssifile, "%s.ssi", full);

      if ((hmmfp->mode = SSIRecommendMode(full)) == -1)
	Die("SSIRecommendMode() failed");

      free(full);
      free(dir);
    }
  else return NULL;
  
  /* Open the SSI index file. If it doesn't exist, or it's corrupt, or 
   * some error happens, hmmfp->ssi stays NULL.
   */
  SQD_DPRINTF1(("Opening ssifile %s...\n", ssifile));
  SSIOpen(ssifile, &(hmmfp->ssi));
  free(ssifile);

  /* Initialize the disk offset stuff.
   */
  status = SSIGetFilePosition(hmmfp->f, hmmfp->mode, &(hmmfp->offset));
  if (status != 0) Die("SSIGetFilePosition() failed");

  /* Check for binary or byteswapped binary format
   * by peeking at first 4 bytes.
   */ 
  if (! fread((char *) &magic, sizeof(unsigned int), 1, hmmfp->f)) {
    CP9_HMMFileClose(hmmfp);
    return NULL;
  }
  rewind(hmmfp->f);

  if (magic == vCP9magic) { 
    hmmfp->parser    = CP9_read_bin_hmm;
    hmmfp->is_binary = TRUE;
    return hmmfp;
  } 
  else if (magic == vCP9swap) { 
    SQD_DPRINTF1(("Opened an Infernal CP9 HMM binary file [byteswapped]\n"));
    hmmfp->parser    = CP9_read_bin_hmm;
    hmmfp->is_binary = TRUE;
    hmmfp->byteswap  = TRUE;
    return hmmfp;
  }
  /* else we fall thru; it may be an ASCII file. */

  /* If magic looks binary but we don't recognize it, choke and die.
   */
  if (magic & 0x80000000) {
    Warn("\
%s appears to be a binary but not a CM plan 9 format that we recognize\n\
It may be from HMMER,\n\
or may be a different kind of binary altogether.\n", hmmfile);
    CP9_HMMFileClose(hmmfp);
    return NULL;
  }