/* Function: PAMPrior() * * Purpose: Produces an ad hoc "Dirichlet mixture" prior for * match emissions, using a PAM matrix. * * Side effect notice: PAMPrior() replaces the match * emission section of an existing Dirichlet prior, * which is /expected/ to be a simple one-component * kind of prior. The insert emissions /must/ be a * one-component prior (because of details in how * PriorifyEmissionVector() is done). However, * the transitions /could/ be a mixture Dirichlet prior * without causing problems. In other words, the * -p and -P options of hmmb can coexist, but there * may be conflicts. PAMPrior() checks for these, * so there's no serious problem, except that the * error message from PAMPrior() might be confusing to * a user. */ void PAMPrior(char *pamfile, struct p7prior_s *pri, float wt) { FILE *fp; char *blastpamfile; /* BLAST looks in aa/ subdirectory of BLASTMAT */ int **pam; float scale; int xi, xj; int idx1, idx2; if (Alphabet_type != hmmAMINO) Die("PAM prior is only valid for protein sequences"); if (pri->strategy != PRI_DCHLET) Die("PAM prior may only be applied over an existing Dirichlet prior"); if (pri->inum != 1) Die("PAM prior requires that the insert emissions be a single Dirichlet"); if (MAXDCHLET < 20) Die("Whoa, code is misconfigured; MAXDCHLET must be >= 20 for PAM prior"); blastpamfile = FileConcat("aa", pamfile); if ((fp = fopen(pamfile, "r")) == NULL && (fp = EnvFileOpen(pamfile, "BLASTMAT", NULL)) == NULL && (fp = EnvFileOpen(blastpamfile, "BLASTMAT", NULL)) == NULL) Die("Failed to open PAM scoring matrix file %s", pamfile); if (! ParsePAMFile(fp, &pam, &scale)) Die("Failed to parse PAM scoring matrix file %s", pamfile); fclose(fp); free(blastpamfile); pri->strategy = PRI_PAM; pri->mnum = 20; /* Convert PAM entries back to conditional prob's P(xj | xi), * which we'll use as "pseudocounts" weighted by wt. */ for (xi = 0; xi < Alphabet_size; xi++) for (xj = 0; xj < Alphabet_size; xj++) { idx1 = Alphabet[xi] - 'A'; idx2 = Alphabet[xj] - 'A'; pri->m[xi][xj] = aafq[xj] * exp((float) pam[idx1][idx2] * scale); } /* Normalize so that rows add up to wt. * i.e. Sum(xj) mat[xi][xj] = wt for every row xi */ for (xi = 0; xi < Alphabet_size; xi++) { pri->mq[xi] = 1. / Alphabet_size; FNorm(pri->m[xi], Alphabet_size); FScale(pri->m[xi], Alphabet_size, wt); } Free2DArray((void **)pam,27); }
/* Function: MSAFileOpen() * Date: SRE, Tue May 18 13:22:01 1999 [St. Louis] * * Purpose: Open an alignment database file and prepare * for reading one alignment, or sequentially * in the (rare) case of multiple MSA databases * (e.g. Stockholm format). * * Args: filename - name of file to open * if "-", read stdin * if it ends in ".gz", read from pipe to gunzip -dc * format - format of file (e.g. MSAFILE_STOCKHOLM) * env - environment variable for path (e.g. BLASTDB) * * Returns: opened MSAFILE * on success. * NULL on failure: * usually, because the file doesn't exist; * for gzip'ed files, may also mean that gzip isn't in the path. */ MSAFILE * MSAFileOpen(char *filename, int format, char *env) { MSAFILE *afp; afp = MallocOrDie(sizeof(MSAFILE)); if (strcmp(filename, "-") == 0) { afp->f = stdin; afp->do_stdin = TRUE; afp->do_gzip = FALSE; afp->fname = sre_strdup("[STDIN]", -1); afp->ssi = NULL; /* can't index stdin because we can't seek*/ } #ifndef SRE_STRICT_ANSI /* popen(), pclose() aren't portable to non-POSIX systems; disable */ else if (Strparse("^.*\\.gz$", filename, 0)) { char cmd[256]; /* Note that popen() will return "successfully" * if file doesn't exist, because gzip works fine * and prints an error! So we have to check for * existence of file ourself. */ if (! FileExists(filename)) Die("%s: file does not exist", filename); if (strlen(filename) + strlen("gzip -dc ") >= 256) Die("filename > 255 char in MSAFileOpen()"); sprintf(cmd, "gzip -dc %s", filename); if ((afp->f = popen(cmd, "r")) == NULL) return NULL; afp->do_stdin = FALSE; afp->do_gzip = TRUE; afp->fname = sre_strdup(filename, -1); /* we can't index a .gz file, because we can't seek in a pipe afaik */ afp->ssi = NULL; } #endif /*SRE_STRICT_ANSI*/ else { char *ssifile; char *dir; /* When we open a file, it may be either in the current * directory, or in the directory indicated by the env * argument - and we have to construct the SSI filename accordingly. */ if ((afp->f = fopen(filename, "r")) != NULL) { ssifile = MallocOrDie(sizeof(char) * (strlen(filename) + 5)); sprintf(ssifile, "%s.ssi", filename); } else if ((afp->f = EnvFileOpen(filename, env, &dir)) != NULL) { char *full; full = FileConcat(dir, filename); ssifile = MallocOrDie(sizeof(char) * (strlen(full) + strlen(filename) + 5)); sprintf(ssifile, "%s.ssi", full); free(dir); } else return NULL; afp->do_stdin = FALSE; afp->do_gzip = FALSE; afp->fname = sre_strdup(filename, -1); afp->ssi = NULL; /* Open the SSI index file. If it doesn't exist, or * it's corrupt, or some error happens, afp->ssi stays NULL. */ SSIOpen(ssifile, &(afp->ssi)); free(ssifile); } /* Invoke autodetection if we haven't already been told what * to expect. */ if (format == MSAFILE_UNKNOWN) { if (afp->do_stdin == TRUE || afp->do_gzip) Die("Can't autodetect alignment file format from a stdin or gzip pipe"); format = MSAFileFormat(afp); if (format == MSAFILE_UNKNOWN) Die("Can't determine format of multiple alignment file %s", afp->fname); } afp->format = format; afp->linenumber = 0; afp->buf = NULL; afp->buflen = 0; return afp; }
/* Function: CP9_HMMFileOpen() * * Purpose: Open an HMM file for reading. The file may be either * an index for a library of HMMs, or an HMM. * * Args: hmmfile - name of file * env - NULL, or environment variable for HMM database. * * Return: Valid HMMFILE *, or NULL on failure. */ CP9HMMFILE * CP9_HMMFileOpen(char *hmmfile, char *env) { CP9HMMFILE *hmmfp; unsigned int magic; char buf[512]; char *ssifile; char *dir; /* dir name in which HMM file was found */ int status; hmmfp = (CP9HMMFILE *) MallocOrDie (sizeof(CP9HMMFILE)); hmmfp->f = NULL; hmmfp->parser = NULL; hmmfp->is_binary = FALSE; hmmfp->byteswap = FALSE; hmmfp->is_seekable= TRUE; /* always; right now, an HMM must always be in a file. */ /* Open the file. Look in current directory. * If that doesn't work, check environment var for * a second possible directory (usually the location * of a system-wide HMM library). * Using dir name if necessary, construct correct SSI file name. */ hmmfp->f = NULL; hmmfp->ssi = NULL; if ((hmmfp->f = fopen(hmmfile, "r")) != NULL) { ssifile = MallocOrDie(sizeof(char) * (strlen(hmmfile) + 5)); sprintf(ssifile, "%s.ssi", hmmfile); if ((hmmfp->mode = SSIRecommendMode(hmmfile)) == -1) Die("SSIRecommendMode() failed"); } else if ((hmmfp->f = EnvFileOpen(hmmfile, env, &dir)) != NULL) { char *full; full = FileConcat(dir, hmmfile); ssifile = MallocOrDie(sizeof(char) * (strlen(full) + strlen(hmmfile) + 5)); sprintf(ssifile, "%s.ssi", full); if ((hmmfp->mode = SSIRecommendMode(full)) == -1) Die("SSIRecommendMode() failed"); free(full); free(dir); } else return NULL; /* Open the SSI index file. If it doesn't exist, or it's corrupt, or * some error happens, hmmfp->ssi stays NULL. */ SQD_DPRINTF1(("Opening ssifile %s...\n", ssifile)); SSIOpen(ssifile, &(hmmfp->ssi)); free(ssifile); /* Initialize the disk offset stuff. */ status = SSIGetFilePosition(hmmfp->f, hmmfp->mode, &(hmmfp->offset)); if (status != 0) Die("SSIGetFilePosition() failed"); /* Check for binary or byteswapped binary format * by peeking at first 4 bytes. */ if (! fread((char *) &magic, sizeof(unsigned int), 1, hmmfp->f)) { CP9_HMMFileClose(hmmfp); return NULL; } rewind(hmmfp->f); if (magic == vCP9magic) { hmmfp->parser = CP9_read_bin_hmm; hmmfp->is_binary = TRUE; return hmmfp; } else if (magic == vCP9swap) { SQD_DPRINTF1(("Opened an Infernal CP9 HMM binary file [byteswapped]\n")); hmmfp->parser = CP9_read_bin_hmm; hmmfp->is_binary = TRUE; hmmfp->byteswap = TRUE; return hmmfp; } /* else we fall thru; it may be an ASCII file. */ /* If magic looks binary but we don't recognize it, choke and die. */ if (magic & 0x80000000) { Warn("\ %s appears to be a binary but not a CM plan 9 format that we recognize\n\ It may be from HMMER,\n\ or may be a different kind of binary altogether.\n", hmmfile); CP9_HMMFileClose(hmmfp); return NULL; }