/* read_mask_file * * Given an open file pointer, read the first token of the * file and return it as *ret_mask. It must contain only * '0' or '1' characters. * * Returns: eslOK on success. */ int read_mask_file(char *filename, char *errbuf, char **ret_mask, int *ret_mask_len) { ESL_FILEPARSER *efp = NULL; char *mask = NULL; char *tok; int toklen; int n; int status; if (esl_fileparser_Open(filename, NULL, &efp) != eslOK) ESL_XFAIL(eslFAIL, errbuf, "failed to open %s in read_mask_file\n", filename); esl_fileparser_SetCommentChar(efp, '#'); if((status = esl_fileparser_GetToken(efp, &tok, &toklen)) != eslOK) ESL_XFAIL(eslFAIL, errbuf, "failed to read a single token from %s\n", filename); ESL_ALLOC(mask, sizeof(char) * (toklen+1)); for(n = 0; n < toklen; n++) { if((tok[n] == '0') || (tok[n] == '1')) { mask[n] = tok[n]; } else { ESL_XFAIL(eslFAIL, errbuf, "read a non-0 and non-1 character (%c) in the mask file %s\n", tok[n], filename); } } mask[n] = '\0'; *ret_mask = mask; *ret_mask_len = n; esl_fileparser_Close(efp); return eslOK; ERROR: if (efp) esl_fileparser_Close(efp); if (mask) free(mask); return status; }
/* Function: p7_builder_SetScoreSystem() * Synopsis: Initialize score system for single sequence queries. * * Purpose: Initialize the builder <bld> to be able to parameterize * single sequence queries, using a substitution matrix * from a file. * * Read a standard substitution score matrix from file * <mxfile>. If <mxfile> is <NULL>, default to BLOSUM62 * scores. If <mxfile> is "-", read score matrix from * <stdin> stream. If <env> is non-<NULL> and <mxfile> is * not found in the current working directory, look for * <mxfile> in colon-delimited directory list contained in * environment variable <env>. * * Set the gap-open and gap-extend probabilities to * <popen>, <pextend>, respectively. * * Use background residue frequencies in the null model * <bg> to convert substitution matrix scores to * conditional probability parameters. * * Args: bld - <P7_BUILDER> to initialize * mxfile - score matrix file to use, or NULL for BLOSUM62 default * env - env variable containing directory list where <mxfile> may reside * popen - gap open probability * pextend - gap extend probability * bg - null model, containing background frequencies * * Returns: <eslOK> on success. * * <eslENOTFOUND> if <mxfile> can't be found or opened, even * in any of the directories specified by the <env> variable. * * <eslEINVAL> if the score matrix can't be converted into * conditional probabilities; for example, if it has no valid * solution for <lambda>. * * On either error, <bld->errbuf> contains a useful error message * for the user. * * Throws: <eslEMEM> on allocation failure. */ int p7_builder_SetScoreSystem(P7_BUILDER *bld, const char *mxfile, const char *env, double popen, double pextend, P7_BG *bg) { ESL_FILEPARSER *efp = NULL; double *f = NULL; double slambda; int status; bld->errbuf[0] = '\0'; /* If a score system is already set, delete it. */ if (bld->S != NULL) esl_scorematrix_Destroy(bld->S); if (bld->Q != NULL) esl_dmatrix_Destroy(bld->Q); /* Get the scoring matrix */ if ((bld->S = esl_scorematrix_Create(bld->abc)) == NULL) { status = eslEMEM; goto ERROR; } if (mxfile == NULL) { if (bld->abc->type == eslAMINO) { if ((status = esl_scorematrix_Set("BLOSUM62", bld->S)) != eslOK) goto ERROR; } else { if ((status = esl_scorematrix_Set("DNA1", bld->S)) != eslOK) goto ERROR; } } else { if ((status = esl_fileparser_Open(mxfile, env, &efp)) != eslOK) ESL_XFAIL(status, bld->errbuf, "Failed to find or open matrix file %s", mxfile); if ((status = esl_scorematrix_Read(efp, bld->abc, &(bld->S))) != eslOK) ESL_XFAIL(status, bld->errbuf, "Failed to read matrix from %s:\n%s", mxfile, efp->errbuf); esl_fileparser_Close(efp); efp = NULL; } /* A wasteful conversion of the HMMER single-precision background probs to Easel double-prec */ ESL_ALLOC(f, sizeof(double) * bg->abc->K); esl_vec_F2D(bg->f, bg->abc->K, f); /* Backcalculate joint probability matrix Q, given scores S and background freqs bg->f. */ status = esl_scorematrix_ProbifyGivenBG(bld->S, f, f, &slambda, &(bld->Q)); if (status == eslEINVAL) ESL_XFAIL(eslEINVAL, bld->errbuf, "input score matrix %s has no valid solution for lambda", mxfile); else if (status == eslENOHALT) ESL_XFAIL(eslEINVAL, bld->errbuf, "failed to solve input score matrix %s for lambda: are you sure it's valid?", mxfile); else if (status != eslOK) ESL_XFAIL(eslEINVAL, bld->errbuf, "unexpected error in solving input score matrix %s for probability parameters", mxfile); /* Convert joint probabilities P(ab) to conditionals P(b|a) */ esl_scorematrix_JointToConditionalOnQuery(bld->abc, bld->Q); bld->popen = popen; bld->pextend = pextend; free(f); return eslOK; ERROR: if (efp) esl_fileparser_Close(efp); if (f) free(f); return status; }
/* multifetch: * given a file containing lines with one name or key per line; * parse the file line-by-line; * if we have an SSI index available, retrieve the HMMs by key * as we see each line; * else, without an SSI index, store the keys in a hash, then * read the entire HMM file in a single pass, outputting HMMs * that are in our keylist. * * Note that with an SSI index, you get the HMMs in the order they * appear in the <keyfile>, but without an SSI index, you get HMMs in * the order they occur in the HMM file. */ static void multifetch(ESL_GETOPTS *go, FILE *ofp, char *keyfile, P7_HMMFILE *hfp) { ESL_KEYHASH *keys = esl_keyhash_Create(); ESL_FILEPARSER *efp = NULL; ESL_ALPHABET *abc = NULL; P7_HMM *hmm = NULL; int nhmm = 0; char *key; int keylen; int keyidx; int status; if (esl_fileparser_Open(keyfile, NULL, &efp) != eslOK) p7_Fail("Failed to open key file %s\n", keyfile); esl_fileparser_SetCommentChar(efp, '#'); while (esl_fileparser_NextLine(efp) == eslOK) { if (esl_fileparser_GetTokenOnLine(efp, &key, &keylen) != eslOK) p7_Fail("Failed to read HMM name on line %d of file %s\n", efp->linenumber, keyfile); status = esl_key_Store(keys, key, &keyidx); if (status == eslEDUP) p7_Fail("HMM key %s occurs more than once in file %s\n", key, keyfile); if (hfp->ssi != NULL) { onefetch(go, ofp, key, hfp); nhmm++; } } if (hfp->ssi == NULL) { while ((status = p7_hmmfile_Read(hfp, &abc, &hmm)) != eslEOF) { if (status == eslEOD) p7_Fail("read failed, HMM file %s may be truncated?", hfp->fname); else if (status == eslEFORMAT) p7_Fail("bad file format in HMM file %s", hfp->fname); else if (status == eslEINCOMPAT) p7_Fail("HMM file %s contains different alphabets", hfp->fname); else if (status != eslOK) p7_Fail("Unexpected error in reading HMMs from %s", hfp->fname); if (esl_key_Lookup(keys, hmm->name, &keyidx) == eslOK || ((hmm->acc) && esl_key_Lookup(keys, hmm->acc, &keyidx) == eslOK)) { p7_hmmfile_WriteASCII(ofp, -1, hmm); nhmm++; } p7_hmm_Destroy(hmm); } } if (ofp != stdout) printf("\nRetrieved %d HMMs.\n", nhmm); if (abc != NULL) esl_alphabet_Destroy(abc); esl_keyhash_Destroy(keys); esl_fileparser_Close(efp); return; }
/* multifetch: * given a file containing lines with one name or key per line; * parse the file line-by-line; * if we have an SSI index available, retrieve the MSAs by key * as we see each line; * else, without an SSI index, store the keys in a hash, then * read the entire MSA file in a single pass, outputting MSAs * that are in our keylist. * * Note that with an SSI index, you get the MSAs in the order they * appear in the <keyfile>, but without an SSI index, you get MSAs in * the order they occur in the MSA file. */ static void multifetch(ESL_GETOPTS *go, FILE *ofp, int outfmt, char *keyfile, ESLX_MSAFILE *afp) { ESL_KEYHASH *keys = esl_keyhash_Create(); ESL_FILEPARSER *efp = NULL; ESL_MSA *msa = NULL; int nali = 0; char *key; int keylen; int keyidx; int status; if (esl_fileparser_Open(keyfile, NULL, &efp) != eslOK) esl_fatal("Failed to open key file %s\n", keyfile); esl_fileparser_SetCommentChar(efp, '#'); while (esl_fileparser_NextLine(efp) == eslOK) { if (esl_fileparser_GetTokenOnLine(efp, &key, &keylen) != eslOK) esl_fatal("Failed to read MSA name on line %d of file %s\n", efp->linenumber, keyfile); status = esl_keyhash_Store(keys, key, keylen, &keyidx); if (status == eslEDUP) esl_fatal("MSA key %s occurs more than once in file %s\n", key, keyfile); if (afp->ssi) { onefetch(go, ofp, outfmt, key, afp); nali++; } } if (! afp->ssi) { while ((status = eslx_msafile_Read(afp, &msa)) != eslEOF) { if (status != eslOK) eslx_msafile_ReadFailure(afp, status); nali++; if (msa->name == NULL) esl_fatal("Every alignment in file must have a name to be retrievable. Failed to find name of alignment #%d\n", nali); if ( (esl_keyhash_Lookup(keys, msa->name, -1, NULL) == eslOK) || (msa->acc != NULL && esl_keyhash_Lookup(keys, msa->acc, -1, NULL) == eslOK)) eslx_msafile_Write(ofp, msa, outfmt); esl_msa_Destroy(msa); } } if (ofp != stdout) printf("\nRetrieved %d alignments.\n", nali); esl_keyhash_Destroy(keys); esl_fileparser_Close(efp); return; }
static void read_tabfile(ESL_GETOPTS *go, char *tabfile, ESL_KEYHASH *kh, ESL_DMATRIX *D) { ESL_FILEPARSER *efp = NULL; int nline = 0; int vfield = esl_opt_GetInteger(go, "-v"); int qfield = esl_opt_GetInteger(go, "-q"); int tfield = esl_opt_GetInteger(go, "-t"); char *tok; int toklen; int ntok; double value; int qidx, tidx; if (esl_fileparser_Open(tabfile, NULL, &efp) != eslOK) esl_fatal("File open failed"); esl_fileparser_SetCommentChar(efp, '#'); esl_dmatrix_Set(D, eslINFINITY); while (esl_fileparser_NextLine(efp) == eslOK) { nline++; ntok = 0; qidx = tidx = -1; value = eslNaN; while (esl_fileparser_GetTokenOnLine(efp, &tok, &toklen) == eslOK) { ntok++; if (ntok == vfield) value = atof(tok); if (ntok == qfield && esl_keyhash_Lookup(kh, tok, toklen, &qidx) != eslOK) esl_fatal("failed to find query key %s", tok); if (ntok == tfield && esl_keyhash_Lookup(kh, tok, toklen, &tidx) != eslOK) esl_fatal("failed to find target key %s", tok); } if (qidx == -1) esl_fatal("Failed to find query name on line %d (looking for field %d)\n", nline, qfield); if (tidx == -1) esl_fatal("Failed to find target name on line %d (looking for field %d)\n", nline, tfield); if (isnan(value)) esl_fatal("Failed to find value on line %d (looking for field %d)\n", nline, vfield); D->mx[qidx][tidx] = value; if (D->mx[tidx][qidx] == eslINFINITY) D->mx[tidx][qidx] = value; } esl_fileparser_Close(efp); }
static void multifetch_subseq(ESL_GETOPTS *go, FILE *ofp, char *gdffile, ESL_SQFILE *sqfp) { ESL_FILEPARSER *efp = NULL; char *newname; char *s; int n1, n2; int start, end; char *source; if (esl_fileparser_Open(gdffile, NULL, &efp) != eslOK) esl_fatal("Failed to open key file %s\n", gdffile); esl_fileparser_SetCommentChar(efp, '#'); while (esl_fileparser_NextLine(efp) == eslOK) { if (esl_fileparser_GetTokenOnLine(efp, &newname, &n1) != eslOK) esl_fatal("Failed to read subseq name on line %d of file %s\n", efp->linenumber, gdffile); if (esl_fileparser_GetTokenOnLine(efp, &s, NULL) != eslOK) esl_fatal("Failed to read start coord on line %d of file %s\n", efp->linenumber, gdffile); start = atoi(s); if(start <= 0) esl_fatal("Read invalid start coord %d on line %d of file %s (must be positive integer)\n", start, efp->linenumber, gdffile); if (esl_fileparser_GetTokenOnLine(efp, &s, NULL) != eslOK) esl_fatal("Failed to read end coord on line %d of file %s\n", efp->linenumber, gdffile); end = atoi(s); if(end < 0) esl_fatal("Read invalid end coord %d on line %d of file %s (must be positive integer, or 0 for full length)\n", end, efp->linenumber, gdffile); if (esl_fileparser_GetTokenOnLine(efp, &source, &n2) != eslOK) esl_fatal("Failed to read source seq name on line %d of file %s\n", efp->linenumber, gdffile); onefetch_subseq(go, ofp, sqfp, newname, source, start, end); } esl_fileparser_Close(efp); }
static void read_keyfile(ESL_GETOPTS *go, char *keyfile, ESL_KEYHASH *kh) { ESL_FILEPARSER *efp = NULL; int nline = 0; char *tok = NULL; int toklen; int status; if (esl_fileparser_Open(keyfile, NULL, &efp) != eslOK) esl_fatal("File open failed"); esl_fileparser_SetCommentChar(efp, '#'); while (esl_fileparser_NextLine(efp) == eslOK) { nline++; if (esl_fileparser_GetTokenOnLine(efp, &tok, &toklen) != eslOK) esl_fatal("No token found on line %d", nline); status = esl_keyhash_Store(kh, tok, toklen, NULL); if (status == eslEDUP) esl_fatal("Saw key %s twice: keys must be unique", tok); else if (status != eslOK) esl_fatal("unknown error in storing key %s\n", tok); } esl_fileparser_Close(efp); }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* application configuration */ char *seqfile = NULL; /* sequence file name */ char *maskfile = NULL; /* mask coordinate file name */ int infmt = eslSQFILE_UNKNOWN; /* format code for seqfile */ int outfmt = eslSQFILE_FASTA; /* format code for output seqs */ ESL_SQFILE *sqfp = NULL; /* open sequence file */ ESL_FILEPARSER *maskefp = NULL; /* open mask coord file */ FILE *ofp = NULL; /* output stream for masked seqs */ char *source = NULL; /* name of current seq to mask */ char *p1, *p2; /* pointers used in parsing */ int64_t start, end; /* start, end coord for masking */ int64_t i, j, pos; /* coords in a sequence */ int64_t overmask; /* # of extra residues to mask */ ESL_SQ *sq = esl_sq_Create(); /* current sequence */ int do_fetching; int do_lowercase; int maskchar; int status; /* easel return code */ /**************************************************************************** * Parse command line ****************************************************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") ) cmdline_help (argv[0], go); if (esl_opt_ArgNumber(go) != 2) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); do_fetching = esl_opt_GetBoolean(go, "-R"); do_lowercase = esl_opt_GetBoolean(go, "-l"); overmask = (esl_opt_IsOn(go, "-x") ? esl_opt_GetInteger(go, "-x") : 0); maskchar = (esl_opt_IsOn(go, "-m") ? esl_opt_GetChar(go, "-m") : 'X'); seqfile = esl_opt_GetArg(go, 1); maskfile = esl_opt_GetArg(go, 2); /* Open the <seqfile>: text mode, not digital */ if (esl_opt_GetString(go, "--informat") != NULL) { infmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslSQFILE_UNKNOWN) cmdline_failure(argv[0], "%s is not a valid input sequence file format for --informat"); } status = esl_sqfile_Open(seqfile, infmt, NULL, &sqfp); if (status == eslENOTFOUND) cmdline_failure(argv[0], "Sequence file %s not found.\n", seqfile); else if (status == eslEFORMAT) cmdline_failure(argv[0], "Format of file %s unrecognized.\n", seqfile); else if (status == eslEINVAL) cmdline_failure(argv[0], "Can't autodetect stdin or .gz.\n"); else if (status != eslOK) cmdline_failure(argv[0], "Open failed, code %d.\n", status); if(do_fetching) { status = esl_sqfile_OpenSSI(sqfp, NULL); if (status == eslEFORMAT) cmdline_failure(argv[0], "SSI index is in incorrect format\n"); else if (status == eslERANGE) cmdline_failure(argv[0], "SSI index is in 64-bit format and we can't read it\n"); else if (status != eslOK) cmdline_failure(argv[0], "Failed to open SSI index\n"); } /* Open the <maskfile> */ if (esl_fileparser_Open(maskfile, NULL, &maskefp) != eslOK) cmdline_failure(argv[0], "Failed to open mask coordinate file %s\n", maskfile); esl_fileparser_SetCommentChar(maskefp, '#'); /* Open the output file, if any */ if (esl_opt_GetString(go, "-o") != NULL) { if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) cmdline_failure(argv[0], "Failed to open output file %s\n", esl_opt_GetString(go, "-o")); } else ofp = stdout; /**************************************************************************** * Main loop over lines in <maskfile> ****************************************************************************/ /* Read one data line at a time from the <maskfile>; * parse into data fields <seqname> <start> <end> */ while (esl_fileparser_NextLine(maskefp) == eslOK) { /* First field is sequence name */ if (esl_fileparser_GetTokenOnLine(maskefp, &source, NULL) != eslOK) esl_fatal("Failed to read source seq name on line %d of file %s\n", maskefp->linenumber, maskfile); /* Get the sequence */ if (do_fetching) { /* If the <seqfile> is SSI indexed, try to reposition it and read <source> seq by random access */ status = esl_sqio_Fetch(sqfp, source, sq); if (status == eslENOTFOUND) esl_fatal("seq %s not found in SSI index for file %s\n", source, sqfp->filename); else if (status == eslEINVAL) esl_fatal("No SSI index or can't reposition in file %s\n", sqfp->filename); else if (status == eslEFORMAT) esl_fatal("Parse failed:\n%s\n", esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslOK) esl_fatal("Unexpected failure in fetching %s from file %s\n", source, sqfp->filename); } else { /* else, assume we're reading sequentially; <sqfile> and <maskfile> have seqs in same order */ status = esl_sqio_Read(sqfp, sq); if (status == eslEOF) esl_fatal("File %s ended prematurely; didn't find %s\n", sqfp->filename, source); else if (status == eslEFORMAT) esl_fatal("Parse failed:\n%s\n", esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslOK) esl_fatal("Unexpected error reading sequence file %s\n", sqfp->filename); if ((strcmp(sq->name, source) != 0) && (strcmp(sq->acc, source) != 0)) esl_fatal("Sequences in <sqfile> and <maskfile> aren't in same order; try -R"); } /* If we're masking by lowercase, first make sure everything's uppercase */ if (do_lowercase) for (pos = 0; pos < sq->n; pos++) if (isalpha(sq->seq[pos])) sq->seq[pos] = toupper(sq->seq[pos]); /* Next two fields are <start>, <end> for the masking */ /* possible future extension: wrap loop around this, enable multiple masked regions */ if (esl_fileparser_GetTokenOnLine(maskefp, &p1, NULL) != eslOK) esl_fatal("Failed to read start coord on line %d of file %s\n", maskefp->linenumber, maskfile); start = strtoll(p1, &p2, 0) - 1; if (esl_fileparser_GetTokenOnLine(maskefp, &p2, NULL) != eslOK) esl_fatal("Failed to read end coord on line %d of file %s\n", maskefp->linenumber, maskfile); end = strtoll(p2, &p1, 0) - 1; /* Do the masking */ if (esl_opt_GetBoolean(go, "-r")) /* Reverse masking */ { /* leave start..end unmasked; mask prefix 0..start-1, end+1..L-1 */ i = 0; j = ESL_MIN(sq->n-1, start - 1 + overmask); for (pos = i; pos <= j; pos++) if (isalpha(sq->seq[pos])) sq->seq[pos] = (do_lowercase ? tolower(sq->seq[pos]) : maskchar); i = ESL_MAX(0, end + 1 - overmask); j = sq->n-1; for (pos = i; pos <= j; pos++) if (isalpha(sq->seq[pos])) sq->seq[pos] = (do_lowercase ? tolower(sq->seq[pos]) : maskchar); } else { /* normal: mask start..end */ i = ESL_MAX(0, start - overmask); j = ESL_MIN(sq->n-1, end + overmask); for (pos = i; pos <= j; pos++) if (isalpha(sq->seq[pos])) sq->seq[pos] = (do_lowercase ? tolower(sq->seq[pos]) : maskchar); } esl_sqio_Write(ofp, sq, outfmt, FALSE); esl_sq_Reuse(sq); } esl_sq_Destroy(sq); esl_fileparser_Close(maskefp); esl_sqfile_Close(sqfp); esl_getopts_Destroy(go); if (ofp != stdout) fclose(ofp); return 0; }
int main(int argc, char **argv) { int status; ESL_RANDOMNESS *r = esl_randomness_CreateTimeseeded(); ESL_ALPHABET *abc = esl_alphabet_Create(eslAMINO); ESL_SCOREMATRIX *S = NULL; ESL_DSQ *x = NULL; /* iid query */ ESL_DSQ *y = NULL; /* iid target */ double lambda; double bg[20]; /* iid background probabilities */ int L; /* query length */ int M; /* target length */ int nseq; /* number of target seqs to simulate */ int i; int gop; int gex; char *mxfile = "PMX"; int raw_sc; /* Configuration */ L = 400; /* query length */ M = 400; /* target length */ nseq = 50000; gop = -11; gex = -1; lambda = 0.3207; ESL_ALLOC(x, sizeof(ESL_DSQ) * (L+2)); ESL_ALLOC(y, sizeof(ESL_DSQ) * (M+2)); /* Input an amino acid score matrix from a file. */ if (mxfile != NULL) { ESL_FILEPARSER *efp = NULL; if ( esl_fileparser_Open(mxfile, &efp) != eslOK) esl_fatal("failed to open score file %s", mxfile); if ( esl_sco_Read(efp, abc, &S) != eslOK) esl_fatal("failed to read matrix from %s", mxfile); esl_fileparser_Close(efp); } else { /* default = BLOSUM62 */ S = esl_scorematrix_Create(abc); esl_scorematrix_SetBLOSUM62(S); } esl_composition_BL62(bg); esl_rsq_xIID(r, bg, 20, L, x); for (i = 0; i < nseq; i++) { esl_rsq_xIID(r, bg, 20, M, y); esl_swat_Score(x, L, y, M, S, gop, gex, &raw_sc); printf("%d\n", raw_sc); } free(x); free(y); esl_scorematrix_Destroy(S); esl_alphabet_Destroy(abc); esl_randomness_Destroy(r); exit(0); ERROR: exit(status); }
/* Function: p7_bg_Read() * Synopsis: Read background frequencies from a file. * * Purpose: Read new background frequencies from file <bgfile>, * overwriting the frequencies previously in the * <P7_BG> object <bg>. * * Note that <bg> is already created by the caller, not * created here. Also note that <p7_bg_Read()> only reads * residue background frequencies used for the "null * model", whereas a <P7_BG> object contains additional * information for the bias filter and for the biased * composition correction. * * Args: bgfile - file to read. * bg - existing <P7_BG> object provided by the caller. * errbuf - OPTIONAL: space for an error message, upon parse errors; or NULL. * * Returns: <eslOK> on success, and background frequencies in <bg> * are overwritten. * * <eslENOTFOUND> if <bgfile> can't be opened for reading. * <eslEFORMAT> if parsing of <bgfile> fails for some * reason. In both cases, <errbuf> contains a * user-directed error message upon return, including (if * relevant) the file name <bgfile> and the line number on * which an error was detected. <bg> is unmodified. * * Throws: <eslEMEM> on allocation failure; <bg> is unmodified, * and <errbuf> is empty. */ int p7_bg_Read(char *bgfile, P7_BG *bg, char *errbuf) { ESL_FILEPARSER *efp = NULL; float *fq = NULL; int n = 0; char *tok; int toklen; int alphatype; ESL_DSQ x; int status; if (errbuf) errbuf[0] = '\0'; status = esl_fileparser_Open(bgfile, NULL, &efp); if (status == eslENOTFOUND) ESL_XFAIL(eslENOTFOUND, errbuf, "couldn't open bg file %s for reading", bgfile); else if (status != eslOK) goto ERROR; esl_fileparser_SetCommentChar(efp, '#'); /* First token is alphabet type: amino | DNA | RNA */ status = esl_fileparser_GetToken(efp, &tok, &toklen); if (status == eslEOF) ESL_XFAIL(eslEFORMAT, errbuf, "premature end of file [line %d of bgfile %s]", efp->linenumber, bgfile); else if (status != eslOK) goto ERROR; alphatype = esl_abc_EncodeType(tok); if (alphatype == eslUNKNOWN) ESL_XFAIL(eslEFORMAT, errbuf, "expected alphabet type but saw \"%s\" [line %d of bgfile %s]", tok, efp->linenumber, bgfile); else if (alphatype != bg->abc->type) ESL_XFAIL(eslEFORMAT, errbuf, "bg file's alphabet is %s; expected %s [line %d, %s]", tok, esl_abc_DecodeType(bg->abc->type), efp->linenumber, bgfile); ESL_ALLOC(fq, sizeof(float) * bg->abc->K); esl_vec_FSet(fq, bg->abc->K, -1.0); while ((status = esl_fileparser_NextLine(efp)) == eslOK) { status = esl_fileparser_GetTokenOnLine(efp, &tok, &toklen); if (status == eslEOL) ESL_XFAIL(eslEFORMAT, errbuf, "premature end of file [line %d of bgfile %s", efp->linenumber, bgfile); else if (status != eslOK) goto ERROR; if (toklen != 1 || ! esl_abc_CIsCanonical(bg->abc, *tok)) ESL_XFAIL(eslEFORMAT, errbuf, "expected to parse a residue letter; saw %s [line %d of bgfile %s]", tok, efp->linenumber, bgfile); x = esl_abc_DigitizeSymbol(bg->abc, *tok); if (fq[x] != -1.0) ESL_XFAIL(eslEFORMAT, errbuf, "already parsed probability of %c [line %d of bgfile %s]", bg->abc->sym[x], efp->linenumber, bgfile); n++; status = esl_fileparser_GetTokenOnLine(efp, &tok, &toklen); if (status == eslEOL) ESL_XFAIL(eslEFORMAT, errbuf, "premature end of file, expected a probability [line %d of bgfile %s]", efp->linenumber, bgfile); else if (status != eslOK) goto ERROR; if (! esl_str_IsReal(tok)) ESL_XFAIL(eslEFORMAT, errbuf, "expected a probability, saw %s [line %d of bgfile %s]", tok, efp->linenumber, bgfile); fq[x] = atof(tok); status = esl_fileparser_GetTokenOnLine(efp, &tok, &toklen); if (status == eslOK) ESL_XFAIL(eslEFORMAT, errbuf, "extra unexpected data found [line %d of bgfile %s]", efp->linenumber, bgfile); else if (status != eslEOL) goto ERROR; } if (status != eslEOF) goto ERROR; if ( n != bg->abc->K) ESL_XFAIL(eslEFORMAT, errbuf, "expected %d residue frequencies, but found %d in bgfile %s", bg->abc->K, n, bgfile); if ( esl_FCompare(esl_vec_FSum(fq, bg->abc->K), 1.0, 0.001) != eslOK) ESL_XFAIL(eslEFORMAT, errbuf, "residue frequencies do not sum to 1.0 in bgfile %s", bgfile); /* all checking complete. no more error cases. overwrite bg with the new frequencies */ esl_vec_FNorm(fq, bg->abc->K); esl_vec_FCopy(fq, bg->abc->K, bg->f); free(fq); esl_fileparser_Close(efp); return eslOK; ERROR: if (fq) free(fq); if (efp) esl_fileparser_Close(efp); return status; }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage); ESL_ALPHABET *abc = esl_alphabet_Create(eslAMINO); char *hmmfile = esl_opt_GetArg(go, 1); char *qfile = esl_opt_GetArg(go, 2); ESL_SQ *qsq = esl_sq_CreateDigital(abc); ESL_SQFILE *qfp = NULL; FILE *hmmfp = NULL; ESL_SCOREMATRIX *S = esl_scorematrix_Create(abc); ESL_DMATRIX *Q = NULL; P7_BG *bg = p7_bg_Create(abc); P7_HMM *hmm = NULL; double *fa = NULL; double popen = esl_opt_GetReal (go, "-q"); double pextend = esl_opt_GetReal (go, "-r"); char *mxfile = esl_opt_GetString(go, "-m"); char errbuf[eslERRBUFSIZE]; double slambda; int a,b; int status; /* Reverse engineer a scoring matrix to obtain conditional prob's * that we'll use for the single-seq query HMM. Because score mx is * symmetric, we can set up P[a][b] = P(b | a), so we can use the * matrix rows as HMM match emission vectors. This means dividing * the joint probs through by f_a. */ if (mxfile == NULL) { if (esl_scorematrix_Set("BLOSUM62", S) != eslOK) esl_fatal("failed to set BLOSUM62 scores"); } else { ESL_FILEPARSER *efp = NULL; if ( esl_fileparser_Open(mxfile, NULL, &efp) != eslOK) esl_fatal("failed to open score file %s", mxfile); if ( esl_scorematrix_Read(efp, abc, &S) != eslOK) esl_fatal("failed to read matrix from %s", mxfile); esl_fileparser_Close(efp); } /* A wasteful conversion of the HMMER single-precision background probs to Easel double-prec */ ESL_ALLOC(fa, sizeof(double) * bg->abc->K); esl_vec_F2D(bg->f, bg->abc->K, fa); /* Backcalculate joint probabilities Q, given score matrix S and background frequencies fa */ status = esl_scorematrix_ProbifyGivenBG(S, fa, fa, &slambda, &Q); if (status == eslEINVAL) esl_fatal("built-in score matrix %s has no valid solution for lambda", matrix); else if (status == eslENOHALT) esl_fatal("failed to solve score matrix %s for lambda", matrix); else if (status != eslOK) esl_fatal("unexpected error in solving score matrix %s for probability parameters", matrix); esl_scorematrix_JointToConditionalOnQuery(abc, Q); /* Open the query sequence file in FASTA format */ status = esl_sqfile_Open(qfile, eslSQFILE_FASTA, NULL, &qfp); if (status == eslENOTFOUND) esl_fatal("No such file %s.", qfile); else if (status == eslEFORMAT) esl_fatal("Format of %s unrecognized.", qfile); else if (status == eslEINVAL) esl_fatal("Can't autodetect stdin or .gz."); else if (status != eslOK) esl_fatal("Open of %s failed, code %d.", qfile, status); /* Open the output HMM file */ if ((hmmfp = fopen(hmmfile, "w")) == NULL) esl_fatal("Failed to open output HMM file %s", hmmfile); /* For each sequence, build a model and save it. */ while ((status = esl_sqio_Read(qfp, qsq)) == eslOK) { p7_Seqmodel(abc, qsq->dsq, qsq->n, qsq->name, Q, bg->f, popen, pextend, &hmm); if ( p7_hmm_Validate(hmm, errbuf, 1e-5) != eslOK) esl_fatal("HMM validation failed: %s\n", errbuf); if ( p7_hmmfile_WriteASCII(hmmfp, -1, hmm) != eslOK) esl_fatal("HMM save failed"); p7_hmm_Destroy(hmm); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s line %" PRId64 "):\n%s\n", qfp->filename, qfp->linenumber, qfp->errbuf); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, qfp->filename); esl_dmatrix_Destroy(Q); esl_scorematrix_Destroy(S); free(fa); free(fb); esl_sq_Destroy(qsq); esl_sqfile_Close(qfp); fclose(hmmfp); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
/* multifetch: * given a file containing lines with one name or key per line; * parse the file line-by-line; * if we have an SSI index available, retrieve the seqs by key * as we see each line; * else, without an SSI index, store the keys in a hash, then * read the entire seq file in a single pass, outputting seqs * that are in our keylist. * * Note that with an SSI index, you get the seqs in the order they * appear in the <keyfile>, but without an SSI index, you get seqs in * the order they occur in the seq file. */ static void multifetch(ESL_GETOPTS *go, FILE *ofp, char *keyfile, ESL_SQFILE *sqfp) { ESL_KEYHASH *keys = esl_keyhash_Create(); ESL_FILEPARSER *efp = NULL; int nseq = 0; int nkeys = 0; char *key; int keylen; int keyidx; int status; if (esl_fileparser_Open(keyfile, NULL, &efp) != eslOK) esl_fatal("Failed to open key file %s\n", keyfile); esl_fileparser_SetCommentChar(efp, '#'); while (esl_fileparser_NextLine(efp) == eslOK) { if (esl_fileparser_GetTokenOnLine(efp, &key, &keylen) != eslOK) esl_fatal("Failed to read seq name on line %d of file %s\n", efp->linenumber, keyfile); status = esl_keyhash_Store(keys, key, keylen, &keyidx); if (status == eslEDUP) esl_fatal("seq key %s occurs more than once in file %s\n", key, keyfile); /* if we have an SSI index, just fetch them as we go. */ if (sqfp->data.ascii.ssi != NULL) { onefetch(go, ofp, key, sqfp); nseq++; } nkeys++; } /* If we don't have an SSI index, we haven't fetched anything yet; do it now. */ if (sqfp->data.ascii.ssi == NULL) { ESL_SQ *sq = esl_sq_Create(); while ((status = esl_sqio_Read(sqfp, sq)) == eslOK) { if ( (sq->name[0] != '\0' && esl_keyhash_Lookup(keys, sq->name, -1, NULL) == eslOK) || (sq->acc[0] != '\0' && esl_keyhash_Lookup(keys, sq->acc, -1, NULL) == eslOK)) { if (esl_opt_GetBoolean(go, "-r") ) if (esl_sq_ReverseComplement(sq) != eslOK) esl_fatal("Failed to reverse complement %s\n", sq->name); esl_sqio_Write(ofp, sq, eslSQFILE_FASTA, FALSE); nseq++; } esl_sq_Reuse(sq); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); esl_sq_Destroy(sq); } if (nkeys != nseq) esl_fatal("Tried to retrieve %d keys, but only retrieved %d sequences\n", nkeys, nseq); if (ofp != stdout) printf("\nRetrieved %d sequences.\n", nseq); esl_keyhash_Destroy(keys); esl_fileparser_Close(efp); return; }