/* onefetch(): * Given one <key> (a seq name or accession), retrieve the corresponding sequence. * In SSI mode, we can do this quickly by positioning the file, then regurgitating * every line until the end-of-record marker; we don't even have to parse. * Without an SSI index, we have to parse the file sequentially 'til we find * the one we're after. */ static void onefetch(ESL_GETOPTS *go, FILE *ofp, char *key, ESL_SQFILE *sqfp) { ESL_SQ *sq = esl_sq_Create(); int do_revcomp = esl_opt_GetBoolean(go, "-r"); char *newname = esl_opt_GetString(go, "-n"); int status; /* Try to position the file at the desired sequence with SSI. */ if (sqfp->data.ascii.ssi != NULL) { status = esl_sqfile_PositionByKey(sqfp, key); if (status == eslENOTFOUND) esl_fatal("seq %s not found in SSI index for file %s\n", key, sqfp->filename); else if (status == eslEFORMAT) esl_fatal("Failed to parse SSI index for %s\n", sqfp->filename); else if (status != eslOK) esl_fatal("Failed to look up location of seq %s in SSI index of file %s\n", key, sqfp->filename); status = esl_sqio_Read(sqfp, sq); if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status == eslEOF) esl_fatal("Unexpected EOF reading sequence file %s", status, sqfp->filename); else if (status != eslOK) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); if (strcmp(key, sq->name) != 0 && strcmp(key, sq->acc) != 0) esl_fatal("whoa, internal error; found the wrong sequence %s, not %s", sq->name, key); } else { /* Else, we have to read the whole damn file sequentially until we find the seq */ while ((status = esl_sqio_Read(sqfp, sq)) != eslEOF) { if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslOK) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); if (strcmp(key, sq->name) == 0 || strcmp(key, sq->acc) == 0) break; esl_sq_Reuse(sq); } if (status == eslEOF) esl_fatal("Failed to find sequence %s in file %s\n", key, sqfp->filename); } if (do_revcomp == FALSE && newname == NULL && ! esl_sqio_IsAlignment(sqfp->format)) { /* If we're not manipulating the sequence in any way, and it's not from an alignment file, we can Echo() it. */ if (esl_sqio_Echo(sqfp, sq, ofp) != eslOK) esl_fatal("Echo failed: %s\n", esl_sqfile_GetErrorBuf(sqfp)); } else { /* Otherwise we Write() the parsed version. */ if (do_revcomp && esl_sq_ReverseComplement(sq) != eslOK) esl_fatal("Failed to reverse complement %s; is it a protein?\n", sq->name); if (newname != NULL) esl_sq_SetName(sq, newname); esl_sqio_Write(ofp, sq, eslSQFILE_FASTA, FALSE); } esl_sq_Destroy(sq); }
static void onefetch_subseq(ESL_GETOPTS *go, FILE *ofp, ESL_SQFILE *sqfp, char *newname, char *key, uint32_t given_start, uint32_t given_end) { int start, end; int do_revcomp; ESL_SQ *sq = esl_sq_Create(); if (sqfp->data.ascii.ssi == NULL) esl_fatal("no ssi index"); /* reverse complement indicated by coords. */ /* -c 52: would be 52,0, so watch out for given_end = 0 case */ if (given_end != 0 && given_start > given_end) { start = given_end; end = given_start; do_revcomp = TRUE; } else { start = given_start; end = given_end; do_revcomp = FALSE; } if (esl_sqio_FetchSubseq(sqfp, key, start, end, sq) != eslOK) esl_fatal(esl_sqfile_GetErrorBuf(sqfp)); if (newname != NULL) esl_sq_SetName(sq, newname); else esl_sq_FormatName(sq, "%s/%d-%d", key, given_start, (given_end == 0) ? sq->L : given_end); /* Two ways we might have been asked to revcomp: by coord, or by -r option */ /* (If both happen, they'll cancel each other out) */ if (do_revcomp) if (esl_sq_ReverseComplement(sq) != eslOK) esl_fatal("Failed to reverse complement %s; is it a protein?\n", sq->name); if (esl_opt_GetBoolean(go, "-r")) if (esl_sq_ReverseComplement(sq) != eslOK) esl_fatal("Failed to reverse complement %s; is it a protein?\n", sq->name); esl_sqio_Write(ofp, sq, eslSQFILE_FASTA, FALSE); esl_sq_Destroy(sq); }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; char *seqfile = NULL; ESL_SQFILE *sqfp = NULL; int infmt = eslSQFILE_UNKNOWN; int alphatype = eslUNKNOWN; ESL_ALPHABET *abc = NULL; ESL_SQ *sq = NULL; int64_t nseq = 0; int64_t nres = 0; int64_t small = 0; int64_t large = 0; double *monoc = NULL; /* monoresidue composition per sequence */ double *monoc_all = NULL; /* monoresidue composition over all seqs */ int do_comp = FALSE; int status = eslOK; int wstatus; int i; int do_stall; /* used to stall when debugging */ /* Parse command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in app configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") ) cmdline_help(argv[0], go); if (esl_opt_ArgNumber(go) != 1) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); seqfile = esl_opt_GetArg(go, 1); do_comp = esl_opt_GetBoolean(go, "-c"); if (esl_opt_GetString(go, "--informat") != NULL) { infmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslSQFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --informat"); } do_stall = esl_opt_GetBoolean(go, "--stall"); /* a stall point for attaching gdb */ while (do_stall); /* open input file */ status = esl_sqfile_Open(seqfile, infmt, NULL, &sqfp); if (status == eslENOTFOUND) esl_fatal("No such file %s", seqfile); else if (status == eslEFORMAT) esl_fatal("Format of seqfile %s unrecognized.", seqfile); else if (status != eslOK) esl_fatal("Open failed, code %d.", status); if (esl_opt_GetBoolean(go, "--rna")) alphatype = eslRNA; else if (esl_opt_GetBoolean(go, "--dna")) alphatype = eslDNA; else if (esl_opt_GetBoolean(go, "--amino")) alphatype = eslAMINO; else { status = esl_sqfile_GuessAlphabet(sqfp, &alphatype); if (status == eslEAMBIGUOUS) esl_fatal("Couldn't guess alphabet from first sequence in %s", seqfile); else if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status == eslENODATA) esl_fatal("Sequence file %s contains no data?", seqfile); else if (status != eslOK) esl_fatal("Failed to guess alphabet (error code %d)\n", status); } abc = esl_alphabet_Create(alphatype); sq = esl_sq_CreateDigital(abc); esl_sqfile_SetDigital(sqfp, abc); if (do_comp) { ESL_ALLOC(monoc, (abc->Kp) * sizeof(double)); ESL_ALLOC(monoc_all, (abc->Kp) * sizeof(double)); esl_vec_DSet(monoc_all, abc->Kp, 0.0); esl_vec_DSet(monoc, abc->Kp, 0.0); } while ((wstatus = esl_sqio_ReadWindow(sqfp, 0, 4096, sq)) != eslEOF) { if (wstatus == eslOK) { if (do_comp) for (i = 1; i <= sq->n; i++) monoc[sq->dsq[i]]++; } else if (wstatus == eslEOD) { if (nseq == 0) { small = large = sq->L; } else { small = ESL_MIN(small, sq->L); large = ESL_MAX(large, sq->L); } if (esl_opt_GetBoolean(go, "-a")) { printf("= %-20s %8" PRId64 " %s\n", sq->name, sq->L, (sq->desc != NULL) ? sq->desc : ""); } nres += sq->L; nseq++; esl_sq_Reuse(sq); if (do_comp) { esl_vec_DAdd(monoc_all, monoc, abc->Kp); esl_vec_DSet(monoc, abc->Kp, 0.0); } } else if (wstatus == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else esl_fatal("Unexpected error %d reading sequence file %s", wstatus, sqfp->filename); } printf("Format: %s\n", esl_sqio_DecodeFormat(sqfp->format)); printf("Alphabet type: %s\n", esl_abc_DecodeType(abc->type)); printf("Number of sequences: %" PRId64 "\n", nseq); printf("Total # residues: %" PRId64 "\n", nres); printf("Smallest: %" PRId64 "\n", small); printf("Largest: %" PRId64 "\n", large); printf("Average length: %.1f\n", (float) nres / (float) nseq); if (do_comp) { show_overall_composition(abc, monoc_all, nres); free(monoc); free(monoc_all); } esl_alphabet_Destroy(abc); esl_sq_Destroy(sq); esl_sqfile_Close(sqfp); esl_getopts_Destroy(go); return 0; ERROR: return status; }
static int do_by_windows(ESL_GENCODE *gcode, ESL_GENCODE_WORKSTATE *wrk, ESL_SQFILE *sqfp) { ESL_SQ *sq = esl_sq_CreateDigital(gcode->nt_abc); int windowsize = 4092; // can be any value, but a multiple of 3 makes most sense. windowsize can be +/-; + means reading top strand; - means bottom strand. int contextsize = 2; // contextsize (adjacent window overlap) must be 2, or translation won't work properly. int wstatus; ESL_DASSERT1(( windowsize % 3 == 0 )); while (( wstatus = esl_sqio_ReadWindow(sqfp, contextsize, windowsize, sq)) != eslEOF) { if (wstatus == eslEOD) { if ( (windowsize > 0 && wrk->do_watson) || (windowsize < 0 && wrk->do_crick)) esl_gencode_ProcessEnd(wrk, sq); if (windowsize > 0 && ! wrk->do_crick) { esl_sq_Reuse(sq); continue; } // Don't switch to revcomp if we don't need do. Allows -W --watson to work on nonrewindable streams if (windowsize < 0) esl_sq_Reuse(sq); // Do not Reuse the sq on the switch from watson to crick; ReadWindow needs sq->L windowsize = -windowsize; // switch to other strand. continue; } else if (wstatus == eslEFORMAT) esl_fatal("Parsing failed in sequence file %s:\n%s", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (wstatus == eslEINVAL) esl_fatal("Invalid residue(s) found in sequence file %s\n%s", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (wstatus != eslOK) esl_fatal("Unexpected error %d reading sequence file %s", wstatus, sqfp->filename); /* If we're the first window in this input DNA sequence * (or the first window in its revcomp), then initialize. * sq->C is the actual context overlap; 0=1st window; 2 (i.e. C)= subsequent. */ if (sq->C == 0) { if (sq->n < 3) continue; // DNA sequence too short; skip it, don't even bother to revcomp, go to next sequence. if ( (windowsize > 0 && wrk->do_watson) || (windowsize < 0 && wrk->do_crick)) esl_gencode_ProcessStart(gcode, wrk, sq); } if ( (windowsize > 0 && wrk->do_watson) || (windowsize < 0 && wrk->do_crick)) esl_gencode_ProcessPiece(gcode, wrk, sq); } esl_sq_Destroy(sq); return eslOK; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* application configuration */ char *seqfile = NULL; /* sequence file name */ char *maskfile = NULL; /* mask coordinate file name */ int infmt = eslSQFILE_UNKNOWN; /* format code for seqfile */ int outfmt = eslSQFILE_FASTA; /* format code for output seqs */ ESL_SQFILE *sqfp = NULL; /* open sequence file */ ESL_FILEPARSER *maskefp = NULL; /* open mask coord file */ FILE *ofp = NULL; /* output stream for masked seqs */ char *source = NULL; /* name of current seq to mask */ char *p1, *p2; /* pointers used in parsing */ int64_t start, end; /* start, end coord for masking */ int64_t i, j, pos; /* coords in a sequence */ int64_t overmask; /* # of extra residues to mask */ ESL_SQ *sq = esl_sq_Create(); /* current sequence */ int do_fetching; int do_lowercase; int maskchar; int status; /* easel return code */ /**************************************************************************** * Parse command line ****************************************************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") ) cmdline_help (argv[0], go); if (esl_opt_ArgNumber(go) != 2) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); do_fetching = esl_opt_GetBoolean(go, "-R"); do_lowercase = esl_opt_GetBoolean(go, "-l"); overmask = (esl_opt_IsOn(go, "-x") ? esl_opt_GetInteger(go, "-x") : 0); maskchar = (esl_opt_IsOn(go, "-m") ? esl_opt_GetChar(go, "-m") : 'X'); seqfile = esl_opt_GetArg(go, 1); maskfile = esl_opt_GetArg(go, 2); /* Open the <seqfile>: text mode, not digital */ if (esl_opt_GetString(go, "--informat") != NULL) { infmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslSQFILE_UNKNOWN) cmdline_failure(argv[0], "%s is not a valid input sequence file format for --informat"); } status = esl_sqfile_Open(seqfile, infmt, NULL, &sqfp); if (status == eslENOTFOUND) cmdline_failure(argv[0], "Sequence file %s not found.\n", seqfile); else if (status == eslEFORMAT) cmdline_failure(argv[0], "Format of file %s unrecognized.\n", seqfile); else if (status == eslEINVAL) cmdline_failure(argv[0], "Can't autodetect stdin or .gz.\n"); else if (status != eslOK) cmdline_failure(argv[0], "Open failed, code %d.\n", status); if(do_fetching) { status = esl_sqfile_OpenSSI(sqfp, NULL); if (status == eslEFORMAT) cmdline_failure(argv[0], "SSI index is in incorrect format\n"); else if (status == eslERANGE) cmdline_failure(argv[0], "SSI index is in 64-bit format and we can't read it\n"); else if (status != eslOK) cmdline_failure(argv[0], "Failed to open SSI index\n"); } /* Open the <maskfile> */ if (esl_fileparser_Open(maskfile, NULL, &maskefp) != eslOK) cmdline_failure(argv[0], "Failed to open mask coordinate file %s\n", maskfile); esl_fileparser_SetCommentChar(maskefp, '#'); /* Open the output file, if any */ if (esl_opt_GetString(go, "-o") != NULL) { if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) cmdline_failure(argv[0], "Failed to open output file %s\n", esl_opt_GetString(go, "-o")); } else ofp = stdout; /**************************************************************************** * Main loop over lines in <maskfile> ****************************************************************************/ /* Read one data line at a time from the <maskfile>; * parse into data fields <seqname> <start> <end> */ while (esl_fileparser_NextLine(maskefp) == eslOK) { /* First field is sequence name */ if (esl_fileparser_GetTokenOnLine(maskefp, &source, NULL) != eslOK) esl_fatal("Failed to read source seq name on line %d of file %s\n", maskefp->linenumber, maskfile); /* Get the sequence */ if (do_fetching) { /* If the <seqfile> is SSI indexed, try to reposition it and read <source> seq by random access */ status = esl_sqio_Fetch(sqfp, source, sq); if (status == eslENOTFOUND) esl_fatal("seq %s not found in SSI index for file %s\n", source, sqfp->filename); else if (status == eslEINVAL) esl_fatal("No SSI index or can't reposition in file %s\n", sqfp->filename); else if (status == eslEFORMAT) esl_fatal("Parse failed:\n%s\n", esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslOK) esl_fatal("Unexpected failure in fetching %s from file %s\n", source, sqfp->filename); } else { /* else, assume we're reading sequentially; <sqfile> and <maskfile> have seqs in same order */ status = esl_sqio_Read(sqfp, sq); if (status == eslEOF) esl_fatal("File %s ended prematurely; didn't find %s\n", sqfp->filename, source); else if (status == eslEFORMAT) esl_fatal("Parse failed:\n%s\n", esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslOK) esl_fatal("Unexpected error reading sequence file %s\n", sqfp->filename); if ((strcmp(sq->name, source) != 0) && (strcmp(sq->acc, source) != 0)) esl_fatal("Sequences in <sqfile> and <maskfile> aren't in same order; try -R"); } /* If we're masking by lowercase, first make sure everything's uppercase */ if (do_lowercase) for (pos = 0; pos < sq->n; pos++) if (isalpha(sq->seq[pos])) sq->seq[pos] = toupper(sq->seq[pos]); /* Next two fields are <start>, <end> for the masking */ /* possible future extension: wrap loop around this, enable multiple masked regions */ if (esl_fileparser_GetTokenOnLine(maskefp, &p1, NULL) != eslOK) esl_fatal("Failed to read start coord on line %d of file %s\n", maskefp->linenumber, maskfile); start = strtoll(p1, &p2, 0) - 1; if (esl_fileparser_GetTokenOnLine(maskefp, &p2, NULL) != eslOK) esl_fatal("Failed to read end coord on line %d of file %s\n", maskefp->linenumber, maskfile); end = strtoll(p2, &p1, 0) - 1; /* Do the masking */ if (esl_opt_GetBoolean(go, "-r")) /* Reverse masking */ { /* leave start..end unmasked; mask prefix 0..start-1, end+1..L-1 */ i = 0; j = ESL_MIN(sq->n-1, start - 1 + overmask); for (pos = i; pos <= j; pos++) if (isalpha(sq->seq[pos])) sq->seq[pos] = (do_lowercase ? tolower(sq->seq[pos]) : maskchar); i = ESL_MAX(0, end + 1 - overmask); j = sq->n-1; for (pos = i; pos <= j; pos++) if (isalpha(sq->seq[pos])) sq->seq[pos] = (do_lowercase ? tolower(sq->seq[pos]) : maskchar); } else { /* normal: mask start..end */ i = ESL_MAX(0, start - overmask); j = ESL_MIN(sq->n-1, end + overmask); for (pos = i; pos <= j; pos++) if (isalpha(sq->seq[pos])) sq->seq[pos] = (do_lowercase ? tolower(sq->seq[pos]) : maskchar); } esl_sqio_Write(ofp, sq, outfmt, FALSE); esl_sq_Reuse(sq); } esl_sq_Destroy(sq); esl_fileparser_Close(maskefp); esl_sqfile_Close(sqfp); esl_getopts_Destroy(go); if (ofp != stdout) fclose(ofp); return 0; }
/* serial_master() * The serial version of hmmsearch. * For each query HMM in <hmmdb> search the database for hits. * * A master can only return if it's successful. All errors are handled * immediately and fatally with p7_Fail(). */ static int serial_master(ESL_GETOPTS *go, struct cfg_s *cfg) { FILE *ofp = stdout; /* output file for results (default stdout) */ FILE *tblfp = NULL; /* output stream for tabular per-seq (--tblout) */ FILE *dfamtblfp = NULL; /* output stream for tabular Dfam format (--dfamtblout) */ FILE *aliscoresfp = NULL; /* output stream for alignment scores (--aliscoresout) */ // P7_HMM *hmm = NULL; /* one HMM query */ // P7_SCOREDATA *scoredata = NULL; int seqfmt = eslSQFILE_UNKNOWN; /* format of seqfile */ ESL_SQFILE *sqfp = NULL; /* open seqfile */ P7_HMMFILE *hfp = NULL; /* open HMM database file */ ESL_ALPHABET *abc = NULL; /* sequence alphabet */ P7_OPROFILE *om = NULL; /* target profile */ ESL_STOPWATCH *w = NULL; /* timing */ ESL_SQ *qsq = NULL; /* query sequence */ int nquery = 0; int textw; int status = eslOK; int hstatus = eslOK; int sstatus = eslOK; int i; int ncpus = 0; int infocnt = 0; WORKER_INFO *info = NULL; #ifdef HMMER_THREADS P7_OM_BLOCK *block = NULL; ESL_THREADS *threadObj= NULL; ESL_WORK_QUEUE *queue = NULL; #endif char errbuf[eslERRBUFSIZE]; double window_beta = -1.0 ; int window_length = -1; if (esl_opt_IsUsed(go, "--w_beta")) { if ( ( window_beta = esl_opt_GetReal(go, "--w_beta") ) < 0 || window_beta > 1 ) esl_fatal("Invalid window-length beta value\n"); } if (esl_opt_IsUsed(go, "--w_length")) { if (( window_length = esl_opt_GetInteger(go, "--w_length")) < 4 ) esl_fatal("Invalid window length value\n"); } w = esl_stopwatch_Create(); if (esl_opt_GetBoolean(go, "--notextw")) textw = 0; else textw = esl_opt_GetInteger(go, "--textw"); /* If caller declared an input format, decode it */ if (esl_opt_IsOn(go, "--qformat")) { seqfmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--qformat")); if (seqfmt == eslSQFILE_UNKNOWN) p7_Fail("%s is not a recognized input sequence file format\n", esl_opt_GetString(go, "--qformat")); } /* validate options if running as a daemon */ // if (esl_opt_IsOn(go, "--daemon")) { /* running as a daemon, the input format must be type daemon */ // if (seqfmt != eslSQFILE_UNKNOWN && seqfmt != eslSQFILE_DAEMON) // esl_fatal("Input format %s not supported. Must be daemon\n", esl_opt_GetString(go, "--qformat")); // seqfmt = eslSQFILE_DAEMON; // if (strcmp(cfg->seqfile, "-") != 0) esl_fatal("Query sequence file must be '-'\n"); // } /* Open the target profile database to get the sequence alphabet */ status = p7_hmmfile_OpenE(cfg->hmmfile, p7_HMMDBENV, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", cfg->hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem, trying to open HMM file %s.\n%s\n", cfg->hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, cfg->hmmfile, errbuf); if (! hfp->is_pressed) p7_Fail("Failed to open binary auxfiles for %s: use hmmpress first\n", hfp->fname); hstatus = p7_oprofile_ReadMSV(hfp, &abc, &om); if (hstatus == eslEFORMAT) p7_Fail("bad format, binary auxfiles, %s:\n%s", cfg->hmmfile, hfp->errbuf); else if (hstatus == eslEINCOMPAT) p7_Fail("HMM file %s contains different alphabets", cfg->hmmfile); else if (hstatus != eslOK) p7_Fail("Unexpected error in reading HMMs from %s", cfg->hmmfile); p7_oprofile_Destroy(om); p7_hmmfile_Close(hfp); /* Open the query sequence database */ status = esl_sqfile_OpenDigital(abc, cfg->seqfile, seqfmt, NULL, &sqfp); if (status == eslENOTFOUND) p7_Fail("Failed to open sequence file %s for reading\n", cfg->seqfile); else if (status == eslEFORMAT) p7_Fail("Sequence file %s is empty or misformatted\n", cfg->seqfile); else if (status == eslEINVAL) p7_Fail("Can't autodetect format of a stdin or .gz seqfile"); else if (status != eslOK) p7_Fail("Unexpected error %d opening sequence file %s\n", status, cfg->seqfile); if (sqfp->format > 100) // breaking the law! That range is reserved for msa, for aligned formats p7_Fail("%s contains a multiple sequence alignment; expect unaligned sequences, like FASTA\n", cfg->seqfile); qsq = esl_sq_CreateDigital(abc); /* Open the results output files */ if (esl_opt_IsOn(go, "-o")) { if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) esl_fatal("Failed to open output file %s for writing\n", esl_opt_GetString(go, "-o")); } if (esl_opt_IsOn(go, "--tblout")) { if ((tblfp = fopen(esl_opt_GetString(go, "--tblout"), "w")) == NULL) esl_fatal("Failed to open tabular per-seq output file %s for writing\n", esl_opt_GetString(go, "--tblfp")); } if (esl_opt_IsOn(go, "--dfamtblout")) { if ((dfamtblfp = fopen(esl_opt_GetString(go, "--dfamtblout"),"w")) == NULL) esl_fatal("Failed to open tabular dfam output file %s for writing\n", esl_opt_GetString(go, "--dfamtblout")); } if (esl_opt_IsOn(go, "--aliscoresout")) { if ((aliscoresfp = fopen(esl_opt_GetString(go, "--aliscoresout"),"w")) == NULL) esl_fatal("Failed to open alignment scores output file %s for writing\n", esl_opt_GetString(go, "--aliscoresout")); } output_header(ofp, go, cfg->hmmfile, cfg->seqfile); #ifdef HMMER_THREADS /* initialize thread data */ if (esl_opt_IsOn(go, "--cpu")) ncpus = esl_opt_GetInteger(go, "--cpu"); else esl_threads_CPUCount(&ncpus); if (ncpus > 0) { threadObj = esl_threads_Create(&pipeline_thread); queue = esl_workqueue_Create(ncpus * 2); } #endif infocnt = (ncpus == 0) ? 1 : ncpus; ESL_ALLOC(info, sizeof(*info) * infocnt); for (i = 0; i < infocnt; ++i) { info[i].bg = p7_bg_Create(abc); #ifdef HMMER_THREADS info[i].queue = queue; #endif } #ifdef HMMER_THREADS for (i = 0; i < ncpus * 2; ++i) { block = p7_oprofile_CreateBlock(BLOCK_SIZE); if (block == NULL) esl_fatal("Failed to allocate sequence block"); status = esl_workqueue_Init(queue, block); if (status != eslOK) esl_fatal("Failed to add block to work queue"); } #endif /* Outside loop: over each query sequence in <seqfile>. */ while ((sstatus = esl_sqio_Read(sqfp, qsq)) == eslOK) { if (sstatus == eslEMEM) p7_Fail("Memory allocation error reading sequence file\n", status); if (sstatus == eslEINCONCEIVABLE) p7_Fail("Unexpected error %d reading sequence file\n", status); // if (qsq->L > NHMMER_MAX_RESIDUE_COUNT) p7_Fail("Input sequence %s in file %s exceeds maximum length of %d bases.\n", qsq->name, cfg->seqfile, NHMMER_MAX_RESIDUE_COUNT); nquery++; esl_stopwatch_Start(w); /* Open the target profile database */ status = p7_hmmfile_OpenE(cfg->hmmfile, p7_HMMDBENV, &hfp, NULL); if (status != eslOK) p7_Fail("Unexpected error %d in opening hmm file %s.\n", status, cfg->hmmfile); #ifdef HMMER_THREADS /* if we are threaded, create a lock to prevent multiple readers */ if (ncpus > 0) { status = p7_hmmfile_CreateLock(hfp); if (status != eslOK) p7_Fail("Unexpected error %d creating lock\n", status); } #endif if (fprintf(ofp, "Query: %s [L=%ld]\n", qsq->name, (long) qsq->n) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (qsq->acc[0] != 0 && fprintf(ofp, "Accession: %s\n", qsq->acc) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (qsq->desc[0] != 0 && fprintf(ofp, "Description: %s\n", qsq->desc) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); for (i = 0; i < infocnt; ++i) { /* Create processing pipeline and hit list */ info[i].th = p7_tophits_Create(); info[i].pli = p7_pipeline_Create(go, 100, 100, TRUE, p7_SCAN_MODELS); /* M_hint = 100, L_hint = 100 are just dummies for now */ info[i].pli->hfp = hfp; /* for two-stage input, pipeline needs <hfp> */ p7_pli_NewSeq(info[i].pli, qsq); info[i].qsq = qsq; if ( esl_opt_IsUsed(go, "--toponly") ) info[i].pli->strand = p7_STRAND_TOPONLY; else if ( esl_opt_IsUsed(go, "--bottomonly") ) info[i].pli->strand = p7_STRAND_BOTTOMONLY; else info[i].pli->strand = p7_STRAND_BOTH; #ifdef HMMER_THREADS if (ncpus > 0) esl_threads_AddThread(threadObj, &info[i]); #endif } #ifdef HMMER_THREADS if (ncpus > 0) hstatus = thread_loop(threadObj, queue, hfp); else hstatus = serial_loop(info, hfp); #else hstatus = serial_loop(info, hfp); #endif switch(hstatus) { case eslEFORMAT: p7_Fail("bad file format in HMM file %s", cfg->hmmfile); break; case eslEINCOMPAT: p7_Fail("HMM file %s contains different alphabets", cfg->hmmfile); break; case eslEOF: case eslOK: /* do nothing */ break; default: p7_Fail("Unexpected error in reading HMMs from %s", cfg->hmmfile); } /* merge the results of the search results */ for (i = 1; i < infocnt; ++i) { p7_tophits_Merge(info[0].th, info[i].th); p7_pipeline_Merge(info[0].pli, info[i].pli); p7_pipeline_Destroy(info[i].pli); p7_tophits_Destroy(info[i].th); } /* modify e-value to account for number of models */ for (i = 0; i < info->th->N ; i++) { info->th->unsrt[i].lnP += log((float)info->pli->nmodels); info->th->unsrt[i].dcl[0].lnP = info->th->unsrt[i].lnP; info->th->unsrt[i].sortkey = -1.0 * info->th->unsrt[i].lnP; } /* it's possible to have duplicates based on how viterbi ranges can overlap */ p7_tophits_SortByModelnameAndAlipos(info->th); p7_tophits_RemoveDuplicates(info->th, info->pli->use_bit_cutoffs); /* Print results */ p7_tophits_SortBySortkey(info->th); p7_tophits_Threshold(info->th, info->pli); //tally up total number of hits and target coverage info->pli->n_output = info->pli->pos_output = 0; for (i = 0; i < info->th->N; i++) { if ( (info->th->hit[i]->flags & p7_IS_REPORTED) || info->th->hit[i]->flags & p7_IS_INCLUDED) { info->pli->n_output++; info->pli->pos_output += abs(info->th->hit[i]->dcl[0].jali - info->th->hit[i]->dcl[0].iali) + 1; } } p7_tophits_Targets(ofp, info->th, info->pli, textw); if (fprintf(ofp, "\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); p7_tophits_Domains(ofp, info->th, info->pli, textw); if (fprintf(ofp, "\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (tblfp) p7_tophits_TabularTargets(tblfp, qsq->name, qsq->acc, info->th, info->pli, (nquery == 1)); if (dfamtblfp) p7_tophits_TabularXfam(dfamtblfp, qsq->name, NULL, info->th, info->pli); if (aliscoresfp) p7_tophits_AliScores(aliscoresfp, qsq->name, info->th ); esl_stopwatch_Stop(w); info->pli->nseqs = 1; p7_pli_Statistics(ofp, info->pli, w); if (fprintf(ofp, "//\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); fflush(ofp); p7_hmmfile_Close(hfp); p7_pipeline_Destroy(info->pli); p7_tophits_Destroy(info->th); esl_sq_Reuse(qsq); } if (sstatus == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (sstatus != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", sstatus, sqfp->filename); /* Terminate outputs - any last words? */ if (tblfp) p7_tophits_TabularTail(tblfp, "hmmscan", p7_SCAN_MODELS, cfg->seqfile, cfg->hmmfile, go); if (ofp) { if (fprintf(ofp, "[ok]\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } /* Cleanup - prepare for successful exit */ for (i = 0; i < infocnt; ++i) p7_bg_Destroy(info[i].bg); #ifdef HMMER_THREADS if (ncpus > 0) { esl_workqueue_Reset(queue); while (esl_workqueue_Remove(queue, (void **) &block) == eslOK) p7_oprofile_DestroyBlock(block); esl_workqueue_Destroy(queue); esl_threads_Destroy(threadObj); } #endif free(info); esl_sq_Destroy(qsq); esl_stopwatch_Destroy(w); esl_alphabet_Destroy(abc); esl_sqfile_Close(sqfp); if (ofp != stdout) fclose(ofp); if (tblfp) fclose(tblfp); if (dfamtblfp) fclose(dfamtblfp); if (aliscoresfp) fclose(aliscoresfp); return eslOK; ERROR: if (ofp != stdout) fclose(ofp); if (tblfp) fclose(tblfp); if (dfamtblfp) fclose(dfamtblfp); if (aliscoresfp) fclose(aliscoresfp); return status; }
/* seq_shuffling() * SRE, Tue Jan 22 08:35:51 2008 [Market Street Cafe, Leesburg] * * Shuffling of input sequences. * * Fixed-length (L>0) vs. full-length (L=0) modes handled differently. * In fixed-length mode: * <shuff->seq> only needs to be allocated once, for L * <targ> is an allocated copy of a random subseq of length L * sequences < L residues long can't be shuffled * In full-length mode: * <shuff->seq> is grown to length <sq->n> for each input seq * <targ> just points to <sq->seq> */ static int seq_shuffling(ESL_GETOPTS *go, ESL_RANDOMNESS *r, FILE *ofp, int outfmt) { char *seqfile = esl_opt_GetArg(go, 1); int infmt = eslSQFILE_UNKNOWN; ESL_SQFILE *sqfp = NULL; ESL_SQ *sq = esl_sq_Create(); ESL_SQ *shuff = esl_sq_Create(); char *targ = NULL; int N = esl_opt_GetInteger(go, "-N"); int L = esl_opt_GetInteger(go, "-L"); /* L>0 means select random fixed-len subseqs */ int kmers = 0; int i; int status; if (esl_opt_GetString(go, "--informat") != NULL) { infmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslSQFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --informat"); } if (esl_opt_IsOn(go, "-k")) kmers = esl_opt_GetInteger(go, "-k"); status = esl_sqfile_Open(seqfile, infmt, NULL, &sqfp); if (status == eslENOTFOUND) esl_fatal("No such file %s", seqfile); else if (status == eslEFORMAT) esl_fatal("Format of seqfile %s unrecognized.", seqfile); else if (status == eslEINVAL) esl_fatal("Can't autodetect stdin or .gz."); else if (status != eslOK) esl_fatal("Open failed, code %d.", status); if (L>0) { esl_sq_GrowTo(shuff, L); shuff->n = L; ESL_ALLOC(targ, sizeof(char) * (L+1)); } while ((status = esl_sqio_Read(sqfp, sq)) == eslOK) { if (L == 0) { /* shuffling entire sequence */ esl_sq_GrowTo(shuff, sq->n); /* make sure shuff can hold sq */ shuff->n = sq->n; targ = sq->seq; } else { if (sq->n < L) continue; /* reject seqs < L long */ } for (i = 0; i < N; i++) { if (L > 0) { /* fixed-len mode: copy a random subseq */ int pos = esl_rnd_Roll(r, sq->n - L + 1); strncpy(targ, sq->seq + pos, L); targ[L] = '\0'; } /* Do the requested kind of shuffling */ if (esl_opt_GetBoolean(go, "-m")) esl_rsq_CShuffle (r, targ, shuff->seq); /* monoresidue shuffling */ else if (esl_opt_GetBoolean(go, "-d")) esl_rsq_CShuffleDP (r, targ, shuff->seq); /* diresidue shuffling */ else if (esl_opt_IsOn (go, "-k")) esl_rsq_CShuffleKmers(r, targ, kmers, shuff->seq); /* diresidue shuffling */ else if (esl_opt_GetBoolean(go, "-0")) esl_rsq_CMarkov0 (r, targ, shuff->seq); /* 0th order Markov */ else if (esl_opt_GetBoolean(go, "-1")) esl_rsq_CMarkov1 (r, targ, shuff->seq); /* 1st order Markov */ else if (esl_opt_GetBoolean(go, "-r")) esl_rsq_CReverse ( targ, shuff->seq); /* reverse */ else if (esl_opt_IsOn (go, "-w")) { /* regionally shuffle */ int W= esl_opt_GetInteger(go, "-w"); esl_rsq_CShuffleWindows(r, targ, W, shuff->seq); } /* Set the name of the shuffled sequence */ if (N > 1) esl_sq_FormatName(shuff, "%s-shuffled-%d", sq->name, i); else esl_sq_FormatName(shuff, "%s-shuffled", sq->name); /* Output the resulting sequence */ esl_sqio_Write(ofp, shuff, outfmt, FALSE); /* don't need to reuse the shuffled sequence: we will use exactly the same memory */ } esl_sq_Reuse(sq); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); if (L>0) free(targ); esl_sq_Destroy(shuff); esl_sq_Destroy(sq); esl_sqfile_Close(sqfp); return eslOK; ERROR: if (targ != NULL) free(targ); esl_sq_Destroy(shuff); esl_sq_Destroy(sq); esl_sqfile_Close(sqfp); return status; }
/* serial_master() * The serial version of hmmsearch. * For each query HMM in <hmmfile> search the database for hits. * * A master can only return if it's successful. All errors are handled * immediately and fatally with p7_Fail(). We also use the * ESL_EXCEPTION and ERROR: mechanisms, but only because we know we're * using a fatal exception handler. */ static int serial_master(ESL_GETOPTS *go, struct cfg_s *cfg) { FILE *ofp = stdout; /* results output file (-o) */ P7_HMMFILE *hfp = NULL; /* open input HMM file */ ESL_SQFILE *dbfp = NULL; /* open input sequence file */ P7_HMM *hmm = NULL; /* one HMM query */ ESL_ALPHABET *abc = NULL; /* digital alphabet */ int dbfmt = eslSQFILE_UNKNOWN; /* format code for sequence database file */ ESL_STOPWATCH *w; int textw = 0; int nquery = 0; int status = eslOK; int hstatus = eslOK; int sstatus = eslOK; int i; int ncpus = 0; int infocnt = 0; WORKER_INFO *info = NULL; char errbuf[eslERRBUFSIZE]; w = esl_stopwatch_Create(); if (esl_opt_GetBoolean(go, "--notextw")) textw = 0; else textw = esl_opt_GetInteger(go, "--textw"); if (esl_opt_IsOn(go, "--tformat")) { dbfmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--tformat")); if (dbfmt == eslSQFILE_UNKNOWN) p7_Fail("%s is not a recognized sequence database file format\n", esl_opt_GetString(go, "--tformat")); } /* Open the target sequence database */ status = esl_sqfile_Open(cfg->dbfile, dbfmt, p7_SEQDBENV, &dbfp); if (status == eslENOTFOUND) p7_Fail("Failed to open sequence file %s for reading\n", cfg->dbfile); else if (status == eslEFORMAT) p7_Fail("Sequence file %s is empty or misformatted\n", cfg->dbfile); else if (status == eslEINVAL) p7_Fail("Can't autodetect format of a stdin or .gz seqfile"); else if (status != eslOK) p7_Fail("Unexpected error %d opening sequence file %s\n", status, cfg->dbfile); if (esl_opt_IsUsed(go, "--restrictdb_stkey") || esl_opt_IsUsed(go, "--restrictdb_n")) { if (esl_opt_IsUsed(go, "--ssifile")) esl_sqfile_OpenSSI(dbfp, esl_opt_GetString(go, "--ssifile")); else esl_sqfile_OpenSSI(dbfp, NULL); } /* Open the query profile HMM file */ status = p7_hmmfile_OpenE(cfg->hmmfile, NULL, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", cfg->hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", cfg->hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, cfg->hmmfile, errbuf); /* Open the results output files */ if (esl_opt_IsOn(go, "-o")) { if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w+")) == NULL) p7_Fail("Failed to open output file %s for writing\n", esl_opt_GetString(go, "-o")); } infocnt = 1; ESL_ALLOC(info, sizeof(*info) * infocnt); /* <abc> is not known 'til first HMM is read. */ hstatus = p7_hmmfile_Read(hfp, &abc, &hmm); if (hstatus == eslOK) { /* One-time initializations after alphabet <abc> becomes known */ // output_header(ofp, go, cfg->hmmfile, cfg->dbfile); // dbfp->abc = abc; //ReadBlock requires knowledge of the alphabet to decide how best to read blocks // for (i = 0; i < infocnt; ++i) // { // info[i].bg = p7_bg_Create(abc); // } } /* Outer loop: over each query HMM in <hmmfile>. */ // while (hstatus == eslOK) // { P7_PROFILE *gm = NULL; P7_OPROFILE *om = NULL; /* optimized query profile */ nquery++; esl_stopwatch_Start(w); /* seqfile may need to be rewound (multiquery mode) */ if (nquery > 1) { if (! esl_sqfile_IsRewindable(dbfp)) esl_fatal("Target sequence file %s isn't rewindable; can't search it with multiple queries", cfg->dbfile); if (! esl_opt_IsUsed(go, "--restrictdb_stkey") ) esl_sqfile_Position(dbfp, 0); //only re-set current position to 0 if we're not planning to set it in a moment } if ( cfg->firstseq_key != NULL ) { //it's tempting to want to do this once and capture the offset position for future passes, but ncbi files make this non-trivial, so this keeps it general sstatus = esl_sqfile_PositionByKey(dbfp, cfg->firstseq_key); if (sstatus != eslOK) p7_Fail("Failure setting restrictdb_stkey to %d\n", cfg->firstseq_key); } // if (fprintf(ofp, "Query: %s [M=%d]\n", hmm->name, hmm->M) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); // if (hmm->acc) { if (fprintf(ofp, "Accession: %s\n", hmm->acc) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } // if (hmm->desc) { if (fprintf(ofp, "Description: %s\n", hmm->desc) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } /* Convert to an optimized model */ gm = p7_profile_Create (hmm->M, abc); om = p7_oprofile_Create(hmm->M, abc); // p7_ProfileConfig(hmm, info->bg, gm, 100, p7_LOCAL); /* 100 is a dummy length for now; and MSVFilter requires local mode */ p7_oprofile_Convert(gm, om); /* <om> is now p7_LOCAL, multihit */ for (i = 0; i < infocnt; ++i) { /* Create processing pipeline and hit list */ info[i].th = p7_tophits_Create(); info[i].om = p7_oprofile_Clone(om); info[i].pli = p7_pipeline_Create(go, om->M, 100, FALSE, p7_SEARCH_SEQS); /* L_hint = 100 is just a dummy for now */ P7_PIPELINE *pli = info[i].pli; pli->nmodels++; pli->nnodes += info[i].om->M; // if (pli->Z_setby == p7_ZSETBY_NTARGETS && pli->mode == p7_SCAN_MODELS) pli->Z = pli->nmodels; // if (pli->do_biasfilter) p7_bg_SetFilter(info[i].bg, info[i].om->M, info[i].om->compo); // if (pli->mode == p7_SEARCH_SEQS) // status = p7_pli_NewModelThresholds(pli, info[i].om); pli->W = info[i].om->max_length; } sstatus = serial_loop(info, dbfp, cfg->n_targetseq, ofp); switch(sstatus) { case eslEFORMAT: esl_fatal("Parse failed (sequence file %s):\n%s\n", dbfp->filename, esl_sqfile_GetErrorBuf(dbfp)); break; case eslEOF: /* do nothing */ break; default: esl_fatal("Unexpected error %d reading sequence file %s", sstatus, dbfp->filename); } /* merge the results of the search results */ for (i = 1; i < infocnt; ++i) { p7_tophits_Merge(info[0].th, info[i].th); p7_pipeline_Merge(info[0].pli, info[i].pli); p7_pipeline_Destroy(info[i].pli); p7_tophits_Destroy(info[i].th); p7_oprofile_Destroy(info[i].om); } /* Print the results. */ p7_tophits_SortBySortkey(info->th); p7_tophits_Threshold(info->th, info->pli); // p7_tophits_Targets(ofp, info->th, info->pli, textw); if (fprintf(ofp, "\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); // p7_tophits_Domains(ofp, info->th, info->pli, textw); if (fprintf(ofp, "\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); esl_stopwatch_Stop(w); // p7_pli_Statistics(ofp, info->pli, w); // if (fprintf(ofp, "//\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); p7_pipeline_Destroy(info->pli); p7_tophits_Destroy(info->th); p7_oprofile_Destroy(info->om); p7_oprofile_Destroy(om); p7_profile_Destroy(gm); p7_hmm_Destroy(hmm); // hstatus = p7_hmmfile_Read(hfp, &abc, &hmm); // } /* end outer loop over query HMMs */ switch(hstatus) { case eslEOD: p7_Fail("read failed, HMM file %s may be truncated?", cfg->hmmfile); break; case eslEFORMAT: p7_Fail("bad file format in HMM file %s", cfg->hmmfile); break; case eslEINCOMPAT: p7_Fail("HMM file %s contains different alphabets", cfg->hmmfile); break; case eslEOF: case eslOK: /* do nothing. EOF is what we want. */ break; default: p7_Fail("Unexpected error (%d) in reading HMMs from %s", hstatus, cfg->hmmfile); } /* Terminate outputs... any last words? */ /* Cleanup - prepare for exit */ // for (i = 0; i < infocnt; ++i) // p7_bg_Destroy(info[i].bg); free(info); p7_hmmfile_Close(hfp); esl_sqfile_Close(dbfp); esl_alphabet_Destroy(abc); esl_stopwatch_Destroy(w); if (ofp != stdout) fclose(ofp); printf("44HHHH \n"); return eslOK; ERROR: return eslFAIL; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* application configuration */ char *hmmfile = NULL; /* HMM file name */ char *seqfile = NULL; /* sequence file name */ char *mapfile = NULL; /* optional mapped MSA file name */ int infmt = eslSQFILE_UNKNOWN; int outfmt = eslMSAFILE_STOCKHOLM; P7_HMMFILE *hfp = NULL; /* open HMM file */ ESL_SQFILE *sqfp = NULL; /* open sequence file */ char *outfile = NULL; /* output filename */ FILE *ofp = stdout; /* output stream */ ESL_SQ **sq = NULL; /* array of sequences */ void *p = NULL; /* tmp ptr for reallocation */ int nseq = 0; /* # of sequences in <seqfile> */ int mapseq = 0; /* # of sequences in mapped MSA */ int totseq = 0; /* # of seqs in all sources */ ESL_ALPHABET *abc = NULL; /* alphabet (set from the HMM file)*/ P7_HMM *hmm = NULL; P7_TRACE **tr = NULL; /* array of tracebacks */ ESL_MSA *msa = NULL; /* resulting multiple alignment */ int msaopts = 0; /* flags to p7_tracealign_Seqs() */ int idx; /* counter over seqs, traces */ int status; /* easel/hmmer return code */ char errbuf[eslERRBUFSIZE]; /* Parse the command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") ) cmdline_help (argv[0], go); if (esl_opt_ArgNumber(go) != 2) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); hmmfile = esl_opt_GetArg(go, 1); seqfile = esl_opt_GetArg(go, 2); if (strcmp(hmmfile, "-") == 0 && strcmp(seqfile, "-") == 0) cmdline_failure(argv[0], "Either <hmmfile> or <seqfile> may be '-' (to read from stdin), but not both.\n"); msaopts |= p7_ALL_CONSENSUS_COLS; /* default as of 3.1 */ if (esl_opt_GetBoolean(go, "--trim")) msaopts |= p7_TRIM; /* If caller declared an input format, decode it */ if (esl_opt_IsOn(go, "--informat")) { infmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslSQFILE_UNKNOWN) cmdline_failure(argv[0], "%s is not a recognized input sequence file format\n", esl_opt_GetString(go, "--informat")); } /* Determine output alignment file format */ outfmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--outformat")); if (outfmt == eslMSAFILE_UNKNOWN) cmdline_failure(argv[0], "%s is not a recognized output MSA file format\n", esl_opt_GetString(go, "--outformat")); /* Open output stream */ if ( (outfile = esl_opt_GetString(go, "-o")) != NULL) { if ((ofp = fopen(outfile, "w")) == NULL) cmdline_failure(argv[0], "failed to open -o output file %s for writing\n", outfile); } /* If caller forced an alphabet on us, create the one the caller wants */ if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); /* Read one HMM, and make sure there's only one. */ status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, hmmfile, errbuf); status = p7_hmmfile_Read(hfp, &abc, &hmm); if (status == eslEFORMAT) p7_Fail("Bad file format in HMM file %s:\n%s\n", hfp->fname, hfp->errbuf); else if (status == eslEINCOMPAT) p7_Fail("HMM in %s is not in the expected %s alphabet\n", hfp->fname, esl_abc_DecodeType(abc->type)); else if (status == eslEOF) p7_Fail("Empty HMM file %s? No HMM data found.\n", hfp->fname); else if (status != eslOK) p7_Fail("Unexpected error in reading HMMs from %s\n", hfp->fname); status = p7_hmmfile_Read(hfp, &abc, NULL); if (status != eslEOF) p7_Fail("HMM file %s does not contain just one HMM\n", hfp->fname); p7_hmmfile_Close(hfp); /* We're going to build up two arrays: sequences and traces. * If --mapali option is chosen, the first set of sequences/traces is from the provided alignment */ if ( (mapfile = esl_opt_GetString(go, "--mapali")) != NULL) { map_alignment(mapfile, hmm, &sq, &tr, &mapseq); } totseq = mapseq; /* Read digital sequences into an array (possibly concat'ed onto mapped seqs) */ status = esl_sqfile_OpenDigital(abc, seqfile, infmt, NULL, &sqfp); if (status == eslENOTFOUND) p7_Fail("Failed to open sequence file %s for reading\n", seqfile); else if (status == eslEFORMAT) p7_Fail("Sequence file %s is empty or misformatted\n", seqfile); else if (status != eslOK) p7_Fail("Unexpected error %d opening sequence file %s\n", status, seqfile); ESL_RALLOC(sq, p, sizeof(ESL_SQ *) * (totseq + 1)); sq[totseq] = esl_sq_CreateDigital(abc); nseq = 0; while ((status = esl_sqio_Read(sqfp, sq[totseq+nseq])) == eslOK) { nseq++; ESL_RALLOC(sq, p, sizeof(ESL_SQ *) * (totseq+nseq+1)); sq[totseq+nseq] = esl_sq_CreateDigital(abc); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); esl_sqfile_Close(sqfp); totseq += nseq; /* Remaining initializations, including trace array allocation */ ESL_RALLOC(tr, p, sizeof(P7_TRACE *) * totseq); for (idx = mapseq; idx < totseq; idx++) tr[idx] = p7_trace_CreateWithPP(); p7_tracealign_computeTraces(hmm, sq, mapseq, totseq - mapseq, tr); p7_tracealign_Seqs(sq, tr, totseq, hmm->M, msaopts, hmm, &msa); eslx_msafile_Write(ofp, msa, outfmt); for (idx = 0; idx <= totseq; idx++) esl_sq_Destroy(sq[idx]); /* including sq[nseq] because we overallocated */ for (idx = 0; idx < totseq; idx++) p7_trace_Destroy(tr[idx]); free(sq); free(tr); esl_msa_Destroy(msa); p7_hmm_Destroy(hmm); if (ofp != stdout) fclose(ofp); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return eslOK; ERROR: return status; }
int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 2, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); char *seqfile = esl_opt_GetArg(go, 2); float nu = esl_opt_GetReal(go, "--nu"); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_GMX *fwd = NULL; ESL_SQ *sq = NULL; ESL_SQFILE *sqfp = NULL; P7_TRACE *tr = NULL; int format = eslSQFILE_UNKNOWN; float sc, nullsc, seqscore, P; int status; /* Read in one HMM */ if (p7_hmmfile_Open(hmmfile, NULL, &hfp) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); p7_hmmfile_Close(hfp); /* Open sequence file */ sq = esl_sq_CreateDigital(abc); status = esl_sqfile_Open(seqfile, format, NULL, &sqfp); if (status == eslENOTFOUND) p7_Fail("No such file."); else if (status == eslEFORMAT) p7_Fail("Format unrecognized."); else if (status == eslEINVAL) p7_Fail("Can't autodetect stdin or .gz."); else if (status != eslOK) p7_Fail("Open failed, code %d.", status); /* Configure a profile from the HMM */ bg = p7_bg_Create(abc); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, sq->n, p7_LOCAL); /* Allocate matrix */ fwd = p7_gmx_Create(gm->M, sq->n); while ((status = esl_sqio_Read(sqfp, sq)) == eslOK) { p7_ReconfigLength(gm, sq->n); p7_bg_SetLength(bg, sq->n); p7_gmx_GrowTo(fwd, gm->M, sq->n); /* Run MSV */ p7_GMSV(sq->dsq, sq->n, gm, fwd, nu, &sc); /* Calculate bit score and P-value using standard null1 model*/ p7_bg_NullOne (bg, sq->dsq, sq->n, &nullsc); seqscore = (sc - nullsc) / eslCONST_LOG2; P = esl_gumbel_surv(seqscore, gm->evparam[p7_MMU], gm->evparam[p7_MLAMBDA]); /* output suitable for direct use in profmark benchmark postprocessors: * <Pvalue> <bitscore> <target name> <query name> */ printf("%g\t%.2f\t%s\t%s\n", P, seqscore, sq->name, hmm->name); esl_sq_Reuse(sq); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); /* Cleanup */ esl_sqfile_Close(sqfp); esl_sq_Destroy(sq); p7_trace_Destroy(tr); p7_gmx_Destroy(fwd); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
/* multifetch: * given a file containing lines with one name or key per line; * parse the file line-by-line; * if we have an SSI index available, retrieve the seqs by key * as we see each line; * else, without an SSI index, store the keys in a hash, then * read the entire seq file in a single pass, outputting seqs * that are in our keylist. * * Note that with an SSI index, you get the seqs in the order they * appear in the <keyfile>, but without an SSI index, you get seqs in * the order they occur in the seq file. */ static void multifetch(ESL_GETOPTS *go, FILE *ofp, char *keyfile, ESL_SQFILE *sqfp) { ESL_KEYHASH *keys = esl_keyhash_Create(); ESL_FILEPARSER *efp = NULL; int nseq = 0; int nkeys = 0; char *key; int keylen; int keyidx; int status; if (esl_fileparser_Open(keyfile, NULL, &efp) != eslOK) esl_fatal("Failed to open key file %s\n", keyfile); esl_fileparser_SetCommentChar(efp, '#'); while (esl_fileparser_NextLine(efp) == eslOK) { if (esl_fileparser_GetTokenOnLine(efp, &key, &keylen) != eslOK) esl_fatal("Failed to read seq name on line %d of file %s\n", efp->linenumber, keyfile); status = esl_keyhash_Store(keys, key, keylen, &keyidx); if (status == eslEDUP) esl_fatal("seq key %s occurs more than once in file %s\n", key, keyfile); /* if we have an SSI index, just fetch them as we go. */ if (sqfp->data.ascii.ssi != NULL) { onefetch(go, ofp, key, sqfp); nseq++; } nkeys++; } /* If we don't have an SSI index, we haven't fetched anything yet; do it now. */ if (sqfp->data.ascii.ssi == NULL) { ESL_SQ *sq = esl_sq_Create(); while ((status = esl_sqio_Read(sqfp, sq)) == eslOK) { if ( (sq->name[0] != '\0' && esl_keyhash_Lookup(keys, sq->name, -1, NULL) == eslOK) || (sq->acc[0] != '\0' && esl_keyhash_Lookup(keys, sq->acc, -1, NULL) == eslOK)) { if (esl_opt_GetBoolean(go, "-r") ) if (esl_sq_ReverseComplement(sq) != eslOK) esl_fatal("Failed to reverse complement %s\n", sq->name); esl_sqio_Write(ofp, sq, eslSQFILE_FASTA, FALSE); nseq++; } esl_sq_Reuse(sq); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); esl_sq_Destroy(sq); } if (nkeys != nseq) esl_fatal("Tried to retrieve %d keys, but only retrieved %d sequences\n", nkeys, nseq); if (ofp != stdout) printf("\nRetrieved %d sequences.\n", nseq); esl_keyhash_Destroy(keys); esl_fileparser_Close(efp); return; }
/* Create an SSI index file for open sequence file <sqfp>. * Both name and accession of sequences are stored as keys. */ static void create_ssi_index(ESL_GETOPTS *go, ESL_SQFILE *sqfp) { ESL_NEWSSI *ns = NULL; ESL_SQ *sq = esl_sq_Create(); int nseq = 0; char *ssifile = NULL; uint16_t fh; int status; esl_strdup(sqfp->filename, -1, &ssifile); esl_strcat(&ssifile, -1, ".ssi", 4); status = esl_newssi_Open(ssifile, TRUE, &ns); /* TRUE is for allowing overwrite. */ if (status == eslENOTFOUND) esl_fatal("failed to open SSI index %s", ssifile); else if (status == eslEOVERWRITE) esl_fatal("SSI index %s already exists; delete or rename it", ssifile); /* won't happen, see TRUE above... */ else if (status != eslOK) esl_fatal("failed to create a new SSI index"); if (esl_newssi_AddFile(ns, sqfp->filename, sqfp->format, &fh) != eslOK) esl_fatal("Failed to add sequence file %s to new SSI index\n", sqfp->filename); printf("Creating SSI index for %s... ", sqfp->filename); fflush(stdout); while ((status = esl_sqio_ReadInfo(sqfp, sq)) == eslOK) { nseq++; if (sq->name == NULL) esl_fatal("Every sequence must have a name to be indexed. Failed to find name of seq #%d\n", nseq); if (esl_newssi_AddKey(ns, sq->name, fh, sq->roff, sq->doff, sq->L) != eslOK) esl_fatal("Failed to add key %s to SSI index", sq->name); if (sq->acc[0] != '\0') { if (esl_newssi_AddAlias(ns, sq->acc, sq->name) != eslOK) esl_fatal("Failed to add secondary key %s to SSI index", sq->acc); } esl_sq_Reuse(sq); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); /* Determine if the file was suitable for fast subseq lookup. */ if (sqfp->data.ascii.bpl > 0 && sqfp->data.ascii.rpl > 0) { if ((status = esl_newssi_SetSubseq(ns, fh, sqfp->data.ascii.bpl, sqfp->data.ascii.rpl)) != eslOK) esl_fatal("Failed to set %s for fast subseq lookup."); } /* Save the SSI file to disk */ if (esl_newssi_Write(ns) != eslOK) esl_fatal("Failed to write keys to ssi file %s\n", ssifile); /* Done - output and exit. */ printf("done.\n"); if (ns->nsecondary > 0) printf("Indexed %d sequences (%ld names and %ld accessions).\n", nseq, (long) ns->nprimary, (long) ns->nsecondary); else printf("Indexed %d sequences (%ld names).\n", nseq, (long) ns->nprimary); printf("SSI index written to file %s\n", ssifile); free(ssifile); esl_sq_Destroy(sq); esl_newssi_Close(ns); return; }