/* serial_master() * The serial version of hmmsearch. * For each query HMM in <hmmdb> search the database for hits. * * A master can only return if it's successful. All errors are handled * immediately and fatally with p7_Fail(). */ static int serial_master(ESL_GETOPTS *go, struct cfg_s *cfg) { FILE *ofp = stdout; /* output file for results (default stdout) */ FILE *tblfp = NULL; /* output stream for tabular per-seq (--tblout) */ FILE *dfamtblfp = NULL; /* output stream for tabular Dfam format (--dfamtblout) */ FILE *aliscoresfp = NULL; /* output stream for alignment scores (--aliscoresout) */ // P7_HMM *hmm = NULL; /* one HMM query */ // P7_SCOREDATA *scoredata = NULL; int seqfmt = eslSQFILE_UNKNOWN; /* format of seqfile */ ESL_SQFILE *sqfp = NULL; /* open seqfile */ P7_HMMFILE *hfp = NULL; /* open HMM database file */ ESL_ALPHABET *abc = NULL; /* sequence alphabet */ P7_OPROFILE *om = NULL; /* target profile */ ESL_STOPWATCH *w = NULL; /* timing */ ESL_SQ *qsq = NULL; /* query sequence */ int nquery = 0; int textw; int status = eslOK; int hstatus = eslOK; int sstatus = eslOK; int i; int ncpus = 0; int infocnt = 0; WORKER_INFO *info = NULL; #ifdef HMMER_THREADS P7_OM_BLOCK *block = NULL; ESL_THREADS *threadObj= NULL; ESL_WORK_QUEUE *queue = NULL; #endif char errbuf[eslERRBUFSIZE]; double window_beta = -1.0 ; int window_length = -1; if (esl_opt_IsUsed(go, "--w_beta")) { if ( ( window_beta = esl_opt_GetReal(go, "--w_beta") ) < 0 || window_beta > 1 ) esl_fatal("Invalid window-length beta value\n"); } if (esl_opt_IsUsed(go, "--w_length")) { if (( window_length = esl_opt_GetInteger(go, "--w_length")) < 4 ) esl_fatal("Invalid window length value\n"); } w = esl_stopwatch_Create(); if (esl_opt_GetBoolean(go, "--notextw")) textw = 0; else textw = esl_opt_GetInteger(go, "--textw"); /* If caller declared an input format, decode it */ if (esl_opt_IsOn(go, "--qformat")) { seqfmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--qformat")); if (seqfmt == eslSQFILE_UNKNOWN) p7_Fail("%s is not a recognized input sequence file format\n", esl_opt_GetString(go, "--qformat")); } /* validate options if running as a daemon */ // if (esl_opt_IsOn(go, "--daemon")) { /* running as a daemon, the input format must be type daemon */ // if (seqfmt != eslSQFILE_UNKNOWN && seqfmt != eslSQFILE_DAEMON) // esl_fatal("Input format %s not supported. Must be daemon\n", esl_opt_GetString(go, "--qformat")); // seqfmt = eslSQFILE_DAEMON; // if (strcmp(cfg->seqfile, "-") != 0) esl_fatal("Query sequence file must be '-'\n"); // } /* Open the target profile database to get the sequence alphabet */ status = p7_hmmfile_OpenE(cfg->hmmfile, p7_HMMDBENV, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", cfg->hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem, trying to open HMM file %s.\n%s\n", cfg->hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, cfg->hmmfile, errbuf); if (! hfp->is_pressed) p7_Fail("Failed to open binary auxfiles for %s: use hmmpress first\n", hfp->fname); hstatus = p7_oprofile_ReadMSV(hfp, &abc, &om); if (hstatus == eslEFORMAT) p7_Fail("bad format, binary auxfiles, %s:\n%s", cfg->hmmfile, hfp->errbuf); else if (hstatus == eslEINCOMPAT) p7_Fail("HMM file %s contains different alphabets", cfg->hmmfile); else if (hstatus != eslOK) p7_Fail("Unexpected error in reading HMMs from %s", cfg->hmmfile); p7_oprofile_Destroy(om); p7_hmmfile_Close(hfp); /* Open the query sequence database */ status = esl_sqfile_OpenDigital(abc, cfg->seqfile, seqfmt, NULL, &sqfp); if (status == eslENOTFOUND) p7_Fail("Failed to open sequence file %s for reading\n", cfg->seqfile); else if (status == eslEFORMAT) p7_Fail("Sequence file %s is empty or misformatted\n", cfg->seqfile); else if (status == eslEINVAL) p7_Fail("Can't autodetect format of a stdin or .gz seqfile"); else if (status != eslOK) p7_Fail("Unexpected error %d opening sequence file %s\n", status, cfg->seqfile); if (sqfp->format > 100) // breaking the law! That range is reserved for msa, for aligned formats p7_Fail("%s contains a multiple sequence alignment; expect unaligned sequences, like FASTA\n", cfg->seqfile); qsq = esl_sq_CreateDigital(abc); /* Open the results output files */ if (esl_opt_IsOn(go, "-o")) { if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) esl_fatal("Failed to open output file %s for writing\n", esl_opt_GetString(go, "-o")); } if (esl_opt_IsOn(go, "--tblout")) { if ((tblfp = fopen(esl_opt_GetString(go, "--tblout"), "w")) == NULL) esl_fatal("Failed to open tabular per-seq output file %s for writing\n", esl_opt_GetString(go, "--tblfp")); } if (esl_opt_IsOn(go, "--dfamtblout")) { if ((dfamtblfp = fopen(esl_opt_GetString(go, "--dfamtblout"),"w")) == NULL) esl_fatal("Failed to open tabular dfam output file %s for writing\n", esl_opt_GetString(go, "--dfamtblout")); } if (esl_opt_IsOn(go, "--aliscoresout")) { if ((aliscoresfp = fopen(esl_opt_GetString(go, "--aliscoresout"),"w")) == NULL) esl_fatal("Failed to open alignment scores output file %s for writing\n", esl_opt_GetString(go, "--aliscoresout")); } output_header(ofp, go, cfg->hmmfile, cfg->seqfile); #ifdef HMMER_THREADS /* initialize thread data */ if (esl_opt_IsOn(go, "--cpu")) ncpus = esl_opt_GetInteger(go, "--cpu"); else esl_threads_CPUCount(&ncpus); if (ncpus > 0) { threadObj = esl_threads_Create(&pipeline_thread); queue = esl_workqueue_Create(ncpus * 2); } #endif infocnt = (ncpus == 0) ? 1 : ncpus; ESL_ALLOC(info, sizeof(*info) * infocnt); for (i = 0; i < infocnt; ++i) { info[i].bg = p7_bg_Create(abc); #ifdef HMMER_THREADS info[i].queue = queue; #endif } #ifdef HMMER_THREADS for (i = 0; i < ncpus * 2; ++i) { block = p7_oprofile_CreateBlock(BLOCK_SIZE); if (block == NULL) esl_fatal("Failed to allocate sequence block"); status = esl_workqueue_Init(queue, block); if (status != eslOK) esl_fatal("Failed to add block to work queue"); } #endif /* Outside loop: over each query sequence in <seqfile>. */ while ((sstatus = esl_sqio_Read(sqfp, qsq)) == eslOK) { if (sstatus == eslEMEM) p7_Fail("Memory allocation error reading sequence file\n", status); if (sstatus == eslEINCONCEIVABLE) p7_Fail("Unexpected error %d reading sequence file\n", status); // if (qsq->L > NHMMER_MAX_RESIDUE_COUNT) p7_Fail("Input sequence %s in file %s exceeds maximum length of %d bases.\n", qsq->name, cfg->seqfile, NHMMER_MAX_RESIDUE_COUNT); nquery++; esl_stopwatch_Start(w); /* Open the target profile database */ status = p7_hmmfile_OpenE(cfg->hmmfile, p7_HMMDBENV, &hfp, NULL); if (status != eslOK) p7_Fail("Unexpected error %d in opening hmm file %s.\n", status, cfg->hmmfile); #ifdef HMMER_THREADS /* if we are threaded, create a lock to prevent multiple readers */ if (ncpus > 0) { status = p7_hmmfile_CreateLock(hfp); if (status != eslOK) p7_Fail("Unexpected error %d creating lock\n", status); } #endif if (fprintf(ofp, "Query: %s [L=%ld]\n", qsq->name, (long) qsq->n) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (qsq->acc[0] != 0 && fprintf(ofp, "Accession: %s\n", qsq->acc) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (qsq->desc[0] != 0 && fprintf(ofp, "Description: %s\n", qsq->desc) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); for (i = 0; i < infocnt; ++i) { /* Create processing pipeline and hit list */ info[i].th = p7_tophits_Create(); info[i].pli = p7_pipeline_Create(go, 100, 100, TRUE, p7_SCAN_MODELS); /* M_hint = 100, L_hint = 100 are just dummies for now */ info[i].pli->hfp = hfp; /* for two-stage input, pipeline needs <hfp> */ p7_pli_NewSeq(info[i].pli, qsq); info[i].qsq = qsq; if ( esl_opt_IsUsed(go, "--toponly") ) info[i].pli->strand = p7_STRAND_TOPONLY; else if ( esl_opt_IsUsed(go, "--bottomonly") ) info[i].pli->strand = p7_STRAND_BOTTOMONLY; else info[i].pli->strand = p7_STRAND_BOTH; #ifdef HMMER_THREADS if (ncpus > 0) esl_threads_AddThread(threadObj, &info[i]); #endif } #ifdef HMMER_THREADS if (ncpus > 0) hstatus = thread_loop(threadObj, queue, hfp); else hstatus = serial_loop(info, hfp); #else hstatus = serial_loop(info, hfp); #endif switch(hstatus) { case eslEFORMAT: p7_Fail("bad file format in HMM file %s", cfg->hmmfile); break; case eslEINCOMPAT: p7_Fail("HMM file %s contains different alphabets", cfg->hmmfile); break; case eslEOF: case eslOK: /* do nothing */ break; default: p7_Fail("Unexpected error in reading HMMs from %s", cfg->hmmfile); } /* merge the results of the search results */ for (i = 1; i < infocnt; ++i) { p7_tophits_Merge(info[0].th, info[i].th); p7_pipeline_Merge(info[0].pli, info[i].pli); p7_pipeline_Destroy(info[i].pli); p7_tophits_Destroy(info[i].th); } /* modify e-value to account for number of models */ for (i = 0; i < info->th->N ; i++) { info->th->unsrt[i].lnP += log((float)info->pli->nmodels); info->th->unsrt[i].dcl[0].lnP = info->th->unsrt[i].lnP; info->th->unsrt[i].sortkey = -1.0 * info->th->unsrt[i].lnP; } /* it's possible to have duplicates based on how viterbi ranges can overlap */ p7_tophits_SortByModelnameAndAlipos(info->th); p7_tophits_RemoveDuplicates(info->th, info->pli->use_bit_cutoffs); /* Print results */ p7_tophits_SortBySortkey(info->th); p7_tophits_Threshold(info->th, info->pli); //tally up total number of hits and target coverage info->pli->n_output = info->pli->pos_output = 0; for (i = 0; i < info->th->N; i++) { if ( (info->th->hit[i]->flags & p7_IS_REPORTED) || info->th->hit[i]->flags & p7_IS_INCLUDED) { info->pli->n_output++; info->pli->pos_output += abs(info->th->hit[i]->dcl[0].jali - info->th->hit[i]->dcl[0].iali) + 1; } } p7_tophits_Targets(ofp, info->th, info->pli, textw); if (fprintf(ofp, "\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); p7_tophits_Domains(ofp, info->th, info->pli, textw); if (fprintf(ofp, "\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (tblfp) p7_tophits_TabularTargets(tblfp, qsq->name, qsq->acc, info->th, info->pli, (nquery == 1)); if (dfamtblfp) p7_tophits_TabularXfam(dfamtblfp, qsq->name, NULL, info->th, info->pli); if (aliscoresfp) p7_tophits_AliScores(aliscoresfp, qsq->name, info->th ); esl_stopwatch_Stop(w); info->pli->nseqs = 1; p7_pli_Statistics(ofp, info->pli, w); if (fprintf(ofp, "//\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); fflush(ofp); p7_hmmfile_Close(hfp); p7_pipeline_Destroy(info->pli); p7_tophits_Destroy(info->th); esl_sq_Reuse(qsq); } if (sstatus == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (sstatus != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", sstatus, sqfp->filename); /* Terminate outputs - any last words? */ if (tblfp) p7_tophits_TabularTail(tblfp, "hmmscan", p7_SCAN_MODELS, cfg->seqfile, cfg->hmmfile, go); if (ofp) { if (fprintf(ofp, "[ok]\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } /* Cleanup - prepare for successful exit */ for (i = 0; i < infocnt; ++i) p7_bg_Destroy(info[i].bg); #ifdef HMMER_THREADS if (ncpus > 0) { esl_workqueue_Reset(queue); while (esl_workqueue_Remove(queue, (void **) &block) == eslOK) p7_oprofile_DestroyBlock(block); esl_workqueue_Destroy(queue); esl_threads_Destroy(threadObj); } #endif free(info); esl_sq_Destroy(qsq); esl_stopwatch_Destroy(w); esl_alphabet_Destroy(abc); esl_sqfile_Close(sqfp); if (ofp != stdout) fclose(ofp); if (tblfp) fclose(tblfp); if (dfamtblfp) fclose(dfamtblfp); if (aliscoresfp) fclose(aliscoresfp); return eslOK; ERROR: if (ofp != stdout) fclose(ofp); if (tblfp) fclose(tblfp); if (dfamtblfp) fclose(dfamtblfp); if (aliscoresfp) fclose(aliscoresfp); return status; }
int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 2, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); char *seqfile = esl_opt_GetArg(go, 2); ESL_ALPHABET *abc = NULL; ESL_RANDOMNESS *rng = esl_randomness_CreateFast(0); P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_OPROFILE *om = NULL; P7_GMX *gx = NULL; P7_OMX *fwd = NULL; P7_TRACE *tr = NULL; ESL_SQ *sq = NULL; ESL_SQFILE *sqfp = NULL; int format = eslSQFILE_UNKNOWN; int N = esl_opt_GetInteger(go, "-N"); int i; float vsc, fsc, tsc; char errbuf[eslERRBUFSIZE]; int status; /* Read in one HMM */ if (p7_hmmfile_Open(hmmfile, NULL, &hfp) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); /* Read in one sequence */ sq = esl_sq_CreateDigital(abc); status = esl_sqfile_Open(seqfile, format, NULL, &sqfp); if (status == eslENOTFOUND) p7_Fail("No such file."); else if (status == eslEFORMAT) p7_Fail("Format unrecognized."); else if (status == eslEINVAL) p7_Fail("Can't autodetect stdin or .gz."); else if (status != eslOK) p7_Fail("Open failed, code %d.", status); if (esl_sqio_Read(sqfp, sq) != eslOK) p7_Fail("Failed to read sequence"); /* create default null model, then create and optimize profile */ bg = p7_bg_Create(abc); p7_bg_SetLength(bg, sq->n); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, sq->n, p7_LOCAL); om = p7_oprofile_Create(gm->M, abc); p7_oprofile_Convert(gm, om); if (esl_opt_GetBoolean(go, "-p")) p7_oprofile_Dump(stdout, om); fwd = p7_omx_Create(gm->M, sq->n, sq->n); gx = p7_gmx_Create(gm->M, sq->n); tr = p7_trace_Create(); if (esl_opt_GetBoolean(go, "-m") == TRUE) p7_omx_SetDumpMode(stdout, fwd, TRUE); p7_GViterbi(sq->dsq, sq->n, gm, gx, &vsc); p7_Forward (sq->dsq, sq->n, om, fwd, &fsc); for (i = 0; i < N; i++) { p7_StochasticTrace(rng, sq->dsq, sq->n, om, fwd, tr); p7_trace_Score(tr, sq->dsq, gm, &tsc); if (esl_opt_GetBoolean(go, "-t") == TRUE) p7_trace_Dump(stdout, tr, gm, sq->dsq); if (p7_trace_Validate(tr, abc, sq->dsq, errbuf) != eslOK) p7_Die("trace %d fails validation:\n%s\n", i, errbuf); printf("Sampled trace: %.4f nats\n", tsc); p7_trace_Reuse(tr); } printf("Forward score: %.4f nats\n", fsc); printf("Viterbi score: %.4f nats\n", vsc); /* cleanup */ esl_sq_Destroy(sq); esl_sqfile_Close(sqfp); p7_trace_Destroy(tr); p7_omx_Destroy(fwd); p7_gmx_Destroy(gx); p7_oprofile_Destroy(om); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); p7_hmmfile_Close(hfp); esl_randomness_Destroy(rng); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
/* seq_shuffling() * SRE, Tue Jan 22 08:35:51 2008 [Market Street Cafe, Leesburg] * * Shuffling of input sequences. * * Fixed-length (L>0) vs. full-length (L=0) modes handled differently. * In fixed-length mode: * <shuff->seq> only needs to be allocated once, for L * <targ> is an allocated copy of a random subseq of length L * sequences < L residues long can't be shuffled * In full-length mode: * <shuff->seq> is grown to length <sq->n> for each input seq * <targ> just points to <sq->seq> */ static int seq_shuffling(ESL_GETOPTS *go, ESL_RANDOMNESS *r, FILE *ofp, int outfmt) { char *seqfile = esl_opt_GetArg(go, 1); int infmt = eslSQFILE_UNKNOWN; ESL_SQFILE *sqfp = NULL; ESL_SQ *sq = esl_sq_Create(); ESL_SQ *shuff = esl_sq_Create(); char *targ = NULL; int N = esl_opt_GetInteger(go, "-N"); int L = esl_opt_GetInteger(go, "-L"); /* L>0 means select random fixed-len subseqs */ int kmers = 0; int i; int status; if (esl_opt_GetString(go, "--informat") != NULL) { infmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslSQFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --informat"); } if (esl_opt_IsOn(go, "-k")) kmers = esl_opt_GetInteger(go, "-k"); status = esl_sqfile_Open(seqfile, infmt, NULL, &sqfp); if (status == eslENOTFOUND) esl_fatal("No such file %s", seqfile); else if (status == eslEFORMAT) esl_fatal("Format of seqfile %s unrecognized.", seqfile); else if (status == eslEINVAL) esl_fatal("Can't autodetect stdin or .gz."); else if (status != eslOK) esl_fatal("Open failed, code %d.", status); if (L>0) { esl_sq_GrowTo(shuff, L); shuff->n = L; ESL_ALLOC(targ, sizeof(char) * (L+1)); } while ((status = esl_sqio_Read(sqfp, sq)) == eslOK) { if (L == 0) { /* shuffling entire sequence */ esl_sq_GrowTo(shuff, sq->n); /* make sure shuff can hold sq */ shuff->n = sq->n; targ = sq->seq; } else { if (sq->n < L) continue; /* reject seqs < L long */ } for (i = 0; i < N; i++) { if (L > 0) { /* fixed-len mode: copy a random subseq */ int pos = esl_rnd_Roll(r, sq->n - L + 1); strncpy(targ, sq->seq + pos, L); targ[L] = '\0'; } /* Do the requested kind of shuffling */ if (esl_opt_GetBoolean(go, "-m")) esl_rsq_CShuffle (r, targ, shuff->seq); /* monoresidue shuffling */ else if (esl_opt_GetBoolean(go, "-d")) esl_rsq_CShuffleDP (r, targ, shuff->seq); /* diresidue shuffling */ else if (esl_opt_IsOn (go, "-k")) esl_rsq_CShuffleKmers(r, targ, kmers, shuff->seq); /* diresidue shuffling */ else if (esl_opt_GetBoolean(go, "-0")) esl_rsq_CMarkov0 (r, targ, shuff->seq); /* 0th order Markov */ else if (esl_opt_GetBoolean(go, "-1")) esl_rsq_CMarkov1 (r, targ, shuff->seq); /* 1st order Markov */ else if (esl_opt_GetBoolean(go, "-r")) esl_rsq_CReverse ( targ, shuff->seq); /* reverse */ else if (esl_opt_IsOn (go, "-w")) { /* regionally shuffle */ int W= esl_opt_GetInteger(go, "-w"); esl_rsq_CShuffleWindows(r, targ, W, shuff->seq); } /* Set the name of the shuffled sequence */ if (N > 1) esl_sq_FormatName(shuff, "%s-shuffled-%d", sq->name, i); else esl_sq_FormatName(shuff, "%s-shuffled", sq->name); /* Output the resulting sequence */ esl_sqio_Write(ofp, shuff, outfmt, FALSE); /* don't need to reuse the shuffled sequence: we will use exactly the same memory */ } esl_sq_Reuse(sq); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); if (L>0) free(targ); esl_sq_Destroy(shuff); esl_sq_Destroy(sq); esl_sqfile_Close(sqfp); return eslOK; ERROR: if (targ != NULL) free(targ); esl_sq_Destroy(shuff); esl_sq_Destroy(sq); esl_sqfile_Close(sqfp); return status; }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); char *seqfile = esl_opt_GetArg(go, 2); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_GMX *fwd = NULL; P7_GMX *bck = NULL; ESL_SQ *sq = NULL; ESL_SQFILE *sqfp = NULL; int format = eslSQFILE_UNKNOWN; float fsc, bsc; float nullsc; int status; /* Initialize log-sum calculator */ p7_FLogsumInit(); /* Read in one HMM */ if (p7_hmmfile_OpenE(hmmfile, NULL, &hfp, NULL) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); p7_hmmfile_Close(hfp); /* Read in one sequence */ sq = esl_sq_CreateDigital(abc); status = esl_sqfile_Open(seqfile, format, NULL, &sqfp); if (status == eslENOTFOUND) p7_Fail("No such file."); else if (status == eslEFORMAT) p7_Fail("Format unrecognized."); else if (status == eslEINVAL) p7_Fail("Can't autodetect stdin or .gz."); else if (status != eslOK) p7_Fail("Open failed, code %d.", status); /* Configure a profile from the HMM */ bg = p7_bg_Create(abc); gm = p7_profile_Create(hmm->M, abc); /* Now reconfig the models however we were asked to */ if (esl_opt_GetBoolean(go, "--fs")) p7_ProfileConfig(hmm, bg, gm, sq->n, p7_LOCAL); else if (esl_opt_GetBoolean(go, "--sw")) p7_ProfileConfig(hmm, bg, gm, sq->n, p7_UNILOCAL); else if (esl_opt_GetBoolean(go, "--ls")) p7_ProfileConfig(hmm, bg, gm, sq->n, p7_GLOCAL); else if (esl_opt_GetBoolean(go, "--s")) p7_ProfileConfig(hmm, bg, gm, sq->n, p7_UNIGLOCAL); /* Allocate matrices */ fwd = p7_gmx_Create(gm->M, sq->n); bck = p7_gmx_Create(gm->M, sq->n); printf("%-30s %-10s %-10s %-10s %-10s\n", "# seq name", "fwd (raw)", "bck (raw) ", "fwd (bits)", "bck (bits)"); printf("%-30s %10s %10s %10s %10s\n", "#--------------", "----------", "----------", "----------", "----------"); while ( (status = esl_sqio_Read(sqfp, sq)) != eslEOF) { if (status == eslEFORMAT) p7_Fail("Parse failed (sequence file %s)\n%s\n", sqfp->filename, sqfp->get_error(sqfp)); else if (status != eslOK) p7_Fail("Unexpected error %d reading sequence file %s", status, sqfp->filename); /* Resize the DP matrices if necessary */ p7_gmx_GrowTo(fwd, gm->M, sq->n); p7_gmx_GrowTo(bck, gm->M, sq->n); /* Set the profile and null model's target length models */ p7_bg_SetLength(bg, sq->n); p7_ReconfigLength(gm, sq->n); /* Run Forward, Backward */ p7_GForward (sq->dsq, sq->n, gm, fwd, &fsc); p7_GBackward(sq->dsq, sq->n, gm, bck, &bsc); p7_gmx_Dump(stdout, fwd, p7_DEFAULT); /* Those scores are partial log-odds likelihoods in nats. * Subtract off the rest of the null model, convert to bits. */ p7_bg_NullOne(bg, sq->dsq, sq->n, &nullsc); printf("%-30s %10.4f %10.4f %10.4f %10.4f\n", sq->name, fsc, bsc, (fsc - nullsc) / eslCONST_LOG2, (bsc - nullsc) / eslCONST_LOG2); p7_gmx_Reuse(fwd); p7_gmx_Reuse(bck); esl_sq_Reuse(sq); } /* Cleanup */ esl_sqfile_Close(sqfp); esl_sq_Destroy(sq); p7_gmx_Destroy(fwd); p7_gmx_Destroy(bck); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 2, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); char *seqfile = esl_opt_GetArg(go, 2); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; ESL_SQFILE *sqfp = NULL; int format = eslSQFILE_UNKNOWN; ESL_SQ *sq = NULL; float nullsc, filtersc, H; int status; /* Read one HMM from <hmmfile> */ if (p7_hmmfile_Open(hmmfile, NULL, &hfp) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); p7_hmmfile_Close(hfp); /* Open <seqfile> for reading */ status = esl_sqfile_OpenDigital(abc, seqfile, format, NULL, &sqfp); if (status == eslENOTFOUND) esl_fatal("No such file."); else if (status == eslEFORMAT) esl_fatal("Format unrecognized."); else if (status != eslOK) esl_fatal("Open failed, code %d.", status); sq = esl_sq_CreateDigital(abc); bg = p7_bg_Create(abc); p7_bg_SetFilterByHMM(bg, hmm); H = esl_vec_FEntropy(bg->f, bg->abc->K); printf("bg iid H = %.4f\n", H); H = esl_vec_FEntropy(bg->mcomp, bg->abc->K); printf("modelcomp H = %.4f\n", H); while ((status = esl_sqio_Read(sqfp, sq)) == eslOK) { p7_bg_SetLength(bg, sq->n); p7_bg_NullOne (bg, sq->dsq, sq->n, &nullsc); p7_bg_FilterScore(bg, sq->dsq, sq->n, &filtersc); printf("%-20s %5d %8.5f %8.5f %8.5f\n", sq->name, (int) sq->n, nullsc, filtersc, filtersc-nullsc); esl_sq_Reuse(sq); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s line %" PRId64 "):\n%s\n", sqfp->filename, sqfp->linenumber, sqfp->errbuf); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); esl_sqfile_Close(sqfp); esl_sq_Destroy(sq); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 2, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); char *seqfile = esl_opt_GetArg(go, 2); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_OPROFILE *om = NULL; P7_GMX *gx = NULL; P7_OMX *fwd = NULL; P7_OMX *bck = NULL; ESL_SQ *sq = NULL; ESL_SQFILE *sqfp = NULL; int format = eslSQFILE_UNKNOWN; float fraw, braw, nullsc, fsc; float gfraw, gbraw, gfsc; double P, gP; int status; /* Read in one HMM */ if (p7_hmmfile_Open(hmmfile, NULL, &hfp) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); /* Open sequence file for reading */ sq = esl_sq_CreateDigital(abc); status = esl_sqfile_Open(seqfile, format, NULL, &sqfp); if (status == eslENOTFOUND) p7_Fail("No such file."); else if (status == eslEFORMAT) p7_Fail("Format unrecognized."); else if (status == eslEINVAL) p7_Fail("Can't autodetect stdin or .gz."); else if (status != eslOK) p7_Fail("Open failed, code %d.", status); /* create default null model, then create and optimize profile */ bg = p7_bg_Create(abc); p7_bg_SetLength(bg, sq->n); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, sq->n, p7_UNILOCAL); om = p7_oprofile_Create(gm->M, abc); p7_oprofile_Convert(gm, om); /* p7_oprofile_Dump(stdout, om); */ /* allocate DP matrices for O(M+L) parsers */ fwd = p7_omx_Create(gm->M, 0, sq->n); bck = p7_omx_Create(gm->M, 0, sq->n); gx = p7_gmx_Create(gm->M, sq->n); /* allocate DP matrices for O(ML) fills */ /* fwd = p7_omx_Create(gm->M, sq->n, sq->n); */ /* bck = p7_omx_Create(gm->M, sq->n, sq->n); */ /* p7_omx_SetDumpMode(stdout, fwd, TRUE); */ /* makes the fast DP algorithms dump their matrices */ /* p7_omx_SetDumpMode(stdout, bck, TRUE); */ while ((status = esl_sqio_Read(sqfp, sq)) == eslOK) { p7_oprofile_ReconfigLength(om, sq->n); p7_ReconfigLength(gm, sq->n); p7_bg_SetLength(bg, sq->n); p7_omx_GrowTo(fwd, om->M, 0, sq->n); p7_omx_GrowTo(bck, om->M, 0, sq->n); p7_gmx_GrowTo(gx, gm->M, sq->n); p7_bg_NullOne (bg, sq->dsq, sq->n, &nullsc); p7_ForwardParser (sq->dsq, sq->n, om, fwd, &fraw); p7_BackwardParser(sq->dsq, sq->n, om, fwd, bck, &braw); /* p7_Forward (sq->dsq, sq->n, om, fwd, &fsc); printf("forward: %.2f nats\n", fsc); */ /* p7_Backward(sq->dsq, sq->n, om, fwd, bck, &bsc); printf("backward: %.2f nats\n", bsc); */ /* Comparison to other F/B implementations */ p7_GForward (sq->dsq, sq->n, gm, gx, &gfraw); p7_GBackward (sq->dsq, sq->n, gm, gx, &gbraw); /* p7_gmx_Dump(stdout, gx); */ fsc = (fraw-nullsc) / eslCONST_LOG2; gfsc = (gfraw-nullsc) / eslCONST_LOG2; P = esl_exp_surv(fsc, om->evparam[p7_FTAU], om->evparam[p7_FLAMBDA]); gP = esl_exp_surv(gfsc, gm->evparam[p7_FTAU], gm->evparam[p7_FLAMBDA]); if (esl_opt_GetBoolean(go, "-1")) { printf("%-30s\t%-20s\t%9.2g\t%6.1f\t%9.2g\t%6.1f\n", sq->name, hmm->name, P, fsc, gP, gfsc); } else if (esl_opt_GetBoolean(go, "-P")) { /* output suitable for direct use in profmark benchmark postprocessors: */ printf("%g\t%.2f\t%s\t%s\n", P, fsc, sq->name, hmm->name); } else { printf("target sequence: %s\n", sq->name); printf("fwd filter raw score: %.2f nats\n", fraw); printf("bck filter raw score: %.2f nats\n", braw); printf("null score: %.2f nats\n", nullsc); printf("per-seq score: %.2f bits\n", fsc); printf("P-value: %g\n", P); printf("GForward raw score: %.2f nats\n", gfraw); printf("GBackward raw score: %.2f nats\n", gbraw); printf("GForward seq score: %.2f bits\n", gfsc); printf("GForward P-value: %g\n", gP); } esl_sq_Reuse(sq); } /* cleanup */ esl_sq_Destroy(sq); esl_sqfile_Close(sqfp); p7_omx_Destroy(bck); p7_omx_Destroy(fwd); p7_gmx_Destroy(gx); p7_oprofile_Destroy(om); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); p7_hmmfile_Close(hfp); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 2, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); char *seqfile = esl_opt_GetArg(go, 2); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_GMX *fwd = NULL; ESL_SQ *sq = NULL; ESL_SQFILE *sqfp = NULL; P7_TRACE *tr = NULL; int format = eslSQFILE_UNKNOWN; char errbuf[eslERRBUFSIZE]; float sc; int d; int status; /* Read in one HMM */ if (p7_hmmfile_Open(hmmfile, NULL, &hfp) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); p7_hmmfile_Close(hfp); /* Read in one sequence */ sq = esl_sq_CreateDigital(abc); status = esl_sqfile_Open(seqfile, format, NULL, &sqfp); if (status == eslENOTFOUND) p7_Fail("No such file."); else if (status == eslEFORMAT) p7_Fail("Format unrecognized."); else if (status == eslEINVAL) p7_Fail("Can't autodetect stdin or .gz."); else if (status != eslOK) p7_Fail("Open failed, code %d.", status); if (esl_sqio_Read(sqfp, sq) != eslOK) p7_Fail("Failed to read sequence"); esl_sqfile_Close(sqfp); /* Configure a profile from the HMM */ bg = p7_bg_Create(abc); p7_bg_SetLength(bg, sq->n); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, sq->n, p7_LOCAL); /* Allocate matrix and a trace */ fwd = p7_gmx_Create(gm->M, sq->n); tr = p7_trace_Create(); /* Run Viterbi; do traceback */ p7_GViterbi (sq->dsq, sq->n, gm, fwd, &sc); p7_GTrace (sq->dsq, sq->n, gm, fwd, tr); /* Dump and validate the trace. */ p7_trace_Dump(stdout, tr, gm, sq->dsq); if (p7_trace_Validate(tr, abc, sq->dsq, errbuf) != eslOK) p7_Die("trace fails validation:\n%s\n", errbuf); /* Domain info in the trace. */ p7_trace_Index(tr); printf("# Viterbi: %d domains : ", tr->ndom); for (d = 0; d < tr->ndom; d++) printf("%6d %6d %6d %6d ", tr->sqfrom[d], tr->sqto[d], tr->hmmfrom[d], tr->hmmto[d]); printf("\n"); /* Cleanup */ p7_trace_Destroy(tr); p7_gmx_Destroy(fwd); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); esl_sq_Destroy(sq); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* application configuration */ char *hmmfile = NULL; /* HMM file name */ char *seqfile = NULL; /* sequence file name */ char *mapfile = NULL; /* optional mapped MSA file name */ int infmt = eslSQFILE_UNKNOWN; int outfmt = eslMSAFILE_STOCKHOLM; P7_HMMFILE *hfp = NULL; /* open HMM file */ ESL_SQFILE *sqfp = NULL; /* open sequence file */ char *outfile = NULL; /* output filename */ FILE *ofp = stdout; /* output stream */ ESL_SQ **sq = NULL; /* array of sequences */ void *p = NULL; /* tmp ptr for reallocation */ int nseq = 0; /* # of sequences in <seqfile> */ int mapseq = 0; /* # of sequences in mapped MSA */ int totseq = 0; /* # of seqs in all sources */ ESL_ALPHABET *abc = NULL; /* alphabet (set from the HMM file)*/ P7_HMM *hmm = NULL; P7_TRACE **tr = NULL; /* array of tracebacks */ ESL_MSA *msa = NULL; /* resulting multiple alignment */ int msaopts = 0; /* flags to p7_tracealign_Seqs() */ int idx; /* counter over seqs, traces */ int status; /* easel/hmmer return code */ char errbuf[eslERRBUFSIZE]; /* Parse the command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") ) cmdline_help (argv[0], go); if (esl_opt_ArgNumber(go) != 2) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); hmmfile = esl_opt_GetArg(go, 1); seqfile = esl_opt_GetArg(go, 2); if (strcmp(hmmfile, "-") == 0 && strcmp(seqfile, "-") == 0) cmdline_failure(argv[0], "Either <hmmfile> or <seqfile> may be '-' (to read from stdin), but not both.\n"); msaopts |= p7_ALL_CONSENSUS_COLS; /* default as of 3.1 */ if (esl_opt_GetBoolean(go, "--trim")) msaopts |= p7_TRIM; /* If caller declared an input format, decode it */ if (esl_opt_IsOn(go, "--informat")) { infmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslSQFILE_UNKNOWN) cmdline_failure(argv[0], "%s is not a recognized input sequence file format\n", esl_opt_GetString(go, "--informat")); } /* Determine output alignment file format */ outfmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--outformat")); if (outfmt == eslMSAFILE_UNKNOWN) cmdline_failure(argv[0], "%s is not a recognized output MSA file format\n", esl_opt_GetString(go, "--outformat")); /* Open output stream */ if ( (outfile = esl_opt_GetString(go, "-o")) != NULL) { if ((ofp = fopen(outfile, "w")) == NULL) cmdline_failure(argv[0], "failed to open -o output file %s for writing\n", outfile); } /* If caller forced an alphabet on us, create the one the caller wants */ if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); /* Read one HMM, and make sure there's only one. */ status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, hmmfile, errbuf); status = p7_hmmfile_Read(hfp, &abc, &hmm); if (status == eslEFORMAT) p7_Fail("Bad file format in HMM file %s:\n%s\n", hfp->fname, hfp->errbuf); else if (status == eslEINCOMPAT) p7_Fail("HMM in %s is not in the expected %s alphabet\n", hfp->fname, esl_abc_DecodeType(abc->type)); else if (status == eslEOF) p7_Fail("Empty HMM file %s? No HMM data found.\n", hfp->fname); else if (status != eslOK) p7_Fail("Unexpected error in reading HMMs from %s\n", hfp->fname); status = p7_hmmfile_Read(hfp, &abc, NULL); if (status != eslEOF) p7_Fail("HMM file %s does not contain just one HMM\n", hfp->fname); p7_hmmfile_Close(hfp); /* We're going to build up two arrays: sequences and traces. * If --mapali option is chosen, the first set of sequences/traces is from the provided alignment */ if ( (mapfile = esl_opt_GetString(go, "--mapali")) != NULL) { map_alignment(mapfile, hmm, &sq, &tr, &mapseq); } totseq = mapseq; /* Read digital sequences into an array (possibly concat'ed onto mapped seqs) */ status = esl_sqfile_OpenDigital(abc, seqfile, infmt, NULL, &sqfp); if (status == eslENOTFOUND) p7_Fail("Failed to open sequence file %s for reading\n", seqfile); else if (status == eslEFORMAT) p7_Fail("Sequence file %s is empty or misformatted\n", seqfile); else if (status != eslOK) p7_Fail("Unexpected error %d opening sequence file %s\n", status, seqfile); ESL_RALLOC(sq, p, sizeof(ESL_SQ *) * (totseq + 1)); sq[totseq] = esl_sq_CreateDigital(abc); nseq = 0; while ((status = esl_sqio_Read(sqfp, sq[totseq+nseq])) == eslOK) { nseq++; ESL_RALLOC(sq, p, sizeof(ESL_SQ *) * (totseq+nseq+1)); sq[totseq+nseq] = esl_sq_CreateDigital(abc); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); esl_sqfile_Close(sqfp); totseq += nseq; /* Remaining initializations, including trace array allocation */ ESL_RALLOC(tr, p, sizeof(P7_TRACE *) * totseq); for (idx = mapseq; idx < totseq; idx++) tr[idx] = p7_trace_CreateWithPP(); p7_tracealign_computeTraces(hmm, sq, mapseq, totseq - mapseq, tr); p7_tracealign_Seqs(sq, tr, totseq, hmm->M, msaopts, hmm, &msa); eslx_msafile_Write(ofp, msa, outfmt); for (idx = 0; idx <= totseq; idx++) esl_sq_Destroy(sq[idx]); /* including sq[nseq] because we overallocated */ for (idx = 0; idx < totseq; idx++) p7_trace_Destroy(tr[idx]); free(sq); free(tr); esl_msa_Destroy(msa); p7_hmm_Destroy(hmm); if (ofp != stdout) fclose(ofp); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return eslOK; ERROR: return status; }
static int serial_loop(WORKER_INFO *info, ESL_SQFILE *dbfp, int n_targetseqs, FILE *ofp) { int sstatus; ESL_SQ *dbsq = NULL; /* one target sequence (digital) */ int seq_cnt = 0; dbsq = esl_sq_CreateDigital(info->om->abc); SEQ_INFO *sqinfo = (SEQ_INFO *)malloc(INFO_SIZE * sizeof(SEQ_INFO)); size_t capacity = INFO_SIZE; FILE *ftmp = fopen("tmp.txt", "w+"); /* Main loop: */ while ( (n_targetseqs==-1 || seq_cnt<n_targetseqs) && (sstatus = esl_sqio_Read(dbfp, dbsq)) == eslOK) { // p7_pli_NewSeq(info->pli, dbsq); // p7_bg_SetLength(info->bg, dbsq->n); // p7_oprofile_ReconfigLength(info->om, dbsq->n); // p7_Pipeline(info->pli, info->om, info->bg, dbsq, info->th); if (seq_cnt > capacity) { capacity = capacity << 1; SEQ_INFO * newsq = (SEQ_INFO * )realloc(sqinfo, capacity * sizeof(SEQ_INFO)); if (newsq) sqinfo = newsq; else { printf("Mem Error\n"); exit(1); } } sqinfo[seq_cnt].index = seq_cnt; sqinfo[seq_cnt].length = dbsq->n; esl_sq_Textize(dbsq); // if (dbsq->seq) printf("Text -- %s\n", dbsq->seq); // if (dbsq->name) printf("Name -- %s\n", dbsq->name); // if (dbsq->ss) printf("%s\n", dbsq->ss); // if (dbsq->desc) printf("Desc -- %s\n", dbsq->desc); // printf("%s\n", dbsq->desc); // printf("Pos1 -- %d ", pos); sqinfo[seq_cnt].pos = ftell(ftmp); fprintf(ftmp, ">%s %s\n%s\n", dbsq->name, dbsq->desc, dbsq->seq); // printf(" %d\n", pos); sqinfo[seq_cnt].size = ftell(ftmp) - sqinfo[seq_cnt].pos; //strlen(dbsq->name) + strlen(dbsq->desc) + strlen(dbsq->seq) + 4; seq_cnt++; esl_sq_Reuse(dbsq); // p7_pipeline_Reuse(info->pli); } qsort(sqinfo, seq_cnt, sizeof(SEQ_INFO), seqSortByLength); // esl_sqfile_Position(dbfp, 0); char *buf = (char *)malloc(sqinfo[0].size + 1); buf[sqinfo[0].size] = '\0'; for (int i = 0; i < seq_cnt; ++i) { // printf("index: %d length: %d \n", sqinfo[i].index, sqinfo[i].length); // int ret = esl_sqfile_Position(dbfp, i);// sqinfo[i].index); // printf("return: %d \n", ret); fseek(ftmp, sqinfo[i].pos, SEEK_SET); fread(buf, 1, sqinfo[i].size, ftmp); if ( i == 0 ) printf("II %s", buf); fwrite(buf, 1, sqinfo[i].size, ofp); int ret = esl_sqio_Read(dbfp, dbsq); // printf("2return: %d \n", ret); // printf("i: %d length: %d \n", i, dbsq->n); esl_sq_Reuse(dbsq); } if (n_targetseqs!=-1 && seq_cnt==n_targetseqs) sstatus = eslEOF; free(sqinfo); fclose(ftmp); esl_sq_Destroy(dbsq); return sstatus; }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage); ESL_ALPHABET *abc = esl_alphabet_Create(eslAMINO); char *qfile = esl_opt_GetArg(go, 1); char *tfile = esl_opt_GetArg(go, 2); ESL_SQFILE *qfp = NULL; ESL_SQFILE *tfp = NULL; ESL_SQ *qsq = esl_sq_CreateDigital(abc); ESL_SQ *tsq = esl_sq_CreateDigital(abc); ESL_SCOREMATRIX *S = esl_scorematrix_Create(abc); ESL_DMATRIX *Q = NULL; P7_BG *bg = p7_bg_Create(abc); P7_HMM *hmm = NULL; P7_PROFILE *gm = NULL; P7_REFMX *vit = p7_refmx_Create(200, 400); /* will grow as needed */ double *fa = malloc(sizeof(double) * abc->K); double popen = 0.02; double pextend = 0.4; double lambda; float vsc; float nullsc; int status; esl_composition_BL62(fa); esl_vec_D2F(fa, abc->K, bg->f); esl_scorematrix_Set("BLOSUM62", S); esl_scorematrix_ProbifyGivenBG(S, fa, fa, &lambda, &Q); esl_scorematrix_JointToConditionalOnQuery(abc, Q); if (esl_sqfile_OpenDigital(abc, qfile, eslSQFILE_UNKNOWN, NULL, &qfp) != eslOK) esl_fatal("failed to open %s", qfile); if (esl_sqio_Read(qfp, qsq) != eslOK) esl_fatal("failed to read query seq"); p7_Seqmodel(abc, qsq->dsq, qsq->n, qsq->name, Q, bg->f, popen, pextend, &hmm); p7_hmm_SetComposition(hmm); p7_hmm_SetConsensus(hmm, qsq); gm = p7_profile_Create(hmm->M, abc); p7_profile_ConfigUnilocal(gm, hmm, bg, 400); if (esl_sqfile_OpenDigital(abc, tfile, eslSQFILE_UNKNOWN, NULL, &tfp) != eslOK) esl_fatal("failed to open %s", tfile); while ((status = esl_sqio_Read(tfp, tsq)) == eslOK) { p7_bg_SetLength (bg, tsq->n); p7_profile_SetLength(gm, tsq->n); p7_ReferenceViterbi(tsq->dsq, tsq->n, gm, vit, NULL, &vsc); p7_bg_NullOne(bg, tsq->dsq, tsq->n, &nullsc); printf("%.4f %-25s %-25s\n", (vsc - nullsc) / eslCONST_LOG2, tsq->name, gm->name); esl_sq_Reuse(tsq); p7_refmx_Reuse(vit); } p7_refmx_Destroy(vit); p7_profile_Destroy(gm); p7_hmm_Destroy(hmm); p7_bg_Destroy(bg); esl_dmatrix_Destroy(Q); esl_scorematrix_Destroy(S); free(fa); esl_sq_Destroy(qsq); esl_sq_Destroy(tsq); esl_sqfile_Close(qfp); esl_sqfile_Close(tfp); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 2, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); char *seqfile = esl_opt_GetArg(go, 2); float nu = esl_opt_GetReal(go, "--nu"); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_GMX *fwd = NULL; ESL_SQ *sq = NULL; ESL_SQFILE *sqfp = NULL; P7_TRACE *tr = NULL; int format = eslSQFILE_UNKNOWN; float sc, nullsc, seqscore, P; int status; /* Read in one HMM */ if (p7_hmmfile_Open(hmmfile, NULL, &hfp) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); p7_hmmfile_Close(hfp); /* Open sequence file */ sq = esl_sq_CreateDigital(abc); status = esl_sqfile_Open(seqfile, format, NULL, &sqfp); if (status == eslENOTFOUND) p7_Fail("No such file."); else if (status == eslEFORMAT) p7_Fail("Format unrecognized."); else if (status == eslEINVAL) p7_Fail("Can't autodetect stdin or .gz."); else if (status != eslOK) p7_Fail("Open failed, code %d.", status); /* Configure a profile from the HMM */ bg = p7_bg_Create(abc); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, sq->n, p7_LOCAL); /* Allocate matrix */ fwd = p7_gmx_Create(gm->M, sq->n); while ((status = esl_sqio_Read(sqfp, sq)) == eslOK) { p7_ReconfigLength(gm, sq->n); p7_bg_SetLength(bg, sq->n); p7_gmx_GrowTo(fwd, gm->M, sq->n); /* Run MSV */ p7_GMSV(sq->dsq, sq->n, gm, fwd, nu, &sc); /* Calculate bit score and P-value using standard null1 model*/ p7_bg_NullOne (bg, sq->dsq, sq->n, &nullsc); seqscore = (sc - nullsc) / eslCONST_LOG2; P = esl_gumbel_surv(seqscore, gm->evparam[p7_MMU], gm->evparam[p7_MLAMBDA]); /* output suitable for direct use in profmark benchmark postprocessors: * <Pvalue> <bitscore> <target name> <query name> */ printf("%g\t%.2f\t%s\t%s\n", P, seqscore, sq->name, hmm->name); esl_sq_Reuse(sq); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); /* Cleanup */ esl_sqfile_Close(sqfp); esl_sq_Destroy(sq); p7_trace_Destroy(tr); p7_gmx_Destroy(fwd); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage); ESL_ALPHABET *abc = esl_alphabet_Create(eslAMINO); char *hmmfile = esl_opt_GetArg(go, 1); char *qfile = esl_opt_GetArg(go, 2); ESL_SQ *qsq = esl_sq_CreateDigital(abc); ESL_SQFILE *qfp = NULL; FILE *hmmfp = NULL; ESL_SCOREMATRIX *S = esl_scorematrix_Create(abc); ESL_DMATRIX *Q = NULL; P7_BG *bg = p7_bg_Create(abc); P7_HMM *hmm = NULL; double *fa = NULL; double popen = esl_opt_GetReal (go, "-q"); double pextend = esl_opt_GetReal (go, "-r"); char *mxfile = esl_opt_GetString(go, "-m"); char errbuf[eslERRBUFSIZE]; double slambda; int a,b; int status; /* Reverse engineer a scoring matrix to obtain conditional prob's * that we'll use for the single-seq query HMM. Because score mx is * symmetric, we can set up P[a][b] = P(b | a), so we can use the * matrix rows as HMM match emission vectors. This means dividing * the joint probs through by f_a. */ if (mxfile == NULL) { if (esl_scorematrix_Set("BLOSUM62", S) != eslOK) esl_fatal("failed to set BLOSUM62 scores"); } else { ESL_FILEPARSER *efp = NULL; if ( esl_fileparser_Open(mxfile, NULL, &efp) != eslOK) esl_fatal("failed to open score file %s", mxfile); if ( esl_scorematrix_Read(efp, abc, &S) != eslOK) esl_fatal("failed to read matrix from %s", mxfile); esl_fileparser_Close(efp); } /* A wasteful conversion of the HMMER single-precision background probs to Easel double-prec */ ESL_ALLOC(fa, sizeof(double) * bg->abc->K); esl_vec_F2D(bg->f, bg->abc->K, fa); /* Backcalculate joint probabilities Q, given score matrix S and background frequencies fa */ status = esl_scorematrix_ProbifyGivenBG(S, fa, fa, &slambda, &Q); if (status == eslEINVAL) esl_fatal("built-in score matrix %s has no valid solution for lambda", matrix); else if (status == eslENOHALT) esl_fatal("failed to solve score matrix %s for lambda", matrix); else if (status != eslOK) esl_fatal("unexpected error in solving score matrix %s for probability parameters", matrix); esl_scorematrix_JointToConditionalOnQuery(abc, Q); /* Open the query sequence file in FASTA format */ status = esl_sqfile_Open(qfile, eslSQFILE_FASTA, NULL, &qfp); if (status == eslENOTFOUND) esl_fatal("No such file %s.", qfile); else if (status == eslEFORMAT) esl_fatal("Format of %s unrecognized.", qfile); else if (status == eslEINVAL) esl_fatal("Can't autodetect stdin or .gz."); else if (status != eslOK) esl_fatal("Open of %s failed, code %d.", qfile, status); /* Open the output HMM file */ if ((hmmfp = fopen(hmmfile, "w")) == NULL) esl_fatal("Failed to open output HMM file %s", hmmfile); /* For each sequence, build a model and save it. */ while ((status = esl_sqio_Read(qfp, qsq)) == eslOK) { p7_Seqmodel(abc, qsq->dsq, qsq->n, qsq->name, Q, bg->f, popen, pextend, &hmm); if ( p7_hmm_Validate(hmm, errbuf, 1e-5) != eslOK) esl_fatal("HMM validation failed: %s\n", errbuf); if ( p7_hmmfile_WriteASCII(hmmfp, -1, hmm) != eslOK) esl_fatal("HMM save failed"); p7_hmm_Destroy(hmm); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s line %" PRId64 "):\n%s\n", qfp->filename, qfp->linenumber, qfp->errbuf); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, qfp->filename); esl_dmatrix_Destroy(Q); esl_scorematrix_Destroy(S); free(fa); free(fb); esl_sq_Destroy(qsq); esl_sqfile_Close(qfp); fclose(hmmfp); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
int p7_seqcache_Open(char *seqfile, P7_SEQCACHE **ret_cache, char *errbuf) { int i; int inx; int val; int status; int32_t seq_cnt; int32_t db_cnt; int32_t db_inx[32]; uint32_t db_key; uint64_t res_cnt; uint64_t res_size; uint64_t hdr_size; char *hdr_ptr; char *res_ptr; char *desc_ptr; char *ptr; char buffer[512]; off_t offset; uint64_t total_mem; SEQ_DB *db = NULL; P7_SEQCACHE *cache = NULL; ESL_RANDOMNESS *rnd = NULL; ESL_SQFILE *sqfp = NULL; ESL_SQ *sq = NULL; ESL_ALPHABET *abc = NULL; ESL_SQASCII_DATA *ascii = NULL; if (errbuf) errbuf[0] = '\0'; /* CURRENTLY UNUSED. FIXME */ /* Open the target sequence database */ if ((status = esl_sqfile_Open(seqfile, eslSQFILE_FASTA, NULL, &sqfp)) != eslOK) return status; /* This is a bit of a hack. The first line contains database information. * * #<res_count> <seq_count> <db_count> <db_sequences_1> <db_sequences_before_removing_duplicates_1> <db_sequences_2> <db_sequences_before_removing_duplicates_2> ... <date_stamp> * * The rest of the file is a fasta format. The fasta header is just * sequence number followed by a binary number indicating which * database this sequence occurs in. * * The header line will be read in, parsed and saved. Then the * parser will be repositioned after the line and used normally. */ ascii = &sqfp->data.ascii; fseek(ascii->fp, 0L, SEEK_SET); if (fgets(buffer, sizeof(buffer), ascii->fp) == NULL) return eslEFORMAT; if (buffer[0] != '#') return eslEFORMAT; ptr = buffer + 1; res_cnt = strtoll(ptr, &ptr, 10); seq_cnt = strtol(ptr, &ptr, 10); db_cnt = strtol(ptr, &ptr, 10); if (db_cnt > (sizeof(db_inx)/sizeof(db_inx[0]))) return eslEFORMAT; total_mem = sizeof(P7_SEQCACHE); ESL_ALLOC(cache, sizeof(P7_SEQCACHE)); memset(cache, 0, sizeof(P7_SEQCACHE)); if (esl_strdup(seqfile, -1, &cache->name) != eslOK) goto ERROR; total_mem += (sizeof(HMMER_SEQ) * seq_cnt); ESL_ALLOC(cache->list, sizeof(HMMER_SEQ) * seq_cnt); memset(cache->list, 0, sizeof(HMMER_SEQ) * seq_cnt); total_mem += (sizeof(SEQ_DB) * db_cnt); ESL_ALLOC(db, sizeof(SEQ_DB) * db_cnt); for (i = 0; i < db_cnt; ++i) { db[i].count = strtol(ptr, &ptr, 10); db[i].K = strtol(ptr, &ptr, 10); total_mem += (sizeof(HMMER_SEQ *) * db[i].count); ESL_ALLOC(db[i].list, sizeof(HMMER_SEQ *) * db[i].count); memset(db[i].list, 0, sizeof(HMMER_SEQ *) * db[i].count); } /* grab the unique identifier */ while (*ptr && isspace(*ptr)) ++ptr; i = strlen(ptr); ESL_ALLOC(cache->id, i+1); strcpy(cache->id, ptr); while (--i > 0 && isspace(cache->id[i])) cache->id[i] = 0; res_size = res_cnt + seq_cnt + 1; hdr_size = seq_cnt * 10; total_mem += res_size + hdr_size; ESL_ALLOC(cache->residue_mem, res_size); ESL_ALLOC(cache->header_mem, hdr_size); /* position the sequence file to the start of the first sequence. * this will force any buffers associated with the file to be reset. */ offset = ftell(ascii->fp); if ((status = esl_sqfile_Position(sqfp, offset)) != eslOK) goto ERROR; abc = esl_alphabet_Create(eslAMINO); sq = esl_sq_CreateDigital(abc); cache->db_cnt = db_cnt; cache->db = db; cache->abc = abc; cache->res_size = res_size; cache->hdr_size = hdr_size; cache->count = seq_cnt; hdr_ptr = cache->header_mem; res_ptr = cache->residue_mem; for (i = 0; i < db_cnt; ++i) db_inx[i] = 0; strcpy(buffer, "000000001"); inx = 0; while ((status = esl_sqio_Read(sqfp, sq)) == eslOK) { /* sanity checks */ if (inx >= seq_cnt) { printf("inx: %d\n", inx); return eslEFORMAT; } if (sq->n + 1 > res_size) { printf("inx: %d size %d %d\n", inx, (int)sq->n + 1, (int)res_size); return eslEFORMAT; } if (hdr_size <= 0) { printf("inx: %d hdr %d\n", inx, (int)hdr_size); return eslEFORMAT; } /* generate the database key - modified to take the first word in the desc line. * The remaining part of the desc is then cached as the description. */ ptr = sq->desc;; desc_ptr = strchr(sq->desc, ' '); if(desc_ptr != NULL) { *desc_ptr= '\0'; ++desc_ptr; } val = 1; db_key = 0; while (*ptr) { if (*ptr == '1') db_key += val; val <<= 1; ++ptr; } if (db_key >= (1 << (db_cnt + 1))) { printf("inx: %d db %d %s\n", inx, db_key, sq->desc); return eslEFORMAT; } cache->list[inx].name = hdr_ptr; cache->list[inx].dsq = (ESL_DSQ *)res_ptr; cache->list[inx].n = sq->n; cache->list[inx].idx = inx; cache->list[inx].db_key = db_key; if(desc_ptr != NULL) esl_strdup(desc_ptr, -1, &(cache->list[inx].desc)); /* copy the digitized sequence */ memcpy(res_ptr, sq->dsq, sq->n + 1); res_ptr += (sq->n + 1); res_size -= (sq->n + 1); /* copy the index to the header */ strcpy(hdr_ptr, buffer); hdr_ptr += 10; hdr_size -= 10; /* increment the buffer string */ ++buffer[8]; for (i = 8; i > 0; --i) { if (buffer[i] > '9') { buffer[i] = '0'; buffer[i-1]++; } } esl_sq_Reuse(sq); ++inx; } if (status != eslEOF) { printf("Unexpected error %d at %d\n", status, inx); return status; } if (inx != seq_cnt) { printf("inx:: %d %d\n", inx, seq_cnt); return eslEFORMAT; } if (hdr_size != 0) { printf("inx:: %d hdr %d\n", inx, (int)hdr_size); return eslEFORMAT; } if (res_size != 1) { printf("inx:: %d size %d %d\n", inx, (int)sq->n + 1, (int)res_size); return eslEFORMAT; } /* copy the final sentinel character */ *res_ptr++ = eslDSQ_SENTINEL; --res_size; /* sort the order of the database sequences */ rnd = esl_randomness_CreateFast(seq_cnt); for (i = 0 ; i < seq_cnt; ++i) { rnd->x = rnd->x * 69069 + 1; cache->list[i].idx = rnd->x; } esl_randomness_Destroy(rnd); qsort(cache->list, seq_cnt, sizeof(HMMER_SEQ), sort_seq); /* fill in the different databases and fix the index */ for (i = 0 ; i < seq_cnt; ++i) { inx = 0; db_key = cache->list[i].db_key; while (db_key) { if (db_key & 1) { SEQ_DB *db = cache->db + inx; if (db_inx[inx] >= db->count) { printf("sort:: %d %d\n", db_inx[inx], db->count); return eslEFORMAT; } db->list[db_inx[inx]] = &cache->list[i]; ++db_inx[inx]; } db_key >>= 1; ++inx; } cache->list[i].idx = (cache->list[i].name - cache->header_mem) / 10 + 1; } for (i = 0; i < cache->db_cnt; ++i) { printf("sequence database (%d):: %d %d\n", i, cache->db[i].count, db_inx[i]); } printf("\nLoaded sequence db file %s; total memory %" PRId64 "\n", seqfile, total_mem); esl_sqfile_Close(sqfp); esl_sq_Destroy(sq); *ret_cache = cache; return eslOK; ERROR: if (sq != NULL) esl_sq_Destroy(sq); if (abc != NULL) esl_alphabet_Destroy(abc); if (cache != NULL) { if (cache->header_mem != NULL) free(cache->header_mem); if (cache->residue_mem != NULL) free(cache->residue_mem); if (cache->name != NULL) free(cache->name); if (cache->id != NULL) free(cache->id); free(cache); } for (i = 0; i < db_cnt; ++i) { if (db[i].list != NULL) free(db[i].list); } return eslEMEM; }
int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 2, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); char *seqfile = esl_opt_GetArg(go, 2); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_GMX *fwd = NULL; P7_GMX *bck = NULL; ESL_SQ *sq = NULL; ESL_SQFILE *sqfp = NULL; int format = eslSQFILE_UNKNOWN; float fsc, bsc; int status; /* Read in one HMM */ if (p7_hmmfile_Open(hmmfile, NULL, &hfp) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); p7_hmmfile_Close(hfp); /* Read in one sequence */ sq = esl_sq_CreateDigital(abc); status = esl_sqfile_Open(seqfile, format, NULL, &sqfp); if (status == eslENOTFOUND) p7_Fail("No such file."); else if (status == eslEFORMAT) p7_Fail("Format unrecognized."); else if (status == eslEINVAL) p7_Fail("Can't autodetect stdin or .gz."); else if (status != eslOK) p7_Fail("Open failed, code %d.", status); if (esl_sqio_Read(sqfp, sq) != eslOK) p7_Fail("Failed to read sequence"); esl_sqfile_Close(sqfp); /* Configure a profile from the HMM */ bg = p7_bg_Create(abc); p7_bg_SetLength(bg, sq->n); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, sq->n, p7_UNILOCAL); /* Allocate matrices */ fwd = p7_gmx_Create(gm->M, sq->n); bck = p7_gmx_Create(gm->M, sq->n); /* Run Forward, Backward */ p7_GForward (sq->dsq, sq->n, gm, fwd, &fsc); p7_GBackward(sq->dsq, sq->n, gm, bck, &bsc); printf("fwd = %.4f nats\n", fsc); printf("bck = %.4f nats\n", bsc); /* Cleanup */ esl_sq_Destroy(sq); p7_gmx_Destroy(fwd); p7_gmx_Destroy(bck); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { char *hmmfile = argv[1]; char *seqfile = argv[2]; ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_OPROFILE *om = NULL; P7_OMX *ox = NULL; P7_GMX *gx = NULL; ESL_SQ *sq = NULL; ESL_SQFILE *sqfp = NULL; int format = eslSQFILE_UNKNOWN; float sc; int status; /* Read in one HMM */ if (p7_hmmfile_Open(hmmfile, NULL, &hfp) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); /* Read in one sequence */ sq = esl_sq_CreateDigital(abc); status = esl_sqfile_Open(seqfile, format, NULL, &sqfp); if (status == eslENOTFOUND) p7_Fail("No such file."); else if (status == eslEFORMAT) p7_Fail("Format unrecognized."); else if (status == eslEINVAL) p7_Fail("Can't autodetect stdin or .gz."); else if (status != eslOK) p7_Fail("Open failed, code %d.", status); if (esl_sqio_Read(sqfp, sq) != eslOK) p7_Fail("Failed to read sequence"); /* create default null model, then create and optimize profile */ bg = p7_bg_Create(abc); p7_bg_SetLength(bg, sq->n); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, sq->n, p7_LOCAL); om = p7_oprofile_Create(gm->M, abc); p7_oprofile_Convert(gm, om); p7_oprofile_Logify(om); /* allocate DP matrices, both a generic and an optimized one */ ox = p7_omx_Create(gm->M, 0, sq->n); gx = p7_gmx_Create(gm->M, sq->n); /* Useful to place and compile in for debugging: p7_oprofile_Dump(stdout, om); dumps the optimized profile p7_omx_SetDumpMode(ox, TRUE); makes the fast DP algorithms dump their matrices p7_gmx_Dump(stdout, gx); dumps a generic DP matrix */ p7_ViterbiScore(sq->dsq, sq->n, om, ox, &sc); printf("viterbi (non-optimized): %.2f nats\n", sc); p7_GViterbi (sq->dsq, sq->n, gm, gx, &sc); printf("viterbi (generic): %.2f nats\n", sc); /* now in a real app, you'd need to convert raw nat scores to final bit * scores, by subtracting the null model score and rescaling. */ /* cleanup */ esl_sq_Destroy(sq); esl_sqfile_Close(sqfp); p7_omx_Destroy(ox); p7_gmx_Destroy(gx); p7_oprofile_Destroy(om); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); p7_hmmfile_Close(hfp); esl_alphabet_Destroy(abc); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); char *seqfile = esl_opt_GetArg(go, 2); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_OPROFILE *om = NULL; P7_OMX *ox = NULL; P7_GMX *gx = NULL; ESL_SQ *sq = NULL; ESL_SQFILE *sqfp = NULL; int format = eslSQFILE_UNKNOWN; float msvraw, nullsc, msvscore; float graw, gscore; double P, gP; int status; /* Read in one HMM */ if (p7_hmmfile_OpenE(hmmfile, NULL, &hfp, NULL) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); /* Open sequence file for reading */ sq = esl_sq_CreateDigital(abc); status = esl_sqfile_Open(seqfile, format, NULL, &sqfp); if (status == eslENOTFOUND) p7_Fail("No such file."); else if (status == eslEFORMAT) p7_Fail("Format unrecognized."); else if (status == eslEINVAL) p7_Fail("Can't autodetect stdin or .gz."); else if (status != eslOK) p7_Fail("Open failed, code %d.", status); /* create default null model, then create and optimize profile */ bg = p7_bg_Create(abc); p7_bg_SetLength(bg, sq->n); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, sq->n, p7_LOCAL); om = p7_oprofile_Create(gm->M, abc); p7_oprofile_Convert(gm, om); /* allocate DP matrices, both a generic and an optimized one */ ox = p7_omx_Create(gm->M, 0, 0); /* one row version */ gx = p7_gmx_Create(gm->M, sq->n); /* Useful to place and compile in for debugging: p7_oprofile_Dump(stdout, om); dumps the optimized profile p7_omx_SetDumpMode(stdout, ox, TRUE); makes the fast DP algorithms dump their matrices p7_gmx_Dump(stdout, gx, p7_DEFAULT); dumps a generic DP matrix p7_oprofile_SameMSV(om, gm); */ //p7_oprofile_Dump(stdout, om); //p7_omx_SetDumpMode(stdout, ox, TRUE); while ((status = esl_sqio_Read(sqfp, sq)) == eslOK) { p7_oprofile_ReconfigLength(om, sq->n); p7_ReconfigLength(gm, sq->n); p7_bg_SetLength(bg, sq->n); p7_omx_GrowTo(ox, om->M, 0, sq->n); p7_gmx_GrowTo(gx, gm->M, sq->n); p7_MSVFilter (sq->dsq, sq->n, om, ox, &msvraw); p7_bg_NullOne (bg, sq->dsq, sq->n, &nullsc); msvscore = (msvraw - nullsc) / eslCONST_LOG2; P = esl_gumbel_surv(msvscore, om->evparam[p7_MMU], om->evparam[p7_MLAMBDA]); p7_GMSV(sq->dsq, sq->n, gm, gx, 2.0, &graw); gscore = (graw - nullsc) / eslCONST_LOG2; gP = esl_gumbel_surv(gscore, gm->evparam[p7_MMU], gm->evparam[p7_MLAMBDA]); if (esl_opt_GetBoolean(go, "-1")) { printf("%-30s %-20s %9.2g %7.2f %9.2g %7.2f\n", sq->name, hmm->name, P, msvscore, gP, gscore); } else if (esl_opt_GetBoolean(go, "-P")) { /* output suitable for direct use in profmark benchmark postprocessors: */ printf("%g %.2f %s %s\n", P, msvscore, sq->name, hmm->name); } else { printf("target sequence: %s\n", sq->name); printf("msv filter raw score: %.2f nats\n", msvraw); printf("null score: %.2f nats\n", nullsc); printf("per-seq score: %.2f bits\n", msvscore); printf("P-value: %g\n", P); printf("GMSV raw score: %.2f nats\n", graw); printf("GSMV per-seq score: %.2f bits\n", gscore); printf("GSMV P-value: %g\n", gP); } esl_sq_Reuse(sq); } /* cleanup */ esl_sq_Destroy(sq); esl_sqfile_Close(sqfp); p7_omx_Destroy(ox); p7_gmx_Destroy(gx); p7_oprofile_Destroy(om); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); p7_hmmfile_Close(hfp); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 2, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); char *seqfile = esl_opt_GetArg(go, 2); ESL_STOPWATCH *w = esl_stopwatch_Create(); ESL_RANDOMNESS*r = esl_randomness_Create(esl_opt_GetInteger(go, "-s")); ESL_ALPHABET*abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm1, *gm2; int L = esl_opt_GetInteger(go, "-L"); int N = esl_opt_GetInteger(go, "-N") / SSE16_NVALS; int MaxPart = esl_opt_GetInteger(go, "-M"); int NROUNDS = esl_opt_GetInteger(go, "-R"); int check = esl_opt_GetBoolean(go, "-c"); __m128 resdata[10]; int i, j; float *sc1 = (float*) resdata; ESL_SQFILE *sqfp = NULL; DATA_COPS16 *dcops; struct timeb tbstart, tbend; int sumlengths = 0; float* results = NULL; srand(time(NULL)); if (p7_hmmfile_Open(hmmfile, NULL, &hfp) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); bg = p7_bg_Create(abc); p7_bg_SetLength(bg, L); gm1 = p7_profile_Create(hmm->M, abc); gm2 = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm1, L, p7_UNILOCAL); p7_ProfileConfig(hmm, bg, gm2, L, p7_UNILOCAL); dcops = p7_ViterbiCOPSw_Create(gm1); p7_ViterbiCOPSW_Setup(dcops, L+100, MaxPart); // use max L dcops->L = L; int dbsize = SSE16_NVALS*N; SEQ **seqsdb= calloc(dbsize+1, sizeof(SEQ*)); int equallength = 1; if (esl_sqfile_OpenDigital(abc, seqfile, eslSQFILE_FASTA, NULL, &sqfp) == eslOK) { // Use Sequence file ESL_SQ* sq = esl_sq_CreateDigital(abc); int maxseqs, len=0; if (esl_opt_IsDefault(go, "-N")) // N not specified in cmdline maxseqs = INT_MAX; // no limit else maxseqs = SSE16_NVALS*N; // use cmdline limit for (j = 0; j < maxseqs && esl_sqio_Read(sqfp, sq) == eslOK; j++) { if (equallength && sq->n != len && j > 0) equallength = 0; len = sq->n; if (j > dbsize) { seqsdb = realloc(seqsdb, 2*(dbsize+1)*sizeof(SEQ*)); dbsize *= 2; } ESL_DSQ* dsq = sq->dsq; seqsdb[j] = malloc(sizeof(SEQ)); seqsdb[j]->length = len; seqsdb[j]->seq = malloc((len+4)*sizeof(ESL_DSQ)); memcpy(seqsdb[j]->seq, dsq, len+2); sumlengths += len; esl_sq_Reuse(sq); } N = j/SSE16_NVALS; } else // Not found database. Generate random sequences for (i = 0; i < N; i++) for (j = 0; j < SSE16_NVALS; j++) { int len = L; // - rand()%1000; seqsdb[i*SSE16_NVALS+j] = malloc(sizeof(SEQ)); seqsdb[i*SSE16_NVALS+j]->seq = malloc(len+4); seqsdb[i*SSE16_NVALS+j]->length = len; esl_rsq_xfIID(r, bg->f, abc->K, len, seqsdb[i*SSE16_NVALS+j]->seq); sumlengths += len; } printf("Viterbi COPS Word with %d threads, model %s. ModelLen: %d, #Segms: %d, SeqL.: %d, #seqs: %d, Partition: %d, #parts: %d\n", NTHREADS, hmmfile, gm1->M, (int) ceil(gm1->M/SSE16_NVALS), L, SSE16_NVALS*N*NROUNDS, dcops->partition, dcops->Npartitions); /* // No. of partitions computed without full parallelism ( == no. of threads active while some are idle) int Niters_part = dcops->Npartitions % NTHREADS; // No. of Model lines that could be computed but are wasted by idle threads waiting on the end int Nwasted_threads = dcops->partition * ((NTHREADS-Niters_part) % NTHREADS); // No. of lines in the last partition that go beyond M. It's wasted comp time by a single thread int Nwasted_leftover= (dcops->partition - gm1->M % dcops->partition) % dcops->partition; // Total number of wasted lines int wastedcomp = Nwasted_threads + Nwasted_leftover; // Total number of lines computed and waited for int totalcomp = wastedcomp + gm1->M; // same as: roundtop(gm1->M, dcops->partition * NTHREADS); printf("Total Comp Lines: %d | Wasted Comp Lines: %d\n", totalcomp, wastedcomp); */ if (check) results = (float*) alloc_m128_aligned64((N+1)*2); ftime(&tbstart); if (!equallength) { // Sort sequences by length qsort(seqsdb, N*SSE16_NVALS, sizeof(SEQ*), compare_seqs); } for (j = 0; j < NROUNDS; j++) for (i = 0; i < N; i++) { // if (i % 1000 == 0) printf("Seq %d\n", i); p7_ViterbiCOPSw_run(dcops, seqsdb+i*SSE16_NVALS, sc1); if (check) memcpy(results+i*SSE16_NVALS, sc1, 32); // 32 bytes indeed! SSE16_NVALS floats } ftime(&tbend); double secs = TIMEDIFF(tbstart,tbend); w->elapsed = w->user = secs; esl_stopwatch_Display(stdout, w, "# Opt CPU time: "); double compmillioncells = NROUNDS * (double) sumlengths * (double) hmm->M * 1e-6; printf("# %.0fM cells in %.1f Mc/s\n", compmillioncells, compmillioncells / secs); if (check) { P7_OPROFILE *om = p7_oprofile_Create(hmm->M, gm1->abc); p7_oprofile_Convert(gm1, om); P7_OMX *ox = p7_omx_Create(hmm->M, 0, 0); printf("Compare results against base version\n"); for (i = 0; i < N; i++) { int maxll = 0; float sc2; for (j = 0; j < SSE16_NVALS; j++) if (maxll < seqsdb[i*SSE16_NVALS+j]->length) maxll = seqsdb[i*SSE16_NVALS+j]->length; p7_oprofile_ReconfigRestLength(om, maxll); // p7_ReconfigLength(gm2, maxll); // emulate the lock-step inter-sequence reconfigs for (j = 0; j < SSE16_NVALS; j++) { // p7_ReconfigLength(gm2, seqsdb[i*SSE16_NVALS+j]->length); // p7_Viterbi_unilocal(seqsdb[i*SSE16_NVALS+j]->seq, seqsdb[i*SSE16_NVALS+j]->length, gm2, &sc3); // p7_Viterbi_unilocal_word(seqsdb[i*SSE16_NVALS+j]->seq, seqsdb[i*SSE16_NVALS+j]->length, gm2, &sc2); // p7_oprofile_ReconfigLength(om, seqsdb[i*SSE16_NVALS+j]->length); p7_ViterbiFilter(seqsdb[i*SSE16_NVALS+j]->seq, seqsdb[i*SSE16_NVALS+j]->length, om, ox, &sc2); sc2 += 1.0; // -2.0nat optimization, Local to Unilocal mode if (fabs(results[i*SSE16_NVALS+j] - sc2) > 0.0001) { printf("Seq %d Len %4d: %f - %f\tdiff: %f\n", i*SSE16_NVALS+j, seqsdb[i*SSE16_NVALS+j]->length, results[i*SSE16_NVALS+j], sc2, fabs(results[i*SSE16_NVALS+j] - sc2)); } } } } return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage); ESL_RANDOMNESS *rng = esl_randomness_Create(esl_opt_GetInteger(go, "-s")); char *hmmfile = esl_opt_GetArg(go, 1); char *seqfile = esl_opt_GetArg(go, 2); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; ESL_SQ *sq = NULL; ESL_SQFILE *sqfp = NULL; int format = eslSQFILE_UNKNOWN; P7_ANCHORS *anch = p7_anchors_Create(); P7_ANCHORHASH *ah = p7_anchorhash_Create(); P7_ENVELOPES *env = p7_envelopes_Create(); P7_REFMX *rxf = NULL; P7_REFMX *rxd = NULL; P7_REFMX *afu = NULL; P7_REFMX *afd = NULL; P7_REFMX *apu = NULL; P7_REFMX *apd = NULL; P7_TRACE *tr = NULL; float *wrk = NULL; P7_MPAS_PARAMS prm; P7_MPAS_STATS stats; float fsc, vsc, asc, asc_b; int status; /* Read in one HMM */ if (p7_hmmfile_OpenE(hmmfile, NULL, &hfp, NULL) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); p7_hmmfile_Close(hfp); /* Open sequence file */ sq = esl_sq_CreateDigital(abc); status = esl_sqfile_Open(seqfile, format, NULL, &sqfp); if (status == eslENOTFOUND) p7_Fail("No such file."); else if (status == eslEFORMAT) p7_Fail("Format unrecognized."); else if (status == eslEINVAL) p7_Fail("Can't autodetect stdin or .gz."); else if (status != eslOK) p7_Fail("Open failed, code %d.", status); /* Read one sequence */ status = esl_sqio_Read(sqfp, sq); if (status == eslEFORMAT) p7_Fail("Parse failed (sequence file %s)\n%s\n", sqfp->filename, sqfp->get_error(sqfp)); else if (status != eslOK) p7_Fail("Unexpected error %d reading sequence file %s", status, sqfp->filename); esl_sqfile_Close(sqfp); /* Configure a profile from the HMM */ bg = p7_bg_Create(abc); gm = p7_profile_Create(hmm->M, abc); p7_profile_Config(gm, hmm, bg); /* Set the profile and null model's target length models */ p7_bg_SetLength (bg, sq->n); p7_profile_SetLength(gm, sq->n); /* Allocate DP matrices and tracebacks */ rxf = p7_refmx_Create(gm->M, sq->n); rxd = p7_refmx_Create(gm->M, sq->n); tr = p7_trace_Create(); afu = p7_refmx_Create(gm->M, sq->n); afd = p7_refmx_Create(gm->M, sq->n); /* First pass analysis */ p7_ReferenceViterbi (sq->dsq, sq->n, gm, rxf, tr, &vsc); p7_ReferenceForward (sq->dsq, sq->n, gm, rxf, &fsc); p7_ReferenceBackward(sq->dsq, sq->n, gm, rxd, NULL); p7_ReferenceDecoding(sq->dsq, sq->n, gm, rxf, rxd, rxd); /* Customize MPAS parameters if you want; these are the defaults. */ prm.max_iterations = 1000; prm.loss_threshold = 0.001; prm.nmax_sampling = FALSE; prm.be_verbose = FALSE; /* MPAS algorithm gets us an anchor set */ p7_reference_Anchors(rng, sq->dsq, sq->n, gm, rxf, rxd, tr, &wrk, ah, afu, afd, anch, &asc, &prm, &stats); //printf("# ASC Forward UP:\n"); p7_refmx_Dump(stdout, afu); //printf("# ASC Forward DOWN:\n"); p7_refmx_Dump(stdout, afd); /* We no longer need rxf and rxd. * Use their space for apu/apd pair, which will briefly * hold ASC Backward matrices, then get used for ASC Decoding. */ apu = rxf; p7_refmx_Reuse(apu); apd = rxd; p7_refmx_Reuse(apd); p7_ReferenceASCBackward(sq->dsq, sq->n, gm, anch->a, anch->D, apu, apd, &asc_b); //printf("# Backward score (raw, nats): %.2f\n", asc_b); //printf("# ASC Backward UP:\n"); p7_refmx_Dump(stdout, apu); //printf("# ASC Backward DOWN:\n"); p7_refmx_Dump(stdout, apd); /* ASC Decoding takes afu/afd and abu/abd as input; * overwrites abu/abd with decoding matrices */ p7_ReferenceASCDecoding(sq->dsq, sq->n, gm, anch->a, anch->D, afu, afd, apu, apd, apu, apd); //printf("# ASC Decoding UP matrix:\n"); p7_refmx_Dump(stdout, apu); //printf("# ASC Decoding DOWN:\n"); p7_refmx_Dump(stdout, apu); /* Envelope calculation needs to get four matrices: * ASC Decoding pair, apu/apd, and it will leave these constant; * ASC Forward pair, afu/afd, and it will overwrite these. */ p7_reference_Envelopes(sq->dsq, sq->n, gm, anch->a, anch->D, apu, apd, afu, afd, env); p7_envelopes_Dump(stdout, env); p7_envelopes_Destroy(env); p7_anchorhash_Destroy(ah); p7_anchors_Destroy(anch); if (wrk) free(wrk); p7_trace_Destroy(tr); p7_refmx_Destroy(afd); p7_refmx_Destroy(afu); p7_refmx_Destroy(rxd); p7_refmx_Destroy(rxf); esl_sq_Destroy(sq); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); esl_alphabet_Destroy(abc); esl_randomness_Destroy(rng); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* application configuration */ char *seqfile = NULL; /* sequence file name */ char *maskfile = NULL; /* mask coordinate file name */ int infmt = eslSQFILE_UNKNOWN; /* format code for seqfile */ int outfmt = eslSQFILE_FASTA; /* format code for output seqs */ ESL_SQFILE *sqfp = NULL; /* open sequence file */ ESL_FILEPARSER *maskefp = NULL; /* open mask coord file */ FILE *ofp = NULL; /* output stream for masked seqs */ char *source = NULL; /* name of current seq to mask */ char *p1, *p2; /* pointers used in parsing */ int64_t start, end; /* start, end coord for masking */ int64_t i, j, pos; /* coords in a sequence */ int64_t overmask; /* # of extra residues to mask */ ESL_SQ *sq = esl_sq_Create(); /* current sequence */ int do_fetching; int do_lowercase; int maskchar; int status; /* easel return code */ /**************************************************************************** * Parse command line ****************************************************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") ) cmdline_help (argv[0], go); if (esl_opt_ArgNumber(go) != 2) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); do_fetching = esl_opt_GetBoolean(go, "-R"); do_lowercase = esl_opt_GetBoolean(go, "-l"); overmask = (esl_opt_IsOn(go, "-x") ? esl_opt_GetInteger(go, "-x") : 0); maskchar = (esl_opt_IsOn(go, "-m") ? esl_opt_GetChar(go, "-m") : 'X'); seqfile = esl_opt_GetArg(go, 1); maskfile = esl_opt_GetArg(go, 2); /* Open the <seqfile>: text mode, not digital */ if (esl_opt_GetString(go, "--informat") != NULL) { infmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslSQFILE_UNKNOWN) cmdline_failure(argv[0], "%s is not a valid input sequence file format for --informat"); } status = esl_sqfile_Open(seqfile, infmt, NULL, &sqfp); if (status == eslENOTFOUND) cmdline_failure(argv[0], "Sequence file %s not found.\n", seqfile); else if (status == eslEFORMAT) cmdline_failure(argv[0], "Format of file %s unrecognized.\n", seqfile); else if (status == eslEINVAL) cmdline_failure(argv[0], "Can't autodetect stdin or .gz.\n"); else if (status != eslOK) cmdline_failure(argv[0], "Open failed, code %d.\n", status); if(do_fetching) { status = esl_sqfile_OpenSSI(sqfp, NULL); if (status == eslEFORMAT) cmdline_failure(argv[0], "SSI index is in incorrect format\n"); else if (status == eslERANGE) cmdline_failure(argv[0], "SSI index is in 64-bit format and we can't read it\n"); else if (status != eslOK) cmdline_failure(argv[0], "Failed to open SSI index\n"); } /* Open the <maskfile> */ if (esl_fileparser_Open(maskfile, NULL, &maskefp) != eslOK) cmdline_failure(argv[0], "Failed to open mask coordinate file %s\n", maskfile); esl_fileparser_SetCommentChar(maskefp, '#'); /* Open the output file, if any */ if (esl_opt_GetString(go, "-o") != NULL) { if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) cmdline_failure(argv[0], "Failed to open output file %s\n", esl_opt_GetString(go, "-o")); } else ofp = stdout; /**************************************************************************** * Main loop over lines in <maskfile> ****************************************************************************/ /* Read one data line at a time from the <maskfile>; * parse into data fields <seqname> <start> <end> */ while (esl_fileparser_NextLine(maskefp) == eslOK) { /* First field is sequence name */ if (esl_fileparser_GetTokenOnLine(maskefp, &source, NULL) != eslOK) esl_fatal("Failed to read source seq name on line %d of file %s\n", maskefp->linenumber, maskfile); /* Get the sequence */ if (do_fetching) { /* If the <seqfile> is SSI indexed, try to reposition it and read <source> seq by random access */ status = esl_sqio_Fetch(sqfp, source, sq); if (status == eslENOTFOUND) esl_fatal("seq %s not found in SSI index for file %s\n", source, sqfp->filename); else if (status == eslEINVAL) esl_fatal("No SSI index or can't reposition in file %s\n", sqfp->filename); else if (status == eslEFORMAT) esl_fatal("Parse failed:\n%s\n", esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslOK) esl_fatal("Unexpected failure in fetching %s from file %s\n", source, sqfp->filename); } else { /* else, assume we're reading sequentially; <sqfile> and <maskfile> have seqs in same order */ status = esl_sqio_Read(sqfp, sq); if (status == eslEOF) esl_fatal("File %s ended prematurely; didn't find %s\n", sqfp->filename, source); else if (status == eslEFORMAT) esl_fatal("Parse failed:\n%s\n", esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslOK) esl_fatal("Unexpected error reading sequence file %s\n", sqfp->filename); if ((strcmp(sq->name, source) != 0) && (strcmp(sq->acc, source) != 0)) esl_fatal("Sequences in <sqfile> and <maskfile> aren't in same order; try -R"); } /* If we're masking by lowercase, first make sure everything's uppercase */ if (do_lowercase) for (pos = 0; pos < sq->n; pos++) if (isalpha(sq->seq[pos])) sq->seq[pos] = toupper(sq->seq[pos]); /* Next two fields are <start>, <end> for the masking */ /* possible future extension: wrap loop around this, enable multiple masked regions */ if (esl_fileparser_GetTokenOnLine(maskefp, &p1, NULL) != eslOK) esl_fatal("Failed to read start coord on line %d of file %s\n", maskefp->linenumber, maskfile); start = strtoll(p1, &p2, 0) - 1; if (esl_fileparser_GetTokenOnLine(maskefp, &p2, NULL) != eslOK) esl_fatal("Failed to read end coord on line %d of file %s\n", maskefp->linenumber, maskfile); end = strtoll(p2, &p1, 0) - 1; /* Do the masking */ if (esl_opt_GetBoolean(go, "-r")) /* Reverse masking */ { /* leave start..end unmasked; mask prefix 0..start-1, end+1..L-1 */ i = 0; j = ESL_MIN(sq->n-1, start - 1 + overmask); for (pos = i; pos <= j; pos++) if (isalpha(sq->seq[pos])) sq->seq[pos] = (do_lowercase ? tolower(sq->seq[pos]) : maskchar); i = ESL_MAX(0, end + 1 - overmask); j = sq->n-1; for (pos = i; pos <= j; pos++) if (isalpha(sq->seq[pos])) sq->seq[pos] = (do_lowercase ? tolower(sq->seq[pos]) : maskchar); } else { /* normal: mask start..end */ i = ESL_MAX(0, start - overmask); j = ESL_MIN(sq->n-1, end + overmask); for (pos = i; pos <= j; pos++) if (isalpha(sq->seq[pos])) sq->seq[pos] = (do_lowercase ? tolower(sq->seq[pos]) : maskchar); } esl_sqio_Write(ofp, sq, outfmt, FALSE); esl_sq_Reuse(sq); } esl_sq_Destroy(sq); esl_fileparser_Close(maskefp); esl_sqfile_Close(sqfp); esl_getopts_Destroy(go); if (ofp != stdout) fclose(ofp); return 0; }
int main(int argc, char **argv) { char *cmfile; ESL_ALPHABET *abc; char *seqfile; ESL_SQFILE *sqfp; int format; CM_FILE *cmfp; CM_t *cm; ESL_SQ *seq; float sc, rev_sc; Parsetree_t *tr; Fancyali_t *fali; Fancyali_t *rev_fali; int do_local; /* int status; */ /* char *optname; */ /* char *optarg; */ int optind; int status; char errbuf[eslERRBUFSIZE]; cmfile = seqfile = NULL; abc = NULL; sqfp = NULL; cmfp = NULL; cm = NULL; seq = NULL; tr = NULL; fali = NULL; rev_fali = NULL; format = eslSQFILE_UNKNOWN; do_local = TRUE; /* Should process options, but for now assume none and set optind */ optind = 1; if ( argc - optind != 2 ) cm_Die("Incorrect number of arguments\n"); cmfile = argv[optind++]; seqfile = argv[optind++]; if((status = cm_file_Open(cmfile, NULL, FALSE, &cmfp, errbuf)) != eslOK) cm_Die("Failed to open covariance model save file\n"); if ((status = cm_file_Read(cmfp, TRUE, &abc, &cm)) != eslOK) cm_Die("Failed to read a CM from cm file\n"); if (cm == NULL) cm_Die("CM file empty?\n"); cm_file_Close(cmfp); if ( esl_sqfile_Open(seqfile, format, NULL, &sqfp) != eslOK ) cm_Die("Failed to open sequence database file\n"); if (do_local) cm->config_opts |= CM_CONFIG_LOCAL; if((status = cm_Configure(cm, errbuf, -1)) != eslOK) cm_Die(errbuf); /*SetMarginalScores_reproduce_bug_i27(cm);*/ seq = esl_sq_Create(); while ( esl_sqio_Read(sqfp, seq) == eslOK ) { if (seq->n == 0) continue; int i0 = 1; int j0 = seq->n; if (seq->dsq == NULL) esl_sq_Digitize(abc, seq); sc = TrCYK_DnC(cm, seq->dsq, seq->n, 0, i0, j0, PLI_PASS_5P_AND_3P_ANY, TRUE, &tr); /* TRUE: reproduce v1.0 behavior */ /* sc = TrCYK_Inside(cm, seq->dsq, seq->n, 0, i0, j0, PLI_PASS_5P_AND_3P_ANY, TRUE, FALSE, &tr); */ fali = CreateFancyAli(cm->abc, tr, cm, cm->cmcons, seq->dsq, FALSE, NULL); /* float sc, struct_sc; * ParsetreeScore(cm, NULL, NULL, tr, seq->dsq, FALSE, &sc, &struct_sc, NULL, NULL, NULL); * printf("Parsetree score: %.4f\n", sc); * ParsetreeDump(stdout, tr, cm, seq->dsq); */ FreeParsetree(tr); revcomp(abc, seq, seq); rev_sc = TrCYK_DnC(cm,seq->dsq, seq->n, 0, i0, j0, PLI_PASS_5P_AND_3P_ANY, TRUE, &tr); /* TRUE: reproduce v1.0 behavior */ rev_fali = CreateFancyAli(cm->abc, tr, cm, cm->cmcons,seq->dsq, FALSE, NULL); /*ParsetreeDump(stdout, tr, cm, seq->dsq);*/ FreeParsetree(tr); if (sc > rev_sc) { printf("sequence: %s\n", seq->name); printf("score: %.2f\n",sc); PrintFancyAli(stdout, fali, 0, FALSE, FALSE, 60); } else { printf("sequence: %s (reversed)\n", seq->name); printf("score: %.2f\n",rev_sc); PrintFancyAli(stdout, fali, seq->n, TRUE, FALSE, 60); } FreeFancyAli(fali); FreeFancyAli(rev_fali); esl_sq_Destroy(seq); seq = esl_sq_Create(); } esl_sq_Destroy(seq); FreeCM(cm); esl_sqfile_Close(sqfp); return EXIT_SUCCESS; }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); char *seqfile = esl_opt_GetArg(go, 2); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_GMX *gx1 = NULL; P7_GMX *gx2 = NULL; ESL_SQ *sq = NULL; ESL_SQFILE *sqfp = NULL; P7_TRACE *tr = NULL; int format = eslSQFILE_UNKNOWN; char errbuf[eslERRBUFSIZE]; float fsc, bsc, vsc; float accscore; int status; /* Read in one HMM */ if (p7_hmmfile_OpenE(hmmfile, NULL, &hfp, NULL) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); p7_hmmfile_Close(hfp); /* Read in one sequence */ sq = esl_sq_CreateDigital(abc); status = esl_sqfile_OpenDigital(abc, seqfile, format, NULL, &sqfp); if (status == eslENOTFOUND) p7_Fail("No such file."); else if (status == eslEFORMAT) p7_Fail("Format unrecognized."); else if (status == eslEINVAL) p7_Fail("Can't autodetect stdin or .gz."); else if (status != eslOK) p7_Fail("Open failed, code %d.", status); if (esl_sqio_Read(sqfp, sq) != eslOK) p7_Fail("Failed to read sequence"); esl_sqfile_Close(sqfp); /* Configure a profile from the HMM */ bg = p7_bg_Create(abc); p7_bg_SetLength(bg, sq->n); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, sq->n, p7_LOCAL); /* multihit local: H3 default */ /* Allocations */ gx1 = p7_gmx_Create(gm->M, sq->n); gx2 = p7_gmx_Create(gm->M, sq->n); tr = p7_trace_CreateWithPP(); p7_FLogsumInit(); /* Run Forward, Backward; do OA fill and trace */ p7_GForward (sq->dsq, sq->n, gm, gx1, &fsc); p7_GBackward(sq->dsq, sq->n, gm, gx2, &bsc); p7_GDecoding(gm, gx1, gx2, gx2); /* <gx2> is now the posterior decoding matrix */ p7_GOptimalAccuracy(gm, gx2, gx1, &accscore); /* <gx1> is now the OA matrix */ p7_GOATrace(gm, gx2, gx1, tr); if (esl_opt_GetBoolean(go, "-d")) p7_gmx_Dump(stdout, gx2, p7_DEFAULT); if (esl_opt_GetBoolean(go, "-m")) p7_gmx_Dump(stdout, gx1, p7_DEFAULT); p7_trace_Dump(stdout, tr, gm, sq->dsq); if (p7_trace_Validate(tr, abc, sq->dsq, errbuf) != eslOK) p7_Die("trace fails validation:\n%s\n", errbuf); printf("fwd = %.4f nats\n", fsc); printf("bck = %.4f nats\n", bsc); printf("acc = %.4f (%.2f%%)\n", accscore, accscore * 100. / (float) sq->n); p7_trace_Reuse(tr); p7_GViterbi(sq->dsq, sq->n, gm, gx1, &vsc); p7_GTrace (sq->dsq, sq->n, gm, gx1, tr); p7_trace_SetPP(tr, gx2); p7_trace_Dump(stdout, tr, gm, sq->dsq); printf("vit = %.4f nats\n", vsc); printf("acc = %.4f\n", p7_trace_GetExpectedAccuracy(tr)); /* Cleanup */ esl_sq_Destroy(sq); p7_trace_Destroy(tr); p7_gmx_Destroy(gx1); p7_gmx_Destroy(gx2); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
/* multifetch: * given a file containing lines with one name or key per line; * parse the file line-by-line; * if we have an SSI index available, retrieve the seqs by key * as we see each line; * else, without an SSI index, store the keys in a hash, then * read the entire seq file in a single pass, outputting seqs * that are in our keylist. * * Note that with an SSI index, you get the seqs in the order they * appear in the <keyfile>, but without an SSI index, you get seqs in * the order they occur in the seq file. */ static void multifetch(ESL_GETOPTS *go, FILE *ofp, char *keyfile, ESL_SQFILE *sqfp) { ESL_KEYHASH *keys = esl_keyhash_Create(); ESL_FILEPARSER *efp = NULL; int nseq = 0; int nkeys = 0; char *key; int keylen; int keyidx; int status; if (esl_fileparser_Open(keyfile, NULL, &efp) != eslOK) esl_fatal("Failed to open key file %s\n", keyfile); esl_fileparser_SetCommentChar(efp, '#'); while (esl_fileparser_NextLine(efp) == eslOK) { if (esl_fileparser_GetTokenOnLine(efp, &key, &keylen) != eslOK) esl_fatal("Failed to read seq name on line %d of file %s\n", efp->linenumber, keyfile); status = esl_keyhash_Store(keys, key, keylen, &keyidx); if (status == eslEDUP) esl_fatal("seq key %s occurs more than once in file %s\n", key, keyfile); /* if we have an SSI index, just fetch them as we go. */ if (sqfp->data.ascii.ssi != NULL) { onefetch(go, ofp, key, sqfp); nseq++; } nkeys++; } /* If we don't have an SSI index, we haven't fetched anything yet; do it now. */ if (sqfp->data.ascii.ssi == NULL) { ESL_SQ *sq = esl_sq_Create(); while ((status = esl_sqio_Read(sqfp, sq)) == eslOK) { if ( (sq->name[0] != '\0' && esl_keyhash_Lookup(keys, sq->name, -1, NULL) == eslOK) || (sq->acc[0] != '\0' && esl_keyhash_Lookup(keys, sq->acc, -1, NULL) == eslOK)) { if (esl_opt_GetBoolean(go, "-r") ) if (esl_sq_ReverseComplement(sq) != eslOK) esl_fatal("Failed to reverse complement %s\n", sq->name); esl_sqio_Write(ofp, sq, eslSQFILE_FASTA, FALSE); nseq++; } esl_sq_Reuse(sq); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); esl_sq_Destroy(sq); } if (nkeys != nseq) esl_fatal("Tried to retrieve %d keys, but only retrieved %d sequences\n", nkeys, nseq); if (ofp != stdout) printf("\nRetrieved %d sequences.\n", nseq); esl_keyhash_Destroy(keys); esl_fileparser_Close(efp); return; }