/* output_filter_power() * * Used for testing whether the filters (MSV scores, Viterbi scores) * have the power they're supposed to have: for example, if MSV filter * is set at a P-value threshold of 0.02, ~2% of sequences should get * through, regardless of things like model and target sequence * length. * * Output a file suitable for constructing histograms over many HMMs, * for a particular choice of hmmsim'ed L and N targets: * <hmm name> <# of seqs passing threshold> <fraction of seqs passing threshold> * * SRE, Thu Apr 9 08:57:32 2009 [Janelia] xref J4/133 */ static int output_filter_power(ESL_GETOPTS *go, struct cfg_s *cfg, char *errbuf, P7_HMM *hmm, double *scores) { double pthresh = esl_opt_GetReal(go, "--pthresh"); /* P-value threshold set for the filter score */ double P; /* calculated P-value (using HMM's own calibration) */ int npass = 0; /* number of scores that pass the P threshold */ double fpass; /* fraction of scores that pass the P threshold */ int i; /* counter over scores */ int do_gumbel; /* flag for how to determine P values */ double pmu, plambda; if (esl_opt_GetBoolean(go, "--vit")) { pmu = hmm->evparam[p7_VMU]; plambda = hmm->evparam[p7_VLAMBDA]; do_gumbel = TRUE; } else if (esl_opt_GetBoolean(go, "--msv")) { pmu = hmm->evparam[p7_MMU]; plambda = hmm->evparam[p7_MLAMBDA]; do_gumbel = TRUE; } else if (esl_opt_GetBoolean(go, "--fwd")) { pmu = hmm->evparam[p7_FTAU]; plambda = hmm->evparam[p7_FLAMBDA]; do_gumbel = FALSE; } else ESL_FAIL(eslEINVAL, errbuf, "can only use --ffile with viterbi, msv, or fwd scores"); for (i = 0; i < cfg->N; i++) { P = (do_gumbel ? esl_gumbel_surv(scores[i], pmu, plambda) : esl_exp_surv (scores[i], pmu, plambda)); if (P <= pthresh) npass++; } fpass = (double) npass / (double) cfg->N; fprintf(cfg->ffp, "%s\t%d\t%.4f\n", hmm->name, npass, fpass); return eslOK; }
static void emit_fancycons(ESL_GETOPTS *go, FILE *ofp, int outfmt, P7_HMM *hmm) { ESL_SQ *sq = NULL; float minl = esl_opt_GetReal(go, "--minl"); float minu = esl_opt_GetReal(go, "--minu"); if ((sq = esl_sq_Create()) == NULL) esl_fatal("failed to allocate sequence"); if (p7_emit_FancyConsensus(hmm, minl, minu, sq) != eslOK) esl_fatal("failed to create consensus seq"); if (esl_sq_FormatName(sq, "%s-consensus", hmm->name) != eslOK) esl_fatal("failed to set sequence name"); if (esl_sqio_Write(ofp, sq, outfmt, FALSE) != eslOK) esl_fatal("failed to write sequence"); esl_sq_Destroy(sq); return; }
int main(int argc, char **argv) { char *msg = "p7_gmx unit test driver failed"; ESL_GETOPTS *go = p7_CreateDefaultApp(options, 0, argc, argv, banner, usage); ESL_RANDOMNESS *r = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s")); ESL_ALPHABET *abc = esl_alphabet_Create(eslAMINO); P7_BG *bg = p7_bg_Create(abc); P7_HMM *hmm = NULL; P7_PROFILE *gm = NULL; int M = esl_opt_GetInteger(go, "-M"); int L = esl_opt_GetInteger(go, "-L"); float tol = esl_opt_GetReal (go, "-t"); p7_FLogsumInit(); if (p7_hmm_Sample(r, M, abc, &hmm) != eslOK) esl_fatal(msg); if ((gm = p7_profile_Create(hmm->M, abc)) == NULL) esl_fatal(msg); if (p7_bg_SetLength(bg, L) != eslOK) esl_fatal(msg); if (p7_ProfileConfig(hmm, bg, gm, L, p7_UNILOCAL) != eslOK) esl_fatal(msg); utest_GrowTo(); utest_Compare(r, gm, bg, L, tol); esl_getopts_Destroy(go); esl_randomness_Destroy(r); esl_alphabet_Destroy(abc); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); p7_profile_Destroy(gm); return eslOK; }
void output_clusters(ESL_GETOPTS *go, ESL_TREE *T, ESL_KEYHASH *kh) { ESL_STACK *ns = esl_stack_ICreate(); ESL_STACK *s2 = esl_stack_ICreate(); double threshold = esl_opt_GetReal(go, "-x"); int v,v2; int nc = 0; esl_stack_IPush(ns, 0); while (esl_stack_IPop(ns, &v) == eslOK) { /* v may only be an internal node. */ if (T->ld[v] < threshold) { nc++; printf("Cluster %d: %g\n", nc, T->ld[v]); esl_stack_IPush(s2, T->right[v]); esl_stack_IPush(s2, T->left[v]); while (esl_stack_IPop(s2, &v2) == eslOK) { if (v2 <= 0) printf("= %s \t%d\t%g\n", esl_keyhash_Get(kh, -v2), nc, T->ld[v]); else { esl_stack_IPush(s2, T->right[v2]); esl_stack_IPush(s2, T->left[v2]); } } printf("\n\n"); continue; } if (T->right[v] > 0) esl_stack_IPush(ns, T->right[v]); else printf("Singleton:\n= %s\t%c\t%c\n\n", esl_keyhash_Get(kh, -T->right[v]), '-', '-'); if (T->left[v] > 0) esl_stack_IPush(ns, T->left[v]); else printf("Singleton:\n= %s\t%c\t%c\n\n", esl_keyhash_Get(kh, -T->left[v]), '-', '-'); } esl_stack_Destroy(ns); esl_stack_Destroy(s2); }
int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage); ESL_RANDOMNESS *r = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s")); ESL_ALPHABET *abc = esl_alphabet_Create(eslAMINO); P7_BG *bg = p7_bg_Create(abc); int M = esl_opt_GetInteger(go, "-M"); int L = esl_opt_GetInteger(go, "-L"); int N = esl_opt_GetInteger(go, "-N"); float tol = esl_opt_GetReal (go, "-t"); p7_FLogsumInit(); utest_decoding(r, abc, bg, M, L, N, tol); esl_getopts_Destroy(go); esl_randomness_Destroy(r); esl_alphabet_Destroy(abc); p7_bg_Destroy(bg); return eslOK; }
/* recalibrate_model() * * Optionally, user can choose (with --recal) to replace the * statistical parameters of the HMM. The calibrated params are used * to generate "expected" distributions in output plots. see * p7_Calibrate(), which this is a partial copy of. (p7_Calibrate() * requires a P7_BUILDER object that we don't have.) * * On success, returns <eslOK> and the statistical parameters of the * <hmm> have been recalibrated and replaced. */ static int recalibrate_model(ESL_GETOPTS *go, struct cfg_s *cfg, char *errbuf, P7_HMM *hmm) { P7_PROFILE *gm = NULL; P7_OPROFILE *om = NULL; int EmL = esl_opt_GetInteger(go, "--EmL"); int EmN = esl_opt_GetInteger(go, "--EmN"); int EvL = esl_opt_GetInteger(go, "--EvL"); int EvN = esl_opt_GetInteger(go, "--EvN"); int EfL = esl_opt_GetInteger(go, "--EfL"); int EfN = esl_opt_GetInteger(go, "--EfN"); double Eft = esl_opt_GetReal (go, "--Eft"); double lambda, mmu, vmu, ftau; gm = p7_profile_Create(hmm->M, cfg->abc); p7_profile_Config(gm, hmm, cfg->bg); /* dual-mode multihit; L=0 (no length model needed right now) */ P7_HARDWARE *hw; if ((hw = p7_hardware_Create ()) == NULL) p7_Fail("Couldn't get HW information data structure"); om = p7_oprofile_Create(gm->M, cfg->abc, hw->simd); p7_oprofile_Convert(gm, om); /* om is now *local* multihit */ p7_Lambda(hmm, cfg->bg, &lambda); p7_MSVMu (cfg->r, om, cfg->bg, EmL, EmN, lambda, &mmu); p7_ViterbiMu(cfg->r, om, cfg->bg, EvL, EvN, lambda, &vmu); p7_Tau (cfg->r, om, cfg->bg, EfL, EfN, lambda, Eft, &ftau); hmm->evparam[p7_MLAMBDA] = lambda; hmm->evparam[p7_VLAMBDA] = lambda; hmm->evparam[p7_FLAMBDA] = lambda; hmm->evparam[p7_MMU] = mmu; hmm->evparam[p7_VMU] = vmu; hmm->evparam[p7_FTAU] = ftau; hmm->flags |= p7H_STATS; p7_profile_Destroy(gm); p7_oprofile_Destroy(om); return eslOK; }
static void utest_FLogsumError(ESL_GETOPTS *go, ESL_RANDOMNESS *r) { int N = esl_opt_GetInteger(go, "-N"); float maxval = esl_opt_GetReal(go, "-S"); int be_verbose = esl_opt_GetBoolean(go, "-v"); float maxerr = 0.0; float avgerr = 0.0; int i; float a,b,result,exact,err; for (i = 0; i < N; i++) { a = (esl_random(r) - 0.5) * maxval * 2.; /* uniform draws on -maxval..maxval */ b = (esl_random(r) - 0.5) * maxval * 2.; exact = log(exp(a) + exp(b)); result = p7_FLogsum(a,b); err = fabs(exact-result) / maxval; avgerr += err; maxerr = ESL_MAX(maxerr, err); if (be_verbose) printf("%8.4f %8.4f %8.4f %8.4f %8.4f\n", a, b, exact, result, err); } avgerr /= (float) N; if (be_verbose) { printf("average error = %f\n", avgerr); printf("max error = %f\n", maxerr); } if (maxerr > 0.0001) esl_fatal("maximum error of %f is too high: logsum unit test fails", maxerr); if (avgerr > 0.0001) esl_fatal("average error of %f is too high: logsum unit test fails", avgerr); }
/* serial_master() * The serial version of hmmsearch. * For each query HMM in <hmmdb> search the database for hits. * * A master can only return if it's successful. All errors are handled * immediately and fatally with p7_Fail(). */ static int serial_master(ESL_GETOPTS *go, struct cfg_s *cfg) { FILE *ofp = stdout; /* output file for results (default stdout) */ FILE *tblfp = NULL; /* output stream for tabular per-seq (--tblout) */ FILE *dfamtblfp = NULL; /* output stream for tabular Dfam format (--dfamtblout) */ FILE *aliscoresfp = NULL; /* output stream for alignment scores (--aliscoresout) */ // P7_HMM *hmm = NULL; /* one HMM query */ // P7_SCOREDATA *scoredata = NULL; int seqfmt = eslSQFILE_UNKNOWN; /* format of seqfile */ ESL_SQFILE *sqfp = NULL; /* open seqfile */ P7_HMMFILE *hfp = NULL; /* open HMM database file */ ESL_ALPHABET *abc = NULL; /* sequence alphabet */ P7_OPROFILE *om = NULL; /* target profile */ ESL_STOPWATCH *w = NULL; /* timing */ ESL_SQ *qsq = NULL; /* query sequence */ int nquery = 0; int textw; int status = eslOK; int hstatus = eslOK; int sstatus = eslOK; int i; int ncpus = 0; int infocnt = 0; WORKER_INFO *info = NULL; #ifdef HMMER_THREADS P7_OM_BLOCK *block = NULL; ESL_THREADS *threadObj= NULL; ESL_WORK_QUEUE *queue = NULL; #endif char errbuf[eslERRBUFSIZE]; double window_beta = -1.0 ; int window_length = -1; if (esl_opt_IsUsed(go, "--w_beta")) { if ( ( window_beta = esl_opt_GetReal(go, "--w_beta") ) < 0 || window_beta > 1 ) esl_fatal("Invalid window-length beta value\n"); } if (esl_opt_IsUsed(go, "--w_length")) { if (( window_length = esl_opt_GetInteger(go, "--w_length")) < 4 ) esl_fatal("Invalid window length value\n"); } w = esl_stopwatch_Create(); if (esl_opt_GetBoolean(go, "--notextw")) textw = 0; else textw = esl_opt_GetInteger(go, "--textw"); /* If caller declared an input format, decode it */ if (esl_opt_IsOn(go, "--qformat")) { seqfmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--qformat")); if (seqfmt == eslSQFILE_UNKNOWN) p7_Fail("%s is not a recognized input sequence file format\n", esl_opt_GetString(go, "--qformat")); } /* validate options if running as a daemon */ // if (esl_opt_IsOn(go, "--daemon")) { /* running as a daemon, the input format must be type daemon */ // if (seqfmt != eslSQFILE_UNKNOWN && seqfmt != eslSQFILE_DAEMON) // esl_fatal("Input format %s not supported. Must be daemon\n", esl_opt_GetString(go, "--qformat")); // seqfmt = eslSQFILE_DAEMON; // if (strcmp(cfg->seqfile, "-") != 0) esl_fatal("Query sequence file must be '-'\n"); // } /* Open the target profile database to get the sequence alphabet */ status = p7_hmmfile_OpenE(cfg->hmmfile, p7_HMMDBENV, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", cfg->hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem, trying to open HMM file %s.\n%s\n", cfg->hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, cfg->hmmfile, errbuf); if (! hfp->is_pressed) p7_Fail("Failed to open binary auxfiles for %s: use hmmpress first\n", hfp->fname); hstatus = p7_oprofile_ReadMSV(hfp, &abc, &om); if (hstatus == eslEFORMAT) p7_Fail("bad format, binary auxfiles, %s:\n%s", cfg->hmmfile, hfp->errbuf); else if (hstatus == eslEINCOMPAT) p7_Fail("HMM file %s contains different alphabets", cfg->hmmfile); else if (hstatus != eslOK) p7_Fail("Unexpected error in reading HMMs from %s", cfg->hmmfile); p7_oprofile_Destroy(om); p7_hmmfile_Close(hfp); /* Open the query sequence database */ status = esl_sqfile_OpenDigital(abc, cfg->seqfile, seqfmt, NULL, &sqfp); if (status == eslENOTFOUND) p7_Fail("Failed to open sequence file %s for reading\n", cfg->seqfile); else if (status == eslEFORMAT) p7_Fail("Sequence file %s is empty or misformatted\n", cfg->seqfile); else if (status == eslEINVAL) p7_Fail("Can't autodetect format of a stdin or .gz seqfile"); else if (status != eslOK) p7_Fail("Unexpected error %d opening sequence file %s\n", status, cfg->seqfile); if (sqfp->format > 100) // breaking the law! That range is reserved for msa, for aligned formats p7_Fail("%s contains a multiple sequence alignment; expect unaligned sequences, like FASTA\n", cfg->seqfile); qsq = esl_sq_CreateDigital(abc); /* Open the results output files */ if (esl_opt_IsOn(go, "-o")) { if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) esl_fatal("Failed to open output file %s for writing\n", esl_opt_GetString(go, "-o")); } if (esl_opt_IsOn(go, "--tblout")) { if ((tblfp = fopen(esl_opt_GetString(go, "--tblout"), "w")) == NULL) esl_fatal("Failed to open tabular per-seq output file %s for writing\n", esl_opt_GetString(go, "--tblfp")); } if (esl_opt_IsOn(go, "--dfamtblout")) { if ((dfamtblfp = fopen(esl_opt_GetString(go, "--dfamtblout"),"w")) == NULL) esl_fatal("Failed to open tabular dfam output file %s for writing\n", esl_opt_GetString(go, "--dfamtblout")); } if (esl_opt_IsOn(go, "--aliscoresout")) { if ((aliscoresfp = fopen(esl_opt_GetString(go, "--aliscoresout"),"w")) == NULL) esl_fatal("Failed to open alignment scores output file %s for writing\n", esl_opt_GetString(go, "--aliscoresout")); } output_header(ofp, go, cfg->hmmfile, cfg->seqfile); #ifdef HMMER_THREADS /* initialize thread data */ if (esl_opt_IsOn(go, "--cpu")) ncpus = esl_opt_GetInteger(go, "--cpu"); else esl_threads_CPUCount(&ncpus); if (ncpus > 0) { threadObj = esl_threads_Create(&pipeline_thread); queue = esl_workqueue_Create(ncpus * 2); } #endif infocnt = (ncpus == 0) ? 1 : ncpus; ESL_ALLOC(info, sizeof(*info) * infocnt); for (i = 0; i < infocnt; ++i) { info[i].bg = p7_bg_Create(abc); #ifdef HMMER_THREADS info[i].queue = queue; #endif } #ifdef HMMER_THREADS for (i = 0; i < ncpus * 2; ++i) { block = p7_oprofile_CreateBlock(BLOCK_SIZE); if (block == NULL) esl_fatal("Failed to allocate sequence block"); status = esl_workqueue_Init(queue, block); if (status != eslOK) esl_fatal("Failed to add block to work queue"); } #endif /* Outside loop: over each query sequence in <seqfile>. */ while ((sstatus = esl_sqio_Read(sqfp, qsq)) == eslOK) { if (sstatus == eslEMEM) p7_Fail("Memory allocation error reading sequence file\n", status); if (sstatus == eslEINCONCEIVABLE) p7_Fail("Unexpected error %d reading sequence file\n", status); // if (qsq->L > NHMMER_MAX_RESIDUE_COUNT) p7_Fail("Input sequence %s in file %s exceeds maximum length of %d bases.\n", qsq->name, cfg->seqfile, NHMMER_MAX_RESIDUE_COUNT); nquery++; esl_stopwatch_Start(w); /* Open the target profile database */ status = p7_hmmfile_OpenE(cfg->hmmfile, p7_HMMDBENV, &hfp, NULL); if (status != eslOK) p7_Fail("Unexpected error %d in opening hmm file %s.\n", status, cfg->hmmfile); #ifdef HMMER_THREADS /* if we are threaded, create a lock to prevent multiple readers */ if (ncpus > 0) { status = p7_hmmfile_CreateLock(hfp); if (status != eslOK) p7_Fail("Unexpected error %d creating lock\n", status); } #endif if (fprintf(ofp, "Query: %s [L=%ld]\n", qsq->name, (long) qsq->n) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (qsq->acc[0] != 0 && fprintf(ofp, "Accession: %s\n", qsq->acc) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (qsq->desc[0] != 0 && fprintf(ofp, "Description: %s\n", qsq->desc) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); for (i = 0; i < infocnt; ++i) { /* Create processing pipeline and hit list */ info[i].th = p7_tophits_Create(); info[i].pli = p7_pipeline_Create(go, 100, 100, TRUE, p7_SCAN_MODELS); /* M_hint = 100, L_hint = 100 are just dummies for now */ info[i].pli->hfp = hfp; /* for two-stage input, pipeline needs <hfp> */ p7_pli_NewSeq(info[i].pli, qsq); info[i].qsq = qsq; if ( esl_opt_IsUsed(go, "--toponly") ) info[i].pli->strand = p7_STRAND_TOPONLY; else if ( esl_opt_IsUsed(go, "--bottomonly") ) info[i].pli->strand = p7_STRAND_BOTTOMONLY; else info[i].pli->strand = p7_STRAND_BOTH; #ifdef HMMER_THREADS if (ncpus > 0) esl_threads_AddThread(threadObj, &info[i]); #endif } #ifdef HMMER_THREADS if (ncpus > 0) hstatus = thread_loop(threadObj, queue, hfp); else hstatus = serial_loop(info, hfp); #else hstatus = serial_loop(info, hfp); #endif switch(hstatus) { case eslEFORMAT: p7_Fail("bad file format in HMM file %s", cfg->hmmfile); break; case eslEINCOMPAT: p7_Fail("HMM file %s contains different alphabets", cfg->hmmfile); break; case eslEOF: case eslOK: /* do nothing */ break; default: p7_Fail("Unexpected error in reading HMMs from %s", cfg->hmmfile); } /* merge the results of the search results */ for (i = 1; i < infocnt; ++i) { p7_tophits_Merge(info[0].th, info[i].th); p7_pipeline_Merge(info[0].pli, info[i].pli); p7_pipeline_Destroy(info[i].pli); p7_tophits_Destroy(info[i].th); } /* modify e-value to account for number of models */ for (i = 0; i < info->th->N ; i++) { info->th->unsrt[i].lnP += log((float)info->pli->nmodels); info->th->unsrt[i].dcl[0].lnP = info->th->unsrt[i].lnP; info->th->unsrt[i].sortkey = -1.0 * info->th->unsrt[i].lnP; } /* it's possible to have duplicates based on how viterbi ranges can overlap */ p7_tophits_SortByModelnameAndAlipos(info->th); p7_tophits_RemoveDuplicates(info->th, info->pli->use_bit_cutoffs); /* Print results */ p7_tophits_SortBySortkey(info->th); p7_tophits_Threshold(info->th, info->pli); //tally up total number of hits and target coverage info->pli->n_output = info->pli->pos_output = 0; for (i = 0; i < info->th->N; i++) { if ( (info->th->hit[i]->flags & p7_IS_REPORTED) || info->th->hit[i]->flags & p7_IS_INCLUDED) { info->pli->n_output++; info->pli->pos_output += abs(info->th->hit[i]->dcl[0].jali - info->th->hit[i]->dcl[0].iali) + 1; } } p7_tophits_Targets(ofp, info->th, info->pli, textw); if (fprintf(ofp, "\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); p7_tophits_Domains(ofp, info->th, info->pli, textw); if (fprintf(ofp, "\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (tblfp) p7_tophits_TabularTargets(tblfp, qsq->name, qsq->acc, info->th, info->pli, (nquery == 1)); if (dfamtblfp) p7_tophits_TabularXfam(dfamtblfp, qsq->name, NULL, info->th, info->pli); if (aliscoresfp) p7_tophits_AliScores(aliscoresfp, qsq->name, info->th ); esl_stopwatch_Stop(w); info->pli->nseqs = 1; p7_pli_Statistics(ofp, info->pli, w); if (fprintf(ofp, "//\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); fflush(ofp); p7_hmmfile_Close(hfp); p7_pipeline_Destroy(info->pli); p7_tophits_Destroy(info->th); esl_sq_Reuse(qsq); } if (sstatus == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (sstatus != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", sstatus, sqfp->filename); /* Terminate outputs - any last words? */ if (tblfp) p7_tophits_TabularTail(tblfp, "hmmscan", p7_SCAN_MODELS, cfg->seqfile, cfg->hmmfile, go); if (ofp) { if (fprintf(ofp, "[ok]\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } /* Cleanup - prepare for successful exit */ for (i = 0; i < infocnt; ++i) p7_bg_Destroy(info[i].bg); #ifdef HMMER_THREADS if (ncpus > 0) { esl_workqueue_Reset(queue); while (esl_workqueue_Remove(queue, (void **) &block) == eslOK) p7_oprofile_DestroyBlock(block); esl_workqueue_Destroy(queue); esl_threads_Destroy(threadObj); } #endif free(info); esl_sq_Destroy(qsq); esl_stopwatch_Destroy(w); esl_alphabet_Destroy(abc); esl_sqfile_Close(sqfp); if (ofp != stdout) fclose(ofp); if (tblfp) fclose(tblfp); if (dfamtblfp) fclose(dfamtblfp); if (aliscoresfp) fclose(aliscoresfp); return eslOK; ERROR: if (ofp != stdout) fclose(ofp); if (tblfp) fclose(tblfp); if (dfamtblfp) fclose(dfamtblfp); if (aliscoresfp) fclose(aliscoresfp); return status; }
static int output_header(FILE *ofp, ESL_GETOPTS *go, char *hmmfile, char *seqfile) { p7_banner(ofp, go->argv[0], banner); if (fprintf(ofp, "# query sequence file: %s\n", seqfile) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (fprintf(ofp, "# target HMM database: %s\n", hmmfile) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "-o") && fprintf(ofp, "# output directed to file: %s\n", esl_opt_GetString(go, "-o")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--tblout") && fprintf(ofp, "# per-seq hits tabular output: %s\n", esl_opt_GetString(go, "--tblout")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--dfamtblout") && fprintf(ofp, "# hits output in Dfam format: %s\n", esl_opt_GetString(go, "--dfamtblout")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--aliscoresout") && fprintf(ofp, "# alignment scores output: %s\n", esl_opt_GetString(go, "--aliscoresout")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--acc") && fprintf(ofp, "# prefer accessions over names: yes\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--noali") && fprintf(ofp, "# show alignments in output: no\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--notextw") && fprintf(ofp, "# max ASCII text line length: unlimited\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--textw") && fprintf(ofp, "# max ASCII text line length: %d\n", esl_opt_GetInteger(go, "--textw")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "-E") && fprintf(ofp, "# profile reporting threshold: E-value <= %g\n", esl_opt_GetReal(go, "-E")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "-T") && fprintf(ofp, "# profile reporting threshold: score >= %g\n", esl_opt_GetReal(go, "-T")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--incE") && fprintf(ofp, "# profile inclusion threshold: E-value <= %g\n", esl_opt_GetReal(go, "--incE")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--incT") && fprintf(ofp, "# profile inclusion threshold: score >= %g\n", esl_opt_GetReal(go, "--incT")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--cut_ga") && fprintf(ofp, "# model-specific thresholding: GA cutoffs\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--cut_nc") && fprintf(ofp, "# model-specific thresholding: NC cutoffs\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--cut_tc") && fprintf(ofp, "# model-specific thresholding: TC cutoffs\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--max") && fprintf(ofp, "# Max sensitivity mode: on [all heuristic filters off]\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--F1") && fprintf(ofp, "# MSV filter P threshold: <= %g\n", esl_opt_GetReal(go, "--F1")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--F2") && fprintf(ofp, "# Vit filter P threshold: <= %g\n", esl_opt_GetReal(go, "--F2")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--F3") && fprintf(ofp, "# Fwd filter P threshold: <= %g\n", esl_opt_GetReal(go, "--F3")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--nobias") && fprintf(ofp, "# biased composition HMM filter: off\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--B1") && fprintf(ofp, "# biased comp MSV window len: %d\n", esl_opt_GetInteger(go, "--B1")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--B2") && fprintf(ofp, "# biased comp Viterbi window len: %d\n", esl_opt_GetInteger(go, "--B2")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--B3") && fprintf(ofp, "# biased comp Forward window len: %d\n", esl_opt_GetInteger(go, "--B3")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--nonull2") && fprintf(ofp, "# null2 bias corrections: off\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--toponly") && fprintf(ofp, "# search only top strand: on\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--bottomonly") && fprintf(ofp, "# search only bottom strand: on\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "-Z") && fprintf(ofp, "# sequence search space set to: %.0f\n", esl_opt_GetReal(go, "-Z")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--seed")) { if (esl_opt_GetInteger(go, "--seed")==0 && fprintf(ofp, "# random number seed: one-time arbitrary\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); else if ( fprintf(ofp, "# random number seed set to: %d\n", esl_opt_GetInteger(go, "--seed")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } if (esl_opt_IsUsed(go, "--qformat") && fprintf(ofp, "# input seqfile format asserted: %s\n", esl_opt_GetString(go, "--qformat")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--w_beta") && fprintf(ofp, "# window length beta value: %g\n", esl_opt_GetReal(go, "--w_beta")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--w_length") && fprintf(ofp, "# window length : %d\n", esl_opt_GetInteger(go, "--w_length")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); // if (esl_opt_IsUsed(go, "--daemon") && fprintf(ofp, "run as a daemon process\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); #ifdef HMMER_THREADS if (esl_opt_IsUsed(go, "--cpu") && fprintf(ofp, "# number of worker threads: %d\n", esl_opt_GetInteger(go, "--cpu")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); #endif if (fprintf(ofp, "# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); return eslOK; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* command line configuration */ struct cfg_s cfg; /* application configuration */ char *basename= NULL; /* base of the output file names */ char *alifile = NULL; /* alignment file name */ char *dbfile = NULL; /* name of seq db file */ char outfile[256]; /* name of an output file */ int alifmt; /* format code for alifile */ int dbfmt; /* format code for dbfile */ ESLX_MSAFILE *afp = NULL; /* open alignment file */ ESL_MSA *origmsa = NULL; /* one multiple sequence alignment */ ESL_MSA *msa = NULL; /* MSA after frags are removed */ ESL_MSA *trainmsa= NULL; /* training set, aligned */ ESL_STACK *teststack=NULL; /* test set: stack of ESL_SQ ptrs */ int status; /* easel return code */ int nfrags; /* # of fragments removed */ int ntestdom; /* # of test domains */ int ntest; /* # of test sequences created */ int nali; /* number of alignments read */ double avgid; /* Parse command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in app configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h")) cmdline_help(argv[0], go); if (esl_opt_ArgNumber(go) != 3) cmdline_failure(argv[0], "Incorrect number of command line arguments\n"); basename = esl_opt_GetArg(go, 1); alifile = esl_opt_GetArg(go, 2); dbfile = esl_opt_GetArg(go, 3); alifmt = eslMSAFILE_STOCKHOLM; dbfmt = eslSQFILE_FASTA; /* Set up the configuration structure shared amongst functions here */ if (esl_opt_IsDefault(go, "--seed")) cfg.r = esl_randomness_CreateTimeseeded(); else cfg.r = esl_randomness_Create(esl_opt_GetInteger(go, "--seed")); cfg.abc = NULL; /* until we open the MSA file, below */ cfg.fragfrac = esl_opt_GetReal(go, "-F"); cfg.idthresh1 = esl_opt_GetReal(go, "-1"); cfg.idthresh2 = esl_opt_GetReal(go, "-2"); cfg.test_lens = NULL; cfg.ntest = 0; cfg.max_ntest = (esl_opt_IsOn(go, "--maxtest") ? esl_opt_GetInteger(go, "--maxtest") : 0); cfg.max_ntrain = (esl_opt_IsOn(go, "--maxtrain") ? esl_opt_GetInteger(go, "--maxtrain") : 0); /* Open the output files */ if (snprintf(outfile, 256, "%s.msa", basename) >= 256) esl_fatal("Failed to construct output MSA file name"); if ((cfg.out_msafp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open MSA output file %s\n", outfile); if (snprintf(outfile, 256, "%s.fa", basename) >= 256) esl_fatal("Failed to construct output FASTA file name"); if ((cfg.out_seqfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open FASTA output file %s\n", outfile); if (snprintf(outfile, 256, "%s.pos", basename) >= 256) esl_fatal("Failed to construct pos test set summary file name"); if ((cfg.possummfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open pos test set summary file %s\n", outfile); if (snprintf(outfile, 256, "%s.neg", basename) >= 256) esl_fatal("Failed to construct neg test set summary file name"); if ((cfg.negsummfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open neg test set summary file %s\n", outfile); if (snprintf(outfile, 256, "%s.tbl", basename) >= 256) esl_fatal("Failed to construct benchmark table file name"); if ((cfg.tblfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open benchmark table file %s\n", outfile); if (esl_opt_GetBoolean(go, "--pid")) { if (snprintf(outfile, 256, "%s.pid", basename) >= 256) esl_fatal("Failed to construct %%id table file name"); if ((cfg.pidfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open %%id table file %s\n", outfile); } else cfg.pidfp = NULL; /* Open the MSA file, digital mode; determine alphabet */ if (esl_opt_GetBoolean(go, "--amino")) cfg.abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) cfg.abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) cfg.abc = esl_alphabet_Create(eslRNA); status = eslx_msafile_Open(&(cfg.abc), alifile, NULL, alifmt, NULL, &afp); if (status != eslOK) eslx_msafile_OpenFailure(afp, status); if (cfg.abc->type == eslAMINO) esl_composition_SW34(cfg.fq); else esl_vec_DSet(cfg.fq, cfg.abc->K, 1.0 / (double) cfg.abc->K); /* Open and process the dbfile; make sure it's in the same alphabet */ process_dbfile(&cfg, dbfile, dbfmt); /* Read and process MSAs one at a time */ nali = 0; while ((status = eslx_msafile_Read(afp, &origmsa)) != eslEOF) { if (status != eslOK) eslx_msafile_ReadFailure(afp, status); esl_msa_ConvertDegen2X(origmsa); esl_msa_Hash(origmsa); remove_fragments(&cfg, origmsa, &msa, &nfrags); separate_sets (&cfg, msa, &trainmsa, &teststack); if ( esl_stack_ObjectCount(teststack) >= 2) { /* randomize test domain order, and apply size limit if any */ esl_stack_Shuffle(cfg.r, teststack); if (cfg.max_ntest) pstack_select_topn(&teststack, cfg.max_ntest); ntestdom = esl_stack_ObjectCount(teststack); /* randomize training set alignment order, and apply size limit if any */ esl_msashuffle_PermuteSequenceOrder(cfg.r, trainmsa); if (cfg.max_ntrain) msa_select_topn(&trainmsa, cfg.max_ntrain); esl_msa_MinimGaps(trainmsa, NULL, NULL, FALSE); if (esl_opt_GetBoolean(go, "--pid")) write_pids(cfg.pidfp, origmsa, trainmsa, teststack); synthesize_positives(go, &cfg, msa->name, teststack, &ntest); eslx_msafile_Write(cfg.out_msafp, trainmsa, eslMSAFILE_STOCKHOLM); esl_dst_XAverageId(cfg.abc, trainmsa->ax, trainmsa->nseq, 10000, &avgid); /* 10000 is max_comparisons, before sampling kicks in */ fprintf(cfg.tblfp, "%-20s %3.0f%% %6d %6d %6d %6d %6d %6d\n", msa->name, 100.*avgid, (int) trainmsa->alen, msa->nseq, nfrags, trainmsa->nseq, ntestdom, ntest); nali++; } esl_msa_Destroy(trainmsa); esl_msa_Destroy(origmsa); esl_msa_Destroy(msa); } if (nali == 0) esl_fatal("No alignments found in file %s\n", alifile); synthesize_negatives(go, &cfg, esl_opt_GetInteger(go, "-N")); fclose(cfg.out_msafp); fclose(cfg.out_seqfp); fclose(cfg.possummfp); fclose(cfg.negsummfp); fclose(cfg.tblfp); if (cfg.pidfp) fclose(cfg.pidfp); esl_randomness_Destroy(cfg.r); esl_alphabet_Destroy(cfg.abc); eslx_msafile_Close(afp); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage); ESL_ALPHABET *abc = esl_alphabet_Create(eslAMINO); char *hmmfile = esl_opt_GetArg(go, 1); char *qfile = esl_opt_GetArg(go, 2); ESL_SQ *qsq = esl_sq_CreateDigital(abc); ESL_SQFILE *qfp = NULL; FILE *hmmfp = NULL; ESL_SCOREMATRIX *S = esl_scorematrix_Create(abc); ESL_DMATRIX *Q = NULL; P7_BG *bg = p7_bg_Create(abc); P7_HMM *hmm = NULL; double *fa = NULL; double popen = esl_opt_GetReal (go, "-q"); double pextend = esl_opt_GetReal (go, "-r"); char *mxfile = esl_opt_GetString(go, "-m"); char errbuf[eslERRBUFSIZE]; double slambda; int a,b; int status; /* Reverse engineer a scoring matrix to obtain conditional prob's * that we'll use for the single-seq query HMM. Because score mx is * symmetric, we can set up P[a][b] = P(b | a), so we can use the * matrix rows as HMM match emission vectors. This means dividing * the joint probs through by f_a. */ if (mxfile == NULL) { if (esl_scorematrix_Set("BLOSUM62", S) != eslOK) esl_fatal("failed to set BLOSUM62 scores"); } else { ESL_FILEPARSER *efp = NULL; if ( esl_fileparser_Open(mxfile, NULL, &efp) != eslOK) esl_fatal("failed to open score file %s", mxfile); if ( esl_scorematrix_Read(efp, abc, &S) != eslOK) esl_fatal("failed to read matrix from %s", mxfile); esl_fileparser_Close(efp); } /* A wasteful conversion of the HMMER single-precision background probs to Easel double-prec */ ESL_ALLOC(fa, sizeof(double) * bg->abc->K); esl_vec_F2D(bg->f, bg->abc->K, fa); /* Backcalculate joint probabilities Q, given score matrix S and background frequencies fa */ status = esl_scorematrix_ProbifyGivenBG(S, fa, fa, &slambda, &Q); if (status == eslEINVAL) esl_fatal("built-in score matrix %s has no valid solution for lambda", matrix); else if (status == eslENOHALT) esl_fatal("failed to solve score matrix %s for lambda", matrix); else if (status != eslOK) esl_fatal("unexpected error in solving score matrix %s for probability parameters", matrix); esl_scorematrix_JointToConditionalOnQuery(abc, Q); /* Open the query sequence file in FASTA format */ status = esl_sqfile_Open(qfile, eslSQFILE_FASTA, NULL, &qfp); if (status == eslENOTFOUND) esl_fatal("No such file %s.", qfile); else if (status == eslEFORMAT) esl_fatal("Format of %s unrecognized.", qfile); else if (status == eslEINVAL) esl_fatal("Can't autodetect stdin or .gz."); else if (status != eslOK) esl_fatal("Open of %s failed, code %d.", qfile, status); /* Open the output HMM file */ if ((hmmfp = fopen(hmmfile, "w")) == NULL) esl_fatal("Failed to open output HMM file %s", hmmfile); /* For each sequence, build a model and save it. */ while ((status = esl_sqio_Read(qfp, qsq)) == eslOK) { p7_Seqmodel(abc, qsq->dsq, qsq->n, qsq->name, Q, bg->f, popen, pextend, &hmm); if ( p7_hmm_Validate(hmm, errbuf, 1e-5) != eslOK) esl_fatal("HMM validation failed: %s\n", errbuf); if ( p7_hmmfile_WriteASCII(hmmfp, -1, hmm) != eslOK) esl_fatal("HMM save failed"); p7_hmm_Destroy(hmm); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s line %" PRId64 "):\n%s\n", qfp->filename, qfp->linenumber, qfp->errbuf); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, qfp->filename); esl_dmatrix_Destroy(Q); esl_scorematrix_Destroy(S); free(fa); free(fb); esl_sq_Destroy(qsq); esl_sqfile_Close(qfp); fclose(hmmfp); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 1, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); ESL_RANDOMNESS *r = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s")); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_OPROFILE *om = NULL; double lambda = 0.0; double mmu = 0.0; double vmu = 0.0; double ftau = 0.0; int Z = esl_opt_GetInteger(go, "-Z"); int EmL = esl_opt_GetInteger(go, "--EmL"); int EmN = esl_opt_GetInteger(go, "--EmN"); int EvL = esl_opt_GetInteger(go, "--EvL"); int EvN = esl_opt_GetInteger(go, "--EvN"); int EfL = esl_opt_GetInteger(go, "--EfL"); int EfN = esl_opt_GetInteger(go, "--EfN"); int Eft = esl_opt_GetReal (go, "--Eft"); int iteration; int do_msv, do_vit, do_fwd; int status; if (esl_opt_GetBoolean(go, "--msvonly") == TRUE) { do_msv = TRUE; do_vit = FALSE; do_fwd = FALSE; } else if (esl_opt_GetBoolean(go, "--vitonly") == TRUE) { do_msv = FALSE; do_vit = TRUE; do_fwd = FALSE; } else if (esl_opt_GetBoolean(go, "--fwdonly") == TRUE) { do_msv = FALSE; do_vit = FALSE; do_fwd = TRUE; } else { do_msv = TRUE; do_vit = TRUE; do_fwd = TRUE; } if (p7_hmmfile_OpenE(hmmfile, NULL, &hfp, NULL) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); while ((status = p7_hmmfile_Read(hfp, &abc, &hmm)) != eslEOF) { if (bg == NULL) bg = p7_bg_Create(abc); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, EvL, p7_LOCAL); /* the EvL doesn't matter */ om = p7_oprofile_Create(hmm->M, abc); p7_oprofile_Convert(gm, om); if (esl_opt_IsOn(go, "--lambda")) lambda = esl_opt_GetReal(go, "--lambda"); else p7_Lambda(hmm, bg, &lambda); for (iteration = 0; iteration < Z; iteration++) { if (do_msv) p7_MSVMu (r, om, bg, EmL, EmN, lambda, &mmu); if (do_vit) p7_ViterbiMu (r, om, bg, EvL, EvN, lambda, &vmu); if (do_fwd) p7_Tau (r, om, bg, EfL, EfN, lambda, Eft, &ftau); printf("%s %.4f %.4f %.4f %.4f\n", hmm->name, lambda, mmu, vmu, ftau); } p7_hmm_Destroy(hmm); p7_profile_Destroy(gm); p7_oprofile_Destroy(om); } p7_hmmfile_Close(hfp); p7_bg_Destroy(bg); esl_alphabet_Destroy(abc); esl_randomness_Destroy(r); esl_getopts_Destroy(go); return eslOK; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* application configuration */ ESL_ALPHABET *abc = NULL; /* biological alphabet */ char *alifile = NULL; /* alignment file name */ int fmt; /* format code for alifiles */ ESL_MSAFILE *afp = NULL; /* open alignment file */ ESL_MSA *msa = NULL; /* multiple sequence alignment */ int status; /* easel return code */ int do_info = TRUE; /* TRUE if -i */ int do_max = FALSE; /* TRUE if -x */ int do_ffreq = FALSE; /* TRUE if --ffreq */ int do_fmin = FALSE; /* TRUE if --fmin */ float fthresh = 0.; /* <x> from -f <x> */ int do_remove_bps = FALSE; /* TRUE if -r */ int do_consistent = FALSE; /* TRUE if -c */ int do_indi2cons = FALSE; /* TRUE if --indi <x> */ int have_cons; /* TRUE if first alignment has consensus sequence */ int do_newcons = FALSE; /* TRUE if we're creating a new consensus structure * and outputing a new alignment (if -x -f -c or --indi) */ int do_a = FALSE; /* TRUE if -a */ char *indi; /* for <x> from --indi <x> */ int nindi_read; /* number of individual sequence SS lines we've read for current alignment */ int a; /* counter over seqs */ int i, i2; /* counter over residues */ int j, j2; /* counter over residues */ int nali; /* counter over alignments */ int **bp = NULL; /* bp[i][j] is number of individual bps exist between aln cols i and j */ int *cur_ct = NULL; /* ct array of basepairs for current sequence */ int *cons_ct = NULL; /* ct array of basepairs for SS_cons being created */ int *xcons_ct = NULL; /* ct array of basepairs for existing SS_cons */ int *ngaps = NULL; /* number of gaps in each alignment position */ FILE *ofp; /* output file (default is stdout) */ int be_verbose = FALSE; /* TRUE to print extra info */ int seqthresh; /* sequence number threshold for defining a bp as consensus (int) ((fthresh * nseq) + 0.5)*/ char *sscons = NULL; /* the new SS_cons line */ FILE *lfp = NULL; /* file to list sequences with conflicting bps to */ int nlist = 0; /* number of sequences listed to list file */ int *nconflictsA; /* number of conflicting bps in seq a's individual structure annotation */ int nconflicts_total = 0; /* total number of conflicts */ int nconflicts_list = 0; /* total number of conflicts in sequences listed to file <x> from -l <x> */ int noverlaps_total = 0; /* total number of overlaps */ int nconsistent_total = 0; /* total number of consistent bps */ int nbps_total = 0; /* total number of bps */ int *nconsistentA; /* number of consistent bps in seq a's individual structure annotation */ int *noverlapsA; /* number of bps in seq a's indi structure that overlap with consensus structure */ int *nbpsA; /* number of bps in seq a's indi structure that overlap with consensus structure */ int ncons_bps = 0; /* number of bps in consensus structure */ int max_noverlaps_aidx; int max_nconsistent_aidx; int max_nbps_aidx; int *removebp; /* removebp[i] is TRUE remove consensus bp [i]:xcons_ct[i] */ int *has_conflict; int *nmates_l2r; /* half matrix, nmate_l2r[i] = <x>, i < nmate_l2r[i], there are <x> different right mates j for i */ int *nmates_r2l; /* half matrix, nmate_r2l[j] = <x>, j < nmate_r2l[j], there are <x> different left mates i for j */ int lmax; /* with -l, maximum number of conflicts to allow */ int namewidth = 18; /* length of 'SS_cons(consensus)' */ char *namedashes = NULL; /* to store underline for seq name */ /* --fmin related variables */ int nbps = 0; int prev_nbps = -1; float fmin; int inconsistent_flag; int pknot_flag; int k,l; /*********************************************** * Parse command line ***********************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_GetBoolean(go, "-h") ) { esl_banner(stdout, argv[0], banner); esl_usage (stdout, argv[0], usage); puts("\nwhere basic options are:"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); puts("\noptions for defining a new consensus structure (all of these require -o):"); esl_opt_DisplayHelp(stdout, go, 2, 2, 80); puts("\noptions for listing sequences based on structure:"); esl_opt_DisplayHelp(stdout, go, 3, 2, 80); exit(0); } if (esl_opt_ArgNumber(go) != 1) { printf("Incorrect number of command line arguments.\n"); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } alifile = esl_opt_GetArg(go, 1); fmt = eslMSAFILE_STOCKHOLM; /*********************************************** * Open the MSA file; determine alphabet; set for digital input ***********************************************/ if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); if ( (status = esl_msafile_Open(&abc, alifile, NULL, fmt, NULL, &afp)) != eslOK) esl_msafile_OpenFailure(afp, status); /* open output file */ if (esl_opt_GetString(go, "-o") != NULL) { if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) esl_fatal("Failed to open -o output file %s\n", esl_opt_GetString(go, "-o")); } else ofp = NULL; if (esl_opt_GetString(go, "-l") != NULL) { if ((lfp = fopen(esl_opt_GetString(go, "-l"), "w")) == NULL) esl_fatal("Failed to open -l output file %s\n", esl_opt_GetString(go, "-l")); } /* determine if we're creating a structure */ do_max = esl_opt_GetBoolean(go, "-x"); if(!(esl_opt_IsDefault(go, "--ffreq"))) { do_ffreq = TRUE; fthresh = esl_opt_GetReal(go, "--ffreq"); } if(!(esl_opt_IsDefault(go, "--fmin"))) { do_fmin = TRUE; } do_remove_bps = esl_opt_GetBoolean(go, "-r"); do_consistent = esl_opt_GetBoolean(go, "-c"); if(!(esl_opt_IsDefault(go, "--indi"))) { do_indi2cons = TRUE; } if(do_max || do_ffreq || do_fmin || do_remove_bps || do_consistent || do_indi2cons) { do_newcons = TRUE; } do_a = esl_opt_GetBoolean(go, "-a"); if(do_a || do_max || do_ffreq || do_fmin || do_remove_bps || do_consistent || do_indi2cons) { do_info = FALSE; } /*********************************************** * Read MSAs one at a time. ***********************************************/ nali = 0; have_cons = FALSE; lmax = esl_opt_GetInteger(go, "--lmax"); if(esl_opt_GetBoolean(go, "-v")) be_verbose = TRUE; while ((status = esl_msafile_Read(afp, &msa)) != eslEOF) { if (status != eslOK) esl_msafile_ReadFailure(afp, status); nali++; /* determine max length name */ namewidth = 18; /* length of 'SS_cons(consensus)' */ for(i = 0; i < msa->nseq; i++) namewidth = ESL_MAX(namewidth, strlen(msa->sqname[i])); if(namedashes != NULL) { free(namedashes); } ESL_ALLOC(namedashes, sizeof(char) * namewidth+1); namedashes[namewidth] = '\0'; for(i = 0; i < namewidth; i++) namedashes[i] = '-'; ESL_ALLOC(sscons, sizeof(char) * (msa->alen+1)); ESL_ALLOC(cur_ct, sizeof(int) * (msa->alen+1)); ESL_ALLOC(cons_ct, sizeof(int) * (msa->alen+1)); ESL_ALLOC(xcons_ct, sizeof(int) * (msa->alen+1)); ESL_ALLOC(bp, sizeof(int *) * (msa->alen+1)); ESL_ALLOC(removebp, sizeof(int) * (msa->alen+1)); ESL_ALLOC(has_conflict, sizeof(int) * (msa->alen+1)); ESL_ALLOC(nmates_l2r, sizeof(int) * (msa->alen+1)); ESL_ALLOC(nmates_r2l, sizeof(int) * (msa->alen+1)); esl_vec_ISet(cur_ct, (msa->alen+1), 0); esl_vec_ISet(cons_ct, (msa->alen+1), 0); esl_vec_ISet(xcons_ct, (msa->alen+1), 0); esl_vec_ISet(removebp, (msa->alen+1), FALSE); esl_vec_ISet(has_conflict, (msa->alen+1), FALSE); esl_vec_ISet(nmates_l2r, (msa->alen+1), 0); esl_vec_ISet(nmates_r2l, (msa->alen+1), 0); ESL_ALLOC(nconflictsA, sizeof(int) * msa->nseq); ESL_ALLOC(noverlapsA, sizeof(int) * msa->nseq); ESL_ALLOC(nconsistentA, sizeof(int) * msa->nseq); ESL_ALLOC(nbpsA, sizeof(int) * msa->nseq); esl_vec_ISet(nconflictsA, msa->nseq, 0); esl_vec_ISet(noverlapsA, msa->nseq, 0); esl_vec_ISet(nconsistentA, msa->nseq, 0); esl_vec_ISet(nbpsA, msa->nseq, 0); max_noverlaps_aidx = max_nconsistent_aidx = max_nbps_aidx = 0; nconsistent_total = nbps_total = noverlaps_total = nconflicts_total = nconflicts_list = 0; for(i = 1; i <= msa->alen; i++) { ESL_ALLOC(bp[i], sizeof(int) * (msa->alen+1)); esl_vec_ISet(bp[i], (msa->alen+1), 0); } /* make sure we have ss_cons and indi ss if we need it */ if(msa->ss_cons == NULL && do_remove_bps) esl_fatal("-r requires all alignments have SS_cons annotation, alignment %d does not.", nali); if(msa->ss == NULL && do_max) esl_fatal("-x requires all alignments have individual SS annotation, alignment %d does not.", nali); if(msa->ss == NULL && do_consistent) esl_fatal("-c requires all alignments have individual SS annotation, alignment %d does not.", nali); if(msa->ss == NULL && do_indi2cons) esl_fatal("--indi requires all alignments have individual SS annotation, alignment %d does not.", nali); if(msa->ss == NULL && do_ffreq) esl_fatal("--ffreq requires all alignments have individual SS annotation, alignment %d does not.", nali); if(msa->ss == NULL && do_fmin) esl_fatal("--fmin requires all alignments have individual SS annotation, alignment %d does not.", nali); if(msa->ss_cons != NULL) { if((status = esl_wuss2ct(msa->ss_cons, msa->alen, xcons_ct)) != eslOK) { esl_fatal("Existing SS_cons for alignment %d is invalid.", nali); } ncons_bps = 0; for(i = 1; i <= msa->alen; i++) if(xcons_ct[i] != 0 && i < xcons_ct[i]) ncons_bps++; if(nali > 1 && !have_cons) esl_fatal("the first aln has SS_cons but aln %d lacks it, if one has it, they all must.", nali); if(nali == 1) have_cons = TRUE; } else if (lfp != NULL) { esl_fatal("the -l option requires existing SS_cons annotation, aln %d lacks it.", nali); } else if (do_remove_bps) { esl_fatal("the -r option requires existing SS_cons annotation, aln %d lacks it.", nali); } else if (do_consistent) { esl_fatal("the -c option requires existing SS_cons annotation, aln %d lacks it.", nali); } else { if(nali > 1 && have_cons) esl_fatal("the first aln does not have SS_cons but aln %d does, if one has it, they all must.", nali); } if(do_info) { printf("# Per-sequence basepair information:\n"); printf("# Alignment file: %s\n", alifile); printf("# Alignment idx: %d\n", nali); if(msa->name != NULL) { printf("# Alignment name: %s\n", msa->name); } if(have_cons) { printf("#\n"); printf("# indibp: number of basepairs in the individual sequence SS annotation\n"); printf("# ovrlap: number of indibp basepairs that also exist as consensus basepairs\n"); printf("# cnsist: number of indibp basepairs that do not conflict with any consensus basepairs\n"); printf("# cnflct: number of indibp basepairs that conflict with >= 1 consensus basepairs\n"); printf("#\n"); printf("# A conflict exists between two basepairs in different structures, one between columns i and j\n"); printf("# and the other between columns k and l, if (i == k and j != l) or (j == l and i != k).\n"); printf("#\n"); printf("# %-*s %6s %6s %6s %6s\n", namewidth, "seqname", "indibp", "ovrlap", "cnsist", "cnflct"); printf("# %-*s %6s %6s %6s %6s\n", namewidth, namedashes, "------", "------", "-----", "------"); } else { printf("# %-*s %6s\n", namewidth, "seqname", "nbp"); printf("# %-*s %6s\n", namewidth, namedashes, "------"); } } nindi_read = 0; for (a = 0; a < msa->nseq; a++) { if(msa->ss != NULL && msa->ss[a] != NULL) { if((status = esl_wuss2ct(msa->ss[a], msa->alen, cur_ct)) != eslOK) { esl_fatal("SS annotation for sequence %d, aln %d is invalid.\n", (a+1), nali); } nindi_read++; for(i = 1; i <= msa->alen; i++) { if(i < cur_ct[i]) { bp[i][cur_ct[i]]++; if(bp[i][cur_ct[i]] == 1) { nmates_l2r[i]++; nmates_r2l[cur_ct[i]]++; } } } for(i = 1; i <= msa->alen; i++) { if(cur_ct[i] != 0 && i < cur_ct[i]) { if(xcons_ct[i] == cur_ct[i]) noverlapsA[a]++; if((xcons_ct[i] != 0) && (xcons_ct[i] != cur_ct[i])) { if(be_verbose) { printf("ali: %2d seq %3d (%s) bp %4d:%4d conflicts with consensus bp %4d:%4d\n", nali, a, msa->sqname[a], i, cur_ct[i], i, xcons_ct[i]); } nconflictsA[a]++; /* indi bp i:cur_ct[i] conflicts with i:xcons_ct[i] */ removebp[i] = TRUE; removebp[xcons_ct[i]] = TRUE; } else if((xcons_ct[cur_ct[i]] != 0) && (xcons_ct[cur_ct[i]] != i) && (cur_ct[xcons_ct[cur_ct[i]]] == 0)) { if(be_verbose) { printf("ali: %2d seq %3d (%s) bp %4d:%4d conflicts with consensus bp %4d:%4d\n", nali, a, msa->sqname[a], xcons_ct[i], cur_ct[xcons_ct[i]], xcons_ct[cur_ct[i]], cur_ct[i]); } nconflictsA[a]++; /* indi bp i:cur_ct[i] conflicts with xcons_ct[cur_ct[i]]:cur_ct[i] */ removebp[cur_ct[i]] = TRUE; removebp[xcons_ct[cur_ct[i]]] = TRUE; } else nconsistentA[a]++; } } if(nconflictsA[a] > lmax) { if(lfp != NULL) fprintf(lfp, "%s\n", msa->sqname[a]); nconflicts_list += nconflictsA[a]; nlist++; } nbpsA[a] = nconflictsA[a] + nconsistentA[a]; nconflicts_total += nconflictsA[a]; nconsistent_total += nconsistentA[a]; noverlaps_total += noverlapsA[a]; nbps_total += nbpsA[a]; if(do_info && have_cons) printf(" %-*s %6d %6d %6d %6d\n", namewidth, msa->sqname[a], nbpsA[a], noverlapsA[a], nconsistentA[a], nconflictsA[a]); if(do_info && !have_cons) printf(" %-*s %6d\n", namewidth, msa->sqname[a], nbpsA[a]); if(nbpsA[a] > nbpsA[max_nbps_aidx]) max_nbps_aidx = a; if((noverlapsA[a] > noverlapsA[max_noverlaps_aidx]) || ((noverlapsA[a] == noverlapsA[max_noverlaps_aidx]) && (nbpsA[a] > nbpsA[max_noverlaps_aidx]))) max_noverlaps_aidx = a; if((nconsistentA[a] > nconsistentA[max_nconsistent_aidx]) || ((nconsistentA[a] == nconsistentA[max_nconsistent_aidx]) && (nbpsA[a] > nbpsA[max_nconsistent_aidx]))) max_nconsistent_aidx = a; } else if(do_newcons || esl_opt_GetBoolean(go, "-a")) { esl_fatal("No SS annotation for sequence %d, aln %d.\n", (a+1), nali); } } if(do_info && have_cons) { if(nindi_read > 0) printf("\n"); printf(" %-*s %6d %6d %6d %6d\n", namewidth, "SS_cons(consensus)", ncons_bps, ncons_bps, ncons_bps, 0); if(nindi_read > 0) { printf("\n# %6d/%6d (%.3f) overlap\n", noverlaps_total, nbps_total, nbps_total > 0 ? (float) noverlaps_total / (float) nbps_total : 0.); printf("# %6d/%6d (%.3f) consistent\n", nconsistent_total, nbps_total, nbps_total > 0 ? (float) nconsistent_total / (float) nbps_total: 0.); printf("# %6d/%6d (%.3f) conflict\n", nconflicts_total, nbps_total, nbps_total > 0 ? (float) nconflicts_total / (float) nbps_total: 0.); } else { printf("# No sequences in the alignment have GR SS annotation.\n"); } } if(lfp != NULL) { printf("# %d/%d sequences with %.3f individual bps on avg that conflict with SS_cons written to %s\n", nlist, msa->nseq, (float) nconflicts_list / (float) nlist, esl_opt_GetString(go, "-l")); } /* determine number of gaps per alignment column */ if((status = get_gaps_per_column(msa, &ngaps)) != eslOK) goto ERROR; /* -x: determine max bp structure OR * -a: list all conflicts in individual structures */ if(do_max || do_a) { for(i = 1; i <= msa->alen; i++) { if(nmates_l2r[i] > 1) {/* list the conflicts */ has_conflict[i] = TRUE; for(j = 1; j <= msa->alen; j++) { if(bp[i][j] > 0) { if(do_a) printf("More than 1 right mates for left mate %4d %4d:%4d bp exists in %4d/%4d seqs (%.3f)\n", i, i, j, bp[i][j], msa->nseq - ngaps[i], (float) bp[i][j] / (float) (msa->nseq - ngaps[i])); has_conflict[j] = TRUE; } } } } for(i = 1; i <= msa->alen; i++) { if(nmates_r2l[i] > 1) {/* list the conflicts */ has_conflict[i] = TRUE; for(j = 1; j <= msa->alen; j++) { if(bp[j][i] > 0) { if(do_a) printf("More than 1 left mates for right mate %4d %4d:%4d bp exists in %4d/%4d seqs (%.3f)\n", i, j, i, bp[j][i], msa->nseq - ngaps[i], (float) bp[j][i] / (float) (msa->nseq - ngaps[i])); has_conflict[j] = TRUE; } } } } for(i = 1; i <= msa->alen; i++) { /*printf("conflict[%4d]: %d\n", i, has_conflict[i]);*/ if(nmates_l2r[i] == 1 && (!(has_conflict[i]))) { j = i+1; while(bp[i][j] == 0) j++; cons_ct[i] = j; cons_ct[j] = i; } } /* remove pseudoknotted bps greedily */ for(i = 1; i <= msa->alen; i++) { j = cons_ct[i]; if(j != 0 && i < j) { for(i2 = i+1; i2 <= msa->alen; i2++) { j2 = cons_ct[i2]; if(j2 != 0 && i2 < j2) { if((i2 < j) && (j < j2)) { /*printf("KNOT %4d:%4d (%4d) %4d:%4d (%4d)\n", i, j, bp[i][j], i2, j2, bp[i2][j2]);*/ /* note: remove both if they have equal number of sequences */ if(bp[i][j] <= bp[i2][j2]) { /*printf("rm %4d:%4d\n", i, j);*/ cons_ct[cons_ct[i]] = 0; cons_ct[i] = 0; } if(bp[i][j] >= bp[i2][j2]) { /*printf("rm %4d:%4d\n", i2, j2);*/ cons_ct[cons_ct[i2]] = 0; cons_ct[i2] = 0; } } } } } } } /***************************************/ /*PARANOID, second check for knots for(i = 1; i <= msa->alen; i++) { j = cons_ct[i]; if(j != 0 && i < j) { printf("BP: %4d:%4d\n", i, j); for(i2 = 1; i2 <= msa->alen; i2++) { j2 = cons_ct[i2]; if(j2 != 0 && i2 < j2) { if((i2 < j) && (j < j2)) { if((i < i2)) { printf("KNOT %4d:%4d (%4d) %4d:%4d (%4d)\n", i, j, bp[i][j], i2, j2, bp[i2][j2]); } } } } } } ******************************************/ /***************************************/ /*PARANOID, check cons_ct for consistency for(i = 1; i <= msa->alen; i++) { if(cons_ct[i] != 0) { if(cons_ct[cons_ct[i]] != i) { printf("ERROR: i: %4d cons_ct[i]: %4d cons_ct[cons_ct[i]]: %4d\n", i, cons_ct[i], cons_ct[cons_ct[i]]); } } } */ /*PARANOID, write out SS_cons for(i = 1; i <= msa->alen; i++) { if(i < cons_ct[i]) printf("<"); else if(cons_ct[i] != 0) { printf(">"); } else printf("."); } printf("\n"); */ /***************************************/ /* textize alignment */ if((status = esl_msa_Textize(msa)) != eslOK) esl_fatal("ERROR textizing alignment %d\n", nali); /* --fmin */ if(do_fmin) { /* define ss_cons */ prev_nbps = -1; fthresh = 0.99; inconsistent_flag = pknot_flag = FALSE; printf("# Defining consensus structure:\n"); printf("# indi SS basepair aln columns i:j (from at least 1 indi SS) will become consensus basepair\n"); printf("# if > <x> individual SS contain i:j as a pair\n"); printf("# We'll search for minimal <x> that gives a consistent consensus structure.\n"); printf("# A consistent structure has each position involved in 0 or 1 basepairs.\n"); printf("#\n"); printf("# Alignment file: %s\n", alifile); printf("# Alignment idx: %d\n", nali); printf("# Number of seqs: %d\n", msa->nseq); printf("#\n"); printf("# %5s %23s %6s\n", "<x>", "nseq-required-with-bp", "numbps"); printf("# %5s %23s %6s\n", "-----", "-----------------------", "------"); while(fthresh >= 0.00 && (inconsistent_flag == FALSE) && (pknot_flag == FALSE)) { nbps = 0; seqthresh = (int) (fthresh * msa->nseq); /*printf("fthresh: %f seqthresh: %d nseq: %d\n", fthresh, seqthresh, msa->nseq);*/ esl_vec_ISet(cons_ct, msa->alen+1, 0); for(i = 1; i <= msa->alen; i++) { for(j = i+1; j <= msa->alen; j++) { if(bp[i][j] > seqthresh) { if(cons_ct[i] != 0 || cons_ct[j] != 0) { inconsistent_flag = TRUE; } /* check for pseudoknots */ for(k = i+1; k < j; k++) { l = cons_ct[k]; if((k < l) && (l > j)) { pknot_flag = TRUE; } if((k > l) && (l != 0) && (l < i)) { pknot_flag = TRUE; } } cons_ct[i] = j; cons_ct[j] = i; nbps++; } } } if(inconsistent_flag) printf(" %.3f %23d %s\n", fthresh, seqthresh+1, "inconsistent"); else if(pknot_flag) printf(" %.3f %23d %s\n", fthresh, seqthresh+1, "pseudoknotted"); else { if(nbps != prev_nbps) { printf(" %.3f %23d %6d\n", fthresh, seqthresh+1, nbps); } fmin = fthresh; } fthresh -= 0.01; prev_nbps = nbps; } fthresh = fmin; esl_vec_ISet(cons_ct, msa->alen+1, 0); } /* --ffreq: determine structure by defining consensus bps that occur in <x> fraction of indi structures */ if(do_ffreq || do_fmin) { if(do_fmin) { printf("#\n# <x> determined to be %.3f\n", fthresh); } if(do_ffreq) { printf("# Defining consensus structure:\n"); printf("# indi SS basepair aln columns i:j (from at least 1 indi SS) will become consensus basepair\n"); printf("# if > %f individual SS contain i:j as a pair\n", fthresh); } esl_vec_ISet(cons_ct, msa->alen+1, 0); /* define ss_cons */ seqthresh = (int) (fthresh * msa->nseq); /*printf("fthresh: %f seqthresh: %d nseq: %d\n", fthresh, seqthresh, msa->nseq);*/ for(i = 1; i <= msa->alen; i++) { for(j = i+1; j <= msa->alen; j++) { if(bp[i][j] > seqthresh) { if(cons_ct[i] != 0) { esl_fatal("ERROR, two base pairs including position %d satisfy threshold (%d:%d and %d:%d)!\n", i, i, cons_ct[i], i, j); } if(cons_ct[j] != 0) { esl_fatal("ERROR, two base pairs including position %d satisfy threshold (%d:%d and %d:%d)!\n", j, j, cons_ct[j], i, j); } cons_ct[i] = j; cons_ct[j] = i; } } } } /* -r: redefine consensus struct by removing any bps that conflict with individual structures */ if(do_remove_bps) { for(i = 1; i <= msa->alen; i++) { if(!(removebp[i])) { cons_ct[i] = xcons_ct[i]; cons_ct[cons_ct[i]] = i; } else { printf("# Removing consensus bp: %d:%d\n", i, xcons_ct[i]); cons_ct[xcons_ct[i]] = 0; cons_ct[i] = 0; } } } /* -c: define consensus structure as indi sequence with highest number of consistent bps with structure OR */ /* --indi: define consensus structure as indi sequence <x> from --indi <x> */ if(do_consistent || do_indi2cons) { if(do_indi2cons) { indi = esl_opt_GetString(go, "--indi"); for(a = 0; a < msa->nseq; a++) { if(strcmp(indi, msa->sqname[a]) == 0) break; } if(a == msa->nseq) esl_fatal("ERROR, could not find a sequence named %s in the alignment.\n", indi); } else { /* do_consistent */ a = max_nconsistent_aidx; } if(msa->ss == NULL || msa->ss[a] == NULL) esl_fatal("ERROR, no individual SS annotation for %s in the alignment.\n", msa->sqname[a]); if((status = esl_wuss2ct(msa->ss[a], msa->alen, cons_ct)) != eslOK) { esl_fatal("Second pass... SS annotation for sequence %d, aln %d is invalid.\n", (a), nali); } printf("# Defined new SS_cons as SS annotation for %s (%d basepairs)\n", msa->sqname[a], nbpsA[a]); if(esl_opt_GetBoolean(go, "--rfc") || esl_opt_GetBoolean(go, "--rfindi")) { if(msa->rf != NULL) { free(msa->rf); msa->rf = NULL; } if((status = esl_strcat(&(msa->rf), -1, msa->aseq[a], msa->alen)) != eslOK) goto ERROR; printf("# Defined new RF as %s sequence\n", msa->sqname[a]); } } /* write out alignment with new SS_cons */ if(do_newcons) { if((status = esl_ct2wuss(cons_ct, msa->alen, sscons)) != eslOK) goto ERROR; if(msa->ss_cons != NULL) { free(msa->ss_cons); msa->ss_cons = NULL; } if((status = esl_strcat(&(msa->ss_cons), -1, sscons, msa->alen)) != eslOK) goto ERROR; status = esl_msafile_Write(ofp, msa, (esl_opt_GetBoolean(go, "--pfam") ? eslMSAFILE_PFAM : eslMSAFILE_STOCKHOLM)); if (status == eslEMEM) esl_fatal("Memory error when outputting alignment\n"); else if (status != eslOK) esl_fatal("Writing alignment file failed with error %d\n", status); } free(sscons); free(cur_ct); free(cons_ct); free(xcons_ct); for(i = 1; i <= msa->alen; i++) free(bp[i]); free(bp); esl_msa_Destroy(msa); } if (nali == 0) esl_fatal("No alignments found in file %s\n", alifile); /* Cleanup, normal return */ if(lfp != NULL) fclose(lfp); if(ofp != NULL) { printf("# Alignment(s) saved to file %s\n", esl_opt_GetString(go, "-o")); fclose(ofp); } esl_msafile_Close(afp); esl_getopts_Destroy(go); return 0; ERROR: if(afp) esl_msafile_Close(afp); if(go) esl_getopts_Destroy(go); if(msa) esl_msa_Destroy(msa); if(lfp) fclose(lfp); if(ofp) fclose(ofp); esl_fatal("ERROR\n"); return 1; }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage); ESL_RANDOMNESS *rng = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s")); ESL_ALPHABET *abc = NULL; char *ghmmfile = esl_opt_GetArg(go, 1); /* HMMs parameterized for sequence generation */ char *ahmmfile = esl_opt_GetArg(go, 2); /* HMMs parameterized for alignment */ int N = esl_opt_GetInteger(go, "-N"); P7_HMMFILE *ghfp = NULL; P7_HMMFILE *ahfp = NULL; P7_HMM *ghmm = NULL; P7_HMM *ahmm = NULL; P7_PROFILE *ggm = NULL; P7_PROFILE *agm = NULL; P7_OPROFILE *aom = NULL; P7_BG *bg = NULL; ESL_SQ *sq = NULL; P7_TRACE *reftr = p7_trace_Create(); P7_TRACE *testtr = p7_trace_Create(); P7_TRACE_METRICS *tmetrics = p7_trace_metrics_Create(); P7_REFMX *rmx = p7_refmx_Create(100,100); // P7_FILTERMX *ox = NULL; P7_HARDWARE *hw; if ((hw = p7_hardware_Create ()) == NULL) p7_Fail("Couldn't get HW information data structure"); P7_SPARSEMASK *sm = p7_sparsemask_Create(100, 100, hw->simd); P7_SPARSEMX *sxv = p7_sparsemx_Create(NULL); int idx; char errbuf[eslERRBUFSIZE]; int status; p7_Init(); /* open HMM file containing models parameterized for generation (sampling) of seqs */ status = p7_hmmfile_OpenE(ghmmfile, NULL, &ghfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", ghmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", ghmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, ghmmfile, errbuf); /* open HMM file containing models parameterized for alignment (may be the same as ghmmfile) */ status = p7_hmmfile_OpenE(ahmmfile, NULL, &ahfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", ahmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", ahmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, ahmmfile, errbuf); while ( (status = p7_hmmfile_Read(ghfp, &abc, &ghmm)) == eslOK) /* <abc> gets set on first read */ { /* read the counterpart HMM from <ahfp> */ status = p7_hmmfile_Read(ahfp, &abc, &ahmm); if (status == eslEFORMAT) p7_Fail("Bad file format in HMM file %s:\n%s\n", ahfp->fname, ahfp->errbuf); else if (status == eslEINCOMPAT) p7_Fail("HMM in %s is not in the expected %s alphabet\n", ahfp->fname, esl_abc_DecodeType(abc->type)); else if (status == eslEOF) p7_Fail("Empty HMM file %s? No HMM data found.\n", ahfp->fname); else if (status != eslOK) p7_Fail("Unexpected error in reading HMMs from %s\n", ahfp->fname); /* try to validate that they're the "same" */ if (ahmm->M != ghmm->M || strcmp(ahmm->name, ghmm->name) != 0) p7_Fail("<gen-hmmfile>, <ali-hmmfile> contain different set or order of models"); /* deferred one-time creation of structures that need to know the alphabet */ if (!bg) bg = p7_bg_Create(abc); if (!sq) sq = esl_sq_CreateDigital(abc); ggm = p7_profile_Create(ghmm->M, abc); agm = p7_profile_Create(ahmm->M, abc); aom = p7_oprofile_Create(ahmm->M, abc, hw->simd); p7_profile_ConfigCustom(ggm, ghmm, bg, esl_opt_GetInteger(go, "--gL"), esl_opt_GetReal(go, "--gnj"), esl_opt_GetReal(go, "--gpglocal")); p7_profile_ConfigCustom(agm, ahmm, bg, 100, esl_opt_GetReal(go, "--anj"), esl_opt_GetReal(go, "--apglocal")); p7_oprofile_Convert(agm, aom); for (idx = 1; idx <= N; idx++) { p7_ProfileEmit(rng, ghmm, ggm, bg, sq, reftr); if (esl_opt_GetBoolean(go, "--dumpseqs")) { esl_sq_FormatName(sq, "seq%d", idx); esl_sqio_Write(stdout, sq, eslSQFILE_FASTA, FALSE); } p7_bg_SetLength(bg, sq->n); p7_profile_SetLength(agm, sq->n); p7_sparsemask_Reinit(sm, agm->M, sq->n); p7_sparsemask_AddAll(sm); if (esl_opt_GetBoolean(go, "--vit")) p7_ReferenceViterbi(sq->dsq, sq->n, agm, rmx, testtr, /*opt_vsc=*/NULL); else p7_SparseViterbi (sq->dsq, sq->n, agm, sm, sxv, testtr, /*opt_vsc=*/NULL); p7_trace_metrics(reftr, testtr, tmetrics); p7_sparsemask_Reuse(sm); p7_sparsemx_Reuse(sxv); //p7_filtermx_Reuse(ox); p7_refmx_Reuse(rmx); esl_sq_Reuse(sq); p7_trace_Reuse(reftr); p7_trace_Reuse(testtr); } p7_oprofile_Destroy(aom); p7_profile_Destroy(ggm); p7_profile_Destroy(agm); p7_hmm_Destroy(ghmm); p7_hmm_Destroy(ahmm); } /* we leave the loop with <status> set by a p7_hmmfile_Read() on ghfp; if all is well, status=eslEOF */ if (status == eslEFORMAT) p7_Fail("Bad file format in HMM file %s:\n%s\n", ghfp->fname, ghfp->errbuf); else if (status == eslEINCOMPAT) p7_Fail("HMM in %s is not in the expected %s alphabet\n", ghfp->fname, esl_abc_DecodeType(abc->type)); else if (status != eslEOF) p7_Fail("Unexpected error in reading HMMs from %s\n", ghfp->fname); p7_trace_metrics_Dump(stdout, tmetrics); p7_hmmfile_Close(ghfp); p7_hmmfile_Close(ahfp); // p7_filtermx_Destroy(ox); p7_sparsemask_Destroy(sm); p7_sparsemx_Destroy(sxv); p7_refmx_Destroy(rmx); p7_trace_metrics_Destroy(tmetrics); p7_trace_Destroy(testtr); p7_trace_Destroy(reftr); p7_bg_Destroy(bg); esl_alphabet_Destroy(abc); esl_randomness_Destroy(rng); esl_getopts_Destroy(go); }
int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 2, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); char *seqfile = esl_opt_GetArg(go, 2); float nu = esl_opt_GetReal(go, "--nu"); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_GMX *fwd = NULL; ESL_SQ *sq = NULL; ESL_SQFILE *sqfp = NULL; P7_TRACE *tr = NULL; int format = eslSQFILE_UNKNOWN; float sc, nullsc, seqscore, P; int status; /* Read in one HMM */ if (p7_hmmfile_Open(hmmfile, NULL, &hfp) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); p7_hmmfile_Close(hfp); /* Open sequence file */ sq = esl_sq_CreateDigital(abc); status = esl_sqfile_Open(seqfile, format, NULL, &sqfp); if (status == eslENOTFOUND) p7_Fail("No such file."); else if (status == eslEFORMAT) p7_Fail("Format unrecognized."); else if (status == eslEINVAL) p7_Fail("Can't autodetect stdin or .gz."); else if (status != eslOK) p7_Fail("Open failed, code %d.", status); /* Configure a profile from the HMM */ bg = p7_bg_Create(abc); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, sq->n, p7_LOCAL); /* Allocate matrix */ fwd = p7_gmx_Create(gm->M, sq->n); while ((status = esl_sqio_Read(sqfp, sq)) == eslOK) { p7_ReconfigLength(gm, sq->n); p7_bg_SetLength(bg, sq->n); p7_gmx_GrowTo(fwd, gm->M, sq->n); /* Run MSV */ p7_GMSV(sq->dsq, sq->n, gm, fwd, nu, &sc); /* Calculate bit score and P-value using standard null1 model*/ p7_bg_NullOne (bg, sq->dsq, sq->n, &nullsc); seqscore = (sc - nullsc) / eslCONST_LOG2; P = esl_gumbel_surv(seqscore, gm->evparam[p7_MMU], gm->evparam[p7_MLAMBDA]); /* output suitable for direct use in profmark benchmark postprocessors: * <Pvalue> <bitscore> <target name> <query name> */ printf("%g\t%.2f\t%s\t%s\n", P, seqscore, sq->name, hmm->name); esl_sq_Reuse(sq); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); /* Cleanup */ esl_sqfile_Close(sqfp); esl_sq_Destroy(sq); p7_trace_Destroy(tr); p7_gmx_Destroy(fwd); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
static int output_result(ESL_GETOPTS *go, struct cfg_s *cfg, char *errbuf, P7_HMM *hmm, double *scores, int *alilens) { ESL_HISTOGRAM *h = NULL; int i; double tailp; double x10; double mu, lambda, E10; double mufix, E10fix; double mufix2, E10fix2; double E10p; double almean, alvar; /* alignment length mean and variance (optional output) */ double pmu, plambda; int status; /* fetch statistical params from HMM for expected distribution */ if (esl_opt_GetBoolean(go, "--vit")) { pmu = hmm->evparam[p7_VMU]; plambda = hmm->evparam[p7_VLAMBDA]; } else if (esl_opt_GetBoolean(go, "--msv")) { pmu = hmm->evparam[p7_MMU]; plambda = hmm->evparam[p7_MLAMBDA]; } else if (esl_opt_GetBoolean(go, "--fwd")) { pmu = hmm->evparam[p7_FTAU]; plambda = hmm->evparam[p7_FLAMBDA]; } /* Optional output of scores/alignment lengths: */ if (cfg->xfp) fwrite(scores, sizeof(double), cfg->N, cfg->xfp); if (cfg->alfp) for (i = 0; i < cfg->N; i++) fprintf(cfg->alfp, "%d %.3f\n", alilens[i], scores[i]); if (esl_opt_GetBoolean(go, "-v")) for (i = 0; i < cfg->N; i++) printf("%.3f\n", scores[i]); /* optional "filter power" data file: <hmm name> <# seqs <= P threshold> <fraction of seqs <= P threshold> */ if (cfg->ffp) output_filter_power(go, cfg, errbuf, hmm, scores); /* Count the scores into a histogram object. */ if ((h = esl_histogram_CreateFull(-50., 50., 0.2)) == NULL) ESL_XFAIL(eslEMEM, errbuf, "allocation failed"); for (i = 0; i < cfg->N; i++) esl_histogram_Add(h, scores[i]); /* For viterbi, MSV, and hybrid, fit data to a Gumbel, either with known lambda or estimated lambda. */ if (esl_opt_GetBoolean(go, "--vit") || esl_opt_GetBoolean(go, "--msv")) { esl_histogram_GetRank(h, 10, &x10); tailp = 1.0; /* mu, lambda, E10 fields are for ML Gumbel fit to the observed data */ if (esl_gumbel_FitComplete(scores, cfg->N, &mu, &lambda) != eslOK) esl_fatal("gumbel complete data fit failed"); E10 = cfg->N * esl_gumbel_surv(x10, mu, lambda); /* mufix, E10fix fields: assume lambda = log2; fit an ML mu to the data */ if (esl_gumbel_FitCompleteLoc(scores, cfg->N, 0.693147, &mufix) != eslOK) esl_fatal("gumbel mu- (location-)only data fit failed for lambda = log2"); E10fix = cfg->N * esl_gumbel_surv(x10, mufix, 0.693147); /* mufix2, E10fix2 fields: assume H3's own lambda estimate; fit ML mu */ if (esl_gumbel_FitCompleteLoc(scores, cfg->N, plambda, &mufix2) != eslOK) esl_fatal("gumbel mu- (location-)only data fit failed for fitted lambda"); E10fix2 = cfg->N * esl_gumbel_surv(x10, mufix2, plambda); /* pmu, plambda, E10p: use H3 expectation estimates (pmu, plambda) */ E10p = cfg->N * esl_gumbel_surv(x10, pmu, plambda); fprintf(cfg->ofp, "%-20s %8.4f %8.4f %8.4f %8.4f %8.4f %8.4f %8.4f %8.4f %8.4f %8.4f %8.4f", hmm->name, tailp, mu, lambda, E10, mufix, E10fix, mufix2, E10fix2, pmu, plambda, E10p); if (esl_opt_GetBoolean(go, "-a")) { esl_stats_IMean(alilens, cfg->N, &almean, &alvar); fprintf(cfg->ofp, " %8.4f %8.4f\n", almean, sqrt(alvar)); } else fprintf(cfg->ofp, "\n"); if (cfg->survfp != NULL) { double xmax = esl_opt_IsOn(go, "--xmax") ? esl_opt_GetReal(go, "--xmax") : h->xmax + 5.; esl_histogram_PlotSurvival(cfg->survfp, h); esl_gumbel_Plot(cfg->survfp, pmu, plambda, esl_gumbel_surv, h->xmin - 5., xmax, 0.1); esl_gumbel_Plot(cfg->survfp, mu, lambda, esl_gumbel_surv, h->xmin - 5., xmax, 0.1); esl_gumbel_Plot(cfg->survfp, mufix, 0.693147, esl_gumbel_surv, h->xmin - 5., xmax, 0.1); } if (cfg->efp != NULL) { double x; fprintf(cfg->efp, "# %s\n", hmm->name); for (i = 1; i <= 1000 && i <= cfg->N; i++) { esl_histogram_GetRank(h, i, &x); fprintf(cfg->efp, "%d %g\n", i, cfg->N * esl_gumbel_surv(x, pmu, plambda)); } fprintf(cfg->efp, "&\n"); } } /* For Forward, fit tail to exponential tails, for a range of tail mass choices. */ else if (esl_opt_GetBoolean(go, "--fwd")) { double tmin = esl_opt_GetReal(go, "--tmin"); double tmax = esl_opt_GetReal(go, "--tmax"); double tpoints = (double) esl_opt_GetInteger(go, "--tpoints"); int do_linear = esl_opt_GetBoolean(go, "--tlinear"); double *xv; double tau; int n; esl_histogram_GetRank(h, 10, &x10); tailp = tmin; do { if (tailp > 1.0) tailp = 1.0; esl_histogram_GetTailByMass(h, tailp, &xv, &n, NULL); if (esl_exp_FitComplete(xv, n, &mu, &lambda) != eslOK) esl_fatal("exponential fit failed"); E10 = cfg->N * tailp * esl_exp_surv(x10, mu, lambda); mufix = mu; E10fix = cfg->N * tailp * esl_exp_surv(x10, mu, 0.693147); E10p = cfg->N * esl_exp_surv(x10, pmu, plambda); /* the pmu is relative to a P=1.0 tail origin. */ tau = mu + log(tailp) / lambda; fprintf(cfg->ofp, "%-20s %8.4f %8.4f %8.4f %8.4f %8.4f %8.4f %8.4f %8.4f %8.4f\n", hmm->name, tailp, mu, lambda, E10, mufix, E10fix, pmu, plambda, E10p); if (tpoints == 1) break; else if (do_linear) tailp += (tmax-tmin) / (tpoints-1); else tailp *= exp(log(tmax/tmin) / (tpoints-1)); } while (tailp <= tmax+1e-7); if (cfg->survfp) { double xmax = esl_opt_IsOn(go, "--xmax") ? esl_opt_GetReal(go, "--xmax") : h->xmax + 5.; esl_histogram_PlotSurvival(cfg->survfp, h); esl_exp_Plot(cfg->survfp, pmu, plambda, esl_exp_surv, pmu, xmax, 0.1); esl_exp_Plot(cfg->survfp, tau, lambda, esl_exp_surv, tau, xmax, 0.1); esl_exp_Plot(cfg->survfp, tau, 0.693147, esl_exp_surv, tau, xmax, 0.1); } if (cfg->efp != NULL) { double x; fprintf(cfg->efp, "# %s\n", hmm->name); for (i = 1; i <= 1000 && i <= cfg->N; i++) { esl_histogram_GetRank(h, i, &x); fprintf(cfg->efp, "%d %g\n", i, cfg->N * esl_exp_surv(x, pmu, plambda)); } fprintf(cfg->efp, "&\n"); } } /* fallthrough: both normal, error cases execute same cleanup code */ status = eslOK; ERROR: esl_histogram_Destroy(h); return status; }
int main(int argc, char **argv) { ESL_GETOPTS *go; char *msafile; ESLX_MSAFILE *afp; ESL_MSA *msa; int do_gsc; int do_pb; int do_blosum; int maxN; double maxid; int nsmall, nbig; int i; int status; /* Process command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) esl_fatal("%s", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) esl_fatal("%s", go->errbuf); if (esl_opt_GetBoolean(go, "-h") == TRUE){ puts(usage); puts("\n where options are:"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2=indentation; 80=width */ return 0; } do_blosum = esl_opt_GetBoolean(go, "--blosum"); do_gsc = esl_opt_GetBoolean(go, "--gsc"); do_pb = esl_opt_GetBoolean(go, "--pb"); maxid = esl_opt_GetReal (go, "--id"); maxN = esl_opt_GetInteger(go, "--maxN"); if (esl_opt_ArgNumber(go) != 1) { puts("Incorrect number of command line arguments."); puts(usage); return 1; } if ((msafile = esl_opt_GetArg(go, 1)) == NULL) esl_fatal("%s", go->errbuf); esl_getopts_Destroy(go); /* Weight one or more alignments from input file */ if ((status = eslx_msafile_Open(NULL, msafile, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK) eslx_msafile_OpenFailure(afp, status); while ( (status = eslx_msafile_Read(afp, &msa)) != eslEOF) { if (status != eslOK) eslx_msafile_ReadFailure(afp, status); if (maxN > 0 && msa->nseq > maxN) { esl_msa_Destroy(msa); continue; } if (do_gsc) esl_msaweight_GSC(msa); else if (do_pb) esl_msaweight_PB(msa); else if (do_blosum) esl_msaweight_BLOSUM(msa, maxid); for (nsmall = 0, nbig = 0, i = 0; i < msa->nseq; i++) { if (msa->wgt[i] < 0.2) nsmall++; if (msa->wgt[i] > 5.0) nbig++; } printf("%-20s %5d %5d %8.4f %8.4f %5d %5d\n", msa->name, msa->nseq, msa->alen, esl_vec_DMin(msa->wgt, msa->nseq), esl_vec_DMax(msa->wgt, msa->nseq), nsmall, nbig); esl_msa_Destroy(msa); } eslx_msafile_Close(afp); return eslOK; }
int main(int argc, char **argv) { ESL_STOPWATCH *w; ESL_GETOPTS *go; char *msafile; ESLX_MSAFILE *afp; ESL_MSA *msa; int do_gsc; int do_pb; int do_blosum; int maxN; double maxid; double cpu; int status; /* Process command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) esl_fatal("failed to parse cmd line: %s", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) esl_fatal("failed to parse cmd line: %s", go->errbuf); if (esl_opt_GetBoolean(go, "-h") == TRUE) { puts(usage); puts("\n where options are:"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2=indentation; 80=width */ return 0; } do_blosum = esl_opt_GetBoolean(go, "--blosum"); do_gsc = esl_opt_GetBoolean(go, "--gsc"); do_pb = esl_opt_GetBoolean(go, "--pb"); maxid = esl_opt_GetReal (go, "--id"); maxN = esl_opt_GetInteger(go, "--maxN"); if (esl_opt_ArgNumber(go) != 1) { puts("Incorrect number of command line arguments."); puts(usage); return 1; } if ((msafile = esl_opt_GetArg(go, 1)) == NULL) esl_fatal("failed to parse cmd line: %s", go->errbuf); esl_getopts_Destroy(go); w = esl_stopwatch_Create(); /* Weight one or more alignments from input file */ if ((status = eslx_msafile_Open(NULL, msafile, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK) eslx_msafile_OpenFailure(afp, status); while ( (status = eslx_msafile_Read(afp, &msa)) != eslEOF) { if (status != eslOK) eslx_msafile_ReadFailure(afp, status); if (maxN > 0 && msa->nseq > maxN) { esl_msa_Destroy(msa); continue; } esl_stopwatch_Start(w); if (do_gsc) esl_msaweight_GSC(msa); else if (do_pb) esl_msaweight_PB(msa); else if (do_blosum) esl_msaweight_BLOSUM(msa, maxid); esl_stopwatch_Stop(w); cpu = w->user; printf("%-20s %6d %6d %.3f\n", msa->name, msa->alen, msa->nseq, cpu); esl_msa_Destroy(msa); } eslx_msafile_Close(afp); esl_stopwatch_Destroy(w); return eslOK; }
int main(int argc, char **argv) { ESL_GETOPTS *go; char *msafile; ESLX_MSAFILE *afp; ESL_MSA *msa; float *sqd; int status; int nbad; int nali = 0; int nbadali = 0; int nwgt = 0; int nbadwgt = 0; int i; int be_quiet; int do_gsc; int do_pb; int do_blosum; double maxid; double tol; int maxN; /* Process command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) esl_fatal("failed to parse cmd line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) esl_fatal("failed to parse cmd line: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") == TRUE) { puts(usage); puts("\n where options are:"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2=indentation; 80=width */ return 0; } be_quiet = esl_opt_GetBoolean(go, "-q"); do_blosum = esl_opt_GetBoolean(go, "--blosum"); do_gsc = esl_opt_GetBoolean(go, "--gsc"); do_pb = esl_opt_GetBoolean(go, "--pb"); maxid = esl_opt_GetReal (go, "--id"); tol = esl_opt_GetReal (go, "--tol"); maxN = esl_opt_GetInteger(go, "--maxN"); if (esl_opt_ArgNumber(go) != 1) { puts("Incorrect number of command line arguments."); puts(usage); return 1; } msafile = esl_opt_GetArg(go, 1); esl_getopts_Destroy(go); /* Weight one or more alignments from input file */ if ((status = eslx_msafile_Open(NULL, msafile, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK) eslx_msafile_OpenFailure(afp, status); while ( (status = eslx_msafile_Read(afp, &msa)) != eslEOF) { if (status != eslOK) eslx_msafile_ReadFailure(afp, status); if (maxN > 0 && msa->nseq > maxN) { esl_msa_Destroy(msa); continue; } nali++; nwgt += msa->nseq; ESL_ALLOC(sqd, sizeof(float) * msa->nseq); if (do_gsc) { esl_msaweight_GSC(msa); GSCWeights(msa->aseq, msa->nseq, msa->alen, sqd); } else if (do_pb) { esl_msaweight_PB(msa); PositionBasedWeights(msa->aseq, msa->nseq, msa->alen, sqd); } else if (do_blosum) { esl_msaweight_BLOSUM(msa, maxid); BlosumWeights(msa->aseq, msa->nseq, msa->alen, maxid, sqd); /* workaround SQUID bug: BLOSUM weights weren't renormalized to sum to nseq. */ esl_vec_FNorm (sqd, msa->nseq); esl_vec_FScale(sqd, msa->nseq, (float) msa->nseq); } if (! be_quiet) { for (i = 0; i < msa->nseq; i++) fprintf(stdout, "%-20s %.3f %.3f\n", msa->sqname[i], msa->wgt[i], sqd[i]); } nbad = 0; for (i = 0; i < msa->nseq; i++) if (esl_DCompare((double) sqd[i], msa->wgt[i], tol) != eslOK) nbad++; if (nbad > 0) nbadali++; nbadwgt += nbad; if (nbad > 0) printf("%-20s :: alignment shows %d weights that differ (out of %d) \n", msa->name, nbad, msa->nseq); esl_msa_Destroy(msa); free(sqd); } eslx_msafile_Close(afp); if (nbadali == 0) printf("OK: all weights identical between squid and Easel in %d alignment(s)\n", nali); else { printf("%d of %d weights mismatched at (> %f fractional difference)\n", nbadwgt, nwgt, tol); printf("involving %d of %d total alignments\n", nbadali, nali); } return eslOK; ERROR: return status; }
static int output_header(FILE *ofp, const ESL_GETOPTS *go) { p7_banner(ofp, go->argv[0], banner); if (esl_opt_IsUsed(go, "--eval2score")) { if ( fprintf(ofp, "# show score required to reach E-value: %.2g\n", esl_opt_GetReal(go, "-E")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } if (esl_opt_IsUsed(go, "--score2eval")) { if ( fprintf(ofp, "# show E-value corresponding to score: %.2g\n", esl_opt_GetReal(go, "-S")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } if (esl_opt_IsUsed(go, "--eval2score") || esl_opt_IsUsed(go, "--score2eval")) { if (esl_opt_IsUsed(go, "--baseZ") ) { if ( fprintf(ofp, "# using base count (search both strands): %d Mb\n", esl_opt_GetInteger(go, "--baseZ")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } else if (esl_opt_IsUsed(go, "--baseZ1") ) { if ( fprintf(ofp, "# using base count (search single strand): %d Mb\n", esl_opt_GetInteger(go, "--baseZ1")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } else { if ( fprintf(ofp, "# using database sequence count: %d\n", esl_opt_GetInteger(go, "-Z")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } } if (fprintf(ofp, "# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); return eslOK; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* command line processing */ ESL_ALPHABET *abc = NULL; char *hmmfile = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; int nhmm; double x; float KL; int status; char errbuf[eslERRBUFSIZE]; float nseq; int do_eval2score = 0; int do_score2eval = 0; int z_val; float e_val; float s_val; /* Process the command line options. */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_GetBoolean(go, "-h") == TRUE) { p7_banner(stdout, argv[0], banner); esl_usage(stdout, argv[0], usage); puts("\nOptions:"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=docgroup, 2 = indentation; 80=textwidth*/ exit(0); } if (esl_opt_ArgNumber(go) != 1) { puts("Incorrect number of command line arguments."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if ((hmmfile = esl_opt_GetArg(go, 1)) == NULL) { puts("Failed to read <hmmfile> argument from command line."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } output_header(stdout, go); if ( esl_opt_IsOn(go, "--eval2score") ) { do_eval2score = TRUE; e_val = esl_opt_GetReal(go, "-E"); } else if ( esl_opt_IsOn(go, "--score2eval") ) { do_score2eval = TRUE; s_val = esl_opt_GetReal(go, "-S"); } else if ( esl_opt_IsUsed(go, "--baseZ") || esl_opt_IsUsed(go, "--baseZ1") || esl_opt_IsUsed(go, "-Z") ) { puts("The flags -Z, --baseZ, and --baseZ1 are for use with --eval2score and --score2eval."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_IsUsed(go, "--baseZ") ) { z_val = 1000000 * 2 * (long)(esl_opt_GetInteger(go, "--baseZ")); } else if (esl_opt_IsUsed(go, "--baseZ1") ) { z_val = 1000000 * (long)(esl_opt_GetInteger(go, "--baseZ1")); } else { z_val = esl_opt_GetInteger(go, "-Z"); } /* Initializations: open the HMM file */ status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, hmmfile, errbuf); /* Main body: read HMMs one at a time, print one line of stats */ printf("#\n"); printf("# %-4s %-20s %-12s %8s %8s %6s %6s %6s %6s %6s", "idx", "name", "accession", "nseq", "eff_nseq", "M", "relent", "info", "p relE", "compKL"); if (do_eval2score) printf (" %6s %6.2g", "sc for", e_val); if (do_score2eval) printf (" %6s %6.2f", "E-val for", s_val); printf("\n"); printf("# %-4s %-20s %-12s %8s %8s %6s %6s %6s %6s %6s", "----", "--------------------", "------------", "--------", "--------", "------", "------", "------", "------", "------"); if (do_eval2score) printf (" %13s", "-------------"); if (do_score2eval) printf (" %13s", "-------------"); printf("\n"); nhmm = 0; while ((status = p7_hmmfile_Read(hfp, &abc, &hmm)) != eslEOF) { if (status == eslEOD) esl_fatal("read failed, HMM file %s may be truncated?", hmmfile); else if (status == eslEFORMAT) esl_fatal("bad file format in HMM file %s", hmmfile); else if (status == eslEINCOMPAT) esl_fatal("HMM file %s contains different alphabets", hmmfile); else if (status != eslOK) esl_fatal("Unexpected error in reading HMMs from %s", hmmfile); nhmm++; if ( esl_opt_IsOn(go, "--eval2score") || esl_opt_IsOn(go, "--score2eval") ) { if (esl_opt_IsUsed(go, "--baseZ") || esl_opt_IsUsed(go, "--baseZ1" ) ) { if ( hmm->abc->type != eslRNA && hmm->abc->type != eslDNA) { puts("The flags --baseZ and --baseZ1 can't be used with non-nucleotide models."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } } else if ( hmm->abc->type != eslAMINO && hmm->abc->type != eslRNA && hmm->abc->type != eslDNA) { puts("The flags --eval2score and --score2eval can't be used with non-sequence models."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } } if (esl_opt_IsUsed(go, "--baseZ") ) { nseq = (float)z_val / (float)(hmm->max_length); } else if (esl_opt_IsUsed(go, "--baseZ1") ) { nseq = (float)z_val / (float)(hmm->max_length); } else { nseq = z_val; } if (bg == NULL) bg = p7_bg_Create(abc); p7_MeanPositionRelativeEntropy(hmm, bg, &x); p7_hmm_CompositionKLDist(hmm, bg, &KL, NULL); printf("%-6d %-20s %-12s %8d %8.2f %6d %6.2f %6.2f %6.2f %6.2f", nhmm, hmm->name, hmm->acc == NULL ? "-" : hmm->acc, hmm->nseq, hmm->eff_nseq, hmm->M, p7_MeanMatchRelativeEntropy(hmm, bg), p7_MeanMatchInfo(hmm, bg), x, KL); if ( do_eval2score ) { float sc; sc = esl_exp_invsurv( e_val / nseq , hmm->evparam[p7_FTAU], hmm->evparam[p7_FLAMBDA]); printf("%13.2f", sc); } else if ( do_score2eval) { float e; e = nseq * esl_exp_surv( s_val , hmm->evparam[p7_FTAU], hmm->evparam[p7_FLAMBDA]); printf("%13.2g", e); } printf("\n"); /* p7_MeanForwardScore(hmm, bg)); */ p7_hmm_Destroy(hmm); } p7_bg_Destroy(bg); esl_alphabet_Destroy(abc); p7_hmmfile_Close(hfp); esl_getopts_Destroy(go); exit(0); }
int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage); ESL_RANDOMNESS *rng = esl_randomness_Create(esl_opt_GetInteger(go, "-s")); double mu = esl_opt_GetReal (go, "-m"); double lambda = esl_opt_GetReal (go, "-l"); double tau = esl_opt_GetReal (go, "-t"); int n = esl_opt_GetInteger(go, "-n"); double binwidth = esl_opt_GetReal (go, "-w"); int plot_cdf = esl_opt_GetBoolean(go, "--cdf"); int plot_logcdf = esl_opt_GetBoolean(go, "--logcdf"); int plot_pdf = esl_opt_GetBoolean(go, "--pdf"); int plot_logpdf = esl_opt_GetBoolean(go, "--logpdf"); int plot_surv = esl_opt_GetBoolean(go, "--surv"); int plot_logsurv = esl_opt_GetBoolean(go, "--logsurv"); int be_verbose = esl_opt_GetBoolean(go, "-v"); char *plotfile = esl_opt_GetString (go, "-o"); ESL_HISTOGRAM *h = NULL; int xmin_set = esl_opt_IsOn(go, "--xL"); double xmin = xmin_set ? esl_opt_GetReal(go, "--xL") : mu; int xmax_set = esl_opt_IsOn(go, "--xH"); double xmax = xmax_set ? esl_opt_GetReal(go, "--xH") : mu+40*(1./lambda); int xstep_set = esl_opt_IsOn(go, "--xH"); double xstep = xstep_set ? esl_opt_GetReal(go, "--xS") : 0.1; FILE *pfp = stdout; double emu, elambda, etau; int i; double x; double *data; int ndata; fprintf(stderr, "## %s\n", argv[0]); fprintf(stderr, "# rng seed = %" PRIu32 "\n", esl_randomness_GetSeed(rng)); if (be_verbose) printf("Parametric: mu = %f lambda = %f tau = %f\n", mu, lambda, tau); h = esl_histogram_CreateFull(mu, 100., binwidth); if (plotfile && (pfp = fopen(plotfile, "w")) == NULL) ESL_EXCEPTION(eslFAIL, "Failed to open plotfile"); for (i = 0; i < n; i++) { x = esl_wei_Sample(rng, mu, lambda, tau); esl_histogram_Add(h, x); } esl_histogram_GetData(h, &data, &ndata); esl_wei_FitComplete(data, ndata, &emu, &elambda, &etau); if (be_verbose) printf("Complete data fit: mu = %f lambda = %f tau = %f\n", emu, elambda, etau); if (fabs( (emu-mu)/mu ) > 0.01) ESL_EXCEPTION(eslFAIL, "Error in (complete) fitted mu > 1%\n"); if (fabs( (elambda-lambda)/lambda ) > 0.10) ESL_EXCEPTION(eslFAIL, "Error in (complete) fitted lambda > 10%\n"); if (fabs( (etau-tau)/tau ) > 0.10) ESL_EXCEPTION(eslFAIL, "Error in (complete) fitted tau > 10%\n"); esl_wei_FitCompleteBinned(h, &emu, &elambda, &etau); if (be_verbose) printf("Binned data fit: mu = %f lambda = %f tau = %f\n", emu, elambda, etau); if (fabs( (emu-mu)/mu ) > 0.01) ESL_EXCEPTION(eslFAIL, "Error in (binned) fitted mu > 1%\n"); if (fabs( (elambda-lambda)/lambda ) > 0.10) ESL_EXCEPTION(eslFAIL, "Error in (binned) fitted lambda > 10%\n"); if (fabs( (etau-tau)/tau ) > 0.10) ESL_EXCEPTION(eslFAIL, "Error in (binned) fitted lambda > 10%\n"); if (plot_pdf) esl_wei_Plot(pfp, mu, lambda, tau, &esl_wei_pdf, xmin, xmax, xstep); if (plot_logpdf) esl_wei_Plot(pfp, mu, lambda, tau, &esl_wei_logpdf, xmin, xmax, xstep); if (plot_cdf) esl_wei_Plot(pfp, mu, lambda, tau, &esl_wei_cdf, xmin, xmax, xstep); if (plot_logcdf) esl_wei_Plot(pfp, mu, lambda, tau, &esl_wei_logcdf, xmin, xmax, xstep); if (plot_surv) esl_wei_Plot(pfp, mu, lambda, tau, &esl_wei_surv, xmin, xmax, xstep); if (plot_logsurv) esl_wei_Plot(pfp, mu, lambda, tau, &esl_wei_logsurv, xmin, xmax, xstep); if (plotfile) fclose(pfp); esl_histogram_Destroy(h); esl_randomness_Destroy(rng); esl_getopts_Destroy(go); fprintf(stderr, "# status = ok\n"); return 0; }
static int output_header(const ESL_GETOPTS *go, const struct cfg_s *cfg) { if (cfg->my_rank > 0) return eslOK; /* if (! cfg->be_verbose) return eslOK; */ p7_banner(cfg->ofp, go->argv[0], banner); fprintf(cfg->ofp, "# input alignment file: %s\n", cfg->alifile); fprintf(cfg->ofp, "# output HMM file: %s\n", cfg->hmmfile); if (esl_opt_IsUsed(go, "-n")) fprintf(cfg->ofp, "# name (the single) HMM: %s\n", esl_opt_GetString(go, "-n")); if (esl_opt_IsUsed(go, "-o")) fprintf(cfg->ofp, "# output directed to file: %s\n", esl_opt_GetString(go, "-o")); if (esl_opt_IsUsed(go, "-O")) fprintf(cfg->ofp, "# processed alignment resaved to: %s\n", esl_opt_GetString(go, "-O")); if (esl_opt_IsUsed(go, "--amino")) fprintf(cfg->ofp, "# input alignment is asserted as: protein\n"); if (esl_opt_IsUsed(go, "--dna")) fprintf(cfg->ofp, "# input alignment is asserted as: DNA\n"); if (esl_opt_IsUsed(go, "--rna")) fprintf(cfg->ofp, "# input alignment is asserted as: RNA\n"); if (esl_opt_IsUsed(go, "--fast")) fprintf(cfg->ofp, "# model architecture construction: fast/heuristic\n"); if (esl_opt_IsUsed(go, "--hand")) fprintf(cfg->ofp, "# model architecture construction: hand-specified by RF annotation\n"); if (esl_opt_IsUsed(go, "--symfrac")) fprintf(cfg->ofp, "# sym fraction for model structure: %.3f\n", esl_opt_GetReal(go, "--symfrac")); if (esl_opt_IsUsed(go, "--fragthresh"))fprintf(cfg->ofp, "# seq called fragment if < xL : %.3f\n", esl_opt_GetReal(go, "--fragthresh")); if (esl_opt_IsUsed(go, "--wpb")) fprintf(cfg->ofp, "# relative weighting scheme: Henikoff PB\n"); if (esl_opt_IsUsed(go, "--wgsc")) fprintf(cfg->ofp, "# relative weighting scheme: G/S/C\n"); if (esl_opt_IsUsed(go, "--wblosum")) fprintf(cfg->ofp, "# relative weighting scheme: BLOSUM filter\n"); if (esl_opt_IsUsed(go, "--wnone")) fprintf(cfg->ofp, "# relative weighting scheme: none\n"); if (esl_opt_IsUsed(go, "--wid")) fprintf(cfg->ofp, "# frac id cutoff for BLOSUM wgts: %f\n", esl_opt_GetReal(go, "--wid")); if (esl_opt_IsUsed(go, "--eent")) fprintf(cfg->ofp, "# effective seq number scheme: entropy weighting\n"); if (esl_opt_IsUsed(go, "--eclust")) fprintf(cfg->ofp, "# effective seq number scheme: single linkage clusters\n"); if (esl_opt_IsUsed(go, "--enone")) fprintf(cfg->ofp, "# effective seq number scheme: none\n"); if (esl_opt_IsUsed(go, "--eset")) fprintf(cfg->ofp, "# effective seq number: set to %f\n", esl_opt_GetReal(go, "--eset")); if (esl_opt_IsUsed(go, "--ere") ) fprintf(cfg->ofp, "# minimum rel entropy target: %f bits\n", esl_opt_GetReal(go, "--ere")); if (esl_opt_IsUsed(go, "--esigma") ) fprintf(cfg->ofp, "# entropy target sigma parameter: %f bits\n", esl_opt_GetReal(go, "--esigma")); if (esl_opt_IsUsed(go, "--eid") ) fprintf(cfg->ofp, "# frac id cutoff for --eclust: %f\n", esl_opt_GetReal(go, "--eid")); if (esl_opt_IsUsed(go, "--EmL") ) fprintf(cfg->ofp, "# seq length for MSV Gumbel mu fit: %d\n", esl_opt_GetInteger(go, "--EmL")); if (esl_opt_IsUsed(go, "--EmN") ) fprintf(cfg->ofp, "# seq number for MSV Gumbel mu fit: %d\n", esl_opt_GetInteger(go, "--EmN")); if (esl_opt_IsUsed(go, "--EvL") ) fprintf(cfg->ofp, "# seq length for Vit Gumbel mu fit: %d\n", esl_opt_GetInteger(go, "--EvL")); if (esl_opt_IsUsed(go, "--EvN") ) fprintf(cfg->ofp, "# seq number for Vit Gumbel mu fit: %d\n", esl_opt_GetInteger(go, "--EvN")); if (esl_opt_IsUsed(go, "--EfL") ) fprintf(cfg->ofp, "# seq length for Fwd exp tau fit: %d\n", esl_opt_GetInteger(go, "--EfL")); if (esl_opt_IsUsed(go, "--EfN") ) fprintf(cfg->ofp, "# seq number for Fwd exp tau fit: %d\n", esl_opt_GetInteger(go, "--EfN")); if (esl_opt_IsUsed(go, "--Eft") ) fprintf(cfg->ofp, "# tail mass for Fwd exp tau fit: %f\n", esl_opt_GetReal(go, "--Eft")); #ifdef HMMER_THREADS if (esl_opt_IsUsed(go, "--cpu")) fprintf(cfg->ofp, "# number of worker threads: %d\n", esl_opt_GetInteger(go, "--cpu")); #endif #ifdef HAVE_MPI if (esl_opt_IsUsed(go, "--mpi") ) fprintf(cfg->ofp, "# parallelization mode: MPI\n"); #endif if (esl_opt_IsUsed(go, "--seed")) { if (esl_opt_GetInteger(go, "--seed") == 0) fprintf(cfg->ofp,"# random number seed: one-time arbitrary\n"); else fprintf(cfg->ofp,"# random number seed set to: %d\n", esl_opt_GetInteger(go, "--seed")); } if (esl_opt_IsUsed(go, "--laplace") ) fprintf(cfg->ofp, "# prior: Laplace +1\n"); fprintf(cfg->ofp, "# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n\n"); return eslOK; }
static int output_header(const ESL_GETOPTS *go, FILE *ofp, char *alifile, char *postmsafile) { p7_banner(ofp, go->argv[0], banner); if (fprintf(ofp, "# input alignment file: %s\n", alifile) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) { if (fprintf(ofp, "# output alignment file: %s\n", postmsafile) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } if (esl_opt_IsUsed(go, "--alirange") && fprintf(ofp, "# alignment range: %s\n", esl_opt_GetString(go, "--alirange")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--modelrange") && fprintf(ofp, "# model range: %s\n", esl_opt_GetString(go, "--modelrange"))< 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--apendmask") && fprintf(ofp, "# add to existing mask: [on]\n" )< 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--model2ali") && fprintf(ofp, "# ali ranges for model range: %s\n", esl_opt_GetString(go, "--model2ali")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--ali2model") && fprintf(ofp, "# model ranges for ali range: %s\n", esl_opt_GetString(go, "--ali2model")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "-o") && fprintf(ofp, "# output directed to file: %s\n", esl_opt_GetString(go, "-o")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--amino") && fprintf(ofp, "# input alignment is asserted as: protein\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--dna") && fprintf(ofp, "# input alignment is asserted as: DNA\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--rna") && fprintf(ofp, "# input alignment is asserted as: RNA\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--fast") && fprintf(ofp, "# model architecture construction: fast/heuristic\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--hand") && fprintf(ofp, "# model architecture construction: hand-specified by RF annotation\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--symfrac") && fprintf(ofp, "# sym fraction for model structure: %.3f\n", esl_opt_GetReal(go, "--symfrac")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--fragthresh") && fprintf(ofp, "# seq called frag if L <= x*alen: %.3f\n", esl_opt_GetReal(go, "--fragthresh")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--wpb") && fprintf(ofp, "# relative weighting scheme: Henikoff PB\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--wgsc") && fprintf(ofp, "# relative weighting scheme: G/S/C\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--wblosum") && fprintf(ofp, "# relative weighting scheme: BLOSUM filter\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--wnone") && fprintf(ofp, "# relative weighting scheme: none\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--wid") && fprintf(ofp, "# frac id cutoff for BLOSUM wgts: %f\n", esl_opt_GetReal(go, "--wid")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--seed")) { if (esl_opt_GetInteger(go, "--seed") == 0 && fprintf(ofp,"# random number seed: one-time arbitrary\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); else if ( fprintf(ofp,"# random number seed set to: %d\n", esl_opt_GetInteger(go, "--seed")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } if (fprintf(ofp, "# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); return eslOK; }
static void search_thread(void *arg) { int i; int count; int seed; int status; int workeridx; WORKER_INFO *info; ESL_THREADS *obj; ESL_SQ dbsq; ESL_STOPWATCH *w = NULL; /* timing stopwatch */ P7_BUILDER *bld = NULL; /* HMM construction configuration */ P7_BG *bg = NULL; /* null model */ P7_PIPELINE *pli = NULL; /* work pipeline */ P7_TOPHITS *th = NULL; /* top hit results */ P7_PROFILE *gm = NULL; /* generic model */ P7_OPROFILE *om = NULL; /* optimized query profile */ obj = (ESL_THREADS *) arg; esl_threads_Started(obj, &workeridx); info = (WORKER_INFO *) esl_threads_GetData(obj, workeridx); w = esl_stopwatch_Create(); bg = p7_bg_Create(info->abc); esl_stopwatch_Start(w); /* set up the dummy description and accession fields */ dbsq.desc = ""; dbsq.acc = ""; /* process a query sequence or hmm */ if (info->seq != NULL) { bld = p7_builder_Create(NULL, info->abc); if ((seed = esl_opt_GetInteger(info->opts, "--seed")) > 0) { esl_randomness_Init(bld->r, seed); bld->do_reseeding = TRUE; } bld->EmL = esl_opt_GetInteger(info->opts, "--EmL"); bld->EmN = esl_opt_GetInteger(info->opts, "--EmN"); bld->EvL = esl_opt_GetInteger(info->opts, "--EvL"); bld->EvN = esl_opt_GetInteger(info->opts, "--EvN"); bld->EfL = esl_opt_GetInteger(info->opts, "--EfL"); bld->EfN = esl_opt_GetInteger(info->opts, "--EfN"); bld->Eft = esl_opt_GetReal (info->opts, "--Eft"); if (esl_opt_IsOn(info->opts, "--mxfile")) status = p7_builder_SetScoreSystem (bld, esl_opt_GetString(info->opts, "--mxfile"), NULL, esl_opt_GetReal(info->opts, "--popen"), esl_opt_GetReal(info->opts, "--pextend"), bg); else status = p7_builder_LoadScoreSystem(bld, esl_opt_GetString(info->opts, "--mx"), esl_opt_GetReal(info->opts, "--popen"), esl_opt_GetReal(info->opts, "--pextend"), bg); if (status != eslOK) { //client_error(info->sock, status, "hmmgpmd: failed to set single query sequence score system: %s", bld->errbuf); fprintf(stderr, "hmmpgmd: failed to set single query sequence score system: %s", bld->errbuf); pthread_exit(NULL); return; } p7_SingleBuilder(bld, info->seq, bg, NULL, NULL, NULL, &om); /* bypass HMM - only need model */ p7_builder_Destroy(bld); } else { gm = p7_profile_Create (info->hmm->M, info->abc); om = p7_oprofile_Create(info->hmm->M, info->abc); p7_ProfileConfig(info->hmm, bg, gm, 100, p7_LOCAL); p7_oprofile_Convert(gm, om); } /* Create processing pipeline and hit list */ th = p7_tophits_Create(); pli = p7_pipeline_Create(info->opts, om->M, 100, FALSE, p7_SEARCH_SEQS); p7_pli_NewModel(pli, om, bg); if (pli->Z_setby == p7_ZSETBY_NTARGETS) pli->Z = info->db_Z; /* loop until all sequences have been processed */ count = 1; while (count > 0) { int inx; int blksz; HMMER_SEQ **sq; /* grab the next block of sequences */ if (pthread_mutex_lock(info->inx_mutex) != 0) p7_Fail("mutex lock failed"); inx = *info->inx; blksz = *info->blk_size; if (inx > *info->limit) { blksz /= 5; if (blksz < 1000) { *info->limit = info->sq_cnt * 2; } else { *info->limit = inx + (info->sq_cnt - inx) * 2 / 3; } } *info->blk_size = blksz; *info->inx += blksz; if (pthread_mutex_unlock(info->inx_mutex) != 0) p7_Fail("mutex unlock failed"); sq = info->sq_list + inx; count = info->sq_cnt - inx; if (count > blksz) count = blksz; /* Main loop: */ for (i = 0; i < count; ++i, ++sq) { if ( !(info->range_list) || hmmpgmd_IsWithinRanges ((*sq)->idx, info->range_list)) { dbsq.name = (*sq)->name; dbsq.dsq = (*sq)->dsq; dbsq.n = (*sq)->n; dbsq.idx = (*sq)->idx; if((*sq)->desc != NULL) dbsq.desc = (*sq)->desc; p7_bg_SetLength(bg, dbsq.n); p7_oprofile_ReconfigLength(om, dbsq.n); p7_Pipeline(pli, om, bg, &dbsq, th); p7_pipeline_Reuse(pli); } } } /* make available the pipeline objects to the main thread */ info->th = th; info->pli = pli; /* clean up */ p7_bg_Destroy(bg); p7_oprofile_Destroy(om); if (gm != NULL) p7_profile_Destroy(gm); esl_stopwatch_Stop(w); info->elapsed = w->elapsed; esl_stopwatch_Destroy(w); esl_threads_Finished(obj, workeridx); pthread_exit(NULL); return; }
/* Function: p7_builder_Create() * Synopsis: Create a default HMM construction configuration. * * Purpose: Create a construction configuration for building * HMMs in alphabet <abc>, and return a pointer to it. * * An application configuration <go> may optionally be * provided. If <go> is <NULL>, default parameters are * used. If <go> is non-<NULL>, it must include appropriate * settings for all of the following ``standard build options'': * * Model construction: --fast --hand --symfrac --fragthresh * Relative weighting: --wgsc --wblosum --wpb --wgiven --wid * Effective seq #: --eent --eclust --enone --eset --ere --esigma --eid * Prior scheme: --pnone --plaplace * E-val calibration: --EmL --EmN --EvL --EvN --EfL --EfN --Eft * run-to-run variation: --seed * * See <hmmbuild.c> or other big users of the build * pipeline for an example of appropriate <ESL_GETOPTS> * initializations of these 24 options. */ P7_BUILDER * p7_builder_Create(const ESL_GETOPTS *go, const ESL_ALPHABET *abc) { P7_BUILDER *bld = NULL; int seed; int status; ESL_ALLOC(bld, sizeof(P7_BUILDER)); bld->prior = NULL; bld->r = NULL; bld->S = NULL; bld->Q = NULL; bld->eset = -1.0; /* -1.0 = unset; must be set if effn_strategy is p7_EFFN_SET */ bld->re_target = -1.0; if (go == NULL) { bld->arch_strategy = p7_ARCH_FAST; bld->wgt_strategy = p7_WGT_PB; bld->effn_strategy = p7_EFFN_ENTROPY; seed = 42; } else { if (esl_opt_GetBoolean(go, "--fast")) bld->arch_strategy = p7_ARCH_FAST; else if (esl_opt_GetBoolean(go, "--hand")) bld->arch_strategy = p7_ARCH_HAND; if (esl_opt_GetBoolean(go, "--wpb")) bld->wgt_strategy = p7_WGT_PB; else if (esl_opt_GetBoolean(go, "--wgsc")) bld->wgt_strategy = p7_WGT_GSC; else if (esl_opt_GetBoolean(go, "--wblosum")) bld->wgt_strategy = p7_WGT_BLOSUM; else if (esl_opt_GetBoolean(go, "--wnone")) bld->wgt_strategy = p7_WGT_NONE; else if (esl_opt_GetBoolean(go, "--wgiven")) bld->wgt_strategy = p7_WGT_GIVEN; if (esl_opt_GetBoolean(go, "--eent")) bld->effn_strategy = p7_EFFN_ENTROPY; else if (esl_opt_GetBoolean(go, "--eclust")) bld->effn_strategy = p7_EFFN_CLUST; else if (esl_opt_GetBoolean(go, "--enone")) bld->effn_strategy = p7_EFFN_NONE; else if (esl_opt_IsOn (go, "--eset")) { bld->effn_strategy = p7_EFFN_SET; bld->eset = esl_opt_GetReal(go, "--eset"); } seed = esl_opt_GetInteger(go, "--seed"); } bld->max_insert_len = 0; /* The default RE target is alphabet dependent. */ if (go != NULL && esl_opt_IsOn (go, "--ere")) bld->re_target = esl_opt_GetReal(go, "--ere"); else { switch (abc->type) { case eslAMINO: bld->re_target = p7_ETARGET_AMINO; break; case eslDNA: bld->re_target = p7_ETARGET_DNA; break; case eslRNA: bld->re_target = p7_ETARGET_DNA; break; default: bld->re_target = p7_ETARGET_OTHER; break; } } bld->symfrac = (go != NULL) ? esl_opt_GetReal (go, "--symfrac") : 0.5; bld->fragthresh = (go != NULL) ? esl_opt_GetReal (go, "--fragthresh") : 0.5; bld->wid = (go != NULL) ? esl_opt_GetReal (go, "--wid") : 0.62; bld->esigma = (go != NULL) ? esl_opt_GetReal (go, "--esigma") : 45.0; bld->eid = (go != NULL) ? esl_opt_GetReal (go, "--eid") : 0.62; bld->EmL = (go != NULL) ? esl_opt_GetInteger(go, "--EmL") : 200; bld->EmN = (go != NULL) ? esl_opt_GetInteger(go, "--EmN") : 200; bld->EvL = (go != NULL) ? esl_opt_GetInteger(go, "--EvL") : 200; bld->EvN = (go != NULL) ? esl_opt_GetInteger(go, "--EvN") : 200; bld->EfL = (go != NULL) ? esl_opt_GetInteger(go, "--EfL") : 100; bld->EfN = (go != NULL) ? esl_opt_GetInteger(go, "--EfN") : 200; bld->Eft = (go != NULL) ? esl_opt_GetReal (go, "--Eft") : 0.04; /* Normally we reinitialize the RNG to original seed before calibrating each model. * This eliminates run-to-run variation. * As a special case, seed==0 means choose an arbitrary seed and shut off the * reinitialization; this allows run-to-run variation. */ bld->r = esl_randomness_CreateFast(seed); bld->do_reseeding = (seed == 0) ? FALSE : TRUE; if (go && esl_opt_GetBoolean(go, "--pnone") ) bld->prior = NULL; else if (go && esl_opt_GetBoolean(go, "--plaplace") ) bld->prior = p7_prior_CreateLaplace(abc); else { switch (abc->type) { case eslAMINO: bld->prior = p7_prior_CreateAmino(); break; case eslDNA: bld->prior = p7_prior_CreateNucleic(); break; case eslRNA: bld->prior = p7_prior_CreateNucleic(); break; default: bld->prior = p7_prior_CreateLaplace(abc); break; } if (bld->prior == NULL) goto ERROR; } bld->abc = abc; bld->errbuf[0] = '\0'; bld->popen = -1; bld->pextend = -1; return bld; ERROR: p7_builder_Destroy(bld); return NULL; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* command line configuration */ struct cfg_s cfg; /* application configuration */ char *basename= NULL; /* base of the output file names */ char *alifile = NULL; /* alignment file name */ char *dbfile = NULL; /* name of seq db file */ char outfile[256]; /* name of an output file */ int alifmt; /* format code for alifile */ int dbfmt; /* format code for dbfile */ ESL_MSAFILE *afp = NULL; /* open alignment file */ ESL_MSA *origmsa = NULL; /* one multiple sequence alignment */ ESL_MSA *msa = NULL; /* MSA after frags are removed */ ESL_MSA *trainmsa= NULL; /* training set, aligned */ ESL_STACK *teststack=NULL; /* test set: stack of ESL_SQ ptrs */ int status; /* easel return code */ int nfrags; /* # of fragments removed */ int ntestdom; /* # of test domains */ int ntest; /* # of test sequences created */ int nali; /* number of alignments read */ double avgid; /* Parse command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in app configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h")) cmdline_help(argv[0], go); if (esl_opt_ArgNumber(go) != 3) cmdline_failure(argv[0], "Incorrect number of command line arguments\n"); basename = esl_opt_GetArg(go, 1); alifile = esl_opt_GetArg(go, 2); dbfile = esl_opt_GetArg(go, 3); alifmt = eslMSAFILE_STOCKHOLM; dbfmt = eslSQFILE_FASTA; /* Set up the configuration structure shared amongst functions here */ if (esl_opt_IsDefault(go, "--seed")) cfg.r = esl_randomness_CreateTimeseeded(); else cfg.r = esl_randomness_Create(esl_opt_GetInteger(go, "--seed")); cfg.abc = NULL; /* until we open the MSA file, below */ cfg.fragfrac = esl_opt_GetReal(go, "-F"); cfg.idthresh1 = esl_opt_GetReal(go, "-1"); cfg.idthresh2 = esl_opt_GetReal(go, "-2"); cfg.test_lens = NULL; cfg.ntest = 0; /* Open the output files */ if (snprintf(outfile, 256, "%s.msa", basename) >= 256) esl_fatal("Failed to construct output MSA file name"); if ((cfg.out_msafp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open MSA output file %s\n", outfile); if (snprintf(outfile, 256, "%s.fa", basename) >= 256) esl_fatal("Failed to construct output FASTA file name"); if ((cfg.out_seqfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open FASTA output file %s\n", outfile); if (snprintf(outfile, 256, "%s.pos", basename) >= 256) esl_fatal("Failed to construct pos test set summary file name"); if ((cfg.possummfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open pos test set summary file %s\n", outfile); if (snprintf(outfile, 256, "%s.neg", basename) >= 256) esl_fatal("Failed to construct neg test set summary file name"); if ((cfg.negsummfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open neg test set summary file %s\n", outfile); if (snprintf(outfile, 256, "%s.tbl", basename) >= 256) esl_fatal("Failed to construct benchmark table file name"); if ((cfg.tblfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open benchmark table file %s\n", outfile); /* Open the MSA file; determine alphabet */ status = esl_msafile_Open(alifile, alifmt, NULL, &afp); if (status == eslENOTFOUND) esl_fatal("Alignment file %s doesn't exist or is not readable\n", alifile); else if (status == eslEFORMAT) esl_fatal("Couldn't determine format of alignment %s\n", alifile); else if (status != eslOK) esl_fatal("Alignment file open failed with error %d\n", status); if (esl_opt_GetBoolean(go, "--amino")) cfg.abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) cfg.abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) cfg.abc = esl_alphabet_Create(eslRNA); else { int type; status = esl_msafile_GuessAlphabet(afp, &type); if (status == eslEAMBIGUOUS) esl_fatal("Failed to guess the bio alphabet used in %s.\nUse --dna, --rna, or --amino option to specify it.", alifile); else if (status == eslEFORMAT) esl_fatal("Alignment file parse failed: %s\n", afp->errbuf); else if (status == eslENODATA) esl_fatal("Alignment file %s is empty\n", alifile); else if (status != eslOK) esl_fatal("Failed to read alignment file %s\n", alifile); cfg.abc = esl_alphabet_Create(type); } esl_msafile_SetDigital(afp, cfg.abc); if (cfg.abc->type == eslAMINO) esl_composition_SW34(cfg.fq); else esl_vec_DSet(cfg.fq, cfg.abc->K, 1.0 / (double) cfg.abc->K); /* Open and process the dbfile; make sure it's in the same alphabet */ process_dbfile(&cfg, dbfile, dbfmt); /* Read and process MSAs one at a time */ nali = 0; while ((status = esl_msa_Read(afp, &origmsa)) == eslOK) { remove_fragments(&cfg, origmsa, &msa, &nfrags); separate_sets (&cfg, msa, &trainmsa, &teststack); ntestdom = esl_stack_ObjectCount(teststack); if (ntestdom >= 2) { esl_stack_Shuffle(cfg.r, teststack); synthesize_positives(go, &cfg, msa->name, teststack, &ntest); esl_msa_MinimGaps(trainmsa, NULL, NULL); esl_msa_Write(cfg.out_msafp, trainmsa, eslMSAFILE_STOCKHOLM); esl_dst_XAverageId(cfg.abc, trainmsa->ax, trainmsa->nseq, 10000, &avgid); /* 10000 is max_comparisons, before sampling kicks in */ fprintf(cfg.tblfp, "%-20s %3.0f%% %6d %6d %6d %6d %6d %6d\n", msa->name, 100.*avgid, (int) trainmsa->alen, msa->nseq, nfrags, trainmsa->nseq, ntestdom, ntest); nali++; } esl_msa_Destroy(trainmsa); esl_msa_Destroy(origmsa); esl_msa_Destroy(msa); } if (status == eslEFORMAT) esl_fatal("Alignment file parse error, line %d of file %s:\n%s\nOffending line is:\n%s\n", afp->linenumber, afp->fname, afp->errbuf, afp->buf); else if (status != eslEOF) esl_fatal("Alignment file read failed with error code %d\n", status); else if (nali == 0) esl_fatal("No alignments found in file %s\n", alifile); if (nali > 0) synthesize_negatives(go, &cfg, esl_opt_GetInteger(go, "-N")); fclose(cfg.out_msafp); fclose(cfg.out_seqfp); fclose(cfg.possummfp); fclose(cfg.negsummfp); fclose(cfg.tblfp); esl_randomness_Destroy(cfg.r); esl_alphabet_Destroy(cfg.abc); esl_msafile_Close(afp); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage); double mu = esl_opt_GetReal(go, "-m"); double lambda = esl_opt_GetReal(go, "-l"); double tau = esl_opt_GetReal(go, "-t"); ESL_RANDOMNESS *r = esl_randomness_Create(esl_opt_GetInteger(go, "-s")); ESL_HISTOGRAM *h = esl_histogram_CreateFull(mu, 100., esl_opt_GetReal(go, "-w"));; int n = esl_opt_GetInteger(go, "-n"); int be_verbose = esl_opt_GetBoolean(go, "-v"); char *plotfile = esl_opt_GetString(go, "-o"); FILE *pfp = stdout; int plot_pdf = esl_opt_GetBoolean(go, "--P"); int plot_logpdf = esl_opt_GetBoolean(go, "--LP"); int plot_cdf = esl_opt_GetBoolean(go, "--C"); int plot_logcdf = esl_opt_GetBoolean(go, "--LC"); int plot_surv = esl_opt_GetBoolean(go, "--S"); int plot_logsurv = esl_opt_GetBoolean(go, "--LS"); double xmin = esl_opt_IsOn(go, "--XL") ? esl_opt_GetReal(go, "--XL") : mu; double xmax = esl_opt_IsOn(go, "--XH") ? esl_opt_GetReal(go, "--XH") : mu+40*(1./lambda); double xstep = esl_opt_IsOn(go, "--XS") ? esl_opt_GetReal(go, "--XS") : 0.1; double emu, elambda, etau; int i; double x; double *data; int ndata; if (be_verbose) printf("Parametric: mu = %f lambda = %f tau = %f\n", mu, lambda, tau); if (plotfile != NULL) { if ((pfp = fopen(plotfile, "w")) == NULL) esl_fatal("Failed to open plotfile"); } for (i = 0; i < n; i++) { x = esl_sxp_Sample(r, mu, lambda, tau); esl_histogram_Add(h, x); } esl_histogram_GetData(h, &data, &ndata); esl_sxp_FitComplete(data, ndata, &emu, &elambda, &etau); if (be_verbose) printf("Complete data fit: mu = %f lambda = %f tau = %f\n", emu, elambda, etau); if (fabs( (emu-mu)/mu ) > 0.01) esl_fatal("Error in (complete) fitted mu > 1%\n"); if (fabs( (elambda-lambda)/lambda ) > 0.10) esl_fatal("Error in (complete) fitted lambda > 10%\n"); if (fabs( (etau-tau)/tau ) > 0.10) esl_fatal("Error in (complete) fitted tau > 10%\n"); esl_sxp_FitCompleteBinned(h, &emu, &elambda, &etau); if (be_verbose) printf("Binned data fit: mu = %f lambda = %f tau = %f\n", emu, elambda, etau); if (fabs( (emu-mu)/mu ) > 0.01) esl_fatal("Error in (binned) fitted mu > 1%\n"); if (fabs( (elambda-lambda)/lambda ) > 0.10) esl_fatal("Error in (binned) fitted lambda > 10%\n"); if (fabs( (etau-tau)/tau ) > 0.10) esl_fatal("Error in (binned) fitted tau > 10%\n"); if (plot_pdf) esl_sxp_Plot(pfp, mu, lambda, tau, &esl_sxp_pdf, xmin, xmax, xstep); if (plot_logpdf) esl_sxp_Plot(pfp, mu, lambda, tau, &esl_sxp_logpdf, xmin, xmax, xstep); if (plot_cdf) esl_sxp_Plot(pfp, mu, lambda, tau, &esl_sxp_cdf, xmin, xmax, xstep); if (plot_logcdf) esl_sxp_Plot(pfp, mu, lambda, tau, &esl_sxp_logcdf, xmin, xmax, xstep); if (plot_surv) esl_sxp_Plot(pfp, mu, lambda, tau, &esl_sxp_surv, xmin, xmax, xstep); if (plot_logsurv) esl_sxp_Plot(pfp, mu, lambda, tau, &esl_sxp_logsurv, xmin, xmax, xstep); if (plotfile != NULL) fclose(pfp); esl_histogram_Destroy(h); esl_randomness_Destroy(r); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { int i,j; ESL_GETOPTS *go = NULL; /* command line processing */ ESL_STOPWATCH *w = esl_stopwatch_Create(); int status; ESL_MSA *msa = NULL; FILE *ofp = NULL; /* output file (default is stdout) */ ESL_ALPHABET *abc = NULL; /* digital alphabet */ char *alifile; /* name of the alignment file we're building HMMs from */ ESLX_MSAFILE *afp = NULL; /* open alifile */ int infmt = eslMSAFILE_UNKNOWN; /* autodetect alignment format by default. */ int outfmt = eslMSAFILE_STOCKHOLM; char *postmsafile; /* optional file to resave annotated, modified MSAs to */ FILE *postmsafp = NULL; /* open <postmsafile>, or NULL */ int mask_range_cnt = 0; uint32_t mask_starts[100]; // over-the-top allocation. uint32_t mask_ends[100]; char *rangestr; char *range; int *map = NULL; /* map[i]=j, means model position i comes from column j of the alignment; 1..alen */ int keep_mm; /* Set processor specific flags */ impl_Init(); alifile = NULL; postmsafile = NULL; /* Parse the command line */ process_commandline(argc, argv, &go, &alifile, &postmsafile); keep_mm = esl_opt_IsUsed(go, "--apendmask"); /* Initialize what we can in the config structure (without knowing the alphabet yet). * Fields controlled by masters are set up in usual_master() or mpi_master() * Fields used by workers are set up in mpi_worker() */ ofp = NULL; infmt = eslMSAFILE_UNKNOWN; afp = NULL; abc = NULL; if (esl_opt_IsOn(go, "--informat")) { infmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslMSAFILE_UNKNOWN) p7_Fail("%s is not a recognized input sequence file format\n", esl_opt_GetString(go, "--informat")); } /* Determine output alignment file format */ outfmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--outformat")); if (outfmt == eslMSAFILE_UNKNOWN) p7_Fail(argv[0], "%s is not a recognized output MSA file format\n", esl_opt_GetString(go, "--outformat")); /* Parse the ranges */ if (esl_opt_IsUsed(go, "--alirange")) { esl_strdup(esl_opt_GetString(go, "--alirange"), -1, &rangestr) ; } else if (esl_opt_IsUsed(go, "--modelrange")) { esl_strdup(esl_opt_GetString(go, "--modelrange"), -1, &rangestr) ; } else if (esl_opt_IsUsed(go, "--model2ali")) { esl_strdup(esl_opt_GetString(go, "--model2ali"), -1, &rangestr) ; } else if (esl_opt_IsUsed(go, "--ali2model")) { esl_strdup(esl_opt_GetString(go, "--ali2model"), -1, &rangestr) ; } else { if (puts("Must specify mask range with --modelrange, --alirange, --model2ali, or --ali2model\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto ERROR; } while ( (status = esl_strtok(&rangestr, ",", &range) ) == eslOK) { status = esl_regexp_ParseCoordString(range, mask_starts + mask_range_cnt, mask_ends + mask_range_cnt ); if (status == eslESYNTAX) esl_fatal("range flags take coords <from>..<to>; %s not recognized", range); if (status == eslFAIL) esl_fatal("Failed to find <from> or <to> coord in %s", range); mask_range_cnt++; } /* Start timing. */ esl_stopwatch_Start(w); /* Open files, set alphabet. * afp - open alignment file for input * abc - alphabet expected or guessed in ali file * postmsafp - open MSA output file * ofp - optional open output file, or stdout */ if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); else abc = NULL; status = eslx_msafile_Open(&abc, alifile, NULL, infmt, NULL, &afp); if (status != eslOK) eslx_msafile_OpenFailure(afp, status); if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) { postmsafp = fopen(postmsafile, "w"); if (postmsafp == NULL) p7_Fail("Failed to MSA output file %s for writing", postmsafile); } if (esl_opt_IsUsed(go, "-o")) { ofp = fopen(esl_opt_GetString(go, "-o"), "w"); if (ofp == NULL) p7_Fail("Failed to open -o output file %s\n", esl_opt_GetString(go, "-o")); } else ofp = stdout; /* Looks like the i/o is set up successfully... * Initial output to the user */ output_header(go, ofp, alifile, postmsafile); /* cheery output header */ /* read the alignment */ if ((status = eslx_msafile_Read(afp, &msa)) != eslOK) eslx_msafile_ReadFailure(afp, status); if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) { /* add/modify mmline for the mask */ if (msa->mm == NULL) { ESL_ALLOC(msa->mm, msa->alen); keep_mm = FALSE; } if (!keep_mm) for (i=0; i<msa->alen; i++) msa->mm[i] = '.'; } // convert model coordinates to alignment coordinates, if necessary if (esl_opt_IsUsed(go, "--modelrange") || esl_opt_IsUsed(go, "--model2ali") || esl_opt_IsUsed(go, "--ali2model") ) { float symfrac = esl_opt_GetReal(go, "--symfrac"); int do_hand = esl_opt_IsOn(go, "--hand"); int L; //same as p7_builder relative_weights if (esl_opt_IsOn(go, "--wnone") ) { esl_vec_DSet(msa->wgt, msa->nseq, 1.); } else if (esl_opt_IsOn(go, "--wgiven") ) ; else if (esl_opt_IsOn(go, "--wpb") ) status = esl_msaweight_PB(msa); else if (esl_opt_IsOn(go, "--wgsc") ) status = esl_msaweight_GSC(msa); else if (esl_opt_IsOn(go, "--wblosum")) status = esl_msaweight_BLOSUM(msa, esl_opt_GetReal(go, "--wid")); if ((status = esl_msa_MarkFragments(msa, esl_opt_GetReal(go, "--fragthresh"))) != eslOK) goto ERROR; //build a map of model mask coordinates to alignment coords ESL_ALLOC(map, sizeof(int) * (msa->alen+1)); L = p7_Alimask_MakeModel2AliMap(msa, do_hand, symfrac, map ); if ( esl_opt_IsUsed(go, "--model2ali") ) { //print mapping printf ("model coordinates alignment coordinates\n"); for (i=0; i<mask_range_cnt; i++) printf ("%8d..%-8d -> %8d..%-8d\n", mask_starts[i], mask_ends[i], map[mask_starts[i]-1], map[mask_ends[i]-1]); /* If I wanted to, I could print all the map values independently: printf("\n\n-----------\n"); printf("Map\n"); printf("---\n"); for (i=0; i<L; i++) printf("%d -> %d\n", i+1, map[i]); */ } else if ( esl_opt_IsUsed(go, "--ali2model") ) { //print mapping (requires scanning the inverted map int alistart = 0; int aliend = 0; printf ("alignment coordinates model coordinates\n"); for (i=0; i<mask_range_cnt; i++) { //find j for ali positions while (map[alistart] < mask_starts[i] ) alistart++; aliend = alistart; while (map[aliend] < mask_ends[i] ) aliend++; printf (" %8d..%-8d -> %8d..%-8d\n", map[alistart], map[aliend], alistart+1, aliend+1); } } else { //convert the mask coords based on map for (i=0; i<mask_range_cnt; i++) { mask_starts[i] = map[mask_starts[i]-1]; //-1 because mmline is offset by one relative to the 1-base alignment mask_ends[i] = map[mask_ends[i]-1]; } } } if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) { //overwrite '.' with 'm' everywhere the range says to do it for (i=0; i<mask_range_cnt; i++) for (j=mask_starts[i]; j<=mask_ends[i]; j++) msa->mm[j-1] = 'm'; if ((status = eslx_msafile_Write(postmsafp, msa, outfmt)) != eslOK) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); } esl_stopwatch_Stop(w); if (esl_opt_IsOn(go, "-o")) fclose(ofp); if (postmsafp) fclose(postmsafp); if (afp) eslx_msafile_Close(afp); if (abc) esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); esl_stopwatch_Destroy(w); return 0; ERROR: return eslFAIL; }
static int output_header(FILE *ofp, const ESL_GETOPTS *go, char *hmmfile, char *seqfile) { p7_banner(ofp, go->argv[0], banner); if (fprintf(ofp, "# query HMM file: %s\n", hmmfile) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (fprintf(ofp, "# target sequence database: %s\n", seqfile) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "-o") && fprintf(ofp, "# output directed to file: %s\n", esl_opt_GetString(go, "-o")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "-A") && fprintf(ofp, "# MSA of all hits saved to file: %s\n", esl_opt_GetString(go, "-A")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--tblout") && fprintf(ofp, "# per-seq hits tabular output: %s\n", esl_opt_GetString(go, "--tblout")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--domtblout") && fprintf(ofp, "# per-dom hits tabular output: %s\n", esl_opt_GetString(go, "--domtblout")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--pfamtblout") && fprintf(ofp, "# pfam-style tabular hit output: %s\n", esl_opt_GetString(go, "--pfamtblout")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--acc") && fprintf(ofp, "# prefer accessions over names: yes\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--noali") && fprintf(ofp, "# show alignments in output: no\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--notextw") && fprintf(ofp, "# max ASCII text line length: unlimited\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--textw") && fprintf(ofp, "# max ASCII text line length: %d\n", esl_opt_GetInteger(go, "--textw")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "-E") && fprintf(ofp, "# sequence reporting threshold: E-value <= %g\n", esl_opt_GetReal(go, "-E")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "-T") && fprintf(ofp, "# sequence reporting threshold: score >= %g\n", esl_opt_GetReal(go, "-T")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--domE") && fprintf(ofp, "# domain reporting threshold: E-value <= %g\n", esl_opt_GetReal(go, "--domE")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--domT") && fprintf(ofp, "# domain reporting threshold: score >= %g\n", esl_opt_GetReal(go, "--domT")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--incE") && fprintf(ofp, "# sequence inclusion threshold: E-value <= %g\n", esl_opt_GetReal(go, "--incE")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--incT") && fprintf(ofp, "# sequence inclusion threshold: score >= %g\n", esl_opt_GetReal(go, "--incT")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--incdomE") && fprintf(ofp, "# domain inclusion threshold: E-value <= %g\n", esl_opt_GetReal(go, "--incdomE")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--incdomT") && fprintf(ofp, "# domain inclusion threshold: score >= %g\n", esl_opt_GetReal(go, "--incdomT")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--cut_ga") && fprintf(ofp, "# model-specific thresholding: GA cutoffs\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--cut_nc") && fprintf(ofp, "# model-specific thresholding: NC cutoffs\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--cut_tc") && fprintf(ofp, "# model-specific thresholding: TC cutoffs\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--max") && fprintf(ofp, "# Max sensitivity mode: on [all heuristic filters off]\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--F1") && fprintf(ofp, "# MSV filter P threshold: <= %g\n", esl_opt_GetReal(go, "--F1")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--F2") && fprintf(ofp, "# Vit filter P threshold: <= %g\n", esl_opt_GetReal(go, "--F2")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--F3") && fprintf(ofp, "# Fwd filter P threshold: <= %g\n", esl_opt_GetReal(go, "--F3")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--nobias") && fprintf(ofp, "# biased composition HMM filter: off\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--restrictdb_stkey") && fprintf(ofp, "# Restrict db to start at seq key: %s\n", esl_opt_GetString(go, "--restrictdb_stkey")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--restrictdb_n") && fprintf(ofp, "# Restrict db to # target seqs: %d\n", esl_opt_GetInteger(go, "--restrictdb_n")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--ssifile") && fprintf(ofp, "# Override ssi file to: %s\n", esl_opt_GetString(go, "--ssifile")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--nonull2") && fprintf(ofp, "# null2 bias corrections: off\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "-Z") && fprintf(ofp, "# sequence search space set to: %.0f\n", esl_opt_GetReal(go, "-Z")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--domZ") && fprintf(ofp, "# domain search space set to: %.0f\n", esl_opt_GetReal(go, "--domZ")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--seed")) { if (esl_opt_GetInteger(go, "--seed") == 0 && fprintf(ofp, "# random number seed: one-time arbitrary\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); else if ( fprintf(ofp, "# random number seed set to: %d\n", esl_opt_GetInteger(go, "--seed")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } if (esl_opt_IsUsed(go, "--tformat") && fprintf(ofp, "# targ <seqfile> format asserted: %s\n", esl_opt_GetString(go, "--tformat")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (fprintf(ofp, "# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); return eslOK; }