/* The esl_random() unit test: * a binned frequency test. */ static void utest_random(long seed, int n, int nbins, int be_verbose) { ESL_RANDOMNESS *r = NULL; int *counts = NULL; double X2p = 0.; int i; double X2, exp, diff; if ((counts = malloc(sizeof(int) * nbins)) == NULL) esl_fatal("malloc failed"); esl_vec_ISet(counts, nbins, 0); /* This contrived call sequence exercises CreateTimeseeded() and * Init(), while leaving us a reproducible chain. Because it's * reproducible, we know this test succeeds, despite being * statistical in nature. */ if ((r = esl_randomness_CreateTimeseeded()) == NULL) esl_fatal("randomness create failed"); if (esl_randomness_Init(r, seed) != eslOK) esl_fatal("randomness init failed"); for (i = 0; i < n; i++) counts[esl_rnd_Roll(r, nbins)]++; /* X^2 value: \sum (o_i - e_i)^2 / e_i */ for (X2 = 0., i = 0; i < nbins; i++) { exp = (double) n / (double) nbins; diff = (double) counts[i] - exp; X2 += diff*diff/exp; } if (esl_stats_ChiSquaredTest(nbins, X2, &X2p) != eslOK) esl_fatal("chi squared eval failed"); if (be_verbose) printf("random(): \t%g\n", X2p); if (X2p < 0.01) esl_fatal("chi squared test failed"); esl_randomness_Destroy(r); free(counts); return; }
/* Function: p7_Calibrate() * Synopsis: Calibrate the E-value parameters of a model. * Incept: SRE, Thu Dec 25 09:29:31 2008 [Magallon] * * Purpose: Calibrate the E-value parameters of a model with * one calculation ($\lambda$) and two brief simulations * (Viterbi $\mu$, Forward $\tau$). * * Args: hmm - HMM to be calibrated * cfg_b - OPTCFG: ptr to optional build configuration; * if <NULL>, use default parameters. * byp_rng - BYPASS optimization: pass ptr to <ESL_RANDOMNESS> generator * if already known; * <*byp_rng> == NULL> if <rng> return is desired; * pass <NULL> to use and discard internal default. * byp_bg - BYPASS optimization: pass ptr to <P7_BG> if already known; * <*byp_bg == NULL> if <bg> return is desired; * pass <NULL> to use and discard internal default. * byp_gm - BYPASS optimization: pass ptr to <gm> profile if already known; * pass <*byp_gm == NULL> if <gm> return desired; * pass <NULL> to use and discard internal default. * byp_om - BYPASS optimization: pass ptr to <om> profile if already known; * pass <*byp_om == NULL> if <om> return desired; * pass <NULL> to use and discard internal default. * * Returns: <eslOK> on success. * * Throws: <eslEMEM> on allocation failure. * <eslEINVAL> if <hmm>, <gm>, <om> aren't compatible somehow. * * Xref: J4/41 */ int p7_Calibrate(P7_HMM *hmm, P7_BUILDER *cfg_b, ESL_RANDOMNESS **byp_rng, P7_BG **byp_bg, P7_PROFILE **byp_gm, P7_OPROFILE **byp_om) { P7_BG *bg = (esl_byp_IsProvided(byp_bg) ? *byp_bg : NULL); P7_PROFILE *gm = (esl_byp_IsProvided(byp_gm) ? *byp_gm : NULL); P7_OPROFILE *om = (esl_byp_IsProvided(byp_om) ? *byp_om : NULL); ESL_RANDOMNESS *r = (esl_byp_IsProvided(byp_rng) ? *byp_rng : NULL); char *errbuf = ((cfg_b != NULL) ? cfg_b->errbuf : NULL); int EmL = ((cfg_b != NULL) ? cfg_b->EmL : 200); int EmN = ((cfg_b != NULL) ? cfg_b->EmN : 200); int EvL = ((cfg_b != NULL) ? cfg_b->EvL : 200); int EvN = ((cfg_b != NULL) ? cfg_b->EvN : 200); int EfL = ((cfg_b != NULL) ? cfg_b->EfL : 100); int EfN = ((cfg_b != NULL) ? cfg_b->EfN : 200); double Eft = ((cfg_b != NULL) ? cfg_b->Eft : 0.04); double lambda, mmu, vmu, tau; int status; /* Configure any objects we need * that weren't already passed to us as a bypass optimization */ if (r == NULL) { if ((r = esl_randomness_CreateFast(42)) == NULL) ESL_XFAIL(eslEMEM, errbuf, "failed to create RNG"); } else if (cfg_b != NULL && cfg_b->do_reseeding) { esl_randomness_Init(r, esl_randomness_GetSeed(r)); } if (bg == NULL) { if ((bg = p7_bg_Create(hmm->abc)) == NULL) ESL_XFAIL(eslEMEM, errbuf, "failed to allocate background"); } /* there's an odd case where the <om> is provided and a <gm> isn't going to be returned * where we don't need a <gm> at all, and <gm> stays <NULL> after the next block. * Note that the <EvL> length in the ProfileConfig doesn't matter; the individual * calibration routines MSVMu(), etc. contain their own length reconfig calls. */ if ((esl_byp_IsInternal(byp_gm) && ! esl_byp_IsProvided(byp_om)) || esl_byp_IsReturned(byp_gm)) { if ( (gm = p7_profile_Create(hmm->M, hmm->abc)) == NULL) ESL_XFAIL(eslEMEM, errbuf, "failed to allocate profile"); if ( (status = p7_ProfileConfig(hmm, bg, gm, EvL, p7_LOCAL)) != eslOK) ESL_XFAIL(status, errbuf, "failed to configure profile"); } if (om == NULL) { if ((om = p7_oprofile_Create(hmm->M, hmm->abc)) == NULL) ESL_XFAIL(eslEMEM, errbuf, "failed to create optimized profile"); if ((status = p7_oprofile_Convert(gm, om)) != eslOK) ESL_XFAIL(status, errbuf, "failed to convert to optimized profile"); } /* The calibration steps themselves */ if ((status = p7_Lambda(hmm, bg, &lambda)) != eslOK) ESL_XFAIL(status, errbuf, "failed to determine lambda"); if ((status = p7_MSVMu (r, om, bg, EmL, EmN, lambda, &mmu)) != eslOK) ESL_XFAIL(status, errbuf, "failed to determine msv mu"); if ((status = p7_ViterbiMu(r, om, bg, EvL, EvN, lambda, &vmu)) != eslOK) ESL_XFAIL(status, errbuf, "failed to determine vit mu"); if ((status = p7_Tau (r, om, bg, EfL, EfN, lambda, Eft, &tau)) != eslOK) ESL_XFAIL(status, errbuf, "failed to determine fwd tau"); /* Store results */ hmm->evparam[p7_MLAMBDA] = om->evparam[p7_MLAMBDA] = lambda; hmm->evparam[p7_VLAMBDA] = om->evparam[p7_VLAMBDA] = lambda; hmm->evparam[p7_FLAMBDA] = om->evparam[p7_FLAMBDA] = lambda; hmm->evparam[p7_MMU] = om->evparam[p7_MMU] = mmu; hmm->evparam[p7_VMU] = om->evparam[p7_VMU] = vmu; hmm->evparam[p7_FTAU] = om->evparam[p7_FTAU] = tau; hmm->flags |= p7H_STATS; if (gm != NULL) { gm->evparam[p7_MLAMBDA] = lambda; gm->evparam[p7_VLAMBDA] = lambda; gm->evparam[p7_FLAMBDA] = lambda; gm->evparam[p7_MMU] = mmu; gm->evparam[p7_VMU] = vmu; gm->evparam[p7_FTAU] = tau; } if (byp_rng != NULL) *byp_rng = r; else esl_randomness_Destroy(r); /* bypass convention: no-op if rng was provided.*/ if (byp_bg != NULL) *byp_bg = bg; else p7_bg_Destroy(bg); /* bypass convention: no-op if bg was provided. */ if (byp_gm != NULL) *byp_gm = gm; else p7_profile_Destroy(gm); /* bypass convention: no-op if gm was provided. */ if (byp_om != NULL) *byp_om = om; else p7_oprofile_Destroy(om); /* bypass convention: no-op if om was provided. */ return eslOK; ERROR: if (! esl_byp_IsProvided(byp_rng)) esl_randomness_Destroy(r); if (! esl_byp_IsProvided(byp_bg)) p7_bg_Destroy(bg); if (! esl_byp_IsProvided(byp_gm)) p7_profile_Destroy(gm); if (! esl_byp_IsProvided(byp_om)) p7_oprofile_Destroy(om); return status; }
static void search_thread(void *arg) { int i; int count; int seed; int status; int workeridx; WORKER_INFO *info; ESL_THREADS *obj; ESL_SQ dbsq; ESL_STOPWATCH *w = NULL; /* timing stopwatch */ P7_BUILDER *bld = NULL; /* HMM construction configuration */ P7_BG *bg = NULL; /* null model */ P7_PIPELINE *pli = NULL; /* work pipeline */ P7_TOPHITS *th = NULL; /* top hit results */ P7_PROFILE *gm = NULL; /* generic model */ P7_OPROFILE *om = NULL; /* optimized query profile */ obj = (ESL_THREADS *) arg; esl_threads_Started(obj, &workeridx); info = (WORKER_INFO *) esl_threads_GetData(obj, workeridx); w = esl_stopwatch_Create(); bg = p7_bg_Create(info->abc); esl_stopwatch_Start(w); /* set up the dummy description and accession fields */ dbsq.desc = ""; dbsq.acc = ""; /* process a query sequence or hmm */ if (info->seq != NULL) { bld = p7_builder_Create(NULL, info->abc); if ((seed = esl_opt_GetInteger(info->opts, "--seed")) > 0) { esl_randomness_Init(bld->r, seed); bld->do_reseeding = TRUE; } bld->EmL = esl_opt_GetInteger(info->opts, "--EmL"); bld->EmN = esl_opt_GetInteger(info->opts, "--EmN"); bld->EvL = esl_opt_GetInteger(info->opts, "--EvL"); bld->EvN = esl_opt_GetInteger(info->opts, "--EvN"); bld->EfL = esl_opt_GetInteger(info->opts, "--EfL"); bld->EfN = esl_opt_GetInteger(info->opts, "--EfN"); bld->Eft = esl_opt_GetReal (info->opts, "--Eft"); if (esl_opt_IsOn(info->opts, "--mxfile")) status = p7_builder_SetScoreSystem (bld, esl_opt_GetString(info->opts, "--mxfile"), NULL, esl_opt_GetReal(info->opts, "--popen"), esl_opt_GetReal(info->opts, "--pextend"), bg); else status = p7_builder_LoadScoreSystem(bld, esl_opt_GetString(info->opts, "--mx"), esl_opt_GetReal(info->opts, "--popen"), esl_opt_GetReal(info->opts, "--pextend"), bg); if (status != eslOK) { //client_error(info->sock, status, "hmmgpmd: failed to set single query sequence score system: %s", bld->errbuf); fprintf(stderr, "hmmpgmd: failed to set single query sequence score system: %s", bld->errbuf); pthread_exit(NULL); return; } p7_SingleBuilder(bld, info->seq, bg, NULL, NULL, NULL, &om); /* bypass HMM - only need model */ p7_builder_Destroy(bld); } else { gm = p7_profile_Create (info->hmm->M, info->abc); om = p7_oprofile_Create(info->hmm->M, info->abc); p7_ProfileConfig(info->hmm, bg, gm, 100, p7_LOCAL); p7_oprofile_Convert(gm, om); } /* Create processing pipeline and hit list */ th = p7_tophits_Create(); pli = p7_pipeline_Create(info->opts, om->M, 100, FALSE, p7_SEARCH_SEQS); p7_pli_NewModel(pli, om, bg); if (pli->Z_setby == p7_ZSETBY_NTARGETS) pli->Z = info->db_Z; /* loop until all sequences have been processed */ count = 1; while (count > 0) { int inx; int blksz; HMMER_SEQ **sq; /* grab the next block of sequences */ if (pthread_mutex_lock(info->inx_mutex) != 0) p7_Fail("mutex lock failed"); inx = *info->inx; blksz = *info->blk_size; if (inx > *info->limit) { blksz /= 5; if (blksz < 1000) { *info->limit = info->sq_cnt * 2; } else { *info->limit = inx + (info->sq_cnt - inx) * 2 / 3; } } *info->blk_size = blksz; *info->inx += blksz; if (pthread_mutex_unlock(info->inx_mutex) != 0) p7_Fail("mutex unlock failed"); sq = info->sq_list + inx; count = info->sq_cnt - inx; if (count > blksz) count = blksz; /* Main loop: */ for (i = 0; i < count; ++i, ++sq) { if ( !(info->range_list) || hmmpgmd_IsWithinRanges ((*sq)->idx, info->range_list)) { dbsq.name = (*sq)->name; dbsq.dsq = (*sq)->dsq; dbsq.n = (*sq)->n; dbsq.idx = (*sq)->idx; if((*sq)->desc != NULL) dbsq.desc = (*sq)->desc; p7_bg_SetLength(bg, dbsq.n); p7_oprofile_ReconfigLength(om, dbsq.n); p7_Pipeline(pli, om, bg, &dbsq, th); p7_pipeline_Reuse(pli); } } } /* make available the pipeline objects to the main thread */ info->th = th; info->pli = pli; /* clean up */ p7_bg_Destroy(bg); p7_oprofile_Destroy(om); if (gm != NULL) p7_profile_Destroy(gm); esl_stopwatch_Stop(w); info->elapsed = w->elapsed; esl_stopwatch_Destroy(w); esl_threads_Finished(obj, workeridx); pthread_exit(NULL); return; }
/* glocal_region_trace_ensemble() * EPN, Tue Oct 5 10:13:25 2010 * * Based on p7_domaindef.c's region_trace_ensemble(). Modified so that * generic matrices (which can be used for glocally configured models) * can be used. An additional parameter <do_null2> has been added, * so that null2-related calculations are only done if necessary. * That is, they're skipped if null2 has been turned off in the pipeline. * * Notes from p7_domaindef.c::region_trace_ensemble(): *~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * SRE, Fri Feb 8 11:49:44 2008 [Janelia] * * Here, we've decided that region <ireg>..<jreg> in sequence <dsq> might be * composed of more than one domain, and we're going to use clustering * of a posterior ensemble of stochastic tracebacks to sort it out. * * Caller provides a filled Forward matrix in <fwd> for the sequence * region <dsq+ireg-1>, length <jreg-ireg+1>, for the model <om> * configured in multihit mode with its target length distribution * set to the total length of <dsq>: i.e., the same model * configuration used to score the complete sequence (if it weren't * multihit, we wouldn't be worried about multiple domains). * * Caller also provides a DP matrix in <wrk> containing at least one * row, for use as temporary workspace. (This will typically be the * caller's Backwards matrix, which we haven't yet used at this point * in the processing pipeline.) * * Caller provides <ddef>, which defines heuristic parameters that * control the clustering, and provides working space for the * calculation and the answers. The <ddef->sp> object must have been * reused (i.e., it needs to be fresh; we're going to use it here); * the caller needs to Reuse() it specifically, because it can't just * Reuse() the whole <ddef>, when it's in the process of analyzing * regions. * * Upon return, <*ret_nc> contains the number of clusters that were * defined. * * The caller can retrieve info on each cluster by calling * <p7_spensemble_GetClusterCoords(ddef->sp...)> on the * <P7_SPENSEMBLE> object in <ddef>. * * Other information on what's happened in working memory: * * <ddef->n2sc[ireg..jreg]> now contains log f'(x_i) / f(x_i) null2 scores * for each residue. * * <ddef->sp> gets filled in, and upon return, it's holding the answers * (the cluster definitions). When the caller is done retrieving those * answers, it needs to <esl_spensemble_Reuse()> it before calling * <region_trace_ensemble()> again. * * <ddef->tr> is used as working memory for sampled traces. * * <wrk> has had its zero row clobbered as working space for a null2 calculation. *~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ static int glocal_region_trace_ensemble(P7_DOMAINDEF *ddef, const P7_PROFILE *gm, const ESL_DSQ *dsq, int ireg, int jreg, const P7_GMX *fwd, P7_GMX *wrk, int do_null2, int *ret_nc) { int Lr = jreg-ireg+1; int t, d, d2; int nov, n; int nc; int pos; float null2[p7_MAXCODE]; esl_vec_FSet(ddef->n2sc+ireg, Lr, 0.0); /* zero the null2 scores in region */ /* By default, we make results reproducible by forcing a reset of * the RNG to its originally seeded state. */ if (ddef->do_reseeding) esl_randomness_Init(ddef->r, esl_randomness_GetSeed(ddef->r)); /* Collect an ensemble of sampled traces; calculate null2 odds ratios from these if nec */ for (t = 0; t < ddef->nsamples; t++) { p7_GStochasticTrace(ddef->r, dsq+ireg-1, Lr, gm, fwd, ddef->tr); p7_trace_Index(ddef->tr); pos = 1; for (d = 0; d < ddef->tr->ndom; d++) { p7_spensemble_Add(ddef->sp, t, ddef->tr->sqfrom[d]+ireg-1, ddef->tr->sqto[d]+ireg-1, ddef->tr->hmmfrom[d], ddef->tr->hmmto[d]); if(do_null2) { p7_GNull2_ByTrace(gm, ddef->tr, ddef->tr->tfrom[d], ddef->tr->tto[d], wrk, null2); /* residues outside domains get bumped +1: because f'(x) = f(x), so f'(x)/f(x) = 1 in these segments */ for (; pos <= ddef->tr->sqfrom[d]; pos++) ddef->n2sc[ireg+pos-1] += 1.0; /* Residues inside domains get bumped by their null2 ratio */ for (; pos <= ddef->tr->sqto[d]; pos++) ddef->n2sc[ireg+pos-1] += null2[dsq[ireg+pos-1]]; } } if(do_null2) { /* the remaining residues in the region outside any domains get +1 */ for (; pos <= Lr; pos++) ddef->n2sc[ireg+pos-1] += 1.0; } p7_trace_Reuse(ddef->tr); } /* Convert the accumulated n2sc[] ratios in this region to log odds null2 scores on each residue. */ if(do_null2) { for (pos = ireg; pos <= jreg; pos++) ddef->n2sc[pos] = logf(ddef->n2sc[pos] / (float) ddef->nsamples); } /* Cluster the ensemble of traces to break region into envelopes. */ p7_spensemble_Cluster(ddef->sp, ddef->min_overlap, ddef->of_smaller, ddef->max_diagdiff, ddef->min_posterior, ddef->min_endpointp, &nc); /* A little hacky now. Remove "dominated" domains relative to seq coords. */ for (d = 0; d < nc; d++) ddef->sp->assignment[d] = 0; /* overload <assignment> to flag that a domain is dominated */ /* who dominates who? (by post prob) */ for (d = 0; d < nc; d++) { for (d2 = d+1; d2 < nc; d2++) { nov = ESL_MIN(ddef->sp->sigc[d].j, ddef->sp->sigc[d2].j) - ESL_MAX(ddef->sp->sigc[d].i, ddef->sp->sigc[d2].i) + 1; if (nov == 0) break; n = ESL_MIN(ddef->sp->sigc[d].j - ddef->sp->sigc[d].i + 1, ddef->sp->sigc[d2].j - ddef->sp->sigc[d2].i + 1); if ((float) nov / (float) n >= 0.8) /* overlap */ { if (ddef->sp->sigc[d].prob > ddef->sp->sigc[d2].prob) ddef->sp->assignment[d2] = 1; else ddef->sp->assignment[d] = 1; } } } /* shrink the sigc list, removing dominated domains */ d = 0; for (d2 = 0; d2 < nc; d2++) { if (ddef->sp->assignment[d2]) continue; /* skip domain d2, it's dominated. */ if (d != d2) memcpy(ddef->sp->sigc + d, ddef->sp->sigc + d2, sizeof(struct p7_spcoord_s)); d++; } ddef->sp->nc = d; *ret_nc = d; return eslOK; }