/* Function: esl_msashuffle_Shuffle() * Synopsis: Shuffle an alignment's columns. * * Purpose: Returns a column-shuffled version of <msa> in <shuf>, * using random generator <r>. Shuffling by columns * preserves the \% identity of the original * alignment. <msa> and <shuf> can be identical, to shuffle * in place. * * The caller sets up the rest of the data (everything but * the alignment itself) in <shuf> the way it wants, * including sequence names, MSA name, and other * annotation. The easy thing to do is to make <shuf> * a copy of <msa>: the caller might create <shuf> by * a call to <esl_msa_Clone()>. * * The alignments <msa> and <shuf> can both be in digital * mode, or can both be in text mode; you cannot mix * digital and text modes. * * Returns: <eslOK> on success. * * Throws: <eslEINVAL> if <msa>,<shuf> aren't in the same mode (digital vs. text). */ int esl_msashuffle_Shuffle(ESL_RANDOMNESS *r, ESL_MSA *msa, ESL_MSA *shuf) { int i, pos, alen; if (! (msa->flags & eslMSA_DIGITAL)) { char c; if (shuf->flags & eslMSA_DIGITAL) ESL_EXCEPTION(eslEINVAL, "<shuf> must be in text mode if <msa> is"); if (msa != shuf) { for (i = 0; i < msa->nseq; i++) strcpy(shuf->aseq[i], msa->aseq[i]); } for (i = 0; i < msa->nseq; i++) shuf->aseq[i][msa->alen] = '\0'; for (alen = msa->alen; alen > 1; alen--) { pos = esl_rnd_Roll(r, alen); for (i = 0; i < msa->nseq; i++) { c = msa->aseq[i][pos]; shuf->aseq[i][pos] = shuf->aseq[i][alen-1]; shuf->aseq[i][alen-1] = c; } } } #ifdef eslAUGMENT_ALPHABET else { ESL_DSQ x; if (! (shuf->flags & eslMSA_DIGITAL)) ESL_EXCEPTION(eslEINVAL, "<shuf> must be in digital mode if <msa> is"); if (msa != shuf) { for (i = 0; i < msa->nseq; i++) memcpy(shuf->ax[i], msa->ax[i], (msa->alen + 2) * sizeof(ESL_DSQ)); } for (i = 0; i < msa->nseq; i++) shuf->ax[i][msa->alen+1] = eslDSQ_SENTINEL; for (alen = msa->alen; alen > 1; alen--) { pos = esl_rnd_Roll(r, alen) + 1; for (i = 0; i < msa->nseq; i++) { x = msa->ax[i][pos]; shuf->ax[i][pos] = shuf->ax[i][alen]; shuf->ax[i][alen] = x; } } } #endif /*eslAUGMENT_ALPHABET*/ return eslOK; }
/* Fetch in a random sequence of length <L> from the the pre-digitized * concatenated sequence database, select a random subseq, shuffle it * by the chosen algorithm; set dsq[1..L] to the resulting randomized * segment. * * If <logfp> is non-NULL, append one or more "<sqname> <from> <to>" * fields to current line, to record where the random segment was * selected from. This is useful in cases where we want to track back * the origin of a high-scoring segment, in case the randomization * wasn't good enough to obscure the identity of a segment. * */ static int set_random_segment(ESL_GETOPTS *go, struct cfg_s *cfg, FILE *logfp, ESL_DSQ *dsq, int L) { ESL_SQ *sq = esl_sq_CreateDigital(cfg->abc); int minDPL = esl_opt_GetInteger(go, "--minDPL"); int db_dependent = (esl_opt_GetBoolean(go, "--iid") == TRUE ? FALSE : TRUE); char *pkey = NULL; int start, end; int64_t Lseq; int status; if (L==0) return eslOK; if (L > cfg->db_maxL) esl_fatal("can't fetch a segment of length %d; database max is %d\n", L, cfg->db_maxL); /* fetch a random subseq from the source database */ esl_sq_GrowTo(sq, L); if (db_dependent) { do { if (pkey != NULL) free(pkey); if (esl_ssi_FindNumber(cfg->dbfp->data.ascii.ssi, esl_rnd_Roll(cfg->r, cfg->db_nseq), NULL, NULL, NULL, &Lseq, &pkey) != eslOK) esl_fatal("failed to look up a random seq"); } while (Lseq < L); start = 1 + esl_rnd_Roll(cfg->r, Lseq-L); end = start + L - 1; if (esl_sqio_FetchSubseq(cfg->dbfp, pkey, start, end, sq) != eslOK) esl_fatal("failed to fetch subseq"); esl_sq_ConvertDegen2X(sq); } /* log sequence source info: <name> <start> <end> */ if (logfp != NULL && db_dependent) fprintf(logfp, " %-15s %5d %5d", pkey, start, end); /* Now apply the appropriate randomization algorithm */ if (esl_opt_GetBoolean(go, "--mono")) status = esl_rsq_XShuffle (cfg->r, sq->dsq, L, sq->dsq); else if (esl_opt_GetBoolean(go, "--di")) { if (L < minDPL) status = esl_rsq_XShuffle (cfg->r, sq->dsq, L, sq->dsq); else status = esl_rsq_XShuffleDP(cfg->r, sq->dsq, L, cfg->abc->Kp, sq->dsq); } else if (esl_opt_GetBoolean(go, "--markov0")) status = esl_rsq_XMarkov0 (cfg->r, sq->dsq, L, cfg->abc->Kp, sq->dsq); else if (esl_opt_GetBoolean(go, "--markov1")) status = esl_rsq_XMarkov1 (cfg->r, sq->dsq, L, cfg->abc->Kp, sq->dsq); else if (esl_opt_GetBoolean(go, "--reverse")) status = esl_rsq_XReverse (sq->dsq, L, sq->dsq); else if (esl_opt_GetBoolean(go, "--iid")) status = esl_rsq_xIID (cfg->r, cfg->fq, cfg->abc->K, L, sq->dsq); else status = eslEINCONCEIVABLE; if (status != eslOK) esl_fatal("esl's shuffling failed"); memcpy(dsq, sq->dsq+1, sizeof(ESL_DSQ) * L); esl_sq_Destroy(sq); free(pkey); return eslOK; }
/* Function: p7_anchors_SampleFromTrace() * Synopsis: Make a reasonable anchor set from a trace. * * Purpose: Make a reasonable anchor set from trace <tr>, by * randomly sampling a match state in each domain. * Return the anchor set in <anch>, which will be * reallocated if needed. * * <tr> must be indexed by the caller with <p7_trace_Index()>. * * Returns: <eslOK> on success. * * Throws: <eslEMEM> on reallocation failure. */ int p7_anchors_SampleFromTrace(P7_ANCHORS *anch, ESL_RANDOMNESS *rng, const P7_TRACE *tr) { int D = tr->ndom; int d,z,w; int nM; int status; if ((status = p7_anchors_Resize(anch, D)) != eslOK) goto ERROR; for (d = 1; d <= D; d++) { for (nM = 0, z = tr->tfrom[d-1]; z <= tr->tto[d-1]; z++) // P7_TRACE numbers domains 0..D-1, off by one from P7_ANCHORS if (p7_trace_IsM(tr->st[z])) nM++; ESL_DASSERT1(( nM )); w = 1+esl_rnd_Roll(rng, nM); // w = 1..nM : choice of which M state to make the anchor for ( z = tr->tfrom[d-1]; w; z++) // when w reaches 0, tr->st[z] is the M state we want to make the anchor, and we break out; there's a final z++, so the state we want ends up being z-1 if (p7_trace_IsM(tr->st[z])) w--; ESL_DASSERT1(( p7_trace_IsM(tr->st[z-1]) )); // since the logic above is overly elegant... better doublecheck. anch->a[d].i0 = tr->i[z-1]; anch->a[d].k0 = tr->k[z-1]; } p7_anchor_SetSentinels(anch->a, D, tr->L, tr->M); anch->D = D; return eslOK; ERROR: return status; }
static void utest_ReadWrite(ESL_RANDOMNESS *rng) { char msg[] = "bg Read/Write unit test failed"; char tmpfile[32] = "esltmpXXXXXX"; FILE *fp = NULL; ESL_ALPHABET *abc = NULL; /* random alphabet choice eslRNA..eslDICE */ float *fq = NULL; P7_BG *bg = NULL; if ((abc = esl_alphabet_Create(esl_rnd_Roll(rng, 5) + 1)) == NULL) esl_fatal(msg); if (( bg = p7_bg_Create(abc)) == NULL) esl_fatal(msg); if (( fq = malloc(sizeof(float) * abc->K)) == NULL) esl_fatal(msg); do { if (esl_dirichlet_FSampleUniform(rng, abc->K, fq) != eslOK) esl_fatal(msg); } while (esl_vec_FMin(fq, abc->K) < 0.001); /* small p's will get rounded off and fail FCompare() */ esl_vec_FCopy(fq, abc->K, bg->f); if (esl_tmpfile_named(tmpfile, &fp) != eslOK) esl_fatal(msg); if ( p7_bg_Write(fp, bg) != eslOK) esl_fatal(msg); fclose(fp); esl_vec_FSet(bg->f, bg->abc->K, 0.0); if ( p7_bg_Read(tmpfile, bg, NULL) != eslOK) esl_fatal(msg); if ( esl_vec_FCompare(fq, bg->f, bg->abc->K, 0.01) != eslOK) esl_fatal(msg); p7_bg_Destroy(bg); esl_alphabet_Destroy(abc); free(fq); remove(tmpfile); }
/* sample_endpoints() * Incept: SRE, Mon Jan 22 10:43:20 2007 [Janelia] * * Purpose: Given a profile <gm> and random number source <r>, sample * a begin transition from the implicit probabilistic profile * model, yielding a sampled start and end node; return these * via <ret_kstart> and <ret_kend>. * * By construction, the entry at node <kstart> is into a * match state, but the exit from node <kend> might turn * out to be from either a match or delete state. * * We assume that exits j are uniformly distributed for a * particular entry point i: $a_{ij} =$ constant $\forall * j$. * * Returns: <eslOK> on success. * * Throws: <eslEMEM> on allocation error. * * Xref: STL11/138 */ static int sample_endpoints(ESL_RANDOMNESS *r, const P7_PROFILE *gm, int *ret_kstart, int *ret_kend) { float *pstart = NULL; int k; int kstart, kend; int status; /* We have to backcalculate a probability distribution from the * lod B->Mk scores in a local model; this is a little time consuming, * but we don't have to do it often. */ ESL_ALLOC(pstart, sizeof(float) * (gm->M+1)); pstart[0] = 0.0f; for (k = 1; k <= gm->M; k++) pstart[k] = exp(p7P_TSC(gm, k-1, p7P_BM)) * (gm->M - k + 1); /* multiply p_ij by the number of exits j */ kstart = esl_rnd_FChoose(r, pstart, gm->M+1); /* sample the starting position from that distribution */ kend = kstart + esl_rnd_Roll(r, gm->M-kstart+1); /* and the exit uniformly from possible exits for it */ free(pstart); *ret_kstart = kstart; *ret_kend = kend; return eslOK; ERROR: if (pstart != NULL) free(pstart); *ret_kstart = 0; *ret_kend = 0; return status; }
/* Function: esl_dst_XAverageId() * Synopsis: Calculate avg identity for digital MSA * Incept: SRE, Fri May 18 15:19:14 2007 [Janelia] * * Purpose: Calculates the average pairwise fractional identity in * a digital multiple sequence alignment <ax>, consisting of <N> * aligned digital sequences of identical length. * * If an exhaustive calculation would require more than * <max_comparisons> pairwise comparisons, then instead of * looking at all pairs, calculate the average over a * stochastic sample of <max_comparisons> random pairs. * This allows the routine to work efficiently even on very * deep MSAs. * * Each fractional pairwise identity (range $[0..$ pid $..1]$ * is calculated using <esl_dsq_XPairId()>. * * Returns: <eslOK> on success, and <*ret_id> contains the average * fractional identity. * * Throws: <eslEMEM> on allocation failure. * <eslEINVAL> if any of the aligned sequence pairs aren't * of the same length. * In either case, <*ret_id> is set to 0. */ int esl_dst_XAverageId(const ESL_ALPHABET *abc, ESL_DSQ **ax, int N, int max_comparisons, double *ret_id) { int status; double id; double sum; int i,j,n; if (N <= 1) { *ret_id = 1.; return eslOK; } *ret_id = 0.; /* Is N small enough that we can average over all pairwise comparisons? watch out for numerical overflow in this: Pfam N's easily overflow when squared */ if (N <= max_comparisons && N <= sqrt(2. * max_comparisons) && (N * (N-1) / 2) <= max_comparisons) { for (i = 0; i < N; i++) for (j = i+1; j < N; j++) { if ((status = esl_dst_XPairId(abc, ax[i], ax[j], &id, NULL, NULL)) != eslOK) return status; sum += id; } sum /= (double) (N * (N-1) / 2); } /* If nseq is large, calculate average over a stochastic sample. */ else { ESL_RANDOMNESS *r = esl_randomness_CreateTimeseeded(); for (n = 0; n < max_comparisons; n++) { do { i = esl_rnd_Roll(r, N); j = esl_rnd_Roll(r, N); } while (j == i); /* make sure j != i */ if ((status = esl_dst_XPairId(abc, ax[i], ax[j], &id, NULL, NULL)) != eslOK) return status; sum += id; } sum /= (double) max_comparisons; esl_randomness_Destroy(r); } *ret_id = sum; return eslOK; }
/* Function: esl_msashuffle_Bootstrap() * Synopsis: Bootstrap sample an MSA. * Incept: SRE, Tue Jan 22 11:05:07 2008 [Janelia] * * Purpose: Takes a bootstrap sample of <msa> (sample <alen> columns, * with replacement) and puts it in <bootsample>, using * random generator <r>. * * The caller provides allocated space for <bootsample>. * It must be different space than <msa>; you cannot take * a bootstrap sample "in place". The caller sets up the * rest of the data in <bootsample> (everything but the * alignment itself) the way it wants, including sequence * names, MSA name, and other annotation. The easy thing to * do is to initialize <bootsample> by cloning <msa>. * * The alignments <msa> and <bootsample> can both be in digital * mode, or can both be in text mode; you cannot mix * digital and text modes. * * Returns: <eslOK> on success, and the alignment in <bootsample> is * set to be a bootstrap resample of the alignment in <msa>. * * Throws: <eslEINVAL> if <msa>,<bootsample> aren't in the same mode * (digital vs. text). */ int esl_msashuffle_Bootstrap(ESL_RANDOMNESS *r, ESL_MSA *msa, ESL_MSA *bootsample) { int i, pos, col; /* contract checks */ if ( (msa->flags & eslMSA_DIGITAL) && ! (bootsample->flags & eslMSA_DIGITAL)) ESL_EXCEPTION(eslEINVAL, "<msa> and <bootsample> must both be in digital or text mode"); if (! (msa->flags & eslMSA_DIGITAL) && (bootsample->flags & eslMSA_DIGITAL)) ESL_EXCEPTION(eslEINVAL, "<msa> and <bootsample> must both be in digital or text mode"); if (! (msa->flags & eslMSA_DIGITAL)) { for (pos = 0; pos < msa->alen; pos++) { col = esl_rnd_Roll(r, msa->alen); for (i = 0; i < msa->nseq; i++) bootsample->aseq[i][pos] = msa->aseq[i][col]; } for (i = 0; i < msa->nseq; i++) bootsample->aseq[i][msa->alen] = '\0'; } #ifdef eslAUGMENT_ALPHABET else { for (i = 0; i < msa->nseq; i++) bootsample->ax[i][0] = eslDSQ_SENTINEL; for (pos = 1; pos <= msa->alen; pos++) { col = esl_rnd_Roll(r, msa->alen) + 1; for (i = 0; i < msa->nseq; i++) bootsample->ax[i][pos] = msa->ax[i][col]; } for (i = 0; i < msa->nseq; i++) bootsample->ax[i][msa->alen+1] = eslDSQ_SENTINEL; } #endif /*eslAUGMENT_ALPHABET*/ return eslOK; }
/* Function: esl_dst_CAverageId() * Synopsis: Calculate avg identity for multiple alignment * Incept: SRE, Fri May 18 15:02:38 2007 [Janelia] * * Purpose: Calculates the average pairwise fractional identity in * a multiple sequence alignment <as>, consisting of <N> * aligned character sequences of identical length. * * If an exhaustive calculation would require more than * <max_comparisons> pairwise comparisons, then instead of * looking at all pairs, calculate the average over a * stochastic sample of <max_comparisons> random pairs. * This allows the routine to work efficiently even on very * deep MSAs. * * Each fractional pairwise identity (range $[0..$ pid $..1]$ * is calculated using <esl_dsq_CPairId()>. * * Returns: <eslOK> on success, and <*ret_id> contains the average * fractional identity. * * Throws: <eslEMEM> on allocation failure. * <eslEINVAL> if any of the aligned sequence pairs aren't * of the same length. * In either case, <*ret_id> is set to 0. */ int esl_dst_CAverageId(char **as, int N, int max_comparisons, double *ret_id) { int status; double id; double sum; int i,j,n; if (N <= 1) { *ret_id = 1.; return eslOK; } *ret_id = 0.; /* Is nseq small enough that we can average over all pairwise comparisons? */ if ((N * (N-1) / 2) <= max_comparisons) { for (i = 0; i < N; i++) for (j = i+1; j < N; j++) { if ((status = esl_dst_CPairId(as[i], as[j], &id, NULL, NULL)) != eslOK) return status; sum += id; } id /= (double) (N * (N-1) / 2); } /* If nseq is large, calculate average over a stochastic sample. */ else { ESL_RANDOMNESS *r = esl_randomness_CreateTimeseeded(); for (n = 0; n < max_comparisons; n++) { do { i = esl_rnd_Roll(r, N); j = esl_rnd_Roll(r, N); } while (j == i); /* make sure j != i */ if ((status = esl_dst_CPairId(as[i], as[j], &id, NULL, NULL)) != eslOK) return status; sum += id; } id /= (double) max_comparisons; esl_randomness_Destroy(r); } *ret_id = id; return eslOK; }
static void generate_testfile(ESL_RANDOMNESS *rng, char *tmpfile, int *is_data, int nlines) { char *msg = "esl_recorder:: test file generator failed"; FILE *fp = NULL; int in_block = esl_rnd_Roll(rng, 2); /* TRUE | FALSE */ int nblock = 1 + esl_rnd_Roll(rng, 10); /* 1..10 */ int i; if (esl_tmpfile_named(tmpfile, &fp) != eslOK) esl_fatal(msg); for (i = 0; i < nlines; i++) { is_data[i] = in_block ? TRUE : FALSE; fprintf(fp, "%c%d\n", (in_block ? '#' : ' '), i); if (--nblock == 0) { in_block = ! in_block; nblock = 1 + esl_rnd_Roll(rng, 10); /* 1..10 */ } } fclose(fp); }
/* Function: p7_anchors_Sample() * Synopsis: Sample a randomized anchor set, for testing. * * Purpose: Randomly generate an anchor set for a profile of * length <M> compared to a sequence of length <L>, * with a random number of up to <maxD> anchors. */ int p7_anchors_Sample(ESL_RANDOMNESS *rng, int L, int M, int maxD, P7_ANCHORS *anch) { int D = 1 + esl_rnd_Roll(rng, maxD); int32_t *tmp = NULL; int i,d,r; int status; if ((status = p7_anchors_Resize(anch, D)) != eslOK) goto ERROR; /* A reservoir sort like algorithm samples a combination of <D> i0 anchors, w/o replacement */ ESL_ALLOC(tmp, sizeof(int32_t) * D); for (i = 0; i < L; i++) { if (i < D) tmp[i] = i+1; else { r = esl_rnd_Roll(rng, L); if (r < D) tmp[r] = i+1; } } esl_vec_ISortIncreasing(tmp, D); for (d = 1; d <= D; d++) { anch->a[d].i0 = tmp[d-1]; // the <D> i0's are sorted anch->a[d].k0 = 1 + esl_rnd_Roll(rng, M); // k0's are independent, uniform on 1..M } p7_anchor_SetSentinels(anch->a, D, L, M); anch->D = D; free(tmp); return eslOK; ERROR: if (tmp) free(tmp); return status; }
/* Function: p7_tophits_TestSample() * Synopsis: Sample a random, bogus, mostly-syntactically-valid P7_TOPHITS * * Purpose: Sample a random but syntactically valid <P7_TOPHITS> * structure, using random number generator <rng>. Return * it thru <*ret_th>. It was allocated here; caller becomes * responsible for freeing it with <p7_tophits_Destroy()>. * * <th->hit[]> 'sorted' array of ptrs is put in a * randomized order. The <th->sortkey> value and * <th->is_sorted*> flags are set randomly, and have * nothing to do with the order of <th->hit[]>. (Main use * here is for testing faithful communication of the * object, including its sorted ptrs.) * * Returns: <eslOK> on success, and <*ret_th> points to the sampled * <P7_TOPHITS> object. * * Throws: (no abnormal error conditions) * * Notes: Easel code spec requires that TestSample() generates an * object that passes whatever Validate() looks for. */ int p7_tophits_TestSample(ESL_RANDOMNESS *rng, P7_TOPHITS **ret_th) { P7_TOPHITS *th = NULL; int nhits = 1049; /* prime. don't make it divisible by any chunk size. */ int h,n; int status; if (( th = p7_tophits_Create(nhits)) == NULL) { status = eslEMEM; goto ERROR; } th->nreported = 1+esl_rnd_Roll(rng, nhits); th->nincluded = 1+esl_rnd_Roll(rng, nhits); th->is_sorted_by_sortkey = esl_rnd_Roll(rng, 2); if (! th->is_sorted_by_sortkey) th->is_sorted_by_seqidx = esl_rnd_Roll(rng, 2); for (h = 0; h < nhits; h++) { if ( (status = p7_hit_TestSample(rng, &(th->unsrt[h]))) != eslOK) goto ERROR; th->N++; /* keep th->N precisely up to date; p7_tophits_Destroy() must work w/ unfinished <th> on error */ } /* Put the hit[] "sorted" array in a random order w/ a Fisher-Yates shuffle */ for (h = 0; h < th->N; h++) th->hit[h] = &(th->unsrt[h]); for (n = th->N; n > 1; n--) { h = esl_rnd_Roll(rng, n); ESL_SWAP( th->hit[h], th->hit[n-1], P7_HIT *); } *ret_th = th; return eslOK; ERROR: if (th) p7_tophits_Destroy(th); *ret_th = NULL; return status; }
static int synthesize_negatives(ESL_GETOPTS *go, struct cfg_s *cfg, int nneg) { ESL_SQ *sq = esl_sq_CreateDigital(cfg->abc); int a; int i; int L1,L2,L3,d1n,d2n; for (i = 0; i < nneg; i++) { /* Select a random test seq, to use its same segments */ a = esl_rnd_Roll(cfg->r, cfg->ntest); L1 = cfg->test_lens[a].L1; L2 = cfg->test_lens[a].L2; L3 = cfg->test_lens[a].L3; d1n = cfg->test_lens[a].d1n; d2n = cfg->test_lens[a].d2n; esl_sq_GrowTo(sq, cfg->test_lens[a].L); esl_sq_FormatName(sq, "decoy%d", i+1); esl_sq_FormatDesc(sq, "L=%d in segments: %d/%d/%d/%d/%d", cfg->test_lens[a].L, L1, d1n, L2, d2n, L3); sq->n = cfg->test_lens[a].L; fprintf(cfg->negsummfp, "%-15s %5d %5d %5d %5d %5d %5d", sq->name, (int) sq->n, L1, d1n, L2, d2n, L3); sq->dsq[0] = sq->dsq[cfg->test_lens[a].L+1] = eslDSQ_SENTINEL; set_random_segment(go, cfg, cfg->negsummfp, sq->dsq+1, L1); set_random_segment(go, cfg, cfg->negsummfp, sq->dsq+1+L1, d1n); set_random_segment(go, cfg, cfg->negsummfp, sq->dsq+1+L1+d1n, L2); set_random_segment(go, cfg, cfg->negsummfp, sq->dsq+1+L1+d1n+L2, d2n); set_random_segment(go, cfg, cfg->negsummfp, sq->dsq+1+L1+d1n+L2+d2n, L3); fprintf(cfg->negsummfp, "\n"); esl_sqio_Write(cfg->out_seqfp, sq, eslSQFILE_FASTA, FALSE); esl_sq_Reuse(sq); } esl_sq_Destroy(sq); return eslOK; }
/* Function: esl_msashuffle_PermuteSequenceOrder() * Synopsis: Permutes the order of the sequences. * * Purpose: Randomly permute the order of the sequences in <msa>, * and any associated sequence annotation, in place. * * Returns: <eslOK> on success. * * Throws: (no abnormal error conditions) */ int esl_msashuffle_PermuteSequenceOrder(ESL_RANDOMNESS *r, ESL_MSA *msa) { void *tmp; double tmpwgt; int64_t tmplen; int N, i, tag; for (N = msa->nseq; N > 1; N--) { i = esl_rnd_Roll(r, N); /* idx = 0..N-1 */ if ( ! (msa->flags & eslMSA_DIGITAL)) { tmp = msa->aseq[i]; msa->aseq[i] = msa->aseq[N-1]; msa->aseq[N-1] = tmp; } #ifdef eslAUGMENT_ALPHABET else { tmp = msa->ax[i]; msa->ax[i] = msa->ax[N-1]; msa->ax[N-1] = tmp; } #endif tmp = msa->sqname[i]; msa->sqname[i] = msa->sqname[N-1]; msa->sqname[N-1] = tmp; tmpwgt = msa->wgt[i]; msa->wgt[i] = msa->wgt[N-1]; msa->wgt[N-1] = tmpwgt; if (msa->sqacc) { tmp = msa->sqacc[i]; msa->sqacc[i] = msa->sqacc[N-1]; msa->sqacc[N-1] = tmp; } if (msa->sqdesc) { tmp = msa->sqdesc[i]; msa->sqdesc[i] = msa->sqdesc[N-1]; msa->sqdesc[N-1] = tmp; } if (msa->ss) { tmp = msa->ss[i]; msa->ss[i] = msa->ss[N-1]; msa->ss[N-1] = tmp; } if (msa->sa) { tmp = msa->sa[i]; msa->sa[i] = msa->sa[N-1]; msa->sa[N-1] = tmp; } if (msa->pp) { tmp = msa->pp[i]; msa->pp[i] = msa->pp[N-1]; msa->pp[N-1] = tmp; } if (msa->sqlen) { tmplen = msa->sqlen[i]; msa->sqlen[i] = msa->sqlen[N-1]; msa->sqlen[N-1] = tmplen; } if (msa->sslen) { tmplen = msa->sslen[i]; msa->sslen[i] = msa->sslen[N-1]; msa->sslen[N-1] = tmplen; } if (msa->salen) { tmplen = msa->salen[i]; msa->salen[i] = msa->salen[N-1]; msa->salen[N-1] = tmplen; } if (msa->pplen) { tmplen = msa->pplen[i]; msa->pplen[i] = msa->pplen[N-1]; msa->pplen[N-1] = tmplen; } for (tag = 0; tag < msa->ngs; tag++) if (msa->gs[tag]) { tmp = msa->gs[tag][i]; msa->gs[tag][i] = msa->gs[tag][N-1]; msa->gs[tag][N-1] = tmp; } for (tag = 0; tag < msa->ngr; tag++) if (msa->gr[tag]) { tmp = msa->gr[tag][i]; msa->gr[tag][i] = msa->gr[tag][N-1]; msa->gr[tag][N-1] = tmp; } } /* if <msa> has a keyhash that maps seqname => seqidx, we'll need to rebuild it. */ if (msa->index) { esl_keyhash_Reuse(msa->index); for (i = 0; i < msa->nseq; i++) esl_keyhash_Store(msa->index, msa->sqname[i], -1, NULL); } return eslOK; }
/* Sample random domain segment positions, start/end pairs, sorted and nonoverlapping. */ int p7_coords2_Sample(ESL_RANDOMNESS *rng, P7_COORDS2 *c2, int32_t maxseg, int32_t L, int32_t **byp_wrk) { int32_t *wrk = NULL; int32_t nseg = 1 + esl_rnd_Roll(rng, maxseg); /* 1..maxseg */ int32_t i; int status; /* Using the bypass idiom, make sure we have a workspace for <L> coords */ if (esl_byp_IsInternal(byp_wrk) ) ESL_ALLOC(wrk, sizeof(int32_t) * L); else if (esl_byp_IsReturned(byp_wrk) ) ESL_ALLOC(wrk, sizeof(int32_t) * L); else if (esl_byp_IsProvided(byp_wrk) ) { wrk = *byp_wrk; ESL_REALLOC(wrk, sizeof(int32_t) * L); } /* We put the numbers 1..L into the workspace <wrk>; shuffle them; * then sort the top nseg*2 of them. This gives us <nseg> * nonoverlapping start/end coords, in order. */ for (i = 0; i < L; i++) wrk[i] = i+1; esl_vec_IShuffle(rng, wrk, L); esl_vec_ISortIncreasing(wrk, nseg*2); /* Store those randomized coords now in the data structure. */ p7_coords2_GrowTo(c2, nseg); c2->n = nseg; for (i = 0; i < nseg; i++) { c2->arr[i].n1 = wrk[i*2]; c2->arr[i].n2 = wrk[i*2+1]; } /* Using the bypass idiom, recycle workspace, if we're supposed to */ if (esl_byp_IsInternal(byp_wrk)) free(wrk); else if (esl_byp_IsReturned(byp_wrk)) *byp_wrk = wrk; else if (esl_byp_IsProvided(byp_wrk)) *byp_wrk = wrk; return eslOK; ERROR: if (esl_byp_IsInternal(byp_wrk) && wrk) free(wrk); return status; }
/* The esl_random() unit test: * a binned frequency test. */ static void utest_random(long seed, int n, int nbins, int be_verbose) { ESL_RANDOMNESS *r = NULL; int *counts = NULL; double X2p = 0.; int i; double X2, exp, diff; if ((counts = malloc(sizeof(int) * nbins)) == NULL) esl_fatal("malloc failed"); esl_vec_ISet(counts, nbins, 0); /* This contrived call sequence exercises CreateTimeseeded() and * Init(), while leaving us a reproducible chain. Because it's * reproducible, we know this test succeeds, despite being * statistical in nature. */ if ((r = esl_randomness_CreateTimeseeded()) == NULL) esl_fatal("randomness create failed"); if (esl_randomness_Init(r, seed) != eslOK) esl_fatal("randomness init failed"); for (i = 0; i < n; i++) counts[esl_rnd_Roll(r, nbins)]++; /* X^2 value: \sum (o_i - e_i)^2 / e_i */ for (X2 = 0., i = 0; i < nbins; i++) { exp = (double) n / (double) nbins; diff = (double) counts[i] - exp; X2 += diff*diff/exp; } if (esl_stats_ChiSquaredTest(nbins, X2, &X2p) != eslOK) esl_fatal("chi squared eval failed"); if (be_verbose) printf("random(): \t%g\n", X2p); if (X2p < 0.01) esl_fatal("chi squared test failed"); esl_randomness_Destroy(r); free(counts); return; }
/* Function: esl_msashuffle_CQRNA() * Synopsis: Gap-preserving column shuffle of a pairwise alignment. * Incept: SRE, Tue Jan 22 08:45:34 2008 [Market Street Cafe, Leesburg] * * Purpose: Shuffle a pairwise alignment <x>,<y> while preserving the * position of gaps, using the random number generator <r>. * Return the shuffled alignment in <xs>, * <ys>. Caller provides allocated space for <xs> and <ys>. * * An alphabet <abc> must also be provided, solely for the * definition of gap characters. Because Easel's default * alphabets (DNA, RNA, and protein) all use the same * definition of gap characters <-_.>, you can actually * provide any alphabet here, and get the same results. * (This may save having to determine the alphabet of input * sequences.) * * Works by doing three separate * shuffles, of (1) columns with residues in both * <x> and <y>, (2) columns with residue in <x> and gap in <y>, * and (3) columns with gap in <x> and residue in <y>. * * <xs>,<x> and <ys>,<y> may be identical: that is, to shuffle * an alignment "in place", destroying the original * alignment, just call <esl_msashuffle_CQRNA(r, abc, x,y,x,y)>. * * Returns: <eslOK> on success, and the shuffled alignment is * returned in <xs>, <ys>. * * Throws: <eslEMEM> on allocation failure. */ int esl_msashuffle_CQRNA(ESL_RANDOMNESS *r, ESL_ALPHABET *abc, char *x, char *y, char *xs, char *ys) { int L; int *xycol = NULL; int *xcol = NULL; int *ycol = NULL; int nxy, nx, ny; int i; int pos, c; char xsym, ysym; int status; if (xs != x) strcpy(xs, x); if (ys != y) strcpy(ys, y); /* First, construct three arrays containing lists of the column positions * of the three types of columns. (If a column contains gaps in both x and y, * we've already simply copied it to the shuffled sequence.) */ L = strlen(x); if (strlen(y) != L) ESL_XEXCEPTION(eslEINVAL, "sequences of different lengths in qrna shuffle"); ESL_ALLOC(xycol, sizeof(int) * L); ESL_ALLOC(xcol, sizeof(int) * L); ESL_ALLOC(ycol, sizeof(int) * L); nxy = nx = ny = 0; for (i = 0; i < L; i++) { if ( esl_abc_CIsGap(abc, x[i]) && esl_abc_CIsGap(abc, y[i])) { continue; } else if (! esl_abc_CIsGap(abc, x[i]) && ! esl_abc_CIsGap(abc, y[i])) { xycol[nxy] = i; nxy++; } else if ( esl_abc_CIsGap(abc, x[i])) { ycol[ny] = i; ny++; } else if ( esl_abc_CIsGap(abc, y[i])) { xcol[nx] = i; nx++; } } /* Second, shuffle the sequences indirectly, via shuffling these arrays. * Yow, careful with those indices, and with order of the statements... */ for (; nxy > 1; nxy--) { pos = esl_rnd_Roll(r, nxy); xsym = xs[xycol[pos]]; ysym = ys[xycol[pos]]; c = xycol[pos]; xs[xycol[pos]] = xs[xycol[nxy-1]]; ys[xycol[pos]] = ys[xycol[nxy-1]]; xycol[pos] = xycol[nxy-1]; xs[xycol[nxy-1]] = xsym; ys[xycol[nxy-1]] = ysym; xycol[pos] = xycol[nxy-1]; } for (; nx > 1; nx--) { pos = esl_rnd_Roll(r, nx); xsym = xs[xcol[pos]]; ysym = ys[xcol[pos]]; c = xcol[pos]; xs[xcol[pos]] = xs[xcol[nx-1]]; ys[xcol[pos]] = ys[xcol[nx-1]]; xcol[pos] = xcol[nx-1]; xs[xcol[nx-1]] = xsym; ys[xcol[nx-1]] = ysym; xcol[nx-1] = c; } for (; ny > 1; ny--) { pos = esl_rnd_Roll(r, ny); xsym = xs[ycol[pos]]; ysym = ys[ycol[pos]]; c = ycol[pos]; xs[ycol[pos]] = xs[ycol[ny-1]]; ys[ycol[pos]] = ys[ycol[ny-1]]; ycol[pos] = ycol[ny-1]; xs[ycol[ny-1]] = xsym; ys[ycol[ny-1]] = ysym; ycol[ny-1] = c; } free(xycol); free(xcol); free(ycol); return eslOK; ERROR: if (xycol != NULL) free(xycol); if (xcol != NULL) free(xcol); if (ycol != NULL) free(ycol); return status; }
/* seq_shuffling() * SRE, Tue Jan 22 08:35:51 2008 [Market Street Cafe, Leesburg] * * Shuffling of input sequences. * * Fixed-length (L>0) vs. full-length (L=0) modes handled differently. * In fixed-length mode: * <shuff->seq> only needs to be allocated once, for L * <targ> is an allocated copy of a random subseq of length L * sequences < L residues long can't be shuffled * In full-length mode: * <shuff->seq> is grown to length <sq->n> for each input seq * <targ> just points to <sq->seq> */ static int seq_shuffling(ESL_GETOPTS *go, ESL_RANDOMNESS *r, FILE *ofp, int outfmt) { char *seqfile = esl_opt_GetArg(go, 1); int infmt = eslSQFILE_UNKNOWN; ESL_SQFILE *sqfp = NULL; ESL_SQ *sq = esl_sq_Create(); ESL_SQ *shuff = esl_sq_Create(); char *targ = NULL; int N = esl_opt_GetInteger(go, "-N"); int L = esl_opt_GetInteger(go, "-L"); /* L>0 means select random fixed-len subseqs */ int kmers = 0; int i; int status; if (esl_opt_GetString(go, "--informat") != NULL) { infmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslSQFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --informat"); } if (esl_opt_IsOn(go, "-k")) kmers = esl_opt_GetInteger(go, "-k"); status = esl_sqfile_Open(seqfile, infmt, NULL, &sqfp); if (status == eslENOTFOUND) esl_fatal("No such file %s", seqfile); else if (status == eslEFORMAT) esl_fatal("Format of seqfile %s unrecognized.", seqfile); else if (status == eslEINVAL) esl_fatal("Can't autodetect stdin or .gz."); else if (status != eslOK) esl_fatal("Open failed, code %d.", status); if (L>0) { esl_sq_GrowTo(shuff, L); shuff->n = L; ESL_ALLOC(targ, sizeof(char) * (L+1)); } while ((status = esl_sqio_Read(sqfp, sq)) == eslOK) { if (L == 0) { /* shuffling entire sequence */ esl_sq_GrowTo(shuff, sq->n); /* make sure shuff can hold sq */ shuff->n = sq->n; targ = sq->seq; } else { if (sq->n < L) continue; /* reject seqs < L long */ } for (i = 0; i < N; i++) { if (L > 0) { /* fixed-len mode: copy a random subseq */ int pos = esl_rnd_Roll(r, sq->n - L + 1); strncpy(targ, sq->seq + pos, L); targ[L] = '\0'; } /* Do the requested kind of shuffling */ if (esl_opt_GetBoolean(go, "-m")) esl_rsq_CShuffle (r, targ, shuff->seq); /* monoresidue shuffling */ else if (esl_opt_GetBoolean(go, "-d")) esl_rsq_CShuffleDP (r, targ, shuff->seq); /* diresidue shuffling */ else if (esl_opt_IsOn (go, "-k")) esl_rsq_CShuffleKmers(r, targ, kmers, shuff->seq); /* diresidue shuffling */ else if (esl_opt_GetBoolean(go, "-0")) esl_rsq_CMarkov0 (r, targ, shuff->seq); /* 0th order Markov */ else if (esl_opt_GetBoolean(go, "-1")) esl_rsq_CMarkov1 (r, targ, shuff->seq); /* 1st order Markov */ else if (esl_opt_GetBoolean(go, "-r")) esl_rsq_CReverse ( targ, shuff->seq); /* reverse */ else if (esl_opt_IsOn (go, "-w")) { /* regionally shuffle */ int W= esl_opt_GetInteger(go, "-w"); esl_rsq_CShuffleWindows(r, targ, W, shuff->seq); } /* Set the name of the shuffled sequence */ if (N > 1) esl_sq_FormatName(shuff, "%s-shuffled-%d", sq->name, i); else esl_sq_FormatName(shuff, "%s-shuffled", sq->name); /* Output the resulting sequence */ esl_sqio_Write(ofp, shuff, outfmt, FALSE); /* don't need to reuse the shuffled sequence: we will use exactly the same memory */ } esl_sq_Reuse(sq); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); if (L>0) free(targ); esl_sq_Destroy(shuff); esl_sq_Destroy(sq); esl_sqfile_Close(sqfp); return eslOK; ERROR: if (targ != NULL) free(targ); esl_sq_Destroy(shuff); esl_sq_Destroy(sq); esl_sqfile_Close(sqfp); return status; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; ESL_RANDOMNESS *r = NULL; int nselect = 0; char *filename = NULL; FILE *fp = NULL; char **larr = NULL; char *buf = NULL; int buflen = 0; char *tmp = NULL; int i,j; int n; /* Parse command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in app configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") ) cmdline_help(argv[0], go); if (esl_opt_ArgNumber(go) != 2) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); nselect = atoi(esl_opt_GetArg(go, 1)); filename = esl_opt_GetArg(go, 2); r = esl_randomness_Create(esl_opt_GetInteger(go, "--seed")); if ((larr = malloc(sizeof(char *) * nselect)) == NULL) esl_fatal("allocation failed"); if (strcmp(filename, "-") == 0) fp = stdin; else { if ((fp = fopen(filename, "r")) == NULL) esl_fatal("Failed to open file %s\n", filename); } n = 0; while (esl_fgets(&buf, &buflen, fp) == eslOK) { n++; i = esl_rnd_Roll(r, n); if (i < nselect) { for (j = i; j < nselect && j < n; j++) { tmp = larr[j]; larr[j] = buf; buf = tmp; } free(buf); buf = NULL; buflen = 0; } } for (i = 0; i < nselect; i++) printf("%s", larr[i]); if (fp != stdin) fclose(fp); for (i = 0; i < nselect; i++) free(larr[i]); free(larr); free(buf); esl_randomness_Destroy(r); esl_getopts_Destroy(go); return 0; }
static void utest_block(ESL_RANDOMNESS *rng, char *tmpfile, int *is_data, int N) { char *msg = "esl_recorder:: block unit test failed"; ESL_RECORDER *rc = NULL; FILE *fp = NULL; int linenumber = 0; /* where we should be in the file */ int max_reposition = 2; int max_realloc = 2; int *nseen1 = NULL; /* # of times we Read() each line */ int *nseen2 = NULL; /* # of times we see each line in a block */ int minalloc; int roll; char *buf; char **block; int from; int n,i; int status = eslOK; if ((fp = fopen(tmpfile, "r")) == NULL) esl_fatal(msg); roll = 1+esl_rnd_Roll(rng, N+1); /* 1..N+1 */ if ((rc = esl_recorder_Create(fp, roll)) == NULL) esl_fatal(msg); if ((nseen1 = malloc(sizeof(int) * N)) == NULL) esl_fatal(msg); if ((nseen2 = malloc(sizeof(int) * N)) == NULL) esl_fatal(msg); for (i = 0; i < N; i++) nseen1[i] = 0; for (i = 0; i < N; i++) nseen2[i] = 0; while (status == eslOK) { /* skip nondata lines (no # prefix) */ do { if (esl_recorder_Read(rc, &buf) == eslEOF) goto DONE; if (atoi(buf+1) != linenumber) esl_fatal(msg); if (esl_recorder_GetCurrent(rc) != linenumber) esl_fatal(msg); nseen1[linenumber]++; linenumber++; } while (*buf != '#'); /* read block */ from = esl_recorder_GetCurrent(rc); esl_recorder_MarkBlock(rc, from); do { if ((status = esl_recorder_Read(rc, &buf)) == eslEOF) break; if (atoi(buf+1) != linenumber) esl_fatal(msg); if (esl_recorder_GetCurrent(rc) != linenumber) esl_fatal(msg); nseen1[linenumber]++; linenumber++; } while (*buf == '#'); /* get the block */ esl_recorder_GetBlock(rc, &block, NULL, NULL, &n); if (status == eslOK) n--; /* check the block */ for (i = 0; i < n; i++) { if (atoi(block[i]+1) != from+i) esl_fatal(msg); nseen2[from+i]++; } /* unmark it */ esl_recorder_UnmarkBlock(rc); /* some fraction of the time, reposition randomly */ if (status == eslOK && max_reposition && (roll = esl_rnd_Roll(rng, 5)) == 0) { linenumber = esl_recorder_GetFirst(rc) + esl_rnd_Roll(rng, esl_recorder_GetLast(rc) - esl_recorder_GetFirst(rc) + 1); if (esl_recorder_Position(rc, linenumber) != eslOK) esl_fatal(msg); max_reposition--; } /* some fraction of the time, shrink the allocation */ if (status == eslOK && max_realloc && (roll = esl_rnd_Roll(rng, 5)) == 0) { /* must keep at least nread-ncurr+1 lines, to keep curr line in window */ minalloc = rc->nread-rc->ncurr+1; roll = minalloc + esl_rnd_Roll(rng, rc->nalloc-minalloc+1); if (esl_recorder_ResizeTo(rc, roll) != eslOK) esl_fatal(msg); max_realloc--; } } DONE: /* we're EOF. We should be sitting on the last line. */ if (esl_recorder_GetCurrent(rc) != N-1) esl_fatal(msg); /* We should have Read() every line at least once. */ for (i = 0; i < N; i++) if (! nseen1[i]) esl_fatal(msg); /* In reading blocks, we should have seen each "data" line at least * once; non-data lines, not at all. */ for (i = 0; i < N; i++) { if ( is_data[i] && ! nseen2[i]) esl_fatal(msg); if (! is_data[i] && nseen2[i]) esl_fatal(msg); } fclose(fp); esl_recorder_Destroy(rc); free(nseen1); free(nseen2); }
/* ideal_local_endpoints() * * Purpose: Implementation of the "two-step" fragment sampling * algorithm, sampling a uniform local fragment w.r.t. * sequence coords, by first sampling a complete * sequence of length L from <hmm>; then choosing * a random fragment <i1..i2> uniformly from all * possible $\frac{L(L+1)/2}$ fragments; then finding * local alignment coordinates wrt model and sequence, * using convention that local alignment starts/stops * with match states. (Thus, if the initially selected * i1 or i2 were generated by insert states, bounds * are moved to reach first/last match state.) * * The caller also provides an allocated sequence <sq> and * traceback <tr>, as storage to be provided to * <p7_CoreEmit()>. They contain the generated global * sequence and trace upon return (not a local trace, note). * * i endpoints are normalized/discretized to 1..<Lbins>, so * we can collate i statistics from sampled sequences of * varying L. Note this causes discretization artifacts, * leading to underrepresentation of j=M and * overrepresentation of i=1. * * This routine is only intended for collecting endpoint * statistics (i1,i2,k1,k2); it does not generate a local * alignment trace. (xref milestone 2, STL11/115). * * Returns: <eslOK> on success; returns normalized/binned sequence * coords in <*ret_i1> and <*ret_i2> in range <1..Lbins> and * the model entry/exit coords in <*ret_k1> and <*ret_k2> in * range <1..M>. By internal def'n of local alignment endpoints, * M_k1 emits residue x_i1, M_k2 emits residue x_i2. * * Xref: STL11/142-143 */ static int ideal_local_endpoints(ESL_RANDOMNESS *r, P7_HMM *hmm, ESL_SQ *sq, P7_TRACE *tr, int Lbins, int *ret_i1, int *ret_i2, int *ret_k1, int *ret_k2) { int status; int tpos; int i1, i2, k1,k2, t1,t2; int all_insert; int failsafe = 0; /* a failsafe timer for rejection sampling */ do { if (failsafe++ == 1000) ESL_XEXCEPTION(eslENOHALT, "failed to obtain local alignment that wasn't all inserts"); if ((status = p7_CoreEmit(r, hmm, sq, tr)) != eslOK) goto ERROR; /* a simple way to sample uniformly from upper triangle is by rejection * this do/while cannot infinite loop, doesn't need failsafe */ do { i1 = 1 + esl_rnd_Roll(r, sq->n); i2 = 1 + esl_rnd_Roll(r, sq->n); } while (i1 > i2); /* Get initial k1,k2 coords: this step must work in a core model, * i1/i2 were generated by an M or I. Also record t1,t2 endpoints * on core's trace. */ for (tpos = 0; tpos < tr->N; tpos++) if (tr->i[tpos] == i1) { t1 = tpos; k1 = tr->k[tpos]; break; } for (tpos = tr->N-1; tpos >= 0; tpos--) if (tr->i[tpos] == i2) { t2 = tpos; k2 = tr->k[tpos]; break; } /* Enforce the definition of local alignment endpoints being * match-delimited - roll up any leading/trailing I states. * Watch out for pathological case of a local fragment that * includes no M state at all. */ all_insert = FALSE; for (; t1 <= t2; t1++) if (tr->st[t1] == p7T_M) break; for (; t2 >= t1; t2--) if (tr->st[t2] == p7T_M) break; if (t2 < t1) all_insert = TRUE; /* sufficient to check both. */ i1 = tr->i[t1]; i2 = tr->i[t2]; k1 = tr->k[t1]; k2 = tr->k[t2]; } while (all_insert); /* Normalize sequence coords. * They're 1..L now; make them 1..Lbins */ *ret_i1 = ((i1-1) * Lbins / sq->n) + 1; *ret_i2 = ((i2-1) * Lbins / sq->n) + 1; *ret_k1 = k1; *ret_k2 = k2; return eslOK; ERROR: *ret_i1 = 0.; *ret_i2 = 0.; *ret_k1 = 0; *ret_k2 = 0; return status; }
/* Function: p7_hit_TestSample() * Synopsis: Sample a random, bogus, mostly syntactic P7_HIT. * * Purpose: Sample a random but syntactically valid <P7_HIT> * array, using random number generator <rng>, and * store it in <hit>, space provided by the caller * (usually, one <P7_HIT> in an array that the caller * has). */ int p7_hit_TestSample(ESL_RANDOMNESS *rng, P7_HIT *hit) { int d; int status; if ((status = esl_rsq_Sample(rng, eslRSQ_SAMPLE_GRAPH, 1+esl_rnd_Roll(rng, 30), &(hit->name))) != eslOK) goto ERROR; if (esl_rnd_Roll(rng, 2)) { if ((status = esl_rsq_Sample(rng, eslRSQ_SAMPLE_ALNUM, 1+esl_rnd_Roll(rng, 10), &(hit->acc))) != eslOK) goto ERROR; } if (esl_rnd_Roll(rng, 2)) { if ((status = esl_rsq_Sample(rng, eslRSQ_SAMPLE_PRINT, 1+esl_rnd_Roll(rng, 120), &(hit->desc))) != eslOK) goto ERROR; } hit->window_length = 1 + esl_rnd_Roll(rng, 100000); hit->sortkey = -1000. + 2000. * esl_random(rng); hit->score = -1000. + 2000. * esl_random(rng); hit->pre_score = -1000. + 2000. * esl_random(rng); hit->sum_score = -1000. + 2000. * esl_random(rng); hit->lnP = -1000. + 2000. * esl_random(rng); hit->pre_lnP = -1000. + 2000. * esl_random(rng); hit->sum_lnP = -1000. + 2000. * esl_random(rng); hit->ndom = 1 + esl_rnd_Roll(rng, 10); hit->noverlaps = esl_rnd_Roll(rng, hit->ndom); hit->nexpected = esl_random(rng)*10; hit->flags = p7_HITFLAGS_DEFAULT; if (esl_rnd_Roll(rng, 2)) hit->flags |= p7_IS_INCLUDED; if (esl_rnd_Roll(rng, 2)) hit->flags |= p7_IS_REPORTED; if (esl_rnd_Roll(rng, 2)) hit->flags |= p7_IS_NEW; if (esl_rnd_Roll(rng, 2)) hit->flags |= p7_IS_DROPPED; if (esl_rnd_Roll(rng, 2)) hit->flags |= p7_IS_DUPLICATE; hit->nreported = 1 + esl_rnd_Roll(rng, hit->ndom); hit->nincluded = 1 + esl_rnd_Roll(rng, hit->ndom); hit->best_domain = esl_rnd_Roll(rng, hit->ndom); hit->seqidx = 1 + esl_rnd_Roll(rng, 1000000); hit->subseq_start = 1 + esl_rnd_Roll(rng, 1000000); hit->offset = 1 + esl_rnd_Roll(rng, 1000000); if (( hit->dcl = p7_domain_Create(hit->ndom) ) == NULL) { status = eslEMEM; goto ERROR; } for (d = 0; d < hit->ndom; d++) if (( status = p7_domain_TestSample(rng, 1 + esl_rnd_Roll(rng, 100), &(hit->dcl[d]))) != eslOK) goto ERROR; return eslOK; ERROR: /* should free inside hit; caller has the shell of it though */ return status; }
/* Step 2. Extract the training set and test set. */ static int separate_sets(struct cfg_s *cfg, ESL_MSA *msa, ESL_MSA **ret_trainmsa, ESL_STACK **ret_teststack) { ESL_MSA *trainmsa = NULL; ESL_MSA *test_msa = NULL; ESL_STACK *teststack = NULL; ESL_SQ *sq = NULL; int *assignment = NULL; int *nin = NULL; int *useme = NULL; int nc = 0; int c; int ctrain; /* index of the cluster that becomes the training alignment */ int ntrain; /* number of seqs in the training alignment */ int nskip; int i; int status; if ((teststack = esl_stack_PCreate()) == NULL) { status = eslEMEM; goto ERROR; } ESL_ALLOC(useme, sizeof(int) * msa->nseq); if ((status = esl_msacluster_SingleLinkage(msa, cfg->idthresh1, &assignment, &nin, &nc)) != eslOK) goto ERROR; ctrain = esl_vec_IArgMax(nin, nc); ntrain = esl_vec_IMax(nin, nc); for (i = 0; i < msa->nseq; i++) useme[i] = (assignment[i] == ctrain) ? 1 : 0; if ((status = esl_msa_SequenceSubset(msa, useme, &trainmsa)) != eslOK) goto ERROR; /* If all the seqs went into the training msa, none are left for testing; we're done here */ if (trainmsa->nseq == msa->nseq) { free(useme); free(assignment); free(nin); *ret_trainmsa = trainmsa; *ret_teststack = teststack; return eslOK; } /* Put all the other sequences into an MSA of their own; from these, we'll * choose test sequences. */ for (i = 0; i < msa->nseq; i++) useme[i] = (assignment[i] != ctrain) ? 1 : 0; if ((status = esl_msa_SequenceSubset(msa, useme, &test_msa)) != eslOK) goto ERROR; /* Cluster those test sequences. */ free(nin); nin = NULL; free(assignment); assignment = NULL; if ((status = esl_msacluster_SingleLinkage(test_msa, cfg->idthresh2, &assignment, &nin, &nc)) != eslOK) goto ERROR; for (c = 0; c < nc; c++) { nskip = esl_rnd_Roll(cfg->r, nin[c]); /* pick a random seq in this cluster to be the test. */ for (i=0; i < test_msa->nseq; i++) if (assignment[i] == c) { if (nskip == 0) { esl_sq_FetchFromMSA(test_msa, i, &sq); esl_stack_PPush(teststack, (void *) sq); break; } else nskip--; } } esl_msa_Destroy(test_msa); free(useme); free(nin); free(assignment); *ret_trainmsa = trainmsa; *ret_teststack = teststack; return eslOK; ERROR: if (useme != NULL) free(useme); if (assignment != NULL) free(assignment); if (nin != NULL) free(nin); esl_msa_Destroy(trainmsa); esl_msa_Destroy(test_msa); while (esl_stack_PPop(teststack, (void **) &sq) == eslOK) esl_sq_Destroy(sq); esl_stack_Destroy(teststack); *ret_trainmsa = NULL; *ret_teststack = NULL; return status; }
/* Each test sequence will contain one or two domains, depending on whether --single is set. */ static int synthesize_positives(ESL_GETOPTS *go, struct cfg_s *cfg, char *testname, ESL_STACK *teststack, int *ret_ntest) { ESL_SQ *domain1, *domain2; ESL_SQ *sq; void *p; int64_t L; /* total length of synthetic test seq */ int d1n, d2n; /* lengths of two domains */ int L1,L2,L3; /* lengths of three random regions */ int i,j; int ntest = 0; int ndomains = ( (esl_opt_GetBoolean(go, "--single") == TRUE) ? 1 : 2); int status; while (esl_stack_ObjectCount(teststack) >= ndomains) { ESL_RALLOC(cfg->test_lens, p, (cfg->ntest+1) * sizeof(struct testseq_s)); /* Pop our one or two test domains off the stack */ esl_stack_PPop(teststack, &p); domain1 = p; d1n = domain1->n; if (ndomains == 2) { esl_stack_PPop(teststack, &p); domain2 = p; d2n = domain2->n; } else { domain2 = NULL; d2n = 0; } /* Select a random total sequence length */ if (d1n+d2n > cfg->db_maxL) esl_fatal("can't construct test seq; no db seq >= %d residues\n", d1n+d2n); do { if (esl_ssi_FindNumber(cfg->dbfp->data.ascii.ssi, esl_rnd_Roll(cfg->r, cfg->db_nseq), NULL, NULL, NULL, &L, NULL) != eslOK) esl_fatal("failed to look up a random seq"); } while (L < d1n+d2n); /* Now figure out the embedding */ if (ndomains == 2) { /* Select random lengths of three flanking domains; * Imagine picking two "insert after" points i,j in sequence 1..L', for * L' = L-d1n-d2n (the total length of nonhomologous test seq) */ do { i = esl_rnd_Roll(cfg->r, L - d1n - d2n + 1 ); /* i = 0..L' */ j = esl_rnd_Roll(cfg->r, L - d1n - d2n + 1 ); /* j = 0..L' */ } while (i > j); /* now 1 .. i = random region 1 (if i==0, there's none); * i+1 .. i+d1n = domain 1 * i+d1n+1 .. j+d1n = random region 2 (if i==j, there's none); * j+d1n+1 .. j+d1n+d2n = domain 2 * j+d1n+d2n+1 .. L = random region 3 (if j == L-d1n-d2n, there's none); */ L1 = i; L2 = j-i; L3 = L - d1n - d2n - j; } else { /* embedding one domain */ i = esl_rnd_Roll(cfg->r, L - d1n + 1 ); /* i = 0..L' */ /* now 1 .. i = random region 1 (if i==0, there's none); * i+1 .. i+d1n = domain 1 * i+d1n+1 .. L = random region 2 (if i==j, there's none); */ L1 = i; L2 = L - d1n - L1; L3 = 0; } sq = esl_sq_CreateDigital(cfg->abc); esl_sq_GrowTo(sq, L); sq->n = L; if (ndomains == 2) { esl_sq_FormatName(sq, "%s/%d/%d-%d/%d-%d", testname, cfg->ntest, i+1, i+d1n, j+d1n+1, j+d1n+d2n); esl_sq_FormatDesc(sq, "domains: %s %s", domain1->name, domain2->name); } else { esl_sq_FormatName(sq, "%s/%d/%d-%d", testname, cfg->ntest, i+1, i+d1n); esl_sq_FormatDesc(sq, "domain: %s", domain1->name); } fprintf(cfg->possummfp, "%-35s %5d %5d %5d %5d %5d %5d", sq->name, (int) sq->n, L1, d1n, L2, d2n, L3); sq->dsq[0] = sq->dsq[L+1] = eslDSQ_SENTINEL; set_random_segment(go, cfg, cfg->possummfp, sq->dsq+1, L1); memcpy(sq->dsq+i+1, domain1->dsq+1, sizeof(ESL_DSQ) * d1n); fprintf(cfg->possummfp, " %-24s %5d %5d", domain1->name, 1, d1n); set_random_segment(go, cfg, cfg->possummfp, sq->dsq+i+d1n+1, L2); if (ndomains == 2) { memcpy(sq->dsq+j+d1n+1, domain2->dsq+1, sizeof(ESL_DSQ) * d2n); fprintf(cfg->possummfp, " %-24s %5d %5d", domain2->name, 1, d2n); set_random_segment(go, cfg, cfg->possummfp, sq->dsq+j+d1n+d2n+1, L3); } fprintf(cfg->possummfp, "\n"); cfg->test_lens[cfg->ntest].L = L; cfg->test_lens[cfg->ntest].L1 = L1; cfg->test_lens[cfg->ntest].d1n = d1n; cfg->test_lens[cfg->ntest].L2 = L2; cfg->test_lens[cfg->ntest].d2n = d2n; cfg->test_lens[cfg->ntest].L3 = L3; cfg->ntest++; ntest++; esl_sqio_Write(cfg->out_seqfp, sq, eslSQFILE_FASTA, FALSE); esl_sq_Destroy(domain1); if (ndomains == 2) esl_sq_Destroy(domain2); esl_sq_Destroy(sq); } *ret_ntest = ntest; return eslOK; ERROR: esl_fatal("Failure in synthesize_positives"); return status; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; ESL_RANDOMNESS *r = NULL; char **as = NULL; /* aligned character seqs (random, iid) */ int N,L; /* # of seqs, and their aligned lengths */ int seed; int i,j; int status; double p[4]; /* ACGT probabilities */ #ifdef eslAUGMENT_ALPHABET ESL_DSQ **ax = NULL; /* digitized alignment */ ESL_ALPHABET *abc = NULL; #endif /* Process command line */ go = esl_getopts_Create(options); esl_opt_ProcessCmdline(go, argc, argv); esl_opt_VerifyConfig(go); if (esl_opt_GetBoolean(go, "-h") == TRUE) { puts(usage); puts("\n where options are:"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2=indentation; 80=width */ return 0; } L = esl_opt_GetInteger(go, "-L"); N = esl_opt_GetInteger(go, "-N"); seed = esl_opt_GetInteger(go, "--seed"); if (esl_opt_ArgNumber(go) != 0) { puts("Incorrect number of command line arguments."); puts(usage); return 1; } esl_getopts_Destroy(go); /* Create a random DNA alignment; * force it to obey the conventions of the unit tests: * 0,1 are identical * 0,2 are completely dissimilar */ r = esl_randomness_Create(seed); for (i = 0; i < 4; i++) p[i] = 0.25; ESL_ALLOC(as, sizeof(char *) * N); for (i = 0; i < N; i++) ESL_ALLOC(as[i], sizeof(char) * (L+1)); esl_rsq_IID(r, "ACGT", p, 4, L, as[0]); strcpy(as[1], as[0]); esl_rsq_IID(r, "ACGT", p, 4, L, as[2]); for (j = 0; j < L; j++) while (as[2][j] == as[0][j]) as[2][j] = "ACGT"[esl_rnd_Roll(r, 4)]; for (i = 3; i < N; i++) esl_rsq_IID(r, "ACGT", p, 4, L, as[i]); #ifdef eslAUGMENT_ALPHABET abc = esl_alphabet_Create(eslDNA); ESL_ALLOC(ax, sizeof(ESL_DSQ *) * N); for (i = 0; i < N; i++) esl_abc_CreateDsq(abc, as[i], &(ax[i])); #endif /*eslAUGMENT_ALPHABET*/ /* Unit tests */ if (utest_CPairId(as, N) != eslOK) return eslFAIL; if (utest_CJukesCantor(4, as, N) != eslOK) return eslFAIL; #ifdef eslAUGMENT_ALPHABET if (utest_XPairId(abc, as, ax, N) != eslOK) return eslFAIL; if (utest_XJukesCantor(abc, as, ax, N) != eslOK) return eslFAIL; #endif /*eslAUGMENT_ALPHABET*/ #ifdef eslAUGMENT_DMATRIX if (utest_CPairIdMx(as, N) != eslOK) return eslFAIL; if (utest_CDiffMx(as, N) != eslOK) return eslFAIL; if (utest_CJukesCantorMx(4, as, N) != eslOK) return eslFAIL; #endif /* eslAUGMENT_DMATRIX*/ #if defined (eslAUGMENT_ALPHABET) && defined (eslAUGMENT_DMATRIX) if (utest_XPairIdMx(abc, as, ax, N) != eslOK) return eslFAIL; if (utest_XDiffMx(abc, as, ax, N) != eslOK) return eslFAIL; if (utest_XJukesCantorMx(abc, as, ax, N) != eslOK) return eslFAIL; #endif esl_randomness_Destroy(r); esl_Free2D((void **) as, N); #ifdef eslAUGMENT_ALPHABET esl_alphabet_Destroy(abc); esl_Free2D((void **) ax, N); #endif return eslOK; ERROR: return eslFAIL; }