static void emit_alignment(ESL_GETOPTS *go, FILE *ofp, int outfmt, ESL_RANDOMNESS *r, P7_HMM *hmm) { ESL_MSA *msa = NULL; ESL_SQ **sq = NULL; P7_TRACE **tr = NULL; int N = esl_opt_GetInteger(go, "-N"); int optflags = p7_ALL_CONSENSUS_COLS; int i; if ((tr = malloc(sizeof(P7_TRACE *) * N)) == NULL) esl_fatal("failed to allocate trace array"); if ((sq = malloc(sizeof(ESL_SQ *) * N)) == NULL) esl_fatal("failed to allocate seq array"); for (i = 0; i < N; i++) { if ((sq[i] = esl_sq_CreateDigital(hmm->abc)) == NULL) esl_fatal("failed to allocate seq"); if ((tr[i] = p7_trace_Create()) == NULL) esl_fatal("failed to allocate trace"); } for (i = 0; i < N; i++) { if (p7_CoreEmit(r, hmm, sq[i], tr[i]) != eslOK) esl_fatal("Failed to emit sequence"); if (esl_sq_FormatName(sq[i], "%s-sample%d", hmm->name, i+1) != eslOK) esl_fatal("Failed to set sequence name\n"); } p7_tracealign_Seqs(sq, tr, N, hmm->M, optflags, hmm, &msa); eslx_msafile_Write(ofp, msa, outfmt); for (i = 0; i < N; i++) p7_trace_Destroy(tr[i]); free(tr); for (i = 0; i < N; i++) esl_sq_Destroy(sq[i]); free(sq); esl_msa_Destroy(msa); return; }
static void onefetch_subseq(ESL_GETOPTS *go, FILE *ofp, ESL_SQFILE *sqfp, char *newname, char *key, uint32_t given_start, uint32_t given_end) { int start, end; int do_revcomp; ESL_SQ *sq = esl_sq_Create(); if (sqfp->data.ascii.ssi == NULL) esl_fatal("no ssi index"); /* reverse complement indicated by coords. */ /* -c 52: would be 52,0, so watch out for given_end = 0 case */ if (given_end != 0 && given_start > given_end) { start = given_end; end = given_start; do_revcomp = TRUE; } else { start = given_start; end = given_end; do_revcomp = FALSE; } if (esl_sqio_FetchSubseq(sqfp, key, start, end, sq) != eslOK) esl_fatal(esl_sqfile_GetErrorBuf(sqfp)); if (newname != NULL) esl_sq_SetName(sq, newname); else esl_sq_FormatName(sq, "%s/%d-%d", key, given_start, (given_end == 0) ? sq->L : given_end); /* Two ways we might have been asked to revcomp: by coord, or by -r option */ /* (If both happen, they'll cancel each other out) */ if (do_revcomp) if (esl_sq_ReverseComplement(sq) != eslOK) esl_fatal("Failed to reverse complement %s; is it a protein?\n", sq->name); if (esl_opt_GetBoolean(go, "-r")) if (esl_sq_ReverseComplement(sq) != eslOK) esl_fatal("Failed to reverse complement %s; is it a protein?\n", sq->name); esl_sqio_Write(ofp, sq, eslSQFILE_FASTA, FALSE); esl_sq_Destroy(sq); }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 1, argc, argv, banner, usage); ESL_RANDOMNESS *rng = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s")); char *hmmfile = esl_opt_GetArg(go, 1); int L = esl_opt_GetInteger(go, "-L"); int N = esl_opt_GetInteger(go, "-N"); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_TRACE *tr = p7_trace_Create(); ESL_SQ *sq = NULL; char errbuf[eslERRBUFSIZE]; int i; int status; status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, hmmfile, errbuf); status = p7_hmmfile_Read(hfp, &abc, &hmm); if (status == eslEFORMAT) p7_Fail("Bad file format in HMM file %s:\n%s\n", hfp->fname, hfp->errbuf); else if (status == eslEINCOMPAT) p7_Fail("HMM in %s is not in the expected %s alphabet\n", hfp->fname, esl_abc_DecodeType(abc->type)); else if (status == eslEOF) p7_Fail("Empty HMM file %s? No HMM data found.\n", hfp->fname); else if (status != eslOK) p7_Fail("Unexpected error in reading HMMs from %s\n", hfp->fname); p7_hmmfile_Close(hfp); bg = p7_bg_Create(abc); p7_bg_SetLength(bg, L); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, L, p7_LOCAL); sq = esl_sq_CreateDigital(abc); for (i = 0; i < N; i++) { p7_ProfileEmit(rng, hmm, gm, bg, sq, tr); esl_sq_FormatName(sq, "%s-sample%d", hmm->name, i); esl_sqio_Write(stdout, sq, eslSQFILE_FASTA, FALSE); if (p7_trace_Validate(tr, abc, sq->dsq, errbuf) != eslOK) esl_fatal(errbuf); esl_sq_Reuse(sq); p7_trace_Reuse(tr); } esl_sq_Destroy(sq); p7_trace_Destroy(tr); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); esl_alphabet_Destroy(abc); esl_randomness_Destroy(rng); esl_getopts_Destroy(go); return 0; }
static void emit_sequences(ESL_GETOPTS *go, FILE *ofp, int outfmt, ESL_RANDOMNESS *r, P7_HMM *hmm) { ESL_SQ *sq = NULL; P7_TRACE *tr = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; int do_profile = esl_opt_GetBoolean(go, "-p"); int N = esl_opt_GetInteger(go, "-N"); int L = esl_opt_GetInteger(go, "-L"); int mode = p7_LOCAL; int nseq; int status; if (esl_opt_GetBoolean(go, "--local")) mode = p7_LOCAL; else if (esl_opt_GetBoolean(go, "--unilocal")) mode = p7_UNILOCAL; else if (esl_opt_GetBoolean(go, "--glocal")) mode = p7_GLOCAL; else if (esl_opt_GetBoolean(go, "--uniglocal")) mode = p7_UNIGLOCAL; if ((sq = esl_sq_CreateDigital(hmm->abc)) == NULL) esl_fatal("failed to allocate sequence"); if ((tr = p7_trace_Create()) == NULL) esl_fatal("failed to allocate trace"); if ((bg = p7_bg_Create(hmm->abc)) == NULL) esl_fatal("failed to create null model"); if ((gm = p7_profile_Create(hmm->M, hmm->abc)) == NULL) esl_fatal("failed to create profile"); if (p7_ProfileConfig(hmm, bg, gm, L, mode) != eslOK) esl_fatal("failed to configure profile"); if (p7_bg_SetLength(bg, L) != eslOK) esl_fatal("failed to reconfig null model length"); if (p7_hmm_Validate (hmm, NULL, 0.0001) != eslOK) esl_fatal("whoops, HMM is bad!"); if (p7_profile_Validate(gm, NULL, 0.0001) != eslOK) esl_fatal("whoops, profile is bad!"); for (nseq = 1; nseq <= N; nseq++) { if (do_profile) status = p7_ProfileEmit(r, hmm, gm, bg, sq, tr); else status = p7_CoreEmit (r, hmm, sq, tr); if (status) esl_fatal("Failed to emit sequence\n"); status = esl_sq_FormatName(sq, "%s-sample%d", hmm->name, nseq); if (status) esl_fatal("Failed to set sequence name\n"); status = esl_sqio_Write(ofp, sq, outfmt, FALSE); if (status != eslOK) esl_fatal("Failed to write sequence\n"); p7_trace_Reuse(tr); esl_sq_Reuse(sq); } esl_sq_Destroy(sq); p7_trace_Destroy(tr); p7_bg_Destroy(bg); p7_profile_Destroy(gm); return; }
/* seq_generation() * * Generating sequences. */ static int seq_generation(ESL_GETOPTS *go, ESL_RANDOMNESS *r, FILE *ofp, int outfmt) { ESL_ALPHABET *abc = NULL; ESL_SQ *sq = NULL; double *fq = NULL; int alphatype = eslUNKNOWN; // static checkers can't see that 1 of --rna, --dna, --amino must be true int N = esl_opt_GetInteger(go, "-N"); int L = esl_opt_GetInteger(go, "-L"); int i; int status; if (L <= 0) esl_fatal("To generate sequences, set -L option (length of generated seqs) > 0 "); if (esl_opt_GetBoolean(go, "--rna")) alphatype = eslRNA; if (esl_opt_GetBoolean(go, "--dna")) alphatype = eslDNA; if (esl_opt_GetBoolean(go, "--amino")) alphatype = eslAMINO; abc = esl_alphabet_Create(alphatype); sq = esl_sq_CreateDigital(abc); esl_sq_GrowTo(sq, L); /* Pick the iid frequency distribution to use */ ESL_ALLOC(fq, sizeof(double) * abc->K); switch (alphatype) { case eslRNA: case eslDNA: esl_vec_DSet(fq, 4, 0.25); break; case eslAMINO: esl_composition_SW34(fq); break; default: esl_vec_DSet(fq, abc->K, 1.0 / (double) abc->K); break; } /* generate */ for (i = 0; i < N; i++) { esl_rsq_xIID(r, fq, abc->K, L, sq->dsq); if (N > 1) esl_sq_FormatName(sq, "random%d", i); else esl_sq_SetName(sq, "random"); sq->n = L; esl_sqio_Write(ofp, sq, outfmt, FALSE); } free(fq); esl_alphabet_Destroy(abc); esl_sq_Destroy(sq); return eslOK; ERROR: if (fq != NULL) free(fq); esl_alphabet_Destroy(abc); esl_sq_Destroy(sq); return status; }
static void emit_consensus(ESL_GETOPTS *go, FILE *ofp, int outfmt, P7_HMM *hmm) { ESL_SQ *sq = NULL; if ((sq = esl_sq_CreateDigital(hmm->abc)) == NULL) esl_fatal("failed to allocate sequence"); if (p7_emit_SimpleConsensus(hmm, sq) != eslOK) esl_fatal("failed to create simple consensus seq"); if (esl_sq_FormatName(sq, "%s-consensus", hmm->name) != eslOK) esl_fatal("failed to set sequence name"); if (esl_sqio_Write(ofp, sq, outfmt, FALSE) != eslOK) esl_fatal("failed to write sequence"); esl_sq_Destroy(sq); return; }
static void emit_fancycons(ESL_GETOPTS *go, FILE *ofp, int outfmt, P7_HMM *hmm) { ESL_SQ *sq = NULL; float minl = esl_opt_GetReal(go, "--minl"); float minu = esl_opt_GetReal(go, "--minu"); if ((sq = esl_sq_Create()) == NULL) esl_fatal("failed to allocate sequence"); if (p7_emit_FancyConsensus(hmm, minl, minu, sq) != eslOK) esl_fatal("failed to create consensus seq"); if (esl_sq_FormatName(sq, "%s-consensus", hmm->name) != eslOK) esl_fatal("failed to set sequence name"); if (esl_sqio_Write(ofp, sq, outfmt, FALSE) != eslOK) esl_fatal("failed to write sequence"); esl_sq_Destroy(sq); return; }
static int synthesize_negatives(ESL_GETOPTS *go, struct cfg_s *cfg, int nneg) { ESL_SQ *sq = esl_sq_CreateDigital(cfg->abc); int a; int i; int L1,L2,L3,d1n,d2n; for (i = 0; i < nneg; i++) { /* Select a random test seq, to use its same segments */ a = esl_rnd_Roll(cfg->r, cfg->ntest); L1 = cfg->test_lens[a].L1; L2 = cfg->test_lens[a].L2; L3 = cfg->test_lens[a].L3; d1n = cfg->test_lens[a].d1n; d2n = cfg->test_lens[a].d2n; esl_sq_GrowTo(sq, cfg->test_lens[a].L); esl_sq_FormatName(sq, "decoy%d", i+1); esl_sq_FormatDesc(sq, "L=%d in segments: %d/%d/%d/%d/%d", cfg->test_lens[a].L, L1, d1n, L2, d2n, L3); sq->n = cfg->test_lens[a].L; fprintf(cfg->negsummfp, "%-15s %5d %5d %5d %5d %5d %5d", sq->name, (int) sq->n, L1, d1n, L2, d2n, L3); sq->dsq[0] = sq->dsq[cfg->test_lens[a].L+1] = eslDSQ_SENTINEL; set_random_segment(go, cfg, cfg->negsummfp, sq->dsq+1, L1); set_random_segment(go, cfg, cfg->negsummfp, sq->dsq+1+L1, d1n); set_random_segment(go, cfg, cfg->negsummfp, sq->dsq+1+L1+d1n, L2); set_random_segment(go, cfg, cfg->negsummfp, sq->dsq+1+L1+d1n+L2, d2n); set_random_segment(go, cfg, cfg->negsummfp, sq->dsq+1+L1+d1n+L2+d2n, L3); fprintf(cfg->negsummfp, "\n"); esl_sqio_Write(cfg->out_seqfp, sq, eslSQFILE_FASTA, FALSE); esl_sq_Reuse(sq); } esl_sq_Destroy(sq); return eslOK; }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage); ESL_RANDOMNESS *rng = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s")); ESL_ALPHABET *abc = NULL; char *ghmmfile = esl_opt_GetArg(go, 1); /* HMMs parameterized for sequence generation */ char *ahmmfile = esl_opt_GetArg(go, 2); /* HMMs parameterized for alignment */ int N = esl_opt_GetInteger(go, "-N"); P7_HMMFILE *ghfp = NULL; P7_HMMFILE *ahfp = NULL; P7_HMM *ghmm = NULL; P7_HMM *ahmm = NULL; P7_PROFILE *ggm = NULL; P7_PROFILE *agm = NULL; P7_OPROFILE *aom = NULL; P7_BG *bg = NULL; ESL_SQ *sq = NULL; P7_TRACE *reftr = p7_trace_Create(); P7_TRACE *testtr = p7_trace_Create(); P7_TRACE_METRICS *tmetrics = p7_trace_metrics_Create(); P7_REFMX *rmx = p7_refmx_Create(100,100); // P7_FILTERMX *ox = NULL; P7_HARDWARE *hw; if ((hw = p7_hardware_Create ()) == NULL) p7_Fail("Couldn't get HW information data structure"); P7_SPARSEMASK *sm = p7_sparsemask_Create(100, 100, hw->simd); P7_SPARSEMX *sxv = p7_sparsemx_Create(NULL); int idx; char errbuf[eslERRBUFSIZE]; int status; p7_Init(); /* open HMM file containing models parameterized for generation (sampling) of seqs */ status = p7_hmmfile_OpenE(ghmmfile, NULL, &ghfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", ghmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", ghmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, ghmmfile, errbuf); /* open HMM file containing models parameterized for alignment (may be the same as ghmmfile) */ status = p7_hmmfile_OpenE(ahmmfile, NULL, &ahfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", ahmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", ahmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, ahmmfile, errbuf); while ( (status = p7_hmmfile_Read(ghfp, &abc, &ghmm)) == eslOK) /* <abc> gets set on first read */ { /* read the counterpart HMM from <ahfp> */ status = p7_hmmfile_Read(ahfp, &abc, &ahmm); if (status == eslEFORMAT) p7_Fail("Bad file format in HMM file %s:\n%s\n", ahfp->fname, ahfp->errbuf); else if (status == eslEINCOMPAT) p7_Fail("HMM in %s is not in the expected %s alphabet\n", ahfp->fname, esl_abc_DecodeType(abc->type)); else if (status == eslEOF) p7_Fail("Empty HMM file %s? No HMM data found.\n", ahfp->fname); else if (status != eslOK) p7_Fail("Unexpected error in reading HMMs from %s\n", ahfp->fname); /* try to validate that they're the "same" */ if (ahmm->M != ghmm->M || strcmp(ahmm->name, ghmm->name) != 0) p7_Fail("<gen-hmmfile>, <ali-hmmfile> contain different set or order of models"); /* deferred one-time creation of structures that need to know the alphabet */ if (!bg) bg = p7_bg_Create(abc); if (!sq) sq = esl_sq_CreateDigital(abc); ggm = p7_profile_Create(ghmm->M, abc); agm = p7_profile_Create(ahmm->M, abc); aom = p7_oprofile_Create(ahmm->M, abc, hw->simd); p7_profile_ConfigCustom(ggm, ghmm, bg, esl_opt_GetInteger(go, "--gL"), esl_opt_GetReal(go, "--gnj"), esl_opt_GetReal(go, "--gpglocal")); p7_profile_ConfigCustom(agm, ahmm, bg, 100, esl_opt_GetReal(go, "--anj"), esl_opt_GetReal(go, "--apglocal")); p7_oprofile_Convert(agm, aom); for (idx = 1; idx <= N; idx++) { p7_ProfileEmit(rng, ghmm, ggm, bg, sq, reftr); if (esl_opt_GetBoolean(go, "--dumpseqs")) { esl_sq_FormatName(sq, "seq%d", idx); esl_sqio_Write(stdout, sq, eslSQFILE_FASTA, FALSE); } p7_bg_SetLength(bg, sq->n); p7_profile_SetLength(agm, sq->n); p7_sparsemask_Reinit(sm, agm->M, sq->n); p7_sparsemask_AddAll(sm); if (esl_opt_GetBoolean(go, "--vit")) p7_ReferenceViterbi(sq->dsq, sq->n, agm, rmx, testtr, /*opt_vsc=*/NULL); else p7_SparseViterbi (sq->dsq, sq->n, agm, sm, sxv, testtr, /*opt_vsc=*/NULL); p7_trace_metrics(reftr, testtr, tmetrics); p7_sparsemask_Reuse(sm); p7_sparsemx_Reuse(sxv); //p7_filtermx_Reuse(ox); p7_refmx_Reuse(rmx); esl_sq_Reuse(sq); p7_trace_Reuse(reftr); p7_trace_Reuse(testtr); } p7_oprofile_Destroy(aom); p7_profile_Destroy(ggm); p7_profile_Destroy(agm); p7_hmm_Destroy(ghmm); p7_hmm_Destroy(ahmm); } /* we leave the loop with <status> set by a p7_hmmfile_Read() on ghfp; if all is well, status=eslEOF */ if (status == eslEFORMAT) p7_Fail("Bad file format in HMM file %s:\n%s\n", ghfp->fname, ghfp->errbuf); else if (status == eslEINCOMPAT) p7_Fail("HMM in %s is not in the expected %s alphabet\n", ghfp->fname, esl_abc_DecodeType(abc->type)); else if (status != eslEOF) p7_Fail("Unexpected error in reading HMMs from %s\n", ghfp->fname); p7_trace_metrics_Dump(stdout, tmetrics); p7_hmmfile_Close(ghfp); p7_hmmfile_Close(ahfp); // p7_filtermx_Destroy(ox); p7_sparsemask_Destroy(sm); p7_sparsemx_Destroy(sxv); p7_refmx_Destroy(rmx); p7_trace_metrics_Destroy(tmetrics); p7_trace_Destroy(testtr); p7_trace_Destroy(reftr); p7_bg_Destroy(bg); esl_alphabet_Destroy(abc); esl_randomness_Destroy(rng); esl_getopts_Destroy(go); }
/* Function: p7_alidisplay_Backconvert() * Synopsis: Convert an alidisplay to a faux trace and subsequence. * Incept: SRE, Wed Dec 10 09:49:28 2008 [Janelia] * * Purpose: Convert alignment display object <ad> to a faux subsequence * and faux subsequence trace, returning them in <ret_sq> and * <ret_tr>. * * The subsequence <*ret_sq> is digital; ascii residues in * <ad> are digitized using digital alphabet <abc>. * * The subsequence and trace are suitable for passing as * array elements to <p7_MultipleAlignment>. This is the * main purpose of backconversion. Results of a profile * search are stored in a hit list as a processed * <P7_ALIDISPLAY>, not as a <P7_TRACE> and <ESL_SQ>, to * reduce space and to reduce communication overhead in * parallelized search implementations. After reduction * to a final hit list, a master may want to construct a * multiple alignment of all the significant hits. * * Returns: <eslOK> on success. * * Throws: <eslEMEM> on allocation failures. <eslECORRUPT> on unexpected internal * data corruption. On any exception, <*ret_sq> and <*ret_tr> are * <NULL>. * * Xref: J4/29. */ int p7_alidisplay_Backconvert(const P7_ALIDISPLAY *ad, const ESL_ALPHABET *abc, ESL_SQ **ret_sq, P7_TRACE **ret_tr) { ESL_SQ *sq = NULL; /* RETURN: faux subsequence */ P7_TRACE *tr = NULL; /* RETURN: faux trace */ int subL = 0; /* subsequence length in the <ad> */ int a, i, k; /* coords for <ad>, <sq->dsq>, model */ char st; /* state type: MDI */ int status; /* Make a first pass over <ad> just to calculate subseq length */ for (a = 0; a < ad->N; a++) if (! esl_abc_CIsGap(abc, ad->aseq[a])) subL++; /* Allocations */ if ((sq = esl_sq_CreateDigital(abc)) == NULL) { status = eslEMEM; goto ERROR; } if ((status = esl_sq_GrowTo(sq, subL)) != eslOK) goto ERROR; if ((tr = (ad->ppline == NULL) ? p7_trace_Create() : p7_trace_CreateWithPP()) == NULL) { status = eslEMEM; goto ERROR; } if ((status = p7_trace_GrowTo(tr, subL+6)) != eslOK) goto ERROR; /* +6 is for SNB/ECT */ /* Construction of dsq, trace */ sq->dsq[0] = eslDSQ_SENTINEL; if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_S, 0, 0) : p7_trace_AppendWithPP(tr, p7T_S, 0, 0, 0.0))) != eslOK) goto ERROR; if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_N, 0, 0) : p7_trace_AppendWithPP(tr, p7T_N, 0, 0, 0.0))) != eslOK) goto ERROR; if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_B, 0, 0) : p7_trace_AppendWithPP(tr, p7T_B, 0, 0, 0.0))) != eslOK) goto ERROR; k = ad->hmmfrom; i = 1; for (a = 0; a < ad->N; a++) { if (esl_abc_CIsResidue(abc, ad->model[a])) { st = (esl_abc_CIsResidue(abc, ad->aseq[a]) ? p7T_M : p7T_D); } else st = p7T_I; if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, st, k, i) : p7_trace_AppendWithPP(tr, st, k, i, p7_alidisplay_DecodePostProb(ad->ppline[a])))) != eslOK) goto ERROR; switch (st) { case p7T_M: sq->dsq[i] = esl_abc_DigitizeSymbol(abc, ad->aseq[a]); k++; i++; break; case p7T_I: sq->dsq[i] = esl_abc_DigitizeSymbol(abc, ad->aseq[a]); i++; break; case p7T_D: k++; break; } } if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_E, 0, 0) : p7_trace_AppendWithPP(tr, p7T_E, 0, 0, 0.0))) != eslOK) goto ERROR; if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_C, 0, 0) : p7_trace_AppendWithPP(tr, p7T_C, 0, 0, 0.0))) != eslOK) goto ERROR; if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_T, 0, 0) : p7_trace_AppendWithPP(tr, p7T_T, 0, 0, 0.0))) != eslOK) goto ERROR; sq->dsq[i] = eslDSQ_SENTINEL; /* some sanity checks */ if (tr->N != ad->N + 6) ESL_XEXCEPTION(eslECORRUPT, "backconverted trace ended up with unexpected size (%s/%s)", ad->sqname, ad->hmmname); if (k != ad->hmmto + 1) ESL_XEXCEPTION(eslECORRUPT, "backconverted trace didn't end at expected place on model (%s/%s)", ad->sqname, ad->hmmname); if (i != subL + 1) ESL_XEXCEPTION(eslECORRUPT, "backconverted subseq didn't end at expected length (%s/%s)", ad->sqname, ad->hmmname); /* Set up <sq> annotation as a subseq of a source sequence */ if ((status = esl_sq_FormatName(sq, "%s/%ld-%ld", ad->sqname, ad->sqfrom, ad->sqto)) != eslOK) goto ERROR; if ((status = esl_sq_FormatDesc(sq, "[subseq from] %s", ad->sqdesc[0] != '\0' ? ad->sqdesc : ad->sqname)) != eslOK) goto ERROR; if ((status = esl_sq_SetSource (sq, ad->sqname)) != eslOK) goto ERROR; if (ad->sqacc[0] != '\0') { if ((status = esl_sq_SetAccession (sq, ad->sqacc)) != eslOK) goto ERROR; } sq->n = subL; sq->start = ad->sqfrom; sq->end = ad->sqto; sq->C = 0; sq->W = subL; sq->L = ad->L; tr->M = ad->M; tr->L = ad->L; *ret_sq = sq; *ret_tr = tr; return eslOK; ERROR: if (sq != NULL) esl_sq_Destroy(sq); if (tr != NULL) p7_trace_Destroy(tr); *ret_sq = NULL; *ret_tr = NULL; return status; }
/* seq_shuffling() * SRE, Tue Jan 22 08:35:51 2008 [Market Street Cafe, Leesburg] * * Shuffling of input sequences. * * Fixed-length (L>0) vs. full-length (L=0) modes handled differently. * In fixed-length mode: * <shuff->seq> only needs to be allocated once, for L * <targ> is an allocated copy of a random subseq of length L * sequences < L residues long can't be shuffled * In full-length mode: * <shuff->seq> is grown to length <sq->n> for each input seq * <targ> just points to <sq->seq> */ static int seq_shuffling(ESL_GETOPTS *go, ESL_RANDOMNESS *r, FILE *ofp, int outfmt) { char *seqfile = esl_opt_GetArg(go, 1); int infmt = eslSQFILE_UNKNOWN; ESL_SQFILE *sqfp = NULL; ESL_SQ *sq = esl_sq_Create(); ESL_SQ *shuff = esl_sq_Create(); char *targ = NULL; int N = esl_opt_GetInteger(go, "-N"); int L = esl_opt_GetInteger(go, "-L"); /* L>0 means select random fixed-len subseqs */ int kmers = 0; int i; int status; if (esl_opt_GetString(go, "--informat") != NULL) { infmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslSQFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --informat"); } if (esl_opt_IsOn(go, "-k")) kmers = esl_opt_GetInteger(go, "-k"); status = esl_sqfile_Open(seqfile, infmt, NULL, &sqfp); if (status == eslENOTFOUND) esl_fatal("No such file %s", seqfile); else if (status == eslEFORMAT) esl_fatal("Format of seqfile %s unrecognized.", seqfile); else if (status == eslEINVAL) esl_fatal("Can't autodetect stdin or .gz."); else if (status != eslOK) esl_fatal("Open failed, code %d.", status); if (L>0) { esl_sq_GrowTo(shuff, L); shuff->n = L; ESL_ALLOC(targ, sizeof(char) * (L+1)); } while ((status = esl_sqio_Read(sqfp, sq)) == eslOK) { if (L == 0) { /* shuffling entire sequence */ esl_sq_GrowTo(shuff, sq->n); /* make sure shuff can hold sq */ shuff->n = sq->n; targ = sq->seq; } else { if (sq->n < L) continue; /* reject seqs < L long */ } for (i = 0; i < N; i++) { if (L > 0) { /* fixed-len mode: copy a random subseq */ int pos = esl_rnd_Roll(r, sq->n - L + 1); strncpy(targ, sq->seq + pos, L); targ[L] = '\0'; } /* Do the requested kind of shuffling */ if (esl_opt_GetBoolean(go, "-m")) esl_rsq_CShuffle (r, targ, shuff->seq); /* monoresidue shuffling */ else if (esl_opt_GetBoolean(go, "-d")) esl_rsq_CShuffleDP (r, targ, shuff->seq); /* diresidue shuffling */ else if (esl_opt_IsOn (go, "-k")) esl_rsq_CShuffleKmers(r, targ, kmers, shuff->seq); /* diresidue shuffling */ else if (esl_opt_GetBoolean(go, "-0")) esl_rsq_CMarkov0 (r, targ, shuff->seq); /* 0th order Markov */ else if (esl_opt_GetBoolean(go, "-1")) esl_rsq_CMarkov1 (r, targ, shuff->seq); /* 1st order Markov */ else if (esl_opt_GetBoolean(go, "-r")) esl_rsq_CReverse ( targ, shuff->seq); /* reverse */ else if (esl_opt_IsOn (go, "-w")) { /* regionally shuffle */ int W= esl_opt_GetInteger(go, "-w"); esl_rsq_CShuffleWindows(r, targ, W, shuff->seq); } /* Set the name of the shuffled sequence */ if (N > 1) esl_sq_FormatName(shuff, "%s-shuffled-%d", sq->name, i); else esl_sq_FormatName(shuff, "%s-shuffled", sq->name); /* Output the resulting sequence */ esl_sqio_Write(ofp, shuff, outfmt, FALSE); /* don't need to reuse the shuffled sequence: we will use exactly the same memory */ } esl_sq_Reuse(sq); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); if (L>0) free(targ); esl_sq_Destroy(shuff); esl_sq_Destroy(sq); esl_sqfile_Close(sqfp); return eslOK; ERROR: if (targ != NULL) free(targ); esl_sq_Destroy(shuff); esl_sq_Destroy(sq); esl_sqfile_Close(sqfp); return status; }
/* Each test sequence will contain one or two domains, depending on whether --single is set. */ static int synthesize_positives(ESL_GETOPTS *go, struct cfg_s *cfg, char *testname, ESL_STACK *teststack, int *ret_ntest) { ESL_SQ *domain1, *domain2; ESL_SQ *sq; void *p; int64_t L; /* total length of synthetic test seq */ int d1n, d2n; /* lengths of two domains */ int L1,L2,L3; /* lengths of three random regions */ int i,j; int ntest = 0; int ndomains = ( (esl_opt_GetBoolean(go, "--single") == TRUE) ? 1 : 2); int status; while (esl_stack_ObjectCount(teststack) >= ndomains) { ESL_RALLOC(cfg->test_lens, p, (cfg->ntest+1) * sizeof(struct testseq_s)); /* Pop our one or two test domains off the stack */ esl_stack_PPop(teststack, &p); domain1 = p; d1n = domain1->n; if (ndomains == 2) { esl_stack_PPop(teststack, &p); domain2 = p; d2n = domain2->n; } else { domain2 = NULL; d2n = 0; } /* Select a random total sequence length */ if (d1n+d2n > cfg->db_maxL) esl_fatal("can't construct test seq; no db seq >= %d residues\n", d1n+d2n); do { if (esl_ssi_FindNumber(cfg->dbfp->data.ascii.ssi, esl_rnd_Roll(cfg->r, cfg->db_nseq), NULL, NULL, NULL, &L, NULL) != eslOK) esl_fatal("failed to look up a random seq"); } while (L < d1n+d2n); /* Now figure out the embedding */ if (ndomains == 2) { /* Select random lengths of three flanking domains; * Imagine picking two "insert after" points i,j in sequence 1..L', for * L' = L-d1n-d2n (the total length of nonhomologous test seq) */ do { i = esl_rnd_Roll(cfg->r, L - d1n - d2n + 1 ); /* i = 0..L' */ j = esl_rnd_Roll(cfg->r, L - d1n - d2n + 1 ); /* j = 0..L' */ } while (i > j); /* now 1 .. i = random region 1 (if i==0, there's none); * i+1 .. i+d1n = domain 1 * i+d1n+1 .. j+d1n = random region 2 (if i==j, there's none); * j+d1n+1 .. j+d1n+d2n = domain 2 * j+d1n+d2n+1 .. L = random region 3 (if j == L-d1n-d2n, there's none); */ L1 = i; L2 = j-i; L3 = L - d1n - d2n - j; } else { /* embedding one domain */ i = esl_rnd_Roll(cfg->r, L - d1n + 1 ); /* i = 0..L' */ /* now 1 .. i = random region 1 (if i==0, there's none); * i+1 .. i+d1n = domain 1 * i+d1n+1 .. L = random region 2 (if i==j, there's none); */ L1 = i; L2 = L - d1n - L1; L3 = 0; } sq = esl_sq_CreateDigital(cfg->abc); esl_sq_GrowTo(sq, L); sq->n = L; if (ndomains == 2) { esl_sq_FormatName(sq, "%s/%d/%d-%d/%d-%d", testname, cfg->ntest, i+1, i+d1n, j+d1n+1, j+d1n+d2n); esl_sq_FormatDesc(sq, "domains: %s %s", domain1->name, domain2->name); } else { esl_sq_FormatName(sq, "%s/%d/%d-%d", testname, cfg->ntest, i+1, i+d1n); esl_sq_FormatDesc(sq, "domain: %s", domain1->name); } fprintf(cfg->possummfp, "%-35s %5d %5d %5d %5d %5d %5d", sq->name, (int) sq->n, L1, d1n, L2, d2n, L3); sq->dsq[0] = sq->dsq[L+1] = eslDSQ_SENTINEL; set_random_segment(go, cfg, cfg->possummfp, sq->dsq+1, L1); memcpy(sq->dsq+i+1, domain1->dsq+1, sizeof(ESL_DSQ) * d1n); fprintf(cfg->possummfp, " %-24s %5d %5d", domain1->name, 1, d1n); set_random_segment(go, cfg, cfg->possummfp, sq->dsq+i+d1n+1, L2); if (ndomains == 2) { memcpy(sq->dsq+j+d1n+1, domain2->dsq+1, sizeof(ESL_DSQ) * d2n); fprintf(cfg->possummfp, " %-24s %5d %5d", domain2->name, 1, d2n); set_random_segment(go, cfg, cfg->possummfp, sq->dsq+j+d1n+d2n+1, L3); } fprintf(cfg->possummfp, "\n"); cfg->test_lens[cfg->ntest].L = L; cfg->test_lens[cfg->ntest].L1 = L1; cfg->test_lens[cfg->ntest].d1n = d1n; cfg->test_lens[cfg->ntest].L2 = L2; cfg->test_lens[cfg->ntest].d2n = d2n; cfg->test_lens[cfg->ntest].L3 = L3; cfg->ntest++; ntest++; esl_sqio_Write(cfg->out_seqfp, sq, eslSQFILE_FASTA, FALSE); esl_sq_Destroy(domain1); if (ndomains == 2) esl_sq_Destroy(domain2); esl_sq_Destroy(sq); } *ret_ntest = ntest; return eslOK; ERROR: esl_fatal("Failure in synthesize_positives"); return status; }