/* Fetch in a random sequence of length <L> from the the pre-digitized * concatenated sequence database, select a random subseq, shuffle it * by the chosen algorithm; set dsq[1..L] to the resulting randomized * segment. * * If <logfp> is non-NULL, append one or more "<sqname> <from> <to>" * fields to current line, to record where the random segment was * selected from. This is useful in cases where we want to track back * the origin of a high-scoring segment, in case the randomization * wasn't good enough to obscure the identity of a segment. * */ static int set_random_segment(ESL_GETOPTS *go, struct cfg_s *cfg, FILE *logfp, ESL_DSQ *dsq, int L) { ESL_SQ *sq = esl_sq_CreateDigital(cfg->abc); int minDPL = esl_opt_GetInteger(go, "--minDPL"); int db_dependent = (esl_opt_GetBoolean(go, "--iid") == TRUE ? FALSE : TRUE); char *pkey = NULL; int start, end; int64_t Lseq; int status; if (L==0) return eslOK; if (L > cfg->db_maxL) esl_fatal("can't fetch a segment of length %d; database max is %d\n", L, cfg->db_maxL); /* fetch a random subseq from the source database */ esl_sq_GrowTo(sq, L); if (db_dependent) { do { if (pkey != NULL) free(pkey); if (esl_ssi_FindNumber(cfg->dbfp->data.ascii.ssi, esl_rnd_Roll(cfg->r, cfg->db_nseq), NULL, NULL, NULL, &Lseq, &pkey) != eslOK) esl_fatal("failed to look up a random seq"); } while (Lseq < L); start = 1 + esl_rnd_Roll(cfg->r, Lseq-L); end = start + L - 1; if (esl_sqio_FetchSubseq(cfg->dbfp, pkey, start, end, sq) != eslOK) esl_fatal("failed to fetch subseq"); esl_sq_ConvertDegen2X(sq); } /* log sequence source info: <name> <start> <end> */ if (logfp != NULL && db_dependent) fprintf(logfp, " %-15s %5d %5d", pkey, start, end); /* Now apply the appropriate randomization algorithm */ if (esl_opt_GetBoolean(go, "--mono")) status = esl_rsq_XShuffle (cfg->r, sq->dsq, L, sq->dsq); else if (esl_opt_GetBoolean(go, "--di")) { if (L < minDPL) status = esl_rsq_XShuffle (cfg->r, sq->dsq, L, sq->dsq); else status = esl_rsq_XShuffleDP(cfg->r, sq->dsq, L, cfg->abc->Kp, sq->dsq); } else if (esl_opt_GetBoolean(go, "--markov0")) status = esl_rsq_XMarkov0 (cfg->r, sq->dsq, L, cfg->abc->Kp, sq->dsq); else if (esl_opt_GetBoolean(go, "--markov1")) status = esl_rsq_XMarkov1 (cfg->r, sq->dsq, L, cfg->abc->Kp, sq->dsq); else if (esl_opt_GetBoolean(go, "--reverse")) status = esl_rsq_XReverse (sq->dsq, L, sq->dsq); else if (esl_opt_GetBoolean(go, "--iid")) status = esl_rsq_xIID (cfg->r, cfg->fq, cfg->abc->K, L, sq->dsq); else status = eslEINCONCEIVABLE; if (status != eslOK) esl_fatal("esl's shuffling failed"); memcpy(dsq, sq->dsq+1, sizeof(ESL_DSQ) * L); esl_sq_Destroy(sq); free(pkey); return eslOK; }
/* seq_generation() * * Generating sequences. */ static int seq_generation(ESL_GETOPTS *go, ESL_RANDOMNESS *r, FILE *ofp, int outfmt) { ESL_ALPHABET *abc = NULL; ESL_SQ *sq = NULL; double *fq = NULL; int alphatype = eslUNKNOWN; // static checkers can't see that 1 of --rna, --dna, --amino must be true int N = esl_opt_GetInteger(go, "-N"); int L = esl_opt_GetInteger(go, "-L"); int i; int status; if (L <= 0) esl_fatal("To generate sequences, set -L option (length of generated seqs) > 0 "); if (esl_opt_GetBoolean(go, "--rna")) alphatype = eslRNA; if (esl_opt_GetBoolean(go, "--dna")) alphatype = eslDNA; if (esl_opt_GetBoolean(go, "--amino")) alphatype = eslAMINO; abc = esl_alphabet_Create(alphatype); sq = esl_sq_CreateDigital(abc); esl_sq_GrowTo(sq, L); /* Pick the iid frequency distribution to use */ ESL_ALLOC(fq, sizeof(double) * abc->K); switch (alphatype) { case eslRNA: case eslDNA: esl_vec_DSet(fq, 4, 0.25); break; case eslAMINO: esl_composition_SW34(fq); break; default: esl_vec_DSet(fq, abc->K, 1.0 / (double) abc->K); break; } /* generate */ for (i = 0; i < N; i++) { esl_rsq_xIID(r, fq, abc->K, L, sq->dsq); if (N > 1) esl_sq_FormatName(sq, "random%d", i); else esl_sq_SetName(sq, "random"); sq->n = L; esl_sqio_Write(ofp, sq, outfmt, FALSE); } free(fq); esl_alphabet_Destroy(abc); esl_sq_Destroy(sq); return eslOK; ERROR: if (fq != NULL) free(fq); esl_alphabet_Destroy(abc); esl_sq_Destroy(sq); return status; }
int main(int argc, char **argv) { int status; ESL_RANDOMNESS *r = esl_randomness_CreateTimeseeded(); ESL_ALPHABET *abc = esl_alphabet_Create(eslAMINO); ESL_SCOREMATRIX *S = NULL; ESL_DSQ *x = NULL; /* iid query */ ESL_DSQ *y = NULL; /* iid target */ double lambda; double bg[20]; /* iid background probabilities */ int L; /* query length */ int M; /* target length */ int nseq; /* number of target seqs to simulate */ int i; int gop; int gex; char *mxfile = "PMX"; int raw_sc; /* Configuration */ L = 400; /* query length */ M = 400; /* target length */ nseq = 50000; gop = -11; gex = -1; lambda = 0.3207; ESL_ALLOC(x, sizeof(ESL_DSQ) * (L+2)); ESL_ALLOC(y, sizeof(ESL_DSQ) * (M+2)); /* Input an amino acid score matrix from a file. */ if (mxfile != NULL) { ESL_FILEPARSER *efp = NULL; if ( esl_fileparser_Open(mxfile, &efp) != eslOK) esl_fatal("failed to open score file %s", mxfile); if ( esl_sco_Read(efp, abc, &S) != eslOK) esl_fatal("failed to read matrix from %s", mxfile); esl_fileparser_Close(efp); } else { /* default = BLOSUM62 */ S = esl_scorematrix_Create(abc); esl_scorematrix_SetBLOSUM62(S); } esl_composition_BL62(bg); esl_rsq_xIID(r, bg, 20, L, x); for (i = 0; i < nseq; i++) { esl_rsq_xIID(r, bg, 20, M, y); esl_swat_Score(x, L, y, M, S, gop, gex, &raw_sc); printf("%d\n", raw_sc); } free(x); free(y); esl_scorematrix_Destroy(S); esl_alphabet_Destroy(abc); esl_randomness_Destroy(r); exit(0); ERROR: exit(status); }