static void onefetch_subseq(ESL_GETOPTS *go, FILE *ofp, ESL_SQFILE *sqfp, char *newname, char *key, uint32_t given_start, uint32_t given_end) { int start, end; int do_revcomp; ESL_SQ *sq = esl_sq_Create(); if (sqfp->data.ascii.ssi == NULL) esl_fatal("no ssi index"); /* reverse complement indicated by coords. */ /* -c 52: would be 52,0, so watch out for given_end = 0 case */ if (given_end != 0 && given_start > given_end) { start = given_end; end = given_start; do_revcomp = TRUE; } else { start = given_start; end = given_end; do_revcomp = FALSE; } if (esl_sqio_FetchSubseq(sqfp, key, start, end, sq) != eslOK) esl_fatal(esl_sqfile_GetErrorBuf(sqfp)); if (newname != NULL) esl_sq_SetName(sq, newname); else esl_sq_FormatName(sq, "%s/%d-%d", key, given_start, (given_end == 0) ? sq->L : given_end); /* Two ways we might have been asked to revcomp: by coord, or by -r option */ /* (If both happen, they'll cancel each other out) */ if (do_revcomp) if (esl_sq_ReverseComplement(sq) != eslOK) esl_fatal("Failed to reverse complement %s; is it a protein?\n", sq->name); if (esl_opt_GetBoolean(go, "-r")) if (esl_sq_ReverseComplement(sq) != eslOK) esl_fatal("Failed to reverse complement %s; is it a protein?\n", sq->name); esl_sqio_Write(ofp, sq, eslSQFILE_FASTA, FALSE); esl_sq_Destroy(sq); }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 1, argc, argv, banner, usage); ESL_RANDOMNESS *rng = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s")); char *hmmfile = esl_opt_GetArg(go, 1); int L = esl_opt_GetInteger(go, "-L"); int N = esl_opt_GetInteger(go, "-N"); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_TRACE *tr = p7_trace_Create(); ESL_SQ *sq = NULL; char errbuf[eslERRBUFSIZE]; int i; int status; status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, hmmfile, errbuf); status = p7_hmmfile_Read(hfp, &abc, &hmm); if (status == eslEFORMAT) p7_Fail("Bad file format in HMM file %s:\n%s\n", hfp->fname, hfp->errbuf); else if (status == eslEINCOMPAT) p7_Fail("HMM in %s is not in the expected %s alphabet\n", hfp->fname, esl_abc_DecodeType(abc->type)); else if (status == eslEOF) p7_Fail("Empty HMM file %s? No HMM data found.\n", hfp->fname); else if (status != eslOK) p7_Fail("Unexpected error in reading HMMs from %s\n", hfp->fname); p7_hmmfile_Close(hfp); bg = p7_bg_Create(abc); p7_bg_SetLength(bg, L); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, L, p7_LOCAL); sq = esl_sq_CreateDigital(abc); for (i = 0; i < N; i++) { p7_ProfileEmit(rng, hmm, gm, bg, sq, tr); esl_sq_FormatName(sq, "%s-sample%d", hmm->name, i); esl_sqio_Write(stdout, sq, eslSQFILE_FASTA, FALSE); if (p7_trace_Validate(tr, abc, sq->dsq, errbuf) != eslOK) esl_fatal(errbuf); esl_sq_Reuse(sq); p7_trace_Reuse(tr); } esl_sq_Destroy(sq); p7_trace_Destroy(tr); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); esl_alphabet_Destroy(abc); esl_randomness_Destroy(rng); esl_getopts_Destroy(go); return 0; }
/* onefetch(): * Given one <key> (a seq name or accession), retrieve the corresponding sequence. * In SSI mode, we can do this quickly by positioning the file, then regurgitating * every line until the end-of-record marker; we don't even have to parse. * Without an SSI index, we have to parse the file sequentially 'til we find * the one we're after. */ static void onefetch(ESL_GETOPTS *go, FILE *ofp, char *key, ESL_SQFILE *sqfp) { ESL_SQ *sq = esl_sq_Create(); int do_revcomp = esl_opt_GetBoolean(go, "-r"); char *newname = esl_opt_GetString(go, "-n"); int status; /* Try to position the file at the desired sequence with SSI. */ if (sqfp->data.ascii.ssi != NULL) { status = esl_sqfile_PositionByKey(sqfp, key); if (status == eslENOTFOUND) esl_fatal("seq %s not found in SSI index for file %s\n", key, sqfp->filename); else if (status == eslEFORMAT) esl_fatal("Failed to parse SSI index for %s\n", sqfp->filename); else if (status != eslOK) esl_fatal("Failed to look up location of seq %s in SSI index of file %s\n", key, sqfp->filename); status = esl_sqio_Read(sqfp, sq); if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status == eslEOF) esl_fatal("Unexpected EOF reading sequence file %s", status, sqfp->filename); else if (status != eslOK) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); if (strcmp(key, sq->name) != 0 && strcmp(key, sq->acc) != 0) esl_fatal("whoa, internal error; found the wrong sequence %s, not %s", sq->name, key); } else { /* Else, we have to read the whole damn file sequentially until we find the seq */ while ((status = esl_sqio_Read(sqfp, sq)) != eslEOF) { if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslOK) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); if (strcmp(key, sq->name) == 0 || strcmp(key, sq->acc) == 0) break; esl_sq_Reuse(sq); } if (status == eslEOF) esl_fatal("Failed to find sequence %s in file %s\n", key, sqfp->filename); } if (do_revcomp == FALSE && newname == NULL && ! esl_sqio_IsAlignment(sqfp->format)) { /* If we're not manipulating the sequence in any way, and it's not from an alignment file, we can Echo() it. */ if (esl_sqio_Echo(sqfp, sq, ofp) != eslOK) esl_fatal("Echo failed: %s\n", esl_sqfile_GetErrorBuf(sqfp)); } else { /* Otherwise we Write() the parsed version. */ if (do_revcomp && esl_sq_ReverseComplement(sq) != eslOK) esl_fatal("Failed to reverse complement %s; is it a protein?\n", sq->name); if (newname != NULL) esl_sq_SetName(sq, newname); esl_sqio_Write(ofp, sq, eslSQFILE_FASTA, FALSE); } esl_sq_Destroy(sq); }
static void emit_sequences(ESL_GETOPTS *go, FILE *ofp, int outfmt, ESL_RANDOMNESS *r, P7_HMM *hmm) { ESL_SQ *sq = NULL; P7_TRACE *tr = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; int do_profile = esl_opt_GetBoolean(go, "-p"); int N = esl_opt_GetInteger(go, "-N"); int L = esl_opt_GetInteger(go, "-L"); int mode = p7_LOCAL; int nseq; int status; if (esl_opt_GetBoolean(go, "--local")) mode = p7_LOCAL; else if (esl_opt_GetBoolean(go, "--unilocal")) mode = p7_UNILOCAL; else if (esl_opt_GetBoolean(go, "--glocal")) mode = p7_GLOCAL; else if (esl_opt_GetBoolean(go, "--uniglocal")) mode = p7_UNIGLOCAL; if ((sq = esl_sq_CreateDigital(hmm->abc)) == NULL) esl_fatal("failed to allocate sequence"); if ((tr = p7_trace_Create()) == NULL) esl_fatal("failed to allocate trace"); if ((bg = p7_bg_Create(hmm->abc)) == NULL) esl_fatal("failed to create null model"); if ((gm = p7_profile_Create(hmm->M, hmm->abc)) == NULL) esl_fatal("failed to create profile"); if (p7_ProfileConfig(hmm, bg, gm, L, mode) != eslOK) esl_fatal("failed to configure profile"); if (p7_bg_SetLength(bg, L) != eslOK) esl_fatal("failed to reconfig null model length"); if (p7_hmm_Validate (hmm, NULL, 0.0001) != eslOK) esl_fatal("whoops, HMM is bad!"); if (p7_profile_Validate(gm, NULL, 0.0001) != eslOK) esl_fatal("whoops, profile is bad!"); for (nseq = 1; nseq <= N; nseq++) { if (do_profile) status = p7_ProfileEmit(r, hmm, gm, bg, sq, tr); else status = p7_CoreEmit (r, hmm, sq, tr); if (status) esl_fatal("Failed to emit sequence\n"); status = esl_sq_FormatName(sq, "%s-sample%d", hmm->name, nseq); if (status) esl_fatal("Failed to set sequence name\n"); status = esl_sqio_Write(ofp, sq, outfmt, FALSE); if (status != eslOK) esl_fatal("Failed to write sequence\n"); p7_trace_Reuse(tr); esl_sq_Reuse(sq); } esl_sq_Destroy(sq); p7_trace_Destroy(tr); p7_bg_Destroy(bg); p7_profile_Destroy(gm); return; }
/* seq_generation() * * Generating sequences. */ static int seq_generation(ESL_GETOPTS *go, ESL_RANDOMNESS *r, FILE *ofp, int outfmt) { ESL_ALPHABET *abc = NULL; ESL_SQ *sq = NULL; double *fq = NULL; int alphatype = eslUNKNOWN; // static checkers can't see that 1 of --rna, --dna, --amino must be true int N = esl_opt_GetInteger(go, "-N"); int L = esl_opt_GetInteger(go, "-L"); int i; int status; if (L <= 0) esl_fatal("To generate sequences, set -L option (length of generated seqs) > 0 "); if (esl_opt_GetBoolean(go, "--rna")) alphatype = eslRNA; if (esl_opt_GetBoolean(go, "--dna")) alphatype = eslDNA; if (esl_opt_GetBoolean(go, "--amino")) alphatype = eslAMINO; abc = esl_alphabet_Create(alphatype); sq = esl_sq_CreateDigital(abc); esl_sq_GrowTo(sq, L); /* Pick the iid frequency distribution to use */ ESL_ALLOC(fq, sizeof(double) * abc->K); switch (alphatype) { case eslRNA: case eslDNA: esl_vec_DSet(fq, 4, 0.25); break; case eslAMINO: esl_composition_SW34(fq); break; default: esl_vec_DSet(fq, abc->K, 1.0 / (double) abc->K); break; } /* generate */ for (i = 0; i < N; i++) { esl_rsq_xIID(r, fq, abc->K, L, sq->dsq); if (N > 1) esl_sq_FormatName(sq, "random%d", i); else esl_sq_SetName(sq, "random"); sq->n = L; esl_sqio_Write(ofp, sq, outfmt, FALSE); } free(fq); esl_alphabet_Destroy(abc); esl_sq_Destroy(sq); return eslOK; ERROR: if (fq != NULL) free(fq); esl_alphabet_Destroy(abc); esl_sq_Destroy(sq); return status; }
static void emit_consensus(ESL_GETOPTS *go, FILE *ofp, int outfmt, P7_HMM *hmm) { ESL_SQ *sq = NULL; if ((sq = esl_sq_CreateDigital(hmm->abc)) == NULL) esl_fatal("failed to allocate sequence"); if (p7_emit_SimpleConsensus(hmm, sq) != eslOK) esl_fatal("failed to create simple consensus seq"); if (esl_sq_FormatName(sq, "%s-consensus", hmm->name) != eslOK) esl_fatal("failed to set sequence name"); if (esl_sqio_Write(ofp, sq, outfmt, FALSE) != eslOK) esl_fatal("failed to write sequence"); esl_sq_Destroy(sq); return; }
static void emit_fancycons(ESL_GETOPTS *go, FILE *ofp, int outfmt, P7_HMM *hmm) { ESL_SQ *sq = NULL; float minl = esl_opt_GetReal(go, "--minl"); float minu = esl_opt_GetReal(go, "--minu"); if ((sq = esl_sq_Create()) == NULL) esl_fatal("failed to allocate sequence"); if (p7_emit_FancyConsensus(hmm, minl, minu, sq) != eslOK) esl_fatal("failed to create consensus seq"); if (esl_sq_FormatName(sq, "%s-consensus", hmm->name) != eslOK) esl_fatal("failed to set sequence name"); if (esl_sqio_Write(ofp, sq, outfmt, FALSE) != eslOK) esl_fatal("failed to write sequence"); esl_sq_Destroy(sq); return; }
static int synthesize_negatives(ESL_GETOPTS *go, struct cfg_s *cfg, int nneg) { ESL_SQ *sq = esl_sq_CreateDigital(cfg->abc); int a; int i; int L1,L2,L3,d1n,d2n; for (i = 0; i < nneg; i++) { /* Select a random test seq, to use its same segments */ a = esl_rnd_Roll(cfg->r, cfg->ntest); L1 = cfg->test_lens[a].L1; L2 = cfg->test_lens[a].L2; L3 = cfg->test_lens[a].L3; d1n = cfg->test_lens[a].d1n; d2n = cfg->test_lens[a].d2n; esl_sq_GrowTo(sq, cfg->test_lens[a].L); esl_sq_FormatName(sq, "decoy%d", i+1); esl_sq_FormatDesc(sq, "L=%d in segments: %d/%d/%d/%d/%d", cfg->test_lens[a].L, L1, d1n, L2, d2n, L3); sq->n = cfg->test_lens[a].L; fprintf(cfg->negsummfp, "%-15s %5d %5d %5d %5d %5d %5d", sq->name, (int) sq->n, L1, d1n, L2, d2n, L3); sq->dsq[0] = sq->dsq[cfg->test_lens[a].L+1] = eslDSQ_SENTINEL; set_random_segment(go, cfg, cfg->negsummfp, sq->dsq+1, L1); set_random_segment(go, cfg, cfg->negsummfp, sq->dsq+1+L1, d1n); set_random_segment(go, cfg, cfg->negsummfp, sq->dsq+1+L1+d1n, L2); set_random_segment(go, cfg, cfg->negsummfp, sq->dsq+1+L1+d1n+L2, d2n); set_random_segment(go, cfg, cfg->negsummfp, sq->dsq+1+L1+d1n+L2+d2n, L3); fprintf(cfg->negsummfp, "\n"); esl_sqio_Write(cfg->out_seqfp, sq, eslSQFILE_FASTA, FALSE); esl_sq_Reuse(sq); } esl_sq_Destroy(sq); return eslOK; }
int main(int argc, char **argv) { ESL_SQFILE *sqfp = NULL; ESL_SQ *sq = NULL; ESL_SQ *dsq = NULL; ESL_SQ **prot; int c; ESL_ALPHABET *abc, *prot_abc; ESL_SQ *prot6[6]; int x; abc = esl_alphabet_Create(eslDNA); prot_abc = esl_alphabet_Create(eslAMINO); if(argc != 2) { printf("You need to pass an argument for a filepath to a dna/rna fasta file\n"); exit(0); } if(eslOK != esl_sqfile_Open(argv[1], eslSQFILE_FASTA, NULL, &sqfp)) { printf("Invalid filepath: %s\n", argv[1]); exit(0); } sq = esl_sq_Create(); if(sq == NULL) { printf("could not allocate new sequence\n"); exit(0); } if(esl_sqio_Read(sqfp, sq) != eslOK) { printf("Not a valid fasta file %s\n", argv[1]); exit(0); } dsq = esl_sq_Create(); if(dsq == NULL) { printf("could not allocate digital sequence\n"); exit(0); } if(esl_sq_Copy(sq, dsq) != eslOK) { printf("could not copy sequence\n"); exit(0); } if(esl_sq_Digitize(abc, dsq) != eslOK) { printf("could not digitize sequence\n"); exit(0); } esl_sqio_Write(stdout, sq, eslSQFILE_FASTA, 0); if(esl_trans_6frame(sq, prot6) != eslOK) { printf("could not generate six frame translation\n"); exit(0); } for(x = 0; x < 6; x++) { esl_sqio_Write(stdout, prot6[x], eslSQFILE_FASTA, 0); } if(esl_trans_orf(dsq, &prot, &c, 10) != eslOK) { printf("could not translate open reading frames\n"); exit(0); } for(x = 0; x < c; x++) { esl_sqio_Write(stdout, prot[x], eslSQFILE_FASTA, 0); } return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* application configuration */ char *seqfile = NULL; /* sequence file name */ char *maskfile = NULL; /* mask coordinate file name */ int infmt = eslSQFILE_UNKNOWN; /* format code for seqfile */ int outfmt = eslSQFILE_FASTA; /* format code for output seqs */ ESL_SQFILE *sqfp = NULL; /* open sequence file */ ESL_FILEPARSER *maskefp = NULL; /* open mask coord file */ FILE *ofp = NULL; /* output stream for masked seqs */ char *source = NULL; /* name of current seq to mask */ char *p1, *p2; /* pointers used in parsing */ int64_t start, end; /* start, end coord for masking */ int64_t i, j, pos; /* coords in a sequence */ int64_t overmask; /* # of extra residues to mask */ ESL_SQ *sq = esl_sq_Create(); /* current sequence */ int do_fetching; int do_lowercase; int maskchar; int status; /* easel return code */ /**************************************************************************** * Parse command line ****************************************************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") ) cmdline_help (argv[0], go); if (esl_opt_ArgNumber(go) != 2) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); do_fetching = esl_opt_GetBoolean(go, "-R"); do_lowercase = esl_opt_GetBoolean(go, "-l"); overmask = (esl_opt_IsOn(go, "-x") ? esl_opt_GetInteger(go, "-x") : 0); maskchar = (esl_opt_IsOn(go, "-m") ? esl_opt_GetChar(go, "-m") : 'X'); seqfile = esl_opt_GetArg(go, 1); maskfile = esl_opt_GetArg(go, 2); /* Open the <seqfile>: text mode, not digital */ if (esl_opt_GetString(go, "--informat") != NULL) { infmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslSQFILE_UNKNOWN) cmdline_failure(argv[0], "%s is not a valid input sequence file format for --informat"); } status = esl_sqfile_Open(seqfile, infmt, NULL, &sqfp); if (status == eslENOTFOUND) cmdline_failure(argv[0], "Sequence file %s not found.\n", seqfile); else if (status == eslEFORMAT) cmdline_failure(argv[0], "Format of file %s unrecognized.\n", seqfile); else if (status == eslEINVAL) cmdline_failure(argv[0], "Can't autodetect stdin or .gz.\n"); else if (status != eslOK) cmdline_failure(argv[0], "Open failed, code %d.\n", status); if(do_fetching) { status = esl_sqfile_OpenSSI(sqfp, NULL); if (status == eslEFORMAT) cmdline_failure(argv[0], "SSI index is in incorrect format\n"); else if (status == eslERANGE) cmdline_failure(argv[0], "SSI index is in 64-bit format and we can't read it\n"); else if (status != eslOK) cmdline_failure(argv[0], "Failed to open SSI index\n"); } /* Open the <maskfile> */ if (esl_fileparser_Open(maskfile, NULL, &maskefp) != eslOK) cmdline_failure(argv[0], "Failed to open mask coordinate file %s\n", maskfile); esl_fileparser_SetCommentChar(maskefp, '#'); /* Open the output file, if any */ if (esl_opt_GetString(go, "-o") != NULL) { if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) cmdline_failure(argv[0], "Failed to open output file %s\n", esl_opt_GetString(go, "-o")); } else ofp = stdout; /**************************************************************************** * Main loop over lines in <maskfile> ****************************************************************************/ /* Read one data line at a time from the <maskfile>; * parse into data fields <seqname> <start> <end> */ while (esl_fileparser_NextLine(maskefp) == eslOK) { /* First field is sequence name */ if (esl_fileparser_GetTokenOnLine(maskefp, &source, NULL) != eslOK) esl_fatal("Failed to read source seq name on line %d of file %s\n", maskefp->linenumber, maskfile); /* Get the sequence */ if (do_fetching) { /* If the <seqfile> is SSI indexed, try to reposition it and read <source> seq by random access */ status = esl_sqio_Fetch(sqfp, source, sq); if (status == eslENOTFOUND) esl_fatal("seq %s not found in SSI index for file %s\n", source, sqfp->filename); else if (status == eslEINVAL) esl_fatal("No SSI index or can't reposition in file %s\n", sqfp->filename); else if (status == eslEFORMAT) esl_fatal("Parse failed:\n%s\n", esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslOK) esl_fatal("Unexpected failure in fetching %s from file %s\n", source, sqfp->filename); } else { /* else, assume we're reading sequentially; <sqfile> and <maskfile> have seqs in same order */ status = esl_sqio_Read(sqfp, sq); if (status == eslEOF) esl_fatal("File %s ended prematurely; didn't find %s\n", sqfp->filename, source); else if (status == eslEFORMAT) esl_fatal("Parse failed:\n%s\n", esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslOK) esl_fatal("Unexpected error reading sequence file %s\n", sqfp->filename); if ((strcmp(sq->name, source) != 0) && (strcmp(sq->acc, source) != 0)) esl_fatal("Sequences in <sqfile> and <maskfile> aren't in same order; try -R"); } /* If we're masking by lowercase, first make sure everything's uppercase */ if (do_lowercase) for (pos = 0; pos < sq->n; pos++) if (isalpha(sq->seq[pos])) sq->seq[pos] = toupper(sq->seq[pos]); /* Next two fields are <start>, <end> for the masking */ /* possible future extension: wrap loop around this, enable multiple masked regions */ if (esl_fileparser_GetTokenOnLine(maskefp, &p1, NULL) != eslOK) esl_fatal("Failed to read start coord on line %d of file %s\n", maskefp->linenumber, maskfile); start = strtoll(p1, &p2, 0) - 1; if (esl_fileparser_GetTokenOnLine(maskefp, &p2, NULL) != eslOK) esl_fatal("Failed to read end coord on line %d of file %s\n", maskefp->linenumber, maskfile); end = strtoll(p2, &p1, 0) - 1; /* Do the masking */ if (esl_opt_GetBoolean(go, "-r")) /* Reverse masking */ { /* leave start..end unmasked; mask prefix 0..start-1, end+1..L-1 */ i = 0; j = ESL_MIN(sq->n-1, start - 1 + overmask); for (pos = i; pos <= j; pos++) if (isalpha(sq->seq[pos])) sq->seq[pos] = (do_lowercase ? tolower(sq->seq[pos]) : maskchar); i = ESL_MAX(0, end + 1 - overmask); j = sq->n-1; for (pos = i; pos <= j; pos++) if (isalpha(sq->seq[pos])) sq->seq[pos] = (do_lowercase ? tolower(sq->seq[pos]) : maskchar); } else { /* normal: mask start..end */ i = ESL_MAX(0, start - overmask); j = ESL_MIN(sq->n-1, end + overmask); for (pos = i; pos <= j; pos++) if (isalpha(sq->seq[pos])) sq->seq[pos] = (do_lowercase ? tolower(sq->seq[pos]) : maskchar); } esl_sqio_Write(ofp, sq, outfmt, FALSE); esl_sq_Reuse(sq); } esl_sq_Destroy(sq); esl_fileparser_Close(maskefp); esl_sqfile_Close(sqfp); esl_getopts_Destroy(go); if (ofp != stdout) fclose(ofp); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage); ESL_RANDOMNESS *rng = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s")); ESL_ALPHABET *abc = NULL; char *ghmmfile = esl_opt_GetArg(go, 1); /* HMMs parameterized for sequence generation */ char *ahmmfile = esl_opt_GetArg(go, 2); /* HMMs parameterized for alignment */ int N = esl_opt_GetInteger(go, "-N"); P7_HMMFILE *ghfp = NULL; P7_HMMFILE *ahfp = NULL; P7_HMM *ghmm = NULL; P7_HMM *ahmm = NULL; P7_PROFILE *ggm = NULL; P7_PROFILE *agm = NULL; P7_OPROFILE *aom = NULL; P7_BG *bg = NULL; ESL_SQ *sq = NULL; P7_TRACE *reftr = p7_trace_Create(); P7_TRACE *testtr = p7_trace_Create(); P7_TRACE_METRICS *tmetrics = p7_trace_metrics_Create(); P7_REFMX *rmx = p7_refmx_Create(100,100); // P7_FILTERMX *ox = NULL; P7_HARDWARE *hw; if ((hw = p7_hardware_Create ()) == NULL) p7_Fail("Couldn't get HW information data structure"); P7_SPARSEMASK *sm = p7_sparsemask_Create(100, 100, hw->simd); P7_SPARSEMX *sxv = p7_sparsemx_Create(NULL); int idx; char errbuf[eslERRBUFSIZE]; int status; p7_Init(); /* open HMM file containing models parameterized for generation (sampling) of seqs */ status = p7_hmmfile_OpenE(ghmmfile, NULL, &ghfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", ghmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", ghmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, ghmmfile, errbuf); /* open HMM file containing models parameterized for alignment (may be the same as ghmmfile) */ status = p7_hmmfile_OpenE(ahmmfile, NULL, &ahfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", ahmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", ahmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, ahmmfile, errbuf); while ( (status = p7_hmmfile_Read(ghfp, &abc, &ghmm)) == eslOK) /* <abc> gets set on first read */ { /* read the counterpart HMM from <ahfp> */ status = p7_hmmfile_Read(ahfp, &abc, &ahmm); if (status == eslEFORMAT) p7_Fail("Bad file format in HMM file %s:\n%s\n", ahfp->fname, ahfp->errbuf); else if (status == eslEINCOMPAT) p7_Fail("HMM in %s is not in the expected %s alphabet\n", ahfp->fname, esl_abc_DecodeType(abc->type)); else if (status == eslEOF) p7_Fail("Empty HMM file %s? No HMM data found.\n", ahfp->fname); else if (status != eslOK) p7_Fail("Unexpected error in reading HMMs from %s\n", ahfp->fname); /* try to validate that they're the "same" */ if (ahmm->M != ghmm->M || strcmp(ahmm->name, ghmm->name) != 0) p7_Fail("<gen-hmmfile>, <ali-hmmfile> contain different set or order of models"); /* deferred one-time creation of structures that need to know the alphabet */ if (!bg) bg = p7_bg_Create(abc); if (!sq) sq = esl_sq_CreateDigital(abc); ggm = p7_profile_Create(ghmm->M, abc); agm = p7_profile_Create(ahmm->M, abc); aom = p7_oprofile_Create(ahmm->M, abc, hw->simd); p7_profile_ConfigCustom(ggm, ghmm, bg, esl_opt_GetInteger(go, "--gL"), esl_opt_GetReal(go, "--gnj"), esl_opt_GetReal(go, "--gpglocal")); p7_profile_ConfigCustom(agm, ahmm, bg, 100, esl_opt_GetReal(go, "--anj"), esl_opt_GetReal(go, "--apglocal")); p7_oprofile_Convert(agm, aom); for (idx = 1; idx <= N; idx++) { p7_ProfileEmit(rng, ghmm, ggm, bg, sq, reftr); if (esl_opt_GetBoolean(go, "--dumpseqs")) { esl_sq_FormatName(sq, "seq%d", idx); esl_sqio_Write(stdout, sq, eslSQFILE_FASTA, FALSE); } p7_bg_SetLength(bg, sq->n); p7_profile_SetLength(agm, sq->n); p7_sparsemask_Reinit(sm, agm->M, sq->n); p7_sparsemask_AddAll(sm); if (esl_opt_GetBoolean(go, "--vit")) p7_ReferenceViterbi(sq->dsq, sq->n, agm, rmx, testtr, /*opt_vsc=*/NULL); else p7_SparseViterbi (sq->dsq, sq->n, agm, sm, sxv, testtr, /*opt_vsc=*/NULL); p7_trace_metrics(reftr, testtr, tmetrics); p7_sparsemask_Reuse(sm); p7_sparsemx_Reuse(sxv); //p7_filtermx_Reuse(ox); p7_refmx_Reuse(rmx); esl_sq_Reuse(sq); p7_trace_Reuse(reftr); p7_trace_Reuse(testtr); } p7_oprofile_Destroy(aom); p7_profile_Destroy(ggm); p7_profile_Destroy(agm); p7_hmm_Destroy(ghmm); p7_hmm_Destroy(ahmm); } /* we leave the loop with <status> set by a p7_hmmfile_Read() on ghfp; if all is well, status=eslEOF */ if (status == eslEFORMAT) p7_Fail("Bad file format in HMM file %s:\n%s\n", ghfp->fname, ghfp->errbuf); else if (status == eslEINCOMPAT) p7_Fail("HMM in %s is not in the expected %s alphabet\n", ghfp->fname, esl_abc_DecodeType(abc->type)); else if (status != eslEOF) p7_Fail("Unexpected error in reading HMMs from %s\n", ghfp->fname); p7_trace_metrics_Dump(stdout, tmetrics); p7_hmmfile_Close(ghfp); p7_hmmfile_Close(ahfp); // p7_filtermx_Destroy(ox); p7_sparsemask_Destroy(sm); p7_sparsemx_Destroy(sxv); p7_refmx_Destroy(rmx); p7_trace_metrics_Destroy(tmetrics); p7_trace_Destroy(testtr); p7_trace_Destroy(reftr); p7_bg_Destroy(bg); esl_alphabet_Destroy(abc); esl_randomness_Destroy(rng); esl_getopts_Destroy(go); }
/* seq_shuffling() * SRE, Tue Jan 22 08:35:51 2008 [Market Street Cafe, Leesburg] * * Shuffling of input sequences. * * Fixed-length (L>0) vs. full-length (L=0) modes handled differently. * In fixed-length mode: * <shuff->seq> only needs to be allocated once, for L * <targ> is an allocated copy of a random subseq of length L * sequences < L residues long can't be shuffled * In full-length mode: * <shuff->seq> is grown to length <sq->n> for each input seq * <targ> just points to <sq->seq> */ static int seq_shuffling(ESL_GETOPTS *go, ESL_RANDOMNESS *r, FILE *ofp, int outfmt) { char *seqfile = esl_opt_GetArg(go, 1); int infmt = eslSQFILE_UNKNOWN; ESL_SQFILE *sqfp = NULL; ESL_SQ *sq = esl_sq_Create(); ESL_SQ *shuff = esl_sq_Create(); char *targ = NULL; int N = esl_opt_GetInteger(go, "-N"); int L = esl_opt_GetInteger(go, "-L"); /* L>0 means select random fixed-len subseqs */ int kmers = 0; int i; int status; if (esl_opt_GetString(go, "--informat") != NULL) { infmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslSQFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --informat"); } if (esl_opt_IsOn(go, "-k")) kmers = esl_opt_GetInteger(go, "-k"); status = esl_sqfile_Open(seqfile, infmt, NULL, &sqfp); if (status == eslENOTFOUND) esl_fatal("No such file %s", seqfile); else if (status == eslEFORMAT) esl_fatal("Format of seqfile %s unrecognized.", seqfile); else if (status == eslEINVAL) esl_fatal("Can't autodetect stdin or .gz."); else if (status != eslOK) esl_fatal("Open failed, code %d.", status); if (L>0) { esl_sq_GrowTo(shuff, L); shuff->n = L; ESL_ALLOC(targ, sizeof(char) * (L+1)); } while ((status = esl_sqio_Read(sqfp, sq)) == eslOK) { if (L == 0) { /* shuffling entire sequence */ esl_sq_GrowTo(shuff, sq->n); /* make sure shuff can hold sq */ shuff->n = sq->n; targ = sq->seq; } else { if (sq->n < L) continue; /* reject seqs < L long */ } for (i = 0; i < N; i++) { if (L > 0) { /* fixed-len mode: copy a random subseq */ int pos = esl_rnd_Roll(r, sq->n - L + 1); strncpy(targ, sq->seq + pos, L); targ[L] = '\0'; } /* Do the requested kind of shuffling */ if (esl_opt_GetBoolean(go, "-m")) esl_rsq_CShuffle (r, targ, shuff->seq); /* monoresidue shuffling */ else if (esl_opt_GetBoolean(go, "-d")) esl_rsq_CShuffleDP (r, targ, shuff->seq); /* diresidue shuffling */ else if (esl_opt_IsOn (go, "-k")) esl_rsq_CShuffleKmers(r, targ, kmers, shuff->seq); /* diresidue shuffling */ else if (esl_opt_GetBoolean(go, "-0")) esl_rsq_CMarkov0 (r, targ, shuff->seq); /* 0th order Markov */ else if (esl_opt_GetBoolean(go, "-1")) esl_rsq_CMarkov1 (r, targ, shuff->seq); /* 1st order Markov */ else if (esl_opt_GetBoolean(go, "-r")) esl_rsq_CReverse ( targ, shuff->seq); /* reverse */ else if (esl_opt_IsOn (go, "-w")) { /* regionally shuffle */ int W= esl_opt_GetInteger(go, "-w"); esl_rsq_CShuffleWindows(r, targ, W, shuff->seq); } /* Set the name of the shuffled sequence */ if (N > 1) esl_sq_FormatName(shuff, "%s-shuffled-%d", sq->name, i); else esl_sq_FormatName(shuff, "%s-shuffled", sq->name); /* Output the resulting sequence */ esl_sqio_Write(ofp, shuff, outfmt, FALSE); /* don't need to reuse the shuffled sequence: we will use exactly the same memory */ } esl_sq_Reuse(sq); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); if (L>0) free(targ); esl_sq_Destroy(shuff); esl_sq_Destroy(sq); esl_sqfile_Close(sqfp); return eslOK; ERROR: if (targ != NULL) free(targ); esl_sq_Destroy(shuff); esl_sq_Destroy(sq); esl_sqfile_Close(sqfp); return status; }
/* Each test sequence will contain one or two domains, depending on whether --single is set. */ static int synthesize_positives(ESL_GETOPTS *go, struct cfg_s *cfg, char *testname, ESL_STACK *teststack, int *ret_ntest) { ESL_SQ *domain1, *domain2; ESL_SQ *sq; void *p; int64_t L; /* total length of synthetic test seq */ int d1n, d2n; /* lengths of two domains */ int L1,L2,L3; /* lengths of three random regions */ int i,j; int ntest = 0; int ndomains = ( (esl_opt_GetBoolean(go, "--single") == TRUE) ? 1 : 2); int status; while (esl_stack_ObjectCount(teststack) >= ndomains) { ESL_RALLOC(cfg->test_lens, p, (cfg->ntest+1) * sizeof(struct testseq_s)); /* Pop our one or two test domains off the stack */ esl_stack_PPop(teststack, &p); domain1 = p; d1n = domain1->n; if (ndomains == 2) { esl_stack_PPop(teststack, &p); domain2 = p; d2n = domain2->n; } else { domain2 = NULL; d2n = 0; } /* Select a random total sequence length */ if (d1n+d2n > cfg->db_maxL) esl_fatal("can't construct test seq; no db seq >= %d residues\n", d1n+d2n); do { if (esl_ssi_FindNumber(cfg->dbfp->data.ascii.ssi, esl_rnd_Roll(cfg->r, cfg->db_nseq), NULL, NULL, NULL, &L, NULL) != eslOK) esl_fatal("failed to look up a random seq"); } while (L < d1n+d2n); /* Now figure out the embedding */ if (ndomains == 2) { /* Select random lengths of three flanking domains; * Imagine picking two "insert after" points i,j in sequence 1..L', for * L' = L-d1n-d2n (the total length of nonhomologous test seq) */ do { i = esl_rnd_Roll(cfg->r, L - d1n - d2n + 1 ); /* i = 0..L' */ j = esl_rnd_Roll(cfg->r, L - d1n - d2n + 1 ); /* j = 0..L' */ } while (i > j); /* now 1 .. i = random region 1 (if i==0, there's none); * i+1 .. i+d1n = domain 1 * i+d1n+1 .. j+d1n = random region 2 (if i==j, there's none); * j+d1n+1 .. j+d1n+d2n = domain 2 * j+d1n+d2n+1 .. L = random region 3 (if j == L-d1n-d2n, there's none); */ L1 = i; L2 = j-i; L3 = L - d1n - d2n - j; } else { /* embedding one domain */ i = esl_rnd_Roll(cfg->r, L - d1n + 1 ); /* i = 0..L' */ /* now 1 .. i = random region 1 (if i==0, there's none); * i+1 .. i+d1n = domain 1 * i+d1n+1 .. L = random region 2 (if i==j, there's none); */ L1 = i; L2 = L - d1n - L1; L3 = 0; } sq = esl_sq_CreateDigital(cfg->abc); esl_sq_GrowTo(sq, L); sq->n = L; if (ndomains == 2) { esl_sq_FormatName(sq, "%s/%d/%d-%d/%d-%d", testname, cfg->ntest, i+1, i+d1n, j+d1n+1, j+d1n+d2n); esl_sq_FormatDesc(sq, "domains: %s %s", domain1->name, domain2->name); } else { esl_sq_FormatName(sq, "%s/%d/%d-%d", testname, cfg->ntest, i+1, i+d1n); esl_sq_FormatDesc(sq, "domain: %s", domain1->name); } fprintf(cfg->possummfp, "%-35s %5d %5d %5d %5d %5d %5d", sq->name, (int) sq->n, L1, d1n, L2, d2n, L3); sq->dsq[0] = sq->dsq[L+1] = eslDSQ_SENTINEL; set_random_segment(go, cfg, cfg->possummfp, sq->dsq+1, L1); memcpy(sq->dsq+i+1, domain1->dsq+1, sizeof(ESL_DSQ) * d1n); fprintf(cfg->possummfp, " %-24s %5d %5d", domain1->name, 1, d1n); set_random_segment(go, cfg, cfg->possummfp, sq->dsq+i+d1n+1, L2); if (ndomains == 2) { memcpy(sq->dsq+j+d1n+1, domain2->dsq+1, sizeof(ESL_DSQ) * d2n); fprintf(cfg->possummfp, " %-24s %5d %5d", domain2->name, 1, d2n); set_random_segment(go, cfg, cfg->possummfp, sq->dsq+j+d1n+d2n+1, L3); } fprintf(cfg->possummfp, "\n"); cfg->test_lens[cfg->ntest].L = L; cfg->test_lens[cfg->ntest].L1 = L1; cfg->test_lens[cfg->ntest].d1n = d1n; cfg->test_lens[cfg->ntest].L2 = L2; cfg->test_lens[cfg->ntest].d2n = d2n; cfg->test_lens[cfg->ntest].L3 = L3; cfg->ntest++; ntest++; esl_sqio_Write(cfg->out_seqfp, sq, eslSQFILE_FASTA, FALSE); esl_sq_Destroy(domain1); if (ndomains == 2) esl_sq_Destroy(domain2); esl_sq_Destroy(sq); } *ret_ntest = ntest; return eslOK; ERROR: esl_fatal("Failure in synthesize_positives"); return status; }
/* multifetch: * given a file containing lines with one name or key per line; * parse the file line-by-line; * if we have an SSI index available, retrieve the seqs by key * as we see each line; * else, without an SSI index, store the keys in a hash, then * read the entire seq file in a single pass, outputting seqs * that are in our keylist. * * Note that with an SSI index, you get the seqs in the order they * appear in the <keyfile>, but without an SSI index, you get seqs in * the order they occur in the seq file. */ static void multifetch(ESL_GETOPTS *go, FILE *ofp, char *keyfile, ESL_SQFILE *sqfp) { ESL_KEYHASH *keys = esl_keyhash_Create(); ESL_FILEPARSER *efp = NULL; int nseq = 0; int nkeys = 0; char *key; int keylen; int keyidx; int status; if (esl_fileparser_Open(keyfile, NULL, &efp) != eslOK) esl_fatal("Failed to open key file %s\n", keyfile); esl_fileparser_SetCommentChar(efp, '#'); while (esl_fileparser_NextLine(efp) == eslOK) { if (esl_fileparser_GetTokenOnLine(efp, &key, &keylen) != eslOK) esl_fatal("Failed to read seq name on line %d of file %s\n", efp->linenumber, keyfile); status = esl_keyhash_Store(keys, key, keylen, &keyidx); if (status == eslEDUP) esl_fatal("seq key %s occurs more than once in file %s\n", key, keyfile); /* if we have an SSI index, just fetch them as we go. */ if (sqfp->data.ascii.ssi != NULL) { onefetch(go, ofp, key, sqfp); nseq++; } nkeys++; } /* If we don't have an SSI index, we haven't fetched anything yet; do it now. */ if (sqfp->data.ascii.ssi == NULL) { ESL_SQ *sq = esl_sq_Create(); while ((status = esl_sqio_Read(sqfp, sq)) == eslOK) { if ( (sq->name[0] != '\0' && esl_keyhash_Lookup(keys, sq->name, -1, NULL) == eslOK) || (sq->acc[0] != '\0' && esl_keyhash_Lookup(keys, sq->acc, -1, NULL) == eslOK)) { if (esl_opt_GetBoolean(go, "-r") ) if (esl_sq_ReverseComplement(sq) != eslOK) esl_fatal("Failed to reverse complement %s\n", sq->name); esl_sqio_Write(ofp, sq, eslSQFILE_FASTA, FALSE); nseq++; } esl_sq_Reuse(sq); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); esl_sq_Destroy(sq); } if (nkeys != nseq) esl_fatal("Tried to retrieve %d keys, but only retrieved %d sequences\n", nkeys, nseq); if (ofp != stdout) printf("\nRetrieved %d sequences.\n", nseq); esl_keyhash_Destroy(keys); esl_fileparser_Close(efp); return; }