static void onefetch_subseq(ESL_GETOPTS *go, FILE *ofp, ESL_SQFILE *sqfp, char *newname, char *key, uint32_t given_start, uint32_t given_end) { int start, end; int do_revcomp; ESL_SQ *sq = esl_sq_Create(); if (sqfp->data.ascii.ssi == NULL) esl_fatal("no ssi index"); /* reverse complement indicated by coords. */ /* -c 52: would be 52,0, so watch out for given_end = 0 case */ if (given_end != 0 && given_start > given_end) { start = given_end; end = given_start; do_revcomp = TRUE; } else { start = given_start; end = given_end; do_revcomp = FALSE; } if (esl_sqio_FetchSubseq(sqfp, key, start, end, sq) != eslOK) esl_fatal(esl_sqfile_GetErrorBuf(sqfp)); if (newname != NULL) esl_sq_SetName(sq, newname); else esl_sq_FormatName(sq, "%s/%d-%d", key, given_start, (given_end == 0) ? sq->L : given_end); /* Two ways we might have been asked to revcomp: by coord, or by -r option */ /* (If both happen, they'll cancel each other out) */ if (do_revcomp) if (esl_sq_ReverseComplement(sq) != eslOK) esl_fatal("Failed to reverse complement %s; is it a protein?\n", sq->name); if (esl_opt_GetBoolean(go, "-r")) if (esl_sq_ReverseComplement(sq) != eslOK) esl_fatal("Failed to reverse complement %s; is it a protein?\n", sq->name); esl_sqio_Write(ofp, sq, eslSQFILE_FASTA, FALSE); esl_sq_Destroy(sq); }
/* onefetch(): * Given one <key> (a seq name or accession), retrieve the corresponding sequence. * In SSI mode, we can do this quickly by positioning the file, then regurgitating * every line until the end-of-record marker; we don't even have to parse. * Without an SSI index, we have to parse the file sequentially 'til we find * the one we're after. */ static void onefetch(ESL_GETOPTS *go, FILE *ofp, char *key, ESL_SQFILE *sqfp) { ESL_SQ *sq = esl_sq_Create(); int do_revcomp = esl_opt_GetBoolean(go, "-r"); char *newname = esl_opt_GetString(go, "-n"); int status; /* Try to position the file at the desired sequence with SSI. */ if (sqfp->data.ascii.ssi != NULL) { status = esl_sqfile_PositionByKey(sqfp, key); if (status == eslENOTFOUND) esl_fatal("seq %s not found in SSI index for file %s\n", key, sqfp->filename); else if (status == eslEFORMAT) esl_fatal("Failed to parse SSI index for %s\n", sqfp->filename); else if (status != eslOK) esl_fatal("Failed to look up location of seq %s in SSI index of file %s\n", key, sqfp->filename); status = esl_sqio_Read(sqfp, sq); if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status == eslEOF) esl_fatal("Unexpected EOF reading sequence file %s", status, sqfp->filename); else if (status != eslOK) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); if (strcmp(key, sq->name) != 0 && strcmp(key, sq->acc) != 0) esl_fatal("whoa, internal error; found the wrong sequence %s, not %s", sq->name, key); } else { /* Else, we have to read the whole damn file sequentially until we find the seq */ while ((status = esl_sqio_Read(sqfp, sq)) != eslEOF) { if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslOK) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); if (strcmp(key, sq->name) == 0 || strcmp(key, sq->acc) == 0) break; esl_sq_Reuse(sq); } if (status == eslEOF) esl_fatal("Failed to find sequence %s in file %s\n", key, sqfp->filename); } if (do_revcomp == FALSE && newname == NULL && ! esl_sqio_IsAlignment(sqfp->format)) { /* If we're not manipulating the sequence in any way, and it's not from an alignment file, we can Echo() it. */ if (esl_sqio_Echo(sqfp, sq, ofp) != eslOK) esl_fatal("Echo failed: %s\n", esl_sqfile_GetErrorBuf(sqfp)); } else { /* Otherwise we Write() the parsed version. */ if (do_revcomp && esl_sq_ReverseComplement(sq) != eslOK) esl_fatal("Failed to reverse complement %s; is it a protein?\n", sq->name); if (newname != NULL) esl_sq_SetName(sq, newname); esl_sqio_Write(ofp, sq, eslSQFILE_FASTA, FALSE); } esl_sq_Destroy(sq); }
static void emit_fancycons(ESL_GETOPTS *go, FILE *ofp, int outfmt, P7_HMM *hmm) { ESL_SQ *sq = NULL; float minl = esl_opt_GetReal(go, "--minl"); float minu = esl_opt_GetReal(go, "--minu"); if ((sq = esl_sq_Create()) == NULL) esl_fatal("failed to allocate sequence"); if (p7_emit_FancyConsensus(hmm, minl, minu, sq) != eslOK) esl_fatal("failed to create consensus seq"); if (esl_sq_FormatName(sq, "%s-consensus", hmm->name) != eslOK) esl_fatal("failed to set sequence name"); if (esl_sqio_Write(ofp, sq, outfmt, FALSE) != eslOK) esl_fatal("failed to write sequence"); esl_sq_Destroy(sq); return; }
int main(int argc, char **argv) { ESL_SQFILE *sqfp = NULL; ESL_SQ *sq = NULL; ESL_SQ *dsq = NULL; ESL_SQ **prot; int c; ESL_ALPHABET *abc, *prot_abc; ESL_SQ *prot6[6]; int x; abc = esl_alphabet_Create(eslDNA); prot_abc = esl_alphabet_Create(eslAMINO); if(argc != 2) { printf("You need to pass an argument for a filepath to a dna/rna fasta file\n"); exit(0); } if(eslOK != esl_sqfile_Open(argv[1], eslSQFILE_FASTA, NULL, &sqfp)) { printf("Invalid filepath: %s\n", argv[1]); exit(0); } sq = esl_sq_Create(); if(sq == NULL) { printf("could not allocate new sequence\n"); exit(0); } if(esl_sqio_Read(sqfp, sq) != eslOK) { printf("Not a valid fasta file %s\n", argv[1]); exit(0); } dsq = esl_sq_Create(); if(dsq == NULL) { printf("could not allocate digital sequence\n"); exit(0); } if(esl_sq_Copy(sq, dsq) != eslOK) { printf("could not copy sequence\n"); exit(0); } if(esl_sq_Digitize(abc, dsq) != eslOK) { printf("could not digitize sequence\n"); exit(0); } esl_sqio_Write(stdout, sq, eslSQFILE_FASTA, 0); if(esl_trans_6frame(sq, prot6) != eslOK) { printf("could not generate six frame translation\n"); exit(0); } for(x = 0; x < 6; x++) { esl_sqio_Write(stdout, prot6[x], eslSQFILE_FASTA, 0); } if(esl_trans_orf(dsq, &prot, &c, 10) != eslOK) { printf("could not translate open reading frames\n"); exit(0); } for(x = 0; x < c; x++) { esl_sqio_Write(stdout, prot[x], eslSQFILE_FASTA, 0); } return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* application configuration */ char *seqfile = NULL; /* sequence file name */ char *maskfile = NULL; /* mask coordinate file name */ int infmt = eslSQFILE_UNKNOWN; /* format code for seqfile */ int outfmt = eslSQFILE_FASTA; /* format code for output seqs */ ESL_SQFILE *sqfp = NULL; /* open sequence file */ ESL_FILEPARSER *maskefp = NULL; /* open mask coord file */ FILE *ofp = NULL; /* output stream for masked seqs */ char *source = NULL; /* name of current seq to mask */ char *p1, *p2; /* pointers used in parsing */ int64_t start, end; /* start, end coord for masking */ int64_t i, j, pos; /* coords in a sequence */ int64_t overmask; /* # of extra residues to mask */ ESL_SQ *sq = esl_sq_Create(); /* current sequence */ int do_fetching; int do_lowercase; int maskchar; int status; /* easel return code */ /**************************************************************************** * Parse command line ****************************************************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") ) cmdline_help (argv[0], go); if (esl_opt_ArgNumber(go) != 2) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); do_fetching = esl_opt_GetBoolean(go, "-R"); do_lowercase = esl_opt_GetBoolean(go, "-l"); overmask = (esl_opt_IsOn(go, "-x") ? esl_opt_GetInteger(go, "-x") : 0); maskchar = (esl_opt_IsOn(go, "-m") ? esl_opt_GetChar(go, "-m") : 'X'); seqfile = esl_opt_GetArg(go, 1); maskfile = esl_opt_GetArg(go, 2); /* Open the <seqfile>: text mode, not digital */ if (esl_opt_GetString(go, "--informat") != NULL) { infmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslSQFILE_UNKNOWN) cmdline_failure(argv[0], "%s is not a valid input sequence file format for --informat"); } status = esl_sqfile_Open(seqfile, infmt, NULL, &sqfp); if (status == eslENOTFOUND) cmdline_failure(argv[0], "Sequence file %s not found.\n", seqfile); else if (status == eslEFORMAT) cmdline_failure(argv[0], "Format of file %s unrecognized.\n", seqfile); else if (status == eslEINVAL) cmdline_failure(argv[0], "Can't autodetect stdin or .gz.\n"); else if (status != eslOK) cmdline_failure(argv[0], "Open failed, code %d.\n", status); if(do_fetching) { status = esl_sqfile_OpenSSI(sqfp, NULL); if (status == eslEFORMAT) cmdline_failure(argv[0], "SSI index is in incorrect format\n"); else if (status == eslERANGE) cmdline_failure(argv[0], "SSI index is in 64-bit format and we can't read it\n"); else if (status != eslOK) cmdline_failure(argv[0], "Failed to open SSI index\n"); } /* Open the <maskfile> */ if (esl_fileparser_Open(maskfile, NULL, &maskefp) != eslOK) cmdline_failure(argv[0], "Failed to open mask coordinate file %s\n", maskfile); esl_fileparser_SetCommentChar(maskefp, '#'); /* Open the output file, if any */ if (esl_opt_GetString(go, "-o") != NULL) { if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) cmdline_failure(argv[0], "Failed to open output file %s\n", esl_opt_GetString(go, "-o")); } else ofp = stdout; /**************************************************************************** * Main loop over lines in <maskfile> ****************************************************************************/ /* Read one data line at a time from the <maskfile>; * parse into data fields <seqname> <start> <end> */ while (esl_fileparser_NextLine(maskefp) == eslOK) { /* First field is sequence name */ if (esl_fileparser_GetTokenOnLine(maskefp, &source, NULL) != eslOK) esl_fatal("Failed to read source seq name on line %d of file %s\n", maskefp->linenumber, maskfile); /* Get the sequence */ if (do_fetching) { /* If the <seqfile> is SSI indexed, try to reposition it and read <source> seq by random access */ status = esl_sqio_Fetch(sqfp, source, sq); if (status == eslENOTFOUND) esl_fatal("seq %s not found in SSI index for file %s\n", source, sqfp->filename); else if (status == eslEINVAL) esl_fatal("No SSI index or can't reposition in file %s\n", sqfp->filename); else if (status == eslEFORMAT) esl_fatal("Parse failed:\n%s\n", esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslOK) esl_fatal("Unexpected failure in fetching %s from file %s\n", source, sqfp->filename); } else { /* else, assume we're reading sequentially; <sqfile> and <maskfile> have seqs in same order */ status = esl_sqio_Read(sqfp, sq); if (status == eslEOF) esl_fatal("File %s ended prematurely; didn't find %s\n", sqfp->filename, source); else if (status == eslEFORMAT) esl_fatal("Parse failed:\n%s\n", esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslOK) esl_fatal("Unexpected error reading sequence file %s\n", sqfp->filename); if ((strcmp(sq->name, source) != 0) && (strcmp(sq->acc, source) != 0)) esl_fatal("Sequences in <sqfile> and <maskfile> aren't in same order; try -R"); } /* If we're masking by lowercase, first make sure everything's uppercase */ if (do_lowercase) for (pos = 0; pos < sq->n; pos++) if (isalpha(sq->seq[pos])) sq->seq[pos] = toupper(sq->seq[pos]); /* Next two fields are <start>, <end> for the masking */ /* possible future extension: wrap loop around this, enable multiple masked regions */ if (esl_fileparser_GetTokenOnLine(maskefp, &p1, NULL) != eslOK) esl_fatal("Failed to read start coord on line %d of file %s\n", maskefp->linenumber, maskfile); start = strtoll(p1, &p2, 0) - 1; if (esl_fileparser_GetTokenOnLine(maskefp, &p2, NULL) != eslOK) esl_fatal("Failed to read end coord on line %d of file %s\n", maskefp->linenumber, maskfile); end = strtoll(p2, &p1, 0) - 1; /* Do the masking */ if (esl_opt_GetBoolean(go, "-r")) /* Reverse masking */ { /* leave start..end unmasked; mask prefix 0..start-1, end+1..L-1 */ i = 0; j = ESL_MIN(sq->n-1, start - 1 + overmask); for (pos = i; pos <= j; pos++) if (isalpha(sq->seq[pos])) sq->seq[pos] = (do_lowercase ? tolower(sq->seq[pos]) : maskchar); i = ESL_MAX(0, end + 1 - overmask); j = sq->n-1; for (pos = i; pos <= j; pos++) if (isalpha(sq->seq[pos])) sq->seq[pos] = (do_lowercase ? tolower(sq->seq[pos]) : maskchar); } else { /* normal: mask start..end */ i = ESL_MAX(0, start - overmask); j = ESL_MIN(sq->n-1, end + overmask); for (pos = i; pos <= j; pos++) if (isalpha(sq->seq[pos])) sq->seq[pos] = (do_lowercase ? tolower(sq->seq[pos]) : maskchar); } esl_sqio_Write(ofp, sq, outfmt, FALSE); esl_sq_Reuse(sq); } esl_sq_Destroy(sq); esl_fileparser_Close(maskefp); esl_sqfile_Close(sqfp); esl_getopts_Destroy(go); if (ofp != stdout) fclose(ofp); return 0; }
/* seq_shuffling() * SRE, Tue Jan 22 08:35:51 2008 [Market Street Cafe, Leesburg] * * Shuffling of input sequences. * * Fixed-length (L>0) vs. full-length (L=0) modes handled differently. * In fixed-length mode: * <shuff->seq> only needs to be allocated once, for L * <targ> is an allocated copy of a random subseq of length L * sequences < L residues long can't be shuffled * In full-length mode: * <shuff->seq> is grown to length <sq->n> for each input seq * <targ> just points to <sq->seq> */ static int seq_shuffling(ESL_GETOPTS *go, ESL_RANDOMNESS *r, FILE *ofp, int outfmt) { char *seqfile = esl_opt_GetArg(go, 1); int infmt = eslSQFILE_UNKNOWN; ESL_SQFILE *sqfp = NULL; ESL_SQ *sq = esl_sq_Create(); ESL_SQ *shuff = esl_sq_Create(); char *targ = NULL; int N = esl_opt_GetInteger(go, "-N"); int L = esl_opt_GetInteger(go, "-L"); /* L>0 means select random fixed-len subseqs */ int kmers = 0; int i; int status; if (esl_opt_GetString(go, "--informat") != NULL) { infmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslSQFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --informat"); } if (esl_opt_IsOn(go, "-k")) kmers = esl_opt_GetInteger(go, "-k"); status = esl_sqfile_Open(seqfile, infmt, NULL, &sqfp); if (status == eslENOTFOUND) esl_fatal("No such file %s", seqfile); else if (status == eslEFORMAT) esl_fatal("Format of seqfile %s unrecognized.", seqfile); else if (status == eslEINVAL) esl_fatal("Can't autodetect stdin or .gz."); else if (status != eslOK) esl_fatal("Open failed, code %d.", status); if (L>0) { esl_sq_GrowTo(shuff, L); shuff->n = L; ESL_ALLOC(targ, sizeof(char) * (L+1)); } while ((status = esl_sqio_Read(sqfp, sq)) == eslOK) { if (L == 0) { /* shuffling entire sequence */ esl_sq_GrowTo(shuff, sq->n); /* make sure shuff can hold sq */ shuff->n = sq->n; targ = sq->seq; } else { if (sq->n < L) continue; /* reject seqs < L long */ } for (i = 0; i < N; i++) { if (L > 0) { /* fixed-len mode: copy a random subseq */ int pos = esl_rnd_Roll(r, sq->n - L + 1); strncpy(targ, sq->seq + pos, L); targ[L] = '\0'; } /* Do the requested kind of shuffling */ if (esl_opt_GetBoolean(go, "-m")) esl_rsq_CShuffle (r, targ, shuff->seq); /* monoresidue shuffling */ else if (esl_opt_GetBoolean(go, "-d")) esl_rsq_CShuffleDP (r, targ, shuff->seq); /* diresidue shuffling */ else if (esl_opt_IsOn (go, "-k")) esl_rsq_CShuffleKmers(r, targ, kmers, shuff->seq); /* diresidue shuffling */ else if (esl_opt_GetBoolean(go, "-0")) esl_rsq_CMarkov0 (r, targ, shuff->seq); /* 0th order Markov */ else if (esl_opt_GetBoolean(go, "-1")) esl_rsq_CMarkov1 (r, targ, shuff->seq); /* 1st order Markov */ else if (esl_opt_GetBoolean(go, "-r")) esl_rsq_CReverse ( targ, shuff->seq); /* reverse */ else if (esl_opt_IsOn (go, "-w")) { /* regionally shuffle */ int W= esl_opt_GetInteger(go, "-w"); esl_rsq_CShuffleWindows(r, targ, W, shuff->seq); } /* Set the name of the shuffled sequence */ if (N > 1) esl_sq_FormatName(shuff, "%s-shuffled-%d", sq->name, i); else esl_sq_FormatName(shuff, "%s-shuffled", sq->name); /* Output the resulting sequence */ esl_sqio_Write(ofp, shuff, outfmt, FALSE); /* don't need to reuse the shuffled sequence: we will use exactly the same memory */ } esl_sq_Reuse(sq); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); if (L>0) free(targ); esl_sq_Destroy(shuff); esl_sq_Destroy(sq); esl_sqfile_Close(sqfp); return eslOK; ERROR: if (targ != NULL) free(targ); esl_sq_Destroy(shuff); esl_sq_Destroy(sq); esl_sqfile_Close(sqfp); return status; }
int main(int argc, char **argv) { char *cmfile; ESL_ALPHABET *abc; char *seqfile; ESL_SQFILE *sqfp; int format; CM_FILE *cmfp; CM_t *cm; ESL_SQ *seq; float sc, rev_sc; Parsetree_t *tr; Fancyali_t *fali; Fancyali_t *rev_fali; int do_local; /* int status; */ /* char *optname; */ /* char *optarg; */ int optind; int status; char errbuf[eslERRBUFSIZE]; cmfile = seqfile = NULL; abc = NULL; sqfp = NULL; cmfp = NULL; cm = NULL; seq = NULL; tr = NULL; fali = NULL; rev_fali = NULL; format = eslSQFILE_UNKNOWN; do_local = TRUE; /* Should process options, but for now assume none and set optind */ optind = 1; if ( argc - optind != 2 ) cm_Die("Incorrect number of arguments\n"); cmfile = argv[optind++]; seqfile = argv[optind++]; if((status = cm_file_Open(cmfile, NULL, FALSE, &cmfp, errbuf)) != eslOK) cm_Die("Failed to open covariance model save file\n"); if ((status = cm_file_Read(cmfp, TRUE, &abc, &cm)) != eslOK) cm_Die("Failed to read a CM from cm file\n"); if (cm == NULL) cm_Die("CM file empty?\n"); cm_file_Close(cmfp); if ( esl_sqfile_Open(seqfile, format, NULL, &sqfp) != eslOK ) cm_Die("Failed to open sequence database file\n"); if (do_local) cm->config_opts |= CM_CONFIG_LOCAL; if((status = cm_Configure(cm, errbuf, -1)) != eslOK) cm_Die(errbuf); /*SetMarginalScores_reproduce_bug_i27(cm);*/ seq = esl_sq_Create(); while ( esl_sqio_Read(sqfp, seq) == eslOK ) { if (seq->n == 0) continue; int i0 = 1; int j0 = seq->n; if (seq->dsq == NULL) esl_sq_Digitize(abc, seq); sc = TrCYK_DnC(cm, seq->dsq, seq->n, 0, i0, j0, PLI_PASS_5P_AND_3P_ANY, TRUE, &tr); /* TRUE: reproduce v1.0 behavior */ /* sc = TrCYK_Inside(cm, seq->dsq, seq->n, 0, i0, j0, PLI_PASS_5P_AND_3P_ANY, TRUE, FALSE, &tr); */ fali = CreateFancyAli(cm->abc, tr, cm, cm->cmcons, seq->dsq, FALSE, NULL); /* float sc, struct_sc; * ParsetreeScore(cm, NULL, NULL, tr, seq->dsq, FALSE, &sc, &struct_sc, NULL, NULL, NULL); * printf("Parsetree score: %.4f\n", sc); * ParsetreeDump(stdout, tr, cm, seq->dsq); */ FreeParsetree(tr); revcomp(abc, seq, seq); rev_sc = TrCYK_DnC(cm,seq->dsq, seq->n, 0, i0, j0, PLI_PASS_5P_AND_3P_ANY, TRUE, &tr); /* TRUE: reproduce v1.0 behavior */ rev_fali = CreateFancyAli(cm->abc, tr, cm, cm->cmcons,seq->dsq, FALSE, NULL); /*ParsetreeDump(stdout, tr, cm, seq->dsq);*/ FreeParsetree(tr); if (sc > rev_sc) { printf("sequence: %s\n", seq->name); printf("score: %.2f\n",sc); PrintFancyAli(stdout, fali, 0, FALSE, FALSE, 60); } else { printf("sequence: %s (reversed)\n", seq->name); printf("score: %.2f\n",rev_sc); PrintFancyAli(stdout, fali, seq->n, TRUE, FALSE, 60); } FreeFancyAli(fali); FreeFancyAli(rev_fali); esl_sq_Destroy(seq); seq = esl_sq_Create(); } esl_sq_Destroy(seq); FreeCM(cm); esl_sqfile_Close(sqfp); return EXIT_SUCCESS; }
/* multifetch: * given a file containing lines with one name or key per line; * parse the file line-by-line; * if we have an SSI index available, retrieve the seqs by key * as we see each line; * else, without an SSI index, store the keys in a hash, then * read the entire seq file in a single pass, outputting seqs * that are in our keylist. * * Note that with an SSI index, you get the seqs in the order they * appear in the <keyfile>, but without an SSI index, you get seqs in * the order they occur in the seq file. */ static void multifetch(ESL_GETOPTS *go, FILE *ofp, char *keyfile, ESL_SQFILE *sqfp) { ESL_KEYHASH *keys = esl_keyhash_Create(); ESL_FILEPARSER *efp = NULL; int nseq = 0; int nkeys = 0; char *key; int keylen; int keyidx; int status; if (esl_fileparser_Open(keyfile, NULL, &efp) != eslOK) esl_fatal("Failed to open key file %s\n", keyfile); esl_fileparser_SetCommentChar(efp, '#'); while (esl_fileparser_NextLine(efp) == eslOK) { if (esl_fileparser_GetTokenOnLine(efp, &key, &keylen) != eslOK) esl_fatal("Failed to read seq name on line %d of file %s\n", efp->linenumber, keyfile); status = esl_keyhash_Store(keys, key, keylen, &keyidx); if (status == eslEDUP) esl_fatal("seq key %s occurs more than once in file %s\n", key, keyfile); /* if we have an SSI index, just fetch them as we go. */ if (sqfp->data.ascii.ssi != NULL) { onefetch(go, ofp, key, sqfp); nseq++; } nkeys++; } /* If we don't have an SSI index, we haven't fetched anything yet; do it now. */ if (sqfp->data.ascii.ssi == NULL) { ESL_SQ *sq = esl_sq_Create(); while ((status = esl_sqio_Read(sqfp, sq)) == eslOK) { if ( (sq->name[0] != '\0' && esl_keyhash_Lookup(keys, sq->name, -1, NULL) == eslOK) || (sq->acc[0] != '\0' && esl_keyhash_Lookup(keys, sq->acc, -1, NULL) == eslOK)) { if (esl_opt_GetBoolean(go, "-r") ) if (esl_sq_ReverseComplement(sq) != eslOK) esl_fatal("Failed to reverse complement %s\n", sq->name); esl_sqio_Write(ofp, sq, eslSQFILE_FASTA, FALSE); nseq++; } esl_sq_Reuse(sq); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); esl_sq_Destroy(sq); } if (nkeys != nseq) esl_fatal("Tried to retrieve %d keys, but only retrieved %d sequences\n", nkeys, nseq); if (ofp != stdout) printf("\nRetrieved %d sequences.\n", nseq); esl_keyhash_Destroy(keys); esl_fileparser_Close(efp); return; }
/* Create an SSI index file for open sequence file <sqfp>. * Both name and accession of sequences are stored as keys. */ static void create_ssi_index(ESL_GETOPTS *go, ESL_SQFILE *sqfp) { ESL_NEWSSI *ns = NULL; ESL_SQ *sq = esl_sq_Create(); int nseq = 0; char *ssifile = NULL; uint16_t fh; int status; esl_strdup(sqfp->filename, -1, &ssifile); esl_strcat(&ssifile, -1, ".ssi", 4); status = esl_newssi_Open(ssifile, TRUE, &ns); /* TRUE is for allowing overwrite. */ if (status == eslENOTFOUND) esl_fatal("failed to open SSI index %s", ssifile); else if (status == eslEOVERWRITE) esl_fatal("SSI index %s already exists; delete or rename it", ssifile); /* won't happen, see TRUE above... */ else if (status != eslOK) esl_fatal("failed to create a new SSI index"); if (esl_newssi_AddFile(ns, sqfp->filename, sqfp->format, &fh) != eslOK) esl_fatal("Failed to add sequence file %s to new SSI index\n", sqfp->filename); printf("Creating SSI index for %s... ", sqfp->filename); fflush(stdout); while ((status = esl_sqio_ReadInfo(sqfp, sq)) == eslOK) { nseq++; if (sq->name == NULL) esl_fatal("Every sequence must have a name to be indexed. Failed to find name of seq #%d\n", nseq); if (esl_newssi_AddKey(ns, sq->name, fh, sq->roff, sq->doff, sq->L) != eslOK) esl_fatal("Failed to add key %s to SSI index", sq->name); if (sq->acc[0] != '\0') { if (esl_newssi_AddAlias(ns, sq->acc, sq->name) != eslOK) esl_fatal("Failed to add secondary key %s to SSI index", sq->acc); } esl_sq_Reuse(sq); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); /* Determine if the file was suitable for fast subseq lookup. */ if (sqfp->data.ascii.bpl > 0 && sqfp->data.ascii.rpl > 0) { if ((status = esl_newssi_SetSubseq(ns, fh, sqfp->data.ascii.bpl, sqfp->data.ascii.rpl)) != eslOK) esl_fatal("Failed to set %s for fast subseq lookup."); } /* Save the SSI file to disk */ if (esl_newssi_Write(ns) != eslOK) esl_fatal("Failed to write keys to ssi file %s\n", ssifile); /* Done - output and exit. */ printf("done.\n"); if (ns->nsecondary > 0) printf("Indexed %d sequences (%ld names and %ld accessions).\n", nseq, (long) ns->nprimary, (long) ns->nsecondary); else printf("Indexed %d sequences (%ld names).\n", nseq, (long) ns->nprimary); printf("SSI index written to file %s\n", ssifile); free(ssifile); esl_sq_Destroy(sq); esl_newssi_Close(ns); return; }