/** * Reads past the next ambiguous or unambiguous stretch of sequence * from the given FASTA file and returns its length. Does not do * anything with the sequence characters themselves; this is purely for * measuring lengths. */ RefRecord fastaRefReadSize(FileBuf& in, const RefReadInParams& rparms, bool first, BitpairOutFileBuf* bpout) { int c; static int lastc = '>'; // last character seen // RefRecord params size_t len = 0; // 'len' counts toward total length // 'off' counts number of ambiguous characters before first // unambiguous character size_t off = 0; // Pick off the first carat and any preceding whitespace if(first) { assert(!in.eof()); lastc = '>'; c = in.getPastWhitespace(); if(in.eof()) { // Got eof right away; emit warning cerr << "Warning: Empty input file" << endl; lastc = -1; return RefRecord(0, 0, true); } assert(c == '>'); } first = true; // Skip to the end of the id line; if the next line is either // another id line or a comment line, keep skipping if(lastc == '>') { // Skip to the end of the name line do { if((c = in.getPastNewline()) == -1) { // No more input cerr << "Warning: Encountered empty reference sequence" << endl; lastc = -1; return RefRecord(0, 0, true); } if(c == '>') { cerr << "Warning: Encountered empty reference sequence" << endl; } // continue until a non-name, non-comment line } while (c == '>'); } else { first = false; // not the first in a sequence off = 1; // The gap has already been consumed, so count it if((c = in.get()) == -1) { // Don't emit a warning, since this might legitimately be // a gap on the end of the final sequence in the file lastc = -1; return RefRecord(off, len, first); } } // Now skip to the first DNA character, counting gap characters // as we go int lc = -1; // last-DNA char variable for color conversion while(true) { int cat = dna4Cat[c]; if(rparms.nsToAs && cat == 2) c = 'A'; if(cat == 1) { // This is a DNA character if(rparms.color) { if(lc != -1) { // Got two consecutive unambiguous DNAs break; // to read-in loop } // Keep going; we need two consecutive unambiguous DNAs lc = charToDna5[(int)c]; // The 'if(off > 0)' takes care of the case where // the reference is entirely unambiguous and we don't // want to incorrectly increment off. if(off > 0) off++; } else { break; // to read-in loop } } else if(cat == 2) { if(lc != -1 && off == 0) off++; lc = -1; off++; // skip over gap character and increment } else if(c == '>') { if(off > 0 && lastc == '>') { cerr << "Warning: Encountered reference sequence with only gaps" << endl; } else if(lastc == '>') { cerr << "Warning: Encountered empty reference sequence" << endl; } lastc = '>'; return RefRecord(off, 0, first); } c = in.get(); if(c == -1) { // End-of-file if(off > 0 && lastc == '>') { cerr << "Warning: Encountered reference sequence with only gaps" << endl; } else if(lastc == '>') { cerr << "Warning: Encountered empty reference sequence" << endl; } lastc = -1; return RefRecord(off, 0, first); } } assert(!rparms.color || (lc != -1)); assert_eq(1, dna4Cat[c]); // C must be unambiguous base if(off > 0 && rparms.color && first) { // Handle the case where the first record has ambiguous // characters but we're in color space; one of those counts is // spurious off--; } // in now points just past the first character of a sequence // line, and c holds the first character while(c != -1 && c != '>') { if(rparms.nsToAs && dna4Cat[c] == 2) c = 'A'; uint8_t cat = dna4Cat[c]; int cc = toupper(c); if(rparms.bisulfite && cc == 'C') c = cc = 'T'; if(cat == 1) { // It's a DNA character assert(cc == 'A' || cc == 'C' || cc == 'G' || cc == 'T'); // Consume it len++; // Output it if(bpout != NULL) { if(rparms.color) { // output color bpout->write(dinuc2color[charToDna5[(int)c]][lc]); } else if(!rparms.color) { // output nucleotide bpout->write(charToDna5[c]); } } lc = charToDna5[(int)c]; } else if(cat == 2) { // It's an N or a gap lastc = c; assert(cc != 'A' && cc != 'C' && cc != 'G' && cc != 'T'); return RefRecord(off, len, first); } else { // Not DNA and not a gap, ignore it #ifndef NDEBUG if(!isspace(c)) { cerr << "Unexpected character in sequence: "; if(isprint(c)) { cerr << ((char)c) << endl; } else { cerr << "(" << c << ")" << endl; } } #endif } c = in.get(); } lastc = c; return RefRecord(off, len, first); }
static void driver( const string& infile, EList<string>& infiles, const string& snpfile, const string& htfile, const string& ssfile, const string& exonfile, const string& svfile, const string& outfile, bool packed, int reverse) { initializeCntLut(); initializeCntBit(); EList<FileBuf*> is(MISC_CAT); bool bisulfite = false; RefReadInParams refparams(false, reverse, nsToAs, bisulfite); assert_gt(infiles.size(), 0); if(format == CMDLINE) { // Adapt sequence strings to stringstreams open for input stringstream *ss = new stringstream(); for(size_t i = 0; i < infiles.size(); i++) { (*ss) << ">" << i << endl << infiles[i].c_str() << endl; } FileBuf *fb = new FileBuf(ss); assert(fb != NULL); assert(!fb->eof()); assert(fb->get() == '>'); ASSERT_ONLY(fb->reset()); assert(!fb->eof()); is.push_back(fb); } else { // Adapt sequence files to ifstreams for(size_t i = 0; i < infiles.size(); i++) { FILE *f = fopen(infiles[i].c_str(), "r"); if (f == NULL) { cerr << "Error: could not open "<< infiles[i].c_str() << endl; throw 1; } FileBuf *fb = new FileBuf(f); assert(fb != NULL); if(fb->peek() == -1 || fb->eof()) { cerr << "Warning: Empty fasta file: '" << infile.c_str() << "'" << endl; continue; } assert(!fb->eof()); assert(fb->get() == '>'); ASSERT_ONLY(fb->reset()); assert(!fb->eof()); is.push_back(fb); } } if(is.empty()) { cerr << "Warning: All fasta inputs were empty" << endl; throw 1; } filesWritten.push_back(outfile + ".1." + gfm_ext); filesWritten.push_back(outfile + ".2." + gfm_ext); // Vector for the ordered list of "records" comprising the input // sequences. A record represents a stretch of unambiguous // characters in one of the input sequences. EList<RefRecord> szs(MISC_CAT); std::pair<size_t, size_t> sztot; { if(verbose) cerr << "Reading reference sizes" << endl; Timer _t(cerr, " Time reading reference sizes: ", verbose); if(!reverse && (writeRef || justRef)) { filesWritten.push_back(outfile + ".3." + gfm_ext); filesWritten.push_back(outfile + ".4." + gfm_ext); sztot = BitPairReference::szsFromFasta(is, outfile, bigEndian, refparams, szs, sanityCheck); } else { sztot = BitPairReference::szsFromFasta(is, string(), bigEndian, refparams, szs, sanityCheck); } } if(justRef) return; assert_gt(sztot.first, 0); assert_gt(sztot.second, 0); assert_gt(szs.size(), 0); // Construct index from input strings and parameters filesWritten.push_back(outfile + ".5." + gfm_ext); filesWritten.push_back(outfile + ".6." + gfm_ext); filesWritten.push_back(outfile + ".7." + gfm_ext); filesWritten.push_back(outfile + ".8." + gfm_ext); TStr s; HGFM<TIndexOffU> hGFM( s, packed, 1, // TODO: maybe not? lineRate, offRate, // suffix-array sampling rate ftabChars, // number of chars in initial arrow-pair calc localOffRate, localFtabChars, nthreads, snpfile, htfile, ssfile, exonfile, svfile, outfile, // basename for .?.ht2 files reverse == 0, // fw !entireSA, // useBlockwise bmax, // block size for blockwise SA builder bmaxMultSqrt, // block size as multiplier of sqrt(len) bmaxDivN, // block size as divisor of len noDc? 0 : dcv,// difference-cover period is, // list of input streams szs, // list of reference sizes (TIndexOffU)sztot.first, // total size of all unambiguous ref chars refparams, // reference read-in parameters seed, // pseudo-random number generator seed -1, // override offRate verbose, // be talkative autoMem, // pass exceptions up to the toplevel so that we can adjust memory settings automatically sanityCheck); // verify results and internal consistency // Note that the Ebwt is *not* resident in memory at this time. To // load it into memory, call ebwt.loadIntoMemory() if(verbose) { // Print Ebwt's vital stats hGFM.gh().print(cerr); } if(sanityCheck) { // Try restoring the original string (if there were // multiple texts, what we'll get back is the joined, // padded string, not a list) hGFM.loadIntoMemory( reverse ? (refparams.reverse == REF_READ_REVERSE) : 0, true, // load SA sample? true, // load ftab? true, // load rstarts? false, false); SString<char> s2; hGFM.restore(s2); hGFM.evictFromMemory(); { SString<char> joinedss = GFM<>::join<SString<char> >( is, // list of input streams szs, // list of reference sizes (TIndexOffU)sztot.first, // total size of all unambiguous ref chars refparams, // reference read-in parameters seed); // pseudo-random number generator seed if(refparams.reverse == REF_READ_REVERSE) { joinedss.reverse(); } assert_eq(joinedss.length(), s2.length()); assert(sstr_eq(joinedss, s2)); } if(verbose) { if(s2.length() < 1000) { cout << "Passed restore check: " << s2.toZBuf() << endl; } else { cout << "Passed restore check: (" << s2.length() << " chars)" << endl; } } } }
/** * Parse a single quality string from fb and store qualities in r. * Assume the next character obtained via fb.get() is the first * character of the quality string. When returning, the next * character returned by fb.peek() or fb.get() should be the first * character of the following line. */ int parseQuals( Read& r, FileBuf& fb, int firstc, int readLen, int trim3, int trim5, bool intQuals, bool phred64, bool solexa64) { int c = firstc; assert(c != '\n' && c != '\r'); r.qual.clear(); if (intQuals) { while (c != '\r' && c != '\n' && c != -1) { bool neg = false; int num = 0; while(!isspace(c) && !fb.eof()) { if(c == '-') { neg = true; assert_eq(num, 0); } else { if(!isdigit(c)) { char buf[2048]; cerr << "Warning: could not parse quality line:" << endl; fb.getPastNewline(); cerr << fb.copyLastN(buf); buf[2047] = '\0'; cerr << buf; throw 1; } assert(isdigit(c)); num *= 10; num += (c - '0'); } c = fb.get(); } if(neg) num = 0; // Phred-33 ASCII encode it and add it to the back of the // quality string r.qual.append('!' + num); // Skip over next stretch of whitespace while(c != '\r' && c != '\n' && isspace(c) && !fb.eof()) { c = fb.get(); } } } else { while (c != '\r' && c != '\n' && c != -1) { r.qual.append(charToPhred33(c, solexa64, phred64)); c = fb.get(); while(c != '\r' && c != '\n' && isspace(c) && !fb.eof()) { c = fb.get(); } } } if ((int)r.qual.length() < readLen-1 || ((int)r.qual.length() < readLen && !r.color)) { tooFewQualities(r.name); } r.qual.trimEnd(trim3); if(r.qual.length()-trim5 < r.patFw.length()) { assert(gColor && r.primer != -1); assert_gt(trim5, 0); trim5--; } r.qual.trimBegin(trim5); if(r.qual.length() <= 0) return 0; assert_eq(r.qual.length(), r.patFw.length()); while(fb.peek() == '\n' || fb.peek() == '\r') fb.get(); return (int)r.qual.length(); }
static void driver(const string& infile, vector<string>& infiles, const string& outfile, bool reverse = false) { vector<FileBuf*> is; bool bisulfite = false; RefReadInParams refparams(color, reverse ? reverseType : REF_READ_FORWARD, nsToAs, bisulfite); assert_gt(infiles.size(), 0); if(format == CMDLINE) { // Adapt sequence strings to stringstreams open for input stringstream *ss = new stringstream(); for(size_t i = 0; i < infiles.size(); i++) { (*ss) << ">" << i << endl << infiles[i] << endl; } FileBuf *fb = new FileBuf(ss); assert(fb != NULL); assert(!fb->eof()); assert(fb->get() == '>'); ASSERT_ONLY(fb->reset()); assert(!fb->eof()); is.push_back(fb); } else { // Adapt sequence files to ifstreams for(size_t i = 0; i < infiles.size(); i++) { FILE *f = fopen(infiles[i].c_str(), "rb"); if (f == NULL) { cerr << "Error: could not open "<< infiles[i] << endl; throw 1; } FileBuf *fb = new FileBuf(f); assert(fb != NULL); assert(!fb->eof()); assert(fb->get() == '>'); ASSERT_ONLY(fb->reset()); assert(!fb->eof()); is.push_back(fb); } } // Vector for the ordered list of "records" comprising the input // sequences. A record represents a stretch of unambiguous // characters in one of the input sequences. vector<RefRecord> szs; vector<uint32_t> plens; std::pair<size_t, size_t> sztot; { if(verbose) cout << "Reading reference sizes" << endl; Timer _t(cout, " Time reading reference sizes: ", verbose); if(!reverse && (writeRef || justRef)) { // For forward reference, dump it to .3.ebwt and .4.ebwt // files string file3 = outfile + ".3." + gEbwt_ext; string file4 = outfile + ".4." + gEbwt_ext; // Open output stream for the '.3.ebwt' file which will // hold the size records. ofstream fout3(file3.c_str(), ios::binary); if(!fout3.good()) { cerr << "Could not open index file for writing: \"" << file3 << "\"" << endl << "Please make sure the directory exists and that permissions allow writing by" << endl << "Bowtie." << endl; throw 1; } BitpairOutFileBuf bpout(file4.c_str()); // Read in the sizes of all the unambiguous stretches of // the genome into a vector of RefRecords. The input // streams are reset once it's done. writeU<int32_t>(fout3, 1, bigEndian); // endianness sentinel if(color) { refparams.color = false; // Make sure the .3.ebwt and .4.ebwt files contain // nucleotides; not colors TIndexOff numSeqs = 0; fastaRefReadSizes(is, szs, plens, refparams, &bpout, numSeqs); refparams.color = true; writeU<TIndexOffU>(fout3, (TIndexOffU)szs.size(), bigEndian); // write # records for(size_t i = 0; i < szs.size(); i++) { szs[i].write(fout3, bigEndian); } szs.clear(); plens.clear(); // Now read in the colorspace size records; these are // the ones that were indexed TIndexOff numSeqs2 = 0; sztot = fastaRefReadSizes(is, szs, plens, refparams, NULL, numSeqs2); assert_geq(numSeqs, numSeqs2); } else { TIndexOff numSeqs = 0; sztot = fastaRefReadSizes(is, szs, plens, refparams, &bpout, numSeqs); writeU<TIndexOffU>(fout3, (TIndexOffU)szs.size(), bigEndian); // write # records for(size_t i = 0; i < szs.size(); i++) szs[i].write(fout3, bigEndian); } if(sztot.first == 0) { cerr << "Error: No unambiguous stretches of characters in the input. Aborting..." << endl; throw 1; } assert_gt(sztot.first, 0); assert_gt(sztot.second, 0); bpout.close(); fout3.close(); #ifndef NDEBUG if(sanityCheck) { BitPairReference bpr( outfile, // ebwt basename color, // expect color? true, // sanity check? &infiles,// files to check against NULL, // sequences to check against format == CMDLINE, // whether infiles contains strings true, // load sequence? false, // use memory-mapped files false, // use shared memory false, // sweep through memory-mapped memory false, // be talkative false); // be talkative } #endif } else { // Read in the sizes of all the unambiguous stretches of the // genome into a vector of RefRecords TIndexOff numSeqs = 0; sztot = fastaRefReadSizes(is, szs, plens, refparams, NULL, numSeqs); #ifndef NDEBUG if(refparams.color) { refparams.color = false; vector<RefRecord> szs2; vector<uint32_t> plens2; TIndexOff numSeqs2 = 0; fastaRefReadSizes(is, szs2, plens2, refparams, NULL, numSeqs2); assert_leq(numSeqs, numSeqs2); // One less color than base refparams.color = true; } #endif } } if(justRef) return; assert_gt(sztot.first, 0); assert_gt(sztot.second, 0); assert_gt(szs.size(), 0); // Construct Ebwt from input strings and parameters Ebwt<TStr> ebwt(refparams.color ? 1 : 0, lineRate, linesPerSide, offRate, // suffix-array sampling rate -1, // ISA sampling rate ftabChars, // number of chars in initial arrow-pair calc nthreads, outfile, // basename for .?.ebwt files !reverse, // fw !entireSA, // useBlockwise bmax, // block size for blockwise SA builder bmaxMultSqrt, // block size as multiplier of sqrt(len) bmaxDivN, // block size as divisor of len noDc? 0 : dcv,// difference-cover period is, // list of input streams szs, // list of reference sizes plens, // list of not-all-gap reference sequence lengths (TIndexOffU)sztot.first, // total size of all unambiguous ref chars refparams, // reference read-in parameters seed, // pseudo-random number generator seed -1, // override offRate -1, // override isaRate verbose, // be talkative autoMem, // pass exceptions up to the toplevel so that we can adjust memory settings automatically sanityCheck); // verify results and internal consistency // Note that the Ebwt is *not* resident in memory at this time. To // load it into memory, call ebwt.loadIntoMemory() if(verbose) { // Print Ebwt's vital stats ebwt.eh().print(cout); } if(sanityCheck) { // Try restoring the original string (if there were // multiple texts, what we'll get back is the joined, // padded string, not a list) ebwt.loadIntoMemory( refparams.color ? 1 : 0, -1, false, false); TStr s2; ebwt.restore(s2); ebwt.evictFromMemory(); { TStr joinedss = Ebwt<TStr>::join( is, // list of input streams szs, // list of reference sizes (TIndexOffU)sztot.first, // total size of all unambiguous ref chars refparams, // reference read-in parameters seed); // pseudo-random number generator seed if(refparams.reverse == REF_READ_REVERSE) { reverseInPlace(joinedss); } assert_eq(length(joinedss), length(s2)); assert_eq(joinedss, s2); } if(verbose) { if(length(s2) < 1000) { cout << "Passed restore check: " << s2 << endl; } else { cout << "Passed restore check: (" << length(s2) << " chars)" << endl; } } } }