/** * Sanity-check various pieces of the Ebwt */ void Ebwt::sanityCheckAll(int reverse) const { const EbwtParams& eh = this->_eh; assert(isInMemory()); // Check ftab for(uint32_t i = 1; i < eh._ftabLen; i++) { assert_geq(this->ftabHi(i), this->ftabLo(i-1)); assert_geq(this->ftabLo(i), this->ftabHi(i-1)); assert_leq(this->ftabHi(i), eh._bwtLen+1); } assert_eq(this->ftabHi(eh._ftabLen-1), eh._bwtLen); // Check offs int seenLen = (eh._bwtLen + 31) >> 5; uint32_t *seen; try { seen = new uint32_t[seenLen]; // bitvector marking seen offsets } catch(bad_alloc& e) { cerr << "Out of memory allocating seen[] at " << __FILE__ << ":" << __LINE__ << endl; throw e; } memset(seen, 0, 4 * seenLen); uint32_t offsLen = eh._offsLen; for(uint32_t i = 0; i < offsLen; i++) { assert_lt(this->offs()[i], eh._bwtLen); int w = this->offs()[i] >> 5; int r = this->offs()[i] & 31; assert_eq(0, (seen[w] >> r) & 1); // shouldn't have been seen before seen[w] |= (1 << r); } delete[] seen; // Check nPat assert_gt(this->_nPat, 0); // Check plen, flen for(uint32_t i = 0; i < this->_nPat; i++) { assert_geq(this->plen()[i], 0); } // Check rstarts if(this->rstarts() != NULL) { for(uint32_t i = 0; i < this->_nFrag-1; i++) { assert_gt(this->rstarts()[(i+1)*3], this->rstarts()[i*3]); if(reverse == REF_READ_REVERSE) { assert(this->rstarts()[(i*3)+1] >= this->rstarts()[((i+1)*3)+1]); } else { assert(this->rstarts()[(i*3)+1] <= this->rstarts()[((i+1)*3)+1]); } } } // Check ebwt sanityCheckUpToSide(eh._numSides); VMSG_NL("Ebwt::sanityCheck passed"); }
/** * Calculate a vector containing the sizes of all of the patterns in * all of the given input files, in order. Returns the total size of * all references combined. Rewinds each istream before returning. */ std::pair<size_t, size_t> fastaRefReadSizes( EList<FileBuf*>& in, EList<RefRecord>& recs, const RefReadInParams& rparms, BitpairOutFileBuf* bpout, int& numSeqs) { uint32_t unambigTot = 0; uint32_t bothTot = 0; RefReadInParams rpcp = rparms; assert_gt(in.size(), 0); // For each input istream for(size_t i = 0; i < in.size(); i++) { bool first = true; assert(!in[i]->eof()); // For each pattern in this istream while(!in[i]->eof()) { RefRecord rec = fastaRefReadSize(*in[i], rparms, first, bpout); if((unambigTot + rec.len) < unambigTot) { cerr << "Error: Reference sequence has more than 2^32-1 characters! Please divide the" << endl << "reference into batches or chunks of about 3.6 billion characters or less each" << endl << "and index each independently." << endl; throw 1; } // Add the length of this record. if(rec.first) numSeqs++; unambigTot += rec.len; bothTot += rec.len; bothTot += rec.off; first = false; if(rec.len == 0 && rec.off == 0 && !rec.first) continue; recs.push_back(rec); } // Reset the input stream in[i]->reset(); assert(!in[i]->eof()); #ifndef NDEBUG // Check that it's really reset int c = in[i]->get(); assert_eq('>', c); in[i]->reset(); assert(!in[i]->eof()); #endif } assert_geq(bothTot, 0); assert_geq(unambigTot, 0); return make_pair( unambigTot, // total number of unambiguous DNA characters read bothTot); // total number of DNA characters read, incl. ambiguous ones }
template <int partition=0> inline ftype pullCapacity(int dim) const { assert_leq(alpha[dim], -edges[dim]); assert_geq(alpha[dim], edges[dim]); // Actually the amount that can be pushed to the other node, so in // the 1-partition it's return -edges[dim] + ( (partition == 1) ? 1 : -1) * alpha[dim]; }
inline double TimeTracker::elapsedSeconds() const { if(started) { clock_t cur_time_diff = paused ? 0 : clock() - start_time; double r = double(cur_time_diff + offset) / CLOCKS_PER_SEC; assert_geq(r, 0); return r; } else { return 0; } }
inline ftype pushCapacity(const node_cptr& src, const node_cptr& dest, int dim, int dir) const { assert(dest == lattice.neighbor(src, dim, dir)); ftype v; if(dir == 1) { v = src->template pushCapacity<partition>(dim); } else { v = dest->template pullCapacity<partition>(dim); } assert_geq(v, 0); return v; }
inline void pushExcess(const node_ptr& src, const node_ptr& dest, const DirDim& dd, ftype amount) { int direction = dd.direction(); int dim = dd.dimension(); #ifndef NDEBUG assert(direction == -1 || direction == 1); assert(dest == lattice.neighbor(src, dim, direction)); assert_equal(src->on(), partition); assert_equal(dest->on(), partition); assert_geq(dim, 0); assert_leq(amount, pushCapacity<partition>(src, dest, dd)); assert_leq(-amount, pushCapacity<partition>(dest, src, dd.reversed())); assert(direction == 1 || direction == -1); _debugVerifyNodeReduction(src); _debugVerifyNodeReduction(dest); ftype src_pr_excess = src->template excess<partition>(); ftype dest_pr_excess = dest->template excess<partition>(); #endif src->reduction -= ((partition == 1) ? 1 : -1) * amount; dest->reduction += ((partition == 1) ? 1 : -1) * amount; ((direction > 0) ? src : dest) ->alpha[dim] += direction * ((partition == 1) ? 1 : -1) * amount; #ifndef NDEBUG _debugVerifyNodeReduction(src); _debugVerifyNodeReduction(dest); ftype src_af_excess = src->template excess<partition>(); ftype dest_af_excess = dest->template excess<partition>(); assert_equal(src_pr_excess - amount, src_af_excess); assert_equal(dest_pr_excess + amount, dest_af_excess); #endif }
inline ftype pushCapacity(const node_cptr& src, const node_cptr& dest, const DirDim& dd) const { assert(dest == lattice.neighbor(src, dd)); ftype v; if(dd.direction() == 1) { v = src->template pushCapacity<partition>(dd.dimension()); } else { v = dest->template pullCapacity<partition>(dd.dimension()); } assert_geq(v, 0); // cout << "Push capacity from node " << (src - lattice.begin()) // << " to node " << (dest - lattice.begin()) << " is " // << v << ". " << endl; return v; }
static void driver(const string& infile, vector<string>& infiles, const string& outfile, bool reverse = false) { vector<FileBuf*> is; bool bisulfite = false; RefReadInParams refparams(color, reverse ? reverseType : REF_READ_FORWARD, nsToAs, bisulfite); assert_gt(infiles.size(), 0); if(format == CMDLINE) { // Adapt sequence strings to stringstreams open for input stringstream *ss = new stringstream(); for(size_t i = 0; i < infiles.size(); i++) { (*ss) << ">" << i << endl << infiles[i] << endl; } FileBuf *fb = new FileBuf(ss); assert(fb != NULL); assert(!fb->eof()); assert(fb->get() == '>'); ASSERT_ONLY(fb->reset()); assert(!fb->eof()); is.push_back(fb); } else { // Adapt sequence files to ifstreams for(size_t i = 0; i < infiles.size(); i++) { FILE *f = fopen(infiles[i].c_str(), "rb"); if (f == NULL) { cerr << "Error: could not open "<< infiles[i] << endl; throw 1; } FileBuf *fb = new FileBuf(f); assert(fb != NULL); assert(!fb->eof()); assert(fb->get() == '>'); ASSERT_ONLY(fb->reset()); assert(!fb->eof()); is.push_back(fb); } } // Vector for the ordered list of "records" comprising the input // sequences. A record represents a stretch of unambiguous // characters in one of the input sequences. vector<RefRecord> szs; vector<uint32_t> plens; std::pair<size_t, size_t> sztot; { if(verbose) cout << "Reading reference sizes" << endl; Timer _t(cout, " Time reading reference sizes: ", verbose); if(!reverse && (writeRef || justRef)) { // For forward reference, dump it to .3.ebwt and .4.ebwt // files string file3 = outfile + ".3." + gEbwt_ext; string file4 = outfile + ".4." + gEbwt_ext; // Open output stream for the '.3.ebwt' file which will // hold the size records. ofstream fout3(file3.c_str(), ios::binary); if(!fout3.good()) { cerr << "Could not open index file for writing: \"" << file3 << "\"" << endl << "Please make sure the directory exists and that permissions allow writing by" << endl << "Bowtie." << endl; throw 1; } BitpairOutFileBuf bpout(file4.c_str()); // Read in the sizes of all the unambiguous stretches of // the genome into a vector of RefRecords. The input // streams are reset once it's done. writeU<int32_t>(fout3, 1, bigEndian); // endianness sentinel if(color) { refparams.color = false; // Make sure the .3.ebwt and .4.ebwt files contain // nucleotides; not colors TIndexOff numSeqs = 0; fastaRefReadSizes(is, szs, plens, refparams, &bpout, numSeqs); refparams.color = true; writeU<TIndexOffU>(fout3, (TIndexOffU)szs.size(), bigEndian); // write # records for(size_t i = 0; i < szs.size(); i++) { szs[i].write(fout3, bigEndian); } szs.clear(); plens.clear(); // Now read in the colorspace size records; these are // the ones that were indexed TIndexOff numSeqs2 = 0; sztot = fastaRefReadSizes(is, szs, plens, refparams, NULL, numSeqs2); assert_geq(numSeqs, numSeqs2); } else { TIndexOff numSeqs = 0; sztot = fastaRefReadSizes(is, szs, plens, refparams, &bpout, numSeqs); writeU<TIndexOffU>(fout3, (TIndexOffU)szs.size(), bigEndian); // write # records for(size_t i = 0; i < szs.size(); i++) szs[i].write(fout3, bigEndian); } if(sztot.first == 0) { cerr << "Error: No unambiguous stretches of characters in the input. Aborting..." << endl; throw 1; } assert_gt(sztot.first, 0); assert_gt(sztot.second, 0); bpout.close(); fout3.close(); #ifndef NDEBUG if(sanityCheck) { BitPairReference bpr( outfile, // ebwt basename color, // expect color? true, // sanity check? &infiles,// files to check against NULL, // sequences to check against format == CMDLINE, // whether infiles contains strings true, // load sequence? false, // use memory-mapped files false, // use shared memory false, // sweep through memory-mapped memory false, // be talkative false); // be talkative } #endif } else { // Read in the sizes of all the unambiguous stretches of the // genome into a vector of RefRecords TIndexOff numSeqs = 0; sztot = fastaRefReadSizes(is, szs, plens, refparams, NULL, numSeqs); #ifndef NDEBUG if(refparams.color) { refparams.color = false; vector<RefRecord> szs2; vector<uint32_t> plens2; TIndexOff numSeqs2 = 0; fastaRefReadSizes(is, szs2, plens2, refparams, NULL, numSeqs2); assert_leq(numSeqs, numSeqs2); // One less color than base refparams.color = true; } #endif } } if(justRef) return; assert_gt(sztot.first, 0); assert_gt(sztot.second, 0); assert_gt(szs.size(), 0); // Construct Ebwt from input strings and parameters Ebwt<TStr> ebwt(refparams.color ? 1 : 0, lineRate, linesPerSide, offRate, // suffix-array sampling rate -1, // ISA sampling rate ftabChars, // number of chars in initial arrow-pair calc nthreads, outfile, // basename for .?.ebwt files !reverse, // fw !entireSA, // useBlockwise bmax, // block size for blockwise SA builder bmaxMultSqrt, // block size as multiplier of sqrt(len) bmaxDivN, // block size as divisor of len noDc? 0 : dcv,// difference-cover period is, // list of input streams szs, // list of reference sizes plens, // list of not-all-gap reference sequence lengths (TIndexOffU)sztot.first, // total size of all unambiguous ref chars refparams, // reference read-in parameters seed, // pseudo-random number generator seed -1, // override offRate -1, // override isaRate verbose, // be talkative autoMem, // pass exceptions up to the toplevel so that we can adjust memory settings automatically sanityCheck); // verify results and internal consistency // Note that the Ebwt is *not* resident in memory at this time. To // load it into memory, call ebwt.loadIntoMemory() if(verbose) { // Print Ebwt's vital stats ebwt.eh().print(cout); } if(sanityCheck) { // Try restoring the original string (if there were // multiple texts, what we'll get back is the joined, // padded string, not a list) ebwt.loadIntoMemory( refparams.color ? 1 : 0, -1, false, false); TStr s2; ebwt.restore(s2); ebwt.evictFromMemory(); { TStr joinedss = Ebwt<TStr>::join( is, // list of input streams szs, // list of reference sizes (TIndexOffU)sztot.first, // total size of all unambiguous ref chars refparams, // reference read-in parameters seed); // pseudo-random number generator seed if(refparams.reverse == REF_READ_REVERSE) { reverseInPlace(joinedss); } assert_eq(length(joinedss), length(s2)); assert_eq(joinedss, s2); } if(verbose) { if(length(s2) < 1000) { cout << "Passed restore check: " << s2 << endl; } else { cout << "Passed restore check: (" << length(s2) << " chars)" << endl; } } } }