/** * Sanity-check various pieces of the Ebwt */ void Ebwt::sanityCheckAll(int reverse) const { const EbwtParams& eh = this->_eh; assert(isInMemory()); // Check ftab for(uint32_t i = 1; i < eh._ftabLen; i++) { assert_geq(this->ftabHi(i), this->ftabLo(i-1)); assert_geq(this->ftabLo(i), this->ftabHi(i-1)); assert_leq(this->ftabHi(i), eh._bwtLen+1); } assert_eq(this->ftabHi(eh._ftabLen-1), eh._bwtLen); // Check offs int seenLen = (eh._bwtLen + 31) >> 5; uint32_t *seen; try { seen = new uint32_t[seenLen]; // bitvector marking seen offsets } catch(bad_alloc& e) { cerr << "Out of memory allocating seen[] at " << __FILE__ << ":" << __LINE__ << endl; throw e; } memset(seen, 0, 4 * seenLen); uint32_t offsLen = eh._offsLen; for(uint32_t i = 0; i < offsLen; i++) { assert_lt(this->offs()[i], eh._bwtLen); int w = this->offs()[i] >> 5; int r = this->offs()[i] & 31; assert_eq(0, (seen[w] >> r) & 1); // shouldn't have been seen before seen[w] |= (1 << r); } delete[] seen; // Check nPat assert_gt(this->_nPat, 0); // Check plen, flen for(uint32_t i = 0; i < this->_nPat; i++) { assert_geq(this->plen()[i], 0); } // Check rstarts if(this->rstarts() != NULL) { for(uint32_t i = 0; i < this->_nFrag-1; i++) { assert_gt(this->rstarts()[(i+1)*3], this->rstarts()[i*3]); if(reverse == REF_READ_REVERSE) { assert(this->rstarts()[(i*3)+1] >= this->rstarts()[((i+1)*3)+1]); } else { assert(this->rstarts()[(i*3)+1] <= this->rstarts()[((i+1)*3)+1]); } } } // Check ebwt sanityCheckUpToSide(eh._numSides); VMSG_NL("Ebwt::sanityCheck passed"); }
// This way of handling positivity violations is // not continuous in time. One would have to // transition gradually from one level to another. // stringencyLevel should be changed to double // (integer values could serve as points where // complete transition is made). // void PositivityState::increaseStringency() { if(stringencyLevel==AVERAGE) { dprintf("increasing stringency level to POSITIVITY_POINTS"); stringencyLevel = POSITIVITY_POINTS; repeatStageFlag = true; } //else if(stringencyLevel==POSITIVITY_POINTS) //{ // dprintf("increasing stringency level to IMPLICIT_SOURCE"); // stringencyLevel = IMPLICIT_SOURCE; // repeatStepFlag = true; //} //else if(stringencyLevel==IMPLICIT_SOURCE) else if(stringencyLevel==POSITIVITY_POINTS) { // make this number configurable? // could rescale based on comparison // of minval with minval computed with // larger time step. // // want to modify dt by a factor that will // cause minval to be zero. // // time step must be repeated if step length is changed. repeatStepFlag = true; #if 0 if(suggested_dt_changeFactor < .8) dt_changeFactor = .8; else if(suggested_dt_changeFactor > .95) dt_changeFactor = .95; else dt_changeFactor = suggested_dt_changeFactor; #endif //double dt_changeFactor = .95; //cflFactor *= dt_changeFactor; cflFactor -= .0625; //dt *= dt_changeFactor; //dprint(dt); dprintf1("decreased cflFactor to %f",cflFactor); // if cflFactor gets too small something must have gone wrong. // There should exist a positivity-guaranteeing CFL number // that is independent of the solution (assuming that there // is a positivity-guaranteeing CFL number that is independent // of the solution for the HLLE method, which maybe isn't quite // true because we don't have a perfect way to obtain an upper // bound on physical wave speeds for the Riemann problem between // two cell states (Einfeldt's prescription is a not a guarantee // for all strictly hyperbolic systems). //assert_gt(cflFactor, .08); assert_gt(cflFactor, 0.); } else { invalid_value_error(stringencyLevel); } }
static void writeArrayToBinaryFileStream(std::ofstream& fileStream, const T* array, const size_t sizeOfArray) { assert_gt(sizeOfArray, 0); auto bufferSize = (std::streamsize)(sizeOfArray * sizeof(T)); auto previousNumberOfBytes = fileStream.tellp(); assert_true(fileStream.write(reinterpret_cast<const char*>(array), bufferSize)); auto currentNumberOfBytes = fileStream.tellp(); assert_eq(bufferSize, currentNumberOfBytes - previousNumberOfBytes); }
/** * Calculate a vector containing the sizes of all of the patterns in * all of the given input files, in order. Returns the total size of * all references combined. Rewinds each istream before returning. */ std::pair<size_t, size_t> fastaRefReadSizes( EList<FileBuf*>& in, EList<RefRecord>& recs, const RefReadInParams& rparms, BitpairOutFileBuf* bpout, int& numSeqs) { uint32_t unambigTot = 0; uint32_t bothTot = 0; RefReadInParams rpcp = rparms; assert_gt(in.size(), 0); // For each input istream for(size_t i = 0; i < in.size(); i++) { bool first = true; assert(!in[i]->eof()); // For each pattern in this istream while(!in[i]->eof()) { RefRecord rec = fastaRefReadSize(*in[i], rparms, first, bpout); if((unambigTot + rec.len) < unambigTot) { cerr << "Error: Reference sequence has more than 2^32-1 characters! Please divide the" << endl << "reference into batches or chunks of about 3.6 billion characters or less each" << endl << "and index each independently." << endl; throw 1; } // Add the length of this record. if(rec.first) numSeqs++; unambigTot += rec.len; bothTot += rec.len; bothTot += rec.off; first = false; if(rec.len == 0 && rec.off == 0 && !rec.first) continue; recs.push_back(rec); } // Reset the input stream in[i]->reset(); assert(!in[i]->eof()); #ifndef NDEBUG // Check that it's really reset int c = in[i]->get(); assert_eq('>', c); in[i]->reset(); assert(!in[i]->eof()); #endif } assert_geq(bothTot, 0); assert_geq(unambigTot, 0); return make_pair( unambigTot, // total number of unambiguous DNA characters read bothTot); // total number of DNA characters read, incl. ambiguous ones }
// ratio = v0/v1. inline double secant_rule(double x0,double x1,double ratio, int num_roots_in_cluster) { switch(num_roots_in_cluster) { case 1: break; case 2: eprintf("disallowed"); break; case 3: ratio = cbrt(ratio); break; default: assert_eq(1,num_roots_in_cluster%2); ratio = copysign(pow(fabs(ratio),1./num_roots_in_cluster),ratio); } double denominator = (1.-ratio); assert_gt(denominator,0.); return (x0-ratio*x1)/denominator; }
/** * Start the driver. The driver will begin by conducting a best-first, * index-assisted search through the space of possible full and partial * alignments. This search may be followed up with a dynamic programming * extension step, taking a prioritized set of partial SA ranges found * during the search and extending each with DP. The process might also be * iterated, with the search being occasioanally halted so that DPs can be * tried, then restarted, etc. */ int AlignerDriver::go( const Scoring& sc, const Ebwt& ebwtFw, const Ebwt& ebwtBw, const BitPairReference& ref, DescentMetrics& met, WalkMetrics& wlm, PerReadMetrics& prm, RandomSource& rnd, AlnSinkWrap& sink) { if(paired_) { // Paired-end - alternate between advancing dr1_ / dr2_ whenever a // new full alignment is discovered in the one currently being // advanced. Whenever a new full alignment is found, check to see // if it pairs with a previously discovered alignment. bool first1 = rnd.nextBool(); bool first = true; DescentStoppingConditions stopc1 = stop_; DescentStoppingConditions stopc2 = stop_; size_t totszIncr = (stop_.totsz + 7) / 8; stopc1.totsz = totszIncr; stopc2.totsz = totszIncr; while(stopc1.totsz <= stop_.totsz && stopc2.totsz <= stop_.totsz) { if(first && first1 && stopc1.totsz <= stop_.totsz) { dr1_.advance(stop_, sc, ebwtFw, ebwtBw, met, prm); stopc1.totsz += totszIncr; } if(stopc2.totsz <= stop_.totsz) { dr2_.advance(stop_, sc, ebwtFw, ebwtBw, met, prm); stopc2.totsz += totszIncr; } first = false; } } else { // Unpaired size_t iter = 1; while(true) { int ret = dr1_.advance(stop_, sc, ebwtFw, ebwtBw, met, prm); if(ret == DESCENT_DRIVER_ALN) { //cerr << iter << ". DESCENT_DRIVER_ALN" << endl; } else if(ret == DESCENT_DRIVER_MEM) { //cerr << iter << ". DESCENT_DRIVER_MEM" << endl; break; } else if(ret == DESCENT_DRIVER_STRATA) { // DESCENT_DRIVER_STRATA is returned by DescentDriver.advance() // when it has finished with a "non-empty" stratum: a stratum // in which at least one alignment was found. Here we report // the alignments in an arbitrary order. AlnRes res; // Initialize alignment selector with the DescentDriver's // alignment sink alsel_.init( dr1_.query(), dr1_.sink(), ebwtFw, ref, rnd, wlm); while(!alsel_.done() && !sink.state().doneWithMate(true)) { res.reset(); bool ret2 = alsel_.next( dr1_, ebwtFw, ref, rnd, res, wlm, prm); if(ret2) { // Got an alignment assert(res.matchesRef( dr1_.query(), ref, tmp_rf_, tmp_rdseq_, tmp_qseq_, raw_refbuf_, raw_destU32_, raw_matches_)); // Get reference interval involved in alignment Interval refival(res.refid(), 0, res.fw(), res.reflen()); assert_gt(res.refExtent(), 0); // Does alignment falls off end of reference? if(gReportOverhangs && !refival.containsIgnoreOrient(res.refival())) { res.clipOutside(true, 0, res.reflen()); if(res.refExtent() == 0) { continue; } } assert(gReportOverhangs || refival.containsIgnoreOrient(res.refival())); // Alignment fell entirely outside the reference? if(!refival.overlapsIgnoreOrient(res.refival())) { continue; // yes, fell outside } // Alignment redundant with one we've seen previously? if(red1_.overlap(res)) { continue; // yes, redundant } red1_.add(res); // so we find subsequent redundancies // Report an unpaired alignment assert(!sink.state().doneWithMate(true)); assert(!sink.maxed()); if(sink.report(0, &res, NULL)) { // Short-circuited because a limit, e.g. -k, -m or // -M, was exceeded return ALDRIVER_POLICY_FULFILLED; } } } dr1_.sink().advanceStratum(); } else if(ret == DESCENT_DRIVER_BWOPS) { //cerr << iter << ". DESCENT_DRIVER_BWOPS" << endl; } else if(ret == DESCENT_DRIVER_DONE) { //cerr << iter << ". DESCENT_DRIVER_DONE" << endl; break; } else { assert(false); } iter++; } } return ALDRIVER_EXHAUSTED_CANDIDATES; }
static void driver( const string& infile, EList<string>& infiles, const string& snpfile, const string& htfile, const string& ssfile, const string& exonfile, const string& svfile, const string& outfile, bool packed, int reverse) { initializeCntLut(); initializeCntBit(); EList<FileBuf*> is(MISC_CAT); bool bisulfite = false; RefReadInParams refparams(false, reverse, nsToAs, bisulfite); assert_gt(infiles.size(), 0); if(format == CMDLINE) { // Adapt sequence strings to stringstreams open for input stringstream *ss = new stringstream(); for(size_t i = 0; i < infiles.size(); i++) { (*ss) << ">" << i << endl << infiles[i].c_str() << endl; } FileBuf *fb = new FileBuf(ss); assert(fb != NULL); assert(!fb->eof()); assert(fb->get() == '>'); ASSERT_ONLY(fb->reset()); assert(!fb->eof()); is.push_back(fb); } else { // Adapt sequence files to ifstreams for(size_t i = 0; i < infiles.size(); i++) { FILE *f = fopen(infiles[i].c_str(), "r"); if (f == NULL) { cerr << "Error: could not open "<< infiles[i].c_str() << endl; throw 1; } FileBuf *fb = new FileBuf(f); assert(fb != NULL); if(fb->peek() == -1 || fb->eof()) { cerr << "Warning: Empty fasta file: '" << infile.c_str() << "'" << endl; continue; } assert(!fb->eof()); assert(fb->get() == '>'); ASSERT_ONLY(fb->reset()); assert(!fb->eof()); is.push_back(fb); } } if(is.empty()) { cerr << "Warning: All fasta inputs were empty" << endl; throw 1; } filesWritten.push_back(outfile + ".1." + gfm_ext); filesWritten.push_back(outfile + ".2." + gfm_ext); // Vector for the ordered list of "records" comprising the input // sequences. A record represents a stretch of unambiguous // characters in one of the input sequences. EList<RefRecord> szs(MISC_CAT); std::pair<size_t, size_t> sztot; { if(verbose) cerr << "Reading reference sizes" << endl; Timer _t(cerr, " Time reading reference sizes: ", verbose); if(!reverse && (writeRef || justRef)) { filesWritten.push_back(outfile + ".3." + gfm_ext); filesWritten.push_back(outfile + ".4." + gfm_ext); sztot = BitPairReference::szsFromFasta(is, outfile, bigEndian, refparams, szs, sanityCheck); } else { sztot = BitPairReference::szsFromFasta(is, string(), bigEndian, refparams, szs, sanityCheck); } } if(justRef) return; assert_gt(sztot.first, 0); assert_gt(sztot.second, 0); assert_gt(szs.size(), 0); // Construct index from input strings and parameters filesWritten.push_back(outfile + ".5." + gfm_ext); filesWritten.push_back(outfile + ".6." + gfm_ext); filesWritten.push_back(outfile + ".7." + gfm_ext); filesWritten.push_back(outfile + ".8." + gfm_ext); TStr s; HGFM<TIndexOffU> hGFM( s, packed, 1, // TODO: maybe not? lineRate, offRate, // suffix-array sampling rate ftabChars, // number of chars in initial arrow-pair calc localOffRate, localFtabChars, nthreads, snpfile, htfile, ssfile, exonfile, svfile, outfile, // basename for .?.ht2 files reverse == 0, // fw !entireSA, // useBlockwise bmax, // block size for blockwise SA builder bmaxMultSqrt, // block size as multiplier of sqrt(len) bmaxDivN, // block size as divisor of len noDc? 0 : dcv,// difference-cover period is, // list of input streams szs, // list of reference sizes (TIndexOffU)sztot.first, // total size of all unambiguous ref chars refparams, // reference read-in parameters seed, // pseudo-random number generator seed -1, // override offRate verbose, // be talkative autoMem, // pass exceptions up to the toplevel so that we can adjust memory settings automatically sanityCheck); // verify results and internal consistency // Note that the Ebwt is *not* resident in memory at this time. To // load it into memory, call ebwt.loadIntoMemory() if(verbose) { // Print Ebwt's vital stats hGFM.gh().print(cerr); } if(sanityCheck) { // Try restoring the original string (if there were // multiple texts, what we'll get back is the joined, // padded string, not a list) hGFM.loadIntoMemory( reverse ? (refparams.reverse == REF_READ_REVERSE) : 0, true, // load SA sample? true, // load ftab? true, // load rstarts? false, false); SString<char> s2; hGFM.restore(s2); hGFM.evictFromMemory(); { SString<char> joinedss = GFM<>::join<SString<char> >( is, // list of input streams szs, // list of reference sizes (TIndexOffU)sztot.first, // total size of all unambiguous ref chars refparams, // reference read-in parameters seed); // pseudo-random number generator seed if(refparams.reverse == REF_READ_REVERSE) { joinedss.reverse(); } assert_eq(joinedss.length(), s2.length()); assert(sstr_eq(joinedss, s2)); } if(verbose) { if(s2.length() < 1000) { cout << "Passed restore check: " << s2.toZBuf() << endl; } else { cout << "Passed restore check: (" << s2.length() << " chars)" << endl; } } } }
static void driver(const string& infile, vector<string>& infiles, const string& outfile, bool reverse = false) { vector<FileBuf*> is; bool bisulfite = false; RefReadInParams refparams(color, reverse ? reverseType : REF_READ_FORWARD, nsToAs, bisulfite); assert_gt(infiles.size(), 0); if(format == CMDLINE) { // Adapt sequence strings to stringstreams open for input stringstream *ss = new stringstream(); for(size_t i = 0; i < infiles.size(); i++) { (*ss) << ">" << i << endl << infiles[i] << endl; } FileBuf *fb = new FileBuf(ss); assert(fb != NULL); assert(!fb->eof()); assert(fb->get() == '>'); ASSERT_ONLY(fb->reset()); assert(!fb->eof()); is.push_back(fb); } else { // Adapt sequence files to ifstreams for(size_t i = 0; i < infiles.size(); i++) { FILE *f = fopen(infiles[i].c_str(), "rb"); if (f == NULL) { cerr << "Error: could not open "<< infiles[i] << endl; throw 1; } FileBuf *fb = new FileBuf(f); assert(fb != NULL); assert(!fb->eof()); assert(fb->get() == '>'); ASSERT_ONLY(fb->reset()); assert(!fb->eof()); is.push_back(fb); } } // Vector for the ordered list of "records" comprising the input // sequences. A record represents a stretch of unambiguous // characters in one of the input sequences. vector<RefRecord> szs; vector<uint32_t> plens; std::pair<size_t, size_t> sztot; { if(verbose) cout << "Reading reference sizes" << endl; Timer _t(cout, " Time reading reference sizes: ", verbose); if(!reverse && (writeRef || justRef)) { // For forward reference, dump it to .3.ebwt and .4.ebwt // files string file3 = outfile + ".3." + gEbwt_ext; string file4 = outfile + ".4." + gEbwt_ext; // Open output stream for the '.3.ebwt' file which will // hold the size records. ofstream fout3(file3.c_str(), ios::binary); if(!fout3.good()) { cerr << "Could not open index file for writing: \"" << file3 << "\"" << endl << "Please make sure the directory exists and that permissions allow writing by" << endl << "Bowtie." << endl; throw 1; } BitpairOutFileBuf bpout(file4.c_str()); // Read in the sizes of all the unambiguous stretches of // the genome into a vector of RefRecords. The input // streams are reset once it's done. writeU<int32_t>(fout3, 1, bigEndian); // endianness sentinel if(color) { refparams.color = false; // Make sure the .3.ebwt and .4.ebwt files contain // nucleotides; not colors TIndexOff numSeqs = 0; fastaRefReadSizes(is, szs, plens, refparams, &bpout, numSeqs); refparams.color = true; writeU<TIndexOffU>(fout3, (TIndexOffU)szs.size(), bigEndian); // write # records for(size_t i = 0; i < szs.size(); i++) { szs[i].write(fout3, bigEndian); } szs.clear(); plens.clear(); // Now read in the colorspace size records; these are // the ones that were indexed TIndexOff numSeqs2 = 0; sztot = fastaRefReadSizes(is, szs, plens, refparams, NULL, numSeqs2); assert_geq(numSeqs, numSeqs2); } else { TIndexOff numSeqs = 0; sztot = fastaRefReadSizes(is, szs, plens, refparams, &bpout, numSeqs); writeU<TIndexOffU>(fout3, (TIndexOffU)szs.size(), bigEndian); // write # records for(size_t i = 0; i < szs.size(); i++) szs[i].write(fout3, bigEndian); } if(sztot.first == 0) { cerr << "Error: No unambiguous stretches of characters in the input. Aborting..." << endl; throw 1; } assert_gt(sztot.first, 0); assert_gt(sztot.second, 0); bpout.close(); fout3.close(); #ifndef NDEBUG if(sanityCheck) { BitPairReference bpr( outfile, // ebwt basename color, // expect color? true, // sanity check? &infiles,// files to check against NULL, // sequences to check against format == CMDLINE, // whether infiles contains strings true, // load sequence? false, // use memory-mapped files false, // use shared memory false, // sweep through memory-mapped memory false, // be talkative false); // be talkative } #endif } else { // Read in the sizes of all the unambiguous stretches of the // genome into a vector of RefRecords TIndexOff numSeqs = 0; sztot = fastaRefReadSizes(is, szs, plens, refparams, NULL, numSeqs); #ifndef NDEBUG if(refparams.color) { refparams.color = false; vector<RefRecord> szs2; vector<uint32_t> plens2; TIndexOff numSeqs2 = 0; fastaRefReadSizes(is, szs2, plens2, refparams, NULL, numSeqs2); assert_leq(numSeqs, numSeqs2); // One less color than base refparams.color = true; } #endif } } if(justRef) return; assert_gt(sztot.first, 0); assert_gt(sztot.second, 0); assert_gt(szs.size(), 0); // Construct Ebwt from input strings and parameters Ebwt<TStr> ebwt(refparams.color ? 1 : 0, lineRate, linesPerSide, offRate, // suffix-array sampling rate -1, // ISA sampling rate ftabChars, // number of chars in initial arrow-pair calc nthreads, outfile, // basename for .?.ebwt files !reverse, // fw !entireSA, // useBlockwise bmax, // block size for blockwise SA builder bmaxMultSqrt, // block size as multiplier of sqrt(len) bmaxDivN, // block size as divisor of len noDc? 0 : dcv,// difference-cover period is, // list of input streams szs, // list of reference sizes plens, // list of not-all-gap reference sequence lengths (TIndexOffU)sztot.first, // total size of all unambiguous ref chars refparams, // reference read-in parameters seed, // pseudo-random number generator seed -1, // override offRate -1, // override isaRate verbose, // be talkative autoMem, // pass exceptions up to the toplevel so that we can adjust memory settings automatically sanityCheck); // verify results and internal consistency // Note that the Ebwt is *not* resident in memory at this time. To // load it into memory, call ebwt.loadIntoMemory() if(verbose) { // Print Ebwt's vital stats ebwt.eh().print(cout); } if(sanityCheck) { // Try restoring the original string (if there were // multiple texts, what we'll get back is the joined, // padded string, not a list) ebwt.loadIntoMemory( refparams.color ? 1 : 0, -1, false, false); TStr s2; ebwt.restore(s2); ebwt.evictFromMemory(); { TStr joinedss = Ebwt<TStr>::join( is, // list of input streams szs, // list of reference sizes (TIndexOffU)sztot.first, // total size of all unambiguous ref chars refparams, // reference read-in parameters seed); // pseudo-random number generator seed if(refparams.reverse == REF_READ_REVERSE) { reverseInPlace(joinedss); } assert_eq(length(joinedss), length(s2)); assert_eq(joinedss, s2); } if(verbose) { if(length(s2) < 1000) { cout << "Passed restore check: " << s2 << endl; } else { cout << "Passed restore check: (" << length(s2) << " chars)" << endl; } } } }
inline bool nonzeroHeightEqualsOne() const { assert_gt(height(), 0); return !(data >> (1 + shiftAmount())); }