/**
 * Check that the ebwt array is internally consistent up to (and not
 * including) the given side index by re-counting the chars and
 * comparing against the embedded occ[] arrays.
 */
void Ebwt::sanityCheckUpToSide(int upToSide) const {
	assert(isInMemory());
	uint32_t occ[] = {0, 0, 0, 0};
	ASSERT_ONLY(uint32_t occ_save[] = {0, 0, 0, 0});
	uint32_t cur = 0; // byte pointer
	const EbwtParams& eh = this->_eh;
	bool fw = false;
	while(cur < (upToSide * eh._sideSz)) {
		assert_leq(cur + eh._sideSz, eh._ebwtTotLen);
		for(uint32_t i = 0; i < eh._sideBwtSz; i++) {
			uint8_t by = this->ebwt()[cur + (fw ? i : eh._sideBwtSz-i-1)];
			for(int j = 0; j < 4; j++) {
				// Unpack from lowest to highest bit pair
				int twoBit = unpack_2b_from_8b(by, fw ? j : 3-j);
				occ[twoBit]++;
			}
			assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) % 4);
		}
		assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) % eh._sideBwtLen);
		// Finished forward bucket; check saved [A], [C], [G] and [T]
		// against the uint32_ts encoded here
		ASSERT_ONLY(const uint32_t *u32ebwt = reinterpret_cast<const uint32_t*>(&ebwt()[cur + eh._sideBwtSz]));
		ASSERT_ONLY(uint32_t as = u32ebwt[0]);
		ASSERT_ONLY(uint32_t cs = u32ebwt[1]);
		ASSERT_ONLY(uint32_t gs = u32ebwt[2]);
		ASSERT_ONLY(uint32_t ts = u32ebwt[3]);
		assert(as == occ_save[0] || as == occ_save[0]-1);
		assert_eq(cs, occ_save[1]);
		assert_eq(gs, occ_save[2]);
		assert_eq(ts, occ_save[3]);
#ifndef NDEBUG
		occ_save[0] = occ[0];
		occ_save[1] = occ[1];
		occ_save[2] = occ[2];
		occ_save[3] = occ[3];
#endif
		cur += eh._sideSz;
	}
}
Beispiel #2
0
static void driver(const string& infile,
                   vector<string>& infiles,
                   const string& outfile,
                   bool reverse = false)
{
	vector<FileBuf*> is;
	bool bisulfite = false;
	RefReadInParams refparams(color, reverse ? reverseType : REF_READ_FORWARD, nsToAs, bisulfite);
	assert_gt(infiles.size(), 0);
	if(format == CMDLINE) {
		// Adapt sequence strings to stringstreams open for input
		stringstream *ss = new stringstream();
		for(size_t i = 0; i < infiles.size(); i++) {
			(*ss) << ">" << i << endl << infiles[i] << endl;
		}
		FileBuf *fb = new FileBuf(ss);
		assert(fb != NULL);
		assert(!fb->eof());
		assert(fb->get() == '>');
		ASSERT_ONLY(fb->reset());
		assert(!fb->eof());
		is.push_back(fb);
	} else {
		// Adapt sequence files to ifstreams
		for(size_t i = 0; i < infiles.size(); i++) {
			FILE *f = fopen(infiles[i].c_str(), "rb");
			if (f == NULL) {
				cerr << "Error: could not open "<< infiles[i] << endl;
				throw 1;
			}
			FileBuf *fb = new FileBuf(f);
			assert(fb != NULL);
			assert(!fb->eof());
			assert(fb->get() == '>');
			ASSERT_ONLY(fb->reset());
			assert(!fb->eof());
			is.push_back(fb);
		}
	}
	// Vector for the ordered list of "records" comprising the input
	// sequences.  A record represents a stretch of unambiguous
	// characters in one of the input sequences.
	vector<RefRecord> szs;
	vector<uint32_t> plens;
	std::pair<size_t, size_t> sztot;
	{
		if(verbose) cout << "Reading reference sizes" << endl;
		Timer _t(cout, "  Time reading reference sizes: ", verbose);
		if(!reverse && (writeRef || justRef)) {
			// For forward reference, dump it to .3.ebwt and .4.ebwt
			// files
			string file3 = outfile + ".3." + gEbwt_ext;
			string file4 = outfile + ".4." + gEbwt_ext;
			// Open output stream for the '.3.ebwt' file which will
			// hold the size records.
			ofstream fout3(file3.c_str(), ios::binary);
			if(!fout3.good()) {
				cerr << "Could not open index file for writing: \"" << file3 << "\"" << endl
					 << "Please make sure the directory exists and that permissions allow writing by" << endl
					 << "Bowtie." << endl;
				throw 1;
			}
			BitpairOutFileBuf bpout(file4.c_str());
			// Read in the sizes of all the unambiguous stretches of
			// the genome into a vector of RefRecords.  The input
			// streams are reset once it's done.
			writeU<int32_t>(fout3, 1, bigEndian); // endianness sentinel
			if(color) {
				refparams.color = false;
				// Make sure the .3.ebwt and .4.ebwt files contain
				// nucleotides; not colors
				TIndexOff numSeqs = 0;
				fastaRefReadSizes(is, szs, plens, refparams, &bpout, numSeqs);
				refparams.color = true;
				writeU<TIndexOffU>(fout3, (TIndexOffU)szs.size(), bigEndian); // write # records
				for(size_t i = 0; i < szs.size(); i++) {
					szs[i].write(fout3, bigEndian);
				}
				szs.clear();
				plens.clear();
				// Now read in the colorspace size records; these are
				// the ones that were indexed
				TIndexOff numSeqs2 = 0;
				sztot = fastaRefReadSizes(is, szs, plens, refparams, NULL, numSeqs2);
				assert_geq(numSeqs, numSeqs2);
			} else {
				TIndexOff numSeqs = 0;
				sztot = fastaRefReadSizes(is, szs, plens, refparams, &bpout, numSeqs);
				writeU<TIndexOffU>(fout3, (TIndexOffU)szs.size(), bigEndian); // write # records
				for(size_t i = 0; i < szs.size(); i++) szs[i].write(fout3, bigEndian);
			}
			if(sztot.first == 0) {
				cerr << "Error: No unambiguous stretches of characters in the input.  Aborting..." << endl;
				throw 1;
			}
			assert_gt(sztot.first, 0);
			assert_gt(sztot.second, 0);
			bpout.close();
			fout3.close();
#ifndef NDEBUG
			if(sanityCheck) {
				BitPairReference bpr(
					outfile, // ebwt basename
					color,   // expect color?
					true,    // sanity check?
					&infiles,// files to check against
					NULL,    // sequences to check against
					format == CMDLINE, // whether infiles contains strings
					true,    // load sequence?
					false,   // use memory-mapped files
					false,   // use shared memory
					false,   // sweep through memory-mapped memory
					false,   // be talkative
					false);  // be talkative
			}
#endif
		} else {
			// Read in the sizes of all the unambiguous stretches of the
			// genome into a vector of RefRecords
			TIndexOff numSeqs = 0;
			sztot = fastaRefReadSizes(is, szs, plens, refparams, NULL, numSeqs);
#ifndef NDEBUG
			if(refparams.color) {
				refparams.color = false;
				vector<RefRecord> szs2;
				vector<uint32_t> plens2;
				TIndexOff numSeqs2 = 0;
				fastaRefReadSizes(is, szs2, plens2, refparams, NULL, numSeqs2);
				assert_leq(numSeqs, numSeqs2);
				// One less color than base
				refparams.color = true;
			}
#endif
		}
	}
	if(justRef) return;
	assert_gt(sztot.first, 0);
	assert_gt(sztot.second, 0);
	assert_gt(szs.size(), 0);
	// Construct Ebwt from input strings and parameters
	Ebwt<TStr> ebwt(refparams.color ? 1 : 0,
	                lineRate,
	                linesPerSide,
	                offRate,      // suffix-array sampling rate
	                -1,           // ISA sampling rate
	                ftabChars,    // number of chars in initial arrow-pair calc
			nthreads,
	                outfile,      // basename for .?.ebwt files
	                !reverse,     // fw
	                !entireSA,    // useBlockwise
	                bmax,         // block size for blockwise SA builder
	                bmaxMultSqrt, // block size as multiplier of sqrt(len)
	                bmaxDivN,     // block size as divisor of len
	                noDc? 0 : dcv,// difference-cover period
	                is,           // list of input streams
	                szs,          // list of reference sizes
	                plens,        // list of not-all-gap reference sequence lengths
	                (TIndexOffU)sztot.first,  // total size of all unambiguous ref chars
	                refparams,    // reference read-in parameters
	                seed,         // pseudo-random number generator seed
	                -1,           // override offRate
	                -1,           // override isaRate
	                verbose,      // be talkative
	                autoMem,      // pass exceptions up to the toplevel so that we can adjust memory settings automatically
	                sanityCheck); // verify results and internal consistency
	// Note that the Ebwt is *not* resident in memory at this time.  To
	// load it into memory, call ebwt.loadIntoMemory()
	if(verbose) {
		// Print Ebwt's vital stats
		ebwt.eh().print(cout);
	}
	if(sanityCheck) {
		// Try restoring the original string (if there were
		// multiple texts, what we'll get back is the joined,
		// padded string, not a list)
		ebwt.loadIntoMemory(
			refparams.color ? 1 : 0,
			-1,
			false,
			false);
		TStr s2; ebwt.restore(s2);
		ebwt.evictFromMemory();
		{
			TStr joinedss = Ebwt<TStr>::join(
				is,          // list of input streams
				szs,         // list of reference sizes
				(TIndexOffU)sztot.first, // total size of all unambiguous ref chars
				refparams,   // reference read-in parameters
				seed);       // pseudo-random number generator seed
			if(refparams.reverse == REF_READ_REVERSE) {
				reverseInPlace(joinedss);
			}
			assert_eq(length(joinedss), length(s2));
			assert_eq(joinedss, s2);
		}
		if(verbose) {
			if(length(s2) < 1000) {
				cout << "Passed restore check: " << s2 << endl;
			} else {
				cout << "Passed restore check: (" << length(s2) << " chars)" << endl;
			}
		}
	}
}
Beispiel #3
0
static void driver(
	const string& infile,
	EList<string>& infiles,
	const string& outfile,
	bool packed,
	int reverse)
{
	EList<FileBuf*> is(MISC_CAT);
	bool bisulfite = false;
	RefReadInParams refparams(false, reverse, nsToAs, bisulfite);
	assert_gt(infiles.size(), 0);
	if(format == CMDLINE) {
		// Adapt sequence strings to stringstreams open for input
		stringstream *ss = new stringstream();
		for(size_t i = 0; i < infiles.size(); i++) {
			(*ss) << ">" << i << endl << infiles[i].c_str() << endl;
		}
		FileBuf *fb = new FileBuf(ss);
		assert(fb != NULL);
		assert(!fb->eof());
		assert(fb->get() == '>');
		ASSERT_ONLY(fb->reset());
		assert(!fb->eof());
		is.push_back(fb);
	} else {
		// Adapt sequence files to ifstreams
		for(size_t i = 0; i < infiles.size(); i++) {
			FILE *f = fopen(infiles[i].c_str(), "rb");
			if (f == NULL) {
				cerr << "Error: could not open "<< infiles[i].c_str() << endl;
				throw 1;
			}
			FileBuf *fb = new FileBuf(f);
			assert(fb != NULL);
			if(fb->peek() == -1 || fb->eof()) {
				cerr << "Warning: Empty fasta file: '" << infile.c_str() << "'" << endl;
				continue;
			}
			assert(!fb->eof());
			assert(fb->get() == '>');
			ASSERT_ONLY(fb->reset());
			assert(!fb->eof());
			is.push_back(fb);
		}
	}
	if(is.empty()) {
		cerr << "Warning: All fasta inputs were empty" << endl;
		throw 1;
	}
	if(!reverse) {
#ifdef BOWTIE_64BIT_INDEX
		cerr << "Building a LARGE index" << endl;
#else
		cerr << "Building a SMALL index" << endl;
#endif
	}
	// Vector for the ordered list of "records" comprising the input
	// sequences.  A record represents a stretch of unambiguous
	// characters in one of the input sequences.
	EList<RefRecord> szs(MISC_CAT);
	std::pair<size_t, size_t> sztot;
	{
		if(verbose) cout << "Reading reference sizes" << endl;
		Timer _t(cout, "  Time reading reference sizes: ", verbose);
		if(!reverse && (writeRef || justRef)) {
			filesWritten.push_back(outfile + ".3." + gEbwt_ext);
			filesWritten.push_back(outfile + ".4." + gEbwt_ext);
			sztot = BitPairReference::szsFromFasta(is, outfile, bigEndian, refparams, szs, sanityCheck);
		} else {
			sztot = BitPairReference::szsFromFasta(is, string(), bigEndian, refparams, szs, sanityCheck);
		}
	}
	if(justRef) return;
	assert_gt(sztot.first, 0);
	assert_gt(sztot.second, 0);
	assert_gt(szs.size(), 0);
	// Construct index from input strings and parameters
	filesWritten.push_back(outfile + ".1." + gEbwt_ext);
	filesWritten.push_back(outfile + ".2." + gEbwt_ext);
	Ebwt ebwt(
		TStr(),
		packed,
		0,
		1,  // TODO: maybe not?
		lineRate,
		offRate,      // suffix-array sampling rate
		ftabChars,    // number of chars in initial arrow-pair calc
              nthreads,
		outfile,      // basename for .?.ebwt files
		reverse == 0, // fw
		!entireSA,    // useBlockwise
		bmax,         // block size for blockwise SA builder
		bmaxMultSqrt, // block size as multiplier of sqrt(len)
		bmaxDivN,     // block size as divisor of len
		noDc? 0 : dcv,// difference-cover period
		is,           // list of input streams
		szs,          // list of reference sizes
		(TIndexOffU)sztot.first,  // total size of all unambiguous ref chars
		refparams,    // reference read-in parameters
		seed,         // pseudo-random number generator seed
		-1,           // override offRate
		doSaFile,     // make a file with just the suffix array in it
		doBwtFile,    // make a file with just the BWT string in it
		verbose,      // be talkative
		autoMem,      // pass exceptions up to the toplevel so that we can adjust memory settings automatically
		sanityCheck); // verify results and internal consistency
	// Note that the Ebwt is *not* resident in memory at this time.  To
	// load it into memory, call ebwt.loadIntoMemory()
	if(verbose) {
		// Print Ebwt's vital stats
		ebwt.eh().print(cout);
	}
	if(sanityCheck) {
		// Try restoring the original string (if there were
		// multiple texts, what we'll get back is the joined,
		// padded string, not a list)
		ebwt.loadIntoMemory(
			0,
			reverse ? (refparams.reverse == REF_READ_REVERSE) : 0,
			true,  // load SA sample?
			true,  // load ftab?
			true,  // load rstarts?
			false,
			false);
		SString<char> s2;
		ebwt.restore(s2);
		ebwt.evictFromMemory();
		{
			SString<char> joinedss = Ebwt::join<SString<char> >(
				is,          // list of input streams
				szs,         // list of reference sizes
				(TIndexOffU)sztot.first, // total size of all unambiguous ref chars
				refparams,   // reference read-in parameters
				seed);       // pseudo-random number generator seed
			if(refparams.reverse == REF_READ_REVERSE) {
				joinedss.reverse();
			}
			assert_eq(joinedss.length(), s2.length());
			assert(sstr_eq(joinedss, s2));
		}
		if(verbose) {
			if(s2.length() < 1000) {
				cout << "Passed restore check: " << s2.toZBuf() << endl;
			} else {
				cout << "Passed restore check: (" << s2.length() << " chars)" << endl;
			}
		}
	}
}