예제 #1
0
/**
 * Check that the ebwt array is internally consistent up to (and not
 * including) the given side index by re-counting the chars and
 * comparing against the embedded occ[] arrays.
 */
void Ebwt::sanityCheckUpToSide(int upToSide) const {
	assert(isInMemory());
	uint32_t occ[] = {0, 0, 0, 0};
	ASSERT_ONLY(uint32_t occ_save[] = {0, 0, 0, 0});
	uint32_t cur = 0; // byte pointer
	const EbwtParams& eh = this->_eh;
	bool fw = false;
	while(cur < (upToSide * eh._sideSz)) {
		assert_leq(cur + eh._sideSz, eh._ebwtTotLen);
		for(uint32_t i = 0; i < eh._sideBwtSz; i++) {
			uint8_t by = this->ebwt()[cur + (fw ? i : eh._sideBwtSz-i-1)];
			for(int j = 0; j < 4; j++) {
				// Unpack from lowest to highest bit pair
				int twoBit = unpack_2b_from_8b(by, fw ? j : 3-j);
				occ[twoBit]++;
			}
			assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) % 4);
		}
		assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) % eh._sideBwtLen);
		// Finished forward bucket; check saved [A], [C], [G] and [T]
		// against the uint32_ts encoded here
		ASSERT_ONLY(const uint32_t *u32ebwt = reinterpret_cast<const uint32_t*>(&ebwt()[cur + eh._sideBwtSz]));
		ASSERT_ONLY(uint32_t as = u32ebwt[0]);
		ASSERT_ONLY(uint32_t cs = u32ebwt[1]);
		ASSERT_ONLY(uint32_t gs = u32ebwt[2]);
		ASSERT_ONLY(uint32_t ts = u32ebwt[3]);
		assert(as == occ_save[0] || as == occ_save[0]-1);
		assert_eq(cs, occ_save[1]);
		assert_eq(gs, occ_save[2]);
		assert_eq(ts, occ_save[3]);
#ifndef NDEBUG
		occ_save[0] = occ[0];
		occ_save[1] = occ[1];
		occ_save[2] = occ[2];
		occ_save[3] = occ[3];
#endif
		cur += eh._sideSz;
	}
}
예제 #2
0
static void driver(
                   const string& infile,
                   EList<string>& infiles,
                   const string& snpfile,
                   const string& htfile,
                   const string& ssfile,
                   const string& exonfile,
                   const string& svfile,
                   const string& outfile,
                   bool packed,
                   int reverse)
{
    initializeCntLut();
    initializeCntBit();
	EList<FileBuf*> is(MISC_CAT);
	bool bisulfite = false;
	RefReadInParams refparams(false, reverse, nsToAs, bisulfite);
	assert_gt(infiles.size(), 0);
	if(format == CMDLINE) {
		// Adapt sequence strings to stringstreams open for input
		stringstream *ss = new stringstream();
		for(size_t i = 0; i < infiles.size(); i++) {
			(*ss) << ">" << i << endl << infiles[i].c_str() << endl;
		}
		FileBuf *fb = new FileBuf(ss);
		assert(fb != NULL);
		assert(!fb->eof());
		assert(fb->get() == '>');
		ASSERT_ONLY(fb->reset());
		assert(!fb->eof());
		is.push_back(fb);
	} else {
		// Adapt sequence files to ifstreams
		for(size_t i = 0; i < infiles.size(); i++) {
			FILE *f = fopen(infiles[i].c_str(), "r");
			if (f == NULL) {
				cerr << "Error: could not open "<< infiles[i].c_str() << endl;
				throw 1;
			}
			FileBuf *fb = new FileBuf(f);
			assert(fb != NULL);
			if(fb->peek() == -1 || fb->eof()) {
				cerr << "Warning: Empty fasta file: '" << infile.c_str() << "'" << endl;
				continue;
			}
			assert(!fb->eof());
			assert(fb->get() == '>');
			ASSERT_ONLY(fb->reset());
			assert(!fb->eof());
			is.push_back(fb);
		}
	}
	if(is.empty()) {
		cerr << "Warning: All fasta inputs were empty" << endl;
		throw 1;
	}
    filesWritten.push_back(outfile + ".1." + gfm_ext);
    filesWritten.push_back(outfile + ".2." + gfm_ext);
	// Vector for the ordered list of "records" comprising the input
	// sequences.  A record represents a stretch of unambiguous
	// characters in one of the input sequences.
	EList<RefRecord> szs(MISC_CAT);
	std::pair<size_t, size_t> sztot;
	{
		if(verbose) cerr << "Reading reference sizes" << endl;
		Timer _t(cerr, "  Time reading reference sizes: ", verbose);
		if(!reverse && (writeRef || justRef)) {
			filesWritten.push_back(outfile + ".3." + gfm_ext);
			filesWritten.push_back(outfile + ".4." + gfm_ext);
			sztot = BitPairReference::szsFromFasta(is, outfile, bigEndian, refparams, szs, sanityCheck);
		} else {
			sztot = BitPairReference::szsFromFasta(is, string(), bigEndian, refparams, szs, sanityCheck);
		}
	}
	if(justRef) return;
	assert_gt(sztot.first, 0);
	assert_gt(sztot.second, 0);
	assert_gt(szs.size(), 0);
    
	// Construct index from input strings and parameters	
    filesWritten.push_back(outfile + ".5." + gfm_ext);
    filesWritten.push_back(outfile + ".6." + gfm_ext);
    filesWritten.push_back(outfile + ".7." + gfm_ext);
    filesWritten.push_back(outfile + ".8." + gfm_ext);
	TStr s;
	HGFM<TIndexOffU> hGFM(
                          s,
                          packed,
                          1,  // TODO: maybe not?
                          lineRate,
                          offRate,      // suffix-array sampling rate
                          ftabChars,    // number of chars in initial arrow-pair calc
                          localOffRate,
                          localFtabChars,
                          nthreads,
                          snpfile,
                          htfile,
                          ssfile,
                          exonfile,
                          svfile,
                          outfile,      // basename for .?.ht2 files
                          reverse == 0, // fw
                          !entireSA,    // useBlockwise
                          bmax,         // block size for blockwise SA builder
                          bmaxMultSqrt, // block size as multiplier of sqrt(len)
                          bmaxDivN,     // block size as divisor of len
                          noDc? 0 : dcv,// difference-cover period
                          is,           // list of input streams
                          szs,          // list of reference sizes
                          (TIndexOffU)sztot.first,  // total size of all unambiguous ref chars
                          refparams,    // reference read-in parameters
                          seed,         // pseudo-random number generator seed
                          -1,           // override offRate
                          verbose,      // be talkative
                          autoMem,      // pass exceptions up to the toplevel so that we can adjust memory settings automatically
                          sanityCheck); // verify results and internal consistency
    // Note that the Ebwt is *not* resident in memory at this time.  To
    // load it into memory, call ebwt.loadIntoMemory()
	if(verbose) {
		// Print Ebwt's vital stats
		hGFM.gh().print(cerr);
	}
	if(sanityCheck) {
		// Try restoring the original string (if there were
		// multiple texts, what we'll get back is the joined,
		// padded string, not a list)
		hGFM.loadIntoMemory(
                            reverse ? (refparams.reverse == REF_READ_REVERSE) : 0,
                            true,  // load SA sample?
                            true,  // load ftab?
                            true,  // load rstarts?
                            false,
                            false);
		SString<char> s2;
		hGFM.restore(s2);
		hGFM.evictFromMemory();
		{
			SString<char> joinedss = GFM<>::join<SString<char> >(
				is,          // list of input streams
				szs,         // list of reference sizes
				(TIndexOffU)sztot.first, // total size of all unambiguous ref chars
				refparams,   // reference read-in parameters
				seed);       // pseudo-random number generator seed
			if(refparams.reverse == REF_READ_REVERSE) {
				joinedss.reverse();
			}
			assert_eq(joinedss.length(), s2.length());
			assert(sstr_eq(joinedss, s2));
		}
		if(verbose) {
			if(s2.length() < 1000) {
				cout << "Passed restore check: " << s2.toZBuf() << endl;
			} else {
				cout << "Passed restore check: (" << s2.length() << " chars)" << endl;
			}
		}
	}
}
예제 #3
0
static void driver(const string& infile,
                   vector<string>& infiles,
                   const string& outfile,
                   bool reverse = false)
{
	vector<FileBuf*> is;
	bool bisulfite = false;
	RefReadInParams refparams(color, reverse ? reverseType : REF_READ_FORWARD, nsToAs, bisulfite);
	assert_gt(infiles.size(), 0);
	if(format == CMDLINE) {
		// Adapt sequence strings to stringstreams open for input
		stringstream *ss = new stringstream();
		for(size_t i = 0; i < infiles.size(); i++) {
			(*ss) << ">" << i << endl << infiles[i] << endl;
		}
		FileBuf *fb = new FileBuf(ss);
		assert(fb != NULL);
		assert(!fb->eof());
		assert(fb->get() == '>');
		ASSERT_ONLY(fb->reset());
		assert(!fb->eof());
		is.push_back(fb);
	} else {
		// Adapt sequence files to ifstreams
		for(size_t i = 0; i < infiles.size(); i++) {
			FILE *f = fopen(infiles[i].c_str(), "rb");
			if (f == NULL) {
				cerr << "Error: could not open "<< infiles[i] << endl;
				throw 1;
			}
			FileBuf *fb = new FileBuf(f);
			assert(fb != NULL);
			assert(!fb->eof());
			assert(fb->get() == '>');
			ASSERT_ONLY(fb->reset());
			assert(!fb->eof());
			is.push_back(fb);
		}
	}
	// Vector for the ordered list of "records" comprising the input
	// sequences.  A record represents a stretch of unambiguous
	// characters in one of the input sequences.
	vector<RefRecord> szs;
	vector<uint32_t> plens;
	std::pair<size_t, size_t> sztot;
	{
		if(verbose) cout << "Reading reference sizes" << endl;
		Timer _t(cout, "  Time reading reference sizes: ", verbose);
		if(!reverse && (writeRef || justRef)) {
			// For forward reference, dump it to .3.ebwt and .4.ebwt
			// files
			string file3 = outfile + ".3." + gEbwt_ext;
			string file4 = outfile + ".4." + gEbwt_ext;
			// Open output stream for the '.3.ebwt' file which will
			// hold the size records.
			ofstream fout3(file3.c_str(), ios::binary);
			if(!fout3.good()) {
				cerr << "Could not open index file for writing: \"" << file3 << "\"" << endl
					 << "Please make sure the directory exists and that permissions allow writing by" << endl
					 << "Bowtie." << endl;
				throw 1;
			}
			BitpairOutFileBuf bpout(file4.c_str());
			// Read in the sizes of all the unambiguous stretches of
			// the genome into a vector of RefRecords.  The input
			// streams are reset once it's done.
			writeU<int32_t>(fout3, 1, bigEndian); // endianness sentinel
			if(color) {
				refparams.color = false;
				// Make sure the .3.ebwt and .4.ebwt files contain
				// nucleotides; not colors
				TIndexOff numSeqs = 0;
				fastaRefReadSizes(is, szs, plens, refparams, &bpout, numSeqs);
				refparams.color = true;
				writeU<TIndexOffU>(fout3, (TIndexOffU)szs.size(), bigEndian); // write # records
				for(size_t i = 0; i < szs.size(); i++) {
					szs[i].write(fout3, bigEndian);
				}
				szs.clear();
				plens.clear();
				// Now read in the colorspace size records; these are
				// the ones that were indexed
				TIndexOff numSeqs2 = 0;
				sztot = fastaRefReadSizes(is, szs, plens, refparams, NULL, numSeqs2);
				assert_geq(numSeqs, numSeqs2);
			} else {
				TIndexOff numSeqs = 0;
				sztot = fastaRefReadSizes(is, szs, plens, refparams, &bpout, numSeqs);
				writeU<TIndexOffU>(fout3, (TIndexOffU)szs.size(), bigEndian); // write # records
				for(size_t i = 0; i < szs.size(); i++) szs[i].write(fout3, bigEndian);
			}
			if(sztot.first == 0) {
				cerr << "Error: No unambiguous stretches of characters in the input.  Aborting..." << endl;
				throw 1;
			}
			assert_gt(sztot.first, 0);
			assert_gt(sztot.second, 0);
			bpout.close();
			fout3.close();
#ifndef NDEBUG
			if(sanityCheck) {
				BitPairReference bpr(
					outfile, // ebwt basename
					color,   // expect color?
					true,    // sanity check?
					&infiles,// files to check against
					NULL,    // sequences to check against
					format == CMDLINE, // whether infiles contains strings
					true,    // load sequence?
					false,   // use memory-mapped files
					false,   // use shared memory
					false,   // sweep through memory-mapped memory
					false,   // be talkative
					false);  // be talkative
			}
#endif
		} else {
			// Read in the sizes of all the unambiguous stretches of the
			// genome into a vector of RefRecords
			TIndexOff numSeqs = 0;
			sztot = fastaRefReadSizes(is, szs, plens, refparams, NULL, numSeqs);
#ifndef NDEBUG
			if(refparams.color) {
				refparams.color = false;
				vector<RefRecord> szs2;
				vector<uint32_t> plens2;
				TIndexOff numSeqs2 = 0;
				fastaRefReadSizes(is, szs2, plens2, refparams, NULL, numSeqs2);
				assert_leq(numSeqs, numSeqs2);
				// One less color than base
				refparams.color = true;
			}
#endif
		}
	}
	if(justRef) return;
	assert_gt(sztot.first, 0);
	assert_gt(sztot.second, 0);
	assert_gt(szs.size(), 0);
	// Construct Ebwt from input strings and parameters
	Ebwt<TStr> ebwt(refparams.color ? 1 : 0,
	                lineRate,
	                linesPerSide,
	                offRate,      // suffix-array sampling rate
	                -1,           // ISA sampling rate
	                ftabChars,    // number of chars in initial arrow-pair calc
			nthreads,
	                outfile,      // basename for .?.ebwt files
	                !reverse,     // fw
	                !entireSA,    // useBlockwise
	                bmax,         // block size for blockwise SA builder
	                bmaxMultSqrt, // block size as multiplier of sqrt(len)
	                bmaxDivN,     // block size as divisor of len
	                noDc? 0 : dcv,// difference-cover period
	                is,           // list of input streams
	                szs,          // list of reference sizes
	                plens,        // list of not-all-gap reference sequence lengths
	                (TIndexOffU)sztot.first,  // total size of all unambiguous ref chars
	                refparams,    // reference read-in parameters
	                seed,         // pseudo-random number generator seed
	                -1,           // override offRate
	                -1,           // override isaRate
	                verbose,      // be talkative
	                autoMem,      // pass exceptions up to the toplevel so that we can adjust memory settings automatically
	                sanityCheck); // verify results and internal consistency
	// Note that the Ebwt is *not* resident in memory at this time.  To
	// load it into memory, call ebwt.loadIntoMemory()
	if(verbose) {
		// Print Ebwt's vital stats
		ebwt.eh().print(cout);
	}
	if(sanityCheck) {
		// Try restoring the original string (if there were
		// multiple texts, what we'll get back is the joined,
		// padded string, not a list)
		ebwt.loadIntoMemory(
			refparams.color ? 1 : 0,
			-1,
			false,
			false);
		TStr s2; ebwt.restore(s2);
		ebwt.evictFromMemory();
		{
			TStr joinedss = Ebwt<TStr>::join(
				is,          // list of input streams
				szs,         // list of reference sizes
				(TIndexOffU)sztot.first, // total size of all unambiguous ref chars
				refparams,   // reference read-in parameters
				seed);       // pseudo-random number generator seed
			if(refparams.reverse == REF_READ_REVERSE) {
				reverseInPlace(joinedss);
			}
			assert_eq(length(joinedss), length(s2));
			assert_eq(joinedss, s2);
		}
		if(verbose) {
			if(length(s2) < 1000) {
				cout << "Passed restore check: " << s2 << endl;
			} else {
				cout << "Passed restore check: (" << length(s2) << " chars)" << endl;
			}
		}
	}
}
예제 #4
0
SPECIALIZE_MIN_MAX_ENCODING_FOR_TYPE(uint8_t) SPECIALIZE_MIN_MAX_ENCODING_FOR_TYPE(int16_t) SPECIALIZE_MIN_MAX_ENCODING_FOR_TYPE(int32_t) SPECIALIZE_MIN_MAX_ENCODING_FOR_TYPE(float) SPECIALIZE_MIN_MAX_ENCODING_FOR_TYPE(double) template <typename T, bool ___2025, int base> ___372 ___4563(FileWriterInterface& file, char const*          ___972, ___81           ___1251, size_t               ___2797, T const*             ___4299, size_t               ___4334  ) { ___372 ___2039 = ___4226; REQUIRE(file.___2041()); REQUIRE(VALID_DESCRIPTION(___972)); REQUIRE("extraID could have any value. NO_EXTRA_ID show only the description"); REQUIRE(___2797>0); REQUIRE(VALID_REF(___4299)); if ( file.___2002() ) { ASSERT_ONLY(___1393 beginningLocation = file.fileLoc()); std::string ___1418 = ___972; if ( ___1251 != ___2745 ) ___1418 += ___4187(___1251+1);
 #if defined ASCII_ANNOTATE_TYPES
___1418.append(1, ' ').append(AsciiTypeString<T>::typeString);
 #endif
if ( ___2797 == 1 ) file.fprintf("%*s  ", -___206, ___1418.c_str()); else file.fprintf("%*s\r\n", -___206, ___1418.c_str()); const int buffSize = 100; char buff[buffSize]; std::string outputBuffer; for ( size_t pos = 1; pos <= ___2797; ++pos ) { ___2039 = ___2039 && encodeAsciiValue<T, ___2025, base>(buff, buffSize, ___4299[pos-1]); outputBuffer.append(buff); if( (pos % ___4334) == 0 || (pos == ___2797 ) ) outputBuffer.append("\r\n"); else outputBuffer.append("  "); } file.fwrite(outputBuffer.c_str(), sizeof(char), outputBuffer.size()); ASSERT_ONLY(___1393 endingLocation = file.fileLoc()); ASSERT_ONLY(___1393 outputSize = (___1393)___206 + 2 + outputBuffer.size()); ___478(endingLocation - beginningLocation == outputSize); } else { file.fwrite(___4299, sizeof(T), ___2797); } ENSURE(VALID_BOOLEAN(___2039)); return ___2039; } template <typename OutType> ___372 ___4528( FileWriterInterface&        file, char const*                 ___972, ___81                  ___1251, size_t                      ___2797, ___2479 const*               ___4299, size_t                      ___4334  ) { ___2240<std::pair<OutType, OutType> > outputArray; ___372 ___2039 = outputArray.alloc(___2797); if (___2039) { for (size_t i = 0; i < ___2797; ++i) { outputArray[i].first = static_cast<OutType>(___4299[i].minValue()); outputArray[i].second = static_cast<OutType>(___4299[i].maxValue()); } ___2039 = ___4563<std::pair<OutType, OutType>, false, 0>(file, ___972, ___1251, ___2797, &outputArray[0], ___4334); } return ___2039; } template <typename T, bool ___2025> uint64_t arrayValueSizeInFile(bool ___2002) { if (___2002) return ___199<T, ___2025>::size + ASCII_SPACING_LEN; return sizeof(T); } template <typename T, bool ___2025> uint64_t arraySizeInFile(size_t ___2797, bool ___2002) { uint64_t charsPerNumber = arrayValueSizeInFile<T, ___2025>(___2002); ___478(charsPerNumber > 0); uint64_t ___3358 = static_cast<uint64_t>(___2797) * charsPerNumber; if (___2002) ___3358 += static_cast<uint64_t>(___206) + ASCII_SPACING_LEN; return ___3358; } template <typename T, bool ___2025> uint64_t valueSizeInFile(bool ___2002) { return arraySizeInFile<T, ___2025>(1, ___2002); }