Ejemplo n.º 1
0
/**
 * Sanity-check various pieces of the Ebwt
 */
void Ebwt::sanityCheckAll(int reverse) const {
	const EbwtParams& eh = this->_eh;
	assert(isInMemory());
	// Check ftab
	for(uint32_t i = 1; i < eh._ftabLen; i++) {
		assert_geq(this->ftabHi(i), this->ftabLo(i-1));
		assert_geq(this->ftabLo(i), this->ftabHi(i-1));
		assert_leq(this->ftabHi(i), eh._bwtLen+1);
	}
	assert_eq(this->ftabHi(eh._ftabLen-1), eh._bwtLen);
	
	// Check offs
	int seenLen = (eh._bwtLen + 31) >> 5;
	uint32_t *seen;
	try {
		seen = new uint32_t[seenLen]; // bitvector marking seen offsets
	} catch(bad_alloc& e) {
		cerr << "Out of memory allocating seen[] at " << __FILE__ << ":" << __LINE__ << endl;
		throw e;
	}
	memset(seen, 0, 4 * seenLen);
	uint32_t offsLen = eh._offsLen;
	for(uint32_t i = 0; i < offsLen; i++) {
		assert_lt(this->offs()[i], eh._bwtLen);
		int w = this->offs()[i] >> 5;
		int r = this->offs()[i] & 31;
		assert_eq(0, (seen[w] >> r) & 1); // shouldn't have been seen before
		seen[w] |= (1 << r);
	}
	delete[] seen;
	
	// Check nPat
	assert_gt(this->_nPat, 0);
	
	// Check plen, flen
	for(uint32_t i = 0; i < this->_nPat; i++) {
		assert_geq(this->plen()[i], 0);
	}
	
	// Check rstarts
	if(this->rstarts() != NULL) {
		for(uint32_t i = 0; i < this->_nFrag-1; i++) {
			assert_gt(this->rstarts()[(i+1)*3], this->rstarts()[i*3]);
			if(reverse == REF_READ_REVERSE) {
				assert(this->rstarts()[(i*3)+1] >= this->rstarts()[((i+1)*3)+1]);
			} else {
				assert(this->rstarts()[(i*3)+1] <= this->rstarts()[((i+1)*3)+1]);
			}
		}
	}
	
	// Check ebwt
	sanityCheckUpToSide(eh._numSides);
	VMSG_NL("Ebwt::sanityCheck passed");
}
Ejemplo n.º 2
0
/**
 * Calculate a vector containing the sizes of all of the patterns in
 * all of the given input files, in order.  Returns the total size of
 * all references combined.  Rewinds each istream before returning.
 */
std::pair<size_t, size_t>
fastaRefReadSizes(
	EList<FileBuf*>& in,
	EList<RefRecord>& recs,
	const RefReadInParams& rparms,
	BitpairOutFileBuf* bpout,
	int& numSeqs)
{
	uint32_t unambigTot = 0;
	uint32_t bothTot = 0;
	RefReadInParams rpcp = rparms;
	assert_gt(in.size(), 0);
	// For each input istream
	for(size_t i = 0; i < in.size(); i++) {
		bool first = true;
		assert(!in[i]->eof());
		// For each pattern in this istream
		while(!in[i]->eof()) {
			RefRecord rec = fastaRefReadSize(*in[i], rparms, first, bpout);
			if((unambigTot + rec.len) < unambigTot) {
				cerr << "Error: Reference sequence has more than 2^32-1 characters!  Please divide the" << endl
				     << "reference into batches or chunks of about 3.6 billion characters or less each" << endl
				     << "and index each independently." << endl;
				throw 1;
			}
			// Add the length of this record.
			if(rec.first) numSeqs++;
			unambigTot += rec.len;
			bothTot += rec.len;
			bothTot += rec.off;
			first = false;
			if(rec.len == 0 && rec.off == 0 && !rec.first) continue;
			recs.push_back(rec);
		}
		// Reset the input stream
		in[i]->reset();
		assert(!in[i]->eof());
#ifndef NDEBUG
		// Check that it's really reset
		int c = in[i]->get();
		assert_eq('>', c);
		in[i]->reset();
		assert(!in[i]->eof());
#endif
	}
	assert_geq(bothTot, 0);
	assert_geq(unambigTot, 0);
	return make_pair(
		unambigTot, // total number of unambiguous DNA characters read
		bothTot); // total number of DNA characters read, incl. ambiguous ones
}
Ejemplo n.º 3
0
  template <int partition=0> inline ftype pullCapacity(int dim) const {
    assert_leq(alpha[dim], -edges[dim]);
    assert_geq(alpha[dim],  edges[dim]);

    // Actually the amount that can be pushed to the other node, so in
    // the 1-partition it's 

    return -edges[dim] + ( (partition == 1) ? 1 : -1) * alpha[dim];
  }
Ejemplo n.º 4
0
inline double TimeTracker::elapsedSeconds() const
{
  if(started)
    {
      clock_t cur_time_diff = paused ? 0 : clock() - start_time;
      double r = double(cur_time_diff + offset) / CLOCKS_PER_SEC;
      assert_geq(r, 0);
      return r;
    }
  else
    {
      return 0;
    }
}
Ejemplo n.º 5
0
inline ftype pushCapacity(const node_cptr& src, const node_cptr& dest, int dim, int dir) const {
  
  assert(dest == lattice.neighbor(src, dim, dir));

  ftype v;

  if(dir == 1) {
    v = src->template pushCapacity<partition>(dim);
  } else {
    v = dest->template pullCapacity<partition>(dim);
  }
  
  assert_geq(v, 0);
  
  return v;
}
Ejemplo n.º 6
0
inline void pushExcess(const node_ptr& src, const node_ptr& dest, const DirDim& dd, ftype amount) {

  int direction = dd.direction();
  int dim = dd.dimension();

#ifndef NDEBUG

  assert(direction == -1 || direction == 1);
  assert(dest == lattice.neighbor(src, dim, direction));
  assert_equal(src->on(), partition);
  assert_equal(dest->on(), partition);
  assert_geq(dim, 0);
  assert_leq(amount, pushCapacity<partition>(src, dest, dd));
  assert_leq(-amount, pushCapacity<partition>(dest, src, dd.reversed()));
  assert(direction == 1 || direction == -1);
  
  _debugVerifyNodeReduction(src);
  _debugVerifyNodeReduction(dest);

  ftype src_pr_excess = src->template excess<partition>();
  ftype dest_pr_excess = dest->template excess<partition>();

#endif

  src->reduction  -= ((partition == 1) ? 1 : -1) * amount;
  dest->reduction += ((partition == 1) ? 1 : -1) * amount;

  ((direction > 0) ? src : dest)
    ->alpha[dim] += direction * ((partition == 1) ? 1 : -1) * amount;
  
#ifndef NDEBUG
  _debugVerifyNodeReduction(src);
  _debugVerifyNodeReduction(dest);

  ftype src_af_excess = src->template excess<partition>();
  ftype dest_af_excess = dest->template excess<partition>();
  
  assert_equal(src_pr_excess - amount, src_af_excess);
  assert_equal(dest_pr_excess + amount, dest_af_excess);
#endif
  
}
Ejemplo n.º 7
0
inline ftype pushCapacity(const node_cptr& src, const node_cptr& dest, const DirDim& dd) const {
  
  assert(dest == lattice.neighbor(src, dd));

  ftype v;

  if(dd.direction() == 1) {
    v = src->template pushCapacity<partition>(dd.dimension());
  } else {
    v = dest->template pullCapacity<partition>(dd.dimension());
  }
  
  assert_geq(v, 0);
  
  // cout << "Push capacity from node " << (src - lattice.begin()) 
  //      << " to node " << (dest - lattice.begin()) << " is " 
  //      << v << ". " << endl;

  return v;
}
Ejemplo n.º 8
0
static void driver(const string& infile,
                   vector<string>& infiles,
                   const string& outfile,
                   bool reverse = false)
{
	vector<FileBuf*> is;
	bool bisulfite = false;
	RefReadInParams refparams(color, reverse ? reverseType : REF_READ_FORWARD, nsToAs, bisulfite);
	assert_gt(infiles.size(), 0);
	if(format == CMDLINE) {
		// Adapt sequence strings to stringstreams open for input
		stringstream *ss = new stringstream();
		for(size_t i = 0; i < infiles.size(); i++) {
			(*ss) << ">" << i << endl << infiles[i] << endl;
		}
		FileBuf *fb = new FileBuf(ss);
		assert(fb != NULL);
		assert(!fb->eof());
		assert(fb->get() == '>');
		ASSERT_ONLY(fb->reset());
		assert(!fb->eof());
		is.push_back(fb);
	} else {
		// Adapt sequence files to ifstreams
		for(size_t i = 0; i < infiles.size(); i++) {
			FILE *f = fopen(infiles[i].c_str(), "rb");
			if (f == NULL) {
				cerr << "Error: could not open "<< infiles[i] << endl;
				throw 1;
			}
			FileBuf *fb = new FileBuf(f);
			assert(fb != NULL);
			assert(!fb->eof());
			assert(fb->get() == '>');
			ASSERT_ONLY(fb->reset());
			assert(!fb->eof());
			is.push_back(fb);
		}
	}
	// Vector for the ordered list of "records" comprising the input
	// sequences.  A record represents a stretch of unambiguous
	// characters in one of the input sequences.
	vector<RefRecord> szs;
	vector<uint32_t> plens;
	std::pair<size_t, size_t> sztot;
	{
		if(verbose) cout << "Reading reference sizes" << endl;
		Timer _t(cout, "  Time reading reference sizes: ", verbose);
		if(!reverse && (writeRef || justRef)) {
			// For forward reference, dump it to .3.ebwt and .4.ebwt
			// files
			string file3 = outfile + ".3." + gEbwt_ext;
			string file4 = outfile + ".4." + gEbwt_ext;
			// Open output stream for the '.3.ebwt' file which will
			// hold the size records.
			ofstream fout3(file3.c_str(), ios::binary);
			if(!fout3.good()) {
				cerr << "Could not open index file for writing: \"" << file3 << "\"" << endl
					 << "Please make sure the directory exists and that permissions allow writing by" << endl
					 << "Bowtie." << endl;
				throw 1;
			}
			BitpairOutFileBuf bpout(file4.c_str());
			// Read in the sizes of all the unambiguous stretches of
			// the genome into a vector of RefRecords.  The input
			// streams are reset once it's done.
			writeU<int32_t>(fout3, 1, bigEndian); // endianness sentinel
			if(color) {
				refparams.color = false;
				// Make sure the .3.ebwt and .4.ebwt files contain
				// nucleotides; not colors
				TIndexOff numSeqs = 0;
				fastaRefReadSizes(is, szs, plens, refparams, &bpout, numSeqs);
				refparams.color = true;
				writeU<TIndexOffU>(fout3, (TIndexOffU)szs.size(), bigEndian); // write # records
				for(size_t i = 0; i < szs.size(); i++) {
					szs[i].write(fout3, bigEndian);
				}
				szs.clear();
				plens.clear();
				// Now read in the colorspace size records; these are
				// the ones that were indexed
				TIndexOff numSeqs2 = 0;
				sztot = fastaRefReadSizes(is, szs, plens, refparams, NULL, numSeqs2);
				assert_geq(numSeqs, numSeqs2);
			} else {
				TIndexOff numSeqs = 0;
				sztot = fastaRefReadSizes(is, szs, plens, refparams, &bpout, numSeqs);
				writeU<TIndexOffU>(fout3, (TIndexOffU)szs.size(), bigEndian); // write # records
				for(size_t i = 0; i < szs.size(); i++) szs[i].write(fout3, bigEndian);
			}
			if(sztot.first == 0) {
				cerr << "Error: No unambiguous stretches of characters in the input.  Aborting..." << endl;
				throw 1;
			}
			assert_gt(sztot.first, 0);
			assert_gt(sztot.second, 0);
			bpout.close();
			fout3.close();
#ifndef NDEBUG
			if(sanityCheck) {
				BitPairReference bpr(
					outfile, // ebwt basename
					color,   // expect color?
					true,    // sanity check?
					&infiles,// files to check against
					NULL,    // sequences to check against
					format == CMDLINE, // whether infiles contains strings
					true,    // load sequence?
					false,   // use memory-mapped files
					false,   // use shared memory
					false,   // sweep through memory-mapped memory
					false,   // be talkative
					false);  // be talkative
			}
#endif
		} else {
			// Read in the sizes of all the unambiguous stretches of the
			// genome into a vector of RefRecords
			TIndexOff numSeqs = 0;
			sztot = fastaRefReadSizes(is, szs, plens, refparams, NULL, numSeqs);
#ifndef NDEBUG
			if(refparams.color) {
				refparams.color = false;
				vector<RefRecord> szs2;
				vector<uint32_t> plens2;
				TIndexOff numSeqs2 = 0;
				fastaRefReadSizes(is, szs2, plens2, refparams, NULL, numSeqs2);
				assert_leq(numSeqs, numSeqs2);
				// One less color than base
				refparams.color = true;
			}
#endif
		}
	}
	if(justRef) return;
	assert_gt(sztot.first, 0);
	assert_gt(sztot.second, 0);
	assert_gt(szs.size(), 0);
	// Construct Ebwt from input strings and parameters
	Ebwt<TStr> ebwt(refparams.color ? 1 : 0,
	                lineRate,
	                linesPerSide,
	                offRate,      // suffix-array sampling rate
	                -1,           // ISA sampling rate
	                ftabChars,    // number of chars in initial arrow-pair calc
			nthreads,
	                outfile,      // basename for .?.ebwt files
	                !reverse,     // fw
	                !entireSA,    // useBlockwise
	                bmax,         // block size for blockwise SA builder
	                bmaxMultSqrt, // block size as multiplier of sqrt(len)
	                bmaxDivN,     // block size as divisor of len
	                noDc? 0 : dcv,// difference-cover period
	                is,           // list of input streams
	                szs,          // list of reference sizes
	                plens,        // list of not-all-gap reference sequence lengths
	                (TIndexOffU)sztot.first,  // total size of all unambiguous ref chars
	                refparams,    // reference read-in parameters
	                seed,         // pseudo-random number generator seed
	                -1,           // override offRate
	                -1,           // override isaRate
	                verbose,      // be talkative
	                autoMem,      // pass exceptions up to the toplevel so that we can adjust memory settings automatically
	                sanityCheck); // verify results and internal consistency
	// Note that the Ebwt is *not* resident in memory at this time.  To
	// load it into memory, call ebwt.loadIntoMemory()
	if(verbose) {
		// Print Ebwt's vital stats
		ebwt.eh().print(cout);
	}
	if(sanityCheck) {
		// Try restoring the original string (if there were
		// multiple texts, what we'll get back is the joined,
		// padded string, not a list)
		ebwt.loadIntoMemory(
			refparams.color ? 1 : 0,
			-1,
			false,
			false);
		TStr s2; ebwt.restore(s2);
		ebwt.evictFromMemory();
		{
			TStr joinedss = Ebwt<TStr>::join(
				is,          // list of input streams
				szs,         // list of reference sizes
				(TIndexOffU)sztot.first, // total size of all unambiguous ref chars
				refparams,   // reference read-in parameters
				seed);       // pseudo-random number generator seed
			if(refparams.reverse == REF_READ_REVERSE) {
				reverseInPlace(joinedss);
			}
			assert_eq(length(joinedss), length(s2));
			assert_eq(joinedss, s2);
		}
		if(verbose) {
			if(length(s2) < 1000) {
				cout << "Passed restore check: " << s2 << endl;
			} else {
				cout << "Passed restore check: (" << length(s2) << " chars)" << endl;
			}
		}
	}
}