Ejemplo n.º 1
0
/**
 * The main member function for dispensing patterns.
 *
 * Returns true iff a pair was parsed succesfully.
 */
bool PatternSource::nextReadPair(
	Read& ra,
	Read& rb,
	TReadId& rdid,
	TReadId& endid,
	bool& success,
	bool& done,
	bool& paired,
	bool fixName)
{
	// nextPatternImpl does the reading from the ultimate source;
	// it is implemented in concrete subclasses
	success = done = paired = false;
	nextReadPairImpl(ra, rb, rdid, endid, success, done, paired);
	if(success) {
		// Construct reversed versions of fw and rc seqs/quals
		ra.finalize();
		if(!rb.empty()) {
			rb.finalize();
		}
		// Fill in the random-seed field using a combination of
		// information from the user-specified seed and the read
		// sequence, qualities, and name
		ra.seed = genRandSeed(ra.patFw, ra.qual, ra.name, seed_);
		if(!rb.empty()) {
			rb.seed = genRandSeed(rb.patFw, rb.qual, rb.name, seed_);
		}
	}
	return success;
}
Ejemplo n.º 2
0
/**
 * This is unused, but implementation is given for completeness.
 */
bool VectorPatternSource::nextReadPairImpl(
	Read& ra,
	Read& rb,
	TReadId& rdid,
	TReadId& endid,
	bool& success,
	bool& done,
	bool& paired)
{
	// Let Strings begin at the beginning of the respective bufs
	ra.reset();
	rb.reset();
	paired = true;
	if(!paired_) {
		paired_ = true;
		cur_ <<= 1;
	}
	lock();
	if(cur_ >= v_.size()-1) {
		unlock();
		// Clear all the Strings, as a signal to the caller that
		// we're out of reads
		ra.reset();
		rb.reset();
		assert(ra.empty());
		assert(rb.empty());
		success = false;
		done = true;
		return false;
	}
	// Copy v_*, quals_* strings into the respective Strings
	ra.patFw  = v_[cur_];
	ra.qual = quals_[cur_];
	ra.trimmed3 = trimmed3_[cur_];
	ra.trimmed5 = trimmed5_[cur_];
	cur_++;
	rb.patFw  = v_[cur_];
	rb.qual = quals_[cur_];
	rb.trimmed3 = trimmed3_[cur_];
	rb.trimmed5 = trimmed5_[cur_];
	ostringstream os;
	os << readCnt_;
	ra.name = os.str();
	rb.name = os.str();
	ra.color = rb.color = gColor;
	cur_++;
	done = cur_ >= v_.size()-1;
	rdid = endid = readCnt_;
	readCnt_++;
	unlock();
	success = true;
	return true;
}
Ejemplo n.º 3
0
/**
 * The main member function for dispensing pairs of reads or
 * singleton reads.  Returns true iff ra and rb contain a new
 * pair; returns false if ra contains a new unpaired read.
 */
bool PairedSoloPatternSource::nextReadPair(
	Read& ra,
	Read& rb,
	TReadId& rdid,
	TReadId& endid,
	bool& success,
	bool& done,
	bool& paired,
	bool fixName)
{
//	printf("paired solo pattern source\n");
	uint32_t cur = cur_;
	success = false;
	while(cur < src_->size()) {
		// Patterns from srca_[cur_] are unpaired
		do {
			(*src_)[cur]->nextReadPair(
				ra, rb, rdid, endid, success, done, paired, fixName);
		} while(!success && !done);
		if(!success) {
			assert(done);
			// If patFw is empty, that's our signal that the
			// input dried up
			lock();
			if(cur + 1 > cur_) cur_++;
			cur = cur_;
			unlock();
			continue; // on to next pair of PatternSources
		}
		assert(success);
		ra.seed = genRandSeed(ra.patFw, ra.qual, ra.name, seed_);
		if(!rb.empty()) {
			rb.seed = genRandSeed(rb.patFw, rb.qual, rb.name, seed_);
			if(fixName) {
				ra.fixMateName(1);
				rb.fixMateName(2);
			}
		}
		ra.rdid = rdid;
		ra.endid = endid;
		if(!rb.empty()) {
			rb.rdid = rdid;
			rb.endid = endid+1;
		}
		ra.mate = 1;
		rb.mate = 2;
		return true; // paired
	}
	assert_leq(cur, src_->size());
	done = (cur == src_->size());
	return false;
}
Ejemplo n.º 4
0
bool VectorPatternSource::nextReadImpl(
	Read& r,
	TReadId& rdid,
	TReadId& endid,
	bool& success,
	bool& done)
{
	// Let Strings begin at the beginning of the respective bufs
	r.reset();
	lock();
	if(cur_ >= v_.size()) {
		unlock();
		// Clear all the Strings, as a signal to the caller that
		// we're out of reads
		r.reset();
		success = false;
		done = true;
		assert(r.empty());
		return false;
	}
	// Copy v_*, quals_* strings into the respective Strings
	r.color = gColor;
	r.patFw  = v_[cur_];
	r.qual = quals_[cur_];
	r.trimmed3 = trimmed3_[cur_];
	r.trimmed5 = trimmed5_[cur_];
	ostringstream os;
	os << cur_;
	r.name = os.str();
	cur_++;
	done = cur_ == v_.size();
	rdid = endid = readCnt_;
	readCnt_++;
	unlock();
	success = true;
	return true;
}
Ejemplo n.º 5
0
/**
 * The main member function for dispensing pairs of reads or
 * singleton reads.  Returns true iff ra and rb contain a new
 * pair; returns false if ra contains a new unpaired read.
 */
bool PairedDualPatternSource::nextReadPair(
	Read& ra,
	Read& rb,
	TReadId& rdid,
	TReadId& endid,
	bool& success,
	bool& done,
	bool& paired,
	bool fixName)
{
	// 'cur' indexes the current pair of PatternSources
	uint32_t cur;
	{
		lock();
		cur = cur_;
		unlock();
	}
	success = false;
	done = true;
	while(cur < srca_->size()) {
		if((*srcb_)[cur] == NULL) {
			paired = false;
			// Patterns from srca_ are unpaired
			do {
				(*srca_)[cur]->nextRead(ra, rdid, endid, success, done);
			} while(!success && !done);
			if(!success) {
				assert(done);
				lock();
				if(cur + 1 > cur_) cur_++;
				cur = cur_; // Move on to next PatternSource
				unlock();
				continue; // on to next pair of PatternSources
			}
			ra.rdid = rdid;
			ra.endid = endid;
			ra.mate  = 0;
			return success;
		} else {
			paired = true;
			// Patterns from srca_[cur_] and srcb_[cur_] are paired
			TReadId rdid_a = 0, endid_a = 0;
			TReadId rdid_b = 0, endid_b = 0;
			bool success_a = false, done_a = false;
			bool success_b = false, done_b = false;
			// Lock to ensure that this thread gets parallel reads
			// in the two mate files
			lock();
			do {
				(*srca_)[cur]->nextRead(ra, rdid_a, endid_a, success_a, done_a);
			} while(!success_a && !done_a);
			do {
				(*srcb_)[cur]->nextRead(rb, rdid_b, endid_b, success_b, done_b);
			} while(!success_b && !done_b);
			if(!success_a && success_b) {
				cerr << "Error, fewer reads in file specified with -1 than in file specified with -2" << endl;
				throw 1;
			} else if(!success_a) {
				assert(done_a && done_b);
				if(cur + 1 > cur_) cur_++;
				cur = cur_; // Move on to next PatternSource
				unlock();
				continue; // on to next pair of PatternSources
			} else if(!success_b) {
				cerr << "Error, fewer reads in file specified with -2 than in file specified with -1" << endl;
				throw 1;
			}
			assert_eq(rdid_a, rdid_b);
			//assert_eq(endid_a+1, endid_b);
			assert_eq(success_a, success_b);
			unlock();
			if(fixName) {
				ra.fixMateName(1);
				rb.fixMateName(2);
			}
			rdid = rdid_a;
			endid = endid_a;
			success = success_a;
			done = done_a;
			ra.rdid = rdid;
			ra.endid = endid;
			if(!rb.empty()) {
				rb.rdid = rdid;
				rb.endid = endid+1;
			}
			ra.mate = 1;
			rb.mate = 2;
			return success;
		}
	}
	return success;
}
Ejemplo n.º 6
0
/**
 * Finalize FASTQ parsing outside critical section.
 */
bool FastqPatternSource::parse(Read &r, Read& rb, TReadId rdid) const {
	// We assume the light parser has put the raw data for the separate ends
	// into separate Read objects.  That doesn't have to be the case, but
	// that's how we've chosen to do it for FastqPatternSource
	assert_gt(r.readOrigBufLen, 0);
	assert(r.empty());
	int c;
	size_t cur = 1;
	const size_t buflen = r.readOrigBufLen;

	// Parse read name
	assert_eq(0, seqan::length(r.name));
	int nameoff = 0;
	while(true) {
		assert_lt(cur, buflen);
		c = r.readOrigBuf[cur++];
		if(c == '\n' || c == '\r') {
			do {
				c = r.readOrigBuf[cur++];
			} while(c == '\n' || c == '\r');
			break;
		}
		r.nameBuf[nameoff++] = c;
	}
	r.nameBuf[nameoff] = '\0';
	_setBegin(r.name, r.nameBuf);
	_setLength(r.name, nameoff);
	
	// Parse sequence
	int nchar = 0, seqoff = 0;
	assert_eq(0, seqan::length(r.patFw));
	if(color_ && asc2dnacat[c] > 0) {
		// First char is a DNA char (primer)
		if(asc2colcat[toupper(r.readOrigBuf[cur++])] <= 0) {
			// 2nd char isn't a color, so don't assume 'c' is primer
			cur -= 2;
		} else {
			// 'c' is primer
			r.primer = c;
		}
		c = r.readOrigBuf[cur++];
	}
	if(color_) {
		while(c != '+' && cur < buflen) {
			if(c >= '0' && c < '4') {
				c = "ACGTN"[(int)c - '0'];
			}
			if(c == '.') {
				c = 'N';
			}
			if(isalpha(c)) {
				assert_in(toupper(c), "ACGTN");
				if(nchar++ >= this->trim5_) {
					assert_neq(0, asc2dnacat[c]);
					r.patBufFw[seqoff++] = charToDna5[c]; // ascii to int
				}
			}
			c = r.readOrigBuf[cur++];
		}
		r.color = true;
	} else {
		while(c != '+' && cur < buflen) {
			if(c == '.') {
				c = 'N';
			}
			if(isalpha(c)) {
				// If it's past the 5'-end trim point
				if(nchar++ >= this->trim5_) {
					r.patBufFw[seqoff++] = charToDna5[c];
				}
			}
			assert_lt(cur, buflen);
			c = r.readOrigBuf[cur++];
		}
	}
	_setBegin(r.patFw, (Dna5*)r.patBufFw);
	// record amt trimmed from 5' end due to --trim5
	r.trimmed5 = (int)(nchar - seqoff);
	// record amt trimmed from 3' end due to --trim3
	int trim3 = (seqoff < this->trim3_) ? seqoff : this->trim3_;
	_setLength(r.patFw, seqoff - trim3);
	r.patBufFw[seqan::length(r.patFw)] = '\0';
	r.trimmed3 = trim3;

	assert_eq('+', c);
	do {
		assert_lt(cur, buflen);
		c = r.readOrigBuf[cur++];
	} while(c != '\n' && c != '\r');
	while(cur < buflen && (c == '\n' || c == '\r')) {
		c = r.readOrigBuf[cur++];
	}
	
	assert_eq(0, seqan::length(r.qual));
	int nqual = 0, qualoff = 0;
	if (intQuals_) {
		int cur_int = 0;
		while(c != '\t' && c != '\n' && c != '\r') {
			cur_int *= 10;
			cur_int += (int)(c - '0');
			c = r.readOrigBuf[cur++];
			if(c == ' ' || c == '\t' || c == '\n' || c == '\r') {
				char cadd = intToPhred33(cur_int, solQuals_);
				cur_int = 0;
				if (c == ' ')
					c = r.readOrigBuf[cur++];
				assert_geq(cadd, 33);
				if(++nqual > this->trim5_) {
					r.qualBuf[qualoff++] = cadd;
				}
			}
		}
	} else {
		c = charToPhred33(c, solQuals_, phred64Quals_);
		if(nqual++ >= r.trimmed5) {
			r.qualBuf[qualoff++] = c;
		}
		while(cur < buflen) {
			c = r.readOrigBuf[cur++];
			if (c == ' ') {
				wrongQualityFormat(r.name);
				return false;
			}
			if(c == '\r' || c == '\n') {
				break;
			}
			c = charToPhred33(c, solQuals_, phred64Quals_);
			if(nqual++ >= r.trimmed5) {
				r.qualBuf[qualoff++] = c;
			}
		}
		if(qualoff < seqoff) {
			tooFewQualities(r.name);
			return false;
		} else if(qualoff > seqoff) {
			// if qualoff is at most 2 characters longer than the sequence
			// then the extra characters will most likely be the quality values
			// of the primer and the first base (which get discarded by bowtie).
			// In this case move the remainder of the sequence the (qualoff - seqoff)
			// positions left.
			if (r.color && qualoff - seqoff <= 2) {
				memmove(r.qualBuf, r.qualBuf + (qualoff - seqoff), seqoff);
			}
			else {
				tooManyQualities(r.name);
				return false;
			}
		}
	}
	r.qualBuf[seqan::length(r.patFw)] = '\0';
	_setBegin(r.qual, r.qualBuf);
	_setLength(r.qual, seqan::length(r.patFw));

	// Set up a default name if one hasn't been set
	if(seqan::length(r.name) == 0) {
		itoa10<TReadId>(static_cast<TReadId>(readCnt_), r.nameBuf);
		_setBegin(r.name, r.nameBuf);
		_setLength(r.name, nameoff);
	}
	r.parsed = true;
	if(!rb.parsed && rb.readOrigBufLen > 0) {
		return parse(rb, r, rdid);
	}
	return true;
}
Ejemplo n.º 7
0
/**
 * Finalize FASTA-continuous parsing outside critical section.
 */
bool FastaContinuousPatternSource::parse(
	Read& ra,
	Read& rb,
	TReadId rdid) const
{
	// Light parser (nextBatchFromFile) puts unparsed data
	// into Read& r, even when the read is paired.
	assert(ra.empty());
	assert(rb.empty());
	assert_gt(ra.readOrigBufLen, 0); // raw data for read/pair is here
	assert_eq(0, rb.readOrigBufLen);
	int c = '\t';
	size_t cur = 0;
	const size_t buflen = ra.readOrigBufLen;
	
	// Parse read name
	assert_eq(0, seqan::length(ra.name));
	int nameoff = 0;
	c = ra.readOrigBuf[cur++];
	while(c != '\t' && cur < buflen) {
		ra.nameBuf[nameoff++] = c;
		c = ra.readOrigBuf[cur++];
	}
	assert_eq('\t', c);
	if(cur >= buflen) {
		return false; // record ended prematurely
	}
	ra.nameBuf[nameoff] = '\0';
	_setBegin(ra.name, ra.nameBuf);
	_setLength(ra.name, nameoff);

	// Parse sequence
	assert_eq(0, seqan::length(ra.patFw));
	c = ra.readOrigBuf[cur++];
	int nchar = 0, seqoff = 0;
	while(cur < buflen) {
		if(isalpha(c)) {
			assert_in(toupper(c), "ACGTN");
			if(nchar++ >= this->trim5_) {
				assert_neq(0, asc2dnacat[c]);
				ra.patBufFw[seqoff++] = charToDna5[c]; // ascii to int
			}
		}
		c = ra.readOrigBuf[cur++];
	}
	ra.patBufFw[seqoff] = '\0';
	_setBegin(ra.patFw, (Dna5*)ra.patBufFw);
	// record amt trimmed from 5' end due to --trim5
	ra.trimmed5 = (int)(nchar - seqoff);
	// record amt trimmed from 3' end due to --trim3
	int trim3 = (seqoff < this->trim3_) ? seqoff : this->trim3_;
	_setLength(ra.patFw, seqoff - trim3);
	ra.trimmed3 = trim3;
	
	// Make fake qualities
	assert_eq(0, seqan::length(ra.qual));
	int qualoff = 0;
	for(size_t i = 0; i < seqoff; i++) {
		ra.qualBuf[qualoff++] = 'I';
	}
	ra.qualBuf[qualoff] = '\0';
	_setBegin(ra.qual, ra.qualBuf);
	_setLength(ra.qual, qualoff);
	ra.parsed = true;
	return true;
}
Ejemplo n.º 8
0
/**
 * Finalize FASTA parsing outside critical section.
 */
bool FastaPatternSource::parse(Read& r, Read& rb, TReadId rdid) const {
	// We assume the light parser has put the raw data for the separate ends
	// into separate Read objects.  That doesn't have to be the case, but
	// that's how we've chosen to do it for FastqPatternSource
	assert_gt(r.readOrigBufLen, 0);
	assert(r.empty());
	int c = -1;
	size_t cur = 1;
	const size_t buflen = r.readOrigBufLen;
	
	// Parse read name
	assert_eq(0, seqan::length(r.name));
	int nameoff = 0;
	while(cur < buflen) {
		c = r.readOrigBuf[cur++];
		if(c == '\n' || c == '\r') {
			do {
				c = r.readOrigBuf[cur++];
			} while((c == '\n' || c == '\r') && cur < buflen);
			break;
		}
		r.nameBuf[nameoff++] = c;
	}
	if(cur >= buflen) {
		return false; // FASTA ended prematurely
	}
	if(nameoff > 0) {
		r.nameBuf[nameoff] = '\0';
		_setBegin(r.name, r.nameBuf);
		_setLength(r.name, nameoff);
	}
	
	// Parse sequence
	int nchar = 0, seqoff = 0;
	assert_eq(0, seqan::length(r.patFw));
	assert(c != '\n' && c != '\r');
	assert_lt(cur, buflen);

	if(color_ && asc2dnacat[c] > 0) {
		// First char is a DNA char (primer)
		if(asc2colcat[toupper(r.readOrigBuf[cur++])] <= 0) {
			// 2nd char isn't a color, so don't assume 'c' is primer
			cur -= 2;
		} else {
			// 'c' is primer
			r.primer = c;
		}
		c = r.readOrigBuf[cur++];
	}
	if(color_) {
		while(c != '\n' && cur < buflen) {
			if(c >= '0' && c < '4') {
				c = "ACGTN"[(int)c - '0'];
			}
			if(c == '.') {
				c = 'N';
			}
			if(isalpha(c)) {
				assert_in(toupper(c), "ACGTN");
				if(nchar++ >= this->trim5_) {
					assert_neq(0, asc2dnacat[c]);
					r.patBufFw[seqoff++] = charToDna5[c]; // ascii to int
				}
			}
			c = r.readOrigBuf[cur++];
		}
		r.color = true;
	} else {
		while(c != '\n' && cur < buflen) {
			if(c == '.') {
				c = 'N';
			}
			if(isalpha(c)) {
				// If it's past the 5'-end trim point
				if(nchar++ >= this->trim5_) {
					r.patBufFw[seqoff++] = charToDna5[c];
				}
			}
			assert_lt(cur, buflen);
			c = r.readOrigBuf[cur++];
		}
	}
	r.patBufFw[seqoff] = '\0';
	_setBegin(r.patFw, (Dna5*)r.patBufFw);
	// record amt trimmed from 5' end due to --trim5
	r.trimmed5 = (int)(nchar - seqoff);
	// record amt trimmed from 3' end due to --trim3
	int trim3 = (seqoff < this->trim3_) ? seqoff : this->trim3_;
	_setLength(r.patFw, seqoff - trim3);
	r.trimmed3 = trim3;
	
	for(size_t i = 0; i < seqoff - trim3; i++) {
		r.qualBuf[i] = 'I';
	}
	r.qualBuf[seqoff - trim3] = '\0';
	_setBegin(r.qual, r.qualBuf);
	_setLength(r.qual, seqoff - trim3);

	// Set up a default name if one hasn't been set
	if(nameoff == 0) {
		itoa10<TReadId>(static_cast<TReadId>(rdid), r.nameBuf);
		_setBegin(r.name, r.nameBuf);
		_setLength(r.name, strlen(r.nameBuf));
	}
	r.parsed = true;
	if(!rb.parsed && rb.readOrigBufLen > 0) {
		return parse(rb, r, rdid);
	}
	return true;
}
Ejemplo n.º 9
0
/**
 * Finishes parsing outside the critical section.
 */
bool VectorPatternSource::parse(Read& ra, Read& rb, TReadId rdid) const {
	// Very similar to TabbedPatternSource

	// Light parser (nextBatchFromFile) puts unparsed data
	// into Read& r, even when the read is paired.
	assert(ra.empty());
	assert_gt(ra.readOrigBufLen, 0); // raw data for read/pair is here
	int c = '\t';
	size_t cur = 0;
	const size_t buflen = ra.readOrigBufLen;
	
	// Loop over the two ends
	for(int endi = 0; endi < 2 && c == '\t'; endi++) {
		Read& r = ((endi == 0) ? ra : rb);
		assert_eq(0, seqan::length(r.name));
		// Parse name if (a) this is the first end, or
		// (b) this is tab6
		size_t nameoff = 0;
		if(endi < 1 || paired_) {
			// Parse read name
			c = ra.readOrigBuf[cur++];
			while(c != '\t' && cur < buflen) {
				r.nameBuf[nameoff++] = c;
				c = ra.readOrigBuf[cur++];
			}
			assert_eq('\t', c);
			if(cur >= buflen) {
				return false; // record ended prematurely
			}
		} else if(endi > 0) {
			// if this is the second end and we're parsing
			// tab5, copy name from first end
			rb.name = ra.name;
		}
		r.nameBuf[nameoff] = '\0';
		_setBegin(r.name, r.nameBuf);
		_setLength(r.name, nameoff);

		// Parse sequence
		assert_eq(0, seqan::length(r.patFw));
		c = ra.readOrigBuf[cur++];
		int nchar = 0, seqoff = 0;
		if(color_ && asc2dnacat[c] > 0) {
			// First char is a DNA char (primer)
			if(asc2colcat[toupper(r.readOrigBuf[cur++])] <= 0) {
				// 2nd char isn't a color, so don't assume 'c' is primer
				cur -= 2;
			} else {
				// 'c' is primer
				r.primer = c;
			}
			c = r.readOrigBuf[cur++];
		}
		if(color_) {
			while(c != '\t' && cur < buflen) {
				if(c >= '0' && c < '4') {
					c = "ACGTN"[(int)c - '0'];
				}
				if(c == '.') {
					c = 'N';
				}
				if(isalpha(c)) {
					assert_in(toupper(c), "ACGTN");
					if(nchar++ >= this->trim5_) {
						assert_neq(0, asc2dnacat[c]);
						r.patBufFw[seqoff++] = charToDna5[c]; // ascii to int
					}
				}
				c = ra.readOrigBuf[cur++];
			}
			ra.color = true;
		} else {
			while(c != '\t' && cur < buflen) {
				if(isalpha(c)) {
					assert_in(toupper(c), "ACGTN");
					if(nchar++ >= this->trim5_) {
						assert_neq(0, asc2dnacat[c]);
						r.patBufFw[seqoff++] = charToDna5[c]; // ascii to int
					}
				}
				c = ra.readOrigBuf[cur++];
			}
		}
		assert_eq('\t', c);
		if(cur >= buflen) {
			return false; // record ended prematurely
		}
		r.patBufFw[seqoff] = '\0';
		_setBegin(r.patFw, (Dna5*)r.patBufFw);
		// record amt trimmed from 5' end due to --trim5
		r.trimmed5 = (int)(nchar - seqoff);
		// record amt trimmed from 3' end due to --trim3
		int trim3 = (seqoff < this->trim3_) ? seqoff : this->trim3_;
		_setLength(r.patFw, seqoff - trim3);
		r.trimmed3 = trim3;
		
		// Parse qualities
		assert_eq(0, seqan::length(r.qual));
		c = ra.readOrigBuf[cur++];
		int nqual = 0;
		size_t qualoff = 0;
		while(c != '\t' && c != '\n' && c != '\r') {
			if(c == ' ') {
				wrongQualityFormat(r.name);
				return false;
			}
			char cadd = charToPhred33(c, false, false);
			if(++nqual > this->trim5_) {
				r.qualBuf[qualoff++] = cadd;
			}
			if(cur >= buflen) break;
			c = ra.readOrigBuf[cur++];
		}
		if(nchar > nqual) {
			tooFewQualities(r.name);
			return false;
		} else if(nqual > nchar) {
			tooManyQualities(r.name);
			return false;
		}
		r.qualBuf[seqoff] = '\0';
		_setBegin(r.qual, r.qualBuf);
		_setLength(r.qual, seqan::length(r.patFw));
		assert(c == '\t' || c == '\n' || c == '\r' || cur >= buflen);
	}
	ra.parsed = true;
	if(!rb.parsed && rb.readOrigBufLen > 0) {
		return parse(rb, ra, rdid);
	}
	return true;
}
Ejemplo n.º 10
0
/**
 * Finalize raw parsing outside critical section.
 */
bool RawPatternSource::parse(Read& r, Read& rb, TReadId rdid) const {
	assert(r.empty());
	assert_gt(r.readOrigBufLen, 0);
	size_t cur = 0;
	const size_t buflen = r.readOrigBufLen;

	// Parse sequence
	assert_eq(0, seqan::length(r.patFw));
	int nchar = 0, seqoff = 0;
	int c = r.readOrigBuf[cur++];

	if(color_ && asc2dnacat[c] > 0) {
		// First char is a DNA char (primer)
		if(asc2colcat[toupper(r.readOrigBuf[cur++])] <= 0) {
			// 2nd char isn't a color, so don't assume 'c' is primer
			cur -= 2;
		} else {
			// 'c' is primer
			r.primer = c;
		}
		c = r.readOrigBuf[cur++];
	}
	if(color_) {
		while(c != '\0') {
			assert(c != '\r' && c != '\n');
			if(c >= '0' && c < '4') {
				c = "ACGTN"[(int)c - '0'];
			}
			if(c == '.') {
				c = 'N';
			}
			if(isalpha(c)) {
				assert_in(toupper(c), "ACGTN");
				if(nchar++ >= this->trim5_) {
					assert_neq(0, asc2dnacat[c]);
					r.patBufFw[seqoff++] = charToDna5[c]; // ascii to int
				}
			}
			c = r.readOrigBuf[cur++];
		}
		r.color = true;
	} else {
		cur--;
		while(cur < buflen) {
			c = r.readOrigBuf[cur++];
			assert(c != '\r' && c != '\n');
			if(isalpha(c)) {
				assert_in(toupper(c), "ACGTN");
				if(nchar++ >= this->trim5_) {
					assert_neq(0, asc2dnacat[c]);
					r.patBufFw[seqoff++] = charToDna5[c];
				}
			}
		}
	}
	_setBegin(r.patFw, (Dna5*)r.patBufFw);
	// record amt trimmed from 5' end due to --trim5
	r.trimmed5 = (int)(nchar - seqoff);
	// record amt trimmed from 3' end due to --trim3
	int trim3 = (seqoff < this->trim3_) ? seqoff : this->trim3_;
	_setLength(r.patFw, seqoff - trim3);
	r.patBufFw[seqan::length(r.patFw)] = '\0';
	r.trimmed3 = trim3;
	
	// Give the name field a dummy value
	itoa10<TReadId>(rdid, r.nameBuf);
	_setBegin(r.name, r.nameBuf);
	_setLength(r.name, strlen(r.nameBuf));
	
	// Give the base qualities dummy values
	assert_eq(0, seqan::length(r.qual));
	const size_t len = seqan::length(r.patFw);
	for(size_t i = 0; i < len; i++) {
		r.qualBuf[i] = 'I';
	}
	_setBegin(r.qual, r.qualBuf);
	_setLength(r.qual, seqan::length(r.patFw));
	
	r.parsed = true;
	if(!rb.parsed && rb.readOrigBufLen > 0) {
		return parse(rb, r, rdid);
	}
	return true;
}
Ejemplo n.º 11
0
/**
 * The main member function for dispensing pairs of reads or
 * singleton reads.  Returns true iff ra and rb contain a new
 * pair; returns false if ra contains a new unpaired read.
 */
bool PairedDualPatternSource::nextReadPair(
	Read& ra,
	Read& rb,
	TReadId& rdid,
	TReadId& endid,
	bool& success,
	bool& done,
	bool& paired,
	bool fixName)
{
	//std::cout << "PairedDualPatternSource::nextReadPair\n";
	// 'cur' indexes the current pair of PatternSources
//	struct timeval t1, t2;
//	double elapsedTime = 0;

	uint32_t cur;
	{
		//lock();
		cur = cur_;
		//unlock();
	}
	success = false;
	done = true;
	while(cur < srca_->size()) {
		if((*srcb_)[cur] == NULL) {
			paired = false;
			// Patterns from srca_ are unpaired
			do {
//				gettimeofday(&t1, NULL);
				(*srca_)[cur]->nextRead(ra, rdid, endid, success, done);
//				gettimeofday(&t2, NULL);
//				elapsedTime = (t2.tv_sec - t1.tv_sec) * 1000.0;      // sec to ms
//				elapsedTime += (t2.tv_usec - t1.tv_usec) / 1000.0;   // us to ms
//				printf("time elapsed in PairedDualPatternSource::nextReadPair %f\n", elapsedTime);
			} while(!success && !done);
			if(!success) {
				assert(done);
				//lock();
				if(cur + 1 > cur_) cur_++;
				cur = cur_; // Move on to next PatternSource

				//unlock();
				continue; // on to next pair of PatternSources
			}
			ra.rdid = rdid;
			ra.endid = endid;
			ra.mate  = 0;
			return success;
		} else {
			paired = true;
			// Patterns from srca_[cur_] and srcb_[cur_] are paired
			TReadId rdid_a = 0, endid_a = 0;
			TReadId rdid_b = 0, endid_b = 0;
			bool success_a = false, done_a = false;
			bool success_b = false, done_b = false;
			// Lock to ensure that this thread gets parallel reads
			// in the two mate files
			//lock();
			do {
				(*srca_)[cur]->nextRead(ra, rdid_a, endid_a, success_a, done_a);
			} while(!success_a && !done_a);
			do {
				(*srcb_)[cur]->nextRead(rb, rdid_b, endid_b, success_b, done_b);
			} while(!success_b && !done_b);
			if(!success_a && success_b) {
				cerr << "Error, fewer reads in file specified with -1 than in file specified with -2" << endl;
				throw 1;
			} else if(!success_a) {
				assert(done_a && done_b);
				if(cur + 1 > cur_) cur_++;
				cur = cur_; // Move on to next PatternSource
				//unlock();
				continue; // on to next pair of PatternSources
			} else if(!success_b) {
				cerr << "Error, fewer reads in file specified with -2 than in file specified with -1" << endl;
				throw 1;
			}
			assert_eq(rdid_a, rdid_b);
			//assert_eq(endid_a+1, endid_b);
			assert_eq(success_a, success_b);
			//unlock();
			if(fixName) {
				ra.fixMateName(1);
				rb.fixMateName(2);
			}
			rdid = rdid_a;
			endid = endid_a;
			success = success_a;
			done = done_a;
			ra.rdid = rdid;
			ra.endid = endid;
			if(!rb.empty()) {
				rb.rdid = rdid;
				rb.endid = endid+1;
			}
			ra.mate = 1;
			rb.mate = 2;

			return success;
		}
	}

	return success;
}