예제 #1
0
파일: hit.cpp 프로젝트: BenLangmead/bowtie
/**
 * Append a verbose, readable hit to the given output stream.
 */
void VerboseHitSink::append(
	BTString& o,
	const Hit& h,
	const vector<string>* refnames,
	bool fullRef,
	int partition,
	int offBase,
	bool colorSeq,
	bool colorQual,
	bool cost,
	const Bitset& suppress)
{
	bool spill = false;
	int spillAmt = 0;
	uint32_t pdiv = 0xffffffff;
	uint32_t pmod = 0xffffffff;
	do {
		bool dospill = false;
		if(spill) {
			// The read spilled over a partition boundary and so
			// needs to be printed more than once
			spill = false;
			dospill = true;
			spillAmt++;
		}
		assert(!spill);
		size_t field = 0;
		bool firstfield = true;
		if(partition != 0) {
			int pospart = abs(partition);
			if(!suppress.test((uint32_t)field++)) {
				if(firstfield) firstfield = false;
				else o << '\t';
				// Output a partitioning key
				// First component of the key is the reference index
				if(refnames != NULL && h.h.first < refnames->size()) {
					printUptoWs(o, (*refnames)[h.h.first], !fullRef);
				} else {
					o << h.h.first;
				}
			}
			// Next component of the key is the partition id
			if(!dospill) {
				pdiv = (h.h.second + offBase) / pospart;
				pmod = (h.h.second + offBase) % pospart;
			}
			assert_neq(0xffffffff, pdiv);
			assert_neq(0xffffffff, pmod);
			if(dospill) assert_gt(spillAmt, 0);
			if(partition > 0 &&
			   (pmod + h.length()) >= ((uint32_t)pospart * (spillAmt + 1))) {
				// Spills into the next partition so we need to
				// output another alignment for that partition
				spill = true;
			}
			if(!suppress.test((uint32_t)field++)) {
				if(firstfield) {
					firstfield = false;
				} else {
					o << '\t';
				}
				// Print partition id with leading 0s so that Hadoop
				// can do lexicographical sort (modern Hadoop versions
				// seen to support numeric)
				int padding = 10;
				uint32_t part = (pdiv + (dospill ? spillAmt : 0));
				uint32_t parttmp = part;
				while(parttmp > 0) {
					padding--;
					parttmp /= 10;
				}
				assert_geq(padding, 0);
				for(int i = 0; i < padding; i++) {
					o << '0';
				}
				o << part;
			}
			if(!suppress.test((uint32_t)field++)) {
				if(firstfield) {
					firstfield = false;
				} else {
					o << '\t';
				}
				// Print offset with leading 0s
				int padding = 9;
				uint32_t off = h.h.second + offBase;
				uint32_t offtmp = off;
				while(offtmp > 0) {
					padding--;
					offtmp /= 10;
				}
				assert_geq(padding, 0);
				for(int i = 0; i < padding; i++) {
					o << '0';
				}
				o << off;
			}
			if(!suppress.test((uint32_t)field++)) {
				if(firstfield) firstfield = false;
				else o << '\t';
				o << (h.fw? "+":"-");
			}
			// end if(partition != 0)
		} else {
			assert(!dospill);
			if(!suppress.test((uint32_t)field++)) {
				if(firstfield) firstfield = false;
				else o << '\t';
				for(size_t i = 0; i < seqan::length(h.patName); i++) {
					o << (char)(h.patName[i]);
				}
			}
			if(!suppress.test((uint32_t)field++)) {
				if(firstfield) firstfield = false;
				else o << '\t';
				o << (h.fw? '+' : '-');
			}
			if(!suppress.test((uint32_t)field++)) {
				if(firstfield) firstfield = false;
				else o << '\t';
				// .first is text id, .second is offset
				if(refnames != NULL && h.h.first < refnames->size()) {
					printUptoWs(o, (*refnames)[h.h.first], !fullRef);
				} else {
					o << h.h.first;
				}
			}
			if(!suppress.test((uint32_t)field++)) {
				if(firstfield) firstfield = false;
				else o << '\t';
				o << (h.h.second + offBase);
			}
			// end else clause of if(partition != 0)
		}
		if(!suppress.test((uint32_t)field++)) {
			if(firstfield) firstfield = false;
			else o << '\t';
			const String<Dna5>* pat = &h.patSeq;
			if(h.color && colorSeq) pat = &h.colSeq;
			for(size_t i = 0; i < seqan::length(*pat); i++) {
				o << (char)((*pat)[i]);
			}
		}
		if(!suppress.test((uint32_t)field++)) {
			if(firstfield) firstfield = false;
			else o << '\t';
			const String<char>* qual = &h.quals;
			if(h.color && colorQual) qual = &h.colQuals;
			for(size_t i = 0; i < seqan::length(*qual); i++) {
				o << (char)((*qual)[i]);
			}
		}
		if(!suppress.test((uint32_t)field++)) {
			if(firstfield) firstfield = false;
			else o << '\t';
			o << h.oms;
		}
		if(!suppress.test((uint32_t)field++)) {
			if(firstfield) firstfield = false;
			else o << '\t';
			const size_t len = length(h.patSeq);
			// Output mismatch column
			bool firstmm = true;
			for (unsigned int i = 0; i < len; ++ i) {
				if(h.mms.test(i)) {
					// There's a mismatch at this position
					if (!firstmm) {
						o << ",";
					}
					o << i; // position
					assert_gt(h.refcs.size(), i);
					char refChar = toupper(h.refcs[i]);
					char qryChar = (h.fw ? h.patSeq[i] : h.patSeq[length(h.patSeq)-i-1]);
					assert_neq(refChar, qryChar);
					o << ":" << refChar << ">" << qryChar;
					firstmm = false;
				}
			}
			if(partition != 0 && firstmm) o << '-';
		}
		if(partition != 0) {
			// Fields addded as of Crossbow 0.1.4
			if(!suppress.test((uint32_t)field++)) {
				if(firstfield) firstfield = false;
				else o << '\t';
				o << (int)h.mate;
			}
			// Print label, or whole read name if label isn't found
			if(!suppress.test((uint32_t)field++)) {
				if(firstfield) firstfield = false;
				else o << '\t';
				int labelOff = -1;
				// If LB: field is present, print its value
				for(int i = 0; i < (int)seqan::length(h.patName)-3; i++) {
					if(h.patName[i]   == 'L' &&
					   h.patName[i+1] == 'B' &&
					   h.patName[i+2] == ':' &&
					   ((i == 0) || h.patName[i-1] == ';'))
					{
						labelOff = i+3;
						for(int j = labelOff; j < (int)seqan::length(h.patName); j++) {
							if(h.patName[j] != ';') {
								o << h.patName[j];
							} else {
								break;
							}
						}
					}
				}
				// Otherwise, print the whole read name
				if(labelOff == -1) {
					for(size_t i = 0; i < seqan::length(h.patName); i++) {
						o << (char)(h.patName[i]);
					}
				}
			}
		}
		if(cost) {
			// Stratum
			if(!suppress.test((uint32_t)field++)) {
				if(firstfield) firstfield = false;
				else o << '\t';
				o << (int)h.stratum;
			}
			// Cost
			if(!suppress.test((uint32_t)field++)) {
				if(firstfield) firstfield = false;
				else o << '\t';
				o << (int)h.cost;
			}
		}
		if(showSeed) {
			// Seed
			if(!suppress.test((uint32_t)field++)) {
				if(firstfield) firstfield = false;
				else o << '\t';
				o << h.seed;
			}
		}
		o << '\n';
	} while(spill);
}
예제 #2
0
/**
 * Append a SAM output record for an unaligned read.
 */
void SAMHitSink::appendAligned(ostream& ss,
                               const Hit& h,
                               int mapq,
                               int xms, // value for XM:I field
                               const vector<string>* refnames,
                               ReferenceMap *rmap,
                               AnnotationMap *amap,
                               bool fullRef,
                               bool noQnameTrunc,
                               int offBase)
{
	// QNAME
	if(h.mate > 0) {
		// truncate final 2 chars
		for(int i = 0; i < (int)seqan::length(h.patName)-2; i++) {
			if(!noQnameTrunc && isspace((int)h.patName[i])) break;
			ss << h.patName[i];
		}
	} else {
		for(int i = 0; i < (int)seqan::length(h.patName); i++) {
			if(!noQnameTrunc && isspace((int)h.patName[i])) break;
			ss << h.patName[i];
		}
	}
	ss << '\t';
	// FLAG
	int flags = 0;
	if(h.mate == 1) {
		flags |= SAM_FLAG_PAIRED | SAM_FLAG_FIRST_IN_PAIR | SAM_FLAG_MAPPED_PAIRED;
	} else if(h.mate == 2) {
		flags |= SAM_FLAG_PAIRED | SAM_FLAG_SECOND_IN_PAIR | SAM_FLAG_MAPPED_PAIRED;
	}
	if(!h.fw) flags |= SAM_FLAG_QUERY_STRAND;
	if(h.mate > 0 && !h.mfw) flags |= SAM_FLAG_MATE_STRAND;
	ss << flags << "\t";
	// RNAME
	if(refnames != NULL && rmap != NULL) {
		printUptoWs(ss, rmap->getName(h.h.first), !fullRef);
	} else if(refnames != NULL && h.h.first < refnames->size()) {
		printUptoWs(ss, (*refnames)[h.h.first], !fullRef);
	} else {
		ss << h.h.first;
	}
	// POS
	ss << '\t' << (h.h.second + 1);
	// MAPQ
	ss << "\t" << mapq;
	// CIGAR
	ss << '\t' << h.length() << 'M';
	// MRNM
	if(h.mate > 0) {
		ss << "\t=";
	} else {
		ss << "\t*";
	}
	// MPOS
	if(h.mate > 0) {
		ss << '\t' << (h.mh.second + 1);
	} else {
		ss << "\t0";
	}
	// ISIZE
	ss << '\t';
	if(h.mate > 0) {
		assert_eq(h.h.first, h.mh.first);
		int64_t inslen = 0;
		if(h.h.second > h.mh.second) {
			inslen = (int64_t)h.h.second - (int64_t)h.mh.second + (int64_t)h.length();
			inslen = -inslen;
		} else {
			inslen = (int64_t)h.mh.second - (int64_t)h.h.second + (int64_t)h.mlen;
		}
		ss << inslen;
	} else {
		ss << '0';
	}
	// SEQ
	ss << '\t' << h.patSeq;
	// QUAL
	ss << '\t' << h.quals;
	//
	// Optional fields
	//
	// Always output stratum
	ss << "\tXA:i:" << (int)h.stratum;
	// Always output cost
	//ss << "\tXC:i:" << (int)h.cost;
	// Look for SNP annotations falling within the alignment
	// Output MD field
	size_t len = length(h.patSeq);
	int nm = 0;
	int run = 0;
	ss << "\tMD:Z:";
	const FixedBitset<1024> *mms = &h.mms;
	ASSERT_ONLY(const String<Dna5>* pat = &h.patSeq);
	const vector<char>* refcs = &h.refcs;
	if(h.color && false) {
		// Disabled: print MD:Z string w/r/t to colors, not letters
		mms = &h.cmms;
		ASSERT_ONLY(pat = &h.colSeq);
		assert_eq(length(h.colSeq), len+1);
		len = length(h.colSeq);
		refcs = &h.crefcs;
	}
	if(h.fw) {
		for (int i = 0; i < (int)len; ++ i) {
			if(mms->test(i)) {
				nm++;
				// There's a mismatch at this position
				assert_gt((int)refcs->size(), i);
				char refChar = toupper((*refcs)[i]);
				ASSERT_ONLY(char qryChar = (h.fw ? (*pat)[i] : (*pat)[len-i-1]));
				assert_neq(refChar, qryChar);
				ss << run << refChar;
				run = 0;
			} else {
				run++;
			}
		}
	} else {
		for (int i = len-1; i >= 0; -- i) {
			if(mms->test(i)) {
				nm++;
				// There's a mismatch at this position
				assert_gt((int)refcs->size(), i);
				char refChar = toupper((*refcs)[i]);
				ASSERT_ONLY(char qryChar = (h.fw ? (*pat)[i] : (*pat)[len-i-1]));
				assert_neq(refChar, qryChar);
				ss << run << refChar;
				run = 0;
			} else {
				run++;
			}
		}
	}
	ss << run;
	// Add optional edit distance field
	ss << "\tNM:i:" << nm;
	if(h.color) ss << "\tCM:i:" << h.cmms.count();
	// Add optional fields reporting the primer base and the downstream color,
	// which, if they were present, were clipped when the read was read in
	if(h.color && gReportColorPrimer) {
		if(h.primer != '?') {
			ss << "\tZP:Z:" << h.primer;
			assert(isprint(h.primer));
		}
		if(h.trimc != '?') {
			ss << "\tZp:Z:" << h.trimc;
			assert(isprint(h.trimc));
		}
	}
	if(xms > 0)  ss << "\tXM:i:" << xms;
	ss << endl;
}