/** * Append a verbose, readable hit to the given output stream. */ void VerboseHitSink::append( BTString& o, const Hit& h, const vector<string>* refnames, bool fullRef, int partition, int offBase, bool colorSeq, bool colorQual, bool cost, const Bitset& suppress) { bool spill = false; int spillAmt = 0; uint32_t pdiv = 0xffffffff; uint32_t pmod = 0xffffffff; do { bool dospill = false; if(spill) { // The read spilled over a partition boundary and so // needs to be printed more than once spill = false; dospill = true; spillAmt++; } assert(!spill); size_t field = 0; bool firstfield = true; if(partition != 0) { int pospart = abs(partition); if(!suppress.test((uint32_t)field++)) { if(firstfield) firstfield = false; else o << '\t'; // Output a partitioning key // First component of the key is the reference index if(refnames != NULL && h.h.first < refnames->size()) { printUptoWs(o, (*refnames)[h.h.first], !fullRef); } else { o << h.h.first; } } // Next component of the key is the partition id if(!dospill) { pdiv = (h.h.second + offBase) / pospart; pmod = (h.h.second + offBase) % pospart; } assert_neq(0xffffffff, pdiv); assert_neq(0xffffffff, pmod); if(dospill) assert_gt(spillAmt, 0); if(partition > 0 && (pmod + h.length()) >= ((uint32_t)pospart * (spillAmt + 1))) { // Spills into the next partition so we need to // output another alignment for that partition spill = true; } if(!suppress.test((uint32_t)field++)) { if(firstfield) { firstfield = false; } else { o << '\t'; } // Print partition id with leading 0s so that Hadoop // can do lexicographical sort (modern Hadoop versions // seen to support numeric) int padding = 10; uint32_t part = (pdiv + (dospill ? spillAmt : 0)); uint32_t parttmp = part; while(parttmp > 0) { padding--; parttmp /= 10; } assert_geq(padding, 0); for(int i = 0; i < padding; i++) { o << '0'; } o << part; } if(!suppress.test((uint32_t)field++)) { if(firstfield) { firstfield = false; } else { o << '\t'; } // Print offset with leading 0s int padding = 9; uint32_t off = h.h.second + offBase; uint32_t offtmp = off; while(offtmp > 0) { padding--; offtmp /= 10; } assert_geq(padding, 0); for(int i = 0; i < padding; i++) { o << '0'; } o << off; } if(!suppress.test((uint32_t)field++)) { if(firstfield) firstfield = false; else o << '\t'; o << (h.fw? "+":"-"); } // end if(partition != 0) } else { assert(!dospill); if(!suppress.test((uint32_t)field++)) { if(firstfield) firstfield = false; else o << '\t'; for(size_t i = 0; i < seqan::length(h.patName); i++) { o << (char)(h.patName[i]); } } if(!suppress.test((uint32_t)field++)) { if(firstfield) firstfield = false; else o << '\t'; o << (h.fw? '+' : '-'); } if(!suppress.test((uint32_t)field++)) { if(firstfield) firstfield = false; else o << '\t'; // .first is text id, .second is offset if(refnames != NULL && h.h.first < refnames->size()) { printUptoWs(o, (*refnames)[h.h.first], !fullRef); } else { o << h.h.first; } } if(!suppress.test((uint32_t)field++)) { if(firstfield) firstfield = false; else o << '\t'; o << (h.h.second + offBase); } // end else clause of if(partition != 0) } if(!suppress.test((uint32_t)field++)) { if(firstfield) firstfield = false; else o << '\t'; const String<Dna5>* pat = &h.patSeq; if(h.color && colorSeq) pat = &h.colSeq; for(size_t i = 0; i < seqan::length(*pat); i++) { o << (char)((*pat)[i]); } } if(!suppress.test((uint32_t)field++)) { if(firstfield) firstfield = false; else o << '\t'; const String<char>* qual = &h.quals; if(h.color && colorQual) qual = &h.colQuals; for(size_t i = 0; i < seqan::length(*qual); i++) { o << (char)((*qual)[i]); } } if(!suppress.test((uint32_t)field++)) { if(firstfield) firstfield = false; else o << '\t'; o << h.oms; } if(!suppress.test((uint32_t)field++)) { if(firstfield) firstfield = false; else o << '\t'; const size_t len = length(h.patSeq); // Output mismatch column bool firstmm = true; for (unsigned int i = 0; i < len; ++ i) { if(h.mms.test(i)) { // There's a mismatch at this position if (!firstmm) { o << ","; } o << i; // position assert_gt(h.refcs.size(), i); char refChar = toupper(h.refcs[i]); char qryChar = (h.fw ? h.patSeq[i] : h.patSeq[length(h.patSeq)-i-1]); assert_neq(refChar, qryChar); o << ":" << refChar << ">" << qryChar; firstmm = false; } } if(partition != 0 && firstmm) o << '-'; } if(partition != 0) { // Fields addded as of Crossbow 0.1.4 if(!suppress.test((uint32_t)field++)) { if(firstfield) firstfield = false; else o << '\t'; o << (int)h.mate; } // Print label, or whole read name if label isn't found if(!suppress.test((uint32_t)field++)) { if(firstfield) firstfield = false; else o << '\t'; int labelOff = -1; // If LB: field is present, print its value for(int i = 0; i < (int)seqan::length(h.patName)-3; i++) { if(h.patName[i] == 'L' && h.patName[i+1] == 'B' && h.patName[i+2] == ':' && ((i == 0) || h.patName[i-1] == ';')) { labelOff = i+3; for(int j = labelOff; j < (int)seqan::length(h.patName); j++) { if(h.patName[j] != ';') { o << h.patName[j]; } else { break; } } } } // Otherwise, print the whole read name if(labelOff == -1) { for(size_t i = 0; i < seqan::length(h.patName); i++) { o << (char)(h.patName[i]); } } } } if(cost) { // Stratum if(!suppress.test((uint32_t)field++)) { if(firstfield) firstfield = false; else o << '\t'; o << (int)h.stratum; } // Cost if(!suppress.test((uint32_t)field++)) { if(firstfield) firstfield = false; else o << '\t'; o << (int)h.cost; } } if(showSeed) { // Seed if(!suppress.test((uint32_t)field++)) { if(firstfield) firstfield = false; else o << '\t'; o << h.seed; } } o << '\n'; } while(spill); }
/** * Append a SAM output record for an unaligned read. */ void SAMHitSink::appendAligned(ostream& ss, const Hit& h, int mapq, int xms, // value for XM:I field const vector<string>* refnames, ReferenceMap *rmap, AnnotationMap *amap, bool fullRef, bool noQnameTrunc, int offBase) { // QNAME if(h.mate > 0) { // truncate final 2 chars for(int i = 0; i < (int)seqan::length(h.patName)-2; i++) { if(!noQnameTrunc && isspace((int)h.patName[i])) break; ss << h.patName[i]; } } else { for(int i = 0; i < (int)seqan::length(h.patName); i++) { if(!noQnameTrunc && isspace((int)h.patName[i])) break; ss << h.patName[i]; } } ss << '\t'; // FLAG int flags = 0; if(h.mate == 1) { flags |= SAM_FLAG_PAIRED | SAM_FLAG_FIRST_IN_PAIR | SAM_FLAG_MAPPED_PAIRED; } else if(h.mate == 2) { flags |= SAM_FLAG_PAIRED | SAM_FLAG_SECOND_IN_PAIR | SAM_FLAG_MAPPED_PAIRED; } if(!h.fw) flags |= SAM_FLAG_QUERY_STRAND; if(h.mate > 0 && !h.mfw) flags |= SAM_FLAG_MATE_STRAND; ss << flags << "\t"; // RNAME if(refnames != NULL && rmap != NULL) { printUptoWs(ss, rmap->getName(h.h.first), !fullRef); } else if(refnames != NULL && h.h.first < refnames->size()) { printUptoWs(ss, (*refnames)[h.h.first], !fullRef); } else { ss << h.h.first; } // POS ss << '\t' << (h.h.second + 1); // MAPQ ss << "\t" << mapq; // CIGAR ss << '\t' << h.length() << 'M'; // MRNM if(h.mate > 0) { ss << "\t="; } else { ss << "\t*"; } // MPOS if(h.mate > 0) { ss << '\t' << (h.mh.second + 1); } else { ss << "\t0"; } // ISIZE ss << '\t'; if(h.mate > 0) { assert_eq(h.h.first, h.mh.first); int64_t inslen = 0; if(h.h.second > h.mh.second) { inslen = (int64_t)h.h.second - (int64_t)h.mh.second + (int64_t)h.length(); inslen = -inslen; } else { inslen = (int64_t)h.mh.second - (int64_t)h.h.second + (int64_t)h.mlen; } ss << inslen; } else { ss << '0'; } // SEQ ss << '\t' << h.patSeq; // QUAL ss << '\t' << h.quals; // // Optional fields // // Always output stratum ss << "\tXA:i:" << (int)h.stratum; // Always output cost //ss << "\tXC:i:" << (int)h.cost; // Look for SNP annotations falling within the alignment // Output MD field size_t len = length(h.patSeq); int nm = 0; int run = 0; ss << "\tMD:Z:"; const FixedBitset<1024> *mms = &h.mms; ASSERT_ONLY(const String<Dna5>* pat = &h.patSeq); const vector<char>* refcs = &h.refcs; if(h.color && false) { // Disabled: print MD:Z string w/r/t to colors, not letters mms = &h.cmms; ASSERT_ONLY(pat = &h.colSeq); assert_eq(length(h.colSeq), len+1); len = length(h.colSeq); refcs = &h.crefcs; } if(h.fw) { for (int i = 0; i < (int)len; ++ i) { if(mms->test(i)) { nm++; // There's a mismatch at this position assert_gt((int)refcs->size(), i); char refChar = toupper((*refcs)[i]); ASSERT_ONLY(char qryChar = (h.fw ? (*pat)[i] : (*pat)[len-i-1])); assert_neq(refChar, qryChar); ss << run << refChar; run = 0; } else { run++; } } } else { for (int i = len-1; i >= 0; -- i) { if(mms->test(i)) { nm++; // There's a mismatch at this position assert_gt((int)refcs->size(), i); char refChar = toupper((*refcs)[i]); ASSERT_ONLY(char qryChar = (h.fw ? (*pat)[i] : (*pat)[len-i-1])); assert_neq(refChar, qryChar); ss << run << refChar; run = 0; } else { run++; } } } ss << run; // Add optional edit distance field ss << "\tNM:i:" << nm; if(h.color) ss << "\tCM:i:" << h.cmms.count(); // Add optional fields reporting the primer base and the downstream color, // which, if they were present, were clipped when the read was read in if(h.color && gReportColorPrimer) { if(h.primer != '?') { ss << "\tZP:Z:" << h.primer; assert(isprint(h.primer)); } if(h.trimc != '?') { ss << "\tZp:Z:" << h.trimc; assert(isprint(h.trimc)); } } if(xms > 0) ss << "\tXM:i:" << xms; ss << endl; }