void VcfWriterCandidateSV:: modifyTranslocInfo( const SVCandidate& sv, const bool isFirstOfPair, InfoTag_t& infoTags) const { const SVBreakend& bpA( isFirstOfPair ? sv.bp1 : sv.bp2); infoTags.push_back( str(boost::format("BND_PAIR_COUNT=%i") % bpA.getLocalPairCount()) ); infoTags.push_back( str(boost::format("PAIR_COUNT=%i") % bpA.getPairCount()) ); }
void VcfWriterCandidateSV:: modifyInvdelInfo( const SVCandidate& sv, const bool isBp1First, InfoTag_t& infoTags) const { const SVBreakend& bpA( isBp1First ? sv.bp1 : sv.bp2); const SVBreakend& bpB( isBp1First ? sv.bp2 : sv.bp1); infoTags.push_back( str(boost::format("UPSTREAM_PAIR_COUNT=%i") % bpA.getLocalPairCount()) ); infoTags.push_back( str(boost::format("DOWNSTREAM_PAIR_COUNT=%i") % bpB.getLocalPairCount()) ); infoTags.push_back( str(boost::format("PAIR_COUNT=%i") % bpA.getPairCount()) ); }
static void addDebugInfo( const SVBreakend& bp1, const SVBreakend& bp2, const bool isFirstOfPair, const SVCandidateAssemblyData& assemblyData, InfoTag_t& infotags) { if (! isFirstOfPair) return; // store alignment start + cigar string for each section of the jumping alignment. // there can be several contigs per breakend, so we iterate over all of them. // only the first breakpoint gets the alignments attached to its VCF entry if (assemblyData.isSpanning) { const unsigned numAlign(assemblyData.spanningAlignments.size()); std::string cigar1; std::string cigar2; for (unsigned alignIndex(0); alignIndex<numAlign; ++alignIndex) { const SVCandidateAssemblyData::JumpAlignmentResultType align(assemblyData.spanningAlignments[alignIndex]); infotags.push_back( str(boost::format("CTG_JALIGN_%i_POS_A=%d") % alignIndex % (bp1.interval.range.begin_pos()+align.align1.beginPos)) ); infotags.push_back( str(boost::format("CTG_JALIGN_%i_POS_B=%d") % alignIndex % (bp2.interval.range.begin_pos()+align.align2.beginPos)) ); apath_to_cigar(align.align1.apath,cigar1); apath_to_cigar(align.align2.apath,cigar2); infotags.push_back( str(boost::format("CTG_JALIGN_%i_CIGAR_A=%s") % alignIndex % cigar1) ); infotags.push_back( str(boost::format("CTG_JALIGN_%i_CIGAR_B=%s") % alignIndex % cigar2) ); } } }
void VcfWriterRnaSV:: modifyTranslocInfo( const SVCandidate& sv, const bool isFirstOfPair, const SVCandidateAssemblyData& assemblyData, InfoTag_t& infotags) const { const SVScoreInfo& baseInfo(getBaseInfo()); infotags.push_back( str(boost::format("BND_DEPTH=%i") % (isFirstOfPair ? baseInfo.bp1MaxDepth : baseInfo.bp2MaxDepth) ) ); infotags.push_back( str(boost::format("MATE_BND_DEPTH=%i") % (isFirstOfPair ? baseInfo.bp2MaxDepth : baseInfo.bp1MaxDepth) ) ); { ///TODO better multisample handler here: const unsigned sampleIndex(0); const SVSampleAlleleInfo& refinfo(baseInfo.samples[sampleIndex].ref); infotags.push_back(str(boost::format("REF_COUNT=%i") % (isFirstOfPair ? refinfo.confidentSplitReadAndPairCountRefBp1 : refinfo.confidentSplitReadAndPairCountRefBp2))); infotags.push_back(str(boost::format("MATE_REF_COUNT=%i") % (isFirstOfPair ? refinfo.confidentSplitReadAndPairCountRefBp2 : refinfo.confidentSplitReadAndPairCountRefBp1))); } { // if (!assemblyData.isSpanning) return; const bool isFirst = (assemblyData.bporient.isBp1First == isFirstOfPair); if (isFirst) infotags.push_back("RNA_FIRST"); if (assemblyData.bporient.isTranscriptStrandKnown) infotags.push_back("RNA_STRANDED"); if (!isFirstOfPair) return; // only the first breakpoint gets the additional RNA info attached to its VCF entry infotags.push_back(str(boost::format("RNA_FwRvReads=%i,%i") % sv.forwardTranscriptStrandReadCount % sv.reverseTranscriptStrandReadCount)); infotags.push_back(str(boost::format("RNA_Reads=%i") % sv.bp2.lowresEvidence.getTotal())); const unsigned numContigs(assemblyData.contigs.size()); if (numContigs > 0) { if (numContigs != assemblyData.spanningAlignments.size()) infotags.push_back(str(boost::format("ERROR=%i,%i") % numContigs % assemblyData.spanningAlignments.size())); const unsigned int bestAlignmentIdx(assemblyData.bestAlignmentIndex); if (numContigs <= bestAlignmentIdx) infotags.push_back(str(boost::format("ERROR2=%i,%i") % numContigs % bestAlignmentIdx)); infotags.push_back(str(boost::format("RNA_CONTIG=%s") % assemblyData.contigs[bestAlignmentIdx].seq)); const auto& bestAlignment(assemblyData.spanningAlignments[bestAlignmentIdx]); infotags.push_back(str(boost::format("RNA_CONTIG_ALN=%i,%i") % apath_matched_length(bestAlignment.align1.apath) % apath_matched_length(bestAlignment.align2.apath))); } } #ifdef DEBUG_VCF addDebugInfo(isFirstOfPair, sv, assemblyData, infotags); #endif }
void VcfWriterSV:: writeInvdel( const SVCandidate& sv, const SVId& svId, const SVCandidateAssemblyData& /*adata*/, const bool isIndel, const EventInfo& event) { const bool isImprecise(sv.isImprecise()); const bool isBreakendRangeSameShift(sv.isBreakendRangeSameShift()); const bool isBp1First(sv.bp1.interval.range.begin_pos()<=sv.bp2.interval.range.begin_pos()); const SVBreakend& bpA(isBp1First ? sv.bp1 : sv.bp2); const SVBreakend& bpB(isBp1First ? sv.bp2 : sv.bp1); InfoTag_t infoTags; SampleTag_t sampleTags; // get CHROM const std::string& chrom(_header.chrom_data[sv.bp1.interval.tid].label); const known_pos_range2& bpArange(bpA.interval.range); const known_pos_range2& bpBrange(bpB.interval.range); if (! isImprecise) { assert(bpArange.size() == bpBrange.size()); } // above this size all records use symbolic alleles (ie. <DEL>): static const unsigned maxNonSymbolicRecordSize(1000); // if the variant is a combination of simple insertion and deletions, and below // a large-event size threshold, it is classified as a small variant. In this case // we report the event using full REF and ALT sequences, plus a CIGAR string for // complex in/del combinations // bool isSmallVariant(false); if ((! isImprecise) && isIndel && (! sv.isUnknownSizeInsertion)) { const unsigned deleteSize(bpBrange.begin_pos() - bpArange.begin_pos()); const unsigned insertSize(sv.insertSeq.size()); const bool isSmallDelete(deleteSize<=maxNonSymbolicRecordSize); const bool isSmallInsert(insertSize<=maxNonSymbolicRecordSize); isSmallVariant = (isSmallDelete && isSmallInsert); } // get POS and endPos pos_t pos(bpArange.center_pos()+1); pos_t endPos(bpBrange.center_pos()); if (! isImprecise) { pos = bpArange.begin_pos()+1; if (isBreakendRangeSameShift) { endPos = bpBrange.begin_pos(); } else { endPos = bpBrange.end_pos()-1; } } else { /// check against the rare case arising when CIEND is a subset of CIPOS: endPos=std::max(endPos,pos+1); } if (pos<1) return; // get REF std::string ref; { const pos_t beginRefPos(pos-1); pos_t endRefPos(beginRefPos); if (isSmallVariant) endRefPos=endPos-1; get_standardized_region_seq(_referenceFilename, chrom, beginRefPos, endRefPos, ref); assert(static_cast<unsigned>(1+endRefPos-beginRefPos) == ref.size()); } // build alt: std::string alt; if (isSmallVariant) { alt = ref[0] + sv.insertSeq; } else { alt = str( boost::format("<%s>") % svId.getLabel()); } // build INFO field std::vector<std::string> words; split_string(svId.getLabel(),':',words); { // note that there's a reasonable argument for displaying these tags only when a // symbolic allele is used (by a strict reading of the vcf spec) -- we instead // print these fields for all variants for uniformity within the manta vcf: // infoTags.push_back( str(boost::format("END=%i") % endPos)); infoTags.push_back( str(boost::format("SVTYPE=%s") % words[0])); const pos_t refLen(endPos-pos); pos_t svLen(refLen); if (! sv.isUnknownSizeInsertion) { if (isIndel) { const pos_t insertLen(static_cast<pos_t>(sv.insertSeq.size())); if ( insertLen > refLen ) { svLen = insertLen; } else { svLen = -refLen; } } infoTags.push_back( str(boost::format("SVLEN=%i") % (svLen))); } } infoTags.push_back( str(boost::format("UPSTREAM_PAIR_COUNT=%i") % bpA.getLocalPairCount()) ); infoTags.push_back( str(boost::format("DOWNSTREAM_PAIR_COUNT=%i") % bpB.getLocalPairCount()) ); infoTags.push_back( str(boost::format("PAIR_COUNT=%i") % bpA.getPairCount()) ); if (isSmallVariant) { if (! sv.insertAlignment.empty()) { std::string cigar; apath_to_cigar(sv.insertAlignment,cigar); // add the 1M to signify the leading reference base: infoTags.push_back( str(boost::format("CIGAR=1M%s") % cigar)); } } if (isImprecise) { infoTags.push_back("IMPRECISE"); } if (bpArange.size() > 1) { infoTags.push_back( str( boost::format("CIPOS=%i,%i") % ((bpArange.begin_pos()+1) - pos) % (bpArange.end_pos() - pos) )); } if (! isSmallVariant) { if (bpBrange.size() > 1) { infoTags.push_back( str( boost::format("CIEND=%i,%i") % (bpBrange.begin_pos() - endPos) % ((bpBrange.end_pos()-1) - endPos) )); } } if (! isImprecise) { if (bpArange.size() > 1) { infoTags.push_back( str( boost::format("HOMLEN=%i") % (bpArange.size()-1) )); std::string homref; get_standardized_region_seq(_referenceFilename,chrom,bpArange.begin_pos()+1,bpArange.end_pos()-1,homref); infoTags.push_back( str( boost::format("HOMSEQ=%s") % (homref) )); } } if (! isSmallVariant) { if (! (sv.insertSeq.empty() || sv.isUnknownSizeInsertion)) { infoTags.push_back( str( boost::format("SVINSLEN=%i") % (sv.insertSeq.size()) )); if (isBp1First || (bpA.state != bpB.state)) { infoTags.push_back( str( boost::format("SVINSSEQ=%s") % (sv.insertSeq) )); } else { infoTags.push_back( str( boost::format("SVINSSEQ=%s") % reverseCompCopyStr(sv.insertSeq) )); } } } if (sv.isUnknownSizeInsertion) { if (! sv.unknownSizeInsertionLeftSeq.empty()) { infoTags.push_back( str( boost::format("LEFT_SVINSSEQ=%s") % (sv.unknownSizeInsertionLeftSeq) )); } if (! sv.unknownSizeInsertionRightSeq.empty()) { infoTags.push_back( str( boost::format("RIGHT_SVINSSEQ=%s") % (sv.unknownSizeInsertionRightSeq) )); } } if (svId.svType == EXTENDED_SV_TYPE::INVERSION) { if (sv.bp1.state == SVBreakendState::RIGHT_OPEN) { infoTags.push_back("INV3"); } else if (sv.bp1.state == SVBreakendState::LEFT_OPEN) { infoTags.push_back("INV5"); } else { assert(false && "Unexpected inversion configuration"); } } addSharedInfo(event, infoTags); modifyInfo(event, infoTags); modifySample(sv, sampleTags); // write out record: _os << chrom << '\t' << pos << '\t' << svId.localId // ID << '\t' << ref // REF << '\t' << alt // ALT << '\t'; writeQual(); _os << '\t'; writeFilter(); _os << '\t'; makeInfoField(infoTags,_os); // INFO makeFormatSampleField(sampleTags, _os); // FORMAT + SAMPLE _os << '\n'; }
void VcfWriterSV:: writeTransloc( const SVCandidate& sv, const SVId& svId, const bool isFirstBreakend, const SVCandidateSetData& /*svData*/, const SVCandidateAssemblyData& /*adata*/, const EventInfo& event) { const bool isImprecise(sv.isImprecise()); const bool isBreakendRangeSameShift(sv.isBreakendRangeSameShift()); const SVBreakend& bpA( isFirstBreakend ? sv.bp1 : sv.bp2); const SVBreakend& bpB( isFirstBreakend ? sv.bp2 : sv.bp1); InfoTag_t infotags; SampleTag_t sampletags; // get CHROM const std::string& chrom(_header.chrom_data[bpA.interval.tid].label); const std::string& mateChrom(_header.chrom_data[bpB.interval.tid].label); const known_pos_range2& bpArange(bpA.interval.range); const known_pos_range2& bpBrange(bpB.interval.range); if (! isImprecise) { assert(bpArange.size() == bpBrange.size()); } // get POS pos_t pos(bpArange.center_pos()+1); pos_t matePos(bpBrange.center_pos()+1); if (! isImprecise) { pos = bpArange.begin_pos()+1; if (isBreakendRangeSameShift) { matePos = bpBrange.begin_pos()+1; } else { matePos = bpBrange.end_pos(); } } // get ID const std::string& localId(isFirstBreakend ? svId.localId : svId.mateId); const std::string& mateId(isFirstBreakend ? svId.mateId : svId.localId); // get REF std::string ref; get_standardized_region_seq(_referenceFilename,chrom,pos-1,pos-1,ref); assert(1 == ref.size()); const bool isReverseInsertSeq(! (isFirstBreakend || (bpA.state != bpB.state))); std::string tmpString; const std::string* insertSeqPtr(&sv.insertSeq); if (isReverseInsertSeq) { tmpString = reverseCompCopyStr(sv.insertSeq); insertSeqPtr = &tmpString; } const std::string& insertSeq(*insertSeqPtr); // build alt: boost::format altFormat("%4%%3%%1%:%2%%3%%5%"); { std::string altPrefix; std::string altSuffix; if (bpA.state == SVBreakendState::RIGHT_OPEN) { altPrefix = ref + insertSeq; } else if (bpA.state == SVBreakendState::LEFT_OPEN) { altSuffix = insertSeq + ref; } else { assert(false && "Unexpected bpA.state"); } char altSep('?'); if (bpB.state == SVBreakendState::RIGHT_OPEN) { altSep=']'; } else if (bpB.state == SVBreakendState::LEFT_OPEN) { altSep='['; } else { assert(false && "Unexpected bpB.state"); } altFormat % mateChrom % matePos % altSep % altPrefix % altSuffix; } // build INFO field infotags.push_back("SVTYPE=BND"); infotags.push_back("MATEID="+mateId); infotags.push_back( str(boost::format("BND_PAIR_COUNT=%i") % bpA.getLocalPairCount()) ); infotags.push_back( str(boost::format("PAIR_COUNT=%i") % bpA.getPairCount()) ); if (isImprecise) { infotags.push_back("IMPRECISE"); } if (bpArange.size() > 1) { infotags.push_back( str( boost::format("CIPOS=%i,%i") % ((bpArange.begin_pos()+1) - pos) % (bpArange.end_pos() - pos) )); } if (! isImprecise) { if (bpArange.size() > 1) { infotags.push_back( str( boost::format("HOMLEN=%i") % (bpArange.size()-1) )); std::string homref; get_standardized_region_seq(_referenceFilename,chrom,bpArange.begin_pos()+1,bpArange.end_pos()-1,homref); infotags.push_back( str( boost::format("HOMSEQ=%s") % (homref) )); } } if (! insertSeq.empty()) { infotags.push_back( str( boost::format("SVINSLEN=%i") % (insertSeq.size()) )); infotags.push_back( str( boost::format("SVINSSEQ=%s") % (insertSeq) )); } addSharedInfo(event, infotags); modifyInfo(event, infotags); modifyTranslocInfo(isFirstBreakend, infotags); modifySample(sv, sampletags); #ifdef DEBUG_VCF addDebugInfo(bpA, bpB, isFirstBreakend, adata, infotags); #endif // write out record: _os << chrom << '\t' << pos << '\t' << localId // ID << '\t' << ref // REF << '\t' << str( altFormat ) // ALT << '\t'; writeQual(); _os << '\t'; writeFilter(); _os << '\t'; makeInfoField(infotags,_os); // INFO makeFormatSampleField(sampletags, _os); // FORMAT + SAMPLE _os << '\n'; }