void VcfWriterRnaSV:: modifySample( const SVCandidate& sv, SampleTag_t& sampletags) const { const SVScoreInfo& baseInfo(getBaseInfo()); const unsigned sampleCount(baseInfo.samples.size()); std::vector<std::string> values(sampleCount); for (unsigned sampleIndex(0); sampleIndex<sampleCount; ++sampleIndex) { const SVSampleInfo& sampleInfo(baseInfo.samples[sampleIndex]); values[sampleIndex] = str(boost::format("%i,%i") % sampleInfo.ref.spanningPairCount % sampleInfo.alt.spanningPairCount); } sampletags.push_back(std::make_pair("PR", values)); if (sv.isImprecise()) return; for (unsigned sampleIndex(0); sampleIndex<sampleCount; ++sampleIndex) { const SVSampleInfo& sampleInfo(baseInfo.samples[sampleIndex]); values[sampleIndex] = str( boost::format("%i,%i") % sampleInfo.ref.splitReadCount % sampleInfo.alt.splitReadCount); } sampletags.push_back(std::make_pair("SR",values)); }
static float largeNoiseSVPriorWeight( const SVCandidate& sv) { static const int smallSize(5000); static const int largeSize(10000); static const LinearScaler<int> svSizeRamp(smallSize, largeSize); return svSizeRamp.getScale(sv.centerSize()); }
/// when an sv is treated as 'small', we skip all paired-read evidence and rely on split reads only: /// /// with further model improvements we can add pairs back into the small variant calls: /// /// this function returns 1 for a variant which is "fully large" and 0 for a variant which is "fully small", /// with intermediate values for sizes in between /// static float getSpanningPairWeight( const SVCandidate& sv, const bool isSmallAssembler) { static const int minSmallSize(300); static const int maxSmallSize(500); static const LinearScaler<int> svSizeRamp(minSmallSize, maxSmallSize); if (! isSmallAssembler) return 1.f; return svSizeRamp.getScale(sv.centerSize()); }
void VcfWriterSomaticSV:: modifySample( const SVCandidate& sv, SampleTag_t& sampletags) const { const SVScoreInfo& baseInfo(getBaseInfo()); std::vector<std::string> values(2); static const std::string pairTag("PR"); values[0] = str( boost::format("%i,%i") % baseInfo.normal.ref.confidentSpanningPairCount % baseInfo.normal.alt.confidentSpanningPairCount); values[1] = str( boost::format("%i,%i") % baseInfo.tumor.ref.confidentSpanningPairCount % baseInfo.tumor.alt.confidentSpanningPairCount); sampletags.push_back(std::make_pair(pairTag,values)); if (sv.isImprecise()) return; static const std::string srTag("SR"); values[0] = str( boost::format("%i,%i") % baseInfo.normal.ref.confidentSplitReadCount % baseInfo.normal.alt.confidentSplitReadCount); values[1] = str( boost::format("%i,%i") % baseInfo.tumor.ref.confidentSplitReadCount % baseInfo.tumor.alt.confidentSplitReadCount); sampletags.push_back(std::make_pair(srTag,values)); }
void VcfWriterSV:: writeInvdel( const SVCandidate& sv, const SVId& svId, const SVCandidateAssemblyData& /*adata*/, const bool isIndel, const EventInfo& event) { const bool isImprecise(sv.isImprecise()); const bool isBreakendRangeSameShift(sv.isBreakendRangeSameShift()); const bool isBp1First(sv.bp1.interval.range.begin_pos()<=sv.bp2.interval.range.begin_pos()); const SVBreakend& bpA(isBp1First ? sv.bp1 : sv.bp2); const SVBreakend& bpB(isBp1First ? sv.bp2 : sv.bp1); InfoTag_t infoTags; SampleTag_t sampleTags; // get CHROM const std::string& chrom(_header.chrom_data[sv.bp1.interval.tid].label); const known_pos_range2& bpArange(bpA.interval.range); const known_pos_range2& bpBrange(bpB.interval.range); if (! isImprecise) { assert(bpArange.size() == bpBrange.size()); } // above this size all records use symbolic alleles (ie. <DEL>): static const unsigned maxNonSymbolicRecordSize(1000); // if the variant is a combination of simple insertion and deletions, and below // a large-event size threshold, it is classified as a small variant. In this case // we report the event using full REF and ALT sequences, plus a CIGAR string for // complex in/del combinations // bool isSmallVariant(false); if ((! isImprecise) && isIndel && (! sv.isUnknownSizeInsertion)) { const unsigned deleteSize(bpBrange.begin_pos() - bpArange.begin_pos()); const unsigned insertSize(sv.insertSeq.size()); const bool isSmallDelete(deleteSize<=maxNonSymbolicRecordSize); const bool isSmallInsert(insertSize<=maxNonSymbolicRecordSize); isSmallVariant = (isSmallDelete && isSmallInsert); } // get POS and endPos pos_t pos(bpArange.center_pos()+1); pos_t endPos(bpBrange.center_pos()); if (! isImprecise) { pos = bpArange.begin_pos()+1; if (isBreakendRangeSameShift) { endPos = bpBrange.begin_pos(); } else { endPos = bpBrange.end_pos()-1; } } else { /// check against the rare case arising when CIEND is a subset of CIPOS: endPos=std::max(endPos,pos+1); } if (pos<1) return; // get REF std::string ref; { const pos_t beginRefPos(pos-1); pos_t endRefPos(beginRefPos); if (isSmallVariant) endRefPos=endPos-1; get_standardized_region_seq(_referenceFilename, chrom, beginRefPos, endRefPos, ref); assert(static_cast<unsigned>(1+endRefPos-beginRefPos) == ref.size()); } // build alt: std::string alt; if (isSmallVariant) { alt = ref[0] + sv.insertSeq; } else { alt = str( boost::format("<%s>") % svId.getLabel()); } // build INFO field std::vector<std::string> words; split_string(svId.getLabel(),':',words); { // note that there's a reasonable argument for displaying these tags only when a // symbolic allele is used (by a strict reading of the vcf spec) -- we instead // print these fields for all variants for uniformity within the manta vcf: // infoTags.push_back( str(boost::format("END=%i") % endPos)); infoTags.push_back( str(boost::format("SVTYPE=%s") % words[0])); const pos_t refLen(endPos-pos); pos_t svLen(refLen); if (! sv.isUnknownSizeInsertion) { if (isIndel) { const pos_t insertLen(static_cast<pos_t>(sv.insertSeq.size())); if ( insertLen > refLen ) { svLen = insertLen; } else { svLen = -refLen; } } infoTags.push_back( str(boost::format("SVLEN=%i") % (svLen))); } } infoTags.push_back( str(boost::format("UPSTREAM_PAIR_COUNT=%i") % bpA.getLocalPairCount()) ); infoTags.push_back( str(boost::format("DOWNSTREAM_PAIR_COUNT=%i") % bpB.getLocalPairCount()) ); infoTags.push_back( str(boost::format("PAIR_COUNT=%i") % bpA.getPairCount()) ); if (isSmallVariant) { if (! sv.insertAlignment.empty()) { std::string cigar; apath_to_cigar(sv.insertAlignment,cigar); // add the 1M to signify the leading reference base: infoTags.push_back( str(boost::format("CIGAR=1M%s") % cigar)); } } if (isImprecise) { infoTags.push_back("IMPRECISE"); } if (bpArange.size() > 1) { infoTags.push_back( str( boost::format("CIPOS=%i,%i") % ((bpArange.begin_pos()+1) - pos) % (bpArange.end_pos() - pos) )); } if (! isSmallVariant) { if (bpBrange.size() > 1) { infoTags.push_back( str( boost::format("CIEND=%i,%i") % (bpBrange.begin_pos() - endPos) % ((bpBrange.end_pos()-1) - endPos) )); } } if (! isImprecise) { if (bpArange.size() > 1) { infoTags.push_back( str( boost::format("HOMLEN=%i") % (bpArange.size()-1) )); std::string homref; get_standardized_region_seq(_referenceFilename,chrom,bpArange.begin_pos()+1,bpArange.end_pos()-1,homref); infoTags.push_back( str( boost::format("HOMSEQ=%s") % (homref) )); } } if (! isSmallVariant) { if (! (sv.insertSeq.empty() || sv.isUnknownSizeInsertion)) { infoTags.push_back( str( boost::format("SVINSLEN=%i") % (sv.insertSeq.size()) )); if (isBp1First || (bpA.state != bpB.state)) { infoTags.push_back( str( boost::format("SVINSSEQ=%s") % (sv.insertSeq) )); } else { infoTags.push_back( str( boost::format("SVINSSEQ=%s") % reverseCompCopyStr(sv.insertSeq) )); } } } if (sv.isUnknownSizeInsertion) { if (! sv.unknownSizeInsertionLeftSeq.empty()) { infoTags.push_back( str( boost::format("LEFT_SVINSSEQ=%s") % (sv.unknownSizeInsertionLeftSeq) )); } if (! sv.unknownSizeInsertionRightSeq.empty()) { infoTags.push_back( str( boost::format("RIGHT_SVINSSEQ=%s") % (sv.unknownSizeInsertionRightSeq) )); } } if (svId.svType == EXTENDED_SV_TYPE::INVERSION) { if (sv.bp1.state == SVBreakendState::RIGHT_OPEN) { infoTags.push_back("INV3"); } else if (sv.bp1.state == SVBreakendState::LEFT_OPEN) { infoTags.push_back("INV5"); } else { assert(false && "Unexpected inversion configuration"); } } addSharedInfo(event, infoTags); modifyInfo(event, infoTags); modifySample(sv, sampleTags); // write out record: _os << chrom << '\t' << pos << '\t' << svId.localId // ID << '\t' << ref // REF << '\t' << alt // ALT << '\t'; writeQual(); _os << '\t'; writeFilter(); _os << '\t'; makeInfoField(infoTags,_os); // INFO makeFormatSampleField(sampleTags, _os); // FORMAT + SAMPLE _os << '\n'; }
void VcfWriterSV:: writeTransloc( const SVCandidate& sv, const SVId& svId, const bool isFirstBreakend, const SVCandidateSetData& /*svData*/, const SVCandidateAssemblyData& /*adata*/, const EventInfo& event) { const bool isImprecise(sv.isImprecise()); const bool isBreakendRangeSameShift(sv.isBreakendRangeSameShift()); const SVBreakend& bpA( isFirstBreakend ? sv.bp1 : sv.bp2); const SVBreakend& bpB( isFirstBreakend ? sv.bp2 : sv.bp1); InfoTag_t infotags; SampleTag_t sampletags; // get CHROM const std::string& chrom(_header.chrom_data[bpA.interval.tid].label); const std::string& mateChrom(_header.chrom_data[bpB.interval.tid].label); const known_pos_range2& bpArange(bpA.interval.range); const known_pos_range2& bpBrange(bpB.interval.range); if (! isImprecise) { assert(bpArange.size() == bpBrange.size()); } // get POS pos_t pos(bpArange.center_pos()+1); pos_t matePos(bpBrange.center_pos()+1); if (! isImprecise) { pos = bpArange.begin_pos()+1; if (isBreakendRangeSameShift) { matePos = bpBrange.begin_pos()+1; } else { matePos = bpBrange.end_pos(); } } // get ID const std::string& localId(isFirstBreakend ? svId.localId : svId.mateId); const std::string& mateId(isFirstBreakend ? svId.mateId : svId.localId); // get REF std::string ref; get_standardized_region_seq(_referenceFilename,chrom,pos-1,pos-1,ref); assert(1 == ref.size()); const bool isReverseInsertSeq(! (isFirstBreakend || (bpA.state != bpB.state))); std::string tmpString; const std::string* insertSeqPtr(&sv.insertSeq); if (isReverseInsertSeq) { tmpString = reverseCompCopyStr(sv.insertSeq); insertSeqPtr = &tmpString; } const std::string& insertSeq(*insertSeqPtr); // build alt: boost::format altFormat("%4%%3%%1%:%2%%3%%5%"); { std::string altPrefix; std::string altSuffix; if (bpA.state == SVBreakendState::RIGHT_OPEN) { altPrefix = ref + insertSeq; } else if (bpA.state == SVBreakendState::LEFT_OPEN) { altSuffix = insertSeq + ref; } else { assert(false && "Unexpected bpA.state"); } char altSep('?'); if (bpB.state == SVBreakendState::RIGHT_OPEN) { altSep=']'; } else if (bpB.state == SVBreakendState::LEFT_OPEN) { altSep='['; } else { assert(false && "Unexpected bpB.state"); } altFormat % mateChrom % matePos % altSep % altPrefix % altSuffix; } // build INFO field infotags.push_back("SVTYPE=BND"); infotags.push_back("MATEID="+mateId); infotags.push_back( str(boost::format("BND_PAIR_COUNT=%i") % bpA.getLocalPairCount()) ); infotags.push_back( str(boost::format("PAIR_COUNT=%i") % bpA.getPairCount()) ); if (isImprecise) { infotags.push_back("IMPRECISE"); } if (bpArange.size() > 1) { infotags.push_back( str( boost::format("CIPOS=%i,%i") % ((bpArange.begin_pos()+1) - pos) % (bpArange.end_pos() - pos) )); } if (! isImprecise) { if (bpArange.size() > 1) { infotags.push_back( str( boost::format("HOMLEN=%i") % (bpArange.size()-1) )); std::string homref; get_standardized_region_seq(_referenceFilename,chrom,bpArange.begin_pos()+1,bpArange.end_pos()-1,homref); infotags.push_back( str( boost::format("HOMSEQ=%s") % (homref) )); } } if (! insertSeq.empty()) { infotags.push_back( str( boost::format("SVINSLEN=%i") % (insertSeq.size()) )); infotags.push_back( str( boost::format("SVINSSEQ=%s") % (insertSeq) )); } addSharedInfo(event, infotags); modifyInfo(event, infotags); modifyTranslocInfo(isFirstBreakend, infotags); modifySample(sv, sampletags); #ifdef DEBUG_VCF addDebugInfo(bpA, bpB, isFirstBreakend, adata, infotags); #endif // write out record: _os << chrom << '\t' << pos << '\t' << localId // ID << '\t' << ref // REF << '\t' << str( altFormat ) // ALT << '\t'; writeQual(); _os << '\t'; writeFilter(); _os << '\t'; makeInfoField(infotags,_os); // INFO makeFormatSampleField(sampletags, _os); // FORMAT + SAMPLE _os << '\n'; }