예제 #1
0
void
VcfWriterCandidateSV::
modifyTranslocInfo(
    const SVCandidate& sv,
    const bool isFirstOfPair,
    InfoTag_t& infoTags) const
{
    const SVBreakend& bpA( isFirstOfPair ? sv.bp1 : sv.bp2);

    infoTags.push_back( str(boost::format("BND_PAIR_COUNT=%i") % bpA.getLocalPairCount()) );
    infoTags.push_back( str(boost::format("PAIR_COUNT=%i") % bpA.getPairCount()) );
}
예제 #2
0
void
VcfWriterCandidateSV::
modifyInvdelInfo(
    const SVCandidate& sv,
    const bool isBp1First,
    InfoTag_t& infoTags) const
{
    const SVBreakend& bpA( isBp1First ? sv.bp1 : sv.bp2);
    const SVBreakend& bpB( isBp1First ? sv.bp2 : sv.bp1);

    infoTags.push_back( str(boost::format("UPSTREAM_PAIR_COUNT=%i") % bpA.getLocalPairCount()) );
    infoTags.push_back( str(boost::format("DOWNSTREAM_PAIR_COUNT=%i") % bpB.getLocalPairCount()) );
    infoTags.push_back( str(boost::format("PAIR_COUNT=%i") % bpA.getPairCount()) );
}
예제 #3
0
static
void
addDebugInfo(
    const SVBreakend& bp1,
    const SVBreakend& bp2,
    const bool isFirstOfPair,
    const SVCandidateAssemblyData& assemblyData,
    InfoTag_t& infotags)
{
    if (! isFirstOfPair) return;

    // store alignment start + cigar string for each section of the jumping alignment.
    // there can be several contigs per breakend, so we iterate over all of them.
    // only the first breakpoint gets the alignments attached to its VCF entry

    if (assemblyData.isSpanning)
    {
        const unsigned numAlign(assemblyData.spanningAlignments.size());
        std::string cigar1;
        std::string cigar2;
        for (unsigned alignIndex(0); alignIndex<numAlign; ++alignIndex)
        {
            const SVCandidateAssemblyData::JumpAlignmentResultType align(assemblyData.spanningAlignments[alignIndex]);
            infotags.push_back( str(boost::format("CTG_JALIGN_%i_POS_A=%d") %
                                    alignIndex %
                                    (bp1.interval.range.begin_pos()+align.align1.beginPos)) );
            infotags.push_back( str(boost::format("CTG_JALIGN_%i_POS_B=%d") %
                                    alignIndex %
                                    (bp2.interval.range.begin_pos()+align.align2.beginPos)) );

            apath_to_cigar(align.align1.apath,cigar1);
            apath_to_cigar(align.align2.apath,cigar2);

            infotags.push_back( str(boost::format("CTG_JALIGN_%i_CIGAR_A=%s") % alignIndex % cigar1) );
            infotags.push_back( str(boost::format("CTG_JALIGN_%i_CIGAR_B=%s") % alignIndex % cigar2) );
        }
    }
}
예제 #4
0
void
VcfWriterRnaSV::
modifyTranslocInfo(
    const SVCandidate& sv,
    const bool isFirstOfPair,
    const SVCandidateAssemblyData& assemblyData,
    InfoTag_t& infotags) const
{
    const SVScoreInfo& baseInfo(getBaseInfo());

    infotags.push_back( str(boost::format("BND_DEPTH=%i") %
                            (isFirstOfPair ? baseInfo.bp1MaxDepth : baseInfo.bp2MaxDepth) ) );
    infotags.push_back( str(boost::format("MATE_BND_DEPTH=%i") %
                            (isFirstOfPair ? baseInfo.bp2MaxDepth : baseInfo.bp1MaxDepth) ) );
    {
        ///TODO better multisample handler here:
        const unsigned sampleIndex(0);

        const SVSampleAlleleInfo& refinfo(baseInfo.samples[sampleIndex].ref);
        infotags.push_back(str(boost::format("REF_COUNT=%i") %
                               (isFirstOfPair ? refinfo.confidentSplitReadAndPairCountRefBp1 : refinfo.confidentSplitReadAndPairCountRefBp2)));
        infotags.push_back(str(boost::format("MATE_REF_COUNT=%i") %
                               (isFirstOfPair ? refinfo.confidentSplitReadAndPairCountRefBp2 : refinfo.confidentSplitReadAndPairCountRefBp1)));
    }
    {
        // if (!assemblyData.isSpanning) return;

        const bool isFirst = (assemblyData.bporient.isBp1First == isFirstOfPair);
        if (isFirst) infotags.push_back("RNA_FIRST");
        if (assemblyData.bporient.isTranscriptStrandKnown) infotags.push_back("RNA_STRANDED");

        if (!isFirstOfPair) return; // only the first breakpoint gets the additional RNA info attached to its VCF entry

        infotags.push_back(str(boost::format("RNA_FwRvReads=%i,%i") % sv.forwardTranscriptStrandReadCount % sv.reverseTranscriptStrandReadCount));
        infotags.push_back(str(boost::format("RNA_Reads=%i") % sv.bp2.lowresEvidence.getTotal()));
        const unsigned numContigs(assemblyData.contigs.size());
        if (numContigs > 0)
        {
            if (numContigs != assemblyData.spanningAlignments.size())
                infotags.push_back(str(boost::format("ERROR=%i,%i") % numContigs % assemblyData.spanningAlignments.size()));
            const unsigned int bestAlignmentIdx(assemblyData.bestAlignmentIndex);
            if (numContigs <= bestAlignmentIdx)
                infotags.push_back(str(boost::format("ERROR2=%i,%i") % numContigs % bestAlignmentIdx));
            infotags.push_back(str(boost::format("RNA_CONTIG=%s") % assemblyData.contigs[bestAlignmentIdx].seq));
            const auto& bestAlignment(assemblyData.spanningAlignments[bestAlignmentIdx]);
            infotags.push_back(str(boost::format("RNA_CONTIG_ALN=%i,%i")
                                   % apath_matched_length(bestAlignment.align1.apath)
                                   % apath_matched_length(bestAlignment.align2.apath)));
        }
    }
#ifdef DEBUG_VCF
    addDebugInfo(isFirstOfPair, sv, assemblyData, infotags);
#endif
}
예제 #5
0
void
VcfWriterSV::
writeInvdel(
    const SVCandidate& sv,
    const SVId& svId,
    const SVCandidateAssemblyData& /*adata*/,
    const bool isIndel,
    const EventInfo& event)
{
    const bool isImprecise(sv.isImprecise());
    const bool isBreakendRangeSameShift(sv.isBreakendRangeSameShift());

    const bool isBp1First(sv.bp1.interval.range.begin_pos()<=sv.bp2.interval.range.begin_pos());

    const SVBreakend& bpA(isBp1First ? sv.bp1 : sv.bp2);
    const SVBreakend& bpB(isBp1First ? sv.bp2 : sv.bp1);

    InfoTag_t infoTags;
    SampleTag_t sampleTags;

    // get CHROM
    const std::string& chrom(_header.chrom_data[sv.bp1.interval.tid].label);

    const known_pos_range2& bpArange(bpA.interval.range);
    const known_pos_range2& bpBrange(bpB.interval.range);

    if (! isImprecise)
    {
        assert(bpArange.size() == bpBrange.size());
    }

    // above this size all records use symbolic alleles (ie. <DEL>):
    static const unsigned maxNonSymbolicRecordSize(1000);

    // if the variant is a combination of simple insertion and deletions, and below
    // a large-event size threshold, it is classified as a small variant. In this case
    // we report the event using full REF and ALT sequences, plus a CIGAR string for
    // complex in/del combinations
    //
    bool isSmallVariant(false);
    if ((! isImprecise) && isIndel && (! sv.isUnknownSizeInsertion))
    {
        const unsigned deleteSize(bpBrange.begin_pos() - bpArange.begin_pos());
        const unsigned insertSize(sv.insertSeq.size());

        const bool isSmallDelete(deleteSize<=maxNonSymbolicRecordSize);
        const bool isSmallInsert(insertSize<=maxNonSymbolicRecordSize);

        isSmallVariant = (isSmallDelete && isSmallInsert);
    }

    // get POS and endPos
    pos_t pos(bpArange.center_pos()+1);
    pos_t endPos(bpBrange.center_pos());
    if (! isImprecise)
    {
        pos = bpArange.begin_pos()+1;
        if (isBreakendRangeSameShift)
        {
            endPos = bpBrange.begin_pos();
        }
        else
        {
            endPos = bpBrange.end_pos()-1;
        }
    }
    else
    {
        /// check against the rare case arising when CIEND is a subset of CIPOS:
        endPos=std::max(endPos,pos+1);
    }

    if (pos<1) return;

    // get REF
    std::string ref;
    {
        const pos_t beginRefPos(pos-1);
        pos_t endRefPos(beginRefPos);
        if (isSmallVariant) endRefPos=endPos-1;

        get_standardized_region_seq(_referenceFilename, chrom, beginRefPos, endRefPos, ref);

        assert(static_cast<unsigned>(1+endRefPos-beginRefPos) == ref.size());
    }

    // build alt:
    std::string alt;
    if (isSmallVariant)
    {
        alt = ref[0] + sv.insertSeq;
    }
    else
    {
        alt = str( boost::format("<%s>") % svId.getLabel());
    }

    // build INFO field
    std::vector<std::string> words;
    split_string(svId.getLabel(),':',words);
    {
        // note that there's a reasonable argument for displaying these tags only when a
        // symbolic allele is used (by a strict reading of the vcf spec) -- we instead
        // print these fields for all variants for uniformity within the manta vcf:
        //
        infoTags.push_back( str(boost::format("END=%i") % endPos));
        infoTags.push_back( str(boost::format("SVTYPE=%s") % words[0]));
        const pos_t refLen(endPos-pos);
        pos_t svLen(refLen);

        if (! sv.isUnknownSizeInsertion)
        {
            if (isIndel)
            {
                const pos_t insertLen(static_cast<pos_t>(sv.insertSeq.size()));
                if ( insertLen > refLen )
                {
                    svLen = insertLen;
                }
                else
                {
                    svLen = -refLen;
                }
            }
            infoTags.push_back( str(boost::format("SVLEN=%i") % (svLen)));
        }
    }
    infoTags.push_back( str(boost::format("UPSTREAM_PAIR_COUNT=%i") % bpA.getLocalPairCount()) );
    infoTags.push_back( str(boost::format("DOWNSTREAM_PAIR_COUNT=%i") % bpB.getLocalPairCount()) );
    infoTags.push_back( str(boost::format("PAIR_COUNT=%i") % bpA.getPairCount()) );

    if (isSmallVariant)
    {
        if (! sv.insertAlignment.empty())
        {
            std::string cigar;
            apath_to_cigar(sv.insertAlignment,cigar);

            // add the 1M to signify the leading reference base:
            infoTags.push_back( str(boost::format("CIGAR=1M%s") % cigar));
        }
    }

    if (isImprecise)
    {
        infoTags.push_back("IMPRECISE");
    }

    if (bpArange.size() > 1)
    {
        infoTags.push_back( str( boost::format("CIPOS=%i,%i") % ((bpArange.begin_pos()+1) - pos) % (bpArange.end_pos() - pos) ));
    }

    if (! isSmallVariant)
    {
        if (bpBrange.size() > 1)
        {
            infoTags.push_back( str( boost::format("CIEND=%i,%i") % (bpBrange.begin_pos() - endPos) % ((bpBrange.end_pos()-1) - endPos) ));
        }
    }

    if (! isImprecise)
    {
        if (bpArange.size() > 1)
        {
            infoTags.push_back( str( boost::format("HOMLEN=%i") % (bpArange.size()-1) ));
            std::string homref;
            get_standardized_region_seq(_referenceFilename,chrom,bpArange.begin_pos()+1,bpArange.end_pos()-1,homref);
            infoTags.push_back( str( boost::format("HOMSEQ=%s") % (homref) ));
        }
    }

    if (! isSmallVariant)
    {
        if (! (sv.insertSeq.empty() || sv.isUnknownSizeInsertion))
        {
            infoTags.push_back( str( boost::format("SVINSLEN=%i") % (sv.insertSeq.size()) ));
            if (isBp1First || (bpA.state != bpB.state))
            {
                infoTags.push_back( str( boost::format("SVINSSEQ=%s") % (sv.insertSeq) ));
            }
            else
            {
                infoTags.push_back( str( boost::format("SVINSSEQ=%s") % reverseCompCopyStr(sv.insertSeq) ));
            }
        }
    }

    if (sv.isUnknownSizeInsertion)
    {
        if (! sv.unknownSizeInsertionLeftSeq.empty())
        {
            infoTags.push_back( str( boost::format("LEFT_SVINSSEQ=%s") % (sv.unknownSizeInsertionLeftSeq) ));
        }

        if (! sv.unknownSizeInsertionRightSeq.empty())
        {
            infoTags.push_back( str( boost::format("RIGHT_SVINSSEQ=%s") % (sv.unknownSizeInsertionRightSeq) ));
        }
    }

    if (svId.svType == EXTENDED_SV_TYPE::INVERSION)
    {
        if (sv.bp1.state == SVBreakendState::RIGHT_OPEN)
        {
            infoTags.push_back("INV3");
        }
        else if (sv.bp1.state == SVBreakendState::LEFT_OPEN)
        {
            infoTags.push_back("INV5");
        }
        else
        {
            assert(false && "Unexpected inversion configuration");
        }
    }

    addSharedInfo(event, infoTags);

    modifyInfo(event, infoTags);
    modifySample(sv, sampleTags);

    // write out record:
    _os << chrom
        << '\t' << pos
        << '\t' << svId.localId // ID
        << '\t' << ref // REF
        << '\t' << alt // ALT
        << '\t';
    writeQual();
    _os << '\t';
    writeFilter();
    _os << '\t';
    makeInfoField(infoTags,_os); // INFO
    makeFormatSampleField(sampleTags, _os); // FORMAT + SAMPLE
    _os << '\n';
}
예제 #6
0
void
VcfWriterSV::
writeTransloc(
    const SVCandidate& sv,
    const SVId& svId,
    const bool isFirstBreakend,
    const SVCandidateSetData& /*svData*/,
    const SVCandidateAssemblyData& /*adata*/,
    const EventInfo& event)
{
    const bool isImprecise(sv.isImprecise());
    const bool isBreakendRangeSameShift(sv.isBreakendRangeSameShift());

    const SVBreakend& bpA( isFirstBreakend ? sv.bp1 : sv.bp2);
    const SVBreakend& bpB( isFirstBreakend ? sv.bp2 : sv.bp1);

    InfoTag_t infotags;
    SampleTag_t sampletags;

    // get CHROM
    const std::string& chrom(_header.chrom_data[bpA.interval.tid].label);
    const std::string& mateChrom(_header.chrom_data[bpB.interval.tid].label);

    const known_pos_range2& bpArange(bpA.interval.range);
    const known_pos_range2& bpBrange(bpB.interval.range);

    if (! isImprecise)
    {
        assert(bpArange.size() == bpBrange.size());
    }

    // get POS
    pos_t pos(bpArange.center_pos()+1);
    pos_t matePos(bpBrange.center_pos()+1);
    if (! isImprecise)
    {
        pos = bpArange.begin_pos()+1;
        if (isBreakendRangeSameShift)
        {
            matePos = bpBrange.begin_pos()+1;
        }
        else
        {
            matePos = bpBrange.end_pos();
        }
    }

    // get ID
    const std::string& localId(isFirstBreakend ? svId.localId : svId.mateId);
    const std::string& mateId(isFirstBreakend ? svId.mateId : svId.localId);

    // get REF
    std::string ref;
    get_standardized_region_seq(_referenceFilename,chrom,pos-1,pos-1,ref);

    assert(1 == ref.size());

    const bool isReverseInsertSeq(! (isFirstBreakend || (bpA.state != bpB.state)));
    std::string tmpString;
    const std::string* insertSeqPtr(&sv.insertSeq);
    if (isReverseInsertSeq)
    {
        tmpString = reverseCompCopyStr(sv.insertSeq);
        insertSeqPtr = &tmpString;
    }
    const std::string& insertSeq(*insertSeqPtr);

    // build alt:
    boost::format altFormat("%4%%3%%1%:%2%%3%%5%");
    {
        std::string altPrefix;
        std::string altSuffix;
        if     (bpA.state == SVBreakendState::RIGHT_OPEN)
        {
            altPrefix = ref + insertSeq;
        }
        else if (bpA.state == SVBreakendState::LEFT_OPEN)
        {
            altSuffix = insertSeq + ref;
        }
        else
        {
            assert(false && "Unexpected bpA.state");
        }


        char altSep('?');
        if     (bpB.state == SVBreakendState::RIGHT_OPEN)
        {
            altSep=']';
        }
        else if (bpB.state == SVBreakendState::LEFT_OPEN)
        {
            altSep='[';
        }
        else
        {
            assert(false && "Unexpected bpB.state");
        }

        altFormat % mateChrom % matePos % altSep % altPrefix % altSuffix;
    }

    // build INFO field
    infotags.push_back("SVTYPE=BND");
    infotags.push_back("MATEID="+mateId);
    infotags.push_back( str(boost::format("BND_PAIR_COUNT=%i") % bpA.getLocalPairCount()) );
    infotags.push_back( str(boost::format("PAIR_COUNT=%i") % bpA.getPairCount()) );
    if (isImprecise)
    {
        infotags.push_back("IMPRECISE");
    }

    if (bpArange.size() > 1)
    {
        infotags.push_back( str( boost::format("CIPOS=%i,%i") % ((bpArange.begin_pos()+1) - pos) % (bpArange.end_pos() - pos) ));
    }

    if (! isImprecise)
    {
        if (bpArange.size() > 1)
        {
            infotags.push_back( str( boost::format("HOMLEN=%i") % (bpArange.size()-1) ));
            std::string homref;
            get_standardized_region_seq(_referenceFilename,chrom,bpArange.begin_pos()+1,bpArange.end_pos()-1,homref);
            infotags.push_back( str( boost::format("HOMSEQ=%s") % (homref) ));
        }
    }

    if (! insertSeq.empty())
    {
        infotags.push_back( str( boost::format("SVINSLEN=%i") % (insertSeq.size()) ));
        infotags.push_back( str( boost::format("SVINSSEQ=%s") % (insertSeq) ));
    }

    addSharedInfo(event, infotags);

    modifyInfo(event, infotags);
    modifyTranslocInfo(isFirstBreakend, infotags);

    modifySample(sv, sampletags);

#ifdef DEBUG_VCF
    addDebugInfo(bpA, bpB, isFirstBreakend, adata, infotags);
#endif

    // write out record:
    _os << chrom
        << '\t' << pos
        << '\t' << localId // ID
        << '\t' << ref // REF
        << '\t' << str( altFormat ) // ALT
        << '\t';
    writeQual();
    _os << '\t';
    writeFilter();
    _os << '\t';
    makeInfoField(infotags,_os); // INFO
    makeFormatSampleField(sampletags, _os); // FORMAT + SAMPLE
    _os << '\n';
}