Exemplo n.º 1
0
void
VcfWriterRnaSV::
modifySample(
    const SVCandidate& sv,
    SampleTag_t& sampletags) const
{
    const SVScoreInfo& baseInfo(getBaseInfo());
    const unsigned sampleCount(baseInfo.samples.size());

    std::vector<std::string> values(sampleCount);
    for (unsigned sampleIndex(0); sampleIndex<sampleCount; ++sampleIndex)
    {
        const SVSampleInfo& sampleInfo(baseInfo.samples[sampleIndex]);
        values[sampleIndex] = str(boost::format("%i,%i")
                                  % sampleInfo.ref.spanningPairCount
                                  % sampleInfo.alt.spanningPairCount);
    }
    sampletags.push_back(std::make_pair("PR", values));

    if (sv.isImprecise()) return;

    for (unsigned sampleIndex(0); sampleIndex<sampleCount; ++sampleIndex)
    {
        const SVSampleInfo& sampleInfo(baseInfo.samples[sampleIndex]);
        values[sampleIndex] =  str( boost::format("%i,%i")
                                    % sampleInfo.ref.splitReadCount
                                    % sampleInfo.alt.splitReadCount);
    }
    sampletags.push_back(std::make_pair("SR",values));
}
Exemplo n.º 2
0
static
float
largeNoiseSVPriorWeight(
    const SVCandidate& sv)
{
    static const int smallSize(5000);
    static const int largeSize(10000);
    static const LinearScaler<int> svSizeRamp(smallSize, largeSize);

    return svSizeRamp.getScale(sv.centerSize());
}
Exemplo n.º 3
0
/// when an sv is treated as 'small', we skip all paired-read evidence and rely on split reads only:
///
/// with further model improvements we can add pairs back into the small variant calls:
///
/// this function returns 1 for a variant which is "fully large" and 0 for a variant which is "fully small",
/// with intermediate values for sizes in between
///
static
float
getSpanningPairWeight(
    const SVCandidate& sv,
    const bool isSmallAssembler)
{
    static const int minSmallSize(300);
    static const int maxSmallSize(500);
    static const LinearScaler<int> svSizeRamp(minSmallSize, maxSmallSize);

    if (! isSmallAssembler) return 1.f;
    return svSizeRamp.getScale(sv.centerSize());
}
Exemplo n.º 4
0
void
VcfWriterSomaticSV::
modifySample(
    const SVCandidate& sv,
    SampleTag_t& sampletags) const
{
    const SVScoreInfo& baseInfo(getBaseInfo());

    std::vector<std::string> values(2);

    static const std::string pairTag("PR");
    values[0] = str( boost::format("%i,%i") % baseInfo.normal.ref.confidentSpanningPairCount % baseInfo.normal.alt.confidentSpanningPairCount);
    values[1] = str( boost::format("%i,%i") % baseInfo.tumor.ref.confidentSpanningPairCount % baseInfo.tumor.alt.confidentSpanningPairCount);
    sampletags.push_back(std::make_pair(pairTag,values));

    if (sv.isImprecise()) return;

    static const std::string srTag("SR");
    values[0] = str( boost::format("%i,%i") % baseInfo.normal.ref.confidentSplitReadCount % baseInfo.normal.alt.confidentSplitReadCount);
    values[1] = str( boost::format("%i,%i") % baseInfo.tumor.ref.confidentSplitReadCount % baseInfo.tumor.alt.confidentSplitReadCount);
    sampletags.push_back(std::make_pair(srTag,values));
}
Exemplo n.º 5
0
void
VcfWriterSV::
writeInvdel(
    const SVCandidate& sv,
    const SVId& svId,
    const SVCandidateAssemblyData& /*adata*/,
    const bool isIndel,
    const EventInfo& event)
{
    const bool isImprecise(sv.isImprecise());
    const bool isBreakendRangeSameShift(sv.isBreakendRangeSameShift());

    const bool isBp1First(sv.bp1.interval.range.begin_pos()<=sv.bp2.interval.range.begin_pos());

    const SVBreakend& bpA(isBp1First ? sv.bp1 : sv.bp2);
    const SVBreakend& bpB(isBp1First ? sv.bp2 : sv.bp1);

    InfoTag_t infoTags;
    SampleTag_t sampleTags;

    // get CHROM
    const std::string& chrom(_header.chrom_data[sv.bp1.interval.tid].label);

    const known_pos_range2& bpArange(bpA.interval.range);
    const known_pos_range2& bpBrange(bpB.interval.range);

    if (! isImprecise)
    {
        assert(bpArange.size() == bpBrange.size());
    }

    // above this size all records use symbolic alleles (ie. <DEL>):
    static const unsigned maxNonSymbolicRecordSize(1000);

    // if the variant is a combination of simple insertion and deletions, and below
    // a large-event size threshold, it is classified as a small variant. In this case
    // we report the event using full REF and ALT sequences, plus a CIGAR string for
    // complex in/del combinations
    //
    bool isSmallVariant(false);
    if ((! isImprecise) && isIndel && (! sv.isUnknownSizeInsertion))
    {
        const unsigned deleteSize(bpBrange.begin_pos() - bpArange.begin_pos());
        const unsigned insertSize(sv.insertSeq.size());

        const bool isSmallDelete(deleteSize<=maxNonSymbolicRecordSize);
        const bool isSmallInsert(insertSize<=maxNonSymbolicRecordSize);

        isSmallVariant = (isSmallDelete && isSmallInsert);
    }

    // get POS and endPos
    pos_t pos(bpArange.center_pos()+1);
    pos_t endPos(bpBrange.center_pos());
    if (! isImprecise)
    {
        pos = bpArange.begin_pos()+1;
        if (isBreakendRangeSameShift)
        {
            endPos = bpBrange.begin_pos();
        }
        else
        {
            endPos = bpBrange.end_pos()-1;
        }
    }
    else
    {
        /// check against the rare case arising when CIEND is a subset of CIPOS:
        endPos=std::max(endPos,pos+1);
    }

    if (pos<1) return;

    // get REF
    std::string ref;
    {
        const pos_t beginRefPos(pos-1);
        pos_t endRefPos(beginRefPos);
        if (isSmallVariant) endRefPos=endPos-1;

        get_standardized_region_seq(_referenceFilename, chrom, beginRefPos, endRefPos, ref);

        assert(static_cast<unsigned>(1+endRefPos-beginRefPos) == ref.size());
    }

    // build alt:
    std::string alt;
    if (isSmallVariant)
    {
        alt = ref[0] + sv.insertSeq;
    }
    else
    {
        alt = str( boost::format("<%s>") % svId.getLabel());
    }

    // build INFO field
    std::vector<std::string> words;
    split_string(svId.getLabel(),':',words);
    {
        // note that there's a reasonable argument for displaying these tags only when a
        // symbolic allele is used (by a strict reading of the vcf spec) -- we instead
        // print these fields for all variants for uniformity within the manta vcf:
        //
        infoTags.push_back( str(boost::format("END=%i") % endPos));
        infoTags.push_back( str(boost::format("SVTYPE=%s") % words[0]));
        const pos_t refLen(endPos-pos);
        pos_t svLen(refLen);

        if (! sv.isUnknownSizeInsertion)
        {
            if (isIndel)
            {
                const pos_t insertLen(static_cast<pos_t>(sv.insertSeq.size()));
                if ( insertLen > refLen )
                {
                    svLen = insertLen;
                }
                else
                {
                    svLen = -refLen;
                }
            }
            infoTags.push_back( str(boost::format("SVLEN=%i") % (svLen)));
        }
    }
    infoTags.push_back( str(boost::format("UPSTREAM_PAIR_COUNT=%i") % bpA.getLocalPairCount()) );
    infoTags.push_back( str(boost::format("DOWNSTREAM_PAIR_COUNT=%i") % bpB.getLocalPairCount()) );
    infoTags.push_back( str(boost::format("PAIR_COUNT=%i") % bpA.getPairCount()) );

    if (isSmallVariant)
    {
        if (! sv.insertAlignment.empty())
        {
            std::string cigar;
            apath_to_cigar(sv.insertAlignment,cigar);

            // add the 1M to signify the leading reference base:
            infoTags.push_back( str(boost::format("CIGAR=1M%s") % cigar));
        }
    }

    if (isImprecise)
    {
        infoTags.push_back("IMPRECISE");
    }

    if (bpArange.size() > 1)
    {
        infoTags.push_back( str( boost::format("CIPOS=%i,%i") % ((bpArange.begin_pos()+1) - pos) % (bpArange.end_pos() - pos) ));
    }

    if (! isSmallVariant)
    {
        if (bpBrange.size() > 1)
        {
            infoTags.push_back( str( boost::format("CIEND=%i,%i") % (bpBrange.begin_pos() - endPos) % ((bpBrange.end_pos()-1) - endPos) ));
        }
    }

    if (! isImprecise)
    {
        if (bpArange.size() > 1)
        {
            infoTags.push_back( str( boost::format("HOMLEN=%i") % (bpArange.size()-1) ));
            std::string homref;
            get_standardized_region_seq(_referenceFilename,chrom,bpArange.begin_pos()+1,bpArange.end_pos()-1,homref);
            infoTags.push_back( str( boost::format("HOMSEQ=%s") % (homref) ));
        }
    }

    if (! isSmallVariant)
    {
        if (! (sv.insertSeq.empty() || sv.isUnknownSizeInsertion))
        {
            infoTags.push_back( str( boost::format("SVINSLEN=%i") % (sv.insertSeq.size()) ));
            if (isBp1First || (bpA.state != bpB.state))
            {
                infoTags.push_back( str( boost::format("SVINSSEQ=%s") % (sv.insertSeq) ));
            }
            else
            {
                infoTags.push_back( str( boost::format("SVINSSEQ=%s") % reverseCompCopyStr(sv.insertSeq) ));
            }
        }
    }

    if (sv.isUnknownSizeInsertion)
    {
        if (! sv.unknownSizeInsertionLeftSeq.empty())
        {
            infoTags.push_back( str( boost::format("LEFT_SVINSSEQ=%s") % (sv.unknownSizeInsertionLeftSeq) ));
        }

        if (! sv.unknownSizeInsertionRightSeq.empty())
        {
            infoTags.push_back( str( boost::format("RIGHT_SVINSSEQ=%s") % (sv.unknownSizeInsertionRightSeq) ));
        }
    }

    if (svId.svType == EXTENDED_SV_TYPE::INVERSION)
    {
        if (sv.bp1.state == SVBreakendState::RIGHT_OPEN)
        {
            infoTags.push_back("INV3");
        }
        else if (sv.bp1.state == SVBreakendState::LEFT_OPEN)
        {
            infoTags.push_back("INV5");
        }
        else
        {
            assert(false && "Unexpected inversion configuration");
        }
    }

    addSharedInfo(event, infoTags);

    modifyInfo(event, infoTags);
    modifySample(sv, sampleTags);

    // write out record:
    _os << chrom
        << '\t' << pos
        << '\t' << svId.localId // ID
        << '\t' << ref // REF
        << '\t' << alt // ALT
        << '\t';
    writeQual();
    _os << '\t';
    writeFilter();
    _os << '\t';
    makeInfoField(infoTags,_os); // INFO
    makeFormatSampleField(sampleTags, _os); // FORMAT + SAMPLE
    _os << '\n';
}
Exemplo n.º 6
0
void
VcfWriterSV::
writeTransloc(
    const SVCandidate& sv,
    const SVId& svId,
    const bool isFirstBreakend,
    const SVCandidateSetData& /*svData*/,
    const SVCandidateAssemblyData& /*adata*/,
    const EventInfo& event)
{
    const bool isImprecise(sv.isImprecise());
    const bool isBreakendRangeSameShift(sv.isBreakendRangeSameShift());

    const SVBreakend& bpA( isFirstBreakend ? sv.bp1 : sv.bp2);
    const SVBreakend& bpB( isFirstBreakend ? sv.bp2 : sv.bp1);

    InfoTag_t infotags;
    SampleTag_t sampletags;

    // get CHROM
    const std::string& chrom(_header.chrom_data[bpA.interval.tid].label);
    const std::string& mateChrom(_header.chrom_data[bpB.interval.tid].label);

    const known_pos_range2& bpArange(bpA.interval.range);
    const known_pos_range2& bpBrange(bpB.interval.range);

    if (! isImprecise)
    {
        assert(bpArange.size() == bpBrange.size());
    }

    // get POS
    pos_t pos(bpArange.center_pos()+1);
    pos_t matePos(bpBrange.center_pos()+1);
    if (! isImprecise)
    {
        pos = bpArange.begin_pos()+1;
        if (isBreakendRangeSameShift)
        {
            matePos = bpBrange.begin_pos()+1;
        }
        else
        {
            matePos = bpBrange.end_pos();
        }
    }

    // get ID
    const std::string& localId(isFirstBreakend ? svId.localId : svId.mateId);
    const std::string& mateId(isFirstBreakend ? svId.mateId : svId.localId);

    // get REF
    std::string ref;
    get_standardized_region_seq(_referenceFilename,chrom,pos-1,pos-1,ref);

    assert(1 == ref.size());

    const bool isReverseInsertSeq(! (isFirstBreakend || (bpA.state != bpB.state)));
    std::string tmpString;
    const std::string* insertSeqPtr(&sv.insertSeq);
    if (isReverseInsertSeq)
    {
        tmpString = reverseCompCopyStr(sv.insertSeq);
        insertSeqPtr = &tmpString;
    }
    const std::string& insertSeq(*insertSeqPtr);

    // build alt:
    boost::format altFormat("%4%%3%%1%:%2%%3%%5%");
    {
        std::string altPrefix;
        std::string altSuffix;
        if     (bpA.state == SVBreakendState::RIGHT_OPEN)
        {
            altPrefix = ref + insertSeq;
        }
        else if (bpA.state == SVBreakendState::LEFT_OPEN)
        {
            altSuffix = insertSeq + ref;
        }
        else
        {
            assert(false && "Unexpected bpA.state");
        }


        char altSep('?');
        if     (bpB.state == SVBreakendState::RIGHT_OPEN)
        {
            altSep=']';
        }
        else if (bpB.state == SVBreakendState::LEFT_OPEN)
        {
            altSep='[';
        }
        else
        {
            assert(false && "Unexpected bpB.state");
        }

        altFormat % mateChrom % matePos % altSep % altPrefix % altSuffix;
    }

    // build INFO field
    infotags.push_back("SVTYPE=BND");
    infotags.push_back("MATEID="+mateId);
    infotags.push_back( str(boost::format("BND_PAIR_COUNT=%i") % bpA.getLocalPairCount()) );
    infotags.push_back( str(boost::format("PAIR_COUNT=%i") % bpA.getPairCount()) );
    if (isImprecise)
    {
        infotags.push_back("IMPRECISE");
    }

    if (bpArange.size() > 1)
    {
        infotags.push_back( str( boost::format("CIPOS=%i,%i") % ((bpArange.begin_pos()+1) - pos) % (bpArange.end_pos() - pos) ));
    }

    if (! isImprecise)
    {
        if (bpArange.size() > 1)
        {
            infotags.push_back( str( boost::format("HOMLEN=%i") % (bpArange.size()-1) ));
            std::string homref;
            get_standardized_region_seq(_referenceFilename,chrom,bpArange.begin_pos()+1,bpArange.end_pos()-1,homref);
            infotags.push_back( str( boost::format("HOMSEQ=%s") % (homref) ));
        }
    }

    if (! insertSeq.empty())
    {
        infotags.push_back( str( boost::format("SVINSLEN=%i") % (insertSeq.size()) ));
        infotags.push_back( str( boost::format("SVINSSEQ=%s") % (insertSeq) ));
    }

    addSharedInfo(event, infotags);

    modifyInfo(event, infotags);
    modifyTranslocInfo(isFirstBreakend, infotags);

    modifySample(sv, sampletags);

#ifdef DEBUG_VCF
    addDebugInfo(bpA, bpB, isFirstBreakend, adata, infotags);
#endif

    // write out record:
    _os << chrom
        << '\t' << pos
        << '\t' << localId // ID
        << '\t' << ref // REF
        << '\t' << str( altFormat ) // ALT
        << '\t';
    writeQual();
    _os << '\t';
    writeFilter();
    _os << '\t';
    makeInfoField(infotags,_os); // INFO
    makeFormatSampleField(sampletags, _os); // FORMAT + SAMPLE
    _os << '\n';
}