Esempio n. 1
0
int main()
{
    // Create an alignment between subject and query.
    seqan::Peptide subject =
            "MGLSDGEWQLVLNVWGKVEADIPGHGQEVLIRLFKGHPETLEKFDKFKHLKSEDEMKASE"
            "DLKKHGATVLTALGGILKKKGHHEAEIKPLAQSHATKHKIPVKYLEFISECIIQVLQSKH"
            "PGDFGADAQGAMNKALELFRKDMASNYK";
    seqan::Peptide query =
            "MSLTKTERTIIVSMWAKISTQADTIGTETLERLFLSHPQTKTYFPHFDLHPGSA"
            "QLRAHGSKVVAAVGDAVKSIDDIGGALSKLSELHAYILRVDPVNFKLLSHCLLVTLAARF"
            "PADFTAEAHAAWDKFLSVTEKYR";

    seqan::Align<seqan::Peptide> align;
    resize(rows(align), 2);
    setSource(row(align, 0), subject);
    setSource(row(align, 1), query);

    seqan::Blosum62 scoringScheme(-1, -12);
    globalAlignment(align, scoringScheme);

    // Compute the statistics of the alignment.
    seqan::AlignmentStats stats;
    int scoreVal = computeAlignmentStats(stats, align, scoringScheme);
    SEQAN_ASSERT_EQ(scoreVal, stats.alignmentScore);
    std::cout << align
              << "gap opens:           " << stats.numGapOpens << "\n"
              << "gap extensions:      " << stats.numGapExtensions << "\n"
              << "num matches:         " << stats.numMatches << "\n"
              << "num mismatches:      " << stats.numMismatches << "\n"
              << "num positive scores: " << stats.numPositiveScores << "\n"
              << "num negative scores: " << stats.numNegativeScores << "\n\n\n";

    // Clip alignment rows and compute score of this view.
    setClippedEndPosition(row(align, 0), 100);
    setClippedEndPosition(row(align, 1), 100);
    setClippedBeginPosition(row(align, 0), 5);
    setClippedBeginPosition(row(align, 1), 5);

    scoreVal = computeAlignmentStats(stats, align, scoringScheme);
    SEQAN_ASSERT_EQ(scoreVal, stats.alignmentScore);
    std::cout << "Clipping alignment to (5, 100)\n"
              << align
              << "gap opens:           " << stats.numGapOpens << "\n"
              << "gap extensions:      " << stats.numGapExtensions << "\n"
              << "num matches:         " << stats.numMatches << "\n"
              << "num mismatches:      " << stats.numMismatches << "\n"
              << "num positive scores: " << stats.numPositiveScores << "\n"
              << "num negative scores: " << stats.numNegativeScores << "\n";

    return 0;
}
Esempio n. 2
0
GenomicInterval PositionMap::getGenomicInterval(int svPos) const
{
    seqan::String<GenomicInterval> intervals;
    findIntervals(intervals, svIntervalTree, svPos);
    SEQAN_ASSERT_EQ(length(intervals), 1u);
    return intervals[0];
}
Esempio n. 3
0
void VcfMaterializer::_appendToVariants(Variants & variants, seqan::VcfRecord const & vcfRecord)
{
    // Compute maximal length of alternative.
    unsigned altLength = 0;
    seqan::StringSet<seqan::CharString> alts;
    strSplit(alts, vcfRecord.alt, seqan::EqualsChar<','>());
    for (unsigned i = 0; i < length(alts); ++i)
        altLength = std::max(altLength, (unsigned)length(alts[i]));

    if (contains(vcfRecord.info, "SVTYPE"))  // Structural Variant
    {
        StructuralVariantRecord svRecord;
        svRecord.rId = vcfRecord.rID;
        svRecord.pos = vcfRecord.beginPos + 1;  // given with shift of -1
        svRecord.haplotype = 0;

        SEQAN_ASSERT_EQ(length(alts), 1u);

        if (contains(vcfRecord.info, "SVTYPE=INS"))  // Insertion
        {
            svRecord.kind = StructuralVariantRecord::INDEL;
            svRecord.size = getSVLen(vcfRecord.info);
            svRecord.seq = suffix(vcfRecord.alt, 1);
        }
        else if (contains(vcfRecord.info, "SVTYPE=DEL"))  // Deletion
        {
            svRecord.kind = StructuralVariantRecord::INDEL;
            svRecord.size = getSVLen(vcfRecord.info);
        }
        else if (contains(vcfRecord.info, "SVTYPE=INV"))  // Inversion
        {
            svRecord.kind = StructuralVariantRecord::INVERSION;
            svRecord.size = getSVLen(vcfRecord.info);
        }
        else if (contains(vcfRecord.info, "SVTYPE=DUP"))  // Duplication
        {
            svRecord.kind = StructuralVariantRecord::DUPLICATION;
            svRecord.size = getSVLen(vcfRecord.info);
            std::pair<seqan::CharString, int> pos = getTargetPos(vcfRecord.info);
            unsigned idx = 0;
            if (!getIdByName(idx, contigNamesCache(context(vcfFileIn)), pos.first))
                SEQAN_FAIL("Unknown sequence %s", toCString(pos.first));
            svRecord.targetRId = idx;
            svRecord.targetPos = pos.second - 1;
        }
        else if (contains(vcfRecord.info, "SVTYPE=BND"))  // Breakend (Must be Translocation)
        {
            SEQAN_FAIL("Unexpected 'SVTYPE=BND' at this place!");
        }
        else
        {
            SEQAN_FAIL("ERROR: Unknown SVTYPE!\n");
        }

        // Split the target variants.
        SEQAN_ASSERT_NOT(empty(vcfRecord.genotypeInfos));
        seqan::DirectionIterator<seqan::CharString const, seqan::Input>::Type inputIter =
                directionIterator(vcfRecord.genotypeInfos[0], seqan::Input());
        seqan::CharString buffer;
        svRecord.haplotype = 0;
        for (; !atEnd(inputIter); ++inputIter)
            if ((*inputIter == '|' || *inputIter == '/'))
            {
                if (!empty(buffer))
                {
                    unsigned idx = std::min(seqan::lexicalCast<unsigned>(buffer), 1u);
                    if (idx != 0u)  // if not == ref
                        appendValue(variants.svRecords, svRecord);
                }
                svRecord.haplotype++;
                clear(buffer);
            }
            else
            {
                appendValue(buffer, *inputIter);
            }
        if (!empty(buffer))
        {
            unsigned idx = std::min(seqan::lexicalCast<unsigned>(buffer), 1u);
            if (idx != 0u)  // if not == ref
                appendValue(variants.svRecords, svRecord);
        }
    }
    else if (length(vcfRecord.ref) == 1u && altLength == 1u)  // SNP
    {
        SnpRecord snpRecord;
        snpRecord.rId = vcfRecord.rID;
        snpRecord.pos = vcfRecord.beginPos;

        // Split the alternatives.
        seqan::StringSet<seqan::CharString> alternatives;
        strSplit(alternatives, vcfRecord.alt, seqan::EqualsChar<','>());

        // Split the target variants.
        SEQAN_ASSERT_NOT(empty(vcfRecord.genotypeInfos));
        seqan::DirectionIterator<seqan::CharString const, seqan::Input>::Type inputIter =
                directionIterator(vcfRecord.genotypeInfos[0], seqan::Input());
        seqan::CharString buffer;
        snpRecord.haplotype = 0;
        for (; !atEnd(inputIter); ++inputIter)
            if ((*inputIter == '|' || *inputIter == '/'))
            {
                if (!empty(buffer))
                {
                    unsigned idx = std::min(seqan::lexicalCast<unsigned>(buffer),
                                            (unsigned)length(alternatives));
                    if (idx != 0u)  // if not == ref
                    {
                        SEQAN_ASSERT_NOT(empty(alternatives[idx - 1]));
                        snpRecord.to = alternatives[idx - 1][0];
                        appendValue(variants.snps, snpRecord);
                    }
                }
                snpRecord.haplotype++;
                clear(buffer);
            }
            else
            {
                appendValue(buffer, *inputIter);
            }
        if (!empty(buffer))
        {
            unsigned idx = std::min(seqan::lexicalCast<unsigned>(buffer),
                                    (unsigned)length(alternatives));
            if (idx != 0u)  // if not == ref
            {
                SEQAN_ASSERT_NOT(empty(alternatives[idx - 1]));
                snpRecord.to = alternatives[idx - 1][0];
                appendValue(variants.snps, snpRecord);
            }
        }
    }
    else  // Small Indel
    {
        SmallIndelRecord smallIndel;
        smallIndel.rId = vcfRecord.rID;
        smallIndel.pos = vcfRecord.beginPos + 1;

        SEQAN_ASSERT_NOT(contains(vcfRecord.alt, ","));  // only one alternative
        SEQAN_ASSERT((length(vcfRecord.alt) == 1u) != (length(vcfRecord.ref) == 1u));  // XOR

        smallIndel.haplotype = 0;
        if (length(vcfRecord.ref) == 1u)  // insertion
        {
            smallIndel.seq = suffix(vcfRecord.alt, 1);
            smallIndel.size = length(smallIndel.seq);
        }
        else  // deletion
        {
            smallIndel.size = -(int)(length(vcfRecord.ref) - 1);
        }

        // Split the target variants.
        SEQAN_ASSERT_NOT(empty(vcfRecord.genotypeInfos));
        seqan::DirectionIterator<seqan::CharString const, seqan::Input>::Type inputIter =
                directionIterator(vcfRecord.genotypeInfos[0], seqan::Input());
        seqan::CharString buffer;
        smallIndel.haplotype = 0;
        for (; !atEnd(inputIter); ++inputIter)
            if ((*inputIter == '|' || *inputIter == '/'))
            {
                if (!empty(buffer))
                {
                    unsigned idx = std::min(seqan::lexicalCast<unsigned>(buffer), 1u);
                    if (idx != 0u)  // if not == ref
                        appendValue(variants.smallIndels, smallIndel);
                }
                smallIndel.haplotype++;
                clear(buffer);
            }
            else
            {
                appendValue(buffer, *inputIter);
            }
        if (!empty(buffer))
        {
            unsigned idx = std::min(seqan::lexicalCast<unsigned>(buffer), 1u);
            if (idx != 0u)  // if not == ref
                appendValue(variants.smallIndels, smallIndel);
        }
    }
}
Esempio n. 4
0
void PositionMap::reinit(TJournalEntries const & journal)
{
    // Reset the interval tree and breakpoints.
    // TODO(holtgrew): Better API support for IntervalTree?
    svIntervalTree = TIntervalTree();
    svIntervalTreeSTL = TIntervalTree();
    svBreakpoints.clear();
    clear(refGapAnchors);
    clear(smallVarGapAnchors);

    // Convert the journal to two gaps.
    //
    // Get anchor gaps objects from anchors.
    typedef seqan::Iterator<TGaps, seqan::Standard>::Type TGapsIter;
    TGaps refGaps(seqan::Nothing(), refGapAnchors);
    TGapsIter itRef = begin(refGaps, seqan::Standard());
    TGaps smallVarGaps(seqan::Nothing(), smallVarGapAnchors);
    TGapsIter itVar = begin(smallVarGaps, seqan::Standard());

    // Iterate over the journal.
    typedef seqan::Iterator<TJournalEntries const, seqan::Standard>::Type TJournalEntriesIt;
    TJournalEntriesIt it = begin(journal, seqan::Standard());
    SEQAN_ASSERT_NEQ(it->segmentSource, seqan::SOURCE_NULL);
    SEQAN_ASSERT_EQ(it->virtualPosition, 0u);

    unsigned lastRefPos = seqan::MaxValue<unsigned>::VALUE;  // Previous position from reference.
    for (; it != end(journal, seqan::Standard()); ++it)
    {
        // std::cerr << *it << "\n";
        SEQAN_ASSERT_NEQ(it->segmentSource, seqan::SOURCE_NULL);
        if (it->segmentSource == seqan::SOURCE_ORIGINAL)
        {
            if (lastRefPos == seqan::maxValue<unsigned>())
            {
                if (it->physicalPosition != 0)
                {
                    insertGaps(itRef, it->physicalPosition);
                    itRef += it->physicalPosition;
                    itVar += it->physicalPosition;
                    lastRefPos = it->physicalPosition + it->length;
                    // std::cerr << "INSERT REF GAPS\t" << it->physicalPosition << "\n";
                }
                itRef += it->length;
                itVar += it->length;
                // std::cerr << "FORWARD\t" << it->length << "\n";
            }
            else
            {
                if (it->physicalPosition != lastRefPos)
                {
                    int len = it->physicalPosition - lastRefPos;
                    insertGaps(itVar, len);
                    // std::cerr << "INSERT VAR GAPS\t" << len << "\n";
                    itRef += len;
                    itVar += len;
                    // std::cerr << "FORWARD\t" << len << "\n";
                }
                itRef += it->length;
                itVar += it->length;
                // std::cerr << "2 FORWARD\t" << it->length << "\n";
            }
            lastRefPos = it->physicalPosition + it->length;
        }
        else
        {
            insertGaps(itRef, it->length);
            // std::cerr << "INSERT REF GAPS\t" << it->length << "\n";
            itRef += it->length;
            itVar += it->length;
            // std::cerr << "FORWARD\t" << it->length << "\n";
        }
    }

    // std::cerr << "--> done\n";

    // typedef seqan::Gaps<seqan::CharString, seqan::AnchorGaps<TGapAnchors> > TGaps2;
    // seqan::CharString seqH;
    // seqan::CharString seqV;
    // for (unsigned i = 0; i < 1000; ++i)
    // {
    //     appendValue(seqH, 'X');
    //     appendValue(seqV, 'X');
    // }
    // TGaps2 gapsH(seqH, refGapAnchors);
    // TGaps2 gapsV(seqV, smallVarGapAnchors);

    // std::cerr << "REF\t" << gapsH << "\n"
    //           << "VAR\t" << gapsV << "\n";
}
Esempio n. 5
0
int VariantMaterializer::_materializeLargeVariants(
        seqan::Dna5String & seq,
        MethylationLevels * levelsLargeVariants,
        std::vector<SmallVarInfo> & varInfos,
        std::vector<std::pair<int, int> > & breakpoints,
        PositionMap & positionMap,
        TJournalEntries const & journal,
        seqan::Dna5String const & contig,
        std::vector<SmallVarInfo> const & smallVarInfos,
        Variants const & variants,
        MethylationLevels const * levels,
        int hId)
{
    if (methSimOptions)
    {
        SEQAN_ASSERT_EQ(methSimOptions->simulateMethylationLevels, (levelsLargeVariants != 0));
        SEQAN_ASSERT_EQ(methSimOptions->simulateMethylationLevels, (levels != 0));
    }

    // We will record all intervals for the positionMap.svIntervalTree in this String.
    seqan::String<GenomicInterval> intervals;

    // Clear output methylation levels->
    if (levelsLargeVariants)
        levelsLargeVariants->clear();
    // Store variation points.  We reuse the fixVariationLevels() function from small indel/snp simulation and thus
    // have to store a bool that is always set to false.
    seqan::String<std::pair<int, bool> > varPoints;

    // Track last position from contig appended to seq so far.
    int lastPos = 0;
    if (verbosity >= 3)
        std::cerr << __LINE__ << "\tlastPos == " << lastPos << "\n";

    // Pointer to the current small variant to write out translated to varInfo.
    std::vector<SmallVarInfo>::const_iterator itSmallVar = smallVarInfos.begin();

    // Number of bytes written out so far/current position in variant.
    unsigned currentPos = 0;

    for (unsigned i = 0; i < length(variants.svRecords); ++i)
    {
        if (variants.svRecords[i].haplotype != hId)  // Ignore all but the current contig.
            continue;
        // We obtain a copy of the current SV record since we translate its positions below.
        StructuralVariantRecord svRecord = variants.svRecords[i];

        // Translate positions and lengths of SV record.
        if (verbosity >= 2)
            std::cerr << "  Translating SvRecord\n  " << svRecord << '\n';
        svRecord.pos = hostToVirtualPosition(journal, svRecord.pos);
        SEQAN_ASSERT_LT(svRecord.pos, (int)length(contig));
        // We do not need to adjust the sizes for insertions.
        if (svRecord.kind != StructuralVariantRecord::INDEL || svRecord.size < 0)
            svRecord.size = hostToVirtualPosition(journal, svRecord.pos + svRecord.size) -
                    hostToVirtualPosition(journal, svRecord.pos);
        if (svRecord.targetPos != -1)
            svRecord.targetPos = hostToVirtualPosition(journal, svRecord.targetPos);
        if (verbosity >= 2)
            std::cerr << "  => " << svRecord << '\n';

        // Copy out small variant infos for interim chars.
        for (; itSmallVar != smallVarInfos.end() && itSmallVar->pos < svRecord.pos; ++itSmallVar)
        {
            int offset = (int)currentPos - lastPos;
            varInfos.push_back(*itSmallVar);
            varInfos.back().pos += offset;
        }

        // Copy from contig to seq with SVs.
        if (verbosity >= 3)
            std::cerr << "lastPos == " << lastPos << "\n";
        append(seq, infix(contig, lastPos, svRecord.pos));  // interim chars
        if (methSimOptions && methSimOptions->simulateMethylationLevels)
        {
            append(levelsLargeVariants->forward, infix(levels->forward, lastPos, svRecord.pos));
            append(levelsLargeVariants->reverse, infix(levels->reverse, lastPos, svRecord.pos));
            appendValue(varPoints, std::make_pair((int)length(seq), false));
        }
        if (currentPos != length(seq))
            appendValue(intervals, GenomicInterval(currentPos, length(seq), lastPos, svRecord.pos,
                                                   '+', GenomicInterval::NORMAL));
        currentPos = length(seq);
        if (verbosity >= 3)
            std::cerr << "append(seq, infix(contig, " << lastPos << ", " << svRecord.pos << ") " << __LINE__ << " (interim)\n";
        switch (svRecord.kind)
        {
            case StructuralVariantRecord::INDEL:
                {
                    if (svRecord.size > 0)  // insertion
                    {
                        SEQAN_ASSERT_EQ((int)length(svRecord.seq), svRecord.size);

                        // Simulate methylation levels for insertion.
                        MethylationLevels lvls;
                        if (methSimOptions && methSimOptions->simulateMethylationLevels)
                        {
                            MethylationLevelSimulator methSim(*rng, *methSimOptions);
                            methSim.run(lvls, svRecord.seq);
                        }

                        // Append novel sequence and methylation levels.
                        append(seq, svRecord.seq);
                        if (methSimOptions && methSimOptions->simulateMethylationLevels)
                        {
                            append(levelsLargeVariants->forward, lvls.forward);
                            append(levelsLargeVariants->reverse, lvls.reverse);
                            appendValue(varPoints, std::make_pair((int)length(seq), false));  // variation point after insertion
                        }
                        if (currentPos != length(seq))
                            appendValue(intervals, GenomicInterval(currentPos, length(seq), -1, -1,
                                                                   '+', GenomicInterval::INSERTED));
                        if (verbosity >= 3)
                            std::cerr << "append(seq, svRecord.seq (length == " << length(svRecord.seq) << ") " << __LINE__ << " (insertion)\n";
                        lastPos = svRecord.pos;
                        SEQAN_ASSERT_LT(lastPos, (int)length(contig));

                        // Copy out breakpoints.
                        breakpoints.push_back(std::make_pair(currentPos, variants.posToIdx(Variants::SV, i)));
                        breakpoints.push_back(std::make_pair((int)length(seq), variants.posToIdx(Variants::SV, i)));

                        currentPos = length(seq);
                    }
                    else  // deletion
                    {
                        lastPos = svRecord.pos - svRecord.size;
                        SEQAN_ASSERT_LT(lastPos, (int)length(contig));

                        // Copy out breakpoint.
                        breakpoints.push_back(std::make_pair(currentPos, variants.posToIdx(Variants::SV, i)));
                    }
                }
                break;
            case StructuralVariantRecord::INVERSION:
                {
                    unsigned oldLen = length(seq);
                    append(seq, infix(contig, svRecord.pos, svRecord.pos + svRecord.size));
                    if (methSimOptions && methSimOptions->simulateMethylationLevels)
                    {
                        appendValue(varPoints, std::make_pair((int)length(seq), false));  // variation point at deletion
                        append(levelsLargeVariants->forward, infix(levels->reverse, svRecord.pos, svRecord.pos + svRecord.size));
                        reverse(infix(levelsLargeVariants->forward, oldLen, length(levelsLargeVariants->forward)));
                        append(levelsLargeVariants->reverse, infix(levels->forward, svRecord.pos, svRecord.pos + svRecord.size));
                        reverse(infix(levelsLargeVariants->reverse, oldLen, length(levelsLargeVariants->reverse)));
                    }
                    if (currentPos != length(seq))
                        appendValue(intervals, GenomicInterval(currentPos, length(seq), svRecord.pos, svRecord.pos + svRecord.size,
                                                               '-', GenomicInterval::INVERTED));

                    // Copy out small variant infos for inversion.
                    for (; itSmallVar != smallVarInfos.end() && itSmallVar->pos < svRecord.pos + svRecord.size; ++itSmallVar)
                    {
                        varInfos.push_back(*itSmallVar);
                        varInfos.back().pos = currentPos + svRecord.size - (varInfos.back().pos - lastPos);
                    }

                    if (verbosity >= 3)
                        std::cerr << "append(seq, infix(contig, " << svRecord.pos << ", " << svRecord.pos + svRecord.size << ") " << __LINE__ << " (inversion)\n";
                    reverseComplement(infix(seq, oldLen, length(seq)));
                    lastPos = svRecord.pos + svRecord.size;
                    SEQAN_ASSERT_LT(lastPos, (int)length(contig));

                    // Copy out breakpoints.
                    breakpoints.push_back(std::make_pair(currentPos, variants.posToIdx(Variants::SV, i)));
                    breakpoints.push_back(std::make_pair((int)length(seq), variants.posToIdx(Variants::SV, i)));

                    currentPos = length(seq);
                }
                break;
            case StructuralVariantRecord::TRANSLOCATION:
                {
                    SEQAN_ASSERT_GEQ(svRecord.targetPos, svRecord.pos + svRecord.size);
                    append(seq, infix(contig, svRecord.pos + svRecord.size, svRecord.targetPos));
                    if (methSimOptions && methSimOptions->simulateMethylationLevels)
                    {
                        appendValue(varPoints, std::make_pair((int)length(seq), false));
                        append(levelsLargeVariants->forward, infix(levels->forward, svRecord.pos + svRecord.size, svRecord.targetPos));
                        append(levelsLargeVariants->reverse, infix(levels->reverse, svRecord.pos + svRecord.size, svRecord.targetPos));
                    }
                    if (currentPos != length(seq))
                        appendValue(intervals, GenomicInterval(currentPos, length(seq), svRecord.pos + svRecord.size, svRecord.targetPos,
                                                               '+', GenomicInterval::NORMAL));
                    unsigned tmpCurrentPos = length(seq);
                    append(seq, infix(contig, svRecord.pos, svRecord.pos + svRecord.size));
                    if (methSimOptions && methSimOptions->simulateMethylationLevels)
                    {
                        appendValue(varPoints, std::make_pair((int)length(seq), false));
                        append(levelsLargeVariants->forward, infix(levels->forward, svRecord.pos, svRecord.pos + svRecord.size));
                        append(levelsLargeVariants->reverse, infix(levels->reverse, svRecord.pos, svRecord.pos + svRecord.size));
                    }
                    if (tmpCurrentPos != length(seq))
                        appendValue(intervals, GenomicInterval(tmpCurrentPos, length(seq), svRecord.pos, svRecord.pos + svRecord.size,
                                                               '+', GenomicInterval::NORMAL));
                    if (verbosity >= 3)
                        std::cerr << "append(seq, infix(contig, " << svRecord.pos + svRecord.size << ", " << svRecord.targetPos << ") " << __LINE__ << " (translocation)\n"
                                  << "append(seq, infix(contig, " << svRecord.pos << ", " << svRecord.pos + svRecord.size << ") " << __LINE__ << "\n";
                    lastPos = svRecord.targetPos;
                    SEQAN_ASSERT_LT(lastPos, (int)length(contig));

                    // Copy out small variant infos for translocation, shift left to right and righ to left but keep
                    // center intact.
                    for (; itSmallVar != smallVarInfos.end() && itSmallVar->pos < svRecord.pos; ++itSmallVar)
                    {
                        int offset = (int)currentPos - lastPos;
                        varInfos.push_back(*itSmallVar);
                        varInfos.back().pos += offset;

                        int bpLeft = svRecord.pos + svRecord.size;
                        int bpRight = svRecord.targetPos;
                        if (itSmallVar->pos < bpLeft)
                            varInfos.back().pos -= (svRecord.targetPos - svRecord.pos);
                        else if (itSmallVar->pos >= bpRight)
                            varInfos.back().pos += (svRecord.targetPos - svRecord.pos);
                    }

                    // Copy out breakpoints.
                    breakpoints.push_back(std::make_pair(currentPos, variants.posToIdx(Variants::SV, i)));
                    breakpoints.push_back(std::make_pair(currentPos + svRecord.targetPos - svRecord.pos - svRecord.size, variants.posToIdx(Variants::SV, i)));
                    breakpoints.push_back(std::make_pair((int)length(seq), variants.posToIdx(Variants::SV, i)));

                    currentPos = length(seq);
                }
                break;
            case StructuralVariantRecord::DUPLICATION:
                {
                    append(seq, infix(contig, svRecord.pos, svRecord.pos + svRecord.size));
                    SEQAN_ASSERT_GEQ(svRecord.targetPos, svRecord.pos + svRecord.size);
                    if (methSimOptions && methSimOptions->simulateMethylationLevels)  // first copy
                    {
                        appendValue(varPoints, std::make_pair((int)length(seq), false));
                        append(levelsLargeVariants->forward, infix(levels->forward, svRecord.pos, svRecord.pos + svRecord.size));
                        append(levelsLargeVariants->reverse, infix(levels->reverse, svRecord.pos, svRecord.pos + svRecord.size));
                    }
                    if (currentPos != length(seq))
                        appendValue(intervals, GenomicInterval(currentPos, length(seq), svRecord.pos, svRecord.pos + svRecord.size,
                                                               '+', GenomicInterval::DUPLICATED));
                    unsigned tmpCurrentPos = length(seq);
                    append(seq, infix(contig, svRecord.pos + svRecord.size, svRecord.targetPos));
                    if (methSimOptions && methSimOptions->simulateMethylationLevels)
                    {
                        appendValue(varPoints, std::make_pair((int)length(seq), false));
                        append(levelsLargeVariants->forward, infix(levels->forward, svRecord.pos + svRecord.size, svRecord.targetPos));
                        append(levelsLargeVariants->reverse, infix(levels->reverse, svRecord.pos + svRecord.size, svRecord.targetPos));
                    }
                    if (tmpCurrentPos != length(seq))
                        appendValue(intervals, GenomicInterval(tmpCurrentPos, length(seq), svRecord.pos + svRecord.size, svRecord.targetPos,
                                                               '+', GenomicInterval::NORMAL));
                    tmpCurrentPos = length(seq);
                    append(seq, infix(contig, svRecord.pos, svRecord.pos + svRecord.size));
                    if (methSimOptions && methSimOptions->simulateMethylationLevels)  // second copy
                    {
                        appendValue(varPoints, std::make_pair((int)length(seq), false));
                        append(levelsLargeVariants->forward, infix(levels->forward, svRecord.pos, svRecord.pos + svRecord.size));
                        append(levelsLargeVariants->reverse, infix(levels->reverse, svRecord.pos, svRecord.pos + svRecord.size));
                    }
                    if (tmpCurrentPos != length(seq))
                        appendValue(intervals, GenomicInterval(tmpCurrentPos, length(seq), svRecord.pos, svRecord.pos + svRecord.size,
                                                               '+', GenomicInterval::NORMAL));
                    if (verbosity >= 3)
                        std::cerr << "append(seq, infix(contig, " << svRecord.pos << ", " << svRecord.pos + svRecord.size << ") " << __LINE__ << " (duplication)\n"
                                  << "append(seq, infix(contig, " << svRecord.pos + svRecord.size << ", " << svRecord.targetPos << ") " << __LINE__ << "\n"
                                  << "append(seq, infix(contig, " << svRecord.pos << ", " << svRecord.pos + svRecord.size << ") " << __LINE__ << "\n";
                    lastPos = svRecord.targetPos;
                    SEQAN_ASSERT_LT(lastPos, (int)length(contig));

                    // Write out small variant infos for duplication.
                    for (; itSmallVar != smallVarInfos.end() && itSmallVar->pos < svRecord.pos + svRecord.size; ++itSmallVar)
                    {
                        int offset = (int)currentPos - lastPos;
                        varInfos.push_back(*itSmallVar);
                        varInfos.back().pos += offset;

                        if (itSmallVar->pos < svRecord.pos + svRecord.size)
                        {
                            varInfos.push_back(*itSmallVar);
                            varInfos.back().pos += (svRecord.targetPos - svRecord.pos);
                        }
                    }

                    // Copy out breakpoints.
                    breakpoints.push_back(std::make_pair(currentPos, variants.posToIdx(Variants::SV, i)));
                    breakpoints.push_back(std::make_pair(currentPos + svRecord.pos + svRecord.size - svRecord.pos, variants.posToIdx(Variants::SV, i)));
                    breakpoints.push_back(std::make_pair(currentPos + svRecord.pos + svRecord.size - svRecord.pos + svRecord.targetPos - (svRecord.pos + svRecord.size), variants.posToIdx(Variants::SV, i)));
                    breakpoints.push_back(std::make_pair((int)length(seq), variants.posToIdx(Variants::SV, i)));

                    currentPos = length(seq);
                }
                break;
            default:
                return 1;
        }
    }
    if (verbosity >= 3)
        std::cerr << "append(seq, infix(contig, " << lastPos << ", " << length(contig) << ") "
                  << __LINE__ << " (last interim)\n";
    append(seq, infix(contig, lastPos, length(contig)));
    if (methSimOptions && methSimOptions->simulateMethylationLevels)
    {
        append(levelsLargeVariants->forward, infix(levels->forward, lastPos, length(contig)));
        append(levelsLargeVariants->reverse, infix(levels->reverse, lastPos, length(contig)));

        SEQAN_ASSERT_EQ(length(seq), length(levelsLargeVariants->forward));
        SEQAN_ASSERT_EQ(length(seq), length(levelsLargeVariants->reverse));

        fixVariationLevels(*levelsLargeVariants, *rng, seq, varPoints, *methSimOptions);
    }
    if (currentPos != length(seq))
        appendValue(intervals, GenomicInterval(currentPos, length(seq), lastPos, length(contig),
                                               '+', GenomicInterval::NORMAL));

    // Copy out small variant infos for trailing characters.
    for (; itSmallVar != smallVarInfos.end(); ++itSmallVar)
    {
        int offset = (int)currentPos - lastPos;
        varInfos.push_back(*itSmallVar);
        varInfos.back().pos += offset;
    }

    // Build the interval trees of the positionMap.
    seqan::String<PositionMap::TInterval> svIntervals, svIntervalsSTL;
    for (unsigned i = 0; i < length(intervals); ++i)
        appendValue(svIntervals, PositionMap::TInterval(
                intervals[i].svBeginPos, intervals[i].svEndPos, intervals[i]));
    for (unsigned i = 0; i < length(intervals); ++i)
        if (intervals[i].smallVarBeginPos != -1)  // ignore insertions
            appendValue(svIntervalsSTL, PositionMap::TInterval(
                    intervals[i].smallVarBeginPos, intervals[i].smallVarEndPos, intervals[i]));
    createIntervalTree(positionMap.svIntervalTree, svIntervals);
    createIntervalTree(positionMap.svIntervalTreeSTL, svIntervalsSTL);

    return 0;
}
Esempio n. 6
0
int VariantMaterializer::_materializeSmallVariants(
        seqan::Dna5String & seq,
        TJournalEntries & journal,
        MethylationLevels * levelsSmallVariants,
        std::vector<SmallVarInfo> & smallVarInfos,
        seqan::Dna5String const & contig,
        Variants const & variants,
        MethylationLevels const * levels,
        int hId)
{
    if (methSimOptions)
    {
        SEQAN_ASSERT_EQ(methSimOptions->simulateMethylationLevels, (levelsSmallVariants != 0));
        SEQAN_ASSERT_EQ(methSimOptions->simulateMethylationLevels, (levels != 0));
    }

    // Clear journal and output methylation levels.
    reinit(journal, length(contig));
    if (levelsSmallVariants)
        levelsSmallVariants->clear();
    // Store variation points with a flag whether it is a SNP (true) or a breakpoint (false).
    seqan::String<std::pair<int, bool> > varPoints;

    // Fors this, we have to iterate in parallel over SNP and small indel records.
    //
    // Current index in snp/small indel array.
    unsigned snpsIdx = 0;
    unsigned smallIndelIdx = 0;
    // Current SNP record, default to sentinel.
    SnpRecord snpRecord;
    snpRecord.rId = seqan::maxValue<int>();
    if (snpsIdx < length(variants.snps))
        snpRecord = variants.snps[snpsIdx++];
    // Current small indel record, default to sentinel.
    SmallIndelRecord smallIndelRecord;
    smallIndelRecord.rId = seqan::maxValue<int>();
    if (smallIndelIdx < length(variants.smallIndels))
        smallIndelRecord = variants.smallIndels[smallIndelIdx++];
    // Track last position from contig appended to seq so far.
    int lastPos = 0;
    if (verbosity >= 3)
        std::cerr << __LINE__ << "\tlastPos == " << lastPos << "\n";

    // TODO(holtgrew): Extract contig building into their own functions.
    if (verbosity >= 2)
        std::cerr << "building output\n";
    while (snpRecord.rId != seqan::maxValue<int>() || smallIndelRecord.rId != seqan::maxValue<int>())
    {
        // TODO(holtgrew): Extract SNP and small indel handling into their own functions.
        if (snpRecord.getPos() < smallIndelRecord.getPos())  // process SNP records
        {
            if (snpRecord.haplotype == hId)  // Ignore all but the current contig.
            {
                if (verbosity >= 3)
                    std::cerr << "append(seq, infix(contig, " << lastPos << ", " << snpRecord.pos << ") " << __LINE__ << "\n";
                // Append interim sequence and methylation levels->
                append(seq, infix(contig, lastPos, snpRecord.pos));
                if (methSimOptions && methSimOptions->simulateMethylationLevels)
                {
                    append(levelsSmallVariants->forward, infix(levels->forward, lastPos, snpRecord.pos + 1));
                    append(levelsSmallVariants->reverse, infix(levels->reverse, lastPos, snpRecord.pos + 1));
                    appendValue(varPoints, std::make_pair((int)length(seq), true));      // variation points before/after SNP
                    appendValue(varPoints, std::make_pair((int)length(seq) + 1, true));
                }

                SEQAN_ASSERT_GEQ(snpRecord.pos, lastPos);
                if (verbosity >= 3)
                    std::cerr << "appendValue(seq, " << snpRecord.to << "')\n";
                appendValue(seq, snpRecord.to);
                lastPos = snpRecord.pos + 1;
                if (verbosity >= 3)
                    std::cerr << __LINE__ << "\tlastPos == " << lastPos << "\n";

                // Register SNP as small variant info.
                smallVarInfos.push_back(SmallVarInfo(SmallVarInfo::SNP, length(seq) - 1, 1));
            }

            if (snpsIdx >= length(variants.snps))
                snpRecord.rId = seqan::maxValue<int>();
            else
                snpRecord = variants.snps[snpsIdx++];
        }
        else
        {
            if (smallIndelRecord.haplotype == hId)  // Ignore all but the current contig.
            {
                if (smallIndelRecord.size > 0)
                {
                    if (verbosity >= 3)
                        std::cerr << "append(seq, infix(contig, " << lastPos << ", " << smallIndelRecord.pos << ") "
                                  << __LINE__ << "\n";

                    // Simulate methylation levels for insertion.
                    MethylationLevels lvls;
                    if (methSimOptions && methSimOptions->simulateMethylationLevels)
                    {
                        MethylationLevelSimulator methSim(*rng, *methSimOptions);
                        methSim.run(lvls, smallIndelRecord.seq);
                    }

                    // Append interim sequence and methylation levels->
                    append(seq, infix(contig, lastPos, smallIndelRecord.pos));
                    if (methSimOptions && methSimOptions->simulateMethylationLevels)
                    {
                        append(levelsSmallVariants->forward, infix(levels->forward, lastPos, smallIndelRecord.pos));
                        append(levelsSmallVariants->reverse, infix(levels->reverse, lastPos, smallIndelRecord.pos));
                        appendValue(varPoints, std::make_pair((int)length(seq), false));  // variation point before insertion
                    }

                    SEQAN_ASSERT_GEQ(smallIndelRecord.pos, lastPos);
                    if (verbosity >= 3)
                        std::cerr << "append(seq, \"" << smallIndelRecord.seq << "\") " << __LINE__ << "\n";
                    // Register insertion as small variant info.
                    for (unsigned i = 0; i < length(smallIndelRecord.seq); ++i)
                        smallVarInfos.push_back(SmallVarInfo(SmallVarInfo::INS, length(seq) + i, 1));
                    // Append novel sequence and methylation levels->
                    append(seq, smallIndelRecord.seq);
                    if (methSimOptions && methSimOptions->simulateMethylationLevels)
                    {
                        append(levelsSmallVariants->forward, lvls.forward);
                        append(levelsSmallVariants->reverse, lvls.reverse);
                        appendValue(varPoints, std::make_pair((int)length(seq), false));  // variation point after insertion
                    }
                    lastPos = smallIndelRecord.pos;
                    recordInsertion(journal, hostToVirtualPosition(journal, smallIndelRecord.pos),
                                    0, smallIndelRecord.size);
                    if (verbosity >= 3)
                        std::cerr << __LINE__ << "\tlastPos == " << lastPos << "\n";
                }
                else  // deletion
                {
                    if (verbosity >= 3)
                        std::cerr << "append(seq, infix(contig, " << lastPos << ", " << smallIndelRecord.pos << ") " << __LINE__ << "\n";
                    // Append interim sequence and methylation levels->
                    append(seq, infix(contig, lastPos, smallIndelRecord.pos));  // interim chars
                    if (methSimOptions && methSimOptions->simulateMethylationLevels)
                    {
                        appendValue(varPoints, std::make_pair((int)length(seq), false));  // variation point at deletion
                        append(levelsSmallVariants->forward, infix(levels->forward, lastPos, smallIndelRecord.pos));
                        append(levelsSmallVariants->reverse, infix(levels->reverse, lastPos, smallIndelRecord.pos));
                    }

                    lastPos = smallIndelRecord.pos - smallIndelRecord.size;
                    SEQAN_ASSERT_LT(lastPos, (int)length(contig));
                    recordErase(journal,
                                hostToVirtualPosition(journal, smallIndelRecord.pos),
                                hostToVirtualPosition(journal, smallIndelRecord.pos - smallIndelRecord.size));
                    if (verbosity >= 3)
                        std::cerr << __LINE__ << "\tlastPos == " << lastPos << "\n";

                    // Register deletion as small variant info.
                    smallVarInfos.push_back(SmallVarInfo(SmallVarInfo::DEL, length(seq), -smallIndelRecord.size));
                }
            }

            if (smallIndelIdx >= length(variants.smallIndels))
                smallIndelRecord.rId = seqan::maxValue<int>();
            else
                smallIndelRecord = variants.smallIndels[smallIndelIdx++];
        }
    }
    // Insert remaining characters.
    if (verbosity >= 3)
        std::cerr << "append(seq, infix(contig, " << lastPos << ", " << length(contig) << ")\n";
    append(seq, infix(contig, lastPos, length(contig)));

    if (methSimOptions && methSimOptions->simulateMethylationLevels)
    {
        append(levelsSmallVariants->forward, infix(levels->forward, lastPos, length(contig)));
        append(levelsSmallVariants->reverse, infix(levels->reverse, lastPos, length(contig)));

        SEQAN_ASSERT_EQ(length(seq), length(levelsSmallVariants->forward));
        SEQAN_ASSERT_EQ(length(seq), length(levelsSmallVariants->reverse));

        fixVariationLevels(*levelsSmallVariants, *rng, seq, varPoints, *methSimOptions);
    }

    return 0;
}