void GenomicRegionCollection<T>::CreateTreeMap() {

  if (!m_grv->size())
    return;

  // sort the genomic intervals
  if (!m_sorted)
    CoordinateSort();

  // loop through and make the intervals for each chromosome
  GenomicIntervalMap map;
  for (size_t i = 0; i < m_grv->size(); ++i) {
    map[m_grv->at(i).chr].push_back(GenomicInterval(m_grv->at(i).pos1, m_grv->at(i).pos2, i));
  }

  // for each chr, make the tree from the intervals
  //for (auto it : map) {
  for (GenomicIntervalMap::iterator it = map.begin(); it != map.end(); ++it) {
    GenomicIntervalTreeMap::iterator ff = m_tree->find(it->first);
    if (ff != m_tree->end())
      ff->second = GenomicIntervalTree(it->second);
    else
      m_tree->insert(std::pair<int, GenomicIntervalTree>(it->first, GenomicIntervalTree(it->second)));
    //old //m_tree[it.first] = GenomicIntervalTree(it.second);
  }

}
Ejemplo n.º 2
0
GenomicInterval parseInputInterval(string const & location) {
	size_t idx = location.find(":");
	if (idx == string::npos) {
		cerr << "[ERROR] Can not parse the input interval. Expected format: chr2:5000000-100000000" << endl;
		exit(1);
	}
	int chr = stoi( location.substr(0, idx).substr(3) );
	size_t idx2 = location.find('-', idx);
	if (idx2 == string::npos) {
		cerr << "[ERROR] Can not parse the input interval. Expected format: chr2:5000000-100000000" << endl;
		exit(1);
	}
	int start_coord = stoi( location.substr(idx+1, idx2 - idx) );
	int end_coord = stoi( location.substr(idx2+1) );

	cerr << chr << " " << start_coord << " " << end_coord << endl;

	return GenomicInterval(chr, start_coord, end_coord);
}
Ejemplo n.º 3
0
int VariantMaterializer::_materializeLargeVariants(
        seqan::Dna5String & seq,
        MethylationLevels * levelsLargeVariants,
        std::vector<SmallVarInfo> & varInfos,
        std::vector<std::pair<int, int> > & breakpoints,
        PositionMap & positionMap,
        TJournalEntries const & journal,
        seqan::Dna5String const & contig,
        std::vector<SmallVarInfo> const & smallVarInfos,
        Variants const & variants,
        MethylationLevels const * levels,
        int hId)
{
    if (methSimOptions)
    {
        SEQAN_ASSERT_EQ(methSimOptions->simulateMethylationLevels, (levelsLargeVariants != 0));
        SEQAN_ASSERT_EQ(methSimOptions->simulateMethylationLevels, (levels != 0));
    }

    // We will record all intervals for the positionMap.svIntervalTree in this String.
    seqan::String<GenomicInterval> intervals;

    // Clear output methylation levels->
    if (levelsLargeVariants)
        levelsLargeVariants->clear();
    // Store variation points.  We reuse the fixVariationLevels() function from small indel/snp simulation and thus
    // have to store a bool that is always set to false.
    seqan::String<std::pair<int, bool> > varPoints;

    // Track last position from contig appended to seq so far.
    int lastPos = 0;
    if (verbosity >= 3)
        std::cerr << __LINE__ << "\tlastPos == " << lastPos << "\n";

    // Pointer to the current small variant to write out translated to varInfo.
    std::vector<SmallVarInfo>::const_iterator itSmallVar = smallVarInfos.begin();

    // Number of bytes written out so far/current position in variant.
    unsigned currentPos = 0;

    for (unsigned i = 0; i < length(variants.svRecords); ++i)
    {
        if (variants.svRecords[i].haplotype != hId)  // Ignore all but the current contig.
            continue;
        // We obtain a copy of the current SV record since we translate its positions below.
        StructuralVariantRecord svRecord = variants.svRecords[i];

        // Translate positions and lengths of SV record.
        if (verbosity >= 2)
            std::cerr << "  Translating SvRecord\n  " << svRecord << '\n';
        svRecord.pos = hostToVirtualPosition(journal, svRecord.pos);
        SEQAN_ASSERT_LT(svRecord.pos, (int)length(contig));
        // We do not need to adjust the sizes for insertions.
        if (svRecord.kind != StructuralVariantRecord::INDEL || svRecord.size < 0)
            svRecord.size = hostToVirtualPosition(journal, svRecord.pos + svRecord.size) -
                    hostToVirtualPosition(journal, svRecord.pos);
        if (svRecord.targetPos != -1)
            svRecord.targetPos = hostToVirtualPosition(journal, svRecord.targetPos);
        if (verbosity >= 2)
            std::cerr << "  => " << svRecord << '\n';

        // Copy out small variant infos for interim chars.
        for (; itSmallVar != smallVarInfos.end() && itSmallVar->pos < svRecord.pos; ++itSmallVar)
        {
            int offset = (int)currentPos - lastPos;
            varInfos.push_back(*itSmallVar);
            varInfos.back().pos += offset;
        }

        // Copy from contig to seq with SVs.
        if (verbosity >= 3)
            std::cerr << "lastPos == " << lastPos << "\n";
        append(seq, infix(contig, lastPos, svRecord.pos));  // interim chars
        if (methSimOptions && methSimOptions->simulateMethylationLevels)
        {
            append(levelsLargeVariants->forward, infix(levels->forward, lastPos, svRecord.pos));
            append(levelsLargeVariants->reverse, infix(levels->reverse, lastPos, svRecord.pos));
            appendValue(varPoints, std::make_pair((int)length(seq), false));
        }
        if (currentPos != length(seq))
            appendValue(intervals, GenomicInterval(currentPos, length(seq), lastPos, svRecord.pos,
                                                   '+', GenomicInterval::NORMAL));
        currentPos = length(seq);
        if (verbosity >= 3)
            std::cerr << "append(seq, infix(contig, " << lastPos << ", " << svRecord.pos << ") " << __LINE__ << " (interim)\n";
        switch (svRecord.kind)
        {
            case StructuralVariantRecord::INDEL:
                {
                    if (svRecord.size > 0)  // insertion
                    {
                        SEQAN_ASSERT_EQ((int)length(svRecord.seq), svRecord.size);

                        // Simulate methylation levels for insertion.
                        MethylationLevels lvls;
                        if (methSimOptions && methSimOptions->simulateMethylationLevels)
                        {
                            MethylationLevelSimulator methSim(*rng, *methSimOptions);
                            methSim.run(lvls, svRecord.seq);
                        }

                        // Append novel sequence and methylation levels.
                        append(seq, svRecord.seq);
                        if (methSimOptions && methSimOptions->simulateMethylationLevels)
                        {
                            append(levelsLargeVariants->forward, lvls.forward);
                            append(levelsLargeVariants->reverse, lvls.reverse);
                            appendValue(varPoints, std::make_pair((int)length(seq), false));  // variation point after insertion
                        }
                        if (currentPos != length(seq))
                            appendValue(intervals, GenomicInterval(currentPos, length(seq), -1, -1,
                                                                   '+', GenomicInterval::INSERTED));
                        if (verbosity >= 3)
                            std::cerr << "append(seq, svRecord.seq (length == " << length(svRecord.seq) << ") " << __LINE__ << " (insertion)\n";
                        lastPos = svRecord.pos;
                        SEQAN_ASSERT_LT(lastPos, (int)length(contig));

                        // Copy out breakpoints.
                        breakpoints.push_back(std::make_pair(currentPos, variants.posToIdx(Variants::SV, i)));
                        breakpoints.push_back(std::make_pair((int)length(seq), variants.posToIdx(Variants::SV, i)));

                        currentPos = length(seq);
                    }
                    else  // deletion
                    {
                        lastPos = svRecord.pos - svRecord.size;
                        SEQAN_ASSERT_LT(lastPos, (int)length(contig));

                        // Copy out breakpoint.
                        breakpoints.push_back(std::make_pair(currentPos, variants.posToIdx(Variants::SV, i)));
                    }
                }
                break;
            case StructuralVariantRecord::INVERSION:
                {
                    unsigned oldLen = length(seq);
                    append(seq, infix(contig, svRecord.pos, svRecord.pos + svRecord.size));
                    if (methSimOptions && methSimOptions->simulateMethylationLevels)
                    {
                        appendValue(varPoints, std::make_pair((int)length(seq), false));  // variation point at deletion
                        append(levelsLargeVariants->forward, infix(levels->reverse, svRecord.pos, svRecord.pos + svRecord.size));
                        reverse(infix(levelsLargeVariants->forward, oldLen, length(levelsLargeVariants->forward)));
                        append(levelsLargeVariants->reverse, infix(levels->forward, svRecord.pos, svRecord.pos + svRecord.size));
                        reverse(infix(levelsLargeVariants->reverse, oldLen, length(levelsLargeVariants->reverse)));
                    }
                    if (currentPos != length(seq))
                        appendValue(intervals, GenomicInterval(currentPos, length(seq), svRecord.pos, svRecord.pos + svRecord.size,
                                                               '-', GenomicInterval::INVERTED));

                    // Copy out small variant infos for inversion.
                    for (; itSmallVar != smallVarInfos.end() && itSmallVar->pos < svRecord.pos + svRecord.size; ++itSmallVar)
                    {
                        varInfos.push_back(*itSmallVar);
                        varInfos.back().pos = currentPos + svRecord.size - (varInfos.back().pos - lastPos);
                    }

                    if (verbosity >= 3)
                        std::cerr << "append(seq, infix(contig, " << svRecord.pos << ", " << svRecord.pos + svRecord.size << ") " << __LINE__ << " (inversion)\n";
                    reverseComplement(infix(seq, oldLen, length(seq)));
                    lastPos = svRecord.pos + svRecord.size;
                    SEQAN_ASSERT_LT(lastPos, (int)length(contig));

                    // Copy out breakpoints.
                    breakpoints.push_back(std::make_pair(currentPos, variants.posToIdx(Variants::SV, i)));
                    breakpoints.push_back(std::make_pair((int)length(seq), variants.posToIdx(Variants::SV, i)));

                    currentPos = length(seq);
                }
                break;
            case StructuralVariantRecord::TRANSLOCATION:
                {
                    SEQAN_ASSERT_GEQ(svRecord.targetPos, svRecord.pos + svRecord.size);
                    append(seq, infix(contig, svRecord.pos + svRecord.size, svRecord.targetPos));
                    if (methSimOptions && methSimOptions->simulateMethylationLevels)
                    {
                        appendValue(varPoints, std::make_pair((int)length(seq), false));
                        append(levelsLargeVariants->forward, infix(levels->forward, svRecord.pos + svRecord.size, svRecord.targetPos));
                        append(levelsLargeVariants->reverse, infix(levels->reverse, svRecord.pos + svRecord.size, svRecord.targetPos));
                    }
                    if (currentPos != length(seq))
                        appendValue(intervals, GenomicInterval(currentPos, length(seq), svRecord.pos + svRecord.size, svRecord.targetPos,
                                                               '+', GenomicInterval::NORMAL));
                    unsigned tmpCurrentPos = length(seq);
                    append(seq, infix(contig, svRecord.pos, svRecord.pos + svRecord.size));
                    if (methSimOptions && methSimOptions->simulateMethylationLevels)
                    {
                        appendValue(varPoints, std::make_pair((int)length(seq), false));
                        append(levelsLargeVariants->forward, infix(levels->forward, svRecord.pos, svRecord.pos + svRecord.size));
                        append(levelsLargeVariants->reverse, infix(levels->reverse, svRecord.pos, svRecord.pos + svRecord.size));
                    }
                    if (tmpCurrentPos != length(seq))
                        appendValue(intervals, GenomicInterval(tmpCurrentPos, length(seq), svRecord.pos, svRecord.pos + svRecord.size,
                                                               '+', GenomicInterval::NORMAL));
                    if (verbosity >= 3)
                        std::cerr << "append(seq, infix(contig, " << svRecord.pos + svRecord.size << ", " << svRecord.targetPos << ") " << __LINE__ << " (translocation)\n"
                                  << "append(seq, infix(contig, " << svRecord.pos << ", " << svRecord.pos + svRecord.size << ") " << __LINE__ << "\n";
                    lastPos = svRecord.targetPos;
                    SEQAN_ASSERT_LT(lastPos, (int)length(contig));

                    // Copy out small variant infos for translocation, shift left to right and righ to left but keep
                    // center intact.
                    for (; itSmallVar != smallVarInfos.end() && itSmallVar->pos < svRecord.pos; ++itSmallVar)
                    {
                        int offset = (int)currentPos - lastPos;
                        varInfos.push_back(*itSmallVar);
                        varInfos.back().pos += offset;

                        int bpLeft = svRecord.pos + svRecord.size;
                        int bpRight = svRecord.targetPos;
                        if (itSmallVar->pos < bpLeft)
                            varInfos.back().pos -= (svRecord.targetPos - svRecord.pos);
                        else if (itSmallVar->pos >= bpRight)
                            varInfos.back().pos += (svRecord.targetPos - svRecord.pos);
                    }

                    // Copy out breakpoints.
                    breakpoints.push_back(std::make_pair(currentPos, variants.posToIdx(Variants::SV, i)));
                    breakpoints.push_back(std::make_pair(currentPos + svRecord.targetPos - svRecord.pos - svRecord.size, variants.posToIdx(Variants::SV, i)));
                    breakpoints.push_back(std::make_pair((int)length(seq), variants.posToIdx(Variants::SV, i)));

                    currentPos = length(seq);
                }
                break;
            case StructuralVariantRecord::DUPLICATION:
                {
                    append(seq, infix(contig, svRecord.pos, svRecord.pos + svRecord.size));
                    SEQAN_ASSERT_GEQ(svRecord.targetPos, svRecord.pos + svRecord.size);
                    if (methSimOptions && methSimOptions->simulateMethylationLevels)  // first copy
                    {
                        appendValue(varPoints, std::make_pair((int)length(seq), false));
                        append(levelsLargeVariants->forward, infix(levels->forward, svRecord.pos, svRecord.pos + svRecord.size));
                        append(levelsLargeVariants->reverse, infix(levels->reverse, svRecord.pos, svRecord.pos + svRecord.size));
                    }
                    if (currentPos != length(seq))
                        appendValue(intervals, GenomicInterval(currentPos, length(seq), svRecord.pos, svRecord.pos + svRecord.size,
                                                               '+', GenomicInterval::DUPLICATED));
                    unsigned tmpCurrentPos = length(seq);
                    append(seq, infix(contig, svRecord.pos + svRecord.size, svRecord.targetPos));
                    if (methSimOptions && methSimOptions->simulateMethylationLevels)
                    {
                        appendValue(varPoints, std::make_pair((int)length(seq), false));
                        append(levelsLargeVariants->forward, infix(levels->forward, svRecord.pos + svRecord.size, svRecord.targetPos));
                        append(levelsLargeVariants->reverse, infix(levels->reverse, svRecord.pos + svRecord.size, svRecord.targetPos));
                    }
                    if (tmpCurrentPos != length(seq))
                        appendValue(intervals, GenomicInterval(tmpCurrentPos, length(seq), svRecord.pos + svRecord.size, svRecord.targetPos,
                                                               '+', GenomicInterval::NORMAL));
                    tmpCurrentPos = length(seq);
                    append(seq, infix(contig, svRecord.pos, svRecord.pos + svRecord.size));
                    if (methSimOptions && methSimOptions->simulateMethylationLevels)  // second copy
                    {
                        appendValue(varPoints, std::make_pair((int)length(seq), false));
                        append(levelsLargeVariants->forward, infix(levels->forward, svRecord.pos, svRecord.pos + svRecord.size));
                        append(levelsLargeVariants->reverse, infix(levels->reverse, svRecord.pos, svRecord.pos + svRecord.size));
                    }
                    if (tmpCurrentPos != length(seq))
                        appendValue(intervals, GenomicInterval(tmpCurrentPos, length(seq), svRecord.pos, svRecord.pos + svRecord.size,
                                                               '+', GenomicInterval::NORMAL));
                    if (verbosity >= 3)
                        std::cerr << "append(seq, infix(contig, " << svRecord.pos << ", " << svRecord.pos + svRecord.size << ") " << __LINE__ << " (duplication)\n"
                                  << "append(seq, infix(contig, " << svRecord.pos + svRecord.size << ", " << svRecord.targetPos << ") " << __LINE__ << "\n"
                                  << "append(seq, infix(contig, " << svRecord.pos << ", " << svRecord.pos + svRecord.size << ") " << __LINE__ << "\n";
                    lastPos = svRecord.targetPos;
                    SEQAN_ASSERT_LT(lastPos, (int)length(contig));

                    // Write out small variant infos for duplication.
                    for (; itSmallVar != smallVarInfos.end() && itSmallVar->pos < svRecord.pos + svRecord.size; ++itSmallVar)
                    {
                        int offset = (int)currentPos - lastPos;
                        varInfos.push_back(*itSmallVar);
                        varInfos.back().pos += offset;

                        if (itSmallVar->pos < svRecord.pos + svRecord.size)
                        {
                            varInfos.push_back(*itSmallVar);
                            varInfos.back().pos += (svRecord.targetPos - svRecord.pos);
                        }
                    }

                    // Copy out breakpoints.
                    breakpoints.push_back(std::make_pair(currentPos, variants.posToIdx(Variants::SV, i)));
                    breakpoints.push_back(std::make_pair(currentPos + svRecord.pos + svRecord.size - svRecord.pos, variants.posToIdx(Variants::SV, i)));
                    breakpoints.push_back(std::make_pair(currentPos + svRecord.pos + svRecord.size - svRecord.pos + svRecord.targetPos - (svRecord.pos + svRecord.size), variants.posToIdx(Variants::SV, i)));
                    breakpoints.push_back(std::make_pair((int)length(seq), variants.posToIdx(Variants::SV, i)));

                    currentPos = length(seq);
                }
                break;
            default:
                return 1;
        }
    }
    if (verbosity >= 3)
        std::cerr << "append(seq, infix(contig, " << lastPos << ", " << length(contig) << ") "
                  << __LINE__ << " (last interim)\n";
    append(seq, infix(contig, lastPos, length(contig)));
    if (methSimOptions && methSimOptions->simulateMethylationLevels)
    {
        append(levelsLargeVariants->forward, infix(levels->forward, lastPos, length(contig)));
        append(levelsLargeVariants->reverse, infix(levels->reverse, lastPos, length(contig)));

        SEQAN_ASSERT_EQ(length(seq), length(levelsLargeVariants->forward));
        SEQAN_ASSERT_EQ(length(seq), length(levelsLargeVariants->reverse));

        fixVariationLevels(*levelsLargeVariants, *rng, seq, varPoints, *methSimOptions);
    }
    if (currentPos != length(seq))
        appendValue(intervals, GenomicInterval(currentPos, length(seq), lastPos, length(contig),
                                               '+', GenomicInterval::NORMAL));

    // Copy out small variant infos for trailing characters.
    for (; itSmallVar != smallVarInfos.end(); ++itSmallVar)
    {
        int offset = (int)currentPos - lastPos;
        varInfos.push_back(*itSmallVar);
        varInfos.back().pos += offset;
    }

    // Build the interval trees of the positionMap.
    seqan::String<PositionMap::TInterval> svIntervals, svIntervalsSTL;
    for (unsigned i = 0; i < length(intervals); ++i)
        appendValue(svIntervals, PositionMap::TInterval(
                intervals[i].svBeginPos, intervals[i].svEndPos, intervals[i]));
    for (unsigned i = 0; i < length(intervals); ++i)
        if (intervals[i].smallVarBeginPos != -1)  // ignore insertions
            appendValue(svIntervalsSTL, PositionMap::TInterval(
                    intervals[i].smallVarBeginPos, intervals[i].smallVarEndPos, intervals[i]));
    createIntervalTree(positionMap.svIntervalTree, svIntervals);
    createIntervalTree(positionMap.svIntervalTreeSTL, svIntervalsSTL);

    return 0;
}