void GenomicRegionCollection<T>::CreateTreeMap() { if (!m_grv->size()) return; // sort the genomic intervals if (!m_sorted) CoordinateSort(); // loop through and make the intervals for each chromosome GenomicIntervalMap map; for (size_t i = 0; i < m_grv->size(); ++i) { map[m_grv->at(i).chr].push_back(GenomicInterval(m_grv->at(i).pos1, m_grv->at(i).pos2, i)); } // for each chr, make the tree from the intervals //for (auto it : map) { for (GenomicIntervalMap::iterator it = map.begin(); it != map.end(); ++it) { GenomicIntervalTreeMap::iterator ff = m_tree->find(it->first); if (ff != m_tree->end()) ff->second = GenomicIntervalTree(it->second); else m_tree->insert(std::pair<int, GenomicIntervalTree>(it->first, GenomicIntervalTree(it->second))); //old //m_tree[it.first] = GenomicIntervalTree(it.second); } }
GenomicInterval parseInputInterval(string const & location) { size_t idx = location.find(":"); if (idx == string::npos) { cerr << "[ERROR] Can not parse the input interval. Expected format: chr2:5000000-100000000" << endl; exit(1); } int chr = stoi( location.substr(0, idx).substr(3) ); size_t idx2 = location.find('-', idx); if (idx2 == string::npos) { cerr << "[ERROR] Can not parse the input interval. Expected format: chr2:5000000-100000000" << endl; exit(1); } int start_coord = stoi( location.substr(idx+1, idx2 - idx) ); int end_coord = stoi( location.substr(idx2+1) ); cerr << chr << " " << start_coord << " " << end_coord << endl; return GenomicInterval(chr, start_coord, end_coord); }
int VariantMaterializer::_materializeLargeVariants( seqan::Dna5String & seq, MethylationLevels * levelsLargeVariants, std::vector<SmallVarInfo> & varInfos, std::vector<std::pair<int, int> > & breakpoints, PositionMap & positionMap, TJournalEntries const & journal, seqan::Dna5String const & contig, std::vector<SmallVarInfo> const & smallVarInfos, Variants const & variants, MethylationLevels const * levels, int hId) { if (methSimOptions) { SEQAN_ASSERT_EQ(methSimOptions->simulateMethylationLevels, (levelsLargeVariants != 0)); SEQAN_ASSERT_EQ(methSimOptions->simulateMethylationLevels, (levels != 0)); } // We will record all intervals for the positionMap.svIntervalTree in this String. seqan::String<GenomicInterval> intervals; // Clear output methylation levels-> if (levelsLargeVariants) levelsLargeVariants->clear(); // Store variation points. We reuse the fixVariationLevels() function from small indel/snp simulation and thus // have to store a bool that is always set to false. seqan::String<std::pair<int, bool> > varPoints; // Track last position from contig appended to seq so far. int lastPos = 0; if (verbosity >= 3) std::cerr << __LINE__ << "\tlastPos == " << lastPos << "\n"; // Pointer to the current small variant to write out translated to varInfo. std::vector<SmallVarInfo>::const_iterator itSmallVar = smallVarInfos.begin(); // Number of bytes written out so far/current position in variant. unsigned currentPos = 0; for (unsigned i = 0; i < length(variants.svRecords); ++i) { if (variants.svRecords[i].haplotype != hId) // Ignore all but the current contig. continue; // We obtain a copy of the current SV record since we translate its positions below. StructuralVariantRecord svRecord = variants.svRecords[i]; // Translate positions and lengths of SV record. if (verbosity >= 2) std::cerr << " Translating SvRecord\n " << svRecord << '\n'; svRecord.pos = hostToVirtualPosition(journal, svRecord.pos); SEQAN_ASSERT_LT(svRecord.pos, (int)length(contig)); // We do not need to adjust the sizes for insertions. if (svRecord.kind != StructuralVariantRecord::INDEL || svRecord.size < 0) svRecord.size = hostToVirtualPosition(journal, svRecord.pos + svRecord.size) - hostToVirtualPosition(journal, svRecord.pos); if (svRecord.targetPos != -1) svRecord.targetPos = hostToVirtualPosition(journal, svRecord.targetPos); if (verbosity >= 2) std::cerr << " => " << svRecord << '\n'; // Copy out small variant infos for interim chars. for (; itSmallVar != smallVarInfos.end() && itSmallVar->pos < svRecord.pos; ++itSmallVar) { int offset = (int)currentPos - lastPos; varInfos.push_back(*itSmallVar); varInfos.back().pos += offset; } // Copy from contig to seq with SVs. if (verbosity >= 3) std::cerr << "lastPos == " << lastPos << "\n"; append(seq, infix(contig, lastPos, svRecord.pos)); // interim chars if (methSimOptions && methSimOptions->simulateMethylationLevels) { append(levelsLargeVariants->forward, infix(levels->forward, lastPos, svRecord.pos)); append(levelsLargeVariants->reverse, infix(levels->reverse, lastPos, svRecord.pos)); appendValue(varPoints, std::make_pair((int)length(seq), false)); } if (currentPos != length(seq)) appendValue(intervals, GenomicInterval(currentPos, length(seq), lastPos, svRecord.pos, '+', GenomicInterval::NORMAL)); currentPos = length(seq); if (verbosity >= 3) std::cerr << "append(seq, infix(contig, " << lastPos << ", " << svRecord.pos << ") " << __LINE__ << " (interim)\n"; switch (svRecord.kind) { case StructuralVariantRecord::INDEL: { if (svRecord.size > 0) // insertion { SEQAN_ASSERT_EQ((int)length(svRecord.seq), svRecord.size); // Simulate methylation levels for insertion. MethylationLevels lvls; if (methSimOptions && methSimOptions->simulateMethylationLevels) { MethylationLevelSimulator methSim(*rng, *methSimOptions); methSim.run(lvls, svRecord.seq); } // Append novel sequence and methylation levels. append(seq, svRecord.seq); if (methSimOptions && methSimOptions->simulateMethylationLevels) { append(levelsLargeVariants->forward, lvls.forward); append(levelsLargeVariants->reverse, lvls.reverse); appendValue(varPoints, std::make_pair((int)length(seq), false)); // variation point after insertion } if (currentPos != length(seq)) appendValue(intervals, GenomicInterval(currentPos, length(seq), -1, -1, '+', GenomicInterval::INSERTED)); if (verbosity >= 3) std::cerr << "append(seq, svRecord.seq (length == " << length(svRecord.seq) << ") " << __LINE__ << " (insertion)\n"; lastPos = svRecord.pos; SEQAN_ASSERT_LT(lastPos, (int)length(contig)); // Copy out breakpoints. breakpoints.push_back(std::make_pair(currentPos, variants.posToIdx(Variants::SV, i))); breakpoints.push_back(std::make_pair((int)length(seq), variants.posToIdx(Variants::SV, i))); currentPos = length(seq); } else // deletion { lastPos = svRecord.pos - svRecord.size; SEQAN_ASSERT_LT(lastPos, (int)length(contig)); // Copy out breakpoint. breakpoints.push_back(std::make_pair(currentPos, variants.posToIdx(Variants::SV, i))); } } break; case StructuralVariantRecord::INVERSION: { unsigned oldLen = length(seq); append(seq, infix(contig, svRecord.pos, svRecord.pos + svRecord.size)); if (methSimOptions && methSimOptions->simulateMethylationLevels) { appendValue(varPoints, std::make_pair((int)length(seq), false)); // variation point at deletion append(levelsLargeVariants->forward, infix(levels->reverse, svRecord.pos, svRecord.pos + svRecord.size)); reverse(infix(levelsLargeVariants->forward, oldLen, length(levelsLargeVariants->forward))); append(levelsLargeVariants->reverse, infix(levels->forward, svRecord.pos, svRecord.pos + svRecord.size)); reverse(infix(levelsLargeVariants->reverse, oldLen, length(levelsLargeVariants->reverse))); } if (currentPos != length(seq)) appendValue(intervals, GenomicInterval(currentPos, length(seq), svRecord.pos, svRecord.pos + svRecord.size, '-', GenomicInterval::INVERTED)); // Copy out small variant infos for inversion. for (; itSmallVar != smallVarInfos.end() && itSmallVar->pos < svRecord.pos + svRecord.size; ++itSmallVar) { varInfos.push_back(*itSmallVar); varInfos.back().pos = currentPos + svRecord.size - (varInfos.back().pos - lastPos); } if (verbosity >= 3) std::cerr << "append(seq, infix(contig, " << svRecord.pos << ", " << svRecord.pos + svRecord.size << ") " << __LINE__ << " (inversion)\n"; reverseComplement(infix(seq, oldLen, length(seq))); lastPos = svRecord.pos + svRecord.size; SEQAN_ASSERT_LT(lastPos, (int)length(contig)); // Copy out breakpoints. breakpoints.push_back(std::make_pair(currentPos, variants.posToIdx(Variants::SV, i))); breakpoints.push_back(std::make_pair((int)length(seq), variants.posToIdx(Variants::SV, i))); currentPos = length(seq); } break; case StructuralVariantRecord::TRANSLOCATION: { SEQAN_ASSERT_GEQ(svRecord.targetPos, svRecord.pos + svRecord.size); append(seq, infix(contig, svRecord.pos + svRecord.size, svRecord.targetPos)); if (methSimOptions && methSimOptions->simulateMethylationLevels) { appendValue(varPoints, std::make_pair((int)length(seq), false)); append(levelsLargeVariants->forward, infix(levels->forward, svRecord.pos + svRecord.size, svRecord.targetPos)); append(levelsLargeVariants->reverse, infix(levels->reverse, svRecord.pos + svRecord.size, svRecord.targetPos)); } if (currentPos != length(seq)) appendValue(intervals, GenomicInterval(currentPos, length(seq), svRecord.pos + svRecord.size, svRecord.targetPos, '+', GenomicInterval::NORMAL)); unsigned tmpCurrentPos = length(seq); append(seq, infix(contig, svRecord.pos, svRecord.pos + svRecord.size)); if (methSimOptions && methSimOptions->simulateMethylationLevels) { appendValue(varPoints, std::make_pair((int)length(seq), false)); append(levelsLargeVariants->forward, infix(levels->forward, svRecord.pos, svRecord.pos + svRecord.size)); append(levelsLargeVariants->reverse, infix(levels->reverse, svRecord.pos, svRecord.pos + svRecord.size)); } if (tmpCurrentPos != length(seq)) appendValue(intervals, GenomicInterval(tmpCurrentPos, length(seq), svRecord.pos, svRecord.pos + svRecord.size, '+', GenomicInterval::NORMAL)); if (verbosity >= 3) std::cerr << "append(seq, infix(contig, " << svRecord.pos + svRecord.size << ", " << svRecord.targetPos << ") " << __LINE__ << " (translocation)\n" << "append(seq, infix(contig, " << svRecord.pos << ", " << svRecord.pos + svRecord.size << ") " << __LINE__ << "\n"; lastPos = svRecord.targetPos; SEQAN_ASSERT_LT(lastPos, (int)length(contig)); // Copy out small variant infos for translocation, shift left to right and righ to left but keep // center intact. for (; itSmallVar != smallVarInfos.end() && itSmallVar->pos < svRecord.pos; ++itSmallVar) { int offset = (int)currentPos - lastPos; varInfos.push_back(*itSmallVar); varInfos.back().pos += offset; int bpLeft = svRecord.pos + svRecord.size; int bpRight = svRecord.targetPos; if (itSmallVar->pos < bpLeft) varInfos.back().pos -= (svRecord.targetPos - svRecord.pos); else if (itSmallVar->pos >= bpRight) varInfos.back().pos += (svRecord.targetPos - svRecord.pos); } // Copy out breakpoints. breakpoints.push_back(std::make_pair(currentPos, variants.posToIdx(Variants::SV, i))); breakpoints.push_back(std::make_pair(currentPos + svRecord.targetPos - svRecord.pos - svRecord.size, variants.posToIdx(Variants::SV, i))); breakpoints.push_back(std::make_pair((int)length(seq), variants.posToIdx(Variants::SV, i))); currentPos = length(seq); } break; case StructuralVariantRecord::DUPLICATION: { append(seq, infix(contig, svRecord.pos, svRecord.pos + svRecord.size)); SEQAN_ASSERT_GEQ(svRecord.targetPos, svRecord.pos + svRecord.size); if (methSimOptions && methSimOptions->simulateMethylationLevels) // first copy { appendValue(varPoints, std::make_pair((int)length(seq), false)); append(levelsLargeVariants->forward, infix(levels->forward, svRecord.pos, svRecord.pos + svRecord.size)); append(levelsLargeVariants->reverse, infix(levels->reverse, svRecord.pos, svRecord.pos + svRecord.size)); } if (currentPos != length(seq)) appendValue(intervals, GenomicInterval(currentPos, length(seq), svRecord.pos, svRecord.pos + svRecord.size, '+', GenomicInterval::DUPLICATED)); unsigned tmpCurrentPos = length(seq); append(seq, infix(contig, svRecord.pos + svRecord.size, svRecord.targetPos)); if (methSimOptions && methSimOptions->simulateMethylationLevels) { appendValue(varPoints, std::make_pair((int)length(seq), false)); append(levelsLargeVariants->forward, infix(levels->forward, svRecord.pos + svRecord.size, svRecord.targetPos)); append(levelsLargeVariants->reverse, infix(levels->reverse, svRecord.pos + svRecord.size, svRecord.targetPos)); } if (tmpCurrentPos != length(seq)) appendValue(intervals, GenomicInterval(tmpCurrentPos, length(seq), svRecord.pos + svRecord.size, svRecord.targetPos, '+', GenomicInterval::NORMAL)); tmpCurrentPos = length(seq); append(seq, infix(contig, svRecord.pos, svRecord.pos + svRecord.size)); if (methSimOptions && methSimOptions->simulateMethylationLevels) // second copy { appendValue(varPoints, std::make_pair((int)length(seq), false)); append(levelsLargeVariants->forward, infix(levels->forward, svRecord.pos, svRecord.pos + svRecord.size)); append(levelsLargeVariants->reverse, infix(levels->reverse, svRecord.pos, svRecord.pos + svRecord.size)); } if (tmpCurrentPos != length(seq)) appendValue(intervals, GenomicInterval(tmpCurrentPos, length(seq), svRecord.pos, svRecord.pos + svRecord.size, '+', GenomicInterval::NORMAL)); if (verbosity >= 3) std::cerr << "append(seq, infix(contig, " << svRecord.pos << ", " << svRecord.pos + svRecord.size << ") " << __LINE__ << " (duplication)\n" << "append(seq, infix(contig, " << svRecord.pos + svRecord.size << ", " << svRecord.targetPos << ") " << __LINE__ << "\n" << "append(seq, infix(contig, " << svRecord.pos << ", " << svRecord.pos + svRecord.size << ") " << __LINE__ << "\n"; lastPos = svRecord.targetPos; SEQAN_ASSERT_LT(lastPos, (int)length(contig)); // Write out small variant infos for duplication. for (; itSmallVar != smallVarInfos.end() && itSmallVar->pos < svRecord.pos + svRecord.size; ++itSmallVar) { int offset = (int)currentPos - lastPos; varInfos.push_back(*itSmallVar); varInfos.back().pos += offset; if (itSmallVar->pos < svRecord.pos + svRecord.size) { varInfos.push_back(*itSmallVar); varInfos.back().pos += (svRecord.targetPos - svRecord.pos); } } // Copy out breakpoints. breakpoints.push_back(std::make_pair(currentPos, variants.posToIdx(Variants::SV, i))); breakpoints.push_back(std::make_pair(currentPos + svRecord.pos + svRecord.size - svRecord.pos, variants.posToIdx(Variants::SV, i))); breakpoints.push_back(std::make_pair(currentPos + svRecord.pos + svRecord.size - svRecord.pos + svRecord.targetPos - (svRecord.pos + svRecord.size), variants.posToIdx(Variants::SV, i))); breakpoints.push_back(std::make_pair((int)length(seq), variants.posToIdx(Variants::SV, i))); currentPos = length(seq); } break; default: return 1; } } if (verbosity >= 3) std::cerr << "append(seq, infix(contig, " << lastPos << ", " << length(contig) << ") " << __LINE__ << " (last interim)\n"; append(seq, infix(contig, lastPos, length(contig))); if (methSimOptions && methSimOptions->simulateMethylationLevels) { append(levelsLargeVariants->forward, infix(levels->forward, lastPos, length(contig))); append(levelsLargeVariants->reverse, infix(levels->reverse, lastPos, length(contig))); SEQAN_ASSERT_EQ(length(seq), length(levelsLargeVariants->forward)); SEQAN_ASSERT_EQ(length(seq), length(levelsLargeVariants->reverse)); fixVariationLevels(*levelsLargeVariants, *rng, seq, varPoints, *methSimOptions); } if (currentPos != length(seq)) appendValue(intervals, GenomicInterval(currentPos, length(seq), lastPos, length(contig), '+', GenomicInterval::NORMAL)); // Copy out small variant infos for trailing characters. for (; itSmallVar != smallVarInfos.end(); ++itSmallVar) { int offset = (int)currentPos - lastPos; varInfos.push_back(*itSmallVar); varInfos.back().pos += offset; } // Build the interval trees of the positionMap. seqan::String<PositionMap::TInterval> svIntervals, svIntervalsSTL; for (unsigned i = 0; i < length(intervals); ++i) appendValue(svIntervals, PositionMap::TInterval( intervals[i].svBeginPos, intervals[i].svEndPos, intervals[i])); for (unsigned i = 0; i < length(intervals); ++i) if (intervals[i].smallVarBeginPos != -1) // ignore insertions appendValue(svIntervalsSTL, PositionMap::TInterval( intervals[i].smallVarBeginPos, intervals[i].smallVarEndPos, intervals[i])); createIntervalTree(positionMap.svIntervalTree, svIntervals); createIntervalTree(positionMap.svIntervalTreeSTL, svIntervalsSTL); return 0; }