int main() { // Create an alignment between subject and query. seqan::Peptide subject = "MGLSDGEWQLVLNVWGKVEADIPGHGQEVLIRLFKGHPETLEKFDKFKHLKSEDEMKASE" "DLKKHGATVLTALGGILKKKGHHEAEIKPLAQSHATKHKIPVKYLEFISECIIQVLQSKH" "PGDFGADAQGAMNKALELFRKDMASNYK"; seqan::Peptide query = "MSLTKTERTIIVSMWAKISTQADTIGTETLERLFLSHPQTKTYFPHFDLHPGSA" "QLRAHGSKVVAAVGDAVKSIDDIGGALSKLSELHAYILRVDPVNFKLLSHCLLVTLAARF" "PADFTAEAHAAWDKFLSVTEKYR"; seqan::Align<seqan::Peptide> align; resize(rows(align), 2); setSource(row(align, 0), subject); setSource(row(align, 1), query); seqan::Blosum62 scoringScheme(-1, -12); globalAlignment(align, scoringScheme); // Compute the statistics of the alignment. seqan::AlignmentStats stats; int scoreVal = computeAlignmentStats(stats, align, scoringScheme); SEQAN_ASSERT_EQ(scoreVal, stats.alignmentScore); std::cout << align << "gap opens: " << stats.numGapOpens << "\n" << "gap extensions: " << stats.numGapExtensions << "\n" << "num matches: " << stats.numMatches << "\n" << "num mismatches: " << stats.numMismatches << "\n" << "num positive scores: " << stats.numPositiveScores << "\n" << "num negative scores: " << stats.numNegativeScores << "\n\n\n"; // Clip alignment rows and compute score of this view. setClippedEndPosition(row(align, 0), 100); setClippedEndPosition(row(align, 1), 100); setClippedBeginPosition(row(align, 0), 5); setClippedBeginPosition(row(align, 1), 5); scoreVal = computeAlignmentStats(stats, align, scoringScheme); SEQAN_ASSERT_EQ(scoreVal, stats.alignmentScore); std::cout << "Clipping alignment to (5, 100)\n" << align << "gap opens: " << stats.numGapOpens << "\n" << "gap extensions: " << stats.numGapExtensions << "\n" << "num matches: " << stats.numMatches << "\n" << "num mismatches: " << stats.numMismatches << "\n" << "num positive scores: " << stats.numPositiveScores << "\n" << "num negative scores: " << stats.numNegativeScores << "\n"; return 0; }
GenomicInterval PositionMap::getGenomicInterval(int svPos) const { seqan::String<GenomicInterval> intervals; findIntervals(intervals, svIntervalTree, svPos); SEQAN_ASSERT_EQ(length(intervals), 1u); return intervals[0]; }
void VcfMaterializer::_appendToVariants(Variants & variants, seqan::VcfRecord const & vcfRecord) { // Compute maximal length of alternative. unsigned altLength = 0; seqan::StringSet<seqan::CharString> alts; strSplit(alts, vcfRecord.alt, seqan::EqualsChar<','>()); for (unsigned i = 0; i < length(alts); ++i) altLength = std::max(altLength, (unsigned)length(alts[i])); if (contains(vcfRecord.info, "SVTYPE")) // Structural Variant { StructuralVariantRecord svRecord; svRecord.rId = vcfRecord.rID; svRecord.pos = vcfRecord.beginPos + 1; // given with shift of -1 svRecord.haplotype = 0; SEQAN_ASSERT_EQ(length(alts), 1u); if (contains(vcfRecord.info, "SVTYPE=INS")) // Insertion { svRecord.kind = StructuralVariantRecord::INDEL; svRecord.size = getSVLen(vcfRecord.info); svRecord.seq = suffix(vcfRecord.alt, 1); } else if (contains(vcfRecord.info, "SVTYPE=DEL")) // Deletion { svRecord.kind = StructuralVariantRecord::INDEL; svRecord.size = getSVLen(vcfRecord.info); } else if (contains(vcfRecord.info, "SVTYPE=INV")) // Inversion { svRecord.kind = StructuralVariantRecord::INVERSION; svRecord.size = getSVLen(vcfRecord.info); } else if (contains(vcfRecord.info, "SVTYPE=DUP")) // Duplication { svRecord.kind = StructuralVariantRecord::DUPLICATION; svRecord.size = getSVLen(vcfRecord.info); std::pair<seqan::CharString, int> pos = getTargetPos(vcfRecord.info); unsigned idx = 0; if (!getIdByName(idx, contigNamesCache(context(vcfFileIn)), pos.first)) SEQAN_FAIL("Unknown sequence %s", toCString(pos.first)); svRecord.targetRId = idx; svRecord.targetPos = pos.second - 1; } else if (contains(vcfRecord.info, "SVTYPE=BND")) // Breakend (Must be Translocation) { SEQAN_FAIL("Unexpected 'SVTYPE=BND' at this place!"); } else { SEQAN_FAIL("ERROR: Unknown SVTYPE!\n"); } // Split the target variants. SEQAN_ASSERT_NOT(empty(vcfRecord.genotypeInfos)); seqan::DirectionIterator<seqan::CharString const, seqan::Input>::Type inputIter = directionIterator(vcfRecord.genotypeInfos[0], seqan::Input()); seqan::CharString buffer; svRecord.haplotype = 0; for (; !atEnd(inputIter); ++inputIter) if ((*inputIter == '|' || *inputIter == '/')) { if (!empty(buffer)) { unsigned idx = std::min(seqan::lexicalCast<unsigned>(buffer), 1u); if (idx != 0u) // if not == ref appendValue(variants.svRecords, svRecord); } svRecord.haplotype++; clear(buffer); } else { appendValue(buffer, *inputIter); } if (!empty(buffer)) { unsigned idx = std::min(seqan::lexicalCast<unsigned>(buffer), 1u); if (idx != 0u) // if not == ref appendValue(variants.svRecords, svRecord); } } else if (length(vcfRecord.ref) == 1u && altLength == 1u) // SNP { SnpRecord snpRecord; snpRecord.rId = vcfRecord.rID; snpRecord.pos = vcfRecord.beginPos; // Split the alternatives. seqan::StringSet<seqan::CharString> alternatives; strSplit(alternatives, vcfRecord.alt, seqan::EqualsChar<','>()); // Split the target variants. SEQAN_ASSERT_NOT(empty(vcfRecord.genotypeInfos)); seqan::DirectionIterator<seqan::CharString const, seqan::Input>::Type inputIter = directionIterator(vcfRecord.genotypeInfos[0], seqan::Input()); seqan::CharString buffer; snpRecord.haplotype = 0; for (; !atEnd(inputIter); ++inputIter) if ((*inputIter == '|' || *inputIter == '/')) { if (!empty(buffer)) { unsigned idx = std::min(seqan::lexicalCast<unsigned>(buffer), (unsigned)length(alternatives)); if (idx != 0u) // if not == ref { SEQAN_ASSERT_NOT(empty(alternatives[idx - 1])); snpRecord.to = alternatives[idx - 1][0]; appendValue(variants.snps, snpRecord); } } snpRecord.haplotype++; clear(buffer); } else { appendValue(buffer, *inputIter); } if (!empty(buffer)) { unsigned idx = std::min(seqan::lexicalCast<unsigned>(buffer), (unsigned)length(alternatives)); if (idx != 0u) // if not == ref { SEQAN_ASSERT_NOT(empty(alternatives[idx - 1])); snpRecord.to = alternatives[idx - 1][0]; appendValue(variants.snps, snpRecord); } } } else // Small Indel { SmallIndelRecord smallIndel; smallIndel.rId = vcfRecord.rID; smallIndel.pos = vcfRecord.beginPos + 1; SEQAN_ASSERT_NOT(contains(vcfRecord.alt, ",")); // only one alternative SEQAN_ASSERT((length(vcfRecord.alt) == 1u) != (length(vcfRecord.ref) == 1u)); // XOR smallIndel.haplotype = 0; if (length(vcfRecord.ref) == 1u) // insertion { smallIndel.seq = suffix(vcfRecord.alt, 1); smallIndel.size = length(smallIndel.seq); } else // deletion { smallIndel.size = -(int)(length(vcfRecord.ref) - 1); } // Split the target variants. SEQAN_ASSERT_NOT(empty(vcfRecord.genotypeInfos)); seqan::DirectionIterator<seqan::CharString const, seqan::Input>::Type inputIter = directionIterator(vcfRecord.genotypeInfos[0], seqan::Input()); seqan::CharString buffer; smallIndel.haplotype = 0; for (; !atEnd(inputIter); ++inputIter) if ((*inputIter == '|' || *inputIter == '/')) { if (!empty(buffer)) { unsigned idx = std::min(seqan::lexicalCast<unsigned>(buffer), 1u); if (idx != 0u) // if not == ref appendValue(variants.smallIndels, smallIndel); } smallIndel.haplotype++; clear(buffer); } else { appendValue(buffer, *inputIter); } if (!empty(buffer)) { unsigned idx = std::min(seqan::lexicalCast<unsigned>(buffer), 1u); if (idx != 0u) // if not == ref appendValue(variants.smallIndels, smallIndel); } } }
void PositionMap::reinit(TJournalEntries const & journal) { // Reset the interval tree and breakpoints. // TODO(holtgrew): Better API support for IntervalTree? svIntervalTree = TIntervalTree(); svIntervalTreeSTL = TIntervalTree(); svBreakpoints.clear(); clear(refGapAnchors); clear(smallVarGapAnchors); // Convert the journal to two gaps. // // Get anchor gaps objects from anchors. typedef seqan::Iterator<TGaps, seqan::Standard>::Type TGapsIter; TGaps refGaps(seqan::Nothing(), refGapAnchors); TGapsIter itRef = begin(refGaps, seqan::Standard()); TGaps smallVarGaps(seqan::Nothing(), smallVarGapAnchors); TGapsIter itVar = begin(smallVarGaps, seqan::Standard()); // Iterate over the journal. typedef seqan::Iterator<TJournalEntries const, seqan::Standard>::Type TJournalEntriesIt; TJournalEntriesIt it = begin(journal, seqan::Standard()); SEQAN_ASSERT_NEQ(it->segmentSource, seqan::SOURCE_NULL); SEQAN_ASSERT_EQ(it->virtualPosition, 0u); unsigned lastRefPos = seqan::MaxValue<unsigned>::VALUE; // Previous position from reference. for (; it != end(journal, seqan::Standard()); ++it) { // std::cerr << *it << "\n"; SEQAN_ASSERT_NEQ(it->segmentSource, seqan::SOURCE_NULL); if (it->segmentSource == seqan::SOURCE_ORIGINAL) { if (lastRefPos == seqan::maxValue<unsigned>()) { if (it->physicalPosition != 0) { insertGaps(itRef, it->physicalPosition); itRef += it->physicalPosition; itVar += it->physicalPosition; lastRefPos = it->physicalPosition + it->length; // std::cerr << "INSERT REF GAPS\t" << it->physicalPosition << "\n"; } itRef += it->length; itVar += it->length; // std::cerr << "FORWARD\t" << it->length << "\n"; } else { if (it->physicalPosition != lastRefPos) { int len = it->physicalPosition - lastRefPos; insertGaps(itVar, len); // std::cerr << "INSERT VAR GAPS\t" << len << "\n"; itRef += len; itVar += len; // std::cerr << "FORWARD\t" << len << "\n"; } itRef += it->length; itVar += it->length; // std::cerr << "2 FORWARD\t" << it->length << "\n"; } lastRefPos = it->physicalPosition + it->length; } else { insertGaps(itRef, it->length); // std::cerr << "INSERT REF GAPS\t" << it->length << "\n"; itRef += it->length; itVar += it->length; // std::cerr << "FORWARD\t" << it->length << "\n"; } } // std::cerr << "--> done\n"; // typedef seqan::Gaps<seqan::CharString, seqan::AnchorGaps<TGapAnchors> > TGaps2; // seqan::CharString seqH; // seqan::CharString seqV; // for (unsigned i = 0; i < 1000; ++i) // { // appendValue(seqH, 'X'); // appendValue(seqV, 'X'); // } // TGaps2 gapsH(seqH, refGapAnchors); // TGaps2 gapsV(seqV, smallVarGapAnchors); // std::cerr << "REF\t" << gapsH << "\n" // << "VAR\t" << gapsV << "\n"; }
int VariantMaterializer::_materializeLargeVariants( seqan::Dna5String & seq, MethylationLevels * levelsLargeVariants, std::vector<SmallVarInfo> & varInfos, std::vector<std::pair<int, int> > & breakpoints, PositionMap & positionMap, TJournalEntries const & journal, seqan::Dna5String const & contig, std::vector<SmallVarInfo> const & smallVarInfos, Variants const & variants, MethylationLevels const * levels, int hId) { if (methSimOptions) { SEQAN_ASSERT_EQ(methSimOptions->simulateMethylationLevels, (levelsLargeVariants != 0)); SEQAN_ASSERT_EQ(methSimOptions->simulateMethylationLevels, (levels != 0)); } // We will record all intervals for the positionMap.svIntervalTree in this String. seqan::String<GenomicInterval> intervals; // Clear output methylation levels-> if (levelsLargeVariants) levelsLargeVariants->clear(); // Store variation points. We reuse the fixVariationLevels() function from small indel/snp simulation and thus // have to store a bool that is always set to false. seqan::String<std::pair<int, bool> > varPoints; // Track last position from contig appended to seq so far. int lastPos = 0; if (verbosity >= 3) std::cerr << __LINE__ << "\tlastPos == " << lastPos << "\n"; // Pointer to the current small variant to write out translated to varInfo. std::vector<SmallVarInfo>::const_iterator itSmallVar = smallVarInfos.begin(); // Number of bytes written out so far/current position in variant. unsigned currentPos = 0; for (unsigned i = 0; i < length(variants.svRecords); ++i) { if (variants.svRecords[i].haplotype != hId) // Ignore all but the current contig. continue; // We obtain a copy of the current SV record since we translate its positions below. StructuralVariantRecord svRecord = variants.svRecords[i]; // Translate positions and lengths of SV record. if (verbosity >= 2) std::cerr << " Translating SvRecord\n " << svRecord << '\n'; svRecord.pos = hostToVirtualPosition(journal, svRecord.pos); SEQAN_ASSERT_LT(svRecord.pos, (int)length(contig)); // We do not need to adjust the sizes for insertions. if (svRecord.kind != StructuralVariantRecord::INDEL || svRecord.size < 0) svRecord.size = hostToVirtualPosition(journal, svRecord.pos + svRecord.size) - hostToVirtualPosition(journal, svRecord.pos); if (svRecord.targetPos != -1) svRecord.targetPos = hostToVirtualPosition(journal, svRecord.targetPos); if (verbosity >= 2) std::cerr << " => " << svRecord << '\n'; // Copy out small variant infos for interim chars. for (; itSmallVar != smallVarInfos.end() && itSmallVar->pos < svRecord.pos; ++itSmallVar) { int offset = (int)currentPos - lastPos; varInfos.push_back(*itSmallVar); varInfos.back().pos += offset; } // Copy from contig to seq with SVs. if (verbosity >= 3) std::cerr << "lastPos == " << lastPos << "\n"; append(seq, infix(contig, lastPos, svRecord.pos)); // interim chars if (methSimOptions && methSimOptions->simulateMethylationLevels) { append(levelsLargeVariants->forward, infix(levels->forward, lastPos, svRecord.pos)); append(levelsLargeVariants->reverse, infix(levels->reverse, lastPos, svRecord.pos)); appendValue(varPoints, std::make_pair((int)length(seq), false)); } if (currentPos != length(seq)) appendValue(intervals, GenomicInterval(currentPos, length(seq), lastPos, svRecord.pos, '+', GenomicInterval::NORMAL)); currentPos = length(seq); if (verbosity >= 3) std::cerr << "append(seq, infix(contig, " << lastPos << ", " << svRecord.pos << ") " << __LINE__ << " (interim)\n"; switch (svRecord.kind) { case StructuralVariantRecord::INDEL: { if (svRecord.size > 0) // insertion { SEQAN_ASSERT_EQ((int)length(svRecord.seq), svRecord.size); // Simulate methylation levels for insertion. MethylationLevels lvls; if (methSimOptions && methSimOptions->simulateMethylationLevels) { MethylationLevelSimulator methSim(*rng, *methSimOptions); methSim.run(lvls, svRecord.seq); } // Append novel sequence and methylation levels. append(seq, svRecord.seq); if (methSimOptions && methSimOptions->simulateMethylationLevels) { append(levelsLargeVariants->forward, lvls.forward); append(levelsLargeVariants->reverse, lvls.reverse); appendValue(varPoints, std::make_pair((int)length(seq), false)); // variation point after insertion } if (currentPos != length(seq)) appendValue(intervals, GenomicInterval(currentPos, length(seq), -1, -1, '+', GenomicInterval::INSERTED)); if (verbosity >= 3) std::cerr << "append(seq, svRecord.seq (length == " << length(svRecord.seq) << ") " << __LINE__ << " (insertion)\n"; lastPos = svRecord.pos; SEQAN_ASSERT_LT(lastPos, (int)length(contig)); // Copy out breakpoints. breakpoints.push_back(std::make_pair(currentPos, variants.posToIdx(Variants::SV, i))); breakpoints.push_back(std::make_pair((int)length(seq), variants.posToIdx(Variants::SV, i))); currentPos = length(seq); } else // deletion { lastPos = svRecord.pos - svRecord.size; SEQAN_ASSERT_LT(lastPos, (int)length(contig)); // Copy out breakpoint. breakpoints.push_back(std::make_pair(currentPos, variants.posToIdx(Variants::SV, i))); } } break; case StructuralVariantRecord::INVERSION: { unsigned oldLen = length(seq); append(seq, infix(contig, svRecord.pos, svRecord.pos + svRecord.size)); if (methSimOptions && methSimOptions->simulateMethylationLevels) { appendValue(varPoints, std::make_pair((int)length(seq), false)); // variation point at deletion append(levelsLargeVariants->forward, infix(levels->reverse, svRecord.pos, svRecord.pos + svRecord.size)); reverse(infix(levelsLargeVariants->forward, oldLen, length(levelsLargeVariants->forward))); append(levelsLargeVariants->reverse, infix(levels->forward, svRecord.pos, svRecord.pos + svRecord.size)); reverse(infix(levelsLargeVariants->reverse, oldLen, length(levelsLargeVariants->reverse))); } if (currentPos != length(seq)) appendValue(intervals, GenomicInterval(currentPos, length(seq), svRecord.pos, svRecord.pos + svRecord.size, '-', GenomicInterval::INVERTED)); // Copy out small variant infos for inversion. for (; itSmallVar != smallVarInfos.end() && itSmallVar->pos < svRecord.pos + svRecord.size; ++itSmallVar) { varInfos.push_back(*itSmallVar); varInfos.back().pos = currentPos + svRecord.size - (varInfos.back().pos - lastPos); } if (verbosity >= 3) std::cerr << "append(seq, infix(contig, " << svRecord.pos << ", " << svRecord.pos + svRecord.size << ") " << __LINE__ << " (inversion)\n"; reverseComplement(infix(seq, oldLen, length(seq))); lastPos = svRecord.pos + svRecord.size; SEQAN_ASSERT_LT(lastPos, (int)length(contig)); // Copy out breakpoints. breakpoints.push_back(std::make_pair(currentPos, variants.posToIdx(Variants::SV, i))); breakpoints.push_back(std::make_pair((int)length(seq), variants.posToIdx(Variants::SV, i))); currentPos = length(seq); } break; case StructuralVariantRecord::TRANSLOCATION: { SEQAN_ASSERT_GEQ(svRecord.targetPos, svRecord.pos + svRecord.size); append(seq, infix(contig, svRecord.pos + svRecord.size, svRecord.targetPos)); if (methSimOptions && methSimOptions->simulateMethylationLevels) { appendValue(varPoints, std::make_pair((int)length(seq), false)); append(levelsLargeVariants->forward, infix(levels->forward, svRecord.pos + svRecord.size, svRecord.targetPos)); append(levelsLargeVariants->reverse, infix(levels->reverse, svRecord.pos + svRecord.size, svRecord.targetPos)); } if (currentPos != length(seq)) appendValue(intervals, GenomicInterval(currentPos, length(seq), svRecord.pos + svRecord.size, svRecord.targetPos, '+', GenomicInterval::NORMAL)); unsigned tmpCurrentPos = length(seq); append(seq, infix(contig, svRecord.pos, svRecord.pos + svRecord.size)); if (methSimOptions && methSimOptions->simulateMethylationLevels) { appendValue(varPoints, std::make_pair((int)length(seq), false)); append(levelsLargeVariants->forward, infix(levels->forward, svRecord.pos, svRecord.pos + svRecord.size)); append(levelsLargeVariants->reverse, infix(levels->reverse, svRecord.pos, svRecord.pos + svRecord.size)); } if (tmpCurrentPos != length(seq)) appendValue(intervals, GenomicInterval(tmpCurrentPos, length(seq), svRecord.pos, svRecord.pos + svRecord.size, '+', GenomicInterval::NORMAL)); if (verbosity >= 3) std::cerr << "append(seq, infix(contig, " << svRecord.pos + svRecord.size << ", " << svRecord.targetPos << ") " << __LINE__ << " (translocation)\n" << "append(seq, infix(contig, " << svRecord.pos << ", " << svRecord.pos + svRecord.size << ") " << __LINE__ << "\n"; lastPos = svRecord.targetPos; SEQAN_ASSERT_LT(lastPos, (int)length(contig)); // Copy out small variant infos for translocation, shift left to right and righ to left but keep // center intact. for (; itSmallVar != smallVarInfos.end() && itSmallVar->pos < svRecord.pos; ++itSmallVar) { int offset = (int)currentPos - lastPos; varInfos.push_back(*itSmallVar); varInfos.back().pos += offset; int bpLeft = svRecord.pos + svRecord.size; int bpRight = svRecord.targetPos; if (itSmallVar->pos < bpLeft) varInfos.back().pos -= (svRecord.targetPos - svRecord.pos); else if (itSmallVar->pos >= bpRight) varInfos.back().pos += (svRecord.targetPos - svRecord.pos); } // Copy out breakpoints. breakpoints.push_back(std::make_pair(currentPos, variants.posToIdx(Variants::SV, i))); breakpoints.push_back(std::make_pair(currentPos + svRecord.targetPos - svRecord.pos - svRecord.size, variants.posToIdx(Variants::SV, i))); breakpoints.push_back(std::make_pair((int)length(seq), variants.posToIdx(Variants::SV, i))); currentPos = length(seq); } break; case StructuralVariantRecord::DUPLICATION: { append(seq, infix(contig, svRecord.pos, svRecord.pos + svRecord.size)); SEQAN_ASSERT_GEQ(svRecord.targetPos, svRecord.pos + svRecord.size); if (methSimOptions && methSimOptions->simulateMethylationLevels) // first copy { appendValue(varPoints, std::make_pair((int)length(seq), false)); append(levelsLargeVariants->forward, infix(levels->forward, svRecord.pos, svRecord.pos + svRecord.size)); append(levelsLargeVariants->reverse, infix(levels->reverse, svRecord.pos, svRecord.pos + svRecord.size)); } if (currentPos != length(seq)) appendValue(intervals, GenomicInterval(currentPos, length(seq), svRecord.pos, svRecord.pos + svRecord.size, '+', GenomicInterval::DUPLICATED)); unsigned tmpCurrentPos = length(seq); append(seq, infix(contig, svRecord.pos + svRecord.size, svRecord.targetPos)); if (methSimOptions && methSimOptions->simulateMethylationLevels) { appendValue(varPoints, std::make_pair((int)length(seq), false)); append(levelsLargeVariants->forward, infix(levels->forward, svRecord.pos + svRecord.size, svRecord.targetPos)); append(levelsLargeVariants->reverse, infix(levels->reverse, svRecord.pos + svRecord.size, svRecord.targetPos)); } if (tmpCurrentPos != length(seq)) appendValue(intervals, GenomicInterval(tmpCurrentPos, length(seq), svRecord.pos + svRecord.size, svRecord.targetPos, '+', GenomicInterval::NORMAL)); tmpCurrentPos = length(seq); append(seq, infix(contig, svRecord.pos, svRecord.pos + svRecord.size)); if (methSimOptions && methSimOptions->simulateMethylationLevels) // second copy { appendValue(varPoints, std::make_pair((int)length(seq), false)); append(levelsLargeVariants->forward, infix(levels->forward, svRecord.pos, svRecord.pos + svRecord.size)); append(levelsLargeVariants->reverse, infix(levels->reverse, svRecord.pos, svRecord.pos + svRecord.size)); } if (tmpCurrentPos != length(seq)) appendValue(intervals, GenomicInterval(tmpCurrentPos, length(seq), svRecord.pos, svRecord.pos + svRecord.size, '+', GenomicInterval::NORMAL)); if (verbosity >= 3) std::cerr << "append(seq, infix(contig, " << svRecord.pos << ", " << svRecord.pos + svRecord.size << ") " << __LINE__ << " (duplication)\n" << "append(seq, infix(contig, " << svRecord.pos + svRecord.size << ", " << svRecord.targetPos << ") " << __LINE__ << "\n" << "append(seq, infix(contig, " << svRecord.pos << ", " << svRecord.pos + svRecord.size << ") " << __LINE__ << "\n"; lastPos = svRecord.targetPos; SEQAN_ASSERT_LT(lastPos, (int)length(contig)); // Write out small variant infos for duplication. for (; itSmallVar != smallVarInfos.end() && itSmallVar->pos < svRecord.pos + svRecord.size; ++itSmallVar) { int offset = (int)currentPos - lastPos; varInfos.push_back(*itSmallVar); varInfos.back().pos += offset; if (itSmallVar->pos < svRecord.pos + svRecord.size) { varInfos.push_back(*itSmallVar); varInfos.back().pos += (svRecord.targetPos - svRecord.pos); } } // Copy out breakpoints. breakpoints.push_back(std::make_pair(currentPos, variants.posToIdx(Variants::SV, i))); breakpoints.push_back(std::make_pair(currentPos + svRecord.pos + svRecord.size - svRecord.pos, variants.posToIdx(Variants::SV, i))); breakpoints.push_back(std::make_pair(currentPos + svRecord.pos + svRecord.size - svRecord.pos + svRecord.targetPos - (svRecord.pos + svRecord.size), variants.posToIdx(Variants::SV, i))); breakpoints.push_back(std::make_pair((int)length(seq), variants.posToIdx(Variants::SV, i))); currentPos = length(seq); } break; default: return 1; } } if (verbosity >= 3) std::cerr << "append(seq, infix(contig, " << lastPos << ", " << length(contig) << ") " << __LINE__ << " (last interim)\n"; append(seq, infix(contig, lastPos, length(contig))); if (methSimOptions && methSimOptions->simulateMethylationLevels) { append(levelsLargeVariants->forward, infix(levels->forward, lastPos, length(contig))); append(levelsLargeVariants->reverse, infix(levels->reverse, lastPos, length(contig))); SEQAN_ASSERT_EQ(length(seq), length(levelsLargeVariants->forward)); SEQAN_ASSERT_EQ(length(seq), length(levelsLargeVariants->reverse)); fixVariationLevels(*levelsLargeVariants, *rng, seq, varPoints, *methSimOptions); } if (currentPos != length(seq)) appendValue(intervals, GenomicInterval(currentPos, length(seq), lastPos, length(contig), '+', GenomicInterval::NORMAL)); // Copy out small variant infos for trailing characters. for (; itSmallVar != smallVarInfos.end(); ++itSmallVar) { int offset = (int)currentPos - lastPos; varInfos.push_back(*itSmallVar); varInfos.back().pos += offset; } // Build the interval trees of the positionMap. seqan::String<PositionMap::TInterval> svIntervals, svIntervalsSTL; for (unsigned i = 0; i < length(intervals); ++i) appendValue(svIntervals, PositionMap::TInterval( intervals[i].svBeginPos, intervals[i].svEndPos, intervals[i])); for (unsigned i = 0; i < length(intervals); ++i) if (intervals[i].smallVarBeginPos != -1) // ignore insertions appendValue(svIntervalsSTL, PositionMap::TInterval( intervals[i].smallVarBeginPos, intervals[i].smallVarEndPos, intervals[i])); createIntervalTree(positionMap.svIntervalTree, svIntervals); createIntervalTree(positionMap.svIntervalTreeSTL, svIntervalsSTL); return 0; }
int VariantMaterializer::_materializeSmallVariants( seqan::Dna5String & seq, TJournalEntries & journal, MethylationLevels * levelsSmallVariants, std::vector<SmallVarInfo> & smallVarInfos, seqan::Dna5String const & contig, Variants const & variants, MethylationLevels const * levels, int hId) { if (methSimOptions) { SEQAN_ASSERT_EQ(methSimOptions->simulateMethylationLevels, (levelsSmallVariants != 0)); SEQAN_ASSERT_EQ(methSimOptions->simulateMethylationLevels, (levels != 0)); } // Clear journal and output methylation levels. reinit(journal, length(contig)); if (levelsSmallVariants) levelsSmallVariants->clear(); // Store variation points with a flag whether it is a SNP (true) or a breakpoint (false). seqan::String<std::pair<int, bool> > varPoints; // Fors this, we have to iterate in parallel over SNP and small indel records. // // Current index in snp/small indel array. unsigned snpsIdx = 0; unsigned smallIndelIdx = 0; // Current SNP record, default to sentinel. SnpRecord snpRecord; snpRecord.rId = seqan::maxValue<int>(); if (snpsIdx < length(variants.snps)) snpRecord = variants.snps[snpsIdx++]; // Current small indel record, default to sentinel. SmallIndelRecord smallIndelRecord; smallIndelRecord.rId = seqan::maxValue<int>(); if (smallIndelIdx < length(variants.smallIndels)) smallIndelRecord = variants.smallIndels[smallIndelIdx++]; // Track last position from contig appended to seq so far. int lastPos = 0; if (verbosity >= 3) std::cerr << __LINE__ << "\tlastPos == " << lastPos << "\n"; // TODO(holtgrew): Extract contig building into their own functions. if (verbosity >= 2) std::cerr << "building output\n"; while (snpRecord.rId != seqan::maxValue<int>() || smallIndelRecord.rId != seqan::maxValue<int>()) { // TODO(holtgrew): Extract SNP and small indel handling into their own functions. if (snpRecord.getPos() < smallIndelRecord.getPos()) // process SNP records { if (snpRecord.haplotype == hId) // Ignore all but the current contig. { if (verbosity >= 3) std::cerr << "append(seq, infix(contig, " << lastPos << ", " << snpRecord.pos << ") " << __LINE__ << "\n"; // Append interim sequence and methylation levels-> append(seq, infix(contig, lastPos, snpRecord.pos)); if (methSimOptions && methSimOptions->simulateMethylationLevels) { append(levelsSmallVariants->forward, infix(levels->forward, lastPos, snpRecord.pos + 1)); append(levelsSmallVariants->reverse, infix(levels->reverse, lastPos, snpRecord.pos + 1)); appendValue(varPoints, std::make_pair((int)length(seq), true)); // variation points before/after SNP appendValue(varPoints, std::make_pair((int)length(seq) + 1, true)); } SEQAN_ASSERT_GEQ(snpRecord.pos, lastPos); if (verbosity >= 3) std::cerr << "appendValue(seq, " << snpRecord.to << "')\n"; appendValue(seq, snpRecord.to); lastPos = snpRecord.pos + 1; if (verbosity >= 3) std::cerr << __LINE__ << "\tlastPos == " << lastPos << "\n"; // Register SNP as small variant info. smallVarInfos.push_back(SmallVarInfo(SmallVarInfo::SNP, length(seq) - 1, 1)); } if (snpsIdx >= length(variants.snps)) snpRecord.rId = seqan::maxValue<int>(); else snpRecord = variants.snps[snpsIdx++]; } else { if (smallIndelRecord.haplotype == hId) // Ignore all but the current contig. { if (smallIndelRecord.size > 0) { if (verbosity >= 3) std::cerr << "append(seq, infix(contig, " << lastPos << ", " << smallIndelRecord.pos << ") " << __LINE__ << "\n"; // Simulate methylation levels for insertion. MethylationLevels lvls; if (methSimOptions && methSimOptions->simulateMethylationLevels) { MethylationLevelSimulator methSim(*rng, *methSimOptions); methSim.run(lvls, smallIndelRecord.seq); } // Append interim sequence and methylation levels-> append(seq, infix(contig, lastPos, smallIndelRecord.pos)); if (methSimOptions && methSimOptions->simulateMethylationLevels) { append(levelsSmallVariants->forward, infix(levels->forward, lastPos, smallIndelRecord.pos)); append(levelsSmallVariants->reverse, infix(levels->reverse, lastPos, smallIndelRecord.pos)); appendValue(varPoints, std::make_pair((int)length(seq), false)); // variation point before insertion } SEQAN_ASSERT_GEQ(smallIndelRecord.pos, lastPos); if (verbosity >= 3) std::cerr << "append(seq, \"" << smallIndelRecord.seq << "\") " << __LINE__ << "\n"; // Register insertion as small variant info. for (unsigned i = 0; i < length(smallIndelRecord.seq); ++i) smallVarInfos.push_back(SmallVarInfo(SmallVarInfo::INS, length(seq) + i, 1)); // Append novel sequence and methylation levels-> append(seq, smallIndelRecord.seq); if (methSimOptions && methSimOptions->simulateMethylationLevels) { append(levelsSmallVariants->forward, lvls.forward); append(levelsSmallVariants->reverse, lvls.reverse); appendValue(varPoints, std::make_pair((int)length(seq), false)); // variation point after insertion } lastPos = smallIndelRecord.pos; recordInsertion(journal, hostToVirtualPosition(journal, smallIndelRecord.pos), 0, smallIndelRecord.size); if (verbosity >= 3) std::cerr << __LINE__ << "\tlastPos == " << lastPos << "\n"; } else // deletion { if (verbosity >= 3) std::cerr << "append(seq, infix(contig, " << lastPos << ", " << smallIndelRecord.pos << ") " << __LINE__ << "\n"; // Append interim sequence and methylation levels-> append(seq, infix(contig, lastPos, smallIndelRecord.pos)); // interim chars if (methSimOptions && methSimOptions->simulateMethylationLevels) { appendValue(varPoints, std::make_pair((int)length(seq), false)); // variation point at deletion append(levelsSmallVariants->forward, infix(levels->forward, lastPos, smallIndelRecord.pos)); append(levelsSmallVariants->reverse, infix(levels->reverse, lastPos, smallIndelRecord.pos)); } lastPos = smallIndelRecord.pos - smallIndelRecord.size; SEQAN_ASSERT_LT(lastPos, (int)length(contig)); recordErase(journal, hostToVirtualPosition(journal, smallIndelRecord.pos), hostToVirtualPosition(journal, smallIndelRecord.pos - smallIndelRecord.size)); if (verbosity >= 3) std::cerr << __LINE__ << "\tlastPos == " << lastPos << "\n"; // Register deletion as small variant info. smallVarInfos.push_back(SmallVarInfo(SmallVarInfo::DEL, length(seq), -smallIndelRecord.size)); } } if (smallIndelIdx >= length(variants.smallIndels)) smallIndelRecord.rId = seqan::maxValue<int>(); else smallIndelRecord = variants.smallIndels[smallIndelIdx++]; } } // Insert remaining characters. if (verbosity >= 3) std::cerr << "append(seq, infix(contig, " << lastPos << ", " << length(contig) << ")\n"; append(seq, infix(contig, lastPos, length(contig))); if (methSimOptions && methSimOptions->simulateMethylationLevels) { append(levelsSmallVariants->forward, infix(levels->forward, lastPos, length(contig))); append(levelsSmallVariants->reverse, infix(levels->reverse, lastPos, length(contig))); SEQAN_ASSERT_EQ(length(seq), length(levelsSmallVariants->forward)); SEQAN_ASSERT_EQ(length(seq), length(levelsSmallVariants->reverse)); fixVariationLevels(*levelsSmallVariants, *rng, seq, varPoints, *methSimOptions); } return 0; }