bool ScaffoldDistanceRefinementVisitor::visit(ScaffoldGraph* /*pGraph*/, ScaffoldVertex* pVertex) { for(size_t idx = 0; idx < ED_COUNT; idx++) { EdgeDir dir = EDGE_DIRECTIONS[idx]; ScaffoldEdgePtrVector edgeVec = pVertex->getEdges(dir); for(size_t i = 0; i < edgeVec.size(); ++i) { Vertex* pX = m_pStringGraph->getVertex(edgeVec[i]->getStartID()); Vertex* pY = m_pStringGraph->getVertex(edgeVec[i]->getEndID()); assert(pX != NULL && pY != NULL); SGWalkVector walks; SGSearch::findWalks(pX, pY, edgeVec[i]->getDir(), edgeVec[i]->getDistance() + pY->getSeqLen() + 1000, true, 100000, walks); // Select the walk closest to the distance estimate if(walks.size() > 0) { int closest = std::numeric_limits<int>::max(); int est = edgeVec[i]->getDistance(); size_t idx = -1; for(size_t j = 0; j < walks.size(); ++j) { int diff = abs(walks[j].getEndToStartDistance() - est); if(diff < closest) { closest = diff; idx = j; } } printf("%s -> %s\t%d\t%d\t%s\n", pX->getID().c_str(), pY->getID().c_str(), edgeVec[i]->getDistance(), walks[idx].getEndToStartDistance(), walks[idx].pathSignature().c_str()); } else { printf("%s -> %s\t%d\tN/A\n", pX->getID().c_str(), pY->getID().c_str(), edgeVec[i]->getDistance()); } } } return false; }
// Attempt to resolve a scaffold link by finding a walk through the graph linking the two vertices bool ScaffoldRecord::graphResolve(const ResolveParams& params, const std::string& startID, const ScaffoldLink& link, std::string& outExtensionString) const { assert(params.pGraph != NULL); // Get the vertex to start the search from Vertex* pStartVertex = params.pGraph->getVertex(startID); Vertex* pEndVertex = params.pGraph->getVertex(link.endpointID); assert(pStartVertex != NULL && pEndVertex != NULL); int threshold = static_cast<int>(params.distanceFactor * link.stdDev); int maxDistance = link.distance + threshold; int maxExtensionDistance = maxDistance + pEndVertex->getSeqLen(); SGWalkVector walks; SGSearch::findWalks(pStartVertex, pEndVertex, link.getDir(), maxExtensionDistance, 10000, true, walks); int numWalksValid = 0; int numWalksClosest = 0; int selectedIdx = -1; int closestDist = std::numeric_limits<int>::max(); #ifdef DEBUGRESOLVE std::cout << "Attempting graph resolve of link " << startID << " -- " << link.endpointID << " expected distance: " << link.distance << " orientation: " << link.edgeData.getComp() << "\n"; #endif // Select the closest walk to the distance estimate for(size_t i = 0; i < walks.size(); ++i) { // Check that the orientation of the walk is the same as the expected link std::vector<EdgeComp> vertexOrientations = walks[i].getOrientationsToStart(); assert(walks[i].getLastEdge()->getEndID() == link.endpointID); if(vertexOrientations.back() != link.edgeData.getComp()) { #ifdef DEBUGRESOLVE std::cout << "SKIPPING WALK OF THE WRONG ORIENTATION\n"; #endif continue; } int walkDistance = walks[i].getEndToStartDistance(); int diff = abs(abs(link.distance - walkDistance)); if(diff <= threshold) { #ifdef DEBUGRESOLVE std::cout << " Walk distance: " << walkDistance << " diff: " << diff << " threshold: " << threshold << " close: " << closestDist << "\n"; #endif ++numWalksValid; if(diff < closestDist) { selectedIdx = i; closestDist = diff; numWalksClosest = 1; } else if(diff == closestDist) { numWalksClosest += 1; } } } // Choose the best path, if any, depending on the algorithm to use bool useWalk = false; if(numWalksValid > 0) { if(params.resolveMask & RESOLVE_GRAPH_BEST) { // If the unique flag is not set, or we only have 1 closest walk, select it if(!(params.resolveMask & RESOLVE_GRAPH_UNIQUE) || numWalksClosest == 1) useWalk = true; else if((params.resolveMask & RESOLVE_GRAPH_UNIQUE) && numWalksClosest > 1) params.pStats->graphWalkTooMany += 1; } else { if(numWalksValid == 1) useWalk = true; else if(numWalksValid > 1) params.pStats->graphWalkTooMany += 1; } } #ifdef DEBUGRESOLVE std::cout << " Num walks: " << walks.size() << " Num valid: " << numWalksValid << " Num closest: " << numWalksClosest << " using: " << useWalk << "\n"; #endif // Was an acceptable walk found? if(useWalk) { assert(selectedIdx != -1); outExtensionString = walks[selectedIdx].getString(SGWT_EXTENSION); params.pStats->graphWalkFound += 1; // Mark all vertices in the walk as visited VertexPtrVec vertexPtrVector = walks[selectedIdx].getVertices(); for(size_t i = 0; i < vertexPtrVector.size(); ++i) params.pSequenceCollection->setPlaced(vertexPtrVector[i]->getID()); return true; } else { if(numWalksValid == 0) params.pStats->graphWalkNoPath += 1; assert(outExtensionString.empty()); return false; } }
bool SGSmoothingVisitor::visit(StringGraph* pGraph, Vertex* pVertex) { (void)pGraph; if(pVertex->getColor() == GC_RED) return false; bool found = false; for(size_t idx = 0; idx < ED_COUNT; idx++) { EdgeDir dir = EDGE_DIRECTIONS[idx]; EdgePtrVec edges = pVertex->getEdges(dir); if(edges.size() <= 1) continue; for(size_t i = 0; i < edges.size(); ++i) { if(edges[i]->getEnd()->getColor() == GC_RED) return false; } //std::cout << "Smoothing " << pVertex->getID() << "\n"; const int MAX_WALKS = 10; const int MAX_DISTANCE = 5000; bool bIsDegenerate = false; bool bFailGapCheck = false; bool bFailDivergenceCheck = false; bool bFailIndelSizeCheck = false; SGWalkVector variantWalks; SGSearch::findVariantWalks(pVertex, dir, MAX_DISTANCE, MAX_WALKS, variantWalks); if(variantWalks.size() > 0) { found = true; size_t selectedIdx = -1; size_t selectedCoverage = 0; // Calculate the minimum amount overlapped on the start/end vertex. // This is used to properly extract the sequences from walks that represent the variation. int minOverlapX = std::numeric_limits<int>::max(); int minOverlapY = std::numeric_limits<int>::max(); for(size_t i = 0; i < variantWalks.size(); ++i) { if(variantWalks[i].getNumEdges() <= 1) bIsDegenerate = true; // Calculate the walk coverage using the internal vertices of the walk. // The walk with the highest coverage will be retained size_t walkCoverage = 0; for(size_t j = 1; j < variantWalks[i].getNumVertices() - 1; ++j) walkCoverage += variantWalks[i].getVertex(j)->getCoverage(); if(walkCoverage > selectedCoverage || selectedCoverage == 0) { selectedIdx = i; selectedCoverage = walkCoverage; } Edge* pFirstEdge = variantWalks[i].getFirstEdge(); Edge* pLastEdge = variantWalks[i].getLastEdge(); if((int)pFirstEdge->getMatchLength() < minOverlapX) minOverlapX = pFirstEdge->getMatchLength(); if((int)pLastEdge->getTwin()->getMatchLength() < minOverlapY) minOverlapY = pLastEdge->getTwin()->getMatchLength(); } // Calculate the strings for each walk that represent the region of variation StringVector walkStrings; for(size_t i = 0; i < variantWalks.size(); ++i) { Vertex* pStartVertex = variantWalks[i].getStartVertex(); Vertex* pLastVertex = variantWalks[i].getLastVertex(); assert(pStartVertex != NULL && pLastVertex != NULL); std::string full = variantWalks[i].getString(SGWT_START_TO_END); int posStart = 0; int posEnd = 0; if(dir == ED_ANTISENSE) { // pLast ----------- // pStart ------------ // full -------------------- // out ---- posStart = pLastVertex->getSeqLen() - minOverlapY; posEnd = full.size() - (pStartVertex->getSeqLen() - minOverlapX); } else { // pStart -------------- // pLast ----------- // full --------------------- // out ---- posStart = pStartVertex->getSeqLen() - minOverlapX; // match start position posEnd = full.size() - (pLastVertex->getSeqLen() - minOverlapY); // match end position } std::string out; if(posEnd > posStart) out = full.substr(posStart, posEnd - posStart); walkStrings.push_back(out); } assert(selectedIdx != (size_t)-1); SGWalk& selectedWalk = variantWalks[selectedIdx]; assert(selectedWalk.isIndexed()); // Check the divergence of the other walks to this walk StringVector cigarStrings; std::vector<int> maxIndel; std::vector<double> gapPercent; // percentage of matching that is gaps std::vector<double> totalPercent; // percent of total alignment that is mismatch or gap cigarStrings.resize(variantWalks.size()); gapPercent.resize(variantWalks.size()); totalPercent.resize(variantWalks.size()); maxIndel.resize(variantWalks.size()); for(size_t i = 0; i < variantWalks.size(); ++i) { if(i == selectedIdx) continue; // We want to compute the total gap length, total mismatches and percent // divergence between the two paths. int matchLen = 0; int totalDiff = 0; int gapLength = 0; int maxGapLength = 0; // We have to handle the degenerate case where one internal string has zero length // this can happen when there is an isolated insertion/deletion and the walks are like: // x -> y -> z // x -> z if(walkStrings[selectedIdx].empty() || walkStrings[i].empty()) { matchLen = std::max(walkStrings[selectedIdx].size(), walkStrings[i].size()); totalDiff = matchLen; gapLength = matchLen; } else { AlnAln *aln_global; aln_global = aln_stdaln(walkStrings[selectedIdx].c_str(), walkStrings[i].c_str(), &aln_param_blast, 1, 1); // Calculate the alignment parameters while(aln_global->outm[matchLen] != '\0') { if(aln_global->outm[matchLen] == ' ') totalDiff += 1; matchLen += 1; } std::stringstream cigarSS; for (int j = 0; j != aln_global->n_cigar; ++j) { char cigarOp = "MID"[aln_global->cigar32[j]&0xf]; int cigarLen = aln_global->cigar32[j]>>4; if(cigarOp == 'I' || cigarOp == 'D') { gapLength += cigarLen; if(gapLength > maxGapLength) maxGapLength = gapLength; } cigarSS << cigarLen; cigarSS << cigarOp; } cigarStrings[i] = cigarSS.str(); aln_free_AlnAln(aln_global); } double percentDiff = (double)totalDiff / matchLen; double percentGap = (double)gapLength / matchLen; if(percentDiff > m_maxTotalDivergence) bFailDivergenceCheck = true; if(percentGap > m_maxGapDivergence) bFailGapCheck = true; if(maxGapLength > m_maxIndelLength) bFailIndelSizeCheck = true; gapPercent[i] = percentGap; totalPercent[i] = percentDiff; maxIndel[i] = maxGapLength; } if(bIsDegenerate || bFailGapCheck || bFailDivergenceCheck || bFailIndelSizeCheck) continue; // Write the selected path to the variants file as variant 0 int variantIdx = 0; std::string selectedSequence = selectedWalk.getString(SGWT_START_TO_END); std::stringstream ss; ss << "variant-" << m_numRemovedTotal << "/" << variantIdx++; writeFastaRecord(&m_outFile, ss.str(), selectedSequence); // The vertex set for each walk is not necessarily disjoint, // the selected walk may contain vertices that are part // of other paths. We handle this be initially marking all // vertices of the for(size_t i = 0; i < variantWalks.size(); ++i) { if(i == selectedIdx) continue; SGWalk& currWalk = variantWalks[i]; for(size_t j = 0; j < currWalk.getNumEdges() - 1; ++j) { Edge* currEdge = currWalk.getEdge(j); // If the vertex is also on the selected path, do not mark it Vertex* currVertex = currEdge->getEnd(); if(!selectedWalk.containsVertex(currVertex->getID())) { currEdge->getEnd()->setColor(GC_RED); } } // Write the variant to a file std::string variantSequence = currWalk.getString(SGWT_START_TO_END); std::stringstream variantID; std::stringstream ss; ss << "variant-" << m_numRemovedTotal << "/" << variantIdx++; ss << " IGD:" << (double)gapPercent[i] << " ITD:" << totalPercent[i] << " MID: " << maxIndel[i] << " InternalCigar:" << cigarStrings[i]; writeFastaRecord(&m_outFile, ss.str(), variantSequence); } if(variantWalks.size() == 2) m_simpleBubblesRemoved += 1; else m_complexBubblesRemoved += 1; ++m_numRemovedTotal; } }
// Returns true if the paired reads are a short-insert pair bool filterByGraph(StringGraph* pGraph, const BamTools::RefVector& referenceVector, BamTools::BamAlignment& record1, BamTools::BamAlignment& record2) { std::string vertexID1 = referenceVector[record1.RefID].RefName; std::string vertexID2 = referenceVector[record2.RefID].RefName; // Get the vertices for this pair using the mapped IDs Vertex* pX = pGraph->getVertex(vertexID1); Vertex* pY = pGraph->getVertex(vertexID2); // Ensure that the vertices are found assert(pX != NULL && pY != NULL); #ifdef DEBUG_CONNECT std::cout << "Finding path from " << vertexID1 << " to " << vertexID2 << "\n"; #endif EdgeDir walkDirectionXOut = ED_SENSE; EdgeDir walkDirectionYIn = ED_SENSE; // Flip walk directions if the alignment is to the reverse strand if(record1.IsReverseStrand()) walkDirectionXOut = !walkDirectionXOut; if(record2.IsReverseStrand()) walkDirectionYIn = !walkDirectionYIn; int fromX = walkDirectionXOut == ED_SENSE ? record1.Position : record1.GetEndPosition(); int toY = walkDirectionYIn == ED_SENSE ? record2.Position : record2.GetEndPosition(); // Calculate the amount of contig X that already covers the fragment // Using this number, we calculate how far we should search int coveredX = walkDirectionXOut == ED_SENSE ? pX->getSeqLen() - fromX : fromX; int maxWalkDistance = opt::maxDistance - coveredX; bool bShortInsertPair = false; if(pX == pY) { if(abs(record1.InsertSize) < opt::maxDistance) bShortInsertPair = true; } else { SGWalkVector walks; SGSearch::findWalks(pX, pY, walkDirectionXOut, maxWalkDistance, 10000, true, walks); if(!walks.empty()) { for(size_t i = 0; i < walks.size(); ++i) { std::string fragment = walks[i].getFragmentString(pX, pY, fromX, toY, walkDirectionXOut, walkDirectionYIn); if((int)fragment.size() < opt::maxDistance) { bShortInsertPair = true; //std::cout << "Found completing fragment (" << pX->getID() << " -> " << pY->getID() << ": " << fragment.size() << "\n"; break; } } } } return bShortInsertPair; }
// Run the bubble construction process HaplotypeBuilderReturnCode DeBruijnHaplotypeBuilder::run(StringVector& out_haplotypes) { PROFILE_FUNC("GraphCompare::buildVariantStringGraph") assert(!m_startingKmer.empty()); std::map<std::string, int> kmerCountMap; // We search until we find the first common vertex in each direction size_t MIN_TARGET_COUNT = m_parameters.bReferenceMode ? 1 : 2; size_t MAX_ITERATIONS = 2000; size_t MAX_SIMULTANEOUS_BRANCHES = 40; size_t MAX_TOTAL_BRANCHES = 50; // Tracking stats size_t max_simul_branches_used = 0; size_t total_branches = 0; size_t iterations = 0; // Initialize the graph StringGraph* pGraph = new StringGraph; BuilderExtensionQueue queue; Vertex* pVertex = new(pGraph->getVertexAllocator()) Vertex(m_startingKmer, m_startingKmer); pVertex->setColor(GC_BLACK); pGraph->addVertex(pVertex); // Add the vertex to the extension queue queue.push(BuilderExtensionNode(pVertex, ED_SENSE)); queue.push(BuilderExtensionNode(pVertex, ED_ANTISENSE)); std::vector<Vertex*> sense_join_vector; std::vector<Vertex*> antisense_join_vector; // Perform the extension. The while conditions are heuristics to avoid searching // the graph too much while(!queue.empty() && iterations++ < MAX_ITERATIONS && queue.size() < MAX_SIMULTANEOUS_BRANCHES && total_branches < MAX_TOTAL_BRANCHES) { if(queue.size() > max_simul_branches_used) max_simul_branches_used = queue.size(); BuilderExtensionNode curr = queue.front(); queue.pop(); // Calculate de Bruijn extensions for this node std::string vertStr = curr.pVertex->getSeq().toString(); AlphaCount64 extensionCounts = BWTAlgorithms::calculateDeBruijnExtensionsSingleIndex(vertStr, m_parameters.variantIndex.pBWT, curr.direction); std::string extensionsUsed; for(size_t i = 0; i < DNA_ALPHABET::size; ++i) { char b = DNA_ALPHABET::getBase(i); size_t count = extensionCounts.get(b); bool acceptExt = count >= m_parameters.minDBGCount; if(!acceptExt) continue; extensionsUsed.push_back(b); std::string newStr = VariationBuilderCommon::makeDeBruijnVertex(vertStr, b, curr.direction); kmerCountMap[newStr] = count; // Create the new vertex and edge in the graph // Skip if the vertex already exists if(pGraph->getVertex(newStr) != NULL) continue; // Allocate the new vertex and add it to the graph Vertex* pVertex = new(pGraph->getVertexAllocator()) Vertex(newStr, newStr); pVertex->setColor(GC_BLACK); pGraph->addVertex(pVertex); // Add edges VariationBuilderCommon::addSameStrandDeBruijnEdges(pGraph, curr.pVertex, pVertex, curr.direction); // Check if this sequence is present in the FM-index of the target // If so, it is the join point of the de Bruijn graph and we extend no further. size_t targetCount = BWTAlgorithms::countSequenceOccurrences(newStr, m_parameters.baseIndex); if(targetCount >= MIN_TARGET_COUNT) { if(curr.direction == ED_SENSE) sense_join_vector.push_back(pVertex); else antisense_join_vector.push_back(pVertex); } else { // Add the vertex to the extension queue queue.push(BuilderExtensionNode(pVertex, curr.direction)); } } // Update the total number of times we branches the search if(!extensionsUsed.empty()) total_branches += extensionsUsed.size() - 1; } // If the graph construction was successful, walk the graph // between the endpoints to make a string // Generate haplotypes between every pair of antisense/sense join vertices for(size_t i = 0; i < antisense_join_vector.size(); ++i) { for(size_t j = 0; j < sense_join_vector.size(); ++j) { SGWalkVector outWalks; SGSearch::findWalks(antisense_join_vector[i], sense_join_vector[j], ED_SENSE, 100000, // max distance to search 10000, // max nodes to search true, // exhaustive search outWalks); for(size_t k = 0; k < outWalks.size(); ++k) out_haplotypes.push_back(outWalks[k].getString(SGWT_START_TO_END)); } } delete pGraph; return HBRC_OK; }