void AssemblyGraph::determineGraphInfo() { m_shortestContig = std::numeric_limits<long long>::max(); m_longestContig = 0; int nodeCount = 0; long long totalLength = 0; std::vector<double> nodeReadDepths; QMapIterator<QString, DeBruijnNode*> i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); long long nodeLength = i.value()->getLength(); if (nodeLength < m_shortestContig) m_shortestContig = nodeLength; if (nodeLength > m_longestContig) m_longestContig = nodeLength; //Only add up the length for positive nodes if (i.value()->isPositiveNode()) { totalLength += nodeLength; ++nodeCount; } nodeReadDepths.push_back(i.value()->getReadDepth()); } //Count up the edges. Edges that are their own pairs will //not be counted, as these won't show up in single mode. int edgeCount = 0; for (size_t i = 0; i < m_deBruijnGraphEdges.size(); ++i) { DeBruijnEdge * edge = m_deBruijnGraphEdges[i]; if (edge != edge->getReverseComplement()) ++edgeCount; } edgeCount /= 2; m_nodeCount = nodeCount; m_edgeCount = edgeCount; m_totalLength = totalLength; m_meanReadDepth = getMeanDeBruijnGraphReadDepth(); std::sort(nodeReadDepths.begin(), nodeReadDepths.end()); double firstQuartileIndex = nodeReadDepths.size() / 4.0; double medianIndex = nodeReadDepths.size() / 2.0; double thirdQuartileIndex = nodeReadDepths.size() * 3.0 / 4.0; m_firstQuartileReadDepth = getValueUsingFractionalIndex(&nodeReadDepths, firstQuartileIndex); m_medianReadDepth = getValueUsingFractionalIndex(&nodeReadDepths, medianIndex); m_thirdQuartileReadDepth = getValueUsingFractionalIndex(&nodeReadDepths, thirdQuartileIndex); //Set the auto base pairs per segment int totalSegments = m_nodeCount * g_settings->meanSegmentsPerNode; g_settings->autoBasePairsPerSegment = m_totalLength / totalSegments; }
bool DeBruijnNode::isNodeConnected(DeBruijnNode * node) const { for (size_t i = 0; i < m_edges.size(); ++i) { DeBruijnEdge * edge = m_edges[i]; if (edge->getStartingNode() == node || edge->getEndingNode() == node) return true; } return false; }
//This function checks to see if the passed node leads away from //this node. If so, it returns the connecting edge. If not, //it returns a null pointer. DeBruijnEdge * DeBruijnNode::doesNodeLeadAway(DeBruijnNode * node) const { for (size_t i = 0; i < m_edges.size(); ++i) { DeBruijnEdge * edge = m_edges[i]; if (edge->getStartingNode() == this && edge->getEndingNode() == node) return edge; } return 0; }
std::vector<DeBruijnEdge *> DeBruijnNode::getLeavingEdges() const { std::vector<DeBruijnEdge *> returnVector; for (size_t i = 0; i < m_edges.size(); ++i) { DeBruijnEdge * edge = m_edges[i]; if (this == edge->getStartingNode()) returnVector.push_back(edge); } return returnVector; }
//If the node has an edge which leads to itself (creating a loop), this function //will return it. Otherwise, it returns 0. DeBruijnEdge * DeBruijnNode::getSelfLoopingEdge() const { for (size_t i = 0; i < m_edges.size(); ++i) { DeBruijnEdge * edge = m_edges[i]; if (edge->getStartingNode() == this && edge->getEndingNode() == this) return edge; } return 0; }
//This function checks whether this node has any path leading outward that //unambiguously leads to the given node. //It checks a number of steps as set by the contiguitySearchSteps setting. //If includeReverseComplement is true, then this function returns true if //all paths lead either to the node or its reverse complement node. bool DeBruijnNode::doesPathLeadOnlyToNode(DeBruijnNode * node, bool includeReverseComplement) { for (size_t i = 0; i < m_edges.size(); ++i) { DeBruijnEdge * edge = m_edges[i]; bool outgoingEdge = (this == edge->getStartingNode()); std::vector<DeBruijnNode *> pathSoFar; pathSoFar.push_back(this); if (edge->leadsOnlyToNode(outgoingEdge, g_settings->contiguitySearchSteps, node, pathSoFar, includeReverseComplement)) return true; } return false; }
//This function makes a double edge: in one direction for the given nodes //and the opposite direction for their reverse complements. It adds the //new edges to the vector here and to the nodes themselves. void AssemblyGraph::createDeBruijnEdge(QString node1Name, QString node2Name, int overlap) { QString node1Opposite = getOppositeNodeName(node1Name); QString node2Opposite = getOppositeNodeName(node2Name); //Quit if any of the nodes don't exist. if (!m_deBruijnGraphNodes.contains(node1Name) || !m_deBruijnGraphNodes.contains(node2Name) || !m_deBruijnGraphNodes.contains(node1Opposite) || !m_deBruijnGraphNodes.contains(node2Opposite)) return; DeBruijnNode * node1 = m_deBruijnGraphNodes[node1Name]; DeBruijnNode * node2 = m_deBruijnGraphNodes[node2Name]; DeBruijnNode * negNode1 = m_deBruijnGraphNodes[node1Opposite]; DeBruijnNode * negNode2 = m_deBruijnGraphNodes[node2Opposite]; //Quit if the edge already exists const std::vector<DeBruijnEdge *> * edges = node1->getEdgesPointer(); for (size_t i = 0; i < edges->size(); ++i) { if ((*edges)[i]->getStartingNode() == node1 && (*edges)[i]->getEndingNode() == node2) return; } //Usually, an edge has a different pair, but it is possible //for an edge to be its own pair. bool isOwnPair = (node1 == negNode2 && node2 == negNode1); DeBruijnEdge * forwardEdge = new DeBruijnEdge(node1, node2); DeBruijnEdge * backwardEdge; if (isOwnPair) backwardEdge = forwardEdge; else backwardEdge = new DeBruijnEdge(negNode2, negNode1); forwardEdge->setReverseComplement(backwardEdge); backwardEdge->setReverseComplement(forwardEdge); forwardEdge->setOverlap(overlap); backwardEdge->setOverlap(overlap); m_deBruijnGraphEdges.push_back(forwardEdge); if (!isOwnPair) m_deBruijnGraphEdges.push_back(backwardEdge); node1->addEdge(forwardEdge); node2->addEdge(forwardEdge); negNode1->addEdge(backwardEdge); negNode2->addEdge(backwardEdge); }
//This function determines the contiguity of nodes relative to this one. //It has two steps: // -First, for each edge leaving this node, all paths outward are found. // Any nodes in any path are MAYBE_CONTIGUOUS, and nodes in all of the // paths are CONTIGUOUS. // -Second, it is necessary to check in the opposite direction - for each // of the MAYBE_CONTIGUOUS nodes, do they have a path that unambiguously // leads to this node? If so, then they are CONTIGUOUS. void DeBruijnNode::determineContiguity() { upgradeContiguityStatus(STARTING); //A set is used to store all nodes found in the paths, as the nodes //that show up as MAYBE_CONTIGUOUS will have their paths checked //to this node. std::set<DeBruijnNode *> allCheckedNodes; //For each path leaving this node, find all possible paths //outward. Nodes in any of the paths for an edge are //MAYBE_CONTIGUOUS. Nodes in all of the paths for an edge //are CONTIGUOUS. for (size_t i = 0; i < m_edges.size(); ++i) { DeBruijnEdge * edge = m_edges[i]; bool outgoingEdge = (this == edge->getStartingNode()); std::vector< std::vector <DeBruijnNode *> > allPaths; edge->tracePaths(outgoingEdge, g_settings->contiguitySearchSteps, &allPaths, this); //Set all nodes in the paths as MAYBE_CONTIGUOUS for (size_t j = 0; j < allPaths.size(); ++j) { QApplication::processEvents(); for (size_t k = 0; k < allPaths[j].size(); ++k) { DeBruijnNode * node = allPaths[j][k]; node->upgradeContiguityStatus(MAYBE_CONTIGUOUS); allCheckedNodes.insert(node); } } //Set all common nodes as CONTIGUOUS_STRAND_SPECIFIC std::vector<DeBruijnNode *> commonNodesStrandSpecific = getNodesCommonToAllPaths(&allPaths, false); for (size_t j = 0; j < commonNodesStrandSpecific.size(); ++j) (commonNodesStrandSpecific[j])->upgradeContiguityStatus(CONTIGUOUS_STRAND_SPECIFIC); //Set all common nodes (when including reverse complement nodes) //as CONTIGUOUS_EITHER_STRAND std::vector<DeBruijnNode *> commonNodesEitherStrand = getNodesCommonToAllPaths(&allPaths, true); for (size_t j = 0; j < commonNodesEitherStrand.size(); ++j) { DeBruijnNode * node = commonNodesEitherStrand[j]; node->upgradeContiguityStatus(CONTIGUOUS_EITHER_STRAND); node->getReverseComplement()->upgradeContiguityStatus(CONTIGUOUS_EITHER_STRAND); } } //For each node that was checked, then we check to see if any //of its paths leads unambiuously back to the starting node (this node). for (std::set<DeBruijnNode *>::iterator i = allCheckedNodes.begin(); i != allCheckedNodes.end(); ++i) { QApplication::processEvents(); DeBruijnNode * node = *i; ContiguityStatus status = node->getContiguityStatus(); //First check without reverse complement target for //strand-specific contiguity. if (status != CONTIGUOUS_STRAND_SPECIFIC && node->doesPathLeadOnlyToNode(this, false)) node->upgradeContiguityStatus(CONTIGUOUS_STRAND_SPECIFIC); //Now check including the reverse complement target for //either strand contiguity. if (status != CONTIGUOUS_STRAND_SPECIFIC && status != CONTIGUOUS_EITHER_STRAND && node->doesPathLeadOnlyToNode(this, true)) { node->upgradeContiguityStatus(CONTIGUOUS_EITHER_STRAND); node->getReverseComplement()->upgradeContiguityStatus(CONTIGUOUS_EITHER_STRAND); } } }
void AssemblyGraph::autoDetermineAllEdgesExactOverlap() { int edgeCount = m_deBruijnGraphEdges.size(); if (edgeCount == 0) return; //Determine the overlap for each edge and produce a vector //that for (size_t i = 0; i < m_deBruijnGraphEdges.size(); ++i) m_deBruijnGraphEdges[i]->autoDetermineExactOverlap(); //The expectation here is that most overlaps will be //the same or from a small subset of possible sizes. //Edges with an overlap that do not match the most common //overlap(s) are suspected of having their overlap //misidentified. They are therefore rechecked using the //common ones. std::vector<int> overlapCounts = makeOverlapCountVector(); //Sort the overlaps in order of decreasing numbers of edges. //I.e. the first overlap size in the vector will be the most //common overlap, the second will be the second most common, //etc. std::vector<int> sortedOverlaps; int overlapsSoFar = 0; double fractionOverlapsFound = 0.0; while (fractionOverlapsFound < 1.0) { int mostCommonOverlap = 0; int mostCommonOverlapCount = 0; //Find the overlap size with the most instances. for (size_t i = 0; i < overlapCounts.size(); ++i) { if (overlapCounts[i] > mostCommonOverlapCount) { mostCommonOverlap = i; mostCommonOverlapCount = overlapCounts[i]; } } //Add that overlap to the common collection and remove it from the counts. sortedOverlaps.push_back(mostCommonOverlap); overlapsSoFar += mostCommonOverlapCount; fractionOverlapsFound = double(overlapsSoFar) / edgeCount; overlapCounts[mostCommonOverlap] = 0; } //For each edge, see if one of the more common overlaps also works. //If so, use that instead. for (size_t i = 0; i < m_deBruijnGraphEdges.size(); ++i) { DeBruijnEdge * edge = m_deBruijnGraphEdges[i]; for (size_t j = 0; j < sortedOverlaps.size(); ++j) { if (edge->getOverlap() == sortedOverlaps[j]) break; else if (edge->testExactOverlap(sortedOverlaps[j])) { edge->setOverlap(sortedOverlaps[j]); break; } } } }