double AssemblyGraph::getMeanDeBruijnGraphReadDepth(bool drawnNodesOnly) { int nodeCount = 0; long double readDepthSum = 0.0; long long totalLength = 0; QMapIterator<QString, DeBruijnNode*> i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); DeBruijnNode * node = i.value(); if (drawnNodesOnly && node->isNotDrawn()) continue; ++nodeCount; totalLength += node->getLength(); readDepthSum += node->getLength() * node->getReadDepth(); } if (totalLength == 0) return 0.0; else return readDepthSum / totalLength; }
//This function recursively labels all nodes as drawn that are within a //certain distance of this node. Whichever node called this will //definitely be drawn, so that one is excluded from the recursive call. void DeBruijnNode::labelNeighbouringNodesAsDrawn(int nodeDistance, DeBruijnNode * callingNode) { if (m_highestDistanceInNeighbourSearch > nodeDistance) return; m_highestDistanceInNeighbourSearch = nodeDistance; if (nodeDistance == 0) return; DeBruijnNode * otherNode; for (size_t i = 0; i < m_edges.size(); ++i) { otherNode = m_edges[i]->getOtherNode(this); if (otherNode == callingNode) continue; if (g_settings->doubleMode) otherNode->m_drawn = true; else //single mode { if (otherNode->isPositiveNode()) otherNode->m_drawn = true; else otherNode->getReverseComplement()->m_drawn = true; } otherNode->labelNeighbouringNodesAsDrawn(nodeDistance-1, this); } }
long AssemblyJobSsDeBruijn::combineTreesMidHelp(SeqNode* topNodeA, int kmerSize, SeqNode* branchA, SeqNode* branchB, int remainingKmer){ char nucs[] = {'A','T','C','G'}; long newNodeSum = 0; if (remainingKmer == 1){ for (int n = 0; n < 4; ++n){ DeBruijnNode* newNodeB = dynamic_cast<DeBruijnNode*>(branchB->getBranch(nucs[n])); if (newNodeB != 0){ DeBruijnNode* newNodeA = dynamic_cast<DeBruijnNode*>(branchA->getBranch(nucs[n])); if (newNodeA == 0){ newNodeA = new DeBruijnNode(branchA,nucs[n]); branchA->addBranch( newNodeA ); newNodeSum += 1; } newNodeA->addKmerScore( newNodeB->getKmerScore() ); newNodeSum += combineTreesBottomHelp(topNodeA, kmerSize, newNodeA, newNodeB); } } } else { for (int n = 0; n < 4; ++n){ SeqNode* newNodeB = branchB->getBranch(nucs[n]); if (newNodeB != 0){ SeqNode* newNodeA = branchA->getBranch(nucs[n]); if (newNodeA == 0){ newNodeA = new NucNode(branchA,nucs[n]); branchA->addBranch( newNodeA ); } newNodeSum += combineTreesMidHelp(topNodeA, kmerSize, newNodeA, newNodeB, remainingKmer-1); } } } return newNodeSum; }
void AssemblyJobSsDeBruijn::testTreeConstructionHelper(set<ScoredSeq*>* foundKmers, SeqNode* node, int stepsToBottom){ if (stepsToBottom == 0){ DeBruijnNode* dbNode = dynamic_cast<DeBruijnNode*>(node); foundKmers->insert( new ScoredSeqMonoScore(dbNode->getKmer(), dbNode->getKmerScore()) ); //foundKmers->insert( ScoredSeq::getScoredSeq(dbNode->getKmer(), dbNode->getKmerScore()) ); } else { char nucList[] = {'A','T','C','G'}; for (int n = 0; n < 4; ++n){ SeqNode* nextNode = node->getBranch( nucList[n] ); if (nextNode != 0){ testTreeConstructionHelper( foundKmers, nextNode, stepsToBottom-1 ); } } } }
BlastQueryPath::BlastQueryPath(Path path, BlastQuery * query) : m_path(path), m_query(query) { //This function follows the path, returning the BLAST hits it finds for the //query. It requires that the hits occur in order, i.e. that each hit in //the path begins later in the query than the previous hit. BlastHit * previousHit = 0; QList<DeBruijnNode *> pathNodes = m_path.getNodes(); for (int i = 0; i < pathNodes.size(); ++i) { DeBruijnNode * node = pathNodes[i]; QList<BlastHit *> hitsThisNode; QList< QSharedPointer<BlastHit> > queryHits = query->getHits(); for (int j = 0; j < queryHits.size(); ++j) { BlastHit * hit = queryHits[j].data(); if (hit->m_node->getName() == node->getName()) hitsThisNode.push_back(hit); } std::sort(hitsThisNode.begin(), hitsThisNode.end(), BlastHit::compareTwoBlastHitPointers); for (int j = 0; j < hitsThisNode.size(); ++j) { BlastHit * hit = hitsThisNode[j]; //First check to make sure the hits are within the path. This means //if we are in the first or last nodes of the path, we need to make //sure that our hit is contained within the start/end positions. if ( (i != 0 || hit->m_nodeStart >= m_path.getStartLocation().getPosition()) && (i != pathNodes.size()-1 || hit->m_nodeEnd <= m_path.getEndLocation().getPosition())) { //Now make sure that the hit follows the previous hit in the //query. if (previousHit == 0 || hit->m_queryStart > previousHit->m_queryStart) { m_hits.push_back(hit); previousHit = hit; } } } } }
void GraphicsItemEdge::calculateAndSetPath() { setControlPointLocations(); double edgeDistance = QLineF(m_startingLocation, m_endingLocation).length(); double extensionLength = g_settings->segmentLength; if (extensionLength > edgeDistance / 2.0) extensionLength = edgeDistance / 2.0; m_controlPoint1 = extendLine(m_beforeStartingLocation, m_startingLocation, extensionLength); m_controlPoint2 = extendLine(m_afterEndingLocation, m_endingLocation, extensionLength); //If this edge is connecting a node to itself, and that node //is made of only one line segment, then a special path is //required, otherwise the edge will be mostly hidden underneath //the node. DeBruijnNode * startingNode = m_deBruijnEdge->getStartingNode(); DeBruijnNode * endingNode = m_deBruijnEdge->getEndingNode(); if (startingNode == endingNode) { GraphicsItemNode * graphicsItemNode = startingNode->getGraphicsItemNode(); if (graphicsItemNode == 0) graphicsItemNode = startingNode->getReverseComplement()->getGraphicsItemNode(); if (graphicsItemNode != 0 && graphicsItemNode->m_linePoints.size() == 2) { makeSpecialPathConnectingNodeToSelf(); return; } } //If we are in single mode and the edge connects a node to its reverse //complement, then we need a special path to make it visible. if (startingNode == endingNode->getReverseComplement() && !g_settings->doubleMode) { makeSpecialPathConnectingNodeToReverseComplement(); return; } //Otherwise, the path is just a single cubic Bezier curve. QPainterPath path; path.moveTo(m_startingLocation); path.cubicTo(m_controlPoint1, m_controlPoint2, m_endingLocation); setPath(path); }
void AssemblyGraph::addGraphicsItemsToScene(MyGraphicsScene * scene) { scene->clear(); double meanDrawnReadDepth = getMeanDeBruijnGraphReadDepth(true); //First make the GraphicsItemNode objects QMapIterator<QString, DeBruijnNode*> i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); DeBruijnNode * node = i.value(); if (node->isDrawn()) { if (meanDrawnReadDepth == 0) node->setReadDepthRelativeToMeanDrawnReadDepth(1.0); else node->setReadDepthRelativeToMeanDrawnReadDepth(node->getReadDepth() / meanDrawnReadDepth); GraphicsItemNode * graphicsItemNode = new GraphicsItemNode(node, m_graphAttributes); node->setGraphicsItemNode(graphicsItemNode); graphicsItemNode->setFlag(QGraphicsItem::ItemIsSelectable); graphicsItemNode->setFlag(QGraphicsItem::ItemIsMovable); } } resetAllNodeColours(); //Then make the GraphicsItemEdge objects and add them to the scene first //so they are drawn underneath for (size_t i = 0; i < m_deBruijnGraphEdges.size(); ++i) { if (m_deBruijnGraphEdges[i]->isDrawn()) { GraphicsItemEdge * graphicsItemEdge = new GraphicsItemEdge(m_deBruijnGraphEdges[i]); m_deBruijnGraphEdges[i]->setGraphicsItemEdge(graphicsItemEdge); graphicsItemEdge->setFlag(QGraphicsItem::ItemIsSelectable); scene->addItem(graphicsItemEdge); } } //Now add the GraphicsItemNode objects to the scene so they are drawn //on top QMapIterator<QString, DeBruijnNode*> j(m_deBruijnGraphNodes); while (j.hasNext()) { j.next(); DeBruijnNode * node = j.value(); if (node->hasGraphicsItem()) scene->addItem(node->getGraphicsItemNode()); } }
//This function differs from the above by including all reverse complement //nodes in the path search. std::vector<DeBruijnNode *> DeBruijnNode::getNodesCommonToAllPaths(std::vector< std::vector <DeBruijnNode *> > * paths, bool includeReverseComplements) const { std::vector<DeBruijnNode *> commonNodes; //If there are no paths, then return the empty vector. if (paths->size() == 0) return commonNodes; //If there is only one path in path, then they are all common nodes commonNodes = (*paths)[0]; if (paths->size() == 1) return commonNodes; //If there are two or more paths, it's necessary to find the intersection. for (size_t i = 1; i < paths->size(); ++i) { QApplication::processEvents(); std::vector <DeBruijnNode *> * path = &((*paths)[i]); //If we are including reverse complements in the search, //then it is necessary to build a new vector that includes //reverse complement nodes and then use that vector. std::vector <DeBruijnNode *> pathWithReverseComplements; if (includeReverseComplements) { for (size_t j = 0; j < path->size(); ++j) { DeBruijnNode * node = (*path)[j]; pathWithReverseComplements.push_back(node); pathWithReverseComplements.push_back(node->getReverseComplement()); } path = &pathWithReverseComplements; } //Combine the commonNodes vector with the path vector, //excluding any repeats. std::sort(commonNodes.begin(), commonNodes.end()); std::sort(path->begin(), path->end()); std::vector<DeBruijnNode *> newCommonNodes; std::set_intersection(commonNodes.begin(), commonNodes.end(), path->begin(), path->end(), std::back_inserter(newCommonNodes)); commonNodes = newCommonNodes; } return commonNodes; }
QByteArray DeBruijnNode::getUpstreamSequence(int upstreamSequenceLength) const { std::vector<DeBruijnNode*> upstreamNodes = getUpstreamNodes(); QByteArray bestUpstreamNodeSequence; for (size_t i = 0; i < upstreamNodes.size(); ++i) { DeBruijnNode * upstreamNode = upstreamNodes[i]; QByteArray upstreamNodeFullSequence = upstreamNode->getSequence(); QByteArray upstreamNodeSequence; //If the upstream node has enough sequence, great! if (upstreamNodeFullSequence.length() >= upstreamSequenceLength) upstreamNodeSequence = upstreamNodeFullSequence.right(upstreamSequenceLength); //If the upstream node does not have enough sequence, then we need to //look even further upstream. else upstreamNodeSequence = upstreamNode->getUpstreamSequence(upstreamSequenceLength - upstreamNodeFullSequence.length()) + upstreamNodeFullSequence; //If we now have enough sequence, then we can return it. if (upstreamNodeSequence.length() == upstreamSequenceLength) return upstreamNodeSequence; //If we don't have enough sequence, then we need to try the next //upstream node. If our current one is the best so far, save that in //case no complete sequence is found. if (upstreamNodeSequence.length() > bestUpstreamNodeSequence.length()) bestUpstreamNodeSequence = upstreamNodeSequence; } //If the code got here, that means that not enough upstream sequence was //found in any path! Return what we have managed to get so far. return bestUpstreamNodeSequence; }
//The startingNodes and nodeDistance parameters are only used if the graph scope //is not WHOLE_GRAPH. void AssemblyGraph::buildOgdfGraphFromNodesAndEdges(std::vector<DeBruijnNode *> startingNodes, int nodeDistance) { if (g_settings->graphScope == WHOLE_GRAPH) { QMapIterator<QString, DeBruijnNode*> i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); //If double mode is off, only positive nodes are drawn. If it's //on, all nodes are drawn. if (i.value()->isPositiveNode() || g_settings->doubleMode) i.value()->setAsDrawn(); } } else //The scope is either around specified nodes or around nodes with BLAST hits { for (size_t i = 0; i < startingNodes.size(); ++i) { DeBruijnNode * node = startingNodes[i]; //If we are in single mode, make sure that each node is positive. if (!g_settings->doubleMode && node->isNegativeNode()) node = node->getReverseComplement(); node->setAsDrawn(); node->setAsSpecial(); node->labelNeighbouringNodesAsDrawn(nodeDistance, 0); } } //First loop through each node, adding it to OGDF if it is drawn. QMapIterator<QString, DeBruijnNode*> i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); if (i.value()->isDrawn()) i.value()->addToOgdfGraph(m_ogdfGraph); } //Then loop through each determining its drawn status and adding it //to OGDF if it is drawn. for (size_t i = 0; i < m_deBruijnGraphEdges.size(); ++i) { m_deBruijnGraphEdges[i]->determineIfDrawn(); if (m_deBruijnGraphEdges[i]->isDrawn()) m_deBruijnGraphEdges[i]->addToOgdfGraph(m_ogdfGraph); } }
void AssemblyGraph::pointEachNodeToItsReverseComplement() { QMapIterator<QString, DeBruijnNode*> i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); DeBruijnNode * positiveNode = i.value(); if (positiveNode->isPositiveNode()) { DeBruijnNode * negativeNode = m_deBruijnGraphNodes[getOppositeNodeName(positiveNode->getName())]; if (negativeNode != 0) { positiveNode->setReverseComplement(negativeNode); negativeNode->setReverseComplement(positiveNode); } } } }
void AssemblyGraph::buildDeBruijnGraphFromFastg(QString fullFileName) { m_graphFileType = FASTG; QFile inputFile(fullFileName); if (inputFile.open(QIODevice::ReadOnly)) { std::vector<QString> edgeStartingNodeNames; std::vector<QString> edgeEndingNodeNames; DeBruijnNode * node = 0; QTextStream in(&inputFile); while (!in.atEnd()) { QApplication::processEvents(); QString nodeName; double nodeReadDepth; QString line = in.readLine(); //If the line starts with a '>', then we are beginning a new node. if (line.startsWith(">")) { line.remove(0, 1); //Remove '>' from start line.chop(1); //Remove ';' from end QStringList nodeDetails = line.split(":"); QString thisNode = nodeDetails.at(0); //A single quote as the last character indicates a negative node. bool negativeNode = thisNode.at(thisNode.size() - 1) == '\''; QStringList thisNodeDetails = thisNode.split("_"); if (thisNodeDetails.size() < 6) throw "load error"; nodeName = thisNodeDetails.at(1); if (negativeNode) nodeName += "-"; else nodeName += "+"; QString nodeReadDepthString = thisNodeDetails.at(5); if (negativeNode) { //It may be necessary to remove a single quote from the end of the read depth if (nodeReadDepthString.at(nodeReadDepthString.size() - 1) == '\'') nodeReadDepthString.chop(1); } nodeReadDepth = nodeReadDepthString.toDouble(); //Make the node node = new DeBruijnNode(nodeName, nodeReadDepth, ""); //Sequence string is currently empty - will be added to on subsequent lines of the fastg file m_deBruijnGraphNodes.insert(nodeName, node); //The second part of nodeDetails is a comma-delimited list of edge nodes. //Edges aren't made right now (because the other node might not yet exist), //so they are saved into vectors and made after all the nodes have been made. if (nodeDetails.size() == 1) continue; QStringList edgeNodes = nodeDetails.at(1).split(","); for (int i = 0; i < edgeNodes.size(); ++i) { QString edgeNode = edgeNodes.at(i); QChar lastChar = edgeNode.at(edgeNode.size() - 1); bool negativeNode = false; if (lastChar == '\'') { negativeNode = true; edgeNode.chop(1); } QStringList edgeNodeDetails = edgeNode.split("_"); if (edgeNodeDetails.size() < 2) throw "load error"; QString edgeNodeName = edgeNodeDetails.at(1); if (negativeNode) edgeNodeName += "-"; else edgeNodeName += "+"; edgeStartingNodeNames.push_back(nodeName); edgeEndingNodeNames.push_back(edgeNodeName); } } //If the line does not start with a '>', then this line is part of the //sequence for the last node. else { QByteArray sequenceLine = line.simplified().toLocal8Bit(); if (node != 0) node->appendToSequence(sequenceLine); } } inputFile.close(); //If all went well, each node will have a reverse complement and the code //will never get here. However, I have noticed that some SPAdes fastg files //have, for some reason, negative nodes with no positive counterpart. For //that reason, we will now make any reverse complement nodes for nodes that //lack them. QMapIterator<QString, DeBruijnNode*> i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); DeBruijnNode * node = i.value(); makeReverseComplementNodeIfNecessary(node); } pointEachNodeToItsReverseComplement(); //Create all of the edges for (size_t i = 0; i < edgeStartingNodeNames.size(); ++i) { QString node1Name = edgeStartingNodeNames[i]; QString node2Name = edgeEndingNodeNames[i]; createDeBruijnEdge(node1Name, node2Name); } } autoDetermineAllEdgesExactOverlap(); if (m_deBruijnGraphNodes.size() == 0) throw "load error"; }
void GraphicsItemEdge::setControlPointLocations() { DeBruijnNode * startingNode = m_deBruijnEdge->getStartingNode(); DeBruijnNode * endingNode = m_deBruijnEdge->getEndingNode(); if (startingNode->hasGraphicsItem()) { m_startingLocation = startingNode->getGraphicsItemNode()->getLast(); m_beforeStartingLocation = startingNode->getGraphicsItemNode()->getSecondLast(); } else if (startingNode->getReverseComplement()->hasGraphicsItem()) { m_startingLocation = startingNode->getReverseComplement()->getGraphicsItemNode()->getFirst(); m_beforeStartingLocation = startingNode->getReverseComplement()->getGraphicsItemNode()->getSecond(); } if (endingNode->hasGraphicsItem()) { m_endingLocation = endingNode->getGraphicsItemNode()->getFirst(); m_afterEndingLocation = endingNode->getGraphicsItemNode()->getSecond(); } else if (endingNode->getReverseComplement()->hasGraphicsItem()) { m_endingLocation = endingNode->getReverseComplement()->getGraphicsItemNode()->getLast(); m_afterEndingLocation = endingNode->getReverseComplement()->getGraphicsItemNode()->getSecondLast(); } }
void AssemblyJobSsDeBruijn::findBestLinks(DeBruijnGraph* graph){ char nucs[] = {'A','T','C','G'}; for (DeBruijnGraph::Iterator nodeIt = graph->begin(); nodeIt != graph->end(); ++nodeIt){ DeBruijnNode* currentNode = *nodeIt; DeBruijnNode* best5pLink = 0; float best5pLinkScore = 0; DeBruijnNode* best3pLink = 0; float best3pLinkScore = 0; for (int n = 0; n < 4; ++n){ DeBruijnNode* cand5pLink = currentNode->get5pLink(nucs[n]); if (cand5pLink != 0){ // take the worse of the two linkage scores float next5pLinkScore; if (currentNode->get5pLinkScore(nucs[n]) > cand5pLink->get3pLinkScore(currentNode->get3pNuc()) ){ next5pLinkScore = cand5pLink->get3pLinkScore(currentNode->get3pNuc()); } else { next5pLinkScore = currentNode->get5pLinkScore(nucs[n]); } if (next5pLinkScore > best5pLinkScore){ best5pLinkScore = next5pLinkScore; best5pLink = cand5pLink; } } DeBruijnNode* cand3pLink = currentNode->get3pLink(nucs[n]); if (cand3pLink != 0){ // take the worse of the two linkage scores float next3pLinkScore; if (currentNode->get3pLinkScore(nucs[n]) > cand3pLink->get5pLinkScore(currentNode->get5pNuc()) ){ next3pLinkScore = cand3pLink->get5pLinkScore(currentNode->get5pNuc()); } else { next3pLinkScore = currentNode->get3pLinkScore(nucs[n]); } if (next3pLinkScore > best3pLinkScore){ best3pLinkScore = next3pLinkScore; best3pLink = cand3pLink; } } } currentNode->setBest5pLink(best5pLink); currentNode->setBest3pLink(best3pLink); } // FOURTH resolve best-edge conflicts for (DeBruijnGraph::Iterator nodeIt = graph->begin(); nodeIt != graph->end(); ++nodeIt){ DeBruijnNode* currentNode = *nodeIt; DeBruijnNode* best5pLink = currentNode->best5pLink(); if (best5pLink != 0 and best5pLink->best3pLink() != currentNode){ DeBruijnNode* bestRecip = best5pLink->best3pLink(); if (bestRecip == 0){ best5pLink->setBest3pLink(currentNode); } else { currentNode->setBest5pLink( 0 ); } } DeBruijnNode* best3pLink = currentNode->best3pLink(); if (best3pLink != 0 and best3pLink->best5pLink() != currentNode){ DeBruijnNode* bestRecip = best3pLink->best5pLink(); if (bestRecip == 0){ best3pLink->setBest5pLink(currentNode); } else { currentNode->setBest3pLink( 0 ); } } } }
//This function determines the contiguity of nodes relative to this one. //It has two steps: // -First, for each edge leaving this node, all paths outward are found. // Any nodes in any path are MAYBE_CONTIGUOUS, and nodes in all of the // paths are CONTIGUOUS. // -Second, it is necessary to check in the opposite direction - for each // of the MAYBE_CONTIGUOUS nodes, do they have a path that unambiguously // leads to this node? If so, then they are CONTIGUOUS. void DeBruijnNode::determineContiguity() { upgradeContiguityStatus(STARTING); //A set is used to store all nodes found in the paths, as the nodes //that show up as MAYBE_CONTIGUOUS will have their paths checked //to this node. std::set<DeBruijnNode *> allCheckedNodes; //For each path leaving this node, find all possible paths //outward. Nodes in any of the paths for an edge are //MAYBE_CONTIGUOUS. Nodes in all of the paths for an edge //are CONTIGUOUS. for (size_t i = 0; i < m_edges.size(); ++i) { DeBruijnEdge * edge = m_edges[i]; bool outgoingEdge = (this == edge->getStartingNode()); std::vector< std::vector <DeBruijnNode *> > allPaths; edge->tracePaths(outgoingEdge, g_settings->contiguitySearchSteps, &allPaths, this); //Set all nodes in the paths as MAYBE_CONTIGUOUS for (size_t j = 0; j < allPaths.size(); ++j) { QApplication::processEvents(); for (size_t k = 0; k < allPaths[j].size(); ++k) { DeBruijnNode * node = allPaths[j][k]; node->upgradeContiguityStatus(MAYBE_CONTIGUOUS); allCheckedNodes.insert(node); } } //Set all common nodes as CONTIGUOUS_STRAND_SPECIFIC std::vector<DeBruijnNode *> commonNodesStrandSpecific = getNodesCommonToAllPaths(&allPaths, false); for (size_t j = 0; j < commonNodesStrandSpecific.size(); ++j) (commonNodesStrandSpecific[j])->upgradeContiguityStatus(CONTIGUOUS_STRAND_SPECIFIC); //Set all common nodes (when including reverse complement nodes) //as CONTIGUOUS_EITHER_STRAND std::vector<DeBruijnNode *> commonNodesEitherStrand = getNodesCommonToAllPaths(&allPaths, true); for (size_t j = 0; j < commonNodesEitherStrand.size(); ++j) { DeBruijnNode * node = commonNodesEitherStrand[j]; node->upgradeContiguityStatus(CONTIGUOUS_EITHER_STRAND); node->getReverseComplement()->upgradeContiguityStatus(CONTIGUOUS_EITHER_STRAND); } } //For each node that was checked, then we check to see if any //of its paths leads unambiuously back to the starting node (this node). for (std::set<DeBruijnNode *>::iterator i = allCheckedNodes.begin(); i != allCheckedNodes.end(); ++i) { QApplication::processEvents(); DeBruijnNode * node = *i; ContiguityStatus status = node->getContiguityStatus(); //First check without reverse complement target for //strand-specific contiguity. if (status != CONTIGUOUS_STRAND_SPECIFIC && node->doesPathLeadOnlyToNode(this, false)) node->upgradeContiguityStatus(CONTIGUOUS_STRAND_SPECIFIC); //Now check including the reverse complement target for //either strand contiguity. if (status != CONTIGUOUS_STRAND_SPECIFIC && status != CONTIGUOUS_EITHER_STRAND && node->doesPathLeadOnlyToNode(this, true)) { node->upgradeContiguityStatus(CONTIGUOUS_EITHER_STRAND); node->getReverseComplement()->upgradeContiguityStatus(CONTIGUOUS_EITHER_STRAND); } } }
ScoredSeq* AssemblyJobSsDeBruijn::makeSeqFromPath(DeBruijnNode* node5p, DeBruijnNode* node3p, long numNodes){ long seqLen = _kmerSize + numNodes - 1; char* cSeq = new char[seqLen + 1]; cSeq[seqLen] = '\0'; float* scores = new float[seqLen]; float* links = new float[seqLen-1]; // do the initial kmer and any overlapping positions node5p->getKmer(cSeq,0); float firstScore = node5p->getKmerScore(); for (int n = 0; n < _kmerSize; ++n){ scores[n] = firstScore; if (n < _kmerSize - 1){ links[n] = firstScore; } } // fill in the rest DeBruijnNode* currentNode = node5p; DeBruijnNode* priorNode = 0; long pos5p = 0; long pos3p = _kmerSize - 1; while (currentNode != node3p){ // increment pos5p++; pos3p++; priorNode = currentNode; currentNode = currentNode->best3pLink(); // fill in values cSeq[pos3p] = currentNode->get3pNuc(); scores[pos3p] = currentNode->getKmerScore(); links[pos3p-1] = priorNode->get3pLinkScore( currentNode->get3pNuc() ); if (links[pos5p-1] < currentNode->get5pLinkScore( cSeq[pos5p-1] )){ links[pos5p-1] = currentNode->get5pLinkScore( cSeq[pos5p-1] ); } for (long pos = pos5p; pos < pos3p; ++pos){ if (scores[pos] < currentNode->getKmerScore()){ scores[pos] = currentNode->getKmerScore(); } } } ScoredSeq* novelSeq = new ScoredSeqShallow(true, cSeq, scores, links, seqLen); //ScoredSeq* novelSeq = ScoredSeq::getScoredSeq(cSeq, scores, links, seqLen); /* delete [] cSeq; delete [] scores; delete [] links; */ return novelSeq; }
void AssemblyGraph::buildDeBruijnGraphFromGfa(QString fullFileName) { m_graphFileType = GFA; QFile inputFile(fullFileName); if (inputFile.open(QIODevice::ReadOnly)) { std::vector<QString> edgeStartingNodeNames; std::vector<QString> edgeEndingNodeNames; std::vector<int> edgeOverlaps; QTextStream in(&inputFile); while (!in.atEnd()) { QApplication::processEvents(); QString line = in.readLine(); QStringList lineParts = line.split(QRegExp("\t")); if (lineParts.size() < 1) continue; //Lines beginning with "S" are sequence (node) lines if (lineParts.at(0) == "S") { if (lineParts.size() < 3) throw "load error"; QString nodeName = lineParts.at(1); QString posNodeName = nodeName + "+"; QString negNodeName = nodeName + "-"; QByteArray sequence = lineParts.at(2).toLocal8Bit(); QByteArray revCompSequence = getReverseComplement(sequence); //If there is an attribute holding the read depth, we'll use that. //If there isn't, then we'll use zero. double nodeReadDepth = 0.0; for (int i = 3; i < lineParts.size(); ++i) { QString part = lineParts.at(i); if (part.size() < 6) continue; else if (part.left(5) == "KC:f:") nodeReadDepth = part.right(part.length() - 5).toDouble(); } DeBruijnNode * node = new DeBruijnNode(posNodeName, nodeReadDepth, sequence); DeBruijnNode * reverseComplementNode = new DeBruijnNode(negNodeName, nodeReadDepth, revCompSequence); node->setReverseComplement(reverseComplementNode); reverseComplementNode->setReverseComplement(node); m_deBruijnGraphNodes.insert(posNodeName, node); m_deBruijnGraphNodes.insert(negNodeName, reverseComplementNode); } //Lines beginning with "L" are link (edge) lines else if (lineParts.at(0) == "L") { //Edges aren't made now, in case their sequence hasn't yet been specified. //Instead, we save the starting and ending nodes and make the edges after //we're done looking at the file. if (lineParts.size() < 6) throw "load error"; //Parts 1 and 3 hold the node names and parts 2 and 4 hold the corresponding +/-. QString startingNode = lineParts.at(1) + lineParts.at(2); QString endingNode = lineParts.at(3) + lineParts.at(4); edgeStartingNodeNames.push_back(startingNode); edgeEndingNodeNames.push_back(endingNode); //Part 5 holds the node overlap cigar string QString cigar = lineParts.at(5); edgeOverlaps.push_back(getLengthFromCigar(cigar)); } } //Create all of the edges for (size_t i = 0; i < edgeStartingNodeNames.size(); ++i) { QString node1Name = edgeStartingNodeNames[i]; QString node2Name = edgeEndingNodeNames[i]; int overlap = edgeOverlaps[i]; createDeBruijnEdge(node1Name, node2Name, overlap); } } if (m_deBruijnGraphNodes.size() == 0) throw "load error"; }
void AssemblyGraph::buildDeBruijnGraphFromLastGraph(QString fullFileName) { m_graphFileType = LAST_GRAPH; bool firstLine = true; QFile inputFile(fullFileName); if (inputFile.open(QIODevice::ReadOnly)) { QTextStream in(&inputFile); while (!in.atEnd()) { QApplication::processEvents(); QString line = in.readLine(); if (firstLine) { QStringList firstLineParts = line.split(QRegExp("\\s+")); if (firstLineParts.size() > 2) m_kmer = firstLineParts[2].toInt(); firstLine = false; } if (line.startsWith("NODE")) { QStringList nodeDetails = line.split(QRegExp("\\s+")); if (nodeDetails.size() < 4) throw "load error"; QString nodeName = nodeDetails.at(1); QString posNodeName = nodeName + "+"; QString negNodeName = nodeName + "-"; int nodeLength = nodeDetails.at(2).toInt(); double nodeReadDepth; if (nodeLength > 0) nodeReadDepth = double(nodeDetails.at(3).toInt()) / nodeLength; //IS THIS COLUMN ($COV_SHORT1) THE BEST ONE TO USE? else nodeReadDepth = double(nodeDetails.at(3).toInt()); QByteArray sequence = in.readLine().toLocal8Bit(); QByteArray revCompSequence = in.readLine().toLocal8Bit(); DeBruijnNode * node = new DeBruijnNode(posNodeName, nodeReadDepth, sequence); DeBruijnNode * reverseComplementNode = new DeBruijnNode(negNodeName, nodeReadDepth, revCompSequence); node->setReverseComplement(reverseComplementNode); reverseComplementNode->setReverseComplement(node); m_deBruijnGraphNodes.insert(posNodeName, node); m_deBruijnGraphNodes.insert(negNodeName, reverseComplementNode); } else if (line.startsWith("ARC")) { QStringList arcDetails = line.split(QRegExp("\\s+")); if (arcDetails.size() < 3) throw "load error"; QString node1Name = convertNormalNumberStringToBandageNodeName(arcDetails.at(1)); QString node2Name = convertNormalNumberStringToBandageNodeName(arcDetails.at(2)); createDeBruijnEdge(node1Name, node2Name); } } inputFile.close(); setAllEdgesExactOverlap(0); } if (m_deBruijnGraphNodes.size() == 0) throw "load error"; }
pair<AssemblyJobSsDeBruijn::NucNode*,long> AssemblyJobSsDeBruijn::makeGraphTree(set<ScoredSeq*>* seqs, int kmerSize){ NucNode* topNode = new NucNode(0, '\0'); long numBaseNodes = 0; if (kmerSize > 0){ for (set<ScoredSeq*>::iterator seqIt = seqs->begin(); seqIt != seqs->end(); ++seqIt){ ScoredSeq* seq = *seqIt; // these are the nodes for which the current nuc is at the Nth position in the kmer; // the elements earlier in the array represent kmers that come later in the sequence SeqNode* currentNodes[kmerSize-1]; float worstScores[kmerSize-1]; // for making the de Bruijn graph at the tips DeBruijnNode* finishedNode = 0; DeBruijnNode* priorNode = 0; // used to determine how far along the kmer construction has made it long maxCurrentNodeIndex = 0; bool createFinishedNode = false; char* nucs = seq->getSeq('+'); float* scores = seq->getScores('+'); float* links = seq->getLinks('+'); long seqSize = seq->size(); long seqSizeM1 = seqSize - 1; for (long pos = 0; pos < seqSize; ++pos){ char nuc = nucs[pos]; float nucScore = scores[pos]; // will only be used up to the second-to-last position float linkScore; if (pos < seqSizeM1){ linkScore = links[pos]; } else { linkScore = 0; } // re-set the derivation of kmers if the nucleotide is an N if (nuc == 'N'){ maxCurrentNodeIndex = 0; createFinishedNode = false; finishedNode = 0; priorNode = 0; } else { priorNode = finishedNode; if (createFinishedNode){ SeqNode* oldNode = currentNodes[maxCurrentNodeIndex]; float worstScore = worstScores[maxCurrentNodeIndex]; finishedNode = dynamic_cast<DeBruijnNode*>( oldNode->getBranch(nuc) ); if (finishedNode == 0){ finishedNode = new DeBruijnNode(oldNode, nuc); oldNode->addBranch(finishedNode); numBaseNodes++; } // add the kmer score to the node; the worst score possible! if ( nucScore < worstScore ){ worstScore = nucScore; } finishedNode->addKmerScore(worstScore); // add the 3p-directed link score if (priorNode != 0){ float link3pScore = links[pos-1]; float link5pScore = links[pos-kmerSize]; float worseLinkScore = link3pScore; if (link5pScore < worseLinkScore){ worseLinkScore = link5pScore; } priorNode->add3pLink(finishedNode,worseLinkScore); finishedNode->add5pLink(priorNode,worseLinkScore); } } for (int n = maxCurrentNodeIndex; n >= 0; --n){ SeqNode* oldNode; float worstScore; if (n==0){ oldNode = topNode; worstScore = nucScore; } else { oldNode = currentNodes[n-1]; worstScore = worstScores[n-1]; if ( nucScore < worstScore ){ worstScore = nucScore; } } if ( linkScore < worstScore ){ worstScore = linkScore; } SeqNode* newNode = oldNode->getBranch(nuc); if (newNode == 0){ newNode = new NucNode(oldNode,nuc); oldNode->addBranch( newNode ); } currentNodes[n] = newNode; worstScores[n] = worstScore; } if (! createFinishedNode){ if (maxCurrentNodeIndex < kmerSize-2){ maxCurrentNodeIndex++; } else { createFinishedNode = true; } } } } delete [] nucs; delete [] scores; delete [] links; } } return pair<NucNode*,long>(topNode,numBaseNodes); }