long AssemblyJobSsDeBruijn::combineTreesMidHelp(SeqNode* topNodeA, int kmerSize, SeqNode* branchA, SeqNode* branchB, int remainingKmer){ char nucs[] = {'A','T','C','G'}; long newNodeSum = 0; if (remainingKmer == 1){ for (int n = 0; n < 4; ++n){ DeBruijnNode* newNodeB = dynamic_cast<DeBruijnNode*>(branchB->getBranch(nucs[n])); if (newNodeB != 0){ DeBruijnNode* newNodeA = dynamic_cast<DeBruijnNode*>(branchA->getBranch(nucs[n])); if (newNodeA == 0){ newNodeA = new DeBruijnNode(branchA,nucs[n]); branchA->addBranch( newNodeA ); newNodeSum += 1; } newNodeA->addKmerScore( newNodeB->getKmerScore() ); newNodeSum += combineTreesBottomHelp(topNodeA, kmerSize, newNodeA, newNodeB); } } } else { for (int n = 0; n < 4; ++n){ SeqNode* newNodeB = branchB->getBranch(nucs[n]); if (newNodeB != 0){ SeqNode* newNodeA = branchA->getBranch(nucs[n]); if (newNodeA == 0){ newNodeA = new NucNode(branchA,nucs[n]); branchA->addBranch( newNodeA ); } newNodeSum += combineTreesMidHelp(topNodeA, kmerSize, newNodeA, newNodeB, remainingKmer-1); } } } return newNodeSum; }
pair<AssemblyJobSsDeBruijn::NucNode*,long> AssemblyJobSsDeBruijn::makeGraphTree(set<ScoredSeq*>* seqs, int kmerSize){ NucNode* topNode = new NucNode(0, '\0'); long numBaseNodes = 0; if (kmerSize > 0){ for (set<ScoredSeq*>::iterator seqIt = seqs->begin(); seqIt != seqs->end(); ++seqIt){ ScoredSeq* seq = *seqIt; // these are the nodes for which the current nuc is at the Nth position in the kmer; // the elements earlier in the array represent kmers that come later in the sequence SeqNode* currentNodes[kmerSize-1]; float worstScores[kmerSize-1]; // for making the de Bruijn graph at the tips DeBruijnNode* finishedNode = 0; DeBruijnNode* priorNode = 0; // used to determine how far along the kmer construction has made it long maxCurrentNodeIndex = 0; bool createFinishedNode = false; char* nucs = seq->getSeq('+'); float* scores = seq->getScores('+'); float* links = seq->getLinks('+'); long seqSize = seq->size(); long seqSizeM1 = seqSize - 1; for (long pos = 0; pos < seqSize; ++pos){ char nuc = nucs[pos]; float nucScore = scores[pos]; // will only be used up to the second-to-last position float linkScore; if (pos < seqSizeM1){ linkScore = links[pos]; } else { linkScore = 0; } // re-set the derivation of kmers if the nucleotide is an N if (nuc == 'N'){ maxCurrentNodeIndex = 0; createFinishedNode = false; finishedNode = 0; priorNode = 0; } else { priorNode = finishedNode; if (createFinishedNode){ SeqNode* oldNode = currentNodes[maxCurrentNodeIndex]; float worstScore = worstScores[maxCurrentNodeIndex]; finishedNode = dynamic_cast<DeBruijnNode*>( oldNode->getBranch(nuc) ); if (finishedNode == 0){ finishedNode = new DeBruijnNode(oldNode, nuc); oldNode->addBranch(finishedNode); numBaseNodes++; } // add the kmer score to the node; the worst score possible! if ( nucScore < worstScore ){ worstScore = nucScore; } finishedNode->addKmerScore(worstScore); // add the 3p-directed link score if (priorNode != 0){ float link3pScore = links[pos-1]; float link5pScore = links[pos-kmerSize]; float worseLinkScore = link3pScore; if (link5pScore < worseLinkScore){ worseLinkScore = link5pScore; } priorNode->add3pLink(finishedNode,worseLinkScore); finishedNode->add5pLink(priorNode,worseLinkScore); } } for (int n = maxCurrentNodeIndex; n >= 0; --n){ SeqNode* oldNode; float worstScore; if (n==0){ oldNode = topNode; worstScore = nucScore; } else { oldNode = currentNodes[n-1]; worstScore = worstScores[n-1]; if ( nucScore < worstScore ){ worstScore = nucScore; } } if ( linkScore < worstScore ){ worstScore = linkScore; } SeqNode* newNode = oldNode->getBranch(nuc); if (newNode == 0){ newNode = new NucNode(oldNode,nuc); oldNode->addBranch( newNode ); } currentNodes[n] = newNode; worstScores[n] = worstScore; } if (! createFinishedNode){ if (maxCurrentNodeIndex < kmerSize-2){ maxCurrentNodeIndex++; } else { createFinishedNode = true; } } } } delete [] nucs; delete [] scores; delete [] links; } } return pair<NucNode*,long>(topNode,numBaseNodes); }