// Build the graph by expanding nodes on the frontier void StringGraphGenerator::buildGraph(FrontierQueue& queue) { while(!queue.empty()) { if(queue.size() > 200) break; GraphFrontier node = queue.front(); queue.pop(); if(node.pVertex->getColor() == EXPLORED_COLOR) continue; // node has been visited already // Search the FM-index for the current vertex SeqRecord record; record.id = node.pVertex->getID(); record.seq = node.pVertex->getSeq().toString(); OverlapBlockList blockList; assert(blockList.empty()); m_pOverlapper->overlapRead(record, m_minOverlap, &blockList); // Update the graph and the frontier queue with newly found vertices updateGraphAndQueue(node, queue, blockList); node.pVertex->setColor(EXPLORED_COLOR); } m_pGraph->setColors(GC_WHITE); }
// iterate through obList and determine the overlaps that are irreducible. This function is recursive. // The final overlap blocks corresponding to irreducible overlaps are written to pOBFinal. // Invariant: the blocks are ordered in descending order of the overlap size so that the longest overlap is first. // Invariant: each block corresponds to the same extension of the root sequence w. void OverlapAlgorithm::_processIrreducibleBlocksExactIterative(const BWT* pBWT, const BWT* pRevBWT, OverlapBlockList& inList, OverlapBlockList* pOBFinal) const { if(inList.empty()) return; // We store the overlap blocks in groups of blocks that have the same right-extension. // When a branch is found, the groups are split based on the extension typedef std::list<OverlapBlockList> BlockGroups; BlockGroups blockGroups; blockGroups.push_back(inList); int numExtensions = 0; int numBranches = 0; while(!blockGroups.empty()) { // Perform one extenion round for each group. // If the top-level block has ended, push the result // to the final list and remove the group from processing BlockGroups::iterator groupIter = blockGroups.begin(); BlockGroups incomingGroups; // Branched blocks are placed here while(groupIter != blockGroups.end()) { OverlapBlockList& currList = *groupIter; bool bEraseGroup = false; // Count the extensions in the top level (longest) blocks first int topLen = currList.front().overlapLen; AlphaCount64 ext_count; OBLIter blockIter = currList.begin(); while(blockIter != currList.end() && blockIter->overlapLen == topLen) { ext_count += blockIter->getCanonicalExtCount(pBWT, pRevBWT); ++blockIter; } // Three cases: // 1) The top level block has ended as it contains the extension $. Output TLB and end. // 2) There is a singular unique extension base for all the blocks. Update the blocks and continue. // 3) There are multiple extension bases, split the block group and continue. // If some block other than the TLB ended, it must be contained within the TLB and it is not output // or considered further. // Likewise if multiple distinct strings in the TLB ended, we only output the top one. The rest // must have the same sequence as the top one and are hence considered to be contained with the top element. if(ext_count.get('$') > 0) { // An irreducible overlap has been found. It is possible that there are two top level blocks // (one in the forward and reverse direction). Since we can't decide which one // contains the other at this point, we output hits to both. Under a fixed // length string assumption one will be contained within the other and removed later. OBLIter tlbIter = currList.begin(); while(tlbIter != currList.end() && tlbIter->overlapLen == topLen) { // Ensure the tlb is actually terminal and not a substring block AlphaCount64 test_count = tlbIter->getCanonicalExtCount(pBWT, pRevBWT); if(test_count.get('$') == 0) { std::cerr << "Error: substring read found during overlap computation.\n"; std::cerr << "Please run sga rmdup before sga overlap\n"; exit(EXIT_FAILURE); } // Perform the final right-update to make the block terminal OverlapBlock branched = *tlbIter; BWTAlgorithms::updateBothR(branched.ranges, '$', branched.getExtensionBWT(pBWT, pRevBWT)); pOBFinal->push_back(branched); #ifdef DEBUGOVERLAP std::cout << "[IE] TLB of length " << branched.overlapLen << " has ended\n"; std::cout << "[IE]\tBlock data: " << branched << "\n"; #endif ++tlbIter; } // Set the flag to erase this group, it is finished bEraseGroup = true; } else { // Count the extension for the rest of the blocks while(blockIter != currList.end()) { ext_count += blockIter->getCanonicalExtCount(pBWT, pRevBWT); ++blockIter; } if(ext_count.hasUniqueDNAChar()) { // Update all the blocks using the unique extension character // This character is in the canonical representation wrt to the query char b = ext_count.getUniqueDNAChar(); updateOverlapBlockRangesRight(pBWT, pRevBWT, currList, b); numExtensions++; bEraseGroup = false; } else { for(size_t idx = 0; idx < DNA_ALPHABET_SIZE; ++idx) { char b = ALPHABET[idx]; if(ext_count.get(b) > 0) { numBranches++; OverlapBlockList branched = currList; updateOverlapBlockRangesRight(pBWT, pRevBWT, branched, b); incomingGroups.push_back(branched); bEraseGroup = true; } } } } if(bEraseGroup) groupIter = blockGroups.erase(groupIter); else ++groupIter; } // Splice in the newly branched blocks, if any blockGroups.splice(blockGroups.end(), incomingGroups); } }
// Classify the blocks in obList as irreducible, transitive or substrings. The irreducible blocks are // put into pOBFinal. The remaining are discarded. // Invariant: the blocks are ordered in descending order of the overlap size so that the longest overlap is first. void OverlapAlgorithm::_processIrreducibleBlocksInexact(const BWT* pBWT, const BWT* pRevBWT, OverlapBlockList& activeList, OverlapBlockList* pOBFinal) const { if(activeList.empty()) return; // The activeList contains all the blocks that are not yet right terminal // Count the extensions in the top level (longest) blocks first bool all_eliminated = false; while(!activeList.empty() && !all_eliminated) { // The terminalBlock list contains all the blocks that became right-terminal // in the current extension round. OverlapBlockList terminalList; OverlapBlockList potentialContainedList; // Perform a single round of extension, any terminal blocks // are moved to the terminated list extendActiveBlocksRight(pBWT, pRevBWT, activeList, terminalList, potentialContainedList); // Compare the blocks in the contained list against the other terminal and active blocks // If they are a substring match to any of these, discard them OverlapBlockList::iterator containedIter = potentialContainedList.begin(); for(; containedIter != potentialContainedList.end(); ++containedIter) { if(!isBlockSubstring(*containedIter, terminalList, m_errorRate) && !isBlockSubstring(*containedIter, activeList, m_errorRate)) { // Not a substring, move to terminal list terminalList.push_back(*containedIter); //std::cout << "Contained block kept: " << containedIter->overlapLen << "\n"; } else { //std::cout << "Contained block found and removed: " << containedIter->overlapLen << "\n"; } } // Using the terminated blocks, mark as eliminated any active blocks // that form a valid overlap to the terminal block. These are transitive edges // We do not compare two terminal blocks, we don't consider these overlaps to be // transitive OverlapBlockList::iterator terminalIter = terminalList.begin(); for(; terminalIter != terminalList.end(); ++terminalIter) { #ifdef DEBUGOVERLAP std::cout << "[II] ***TLB of length " << terminalIter->overlapLen << " has ended\n"; #endif all_eliminated = true; OverlapBlockList::iterator activeIter = activeList.begin(); for(; activeIter != activeList.end(); ++activeIter) { if(activeIter->isEliminated) continue; // skip previously marked blocks // Two conditions must be met for a block to be transitive wrt terminal: // 1) It must have a strictly shorter overlap than the terminal block // 2) The error rate between the block and terminal must be less than the threshold double inferredErrorRate = calculateBlockErrorRate(*terminalIter, *activeIter); if(activeIter->overlapLen < terminalIter->overlapLen && isErrorRateAcceptable(inferredErrorRate, m_errorRate)) { #ifdef DEBUGOVERLAP_2 std::cout << "Marking block of length " << activeIter->overlapLen << " as eliminated\n"; #endif activeIter->isEliminated = true; } else { all_eliminated = false; } } // Move this block to the final list if it has not been previously marked eliminated if(!terminalIter->isEliminated) { #ifdef DEBUGOVERLAP std::cout << "[II] Adding block " << *terminalIter << " to final list\n"; //std::cout << " extension: " << terminalIter->forwardHistory << "\n"; #endif pOBFinal->push_back(*terminalIter); } } } activeList.clear(); }
OverlapResult OverlapAlgorithm::overlapReadInexact(const SeqRecord& read, int minOverlap, OverlapBlockList* pOBOut) const { OverlapResult result; OverlapBlockList obWorkingList; std::string seq = read.seq.toString(); #ifdef DEBUGOVERLAP std::cout << "\n\n***Overlapping read " << read.id << " suffix\n"; #endif // Match the suffix of seq to prefixes // findInexact returns false is the maximum search time was exceeded. In this // case we dont run any of the subsequent commands and return no overlaps. bool valid = true; valid = findOverlapBlocksInexact(seq, m_pBWT, m_pRevBWT, sufPreAF, minOverlap, &obWorkingList, pOBOut, result); if(valid) valid = findOverlapBlocksInexact(complement(seq), m_pRevBWT, m_pBWT, prePreAF, minOverlap, &obWorkingList, pOBOut, result); if(valid) { if(m_bIrreducible) { computeIrreducibleBlocks(m_pBWT, m_pRevBWT, &obWorkingList, pOBOut); obWorkingList.clear(); } else { pOBOut->splice(pOBOut->end(), obWorkingList); assert(obWorkingList.empty()); } } #ifdef DEBUGOVERLAP std::cout << "\n\n***Overlapping read " << read.id << " prefix\n"; #endif // Match the prefix of seq to suffixes if(valid) valid = findOverlapBlocksInexact(reverseComplement(seq), m_pBWT, m_pRevBWT, sufSufAF, minOverlap, &obWorkingList, pOBOut, result); if(valid) valid = findOverlapBlocksInexact(reverse(seq), m_pRevBWT, m_pBWT, preSufAF, minOverlap, &obWorkingList, pOBOut, result); if(valid) { if(m_bIrreducible) { computeIrreducibleBlocks(m_pBWT, m_pRevBWT, &obWorkingList, pOBOut); obWorkingList.clear(); } else { pOBOut->splice(pOBOut->end(), obWorkingList); assert(obWorkingList.empty()); } } if(!valid) { pOBOut->clear(); result.isSubstring = false; result.searchAborted = true; return result; } return result; }