// Classify the blocks in obList as irreducible, transitive or substrings. The irreducible blocks are // put into pOBFinal. The remaining are discarded. // Invariant: the blocks are ordered in descending order of the overlap size so that the longest overlap is first. void OverlapAlgorithm::_processIrreducibleBlocksInexact(const BWT* pBWT, const BWT* pRevBWT, OverlapBlockList& activeList, OverlapBlockList* pOBFinal) const { if(activeList.empty()) return; // The activeList contains all the blocks that are not yet right terminal // Count the extensions in the top level (longest) blocks first bool all_eliminated = false; while(!activeList.empty() && !all_eliminated) { // The terminalBlock list contains all the blocks that became right-terminal // in the current extension round. OverlapBlockList terminalList; OverlapBlockList potentialContainedList; // Perform a single round of extension, any terminal blocks // are moved to the terminated list extendActiveBlocksRight(pBWT, pRevBWT, activeList, terminalList, potentialContainedList); // Compare the blocks in the contained list against the other terminal and active blocks // If they are a substring match to any of these, discard them OverlapBlockList::iterator containedIter = potentialContainedList.begin(); for(; containedIter != potentialContainedList.end(); ++containedIter) { if(!isBlockSubstring(*containedIter, terminalList, m_errorRate) && !isBlockSubstring(*containedIter, activeList, m_errorRate)) { // Not a substring, move to terminal list terminalList.push_back(*containedIter); //std::cout << "Contained block kept: " << containedIter->overlapLen << "\n"; } else { //std::cout << "Contained block found and removed: " << containedIter->overlapLen << "\n"; } } // Using the terminated blocks, mark as eliminated any active blocks // that form a valid overlap to the terminal block. These are transitive edges // We do not compare two terminal blocks, we don't consider these overlaps to be // transitive OverlapBlockList::iterator terminalIter = terminalList.begin(); for(; terminalIter != terminalList.end(); ++terminalIter) { #ifdef DEBUGOVERLAP std::cout << "[II] ***TLB of length " << terminalIter->overlapLen << " has ended\n"; #endif all_eliminated = true; OverlapBlockList::iterator activeIter = activeList.begin(); for(; activeIter != activeList.end(); ++activeIter) { if(activeIter->isEliminated) continue; // skip previously marked blocks // Two conditions must be met for a block to be transitive wrt terminal: // 1) It must have a strictly shorter overlap than the terminal block // 2) The error rate between the block and terminal must be less than the threshold double inferredErrorRate = calculateBlockErrorRate(*terminalIter, *activeIter); if(activeIter->overlapLen < terminalIter->overlapLen && isErrorRateAcceptable(inferredErrorRate, m_errorRate)) { #ifdef DEBUGOVERLAP_2 std::cout << "Marking block of length " << activeIter->overlapLen << " as eliminated\n"; #endif activeIter->isEliminated = true; } else { all_eliminated = false; } } // Move this block to the final list if it has not been previously marked eliminated if(!terminalIter->isEliminated) { #ifdef DEBUGOVERLAP std::cout << "[II] Adding block " << *terminalIter << " to final list\n"; //std::cout << " extension: " << terminalIter->forwardHistory << "\n"; #endif pOBFinal->push_back(*terminalIter); } } } activeList.clear(); }
OverlapResult OverlapAlgorithm::overlapReadInexact(const SeqRecord& read, int minOverlap, OverlapBlockList* pOBOut) const { OverlapResult result; OverlapBlockList obWorkingList; std::string seq = read.seq.toString(); #ifdef DEBUGOVERLAP std::cout << "\n\n***Overlapping read " << read.id << " suffix\n"; #endif // Match the suffix of seq to prefixes // findInexact returns false is the maximum search time was exceeded. In this // case we dont run any of the subsequent commands and return no overlaps. bool valid = true; valid = findOverlapBlocksInexact(seq, m_pBWT, m_pRevBWT, sufPreAF, minOverlap, &obWorkingList, pOBOut, result); if(valid) valid = findOverlapBlocksInexact(complement(seq), m_pRevBWT, m_pBWT, prePreAF, minOverlap, &obWorkingList, pOBOut, result); if(valid) { if(m_bIrreducible) { computeIrreducibleBlocks(m_pBWT, m_pRevBWT, &obWorkingList, pOBOut); obWorkingList.clear(); } else { pOBOut->splice(pOBOut->end(), obWorkingList); assert(obWorkingList.empty()); } } #ifdef DEBUGOVERLAP std::cout << "\n\n***Overlapping read " << read.id << " prefix\n"; #endif // Match the prefix of seq to suffixes if(valid) valid = findOverlapBlocksInexact(reverseComplement(seq), m_pBWT, m_pRevBWT, sufSufAF, minOverlap, &obWorkingList, pOBOut, result); if(valid) valid = findOverlapBlocksInexact(reverse(seq), m_pRevBWT, m_pBWT, preSufAF, minOverlap, &obWorkingList, pOBOut, result); if(valid) { if(m_bIrreducible) { computeIrreducibleBlocks(m_pBWT, m_pRevBWT, &obWorkingList, pOBOut); obWorkingList.clear(); } else { pOBOut->splice(pOBOut->end(), obWorkingList); assert(obWorkingList.empty()); } } if(!valid) { pOBOut->clear(); result.isSubstring = false; result.searchAborted = true; return result; } return result; }