// Classify the blocks in obList as irreducible, transitive or substrings. The irreducible blocks are // put into pOBFinal. The remaining are discarded. // Invariant: the blocks are ordered in descending order of the overlap size so that the longest overlap is first. void OverlapAlgorithm::_processIrreducibleBlocksInexact(const BWT* pBWT, const BWT* pRevBWT, OverlapBlockList& activeList, OverlapBlockList* pOBFinal) const { if(activeList.empty()) return; // The activeList contains all the blocks that are not yet right terminal // Count the extensions in the top level (longest) blocks first bool all_eliminated = false; while(!activeList.empty() && !all_eliminated) { // The terminalBlock list contains all the blocks that became right-terminal // in the current extension round. OverlapBlockList terminalList; OverlapBlockList potentialContainedList; // Perform a single round of extension, any terminal blocks // are moved to the terminated list extendActiveBlocksRight(pBWT, pRevBWT, activeList, terminalList, potentialContainedList); // Compare the blocks in the contained list against the other terminal and active blocks // If they are a substring match to any of these, discard them OverlapBlockList::iterator containedIter = potentialContainedList.begin(); for(; containedIter != potentialContainedList.end(); ++containedIter) { if(!isBlockSubstring(*containedIter, terminalList, m_errorRate) && !isBlockSubstring(*containedIter, activeList, m_errorRate)) { // Not a substring, move to terminal list terminalList.push_back(*containedIter); //std::cout << "Contained block kept: " << containedIter->overlapLen << "\n"; } else { //std::cout << "Contained block found and removed: " << containedIter->overlapLen << "\n"; } } // Using the terminated blocks, mark as eliminated any active blocks // that form a valid overlap to the terminal block. These are transitive edges // We do not compare two terminal blocks, we don't consider these overlaps to be // transitive OverlapBlockList::iterator terminalIter = terminalList.begin(); for(; terminalIter != terminalList.end(); ++terminalIter) { #ifdef DEBUGOVERLAP std::cout << "[II] ***TLB of length " << terminalIter->overlapLen << " has ended\n"; #endif all_eliminated = true; OverlapBlockList::iterator activeIter = activeList.begin(); for(; activeIter != activeList.end(); ++activeIter) { if(activeIter->isEliminated) continue; // skip previously marked blocks // Two conditions must be met for a block to be transitive wrt terminal: // 1) It must have a strictly shorter overlap than the terminal block // 2) The error rate between the block and terminal must be less than the threshold double inferredErrorRate = calculateBlockErrorRate(*terminalIter, *activeIter); if(activeIter->overlapLen < terminalIter->overlapLen && isErrorRateAcceptable(inferredErrorRate, m_errorRate)) { #ifdef DEBUGOVERLAP_2 std::cout << "Marking block of length " << activeIter->overlapLen << " as eliminated\n"; #endif activeIter->isEliminated = true; } else { all_eliminated = false; } } // Move this block to the final list if it has not been previously marked eliminated if(!terminalIter->isEliminated) { #ifdef DEBUGOVERLAP std::cout << "[II] Adding block " << *terminalIter << " to final list\n"; //std::cout << " extension: " << terminalIter->forwardHistory << "\n"; #endif pOBFinal->push_back(*terminalIter); } } } activeList.clear(); }
// Extend all the blocks in activeList by one base to the right // Move all right-terminal blocks to the termainl list. If a block // is terminal and potentially contained by another block, add it to // containedList void OverlapAlgorithm::extendActiveBlocksRight(const BWT* pBWT, const BWT* pRevBWT, OverlapBlockList& activeList, OverlapBlockList& terminalList, OverlapBlockList& /*containedList*/) const { OverlapBlockList::iterator iter = activeList.begin(); OverlapBlockList::iterator next; while(iter != activeList.end()) { next = iter; ++next; // Check if block is terminal AlphaCount64 ext_count = iter->getCanonicalExtCount(pBWT, pRevBWT); if(ext_count.get('$') > 0) { // Only consider this block to be terminal irreducible if it has at least one extension // or else it is a substring block if(iter->forwardHistory.size() > 0) { OverlapBlock branched = *iter; BWTAlgorithms::updateBothR(branched.ranges, '$', branched.getExtensionBWT(pBWT, pRevBWT)); terminalList.push_back(branched); #ifdef DEBUGOVERLAP_2 std::cout << "Block of length " << iter->overlapLen << " moved to terminal\n"; #endif } } int curr_extension = iter->forwardHistory.size(); // Perform the right extensions // Best case, there is only a single extension character // Handle this case specially so we don't need to copy the potentially // large OverlapBlock structure and its full history if(ext_count.hasUniqueDNAChar()) { // Get the extension character with respect to the queried sequence char canonical_base = ext_count.getUniqueDNAChar(); // Flip the base into the frame of reference for the block char block_base = iter->flags.isQueryComp() ? complement(canonical_base) : canonical_base; // Update the block using the base in its frame of reference BWTAlgorithms::updateBothR(iter->ranges, block_base, iter->getExtensionBWT(pBWT, pRevBWT)); // Add the base to the history in the frame of reference of the query read // This is so the history is consistent when comparing between blocks from different strands iter->forwardHistory.add(curr_extension, canonical_base); } else { for(size_t idx = 0; idx < DNA_ALPHABET_SIZE; ++idx) { char canonical_base = ALPHABET[idx]; char block_base = iter->flags.isQueryComp() ? complement(canonical_base) : canonical_base; if(ext_count.get(canonical_base) == 0) continue; // Branch the sequence. This involves copying the entire history which can be large // if the input sequences are very long. This could be avoided by using the SearchHistoyNode/Link // structure but branches are infrequent enough to not have a large impact OverlapBlock branched = *iter; BWTAlgorithms::updateBothR(branched.ranges, block_base, branched.getExtensionBWT(pBWT, pRevBWT)); assert(branched.ranges.isValid()); // Add the base in the canonical frame branched.forwardHistory.add(curr_extension, canonical_base); // Insert the new block after the iterator activeList.insert(iter, branched); } // Remove the original block, which has been superceded by the branches activeList.erase(iter); } iter = next; // this skips the newly-inserted blocks } }
// Seeded blockwise BWT alignment of prefix-suffix for reads // Each alignment is given a seed region and a block region // The seed region is the terminal portion of w where maxDiff + 1 seeds are created // at least 1 of these seeds must align exactly for there to be an alignment with // at most maxDiff differences between the prefix/suffix. Only alignments within the // range [block_start, block_end] are output. The block_end coordinate is inclusive. bool OverlapAlgorithm::findOverlapBlocksInexact(const std::string& w, const BWT* pBWT, const BWT* pRevBWT, const AlignFlags& af, int minOverlap, OverlapBlockList* pOverlapList, OverlapBlockList* pContainList, OverlapResult& result) const { int len = w.length(); int overlap_region_left = len - minOverlap; SearchSeedVector* pCurrVector = new SearchSeedVector; SearchSeedVector* pNextVector = new SearchSeedVector; OverlapBlockList workingList; SearchSeedVector::iterator iter; // Create and extend the initial seeds int actual_seed_length = m_seedLength; int actual_seed_stride = m_seedStride; if(actual_seed_length == 0) { // Calculate a seed length and stride that will guarantee all overlaps // with error rate m_errorRate will be found calculateSeedParameters(w, minOverlap, actual_seed_length, actual_seed_stride); } assert(actual_seed_stride != 0); createSearchSeeds(w, pBWT, pRevBWT, actual_seed_length, actual_seed_stride, pCurrVector); extendSeedsExactRight(w, pBWT, pRevBWT, ED_RIGHT, pCurrVector, pNextVector); pCurrVector->clear(); pCurrVector->swap(*pNextVector); assert(pNextVector->empty()); int num_steps = 0; // Perform the inexact extensions bool fail = false; while(!pCurrVector->empty()) { if(m_maxSeeds != -1 && (int)pCurrVector->size() > m_maxSeeds) { fail = true; break; } iter = pCurrVector->begin(); while(iter != pCurrVector->end()) { SearchSeed& align = *iter; // If the current aligned region is right-terminal // and the overlap is greater than minOverlap, try to find overlaps // or containments if(align.right_index == len - 1) { double align_error = align.calcErrorRate(); // Check for overlaps if(align.left_index <= overlap_region_left && isErrorRateAcceptable(align_error, m_errorRate)) { int overlapLen = len - align.left_index; BWTIntervalPair probe = align.ranges; BWTAlgorithms::updateBothL(probe, '$', pBWT); // The probe interval contains the range of proper prefixes if(probe.interval[1].isValid()) { assert(probe.interval[1].lower > 0); OverlapBlock nBlock(probe, align.ranges, overlapLen, align.z, af, align.historyLink->getHistoryVector()); workingList.push_back(nBlock); } } // Check for containments // If the seed is left-terminal and there are [ACGT] left/right extensions of the sequence // this read must be a substring of another read if(align.left_index == 0) { AlphaCount64 left_ext = BWTAlgorithms::getExtCount(align.ranges.interval[0], pBWT); AlphaCount64 right_ext = BWTAlgorithms::getExtCount(align.ranges.interval[1], pRevBWT); if(left_ext.hasDNAChar() || right_ext.hasDNAChar()) result.isSubstring = true; } } // Extend the seed to the right/left if(align.dir == ED_RIGHT) extendSeedInexactRight(align, w, pBWT, pRevBWT, pNextVector); else extendSeedInexactLeft(align, w, pBWT, pRevBWT, pNextVector); ++iter; //pCurrVector->erase(iter++); } pCurrVector->clear(); assert(pCurrVector->empty()); pCurrVector->swap(*pNextVector); // Remove identical seeds after we have performed seed_len steps // as there now might be redundant seeds if(num_steps % actual_seed_stride == 0) { std::sort(pCurrVector->begin(), pCurrVector->end(), SearchSeed::compareLeftRange); SearchSeedVector::iterator end_iter = std::unique(pCurrVector->begin(), pCurrVector->end(), SearchSeed::equalLeftRange); pCurrVector->resize(end_iter - pCurrVector->begin()); } ++num_steps; } if(!fail) { // parse the working list to remove any submaximal overlap blocks // these blocks correspond to reads that have multiple valid overlaps. // we only keep the longest removeSubMaximalBlocks(&workingList, pBWT, pRevBWT); OverlapBlockList containedWorkingList; partitionBlockList(len, &workingList, pOverlapList, &containedWorkingList); // Terminate the contained blocks terminateContainedBlocks(containedWorkingList); // Move the contained blocks to the final contained list pContainList->splice(pContainList->end(), containedWorkingList); } delete pCurrVector; delete pNextVector; return !fail; }