// Extend all the blocks in activeList by one base to the right // Move all right-terminal blocks to the termainl list. If a block // is terminal and potentially contained by another block, add it to // containedList void OverlapAlgorithm::extendActiveBlocksRight(const BWT* pBWT, const BWT* pRevBWT, OverlapBlockList& activeList, OverlapBlockList& terminalList, OverlapBlockList& /*containedList*/) const { OverlapBlockList::iterator iter = activeList.begin(); OverlapBlockList::iterator next; while(iter != activeList.end()) { next = iter; ++next; // Check if block is terminal AlphaCount64 ext_count = iter->getCanonicalExtCount(pBWT, pRevBWT); if(ext_count.get('$') > 0) { // Only consider this block to be terminal irreducible if it has at least one extension // or else it is a substring block if(iter->forwardHistory.size() > 0) { OverlapBlock branched = *iter; BWTAlgorithms::updateBothR(branched.ranges, '$', branched.getExtensionBWT(pBWT, pRevBWT)); terminalList.push_back(branched); #ifdef DEBUGOVERLAP_2 std::cout << "Block of length " << iter->overlapLen << " moved to terminal\n"; #endif } } int curr_extension = iter->forwardHistory.size(); // Perform the right extensions // Best case, there is only a single extension character // Handle this case specially so we don't need to copy the potentially // large OverlapBlock structure and its full history if(ext_count.hasUniqueDNAChar()) { // Get the extension character with respect to the queried sequence char canonical_base = ext_count.getUniqueDNAChar(); // Flip the base into the frame of reference for the block char block_base = iter->flags.isQueryComp() ? complement(canonical_base) : canonical_base; // Update the block using the base in its frame of reference BWTAlgorithms::updateBothR(iter->ranges, block_base, iter->getExtensionBWT(pBWT, pRevBWT)); // Add the base to the history in the frame of reference of the query read // This is so the history is consistent when comparing between blocks from different strands iter->forwardHistory.add(curr_extension, canonical_base); } else { for(size_t idx = 0; idx < DNA_ALPHABET_SIZE; ++idx) { char canonical_base = ALPHABET[idx]; char block_base = iter->flags.isQueryComp() ? complement(canonical_base) : canonical_base; if(ext_count.get(canonical_base) == 0) continue; // Branch the sequence. This involves copying the entire history which can be large // if the input sequences are very long. This could be avoided by using the SearchHistoyNode/Link // structure but branches are infrequent enough to not have a large impact OverlapBlock branched = *iter; BWTAlgorithms::updateBothR(branched.ranges, block_base, branched.getExtensionBWT(pBWT, pRevBWT)); assert(branched.ranges.isValid()); // Add the base in the canonical frame branched.forwardHistory.add(curr_extension, canonical_base); // Insert the new block after the iterator activeList.insert(iter, branched); } // Remove the original block, which has been superceded by the branches activeList.erase(iter); } iter = next; // this skips the newly-inserted blocks } }
// Construct the set of blocks describing irreducible overlaps with READ // and write the blocks to pOBOut OverlapResult OverlapAlgorithm::overlapReadExact(const SeqRecord& read, int minOverlap, OverlapBlockList* pOBOut) const { OverlapResult result; // The complete set of overlap blocks are collected in obWorkingList // The filtered set (containing only irreducible overlaps) are placed into pOBOut // by calculateIrreducibleHits OverlapBlockList obWorkingList; std::string seq = read.seq.toString(); // We store the various overlap blocks using a number of lists, one for the containments // in the forward and reverse index and one for each set of overlap blocks OverlapBlockList oblFwdContain; OverlapBlockList oblRevContain; OverlapBlockList oblSuffixFwd; OverlapBlockList oblSuffixRev; OverlapBlockList oblPrefixFwd; OverlapBlockList oblPrefixRev; // Match the suffix of seq to prefixes findOverlapBlocksExact(seq, m_pBWT, m_pRevBWT, sufPreAF, minOverlap, &oblSuffixFwd, &oblFwdContain, result); if (!m_noReverse) { findOverlapBlocksExact(complement(seq), m_pRevBWT, m_pBWT, prePreAF, minOverlap, &oblSuffixRev, &oblRevContain, result); } // Match the prefix of seq to suffixes if (!m_noReverse) { findOverlapBlocksExact(reverseComplement(seq), m_pBWT, m_pRevBWT, sufSufAF, minOverlap, &oblPrefixFwd, &oblFwdContain, result); } findOverlapBlocksExact(reverse(seq), m_pRevBWT, m_pBWT, preSufAF, minOverlap, &oblPrefixRev, &oblRevContain, result); // Remove submaximal blocks for each block list including fully contained blocks // Copy the containment blocks into the prefix/suffix lists oblSuffixFwd.insert(oblSuffixFwd.end(), oblFwdContain.begin(), oblFwdContain.end()); oblPrefixFwd.insert(oblPrefixFwd.end(), oblFwdContain.begin(), oblFwdContain.end()); oblSuffixRev.insert(oblSuffixRev.end(), oblRevContain.begin(), oblRevContain.end()); oblPrefixRev.insert(oblPrefixRev.end(), oblRevContain.begin(), oblRevContain.end()); // Perform the submaximal filter removeSubMaximalBlocks(&oblSuffixFwd, m_pBWT, m_pRevBWT); removeSubMaximalBlocks(&oblPrefixFwd, m_pBWT, m_pRevBWT); removeSubMaximalBlocks(&oblSuffixRev, m_pRevBWT, m_pBWT); removeSubMaximalBlocks(&oblPrefixRev, m_pRevBWT, m_pBWT); // Remove the contain blocks from the suffix/prefix lists removeContainmentBlocks(seq.length(), &oblSuffixFwd); removeContainmentBlocks(seq.length(), &oblPrefixFwd); removeContainmentBlocks(seq.length(), &oblSuffixRev); removeContainmentBlocks(seq.length(), &oblPrefixRev); // Join the suffix and prefix lists oblSuffixFwd.splice(oblSuffixFwd.end(), oblSuffixRev); oblPrefixFwd.splice(oblPrefixFwd.end(), oblPrefixRev); // Move the containments to the output list pOBOut->splice(pOBOut->end(), oblFwdContain); pOBOut->splice(pOBOut->end(), oblRevContain); // Filter out transitive overlap blocks if requested if(m_bIrreducible) { computeIrreducibleBlocks(m_pBWT, m_pRevBWT, &oblSuffixFwd, pOBOut); computeIrreducibleBlocks(m_pBWT, m_pRevBWT, &oblPrefixFwd, pOBOut); } else { pOBOut->splice(pOBOut->end(), oblSuffixFwd); pOBOut->splice(pOBOut->end(), oblPrefixFwd); } return result; }