Ejemplo n.º 1
0
// Extend all the blocks in activeList by one base to the right
// Move all right-terminal blocks to the termainl list. If a block 
// is terminal and potentially contained by another block, add it to 
// containedList
void OverlapAlgorithm::extendActiveBlocksRight(const BWT* pBWT, const BWT* pRevBWT, 
                                               OverlapBlockList& activeList, 
                                               OverlapBlockList& terminalList,
                                               OverlapBlockList& /*containedList*/) const
{
    OverlapBlockList::iterator iter = activeList.begin();
    OverlapBlockList::iterator next;
    while(iter != activeList.end())
    {
        next = iter;
        ++next;

        // Check if block is terminal
        AlphaCount64 ext_count = iter->getCanonicalExtCount(pBWT, pRevBWT);
        if(ext_count.get('$') > 0)
        {
            // Only consider this block to be terminal irreducible if it has at least one extension
            // or else it is a substring block
            if(iter->forwardHistory.size() > 0)
            {
                OverlapBlock branched = *iter;
                BWTAlgorithms::updateBothR(branched.ranges, '$', branched.getExtensionBWT(pBWT, pRevBWT));
                terminalList.push_back(branched);
#ifdef DEBUGOVERLAP_2            
                std::cout << "Block of length " << iter->overlapLen << " moved to terminal\n";
#endif
            }
        }

        int curr_extension = iter->forwardHistory.size();

        // Perform the right extensions
        
        // Best case, there is only a single extension character
        // Handle this case specially so we don't need to copy the potentially
        // large OverlapBlock structure and its full history
        if(ext_count.hasUniqueDNAChar())
        {
            // Get the extension character with respect to the queried sequence
            char canonical_base = ext_count.getUniqueDNAChar();

            // Flip the base into the frame of reference for the block
            char block_base = iter->flags.isQueryComp() ? complement(canonical_base) : canonical_base;

            // Update the block using the base in its frame of reference
            BWTAlgorithms::updateBothR(iter->ranges, block_base, iter->getExtensionBWT(pBWT, pRevBWT));

            // Add the base to the history in the frame of reference of the query read
            // This is so the history is consistent when comparing between blocks from different strands
            iter->forwardHistory.add(curr_extension, canonical_base);
        }
        else
        {
            for(size_t idx = 0; idx < DNA_ALPHABET_SIZE; ++idx)
            {
                char canonical_base = ALPHABET[idx];
                char block_base = iter->flags.isQueryComp() ? complement(canonical_base) : canonical_base;
                if(ext_count.get(canonical_base) == 0)
                    continue;

                // Branch the sequence. This involves copying the entire history which can be large
                // if the input sequences are very long. This could be avoided by using the SearchHistoyNode/Link
                // structure but branches are infrequent enough to not have a large impact
                OverlapBlock branched = *iter;
                BWTAlgorithms::updateBothR(branched.ranges, block_base, branched.getExtensionBWT(pBWT, pRevBWT));
                assert(branched.ranges.isValid());

                // Add the base in the canonical frame
                branched.forwardHistory.add(curr_extension, canonical_base);

                // Insert the new block after the iterator
                activeList.insert(iter, branched);
            }

            // Remove the original block, which has been superceded by the branches
            activeList.erase(iter);
        }

        iter = next; // this skips the newly-inserted blocks
    }
} 
Ejemplo n.º 2
0
// Construct the set of blocks describing irreducible overlaps with READ
// and write the blocks to pOBOut
OverlapResult OverlapAlgorithm::overlapReadExact(const SeqRecord& read, int minOverlap, OverlapBlockList* pOBOut) const
{
    OverlapResult result;
    // The complete set of overlap blocks are collected in obWorkingList
    // The filtered set (containing only irreducible overlaps) are placed into pOBOut
    // by calculateIrreducibleHits
    OverlapBlockList obWorkingList;
    std::string seq = read.seq.toString();

    // We store the various overlap blocks using a number of lists, one for the containments
    // in the forward and reverse index and one for each set of overlap blocks
    OverlapBlockList oblFwdContain;
    OverlapBlockList oblRevContain;
    
    OverlapBlockList oblSuffixFwd;
    OverlapBlockList oblSuffixRev;
    OverlapBlockList oblPrefixFwd;
    OverlapBlockList oblPrefixRev;

    // Match the suffix of seq to prefixes
    findOverlapBlocksExact(seq, m_pBWT, m_pRevBWT, sufPreAF, minOverlap, &oblSuffixFwd, &oblFwdContain, result);
    if (!m_noReverse)
    {
    findOverlapBlocksExact(complement(seq), m_pRevBWT, m_pBWT, prePreAF, minOverlap, &oblSuffixRev, &oblRevContain, result);
    }

    // Match the prefix of seq to suffixes
    if (!m_noReverse)
    {
    findOverlapBlocksExact(reverseComplement(seq), m_pBWT, m_pRevBWT, sufSufAF, minOverlap, &oblPrefixFwd, &oblFwdContain, result);
    }
    findOverlapBlocksExact(reverse(seq), m_pRevBWT, m_pBWT, preSufAF, minOverlap, &oblPrefixRev, &oblRevContain, result);

    // Remove submaximal blocks for each block list including fully contained blocks
    // Copy the containment blocks into the prefix/suffix lists
    oblSuffixFwd.insert(oblSuffixFwd.end(), oblFwdContain.begin(), oblFwdContain.end());
    oblPrefixFwd.insert(oblPrefixFwd.end(), oblFwdContain.begin(), oblFwdContain.end());
    oblSuffixRev.insert(oblSuffixRev.end(), oblRevContain.begin(), oblRevContain.end());
    oblPrefixRev.insert(oblPrefixRev.end(), oblRevContain.begin(), oblRevContain.end());
    
    // Perform the submaximal filter
    removeSubMaximalBlocks(&oblSuffixFwd, m_pBWT, m_pRevBWT);
    removeSubMaximalBlocks(&oblPrefixFwd, m_pBWT, m_pRevBWT);
    removeSubMaximalBlocks(&oblSuffixRev, m_pRevBWT, m_pBWT);
    removeSubMaximalBlocks(&oblPrefixRev, m_pRevBWT, m_pBWT);
    
    // Remove the contain blocks from the suffix/prefix lists
    removeContainmentBlocks(seq.length(), &oblSuffixFwd);
    removeContainmentBlocks(seq.length(), &oblPrefixFwd);
    removeContainmentBlocks(seq.length(), &oblSuffixRev);
    removeContainmentBlocks(seq.length(), &oblPrefixRev);

    // Join the suffix and prefix lists
    oblSuffixFwd.splice(oblSuffixFwd.end(), oblSuffixRev);
    oblPrefixFwd.splice(oblPrefixFwd.end(), oblPrefixRev);

    // Move the containments to the output list
    pOBOut->splice(pOBOut->end(), oblFwdContain);
    pOBOut->splice(pOBOut->end(), oblRevContain);

    // Filter out transitive overlap blocks if requested
    if(m_bIrreducible)
    {
        computeIrreducibleBlocks(m_pBWT, m_pRevBWT, &oblSuffixFwd, pOBOut);
        computeIrreducibleBlocks(m_pBWT, m_pRevBWT, &oblPrefixFwd, pOBOut);
    }
    else
    {
        pOBOut->splice(pOBOut->end(), oblSuffixFwd);
        pOBOut->splice(pOBOut->end(), oblPrefixFwd);
    }

    return result;
}