Ejemplo n.º 1
0
// Classify the blocks in obList as irreducible, transitive or substrings. The irreducible blocks are
// put into pOBFinal. The remaining are discarded.
// Invariant: the blocks are ordered in descending order of the overlap size so that the longest overlap is first.
void OverlapAlgorithm::_processIrreducibleBlocksInexact(const BWT* pBWT, const BWT* pRevBWT, 
                                                        OverlapBlockList& activeList, 
                                                        OverlapBlockList* pOBFinal) const
{
    if(activeList.empty())
        return;
    
    // The activeList contains all the blocks that are not yet right terminal
    // Count the extensions in the top level (longest) blocks first
    bool all_eliminated = false;
    while(!activeList.empty() && !all_eliminated)
    {
        // The terminalBlock list contains all the blocks that became right-terminal
        // in the current extension round.
        OverlapBlockList terminalList;
        OverlapBlockList potentialContainedList;

        // Perform a single round of extension, any terminal blocks
        // are moved to the terminated list
        extendActiveBlocksRight(pBWT, pRevBWT, activeList, terminalList, potentialContainedList);

        // Compare the blocks in the contained list against the other terminal and active blocks
        // If they are a substring match to any of these, discard them
        OverlapBlockList::iterator containedIter = potentialContainedList.begin();
        for(; containedIter != potentialContainedList.end(); ++containedIter)
        {
           if(!isBlockSubstring(*containedIter, terminalList, m_errorRate) && 
              !isBlockSubstring(*containedIter, activeList, m_errorRate))
           {
                // Not a substring, move to terminal list
                terminalList.push_back(*containedIter);
                //std::cout << "Contained block kept: " << containedIter->overlapLen << "\n";
           }
           else
           {
                //std::cout << "Contained block found and removed: " << containedIter->overlapLen << "\n";
           }
        }

        // Using the terminated blocks, mark as eliminated any active blocks
        // that form a valid overlap to the terminal block. These are transitive edges
        // We do not compare two terminal blocks, we don't consider these overlaps to be
        // transitive
        OverlapBlockList::iterator terminalIter = terminalList.begin();
        for(; terminalIter != terminalList.end(); ++terminalIter)
        {
#ifdef DEBUGOVERLAP
            std::cout << "[II] ***TLB of length " << terminalIter->overlapLen << " has ended\n";
#endif       
            all_eliminated = true;
            OverlapBlockList::iterator activeIter = activeList.begin();
            for(; activeIter != activeList.end(); ++activeIter)
            {
                if(activeIter->isEliminated)
                    continue; // skip previously marked blocks
                
                // Two conditions must be met for a block to be transitive wrt terminal:
                // 1) It must have a strictly shorter overlap than the terminal block
                // 2) The error rate between the block and terminal must be less than the threshold
                double inferredErrorRate = calculateBlockErrorRate(*terminalIter, *activeIter);
                if(activeIter->overlapLen < terminalIter->overlapLen && 
                   isErrorRateAcceptable(inferredErrorRate, m_errorRate))
                {
#ifdef DEBUGOVERLAP_2                            
                    std::cout << "Marking block of length " << activeIter->overlapLen << " as eliminated\n";
#endif
                    activeIter->isEliminated = true;
                }
                else
                {
                    all_eliminated = false;
                }
            } 
            
            // Move this block to the final list if it has not been previously marked eliminated
            if(!terminalIter->isEliminated)
            {
#ifdef DEBUGOVERLAP
                std::cout << "[II] Adding block " << *terminalIter << " to final list\n";
                //std::cout << "  extension: " << terminalIter->forwardHistory << "\n";
#endif                
                pOBFinal->push_back(*terminalIter);
            }
        }
    }

    activeList.clear();
}
Ejemplo n.º 2
0
// Extend all the blocks in activeList by one base to the right
// Move all right-terminal blocks to the termainl list. If a block 
// is terminal and potentially contained by another block, add it to 
// containedList
void OverlapAlgorithm::extendActiveBlocksRight(const BWT* pBWT, const BWT* pRevBWT, 
                                               OverlapBlockList& activeList, 
                                               OverlapBlockList& terminalList,
                                               OverlapBlockList& /*containedList*/) const
{
    OverlapBlockList::iterator iter = activeList.begin();
    OverlapBlockList::iterator next;
    while(iter != activeList.end())
    {
        next = iter;
        ++next;

        // Check if block is terminal
        AlphaCount64 ext_count = iter->getCanonicalExtCount(pBWT, pRevBWT);
        if(ext_count.get('$') > 0)
        {
            // Only consider this block to be terminal irreducible if it has at least one extension
            // or else it is a substring block
            if(iter->forwardHistory.size() > 0)
            {
                OverlapBlock branched = *iter;
                BWTAlgorithms::updateBothR(branched.ranges, '$', branched.getExtensionBWT(pBWT, pRevBWT));
                terminalList.push_back(branched);
#ifdef DEBUGOVERLAP_2            
                std::cout << "Block of length " << iter->overlapLen << " moved to terminal\n";
#endif
            }
        }

        int curr_extension = iter->forwardHistory.size();

        // Perform the right extensions
        
        // Best case, there is only a single extension character
        // Handle this case specially so we don't need to copy the potentially
        // large OverlapBlock structure and its full history
        if(ext_count.hasUniqueDNAChar())
        {
            // Get the extension character with respect to the queried sequence
            char canonical_base = ext_count.getUniqueDNAChar();

            // Flip the base into the frame of reference for the block
            char block_base = iter->flags.isQueryComp() ? complement(canonical_base) : canonical_base;

            // Update the block using the base in its frame of reference
            BWTAlgorithms::updateBothR(iter->ranges, block_base, iter->getExtensionBWT(pBWT, pRevBWT));

            // Add the base to the history in the frame of reference of the query read
            // This is so the history is consistent when comparing between blocks from different strands
            iter->forwardHistory.add(curr_extension, canonical_base);
        }
        else
        {
            for(size_t idx = 0; idx < DNA_ALPHABET_SIZE; ++idx)
            {
                char canonical_base = ALPHABET[idx];
                char block_base = iter->flags.isQueryComp() ? complement(canonical_base) : canonical_base;
                if(ext_count.get(canonical_base) == 0)
                    continue;

                // Branch the sequence. This involves copying the entire history which can be large
                // if the input sequences are very long. This could be avoided by using the SearchHistoyNode/Link
                // structure but branches are infrequent enough to not have a large impact
                OverlapBlock branched = *iter;
                BWTAlgorithms::updateBothR(branched.ranges, block_base, branched.getExtensionBWT(pBWT, pRevBWT));
                assert(branched.ranges.isValid());

                // Add the base in the canonical frame
                branched.forwardHistory.add(curr_extension, canonical_base);

                // Insert the new block after the iterator
                activeList.insert(iter, branched);
            }

            // Remove the original block, which has been superceded by the branches
            activeList.erase(iter);
        }

        iter = next; // this skips the newly-inserted blocks
    }
} 
Ejemplo n.º 3
0
// Seeded blockwise BWT alignment of prefix-suffix for reads
// Each alignment is given a seed region and a block region
// The seed region is the terminal portion of w where maxDiff + 1 seeds are created
// at least 1 of these seeds must align exactly for there to be an alignment with 
// at most maxDiff differences between the prefix/suffix. Only alignments within the
// range [block_start, block_end] are output. The block_end coordinate is inclusive.
bool OverlapAlgorithm::findOverlapBlocksInexact(const std::string& w, const BWT* pBWT, 
                                                const BWT* pRevBWT, const AlignFlags& af, int minOverlap,
                                                OverlapBlockList* pOverlapList, OverlapBlockList* pContainList, 
                                                OverlapResult& result) const
{
    int len = w.length();
    int overlap_region_left = len - minOverlap;
    SearchSeedVector* pCurrVector = new SearchSeedVector;
    SearchSeedVector* pNextVector = new SearchSeedVector;
    OverlapBlockList workingList;
    SearchSeedVector::iterator iter;

    // Create and extend the initial seeds
    int actual_seed_length = m_seedLength;
    int actual_seed_stride = m_seedStride;

    if(actual_seed_length == 0)
    {
        // Calculate a seed length and stride that will guarantee all overlaps
        // with error rate m_errorRate will be found
        calculateSeedParameters(w, minOverlap, actual_seed_length, actual_seed_stride);
    }

    assert(actual_seed_stride != 0);

    createSearchSeeds(w, pBWT, pRevBWT, actual_seed_length, actual_seed_stride, pCurrVector);
    extendSeedsExactRight(w, pBWT, pRevBWT, ED_RIGHT, pCurrVector, pNextVector);
    pCurrVector->clear();
    pCurrVector->swap(*pNextVector);
    assert(pNextVector->empty());

    int num_steps = 0;

    // Perform the inexact extensions
    bool fail = false;
    while(!pCurrVector->empty())
    {
        if(m_maxSeeds != -1 && (int)pCurrVector->size() > m_maxSeeds)
        {
            fail = true;
            break;
        }

        iter = pCurrVector->begin();
        while(iter != pCurrVector->end())
        {
            SearchSeed& align = *iter;

            // If the current aligned region is right-terminal
            // and the overlap is greater than minOverlap, try to find overlaps
            // or containments
            if(align.right_index == len - 1)
            {
                double align_error = align.calcErrorRate();

                // Check for overlaps
                if(align.left_index <= overlap_region_left && isErrorRateAcceptable(align_error, m_errorRate))
                {
                    int overlapLen = len - align.left_index;
                    BWTIntervalPair probe = align.ranges;
                    BWTAlgorithms::updateBothL(probe, '$', pBWT);
                    
                    // The probe interval contains the range of proper prefixes
                    if(probe.interval[1].isValid())
                    {
                        assert(probe.interval[1].lower > 0);
                        OverlapBlock nBlock(probe, align.ranges, overlapLen, align.z, af, align.historyLink->getHistoryVector());
                        workingList.push_back(nBlock);
                    }
                }

                // Check for containments
                // If the seed is left-terminal and there are [ACGT] left/right extensions of the sequence
                // this read must be a substring of another read
                if(align.left_index == 0)
                {
                    AlphaCount64 left_ext = BWTAlgorithms::getExtCount(align.ranges.interval[0], pBWT);
                    AlphaCount64 right_ext = BWTAlgorithms::getExtCount(align.ranges.interval[1], pRevBWT);
                    if(left_ext.hasDNAChar() || right_ext.hasDNAChar())
                        result.isSubstring = true;
                }
            }

            // Extend the seed to the right/left
            if(align.dir == ED_RIGHT)
                extendSeedInexactRight(align, w, pBWT, pRevBWT, pNextVector);
            else
                extendSeedInexactLeft(align, w, pBWT, pRevBWT, pNextVector);
            ++iter;
            //pCurrVector->erase(iter++);
        }
        pCurrVector->clear();
        assert(pCurrVector->empty());
        pCurrVector->swap(*pNextVector);

        // Remove identical seeds after we have performed seed_len steps
        // as there now might be redundant seeds
        if(num_steps % actual_seed_stride == 0)
        {
            std::sort(pCurrVector->begin(), pCurrVector->end(), SearchSeed::compareLeftRange);
            SearchSeedVector::iterator end_iter = std::unique(pCurrVector->begin(), pCurrVector->end(), 
                                                                   SearchSeed::equalLeftRange);
            pCurrVector->resize(end_iter - pCurrVector->begin());
        }
        ++num_steps;
    }

    if(!fail)
    {
        // parse the working list to remove any submaximal overlap blocks
        // these blocks correspond to reads that have multiple valid overlaps. 
        // we only keep the longest
        removeSubMaximalBlocks(&workingList, pBWT, pRevBWT);

        OverlapBlockList containedWorkingList;
        partitionBlockList(len, &workingList, pOverlapList, &containedWorkingList);
        
        // Terminate the contained blocks
        terminateContainedBlocks(containedWorkingList);
        
        // Move the contained blocks to the final contained list
        pContainList->splice(pContainList->end(), containedWorkingList);
    }

    delete pCurrVector;
    delete pNextVector;
    return !fail;
}