// Build the graph by expanding nodes on the frontier
void StringGraphGenerator::buildGraph(FrontierQueue& queue)
{
    while(!queue.empty())
    {
        if(queue.size() > 200)
            break;

        GraphFrontier node = queue.front();
        queue.pop();
        if(node.pVertex->getColor() == EXPLORED_COLOR)
            continue; // node has been visited already
        
        // Search the FM-index for the current vertex
        SeqRecord record;
        record.id = node.pVertex->getID();
        record.seq = node.pVertex->getSeq().toString();
        
        OverlapBlockList blockList;
        assert(blockList.empty());
        m_pOverlapper->overlapRead(record, m_minOverlap, &blockList);

        // Update the graph and the frontier queue with newly found vertices
        updateGraphAndQueue(node, queue, blockList);
        node.pVertex->setColor(EXPLORED_COLOR);
    }

    m_pGraph->setColors(GC_WHITE);
}
Vertex* StringGraphGenerator::addTerminalVertex(const SeqRecord& record)
{
    assert(m_pGraph != NULL);

    // Build the vertex by performing a full-length search for the
    // sequence in the FM-index. We set the ID of the vertex to be the 
    // lowest index in the returned block list.
    OverlapBlockList endBlockList;
    m_pOverlapper->alignReadDuplicate(record, &endBlockList);

    // Search the block list for the exact match to the end read. This must exist
    OverlapBlockList::iterator matchIter = endBlockList.begin();
    while(matchIter != endBlockList.end())
    {
        if(matchIter->numDiff == 0 && !matchIter->flags.isQueryRev())
            break; // this block corresponds to the actual sequence of endRead
    }
    assert(matchIter != endBlockList.end());
    
    // Construct the canonical ID from the matching interval
    std::string endID = matchIter->toCanonicalID();

    Vertex* pVertex = m_pGraph->getVertex(endID);
    if(pVertex == NULL)
    {
        pVertex = new(m_pGraph->getVertexAllocator()) Vertex(endID, record.seq.toString());
        m_pGraph->addVertex(pVertex);
    }
    return pVertex;
}
Example #3
0
// Run the cluster process. If the number of total nodes
// exceeds max, abort the search.
void ReadCluster::run(size_t max)
{
    while(!m_queue.empty())
    {
        if(m_queue.size() + m_outCluster.size() > max)
        {
            while(!m_queue.empty())
                m_queue.pop();
            m_outCluster.clear();
            return;
        }

        ClusterNode node = m_queue.front();
        m_queue.pop();

        // Add this node to the output
        m_outCluster.push_back(node);

        // Find overlaps for the current node
        SeqRecord tempRecord;
        tempRecord.id = "cluster";
        tempRecord.seq = node.sequence;
        OverlapBlockList blockList;
        m_pOverlapper->overlapRead(tempRecord, m_minOverlap, &blockList);
        
        // Parse each member of the block list and potentially expand the cluster
        for(OverlapBlockList::const_iterator iter = blockList.begin(); iter != blockList.end(); ++iter)
        {
            // Check if the reads in this block are part of the cluster already
            BWTInterval canonicalInterval = iter->getCanonicalInterval();
            int64_t canonicalIndex = canonicalInterval.lower;
            if(m_usedIndex.count(canonicalIndex) == 0)
            {
                // This is a new node that isn't in the cluster. Add it.
                m_usedIndex.insert(canonicalIndex);

                ClusterNode newNode;
                newNode.sequence = iter->getFullString(node.sequence);
                newNode.interval = canonicalInterval;
                newNode.isReverseInterval = iter->flags.isTargetRev();
                m_queue.push(newNode);
            }
        }
    }
}
Example #4
0
// Return true if the terminalBlock is a substring of any member of blockList
bool OverlapAlgorithm::isBlockSubstring(OverlapBlock& terminalBlock, const OverlapBlockList& blockList, double maxER) const
{
    OverlapBlockList::const_iterator iter = blockList.begin();
    size_t right_extension_length = terminalBlock.forwardHistory.size();
    for(; iter != blockList.end(); ++iter)
    {
        if(terminalBlock.overlapLen == iter->overlapLen && 
           right_extension_length == iter->forwardHistory.size())
        {
            continue; // same length, cannot be a substring
        }
        
        // Calculate error rate between blocks
        double er = calculateBlockErrorRate(terminalBlock, *iter);
        if(isErrorRateAcceptable(er, maxER))
            return true;
    }
    return false;
}
Example #5
0
// Update the overlap block list with a righthand extension to b, removing ranges that become invalid
void OverlapAlgorithm::updateOverlapBlockRangesRight(const BWT* pBWT, const BWT* pRevBWT, 
                                                     OverlapBlockList& obList, char canonical_base) const
{
    OverlapBlockList::iterator iter = obList.begin(); 
    while(iter != obList.end())
    {
        char relative_base = iter->flags.isQueryComp() ? complement(canonical_base) : canonical_base;
        BWTAlgorithms::updateBothR(iter->ranges, relative_base, iter->getExtensionBWT(pBWT, pRevBWT));
        // remove the block from the list if its no longer valid
        if(!iter->ranges.isValid())
        {
            iter = obList.erase(iter);
        }
        else
        {
            // Add the base to the extension history
            int currExtension = iter->forwardHistory.size();
            iter->forwardHistory.add(currExtension, canonical_base);
            ++iter;
        }
    }
}
void StringGraphGenerator::updateGraphAndQueue(GraphFrontier& currNode, FrontierQueue& queue, OverlapBlockList& blockList)
{
    // Partition the block list into containment blocks and extension (valid) blocks
    // We do not add containment edges to the graph so the containments are discarded
    OverlapBlockList containList;
    OverlapBlockList overlapList;

    Vertex* pX = currNode.pVertex;

    //partitionBlockList(pX->getSeqLen(), &blockList, &overlapList, &containList);

    // Process the overlap blocks, adding new vertices and edges where necessary
    for(OverlapBlockList::iterator iter = blockList.begin(); iter != blockList.end(); ++iter)
    {
        if(iter->getEdgeDir() != currNode.dir)
            continue;

        std::string vertexID = iter->toCanonicalID();
        if(vertexID == pX->getID())
            continue; // skip self-edges


        std::string vertexSeq = iter->getFullString(pX->getSeq().toString());
        Overlap o = iter->toOverlap(pX->getID(), vertexID, pX->getSeqLen(), vertexSeq.length());

/*
#if DEBUGGENERATE
        std::cout << "has overlap to: " << vertexID << " len: " << iter->overlapLen << " flags: " << iter->flags << "\n";
        std::cout << "Overlap string: " << iter->getOverlapString(pX->getSeq().toString()) << "\n";
#endif
*/      
        // Check if a vertex with endVertexID exists in the graph
        Vertex* pVertex = m_pGraph->getVertex(vertexID);
        if(pVertex == NULL)
        {

#if DEBUGGENERATE
            std::cout << "Vertex with ID: " << vertexID << " does not exist, creating\n";
            std::cout << "Vertex sequence: " << vertexSeq << "\n";
#endif
            // Generate the new vertex
            vertexSeq = iter->getFullString(pX->getSeq().toString());
            pVertex = new(m_pGraph->getVertexAllocator()) Vertex(vertexID, vertexSeq);
            pVertex->setColor(UNEXPLORED_COLOR);
            m_pGraph->addVertex(pVertex);
        }

        // Construct the found edge
        Edge* pXY = SGAlgorithms::createEdgesFromOverlap(m_pGraph, o, true);

        // If the endpoint vertex is unexplored, queue it
        if(pVertex->getColor() == UNEXPLORED_COLOR)
        {
            GraphFrontier node;
            node.pVertex = pVertex;
            node.dir = !pXY->getTwin()->getDir(); // continuation direction
            node.distance = currNode.distance + pXY->getSeqLen();
            if(node.distance <= m_maxDistance)
                queue.push(node);
        }
    }
}
Example #7
0
ClusterResult ClusterProcess::process(const SequenceWorkItem& item)
{
    // Calculate the intervals in the forward FM-index for this read
    const BWT* pBWT = m_pOverlapper->getBWT();

    // Check if this read is a substring
    OverlapBlockList tempBlockList;
    OverlapResult overlapResult = m_pOverlapper->alignReadDuplicate(item.read, &tempBlockList);
    if(overlapResult.isSubstring)
    {
        std::cerr << "Error: substring reads found in sga-cluster. Please run rmdup before cluster\n";
        exit(1);
    }

    // Find the interval in the fm-index containing the read
    std::string readString = item.read.seq.toString();
    BWTInterval readInterval = BWTAlgorithms::findInterval(pBWT, readString);
    BWTAlgorithms::updateInterval(readInterval, '$', pBWT);

    // The read must be present in the index
    assert(readInterval.isValid());

    // Check if this read has been used yet
    bool used = false;
    for(int64_t i = readInterval.lower; i <= readInterval.upper; ++i)
    {
        if(m_pMarkedReads->test(i))
        {
            used = true;
            break;
        }
    }

    ClusterResult result;
    if(used)
        return result; // already part of a cluster, return nothing

    // Compute a new cluster around this read
    std::set<int64_t> usedIndex;
    ClusterNodeQueue queue;
    ClusterNode node;
    node.sequence = item.read.seq.toString();
    node.interval = readInterval;
    node.isReverseInterval = false;
    usedIndex.insert(readInterval.lower);
    queue.push(node);
    while(!queue.empty())
    {
        ClusterNode node = queue.front();
        queue.pop();

        // Update the used index and the result structure with this node's data
        result.clusterNodes.push_back(node);

        SeqRecord tempRecord;
        tempRecord.id = "cluster";
        tempRecord.seq = node.sequence;
        OverlapBlockList blockList;
        OverlapResult result = m_pOverlapper->overlapRead(tempRecord, m_minOverlap, &blockList);
        //m_pOverlapper->buildForwardHistory(&blockList);
        
        // Parse each member of the block list and potentially expand the cluster
        for(OverlapBlockList::const_iterator iter = blockList.begin(); iter != blockList.end(); ++iter)
        {
            // Check if the reads in this block are part of the cluster already
            BWTInterval canonicalInterval = iter->getCanonicalInterval();
            int64_t canonicalIndex = canonicalInterval.lower;
            if(usedIndex.count(canonicalIndex) == 0)
            {
                usedIndex.insert(canonicalIndex);
                ClusterNode newNode;
                newNode.sequence = iter->getFullString(node.sequence);
                newNode.interval = canonicalInterval;
                newNode.isReverseInterval = iter->flags.isTargetRev();
                queue.push(newNode);
            }
        }
    }

    // If some work was performed, update the bitvector so other threads do not try to merge the same set of reads.
    // This uses compare-and-swap instructions to ensure the uppdate is atomic. 
    // If some other thread has merged this set (and updated
    // the bitvector), we discard all the merged data.
    
    // As a given set of reads should all be merged together, we only need to make sure we atomically update
    // the bit for the read with the lowest index in the set.

    // Sort the intervals into ascending order and remove any duplicate intervals (which can occur
    // if the subgraph has a simple cycle)
    std::sort(result.clusterNodes.begin(), result.clusterNodes.end(), ClusterNode::compare);
    std::vector<ClusterNode>::iterator newEnd = std::unique(result.clusterNodes.begin(),
                                                            result.clusterNodes.end(),
                                                            ClusterNode::equal);

    size_t oldSize = result.clusterNodes.size();
    result.clusterNodes.erase(newEnd, result.clusterNodes.end());
    size_t newSize = result.clusterNodes.size();
    if(oldSize != newSize)
        std::cout << "Warning: duplicate cluster nodes were found\n";

    // Check if the bit in the vector has already been set for the lowest read index
    // If it has some other thread has already output this set so we do nothing
    int64_t lowestIndex = result.clusterNodes.front().interval.lower;
    bool currentValue = m_pMarkedReads->test(lowestIndex);
    bool updateSuccess = false;

    if(currentValue == false)
    {
        // Attempt to update the bit vector with an atomic CAS. If this returns false
        // the bit was set by some other thread
        updateSuccess = m_pMarkedReads->updateCAS(lowestIndex, currentValue, true);
    }

    if(updateSuccess)
    {
        // We successfully atomically set the bit for the first read in this set
        // to true. We can safely update the rest of the bits and keep the merged sequences
        // for output.
        std::vector<ClusterNode>::const_iterator iter = result.clusterNodes.begin();
        for(; iter != result.clusterNodes.end(); ++iter)
        {
            for(int64_t i = iter->interval.lower; i <= iter->interval.upper; ++i)
            {
                if(i == lowestIndex) //already set
                    continue;
                currentValue = m_pMarkedReads->test(i);
                if(currentValue)
                {
                    // This value should not be true, emit a warning
                    std::cout << "Warning: Bit " << i << " was set outside of critical section\n";
                    std::cout << "Read: " << readString << "\n";
                }
                else
                {
                    m_pMarkedReads->updateCAS(i, currentValue, true);
                }
            }
        }
    }
    else
    {
        // Some other thread merged these reads already, discard the intermediate
        // data and set the result to false
        result.clusterNodes.clear();
    }
    return result;
}
Example #8
0
// Extend all the blocks in activeList by one base to the right
// Move all right-terminal blocks to the termainl list. If a block 
// is terminal and potentially contained by another block, add it to 
// containedList
void OverlapAlgorithm::extendActiveBlocksRight(const BWT* pBWT, const BWT* pRevBWT, 
                                               OverlapBlockList& activeList, 
                                               OverlapBlockList& terminalList,
                                               OverlapBlockList& /*containedList*/) const
{
    OverlapBlockList::iterator iter = activeList.begin();
    OverlapBlockList::iterator next;
    while(iter != activeList.end())
    {
        next = iter;
        ++next;

        // Check if block is terminal
        AlphaCount64 ext_count = iter->getCanonicalExtCount(pBWT, pRevBWT);
        if(ext_count.get('$') > 0)
        {
            // Only consider this block to be terminal irreducible if it has at least one extension
            // or else it is a substring block
            if(iter->forwardHistory.size() > 0)
            {
                OverlapBlock branched = *iter;
                BWTAlgorithms::updateBothR(branched.ranges, '$', branched.getExtensionBWT(pBWT, pRevBWT));
                terminalList.push_back(branched);
#ifdef DEBUGOVERLAP_2            
                std::cout << "Block of length " << iter->overlapLen << " moved to terminal\n";
#endif
            }
        }

        int curr_extension = iter->forwardHistory.size();

        // Perform the right extensions
        
        // Best case, there is only a single extension character
        // Handle this case specially so we don't need to copy the potentially
        // large OverlapBlock structure and its full history
        if(ext_count.hasUniqueDNAChar())
        {
            // Get the extension character with respect to the queried sequence
            char canonical_base = ext_count.getUniqueDNAChar();

            // Flip the base into the frame of reference for the block
            char block_base = iter->flags.isQueryComp() ? complement(canonical_base) : canonical_base;

            // Update the block using the base in its frame of reference
            BWTAlgorithms::updateBothR(iter->ranges, block_base, iter->getExtensionBWT(pBWT, pRevBWT));

            // Add the base to the history in the frame of reference of the query read
            // This is so the history is consistent when comparing between blocks from different strands
            iter->forwardHistory.add(curr_extension, canonical_base);
        }
        else
        {
            for(size_t idx = 0; idx < DNA_ALPHABET_SIZE; ++idx)
            {
                char canonical_base = ALPHABET[idx];
                char block_base = iter->flags.isQueryComp() ? complement(canonical_base) : canonical_base;
                if(ext_count.get(canonical_base) == 0)
                    continue;

                // Branch the sequence. This involves copying the entire history which can be large
                // if the input sequences are very long. This could be avoided by using the SearchHistoyNode/Link
                // structure but branches are infrequent enough to not have a large impact
                OverlapBlock branched = *iter;
                BWTAlgorithms::updateBothR(branched.ranges, block_base, branched.getExtensionBWT(pBWT, pRevBWT));
                assert(branched.ranges.isValid());

                // Add the base in the canonical frame
                branched.forwardHistory.add(curr_extension, canonical_base);

                // Insert the new block after the iterator
                activeList.insert(iter, branched);
            }

            // Remove the original block, which has been superceded by the branches
            activeList.erase(iter);
        }

        iter = next; // this skips the newly-inserted blocks
    }
} 
Example #9
0
// Classify the blocks in obList as irreducible, transitive or substrings. The irreducible blocks are
// put into pOBFinal. The remaining are discarded.
// Invariant: the blocks are ordered in descending order of the overlap size so that the longest overlap is first.
void OverlapAlgorithm::_processIrreducibleBlocksInexact(const BWT* pBWT, const BWT* pRevBWT, 
                                                        OverlapBlockList& activeList, 
                                                        OverlapBlockList* pOBFinal) const
{
    if(activeList.empty())
        return;
    
    // The activeList contains all the blocks that are not yet right terminal
    // Count the extensions in the top level (longest) blocks first
    bool all_eliminated = false;
    while(!activeList.empty() && !all_eliminated)
    {
        // The terminalBlock list contains all the blocks that became right-terminal
        // in the current extension round.
        OverlapBlockList terminalList;
        OverlapBlockList potentialContainedList;

        // Perform a single round of extension, any terminal blocks
        // are moved to the terminated list
        extendActiveBlocksRight(pBWT, pRevBWT, activeList, terminalList, potentialContainedList);

        // Compare the blocks in the contained list against the other terminal and active blocks
        // If they are a substring match to any of these, discard them
        OverlapBlockList::iterator containedIter = potentialContainedList.begin();
        for(; containedIter != potentialContainedList.end(); ++containedIter)
        {
           if(!isBlockSubstring(*containedIter, terminalList, m_errorRate) && 
              !isBlockSubstring(*containedIter, activeList, m_errorRate))
           {
                // Not a substring, move to terminal list
                terminalList.push_back(*containedIter);
                //std::cout << "Contained block kept: " << containedIter->overlapLen << "\n";
           }
           else
           {
                //std::cout << "Contained block found and removed: " << containedIter->overlapLen << "\n";
           }
        }

        // Using the terminated blocks, mark as eliminated any active blocks
        // that form a valid overlap to the terminal block. These are transitive edges
        // We do not compare two terminal blocks, we don't consider these overlaps to be
        // transitive
        OverlapBlockList::iterator terminalIter = terminalList.begin();
        for(; terminalIter != terminalList.end(); ++terminalIter)
        {
#ifdef DEBUGOVERLAP
            std::cout << "[II] ***TLB of length " << terminalIter->overlapLen << " has ended\n";
#endif       
            all_eliminated = true;
            OverlapBlockList::iterator activeIter = activeList.begin();
            for(; activeIter != activeList.end(); ++activeIter)
            {
                if(activeIter->isEliminated)
                    continue; // skip previously marked blocks
                
                // Two conditions must be met for a block to be transitive wrt terminal:
                // 1) It must have a strictly shorter overlap than the terminal block
                // 2) The error rate between the block and terminal must be less than the threshold
                double inferredErrorRate = calculateBlockErrorRate(*terminalIter, *activeIter);
                if(activeIter->overlapLen < terminalIter->overlapLen && 
                   isErrorRateAcceptable(inferredErrorRate, m_errorRate))
                {
#ifdef DEBUGOVERLAP_2                            
                    std::cout << "Marking block of length " << activeIter->overlapLen << " as eliminated\n";
#endif
                    activeIter->isEliminated = true;
                }
                else
                {
                    all_eliminated = false;
                }
            } 
            
            // Move this block to the final list if it has not been previously marked eliminated
            if(!terminalIter->isEliminated)
            {
#ifdef DEBUGOVERLAP
                std::cout << "[II] Adding block " << *terminalIter << " to final list\n";
                //std::cout << "  extension: " << terminalIter->forwardHistory << "\n";
#endif                
                pOBFinal->push_back(*terminalIter);
            }
        }
    }

    activeList.clear();
}
Example #10
0
// iterate through obList and determine the overlaps that are irreducible. This function is recursive.
// The final overlap blocks corresponding to irreducible overlaps are written to pOBFinal.
// Invariant: the blocks are ordered in descending order of the overlap size so that the longest overlap is first.
// Invariant: each block corresponds to the same extension of the root sequence w.
void OverlapAlgorithm::_processIrreducibleBlocksExactIterative(const BWT* pBWT, const BWT* pRevBWT, 
                                                               OverlapBlockList& inList, 
                                                               OverlapBlockList* pOBFinal) const
{
    if(inList.empty())
        return;
    
    // We store the overlap blocks in groups of blocks that have the same right-extension.
    // When a branch is found, the groups are split based on the extension
    typedef std::list<OverlapBlockList> BlockGroups;

    BlockGroups blockGroups;
    blockGroups.push_back(inList);
    int numExtensions = 0;
    int numBranches = 0;
    while(!blockGroups.empty())
    {
        // Perform one extenion round for each group.
        // If the top-level block has ended, push the result
        // to the final list and remove the group from processing
        BlockGroups::iterator groupIter = blockGroups.begin();
        BlockGroups incomingGroups; // Branched blocks are placed here

        while(groupIter != blockGroups.end())
        {
            OverlapBlockList& currList = *groupIter;
            bool bEraseGroup = false;

            // Count the extensions in the top level (longest) blocks first
            int topLen = currList.front().overlapLen;
            AlphaCount64 ext_count;
            OBLIter blockIter = currList.begin();
            while(blockIter != currList.end() && blockIter->overlapLen == topLen)
            {
                ext_count += blockIter->getCanonicalExtCount(pBWT, pRevBWT);
                ++blockIter;
            }
            
            // Three cases:
            // 1) The top level block has ended as it contains the extension $. Output TLB and end.
            // 2) There is a singular unique extension base for all the blocks. Update the blocks and continue.
            // 3) There are multiple extension bases, split the block group and continue.
            // If some block other than the TLB ended, it must be contained within the TLB and it is not output
            // or considered further. 
            // Likewise if multiple distinct strings in the TLB ended, we only output the top one. The rest
            // must have the same sequence as the top one and are hence considered to be contained with the top element.
            if(ext_count.get('$') > 0)
            {
                // An irreducible overlap has been found. It is possible that there are two top level blocks
                // (one in the forward and reverse direction). Since we can't decide which one
                // contains the other at this point, we output hits to both. Under a fixed 
                // length string assumption one will be contained within the other and removed later.
                OBLIter tlbIter = currList.begin();
                while(tlbIter != currList.end() && tlbIter->overlapLen == topLen)
                {
                    // Ensure the tlb is actually terminal and not a substring block
                    AlphaCount64 test_count = tlbIter->getCanonicalExtCount(pBWT, pRevBWT);
                    if(test_count.get('$') == 0)
                    {
                        std::cerr << "Error: substring read found during overlap computation.\n";
                        std::cerr << "Please run sga rmdup before sga overlap\n";
                        exit(EXIT_FAILURE);
                    }
                    
                    // Perform the final right-update to make the block terminal
                    OverlapBlock branched = *tlbIter;
                    BWTAlgorithms::updateBothR(branched.ranges, '$', branched.getExtensionBWT(pBWT, pRevBWT));
                    pOBFinal->push_back(branched);
#ifdef DEBUGOVERLAP
                    std::cout << "[IE] TLB of length " << branched.overlapLen << " has ended\n";
                    std::cout << "[IE]\tBlock data: " << branched << "\n";
#endif             
                    ++tlbIter;
                } 

                // Set the flag to erase this group, it is finished
                bEraseGroup = true;
            }
            else
            {
                // Count the extension for the rest of the blocks
                while(blockIter != currList.end())
                {
                    ext_count += blockIter->getCanonicalExtCount(pBWT, pRevBWT);
                    ++blockIter;
                }

                if(ext_count.hasUniqueDNAChar())
                {
                    // Update all the blocks using the unique extension character
                    // This character is in the canonical representation wrt to the query
                    char b = ext_count.getUniqueDNAChar();
                    updateOverlapBlockRangesRight(pBWT, pRevBWT, currList, b);
                    numExtensions++;
                    bEraseGroup = false;
                }
                else
                {
                    for(size_t idx = 0; idx < DNA_ALPHABET_SIZE; ++idx)
                    {
                        char b = ALPHABET[idx];
                        if(ext_count.get(b) > 0)
                        {
                            numBranches++;
                            OverlapBlockList branched = currList;
                            updateOverlapBlockRangesRight(pBWT, pRevBWT, branched, b);
                            incomingGroups.push_back(branched);
                            bEraseGroup = true;
                        }
                    }
                }
            }

            if(bEraseGroup)
                groupIter = blockGroups.erase(groupIter);
            else
                ++groupIter;
        }

        // Splice in the newly branched blocks, if any
        blockGroups.splice(blockGroups.end(), incomingGroups);
    }
}
Example #11
0
// Calculate the single right extension to the '$' for each the contained blocks
// so that the interval ranges are consistent
void OverlapAlgorithm::terminateContainedBlocks(OverlapBlockList& containedBlocks) const
{
    for(OverlapBlockList::iterator iter = containedBlocks.begin(); iter != containedBlocks.end(); ++iter)
        BWTAlgorithms::updateBothR(iter->ranges, '$', iter->getExtensionBWT(m_pBWT, m_pRevBWT));
}
Example #12
0
OverlapResult OverlapAlgorithm::overlapReadInexact(const SeqRecord& read, int minOverlap, OverlapBlockList* pOBOut) const
{
    OverlapResult result;
    OverlapBlockList obWorkingList;
    std::string seq = read.seq.toString();

#ifdef DEBUGOVERLAP
    std::cout << "\n\n***Overlapping read " << read.id << " suffix\n";
#endif

    // Match the suffix of seq to prefixes

    // findInexact returns false is the maximum search time was exceeded. In this
    // case we dont run any of the subsequent commands and return no overlaps.
    bool valid = true;
    valid = findOverlapBlocksInexact(seq, m_pBWT, m_pRevBWT, sufPreAF, 
                                     minOverlap, &obWorkingList, pOBOut, result);

    if(valid)
        valid = findOverlapBlocksInexact(complement(seq), m_pRevBWT, m_pBWT, prePreAF, 
                                         minOverlap, &obWorkingList, pOBOut, result);

    if(valid)
    {
        if(m_bIrreducible)
        {
            computeIrreducibleBlocks(m_pBWT, m_pRevBWT, &obWorkingList, pOBOut);
            obWorkingList.clear();
        }
        else
        {
            pOBOut->splice(pOBOut->end(), obWorkingList);
            assert(obWorkingList.empty());
        }
    }

#ifdef DEBUGOVERLAP
    std::cout << "\n\n***Overlapping read " << read.id << " prefix\n";
#endif

    // Match the prefix of seq to suffixes
    if(valid)
        valid = findOverlapBlocksInexact(reverseComplement(seq), m_pBWT, m_pRevBWT, sufSufAF, minOverlap, &obWorkingList, pOBOut, result);
    
    if(valid)
        valid = findOverlapBlocksInexact(reverse(seq), m_pRevBWT, m_pBWT, preSufAF, minOverlap, &obWorkingList, pOBOut, result);

    if(valid)
    {
        if(m_bIrreducible)
        {
            computeIrreducibleBlocks(m_pBWT, m_pRevBWT, &obWorkingList, pOBOut);
            obWorkingList.clear();
        }
        else
        {
            pOBOut->splice(pOBOut->end(), obWorkingList);
            assert(obWorkingList.empty());
        }
    }

    if(!valid)
    {
        pOBOut->clear();
        result.isSubstring = false;
        result.searchAborted = true;
        return result;
    }

    return result;
}
Example #13
0
// Seeded blockwise BWT alignment of prefix-suffix for reads
// Each alignment is given a seed region and a block region
// The seed region is the terminal portion of w where maxDiff + 1 seeds are created
// at least 1 of these seeds must align exactly for there to be an alignment with 
// at most maxDiff differences between the prefix/suffix. Only alignments within the
// range [block_start, block_end] are output. The block_end coordinate is inclusive.
bool OverlapAlgorithm::findOverlapBlocksInexact(const std::string& w, const BWT* pBWT, 
                                                const BWT* pRevBWT, const AlignFlags& af, int minOverlap,
                                                OverlapBlockList* pOverlapList, OverlapBlockList* pContainList, 
                                                OverlapResult& result) const
{
    int len = w.length();
    int overlap_region_left = len - minOverlap;
    SearchSeedVector* pCurrVector = new SearchSeedVector;
    SearchSeedVector* pNextVector = new SearchSeedVector;
    OverlapBlockList workingList;
    SearchSeedVector::iterator iter;

    // Create and extend the initial seeds
    int actual_seed_length = m_seedLength;
    int actual_seed_stride = m_seedStride;

    if(actual_seed_length == 0)
    {
        // Calculate a seed length and stride that will guarantee all overlaps
        // with error rate m_errorRate will be found
        calculateSeedParameters(w, minOverlap, actual_seed_length, actual_seed_stride);
    }

    assert(actual_seed_stride != 0);

    createSearchSeeds(w, pBWT, pRevBWT, actual_seed_length, actual_seed_stride, pCurrVector);
    extendSeedsExactRight(w, pBWT, pRevBWT, ED_RIGHT, pCurrVector, pNextVector);
    pCurrVector->clear();
    pCurrVector->swap(*pNextVector);
    assert(pNextVector->empty());

    int num_steps = 0;

    // Perform the inexact extensions
    bool fail = false;
    while(!pCurrVector->empty())
    {
        if(m_maxSeeds != -1 && (int)pCurrVector->size() > m_maxSeeds)
        {
            fail = true;
            break;
        }

        iter = pCurrVector->begin();
        while(iter != pCurrVector->end())
        {
            SearchSeed& align = *iter;

            // If the current aligned region is right-terminal
            // and the overlap is greater than minOverlap, try to find overlaps
            // or containments
            if(align.right_index == len - 1)
            {
                double align_error = align.calcErrorRate();

                // Check for overlaps
                if(align.left_index <= overlap_region_left && isErrorRateAcceptable(align_error, m_errorRate))
                {
                    int overlapLen = len - align.left_index;
                    BWTIntervalPair probe = align.ranges;
                    BWTAlgorithms::updateBothL(probe, '$', pBWT);
                    
                    // The probe interval contains the range of proper prefixes
                    if(probe.interval[1].isValid())
                    {
                        assert(probe.interval[1].lower > 0);
                        OverlapBlock nBlock(probe, align.ranges, overlapLen, align.z, af, align.historyLink->getHistoryVector());
                        workingList.push_back(nBlock);
                    }
                }

                // Check for containments
                // If the seed is left-terminal and there are [ACGT] left/right extensions of the sequence
                // this read must be a substring of another read
                if(align.left_index == 0)
                {
                    AlphaCount64 left_ext = BWTAlgorithms::getExtCount(align.ranges.interval[0], pBWT);
                    AlphaCount64 right_ext = BWTAlgorithms::getExtCount(align.ranges.interval[1], pRevBWT);
                    if(left_ext.hasDNAChar() || right_ext.hasDNAChar())
                        result.isSubstring = true;
                }
            }

            // Extend the seed to the right/left
            if(align.dir == ED_RIGHT)
                extendSeedInexactRight(align, w, pBWT, pRevBWT, pNextVector);
            else
                extendSeedInexactLeft(align, w, pBWT, pRevBWT, pNextVector);
            ++iter;
            //pCurrVector->erase(iter++);
        }
        pCurrVector->clear();
        assert(pCurrVector->empty());
        pCurrVector->swap(*pNextVector);

        // Remove identical seeds after we have performed seed_len steps
        // as there now might be redundant seeds
        if(num_steps % actual_seed_stride == 0)
        {
            std::sort(pCurrVector->begin(), pCurrVector->end(), SearchSeed::compareLeftRange);
            SearchSeedVector::iterator end_iter = std::unique(pCurrVector->begin(), pCurrVector->end(), 
                                                                   SearchSeed::equalLeftRange);
            pCurrVector->resize(end_iter - pCurrVector->begin());
        }
        ++num_steps;
    }

    if(!fail)
    {
        // parse the working list to remove any submaximal overlap blocks
        // these blocks correspond to reads that have multiple valid overlaps. 
        // we only keep the longest
        removeSubMaximalBlocks(&workingList, pBWT, pRevBWT);

        OverlapBlockList containedWorkingList;
        partitionBlockList(len, &workingList, pOverlapList, &containedWorkingList);
        
        // Terminate the contained blocks
        terminateContainedBlocks(containedWorkingList);
        
        // Move the contained blocks to the final contained list
        pContainList->splice(pContainList->end(), containedWorkingList);
    }

    delete pCurrVector;
    delete pNextVector;
    return !fail;
}
Example #14
0
// Construct the set of blocks describing irreducible overlaps with READ
// and write the blocks to pOBOut
OverlapResult OverlapAlgorithm::overlapReadExact(const SeqRecord& read, int minOverlap, OverlapBlockList* pOBOut) const
{
    OverlapResult result;
    // The complete set of overlap blocks are collected in obWorkingList
    // The filtered set (containing only irreducible overlaps) are placed into pOBOut
    // by calculateIrreducibleHits
    OverlapBlockList obWorkingList;
    std::string seq = read.seq.toString();

    // We store the various overlap blocks using a number of lists, one for the containments
    // in the forward and reverse index and one for each set of overlap blocks
    OverlapBlockList oblFwdContain;
    OverlapBlockList oblRevContain;
    
    OverlapBlockList oblSuffixFwd;
    OverlapBlockList oblSuffixRev;
    OverlapBlockList oblPrefixFwd;
    OverlapBlockList oblPrefixRev;

    // Match the suffix of seq to prefixes
    findOverlapBlocksExact(seq, m_pBWT, m_pRevBWT, sufPreAF, minOverlap, &oblSuffixFwd, &oblFwdContain, result);
    if (!m_noReverse)
    {
    findOverlapBlocksExact(complement(seq), m_pRevBWT, m_pBWT, prePreAF, minOverlap, &oblSuffixRev, &oblRevContain, result);
    }

    // Match the prefix of seq to suffixes
    if (!m_noReverse)
    {
    findOverlapBlocksExact(reverseComplement(seq), m_pBWT, m_pRevBWT, sufSufAF, minOverlap, &oblPrefixFwd, &oblFwdContain, result);
    }
    findOverlapBlocksExact(reverse(seq), m_pRevBWT, m_pBWT, preSufAF, minOverlap, &oblPrefixRev, &oblRevContain, result);

    // Remove submaximal blocks for each block list including fully contained blocks
    // Copy the containment blocks into the prefix/suffix lists
    oblSuffixFwd.insert(oblSuffixFwd.end(), oblFwdContain.begin(), oblFwdContain.end());
    oblPrefixFwd.insert(oblPrefixFwd.end(), oblFwdContain.begin(), oblFwdContain.end());
    oblSuffixRev.insert(oblSuffixRev.end(), oblRevContain.begin(), oblRevContain.end());
    oblPrefixRev.insert(oblPrefixRev.end(), oblRevContain.begin(), oblRevContain.end());
    
    // Perform the submaximal filter
    removeSubMaximalBlocks(&oblSuffixFwd, m_pBWT, m_pRevBWT);
    removeSubMaximalBlocks(&oblPrefixFwd, m_pBWT, m_pRevBWT);
    removeSubMaximalBlocks(&oblSuffixRev, m_pRevBWT, m_pBWT);
    removeSubMaximalBlocks(&oblPrefixRev, m_pRevBWT, m_pBWT);
    
    // Remove the contain blocks from the suffix/prefix lists
    removeContainmentBlocks(seq.length(), &oblSuffixFwd);
    removeContainmentBlocks(seq.length(), &oblPrefixFwd);
    removeContainmentBlocks(seq.length(), &oblSuffixRev);
    removeContainmentBlocks(seq.length(), &oblPrefixRev);

    // Join the suffix and prefix lists
    oblSuffixFwd.splice(oblSuffixFwd.end(), oblSuffixRev);
    oblPrefixFwd.splice(oblPrefixFwd.end(), oblPrefixRev);

    // Move the containments to the output list
    pOBOut->splice(pOBOut->end(), oblFwdContain);
    pOBOut->splice(pOBOut->end(), oblRevContain);

    // Filter out transitive overlap blocks if requested
    if(m_bIrreducible)
    {
        computeIrreducibleBlocks(m_pBWT, m_pRevBWT, &oblSuffixFwd, pOBOut);
        computeIrreducibleBlocks(m_pBWT, m_pRevBWT, &oblPrefixFwd, pOBOut);
    }
    else
    {
        pOBOut->splice(pOBOut->end(), oblSuffixFwd);
        pOBOut->splice(pOBOut->end(), oblPrefixFwd);
    }

    return result;
}