Vertex* StringGraphGenerator::addTerminalVertex(const SeqRecord& record)
{
    assert(m_pGraph != NULL);

    // Build the vertex by performing a full-length search for the
    // sequence in the FM-index. We set the ID of the vertex to be the 
    // lowest index in the returned block list.
    OverlapBlockList endBlockList;
    m_pOverlapper->alignReadDuplicate(record, &endBlockList);

    // Search the block list for the exact match to the end read. This must exist
    OverlapBlockList::iterator matchIter = endBlockList.begin();
    while(matchIter != endBlockList.end())
    {
        if(matchIter->numDiff == 0 && !matchIter->flags.isQueryRev())
            break; // this block corresponds to the actual sequence of endRead
    }
    assert(matchIter != endBlockList.end());
    
    // Construct the canonical ID from the matching interval
    std::string endID = matchIter->toCanonicalID();

    Vertex* pVertex = m_pGraph->getVertex(endID);
    if(pVertex == NULL)
    {
        pVertex = new(m_pGraph->getVertexAllocator()) Vertex(endID, record.seq.toString());
        m_pGraph->addVertex(pVertex);
    }
    return pVertex;
}
Exemple #2
0
// Run the cluster process. If the number of total nodes
// exceeds max, abort the search.
void ReadCluster::run(size_t max)
{
    while(!m_queue.empty())
    {
        if(m_queue.size() + m_outCluster.size() > max)
        {
            while(!m_queue.empty())
                m_queue.pop();
            m_outCluster.clear();
            return;
        }

        ClusterNode node = m_queue.front();
        m_queue.pop();

        // Add this node to the output
        m_outCluster.push_back(node);

        // Find overlaps for the current node
        SeqRecord tempRecord;
        tempRecord.id = "cluster";
        tempRecord.seq = node.sequence;
        OverlapBlockList blockList;
        m_pOverlapper->overlapRead(tempRecord, m_minOverlap, &blockList);
        
        // Parse each member of the block list and potentially expand the cluster
        for(OverlapBlockList::const_iterator iter = blockList.begin(); iter != blockList.end(); ++iter)
        {
            // Check if the reads in this block are part of the cluster already
            BWTInterval canonicalInterval = iter->getCanonicalInterval();
            int64_t canonicalIndex = canonicalInterval.lower;
            if(m_usedIndex.count(canonicalIndex) == 0)
            {
                // This is a new node that isn't in the cluster. Add it.
                m_usedIndex.insert(canonicalIndex);

                ClusterNode newNode;
                newNode.sequence = iter->getFullString(node.sequence);
                newNode.interval = canonicalInterval;
                newNode.isReverseInterval = iter->flags.isTargetRev();
                m_queue.push(newNode);
            }
        }
    }
}
// Return true if the terminalBlock is a substring of any member of blockList
bool OverlapAlgorithm::isBlockSubstring(OverlapBlock& terminalBlock, const OverlapBlockList& blockList, double maxER) const
{
    OverlapBlockList::const_iterator iter = blockList.begin();
    size_t right_extension_length = terminalBlock.forwardHistory.size();
    for(; iter != blockList.end(); ++iter)
    {
        if(terminalBlock.overlapLen == iter->overlapLen && 
           right_extension_length == iter->forwardHistory.size())
        {
            continue; // same length, cannot be a substring
        }
        
        // Calculate error rate between blocks
        double er = calculateBlockErrorRate(terminalBlock, *iter);
        if(isErrorRateAcceptable(er, maxER))
            return true;
    }
    return false;
}
// Update the overlap block list with a righthand extension to b, removing ranges that become invalid
void OverlapAlgorithm::updateOverlapBlockRangesRight(const BWT* pBWT, const BWT* pRevBWT, 
                                                     OverlapBlockList& obList, char canonical_base) const
{
    OverlapBlockList::iterator iter = obList.begin(); 
    while(iter != obList.end())
    {
        char relative_base = iter->flags.isQueryComp() ? complement(canonical_base) : canonical_base;
        BWTAlgorithms::updateBothR(iter->ranges, relative_base, iter->getExtensionBWT(pBWT, pRevBWT));
        // remove the block from the list if its no longer valid
        if(!iter->ranges.isValid())
        {
            iter = obList.erase(iter);
        }
        else
        {
            // Add the base to the extension history
            int currExtension = iter->forwardHistory.size();
            iter->forwardHistory.add(currExtension, canonical_base);
            ++iter;
        }
    }
}
void StringGraphGenerator::updateGraphAndQueue(GraphFrontier& currNode, FrontierQueue& queue, OverlapBlockList& blockList)
{
    // Partition the block list into containment blocks and extension (valid) blocks
    // We do not add containment edges to the graph so the containments are discarded
    OverlapBlockList containList;
    OverlapBlockList overlapList;

    Vertex* pX = currNode.pVertex;

    //partitionBlockList(pX->getSeqLen(), &blockList, &overlapList, &containList);

    // Process the overlap blocks, adding new vertices and edges where necessary
    for(OverlapBlockList::iterator iter = blockList.begin(); iter != blockList.end(); ++iter)
    {
        if(iter->getEdgeDir() != currNode.dir)
            continue;

        std::string vertexID = iter->toCanonicalID();
        if(vertexID == pX->getID())
            continue; // skip self-edges


        std::string vertexSeq = iter->getFullString(pX->getSeq().toString());
        Overlap o = iter->toOverlap(pX->getID(), vertexID, pX->getSeqLen(), vertexSeq.length());

/*
#if DEBUGGENERATE
        std::cout << "has overlap to: " << vertexID << " len: " << iter->overlapLen << " flags: " << iter->flags << "\n";
        std::cout << "Overlap string: " << iter->getOverlapString(pX->getSeq().toString()) << "\n";
#endif
*/      
        // Check if a vertex with endVertexID exists in the graph
        Vertex* pVertex = m_pGraph->getVertex(vertexID);
        if(pVertex == NULL)
        {

#if DEBUGGENERATE
            std::cout << "Vertex with ID: " << vertexID << " does not exist, creating\n";
            std::cout << "Vertex sequence: " << vertexSeq << "\n";
#endif
            // Generate the new vertex
            vertexSeq = iter->getFullString(pX->getSeq().toString());
            pVertex = new(m_pGraph->getVertexAllocator()) Vertex(vertexID, vertexSeq);
            pVertex->setColor(UNEXPLORED_COLOR);
            m_pGraph->addVertex(pVertex);
        }

        // Construct the found edge
        Edge* pXY = SGAlgorithms::createEdgesFromOverlap(m_pGraph, o, true);

        // If the endpoint vertex is unexplored, queue it
        if(pVertex->getColor() == UNEXPLORED_COLOR)
        {
            GraphFrontier node;
            node.pVertex = pVertex;
            node.dir = !pXY->getTwin()->getDir(); // continuation direction
            node.distance = currNode.distance + pXY->getSeqLen();
            if(node.distance <= m_maxDistance)
                queue.push(node);
        }
    }
}
Exemple #6
0
ClusterResult ClusterProcess::process(const SequenceWorkItem& item)
{
    // Calculate the intervals in the forward FM-index for this read
    const BWT* pBWT = m_pOverlapper->getBWT();

    // Check if this read is a substring
    OverlapBlockList tempBlockList;
    OverlapResult overlapResult = m_pOverlapper->alignReadDuplicate(item.read, &tempBlockList);
    if(overlapResult.isSubstring)
    {
        std::cerr << "Error: substring reads found in sga-cluster. Please run rmdup before cluster\n";
        exit(1);
    }

    // Find the interval in the fm-index containing the read
    std::string readString = item.read.seq.toString();
    BWTInterval readInterval = BWTAlgorithms::findInterval(pBWT, readString);
    BWTAlgorithms::updateInterval(readInterval, '$', pBWT);

    // The read must be present in the index
    assert(readInterval.isValid());

    // Check if this read has been used yet
    bool used = false;
    for(int64_t i = readInterval.lower; i <= readInterval.upper; ++i)
    {
        if(m_pMarkedReads->test(i))
        {
            used = true;
            break;
        }
    }

    ClusterResult result;
    if(used)
        return result; // already part of a cluster, return nothing

    // Compute a new cluster around this read
    std::set<int64_t> usedIndex;
    ClusterNodeQueue queue;
    ClusterNode node;
    node.sequence = item.read.seq.toString();
    node.interval = readInterval;
    node.isReverseInterval = false;
    usedIndex.insert(readInterval.lower);
    queue.push(node);
    while(!queue.empty())
    {
        ClusterNode node = queue.front();
        queue.pop();

        // Update the used index and the result structure with this node's data
        result.clusterNodes.push_back(node);

        SeqRecord tempRecord;
        tempRecord.id = "cluster";
        tempRecord.seq = node.sequence;
        OverlapBlockList blockList;
        OverlapResult result = m_pOverlapper->overlapRead(tempRecord, m_minOverlap, &blockList);
        //m_pOverlapper->buildForwardHistory(&blockList);
        
        // Parse each member of the block list and potentially expand the cluster
        for(OverlapBlockList::const_iterator iter = blockList.begin(); iter != blockList.end(); ++iter)
        {
            // Check if the reads in this block are part of the cluster already
            BWTInterval canonicalInterval = iter->getCanonicalInterval();
            int64_t canonicalIndex = canonicalInterval.lower;
            if(usedIndex.count(canonicalIndex) == 0)
            {
                usedIndex.insert(canonicalIndex);
                ClusterNode newNode;
                newNode.sequence = iter->getFullString(node.sequence);
                newNode.interval = canonicalInterval;
                newNode.isReverseInterval = iter->flags.isTargetRev();
                queue.push(newNode);
            }
        }
    }

    // If some work was performed, update the bitvector so other threads do not try to merge the same set of reads.
    // This uses compare-and-swap instructions to ensure the uppdate is atomic. 
    // If some other thread has merged this set (and updated
    // the bitvector), we discard all the merged data.
    
    // As a given set of reads should all be merged together, we only need to make sure we atomically update
    // the bit for the read with the lowest index in the set.

    // Sort the intervals into ascending order and remove any duplicate intervals (which can occur
    // if the subgraph has a simple cycle)
    std::sort(result.clusterNodes.begin(), result.clusterNodes.end(), ClusterNode::compare);
    std::vector<ClusterNode>::iterator newEnd = std::unique(result.clusterNodes.begin(),
                                                            result.clusterNodes.end(),
                                                            ClusterNode::equal);

    size_t oldSize = result.clusterNodes.size();
    result.clusterNodes.erase(newEnd, result.clusterNodes.end());
    size_t newSize = result.clusterNodes.size();
    if(oldSize != newSize)
        std::cout << "Warning: duplicate cluster nodes were found\n";

    // Check if the bit in the vector has already been set for the lowest read index
    // If it has some other thread has already output this set so we do nothing
    int64_t lowestIndex = result.clusterNodes.front().interval.lower;
    bool currentValue = m_pMarkedReads->test(lowestIndex);
    bool updateSuccess = false;

    if(currentValue == false)
    {
        // Attempt to update the bit vector with an atomic CAS. If this returns false
        // the bit was set by some other thread
        updateSuccess = m_pMarkedReads->updateCAS(lowestIndex, currentValue, true);
    }

    if(updateSuccess)
    {
        // We successfully atomically set the bit for the first read in this set
        // to true. We can safely update the rest of the bits and keep the merged sequences
        // for output.
        std::vector<ClusterNode>::const_iterator iter = result.clusterNodes.begin();
        for(; iter != result.clusterNodes.end(); ++iter)
        {
            for(int64_t i = iter->interval.lower; i <= iter->interval.upper; ++i)
            {
                if(i == lowestIndex) //already set
                    continue;
                currentValue = m_pMarkedReads->test(i);
                if(currentValue)
                {
                    // This value should not be true, emit a warning
                    std::cout << "Warning: Bit " << i << " was set outside of critical section\n";
                    std::cout << "Read: " << readString << "\n";
                }
                else
                {
                    m_pMarkedReads->updateCAS(i, currentValue, true);
                }
            }
        }
    }
    else
    {
        // Some other thread merged these reads already, discard the intermediate
        // data and set the result to false
        result.clusterNodes.clear();
    }
    return result;
}
// Extend all the blocks in activeList by one base to the right
// Move all right-terminal blocks to the termainl list. If a block 
// is terminal and potentially contained by another block, add it to 
// containedList
void OverlapAlgorithm::extendActiveBlocksRight(const BWT* pBWT, const BWT* pRevBWT, 
                                               OverlapBlockList& activeList, 
                                               OverlapBlockList& terminalList,
                                               OverlapBlockList& /*containedList*/) const
{
    OverlapBlockList::iterator iter = activeList.begin();
    OverlapBlockList::iterator next;
    while(iter != activeList.end())
    {
        next = iter;
        ++next;

        // Check if block is terminal
        AlphaCount64 ext_count = iter->getCanonicalExtCount(pBWT, pRevBWT);
        if(ext_count.get('$') > 0)
        {
            // Only consider this block to be terminal irreducible if it has at least one extension
            // or else it is a substring block
            if(iter->forwardHistory.size() > 0)
            {
                OverlapBlock branched = *iter;
                BWTAlgorithms::updateBothR(branched.ranges, '$', branched.getExtensionBWT(pBWT, pRevBWT));
                terminalList.push_back(branched);
#ifdef DEBUGOVERLAP_2            
                std::cout << "Block of length " << iter->overlapLen << " moved to terminal\n";
#endif
            }
        }

        int curr_extension = iter->forwardHistory.size();

        // Perform the right extensions
        
        // Best case, there is only a single extension character
        // Handle this case specially so we don't need to copy the potentially
        // large OverlapBlock structure and its full history
        if(ext_count.hasUniqueDNAChar())
        {
            // Get the extension character with respect to the queried sequence
            char canonical_base = ext_count.getUniqueDNAChar();

            // Flip the base into the frame of reference for the block
            char block_base = iter->flags.isQueryComp() ? complement(canonical_base) : canonical_base;

            // Update the block using the base in its frame of reference
            BWTAlgorithms::updateBothR(iter->ranges, block_base, iter->getExtensionBWT(pBWT, pRevBWT));

            // Add the base to the history in the frame of reference of the query read
            // This is so the history is consistent when comparing between blocks from different strands
            iter->forwardHistory.add(curr_extension, canonical_base);
        }
        else
        {
            for(size_t idx = 0; idx < DNA_ALPHABET_SIZE; ++idx)
            {
                char canonical_base = ALPHABET[idx];
                char block_base = iter->flags.isQueryComp() ? complement(canonical_base) : canonical_base;
                if(ext_count.get(canonical_base) == 0)
                    continue;

                // Branch the sequence. This involves copying the entire history which can be large
                // if the input sequences are very long. This could be avoided by using the SearchHistoyNode/Link
                // structure but branches are infrequent enough to not have a large impact
                OverlapBlock branched = *iter;
                BWTAlgorithms::updateBothR(branched.ranges, block_base, branched.getExtensionBWT(pBWT, pRevBWT));
                assert(branched.ranges.isValid());

                // Add the base in the canonical frame
                branched.forwardHistory.add(curr_extension, canonical_base);

                // Insert the new block after the iterator
                activeList.insert(iter, branched);
            }

            // Remove the original block, which has been superceded by the branches
            activeList.erase(iter);
        }

        iter = next; // this skips the newly-inserted blocks
    }
} 
// Classify the blocks in obList as irreducible, transitive or substrings. The irreducible blocks are
// put into pOBFinal. The remaining are discarded.
// Invariant: the blocks are ordered in descending order of the overlap size so that the longest overlap is first.
void OverlapAlgorithm::_processIrreducibleBlocksInexact(const BWT* pBWT, const BWT* pRevBWT, 
                                                        OverlapBlockList& activeList, 
                                                        OverlapBlockList* pOBFinal) const
{
    if(activeList.empty())
        return;
    
    // The activeList contains all the blocks that are not yet right terminal
    // Count the extensions in the top level (longest) blocks first
    bool all_eliminated = false;
    while(!activeList.empty() && !all_eliminated)
    {
        // The terminalBlock list contains all the blocks that became right-terminal
        // in the current extension round.
        OverlapBlockList terminalList;
        OverlapBlockList potentialContainedList;

        // Perform a single round of extension, any terminal blocks
        // are moved to the terminated list
        extendActiveBlocksRight(pBWT, pRevBWT, activeList, terminalList, potentialContainedList);

        // Compare the blocks in the contained list against the other terminal and active blocks
        // If they are a substring match to any of these, discard them
        OverlapBlockList::iterator containedIter = potentialContainedList.begin();
        for(; containedIter != potentialContainedList.end(); ++containedIter)
        {
           if(!isBlockSubstring(*containedIter, terminalList, m_errorRate) && 
              !isBlockSubstring(*containedIter, activeList, m_errorRate))
           {
                // Not a substring, move to terminal list
                terminalList.push_back(*containedIter);
                //std::cout << "Contained block kept: " << containedIter->overlapLen << "\n";
           }
           else
           {
                //std::cout << "Contained block found and removed: " << containedIter->overlapLen << "\n";
           }
        }

        // Using the terminated blocks, mark as eliminated any active blocks
        // that form a valid overlap to the terminal block. These are transitive edges
        // We do not compare two terminal blocks, we don't consider these overlaps to be
        // transitive
        OverlapBlockList::iterator terminalIter = terminalList.begin();
        for(; terminalIter != terminalList.end(); ++terminalIter)
        {
#ifdef DEBUGOVERLAP
            std::cout << "[II] ***TLB of length " << terminalIter->overlapLen << " has ended\n";
#endif       
            all_eliminated = true;
            OverlapBlockList::iterator activeIter = activeList.begin();
            for(; activeIter != activeList.end(); ++activeIter)
            {
                if(activeIter->isEliminated)
                    continue; // skip previously marked blocks
                
                // Two conditions must be met for a block to be transitive wrt terminal:
                // 1) It must have a strictly shorter overlap than the terminal block
                // 2) The error rate between the block and terminal must be less than the threshold
                double inferredErrorRate = calculateBlockErrorRate(*terminalIter, *activeIter);
                if(activeIter->overlapLen < terminalIter->overlapLen && 
                   isErrorRateAcceptable(inferredErrorRate, m_errorRate))
                {
#ifdef DEBUGOVERLAP_2                            
                    std::cout << "Marking block of length " << activeIter->overlapLen << " as eliminated\n";
#endif
                    activeIter->isEliminated = true;
                }
                else
                {
                    all_eliminated = false;
                }
            } 
            
            // Move this block to the final list if it has not been previously marked eliminated
            if(!terminalIter->isEliminated)
            {
#ifdef DEBUGOVERLAP
                std::cout << "[II] Adding block " << *terminalIter << " to final list\n";
                //std::cout << "  extension: " << terminalIter->forwardHistory << "\n";
#endif                
                pOBFinal->push_back(*terminalIter);
            }
        }
    }

    activeList.clear();
}
// Calculate the single right extension to the '$' for each the contained blocks
// so that the interval ranges are consistent
void OverlapAlgorithm::terminateContainedBlocks(OverlapBlockList& containedBlocks) const
{
    for(OverlapBlockList::iterator iter = containedBlocks.begin(); iter != containedBlocks.end(); ++iter)
        BWTAlgorithms::updateBothR(iter->ranges, '$', iter->getExtensionBWT(m_pBWT, m_pRevBWT));
}
// Construct the set of blocks describing irreducible overlaps with READ
// and write the blocks to pOBOut
OverlapResult OverlapAlgorithm::overlapReadExact(const SeqRecord& read, int minOverlap, OverlapBlockList* pOBOut) const
{
    OverlapResult result;
    // The complete set of overlap blocks are collected in obWorkingList
    // The filtered set (containing only irreducible overlaps) are placed into pOBOut
    // by calculateIrreducibleHits
    OverlapBlockList obWorkingList;
    std::string seq = read.seq.toString();

    // We store the various overlap blocks using a number of lists, one for the containments
    // in the forward and reverse index and one for each set of overlap blocks
    OverlapBlockList oblFwdContain;
    OverlapBlockList oblRevContain;
    
    OverlapBlockList oblSuffixFwd;
    OverlapBlockList oblSuffixRev;
    OverlapBlockList oblPrefixFwd;
    OverlapBlockList oblPrefixRev;

    // Match the suffix of seq to prefixes
    findOverlapBlocksExact(seq, m_pBWT, m_pRevBWT, sufPreAF, minOverlap, &oblSuffixFwd, &oblFwdContain, result);
    if (!m_noReverse)
    {
    findOverlapBlocksExact(complement(seq), m_pRevBWT, m_pBWT, prePreAF, minOverlap, &oblSuffixRev, &oblRevContain, result);
    }

    // Match the prefix of seq to suffixes
    if (!m_noReverse)
    {
    findOverlapBlocksExact(reverseComplement(seq), m_pBWT, m_pRevBWT, sufSufAF, minOverlap, &oblPrefixFwd, &oblFwdContain, result);
    }
    findOverlapBlocksExact(reverse(seq), m_pRevBWT, m_pBWT, preSufAF, minOverlap, &oblPrefixRev, &oblRevContain, result);

    // Remove submaximal blocks for each block list including fully contained blocks
    // Copy the containment blocks into the prefix/suffix lists
    oblSuffixFwd.insert(oblSuffixFwd.end(), oblFwdContain.begin(), oblFwdContain.end());
    oblPrefixFwd.insert(oblPrefixFwd.end(), oblFwdContain.begin(), oblFwdContain.end());
    oblSuffixRev.insert(oblSuffixRev.end(), oblRevContain.begin(), oblRevContain.end());
    oblPrefixRev.insert(oblPrefixRev.end(), oblRevContain.begin(), oblRevContain.end());
    
    // Perform the submaximal filter
    removeSubMaximalBlocks(&oblSuffixFwd, m_pBWT, m_pRevBWT);
    removeSubMaximalBlocks(&oblPrefixFwd, m_pBWT, m_pRevBWT);
    removeSubMaximalBlocks(&oblSuffixRev, m_pRevBWT, m_pBWT);
    removeSubMaximalBlocks(&oblPrefixRev, m_pRevBWT, m_pBWT);
    
    // Remove the contain blocks from the suffix/prefix lists
    removeContainmentBlocks(seq.length(), &oblSuffixFwd);
    removeContainmentBlocks(seq.length(), &oblPrefixFwd);
    removeContainmentBlocks(seq.length(), &oblSuffixRev);
    removeContainmentBlocks(seq.length(), &oblPrefixRev);

    // Join the suffix and prefix lists
    oblSuffixFwd.splice(oblSuffixFwd.end(), oblSuffixRev);
    oblPrefixFwd.splice(oblPrefixFwd.end(), oblPrefixRev);

    // Move the containments to the output list
    pOBOut->splice(pOBOut->end(), oblFwdContain);
    pOBOut->splice(pOBOut->end(), oblRevContain);

    // Filter out transitive overlap blocks if requested
    if(m_bIrreducible)
    {
        computeIrreducibleBlocks(m_pBWT, m_pRevBWT, &oblSuffixFwd, pOBOut);
        computeIrreducibleBlocks(m_pBWT, m_pRevBWT, &oblPrefixFwd, pOBOut);
    }
    else
    {
        pOBOut->splice(pOBOut->end(), oblSuffixFwd);
        pOBOut->splice(pOBOut->end(), oblPrefixFwd);
    }

    return result;
}