Esempio n. 1
0
// Run the cluster process. If the number of total nodes
// exceeds max, abort the search.
void ReadCluster::run(size_t max)
{
    while(!m_queue.empty())
    {
        if(m_queue.size() + m_outCluster.size() > max)
        {
            while(!m_queue.empty())
                m_queue.pop();
            m_outCluster.clear();
            return;
        }

        ClusterNode node = m_queue.front();
        m_queue.pop();

        // Add this node to the output
        m_outCluster.push_back(node);

        // Find overlaps for the current node
        SeqRecord tempRecord;
        tempRecord.id = "cluster";
        tempRecord.seq = node.sequence;
        OverlapBlockList blockList;
        m_pOverlapper->overlapRead(tempRecord, m_minOverlap, &blockList);
        
        // Parse each member of the block list and potentially expand the cluster
        for(OverlapBlockList::const_iterator iter = blockList.begin(); iter != blockList.end(); ++iter)
        {
            // Check if the reads in this block are part of the cluster already
            BWTInterval canonicalInterval = iter->getCanonicalInterval();
            int64_t canonicalIndex = canonicalInterval.lower;
            if(m_usedIndex.count(canonicalIndex) == 0)
            {
                // This is a new node that isn't in the cluster. Add it.
                m_usedIndex.insert(canonicalIndex);

                ClusterNode newNode;
                newNode.sequence = iter->getFullString(node.sequence);
                newNode.interval = canonicalInterval;
                newNode.isReverseInterval = iter->flags.isTargetRev();
                m_queue.push(newNode);
            }
        }
    }
}
Esempio n. 2
0
ClusterResult ClusterProcess::process(const SequenceWorkItem& item)
{
    // Calculate the intervals in the forward FM-index for this read
    const BWT* pBWT = m_pOverlapper->getBWT();

    // Check if this read is a substring
    OverlapBlockList tempBlockList;
    OverlapResult overlapResult = m_pOverlapper->alignReadDuplicate(item.read, &tempBlockList);
    if(overlapResult.isSubstring)
    {
        std::cerr << "Error: substring reads found in sga-cluster. Please run rmdup before cluster\n";
        exit(1);
    }

    // Find the interval in the fm-index containing the read
    std::string readString = item.read.seq.toString();
    BWTInterval readInterval = BWTAlgorithms::findInterval(pBWT, readString);
    BWTAlgorithms::updateInterval(readInterval, '$', pBWT);

    // The read must be present in the index
    assert(readInterval.isValid());

    // Check if this read has been used yet
    bool used = false;
    for(int64_t i = readInterval.lower; i <= readInterval.upper; ++i)
    {
        if(m_pMarkedReads->test(i))
        {
            used = true;
            break;
        }
    }

    ClusterResult result;
    if(used)
        return result; // already part of a cluster, return nothing

    // Compute a new cluster around this read
    std::set<int64_t> usedIndex;
    ClusterNodeQueue queue;
    ClusterNode node;
    node.sequence = item.read.seq.toString();
    node.interval = readInterval;
    node.isReverseInterval = false;
    usedIndex.insert(readInterval.lower);
    queue.push(node);
    while(!queue.empty())
    {
        ClusterNode node = queue.front();
        queue.pop();

        // Update the used index and the result structure with this node's data
        result.clusterNodes.push_back(node);

        SeqRecord tempRecord;
        tempRecord.id = "cluster";
        tempRecord.seq = node.sequence;
        OverlapBlockList blockList;
        OverlapResult result = m_pOverlapper->overlapRead(tempRecord, m_minOverlap, &blockList);
        //m_pOverlapper->buildForwardHistory(&blockList);
        
        // Parse each member of the block list and potentially expand the cluster
        for(OverlapBlockList::const_iterator iter = blockList.begin(); iter != blockList.end(); ++iter)
        {
            // Check if the reads in this block are part of the cluster already
            BWTInterval canonicalInterval = iter->getCanonicalInterval();
            int64_t canonicalIndex = canonicalInterval.lower;
            if(usedIndex.count(canonicalIndex) == 0)
            {
                usedIndex.insert(canonicalIndex);
                ClusterNode newNode;
                newNode.sequence = iter->getFullString(node.sequence);
                newNode.interval = canonicalInterval;
                newNode.isReverseInterval = iter->flags.isTargetRev();
                queue.push(newNode);
            }
        }
    }

    // If some work was performed, update the bitvector so other threads do not try to merge the same set of reads.
    // This uses compare-and-swap instructions to ensure the uppdate is atomic. 
    // If some other thread has merged this set (and updated
    // the bitvector), we discard all the merged data.
    
    // As a given set of reads should all be merged together, we only need to make sure we atomically update
    // the bit for the read with the lowest index in the set.

    // Sort the intervals into ascending order and remove any duplicate intervals (which can occur
    // if the subgraph has a simple cycle)
    std::sort(result.clusterNodes.begin(), result.clusterNodes.end(), ClusterNode::compare);
    std::vector<ClusterNode>::iterator newEnd = std::unique(result.clusterNodes.begin(),
                                                            result.clusterNodes.end(),
                                                            ClusterNode::equal);

    size_t oldSize = result.clusterNodes.size();
    result.clusterNodes.erase(newEnd, result.clusterNodes.end());
    size_t newSize = result.clusterNodes.size();
    if(oldSize != newSize)
        std::cout << "Warning: duplicate cluster nodes were found\n";

    // Check if the bit in the vector has already been set for the lowest read index
    // If it has some other thread has already output this set so we do nothing
    int64_t lowestIndex = result.clusterNodes.front().interval.lower;
    bool currentValue = m_pMarkedReads->test(lowestIndex);
    bool updateSuccess = false;

    if(currentValue == false)
    {
        // Attempt to update the bit vector with an atomic CAS. If this returns false
        // the bit was set by some other thread
        updateSuccess = m_pMarkedReads->updateCAS(lowestIndex, currentValue, true);
    }

    if(updateSuccess)
    {
        // We successfully atomically set the bit for the first read in this set
        // to true. We can safely update the rest of the bits and keep the merged sequences
        // for output.
        std::vector<ClusterNode>::const_iterator iter = result.clusterNodes.begin();
        for(; iter != result.clusterNodes.end(); ++iter)
        {
            for(int64_t i = iter->interval.lower; i <= iter->interval.upper; ++i)
            {
                if(i == lowestIndex) //already set
                    continue;
                currentValue = m_pMarkedReads->test(i);
                if(currentValue)
                {
                    // This value should not be true, emit a warning
                    std::cout << "Warning: Bit " << i << " was set outside of critical section\n";
                    std::cout << "Read: " << readString << "\n";
                }
                else
                {
                    m_pMarkedReads->updateCAS(i, currentValue, true);
                }
            }
        }
    }
    else
    {
        // Some other thread merged these reads already, discard the intermediate
        // data and set the result to false
        result.clusterNodes.clear();
    }
    return result;
}