Vertex* StringGraphGenerator::addTerminalVertex(const SeqRecord& record) { assert(m_pGraph != NULL); // Build the vertex by performing a full-length search for the // sequence in the FM-index. We set the ID of the vertex to be the // lowest index in the returned block list. OverlapBlockList endBlockList; m_pOverlapper->alignReadDuplicate(record, &endBlockList); // Search the block list for the exact match to the end read. This must exist OverlapBlockList::iterator matchIter = endBlockList.begin(); while(matchIter != endBlockList.end()) { if(matchIter->numDiff == 0 && !matchIter->flags.isQueryRev()) break; // this block corresponds to the actual sequence of endRead } assert(matchIter != endBlockList.end()); // Construct the canonical ID from the matching interval std::string endID = matchIter->toCanonicalID(); Vertex* pVertex = m_pGraph->getVertex(endID); if(pVertex == NULL) { pVertex = new(m_pGraph->getVertexAllocator()) Vertex(endID, record.seq.toString()); m_pGraph->addVertex(pVertex); } return pVertex; }
// Run the cluster process. If the number of total nodes // exceeds max, abort the search. void ReadCluster::run(size_t max) { while(!m_queue.empty()) { if(m_queue.size() + m_outCluster.size() > max) { while(!m_queue.empty()) m_queue.pop(); m_outCluster.clear(); return; } ClusterNode node = m_queue.front(); m_queue.pop(); // Add this node to the output m_outCluster.push_back(node); // Find overlaps for the current node SeqRecord tempRecord; tempRecord.id = "cluster"; tempRecord.seq = node.sequence; OverlapBlockList blockList; m_pOverlapper->overlapRead(tempRecord, m_minOverlap, &blockList); // Parse each member of the block list and potentially expand the cluster for(OverlapBlockList::const_iterator iter = blockList.begin(); iter != blockList.end(); ++iter) { // Check if the reads in this block are part of the cluster already BWTInterval canonicalInterval = iter->getCanonicalInterval(); int64_t canonicalIndex = canonicalInterval.lower; if(m_usedIndex.count(canonicalIndex) == 0) { // This is a new node that isn't in the cluster. Add it. m_usedIndex.insert(canonicalIndex); ClusterNode newNode; newNode.sequence = iter->getFullString(node.sequence); newNode.interval = canonicalInterval; newNode.isReverseInterval = iter->flags.isTargetRev(); m_queue.push(newNode); } } } }
// Return true if the terminalBlock is a substring of any member of blockList bool OverlapAlgorithm::isBlockSubstring(OverlapBlock& terminalBlock, const OverlapBlockList& blockList, double maxER) const { OverlapBlockList::const_iterator iter = blockList.begin(); size_t right_extension_length = terminalBlock.forwardHistory.size(); for(; iter != blockList.end(); ++iter) { if(terminalBlock.overlapLen == iter->overlapLen && right_extension_length == iter->forwardHistory.size()) { continue; // same length, cannot be a substring } // Calculate error rate between blocks double er = calculateBlockErrorRate(terminalBlock, *iter); if(isErrorRateAcceptable(er, maxER)) return true; } return false; }
// Update the overlap block list with a righthand extension to b, removing ranges that become invalid void OverlapAlgorithm::updateOverlapBlockRangesRight(const BWT* pBWT, const BWT* pRevBWT, OverlapBlockList& obList, char canonical_base) const { OverlapBlockList::iterator iter = obList.begin(); while(iter != obList.end()) { char relative_base = iter->flags.isQueryComp() ? complement(canonical_base) : canonical_base; BWTAlgorithms::updateBothR(iter->ranges, relative_base, iter->getExtensionBWT(pBWT, pRevBWT)); // remove the block from the list if its no longer valid if(!iter->ranges.isValid()) { iter = obList.erase(iter); } else { // Add the base to the extension history int currExtension = iter->forwardHistory.size(); iter->forwardHistory.add(currExtension, canonical_base); ++iter; } } }
void StringGraphGenerator::updateGraphAndQueue(GraphFrontier& currNode, FrontierQueue& queue, OverlapBlockList& blockList) { // Partition the block list into containment blocks and extension (valid) blocks // We do not add containment edges to the graph so the containments are discarded OverlapBlockList containList; OverlapBlockList overlapList; Vertex* pX = currNode.pVertex; //partitionBlockList(pX->getSeqLen(), &blockList, &overlapList, &containList); // Process the overlap blocks, adding new vertices and edges where necessary for(OverlapBlockList::iterator iter = blockList.begin(); iter != blockList.end(); ++iter) { if(iter->getEdgeDir() != currNode.dir) continue; std::string vertexID = iter->toCanonicalID(); if(vertexID == pX->getID()) continue; // skip self-edges std::string vertexSeq = iter->getFullString(pX->getSeq().toString()); Overlap o = iter->toOverlap(pX->getID(), vertexID, pX->getSeqLen(), vertexSeq.length()); /* #if DEBUGGENERATE std::cout << "has overlap to: " << vertexID << " len: " << iter->overlapLen << " flags: " << iter->flags << "\n"; std::cout << "Overlap string: " << iter->getOverlapString(pX->getSeq().toString()) << "\n"; #endif */ // Check if a vertex with endVertexID exists in the graph Vertex* pVertex = m_pGraph->getVertex(vertexID); if(pVertex == NULL) { #if DEBUGGENERATE std::cout << "Vertex with ID: " << vertexID << " does not exist, creating\n"; std::cout << "Vertex sequence: " << vertexSeq << "\n"; #endif // Generate the new vertex vertexSeq = iter->getFullString(pX->getSeq().toString()); pVertex = new(m_pGraph->getVertexAllocator()) Vertex(vertexID, vertexSeq); pVertex->setColor(UNEXPLORED_COLOR); m_pGraph->addVertex(pVertex); } // Construct the found edge Edge* pXY = SGAlgorithms::createEdgesFromOverlap(m_pGraph, o, true); // If the endpoint vertex is unexplored, queue it if(pVertex->getColor() == UNEXPLORED_COLOR) { GraphFrontier node; node.pVertex = pVertex; node.dir = !pXY->getTwin()->getDir(); // continuation direction node.distance = currNode.distance + pXY->getSeqLen(); if(node.distance <= m_maxDistance) queue.push(node); } } }
ClusterResult ClusterProcess::process(const SequenceWorkItem& item) { // Calculate the intervals in the forward FM-index for this read const BWT* pBWT = m_pOverlapper->getBWT(); // Check if this read is a substring OverlapBlockList tempBlockList; OverlapResult overlapResult = m_pOverlapper->alignReadDuplicate(item.read, &tempBlockList); if(overlapResult.isSubstring) { std::cerr << "Error: substring reads found in sga-cluster. Please run rmdup before cluster\n"; exit(1); } // Find the interval in the fm-index containing the read std::string readString = item.read.seq.toString(); BWTInterval readInterval = BWTAlgorithms::findInterval(pBWT, readString); BWTAlgorithms::updateInterval(readInterval, '$', pBWT); // The read must be present in the index assert(readInterval.isValid()); // Check if this read has been used yet bool used = false; for(int64_t i = readInterval.lower; i <= readInterval.upper; ++i) { if(m_pMarkedReads->test(i)) { used = true; break; } } ClusterResult result; if(used) return result; // already part of a cluster, return nothing // Compute a new cluster around this read std::set<int64_t> usedIndex; ClusterNodeQueue queue; ClusterNode node; node.sequence = item.read.seq.toString(); node.interval = readInterval; node.isReverseInterval = false; usedIndex.insert(readInterval.lower); queue.push(node); while(!queue.empty()) { ClusterNode node = queue.front(); queue.pop(); // Update the used index and the result structure with this node's data result.clusterNodes.push_back(node); SeqRecord tempRecord; tempRecord.id = "cluster"; tempRecord.seq = node.sequence; OverlapBlockList blockList; OverlapResult result = m_pOverlapper->overlapRead(tempRecord, m_minOverlap, &blockList); //m_pOverlapper->buildForwardHistory(&blockList); // Parse each member of the block list and potentially expand the cluster for(OverlapBlockList::const_iterator iter = blockList.begin(); iter != blockList.end(); ++iter) { // Check if the reads in this block are part of the cluster already BWTInterval canonicalInterval = iter->getCanonicalInterval(); int64_t canonicalIndex = canonicalInterval.lower; if(usedIndex.count(canonicalIndex) == 0) { usedIndex.insert(canonicalIndex); ClusterNode newNode; newNode.sequence = iter->getFullString(node.sequence); newNode.interval = canonicalInterval; newNode.isReverseInterval = iter->flags.isTargetRev(); queue.push(newNode); } } } // If some work was performed, update the bitvector so other threads do not try to merge the same set of reads. // This uses compare-and-swap instructions to ensure the uppdate is atomic. // If some other thread has merged this set (and updated // the bitvector), we discard all the merged data. // As a given set of reads should all be merged together, we only need to make sure we atomically update // the bit for the read with the lowest index in the set. // Sort the intervals into ascending order and remove any duplicate intervals (which can occur // if the subgraph has a simple cycle) std::sort(result.clusterNodes.begin(), result.clusterNodes.end(), ClusterNode::compare); std::vector<ClusterNode>::iterator newEnd = std::unique(result.clusterNodes.begin(), result.clusterNodes.end(), ClusterNode::equal); size_t oldSize = result.clusterNodes.size(); result.clusterNodes.erase(newEnd, result.clusterNodes.end()); size_t newSize = result.clusterNodes.size(); if(oldSize != newSize) std::cout << "Warning: duplicate cluster nodes were found\n"; // Check if the bit in the vector has already been set for the lowest read index // If it has some other thread has already output this set so we do nothing int64_t lowestIndex = result.clusterNodes.front().interval.lower; bool currentValue = m_pMarkedReads->test(lowestIndex); bool updateSuccess = false; if(currentValue == false) { // Attempt to update the bit vector with an atomic CAS. If this returns false // the bit was set by some other thread updateSuccess = m_pMarkedReads->updateCAS(lowestIndex, currentValue, true); } if(updateSuccess) { // We successfully atomically set the bit for the first read in this set // to true. We can safely update the rest of the bits and keep the merged sequences // for output. std::vector<ClusterNode>::const_iterator iter = result.clusterNodes.begin(); for(; iter != result.clusterNodes.end(); ++iter) { for(int64_t i = iter->interval.lower; i <= iter->interval.upper; ++i) { if(i == lowestIndex) //already set continue; currentValue = m_pMarkedReads->test(i); if(currentValue) { // This value should not be true, emit a warning std::cout << "Warning: Bit " << i << " was set outside of critical section\n"; std::cout << "Read: " << readString << "\n"; } else { m_pMarkedReads->updateCAS(i, currentValue, true); } } } } else { // Some other thread merged these reads already, discard the intermediate // data and set the result to false result.clusterNodes.clear(); } return result; }
// Extend all the blocks in activeList by one base to the right // Move all right-terminal blocks to the termainl list. If a block // is terminal and potentially contained by another block, add it to // containedList void OverlapAlgorithm::extendActiveBlocksRight(const BWT* pBWT, const BWT* pRevBWT, OverlapBlockList& activeList, OverlapBlockList& terminalList, OverlapBlockList& /*containedList*/) const { OverlapBlockList::iterator iter = activeList.begin(); OverlapBlockList::iterator next; while(iter != activeList.end()) { next = iter; ++next; // Check if block is terminal AlphaCount64 ext_count = iter->getCanonicalExtCount(pBWT, pRevBWT); if(ext_count.get('$') > 0) { // Only consider this block to be terminal irreducible if it has at least one extension // or else it is a substring block if(iter->forwardHistory.size() > 0) { OverlapBlock branched = *iter; BWTAlgorithms::updateBothR(branched.ranges, '$', branched.getExtensionBWT(pBWT, pRevBWT)); terminalList.push_back(branched); #ifdef DEBUGOVERLAP_2 std::cout << "Block of length " << iter->overlapLen << " moved to terminal\n"; #endif } } int curr_extension = iter->forwardHistory.size(); // Perform the right extensions // Best case, there is only a single extension character // Handle this case specially so we don't need to copy the potentially // large OverlapBlock structure and its full history if(ext_count.hasUniqueDNAChar()) { // Get the extension character with respect to the queried sequence char canonical_base = ext_count.getUniqueDNAChar(); // Flip the base into the frame of reference for the block char block_base = iter->flags.isQueryComp() ? complement(canonical_base) : canonical_base; // Update the block using the base in its frame of reference BWTAlgorithms::updateBothR(iter->ranges, block_base, iter->getExtensionBWT(pBWT, pRevBWT)); // Add the base to the history in the frame of reference of the query read // This is so the history is consistent when comparing between blocks from different strands iter->forwardHistory.add(curr_extension, canonical_base); } else { for(size_t idx = 0; idx < DNA_ALPHABET_SIZE; ++idx) { char canonical_base = ALPHABET[idx]; char block_base = iter->flags.isQueryComp() ? complement(canonical_base) : canonical_base; if(ext_count.get(canonical_base) == 0) continue; // Branch the sequence. This involves copying the entire history which can be large // if the input sequences are very long. This could be avoided by using the SearchHistoyNode/Link // structure but branches are infrequent enough to not have a large impact OverlapBlock branched = *iter; BWTAlgorithms::updateBothR(branched.ranges, block_base, branched.getExtensionBWT(pBWT, pRevBWT)); assert(branched.ranges.isValid()); // Add the base in the canonical frame branched.forwardHistory.add(curr_extension, canonical_base); // Insert the new block after the iterator activeList.insert(iter, branched); } // Remove the original block, which has been superceded by the branches activeList.erase(iter); } iter = next; // this skips the newly-inserted blocks } }
// Classify the blocks in obList as irreducible, transitive or substrings. The irreducible blocks are // put into pOBFinal. The remaining are discarded. // Invariant: the blocks are ordered in descending order of the overlap size so that the longest overlap is first. void OverlapAlgorithm::_processIrreducibleBlocksInexact(const BWT* pBWT, const BWT* pRevBWT, OverlapBlockList& activeList, OverlapBlockList* pOBFinal) const { if(activeList.empty()) return; // The activeList contains all the blocks that are not yet right terminal // Count the extensions in the top level (longest) blocks first bool all_eliminated = false; while(!activeList.empty() && !all_eliminated) { // The terminalBlock list contains all the blocks that became right-terminal // in the current extension round. OverlapBlockList terminalList; OverlapBlockList potentialContainedList; // Perform a single round of extension, any terminal blocks // are moved to the terminated list extendActiveBlocksRight(pBWT, pRevBWT, activeList, terminalList, potentialContainedList); // Compare the blocks in the contained list against the other terminal and active blocks // If they are a substring match to any of these, discard them OverlapBlockList::iterator containedIter = potentialContainedList.begin(); for(; containedIter != potentialContainedList.end(); ++containedIter) { if(!isBlockSubstring(*containedIter, terminalList, m_errorRate) && !isBlockSubstring(*containedIter, activeList, m_errorRate)) { // Not a substring, move to terminal list terminalList.push_back(*containedIter); //std::cout << "Contained block kept: " << containedIter->overlapLen << "\n"; } else { //std::cout << "Contained block found and removed: " << containedIter->overlapLen << "\n"; } } // Using the terminated blocks, mark as eliminated any active blocks // that form a valid overlap to the terminal block. These are transitive edges // We do not compare two terminal blocks, we don't consider these overlaps to be // transitive OverlapBlockList::iterator terminalIter = terminalList.begin(); for(; terminalIter != terminalList.end(); ++terminalIter) { #ifdef DEBUGOVERLAP std::cout << "[II] ***TLB of length " << terminalIter->overlapLen << " has ended\n"; #endif all_eliminated = true; OverlapBlockList::iterator activeIter = activeList.begin(); for(; activeIter != activeList.end(); ++activeIter) { if(activeIter->isEliminated) continue; // skip previously marked blocks // Two conditions must be met for a block to be transitive wrt terminal: // 1) It must have a strictly shorter overlap than the terminal block // 2) The error rate between the block and terminal must be less than the threshold double inferredErrorRate = calculateBlockErrorRate(*terminalIter, *activeIter); if(activeIter->overlapLen < terminalIter->overlapLen && isErrorRateAcceptable(inferredErrorRate, m_errorRate)) { #ifdef DEBUGOVERLAP_2 std::cout << "Marking block of length " << activeIter->overlapLen << " as eliminated\n"; #endif activeIter->isEliminated = true; } else { all_eliminated = false; } } // Move this block to the final list if it has not been previously marked eliminated if(!terminalIter->isEliminated) { #ifdef DEBUGOVERLAP std::cout << "[II] Adding block " << *terminalIter << " to final list\n"; //std::cout << " extension: " << terminalIter->forwardHistory << "\n"; #endif pOBFinal->push_back(*terminalIter); } } } activeList.clear(); }
// Calculate the single right extension to the '$' for each the contained blocks // so that the interval ranges are consistent void OverlapAlgorithm::terminateContainedBlocks(OverlapBlockList& containedBlocks) const { for(OverlapBlockList::iterator iter = containedBlocks.begin(); iter != containedBlocks.end(); ++iter) BWTAlgorithms::updateBothR(iter->ranges, '$', iter->getExtensionBWT(m_pBWT, m_pRevBWT)); }
// Construct the set of blocks describing irreducible overlaps with READ // and write the blocks to pOBOut OverlapResult OverlapAlgorithm::overlapReadExact(const SeqRecord& read, int minOverlap, OverlapBlockList* pOBOut) const { OverlapResult result; // The complete set of overlap blocks are collected in obWorkingList // The filtered set (containing only irreducible overlaps) are placed into pOBOut // by calculateIrreducibleHits OverlapBlockList obWorkingList; std::string seq = read.seq.toString(); // We store the various overlap blocks using a number of lists, one for the containments // in the forward and reverse index and one for each set of overlap blocks OverlapBlockList oblFwdContain; OverlapBlockList oblRevContain; OverlapBlockList oblSuffixFwd; OverlapBlockList oblSuffixRev; OverlapBlockList oblPrefixFwd; OverlapBlockList oblPrefixRev; // Match the suffix of seq to prefixes findOverlapBlocksExact(seq, m_pBWT, m_pRevBWT, sufPreAF, minOverlap, &oblSuffixFwd, &oblFwdContain, result); if (!m_noReverse) { findOverlapBlocksExact(complement(seq), m_pRevBWT, m_pBWT, prePreAF, minOverlap, &oblSuffixRev, &oblRevContain, result); } // Match the prefix of seq to suffixes if (!m_noReverse) { findOverlapBlocksExact(reverseComplement(seq), m_pBWT, m_pRevBWT, sufSufAF, minOverlap, &oblPrefixFwd, &oblFwdContain, result); } findOverlapBlocksExact(reverse(seq), m_pRevBWT, m_pBWT, preSufAF, minOverlap, &oblPrefixRev, &oblRevContain, result); // Remove submaximal blocks for each block list including fully contained blocks // Copy the containment blocks into the prefix/suffix lists oblSuffixFwd.insert(oblSuffixFwd.end(), oblFwdContain.begin(), oblFwdContain.end()); oblPrefixFwd.insert(oblPrefixFwd.end(), oblFwdContain.begin(), oblFwdContain.end()); oblSuffixRev.insert(oblSuffixRev.end(), oblRevContain.begin(), oblRevContain.end()); oblPrefixRev.insert(oblPrefixRev.end(), oblRevContain.begin(), oblRevContain.end()); // Perform the submaximal filter removeSubMaximalBlocks(&oblSuffixFwd, m_pBWT, m_pRevBWT); removeSubMaximalBlocks(&oblPrefixFwd, m_pBWT, m_pRevBWT); removeSubMaximalBlocks(&oblSuffixRev, m_pRevBWT, m_pBWT); removeSubMaximalBlocks(&oblPrefixRev, m_pRevBWT, m_pBWT); // Remove the contain blocks from the suffix/prefix lists removeContainmentBlocks(seq.length(), &oblSuffixFwd); removeContainmentBlocks(seq.length(), &oblPrefixFwd); removeContainmentBlocks(seq.length(), &oblSuffixRev); removeContainmentBlocks(seq.length(), &oblPrefixRev); // Join the suffix and prefix lists oblSuffixFwd.splice(oblSuffixFwd.end(), oblSuffixRev); oblPrefixFwd.splice(oblPrefixFwd.end(), oblPrefixRev); // Move the containments to the output list pOBOut->splice(pOBOut->end(), oblFwdContain); pOBOut->splice(pOBOut->end(), oblRevContain); // Filter out transitive overlap blocks if requested if(m_bIrreducible) { computeIrreducibleBlocks(m_pBWT, m_pRevBWT, &oblSuffixFwd, pOBOut); computeIrreducibleBlocks(m_pBWT, m_pRevBWT, &oblPrefixFwd, pOBOut); } else { pOBOut->splice(pOBOut->end(), oblSuffixFwd); pOBOut->splice(pOBOut->end(), oblPrefixFwd); } return result; }