// Build the graph by expanding nodes on the frontier void StringGraphGenerator::buildGraph(FrontierQueue& queue) { while(!queue.empty()) { if(queue.size() > 200) break; GraphFrontier node = queue.front(); queue.pop(); if(node.pVertex->getColor() == EXPLORED_COLOR) continue; // node has been visited already // Search the FM-index for the current vertex SeqRecord record; record.id = node.pVertex->getID(); record.seq = node.pVertex->getSeq().toString(); OverlapBlockList blockList; assert(blockList.empty()); m_pOverlapper->overlapRead(record, m_minOverlap, &blockList); // Update the graph and the frontier queue with newly found vertices updateGraphAndQueue(node, queue, blockList); node.pVertex->setColor(EXPLORED_COLOR); } m_pGraph->setColors(GC_WHITE); }
Vertex* StringGraphGenerator::addTerminalVertex(const SeqRecord& record) { assert(m_pGraph != NULL); // Build the vertex by performing a full-length search for the // sequence in the FM-index. We set the ID of the vertex to be the // lowest index in the returned block list. OverlapBlockList endBlockList; m_pOverlapper->alignReadDuplicate(record, &endBlockList); // Search the block list for the exact match to the end read. This must exist OverlapBlockList::iterator matchIter = endBlockList.begin(); while(matchIter != endBlockList.end()) { if(matchIter->numDiff == 0 && !matchIter->flags.isQueryRev()) break; // this block corresponds to the actual sequence of endRead } assert(matchIter != endBlockList.end()); // Construct the canonical ID from the matching interval std::string endID = matchIter->toCanonicalID(); Vertex* pVertex = m_pGraph->getVertex(endID); if(pVertex == NULL) { pVertex = new(m_pGraph->getVertexAllocator()) Vertex(endID, record.seq.toString()); m_pGraph->addVertex(pVertex); } return pVertex; }
// Run the cluster process. If the number of total nodes // exceeds max, abort the search. void ReadCluster::run(size_t max) { while(!m_queue.empty()) { if(m_queue.size() + m_outCluster.size() > max) { while(!m_queue.empty()) m_queue.pop(); m_outCluster.clear(); return; } ClusterNode node = m_queue.front(); m_queue.pop(); // Add this node to the output m_outCluster.push_back(node); // Find overlaps for the current node SeqRecord tempRecord; tempRecord.id = "cluster"; tempRecord.seq = node.sequence; OverlapBlockList blockList; m_pOverlapper->overlapRead(tempRecord, m_minOverlap, &blockList); // Parse each member of the block list and potentially expand the cluster for(OverlapBlockList::const_iterator iter = blockList.begin(); iter != blockList.end(); ++iter) { // Check if the reads in this block are part of the cluster already BWTInterval canonicalInterval = iter->getCanonicalInterval(); int64_t canonicalIndex = canonicalInterval.lower; if(m_usedIndex.count(canonicalIndex) == 0) { // This is a new node that isn't in the cluster. Add it. m_usedIndex.insert(canonicalIndex); ClusterNode newNode; newNode.sequence = iter->getFullString(node.sequence); newNode.interval = canonicalInterval; newNode.isReverseInterval = iter->flags.isTargetRev(); m_queue.push(newNode); } } } }
// Return true if the terminalBlock is a substring of any member of blockList bool OverlapAlgorithm::isBlockSubstring(OverlapBlock& terminalBlock, const OverlapBlockList& blockList, double maxER) const { OverlapBlockList::const_iterator iter = blockList.begin(); size_t right_extension_length = terminalBlock.forwardHistory.size(); for(; iter != blockList.end(); ++iter) { if(terminalBlock.overlapLen == iter->overlapLen && right_extension_length == iter->forwardHistory.size()) { continue; // same length, cannot be a substring } // Calculate error rate between blocks double er = calculateBlockErrorRate(terminalBlock, *iter); if(isErrorRateAcceptable(er, maxER)) return true; } return false; }
// Update the overlap block list with a righthand extension to b, removing ranges that become invalid void OverlapAlgorithm::updateOverlapBlockRangesRight(const BWT* pBWT, const BWT* pRevBWT, OverlapBlockList& obList, char canonical_base) const { OverlapBlockList::iterator iter = obList.begin(); while(iter != obList.end()) { char relative_base = iter->flags.isQueryComp() ? complement(canonical_base) : canonical_base; BWTAlgorithms::updateBothR(iter->ranges, relative_base, iter->getExtensionBWT(pBWT, pRevBWT)); // remove the block from the list if its no longer valid if(!iter->ranges.isValid()) { iter = obList.erase(iter); } else { // Add the base to the extension history int currExtension = iter->forwardHistory.size(); iter->forwardHistory.add(currExtension, canonical_base); ++iter; } } }
void StringGraphGenerator::updateGraphAndQueue(GraphFrontier& currNode, FrontierQueue& queue, OverlapBlockList& blockList) { // Partition the block list into containment blocks and extension (valid) blocks // We do not add containment edges to the graph so the containments are discarded OverlapBlockList containList; OverlapBlockList overlapList; Vertex* pX = currNode.pVertex; //partitionBlockList(pX->getSeqLen(), &blockList, &overlapList, &containList); // Process the overlap blocks, adding new vertices and edges where necessary for(OverlapBlockList::iterator iter = blockList.begin(); iter != blockList.end(); ++iter) { if(iter->getEdgeDir() != currNode.dir) continue; std::string vertexID = iter->toCanonicalID(); if(vertexID == pX->getID()) continue; // skip self-edges std::string vertexSeq = iter->getFullString(pX->getSeq().toString()); Overlap o = iter->toOverlap(pX->getID(), vertexID, pX->getSeqLen(), vertexSeq.length()); /* #if DEBUGGENERATE std::cout << "has overlap to: " << vertexID << " len: " << iter->overlapLen << " flags: " << iter->flags << "\n"; std::cout << "Overlap string: " << iter->getOverlapString(pX->getSeq().toString()) << "\n"; #endif */ // Check if a vertex with endVertexID exists in the graph Vertex* pVertex = m_pGraph->getVertex(vertexID); if(pVertex == NULL) { #if DEBUGGENERATE std::cout << "Vertex with ID: " << vertexID << " does not exist, creating\n"; std::cout << "Vertex sequence: " << vertexSeq << "\n"; #endif // Generate the new vertex vertexSeq = iter->getFullString(pX->getSeq().toString()); pVertex = new(m_pGraph->getVertexAllocator()) Vertex(vertexID, vertexSeq); pVertex->setColor(UNEXPLORED_COLOR); m_pGraph->addVertex(pVertex); } // Construct the found edge Edge* pXY = SGAlgorithms::createEdgesFromOverlap(m_pGraph, o, true); // If the endpoint vertex is unexplored, queue it if(pVertex->getColor() == UNEXPLORED_COLOR) { GraphFrontier node; node.pVertex = pVertex; node.dir = !pXY->getTwin()->getDir(); // continuation direction node.distance = currNode.distance + pXY->getSeqLen(); if(node.distance <= m_maxDistance) queue.push(node); } } }
ClusterResult ClusterProcess::process(const SequenceWorkItem& item) { // Calculate the intervals in the forward FM-index for this read const BWT* pBWT = m_pOverlapper->getBWT(); // Check if this read is a substring OverlapBlockList tempBlockList; OverlapResult overlapResult = m_pOverlapper->alignReadDuplicate(item.read, &tempBlockList); if(overlapResult.isSubstring) { std::cerr << "Error: substring reads found in sga-cluster. Please run rmdup before cluster\n"; exit(1); } // Find the interval in the fm-index containing the read std::string readString = item.read.seq.toString(); BWTInterval readInterval = BWTAlgorithms::findInterval(pBWT, readString); BWTAlgorithms::updateInterval(readInterval, '$', pBWT); // The read must be present in the index assert(readInterval.isValid()); // Check if this read has been used yet bool used = false; for(int64_t i = readInterval.lower; i <= readInterval.upper; ++i) { if(m_pMarkedReads->test(i)) { used = true; break; } } ClusterResult result; if(used) return result; // already part of a cluster, return nothing // Compute a new cluster around this read std::set<int64_t> usedIndex; ClusterNodeQueue queue; ClusterNode node; node.sequence = item.read.seq.toString(); node.interval = readInterval; node.isReverseInterval = false; usedIndex.insert(readInterval.lower); queue.push(node); while(!queue.empty()) { ClusterNode node = queue.front(); queue.pop(); // Update the used index and the result structure with this node's data result.clusterNodes.push_back(node); SeqRecord tempRecord; tempRecord.id = "cluster"; tempRecord.seq = node.sequence; OverlapBlockList blockList; OverlapResult result = m_pOverlapper->overlapRead(tempRecord, m_minOverlap, &blockList); //m_pOverlapper->buildForwardHistory(&blockList); // Parse each member of the block list and potentially expand the cluster for(OverlapBlockList::const_iterator iter = blockList.begin(); iter != blockList.end(); ++iter) { // Check if the reads in this block are part of the cluster already BWTInterval canonicalInterval = iter->getCanonicalInterval(); int64_t canonicalIndex = canonicalInterval.lower; if(usedIndex.count(canonicalIndex) == 0) { usedIndex.insert(canonicalIndex); ClusterNode newNode; newNode.sequence = iter->getFullString(node.sequence); newNode.interval = canonicalInterval; newNode.isReverseInterval = iter->flags.isTargetRev(); queue.push(newNode); } } } // If some work was performed, update the bitvector so other threads do not try to merge the same set of reads. // This uses compare-and-swap instructions to ensure the uppdate is atomic. // If some other thread has merged this set (and updated // the bitvector), we discard all the merged data. // As a given set of reads should all be merged together, we only need to make sure we atomically update // the bit for the read with the lowest index in the set. // Sort the intervals into ascending order and remove any duplicate intervals (which can occur // if the subgraph has a simple cycle) std::sort(result.clusterNodes.begin(), result.clusterNodes.end(), ClusterNode::compare); std::vector<ClusterNode>::iterator newEnd = std::unique(result.clusterNodes.begin(), result.clusterNodes.end(), ClusterNode::equal); size_t oldSize = result.clusterNodes.size(); result.clusterNodes.erase(newEnd, result.clusterNodes.end()); size_t newSize = result.clusterNodes.size(); if(oldSize != newSize) std::cout << "Warning: duplicate cluster nodes were found\n"; // Check if the bit in the vector has already been set for the lowest read index // If it has some other thread has already output this set so we do nothing int64_t lowestIndex = result.clusterNodes.front().interval.lower; bool currentValue = m_pMarkedReads->test(lowestIndex); bool updateSuccess = false; if(currentValue == false) { // Attempt to update the bit vector with an atomic CAS. If this returns false // the bit was set by some other thread updateSuccess = m_pMarkedReads->updateCAS(lowestIndex, currentValue, true); } if(updateSuccess) { // We successfully atomically set the bit for the first read in this set // to true. We can safely update the rest of the bits and keep the merged sequences // for output. std::vector<ClusterNode>::const_iterator iter = result.clusterNodes.begin(); for(; iter != result.clusterNodes.end(); ++iter) { for(int64_t i = iter->interval.lower; i <= iter->interval.upper; ++i) { if(i == lowestIndex) //already set continue; currentValue = m_pMarkedReads->test(i); if(currentValue) { // This value should not be true, emit a warning std::cout << "Warning: Bit " << i << " was set outside of critical section\n"; std::cout << "Read: " << readString << "\n"; } else { m_pMarkedReads->updateCAS(i, currentValue, true); } } } } else { // Some other thread merged these reads already, discard the intermediate // data and set the result to false result.clusterNodes.clear(); } return result; }
// Extend all the blocks in activeList by one base to the right // Move all right-terminal blocks to the termainl list. If a block // is terminal and potentially contained by another block, add it to // containedList void OverlapAlgorithm::extendActiveBlocksRight(const BWT* pBWT, const BWT* pRevBWT, OverlapBlockList& activeList, OverlapBlockList& terminalList, OverlapBlockList& /*containedList*/) const { OverlapBlockList::iterator iter = activeList.begin(); OverlapBlockList::iterator next; while(iter != activeList.end()) { next = iter; ++next; // Check if block is terminal AlphaCount64 ext_count = iter->getCanonicalExtCount(pBWT, pRevBWT); if(ext_count.get('$') > 0) { // Only consider this block to be terminal irreducible if it has at least one extension // or else it is a substring block if(iter->forwardHistory.size() > 0) { OverlapBlock branched = *iter; BWTAlgorithms::updateBothR(branched.ranges, '$', branched.getExtensionBWT(pBWT, pRevBWT)); terminalList.push_back(branched); #ifdef DEBUGOVERLAP_2 std::cout << "Block of length " << iter->overlapLen << " moved to terminal\n"; #endif } } int curr_extension = iter->forwardHistory.size(); // Perform the right extensions // Best case, there is only a single extension character // Handle this case specially so we don't need to copy the potentially // large OverlapBlock structure and its full history if(ext_count.hasUniqueDNAChar()) { // Get the extension character with respect to the queried sequence char canonical_base = ext_count.getUniqueDNAChar(); // Flip the base into the frame of reference for the block char block_base = iter->flags.isQueryComp() ? complement(canonical_base) : canonical_base; // Update the block using the base in its frame of reference BWTAlgorithms::updateBothR(iter->ranges, block_base, iter->getExtensionBWT(pBWT, pRevBWT)); // Add the base to the history in the frame of reference of the query read // This is so the history is consistent when comparing between blocks from different strands iter->forwardHistory.add(curr_extension, canonical_base); } else { for(size_t idx = 0; idx < DNA_ALPHABET_SIZE; ++idx) { char canonical_base = ALPHABET[idx]; char block_base = iter->flags.isQueryComp() ? complement(canonical_base) : canonical_base; if(ext_count.get(canonical_base) == 0) continue; // Branch the sequence. This involves copying the entire history which can be large // if the input sequences are very long. This could be avoided by using the SearchHistoyNode/Link // structure but branches are infrequent enough to not have a large impact OverlapBlock branched = *iter; BWTAlgorithms::updateBothR(branched.ranges, block_base, branched.getExtensionBWT(pBWT, pRevBWT)); assert(branched.ranges.isValid()); // Add the base in the canonical frame branched.forwardHistory.add(curr_extension, canonical_base); // Insert the new block after the iterator activeList.insert(iter, branched); } // Remove the original block, which has been superceded by the branches activeList.erase(iter); } iter = next; // this skips the newly-inserted blocks } }
// Classify the blocks in obList as irreducible, transitive or substrings. The irreducible blocks are // put into pOBFinal. The remaining are discarded. // Invariant: the blocks are ordered in descending order of the overlap size so that the longest overlap is first. void OverlapAlgorithm::_processIrreducibleBlocksInexact(const BWT* pBWT, const BWT* pRevBWT, OverlapBlockList& activeList, OverlapBlockList* pOBFinal) const { if(activeList.empty()) return; // The activeList contains all the blocks that are not yet right terminal // Count the extensions in the top level (longest) blocks first bool all_eliminated = false; while(!activeList.empty() && !all_eliminated) { // The terminalBlock list contains all the blocks that became right-terminal // in the current extension round. OverlapBlockList terminalList; OverlapBlockList potentialContainedList; // Perform a single round of extension, any terminal blocks // are moved to the terminated list extendActiveBlocksRight(pBWT, pRevBWT, activeList, terminalList, potentialContainedList); // Compare the blocks in the contained list against the other terminal and active blocks // If they are a substring match to any of these, discard them OverlapBlockList::iterator containedIter = potentialContainedList.begin(); for(; containedIter != potentialContainedList.end(); ++containedIter) { if(!isBlockSubstring(*containedIter, terminalList, m_errorRate) && !isBlockSubstring(*containedIter, activeList, m_errorRate)) { // Not a substring, move to terminal list terminalList.push_back(*containedIter); //std::cout << "Contained block kept: " << containedIter->overlapLen << "\n"; } else { //std::cout << "Contained block found and removed: " << containedIter->overlapLen << "\n"; } } // Using the terminated blocks, mark as eliminated any active blocks // that form a valid overlap to the terminal block. These are transitive edges // We do not compare two terminal blocks, we don't consider these overlaps to be // transitive OverlapBlockList::iterator terminalIter = terminalList.begin(); for(; terminalIter != terminalList.end(); ++terminalIter) { #ifdef DEBUGOVERLAP std::cout << "[II] ***TLB of length " << terminalIter->overlapLen << " has ended\n"; #endif all_eliminated = true; OverlapBlockList::iterator activeIter = activeList.begin(); for(; activeIter != activeList.end(); ++activeIter) { if(activeIter->isEliminated) continue; // skip previously marked blocks // Two conditions must be met for a block to be transitive wrt terminal: // 1) It must have a strictly shorter overlap than the terminal block // 2) The error rate between the block and terminal must be less than the threshold double inferredErrorRate = calculateBlockErrorRate(*terminalIter, *activeIter); if(activeIter->overlapLen < terminalIter->overlapLen && isErrorRateAcceptable(inferredErrorRate, m_errorRate)) { #ifdef DEBUGOVERLAP_2 std::cout << "Marking block of length " << activeIter->overlapLen << " as eliminated\n"; #endif activeIter->isEliminated = true; } else { all_eliminated = false; } } // Move this block to the final list if it has not been previously marked eliminated if(!terminalIter->isEliminated) { #ifdef DEBUGOVERLAP std::cout << "[II] Adding block " << *terminalIter << " to final list\n"; //std::cout << " extension: " << terminalIter->forwardHistory << "\n"; #endif pOBFinal->push_back(*terminalIter); } } } activeList.clear(); }
// iterate through obList and determine the overlaps that are irreducible. This function is recursive. // The final overlap blocks corresponding to irreducible overlaps are written to pOBFinal. // Invariant: the blocks are ordered in descending order of the overlap size so that the longest overlap is first. // Invariant: each block corresponds to the same extension of the root sequence w. void OverlapAlgorithm::_processIrreducibleBlocksExactIterative(const BWT* pBWT, const BWT* pRevBWT, OverlapBlockList& inList, OverlapBlockList* pOBFinal) const { if(inList.empty()) return; // We store the overlap blocks in groups of blocks that have the same right-extension. // When a branch is found, the groups are split based on the extension typedef std::list<OverlapBlockList> BlockGroups; BlockGroups blockGroups; blockGroups.push_back(inList); int numExtensions = 0; int numBranches = 0; while(!blockGroups.empty()) { // Perform one extenion round for each group. // If the top-level block has ended, push the result // to the final list and remove the group from processing BlockGroups::iterator groupIter = blockGroups.begin(); BlockGroups incomingGroups; // Branched blocks are placed here while(groupIter != blockGroups.end()) { OverlapBlockList& currList = *groupIter; bool bEraseGroup = false; // Count the extensions in the top level (longest) blocks first int topLen = currList.front().overlapLen; AlphaCount64 ext_count; OBLIter blockIter = currList.begin(); while(blockIter != currList.end() && blockIter->overlapLen == topLen) { ext_count += blockIter->getCanonicalExtCount(pBWT, pRevBWT); ++blockIter; } // Three cases: // 1) The top level block has ended as it contains the extension $. Output TLB and end. // 2) There is a singular unique extension base for all the blocks. Update the blocks and continue. // 3) There are multiple extension bases, split the block group and continue. // If some block other than the TLB ended, it must be contained within the TLB and it is not output // or considered further. // Likewise if multiple distinct strings in the TLB ended, we only output the top one. The rest // must have the same sequence as the top one and are hence considered to be contained with the top element. if(ext_count.get('$') > 0) { // An irreducible overlap has been found. It is possible that there are two top level blocks // (one in the forward and reverse direction). Since we can't decide which one // contains the other at this point, we output hits to both. Under a fixed // length string assumption one will be contained within the other and removed later. OBLIter tlbIter = currList.begin(); while(tlbIter != currList.end() && tlbIter->overlapLen == topLen) { // Ensure the tlb is actually terminal and not a substring block AlphaCount64 test_count = tlbIter->getCanonicalExtCount(pBWT, pRevBWT); if(test_count.get('$') == 0) { std::cerr << "Error: substring read found during overlap computation.\n"; std::cerr << "Please run sga rmdup before sga overlap\n"; exit(EXIT_FAILURE); } // Perform the final right-update to make the block terminal OverlapBlock branched = *tlbIter; BWTAlgorithms::updateBothR(branched.ranges, '$', branched.getExtensionBWT(pBWT, pRevBWT)); pOBFinal->push_back(branched); #ifdef DEBUGOVERLAP std::cout << "[IE] TLB of length " << branched.overlapLen << " has ended\n"; std::cout << "[IE]\tBlock data: " << branched << "\n"; #endif ++tlbIter; } // Set the flag to erase this group, it is finished bEraseGroup = true; } else { // Count the extension for the rest of the blocks while(blockIter != currList.end()) { ext_count += blockIter->getCanonicalExtCount(pBWT, pRevBWT); ++blockIter; } if(ext_count.hasUniqueDNAChar()) { // Update all the blocks using the unique extension character // This character is in the canonical representation wrt to the query char b = ext_count.getUniqueDNAChar(); updateOverlapBlockRangesRight(pBWT, pRevBWT, currList, b); numExtensions++; bEraseGroup = false; } else { for(size_t idx = 0; idx < DNA_ALPHABET_SIZE; ++idx) { char b = ALPHABET[idx]; if(ext_count.get(b) > 0) { numBranches++; OverlapBlockList branched = currList; updateOverlapBlockRangesRight(pBWT, pRevBWT, branched, b); incomingGroups.push_back(branched); bEraseGroup = true; } } } } if(bEraseGroup) groupIter = blockGroups.erase(groupIter); else ++groupIter; } // Splice in the newly branched blocks, if any blockGroups.splice(blockGroups.end(), incomingGroups); } }
// Calculate the single right extension to the '$' for each the contained blocks // so that the interval ranges are consistent void OverlapAlgorithm::terminateContainedBlocks(OverlapBlockList& containedBlocks) const { for(OverlapBlockList::iterator iter = containedBlocks.begin(); iter != containedBlocks.end(); ++iter) BWTAlgorithms::updateBothR(iter->ranges, '$', iter->getExtensionBWT(m_pBWT, m_pRevBWT)); }
OverlapResult OverlapAlgorithm::overlapReadInexact(const SeqRecord& read, int minOverlap, OverlapBlockList* pOBOut) const { OverlapResult result; OverlapBlockList obWorkingList; std::string seq = read.seq.toString(); #ifdef DEBUGOVERLAP std::cout << "\n\n***Overlapping read " << read.id << " suffix\n"; #endif // Match the suffix of seq to prefixes // findInexact returns false is the maximum search time was exceeded. In this // case we dont run any of the subsequent commands and return no overlaps. bool valid = true; valid = findOverlapBlocksInexact(seq, m_pBWT, m_pRevBWT, sufPreAF, minOverlap, &obWorkingList, pOBOut, result); if(valid) valid = findOverlapBlocksInexact(complement(seq), m_pRevBWT, m_pBWT, prePreAF, minOverlap, &obWorkingList, pOBOut, result); if(valid) { if(m_bIrreducible) { computeIrreducibleBlocks(m_pBWT, m_pRevBWT, &obWorkingList, pOBOut); obWorkingList.clear(); } else { pOBOut->splice(pOBOut->end(), obWorkingList); assert(obWorkingList.empty()); } } #ifdef DEBUGOVERLAP std::cout << "\n\n***Overlapping read " << read.id << " prefix\n"; #endif // Match the prefix of seq to suffixes if(valid) valid = findOverlapBlocksInexact(reverseComplement(seq), m_pBWT, m_pRevBWT, sufSufAF, minOverlap, &obWorkingList, pOBOut, result); if(valid) valid = findOverlapBlocksInexact(reverse(seq), m_pRevBWT, m_pBWT, preSufAF, minOverlap, &obWorkingList, pOBOut, result); if(valid) { if(m_bIrreducible) { computeIrreducibleBlocks(m_pBWT, m_pRevBWT, &obWorkingList, pOBOut); obWorkingList.clear(); } else { pOBOut->splice(pOBOut->end(), obWorkingList); assert(obWorkingList.empty()); } } if(!valid) { pOBOut->clear(); result.isSubstring = false; result.searchAborted = true; return result; } return result; }
// Seeded blockwise BWT alignment of prefix-suffix for reads // Each alignment is given a seed region and a block region // The seed region is the terminal portion of w where maxDiff + 1 seeds are created // at least 1 of these seeds must align exactly for there to be an alignment with // at most maxDiff differences between the prefix/suffix. Only alignments within the // range [block_start, block_end] are output. The block_end coordinate is inclusive. bool OverlapAlgorithm::findOverlapBlocksInexact(const std::string& w, const BWT* pBWT, const BWT* pRevBWT, const AlignFlags& af, int minOverlap, OverlapBlockList* pOverlapList, OverlapBlockList* pContainList, OverlapResult& result) const { int len = w.length(); int overlap_region_left = len - minOverlap; SearchSeedVector* pCurrVector = new SearchSeedVector; SearchSeedVector* pNextVector = new SearchSeedVector; OverlapBlockList workingList; SearchSeedVector::iterator iter; // Create and extend the initial seeds int actual_seed_length = m_seedLength; int actual_seed_stride = m_seedStride; if(actual_seed_length == 0) { // Calculate a seed length and stride that will guarantee all overlaps // with error rate m_errorRate will be found calculateSeedParameters(w, minOverlap, actual_seed_length, actual_seed_stride); } assert(actual_seed_stride != 0); createSearchSeeds(w, pBWT, pRevBWT, actual_seed_length, actual_seed_stride, pCurrVector); extendSeedsExactRight(w, pBWT, pRevBWT, ED_RIGHT, pCurrVector, pNextVector); pCurrVector->clear(); pCurrVector->swap(*pNextVector); assert(pNextVector->empty()); int num_steps = 0; // Perform the inexact extensions bool fail = false; while(!pCurrVector->empty()) { if(m_maxSeeds != -1 && (int)pCurrVector->size() > m_maxSeeds) { fail = true; break; } iter = pCurrVector->begin(); while(iter != pCurrVector->end()) { SearchSeed& align = *iter; // If the current aligned region is right-terminal // and the overlap is greater than minOverlap, try to find overlaps // or containments if(align.right_index == len - 1) { double align_error = align.calcErrorRate(); // Check for overlaps if(align.left_index <= overlap_region_left && isErrorRateAcceptable(align_error, m_errorRate)) { int overlapLen = len - align.left_index; BWTIntervalPair probe = align.ranges; BWTAlgorithms::updateBothL(probe, '$', pBWT); // The probe interval contains the range of proper prefixes if(probe.interval[1].isValid()) { assert(probe.interval[1].lower > 0); OverlapBlock nBlock(probe, align.ranges, overlapLen, align.z, af, align.historyLink->getHistoryVector()); workingList.push_back(nBlock); } } // Check for containments // If the seed is left-terminal and there are [ACGT] left/right extensions of the sequence // this read must be a substring of another read if(align.left_index == 0) { AlphaCount64 left_ext = BWTAlgorithms::getExtCount(align.ranges.interval[0], pBWT); AlphaCount64 right_ext = BWTAlgorithms::getExtCount(align.ranges.interval[1], pRevBWT); if(left_ext.hasDNAChar() || right_ext.hasDNAChar()) result.isSubstring = true; } } // Extend the seed to the right/left if(align.dir == ED_RIGHT) extendSeedInexactRight(align, w, pBWT, pRevBWT, pNextVector); else extendSeedInexactLeft(align, w, pBWT, pRevBWT, pNextVector); ++iter; //pCurrVector->erase(iter++); } pCurrVector->clear(); assert(pCurrVector->empty()); pCurrVector->swap(*pNextVector); // Remove identical seeds after we have performed seed_len steps // as there now might be redundant seeds if(num_steps % actual_seed_stride == 0) { std::sort(pCurrVector->begin(), pCurrVector->end(), SearchSeed::compareLeftRange); SearchSeedVector::iterator end_iter = std::unique(pCurrVector->begin(), pCurrVector->end(), SearchSeed::equalLeftRange); pCurrVector->resize(end_iter - pCurrVector->begin()); } ++num_steps; } if(!fail) { // parse the working list to remove any submaximal overlap blocks // these blocks correspond to reads that have multiple valid overlaps. // we only keep the longest removeSubMaximalBlocks(&workingList, pBWT, pRevBWT); OverlapBlockList containedWorkingList; partitionBlockList(len, &workingList, pOverlapList, &containedWorkingList); // Terminate the contained blocks terminateContainedBlocks(containedWorkingList); // Move the contained blocks to the final contained list pContainList->splice(pContainList->end(), containedWorkingList); } delete pCurrVector; delete pNextVector; return !fail; }
// Construct the set of blocks describing irreducible overlaps with READ // and write the blocks to pOBOut OverlapResult OverlapAlgorithm::overlapReadExact(const SeqRecord& read, int minOverlap, OverlapBlockList* pOBOut) const { OverlapResult result; // The complete set of overlap blocks are collected in obWorkingList // The filtered set (containing only irreducible overlaps) are placed into pOBOut // by calculateIrreducibleHits OverlapBlockList obWorkingList; std::string seq = read.seq.toString(); // We store the various overlap blocks using a number of lists, one for the containments // in the forward and reverse index and one for each set of overlap blocks OverlapBlockList oblFwdContain; OverlapBlockList oblRevContain; OverlapBlockList oblSuffixFwd; OverlapBlockList oblSuffixRev; OverlapBlockList oblPrefixFwd; OverlapBlockList oblPrefixRev; // Match the suffix of seq to prefixes findOverlapBlocksExact(seq, m_pBWT, m_pRevBWT, sufPreAF, minOverlap, &oblSuffixFwd, &oblFwdContain, result); if (!m_noReverse) { findOverlapBlocksExact(complement(seq), m_pRevBWT, m_pBWT, prePreAF, minOverlap, &oblSuffixRev, &oblRevContain, result); } // Match the prefix of seq to suffixes if (!m_noReverse) { findOverlapBlocksExact(reverseComplement(seq), m_pBWT, m_pRevBWT, sufSufAF, minOverlap, &oblPrefixFwd, &oblFwdContain, result); } findOverlapBlocksExact(reverse(seq), m_pRevBWT, m_pBWT, preSufAF, minOverlap, &oblPrefixRev, &oblRevContain, result); // Remove submaximal blocks for each block list including fully contained blocks // Copy the containment blocks into the prefix/suffix lists oblSuffixFwd.insert(oblSuffixFwd.end(), oblFwdContain.begin(), oblFwdContain.end()); oblPrefixFwd.insert(oblPrefixFwd.end(), oblFwdContain.begin(), oblFwdContain.end()); oblSuffixRev.insert(oblSuffixRev.end(), oblRevContain.begin(), oblRevContain.end()); oblPrefixRev.insert(oblPrefixRev.end(), oblRevContain.begin(), oblRevContain.end()); // Perform the submaximal filter removeSubMaximalBlocks(&oblSuffixFwd, m_pBWT, m_pRevBWT); removeSubMaximalBlocks(&oblPrefixFwd, m_pBWT, m_pRevBWT); removeSubMaximalBlocks(&oblSuffixRev, m_pRevBWT, m_pBWT); removeSubMaximalBlocks(&oblPrefixRev, m_pRevBWT, m_pBWT); // Remove the contain blocks from the suffix/prefix lists removeContainmentBlocks(seq.length(), &oblSuffixFwd); removeContainmentBlocks(seq.length(), &oblPrefixFwd); removeContainmentBlocks(seq.length(), &oblSuffixRev); removeContainmentBlocks(seq.length(), &oblPrefixRev); // Join the suffix and prefix lists oblSuffixFwd.splice(oblSuffixFwd.end(), oblSuffixRev); oblPrefixFwd.splice(oblPrefixFwd.end(), oblPrefixRev); // Move the containments to the output list pOBOut->splice(pOBOut->end(), oblFwdContain); pOBOut->splice(pOBOut->end(), oblRevContain); // Filter out transitive overlap blocks if requested if(m_bIrreducible) { computeIrreducibleBlocks(m_pBWT, m_pRevBWT, &oblSuffixFwd, pOBOut); computeIrreducibleBlocks(m_pBWT, m_pRevBWT, &oblPrefixFwd, pOBOut); } else { pOBOut->splice(pOBOut->end(), oblSuffixFwd); pOBOut->splice(pOBOut->end(), oblPrefixFwd); } return result; }