// Find the interval pair corresponding to w using a cached intervals for short substrings BWTIntervalPair BWTAlgorithms::findIntervalPairWithCache(const BWT* pBWT, const BWT* pRevBWT, const BWTIntervalCache* pFwdCache, const BWTIntervalCache* pRevCache, const std::string& w) { size_t cacheLen = pFwdCache->getCachedLength(); if(w.size() < cacheLen) return findIntervalPair(pBWT, pRevBWT, w); // Compute the fwd and reverse interval using the cache for the last k bases BWTIntervalPair ip; int len = w.size(); int j = len - cacheLen; std::string ss = w.substr(j); std::string r_ss = reverse(ss); assert(ss.size() == cacheLen); ip.interval[0] = pFwdCache->lookup(ss.c_str()); ip.interval[1] = pRevCache->lookup(r_ss.c_str()); // Extend the interval to the full length of w as normal j -= 1; for(;j >= 0; --j) { updateBothL(ip, w[j], pBWT); if(!ip.isValid()) return ip; } return ip; }
// Find the intervals in pBWT/pRevBWT corresponding to w // If w does not exist in the BWT, the interval // coordinates [l, u] will be such that l > u BWTIntervalPair BWTAlgorithms::findIntervalPair(const BWT* pBWT, const BWT* pRevBWT, const std::string& w) { BWTIntervalPair intervals; int len = w.size(); int j = len - 1; char curr = w[j]; initIntervalPair(intervals, curr, pBWT, pRevBWT); --j; for(;j >= 0; --j) { curr = w[j]; updateBothL(intervals, curr, pBWT); if(!intervals.isValid()) return intervals; } return intervals; }
// Calculate the 1-base de Bruijn graph extensions of str // The includes the reverse complement AlphaCount64 BWTAlgorithms::calculateDeBruijnExtensions(const std::string str, const BWT* pBWT, const BWT* pRevBWT, EdgeDir direction, const BWTIntervalCache* pFwdCache, const BWTIntervalCache* pRevCache) { size_t k = str.size(); size_t p = k - 1; std::string pmer; // In the sense direction, we extend from the 3' end if(direction == ED_SENSE) pmer = str.substr(1, p); else pmer = str.substr(0, p); assert(pmer.length() == p); std::string rc_pmer = reverseComplement(pmer); // Get the interval for the p-mer and its reverse complement BWTIntervalPair ip; BWTIntervalPair rc_ip; // If pointers to interval caches are available, use them // to speed up the initial calculation if(pFwdCache != NULL && pRevCache != NULL) { ip = BWTAlgorithms::findIntervalPairWithCache(pBWT, pRevBWT, pFwdCache, pRevCache, pmer); rc_ip = BWTAlgorithms::findIntervalPairWithCache(pBWT, pRevBWT, pFwdCache, pRevCache, rc_pmer); } else { ip = BWTAlgorithms::findIntervalPair(pBWT, pRevBWT, pmer); rc_ip = BWTAlgorithms::findIntervalPair(pBWT, pRevBWT, rc_pmer); } assert(ip.isValid() || rc_ip.isValid()); // Get the extension bases AlphaCount64 extensions; AlphaCount64 rc_extensions; // Calculate the interval to use to find the extensions. If extending in the sense // direction this is the reverse interval/reverse bwt for the forward bwt and the forward // interval for the reverse BWT. Vice-versa for anti-sense size_t fwdIdx; if(direction == ED_SENSE) fwdIdx = 1; else fwdIdx = 0; size_t revIdx = 1 - fwdIdx; const BWT* bwts[2]; bwts[0] = pBWT; bwts[1] = pRevBWT; if(ip.interval[fwdIdx].isValid()) extensions += BWTAlgorithms::getExtCount(ip.interval[fwdIdx], bwts[fwdIdx]); if(rc_ip.interval[revIdx].isValid()) rc_extensions = BWTAlgorithms::getExtCount(rc_ip.interval[revIdx], bwts[revIdx]); // Switch the reverse-complement extensions to the same strand as the str rc_extensions.complement(); extensions += rc_extensions; return extensions; }
// Calculate the ranges in pBWT that contain a prefix of at least minOverlap basepairs that // overlaps with a suffix of w. The ranges are added to the pOBList void OverlapAlgorithm::findOverlapBlocksExact(const std::string& w, const BWT* pBWT, const BWT* pRevBWT, const AlignFlags& af, int minOverlap, OverlapBlockList* pOverlapList, OverlapBlockList* pContainList, OverlapResult& result) const { // The algorithm is as follows: // We perform a backwards search using the FM-index for the string w. // As we perform the search we collect the intervals // of the significant prefixes (len >= minOverlap) that overlap w. BWTIntervalPair ranges; size_t l = w.length(); int start = l - 1; BWTAlgorithms::initIntervalPair(ranges, w[start], pBWT, pRevBWT); // Collect the OverlapBlocks for(size_t i = start - 1; i >= 1; --i) { // Compute the range of the suffix w[i, l] BWTAlgorithms::updateBothL(ranges, w[i], pBWT); int overlapLen = l - i; if(overlapLen >= minOverlap) { // Calculate which of the prefixes that match w[i, l] are terminal // These are the proper prefixes (they are the start of a read) BWTIntervalPair probe = ranges; BWTAlgorithms::updateBothL(probe, '$', pBWT); // The probe interval contains the range of proper prefixes if(probe.interval[1].isValid()) { assert(probe.interval[1].lower > 0); pOverlapList->push_back(OverlapBlock(probe, ranges, overlapLen, 0, af)); } } } // Determine if this sequence is contained and should not be processed further BWTAlgorithms::updateBothL(ranges, w[0], pBWT); // Ranges now holds the interval for the full-length read // To handle containments, we output the overlapBlock to the final overlap block list // and it will be processed later // Two possible containment cases: // 1) This read is a substring of some other read // 2) This read is identical to some other read // Case 1 is indicated by the existance of a non-$ left or right hand extension // In this case we return no alignments for the string AlphaCount64 left_ext = BWTAlgorithms::getExtCount(ranges.interval[0], pBWT); AlphaCount64 right_ext = BWTAlgorithms::getExtCount(ranges.interval[1], pRevBWT); if(left_ext.hasDNAChar() || right_ext.hasDNAChar()) { result.isSubstring = true; } else { BWTIntervalPair probe = ranges; BWTAlgorithms::updateBothL(probe, '$', pBWT); if(probe.isValid()) { // terminate the contained block and add it to the contained list BWTAlgorithms::updateBothR(probe, '$', pRevBWT); assert(probe.isValid()); pContainList->push_back(OverlapBlock(probe, ranges, w.length(), 0, af)); } } //OverlapBlockList containedWorkingList; //partitionBlockList(w.length(), &workingList, pOverlapList, &containedWorkingList); // Terminate the contained blocks //terminateContainedBlocks(containedWorkingList); // Move the contained blocks to the final contained list //pContainList->splice(pContainList->end(), containedWorkingList); return; }