Ejemplo n.º 1
0
		bool operator()(std::string& read,
				std::vector<rapmap::utils::QuasiAlignment>& hits,
				SASearcher& saSearcher,
				rapmap::utils::MateStatus mateStatus,
				bool strictCheck = false) {

        using QuasiAlignment = rapmap::utils::QuasiAlignment;
        using MateStatus = rapmap::utils::MateStatus;

        //auto& posIDs = rmi_->positionIDs;
        auto& rankDict = rmi_->rankDict;
        auto& txpStarts = rmi_->txpOffsets;
        auto& SA = rmi_->SA;
        auto& khash = rmi_->khash;
        auto& text = rmi_->seq;
        uint32_t sampFactor{1};
        auto salen = SA.size();

        auto readLen = read.length();
        auto maxDist = 1.5 * readLen;
        auto k = rapmap::utils::my_mer::k();
        auto readStartIt = read.begin();
        auto readEndIt = read.end();

        auto readRevStartIt = read.rbegin();
        auto readRevEndIt = read.rend();

        auto rb = read.begin();
        auto re = rb + k;
        int lbLeftFwd = 0, ubLeftFwd = 0;
        int lbLeftRC = 0, ubLeftRC = 0;
        int lbRightFwd = 0, ubRightFwd = 0;
        int lbRightRC = 0, ubRightRC = 0;
        int matchedLen;

        uint32_t fwdHit{0};
        uint32_t rcHit{0};

        bool foundHit = false;
        bool isRev = false;
        rapmap::utils::my_mer mer;
        rapmap::utils::my_mer rcMer;

        enum HitStatus { ABSENT = -1, UNTESTED = 0, PRESENT = 1 };
        // Record if k-mers are hits in the
        // fwd direction, rc direction or both
        struct KmerDirScore {
            KmerDirScore(rapmap::utils::my_mer kmerIn, HitStatus fwdScoreIn, HitStatus rcScoreIn) :
                kmer(kmerIn), fwdScore(fwdScoreIn), rcScore(rcScoreIn) {}
            KmerDirScore() : fwdScore(UNTESTED), rcScore(UNTESTED) {}
            rapmap::utils::my_mer kmer;
            HitStatus fwdScore{UNTESTED};
            HitStatus rcScore{UNTESTED};
        };

        // This allows implementing our heurisic for comparing
        // forward and reverse-complement strand matches
        std::vector<KmerDirScore> kmerScores;

        using rapmap::utils::SAIntervalHit;

        std::vector<SAIntervalHit> fwdSAInts;
        std::vector<SAIntervalHit> rcSAInts;

        std::vector<uint32_t> leftTxps, leftTxpsRC;
        std::vector<uint32_t> rightTxps, rightTxpsRC;
        int maxInterval{1000};

        // The number of bases that a new query position (to which
        // we skipped) should overlap the previous extension. A
        // value of 0 means no overlap (the new search begins at the next
        // base) while a value of (k - 1) means that k-1 bases (one less than
        // the k-mer size) must overlap.
        int skipOverlap = k-1;
        // Number of nucleotides to skip when encountering a homopolymer k-mer.
        int homoPolymerSkip = k/2;

        // Find a hit within the read
        // While we haven't fallen off the end
        while (re < read.end()) {

            // Get the k-mer at the current start position.
            // And make sure that it's valid (contains no Ns).
            auto pos = std::distance(readStartIt, rb);
            auto invalidPos = read.find_first_of("nN", pos);
            if (invalidPos < pos + k) {
                rb = read.begin() + invalidPos + 1;
                re = rb + k;
                continue;
            }

            // If the next k-bases are valid, get the k-mer and
            // reverse complement k-mer
            mer = rapmap::utils::my_mer(read.c_str() + pos);
            if (mer.is_homopolymer()) { rb += homoPolymerSkip; re += homoPolymerSkip; continue; }
            rcMer = mer.get_reverse_complement();

            // See if we can find this k-mer in the hash
            auto merIt = khash.find(mer.get_bits(0, 2*k));
            auto rcMerIt = khash.find(rcMer.get_bits(0, 2*k));

            // If we can find the k-mer in the hash, get its SA interval
            if (merIt != khash.end()) {
                int lb = merIt->second.begin;
                int ub = merIt->second.end;

                // lb must be 1 *less* then the current lb
                auto lbRestart = std::max(static_cast<int>(0), lb-1);
                // Extend the SA interval using the read sequence as far as
                // possible
                std::tie(lbLeftFwd, ubLeftFwd, matchedLen) =
                    saSearcher.extendSearchNaive(lbRestart, ub, k, rb, readEndIt);

                // If the SA interval is valid, and not too wide, then record
                // the hit.
                int diff = ubLeftFwd - lbLeftFwd;
                if (ubLeftFwd > lbLeftFwd and diff < maxInterval) {
                    auto queryStart = std::distance(read.begin(), rb);
                    fwdSAInts.emplace_back(lbLeftFwd, ubLeftFwd, matchedLen, queryStart, false);
                    if (strictCheck) {
                        ++fwdHit;
                        // If we also match this k-mer in the rc direction
                        if (rcMerIt != khash.end()) {
                            ++rcHit;
                            kmerScores.emplace_back(mer, PRESENT, PRESENT);
                        } else { // Otherwise it doesn't match in the rc direction
                            kmerScores.emplace_back(mer, PRESENT, ABSENT);
                        }

                        // If we didn't end the match b/c we exhausted the query
                        // test the mismatching k-mer in the other strand
                        // TODO: check for 'N'?
                        if (rb + matchedLen < readEndIt){
                            auto kmerPos = std::distance(readStartIt, rb + matchedLen - skipOverlap);
                            mer = rapmap::utils::my_mer(read.c_str() + kmerPos);
                            kmerScores.emplace_back(mer , ABSENT, UNTESTED);
                        }
                    } else { // no strict check
                        ++fwdHit;
                        if (rcMerIt != khash.end()) { ++rcHit; }
                    }
                }
            }

            // See if the reverse complement k-mer is in the hash
            if (rcMerIt != khash.end()) {
                lbLeftRC = rcMerIt->second.begin;
                ubLeftRC = rcMerIt->second.end;
                if (ubLeftRC > lbLeftRC) {
                    // The original k-mer didn't match in the foward direction
                    if (!fwdHit) {
                        ++rcHit;
                        if (strictCheck) {
                            kmerScores.emplace_back(rcMer, ABSENT, PRESENT);
                        }
                    }
                }
            }

            // If we had a hit with either k-mer then we can
            // break out of this loop to look for the next informative position
            if (fwdHit + rcHit > 0) {
                foundHit = true;
                break;
            }
            ++rb; ++re;
        }

        // If we went the entire length of the read without finding a hit
        // then we can bail.
        if (!foundHit) { return false; }

        bool lastSearch{false};
        // If we had a hit on the forward strand
        if (fwdHit) {

            // The length of this match
            int32_t matchLen = fwdSAInts.front().len;

            // The iterator to where this match began
            rb = read.begin() + fwdSAInts.front().queryPos;

            // [lb, ub) is the suffix array interval for the MMP (maximum mappable prefix)
            // of the k-mer we found.  The NIP (next informative position) in the sequence
            // is the position after the LCE (longest common extension) of
            // T[SA[lb]:] and T[SA[ub-1]:]
            auto remainingLength = std::distance(rb + matchLen, readEndIt);
            int32_t lce = saSearcher.lce(
                    lbLeftFwd, ubLeftFwd-1, matchLen, remainingLength);
            auto fwdSkip = std::max(matchLen - skipOverlap, lce - skipOverlap);

            size_t nextInformativePosition = std::min(
                    std::max(0, static_cast<int>(readLen)- static_cast<int>(k)),
                    static_cast<int>(std::distance(readStartIt, rb) + fwdSkip)
                    );

            rb = read.begin() + nextInformativePosition;
            re = rb + k;

            size_t invalidPos{0};
            while (re <= readEndIt) {
                // The offset into the string
                auto pos = std::distance(readStartIt, rb);

                // The position of the first N in the k-mer (if there is one)
                // If we have already verified there are no Ns in the remainder
                // of the string (invalidPos is std::string::npos) then we can
                // skip this test.
                if (invalidPos != std::string::npos) {
                    invalidPos = read.find_first_of("nN", pos);
                }

                // If the first N is within k bases, then this k-mer is invalid
                if (invalidPos < pos + k) {
                    // A valid k-mer can't start until after the 'N'
                    nextInformativePosition = invalidPos + 1;
                    rb = read.begin() + nextInformativePosition;
                    re = rb + k;
                    // Go to the next iteration of the while loop
                    continue;
                }

                // If the current end position is valid
                if (re <= readEndIt) {

                    mer = rapmap::utils::my_mer(read.c_str() + pos);
                    if (mer.is_homopolymer()) { rb += homoPolymerSkip; re = rb + k; continue; }
                    auto merIt = khash.find(mer.get_bits(0, 2*k));

                    if (merIt != khash.end()) {
                        if (strictCheck) {
                            ++fwdHit;
                            kmerScores.emplace_back(mer, PRESENT, UNTESTED);
                            auto rcMer = mer.get_reverse_complement();
                            auto rcMerIt = khash.find(rcMer.get_bits(0, 2*k));
                            if (rcMerIt != khash.end()) {
                                ++rcHit;
                                kmerScores.back().rcScore = PRESENT;
                            }
                        }

                      lbRightFwd = merIt->second.begin;
                      ubRightFwd = merIt->second.end;

                        // lb must be 1 *less* then the current lb
                        lbRightFwd = std::max(0, lbRightFwd - 1);
                        std::tie(lbRightFwd, ubRightFwd, matchedLen) =
                            saSearcher.extendSearchNaive(lbRightFwd, ubRightFwd,
                                    k, rb, readEndIt);

                        int diff = ubRightFwd - lbRightFwd;
                        if (ubRightFwd > lbRightFwd and diff < maxInterval) {
                            auto queryStart = std::distance(read.begin(), rb);
                            fwdSAInts.emplace_back(lbRightFwd, ubRightFwd, matchedLen, queryStart, false);
                            // If we didn't end the match b/c we exhausted the query
                            // test the mismatching k-mer in the other strand
                            // TODO: check for 'N'?
                            if (strictCheck and rb + matchedLen < readEndIt){
                                auto kmerPos = std::distance(readStartIt, rb + matchedLen - skipOverlap);
                                mer = rapmap::utils::my_mer(read.c_str() + kmerPos);
                                kmerScores.emplace_back(mer , ABSENT, UNTESTED);
                            }
                        }

                        if (lastSearch) { break; }
                        auto mismatchIt = rb + matchedLen;
                        if (mismatchIt < readEndIt) {
                            auto remainingDistance = std::distance(mismatchIt, readEndIt);
                            auto lce = saSearcher.lce(lbRightFwd, ubRightFwd-1, matchedLen, remainingDistance);

                            // Where we would jump if we just used the MMP
                            auto skipMatch = mismatchIt - skipOverlap;
                            // Where we would jump if we used the LCE
                            auto skipLCE = rb + lce - skipOverlap;
                            // Pick the larger of the two
                            rb = std::max(skipLCE, skipMatch);
                            if (rb > (readEndIt - k)) {
                                rb = readEndIt - k;
                                lastSearch = true;
                            }
                            re = rb + k;
                        } else {
                            lastSearch = true;
                            rb = readEndIt - k;
                            re = rb + k;
                        }

                    } else {
                        rb += sampFactor;
                        re = rb + k;
                    }
                }
            }
        }

        lastSearch = false;
        if (rcHit >= fwdHit) {
            size_t pos{read.length() - k};

            auto revReadStartIt = read.rend();
            auto revReadEndIt = read.rend();

            auto revRB = read.rbegin();
            auto revRE = revRB + k;

            auto invalidPosIt = revRB;
            while (revRE <= revReadEndIt){

                revRE = revRB + k;
                if (revRE > revReadEndIt) { break; }

                // See if this k-mer would contain an N
                // only check if we don't yet know that there are no remaining
                // Ns
                if (invalidPosIt != revReadEndIt) {
                    invalidPosIt = std::find_if(revRB, revRE,
                                                 [](const char c) -> bool {
                                                     return c == 'n' or c == 'N';
                                                 });
                }

                // If we found an N before the end of the k-mer
                if (invalidPosIt < revRE) {
                    // Skip to the k-mer starting at the next position
                    // (i.e. right past the N)
                    revRB = invalidPosIt + 1;
                    continue;
                }

                // The distance from the beginning of the read to the
                // start of the k-mer
                pos = std::distance(revRE, revReadEndIt);

                // Get the k-mer and query it in the hash
                mer = rapmap::utils::my_mer(read.c_str() + pos);
                if (mer.is_homopolymer()) { revRB += homoPolymerSkip; revRE += homoPolymerSkip; continue; }
                rcMer = mer.get_reverse_complement();
                auto rcMerIt = khash.find(rcMer.get_bits(0, 2*k));

                // If we found the k-mer
                if (rcMerIt != khash.end()) {
                    if (strictCheck) {
                        ++rcHit;
                        kmerScores.emplace_back(mer, UNTESTED, PRESENT);
                        auto merIt = khash.find(mer.get_bits(0, 2*k));
                        if (merIt != khash.end()) {
                            ++fwdHit;
                            kmerScores.back().fwdScore = PRESENT;
                        }
                    }


                    lbRightRC = rcMerIt->second.begin;
                    ubRightRC = rcMerIt->second.end;

                    // lb must be 1 *less* then the current lb
                    // We can't move any further in the reverse complement direction
                    lbRightRC = std::max(0, lbRightRC - 1);
                    std::tie(lbRightRC, ubRightRC, matchedLen) =
                        saSearcher.extendSearchNaive(lbRightRC, ubRightRC, k,
                                revRB, revReadEndIt, true);

                    int diff = ubRightRC - lbRightRC;
                    if (ubRightRC > lbRightRC and diff < maxInterval) {
                        auto queryStart = std::distance(read.rbegin(), revRB);
                        rcSAInts.emplace_back(lbRightRC, ubRightRC, matchedLen, queryStart, true);
                        // If we didn't end the match b/c we exhausted the query
                        // test the mismatching k-mer in the other strand
                        // TODO: check for 'N'?
                        if (strictCheck and revRB + matchedLen < revReadEndIt){
                            auto kmerPos = std::distance(revRB + matchedLen, revReadEndIt);
                            mer = rapmap::utils::my_mer(read.c_str() + kmerPos);
                            kmerScores.emplace_back(mer , UNTESTED, ABSENT);
                        }
                    }

                    if (lastSearch) { break; }
                    auto mismatchIt = revRB + matchedLen;
                    if (mismatchIt < revReadEndIt) {
                        auto remainingDistance =
                            std::distance(mismatchIt, revReadEndIt);
                        auto lce = saSearcher.lce(lbRightRC, ubRightRC-1,
                                                  matchedLen, remainingDistance);

                        // Where we would jump if we just used the MMP
                        auto skipMatch = mismatchIt - skipOverlap;
                        // Where we would jump if we used the lce
                        auto skipLCE = revRB + lce - skipOverlap;
                        // Choose the larger of the two
                        revRB = std::max(skipLCE, skipMatch);
                        if (revRB > (revReadEndIt - k)) {
                            revRB = revReadEndIt - k;
                            lastSearch = true;
                        }
                        revRE = revRB + k;

                    } else {
                        lastSearch = true;
                        revRB = revReadEndIt - k;
                        revRE = revRB + k;
                    }

                } else {
                    revRB += sampFactor;
                    revRE = revRB + k;
                }
            }
        }


        if (strictCheck) {
            // The first two conditions shouldn't happen
            // but I'm just being paranoid here
            if (fwdHit > 0 and rcHit == 0) {
                rcSAInts.clear();
            } else if (rcHit > 0 and fwdHit == 0) {
                fwdSAInts.clear();
            } else {
                // Compute the score for the k-mers we need to
                // test in both the forward and rc directions.
                int32_t fwdScore{0};
                int32_t rcScore{0};
                // For every kmer score structure
                for (auto& kms : kmerScores) {
                    // If the forward k-mer is untested, then test it
                    if (kms.fwdScore == UNTESTED) {
                        auto merIt = khash.find(mer.get_bits(0, 2*k));
                        kms.fwdScore = (merIt != khash.end()) ? PRESENT : ABSENT;
                    }
                    // accumulate the score
                    fwdScore += kms.fwdScore;

                    // If the rc k-mer is untested, then test it
                    if (kms.rcScore == UNTESTED) {
                        rcMer = kms.kmer.get_reverse_complement();
                        auto rcMerIt = khash.find(rcMer.get_bits(0, 2*k));
                        kms.rcScore = (rcMerIt != khash.end()) ? PRESENT : ABSENT;
                    }
                    // accumulate the score
                    rcScore += kms.rcScore;
                }
                // If the forward score is strictly greater
                // then get rid of the rc hits.
                if (fwdScore > rcScore) {
                    rcSAInts.clear();
                } else if (rcScore > fwdScore) {
                    // If the rc score is strictly greater
                    // get rid of the forward hits
                    fwdSAInts.clear();
                }
            }
        }

        /* VERY SIMPLE HEURISTIC */
        /*
           if (fwdHit > rcHit) {
           rcSAInts.clear();
           } else if (rcHit > fwdHit) {
           fwdSAInts.clear();
           }
           */

        auto fwdHitsStart = hits.size();
        // If we had > 1 forward hit
        if (fwdSAInts.size() > 1) {
            auto processedHits = rapmap::hit_manager::intersectSAHits(fwdSAInts, *rmi_);
            rapmap::hit_manager::collectHitsSimpleSA(processedHits, readLen, maxDist, hits, mateStatus);
        } else if (fwdSAInts.size() == 1) { // only 1 hit!
            auto& saIntervalHit = fwdSAInts.front();
            auto initialSize = hits.size();
            for (int i = saIntervalHit.begin; i != saIntervalHit.end; ++i) {
                auto globalPos = SA[i];
                auto txpID = rmi_->transcriptAtPosition(globalPos);
                // the offset into this transcript
                auto pos = globalPos - txpStarts[txpID];
                hits.emplace_back(txpID, pos, true, readLen);
                hits.back().mateStatus = mateStatus;
            }
            // Now sort by transcript ID (then position) and eliminate
            // duplicates
            auto sortStartIt = hits.begin() + initialSize;
            auto sortEndIt = hits.end();
            std::sort(sortStartIt, sortEndIt,
                    [](const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
                    if (a.tid == b.tid) {
                    return a.pos < b.pos;
                    } else {
                    return a.tid < b.tid;
                    }
                    });
            auto newEnd = std::unique(hits.begin() + initialSize, hits.end(),
                    [] (const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
                    return a.tid == b.tid;
                    });
            hits.resize(std::distance(hits.begin(), newEnd));
        }
        auto fwdHitsEnd = hits.size();

        auto rcHitsStart = fwdHitsEnd;
        // If we had > 1 rc hit
        if (rcSAInts.size() > 1) {
            auto processedHits = rapmap::hit_manager::intersectSAHits(rcSAInts, *rmi_);
            rapmap::hit_manager::collectHitsSimpleSA(processedHits, readLen, maxDist, hits, mateStatus);
        } else if (rcSAInts.size() == 1) { // only 1 hit!
            auto& saIntervalHit = rcSAInts.front();
            auto initialSize = hits.size();
            for (int i = saIntervalHit.begin; i != saIntervalHit.end; ++i) {
                auto globalPos = SA[i];
                auto txpID = rmi_->transcriptAtPosition(globalPos);
                // the offset into this transcript
                auto pos = globalPos - txpStarts[txpID];
                hits.emplace_back(txpID, pos, false, readLen);
                hits.back().mateStatus = mateStatus;
            }
            // Now sort by transcript ID (then position) and eliminate
            // duplicates
            auto sortStartIt = hits.begin() + rcHitsStart;
            auto sortEndIt = hits.end();
            std::sort(sortStartIt, sortEndIt,
                    [](const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
                    if (a.tid == b.tid) {
                    return a.pos < b.pos;
                    } else {
                    return a.tid < b.tid;
                    }
                    });
            auto newEnd = std::unique(sortStartIt, sortEndIt,
                    [] (const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
                    return a.tid == b.tid;
                    });
            hits.resize(std::distance(hits.begin(), newEnd));
        }
        auto rcHitsEnd = hits.size();

        // If we had both forward and RC hits, then merge them
        if ((fwdHitsEnd > fwdHitsStart) and (rcHitsEnd > rcHitsStart)) {
            // Merge the forward and reverse hits
            std::inplace_merge(hits.begin() + fwdHitsStart, hits.begin() + fwdHitsEnd, hits.begin() + rcHitsEnd,
                    [](const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
                    return a.tid < b.tid;
                    });
            // And get rid of duplicate transcript IDs
            auto newEnd = std::unique(hits.begin() + fwdHitsStart, hits.begin() + rcHitsEnd,
                    [] (const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
                    return a.tid == b.tid;
                    });
            hits.resize(std::distance(hits.begin(), newEnd));
        }
        // Return true if we had any valid hits and false otherwise.
        return foundHit;
        }
Ejemplo n.º 2
0
    bool operator()(std::string& read,
            std::vector<rapmap::utils::QuasiAlignment>& hits,
            SASearcher& saSearcher,
            rapmap::utils::MateStatus mateStatus) {

        using QuasiAlignment = rapmap::utils::QuasiAlignment;
        using MateStatus = rapmap::utils::MateStatus;

        //auto& posIDs = rmi_->positionIDs;
        auto& rankDict = rmi_->rankDict;
        auto& txpStarts = rmi_->txpOffsets;
        auto& SA = rmi_->SA;
        auto& khash = rmi_->khash;
        auto& text = rmi_->seq;
        uint32_t sampFactor{1};
        auto salen = SA.size();

        auto readLen = read.length();
        auto maxDist = 1.5 * readLen;
        auto k = rapmap::utils::my_mer::k();
        auto readStartIt = read.begin();
        auto readEndIt = read.end();

        auto readRevStartIt = read.rbegin();
        auto readRevEndIt = read.rend();

        auto rb = read.begin();
        auto re = rb + k;
        int lbLeftFwd = 0, ubLeftFwd = 0;
        int lbLeftRC = 0, ubLeftRC = 0;
        int lbRightFwd = 0, ubRightFwd = 0;
        int lbRightRC = 0, ubRightRC = 0;
        int matchedLen;

        bool leftFwdHit = false;
        bool leftRCHit = false;
        bool leftHit = false;
        bool rightFwdHit = false;
        bool rightRCHit = false;
        bool rightHit = false;
        bool isRev = false;
        rapmap::utils::my_mer mer;
        rapmap::utils::my_mer rcMer;

        using rapmap::utils::SAIntervalHit;

        std::vector<SAIntervalHit> fwdSAInts;
        std::vector<SAIntervalHit> rcSAInts;

        std::vector<uint32_t> leftTxps, leftTxpsRC;
        std::vector<uint32_t> rightTxps, rightTxpsRC;
        int maxInterval{1000};

        // The number of bases that a new query position (to which
        // we skipped) should overlap the previous extension. A
        // value of 0 means no overlap (the new search begins at the next
        // base) while a value of (k - 1) means that k-1 bases (one less than
        // the k-mer size) must overlap.
        int skipOverlap = k-1;
        // Number of nucleotides to skip when encountering a homopolymer k-mer.
        int homoPolymerSkip = k/2;

        // Find a hit within the read
        // While we haven't fallen off the end
        while (re < read.end()) {

            // Get the k-mer at the current start position.
            // And make sure that it's valid (contains no Ns).
            auto pos = std::distance(readStartIt, rb);
            auto invalidPos = read.find_first_of("nN", pos);
            if (invalidPos <= pos + k) {
                rb = read.begin() + invalidPos + 1;
                re = rb + k;
                continue;
            }

            // If the next k-bases are valid, get the k-mer and
            // reverse complement k-mer
            mer = rapmap::utils::my_mer(read.c_str() + pos);
            if (mer.is_homopolymer()) { rb += homoPolymerSkip; re += homoPolymerSkip; continue; }
            rcMer = mer.get_reverse_complement();

            // See if we can find this k-mer in the hash
            auto merIt = khash.find(mer.get_bits(0, 2*k));
            auto rcMerIt = khash.find(rcMer.get_bits(0, 2*k));

            // If we can find the k-mer in the hash, get its SA interval
            if (merIt != khash.end()) {
                int lb = merIt->second.begin;
                int ub = merIt->second.end;

                // lb must be 1 *less* then the current lb
                auto lbRestart = std::max(static_cast<int>(0), lb-1);
                // Extend the SA interval using the read sequence as far as
                // possible
                std::tie(lbLeftFwd, ubLeftFwd, matchedLen) =
                    saSearcher.extendSearchNaive(lbRestart, ub, k, rb, readEndIt);
                /*
                   if (read.substr(pos, matchedLen) == text.substr(SA[lbLeftFwd - 1], matchedLen)) {
                   std::cerr << "INTERVAL LOOKS WRONG!\n";
                   std::cerr << "Start interval - 2 = " << text.substr(SA[lbLeftFwd - 2], matchedLen) << '\n';
                   std::cerr << "Start interval - 1 = " << text.substr(SA[lbLeftFwd - 1], matchedLen) << '\n';
                   std::cerr << "Interval start     = " << text.substr(SA[lbLeftFwd], matchedLen) << '\n';
                   }

                   if (read.substr(pos, matchedLen) == text.substr(SA[ubLeftFwd], matchedLen)) {
                   std::cerr << "INTERVAL LOOKS WRONG!\n";
                   std::cerr << "kmer                = " << mer << '\n';
                   std::cerr << "read substring      = " << read.substr(pos, matchedLen) << '\n';
                   std::cerr << "Before interval end = " << text.substr(SA[ubLeftFwd - 1], matchedLen) << '\n';
                   std::cerr << "Interval end        = " << text.substr(SA[ubLeftFwd], matchedLen) << '\n';
                   std::cerr << "Interval end + 1    = " << text.substr(SA[ubLeftFwd + 1], matchedLen) << '\n';
                   std::exit(1);
                   }
                //matchedLen = k;
                //lbLeftFwd = lb; ubLeftFwd = ub;
                */

                // If the SA interval is valid, and not too wide, then record
                // the hit.
                int diff = ubLeftFwd - lbLeftFwd;
                if (ubLeftFwd > lbLeftFwd and diff < maxInterval) {
                    auto queryStart = std::distance(read.begin(), rb);
                    fwdSAInts.emplace_back(lbLeftFwd, ubLeftFwd, matchedLen, queryStart, false);
                    leftFwdHit = true;
                }
            }

            // See if the reverse complement k-mer is in the hash
            if (rcMerIt != khash.end()) {
                lbLeftRC = rcMerIt->second.begin;
                ubLeftRC = rcMerIt->second.end;
                int diff = ubLeftRC - lbLeftRC;
                if (ubLeftRC > lbLeftRC and diff < maxInterval) {
                    /* Don't actually push here since we can't extend
                    auto queryStart = std::distance(read.begin(), rb);
                    rcSAInts.emplace_back(lbLeftRC, ubLeftRC, k, queryStart, true);
                    */
                    leftRCHit = true;
                }
            }

            // If we had a hit with either k-mer then we can
            // break out of this loop to look for the next informative position
            if (leftFwdHit or leftRCHit) {
                leftHit = true;
                break;
            }
            ++rb; ++re;
        }

        // If we went the entire length of the read without finding a hit
        // then we can bail.
        if (!leftHit) { return false; }

        // If we had a hit on the forward strand
        if (leftFwdHit) {

            // The length of this match
            auto matchLen = fwdSAInts.front().len;
            // The iterator to where this match began
            rb = read.begin() + fwdSAInts.front().queryPos;

            // [lb, ub) is the suffix array interval for the MMP (maximum mappable prefix)
            // of the k-mer we found.  The NIP (next informative position) in the sequence
            // is the position after the LCE (longest common extension) of
            // T[SA[lb]:] and T[SA[ub-1]:]
            auto remainingLength = std::distance(rb + matchLen, readEndIt);
            auto lce = saSearcher.lce(lbLeftFwd, ubLeftFwd-1, matchLen, remainingLength);
            //auto fwdSkip = matchLen + 1 - skipOverlap;//lce - skipOverlap;
            auto fwdSkip = std::max(matchLen + 1 - skipOverlap, lce - skipOverlap);

            size_t nextInformativePosition = std::min(
                    std::max(0, static_cast<int>(readLen)- static_cast<int>(k)),
                    static_cast<int>(std::distance(readStartIt, rb) + fwdSkip)
                    );

            rb = read.begin() + nextInformativePosition;
            re = rb + k;

            size_t invalidPos{0};
            while (re <= readEndIt) {
                // The offset into the string
                auto pos = std::distance(readStartIt, rb);

                // The position of the first N in the k-mer (if there is one)
                // If we have already verified there are no Ns in the remainder
                // of the string (invalidPos is std::string::npos) then we can
                // skip this test.
                if (invalidPos != std::string::npos) {
                    invalidPos = read.find_first_of("nN", pos);
                }

                // If the first N is within k bases, then this k-mer is invalid
                if (invalidPos < pos + k) {
                    // A valid k-mer can't start until after the 'N'
                    nextInformativePosition = invalidPos + 1;
                    rb = read.begin() + nextInformativePosition;
                    re = rb + k;
                    // Go to the next iteration of the while loop
                    continue;
                }

                // If the current end position is valid
                if (re <= readEndIt) {

                    mer = rapmap::utils::my_mer(read.c_str() + pos);
                    if (mer.is_homopolymer()) { rb += homoPolymerSkip; re = rb + k; continue; }
                    auto merIt = khash.find(mer.get_bits(0, 2*k));

                    if (merIt != khash.end()) {
                        /*
                        if (!leftRCHit) {
                            auto rcMer = mer.get_reverse_complement();
                            auto rcMerIt = khash.find(rcMer.get_bits(0, 2*k));
                            if (rcMerIt != khash.end()) { leftRCHit = true; }
                        }
                        */

                        lbRightFwd = merIt->second.begin;
                        ubRightFwd = merIt->second.end;

                        // lb must be 1 *less* then the current lb
                        lbRightFwd = std::max(0, lbRightFwd - 1);
                        std::tie(lbRightFwd, ubRightFwd, matchedLen) =
                            saSearcher.extendSearchNaive(lbRightFwd, ubRightFwd,
                                    k, rb, readEndIt);

                        int diff = ubRightFwd - lbRightFwd;
                        if (ubRightFwd > lbRightFwd and diff < maxInterval) {
                            auto queryStart = std::distance(read.begin(), rb);
                            fwdSAInts.emplace_back(lbRightFwd, ubRightFwd, matchedLen, queryStart, false);
                            rightFwdHit = true;
                        }

                        //fwdSkip = matchedLen - skipOverlap;

                        auto mismatchIt = rb + matchedLen;
                        if ((mismatchIt + 1 - skipOverlap) <= readEndIt) {
                            auto remainingDistance = std::distance(mismatchIt, readEndIt);
                            auto lce = saSearcher.lce(lbRightFwd, ubRightFwd-1, matchedLen, remainingDistance);
                            //rb += lce - skipOverlap;
                            //rb += std::max(static_cast<uint32_t>(fwdSkip), lce - skipOverlap);

                            // Where we would jump if we just used the MMP
                            auto skipMatch = mismatchIt + 1 - skipOverlap;
                            // Where we would jump if we used the LCE
                            auto skipLCE = rb + lce - skipOverlap;
                            // Pick the larger of the two
                            rb = std::max(skipLCE, skipMatch);
                            re = rb + k;
                        } else {
                            rb = mismatchIt + 1 - skipOverlap;
                            re = rb + k;
                        }

                    } else {
                        rb += sampFactor;
                        re = rb + k;
                    }
                }
            }
        }

        if (leftRCHit) {
            size_t pos{read.length() - k};

            auto revReadEndIt = read.rend();

            auto revRB = read.rbegin();
            auto revRE = revRB + k;

            auto invalidPosIt = revRB;
            while (revRE <= revReadEndIt){

                revRE = revRB + k;
                if (revRE > revReadEndIt) { break; }

                // See if this k-mer would contain an N
                // only check if we don't yet know that there are no remaining
                // Ns
                if (invalidPosIt != revReadEndIt) {
                    invalidPosIt = std::find_if(revRB, revRE,
                                                 [](const char c) -> bool {
                                                     return c == 'n' or c == 'N';
                                                 });
                }

                // If we found an N before the end of the k-mer
                if (invalidPosIt < revRE) {
                    // Skip to the k-mer starting at the next position
                    // (i.e. right past the N)
                    revRB = invalidPosIt + 1;
                    continue;
                }

                // The distance from the beginning of the read to the
                // start of the k-mer
                pos = std::distance(revRE, revReadEndIt);

                // Get the k-mer and query it in the hash
                mer = rapmap::utils::my_mer(read.c_str() + pos);
                if (mer.is_homopolymer()) { revRB += homoPolymerSkip; revRE += homoPolymerSkip; continue; }
                rcMer = mer.get_reverse_complement();
                auto rcMerIt = khash.find(rcMer.get_bits(0, 2*k));

                // If we found the k-mer
                if (rcMerIt != khash.end()) {
                    lbRightRC = rcMerIt->second.begin;
                    ubRightRC = rcMerIt->second.end;

                    // lb must be 1 *less* then the current lb
                    // We can't move any further in the reverse complement direction
                    lbRightRC = std::max(0, lbRightRC - 1);
                    std::tie(lbRightRC, ubRightRC, matchedLen) =
                        saSearcher.extendSearchNaive(lbRightRC, ubRightRC, k,
                                revRB, revReadEndIt, true);

                    int diff = ubRightRC - lbRightRC;
                    if (ubRightRC > lbRightRC and diff < maxInterval) {
                        auto queryStart = std::distance(revRB + matchedLen, revReadEndIt);
                        rcSAInts.emplace_back(lbRightRC, ubRightRC, matchedLen, queryStart, true);
                        rightRCHit = true;
                    }

                    //auto fwdSkip = matchedLen - skipOverlap;
                    //auto fwdSkip = matchedLen + 1;
                    auto mismatchIt = revRB + matchedLen;

                    if ((mismatchIt + 1 - skipOverlap) <= revReadEndIt) {
                    //if (revRE <= revReadEndIt) {
                        auto remainingDistance = std::distance(mismatchIt, revReadEndIt);
                        auto lce = saSearcher.lce(lbRightRC, ubRightRC-1, matchedLen, remainingDistance);

                        // Where we would jump if we just used the MMP
                        auto skipMatch = mismatchIt + 1 - skipOverlap;
                        // Where we would jump if we used the lce
                        auto skipLCE = revRB + lce - skipOverlap;
                        // Choose the larger of the two
                        revRB = std::max(skipLCE, skipMatch);
                        revRE = revRB + k;
                    } else {
                        revRB = mismatchIt + 1 - skipOverlap;
                        revRE = revRB + k;
                    }

                } else {
                    revRB += sampFactor;
                    revRE = revRB + k;
                }
            }
        }

        auto fwdHitsStart = hits.size();
        // If we had > 1 forward hit
        if (fwdSAInts.size() > 1) {
                auto processedHits = rapmap::hit_manager::intersectSAHits(fwdSAInts, *rmi_);
                rapmap::hit_manager::collectHitsSimpleSA(processedHits, readLen, maxDist, hits, mateStatus);
        } else if (fwdSAInts.size() == 1) { // only 1 hit!
                auto& saIntervalHit = fwdSAInts.front();
                auto initialSize = hits.size();
                for (int i = saIntervalHit.begin; i != saIntervalHit.end; ++i) {
                        auto globalPos = SA[i];
                        //auto txpID = posIDs[globalPos];
            			//auto txpID = rankDict.Rank(globalPos, 1);
		            	auto txpID = rmi_->transcriptAtPosition(globalPos);
                        // the offset into this transcript
                        auto pos = globalPos - txpStarts[txpID];
                        hits.emplace_back(txpID, pos, true, readLen);
                        hits.back().mateStatus = mateStatus;
                }
                // Now sort by transcript ID (then position) and eliminate
                // duplicates
                auto sortStartIt = hits.begin() + initialSize;
                auto sortEndIt = hits.end();
                std::sort(sortStartIt, sortEndIt,
                                [](const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
                                if (a.tid == b.tid) {
                                return a.pos < b.pos;
                                } else {
                                return a.tid < b.tid;
                                }
                                });
                auto newEnd = std::unique(hits.begin() + initialSize, hits.end(),
                                [] (const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
                                return a.tid == b.tid;
                                });
                hits.resize(std::distance(hits.begin(), newEnd));
        }
        auto fwdHitsEnd = hits.size();

        auto rcHitsStart = fwdHitsEnd;
        // If we had > 1 rc hit
        if (rcSAInts.size() > 1) {
            auto processedHits = rapmap::hit_manager::intersectSAHits(rcSAInts, *rmi_);
            rapmap::hit_manager::collectHitsSimpleSA(processedHits, readLen, maxDist, hits, mateStatus);
        } else if (rcSAInts.size() == 1) { // only 1 hit!
            auto& saIntervalHit = rcSAInts.front();
            auto initialSize = hits.size();
            for (int i = saIntervalHit.begin; i != saIntervalHit.end; ++i) {
                auto globalPos = SA[i];
                //auto txpID = posIDs[globalPos];
		//auto txpID = rankDict.Rank(globalPos, 1);
		auto txpID = rmi_->transcriptAtPosition(globalPos);
                // the offset into this transcript
                auto pos = globalPos - txpStarts[txpID];
                hits.emplace_back(txpID, pos, false, readLen);
                hits.back().mateStatus = mateStatus;
            }
            // Now sort by transcript ID (then position) and eliminate
            // duplicates
            auto sortStartIt = hits.begin() + rcHitsStart;
            auto sortEndIt = hits.end();
            std::sort(sortStartIt, sortEndIt,
                    [](const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
                    if (a.tid == b.tid) {
                    return a.pos < b.pos;
                    } else {
                    return a.tid < b.tid;
                    }
                    });
            auto newEnd = std::unique(sortStartIt, sortEndIt,
                    [] (const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
                    return a.tid == b.tid;
                    });
            hits.resize(std::distance(hits.begin(), newEnd));
        }
        auto rcHitsEnd = hits.size();

        // If we had both forward and RC hits, then merge them
        if ((fwdHitsEnd > fwdHitsStart) and (rcHitsEnd > rcHitsStart)) {
            // Merge the forward and reverse hits
            std::inplace_merge(hits.begin() + fwdHitsStart, hits.begin() + fwdHitsEnd, hits.begin() + rcHitsEnd,
                    [](const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
                    return a.tid < b.tid;
                    });
            // And get rid of duplicate transcript IDs
            auto newEnd = std::unique(hits.begin() + fwdHitsStart, hits.begin() + rcHitsEnd,
                    [] (const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
                    return a.tid == b.tid;
                    });
            hits.resize(std::distance(hits.begin(), newEnd));
        }
        // Return true if we had any valid hits and false otherwise.
        return (rcHitsEnd > fwdHitsStart);
    }