Ejemplo n.º 1
0
// Find the interval pair corresponding to w using a cached intervals for short substrings
BWTIntervalPair BWTAlgorithms::findIntervalPairWithCache(const BWT* pBWT,
                                                         const BWT* pRevBWT,
                                                         const BWTIntervalCache* pFwdCache,
                                                         const BWTIntervalCache* pRevCache,
                                                         const std::string& w)
{
    size_t cacheLen = pFwdCache->getCachedLength();
    if(w.size() < cacheLen)
        return findIntervalPair(pBWT, pRevBWT, w);

    // Compute the fwd and reverse interval using the cache for the last k bases
    BWTIntervalPair ip;
    int len = w.size();
    int j = len - cacheLen;

    std::string ss = w.substr(j);
    std::string r_ss = reverse(ss);
    assert(ss.size() == cacheLen);
    ip.interval[0] = pFwdCache->lookup(ss.c_str());
    ip.interval[1] = pRevCache->lookup(r_ss.c_str());

    // Extend the interval to the full length of w as normal
    j -= 1;
    for(;j >= 0; --j)
    {
        updateBothL(ip, w[j], pBWT);
        if(!ip.isValid())
            return ip;
    }
    return ip;
}
Ejemplo n.º 2
0
// Find the intervals in pBWT/pRevBWT corresponding to w
// If w does not exist in the BWT, the interval
// coordinates [l, u] will be such that l > u
BWTIntervalPair BWTAlgorithms::findIntervalPair(const BWT* pBWT, const BWT* pRevBWT, const std::string& w)
{
    BWTIntervalPair intervals;
    int len = w.size();
    int j = len - 1;
    char curr = w[j];
    initIntervalPair(intervals, curr, pBWT, pRevBWT);
    --j;

    for(;j >= 0; --j)
    {
        curr = w[j];
        updateBothL(intervals, curr, pBWT);
        if(!intervals.isValid())
            return intervals;
    }
    return intervals;
}
Ejemplo n.º 3
0
// Calculate the 1-base de Bruijn graph extensions of str
// The includes the reverse complement
AlphaCount64 BWTAlgorithms::calculateDeBruijnExtensions(const std::string str,
                                                        const BWT* pBWT,
                                                        const BWT* pRevBWT,
                                                        EdgeDir direction,
                                                        const BWTIntervalCache* pFwdCache,
                                                        const BWTIntervalCache* pRevCache)
{
    size_t k = str.size();
    size_t p = k - 1;

    std::string pmer;

    // In the sense direction, we extend from the 3' end
    if(direction == ED_SENSE)
        pmer = str.substr(1, p);
    else
        pmer = str.substr(0, p);
    assert(pmer.length() == p);
    std::string rc_pmer = reverseComplement(pmer);

    // Get the interval for the p-mer and its reverse complement
    BWTIntervalPair ip;
    BWTIntervalPair rc_ip;

    // If pointers to interval caches are available, use them
    // to speed up the initial calculation
    if(pFwdCache != NULL && pRevCache != NULL)
    {
        ip = BWTAlgorithms::findIntervalPairWithCache(pBWT, pRevBWT, pFwdCache, pRevCache, pmer);
        rc_ip = BWTAlgorithms::findIntervalPairWithCache(pBWT, pRevBWT, pFwdCache, pRevCache, rc_pmer);
    }
    else
    {
        ip = BWTAlgorithms::findIntervalPair(pBWT, pRevBWT, pmer);
        rc_ip = BWTAlgorithms::findIntervalPair(pBWT, pRevBWT, rc_pmer);
    }

    assert(ip.isValid() || rc_ip.isValid());

    // Get the extension bases
    AlphaCount64 extensions;
    AlphaCount64 rc_extensions;

    // Calculate the interval to use to find the extensions. If extending in the sense
    // direction this is the reverse interval/reverse bwt for the forward bwt and the forward
    // interval for the reverse BWT. Vice-versa for anti-sense
    size_t fwdIdx;
    if(direction == ED_SENSE)
        fwdIdx = 1;
    else
        fwdIdx = 0;
    size_t revIdx = 1 - fwdIdx;
    const BWT* bwts[2];
    bwts[0] = pBWT;
    bwts[1] = pRevBWT;

    if(ip.interval[fwdIdx].isValid())
        extensions += BWTAlgorithms::getExtCount(ip.interval[fwdIdx], bwts[fwdIdx]);
    if(rc_ip.interval[revIdx].isValid())
        rc_extensions = BWTAlgorithms::getExtCount(rc_ip.interval[revIdx], bwts[revIdx]);

    // Switch the reverse-complement extensions to the same strand as the str
    rc_extensions.complement();
    extensions += rc_extensions;
    return extensions;
}
Ejemplo n.º 4
0
// Calculate the ranges in pBWT that contain a prefix of at least minOverlap basepairs that
// overlaps with a suffix of w. The ranges are added to the pOBList
void OverlapAlgorithm::findOverlapBlocksExact(const std::string& w, const BWT* pBWT,
                                              const BWT* pRevBWT, const AlignFlags& af, int minOverlap,
                                              OverlapBlockList* pOverlapList, OverlapBlockList* pContainList, 
                                              OverlapResult& result) const
{
    // The algorithm is as follows:
    // We perform a backwards search using the FM-index for the string w.
    // As we perform the search we collect the intervals 
    // of the significant prefixes (len >= minOverlap) that overlap w.
    BWTIntervalPair ranges;
    size_t l = w.length();
    int start = l - 1;
    BWTAlgorithms::initIntervalPair(ranges, w[start], pBWT, pRevBWT);
    
    // Collect the OverlapBlocks
    for(size_t i = start - 1; i >= 1; --i)
    {
        // Compute the range of the suffix w[i, l]
        BWTAlgorithms::updateBothL(ranges, w[i], pBWT);
        int overlapLen = l - i;
        if(overlapLen >= minOverlap)
        {
            // Calculate which of the prefixes that match w[i, l] are terminal
            // These are the proper prefixes (they are the start of a read)
            BWTIntervalPair probe = ranges;
            BWTAlgorithms::updateBothL(probe, '$', pBWT);
            
            // The probe interval contains the range of proper prefixes
            if(probe.interval[1].isValid())
            {
                assert(probe.interval[1].lower > 0);
                pOverlapList->push_back(OverlapBlock(probe, ranges, overlapLen, 0, af));
            }
        }
    }

    // Determine if this sequence is contained and should not be processed further
    BWTAlgorithms::updateBothL(ranges, w[0], pBWT);

    // Ranges now holds the interval for the full-length read
    // To handle containments, we output the overlapBlock to the final overlap block list
    // and it will be processed later
    // Two possible containment cases:
    // 1) This read is a substring of some other read
    // 2) This read is identical to some other read
    
    // Case 1 is indicated by the existance of a non-$ left or right hand extension
    // In this case we return no alignments for the string
    AlphaCount64 left_ext = BWTAlgorithms::getExtCount(ranges.interval[0], pBWT);
    AlphaCount64 right_ext = BWTAlgorithms::getExtCount(ranges.interval[1], pRevBWT);
    if(left_ext.hasDNAChar() || right_ext.hasDNAChar())
    {
        result.isSubstring = true;
    }
    else
    {
        BWTIntervalPair probe = ranges;
        BWTAlgorithms::updateBothL(probe, '$', pBWT);
        if(probe.isValid())
        {
            // terminate the contained block and add it to the contained list
            BWTAlgorithms::updateBothR(probe, '$', pRevBWT);
            assert(probe.isValid());
            pContainList->push_back(OverlapBlock(probe, ranges, w.length(), 0, af));
        }
    }

    //OverlapBlockList containedWorkingList;
    //partitionBlockList(w.length(), &workingList, pOverlapList, &containedWorkingList);
    
    // Terminate the contained blocks
    //terminateContainedBlocks(containedWorkingList);
    
    // Move the contained blocks to the final contained list
    //pContainList->splice(pContainList->end(), containedWorkingList);

    return;
}