// Build the graph by expanding nodes on the frontier
void StringGraphGenerator::buildGraph(FrontierQueue& queue)
{
    while(!queue.empty())
    {
        if(queue.size() > 200)
            break;

        GraphFrontier node = queue.front();
        queue.pop();
        if(node.pVertex->getColor() == EXPLORED_COLOR)
            continue; // node has been visited already
        
        // Search the FM-index for the current vertex
        SeqRecord record;
        record.id = node.pVertex->getID();
        record.seq = node.pVertex->getSeq().toString();
        
        OverlapBlockList blockList;
        assert(blockList.empty());
        m_pOverlapper->overlapRead(record, m_minOverlap, &blockList);

        // Update the graph and the frontier queue with newly found vertices
        updateGraphAndQueue(node, queue, blockList);
        node.pVertex->setColor(EXPLORED_COLOR);
    }

    m_pGraph->setColors(GC_WHITE);
}
Ejemplo n.º 2
0
// iterate through obList and determine the overlaps that are irreducible. This function is recursive.
// The final overlap blocks corresponding to irreducible overlaps are written to pOBFinal.
// Invariant: the blocks are ordered in descending order of the overlap size so that the longest overlap is first.
// Invariant: each block corresponds to the same extension of the root sequence w.
void OverlapAlgorithm::_processIrreducibleBlocksExactIterative(const BWT* pBWT, const BWT* pRevBWT, 
                                                               OverlapBlockList& inList, 
                                                               OverlapBlockList* pOBFinal) const
{
    if(inList.empty())
        return;
    
    // We store the overlap blocks in groups of blocks that have the same right-extension.
    // When a branch is found, the groups are split based on the extension
    typedef std::list<OverlapBlockList> BlockGroups;

    BlockGroups blockGroups;
    blockGroups.push_back(inList);
    int numExtensions = 0;
    int numBranches = 0;
    while(!blockGroups.empty())
    {
        // Perform one extenion round for each group.
        // If the top-level block has ended, push the result
        // to the final list and remove the group from processing
        BlockGroups::iterator groupIter = blockGroups.begin();
        BlockGroups incomingGroups; // Branched blocks are placed here

        while(groupIter != blockGroups.end())
        {
            OverlapBlockList& currList = *groupIter;
            bool bEraseGroup = false;

            // Count the extensions in the top level (longest) blocks first
            int topLen = currList.front().overlapLen;
            AlphaCount64 ext_count;
            OBLIter blockIter = currList.begin();
            while(blockIter != currList.end() && blockIter->overlapLen == topLen)
            {
                ext_count += blockIter->getCanonicalExtCount(pBWT, pRevBWT);
                ++blockIter;
            }
            
            // Three cases:
            // 1) The top level block has ended as it contains the extension $. Output TLB and end.
            // 2) There is a singular unique extension base for all the blocks. Update the blocks and continue.
            // 3) There are multiple extension bases, split the block group and continue.
            // If some block other than the TLB ended, it must be contained within the TLB and it is not output
            // or considered further. 
            // Likewise if multiple distinct strings in the TLB ended, we only output the top one. The rest
            // must have the same sequence as the top one and are hence considered to be contained with the top element.
            if(ext_count.get('$') > 0)
            {
                // An irreducible overlap has been found. It is possible that there are two top level blocks
                // (one in the forward and reverse direction). Since we can't decide which one
                // contains the other at this point, we output hits to both. Under a fixed 
                // length string assumption one will be contained within the other and removed later.
                OBLIter tlbIter = currList.begin();
                while(tlbIter != currList.end() && tlbIter->overlapLen == topLen)
                {
                    // Ensure the tlb is actually terminal and not a substring block
                    AlphaCount64 test_count = tlbIter->getCanonicalExtCount(pBWT, pRevBWT);
                    if(test_count.get('$') == 0)
                    {
                        std::cerr << "Error: substring read found during overlap computation.\n";
                        std::cerr << "Please run sga rmdup before sga overlap\n";
                        exit(EXIT_FAILURE);
                    }
                    
                    // Perform the final right-update to make the block terminal
                    OverlapBlock branched = *tlbIter;
                    BWTAlgorithms::updateBothR(branched.ranges, '$', branched.getExtensionBWT(pBWT, pRevBWT));
                    pOBFinal->push_back(branched);
#ifdef DEBUGOVERLAP
                    std::cout << "[IE] TLB of length " << branched.overlapLen << " has ended\n";
                    std::cout << "[IE]\tBlock data: " << branched << "\n";
#endif             
                    ++tlbIter;
                } 

                // Set the flag to erase this group, it is finished
                bEraseGroup = true;
            }
            else
            {
                // Count the extension for the rest of the blocks
                while(blockIter != currList.end())
                {
                    ext_count += blockIter->getCanonicalExtCount(pBWT, pRevBWT);
                    ++blockIter;
                }

                if(ext_count.hasUniqueDNAChar())
                {
                    // Update all the blocks using the unique extension character
                    // This character is in the canonical representation wrt to the query
                    char b = ext_count.getUniqueDNAChar();
                    updateOverlapBlockRangesRight(pBWT, pRevBWT, currList, b);
                    numExtensions++;
                    bEraseGroup = false;
                }
                else
                {
                    for(size_t idx = 0; idx < DNA_ALPHABET_SIZE; ++idx)
                    {
                        char b = ALPHABET[idx];
                        if(ext_count.get(b) > 0)
                        {
                            numBranches++;
                            OverlapBlockList branched = currList;
                            updateOverlapBlockRangesRight(pBWT, pRevBWT, branched, b);
                            incomingGroups.push_back(branched);
                            bEraseGroup = true;
                        }
                    }
                }
            }

            if(bEraseGroup)
                groupIter = blockGroups.erase(groupIter);
            else
                ++groupIter;
        }

        // Splice in the newly branched blocks, if any
        blockGroups.splice(blockGroups.end(), incomingGroups);
    }
}
Ejemplo n.º 3
0
// Classify the blocks in obList as irreducible, transitive or substrings. The irreducible blocks are
// put into pOBFinal. The remaining are discarded.
// Invariant: the blocks are ordered in descending order of the overlap size so that the longest overlap is first.
void OverlapAlgorithm::_processIrreducibleBlocksInexact(const BWT* pBWT, const BWT* pRevBWT, 
                                                        OverlapBlockList& activeList, 
                                                        OverlapBlockList* pOBFinal) const
{
    if(activeList.empty())
        return;
    
    // The activeList contains all the blocks that are not yet right terminal
    // Count the extensions in the top level (longest) blocks first
    bool all_eliminated = false;
    while(!activeList.empty() && !all_eliminated)
    {
        // The terminalBlock list contains all the blocks that became right-terminal
        // in the current extension round.
        OverlapBlockList terminalList;
        OverlapBlockList potentialContainedList;

        // Perform a single round of extension, any terminal blocks
        // are moved to the terminated list
        extendActiveBlocksRight(pBWT, pRevBWT, activeList, terminalList, potentialContainedList);

        // Compare the blocks in the contained list against the other terminal and active blocks
        // If they are a substring match to any of these, discard them
        OverlapBlockList::iterator containedIter = potentialContainedList.begin();
        for(; containedIter != potentialContainedList.end(); ++containedIter)
        {
           if(!isBlockSubstring(*containedIter, terminalList, m_errorRate) && 
              !isBlockSubstring(*containedIter, activeList, m_errorRate))
           {
                // Not a substring, move to terminal list
                terminalList.push_back(*containedIter);
                //std::cout << "Contained block kept: " << containedIter->overlapLen << "\n";
           }
           else
           {
                //std::cout << "Contained block found and removed: " << containedIter->overlapLen << "\n";
           }
        }

        // Using the terminated blocks, mark as eliminated any active blocks
        // that form a valid overlap to the terminal block. These are transitive edges
        // We do not compare two terminal blocks, we don't consider these overlaps to be
        // transitive
        OverlapBlockList::iterator terminalIter = terminalList.begin();
        for(; terminalIter != terminalList.end(); ++terminalIter)
        {
#ifdef DEBUGOVERLAP
            std::cout << "[II] ***TLB of length " << terminalIter->overlapLen << " has ended\n";
#endif       
            all_eliminated = true;
            OverlapBlockList::iterator activeIter = activeList.begin();
            for(; activeIter != activeList.end(); ++activeIter)
            {
                if(activeIter->isEliminated)
                    continue; // skip previously marked blocks
                
                // Two conditions must be met for a block to be transitive wrt terminal:
                // 1) It must have a strictly shorter overlap than the terminal block
                // 2) The error rate between the block and terminal must be less than the threshold
                double inferredErrorRate = calculateBlockErrorRate(*terminalIter, *activeIter);
                if(activeIter->overlapLen < terminalIter->overlapLen && 
                   isErrorRateAcceptable(inferredErrorRate, m_errorRate))
                {
#ifdef DEBUGOVERLAP_2                            
                    std::cout << "Marking block of length " << activeIter->overlapLen << " as eliminated\n";
#endif
                    activeIter->isEliminated = true;
                }
                else
                {
                    all_eliminated = false;
                }
            } 
            
            // Move this block to the final list if it has not been previously marked eliminated
            if(!terminalIter->isEliminated)
            {
#ifdef DEBUGOVERLAP
                std::cout << "[II] Adding block " << *terminalIter << " to final list\n";
                //std::cout << "  extension: " << terminalIter->forwardHistory << "\n";
#endif                
                pOBFinal->push_back(*terminalIter);
            }
        }
    }

    activeList.clear();
}
Ejemplo n.º 4
0
OverlapResult OverlapAlgorithm::overlapReadInexact(const SeqRecord& read, int minOverlap, OverlapBlockList* pOBOut) const
{
    OverlapResult result;
    OverlapBlockList obWorkingList;
    std::string seq = read.seq.toString();

#ifdef DEBUGOVERLAP
    std::cout << "\n\n***Overlapping read " << read.id << " suffix\n";
#endif

    // Match the suffix of seq to prefixes

    // findInexact returns false is the maximum search time was exceeded. In this
    // case we dont run any of the subsequent commands and return no overlaps.
    bool valid = true;
    valid = findOverlapBlocksInexact(seq, m_pBWT, m_pRevBWT, sufPreAF, 
                                     minOverlap, &obWorkingList, pOBOut, result);

    if(valid)
        valid = findOverlapBlocksInexact(complement(seq), m_pRevBWT, m_pBWT, prePreAF, 
                                         minOverlap, &obWorkingList, pOBOut, result);

    if(valid)
    {
        if(m_bIrreducible)
        {
            computeIrreducibleBlocks(m_pBWT, m_pRevBWT, &obWorkingList, pOBOut);
            obWorkingList.clear();
        }
        else
        {
            pOBOut->splice(pOBOut->end(), obWorkingList);
            assert(obWorkingList.empty());
        }
    }

#ifdef DEBUGOVERLAP
    std::cout << "\n\n***Overlapping read " << read.id << " prefix\n";
#endif

    // Match the prefix of seq to suffixes
    if(valid)
        valid = findOverlapBlocksInexact(reverseComplement(seq), m_pBWT, m_pRevBWT, sufSufAF, minOverlap, &obWorkingList, pOBOut, result);
    
    if(valid)
        valid = findOverlapBlocksInexact(reverse(seq), m_pRevBWT, m_pBWT, preSufAF, minOverlap, &obWorkingList, pOBOut, result);

    if(valid)
    {
        if(m_bIrreducible)
        {
            computeIrreducibleBlocks(m_pBWT, m_pRevBWT, &obWorkingList, pOBOut);
            obWorkingList.clear();
        }
        else
        {
            pOBOut->splice(pOBOut->end(), obWorkingList);
            assert(obWorkingList.empty());
        }
    }

    if(!valid)
    {
        pOBOut->clear();
        result.isSubstring = false;
        result.searchAborted = true;
        return result;
    }

    return result;
}