Example #1
0
// Recursive traversal to extract all the strings needed for the above function
void _extractRankedPrefixes(const BWT* pBWT, BWTInterval interval, const std::string& curr, RankedPrefixVector* pOutput)
{
    AlphaCount64 extensions = BWTAlgorithms::getExtCount(interval, pBWT);

    for(size_t i = 0; i < 4; ++i)
    {
        char b = "ACGT"[i];

        if(extensions.get(b) > 0)
        {
            BWTInterval ni = interval;
            BWTAlgorithms::updateInterval(ni, b, pBWT);
            _extractRankedPrefixes(pBWT, ni, curr + b, pOutput);
        }

    }

    // If we have extended the prefix as far as possible, stop
    BWTAlgorithms::updateInterval(interval, '$', pBWT);
    for(int64_t i = interval.lower; i <= interval.upper; ++i)
    {
        // backwards search gives a reversed prefix, fix it
        RankedPrefix rp = { (size_t)i, reverse(curr) };
        pOutput->push_back(rp);
    }
}
Example #2
0
std::string get_valid_dbg_neighbors_coverage_and_ratio(const std::string& kmer,
                                                       const BWTIndexSet& index_set,
                                                       size_t min_coverage,
                                                       double min_ratio,
                                                       EdgeDir dir)
{
    std::string out;
    AlphaCount64 counts = 
        BWTAlgorithms::calculateDeBruijnExtensionsSingleIndex(kmer, 
                                                              index_set.pBWT,
                                                              dir,
                                                              index_set.pCache);
    
    if(!counts.hasDNAChar())
        return out; // no extensions

    char max_b = counts.getMaxDNABase();
    size_t max_c = counts.get(max_b);

    for(size_t j = 0; j < 4; ++j)
    {
        char b = "ACGT"[j];
        size_t c = counts.get(b);
        if(c >= min_coverage && (double)c / max_c >= min_ratio)
            out.push_back(b);
    }
    return out;    
}
// Write out the next BWT for the next cycle. This updates BCRVector
// and suffixSymbolCounts. Returns the number of symbols written to writeBWT
size_t BWTCA::outputPartialCycle(int cycle,
                                 const DNAEncodedStringVector* pReadSequences,
                                 BCRVector& bcrVector, 
                                 const DNAEncodedString& readBWT, 
                                 size_t total_read_symbols,
                                 DNAEncodedString& writeBWT, 
                                 AlphaCount64& suffixStartCounts)
{
    // We track the rank of each symbol as it is copied/inserted
    // into the new bwt
    AlphaCount64 rank;

    // Counters
    size_t num_copied = 0;
    size_t num_inserted = 0;
    size_t num_wrote = 0;

    for(size_t i = 0; i < bcrVector.size(); ++i)
    {
        BCRElem& ne = bcrVector[i];
        
        // Copy elements from the read bwt until we reach the target position
        while(num_copied + num_inserted < ne.position)
        {
            char c = readBWT.get(num_copied++);
            writeBWT.set(num_wrote++, c);
            rank.increment(c);
        }

        // Now insert the incoming symbol
        int rl = pReadSequences->at(ne.index).length();
        char c = '$';

        // If the cycle number is greater than the read length, we are
        // on the final iteration and we just add in the '$' characters
        if(cycle <= rl)
            c = pReadSequences->at(ne.index).get(rl - cycle);
        //std::cout << "Inserting " << c << " at position " << num_copied + num_inserted << "\n";
        writeBWT.set(num_wrote++, c);
        num_inserted += 1;

        // Update the nvector element
        ne.sym = c;

        // Record the rank of the inserted symbol
        ne.position = rank.get(c);

        // Update the rank and the number of suffixes that start with c
        rank.increment(c);
        suffixStartCounts.increment(c);
    }

    // Copy any remaining symbols in the bwt
    while(num_copied < total_read_symbols)
        writeBWT.set(num_wrote++, readBWT.get(num_copied++));

    return num_wrote;
}
Example #4
0
// Validate that the sampled occurrence array is correct
void Occurrence::validate(const BWTString& bwStr) const
{
    size_t l = bwStr.length();
    AlphaCount64 sum;
    for(size_t i = 0; i < l; ++i)
    {
        char currB = bwStr.get(i);
        sum.increment(currB);
        AlphaCount64 calculated = get(bwStr, i);
        for(int i = 0; i < ALPHABET_SIZE; ++i)
            assert(calculated.get(ALPHABET[i]) == sum.get(ALPHABET[i]));
    }
}
void BWTCA::calculateAbsolutePositions(BCRVector& bcrVector, const AlphaCount64& suffixSymbolCounts)
{
    // Calculate a predecessor array from the suffix symbol counts
    AlphaCount64 predCounts;
    for(int i = 0; i < BWT_ALPHABET::size; ++i)
    {
        char b = RANK_ALPHABET[i];
        int64_t pc = suffixSymbolCounts.getLessThan(b);
        predCounts.set(b, pc);
    }

    for(size_t i = 0; i < bcrVector.size(); ++i)
        bcrVector[i].position += predCounts.get(bcrVector[i].sym);
}
Example #6
0
// Perform duplicate check
// Look up the interval of the read in the BWT. If the index of the read
DuplicateCheckResult QCProcess::performDuplicateCheck(const SequenceWorkItem& workItem)
{
    assert(m_params.pSharedBV != NULL);

    std::string w = workItem.read.seq.toString();
    std::string rc_w = reverseComplement(w);

    // Look up the interval of the sequence and its reverse complement
    BWTIntervalPair fwdIntervals = BWTAlgorithms::findIntervalPair(m_params.pBWT, m_params.pRevBWT, w);
    BWTIntervalPair rcIntervals = BWTAlgorithms::findIntervalPair(m_params.pBWT, m_params.pRevBWT, rc_w);

    // Check if this read is a substring of any other
    // This is indicated by the presence of a non-$ extension in the left or right direction
    AlphaCount64 fwdECL = BWTAlgorithms::getExtCount(fwdIntervals.interval[0], m_params.pBWT);
    AlphaCount64 fwdECR = BWTAlgorithms::getExtCount(fwdIntervals.interval[1], m_params.pRevBWT);

    AlphaCount64 rcECL = BWTAlgorithms::getExtCount(rcIntervals.interval[0], m_params.pBWT);
    AlphaCount64 rcECR = BWTAlgorithms::getExtCount(rcIntervals.interval[1], m_params.pRevBWT);

    if(fwdECL.hasDNAChar() || fwdECR.hasDNAChar() || rcECL.hasDNAChar() || rcECR.hasDNAChar())
    {
        // Substring reads are always removed so no need to update the bit vector
        return DCR_SUBSTRING;
    }

    // Calculate the lexicographic intervals for the fwd and reverse intervals
    BWTAlgorithms::updateBothL(fwdIntervals, '$', m_params.pBWT);
    BWTAlgorithms::updateBothL(rcIntervals, '$', m_params.pBWT);

    // Calculate the canonical index for this string - the lowest
    // value in the two lexicographic index
    int64_t fi = fwdIntervals.interval[0].isValid() ? fwdIntervals.interval[0].lower : std::numeric_limits<int64_t>::max();
    int64_t ri = rcIntervals.interval[0].isValid() ? rcIntervals.interval[0].lower : std::numeric_limits<int64_t>::max();
    int64_t canonicalIdx = std::min(fi, ri);

    // Check if the bit reprsenting the canonical index is set in the shared bit vector
    if(!m_params.pSharedBV->test(canonicalIdx))
    {
        // This read is not a duplicate
        // Attempt to atomically set the bit from false to true
        if(m_params.pSharedBV->updateCAS(canonicalIdx, false, true))
        {
            // Call succeed, return that this read is not a duplicate
            return DCR_UNIQUE;
        }
        else
        {
            // Call failed, some other thread set the bit before
            // this thread. Return that the reead is a duplicate
            return DCR_FULL_LENGTH_DUPLICATE;
        }
    }
    else
    {
        // this read is duplicate
        return DCR_FULL_LENGTH_DUPLICATE;
    }
}
Example #7
0
// Calculate the successors of this node in the implicit deBruijn graph
StringVector StringThreader::getDeBruijnExtensions(StringThreaderNode* pNode)
{
    WARN_ONCE("TODO: Refactor StringThreader to use new deBruijn code");

    // Get the last k-1 bases of the node
    std::string pmer = pNode->getSuffix(m_kmer - 1);
    std::string rc_pmer = reverseComplement(pmer);

    // Get an interval for the p-mer and its reverse complement
    BWTIntervalPair ip = BWTAlgorithms::findIntervalPair(m_pBWT, m_pRevBWT, pmer);
    BWTIntervalPair rc_ip = BWTAlgorithms::findIntervalPair(m_pBWT, m_pRevBWT, rc_pmer);

    // Get the extension bases
    AlphaCount64 extensions;
    AlphaCount64 rc_extensions;
    if(ip.interval[1].isValid())
        extensions += BWTAlgorithms::getExtCount(ip.interval[1], m_pRevBWT);
    if(rc_ip.interval[1].isValid())
        rc_extensions = BWTAlgorithms::getExtCount(rc_ip.interval[0], m_pBWT);
    rc_extensions.complement();
    extensions += rc_extensions;

    // Loop over the DNA symbols, if there is are more than two characters create a branch
    // otherwise just perform an extension.
    bool hasExtension = extensions.hasDNAChar();

    StringVector out;
    if(hasExtension)
    {
        for(int i = 0; i < DNA_ALPHABET::size; ++i)
        {
            char b = DNA_ALPHABET::getBase(i);
            if(extensions.get(b) > 0)
            {
                // extend to b
                std::string tmp;
                tmp.append(1,b);
                out.push_back(tmp);
            }
        }
    }

    // If the node branched, return true so the outer function can remove it from the leaf list
    return out;
}
Example #8
0
// Initialize the counts from the bwt string b
void Occurrence::initialize(const BWTString& bwStr, int sampleRate)
{
    m_sampleRate = sampleRate;
    m_shift = calculateShiftValue(m_sampleRate);

    size_t l = bwStr.length();
    int num_samples = (l % m_sampleRate == 0) ? (l / m_sampleRate) : (l / m_sampleRate + 1);
    m_values.resize(num_samples);
    
    AlphaCount64 sum;
    for(size_t i = 0; i < l; ++i)
    {
        char currB = bwStr.get(i);
        sum.increment(currB);
        if(i % m_sampleRate == 0)
            m_values[i / m_sampleRate] = sum;
    }
}
Example #9
0
// Check if sequence is composed of predominantely a single base
// Returns true if the sequence is not degenrate
bool QCProcess::performDegenerateCheck(const SequenceWorkItem& item)
{
    std::string w = item.read.seq.toString();
    AlphaCount64 bc;
    for(size_t i = 0; i < w.size(); ++i)
    {
        bc.increment(w[i]);
    }

    size_t maxCount = bc.getMaxCount();
    double prop = (double)maxCount / w.size();
    if(prop > m_params.degenProportion)
    {
        if(m_params.verbose > 0)
            std::cout << "Read " << w << " failed degenerate filter\n";
        return false;
    }
    return true;
}
Example #10
0
// Fill in the FM-index data structures
void SBWT::initializeFMIndex(int sampleRate)
{
    // initialize the occurance table
    m_occurrence.initialize(m_bwStr, sampleRate);

    // Calculate the C(a) array
    
    // Calculate the total number of occurances of each character in the BW str
    AlphaCount64 tmp;
    for(size_t i = 0; i < m_bwStr.length(); ++i)
    {
        tmp.increment(m_bwStr.get(i));
    }

    m_predCount.set('$', 0);
    m_predCount.set('A', tmp.get('$')); 
    m_predCount.set('C', m_predCount.get('A') + tmp.get('A'));
    m_predCount.set('G', m_predCount.get('C') + tmp.get('C'));
    m_predCount.set('T', m_predCount.get('G') + tmp.get('G'));
}
// Update N and the output BWT for the initial cycle, corresponding to the sentinel suffixes
// the symbolCounts vector is updated to hold the number of times each symbol has been inserted
// into the bwt
void BWTCA::outputInitialCycle(const DNAEncodedStringVector* pReadSequences, BCRVector& bcrVector, DNAEncodedString& bwt, AlphaCount64& suffixSymbolCounts)
{
    AlphaCount64 incomingSymbolCounts;

    size_t n = pReadSequences->size();
    size_t first_read_len = pReadSequences->at(0).length();
    for(size_t i = 0; i < n; ++i)
    {
        size_t rl =  pReadSequences->at(i).length();
        
        // Check that all reads are the same length
        if(rl != first_read_len)
        {
            std::cout << "Error: This implementation of BCR requires all reads to be the same length\n";
            exit(EXIT_FAILURE);
        }

        char c = pReadSequences->at(i).get(rl - 1);
        bwt.set(i, c);

        assert(rl > 1);

        // Load the elements of the N vector with the next symbol
        bcrVector[i].sym = c;
        bcrVector[i].index = i;

        // Set the relative position of the symbol that is being inserted
        bcrVector[i].position = incomingSymbolCounts.get(c);

        // Increment the count of the first base of the suffix of the
        // incoming strings. This is $ for the initial cycle
        suffixSymbolCounts.increment('$');

        // Update the inserted symbols
        incomingSymbolCounts.increment(c);
    }

    suffixSymbolCounts += incomingSymbolCounts;
}
Example #12
0
void FMIndex::loadSGABWT(const std::string& filename)
{
    FMIndexBuilder builder(filename, m_smallSampleRate, m_largeSampleRate);

    size_t n = 0;

    // Load the compressed string from the file
    std::ifstream str_reader(builder.getStringFilename().c_str());
    n = builder.getNumStringBytes();
    m_string.resize(n);
    str_reader.read(reinterpret_cast<char*>(&m_string[0]), n);

    // Load the small markers from the file
    std::ifstream sm_reader(builder.getSmallMarkerFilename().c_str());
    n = builder.getNumSmallMarkers();
    m_smallMarkers.resize(n);
    sm_reader.read(reinterpret_cast<char*>(&m_smallMarkers[0]), sizeof(SmallMarker) * n);
    
    // Load the large markers from the file
    std::ifstream lm_reader(builder.getLargeMarkerFilename().c_str());
    n = builder.getNumLargeMarkers();
    m_largeMarkers.resize(n);
    lm_reader.read(reinterpret_cast<char*>(&m_largeMarkers[0]), sizeof(LargeMarker) * n);

    m_numStrings = builder.getNumStrings();
    m_numSymbols = builder.getNumSymbols();

    AlphaCount64 totals = builder.getSymbolCounts();
    assert(totals.get('$') + 
           totals.get('A') + 
           totals.get('C') + 
           totals.get('G') + 
           totals.get('T') == m_numSymbols);

    m_predCount.set('$', 0);
    m_predCount.set('A', totals.get('$')); 
    m_predCount.set('C', m_predCount.get('A') + totals.get('A'));
    m_predCount.set('G', m_predCount.get('C') + totals.get('C'));
    m_predCount.set('T', m_predCount.get('G') + totals.get('G'));
    assert(m_predCount.get('T') + totals.get('T') == m_numSymbols);

    m_decoder = builder.getDecoder();

    printInfo();
}
Example #13
0
AlphaCount64 BWTAlgorithms::calculateDeBruijnExtensionsSingleIndex(const std::string str,
                                                                   const BWT* pBWT,
                                                                   EdgeDir direction,
                                                                   const BWTIntervalCache* pFwdCache)
{
    size_t k = str.size();
    size_t p = k - 1;

    std::string pmer;

    // In the sense direction, we extend from the 3' end
    if(direction == ED_SENSE)
        pmer = str.substr(1, p);
    else
        pmer = str.substr(0, p);
    assert(pmer.length() == p);
    std::string rc_pmer = reverseComplement(pmer);

    // As we only have a single index, we can only directly look up
    // the extensions for either the pmer or its reverse complement
    // In the SENSE extension direction, we directly look up for
    // the reverse complement. In ANTISENSE we directly look up for
    // the pmer.

    // Get the extension bases
    AlphaCount64 extensions;
    AlphaCount64 rc_extensions;

    // Set up pointers to the data to fill in/query
    // depending on the direction of the extension
    AlphaCount64* pDirectEC;
    AlphaCount64* pIndirectEC;
    std::string* pDirectStr;
    std::string* pIndirectStr;
    if(direction == ED_SENSE)
    {
        pDirectEC = &rc_extensions;
        pDirectStr = &rc_pmer;

        pIndirectEC = &extensions;
        pIndirectStr = &pmer;
    }
    else
    {
        pDirectEC = &extensions;
        pDirectStr = &pmer;

        pIndirectEC = &rc_extensions;
        pIndirectStr = &rc_pmer;
    }

    // Get the interval for the direct query string
    BWTInterval interval;

    // Use interval cache if available
    if(pFwdCache)
        interval = BWTAlgorithms::findIntervalWithCache(pBWT, pFwdCache, *pDirectStr);
    else
        interval = BWTAlgorithms::findInterval(pBWT, *pDirectStr);

    // Fill in the direct count
    if(interval.isValid())
        *pDirectEC = BWTAlgorithms::getExtCount(interval, pBWT);

    // Now, for the non-direct index, query the 4 possible k-mers that are adjacent to the pmer
    // Setup the query sequence
    std::string query(k, 'A');
    int varIdx = query.size() - 1;
    query.replace(0, p, *pIndirectStr);

    for(int i = 0; i < BWT_ALPHABET::size; ++i)
    {
        // Transform the query
        char b = BWT_ALPHABET::getChar(i);
        query[varIdx] = b;

        // Perform lookup
        if(pFwdCache)
            interval = BWTAlgorithms::findIntervalWithCache(pBWT, pFwdCache, query);
        else
            interval = BWTAlgorithms::findInterval(pBWT, query);

        // Update the extension count
        if(interval.isValid())
            pIndirectEC->add(b, interval.size());
    }

    // Switch the reverse-complement extensions to the same strand as the str
    rc_extensions.complement();
    extensions += rc_extensions;
    return extensions;
}
// Run the bubble construction process
HaplotypeBuilderReturnCode DeBruijnHaplotypeBuilder::run(StringVector& out_haplotypes)
{
    PROFILE_FUNC("GraphCompare::buildVariantStringGraph")
    assert(!m_startingKmer.empty());

    std::map<std::string, int> kmerCountMap;

    // We search until we find the first common vertex in each direction
    size_t MIN_TARGET_COUNT = m_parameters.bReferenceMode ? 1 : 2;
    size_t MAX_ITERATIONS = 2000;
    size_t MAX_SIMULTANEOUS_BRANCHES = 40;
    size_t MAX_TOTAL_BRANCHES = 50;

    // Tracking stats
    size_t max_simul_branches_used = 0;
    size_t total_branches = 0;
    size_t iterations = 0;

    // Initialize the graph
    StringGraph* pGraph = new StringGraph;
    BuilderExtensionQueue queue;

    Vertex* pVertex = new(pGraph->getVertexAllocator()) Vertex(m_startingKmer, m_startingKmer);
    pVertex->setColor(GC_BLACK);
    pGraph->addVertex(pVertex);

    // Add the vertex to the extension queue
    queue.push(BuilderExtensionNode(pVertex, ED_SENSE));
    queue.push(BuilderExtensionNode(pVertex, ED_ANTISENSE));

    std::vector<Vertex*> sense_join_vector;
    std::vector<Vertex*> antisense_join_vector;

    // Perform the extension. The while conditions are heuristics to avoid searching
    // the graph too much 
    while(!queue.empty() && iterations++ < MAX_ITERATIONS && queue.size() < MAX_SIMULTANEOUS_BRANCHES && total_branches < MAX_TOTAL_BRANCHES)
    {
        if(queue.size() > max_simul_branches_used)
            max_simul_branches_used = queue.size();

        BuilderExtensionNode curr = queue.front();
        queue.pop();

        // Calculate de Bruijn extensions for this node
        std::string vertStr = curr.pVertex->getSeq().toString();
        AlphaCount64 extensionCounts = BWTAlgorithms::calculateDeBruijnExtensionsSingleIndex(vertStr, m_parameters.variantIndex.pBWT, curr.direction);

        std::string extensionsUsed;
        for(size_t i = 0; i < DNA_ALPHABET::size; ++i)
        {
            char b = DNA_ALPHABET::getBase(i);
            size_t count = extensionCounts.get(b);
            bool acceptExt = count >= m_parameters.minDBGCount;
            if(!acceptExt)
                continue;

            extensionsUsed.push_back(b);
            std::string newStr = VariationBuilderCommon::makeDeBruijnVertex(vertStr, b, curr.direction);
            kmerCountMap[newStr] = count;

            // Create the new vertex and edge in the graph
            // Skip if the vertex already exists
            if(pGraph->getVertex(newStr) != NULL)
                continue;
            
            // Allocate the new vertex and add it to the graph
            Vertex* pVertex = new(pGraph->getVertexAllocator()) Vertex(newStr, newStr);
            pVertex->setColor(GC_BLACK);
            pGraph->addVertex(pVertex);

            // Add edges
            VariationBuilderCommon::addSameStrandDeBruijnEdges(pGraph, curr.pVertex, pVertex, curr.direction);
            
            // Check if this sequence is present in the FM-index of the target
            // If so, it is the join point of the de Bruijn graph and we extend no further.
            size_t targetCount = BWTAlgorithms::countSequenceOccurrences(newStr, m_parameters.baseIndex);

            if(targetCount >= MIN_TARGET_COUNT)
            {
                if(curr.direction == ED_SENSE)
                    sense_join_vector.push_back(pVertex);
                else
                    antisense_join_vector.push_back(pVertex);
            }
            else
            {
                // Add the vertex to the extension queue
                queue.push(BuilderExtensionNode(pVertex, curr.direction));
            }
        }
        
        // Update the total number of times we branches the search
        if(!extensionsUsed.empty())
            total_branches += extensionsUsed.size() - 1;
    }

    // If the graph construction was successful, walk the graph
    // between the endpoints to make a string
    // Generate haplotypes between every pair of antisense/sense join vertices
    for(size_t i = 0; i < antisense_join_vector.size(); ++i) {
        for(size_t j = 0; j < sense_join_vector.size(); ++j) {
            SGWalkVector outWalks;
            SGSearch::findWalks(antisense_join_vector[i],
                                sense_join_vector[j],
                                ED_SENSE,
                                100000, // max distance to search
                                10000, // max nodes to search
                                true, // exhaustive search
                                outWalks);

            for(size_t k = 0; k < outWalks.size(); ++k)
                out_haplotypes.push_back(outWalks[k].getString(SGWT_START_TO_END));
        }
    }
    
    delete pGraph;
    return HBRC_OK;
}