// Write out the next BWT for the next cycle. This updates BCRVector
// and suffixSymbolCounts. Returns the number of symbols written to writeBWT
size_t BWTCA::outputPartialCycle(int cycle,
                                 const DNAEncodedStringVector* pReadSequences,
                                 BCRVector& bcrVector, 
                                 const DNAEncodedString& readBWT, 
                                 size_t total_read_symbols,
                                 DNAEncodedString& writeBWT, 
                                 AlphaCount64& suffixStartCounts)
{
    // We track the rank of each symbol as it is copied/inserted
    // into the new bwt
    AlphaCount64 rank;

    // Counters
    size_t num_copied = 0;
    size_t num_inserted = 0;
    size_t num_wrote = 0;

    for(size_t i = 0; i < bcrVector.size(); ++i)
    {
        BCRElem& ne = bcrVector[i];
        
        // Copy elements from the read bwt until we reach the target position
        while(num_copied + num_inserted < ne.position)
        {
            char c = readBWT.get(num_copied++);
            writeBWT.set(num_wrote++, c);
            rank.increment(c);
        }

        // Now insert the incoming symbol
        int rl = pReadSequences->at(ne.index).length();
        char c = '$';

        // If the cycle number is greater than the read length, we are
        // on the final iteration and we just add in the '$' characters
        if(cycle <= rl)
            c = pReadSequences->at(ne.index).get(rl - cycle);
        //std::cout << "Inserting " << c << " at position " << num_copied + num_inserted << "\n";
        writeBWT.set(num_wrote++, c);
        num_inserted += 1;

        // Update the nvector element
        ne.sym = c;

        // Record the rank of the inserted symbol
        ne.position = rank.get(c);

        // Update the rank and the number of suffixes that start with c
        rank.increment(c);
        suffixStartCounts.increment(c);
    }

    // Copy any remaining symbols in the bwt
    while(num_copied < total_read_symbols)
        writeBWT.set(num_wrote++, readBWT.get(num_copied++));

    return num_wrote;
}
Exemplo n.º 2
0
// Validate that the sampled occurrence array is correct
void Occurrence::validate(const BWTString& bwStr) const
{
    size_t l = bwStr.length();
    AlphaCount64 sum;
    for(size_t i = 0; i < l; ++i)
    {
        char currB = bwStr.get(i);
        sum.increment(currB);
        AlphaCount64 calculated = get(bwStr, i);
        for(int i = 0; i < ALPHABET_SIZE; ++i)
            assert(calculated.get(ALPHABET[i]) == sum.get(ALPHABET[i]));
    }
}
// Update N and the output BWT for the initial cycle, corresponding to the sentinel suffixes
// the symbolCounts vector is updated to hold the number of times each symbol has been inserted
// into the bwt
void BWTCA::outputInitialCycle(const DNAEncodedStringVector* pReadSequences, BCRVector& bcrVector, DNAEncodedString& bwt, AlphaCount64& suffixSymbolCounts)
{
    AlphaCount64 incomingSymbolCounts;

    size_t n = pReadSequences->size();
    size_t first_read_len = pReadSequences->at(0).length();
    for(size_t i = 0; i < n; ++i)
    {
        size_t rl =  pReadSequences->at(i).length();
        
        // Check that all reads are the same length
        if(rl != first_read_len)
        {
            std::cout << "Error: This implementation of BCR requires all reads to be the same length\n";
            exit(EXIT_FAILURE);
        }

        char c = pReadSequences->at(i).get(rl - 1);
        bwt.set(i, c);

        assert(rl > 1);

        // Load the elements of the N vector with the next symbol
        bcrVector[i].sym = c;
        bcrVector[i].index = i;

        // Set the relative position of the symbol that is being inserted
        bcrVector[i].position = incomingSymbolCounts.get(c);

        // Increment the count of the first base of the suffix of the
        // incoming strings. This is $ for the initial cycle
        suffixSymbolCounts.increment('$');

        // Update the inserted symbols
        incomingSymbolCounts.increment(c);
    }

    suffixSymbolCounts += incomingSymbolCounts;
}
Exemplo n.º 4
0
// Initialize the counts from the bwt string b
void Occurrence::initialize(const BWTString& bwStr, int sampleRate)
{
    m_sampleRate = sampleRate;
    m_shift = calculateShiftValue(m_sampleRate);

    size_t l = bwStr.length();
    int num_samples = (l % m_sampleRate == 0) ? (l / m_sampleRate) : (l / m_sampleRate + 1);
    m_values.resize(num_samples);
    
    AlphaCount64 sum;
    for(size_t i = 0; i < l; ++i)
    {
        char currB = bwStr.get(i);
        sum.increment(currB);
        if(i % m_sampleRate == 0)
            m_values[i / m_sampleRate] = sum;
    }
}
Exemplo n.º 5
0
// Check if sequence is composed of predominantely a single base
// Returns true if the sequence is not degenrate
bool QCProcess::performDegenerateCheck(const SequenceWorkItem& item)
{
    std::string w = item.read.seq.toString();
    AlphaCount64 bc;
    for(size_t i = 0; i < w.size(); ++i)
    {
        bc.increment(w[i]);
    }

    size_t maxCount = bc.getMaxCount();
    double prop = (double)maxCount / w.size();
    if(prop > m_params.degenProportion)
    {
        if(m_params.verbose > 0)
            std::cout << "Read " << w << " failed degenerate filter\n";
        return false;
    }
    return true;
}
Exemplo n.º 6
0
// Fill in the FM-index data structures
void SBWT::initializeFMIndex(int sampleRate)
{
    // initialize the occurance table
    m_occurrence.initialize(m_bwStr, sampleRate);

    // Calculate the C(a) array
    
    // Calculate the total number of occurances of each character in the BW str
    AlphaCount64 tmp;
    for(size_t i = 0; i < m_bwStr.length(); ++i)
    {
        tmp.increment(m_bwStr.get(i));
    }

    m_predCount.set('$', 0);
    m_predCount.set('A', tmp.get('$')); 
    m_predCount.set('C', m_predCount.get('A') + tmp.get('A'));
    m_predCount.set('G', m_predCount.get('C') + tmp.get('C'));
    m_predCount.set('T', m_predCount.get('G') + tmp.get('G'));
}