Example #1
0
void generate_errors_per_base(JSONWriter* pWriter, const BWTIndexSet& index_set)
{

    int n_samples = 100000;
    size_t k = 25;

    double max_error_rate = 0.95;
    size_t min_overlap = 50;
    
    std::vector<size_t> position_count;
    std::vector<size_t> error_count;

    Timer timer("test", true);
#if HAVE_OPENMP
        omp_set_num_threads(opt::numThreads);
        #pragma omp parallel for
#endif
    for(int i = 0; i < n_samples; ++i)
    {
        std::string s = BWTAlgorithms::sampleRandomString(index_set.pBWT);
        KmerOverlaps::retrieveMatches(s, k, min_overlap, max_error_rate, 2, index_set);
        //KmerOverlaps::approximateMatch(s, min_overlap, max_error_rate, 2, 200, index_set);

        MultipleAlignment ma = 
            KmerOverlaps::buildMultipleAlignment(s, k, min_overlap, max_error_rate, 2, index_set);

        // Skip when there is insufficient depth to classify errors
        size_t ma_rows = ma.getNumRows();
        if(ma_rows <= 1)
            continue;

        size_t ma_cols = ma.getNumColumns();
        size_t position = 0;
        for(size_t j = 0; j < ma_cols; ++j)
        {
            char s_symbol = ma.getSymbol(0, j);

            // Skip gaps
            if(s_symbol == '-' || s_symbol == '\0')
                continue;
            
            SymbolCountVector scv = ma.getSymbolCountVector(j);
            int s_symbol_count = 0;
            char max_symbol = 0;
            int max_count = 0;

            for(size_t k = 0; k < scv.size(); ++k)
            {
                if(scv[k].symbol == s_symbol)
                    s_symbol_count = scv[k].count;
                if(scv[k].count > max_count)
                {
                    max_count = scv[k].count;
                    max_symbol = scv[k].symbol;
                }
            }

            //printf("P: %zu S: %c M: %c MC: %d\n", position, s_symbol, max_symbol, max_count);

            // Call an error at this position if the consensus symbol differs from the read
            //    and the support for the read symbol is less than 4 and the consensus symbol
            //    is strongly supported.
            bool is_error = s_symbol != max_symbol && s_symbol_count < 4 && max_count >= 3;

#if HAVE_OPENMP
            #pragma omp critical
#endif
            {
                if(position >= position_count.size())
                {
                    position_count.resize(position+1);
                    error_count.resize(position+1);
                }

                position_count[position]++;
                error_count[position] += is_error;
            }
            position += 1;
        }
    }
    
    pWriter->String("ErrorsPerBase");
    pWriter->StartObject();
    
    pWriter->String("base_count");
    pWriter->StartArray();
    for(size_t i = 0; i < position_count.size(); ++i)
        pWriter->Int(position_count[i]);
    pWriter->EndArray();
    
    pWriter->String("error_count");
    pWriter->StartArray();
    for(size_t i = 0; i < position_count.size(); ++i)
        pWriter->Int(error_count[i]);
    pWriter->EndArray();

    pWriter->EndObject();
}