Esempio n. 1
0
KmerDistribution BWTAlgorithms::sampleKmerCounts(size_t kmerSize, size_t sampleSize, const BWT* pBWT)
{
    // Learn k-mer occurrence distribution for this value of k
    KmerDistribution distribution;
    for(size_t i = 0; i < sampleSize; ++i)
    {
        std::string s = sampleRandomString(pBWT, kmerSize);
		int count = countSequenceOccurrences(s, pBWT);
		distribution.add(count);
    }

    return distribution;
}
Esempio n. 2
0
// Learn parameters of the kmer corrector
int learnKmerParameters(const BWT* pBWT)
{
    std::cout << "Learning kmer parameters\n";
    srand(time(0));
    size_t n_samples = 10000;

    //
    KmerDistribution kmerDistribution;
    int k = opt::kmerLength;
    for(size_t i = 0; i < n_samples; ++i)
    {
        std::string s = BWTAlgorithms::sampleRandomString(pBWT);
        int n = s.size();
        int nk = n - k + 1;
        for(int j = 0; j < nk; ++j)
        {
            std::string kmer = s.substr(j, k);
            int count = BWTAlgorithms::countSequenceOccurrences(kmer, pBWT);
            kmerDistribution.add(count);
        }
    }

    //
    kmerDistribution.print(75);

    double ratio = 2.0f;
    int chosenThreshold = kmerDistribution.findErrorBoundaryByRatio(ratio);
    double cumulativeLEQ = kmerDistribution.getCumulativeProportionLEQ(chosenThreshold);

    if(chosenThreshold == -1)
    {
        std::cerr << "[sga correct] Error k-mer threshold learning failed\n";
        std::cerr << "[sga correct] This can indicate the k-mer you choose is too high or your data has very low coverage\n";
        exit(EXIT_FAILURE);
    }

    std::cout << "Chosen kmer threshold: " << chosenThreshold << "\n";
    std::cout << "Proportion of kmer density right of threshold: " << 1.0f - cumulativeLEQ << "\n";
    if(cumulativeLEQ > 0.25f)
    {
        std::cerr << "[sga correct] Warning: Proportion of kmers greater than the chosen threshold is less than 0.75 (" << cumulativeLEQ  << "\n";
        std::cerr << "[sga correct] This can indicate your chosen kmer size is too large or your data is too low coverage to reliably correct\n";
        std::cerr << "[sga correct] It is suggest to lower the kmer size and/or choose the threshold manually\n";
    }
    
    return chosenThreshold;
}
Esempio n. 3
0
void generate_kmer_coverage(JSONWriter* pWriter, const BWTIndexSet& index_set)
{
    pWriter->String("KmerDistribution");
    pWriter->StartObject();
    size_t n_samples = 10000;
    size_t k = 51;
    
    pWriter->String("k");
    pWriter->Int(k);
    
    KmerDistribution kmerDistribution;
    for(size_t i = 0; i < n_samples; ++i)
    {
        std::string s = BWTAlgorithms::sampleRandomString(index_set.pBWT);
        int n = s.size();
        int nk = n - k + 1;
        for(int j = 0; j < nk; ++j)
        {
            std::string kmer = s.substr(j, k);
            int count = BWTAlgorithms::countSequenceOccurrences(kmer, index_set.pBWT);
            kmerDistribution.add(count);
        }
    }

    pWriter->String("distribution");
    pWriter->StartArray();
    int max = kmerDistribution.getCutoffForProportion(0.95f);
    std::vector<int> count_vector = kmerDistribution.toCountVector(max);
    for(size_t i = 1; i < count_vector.size(); ++i)
    {
        pWriter->StartObject();
        pWriter->String("kmer-depth");
        pWriter->Int(i);
        pWriter->String("count");
        pWriter->Int(count_vector[i]);
        pWriter->EndObject();
    }

    pWriter->EndArray();
    pWriter->EndObject();
}