KmerDistribution BWTAlgorithms::sampleKmerCounts(size_t kmerSize, size_t sampleSize, const BWT* pBWT) { // Learn k-mer occurrence distribution for this value of k KmerDistribution distribution; for(size_t i = 0; i < sampleSize; ++i) { std::string s = sampleRandomString(pBWT, kmerSize); int count = countSequenceOccurrences(s, pBWT); distribution.add(count); } return distribution; }
// Learn parameters of the kmer corrector int learnKmerParameters(const BWT* pBWT) { std::cout << "Learning kmer parameters\n"; srand(time(0)); size_t n_samples = 10000; // KmerDistribution kmerDistribution; int k = opt::kmerLength; for(size_t i = 0; i < n_samples; ++i) { std::string s = BWTAlgorithms::sampleRandomString(pBWT); int n = s.size(); int nk = n - k + 1; for(int j = 0; j < nk; ++j) { std::string kmer = s.substr(j, k); int count = BWTAlgorithms::countSequenceOccurrences(kmer, pBWT); kmerDistribution.add(count); } } // kmerDistribution.print(75); double ratio = 2.0f; int chosenThreshold = kmerDistribution.findErrorBoundaryByRatio(ratio); double cumulativeLEQ = kmerDistribution.getCumulativeProportionLEQ(chosenThreshold); if(chosenThreshold == -1) { std::cerr << "[sga correct] Error k-mer threshold learning failed\n"; std::cerr << "[sga correct] This can indicate the k-mer you choose is too high or your data has very low coverage\n"; exit(EXIT_FAILURE); } std::cout << "Chosen kmer threshold: " << chosenThreshold << "\n"; std::cout << "Proportion of kmer density right of threshold: " << 1.0f - cumulativeLEQ << "\n"; if(cumulativeLEQ > 0.25f) { std::cerr << "[sga correct] Warning: Proportion of kmers greater than the chosen threshold is less than 0.75 (" << cumulativeLEQ << "\n"; std::cerr << "[sga correct] This can indicate your chosen kmer size is too large or your data is too low coverage to reliably correct\n"; std::cerr << "[sga correct] It is suggest to lower the kmer size and/or choose the threshold manually\n"; } return chosenThreshold; }
void generate_kmer_coverage(JSONWriter* pWriter, const BWTIndexSet& index_set) { pWriter->String("KmerDistribution"); pWriter->StartObject(); size_t n_samples = 10000; size_t k = 51; pWriter->String("k"); pWriter->Int(k); KmerDistribution kmerDistribution; for(size_t i = 0; i < n_samples; ++i) { std::string s = BWTAlgorithms::sampleRandomString(index_set.pBWT); int n = s.size(); int nk = n - k + 1; for(int j = 0; j < nk; ++j) { std::string kmer = s.substr(j, k); int count = BWTAlgorithms::countSequenceOccurrences(kmer, index_set.pBWT); kmerDistribution.add(count); } } pWriter->String("distribution"); pWriter->StartArray(); int max = kmerDistribution.getCutoffForProportion(0.95f); std::vector<int> count_vector = kmerDistribution.toCountVector(max); for(size_t i = 1; i < count_vector.size(); ++i) { pWriter->StartObject(); pWriter->String("kmer-depth"); pWriter->Int(i); pWriter->String("count"); pWriter->Int(count_vector[i]); pWriter->EndObject(); } pWriter->EndArray(); pWriter->EndObject(); }