static void print_suggest_cutoff(size_t hist_distsize, size_t hist_covgsize, uint64_t (*hists)[hist_covgsize], FILE *fh) { size_t i, dist, median, nthresh_failed = 0; size_t cutoffs[hist_distsize], sumcovgs[hist_distsize]; // Don't use dist[0] -- not informative memset(cutoffs, 0, sizeof(cutoffs)); memset(sumcovgs, 0, sizeof(sumcovgs)); for(dist = 1; dist < hist_distsize; dist++) { int t = cleaning_pick_kmer_threshold(hists[dist], hist_covgsize, NULL, NULL, NULL, NULL); if(t < 0) { nthresh_failed++; t = 0; } cutoffs[dist] = t; for(i = 0; i < hist_covgsize; i++) sumcovgs[dist] += hists[dist][i]; } // Print cutoffs fprintf(fh, "sumcovgs=%zu", sumcovgs[1]); for(i = 2; i < hist_distsize; i++) fprintf(fh, ",%zu", sumcovgs[i]); fprintf(fh, "\ncutoffs=%zu", cutoffs[1]); for(i = 2; i < hist_distsize; i++) fprintf(fh, ",%zu", cutoffs[i]); fprintf(fh, "\n"); median = gca_median_size(cutoffs+1, hist_distsize-1); fprintf(fh, "suggested_cutoff=%zu\n", median); if(nthresh_failed) warn("Threshold failed in %zu cases [default to 0]", nthresh_failed); }
/** * Get coverage threshold for removing unitigs * * @param visited should be at least db_graph.ht.capcity bits long and initialised * to zero. On return, it will be 1 at each original kmer index * @param covgs_csv_path * @param lens_csv_path paths to files to write CSV histogram of unitigs coverages and lengths BEFORE ANY CLEANING. * If NULL these are ignored. * @return threshold to clean or -1 on error */ int cleaning_get_threshold(size_t num_threads, const char *covgs_csv_path, const char *lens_csv_path, uint8_t *visited, const dBGraph *db_graph) { // Estimate optimum cleaning threshold status("[cleaning] Calculating unitig stats with %zu threads...", num_threads); status("[cleaning] Using kmer gamma method"); // Get kmer coverages and unitig lengths UnitigCleaner cl; unitig_cleaner_alloc(&cl, num_threads, 0, 0, NULL, db_graph); supernodes_iterate(num_threads, visited, db_graph, unitig_get_covg, &cl); // Get kmer coverage only (faster) // KmerCleanerIterator kcls[nthreads]; // for(i = 0; i < nthreads; i++) // kcls[i] = (KmerCleanerIterator){.threadid = i, .nthreads = nthreads, .cl = &cl}; // util_run_threads(kcls, nthreads, sizeof(kcls[0]), nthreads, kmer_get_covg); // Wipe visited kmer memory memset(visited, 0, roundup_bits2bytes(db_graph->ht.capacity)); if(covgs_csv_path != NULL) { cleaning_write_covg_histogram(covgs_csv_path, cl.kmer_covgs_init, cl.unitig_covgs_init, cl.covg_arrsize); } if(lens_csv_path != NULL) { cleaning_write_len_histogram(lens_csv_path, cl.len_hist_init, cl.len_arrsize, db_graph->kmer_size); } // set threshold using histogram and genome size double alpha = 0, beta = 0, false_pos = 0, false_neg = 0; int threshold_est = cleaning_pick_kmer_threshold(cl.kmer_covgs_init, cl.covg_arrsize, &alpha, &beta, &false_pos, &false_neg); if(threshold_est < 0) warn("Cannot pick a cleaning threshold"); else { status("[cleaning] alpha=%f, beta=%f FP=%f FN=%f", alpha, beta, false_pos, false_neg); status("[cleaning] Recommended unitig cleaning threshold: < %i", threshold_est); } unitig_cleaner_dealloc(&cl); return threshold_est; }
/** * Get coverage threshold for removing supernodes * * @param visited should be at least db_graph.ht.capcity bits long and initialised * to zero. On return, it will be 1 at each original kmer index * @param covgs_csv_path * @param lens_csv_path paths to files to write CSV histogram of supernodes coverages and lengths BEFORE ANY CLEANING. * If NULL these are ignored. * @return threshold to clean or -1 on error */ int cleaning_get_threshold(size_t num_threads, const char *covgs_csv_path, const char *lens_csv_path, uint8_t *visited, const dBGraph *db_graph) { // Estimate optimum cleaning threshold status("[cleaning] Calculating supernode stats with %zu threads...", num_threads); status("[cleaning] Using kmer gamma method"); // Get kmer coverages and supernode lengths SupernodeCleaner cl; supernode_cleaner_alloc(&cl, num_threads, 0, 0, NULL, db_graph); supernodes_iterate(num_threads, visited, db_graph, supernode_get_covg, &cl); // Get kmer coverage only (faster) // KmerCleanerIterator kcls[nthreads]; // for(i = 0; i < nthreads; i++) // kcls[i] = (KmerCleanerIterator){.threadid = i, .nthreads = nthreads, .cl = &cl}; // util_run_threads(kcls, nthreads, sizeof(kcls[0]), nthreads, kmer_get_covg); // Wipe visited kmer memory memset(visited, 0, roundup_bits2bytes(db_graph->ht.capacity)); if(covgs_csv_path != NULL) { cleaning_write_covg_histogram(covgs_csv_path, cl.covg_hist_init, cl.mean_covg_hist_init, cl.covg_arrsize); } if(lens_csv_path != NULL) { cleaning_write_len_histogram(lens_csv_path, cl.len_hist_init, cl.len_arrsize, db_graph->kmer_size); } // set threshold using histogram and genome size int threshold_est = -1; double fdr = 0.001, alpha = 0, beta = 0; while(fdr < 1) { threshold_est = cleaning_pick_kmer_threshold(cl.covg_hist_init, cl.covg_arrsize, fdr, &alpha, &beta); if(threshold_est >= 0) break; fdr *= 10; } if(threshold_est < 0) warn("Cannot pick a cleaning threshold"); else status("[cleaning] FDR set to %f [alpha=%f, beta=%f]", fdr, alpha, beta); if(threshold_est >= 0) { status("[cleaning] Recommended supernode cleaning threshold: < %i", threshold_est); } supernode_cleaner_dealloc(&cl); return threshold_est; }