/** * Get coverage threshold for removing unitigs * * @param visited should be at least db_graph.ht.capcity bits long and initialised * to zero. On return, it will be 1 at each original kmer index * @param covgs_csv_path * @param lens_csv_path paths to files to write CSV histogram of unitigs coverages and lengths BEFORE ANY CLEANING. * If NULL these are ignored. * @return threshold to clean or -1 on error */ int cleaning_get_threshold(size_t num_threads, const char *covgs_csv_path, const char *lens_csv_path, uint8_t *visited, const dBGraph *db_graph) { // Estimate optimum cleaning threshold status("[cleaning] Calculating unitig stats with %zu threads...", num_threads); status("[cleaning] Using kmer gamma method"); // Get kmer coverages and unitig lengths UnitigCleaner cl; unitig_cleaner_alloc(&cl, num_threads, 0, 0, NULL, db_graph); supernodes_iterate(num_threads, visited, db_graph, unitig_get_covg, &cl); // Get kmer coverage only (faster) // KmerCleanerIterator kcls[nthreads]; // for(i = 0; i < nthreads; i++) // kcls[i] = (KmerCleanerIterator){.threadid = i, .nthreads = nthreads, .cl = &cl}; // util_run_threads(kcls, nthreads, sizeof(kcls[0]), nthreads, kmer_get_covg); // Wipe visited kmer memory memset(visited, 0, roundup_bits2bytes(db_graph->ht.capacity)); if(covgs_csv_path != NULL) { cleaning_write_covg_histogram(covgs_csv_path, cl.kmer_covgs_init, cl.unitig_covgs_init, cl.covg_arrsize); } if(lens_csv_path != NULL) { cleaning_write_len_histogram(lens_csv_path, cl.len_hist_init, cl.len_arrsize, db_graph->kmer_size); } // set threshold using histogram and genome size double alpha = 0, beta = 0, false_pos = 0, false_neg = 0; int threshold_est = cleaning_pick_kmer_threshold(cl.kmer_covgs_init, cl.covg_arrsize, &alpha, &beta, &false_pos, &false_neg); if(threshold_est < 0) warn("Cannot pick a cleaning threshold"); else { status("[cleaning] alpha=%f, beta=%f FP=%f FN=%f", alpha, beta, false_pos, false_neg); status("[cleaning] Recommended unitig cleaning threshold: < %i", threshold_est); } unitig_cleaner_dealloc(&cl); return threshold_est; }
/** * Remove unitigs with coverage < `covg_threshold` and tips shorter than * `min_keep_tip`. * * @param num_threads Number of threads to use * @param covg_threshold Remove unitigs with mean covg < `covg_threshold`. * Ignored if 0. * @param min_keep_tip Remove tips with length < `min_keep_tip`. Ignored if 0. * @param covgs_csv_path Path to write CSV of kmer coverage histogram * @param lens_csv_path Path to write CSV of unitig length histogram * * `visited`, `keep` should each be at least db_graph.ht.capcity bits long * and initialised to zero. On return, * `visited` will be 1 at each original kmer index * `keep` will be 1 at each retained kmer index **/ void clean_graph(size_t num_threads, size_t covg_threshold, size_t min_keep_tip, const char *covgs_csv_path, const char *lens_csv_path, uint8_t *visited, uint8_t *keep, dBGraph *db_graph) { ctx_assert(db_graph->num_of_cols == 1); ctx_assert(db_graph->num_edge_cols > 0); size_t init_nkmers = db_graph->ht.num_kmers; if(db_graph->ht.num_kmers == 0) return; if(covg_threshold == 0 && min_keep_tip == 0) { warn("[cleaning] No cleaning specified"); return; } if(covg_threshold > 0) { status("[cleaning] Removing unitigs with coverage < %zu...", covg_threshold); status("[cleaning] Using kmer gamma method"); } if(min_keep_tip > 0) status("[cleaning] Removing tips shorter than %zu...", min_keep_tip); status("[cleaning] using %zu threads", num_threads); // Mark nodes to keep UnitigCleaner cl; unitig_cleaner_alloc(&cl, num_threads, covg_threshold, min_keep_tip, keep, db_graph); supernodes_iterate(num_threads, visited, db_graph, unitig_mark, &cl); // Print numbers of kmers that are being removed char num_snodes_str[50], num_tips_str[50], num_tip_snodes_str[50]; char num_snode_kmers_str[50], num_tip_kmers_str[50], num_tip_snode_kmers_str[50]; ulong_to_str(cl.num_low_covg_snodes, num_snodes_str); ulong_to_str(cl.num_tips, num_tips_str); ulong_to_str(cl.num_tip_and_low_snodes, num_tip_snodes_str); ulong_to_str(cl.num_low_covg_snode_kmers, num_snode_kmers_str); ulong_to_str(cl.num_tip_kmers, num_tip_kmers_str); ulong_to_str(cl.num_tip_and_low_snode_kmers, num_tip_snode_kmers_str); status("[cleaning] Removing %s low coverage unitigs [%s kmer%s], " "%s unitig tips [%s kmer%s] " "and %s of both [%s kmer%s]", num_snodes_str, num_snode_kmers_str, util_plural_str(cl.num_low_covg_snode_kmers), num_tips_str, num_tip_kmers_str, util_plural_str(cl.num_tip_kmers), num_tip_snodes_str, num_tip_snode_kmers_str, util_plural_str(cl.num_tip_and_low_snode_kmers)); // Remove nodes not marked to keep prune_nodes_lacking_flag(num_threads, keep, db_graph); // Wipe memory memset(visited, 0, roundup_bits2bytes(db_graph->ht.capacity)); memset(keep, 0, roundup_bits2bytes(db_graph->ht.capacity)); // Print status update char remain_nkmers_str[100], removed_nkmers_str[100]; size_t remain_nkmers = db_graph->ht.num_kmers; size_t removed_nkmers = init_nkmers - remain_nkmers; ulong_to_str(remain_nkmers, remain_nkmers_str); ulong_to_str(removed_nkmers, removed_nkmers_str); status("[cleaning] Remaining kmers: %s removed: %s (%.1f%%)", remain_nkmers_str, removed_nkmers_str, (100.0*removed_nkmers)/init_nkmers); if(covgs_csv_path != NULL) { cleaning_write_covg_histogram(covgs_csv_path, cl.kmer_covgs_clean, cl.unitig_covg_clean, cl.covg_arrsize); } if(lens_csv_path != NULL) { cleaning_write_len_histogram(lens_csv_path, cl.len_hist_clean, cl.len_arrsize, db_graph->kmer_size); } unitig_cleaner_dealloc(&cl); }
/** * Get coverage threshold for removing supernodes * * @param visited should be at least db_graph.ht.capcity bits long and initialised * to zero. On return, it will be 1 at each original kmer index * @param covgs_csv_path * @param lens_csv_path paths to files to write CSV histogram of supernodes coverages and lengths BEFORE ANY CLEANING. * If NULL these are ignored. * @return threshold to clean or -1 on error */ int cleaning_get_threshold(size_t num_threads, const char *covgs_csv_path, const char *lens_csv_path, uint8_t *visited, const dBGraph *db_graph) { // Estimate optimum cleaning threshold status("[cleaning] Calculating supernode stats with %zu threads...", num_threads); status("[cleaning] Using kmer gamma method"); // Get kmer coverages and supernode lengths SupernodeCleaner cl; supernode_cleaner_alloc(&cl, num_threads, 0, 0, NULL, db_graph); supernodes_iterate(num_threads, visited, db_graph, supernode_get_covg, &cl); // Get kmer coverage only (faster) // KmerCleanerIterator kcls[nthreads]; // for(i = 0; i < nthreads; i++) // kcls[i] = (KmerCleanerIterator){.threadid = i, .nthreads = nthreads, .cl = &cl}; // util_run_threads(kcls, nthreads, sizeof(kcls[0]), nthreads, kmer_get_covg); // Wipe visited kmer memory memset(visited, 0, roundup_bits2bytes(db_graph->ht.capacity)); if(covgs_csv_path != NULL) { cleaning_write_covg_histogram(covgs_csv_path, cl.covg_hist_init, cl.mean_covg_hist_init, cl.covg_arrsize); } if(lens_csv_path != NULL) { cleaning_write_len_histogram(lens_csv_path, cl.len_hist_init, cl.len_arrsize, db_graph->kmer_size); } // set threshold using histogram and genome size int threshold_est = -1; double fdr = 0.001, alpha = 0, beta = 0; while(fdr < 1) { threshold_est = cleaning_pick_kmer_threshold(cl.covg_hist_init, cl.covg_arrsize, fdr, &alpha, &beta); if(threshold_est >= 0) break; fdr *= 10; } if(threshold_est < 0) warn("Cannot pick a cleaning threshold"); else status("[cleaning] FDR set to %f [alpha=%f, beta=%f]", fdr, alpha, beta); if(threshold_est >= 0) { status("[cleaning] Recommended supernode cleaning threshold: < %i", threshold_est); } supernode_cleaner_dealloc(&cl); return threshold_est; }