/** * Get coverage threshold for removing unitigs * * @param visited should be at least db_graph.ht.capcity bits long and initialised * to zero. On return, it will be 1 at each original kmer index * @param covgs_csv_path * @param lens_csv_path paths to files to write CSV histogram of unitigs coverages and lengths BEFORE ANY CLEANING. * If NULL these are ignored. * @return threshold to clean or -1 on error */ int cleaning_get_threshold(size_t num_threads, const char *covgs_csv_path, const char *lens_csv_path, uint8_t *visited, const dBGraph *db_graph) { // Estimate optimum cleaning threshold status("[cleaning] Calculating unitig stats with %zu threads...", num_threads); status("[cleaning] Using kmer gamma method"); // Get kmer coverages and unitig lengths UnitigCleaner cl; unitig_cleaner_alloc(&cl, num_threads, 0, 0, NULL, db_graph); supernodes_iterate(num_threads, visited, db_graph, unitig_get_covg, &cl); // Get kmer coverage only (faster) // KmerCleanerIterator kcls[nthreads]; // for(i = 0; i < nthreads; i++) // kcls[i] = (KmerCleanerIterator){.threadid = i, .nthreads = nthreads, .cl = &cl}; // util_run_threads(kcls, nthreads, sizeof(kcls[0]), nthreads, kmer_get_covg); // Wipe visited kmer memory memset(visited, 0, roundup_bits2bytes(db_graph->ht.capacity)); if(covgs_csv_path != NULL) { cleaning_write_covg_histogram(covgs_csv_path, cl.kmer_covgs_init, cl.unitig_covgs_init, cl.covg_arrsize); } if(lens_csv_path != NULL) { cleaning_write_len_histogram(lens_csv_path, cl.len_hist_init, cl.len_arrsize, db_graph->kmer_size); } // set threshold using histogram and genome size double alpha = 0, beta = 0, false_pos = 0, false_neg = 0; int threshold_est = cleaning_pick_kmer_threshold(cl.kmer_covgs_init, cl.covg_arrsize, &alpha, &beta, &false_pos, &false_neg); if(threshold_est < 0) warn("Cannot pick a cleaning threshold"); else { status("[cleaning] alpha=%f, beta=%f FP=%f FN=%f", alpha, beta, false_pos, false_neg); status("[cleaning] Recommended unitig cleaning threshold: < %i", threshold_est); } unitig_cleaner_dealloc(&cl); return threshold_est; }
// Returns 0 on success, otherwise != 0 int ctx_unitigs(int argc, char **argv) { size_t nthreads = 0; struct MemArgs memargs = MEM_ARGS_INIT; const char *out_path = NULL; UnitigSyntax syntax = PRINT_FASTA; bool dot_use_points = false; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'F': cmd_check(!syntax, cmd); syntax = PRINT_FASTA; break; case 'g': cmd_check(!syntax, cmd); syntax = PRINT_GFA; break; case 'd': cmd_check(!syntax, cmd); syntax = PRINT_DOT; break; case 'P': cmd_check(!dot_use_points, cmd); dot_use_points = true; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ die("`"CMD" unitigs -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } if(dot_use_points && syntax == PRINT_FASTA) cmd_print_usage("--point is only for use with --dot"); // Defaults for unset values if(out_path == NULL) out_path = "-"; if(nthreads == 0) nthreads = DEFAULT_NTHREADS; if(optind >= argc) cmd_print_usage(NULL); size_t i, num_gfiles = (size_t)(argc - optind); char **gfile_paths = argv + optind; if(dot_use_points && syntax != PRINT_DOT) cmd_print_usage("--points only valid with --graphviz / --dot"); ctx_assert(num_gfiles > 0); // Open graph files GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); size_t ctx_max_kmers = 0, ctx_sum_kmers = 0; graph_files_open(gfile_paths, gfiles, num_gfiles, &ctx_max_kmers, &ctx_sum_kmers); // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem; bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 + 1; if(syntax != PRINT_FASTA) bits_per_kmer += sizeof(UnitigEnd) * 8; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, ctx_max_kmers, ctx_sum_kmers, true, &graph_mem); cmd_check_mem_limit(memargs.mem_to_use, graph_mem); status("Output in %s format to %s\n", syntax_strs[syntax], futil_outpath_str(out_path)); // // Open output file // // Print to stdout unless --out <out> is specified FILE *fout = futil_fopen_create(out_path, "w"); // // Allocate memory // dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, 1, 1, kmers_in_hash, DBG_ALLOC_EDGES); UnitigPrinter printer; unitig_printer_init(&printer, &db_graph, nthreads, syntax, fout); if(syntax == PRINT_DOT || syntax == PRINT_GFA) unitig_graph_alloc(&printer.ugraph, &db_graph); // Load graphs GraphLoadingPrefs gprefs = {.db_graph = &db_graph, .boolean_covgs = false, .must_exist_in_graph = false, .empty_colours = false}; for(i = 0; i < num_gfiles; i++) { file_filter_flatten(&gfiles[i].fltr, 0); graph_load(&gfiles[i], gprefs, NULL); graph_file_close(&gfiles[i]); } ctx_free(gfiles); hash_table_print_stats(&db_graph.ht); switch(syntax) { case PRINT_FASTA: status("Printing unitgs in FASTA using %zu threads", nthreads); supernodes_iterate(nthreads, printer.visited, &db_graph, print_unitig_fasta, &printer); break; case PRINT_GFA: print_gfa_syntax(&printer); break; case PRINT_DOT: print_dot_syntax(&printer, dot_use_points); break; default: die("Invalid print syntax: %i", syntax); } char num_unitigs_str[50]; ulong_to_str(printer.num_unitigs, num_unitigs_str); status("Dumped %s unitigs\n", num_unitigs_str); fclose(fout); unitig_printer_destroy(&printer); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
/** * Remove unitigs with coverage < `covg_threshold` and tips shorter than * `min_keep_tip`. * * @param num_threads Number of threads to use * @param covg_threshold Remove unitigs with mean covg < `covg_threshold`. * Ignored if 0. * @param min_keep_tip Remove tips with length < `min_keep_tip`. Ignored if 0. * @param covgs_csv_path Path to write CSV of kmer coverage histogram * @param lens_csv_path Path to write CSV of unitig length histogram * * `visited`, `keep` should each be at least db_graph.ht.capcity bits long * and initialised to zero. On return, * `visited` will be 1 at each original kmer index * `keep` will be 1 at each retained kmer index **/ void clean_graph(size_t num_threads, size_t covg_threshold, size_t min_keep_tip, const char *covgs_csv_path, const char *lens_csv_path, uint8_t *visited, uint8_t *keep, dBGraph *db_graph) { ctx_assert(db_graph->num_of_cols == 1); ctx_assert(db_graph->num_edge_cols > 0); size_t init_nkmers = db_graph->ht.num_kmers; if(db_graph->ht.num_kmers == 0) return; if(covg_threshold == 0 && min_keep_tip == 0) { warn("[cleaning] No cleaning specified"); return; } if(covg_threshold > 0) { status("[cleaning] Removing unitigs with coverage < %zu...", covg_threshold); status("[cleaning] Using kmer gamma method"); } if(min_keep_tip > 0) status("[cleaning] Removing tips shorter than %zu...", min_keep_tip); status("[cleaning] using %zu threads", num_threads); // Mark nodes to keep UnitigCleaner cl; unitig_cleaner_alloc(&cl, num_threads, covg_threshold, min_keep_tip, keep, db_graph); supernodes_iterate(num_threads, visited, db_graph, unitig_mark, &cl); // Print numbers of kmers that are being removed char num_snodes_str[50], num_tips_str[50], num_tip_snodes_str[50]; char num_snode_kmers_str[50], num_tip_kmers_str[50], num_tip_snode_kmers_str[50]; ulong_to_str(cl.num_low_covg_snodes, num_snodes_str); ulong_to_str(cl.num_tips, num_tips_str); ulong_to_str(cl.num_tip_and_low_snodes, num_tip_snodes_str); ulong_to_str(cl.num_low_covg_snode_kmers, num_snode_kmers_str); ulong_to_str(cl.num_tip_kmers, num_tip_kmers_str); ulong_to_str(cl.num_tip_and_low_snode_kmers, num_tip_snode_kmers_str); status("[cleaning] Removing %s low coverage unitigs [%s kmer%s], " "%s unitig tips [%s kmer%s] " "and %s of both [%s kmer%s]", num_snodes_str, num_snode_kmers_str, util_plural_str(cl.num_low_covg_snode_kmers), num_tips_str, num_tip_kmers_str, util_plural_str(cl.num_tip_kmers), num_tip_snodes_str, num_tip_snode_kmers_str, util_plural_str(cl.num_tip_and_low_snode_kmers)); // Remove nodes not marked to keep prune_nodes_lacking_flag(num_threads, keep, db_graph); // Wipe memory memset(visited, 0, roundup_bits2bytes(db_graph->ht.capacity)); memset(keep, 0, roundup_bits2bytes(db_graph->ht.capacity)); // Print status update char remain_nkmers_str[100], removed_nkmers_str[100]; size_t remain_nkmers = db_graph->ht.num_kmers; size_t removed_nkmers = init_nkmers - remain_nkmers; ulong_to_str(remain_nkmers, remain_nkmers_str); ulong_to_str(removed_nkmers, removed_nkmers_str); status("[cleaning] Remaining kmers: %s removed: %s (%.1f%%)", remain_nkmers_str, removed_nkmers_str, (100.0*removed_nkmers)/init_nkmers); if(covgs_csv_path != NULL) { cleaning_write_covg_histogram(covgs_csv_path, cl.kmer_covgs_clean, cl.unitig_covg_clean, cl.covg_arrsize); } if(lens_csv_path != NULL) { cleaning_write_len_histogram(lens_csv_path, cl.len_hist_clean, cl.len_arrsize, db_graph->kmer_size); } unitig_cleaner_dealloc(&cl); }
/** * Get coverage threshold for removing supernodes * * @param visited should be at least db_graph.ht.capcity bits long and initialised * to zero. On return, it will be 1 at each original kmer index * @param covgs_csv_path * @param lens_csv_path paths to files to write CSV histogram of supernodes coverages and lengths BEFORE ANY CLEANING. * If NULL these are ignored. * @return threshold to clean or -1 on error */ int cleaning_get_threshold(size_t num_threads, const char *covgs_csv_path, const char *lens_csv_path, uint8_t *visited, const dBGraph *db_graph) { // Estimate optimum cleaning threshold status("[cleaning] Calculating supernode stats with %zu threads...", num_threads); status("[cleaning] Using kmer gamma method"); // Get kmer coverages and supernode lengths SupernodeCleaner cl; supernode_cleaner_alloc(&cl, num_threads, 0, 0, NULL, db_graph); supernodes_iterate(num_threads, visited, db_graph, supernode_get_covg, &cl); // Get kmer coverage only (faster) // KmerCleanerIterator kcls[nthreads]; // for(i = 0; i < nthreads; i++) // kcls[i] = (KmerCleanerIterator){.threadid = i, .nthreads = nthreads, .cl = &cl}; // util_run_threads(kcls, nthreads, sizeof(kcls[0]), nthreads, kmer_get_covg); // Wipe visited kmer memory memset(visited, 0, roundup_bits2bytes(db_graph->ht.capacity)); if(covgs_csv_path != NULL) { cleaning_write_covg_histogram(covgs_csv_path, cl.covg_hist_init, cl.mean_covg_hist_init, cl.covg_arrsize); } if(lens_csv_path != NULL) { cleaning_write_len_histogram(lens_csv_path, cl.len_hist_init, cl.len_arrsize, db_graph->kmer_size); } // set threshold using histogram and genome size int threshold_est = -1; double fdr = 0.001, alpha = 0, beta = 0; while(fdr < 1) { threshold_est = cleaning_pick_kmer_threshold(cl.covg_hist_init, cl.covg_arrsize, fdr, &alpha, &beta); if(threshold_est >= 0) break; fdr *= 10; } if(threshold_est < 0) warn("Cannot pick a cleaning threshold"); else status("[cleaning] FDR set to %f [alpha=%f, beta=%f]", fdr, alpha, beta); if(threshold_est >= 0) { status("[cleaning] Recommended supernode cleaning threshold: < %i", threshold_est); } supernode_cleaner_dealloc(&cl); return threshold_est; }