// Print file filter description void file_filter_status(const FileFilter *fltr) { size_t i; pthread_mutex_lock(&ctx_biglock); timestamp(); message("[FileFilter] Loading file %s [%u colour%s]", file_filter_path(fltr), fltr->filencols, util_plural_str(fltr->filencols)); if(!file_filter_is_direct(fltr)) { message(" with filter: %u->%u", file_filter_fromcol(fltr, 0), file_filter_intocol(fltr, 0)); for(i = 1; i < file_filter_num(fltr); i++) message(",%u->%u", file_filter_fromcol(fltr,i), file_filter_intocol(fltr,i)); } message("\n"); pthread_mutex_unlock(&ctx_biglock); }
int ctx_clean(int argc, char **argv) { size_t nthreads = 0, use_ncols = 0; struct MemArgs memargs = MEM_ARGS_INIT; const char *out_ctx_path = NULL; bool tip_cleaning = false, supernode_cleaning = false; size_t min_keep_tip = 0; Covg threshold = 0, fallback_thresh = 0; const char *len_before_path = NULL, *len_after_path = NULL; const char *covg_before_path = NULL, *covg_after_path = NULL; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'o': if(out_ctx_path != NULL) cmd_print_usage(NULL); out_ctx_path = optarg; break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'N': use_ncols = cmd_uint32_nonzero(cmd, optarg); break; case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'T': cmd_check(!tip_cleaning, cmd); min_keep_tip = cmd_uint32_nonzero(cmd, optarg); tip_cleaning = true; break; case 'S': cmd_check(!supernode_cleaning, cmd); if(optarg != NULL) threshold = cmd_uint32_nonzero(cmd, optarg); supernode_cleaning = true; break; case 'B': cmd_check(!fallback_thresh, cmd); fallback_thresh = cmd_uint32_nonzero(cmd, optarg); break; case 'l': cmd_check(!len_before_path, cmd); len_before_path = optarg; break; case 'L': cmd_check(!len_after_path, cmd); len_after_path = optarg; break; case 'c': cmd_check(!covg_before_path, cmd); covg_before_path = optarg; break; case 'C': cmd_check(!covg_after_path, cmd); covg_after_path = optarg; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" clean -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } if(nthreads == 0) nthreads = DEFAULT_NTHREADS; if(optind >= argc) cmd_print_usage("Please give input graph files"); // Default behaviour if(!tip_cleaning && !supernode_cleaning) { if(out_ctx_path != NULL) supernode_cleaning = tip_cleaning = true; // do both else warn("No cleaning being done: you did not specify --out <out.ctx>"); } bool doing_cleaning = (supernode_cleaning || tip_cleaning); if(doing_cleaning && out_ctx_path == NULL) { cmd_print_usage("Please specify --out <out.ctx> for cleaned graph"); } if(!doing_cleaning && (covg_after_path || len_after_path)) { cmd_print_usage("You gave --len-after <out> / --covg-after <out> without " "any cleaning (set -s, --supernodes or -t, --tips)"); } if(doing_cleaning && strcmp(out_ctx_path,"-") != 0 && !futil_get_force() && futil_file_exists(out_ctx_path)) { cmd_print_usage("Output file already exists: %s", out_ctx_path); } if(fallback_thresh && !supernode_cleaning) cmd_print_usage("-B, --fallback <T> without --supernodes"); // Use remaining args as graph files char **gfile_paths = argv + optind; size_t i, j, num_gfiles = (size_t)(argc - optind); // Open graph files GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); size_t ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0; ncols = graph_files_open(gfile_paths, gfiles, num_gfiles, &ctx_max_kmers, &ctx_sum_kmers); size_t kmer_size = gfiles[0].hdr.kmer_size; // default to one colour for now if(use_ncols == 0) use_ncols = 1; // Flatten if we don't have to remember colours / output a graph if(!doing_cleaning) { ncols = use_ncols = 1; for(i = 0; i < num_gfiles; i++) file_filter_flatten(&gfiles[i].fltr, 0); } if(ncols < use_ncols) { warn("I only need %zu colour%s ('--ncols %zu' ignored)", ncols, util_plural_str(ncols), use_ncols); use_ncols = ncols; } char max_kmers_str[100]; ulong_to_str(ctx_max_kmers, max_kmers_str); status("%zu input graph%s, max kmers: %s, using %zu colours", num_gfiles, util_plural_str(num_gfiles), max_kmers_str, use_ncols); // If no arguments given we default to removing tips < 2*kmer_size if(tip_cleaning && min_keep_tip == 0) min_keep_tip = 2 * kmer_size; // Warn if any graph files already cleaned size_t fromcol, intocol; ErrorCleaning *cleaning; for(i = 0; i < num_gfiles; i++) { for(j = 0; j < file_filter_num(&gfiles[i].fltr); j++) { fromcol = file_filter_fromcol(&gfiles[i].fltr, j); cleaning = &gfiles[i].hdr.ginfo[fromcol].cleaning; if(cleaning->cleaned_snodes && supernode_cleaning) { warn("%s:%zu already has supernode cleaning with threshold: <%zu", file_filter_path(&gfiles[i].fltr), fromcol, (size_t)cleaning->clean_snodes_thresh); } if(cleaning->cleaned_tips && tip_cleaning) { warn("%s:%zu already has had tip cleaned", file_filter_path(&gfiles[i].fltr), fromcol); } } } // Print steps size_t step = 0; status("Actions:\n"); if(covg_before_path != NULL) status("%zu. Saving kmer coverage distribution to: %s", step++, covg_before_path); if(len_before_path != NULL) status("%zu. Saving supernode length distribution to: %s", step++, len_before_path); if(tip_cleaning) status("%zu. Cleaning tips shorter than %zu nodes", step++, min_keep_tip); if(supernode_cleaning && threshold > 0) status("%zu. Cleaning supernodes with coverage < %u", step++, threshold); if(supernode_cleaning && threshold <= 0) status("%zu. Cleaning supernodes with auto-detected threshold", step++); if(covg_after_path != NULL) status("%zu. Saving kmer coverage distribution to: %s", step++, covg_after_path); if(len_after_path != NULL) status("%zu. Saving supernode length distribution to: %s", step++, len_after_path); // // Decide memory usage // bool all_colours_loaded = (ncols <= use_ncols); bool use_mem_limit = (memargs.mem_to_use_set && num_gfiles > 1) || !ctx_max_kmers; size_t kmers_in_hash, bits_per_kmer, graph_mem; size_t per_kmer_per_col_bits = (sizeof(BinaryKmer)+sizeof(Covg)+sizeof(Edges)) * 8; size_t pop_edges_per_kmer_bits = (all_colours_loaded ? 0 : sizeof(Edges) * 8); bits_per_kmer = per_kmer_per_col_bits * use_ncols + pop_edges_per_kmer_bits; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, ctx_max_kmers, ctx_sum_kmers, use_mem_limit, &graph_mem); // Maximise the number of colours we load to fill the mem size_t max_usencols = (memargs.mem_to_use*8 - pop_edges_per_kmer_bits * kmers_in_hash) / (per_kmer_per_col_bits * kmers_in_hash); use_ncols = MIN2(max_usencols, ncols); cmd_check_mem_limit(memargs.mem_to_use, graph_mem); // // Check output files are writable // futil_create_output(out_ctx_path); // Does nothing if arg is NULL futil_create_output(covg_before_path); futil_create_output(covg_after_path); futil_create_output(len_before_path); futil_create_output(len_after_path); // Create db_graph // Load as many colours as possible // Use an extra set of edge to take intersections dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, use_ncols, use_ncols, kmers_in_hash, DBG_ALLOC_COVGS); // Edges is a special case size_t num_edges = db_graph.ht.capacity * (use_ncols + !all_colours_loaded); db_graph.col_edges = ctx_calloc(num_edges, sizeof(Edges)); // Load graph into a single colour LoadingStats stats = LOAD_STATS_INIT_MACRO; GraphLoadingPrefs gprefs = {.db_graph = &db_graph, .boolean_covgs = false, .must_exist_in_graph = false, .must_exist_in_edges = NULL, .empty_colours = false}; // Construct cleaned graph header GraphFileHeader outhdr; memset(&outhdr, 0, sizeof(GraphFileHeader)); outhdr.version = CTX_GRAPH_FILEFORMAT; outhdr.kmer_size = db_graph.kmer_size; outhdr.num_of_cols = ncols; outhdr.num_of_bitfields = (db_graph.kmer_size*2+63)/64; graph_header_alloc(&outhdr, ncols); // Merge info into header size_t gcol = 0; for(i = 0; i < num_gfiles; i++) { for(j = 0; j < file_filter_num(&gfiles[i].fltr); j++, gcol++) { fromcol = file_filter_fromcol(&gfiles[i].fltr, j); intocol = file_filter_intocol(&gfiles[i].fltr, j); graph_info_merge(&outhdr.ginfo[intocol], &gfiles[i].hdr.ginfo[fromcol]); } } if(ncols > use_ncols) { graph_files_load_flat(gfiles, num_gfiles, gprefs, &stats); } else { for(i = 0; i < num_gfiles; i++) graph_load(&gfiles[i], gprefs, &stats); } char num_kmers_str[100]; ulong_to_str(db_graph.ht.num_kmers, num_kmers_str); status("Total kmers loaded: %s\n", num_kmers_str); size_t initial_nkmers = db_graph.ht.num_kmers; hash_table_print_stats(&db_graph.ht); uint8_t *visited = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1); uint8_t *keep = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1); if((supernode_cleaning && threshold <= 0) || covg_before_path || len_before_path) { // Get coverage distribution and estimate cleaning threshold int est_threshold = cleaning_get_threshold(nthreads, covg_before_path, len_before_path, visited, &db_graph); if(est_threshold < 0) status("Cannot find recommended cleaning threshold"); else status("Recommended cleaning threshold is: %i", est_threshold); // Use estimated threshold if threshold not set if(threshold <= 0) { if(fallback_thresh > 0 && est_threshold < (int)fallback_thresh) { status("Using fallback threshold: %i", fallback_thresh); threshold = fallback_thresh; } else if(est_threshold >= 0) threshold = est_threshold; } } // Die if we failed to find suitable cleaning threshold if(supernode_cleaning && threshold <= 0) die("Need cleaning threshold (--supernodes=<D> or --fallback <D>)"); if(doing_cleaning) { // Clean graph of tips (if min_keep_tip > 0) and supernodes (if threshold > 0) clean_graph(nthreads, threshold, min_keep_tip, covg_after_path, len_after_path, visited, keep, &db_graph); } ctx_free(visited); ctx_free(keep); if(doing_cleaning) { // Output graph file Edges *intersect_edges = NULL; bool kmers_loaded = true; size_t col, thresh; // Set output header ginfo cleaned for(col = 0; col < ncols; col++) { cleaning = &outhdr.ginfo[col].cleaning; cleaning->cleaned_snodes |= supernode_cleaning; cleaning->cleaned_tips |= tip_cleaning; // if(tip_cleaning) { // strbuf_append_str(&outhdr.ginfo[col].sample_name, ".tipclean"); // } if(supernode_cleaning) { thresh = cleaning->clean_snodes_thresh; thresh = cleaning->cleaned_snodes ? MAX2(thresh, (uint32_t)threshold) : (uint32_t)threshold; cleaning->clean_snodes_thresh = thresh; // char name_append[200]; // sprintf(name_append, ".supclean%zu", thresh); // strbuf_append_str(&outhdr.ginfo[col].sample_name, name_append); } } if(!all_colours_loaded) { // We haven't loaded all the colours // intersect_edges are edges to mask with // resets graph edges intersect_edges = db_graph.col_edges; db_graph.col_edges += db_graph.ht.capacity; } // Print stats on removed kmers size_t removed_nkmers = initial_nkmers - db_graph.ht.num_kmers; double removed_pct = (100.0 * removed_nkmers) / initial_nkmers; char removed_str[100], init_str[100]; ulong_to_str(removed_nkmers, removed_str); ulong_to_str(initial_nkmers, init_str); status("Removed %s of %s (%.2f%%) kmers", removed_str, init_str, removed_pct); graph_files_merge(out_ctx_path, gfiles, num_gfiles, kmers_loaded, all_colours_loaded, intersect_edges, &outhdr, &db_graph); // Swap back if(!all_colours_loaded) db_graph.col_edges = intersect_edges; } ctx_check(db_graph.ht.num_kmers == hash_table_count_kmers(&db_graph.ht)); graph_header_dealloc(&outhdr); for(i = 0; i < num_gfiles; i++) graph_file_close(&gfiles[i]); ctx_free(gfiles); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int ctx_sort(int argc, char **argv) { const char *out_path = NULL; struct MemArgs memargs = MEM_ARGS_INIT; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" sort -h` for help. Bad option: %s", argv[optind-1]); default: die("Bad option: [%c]: %s", c, cmd); } } if(optind+1 != argc) cmd_print_usage("Require exactly one input graph file (.ctx)"); const char *ctx_path = argv[optind]; // // Open Graph file // GraphFileReader gfile; memset(&gfile, 0, sizeof(GraphFileReader)); graph_file_open2(&gfile, ctx_path, out_path ? "r" : "r+", true, 0); if(!file_filter_is_direct(&gfile.fltr)) die("Cannot open graph file with a filter ('in.ctx:blah' syntax)"); size_t num_kmers, memory; // Reading from a stream if(gfile.num_of_kmers < 0) { if(!memargs.num_kmers_set) die("If reading from a stream, must give -n <num_kmers>"); num_kmers = memargs.num_kmers; } else num_kmers = gfile.num_of_kmers; // Open output path (if given) FILE *fout = out_path ? futil_fopen_create(out_path, "w") : NULL; size_t i; size_t ncols = gfile.hdr.num_of_cols; size_t kmer_mem = sizeof(BinaryKmer) + (sizeof(Edges)+sizeof(Covg))*ncols; memory = (sizeof(char*) + kmer_mem) * num_kmers; char mem_str[50]; bytes_to_str(memory, 1, mem_str); if(memory > memargs.mem_to_use) die("Require at least %s memory", mem_str); status("[memory] Total: %s", mem_str); char *mem = ctx_malloc(kmer_mem * num_kmers); char **kmers = ctx_malloc(num_kmers*sizeof(char*)); // Read in whole file // if(graph_file_fseek(gfile, gfile.hdr_size, SEEK_SET) != 0) die("fseek failed"); size_t nkread = gfr_fread_bytes(&gfile, mem, num_kmers*kmer_mem); if(nkread != num_kmers*kmer_mem) die("Could only read %zu bytes [<%zu]", nkread, num_kmers*kmer_mem); // check we are at the end of the file char tmpc; if(gfr_fread_bytes(&gfile, &tmpc, 1) != 0) { die("More kmers in file than believed (kmers: %zu ncols: %zu).", num_kmers, ncols); } status("Read %zu kmers with %zu colour%s", num_kmers, ncols, util_plural_str(ncols)); for(i = 0; i < num_kmers; i++) kmers[i] = mem + kmer_mem*i; sort_block(kmers, num_kmers); // Print if(out_path != NULL) { // saving to a different destination - write header graph_write_header(fout, &gfile.hdr); } else { // Directly manipulating gfile.fh here, using it to write later // Not doing any more reading if(fseek(gfile.fh, gfile.hdr_size, SEEK_SET) != 0) die("fseek failed"); fout = gfile.fh; } for(i = 0; i < num_kmers; i++) if(fwrite(kmers[i], 1, kmer_mem, fout) != kmer_mem) die("Cannot write to file"); if(out_path) fclose(fout); graph_file_close(&gfile); ctx_free(kmers); ctx_free(mem); return EXIT_SUCCESS; }
int ctx_index(int argc, char **argv) { const char *out_path = NULL; size_t block_size = 0, block_kmers = 0; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 'b': cmd_check(!block_kmers, cmd); block_kmers = cmd_size_nonzero(cmd, optarg); break; case 's': cmd_check(!block_size, cmd); block_size = cmd_size_nonzero(cmd, optarg); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" index -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } if(optind+1 != argc) cmd_print_usage("Require exactly one input graph file (.ctx)"); if(block_size && block_kmers) cmd_print_usage("Cannot use --block-kmers and --block-size together"); const char *ctx_path = argv[optind]; // // Open Graph file // GraphFileReader gfile; memset(&gfile, 0, sizeof(GraphFileReader)); graph_file_open2(&gfile, ctx_path, "r+", true, 0); if(!file_filter_is_direct(&gfile.fltr)) die("Cannot open graph file with a filter ('in.ctx:blah' syntax)"); // Open output file FILE *fout = out_path ? futil_fopen_create(out_path, "w") : stdout; // Start size_t filencols = gfile.hdr.num_of_cols; size_t kmer_size = gfile.hdr.kmer_size; const char *path = file_filter_path(&gfile.fltr); size_t ncols = file_filter_into_ncols(&gfile.fltr); size_t kmer_mem = sizeof(BinaryKmer) + (sizeof(Edges)+sizeof(Covg))*filencols; if(block_size) { block_kmers = block_size / kmer_mem; } else if(!block_size && !block_kmers) { block_size = 4 * ONE_MEGABYTE; block_kmers = block_size / kmer_mem; } // Update block-size block_size = block_kmers * kmer_mem; status("[index] block bytes: %zu kmers: %zu; kmer bytes: %zu, hdr: %zu", block_size, block_kmers, kmer_mem, (size_t)gfile.hdr_size); if(block_kmers == 0) die("Cannot set block_kmers to zero"); // Print header fputs("#block_start\tnext_block\tfirst_kmer\tkmer_idx\tnext_kmer_idx\n", fout); BinaryKmer bkmer = BINARY_KMER_ZERO_MACRO; BinaryKmer prev_bkmer = BINARY_KMER_ZERO_MACRO; Covg *covgs = ctx_malloc(ncols * sizeof(Covg)); Edges *edges = ctx_malloc(ncols * sizeof(Edges)); char bkmerstr[MAX_KMER_SIZE+1]; size_t rem_block = block_size - kmer_mem; // block after first kmer char *tmp_mem = ctx_malloc(rem_block); // Read in file, print index size_t nblocks = 0; size_t bl_bytes = 0, bl_kmers = 0; size_t bl_byte_offset = gfile.hdr_size, bl_kmer_offset = 0; while(1) { if(!graph_file_read(&gfile, &bkmer, covgs, edges)) { status("Read kmer failed"); break; } binary_kmer_to_str(bkmer, kmer_size, bkmerstr); if(nblocks > 0 && !binary_kmer_less_than(prev_bkmer,bkmer)) die("File is not sorted: %s [%s]", bkmerstr, path); // We've already read one kmer entry, read rest of block bl_bytes = kmer_mem + gfr_fread_bytes(&gfile, tmp_mem, rem_block); bl_kmers = 1 + bl_bytes / kmer_mem; fprintf(fout, "%zu\t%zu\t%s\t%zu\t%zu\n", bl_byte_offset, bl_byte_offset+bl_bytes, bkmerstr, bl_kmer_offset, bl_kmer_offset+bl_kmers); bl_byte_offset += bl_bytes; bl_kmer_offset += bl_kmers; nblocks++; if(bl_kmers < block_kmers) { status("last block %zu < %zu; %zu vs %zu", bl_kmers, block_kmers, bl_bytes, block_size); break; } prev_bkmer = bkmer; } ctx_free(covgs); ctx_free(edges); ctx_free(tmp_mem); // done char num_kmers_str[50], num_blocks_str[50]; char block_mem_str[50], block_kmers_str[50]; ulong_to_str(bl_kmer_offset, num_kmers_str); ulong_to_str(nblocks, num_blocks_str); bytes_to_str(block_size, 1, block_mem_str); ulong_to_str(block_kmers, block_kmers_str); status("Read %s kmers in %s block%s (block size %s / %s kmers)", num_kmers_str, num_blocks_str, util_plural_str(nblocks), block_mem_str, block_kmers_str); if(fout != stdout) status("Saved to %s", out_path); graph_file_close(&gfile); fclose(fout); return EXIT_SUCCESS; }
/** * Remove unitigs with coverage < `covg_threshold` and tips shorter than * `min_keep_tip`. * * @param num_threads Number of threads to use * @param covg_threshold Remove unitigs with mean covg < `covg_threshold`. * Ignored if 0. * @param min_keep_tip Remove tips with length < `min_keep_tip`. Ignored if 0. * @param covgs_csv_path Path to write CSV of kmer coverage histogram * @param lens_csv_path Path to write CSV of unitig length histogram * * `visited`, `keep` should each be at least db_graph.ht.capcity bits long * and initialised to zero. On return, * `visited` will be 1 at each original kmer index * `keep` will be 1 at each retained kmer index **/ void clean_graph(size_t num_threads, size_t covg_threshold, size_t min_keep_tip, const char *covgs_csv_path, const char *lens_csv_path, uint8_t *visited, uint8_t *keep, dBGraph *db_graph) { ctx_assert(db_graph->num_of_cols == 1); ctx_assert(db_graph->num_edge_cols > 0); size_t init_nkmers = db_graph->ht.num_kmers; if(db_graph->ht.num_kmers == 0) return; if(covg_threshold == 0 && min_keep_tip == 0) { warn("[cleaning] No cleaning specified"); return; } if(covg_threshold > 0) { status("[cleaning] Removing unitigs with coverage < %zu...", covg_threshold); status("[cleaning] Using kmer gamma method"); } if(min_keep_tip > 0) status("[cleaning] Removing tips shorter than %zu...", min_keep_tip); status("[cleaning] using %zu threads", num_threads); // Mark nodes to keep UnitigCleaner cl; unitig_cleaner_alloc(&cl, num_threads, covg_threshold, min_keep_tip, keep, db_graph); supernodes_iterate(num_threads, visited, db_graph, unitig_mark, &cl); // Print numbers of kmers that are being removed char num_snodes_str[50], num_tips_str[50], num_tip_snodes_str[50]; char num_snode_kmers_str[50], num_tip_kmers_str[50], num_tip_snode_kmers_str[50]; ulong_to_str(cl.num_low_covg_snodes, num_snodes_str); ulong_to_str(cl.num_tips, num_tips_str); ulong_to_str(cl.num_tip_and_low_snodes, num_tip_snodes_str); ulong_to_str(cl.num_low_covg_snode_kmers, num_snode_kmers_str); ulong_to_str(cl.num_tip_kmers, num_tip_kmers_str); ulong_to_str(cl.num_tip_and_low_snode_kmers, num_tip_snode_kmers_str); status("[cleaning] Removing %s low coverage unitigs [%s kmer%s], " "%s unitig tips [%s kmer%s] " "and %s of both [%s kmer%s]", num_snodes_str, num_snode_kmers_str, util_plural_str(cl.num_low_covg_snode_kmers), num_tips_str, num_tip_kmers_str, util_plural_str(cl.num_tip_kmers), num_tip_snodes_str, num_tip_snode_kmers_str, util_plural_str(cl.num_tip_and_low_snode_kmers)); // Remove nodes not marked to keep prune_nodes_lacking_flag(num_threads, keep, db_graph); // Wipe memory memset(visited, 0, roundup_bits2bytes(db_graph->ht.capacity)); memset(keep, 0, roundup_bits2bytes(db_graph->ht.capacity)); // Print status update char remain_nkmers_str[100], removed_nkmers_str[100]; size_t remain_nkmers = db_graph->ht.num_kmers; size_t removed_nkmers = init_nkmers - remain_nkmers; ulong_to_str(remain_nkmers, remain_nkmers_str); ulong_to_str(removed_nkmers, removed_nkmers_str); status("[cleaning] Remaining kmers: %s removed: %s (%.1f%%)", remain_nkmers_str, removed_nkmers_str, (100.0*removed_nkmers)/init_nkmers); if(covgs_csv_path != NULL) { cleaning_write_covg_histogram(covgs_csv_path, cl.kmer_covgs_clean, cl.unitig_covg_clean, cl.covg_arrsize); } if(lens_csv_path != NULL) { cleaning_write_len_histogram(lens_csv_path, cl.len_hist_clean, cl.len_arrsize, db_graph->kmer_size); } unitig_cleaner_dealloc(&cl); }
int ctx_join(int argc, char **argv) { struct MemArgs memargs = MEM_ARGS_INIT; const char *out_path = NULL; size_t use_ncols = 0; GraphFileReader tmp_gfile; GraphFileBuffer isec_gfiles_buf; gfile_buf_alloc(&isec_gfiles_buf, 8); // Arg parsing char cmd[100], shortopts[100]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'N': cmd_check(!use_ncols, cmd); use_ncols = cmd_uint32_nonzero(cmd, optarg); break; case 'i': graph_file_reset(&tmp_gfile); graph_file_open(&tmp_gfile, optarg); if(file_filter_into_ncols(&tmp_gfile.fltr) > 1) warn("Flattening intersection graph into colour 0: %s", optarg); file_filter_flatten(&tmp_gfile.fltr, 0); gfile_buf_push(&isec_gfiles_buf, &tmp_gfile, 1); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" join -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } GraphFileReader *igfiles = isec_gfiles_buf.b; size_t num_igfiles = isec_gfiles_buf.len; if(!out_path) cmd_print_usage("--out <out.ctx> required"); if(optind >= argc) cmd_print_usage("Please specify at least one input graph file"); // optind .. argend-1 are graphs to load size_t num_gfiles = (size_t)(argc - optind); char **gfile_paths = argv + optind; GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); status("Probing %zu graph files and %zu intersect files", num_gfiles, num_igfiles); // Check all binaries are valid binaries with matching kmer size size_t i; size_t ctx_max_cols = 0; uint64_t min_intersect_num_kmers = 0, ctx_max_kmers = 0, ctx_sum_kmers = 0; for(i = 0; i < num_gfiles; i++) { graph_file_open2(&gfiles[i], gfile_paths[i], "r", true, ctx_max_cols); if(gfiles[0].hdr.kmer_size != gfiles[i].hdr.kmer_size) { cmd_print_usage("Kmer sizes don't match [%u vs %u]", gfiles[0].hdr.kmer_size, gfiles[i].hdr.kmer_size); } ctx_max_cols = MAX2(ctx_max_cols, file_filter_into_ncols(&gfiles[i].fltr)); ctx_max_kmers = MAX2(ctx_max_kmers, graph_file_nkmers(&gfiles[i])); ctx_sum_kmers += graph_file_nkmers(&gfiles[i]); } // Probe intersection graph files for(i = 0; i < num_igfiles; i++) { if(gfiles[0].hdr.kmer_size != igfiles[i].hdr.kmer_size) { cmd_print_usage("Kmer sizes don't match [%u vs %u]", gfiles[0].hdr.kmer_size, igfiles[i].hdr.kmer_size); } uint64_t nkmers = graph_file_nkmers(&igfiles[i]); if(i == 0) min_intersect_num_kmers = nkmers; else if(nkmers < min_intersect_num_kmers) { // Put smallest intersection binary first SWAP(igfiles[i], igfiles[0]); min_intersect_num_kmers = nkmers; } } bool take_intersect = (num_igfiles > 0); // If we are taking an intersection, // all kmers intersection kmers will need to be loaded if(take_intersect) ctx_max_kmers = ctx_sum_kmers = min_intersect_num_kmers; bool use_ncols_set = (use_ncols > 0); bool output_to_stdout = (strcmp(out_path,"-") == 0); // if(use_ncols == 0) use_ncols = 1; if(use_ncols_set) { if(use_ncols < ctx_max_cols && output_to_stdout) die("I need %zu colours if outputting to STDOUT (--ncols)", ctx_max_cols); if(use_ncols > ctx_max_cols) { warn("I only need %zu colour%s ('--ncols %zu' ignored)", ctx_max_cols, util_plural_str(ctx_max_cols), use_ncols); use_ncols = ctx_max_cols; } } else { use_ncols = output_to_stdout ? ctx_max_cols : 1; } // Check out_path is writable futil_create_output(out_path); status("Output %zu cols; from %zu files; intersecting %zu graphs; ", ctx_max_cols, num_gfiles, num_igfiles); if(num_gfiles == 1 && num_igfiles == 0) { // Loading only one file with no intersection files // Don't need to store a graph in memory, can filter as stream // Don't actually store anything in the de Bruijn graph, but we need to // pass it, so mock one up dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, file_filter_into_ncols(&gfiles[0].fltr), 0, 1024, 0); graph_writer_stream_mkhdr(out_path, &gfiles[0], &db_graph, NULL, NULL); graph_file_close(&gfiles[0]); gfile_buf_dealloc(&isec_gfiles_buf); ctx_free(gfiles); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; } // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem; bits_per_kmer = sizeof(BinaryKmer)*8 + (sizeof(Covg) + sizeof(Edges)) * 8 * use_ncols; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, ctx_max_kmers, ctx_sum_kmers, true, &graph_mem); if(!use_ncols_set) { // Maximise use_ncols size_t max_usencols = (memargs.mem_to_use*8) / bits_per_kmer; use_ncols = MIN2(max_usencols, ctx_max_cols); bits_per_kmer = sizeof(BinaryKmer)*8 + (sizeof(Covg) + sizeof(Edges)) * 8 * use_ncols; // Re-check memory used kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, ctx_max_kmers, ctx_sum_kmers, true, &graph_mem); } status("Using %zu colour%s in memory", use_ncols, util_plural_str(use_ncols)); cmd_check_mem_limit(memargs.mem_to_use, graph_mem); // Create db_graph dBGraph db_graph; Edges *intersect_edges = NULL; size_t edge_cols = (use_ncols + take_intersect); db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, use_ncols, use_ncols, kmers_in_hash, DBG_ALLOC_COVGS); // We allocate edges ourself since it's a special case db_graph.col_edges = ctx_calloc(db_graph.ht.capacity*edge_cols, sizeof(Edges)); // Load intersection binaries char *intsct_gname_ptr = NULL; StrBuf intersect_gname; strbuf_alloc(&intersect_gname, 1024); if(take_intersect) { GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph); gprefs.boolean_covgs = true; // covg++ only for(i = 0; i < num_igfiles; i++) { graph_load(&igfiles[i], gprefs, NULL); // Update intersect header // note: intersection graphs all load exactly one colour into colour 0 graph_info_make_intersect(&igfiles[i].hdr.ginfo[0], &intersect_gname); gprefs.must_exist_in_graph = true; gprefs.must_exist_in_edges = db_graph.col_edges; } if(num_igfiles > 1) { // Remove nodes where covg != num_igfiles HASH_ITERATE_SAFE(&db_graph.ht, remove_non_intersect_nodes, db_graph.col_covgs, (Covg)num_igfiles, &db_graph.ht); } status("Loaded intersection set\n"); intsct_gname_ptr = intersect_gname.b; for(i = 0; i < num_igfiles; i++) graph_file_close(&igfiles[i]); // Reset graph info for(i = 0; i < db_graph.num_of_cols; i++) graph_info_init(&db_graph.ginfo[i]); // Zero covgs memset(db_graph.col_covgs, 0, db_graph.ht.capacity * sizeof(Covg)); // Use union edges we loaded to intersect new edges intersect_edges = db_graph.col_edges; db_graph.col_edges += db_graph.ht.capacity; } bool kmers_loaded = take_intersect, colours_loaded = false; graph_writer_merge_mkhdr(out_path, gfiles, num_gfiles, kmers_loaded, colours_loaded, intersect_edges, intsct_gname_ptr, &db_graph); if(take_intersect) db_graph.col_edges -= db_graph.ht.capacity; for(i = 0; i < num_gfiles; i++) graph_file_close(&gfiles[i]); strbuf_dealloc(&intersect_gname); gfile_buf_dealloc(&isec_gfiles_buf); ctx_free(gfiles); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int ctx_clean(int argc, char **argv) { size_t nthreads = 0, use_ncols = 0; struct MemArgs memargs = MEM_ARGS_INIT; const char *out_ctx_path = NULL; int min_keep_tip = -1, unitig_min = -1; // <0 => default, 0 => noclean uint32_t fallback_thresh = 0; const char *len_before_path = NULL, *len_after_path = NULL; const char *covg_before_path = NULL, *covg_after_path = NULL; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'o': if(out_ctx_path != NULL) cmd_print_usage(NULL); out_ctx_path = optarg; break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'N': use_ncols = cmd_uint32_nonzero(cmd, optarg); break; case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'T': cmd_check(min_keep_tip<0, cmd); min_keep_tip = (optarg != NULL ? (int)cmd_uint32(cmd, optarg) : -1); break; case 'S': case 'U': cmd_check(unitig_min<0, cmd); unitig_min = (optarg != NULL ? cmd_uint32(cmd, optarg) : -1); break; case 'B': cmd_check(!fallback_thresh, cmd); fallback_thresh = cmd_uint32_nonzero(cmd, optarg); break; case 'l': cmd_check(!len_before_path, cmd); len_before_path = optarg; break; case 'L': cmd_check(!len_after_path, cmd); len_after_path = optarg; break; case 'c': cmd_check(!covg_before_path, cmd); covg_before_path = optarg; break; case 'C': cmd_check(!covg_after_path, cmd); covg_after_path = optarg; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" clean -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } if(nthreads == 0) nthreads = DEFAULT_NTHREADS; if(optind >= argc) cmd_print_usage("Please give input graph files"); bool unitig_cleaning = (unitig_min != 0); bool tip_cleaning = (min_keep_tip != 0); bool doing_cleaning = (unitig_cleaning || tip_cleaning); // If you ever want to estimate cleaning threshold without outputting // a graph, change this to a warning if(doing_cleaning && out_ctx_path == NULL) { cmd_print_usage("Please specify --out <out.ctx> for cleaned graph"); // warn("No cleaning being done: you did not specify --out <out.ctx>"); } if(!doing_cleaning && (covg_after_path || len_after_path)) { warn("You gave --len-after <out> / --covg-after <out> without " "any cleaning (set -U, --unitigs or -t, --tips)"); } if(doing_cleaning && strcmp(out_ctx_path,"-") != 0 && !futil_get_force() && futil_file_exists(out_ctx_path)) { cmd_print_usage("Output file already exists: %s", out_ctx_path); } if(fallback_thresh && !unitig_cleaning) warn("-B, --fallback <T> without --unitigs"); // Use remaining args as graph files char **gfile_paths = argv + optind; size_t i, j, num_gfiles = (size_t)(argc - optind); // Open graph files GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); size_t col, ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0; ncols = graph_files_open(gfile_paths, gfiles, num_gfiles, &ctx_max_kmers, &ctx_sum_kmers); size_t kmer_size = gfiles[0].hdr.kmer_size; // default to one colour for now if(use_ncols == 0) use_ncols = 1; // Flatten if we don't have to remember colours / output a graph if(out_ctx_path == NULL) { ncols = use_ncols = 1; for(i = 0; i < num_gfiles; i++) file_filter_flatten(&gfiles[i].fltr, 0); } if(ncols < use_ncols) { warn("I only need %zu colour%s ('--ncols %zu' ignored)", ncols, util_plural_str(ncols), use_ncols); use_ncols = ncols; } char max_kmers_str[100]; ulong_to_str(ctx_max_kmers, max_kmers_str); status("%zu input graph%s, max kmers: %s, using %zu colours", num_gfiles, util_plural_str(num_gfiles), max_kmers_str, use_ncols); // If no arguments given we default to removing tips < 2*kmer_size if(min_keep_tip < 0) min_keep_tip = 2 * kmer_size; // Warn if any graph files already cleaned size_t fromcol; ErrorCleaning *cleaning; for(i = 0; i < num_gfiles; i++) { for(j = 0; j < file_filter_num(&gfiles[i].fltr); j++) { fromcol = file_filter_fromcol(&gfiles[i].fltr, j); cleaning = &gfiles[i].hdr.ginfo[fromcol].cleaning; if(cleaning->cleaned_snodes && unitig_cleaning) { warn("%s:%zu already has unitig cleaning with threshold: <%zu", file_filter_path(&gfiles[i].fltr), fromcol, (size_t)cleaning->clean_snodes_thresh); } if(cleaning->cleaned_tips && tip_cleaning) { warn("%s:%zu already has had tip cleaned", file_filter_path(&gfiles[i].fltr), fromcol); } } } // Print steps size_t step = 0; status("Actions:\n"); if(covg_before_path != NULL) status("%zu. Saving kmer coverage distribution to: %s", step++, covg_before_path); if(len_before_path != NULL) status("%zu. Saving unitig length distribution to: %s", step++, len_before_path); if(min_keep_tip > 0) status("%zu. Cleaning tips shorter than %i nodes", step++, min_keep_tip); if(unitig_min > 0) status("%zu. Cleaning unitigs with coverage < %i", step++, unitig_min); if(unitig_min < 0) status("%zu. Cleaning unitigs with auto-detected threshold", step++); if(covg_after_path != NULL) status("%zu. Saving kmer coverage distribution to: %s", step++, covg_after_path); if(len_after_path != NULL) status("%zu. Saving unitig length distribution to: %s", step++, len_after_path); // // Decide memory usage // bool all_colours_loaded = (ncols <= use_ncols); bool use_mem_limit = (memargs.mem_to_use_set && num_gfiles > 1) || !ctx_max_kmers; size_t kmers_in_hash, bits_per_kmer, graph_mem; size_t per_col_bits = (sizeof(Covg)+sizeof(Edges)) * 8; size_t extra_edge_bits = (all_colours_loaded ? 0 : sizeof(Edges) * 8); bits_per_kmer = sizeof(BinaryKmer)*8 + per_col_bits * use_ncols + extra_edge_bits; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, ctx_max_kmers, ctx_sum_kmers, use_mem_limit, &graph_mem); // Maximise the number of colours we load to fill the mem size_t max_usencols = (memargs.mem_to_use*8 - sizeof(BinaryKmer)*8*kmers_in_hash + extra_edge_bits*kmers_in_hash) / (per_col_bits*kmers_in_hash); use_ncols = MIN2(max_usencols, ncols); cmd_check_mem_limit(memargs.mem_to_use, graph_mem); // // Check output files are writable // futil_create_output(out_ctx_path); // Does nothing if arg is NULL futil_create_output(covg_before_path); futil_create_output(covg_after_path); futil_create_output(len_before_path); futil_create_output(len_after_path); // Create db_graph // Load as many colours as possible // Use an extra set of edge to take intersections dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, use_ncols, use_ncols, kmers_in_hash, DBG_ALLOC_EDGES | DBG_ALLOC_COVGS); // Extra edges required to hold union of kept edges Edges *edges_union = NULL; if(use_ncols < ncols) edges_union = ctx_calloc(db_graph.ht.capacity, sizeof(Edges)); // Load graph into a single colour GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph); // Construct cleaned graph header GraphFileHeader outhdr; memset(&outhdr, 0, sizeof(GraphFileHeader)); for(i = 0; i < num_gfiles; i++) graph_file_merge_header(&outhdr, &gfiles[i]); if(ncols > use_ncols) { db_graph.num_of_cols = db_graph.num_edge_cols = 1; SWAP(edges_union, db_graph.col_edges); graphs_load_files_flat(gfiles, num_gfiles, gprefs, NULL); SWAP(edges_union, db_graph.col_edges); db_graph.num_of_cols = db_graph.num_edge_cols = use_ncols; } else { for(i = 0; i < num_gfiles; i++) graph_load(&gfiles[i], gprefs, NULL); } char num_kmers_str[100]; ulong_to_str(db_graph.ht.num_kmers, num_kmers_str); status("Total kmers loaded: %s\n", num_kmers_str); size_t initial_nkmers = db_graph.ht.num_kmers; hash_table_print_stats(&db_graph.ht); uint8_t *visited = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1); uint8_t *keep = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1); // Always estimate cleaning threshold // if(unitig_min <= 0 || covg_before_path || len_before_path) // { // Get coverage distribution and estimate cleaning threshold int est_min_covg = cleaning_get_threshold(nthreads, covg_before_path, len_before_path, visited, &db_graph); if(est_min_covg < 0) status("Cannot find recommended cleaning threshold"); else status("Recommended cleaning threshold is: %i", est_min_covg); // Use estimated threshold if threshold not set if(unitig_min < 0) { if(fallback_thresh > 0 && est_min_covg < (int)fallback_thresh) { status("Using fallback threshold: %i", fallback_thresh); unitig_min = fallback_thresh; } else if(est_min_covg >= 0) unitig_min = est_min_covg; } // } // Die if we failed to find suitable cleaning threshold if(unitig_min < 0) die("Need cleaning threshold (--unitigs=<D> or --fallback <D>)"); // Cleaning parameters should now be set (>0) or turned off (==0) ctx_assert(unitig_min >= 0); ctx_assert(min_keep_tip >= 0); if(unitig_min || min_keep_tip) { // Clean graph of tips (if min_keep_tip > 0) and unitigs (if threshold > 0) clean_graph(nthreads, unitig_min, min_keep_tip, covg_after_path, len_after_path, visited, keep, &db_graph); } ctx_free(visited); ctx_free(keep); if(out_ctx_path != NULL) { // Set output header ginfo cleaned for(col = 0; col < ncols; col++) { cleaning = &outhdr.ginfo[col].cleaning; cleaning->cleaned_snodes |= unitig_cleaning; cleaning->cleaned_tips |= tip_cleaning; // if(tip_cleaning) { // strbuf_append_str(&outhdr.ginfo[col].sample_name, ".tipclean"); // } if(unitig_cleaning) { size_t thresh = cleaning->clean_snodes_thresh; thresh = cleaning->cleaned_snodes ? MAX2(thresh, (uint32_t)unitig_min) : (uint32_t)unitig_min; cleaning->clean_snodes_thresh = thresh; // char name_append[200]; // sprintf(name_append, ".supclean%zu", thresh); // strbuf_append_str(&outhdr.ginfo[col].sample_name, name_append); } } // Print stats on removed kmers size_t removed_nkmers = initial_nkmers - db_graph.ht.num_kmers; double removed_pct = (100.0 * removed_nkmers) / initial_nkmers; char removed_str[100], init_str[100]; ulong_to_str(removed_nkmers, removed_str); ulong_to_str(initial_nkmers, init_str); status("Removed %s of %s (%.2f%%) kmers", removed_str, init_str, removed_pct); // kmers_loaded=true graph_writer_merge(out_ctx_path, gfiles, num_gfiles, true, all_colours_loaded, edges_union, &outhdr, &db_graph); } ctx_check(db_graph.ht.num_kmers == hash_table_count_kmers(&db_graph.ht)); // TODO: report kmer coverage for each sample graph_header_dealloc(&outhdr); for(i = 0; i < num_gfiles; i++) graph_file_close(&gfiles[i]); ctx_free(gfiles); ctx_free(edges_union); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }