int ctx_clean(int argc, char **argv) { size_t nthreads = 0, use_ncols = 0; struct MemArgs memargs = MEM_ARGS_INIT; const char *out_ctx_path = NULL; bool tip_cleaning = false, supernode_cleaning = false; size_t min_keep_tip = 0; Covg threshold = 0, fallback_thresh = 0; const char *len_before_path = NULL, *len_after_path = NULL; const char *covg_before_path = NULL, *covg_after_path = NULL; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'o': if(out_ctx_path != NULL) cmd_print_usage(NULL); out_ctx_path = optarg; break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'N': use_ncols = cmd_uint32_nonzero(cmd, optarg); break; case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'T': cmd_check(!tip_cleaning, cmd); min_keep_tip = cmd_uint32_nonzero(cmd, optarg); tip_cleaning = true; break; case 'S': cmd_check(!supernode_cleaning, cmd); if(optarg != NULL) threshold = cmd_uint32_nonzero(cmd, optarg); supernode_cleaning = true; break; case 'B': cmd_check(!fallback_thresh, cmd); fallback_thresh = cmd_uint32_nonzero(cmd, optarg); break; case 'l': cmd_check(!len_before_path, cmd); len_before_path = optarg; break; case 'L': cmd_check(!len_after_path, cmd); len_after_path = optarg; break; case 'c': cmd_check(!covg_before_path, cmd); covg_before_path = optarg; break; case 'C': cmd_check(!covg_after_path, cmd); covg_after_path = optarg; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" clean -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } if(nthreads == 0) nthreads = DEFAULT_NTHREADS; if(optind >= argc) cmd_print_usage("Please give input graph files"); // Default behaviour if(!tip_cleaning && !supernode_cleaning) { if(out_ctx_path != NULL) supernode_cleaning = tip_cleaning = true; // do both else warn("No cleaning being done: you did not specify --out <out.ctx>"); } bool doing_cleaning = (supernode_cleaning || tip_cleaning); if(doing_cleaning && out_ctx_path == NULL) { cmd_print_usage("Please specify --out <out.ctx> for cleaned graph"); } if(!doing_cleaning && (covg_after_path || len_after_path)) { cmd_print_usage("You gave --len-after <out> / --covg-after <out> without " "any cleaning (set -s, --supernodes or -t, --tips)"); } if(doing_cleaning && strcmp(out_ctx_path,"-") != 0 && !futil_get_force() && futil_file_exists(out_ctx_path)) { cmd_print_usage("Output file already exists: %s", out_ctx_path); } if(fallback_thresh && !supernode_cleaning) cmd_print_usage("-B, --fallback <T> without --supernodes"); // Use remaining args as graph files char **gfile_paths = argv + optind; size_t i, j, num_gfiles = (size_t)(argc - optind); // Open graph files GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); size_t ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0; ncols = graph_files_open(gfile_paths, gfiles, num_gfiles, &ctx_max_kmers, &ctx_sum_kmers); size_t kmer_size = gfiles[0].hdr.kmer_size; // default to one colour for now if(use_ncols == 0) use_ncols = 1; // Flatten if we don't have to remember colours / output a graph if(!doing_cleaning) { ncols = use_ncols = 1; for(i = 0; i < num_gfiles; i++) file_filter_flatten(&gfiles[i].fltr, 0); } if(ncols < use_ncols) { warn("I only need %zu colour%s ('--ncols %zu' ignored)", ncols, util_plural_str(ncols), use_ncols); use_ncols = ncols; } char max_kmers_str[100]; ulong_to_str(ctx_max_kmers, max_kmers_str); status("%zu input graph%s, max kmers: %s, using %zu colours", num_gfiles, util_plural_str(num_gfiles), max_kmers_str, use_ncols); // If no arguments given we default to removing tips < 2*kmer_size if(tip_cleaning && min_keep_tip == 0) min_keep_tip = 2 * kmer_size; // Warn if any graph files already cleaned size_t fromcol, intocol; ErrorCleaning *cleaning; for(i = 0; i < num_gfiles; i++) { for(j = 0; j < file_filter_num(&gfiles[i].fltr); j++) { fromcol = file_filter_fromcol(&gfiles[i].fltr, j); cleaning = &gfiles[i].hdr.ginfo[fromcol].cleaning; if(cleaning->cleaned_snodes && supernode_cleaning) { warn("%s:%zu already has supernode cleaning with threshold: <%zu", file_filter_path(&gfiles[i].fltr), fromcol, (size_t)cleaning->clean_snodes_thresh); } if(cleaning->cleaned_tips && tip_cleaning) { warn("%s:%zu already has had tip cleaned", file_filter_path(&gfiles[i].fltr), fromcol); } } } // Print steps size_t step = 0; status("Actions:\n"); if(covg_before_path != NULL) status("%zu. Saving kmer coverage distribution to: %s", step++, covg_before_path); if(len_before_path != NULL) status("%zu. Saving supernode length distribution to: %s", step++, len_before_path); if(tip_cleaning) status("%zu. Cleaning tips shorter than %zu nodes", step++, min_keep_tip); if(supernode_cleaning && threshold > 0) status("%zu. Cleaning supernodes with coverage < %u", step++, threshold); if(supernode_cleaning && threshold <= 0) status("%zu. Cleaning supernodes with auto-detected threshold", step++); if(covg_after_path != NULL) status("%zu. Saving kmer coverage distribution to: %s", step++, covg_after_path); if(len_after_path != NULL) status("%zu. Saving supernode length distribution to: %s", step++, len_after_path); // // Decide memory usage // bool all_colours_loaded = (ncols <= use_ncols); bool use_mem_limit = (memargs.mem_to_use_set && num_gfiles > 1) || !ctx_max_kmers; size_t kmers_in_hash, bits_per_kmer, graph_mem; size_t per_kmer_per_col_bits = (sizeof(BinaryKmer)+sizeof(Covg)+sizeof(Edges)) * 8; size_t pop_edges_per_kmer_bits = (all_colours_loaded ? 0 : sizeof(Edges) * 8); bits_per_kmer = per_kmer_per_col_bits * use_ncols + pop_edges_per_kmer_bits; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, ctx_max_kmers, ctx_sum_kmers, use_mem_limit, &graph_mem); // Maximise the number of colours we load to fill the mem size_t max_usencols = (memargs.mem_to_use*8 - pop_edges_per_kmer_bits * kmers_in_hash) / (per_kmer_per_col_bits * kmers_in_hash); use_ncols = MIN2(max_usencols, ncols); cmd_check_mem_limit(memargs.mem_to_use, graph_mem); // // Check output files are writable // futil_create_output(out_ctx_path); // Does nothing if arg is NULL futil_create_output(covg_before_path); futil_create_output(covg_after_path); futil_create_output(len_before_path); futil_create_output(len_after_path); // Create db_graph // Load as many colours as possible // Use an extra set of edge to take intersections dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, use_ncols, use_ncols, kmers_in_hash, DBG_ALLOC_COVGS); // Edges is a special case size_t num_edges = db_graph.ht.capacity * (use_ncols + !all_colours_loaded); db_graph.col_edges = ctx_calloc(num_edges, sizeof(Edges)); // Load graph into a single colour LoadingStats stats = LOAD_STATS_INIT_MACRO; GraphLoadingPrefs gprefs = {.db_graph = &db_graph, .boolean_covgs = false, .must_exist_in_graph = false, .must_exist_in_edges = NULL, .empty_colours = false}; // Construct cleaned graph header GraphFileHeader outhdr; memset(&outhdr, 0, sizeof(GraphFileHeader)); outhdr.version = CTX_GRAPH_FILEFORMAT; outhdr.kmer_size = db_graph.kmer_size; outhdr.num_of_cols = ncols; outhdr.num_of_bitfields = (db_graph.kmer_size*2+63)/64; graph_header_alloc(&outhdr, ncols); // Merge info into header size_t gcol = 0; for(i = 0; i < num_gfiles; i++) { for(j = 0; j < file_filter_num(&gfiles[i].fltr); j++, gcol++) { fromcol = file_filter_fromcol(&gfiles[i].fltr, j); intocol = file_filter_intocol(&gfiles[i].fltr, j); graph_info_merge(&outhdr.ginfo[intocol], &gfiles[i].hdr.ginfo[fromcol]); } } if(ncols > use_ncols) { graph_files_load_flat(gfiles, num_gfiles, gprefs, &stats); } else { for(i = 0; i < num_gfiles; i++) graph_load(&gfiles[i], gprefs, &stats); } char num_kmers_str[100]; ulong_to_str(db_graph.ht.num_kmers, num_kmers_str); status("Total kmers loaded: %s\n", num_kmers_str); size_t initial_nkmers = db_graph.ht.num_kmers; hash_table_print_stats(&db_graph.ht); uint8_t *visited = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1); uint8_t *keep = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1); if((supernode_cleaning && threshold <= 0) || covg_before_path || len_before_path) { // Get coverage distribution and estimate cleaning threshold int est_threshold = cleaning_get_threshold(nthreads, covg_before_path, len_before_path, visited, &db_graph); if(est_threshold < 0) status("Cannot find recommended cleaning threshold"); else status("Recommended cleaning threshold is: %i", est_threshold); // Use estimated threshold if threshold not set if(threshold <= 0) { if(fallback_thresh > 0 && est_threshold < (int)fallback_thresh) { status("Using fallback threshold: %i", fallback_thresh); threshold = fallback_thresh; } else if(est_threshold >= 0) threshold = est_threshold; } } // Die if we failed to find suitable cleaning threshold if(supernode_cleaning && threshold <= 0) die("Need cleaning threshold (--supernodes=<D> or --fallback <D>)"); if(doing_cleaning) { // Clean graph of tips (if min_keep_tip > 0) and supernodes (if threshold > 0) clean_graph(nthreads, threshold, min_keep_tip, covg_after_path, len_after_path, visited, keep, &db_graph); } ctx_free(visited); ctx_free(keep); if(doing_cleaning) { // Output graph file Edges *intersect_edges = NULL; bool kmers_loaded = true; size_t col, thresh; // Set output header ginfo cleaned for(col = 0; col < ncols; col++) { cleaning = &outhdr.ginfo[col].cleaning; cleaning->cleaned_snodes |= supernode_cleaning; cleaning->cleaned_tips |= tip_cleaning; // if(tip_cleaning) { // strbuf_append_str(&outhdr.ginfo[col].sample_name, ".tipclean"); // } if(supernode_cleaning) { thresh = cleaning->clean_snodes_thresh; thresh = cleaning->cleaned_snodes ? MAX2(thresh, (uint32_t)threshold) : (uint32_t)threshold; cleaning->clean_snodes_thresh = thresh; // char name_append[200]; // sprintf(name_append, ".supclean%zu", thresh); // strbuf_append_str(&outhdr.ginfo[col].sample_name, name_append); } } if(!all_colours_loaded) { // We haven't loaded all the colours // intersect_edges are edges to mask with // resets graph edges intersect_edges = db_graph.col_edges; db_graph.col_edges += db_graph.ht.capacity; } // Print stats on removed kmers size_t removed_nkmers = initial_nkmers - db_graph.ht.num_kmers; double removed_pct = (100.0 * removed_nkmers) / initial_nkmers; char removed_str[100], init_str[100]; ulong_to_str(removed_nkmers, removed_str); ulong_to_str(initial_nkmers, init_str); status("Removed %s of %s (%.2f%%) kmers", removed_str, init_str, removed_pct); graph_files_merge(out_ctx_path, gfiles, num_gfiles, kmers_loaded, all_colours_loaded, intersect_edges, &outhdr, &db_graph); // Swap back if(!all_colours_loaded) db_graph.col_edges = intersect_edges; } ctx_check(db_graph.ht.num_kmers == hash_table_count_kmers(&db_graph.ht)); graph_header_dealloc(&outhdr); for(i = 0; i < num_gfiles; i++) graph_file_close(&gfiles[i]); ctx_free(gfiles); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int ctx_pop_bubbles(int argc, char **argv) { size_t nthreads = 0; struct MemArgs memargs = MEM_ARGS_INIT; const char *out_path = NULL; int32_t max_covg = -1; // max mean coverage to remove <=0 => ignore int32_t max_klen = -1; // max length (kmers) to remove <=0 => ignore int32_t max_kdiff = -1; // max diff between bubble branch lengths <0 => ignore // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'C': cmd_check(max_covg<0, cmd); max_covg = cmd_uint32(cmd, optarg); break; case 'L': cmd_check(max_klen<0, cmd); max_klen = cmd_uint32(cmd, optarg); break; case 'D': cmd_check(max_kdiff<0, cmd); max_kdiff = cmd_uint32(cmd, optarg); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" pop -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } // Defaults for unset values if(out_path == NULL) out_path = "-"; if(nthreads == 0) nthreads = DEFAULT_NTHREADS; if(optind >= argc) cmd_print_usage("Require input graph files (.ctx)"); // // Open graph files // const size_t num_gfiles = argc - optind; char **graph_paths = argv + optind; ctx_assert(num_gfiles > 0); GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); size_t i, ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0; ncols = graph_files_open(graph_paths, gfiles, num_gfiles, &ctx_max_kmers, &ctx_sum_kmers); bool reread_graph_to_filter = (num_gfiles == 1 && strcmp(file_filter_path(&gfiles[0].fltr),"-") != 0); if(reread_graph_to_filter) { file_filter_flatten(&gfiles[0].fltr, 0); ncols = 1; } // Check graphs are compatible graphs_gpaths_compatible(gfiles, num_gfiles, NULL, 0, -1); // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem; bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Covg)*8*ncols + sizeof(Edges)*8*ncols + 2; // 1 bit for visited, 1 for removed kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, ctx_max_kmers, ctx_sum_kmers, false, &graph_mem); cmd_check_mem_limit(memargs.mem_to_use, graph_mem); // Check out_path is writable futil_create_output(out_path); // Allocate memory dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, ncols, ncols, kmers_in_hash, DBG_ALLOC_EDGES | DBG_ALLOC_COVGS); size_t nkwords = roundup_bits2bytes(db_graph.ht.capacity); uint8_t *visited = ctx_calloc(1, nkwords); uint8_t *rmvbits = ctx_calloc(1, nkwords); // // Load graphs // GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph); gprefs.empty_colours = true; for(i = 0; i < num_gfiles; i++) { graph_load(&gfiles[i], gprefs, NULL); graph_file_close(&gfiles[i]); gprefs.empty_colours = false; } ctx_free(gfiles); hash_table_print_stats(&db_graph.ht); PopBubblesPrefs prefs = {.max_rmv_covg = max_covg, .max_rmv_klen = max_klen, .max_rmv_kdiff = max_kdiff}; size_t npopped = 0; char npopped_str[50]; status("Popping bubbles..."); npopped = pop_bubbles(&db_graph, nthreads, prefs, visited, rmvbits); ulong_to_str(npopped, npopped_str); status("Popped %s bubbles", npopped_str); size_t nkmers0 = db_graph.ht.num_kmers; status("Removing nodes..."); for(i = 0; i < nkwords; i++) rmvbits[i] = ~rmvbits[i]; prune_nodes_lacking_flag(nthreads, rmvbits, &db_graph); size_t nkmers1 = db_graph.ht.num_kmers; ctx_assert(nkmers1 <= nkmers0); char nkmers0str[50], nkmers1str[50], ndiffstr[50]; ulong_to_str(nkmers0, nkmers0str); ulong_to_str(nkmers1, nkmers1str); ulong_to_str(nkmers0-nkmers1, ndiffstr); status("Number of kmers %s -> %s (-%s)", nkmers0str, nkmers1str, ndiffstr); if(reread_graph_to_filter) { status("Streaming filtered file to: %s\n", out_path); GraphFileReader gfile; memset(&gfile, 0, sizeof(GraphFileReader)); graph_file_open(&gfile, graph_paths[0]); graph_writer_stream_mkhdr(out_path, &gfile, &db_graph, db_graph.col_edges, NULL); graph_file_close(&gfile); } else { status("Saving to: %s\n", out_path); graph_writer_save_mkhdr(out_path, &db_graph, CTX_GRAPH_FILEFORMAT, NULL, 0, ncols); } ctx_free(visited); ctx_free(rmvbits); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
// Returns 0 on success, otherwise != 0 int ctx_unitigs(int argc, char **argv) { size_t nthreads = 0; struct MemArgs memargs = MEM_ARGS_INIT; const char *out_path = NULL; UnitigSyntax syntax = PRINT_FASTA; bool dot_use_points = false; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'F': cmd_check(!syntax, cmd); syntax = PRINT_FASTA; break; case 'g': cmd_check(!syntax, cmd); syntax = PRINT_GFA; break; case 'd': cmd_check(!syntax, cmd); syntax = PRINT_DOT; break; case 'P': cmd_check(!dot_use_points, cmd); dot_use_points = true; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ die("`"CMD" unitigs -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } if(dot_use_points && syntax == PRINT_FASTA) cmd_print_usage("--point is only for use with --dot"); // Defaults for unset values if(out_path == NULL) out_path = "-"; if(nthreads == 0) nthreads = DEFAULT_NTHREADS; if(optind >= argc) cmd_print_usage(NULL); size_t i, num_gfiles = (size_t)(argc - optind); char **gfile_paths = argv + optind; if(dot_use_points && syntax != PRINT_DOT) cmd_print_usage("--points only valid with --graphviz / --dot"); ctx_assert(num_gfiles > 0); // Open graph files GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); size_t ctx_max_kmers = 0, ctx_sum_kmers = 0; graph_files_open(gfile_paths, gfiles, num_gfiles, &ctx_max_kmers, &ctx_sum_kmers); // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem; bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 + 1; if(syntax != PRINT_FASTA) bits_per_kmer += sizeof(UnitigEnd) * 8; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, ctx_max_kmers, ctx_sum_kmers, true, &graph_mem); cmd_check_mem_limit(memargs.mem_to_use, graph_mem); status("Output in %s format to %s\n", syntax_strs[syntax], futil_outpath_str(out_path)); // // Open output file // // Print to stdout unless --out <out> is specified FILE *fout = futil_fopen_create(out_path, "w"); // // Allocate memory // dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, 1, 1, kmers_in_hash, DBG_ALLOC_EDGES); UnitigPrinter printer; unitig_printer_init(&printer, &db_graph, nthreads, syntax, fout); if(syntax == PRINT_DOT || syntax == PRINT_GFA) unitig_graph_alloc(&printer.ugraph, &db_graph); // Load graphs GraphLoadingPrefs gprefs = {.db_graph = &db_graph, .boolean_covgs = false, .must_exist_in_graph = false, .empty_colours = false}; for(i = 0; i < num_gfiles; i++) { file_filter_flatten(&gfiles[i].fltr, 0); graph_load(&gfiles[i], gprefs, NULL); graph_file_close(&gfiles[i]); } ctx_free(gfiles); hash_table_print_stats(&db_graph.ht); switch(syntax) { case PRINT_FASTA: status("Printing unitgs in FASTA using %zu threads", nthreads); supernodes_iterate(nthreads, printer.visited, &db_graph, print_unitig_fasta, &printer); break; case PRINT_GFA: print_gfa_syntax(&printer); break; case PRINT_DOT: print_dot_syntax(&printer, dot_use_points); break; default: die("Invalid print syntax: %i", syntax); } char num_unitigs_str[50]; ulong_to_str(printer.num_unitigs, num_unitigs_str); status("Dumped %s unitigs\n", num_unitigs_str); fclose(fout); unitig_printer_destroy(&printer); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int ctx_clean(int argc, char **argv) { size_t nthreads = 0, use_ncols = 0; struct MemArgs memargs = MEM_ARGS_INIT; const char *out_ctx_path = NULL; int min_keep_tip = -1, unitig_min = -1; // <0 => default, 0 => noclean uint32_t fallback_thresh = 0; const char *len_before_path = NULL, *len_after_path = NULL; const char *covg_before_path = NULL, *covg_after_path = NULL; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'o': if(out_ctx_path != NULL) cmd_print_usage(NULL); out_ctx_path = optarg; break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'N': use_ncols = cmd_uint32_nonzero(cmd, optarg); break; case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'T': cmd_check(min_keep_tip<0, cmd); min_keep_tip = (optarg != NULL ? (int)cmd_uint32(cmd, optarg) : -1); break; case 'S': case 'U': cmd_check(unitig_min<0, cmd); unitig_min = (optarg != NULL ? cmd_uint32(cmd, optarg) : -1); break; case 'B': cmd_check(!fallback_thresh, cmd); fallback_thresh = cmd_uint32_nonzero(cmd, optarg); break; case 'l': cmd_check(!len_before_path, cmd); len_before_path = optarg; break; case 'L': cmd_check(!len_after_path, cmd); len_after_path = optarg; break; case 'c': cmd_check(!covg_before_path, cmd); covg_before_path = optarg; break; case 'C': cmd_check(!covg_after_path, cmd); covg_after_path = optarg; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" clean -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } if(nthreads == 0) nthreads = DEFAULT_NTHREADS; if(optind >= argc) cmd_print_usage("Please give input graph files"); bool unitig_cleaning = (unitig_min != 0); bool tip_cleaning = (min_keep_tip != 0); bool doing_cleaning = (unitig_cleaning || tip_cleaning); // If you ever want to estimate cleaning threshold without outputting // a graph, change this to a warning if(doing_cleaning && out_ctx_path == NULL) { cmd_print_usage("Please specify --out <out.ctx> for cleaned graph"); // warn("No cleaning being done: you did not specify --out <out.ctx>"); } if(!doing_cleaning && (covg_after_path || len_after_path)) { warn("You gave --len-after <out> / --covg-after <out> without " "any cleaning (set -U, --unitigs or -t, --tips)"); } if(doing_cleaning && strcmp(out_ctx_path,"-") != 0 && !futil_get_force() && futil_file_exists(out_ctx_path)) { cmd_print_usage("Output file already exists: %s", out_ctx_path); } if(fallback_thresh && !unitig_cleaning) warn("-B, --fallback <T> without --unitigs"); // Use remaining args as graph files char **gfile_paths = argv + optind; size_t i, j, num_gfiles = (size_t)(argc - optind); // Open graph files GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); size_t col, ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0; ncols = graph_files_open(gfile_paths, gfiles, num_gfiles, &ctx_max_kmers, &ctx_sum_kmers); size_t kmer_size = gfiles[0].hdr.kmer_size; // default to one colour for now if(use_ncols == 0) use_ncols = 1; // Flatten if we don't have to remember colours / output a graph if(out_ctx_path == NULL) { ncols = use_ncols = 1; for(i = 0; i < num_gfiles; i++) file_filter_flatten(&gfiles[i].fltr, 0); } if(ncols < use_ncols) { warn("I only need %zu colour%s ('--ncols %zu' ignored)", ncols, util_plural_str(ncols), use_ncols); use_ncols = ncols; } char max_kmers_str[100]; ulong_to_str(ctx_max_kmers, max_kmers_str); status("%zu input graph%s, max kmers: %s, using %zu colours", num_gfiles, util_plural_str(num_gfiles), max_kmers_str, use_ncols); // If no arguments given we default to removing tips < 2*kmer_size if(min_keep_tip < 0) min_keep_tip = 2 * kmer_size; // Warn if any graph files already cleaned size_t fromcol; ErrorCleaning *cleaning; for(i = 0; i < num_gfiles; i++) { for(j = 0; j < file_filter_num(&gfiles[i].fltr); j++) { fromcol = file_filter_fromcol(&gfiles[i].fltr, j); cleaning = &gfiles[i].hdr.ginfo[fromcol].cleaning; if(cleaning->cleaned_snodes && unitig_cleaning) { warn("%s:%zu already has unitig cleaning with threshold: <%zu", file_filter_path(&gfiles[i].fltr), fromcol, (size_t)cleaning->clean_snodes_thresh); } if(cleaning->cleaned_tips && tip_cleaning) { warn("%s:%zu already has had tip cleaned", file_filter_path(&gfiles[i].fltr), fromcol); } } } // Print steps size_t step = 0; status("Actions:\n"); if(covg_before_path != NULL) status("%zu. Saving kmer coverage distribution to: %s", step++, covg_before_path); if(len_before_path != NULL) status("%zu. Saving unitig length distribution to: %s", step++, len_before_path); if(min_keep_tip > 0) status("%zu. Cleaning tips shorter than %i nodes", step++, min_keep_tip); if(unitig_min > 0) status("%zu. Cleaning unitigs with coverage < %i", step++, unitig_min); if(unitig_min < 0) status("%zu. Cleaning unitigs with auto-detected threshold", step++); if(covg_after_path != NULL) status("%zu. Saving kmer coverage distribution to: %s", step++, covg_after_path); if(len_after_path != NULL) status("%zu. Saving unitig length distribution to: %s", step++, len_after_path); // // Decide memory usage // bool all_colours_loaded = (ncols <= use_ncols); bool use_mem_limit = (memargs.mem_to_use_set && num_gfiles > 1) || !ctx_max_kmers; size_t kmers_in_hash, bits_per_kmer, graph_mem; size_t per_col_bits = (sizeof(Covg)+sizeof(Edges)) * 8; size_t extra_edge_bits = (all_colours_loaded ? 0 : sizeof(Edges) * 8); bits_per_kmer = sizeof(BinaryKmer)*8 + per_col_bits * use_ncols + extra_edge_bits; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, ctx_max_kmers, ctx_sum_kmers, use_mem_limit, &graph_mem); // Maximise the number of colours we load to fill the mem size_t max_usencols = (memargs.mem_to_use*8 - sizeof(BinaryKmer)*8*kmers_in_hash + extra_edge_bits*kmers_in_hash) / (per_col_bits*kmers_in_hash); use_ncols = MIN2(max_usencols, ncols); cmd_check_mem_limit(memargs.mem_to_use, graph_mem); // // Check output files are writable // futil_create_output(out_ctx_path); // Does nothing if arg is NULL futil_create_output(covg_before_path); futil_create_output(covg_after_path); futil_create_output(len_before_path); futil_create_output(len_after_path); // Create db_graph // Load as many colours as possible // Use an extra set of edge to take intersections dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, use_ncols, use_ncols, kmers_in_hash, DBG_ALLOC_EDGES | DBG_ALLOC_COVGS); // Extra edges required to hold union of kept edges Edges *edges_union = NULL; if(use_ncols < ncols) edges_union = ctx_calloc(db_graph.ht.capacity, sizeof(Edges)); // Load graph into a single colour GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph); // Construct cleaned graph header GraphFileHeader outhdr; memset(&outhdr, 0, sizeof(GraphFileHeader)); for(i = 0; i < num_gfiles; i++) graph_file_merge_header(&outhdr, &gfiles[i]); if(ncols > use_ncols) { db_graph.num_of_cols = db_graph.num_edge_cols = 1; SWAP(edges_union, db_graph.col_edges); graphs_load_files_flat(gfiles, num_gfiles, gprefs, NULL); SWAP(edges_union, db_graph.col_edges); db_graph.num_of_cols = db_graph.num_edge_cols = use_ncols; } else { for(i = 0; i < num_gfiles; i++) graph_load(&gfiles[i], gprefs, NULL); } char num_kmers_str[100]; ulong_to_str(db_graph.ht.num_kmers, num_kmers_str); status("Total kmers loaded: %s\n", num_kmers_str); size_t initial_nkmers = db_graph.ht.num_kmers; hash_table_print_stats(&db_graph.ht); uint8_t *visited = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1); uint8_t *keep = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1); // Always estimate cleaning threshold // if(unitig_min <= 0 || covg_before_path || len_before_path) // { // Get coverage distribution and estimate cleaning threshold int est_min_covg = cleaning_get_threshold(nthreads, covg_before_path, len_before_path, visited, &db_graph); if(est_min_covg < 0) status("Cannot find recommended cleaning threshold"); else status("Recommended cleaning threshold is: %i", est_min_covg); // Use estimated threshold if threshold not set if(unitig_min < 0) { if(fallback_thresh > 0 && est_min_covg < (int)fallback_thresh) { status("Using fallback threshold: %i", fallback_thresh); unitig_min = fallback_thresh; } else if(est_min_covg >= 0) unitig_min = est_min_covg; } // } // Die if we failed to find suitable cleaning threshold if(unitig_min < 0) die("Need cleaning threshold (--unitigs=<D> or --fallback <D>)"); // Cleaning parameters should now be set (>0) or turned off (==0) ctx_assert(unitig_min >= 0); ctx_assert(min_keep_tip >= 0); if(unitig_min || min_keep_tip) { // Clean graph of tips (if min_keep_tip > 0) and unitigs (if threshold > 0) clean_graph(nthreads, unitig_min, min_keep_tip, covg_after_path, len_after_path, visited, keep, &db_graph); } ctx_free(visited); ctx_free(keep); if(out_ctx_path != NULL) { // Set output header ginfo cleaned for(col = 0; col < ncols; col++) { cleaning = &outhdr.ginfo[col].cleaning; cleaning->cleaned_snodes |= unitig_cleaning; cleaning->cleaned_tips |= tip_cleaning; // if(tip_cleaning) { // strbuf_append_str(&outhdr.ginfo[col].sample_name, ".tipclean"); // } if(unitig_cleaning) { size_t thresh = cleaning->clean_snodes_thresh; thresh = cleaning->cleaned_snodes ? MAX2(thresh, (uint32_t)unitig_min) : (uint32_t)unitig_min; cleaning->clean_snodes_thresh = thresh; // char name_append[200]; // sprintf(name_append, ".supclean%zu", thresh); // strbuf_append_str(&outhdr.ginfo[col].sample_name, name_append); } } // Print stats on removed kmers size_t removed_nkmers = initial_nkmers - db_graph.ht.num_kmers; double removed_pct = (100.0 * removed_nkmers) / initial_nkmers; char removed_str[100], init_str[100]; ulong_to_str(removed_nkmers, removed_str); ulong_to_str(initial_nkmers, init_str); status("Removed %s of %s (%.2f%%) kmers", removed_str, init_str, removed_pct); // kmers_loaded=true graph_writer_merge(out_ctx_path, gfiles, num_gfiles, true, all_colours_loaded, edges_union, &outhdr, &db_graph); } ctx_check(db_graph.ht.num_kmers == hash_table_count_kmers(&db_graph.ht)); // TODO: report kmer coverage for each sample graph_header_dealloc(&outhdr); for(i = 0; i < num_gfiles; i++) graph_file_close(&gfiles[i]); ctx_free(gfiles); ctx_free(edges_union); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int ctx_reads(int argc, char **argv) { parse_args(argc, argv); // // Open input graphs // GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); size_t i, ctx_max_kmers = 0, ctx_sum_kmers = 0; graph_files_open(gfile_paths, gfiles, num_gfiles, &ctx_max_kmers, &ctx_sum_kmers); // Will exit and remove output files on error inputs_attempt_open(); // // Calculate memory use // size_t kmers_in_hash, graph_mem, bits_per_kmer = sizeof(BinaryKmer)*8; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, ctx_max_kmers, ctx_sum_kmers, true, &graph_mem); cmd_check_mem_limit(memargs.mem_to_use, graph_mem); // // Set up graph // dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, 1, 0, kmers_in_hash, 0); // Load graphs LoadingStats gstats = LOAD_STATS_INIT_MACRO; GraphLoadingPrefs gprefs = {.db_graph = &db_graph, .must_exist_in_graph = false, .empty_colours = true, .boolean_covgs = false}; for(i = 0; i < num_gfiles; i++) { file_filter_flatten(&gfiles[i].fltr, 0); graph_load(&gfiles[i], gprefs, &gstats); graph_file_close(&gfiles[i]); gprefs.empty_colours = false; } ctx_free(gfiles); status("Printing reads that do %stouch the graph\n", inputs.b[0].invert ? "not " : ""); // // Filter reads using async io // LoadingStats seq_stats = LOAD_STATS_INIT_MACRO; for(i = 0; i < inputs.len; i++) { inputs.b[i].stats = &seq_stats; inputs.b[i].db_graph = &db_graph; } // Deal with a set of files at once size_t start, end; for(start = 0; start < inputs.len; start += MAX_IO_THREADS) { // Can have different numbers of inputs vs threads end = MIN2(inputs.len, start+MAX_IO_THREADS); asyncio_run_pool(files.b+start, end-start, filter_reads, NULL, nthreads, 0); } size_t total_reads_printed = 0; size_t total_reads = seq_stats.num_se_reads + seq_stats.num_pe_reads; for(i = 0; i < inputs.len; i++) total_reads_printed += inputs.b[i].num_of_reads_printed; for(i = 0; i < inputs.len; i++) { seqout_close(&inputs.b[i].seqout, false); asyncio_task_close(&files.b[i]); } aln_reads_buf_dealloc(&inputs); asyncio_buf_dealloc(&files); status("Total printed %zu / %zu (%.2f%%) reads\n", total_reads_printed, total_reads, total_reads ? (100.0 * total_reads_printed) / total_reads : 0.0); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int ctx_contigs(int argc, char **argv) { size_t nthreads = 0; struct MemArgs memargs = MEM_ARGS_INIT; const char *out_path = NULL; size_t i, contig_limit = 0, colour = 0; bool cmd_reseed = false, cmd_no_reseed = false; // -r, -R const char *conf_table_path = NULL; // save confidence table to here bool use_missing_info_check = true, seed_with_unused_paths = false; double min_step_confid = -1.0, min_cumul_confid = -1.0; // < 0 => no min // Read length and expected depth for calculating confidences size_t genome_size = 0; seq_file_t *tmp_seed_file = NULL; SeqFilePtrBuffer seed_buf; seq_file_ptr_buf_alloc(&seed_buf, 16); GPathReader tmp_gpfile; GPathFileBuffer gpfiles; gpfile_buf_alloc(&gpfiles, 8); // Arg parsing char cmd[100], shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'o': cmd_check(!out_path,cmd); out_path = optarg; break; case 't': cmd_check(!nthreads,cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'p': memset(&tmp_gpfile, 0, sizeof(GPathReader)); gpath_reader_open(&tmp_gpfile, optarg); gpfile_buf_push(&gpfiles, &tmp_gpfile, 1); break; case '1': case 's': // --seed <in.fa> if((tmp_seed_file = seq_open(optarg)) == NULL) die("Cannot read --seed file: %s", optarg); seq_file_ptr_buf_add(&seed_buf, tmp_seed_file); break; case 'r': cmd_check(!cmd_reseed,cmd); cmd_reseed = true; break; case 'R': cmd_check(!cmd_no_reseed,cmd); cmd_no_reseed = true; break; case 'N': cmd_check(!contig_limit,cmd); contig_limit = cmd_uint32_nonzero(cmd, optarg); break; case 'c': cmd_check(!colour,cmd); colour = cmd_uint32(cmd, optarg); break; case 'G': cmd_check(!genome_size,cmd); genome_size = cmd_bases(cmd, optarg); break; case 'S': cmd_check(!conf_table_path,cmd); conf_table_path = optarg; break; case 'M': cmd_check(use_missing_info_check,cmd); use_missing_info_check = false; break; case 'P': cmd_check(!seed_with_unused_paths,cmd); seed_with_unused_paths = true; break; case 'C': cmd_check(min_cumul_confid < 0,cmd); min_cumul_confid = cmd_udouble(cmd,optarg); if(min_cumul_confid > 1) die("%s must be 0 <= x <= 1", cmd); break; case 'T': cmd_check(min_step_confid < 0,cmd); min_step_confid = cmd_udouble(cmd,optarg); if(min_step_confid > 1) die("%s must be 0 <= x <= 1", cmd); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ die("`"CMD" contigs -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } if(cmd_no_reseed && cmd_reseed) cmd_print_usage("Cannot specify both -r and -R"); if(contig_limit && seed_with_unused_paths) cmd_print_usage("Cannot combine --ncontigs with --use-seed-paths"); bool sample_with_replacement = cmd_reseed; // Defaults if(nthreads == 0) nthreads = DEFAULT_NTHREADS; if(!seed_buf.len && !contig_limit && sample_with_replacement) { cmd_print_usage("Please specify one or more of: " "--no-reseed | --ncontigs | --seed <in.fa>"); } if(optind >= argc) cmd_print_usage("Require input graph files (.ctx)"); // // Open graph files // const size_t num_gfiles = argc - optind; char **graph_paths = argv + optind; ctx_assert(num_gfiles > 0); GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); size_t ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0; graph_files_open(graph_paths, gfiles, num_gfiles, &ctx_max_kmers, &ctx_sum_kmers); // char *ctx_path = argv[optind]; // // Open Graph file // // GraphFileReader gfile; // memset(&gfile, 0, sizeof(GraphFileReader)); // graph_file_open(&gfile, ctx_path); // Update colours in graph file - sample in 0, all others in 1 // never need more than two colours ncols = gpath_load_sample_pop(gfiles, num_gfiles, gpfiles.b, gpfiles.len, colour); // Check for compatibility between graph files and path files // pop_colour is colour 1 graphs_gpaths_compatible(gfiles, num_gfiles, gpfiles.b, gpfiles.len, 1); if(!genome_size) { char nk_str[50]; if(ctx_max_kmers <= 0) die("Please pass --genome <G> if streaming"); genome_size = ctx_max_kmers; ulong_to_str(genome_size, nk_str); status("Taking number of kmers as genome size: %s", nk_str); } // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem, path_mem, total_mem; // 1 bit needed per kmer if we need to keep track of kmer usage bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 + sizeof(GPath*)*8 + ncols + !sample_with_replacement; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, ctx_max_kmers, ctx_sum_kmers, false, &graph_mem); // Paths memory size_t rem_mem = memargs.mem_to_use - MIN2(memargs.mem_to_use, graph_mem); path_mem = gpath_reader_mem_req(gpfiles.b, gpfiles.len, ncols, rem_mem, false); // Shift path store memory from graphs->paths graph_mem -= sizeof(GPath*)*kmers_in_hash; path_mem += sizeof(GPath*)*kmers_in_hash; cmd_print_mem(path_mem, "paths"); // Total memory total_mem = graph_mem + path_mem; cmd_check_mem_limit(memargs.mem_to_use, total_mem); // Load contig hist distribution from ctp files ZeroSizeBuffer contig_hist; memset(&contig_hist, 0, sizeof(contig_hist)); for(i = 0; i < gpfiles.len; i++) { gpath_reader_load_contig_hist(gpfiles.b[i].json, gpfiles.b[i].fltr.path.b, file_filter_fromcol(&gpfiles.b[i].fltr, 0), &contig_hist); } // Calculate confidences, only for one colour ContigConfidenceTable conf_table; conf_table_alloc(&conf_table, 1); conf_table_update_hist(&conf_table, 0, genome_size, contig_hist.b, contig_hist.len); if(conf_table_path != NULL) { conf_table_save(&conf_table, conf_table_path); } zsize_buf_dealloc(&contig_hist); // // Output file if printing // FILE *fout = out_path ? futil_fopen_create(out_path, "w") : NULL; // Allocate dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, ncols, 1, kmers_in_hash, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL); // Paths gpath_reader_alloc_gpstore(gpfiles.b, gpfiles.len, path_mem, false, &db_graph); uint8_t *visited = NULL; if(!sample_with_replacement) visited = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1); // Load graph LoadingStats stats = LOAD_STATS_INIT_MACRO; GraphLoadingPrefs gprefs = {.db_graph = &db_graph, .boolean_covgs = false, .must_exist_in_graph = false, .empty_colours = true}; for(i = 0; i < num_gfiles; i++) { graph_load(&gfiles[i], gprefs, &stats); graph_file_close(&gfiles[i]); gprefs.empty_colours = false; } ctx_free(gfiles); hash_table_print_stats(&db_graph.ht); // Load path files for(i = 0; i < gpfiles.len; i++) { gpath_reader_load(&gpfiles.b[i], GPATH_DIE_MISSING_KMERS, &db_graph); gpath_reader_close(&gpfiles.b[i]); } gpfile_buf_dealloc(&gpfiles); AssembleContigStats assem_stats; assemble_contigs_stats_init(&assem_stats); assemble_contigs(nthreads, seed_buf.b, seed_buf.len, contig_limit, visited, use_missing_info_check, seed_with_unused_paths, min_step_confid, min_cumul_confid, fout, out_path, &assem_stats, &conf_table, &db_graph, 0); // Sample always loaded into colour zero if(fout && fout != stdout) fclose(fout); assemble_contigs_stats_print(&assem_stats); assemble_contigs_stats_destroy(&assem_stats); conf_table_dealloc(&conf_table); for(i = 0; i < seed_buf.len; i++) seq_close(seed_buf.b[i]); seq_file_ptr_buf_dealloc(&seed_buf); ctx_free(visited); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int ctx_bubbles(int argc, char **argv) { size_t nthreads = 0; struct MemArgs memargs = MEM_ARGS_INIT; const char *out_path = NULL; size_t max_allele_len = 0, max_flank_len = 0; bool remove_serial_bubbles = true; // List of haploid colours size_t *hapcols = NULL; int nhapcols = 0; char *hapcols_arg = NULL; GPathReader tmp_gpfile; GPathFileBuffer gpfiles; gpfile_buf_alloc(&gpfiles, 8); // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'p': memset(&tmp_gpfile, 0, sizeof(GPathReader)); gpath_reader_open(&tmp_gpfile, optarg); gpfile_buf_push(&gpfiles, &tmp_gpfile, 1); break; case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'H': cmd_check(!hapcols_arg, cmd); hapcols_arg = optarg; break; case 'A': cmd_check(!max_allele_len, cmd); max_allele_len = cmd_uint32_nonzero(cmd, optarg); break; case 'F': cmd_check(!max_flank_len, cmd); max_flank_len = cmd_uint32_nonzero(cmd, optarg); break; case 'S': cmd_check(remove_serial_bubbles,cmd); remove_serial_bubbles = false; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } // Defaults for unset values if(out_path == NULL) out_path = "-"; if(nthreads == 0) nthreads = DEFAULT_NTHREADS; if(max_allele_len == 0) max_allele_len = DEFAULT_MAX_ALLELE; if(max_flank_len == 0) max_flank_len = DEFAULT_MAX_FLANK; if(optind >= argc) cmd_print_usage("Require input graph files (.ctx)"); // // Open graph files // const size_t num_gfiles = argc - optind; char **graph_paths = argv + optind; ctx_assert(num_gfiles > 0); GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); size_t i, ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0; ncols = graph_files_open(graph_paths, gfiles, num_gfiles, &ctx_max_kmers, &ctx_sum_kmers); // Check graph + paths are compatible graphs_gpaths_compatible(gfiles, num_gfiles, gpfiles.b, gpfiles.len, -1); // // Check haploid colours are valid // if(hapcols_arg != NULL) { if((nhapcols = range_get_num(hapcols_arg, ncols)) < 0) die("Invalid haploid colour list: %s", hapcols_arg); hapcols = ctx_calloc(nhapcols, sizeof(hapcols[0])); if(range_parse_array(hapcols_arg, hapcols, ncols) < 0) die("Invalid haploid colour list: %s", hapcols_arg); } // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem, path_mem, thread_mem; char thread_mem_str[100]; // edges(1bytes) + kmer_paths(8bytes) + in_colour(1bit/col) + // visitedfw/rv(2bits/thread) bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 + (gpfiles.len > 0 ? sizeof(GPath*)*8 : 0) + ncols + 2*nthreads; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, ctx_max_kmers, ctx_sum_kmers, false, &graph_mem); // Thread memory thread_mem = roundup_bits2bytes(kmers_in_hash) * 2; bytes_to_str(thread_mem * nthreads, 1, thread_mem_str); status("[memory] (of which threads: %zu x %zu = %s)\n", nthreads, thread_mem, thread_mem_str); // Paths memory size_t rem_mem = memargs.mem_to_use - MIN2(memargs.mem_to_use, graph_mem+thread_mem); path_mem = gpath_reader_mem_req(gpfiles.b, gpfiles.len, ncols, rem_mem, false, kmers_in_hash, false); // Shift path store memory from graphs->paths graph_mem -= sizeof(GPath*)*kmers_in_hash; path_mem += sizeof(GPath*)*kmers_in_hash; cmd_print_mem(path_mem, "paths"); size_t total_mem = graph_mem + thread_mem + path_mem; cmd_check_mem_limit(memargs.mem_to_use, total_mem); // // Open output file // gzFile gzout = futil_gzopen_create(out_path, "w"); // Allocate memory dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, ncols, 1, kmers_in_hash, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL); // Paths gpath_reader_alloc_gpstore(gpfiles.b, gpfiles.len, path_mem, false, &db_graph); // // Load graphs // GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph); gprefs.empty_colours = true; for(i = 0; i < num_gfiles; i++) { graph_load(&gfiles[i], gprefs, NULL); graph_file_close(&gfiles[i]); gprefs.empty_colours = false; } ctx_free(gfiles); hash_table_print_stats(&db_graph.ht); // Load link files for(i = 0; i < gpfiles.len; i++) gpath_reader_load(&gpfiles.b[i], GPATH_DIE_MISSING_KMERS, &db_graph); // Create array of cJSON** from input files cJSON **hdrs = ctx_malloc(gpfiles.len * sizeof(cJSON*)); for(i = 0; i < gpfiles.len; i++) hdrs[i] = gpfiles.b[i].json; // Now call variants BubbleCallingPrefs call_prefs = {.max_allele_len = max_allele_len, .max_flank_len = max_flank_len, .haploid_cols = hapcols, .nhaploid_cols = nhapcols, .remove_serial_bubbles = remove_serial_bubbles}; invoke_bubble_caller(nthreads, &call_prefs, gzout, out_path, hdrs, gpfiles.len, &db_graph); status(" saved to: %s\n", out_path); gzclose(gzout); ctx_free(hdrs); // Close input link files for(i = 0; i < gpfiles.len; i++) gpath_reader_close(&gpfiles.b[i]); gpfile_buf_dealloc(&gpfiles); ctx_free(hapcols); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int ctx_vcfcov(int argc, char **argv) { struct MemArgs memargs = MEM_ARGS_INIT; const char *out_path = NULL, *out_type = NULL; uint32_t max_allele_len = 0, max_gt_vars = 0; char *ref_path = NULL; bool low_mem = false; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; size_t i; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 'O': cmd_check(!out_type, cmd); out_type = optarg; break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'r': cmd_check(!ref_path, cmd); ref_path = optarg; break; case 'L': cmd_check(!max_allele_len,cmd); max_allele_len = cmd_uint32(cmd,optarg); break; case 'N': cmd_check(!max_gt_vars,cmd); max_gt_vars = cmd_uint32(cmd,optarg); break; case 'M': cmd_check(!low_mem, cmd); low_mem = true; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } // Defaults for unset values if(out_path == NULL) out_path = "-"; if(ref_path == NULL) cmd_print_usage("Require a reference (-r,--ref <ref.fa>)"); if(optind+2 > argc) cmd_print_usage("Require VCF and graph files"); if(!max_allele_len) max_allele_len = DEFAULT_MAX_ALLELE_LEN; if(!max_gt_vars) max_gt_vars = DEFAULT_MAX_GT_VARS; status("[vcfcov] max allele length: %u; max number of variants: %u", max_allele_len, max_gt_vars); // open ref // index fasta with: samtools faidx ref.fa faidx_t *fai = fai_load(ref_path); if(fai == NULL) die("Cannot load ref index: %s / %s.fai", ref_path, ref_path); // Open input VCF file const char *vcf_path = argv[optind++]; htsFile *vcffh = hts_open(vcf_path, "r"); if(vcffh == NULL) die("Cannot open VCF file: %s", vcf_path); bcf_hdr_t *vcfhdr = bcf_hdr_read(vcffh); if(vcfhdr == NULL) die("Cannot read VCF header: %s", vcf_path); // Test we can close and reopen files if(low_mem) { if((vcffh = hts_open(vcf_path, "r")) == NULL) die("Cannot re-open VCF file: %s", vcf_path); if((vcfhdr = bcf_hdr_read(vcffh)) == NULL) die("Cannot re-read VCF header: %s", vcf_path); } // // Open graph files // const size_t num_gfiles = argc - optind; char **graph_paths = argv + optind; ctx_assert(num_gfiles > 0); GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); size_t ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0; ncols = graph_files_open(graph_paths, gfiles, num_gfiles, &ctx_max_kmers, &ctx_sum_kmers); // Check graph + paths are compatible graphs_gpaths_compatible(gfiles, num_gfiles, NULL, 0, -1); // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem; bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Covg)*8 * ncols; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, low_mem ? -1 : (int64_t)ctx_max_kmers, ctx_sum_kmers, true, &graph_mem); cmd_check_mem_limit(memargs.mem_to_use, graph_mem); // // Open output file // // v=>vcf, z=>compressed vcf, b=>bcf, bu=>uncompressed bcf int mode = vcf_misc_get_outtype(out_type, out_path); futil_create_output(out_path); htsFile *outfh = hts_open(out_path, modes_htslib[mode]); status("[vcfcov] Output format: %s", hsmodes_htslib[mode]); // Allocate memory dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, ncols, 1, kmers_in_hash, DBG_ALLOC_COVGS); // // Set up tag names // // *R => ref, *A => alt sprintf(kcov_ref_tag, "K%zuR", db_graph.kmer_size); // mean coverage sprintf(kcov_alt_tag, "K%zuA", db_graph.kmer_size); // #SAMPLE=<ID=...,K29KCOV=...,K29NK=...,K29RLK> // - K29_kcov is empirical kmer coverage // - K29_nkmers is the number of kmers in the sample // - mean_read_length is the mean read length in bases char sample_kcov_tag[20], sample_nk_tag[20], sample_rlk_tag[20]; sprintf(sample_kcov_tag, "K%zu_kcov", db_graph.kmer_size); // mean coverage sprintf(sample_nk_tag, "K%zu_nkmers", db_graph.kmer_size); sprintf(sample_rlk_tag, "mean_read_length"); // // Load kmers if we are using --low-mem // VcfCovStats st; memset(&st, 0, sizeof(st)); VcfCovPrefs prefs = {.kcov_ref_tag = kcov_ref_tag, .kcov_alt_tag = kcov_alt_tag, .max_allele_len = max_allele_len, .max_gt_vars = max_gt_vars, .load_kmers_only = false}; if(low_mem) { status("[vcfcov] Loading kmers from VCF+ref"); prefs.load_kmers_only = true; vcfcov_file(vcffh, vcfhdr, NULL, NULL, vcf_path, fai, NULL, &prefs, &st, &db_graph); // Close files hts_close(vcffh); bcf_hdr_destroy(vcfhdr); // Re-open files if((vcffh = hts_open(vcf_path, "r")) == NULL) die("Cannot re-open VCF file: %s", vcf_path); if((vcfhdr = bcf_hdr_read(vcffh)) == NULL) die("Cannot re-read VCF header: %s", vcf_path); prefs.load_kmers_only = false; } // // Load graphs // GraphLoadingStats gstats; memset(&gstats, 0, sizeof(gstats)); GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph); gprefs.must_exist_in_graph = low_mem; for(i = 0; i < num_gfiles; i++) { graph_load(&gfiles[i], gprefs, &gstats); graph_file_close(&gfiles[i]); } ctx_free(gfiles); hash_table_print_stats(&db_graph.ht); // // Set up VCF header / graph matchup // size_t *samplehdrids = ctx_malloc(db_graph.num_of_cols * sizeof(size_t)); // Add samples to vcf header bcf_hdr_t *outhdr = bcf_hdr_dup(vcfhdr); bcf_hrec_t *hrec; int sid; char hdrstr[200]; for(i = 0; i < db_graph.num_of_cols; i++) { char *sname = db_graph.ginfo[i].sample_name.b; if((sid = bcf_hdr_id2int(outhdr, BCF_DT_SAMPLE, sname)) < 0) { bcf_hdr_add_sample(outhdr, sname); sid = bcf_hdr_id2int(outhdr, BCF_DT_SAMPLE, sname); } samplehdrids[i] = sid; // Add SAMPLE field hrec = bcf_hdr_get_hrec(outhdr, BCF_HL_STR, "ID", sname, "SAMPLE"); if(hrec == NULL) { sprintf(hdrstr, "##SAMPLE=<ID=%s,%s=%"PRIu64",%s=%"PRIu64",%s=%zu>", sname, sample_kcov_tag, gstats.nkmers[i] ? gstats.sumcov[i] / gstats.nkmers[i] : 0, sample_nk_tag, gstats.nkmers[i], sample_rlk_tag, (size_t)db_graph.ginfo[i].mean_read_length); bcf_hdr_append(outhdr, hdrstr); } else { // mean kcovg sprintf(hdrstr, "%"PRIu64, gstats.sumcov[i] / gstats.nkmers[i]); vcf_misc_add_update_hrec(hrec, sample_kcov_tag, hdrstr); // num kmers sprintf(hdrstr, "%"PRIu64, gstats.nkmers[i]); vcf_misc_add_update_hrec(hrec, sample_nk_tag, hdrstr); // mean read length in kmers sprintf(hdrstr, "%zu", (size_t)db_graph.ginfo[i].mean_read_length); vcf_misc_add_update_hrec(hrec, sample_rlk_tag, hdrstr); } status("[vcfcov] Colour %zu: %s [VCF column %zu]", i, sname, samplehdrids[i]); } // Add genotype format fields // One field per alternative allele sprintf(hdrstr, "##FORMAT=<ID=%s,Number=A,Type=Integer," "Description=\"Coverage on ref (k=%zu): sum(kmer_covs) / exp_num_kmers\">\n", kcov_ref_tag, db_graph.kmer_size); bcf_hdr_append(outhdr, hdrstr); sprintf(hdrstr, "##FORMAT=<ID=%s,Number=A,Type=Integer," "Description=\"Coverage on alt (k=%zu): sum(kmer_covs) / exp_num_kmers\">\n", kcov_alt_tag, db_graph.kmer_size); bcf_hdr_append(outhdr, hdrstr); bcf_hdr_set_version(outhdr, "VCFv4.2"); // Add command string to header vcf_misc_hdr_add_cmd(outhdr, cmd_get_cmdline(), cmd_get_cwd()); if(bcf_hdr_write(outfh, outhdr) != 0) die("Cannot write header to: %s", futil_outpath_str(out_path)); status("[vcfcov] Reading %s and adding coverage", vcf_path); // Reset stats and get coverage memset(&st, 0, sizeof(st)); vcfcov_file(vcffh, vcfhdr, outfh, outhdr, vcf_path, fai, samplehdrids, &prefs, &st, &db_graph); // Print statistics char ns0[50], ns1[50]; status("[vcfcov] Read %s VCF lines", ulong_to_str(st.nvcf_lines, ns0)); status("[vcfcov] Read %s ALTs", ulong_to_str(st.nalts_read, ns0)); status("[vcfcov] Used %s kmers", ulong_to_str(st.ngt_kmers, ns0)); status("[vcfcov] ALTs used: %s / %s (%.2f%%)", ulong_to_str(st.nalts_loaded, ns0), ulong_to_str(st.nalts_read, ns1), st.nalts_read ? (100.0*st.nalts_loaded) / st.nalts_read : 0.0); status("[vcfcov] ALTs too long (>%ubp): %s / %s (%.2f%%)", max_allele_len, ulong_to_str(st.nalts_too_long, ns0), ulong_to_str(st.nalts_read, ns1), st.nalts_read ? (100.0*st.nalts_too_long) / st.nalts_read : 0.0); status("[vcfcov] ALTs too dense (>%u within %zubp): %s / %s (%.2f%%)", max_gt_vars, db_graph.kmer_size, ulong_to_str(st.nalts_no_covg, ns0), ulong_to_str(st.nalts_read, ns1), st.nalts_read ? (100.0*st.nalts_no_covg) / st.nalts_read : 0.0); status("[vcfcov] ALTs printed with coverage: %s / %s (%.2f%%)", ulong_to_str(st.nalts_with_covg, ns0), ulong_to_str(st.nalts_read, ns1), st.nalts_read ? (100.0*st.nalts_with_covg) / st.nalts_read : 0.0); status("[vcfcov] Saved to: %s\n", out_path); ctx_free(samplehdrids); graph_loading_stats_destroy(&gstats); bcf_hdr_destroy(vcfhdr); bcf_hdr_destroy(outhdr); hts_close(vcffh); hts_close(outfh); fai_destroy(fai); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }