int ctx_infer_edges(int argc, char **argv) { size_t num_of_threads = DEFAULT_NTHREADS; struct MemArgs memargs = MEM_ARGS_INIT; char *out_ctx_path = NULL; bool add_pop_edges = false, add_all_edges = false; // Arg parsing char cmd[100]; char shortopts[100]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'o': cmd_check(!out_ctx_path,cmd); out_ctx_path = optarg; break; case 't': num_of_threads = cmd_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'A': add_all_edges = true; break; case 'P': add_pop_edges = true; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" inferedges -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } // Default to adding all edges if(!add_pop_edges && !add_all_edges) add_all_edges = true; // Can only specify one of --pop --all if(add_pop_edges && add_all_edges) cmd_print_usage("Please specify only one of --all --pop"); // Check that optind+1 == argc if(optind+1 > argc) cmd_print_usage("Expected exactly one graph file"); else if(optind+1 < argc) cmd_print_usage("Expected only one graph file. What is this: '%s'", argv[optind]); // // Open graph file // char *graph_path = argv[optind]; status("Reading graph: %s", graph_path); if(strchr(graph_path,':') != NULL) cmd_print_usage("Cannot use ':' in input graph for `"CMD" inferedges`"); GraphFileReader file; memset(&file, 0, sizeof(file)); file_filter_open(&file.fltr, graph_path); // Use stat to detect if we are reading from a stream struct stat st; bool reading_stream = (stat(file.fltr.path.b, &st) != 0); // Mode r+ means open (not create) for update (read & write) graph_file_open2(&file, graph_path, reading_stream ? "r" : "r+", 0); if(!file_filter_is_direct(&file.fltr)) cmd_print_usage("Inferedges with filter not implemented - sorry"); bool editing_file = !(out_ctx_path || reading_stream); FILE *fout = NULL; // Editing input file or writing a new file if(!editing_file) fout = futil_fopen_create(out_ctx_path ? out_ctx_path : "-", "w"); // Print output status if(fout == stdout) status("Writing to STDOUT"); else if(fout != NULL) status("Writing to: %s", out_ctx_path); else status("Editing file in place: %s", graph_path); status("Inferring all missing %sedges", add_pop_edges ? "population " : ""); // // Decide on memory // const size_t ncols = file.hdr.num_of_cols; size_t kmers_in_hash, graph_mem, bits_per_kmer; // reading stream: all covgs + edges // reading file: one bit per kmer per colour for 'in colour' bits_per_kmer = sizeof(BinaryKmer)*8; if(reading_stream) { bits_per_kmer += ncols * 8 * (sizeof(Edges) + sizeof(Covg)); } else { bits_per_kmer += ncols; // in colour } kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, file.num_of_kmers, file.num_of_kmers, memargs.mem_to_use_set, &graph_mem); cmd_check_mem_limit(memargs.mem_to_use, graph_mem); // // Allocate memory // int alloc_flags = reading_stream ? DBG_ALLOC_EDGES | DBG_ALLOC_COVGS : DBG_ALLOC_NODE_IN_COL; dBGraph db_graph; db_graph_alloc(&db_graph, file.hdr.kmer_size, ncols, reading_stream ? ncols : 1, kmers_in_hash, alloc_flags); LoadingStats stats = LOAD_STATS_INIT_MACRO; GraphLoadingPrefs gprefs = {.db_graph = &db_graph, .boolean_covgs = false, .must_exist_in_graph = false, .must_exist_in_edges = NULL, .empty_colours = false}; // We need to load the graph for both --pop and --all since we need to check // if the next kmer is in each of the colours graph_load(&file, gprefs, &stats); if(add_pop_edges) status("Inferring edges from population...\n"); else status("Inferring all missing edges...\n"); size_t num_kmers_edited; if(reading_stream) { ctx_assert(fout != NULL); num_kmers_edited = infer_edges(num_of_threads, add_all_edges, &db_graph); graph_write_header(fout, &file.hdr); graph_write_all_kmers(fout, &db_graph); } else if(fout == NULL) { num_kmers_edited = inferedges_on_mmap(&db_graph, add_all_edges, &file); } else { num_kmers_edited = inferedges_on_file(&db_graph, add_all_edges, &file, fout); } if(fout != NULL && fout != stdout) fclose(fout); char modified_str[100], kmers_str[100]; ulong_to_str(num_kmers_edited, modified_str); ulong_to_str(db_graph.ht.num_kmers, kmers_str); double modified_rate = 0; if(db_graph.ht.num_kmers) modified_rate = (100.0 * num_kmers_edited) / db_graph.ht.num_kmers; status("%s of %s (%.2f%%) nodes modified\n", modified_str, kmers_str, modified_rate); if(editing_file) { // Close and re-open fclose(file.fh); file.fh = NULL; futil_update_timestamp(file.fltr.path.b); } graph_file_close(&file); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int main(int argc, char **argv) { cortex_init(); cmd_init(argc, argv); ctx_msg_out = NULL; ctx_tst_out = stdout; test_status("Tests running k=%i..%i...", get_min_kmer_size(), get_max_kmer_size()); test_status("[version] "VERSION_STATUS_STR"\n"); // Binary Kmer tests should work for all values of MAXK test_bkmer_functions(); test_hash_table(); #if MAX_KMER_SIZE == 31 // not kmer dependent test_util(); test_dna_functions(); test_binary_seq_functions(); // only written in k=31 test_db_node(); test_build_graph(); test_db_unitig(); test_subgraph(); test_cleaning(); test_paths(); // test_path_sets(); // TODO: replace with test_path_subset() test_graph_walker(); test_corrected_aln(); test_repeat_walker(); test_graph_crawler(); test_bubble_caller(); test_kmer_occur(); test_infer_edges_tests(); #endif cmd_destroy(); // Check we free'd all our memory size_t still_alloced = alloc_get_num_allocs() - alloc_get_num_frees(); TASSERT2(still_alloced == 0, "%zu not free'd", still_alloced); // Finished char num_test_str[100], num_passed_str[100]; size_t tests_num_passed = tests_num_run - tests_num_failed; ulong_to_str(tests_num_run, num_test_str); ulong_to_str(tests_num_passed, num_passed_str); test_status("Tests passed: %s / %s (%.1f%%)", num_passed_str, num_test_str, (100.0*tests_num_passed)/tests_num_run); if(tests_num_failed) test_status("%zu tests failed", tests_num_failed); else test_status("All tests passed."); cortex_destroy(); // Return 1 if any tests failed, 0 on success return tests_num_failed ? 1 : 0; }
int ctx_rmsubstr(int argc, char **argv) { struct MemArgs memargs = MEM_ARGS_INIT; size_t kmer_size = 0, nthreads = 0; const char *output_file = NULL; seq_format fmt = SEQ_FMT_FASTA; bool invert = false; // Arg parsing char cmd[100], shortopts[100]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'o': cmd_check(!output_file, cmd); output_file = optarg; break; case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'k': cmd_check(!kmer_size,cmd); kmer_size = cmd_uint32(cmd, optarg); break; case 'F': cmd_check(fmt==SEQ_FMT_FASTA, cmd); fmt = cmd_parse_format(cmd, optarg); break; case 'v': cmd_check(!invert,cmd); invert = true; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); cmd_print_usage("`"CMD" rmsubstr -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } // Defaults if(!nthreads) nthreads = DEFAULT_NTHREADS; if(!kmer_size) kmer_size = DEFAULT_KMER; if(!(kmer_size&1)) cmd_print_usage("Kmer size must be odd"); if(kmer_size < MIN_KMER_SIZE) cmd_print_usage("Kmer size too small (recompile)"); if(kmer_size > MAX_KMER_SIZE) cmd_print_usage("Kmer size too large (recompile?)"); if(optind >= argc) cmd_print_usage("Please specify at least one input sequence file (.fq, .fq etc.)"); size_t i, num_seq_files = argc - optind; char **seq_paths = argv + optind; seq_file_t **seq_files = ctx_calloc(num_seq_files, sizeof(seq_file_t*)); for(i = 0; i < num_seq_files; i++) if((seq_files[i] = seq_open(seq_paths[i])) == NULL) die("Cannot read sequence file %s", seq_paths[i]); // Estimate number of bases // set to -1 if we cannot calc int64_t est_num_bases = seq_est_seq_bases(seq_files, num_seq_files); if(est_num_bases < 0) { warn("Cannot get file sizes, using pipes"); est_num_bases = memargs.num_kmers * IDEAL_OCCUPANCY; } status("[memory] Estimated number of bases: %li", (long)est_num_bases); // Use file sizes to decide on memory // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem; size_t mem_to_use = memargs.mem_to_use; bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(KONodeList) + sizeof(KOccur); // see kmer_occur.h if(mem_to_use < (size_t)est_num_bases) { warn("You probably need at least %zu bytes (> %zu)", (size_t)est_num_bases, memargs.mem_to_use); } else { mem_to_use -= est_num_bases; } kmers_in_hash = cmd_get_kmers_in_hash(mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, 0, est_num_bases, true, &graph_mem); // 1 byte per kmer for each base to load sequence files size_t total_mem = kmers_in_hash*bits_per_kmer/8 + est_num_bases; char memstr[50]; bytes_to_str(total_mem, 1, memstr); status("[memory] total mem with input: %s\n", memstr); cmd_check_mem_limit(memargs.mem_to_use, total_mem); // // Open output file // if(output_file == NULL) output_file = "-"; FILE *fout = futil_fopen_create(output_file, "w"); // // Set up memory // dBGraph db_graph; db_graph_alloc(&db_graph, kmer_size, 1, 0, kmers_in_hash, DBG_ALLOC_BKTLOCKS); // // Load reference sequence into a read buffer // ReadBuffer rbuf; read_buf_alloc(&rbuf, 1024); seq_load_all_reads(seq_files, num_seq_files, &rbuf); // Check for reads too short for(i = 0; i < rbuf.len && rbuf.b[i].seq.end >= kmer_size; i++) {} if(i < rbuf.len) warn("Reads shorter than kmer size (%zu) will not be filtered", kmer_size); KOGraph kograph = kograph_create(rbuf.b, rbuf.len, true, 0, nthreads, &db_graph); size_t num_reads = rbuf.len, num_reads_printed = 0, num_bad_reads = 0; // Loop over reads printing those that are not substrings int ret; for(i = 0; i < rbuf.len; i++) { ret = _is_substr(&rbuf, i, &kograph, &db_graph); if(ret == -1) num_bad_reads++; else if((ret && invert) || (!ret && !invert)) { seqout_print_read(&rbuf.b[i], fmt, fout); num_reads_printed++; } } char num_reads_str[100], num_reads_printed_str[100], num_bad_reads_str[100]; ulong_to_str(num_reads, num_reads_str); ulong_to_str(num_reads_printed, num_reads_printed_str); ulong_to_str(num_bad_reads, num_bad_reads_str); status("Printed %s / %s (%.1f%%) to %s", num_reads_printed_str, num_reads_str, !num_reads ? 0.0 : (100.0 * num_reads_printed) / num_reads, futil_outpath_str(output_file)); if(num_bad_reads > 0) { status("Bad reads: %s / %s (%.1f%%) - no kmer {ACGT} of length %zu", num_bad_reads_str, num_reads_str, (100.0 * num_bad_reads) / num_reads, kmer_size); } fclose(fout); kograph_dealloc(&kograph); // Free sequence memory for(i = 0; i < rbuf.len; i++) seq_read_dealloc(&rbuf.b[i]); read_buf_dealloc(&rbuf); ctx_free(seq_files); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int ctx_clean(int argc, char **argv) { size_t nthreads = 0, use_ncols = 0; struct MemArgs memargs = MEM_ARGS_INIT; const char *out_ctx_path = NULL; int min_keep_tip = -1, unitig_min = -1; // <0 => default, 0 => noclean uint32_t fallback_thresh = 0; const char *len_before_path = NULL, *len_after_path = NULL; const char *covg_before_path = NULL, *covg_after_path = NULL; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'o': if(out_ctx_path != NULL) cmd_print_usage(NULL); out_ctx_path = optarg; break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'N': use_ncols = cmd_uint32_nonzero(cmd, optarg); break; case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'T': cmd_check(min_keep_tip<0, cmd); min_keep_tip = (optarg != NULL ? cmd_uint32(cmd, optarg) : -1); break; case 'S': case 'U': cmd_check(unitig_min<0, cmd); unitig_min = (optarg != NULL ? cmd_uint32(cmd, optarg) : -1); break; case 'B': cmd_check(!fallback_thresh, cmd); fallback_thresh = cmd_uint32_nonzero(cmd, optarg); break; case 'l': cmd_check(!len_before_path, cmd); len_before_path = optarg; break; case 'L': cmd_check(!len_after_path, cmd); len_after_path = optarg; break; case 'c': cmd_check(!covg_before_path, cmd); covg_before_path = optarg; break; case 'C': cmd_check(!covg_after_path, cmd); covg_after_path = optarg; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" clean -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } if(nthreads == 0) nthreads = DEFAULT_NTHREADS; if(optind >= argc) cmd_print_usage("Please give input graph files"); bool unitig_cleaning = (unitig_min != 0); bool tip_cleaning = (min_keep_tip != 0); bool doing_cleaning = (unitig_cleaning || tip_cleaning); // If you ever want to estimate cleaning threshold without outputting // a graph, change this to a warning if(doing_cleaning && out_ctx_path == NULL) { cmd_print_usage("Please specify --out <out.ctx> for cleaned graph"); // warn("No cleaning being done: you did not specify --out <out.ctx>"); } if(!doing_cleaning && (covg_after_path || len_after_path)) { warn("You gave --len-after <out> / --covg-after <out> without " "any cleaning (set -U, --unitigs or -t, --tips)"); } if(doing_cleaning && strcmp(out_ctx_path,"-") != 0 && !futil_get_force() && futil_file_exists(out_ctx_path)) { cmd_print_usage("Output file already exists: %s", out_ctx_path); } if(fallback_thresh && !unitig_cleaning) warn("-B, --fallback <T> without --unitigs"); // Use remaining args as graph files char **gfile_paths = argv + optind; size_t i, j, num_gfiles = (size_t)(argc - optind); // Open graph files GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); size_t col, ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0; ncols = graph_files_open(gfile_paths, gfiles, num_gfiles, &ctx_max_kmers, &ctx_sum_kmers); size_t kmer_size = gfiles[0].hdr.kmer_size; // default to one colour for now if(use_ncols == 0) use_ncols = 1; // Flatten if we don't have to remember colours / output a graph if(out_ctx_path == NULL) { ncols = use_ncols = 1; for(i = 0; i < num_gfiles; i++) file_filter_flatten(&gfiles[i].fltr, 0); } if(ncols < use_ncols) { warn("I only need %zu colour%s ('--ncols %zu' ignored)", ncols, util_plural_str(ncols), use_ncols); use_ncols = ncols; } char max_kmers_str[100]; ulong_to_str(ctx_max_kmers, max_kmers_str); status("%zu input graph%s, max kmers: %s, using %zu colours", num_gfiles, util_plural_str(num_gfiles), max_kmers_str, use_ncols); // If no arguments given we default to removing tips < 2*kmer_size if(min_keep_tip < 0) min_keep_tip = 2 * kmer_size; // Warn if any graph files already cleaned size_t fromcol; ErrorCleaning *cleaning; for(i = 0; i < num_gfiles; i++) { for(j = 0; j < file_filter_num(&gfiles[i].fltr); j++) { fromcol = file_filter_fromcol(&gfiles[i].fltr, j); cleaning = &gfiles[i].hdr.ginfo[fromcol].cleaning; if(cleaning->cleaned_snodes && unitig_cleaning) { warn("%s:%zu already has unitig cleaning with threshold: <%zu", file_filter_path(&gfiles[i].fltr), fromcol, (size_t)cleaning->clean_snodes_thresh); } if(cleaning->cleaned_tips && tip_cleaning) { warn("%s:%zu already has had tip cleaned", file_filter_path(&gfiles[i].fltr), fromcol); } } } // Print steps size_t step = 0; status("Actions:\n"); if(covg_before_path != NULL) status("%zu. Saving kmer coverage distribution to: %s", step++, covg_before_path); if(len_before_path != NULL) status("%zu. Saving unitig length distribution to: %s", step++, len_before_path); if(min_keep_tip > 0) status("%zu. Cleaning tips shorter than %i nodes", step++, min_keep_tip); if(unitig_min > 0) status("%zu. Cleaning unitigs with coverage < %i", step++, unitig_min); if(unitig_min < 0) status("%zu. Cleaning unitigs with auto-detected threshold", step++); if(covg_after_path != NULL) status("%zu. Saving kmer coverage distribution to: %s", step++, covg_after_path); if(len_after_path != NULL) status("%zu. Saving unitig length distribution to: %s", step++, len_after_path); // // Decide memory usage // bool all_colours_loaded = (ncols <= use_ncols); bool use_mem_limit = (memargs.mem_to_use_set && num_gfiles > 1) || !ctx_max_kmers; size_t kmers_in_hash, bits_per_kmer, graph_mem; size_t per_col_bits = (sizeof(Covg)+sizeof(Edges)) * 8; size_t extra_edge_bits = (all_colours_loaded ? 0 : sizeof(Edges) * 8); bits_per_kmer = sizeof(BinaryKmer)*8 + per_col_bits * use_ncols + extra_edge_bits; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, ctx_max_kmers, ctx_sum_kmers, use_mem_limit, &graph_mem); // Maximise the number of colours we load to fill the mem size_t max_usencols = (memargs.mem_to_use*8 - sizeof(BinaryKmer)*8*kmers_in_hash + extra_edge_bits*kmers_in_hash) / (per_col_bits*kmers_in_hash); use_ncols = MIN2(max_usencols, ncols); cmd_check_mem_limit(memargs.mem_to_use, graph_mem); // // Check output files are writable // futil_create_output(out_ctx_path); // Does nothing if arg is NULL futil_create_output(covg_before_path); futil_create_output(covg_after_path); futil_create_output(len_before_path); futil_create_output(len_after_path); // Create db_graph // Load as many colours as possible // Use an extra set of edge to take intersections dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, use_ncols, use_ncols, kmers_in_hash, DBG_ALLOC_EDGES | DBG_ALLOC_COVGS); // Extra edges required to hold union of kept edges Edges *edges_union = NULL; if(use_ncols < ncols) edges_union = ctx_calloc(db_graph.ht.capacity, sizeof(Edges)); // Load graph into a single colour GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph); // Construct cleaned graph header GraphFileHeader outhdr; memset(&outhdr, 0, sizeof(GraphFileHeader)); for(i = 0; i < num_gfiles; i++) graph_file_merge_header(&outhdr, &gfiles[i]); if(ncols > use_ncols) { db_graph.num_of_cols = db_graph.num_edge_cols = 1; SWAP(edges_union, db_graph.col_edges); graphs_load_files_flat(gfiles, num_gfiles, gprefs, NULL); SWAP(edges_union, db_graph.col_edges); db_graph.num_of_cols = db_graph.num_edge_cols = use_ncols; } else { for(i = 0; i < num_gfiles; i++) graph_load(&gfiles[i], gprefs, NULL); } char num_kmers_str[100]; ulong_to_str(db_graph.ht.num_kmers, num_kmers_str); status("Total kmers loaded: %s\n", num_kmers_str); size_t initial_nkmers = db_graph.ht.num_kmers; hash_table_print_stats(&db_graph.ht); uint8_t *visited = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1); uint8_t *keep = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1); // Always estimate cleaning threshold // if(unitig_min <= 0 || covg_before_path || len_before_path) // { // Get coverage distribution and estimate cleaning threshold int est_min_covg = cleaning_get_threshold(nthreads, covg_before_path, len_before_path, visited, &db_graph); if(est_min_covg < 0) status("Cannot find recommended cleaning threshold"); else status("Recommended cleaning threshold is: %i", est_min_covg); // Use estimated threshold if threshold not set if(unitig_min < 0) { if(fallback_thresh > 0 && est_min_covg < (int)fallback_thresh) { status("Using fallback threshold: %i", fallback_thresh); unitig_min = fallback_thresh; } else if(est_min_covg >= 0) unitig_min = est_min_covg; } // } // Die if we failed to find suitable cleaning threshold if(unitig_min < 0) die("Need cleaning threshold (--unitigs=<D> or --fallback <D>)"); // Cleaning parameters should now be set (>0) or turned off (==0) ctx_assert(unitig_min >= 0); ctx_assert(min_keep_tip >= 0); if(unitig_min || min_keep_tip) { // Clean graph of tips (if min_keep_tip > 0) and unitigs (if threshold > 0) clean_graph(nthreads, unitig_min, min_keep_tip, covg_after_path, len_after_path, visited, keep, &db_graph); } ctx_free(visited); ctx_free(keep); if(out_ctx_path != NULL) { // Set output header ginfo cleaned for(col = 0; col < ncols; col++) { cleaning = &outhdr.ginfo[col].cleaning; cleaning->cleaned_snodes |= unitig_cleaning; cleaning->cleaned_tips |= tip_cleaning; // if(tip_cleaning) { // strbuf_append_str(&outhdr.ginfo[col].sample_name, ".tipclean"); // } if(unitig_cleaning) { size_t thresh = cleaning->clean_snodes_thresh; thresh = cleaning->cleaned_snodes ? MAX2(thresh, (uint32_t)unitig_min) : (uint32_t)unitig_min; cleaning->clean_snodes_thresh = thresh; // char name_append[200]; // sprintf(name_append, ".supclean%zu", thresh); // strbuf_append_str(&outhdr.ginfo[col].sample_name, name_append); } } // Print stats on removed kmers size_t removed_nkmers = initial_nkmers - db_graph.ht.num_kmers; double removed_pct = (100.0 * removed_nkmers) / initial_nkmers; char removed_str[100], init_str[100]; ulong_to_str(removed_nkmers, removed_str); ulong_to_str(initial_nkmers, init_str); status("Removed %s of %s (%.2f%%) kmers", removed_str, init_str, removed_pct); // kmers_loaded=true graph_writer_merge(out_ctx_path, gfiles, num_gfiles, true, all_colours_loaded, edges_union, &outhdr, &db_graph); } ctx_check(db_graph.ht.num_kmers == hash_table_count_kmers(&db_graph.ht)); // TODO: report kmer coverage for each sample graph_header_dealloc(&outhdr); for(i = 0; i < num_gfiles; i++) graph_file_close(&gfiles[i]); ctx_free(gfiles); ctx_free(edges_union); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int ctx_contigs(int argc, char **argv) { size_t nthreads = 0; struct MemArgs memargs = MEM_ARGS_INIT; const char *out_path = NULL; size_t i, contig_limit = 0, colour = 0; bool cmd_reseed = false, cmd_no_reseed = false; // -r, -R const char *conf_table_path = NULL; // save confidence table to here bool use_missing_info_check = true, seed_with_unused_paths = false; double min_step_confid = -1.0, min_cumul_confid = -1.0; // < 0 => no min // Read length and expected depth for calculating confidences size_t genome_size = 0; seq_file_t *tmp_seed_file = NULL; SeqFilePtrBuffer seed_buf; seq_file_ptr_buf_alloc(&seed_buf, 16); GPathReader tmp_gpfile; GPathFileBuffer gpfiles; gpfile_buf_alloc(&gpfiles, 8); // Arg parsing char cmd[100], shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'o': cmd_check(!out_path,cmd); out_path = optarg; break; case 't': cmd_check(!nthreads,cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'p': memset(&tmp_gpfile, 0, sizeof(GPathReader)); gpath_reader_open(&tmp_gpfile, optarg); gpfile_buf_push(&gpfiles, &tmp_gpfile, 1); break; case '1': case 's': // --seed <in.fa> if((tmp_seed_file = seq_open(optarg)) == NULL) die("Cannot read --seed file: %s", optarg); seq_file_ptr_buf_add(&seed_buf, tmp_seed_file); break; case 'r': cmd_check(!cmd_reseed,cmd); cmd_reseed = true; break; case 'R': cmd_check(!cmd_no_reseed,cmd); cmd_no_reseed = true; break; case 'N': cmd_check(!contig_limit,cmd); contig_limit = cmd_uint32_nonzero(cmd, optarg); break; case 'c': cmd_check(!colour,cmd); colour = cmd_uint32(cmd, optarg); break; case 'G': cmd_check(!genome_size,cmd); genome_size = cmd_bases(cmd, optarg); break; case 'S': cmd_check(!conf_table_path,cmd); conf_table_path = optarg; break; case 'M': cmd_check(use_missing_info_check,cmd); use_missing_info_check = false; break; case 'P': cmd_check(!seed_with_unused_paths,cmd); seed_with_unused_paths = true; break; case 'C': cmd_check(min_cumul_confid < 0,cmd); min_cumul_confid = cmd_udouble(cmd,optarg); if(min_cumul_confid > 1) die("%s must be 0 <= x <= 1", cmd); break; case 'T': cmd_check(min_step_confid < 0,cmd); min_step_confid = cmd_udouble(cmd,optarg); if(min_step_confid > 1) die("%s must be 0 <= x <= 1", cmd); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ die("`"CMD" contigs -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } if(cmd_no_reseed && cmd_reseed) cmd_print_usage("Cannot specify both -r and -R"); if(contig_limit && seed_with_unused_paths) cmd_print_usage("Cannot combine --ncontigs with --use-seed-paths"); bool sample_with_replacement = cmd_reseed; // Defaults if(nthreads == 0) nthreads = DEFAULT_NTHREADS; if(!seed_buf.len && !contig_limit && sample_with_replacement) { cmd_print_usage("Please specify one or more of: " "--no-reseed | --ncontigs | --seed <in.fa>"); } if(optind >= argc) cmd_print_usage("Require input graph files (.ctx)"); // // Open graph files // const size_t num_gfiles = argc - optind; char **graph_paths = argv + optind; ctx_assert(num_gfiles > 0); GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); size_t ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0; graph_files_open(graph_paths, gfiles, num_gfiles, &ctx_max_kmers, &ctx_sum_kmers); // char *ctx_path = argv[optind]; // // Open Graph file // // GraphFileReader gfile; // memset(&gfile, 0, sizeof(GraphFileReader)); // graph_file_open(&gfile, ctx_path); // Update colours in graph file - sample in 0, all others in 1 // never need more than two colours ncols = gpath_load_sample_pop(gfiles, num_gfiles, gpfiles.b, gpfiles.len, colour); // Check for compatibility between graph files and path files // pop_colour is colour 1 graphs_gpaths_compatible(gfiles, num_gfiles, gpfiles.b, gpfiles.len, 1); if(!genome_size) { char nk_str[50]; if(ctx_max_kmers <= 0) die("Please pass --genome <G> if streaming"); genome_size = ctx_max_kmers; ulong_to_str(genome_size, nk_str); status("Taking number of kmers as genome size: %s", nk_str); } // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem, path_mem, total_mem; // 1 bit needed per kmer if we need to keep track of kmer usage bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 + sizeof(GPath*)*8 + ncols + !sample_with_replacement; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, ctx_max_kmers, ctx_sum_kmers, false, &graph_mem); // Paths memory size_t rem_mem = memargs.mem_to_use - MIN2(memargs.mem_to_use, graph_mem); path_mem = gpath_reader_mem_req(gpfiles.b, gpfiles.len, ncols, rem_mem, false); // Shift path store memory from graphs->paths graph_mem -= sizeof(GPath*)*kmers_in_hash; path_mem += sizeof(GPath*)*kmers_in_hash; cmd_print_mem(path_mem, "paths"); // Total memory total_mem = graph_mem + path_mem; cmd_check_mem_limit(memargs.mem_to_use, total_mem); // Load contig hist distribution from ctp files ZeroSizeBuffer contig_hist; memset(&contig_hist, 0, sizeof(contig_hist)); for(i = 0; i < gpfiles.len; i++) { gpath_reader_load_contig_hist(gpfiles.b[i].json, gpfiles.b[i].fltr.path.b, file_filter_fromcol(&gpfiles.b[i].fltr, 0), &contig_hist); } // Calculate confidences, only for one colour ContigConfidenceTable conf_table; conf_table_alloc(&conf_table, 1); conf_table_update_hist(&conf_table, 0, genome_size, contig_hist.b, contig_hist.len); if(conf_table_path != NULL) { conf_table_save(&conf_table, conf_table_path); } zsize_buf_dealloc(&contig_hist); // // Output file if printing // FILE *fout = out_path ? futil_fopen_create(out_path, "w") : NULL; // Allocate dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, ncols, 1, kmers_in_hash, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL); // Paths gpath_reader_alloc_gpstore(gpfiles.b, gpfiles.len, path_mem, false, &db_graph); uint8_t *visited = NULL; if(!sample_with_replacement) visited = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1); // Load graph LoadingStats stats = LOAD_STATS_INIT_MACRO; GraphLoadingPrefs gprefs = {.db_graph = &db_graph, .boolean_covgs = false, .must_exist_in_graph = false, .empty_colours = true}; for(i = 0; i < num_gfiles; i++) { graph_load(&gfiles[i], gprefs, &stats); graph_file_close(&gfiles[i]); gprefs.empty_colours = false; } ctx_free(gfiles); hash_table_print_stats(&db_graph.ht); // Load path files for(i = 0; i < gpfiles.len; i++) { gpath_reader_load(&gpfiles.b[i], GPATH_DIE_MISSING_KMERS, &db_graph); gpath_reader_close(&gpfiles.b[i]); } gpfile_buf_dealloc(&gpfiles); AssembleContigStats assem_stats; assemble_contigs_stats_init(&assem_stats); assemble_contigs(nthreads, seed_buf.b, seed_buf.len, contig_limit, visited, use_missing_info_check, seed_with_unused_paths, min_step_confid, min_cumul_confid, fout, out_path, &assem_stats, &conf_table, &db_graph, 0); // Sample always loaded into colour zero if(fout && fout != stdout) fclose(fout); assemble_contigs_stats_print(&assem_stats); assemble_contigs_stats_destroy(&assem_stats); conf_table_dealloc(&conf_table); for(i = 0; i < seed_buf.len; i++) seq_close(seed_buf.b[i]); seq_file_ptr_buf_dealloc(&seed_buf); ctx_free(visited); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int ctx_view(int argc, char **argv) { // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // TODO: // print_action actions[argc]; // bool read_kmers = false; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); cmd_print_usage("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]); default: cmd_print_usage("Programmer fail. Tell Isaac."); } } if(print_kmers) parse_kmers = 1; bool no_flags = (!print_info && !parse_kmers && !print_kmers); if(no_flags) { print_info = parse_kmers = 1; } if(optind+1 != argc) cmd_print_usage("Require one input graph file (.ctx)"); char *path = argv[optind]; size_t num_errors = 0, num_warnings = 0; GraphFileReader gfile; memset(&gfile, 0, sizeof(gfile)); int ret = graph_file_open(&gfile, path); if(ret == 0) die("Cannot open file: %s", path); if(print_info) { char fsize_str[50]; bytes_to_str((size_t)gfile.file_size, 0, fsize_str); printf("Loading file: %s\n", file_filter_path(&gfile.fltr)); printf("File size: %s\n", fsize_str); printf("----\n"); } size_t i, col, ncols = file_filter_into_ncols(&gfile.fltr); size_t kmer_size = gfile.hdr.kmer_size; ctx_assert(ncols > 0); GraphFileHeader hdr; memset(&hdr, 0, sizeof(hdr)); graph_file_merge_header(&hdr, &gfile); uint64_t nkmers_read = 0, nkmers_loaded = 0; uint64_t num_all_zero_kmers = 0, num_zero_covg_kmers = 0; uint64_t *col_nkmers, *col_sum_covgs; col_nkmers = ctx_calloc(ncols, sizeof(col_nkmers[0])); col_sum_covgs = ctx_calloc(ncols, sizeof(col_sum_covgs[0])); // Print header if(print_info) print_header(&hdr, gfile.num_of_kmers); BinaryKmer bkmer; Covg covgs[ncols], keep_kmer; Edges edges[ncols]; bool direct_read = file_filter_is_direct(&gfile.fltr); if(parse_kmers || print_kmers) { if(print_info && print_kmers) printf("----\n"); for(; graph_file_read_reset(&gfile, &bkmer, covgs, edges); nkmers_read++) { // If kmer has no covg in any samples -> don't load keep_kmer = 0; for(col = 0; col < ncols; col++) { col_nkmers[col] += (covgs[col] > 0); col_sum_covgs[col] += covgs[col]; keep_kmer |= covgs[col]; } if(!direct_read && !keep_kmer) continue; nkmers_loaded++; /* Kmer Checks */ // graph_file_read_reset() already checks for: // 1. oversized kmers // 2. kmers with covg 0 in all colours // 3. edges without coverage in a colour // Check for all-zeros (i.e. all As kmer: AAAAAA) uint64_t kmer_words_or = 0; for(i = 0; i < hdr.num_of_bitfields; i++) kmer_words_or |= bkmer.b[i]; if(kmer_words_or == 0) { if(num_all_zero_kmers == 1) { loading_error("more than one all 'A's kmers seen [index: %"PRIu64"]\n", nkmers_read); } num_all_zero_kmers++; } // Check covg is 0 for all colours for(i = 0; i < ncols && covgs[i] == 0; i++); num_zero_covg_kmers += (i == ncols); // Print if(print_kmers) db_graph_print_kmer2(bkmer, covgs, edges, ncols, kmer_size, stdout); } } // check for various reading errors // if(errno != 0) // loading_error("errno set [%i]: %s\n", (int)errno, strerror(errno)); int err = ferror(gfile.fh); if(err != 0) loading_error("occurred after file reading [%i]\n", err); char nstr[50]; if(print_kmers || parse_kmers) { // file_size is set to -1 if we are reading from a stream, // therefore won't be able to check number of kmers read if(gfile.file_size != -1 && nkmers_read != (uint64_t)gfile.num_of_kmers) { loading_warning("Expected %zu kmers, read %zu\n", (size_t)gfile.num_of_kmers, (size_t)nkmers_read); } if(num_all_zero_kmers > 1) { loading_error("%s all-zero-kmers seen\n", ulong_to_str(num_all_zero_kmers, nstr)); } if(num_zero_covg_kmers > 0) { loading_warning("%s kmers have no coverage in any colour\n", ulong_to_str(num_zero_covg_kmers, nstr)); } } // Count warnings printed by graph_file_reader.c num_warnings += gfile.error_zero_covg; num_warnings += gfile.error_missing_covg; // Can only print these stats if we're read in the kmers if((print_kmers || parse_kmers) && print_info) { // print kmer coverage per sample printf("\n---- Per colour stats\n"); printf("num. kmers:"); for(col = 0; col < ncols; col++) printf("\t%s", ulong_to_str(col_nkmers[col], nstr)); printf("\n"); printf("sum coverage:"); for(col = 0; col < ncols; col++) printf("\t%s", ulong_to_str(col_sum_covgs[col], nstr)); printf("\n"); printf("kmer coverage:"); for(col = 0; col < ncols; col++) printf("\t%.2f", safe_frac(col_sum_covgs[col], col_nkmers[col])); printf("\n"); // Overall stats uint64_t sum_covgs = 0; double mean_kmer_covg = 0.0; for(col = 0; col < ncols; col++) sum_covgs += col_sum_covgs[col]; mean_kmer_covg = nkmers_loaded ? (double)sum_covgs / nkmers_loaded : 0.0; printf("\n---- Overall stats\n"); printf("Total kmers: %s\n", ulong_to_str(nkmers_loaded, nstr)); printf("Total coverage: %s\n", ulong_to_str(sum_covgs, nstr)); printf("Mean coverage: %s\n", double_to_str(mean_kmer_covg, 2, nstr)); } if(print_info) { // Print memory stats uint64_t mem, capacity, num_buckets, req_capacity; uint8_t bucket_size; req_capacity = (size_t)(gfile.num_of_kmers / IDEAL_OCCUPANCY); capacity = hash_table_cap(req_capacity, &num_buckets, &bucket_size); mem = ht_mem(bucket_size, num_buckets, sizeof(BinaryKmer)*8 + ncols*(sizeof(Covg)+sizeof(Edges))*8); char memstr[100], capacitystr[100], bucket_size_str[100], num_buckets_str[100]; bytes_to_str(mem, 1, memstr); ulong_to_str(capacity, capacitystr); ulong_to_str(bucket_size, bucket_size_str); ulong_to_str(num_buckets, num_buckets_str); size_t mem_height = (size_t)__builtin_ctzl(num_buckets); printf("\n---- Memory\n"); printf("memory required: %s [capacity: %s]\n", memstr, capacitystr); printf(" bucket size: %s; number of buckets: %s\n", bucket_size_str, num_buckets_str); printf(" --kmer_size %zu --mem_height %zu --mem_width %i\n", kmer_size, mem_height, bucket_size); } if((print_kmers || parse_kmers) && print_info) { printf("\n----\n"); if(num_warnings > 0 || num_errors > 0) { printf("Warnings: %zu; Errors: %zu\n", (size_t)num_warnings, (size_t)num_errors); } if(num_errors == 0) printf(num_warnings ? "Graph may be ok\n" : "Graph is valid\n"); } ctx_free(col_nkmers); ctx_free(col_sum_covgs); // Close file (which zeros it) graph_file_close(&gfile); graph_header_dealloc(&hdr); return num_errors ? EXIT_FAILURE : EXIT_SUCCESS; }
void seq_parse_pe_sf(seq_file_t *sf1, seq_file_t *sf2, uint8_t ascii_fq_offset, read_t *r1, read_t *r2, void (*read_func)(read_t *_r1, read_t *_r2, uint8_t _qoffset1, uint8_t _qoffset2, void *_ptr), void *reader_ptr) { if(sf2 == NULL) { seq_parse_se_sf(sf1, ascii_fq_offset, r1, read_func, reader_ptr); return; } status("[seq] Parsing sequence files %s %s\n", futil_inpath_str(sf1->path), futil_inpath_str(sf2->path)); // Guess offset if needed uint8_t qoffset1 = ascii_fq_offset, qoffset2 = ascii_fq_offset; uint8_t qmin1 = ascii_fq_offset, qmin2 = ascii_fq_offset; uint8_t qmax1 = 126, qmax2 = 126; if(ascii_fq_offset == 0) { int fmt1, fmt2; if((fmt1 = guess_fastq_format(sf1)) != -1) { qmin1 = (uint8_t)FASTQ_MIN[fmt1]; qmax1 = (uint8_t)FASTQ_MAX[fmt1]; qoffset1 = (uint8_t)FASTQ_OFFSET[fmt1]; } if((fmt2 = guess_fastq_format(sf2)) != -1) { qmin2 = (uint8_t)FASTQ_MIN[fmt2]; qmax2 = (uint8_t)FASTQ_MAX[fmt2]; qoffset2 = (uint8_t)FASTQ_OFFSET[fmt2]; } } // warn_flags keeps track of which of the error msgs have been printed // (only print each error msg once per file) uint8_t warn_flags = 0; int success1, success2; size_t num_pe_pairs = 0; while(1) { success1 = seq_read_primary(sf1, r1); success2 = seq_read_primary(sf2, r2); if(success1 < 0) warn("input error: %s", sf1->path); if(success2 < 0) warn("input error: %s", sf2->path); if(!success1 != !success2) { warn("Different number of reads in pe files [%s; %s]\n", sf1->path, sf2->path); } if(success1 <= 0 || success2 <= 0) break; // PE // We don't care about read orientation at this point warn_flags = check_new_read(r1, qmin1, qmax1, sf1->path, warn_flags); warn_flags = check_new_read(r2, qmin2, qmax2, sf2->path, warn_flags); read_func(r1, r2, qoffset1, qoffset2, reader_ptr); num_pe_pairs++; } char num_pe_pairs_str[100]; ulong_to_str(num_pe_pairs, num_pe_pairs_str); status("[seq] Loaded %s read pairs (files: %s, %s)", num_pe_pairs_str, futil_inpath_str(sf1->path), futil_inpath_str(sf2->path)); }
void seq_parse_interleaved_sf(seq_file_t *sf, uint8_t ascii_fq_offset, read_t *r1, read_t *r2, void (*read_func)(read_t *_r1, read_t *_r2, uint8_t _qoffset1, uint8_t _qoffset2, void *_ptr), void *reader_ptr) { status("[seq] Reading a (possibly) interleaved file (expect both S.E. & P.E. reads)"); // Guess offset if needed uint8_t qoffset = ascii_fq_offset; uint8_t qmin = ascii_fq_offset, qmax = 126; int format; if(ascii_fq_offset == 0 && (format = guess_fastq_format(sf)) != -1) { qmin = (uint8_t)FASTQ_MIN[format]; qmax = (uint8_t)FASTQ_MAX[format]; qoffset = (uint8_t)FASTQ_OFFSET[format]; } read_t *r[2] = {r1,r2}; int ridx = 0, s; uint8_t warn_flags = 0; size_t num_se_reads = 0, num_pe_pairs = 0; while((s = seq_read_primary(sf, r[ridx])) > 0) { warn_flags = check_new_read(r[ridx], qmin, qmax, sf->path, warn_flags); if(ridx) { // ridx == 1 if(seq_read_names_cmp(r[0]->name.b, r[1]->name.b) == 0) { // Either read may be the first in the pair if from SAM/BAM int r0 = (r[1]->from_sam && seq_read_bam(r[1])->core.flag & BAM_FREAD1); read_func(r[r0], r[!r0], qoffset, qoffset, reader_ptr); num_pe_pairs++; ridx = 0; } else { read_func(r[0], NULL, qoffset, 0, reader_ptr); num_se_reads++; SWAP(r[0], r[1]); ridx = 1; } } else ridx = 1; } // Process last read if(ridx == 1) { read_func(r[0], NULL, qoffset, 0, reader_ptr); num_se_reads++; } if(s < 0) warn("Input error: %s\n", sf->path); char num_se_reads_str[100], num_pe_pairs_str[100]; ulong_to_str(num_pe_pairs, num_pe_pairs_str); ulong_to_str(num_se_reads, num_se_reads_str); status("[seq] Loaded %s reads and %s reads pairs (file: %s)", num_se_reads_str, num_pe_pairs_str, futil_inpath_str(sf->path)); }
int ctx_vcfcov(int argc, char **argv) { struct MemArgs memargs = MEM_ARGS_INIT; const char *out_path = NULL, *out_type = NULL; uint32_t max_allele_len = 0, max_gt_vars = 0; char *ref_path = NULL; bool low_mem = false; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; size_t i; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 'O': cmd_check(!out_type, cmd); out_type = optarg; break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'r': cmd_check(!ref_path, cmd); ref_path = optarg; break; case 'L': cmd_check(!max_allele_len,cmd); max_allele_len = cmd_uint32(cmd,optarg); break; case 'N': cmd_check(!max_gt_vars,cmd); max_gt_vars = cmd_uint32(cmd,optarg); break; case 'M': cmd_check(!low_mem, cmd); low_mem = true; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } // Defaults for unset values if(out_path == NULL) out_path = "-"; if(ref_path == NULL) cmd_print_usage("Require a reference (-r,--ref <ref.fa>)"); if(optind+2 > argc) cmd_print_usage("Require VCF and graph files"); if(!max_allele_len) max_allele_len = DEFAULT_MAX_ALLELE_LEN; if(!max_gt_vars) max_gt_vars = DEFAULT_MAX_GT_VARS; status("[vcfcov] max allele length: %u; max number of variants: %u", max_allele_len, max_gt_vars); // open ref // index fasta with: samtools faidx ref.fa faidx_t *fai = fai_load(ref_path); if(fai == NULL) die("Cannot load ref index: %s / %s.fai", ref_path, ref_path); // Open input VCF file const char *vcf_path = argv[optind++]; htsFile *vcffh = hts_open(vcf_path, "r"); if(vcffh == NULL) die("Cannot open VCF file: %s", vcf_path); bcf_hdr_t *vcfhdr = bcf_hdr_read(vcffh); if(vcfhdr == NULL) die("Cannot read VCF header: %s", vcf_path); // Test we can close and reopen files if(low_mem) { if((vcffh = hts_open(vcf_path, "r")) == NULL) die("Cannot re-open VCF file: %s", vcf_path); if((vcfhdr = bcf_hdr_read(vcffh)) == NULL) die("Cannot re-read VCF header: %s", vcf_path); } // // Open graph files // const size_t num_gfiles = argc - optind; char **graph_paths = argv + optind; ctx_assert(num_gfiles > 0); GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); size_t ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0; ncols = graph_files_open(graph_paths, gfiles, num_gfiles, &ctx_max_kmers, &ctx_sum_kmers); // Check graph + paths are compatible graphs_gpaths_compatible(gfiles, num_gfiles, NULL, 0, -1); // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem; bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Covg)*8 * ncols; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, low_mem ? -1 : (int64_t)ctx_max_kmers, ctx_sum_kmers, true, &graph_mem); cmd_check_mem_limit(memargs.mem_to_use, graph_mem); // // Open output file // // v=>vcf, z=>compressed vcf, b=>bcf, bu=>uncompressed bcf int mode = vcf_misc_get_outtype(out_type, out_path); futil_create_output(out_path); htsFile *outfh = hts_open(out_path, modes_htslib[mode]); status("[vcfcov] Output format: %s", hsmodes_htslib[mode]); // Allocate memory dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, ncols, 1, kmers_in_hash, DBG_ALLOC_COVGS); // // Set up tag names // // *R => ref, *A => alt sprintf(kcov_ref_tag, "K%zuR", db_graph.kmer_size); // mean coverage sprintf(kcov_alt_tag, "K%zuA", db_graph.kmer_size); // #SAMPLE=<ID=...,K29KCOV=...,K29NK=...,K29RLK> // - K29_kcov is empirical kmer coverage // - K29_nkmers is the number of kmers in the sample // - mean_read_length is the mean read length in bases char sample_kcov_tag[20], sample_nk_tag[20], sample_rlk_tag[20]; sprintf(sample_kcov_tag, "K%zu_kcov", db_graph.kmer_size); // mean coverage sprintf(sample_nk_tag, "K%zu_nkmers", db_graph.kmer_size); sprintf(sample_rlk_tag, "mean_read_length"); // // Load kmers if we are using --low-mem // VcfCovStats st; memset(&st, 0, sizeof(st)); VcfCovPrefs prefs = {.kcov_ref_tag = kcov_ref_tag, .kcov_alt_tag = kcov_alt_tag, .max_allele_len = max_allele_len, .max_gt_vars = max_gt_vars, .load_kmers_only = false}; if(low_mem) { status("[vcfcov] Loading kmers from VCF+ref"); prefs.load_kmers_only = true; vcfcov_file(vcffh, vcfhdr, NULL, NULL, vcf_path, fai, NULL, &prefs, &st, &db_graph); // Close files hts_close(vcffh); bcf_hdr_destroy(vcfhdr); // Re-open files if((vcffh = hts_open(vcf_path, "r")) == NULL) die("Cannot re-open VCF file: %s", vcf_path); if((vcfhdr = bcf_hdr_read(vcffh)) == NULL) die("Cannot re-read VCF header: %s", vcf_path); prefs.load_kmers_only = false; } // // Load graphs // GraphLoadingStats gstats; memset(&gstats, 0, sizeof(gstats)); GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph); gprefs.must_exist_in_graph = low_mem; for(i = 0; i < num_gfiles; i++) { graph_load(&gfiles[i], gprefs, &gstats); graph_file_close(&gfiles[i]); } ctx_free(gfiles); hash_table_print_stats(&db_graph.ht); // // Set up VCF header / graph matchup // size_t *samplehdrids = ctx_malloc(db_graph.num_of_cols * sizeof(size_t)); // Add samples to vcf header bcf_hdr_t *outhdr = bcf_hdr_dup(vcfhdr); bcf_hrec_t *hrec; int sid; char hdrstr[200]; for(i = 0; i < db_graph.num_of_cols; i++) { char *sname = db_graph.ginfo[i].sample_name.b; if((sid = bcf_hdr_id2int(outhdr, BCF_DT_SAMPLE, sname)) < 0) { bcf_hdr_add_sample(outhdr, sname); sid = bcf_hdr_id2int(outhdr, BCF_DT_SAMPLE, sname); } samplehdrids[i] = sid; // Add SAMPLE field hrec = bcf_hdr_get_hrec(outhdr, BCF_HL_STR, "ID", sname, "SAMPLE"); if(hrec == NULL) { sprintf(hdrstr, "##SAMPLE=<ID=%s,%s=%"PRIu64",%s=%"PRIu64",%s=%zu>", sname, sample_kcov_tag, gstats.nkmers[i] ? gstats.sumcov[i] / gstats.nkmers[i] : 0, sample_nk_tag, gstats.nkmers[i], sample_rlk_tag, (size_t)db_graph.ginfo[i].mean_read_length); bcf_hdr_append(outhdr, hdrstr); } else { // mean kcovg sprintf(hdrstr, "%"PRIu64, gstats.sumcov[i] / gstats.nkmers[i]); vcf_misc_add_update_hrec(hrec, sample_kcov_tag, hdrstr); // num kmers sprintf(hdrstr, "%"PRIu64, gstats.nkmers[i]); vcf_misc_add_update_hrec(hrec, sample_nk_tag, hdrstr); // mean read length in kmers sprintf(hdrstr, "%zu", (size_t)db_graph.ginfo[i].mean_read_length); vcf_misc_add_update_hrec(hrec, sample_rlk_tag, hdrstr); } status("[vcfcov] Colour %zu: %s [VCF column %zu]", i, sname, samplehdrids[i]); } // Add genotype format fields // One field per alternative allele sprintf(hdrstr, "##FORMAT=<ID=%s,Number=A,Type=Integer," "Description=\"Coverage on ref (k=%zu): sum(kmer_covs) / exp_num_kmers\">\n", kcov_ref_tag, db_graph.kmer_size); bcf_hdr_append(outhdr, hdrstr); sprintf(hdrstr, "##FORMAT=<ID=%s,Number=A,Type=Integer," "Description=\"Coverage on alt (k=%zu): sum(kmer_covs) / exp_num_kmers\">\n", kcov_alt_tag, db_graph.kmer_size); bcf_hdr_append(outhdr, hdrstr); bcf_hdr_set_version(outhdr, "VCFv4.2"); // Add command string to header vcf_misc_hdr_add_cmd(outhdr, cmd_get_cmdline(), cmd_get_cwd()); if(bcf_hdr_write(outfh, outhdr) != 0) die("Cannot write header to: %s", futil_outpath_str(out_path)); status("[vcfcov] Reading %s and adding coverage", vcf_path); // Reset stats and get coverage memset(&st, 0, sizeof(st)); vcfcov_file(vcffh, vcfhdr, outfh, outhdr, vcf_path, fai, samplehdrids, &prefs, &st, &db_graph); // Print statistics char ns0[50], ns1[50]; status("[vcfcov] Read %s VCF lines", ulong_to_str(st.nvcf_lines, ns0)); status("[vcfcov] Read %s ALTs", ulong_to_str(st.nalts_read, ns0)); status("[vcfcov] Used %s kmers", ulong_to_str(st.ngt_kmers, ns0)); status("[vcfcov] ALTs used: %s / %s (%.2f%%)", ulong_to_str(st.nalts_loaded, ns0), ulong_to_str(st.nalts_read, ns1), st.nalts_read ? (100.0*st.nalts_loaded) / st.nalts_read : 0.0); status("[vcfcov] ALTs too long (>%ubp): %s / %s (%.2f%%)", max_allele_len, ulong_to_str(st.nalts_too_long, ns0), ulong_to_str(st.nalts_read, ns1), st.nalts_read ? (100.0*st.nalts_too_long) / st.nalts_read : 0.0); status("[vcfcov] ALTs too dense (>%u within %zubp): %s / %s (%.2f%%)", max_gt_vars, db_graph.kmer_size, ulong_to_str(st.nalts_no_covg, ns0), ulong_to_str(st.nalts_read, ns1), st.nalts_read ? (100.0*st.nalts_no_covg) / st.nalts_read : 0.0); status("[vcfcov] ALTs printed with coverage: %s / %s (%.2f%%)", ulong_to_str(st.nalts_with_covg, ns0), ulong_to_str(st.nalts_read, ns1), st.nalts_read ? (100.0*st.nalts_with_covg) / st.nalts_read : 0.0); status("[vcfcov] Saved to: %s\n", out_path); ctx_free(samplehdrids); graph_loading_stats_destroy(&gstats); bcf_hdr_destroy(vcfhdr); bcf_hdr_destroy(outhdr); hts_close(vcffh); hts_close(outfh); fai_destroy(fai); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }