static void parse_cmdline_args(int argc, char **argv) { // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'F': cmd_check(!sam_path,cmd); sam_path = optarg; break; case 'Q': cmd_check(min_mapq == SIZE_MAX,cmd); min_mapq = cmd_uint32(cmd, optarg); break; case 'A': cmd_check(max_align_len == SIZE_MAX,cmd); max_align_len = cmd_uint32(cmd, optarg); break; case 'L': cmd_check(max_allele_len == SIZE_MAX,cmd); max_allele_len = cmd_uint32(cmd, optarg); break; case 'D': cmd_check(max_path_diff == SIZE_MAX, cmd); max_path_diff = cmd_uint32(cmd, optarg); break; case 'm': nwmatch = cmd_int32(cmd, optarg); break; case 'M': nwmismatch = cmd_int32(cmd, optarg); break; case 'g': nwgapopen = cmd_int32(cmd, optarg); break; case 'G': nwgapextend = cmd_int32(cmd, optarg); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ die("`"CMD" calls2vcf -h` for help. Bad option: %s", argv[optind-1]); default: ctx_assert2(0, "shouldn't reach here: %c", c); } } // Defaults for unset values if(out_path == NULL) out_path = default_out_path; if(min_mapq == SIZE_MAX) min_mapq = DEFAULT_MIN_MAPQ; if(max_align_len == SIZE_MAX) max_align_len = DEFAULT_MAX_ALIGN; if(max_allele_len == SIZE_MAX) max_allele_len = DEFAULT_MAX_ALLELE; if(max_path_diff == SIZE_MAX) max_path_diff = DEFAULT_MAX_PDIFF; if(optind+2 > argc) cmd_print_usage("Require <in.txt.gz> and at least one reference"); input_path = argv[optind++]; ref_paths = argv + optind; num_ref_paths = argc - optind; }
// if one of the files is reading from stdin, sum_kmers_ptr is set to 0 // `max_cols_ptr` is used to return the most colours being loaded from a single file // returns the number of colours being loaded in total size_t graph_files_open(char **graph_paths, GraphFileReader *gfiles, size_t num_gfiles, size_t *max_kmers_ptr, size_t *sum_kmers_ptr) { size_t i, ctx_max_kmers = 0, ctx_sum_kmers = 0; bool ctx_uses_stdin = false; size_t ncols = 0; for(i = 0; i < num_gfiles; i++) { memset(&gfiles[i], 0, sizeof(GraphFileReader)); graph_file_open2(&gfiles[i], graph_paths[i], "r", ncols); if(gfiles[0].hdr.kmer_size != gfiles[i].hdr.kmer_size) { cmd_print_usage("Kmer sizes don't match [%u vs %u]", gfiles[0].hdr.kmer_size, gfiles[i].hdr.kmer_size); } ncols = MAX2(ncols, file_filter_into_ncols(&gfiles[i].fltr)); ctx_max_kmers = MAX2(ctx_max_kmers, graph_file_nkmers(&gfiles[i])); ctx_sum_kmers += graph_file_nkmers(&gfiles[i]); ctx_uses_stdin |= file_filter_isstdin(&gfiles[i].fltr); } if(ctx_uses_stdin) ctx_sum_kmers = SIZE_MAX; *max_kmers_ptr = ctx_max_kmers; *sum_kmers_ptr = ctx_sum_kmers; return ncols; }
static void flanks_sam_open() { if(!futil_path_has_extension(sam_path, ".bam") && !futil_path_has_extension(sam_path, ".sam")) { cmd_print_usage("Mapped flanks is not .sam or .bam file: %s", sam_path); } bool isbam = futil_path_has_extension(sam_path, ".bam"); samfh = sam_open(sam_path, isbam ? "rb" : "rs"); if(samfh == NULL) die("Cannot open SAM/BAM %s", sam_path); // Load BAM header bam_header = sam_hdr_read(samfh); bamentry = bam_init1(); }
int ctx_calls2vcf(int argc, char **argv) { parse_cmdline_args(argc, argv); size_t i; // These functions call die() on error gzFile gzin = futil_gzopen(input_path, "r"); nw_aligner_setup(); // Read file header cJSON *json = read_input_header(gzin); // Get format (bubble or breakpoint file) cJSON *json_fmt = json_hdr_get(json, "file_format", cJSON_String, input_path); if(strcmp(json_fmt->valuestring,"CtxBreakpoints") == 0) input_bubble_format = false; else if(strcmp(json_fmt->valuestring,"CtxBubbles") == 0) input_bubble_format = true; else die("Unknown format: '%s'", json_fmt->valuestring); status("Reading %s in %s format", futil_inpath_str(input_path), input_bubble_format ? "bubble" : "breakpoint"); if(input_bubble_format && sam_path == NULL) cmd_print_usage("Require -F <flanks.sam> with bubble file"); // Open flank file if it exists if(sam_path) flanks_sam_open(); // Open output file FILE *fout = futil_fopen_create(out_path, "w"); // Load reference genome read_buf_alloc(&chroms, 1024); genome = kh_init(ChromHash); seq_reader_load_ref_genome(ref_paths, num_ref_paths, &chroms, genome); // convert to upper case char *s; for(i = 0; i < chroms.len; i++) for(s = chroms.b[i].seq.b; *s; s++) *s = toupper(*s); if(!input_bubble_format) brkpnt_check_refs_match(json, input_path); // Output VCF has 0 samples if bubbles file, otherwise has N where N is // number of samples/colours in the breakpoint graph size_t num_graph_samples = json_hdr_get_ncols(json, input_path); size_t num_graph_nonref = json_hdr_get_nonref_ncols(json, input_path); num_samples = 0; if(!input_bubble_format) { // If last colour has "is_ref", drop number of samples by one num_samples = num_graph_nonref < num_graph_samples ? num_graph_samples-1 : num_graph_samples; } print_vcf_header(json, !input_bubble_format, fout); status("Reading %s call file with %zu samples", input_bubble_format ? "Bubble" : "Breakpoint", num_graph_samples); status("Writing a VCF with %zu samples", num_samples); parse_entries(gzin, fout); // Print stats char num_entries_read_str[50]; char num_vars_printed_str[50]; ulong_to_str(num_entries_read, num_entries_read_str); ulong_to_str(num_vars_printed, num_vars_printed_str); status("Read %s entries, printed %s vcf entries to: %s", num_entries_read_str, num_vars_printed_str, futil_outpath_str(out_path)); if(input_bubble_format) { char msg[200]; // Bubble caller specific print_stat(num_flank5p_unmapped, num_entries_read, "flank 5p unmapped"); sprintf(msg, "flank 5p low mapq (<%zu)", min_mapq); print_stat(num_flank5p_lowqual, num_entries_read, msg); print_stat(num_flank3p_not_found, num_entries_read, "flank 3p not found"); print_stat(num_flank3p_multihits, num_entries_read, "flank 3p multiple hits"); print_stat(num_flank3p_approx_match,num_entries_read, "flank 3p approx match used"); print_stat(num_flank3p_exact_match, num_entries_read, "flank 3p exact match"); } else { // Breakpoint caller specific print_stat(num_flanks_not_uniquely_mapped, num_entries_read, "flank pairs contain one flank not mapped uniquely"); print_stat(num_flanks_diff_chroms, num_entries_read, "flank pairs map to diff chroms"); print_stat(num_flanks_diff_strands, num_entries_read, "flank pairs map to diff strands"); } print_stat(num_flanks_too_far_apart, num_entries_read, "flank pairs too far apart"); print_stat(num_flanks_overlap_too_large, num_entries_read, "flank pairs overlap too much"); print_stat(num_entries_well_mapped, num_entries_read, "flank pairs map well"); status("Aligned %zu allele pairs and %zu flanks", num_nw_allele, num_nw_flank); // Finished - clean up cJSON_Delete(json); gzclose(gzin); fclose(fout); for(i = 0; i < chroms.len; i++) seq_read_dealloc(&chroms.b[i]); read_buf_dealloc(&chroms); kh_destroy_ChromHash(genome); nw_aligner_destroy(); if(sam_path) flanks_sam_close(); // hide unused method warnings (void)kh_del_ChromHash; (void)kh_put_ChromHash; (void)kh_get_ChromHash; (void)kh_clear_ChromHash; (void)kh_destroy_ChromHash; (void)kh_init_ChromHash; return EXIT_SUCCESS; }
int ctx_index(int argc, char **argv) { const char *out_path = NULL; size_t block_size = 0, block_kmers = 0; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 'b': cmd_check(!block_kmers, cmd); block_kmers = cmd_size_nonzero(cmd, optarg); break; case 's': cmd_check(!block_size, cmd); block_size = cmd_size_nonzero(cmd, optarg); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" index -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } if(optind+1 != argc) cmd_print_usage("Require exactly one input graph file (.ctx)"); if(block_size && block_kmers) cmd_print_usage("Cannot use --block-kmers and --block-size together"); const char *ctx_path = argv[optind]; // // Open Graph file // GraphFileReader gfile; memset(&gfile, 0, sizeof(GraphFileReader)); graph_file_open2(&gfile, ctx_path, "r+", true, 0); if(!file_filter_is_direct(&gfile.fltr)) die("Cannot open graph file with a filter ('in.ctx:blah' syntax)"); // Open output file FILE *fout = out_path ? futil_fopen_create(out_path, "w") : stdout; // Start size_t filencols = gfile.hdr.num_of_cols; size_t kmer_size = gfile.hdr.kmer_size; const char *path = file_filter_path(&gfile.fltr); size_t ncols = file_filter_into_ncols(&gfile.fltr); size_t kmer_mem = sizeof(BinaryKmer) + (sizeof(Edges)+sizeof(Covg))*filencols; if(block_size) { block_kmers = block_size / kmer_mem; } else if(!block_size && !block_kmers) { block_size = 4 * ONE_MEGABYTE; block_kmers = block_size / kmer_mem; } // Update block-size block_size = block_kmers * kmer_mem; status("[index] block bytes: %zu kmers: %zu; kmer bytes: %zu, hdr: %zu", block_size, block_kmers, kmer_mem, (size_t)gfile.hdr_size); if(block_kmers == 0) die("Cannot set block_kmers to zero"); // Print header fputs("#block_start\tnext_block\tfirst_kmer\tkmer_idx\tnext_kmer_idx\n", fout); BinaryKmer bkmer = BINARY_KMER_ZERO_MACRO; BinaryKmer prev_bkmer = BINARY_KMER_ZERO_MACRO; Covg *covgs = ctx_malloc(ncols * sizeof(Covg)); Edges *edges = ctx_malloc(ncols * sizeof(Edges)); char bkmerstr[MAX_KMER_SIZE+1]; size_t rem_block = block_size - kmer_mem; // block after first kmer char *tmp_mem = ctx_malloc(rem_block); // Read in file, print index size_t nblocks = 0; size_t bl_bytes = 0, bl_kmers = 0; size_t bl_byte_offset = gfile.hdr_size, bl_kmer_offset = 0; while(1) { if(!graph_file_read(&gfile, &bkmer, covgs, edges)) { status("Read kmer failed"); break; } binary_kmer_to_str(bkmer, kmer_size, bkmerstr); if(nblocks > 0 && !binary_kmer_less_than(prev_bkmer,bkmer)) die("File is not sorted: %s [%s]", bkmerstr, path); // We've already read one kmer entry, read rest of block bl_bytes = kmer_mem + gfr_fread_bytes(&gfile, tmp_mem, rem_block); bl_kmers = 1 + bl_bytes / kmer_mem; fprintf(fout, "%zu\t%zu\t%s\t%zu\t%zu\n", bl_byte_offset, bl_byte_offset+bl_bytes, bkmerstr, bl_kmer_offset, bl_kmer_offset+bl_kmers); bl_byte_offset += bl_bytes; bl_kmer_offset += bl_kmers; nblocks++; if(bl_kmers < block_kmers) { status("last block %zu < %zu; %zu vs %zu", bl_kmers, block_kmers, bl_bytes, block_size); break; } prev_bkmer = bkmer; } ctx_free(covgs); ctx_free(edges); ctx_free(tmp_mem); // done char num_kmers_str[50], num_blocks_str[50]; char block_mem_str[50], block_kmers_str[50]; ulong_to_str(bl_kmer_offset, num_kmers_str); ulong_to_str(nblocks, num_blocks_str); bytes_to_str(block_size, 1, block_mem_str); ulong_to_str(block_kmers, block_kmers_str); status("Read %s kmers in %s block%s (block size %s / %s kmers)", num_kmers_str, num_blocks_str, util_plural_str(nblocks), block_mem_str, block_kmers_str); if(fout != stdout) status("Saved to %s", out_path); graph_file_close(&gfile); fclose(fout); return EXIT_SUCCESS; }
void cmd_parse(const char * str) { int name_len; name_len=0; char cmd_name[MAX_LEN_NAME]; int arg_len; arg_len=0; char cmd_arg[MAX_LEN_ARG]; while(*str && *str != ' ') { cmd_name[name_len] = *str; name_len++; str++; } cmd_name[name_len] = '\0'; while(*str && *str == ' ') str++; while(*str && *str != ' ') { cmd_arg[arg_len] = *str; arg_len++; str++; } cmd_arg[arg_len] = '\0'; if (console_streq(cmd_name, led_cmd.name)) { int arg_value; led_cfg_set(1); if(arg_len == 0) led_dat_set(!led_dat_get_state()); else if (arg_len == 1) { arg_value = atoi(cmd_arg); if(arg_value == 1 || arg_value == 0) led_dat_set(arg_value); else { console_print("Bad argument : Not 0 or 1.\r\n"); cmd_print_usage(led_cmd); } } else { console_print("Bad number of arguments\r\n"); cmd_print_usage(led_cmd); } console_prompt(); } else if (console_streq(cmd_name, gpio_cmd.name)) { gpio_output_set(9); gpio_activate(9); console_prompt(); } else { console_print("Unknown command : "); console_print(cmd_name); console_print("\r\n"); console_prompt(); } }
int ctx_exp_abc(int argc, char **argv) { size_t i, nthreads = 0, num_repeats = 0, max_AB_dist = 0; struct MemArgs memargs = MEM_ARGS_INIT; bool print_failed_contigs = false; GPathReader tmp_gpfile; GPathFileBuffer gpfiles; gpfile_buf_alloc(&gpfiles, 8); // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 't': cmd_check(!nthreads,cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'p': memset(&tmp_gpfile, 0, sizeof(GPathReader)); gpath_reader_open(&tmp_gpfile, optarg); gpfile_buf_push(&gpfiles, &tmp_gpfile, 1); break; case 'N': cmd_check(!num_repeats,cmd); num_repeats = cmd_uint32_nonzero(cmd, optarg); break; case 'M': cmd_check(!max_AB_dist,cmd); max_AB_dist = cmd_uint32_nonzero(cmd, optarg); break; case 'P': cmd_check(!print_failed_contigs,cmd); print_failed_contigs = true; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" exp_abc -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } // Defaults if(nthreads == 0) nthreads = DEFAULT_NTHREADS; if(num_repeats == 0) num_repeats = DEFAULT_NUM_REPEATS; if(max_AB_dist == 0) max_AB_dist = DEFAULT_MAX_AB_DIST; if(print_failed_contigs && nthreads != 1) { warn("--print forces nthreads to be one. soz."); nthreads = 1; } if(optind+1 != argc) cmd_print_usage("Require exactly one input graph file (.ctx)"); const char *ctx_path = argv[optind]; // // Open Graph file // GraphFileReader gfile; memset(&gfile, 0, sizeof(GraphFileReader)); graph_file_open(&gfile, ctx_path); size_t ncols = file_filter_into_ncols(&gfile.fltr); // Check only loading one colour if(ncols > 1) die("Only implemented for one colour currently"); // Check graph + paths are compatible graphs_gpaths_compatible(&gfile, 1, gpfiles.b, gpfiles.len, -1); // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem, path_mem, total_mem; // 1 bit needed per kmer if we need to keep track of kmer usage bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 + sizeof(GPath*)*8 + ncols; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, gfile.num_of_kmers, gfile.num_of_kmers, false, &graph_mem); // Paths memory size_t rem_mem = memargs.mem_to_use - MIN2(memargs.mem_to_use, graph_mem); path_mem = gpath_reader_mem_req(gpfiles.b, gpfiles.len, ncols, rem_mem, false, kmers_in_hash, false); // Shift path store memory from graphs->paths graph_mem -= sizeof(GPath*)*kmers_in_hash; path_mem += sizeof(GPath*)*kmers_in_hash; cmd_print_mem(path_mem, "paths"); total_mem = graph_mem + path_mem; cmd_check_mem_limit(memargs.mem_to_use, total_mem); // // Allocate memory // dBGraph db_graph; db_graph_alloc(&db_graph, gfile.hdr.kmer_size, 1, 1, kmers_in_hash, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL); // Paths gpath_reader_alloc_gpstore(gpfiles.b, gpfiles.len, path_mem, false, &db_graph); // Load the graph GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph); gprefs.empty_colours = true; graph_load(&gfile, gprefs, NULL); graph_file_close(&gfile); hash_table_print_stats(&db_graph.ht); // Load link files for(i = 0; i < gpfiles.len; i++) { gpath_reader_load(&gpfiles.b[i], GPATH_DIE_MISSING_KMERS, &db_graph); gpath_reader_close(&gpfiles.b[i]); } gpfile_buf_dealloc(&gpfiles); status("\n"); status("Test 1: Priming region A->B (n: %zu max_AB_dist: %zu)", num_repeats, max_AB_dist); run_exp_abc(&db_graph, true, nthreads, num_repeats, max_AB_dist, print_failed_contigs); status("\n"); status("Test 2: Trying to traverse A->B (n: %zu max_AB_dist: %zu)", num_repeats, max_AB_dist); run_exp_abc(&db_graph, false, nthreads, num_repeats, max_AB_dist, print_failed_contigs); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int ctx_pop_bubbles(int argc, char **argv) { size_t nthreads = 0; struct MemArgs memargs = MEM_ARGS_INIT; const char *out_path = NULL; int32_t max_covg = -1; // max mean coverage to remove <=0 => ignore int32_t max_klen = -1; // max length (kmers) to remove <=0 => ignore int32_t max_kdiff = -1; // max diff between bubble branch lengths <0 => ignore // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'C': cmd_check(max_covg<0, cmd); max_covg = cmd_uint32(cmd, optarg); break; case 'L': cmd_check(max_klen<0, cmd); max_klen = cmd_uint32(cmd, optarg); break; case 'D': cmd_check(max_kdiff<0, cmd); max_kdiff = cmd_uint32(cmd, optarg); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" pop -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } // Defaults for unset values if(out_path == NULL) out_path = "-"; if(nthreads == 0) nthreads = DEFAULT_NTHREADS; if(optind >= argc) cmd_print_usage("Require input graph files (.ctx)"); // // Open graph files // const size_t num_gfiles = argc - optind; char **graph_paths = argv + optind; ctx_assert(num_gfiles > 0); GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); size_t i, ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0; ncols = graph_files_open(graph_paths, gfiles, num_gfiles, &ctx_max_kmers, &ctx_sum_kmers); bool reread_graph_to_filter = (num_gfiles == 1 && strcmp(file_filter_path(&gfiles[0].fltr),"-") != 0); if(reread_graph_to_filter) { file_filter_flatten(&gfiles[0].fltr, 0); ncols = 1; } // Check graphs are compatible graphs_gpaths_compatible(gfiles, num_gfiles, NULL, 0, -1); // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem; bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Covg)*8*ncols + sizeof(Edges)*8*ncols + 2; // 1 bit for visited, 1 for removed kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, ctx_max_kmers, ctx_sum_kmers, false, &graph_mem); cmd_check_mem_limit(memargs.mem_to_use, graph_mem); // Check out_path is writable futil_create_output(out_path); // Allocate memory dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, ncols, ncols, kmers_in_hash, DBG_ALLOC_EDGES | DBG_ALLOC_COVGS); size_t nkwords = roundup_bits2bytes(db_graph.ht.capacity); uint8_t *visited = ctx_calloc(1, nkwords); uint8_t *rmvbits = ctx_calloc(1, nkwords); // // Load graphs // GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph); gprefs.empty_colours = true; for(i = 0; i < num_gfiles; i++) { graph_load(&gfiles[i], gprefs, NULL); graph_file_close(&gfiles[i]); gprefs.empty_colours = false; } ctx_free(gfiles); hash_table_print_stats(&db_graph.ht); PopBubblesPrefs prefs = {.max_rmv_covg = max_covg, .max_rmv_klen = max_klen, .max_rmv_kdiff = max_kdiff}; size_t npopped = 0; char npopped_str[50]; status("Popping bubbles..."); npopped = pop_bubbles(&db_graph, nthreads, prefs, visited, rmvbits); ulong_to_str(npopped, npopped_str); status("Popped %s bubbles", npopped_str); size_t nkmers0 = db_graph.ht.num_kmers; status("Removing nodes..."); for(i = 0; i < nkwords; i++) rmvbits[i] = ~rmvbits[i]; prune_nodes_lacking_flag(nthreads, rmvbits, &db_graph); size_t nkmers1 = db_graph.ht.num_kmers; ctx_assert(nkmers1 <= nkmers0); char nkmers0str[50], nkmers1str[50], ndiffstr[50]; ulong_to_str(nkmers0, nkmers0str); ulong_to_str(nkmers1, nkmers1str); ulong_to_str(nkmers0-nkmers1, ndiffstr); status("Number of kmers %s -> %s (-%s)", nkmers0str, nkmers1str, ndiffstr); if(reread_graph_to_filter) { status("Streaming filtered file to: %s\n", out_path); GraphFileReader gfile; memset(&gfile, 0, sizeof(GraphFileReader)); graph_file_open(&gfile, graph_paths[0]); graph_writer_stream_mkhdr(out_path, &gfile, &db_graph, db_graph.col_edges, NULL); graph_file_close(&gfile); } else { status("Saving to: %s\n", out_path); graph_writer_save_mkhdr(out_path, &db_graph, CTX_GRAPH_FILEFORMAT, NULL, 0, ncols); } ctx_free(visited); ctx_free(rmvbits); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int ctx_contigs(int argc, char **argv) { size_t nthreads = 0; struct MemArgs memargs = MEM_ARGS_INIT; const char *out_path = NULL; size_t i, contig_limit = 0, colour = 0; bool cmd_reseed = false, cmd_no_reseed = false; // -r, -R const char *conf_table_path = NULL; // save confidence table to here bool use_missing_info_check = true, seed_with_unused_paths = false; double min_step_confid = -1.0, min_cumul_confid = -1.0; // < 0 => no min // Read length and expected depth for calculating confidences size_t genome_size = 0; seq_file_t *tmp_seed_file = NULL; SeqFilePtrBuffer seed_buf; seq_file_ptr_buf_alloc(&seed_buf, 16); GPathReader tmp_gpfile; GPathFileBuffer gpfiles; gpfile_buf_alloc(&gpfiles, 8); // Arg parsing char cmd[100], shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'o': cmd_check(!out_path,cmd); out_path = optarg; break; case 't': cmd_check(!nthreads,cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'p': memset(&tmp_gpfile, 0, sizeof(GPathReader)); gpath_reader_open(&tmp_gpfile, optarg); gpfile_buf_push(&gpfiles, &tmp_gpfile, 1); break; case '1': case 's': // --seed <in.fa> if((tmp_seed_file = seq_open(optarg)) == NULL) die("Cannot read --seed file: %s", optarg); seq_file_ptr_buf_add(&seed_buf, tmp_seed_file); break; case 'r': cmd_check(!cmd_reseed,cmd); cmd_reseed = true; break; case 'R': cmd_check(!cmd_no_reseed,cmd); cmd_no_reseed = true; break; case 'N': cmd_check(!contig_limit,cmd); contig_limit = cmd_uint32_nonzero(cmd, optarg); break; case 'c': cmd_check(!colour,cmd); colour = cmd_uint32(cmd, optarg); break; case 'G': cmd_check(!genome_size,cmd); genome_size = cmd_bases(cmd, optarg); break; case 'S': cmd_check(!conf_table_path,cmd); conf_table_path = optarg; break; case 'M': cmd_check(use_missing_info_check,cmd); use_missing_info_check = false; break; case 'P': cmd_check(!seed_with_unused_paths,cmd); seed_with_unused_paths = true; break; case 'C': cmd_check(min_cumul_confid < 0,cmd); min_cumul_confid = cmd_udouble(cmd,optarg); if(min_cumul_confid > 1) die("%s must be 0 <= x <= 1", cmd); break; case 'T': cmd_check(min_step_confid < 0,cmd); min_step_confid = cmd_udouble(cmd,optarg); if(min_step_confid > 1) die("%s must be 0 <= x <= 1", cmd); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ die("`"CMD" contigs -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } if(cmd_no_reseed && cmd_reseed) cmd_print_usage("Cannot specify both -r and -R"); if(contig_limit && seed_with_unused_paths) cmd_print_usage("Cannot combine --ncontigs with --use-seed-paths"); bool sample_with_replacement = cmd_reseed; // Defaults if(nthreads == 0) nthreads = DEFAULT_NTHREADS; if(!seed_buf.len && !contig_limit && sample_with_replacement) { cmd_print_usage("Please specify one or more of: " "--no-reseed | --ncontigs | --seed <in.fa>"); } if(optind >= argc) cmd_print_usage("Require input graph files (.ctx)"); // // Open graph files // const size_t num_gfiles = argc - optind; char **graph_paths = argv + optind; ctx_assert(num_gfiles > 0); GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); size_t ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0; graph_files_open(graph_paths, gfiles, num_gfiles, &ctx_max_kmers, &ctx_sum_kmers); // char *ctx_path = argv[optind]; // // Open Graph file // // GraphFileReader gfile; // memset(&gfile, 0, sizeof(GraphFileReader)); // graph_file_open(&gfile, ctx_path); // Update colours in graph file - sample in 0, all others in 1 // never need more than two colours ncols = gpath_load_sample_pop(gfiles, num_gfiles, gpfiles.b, gpfiles.len, colour); // Check for compatibility between graph files and path files // pop_colour is colour 1 graphs_gpaths_compatible(gfiles, num_gfiles, gpfiles.b, gpfiles.len, 1); if(!genome_size) { char nk_str[50]; if(ctx_max_kmers <= 0) die("Please pass --genome <G> if streaming"); genome_size = ctx_max_kmers; ulong_to_str(genome_size, nk_str); status("Taking number of kmers as genome size: %s", nk_str); } // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem, path_mem, total_mem; // 1 bit needed per kmer if we need to keep track of kmer usage bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 + sizeof(GPath*)*8 + ncols + !sample_with_replacement; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, ctx_max_kmers, ctx_sum_kmers, false, &graph_mem); // Paths memory size_t rem_mem = memargs.mem_to_use - MIN2(memargs.mem_to_use, graph_mem); path_mem = gpath_reader_mem_req(gpfiles.b, gpfiles.len, ncols, rem_mem, false); // Shift path store memory from graphs->paths graph_mem -= sizeof(GPath*)*kmers_in_hash; path_mem += sizeof(GPath*)*kmers_in_hash; cmd_print_mem(path_mem, "paths"); // Total memory total_mem = graph_mem + path_mem; cmd_check_mem_limit(memargs.mem_to_use, total_mem); // Load contig hist distribution from ctp files ZeroSizeBuffer contig_hist; memset(&contig_hist, 0, sizeof(contig_hist)); for(i = 0; i < gpfiles.len; i++) { gpath_reader_load_contig_hist(gpfiles.b[i].json, gpfiles.b[i].fltr.path.b, file_filter_fromcol(&gpfiles.b[i].fltr, 0), &contig_hist); } // Calculate confidences, only for one colour ContigConfidenceTable conf_table; conf_table_alloc(&conf_table, 1); conf_table_update_hist(&conf_table, 0, genome_size, contig_hist.b, contig_hist.len); if(conf_table_path != NULL) { conf_table_save(&conf_table, conf_table_path); } zsize_buf_dealloc(&contig_hist); // // Output file if printing // FILE *fout = out_path ? futil_fopen_create(out_path, "w") : NULL; // Allocate dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, ncols, 1, kmers_in_hash, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL); // Paths gpath_reader_alloc_gpstore(gpfiles.b, gpfiles.len, path_mem, false, &db_graph); uint8_t *visited = NULL; if(!sample_with_replacement) visited = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1); // Load graph LoadingStats stats = LOAD_STATS_INIT_MACRO; GraphLoadingPrefs gprefs = {.db_graph = &db_graph, .boolean_covgs = false, .must_exist_in_graph = false, .empty_colours = true}; for(i = 0; i < num_gfiles; i++) { graph_load(&gfiles[i], gprefs, &stats); graph_file_close(&gfiles[i]); gprefs.empty_colours = false; } ctx_free(gfiles); hash_table_print_stats(&db_graph.ht); // Load path files for(i = 0; i < gpfiles.len; i++) { gpath_reader_load(&gpfiles.b[i], GPATH_DIE_MISSING_KMERS, &db_graph); gpath_reader_close(&gpfiles.b[i]); } gpfile_buf_dealloc(&gpfiles); AssembleContigStats assem_stats; assemble_contigs_stats_init(&assem_stats); assemble_contigs(nthreads, seed_buf.b, seed_buf.len, contig_limit, visited, use_missing_info_check, seed_with_unused_paths, min_step_confid, min_cumul_confid, fout, out_path, &assem_stats, &conf_table, &db_graph, 0); // Sample always loaded into colour zero if(fout && fout != stdout) fclose(fout); assemble_contigs_stats_print(&assem_stats); assemble_contigs_stats_destroy(&assem_stats); conf_table_dealloc(&conf_table); for(i = 0; i < seed_buf.len; i++) seq_close(seed_buf.b[i]); seq_file_ptr_buf_dealloc(&seed_buf); ctx_free(visited); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int ctx_thread(int argc, char **argv) { struct ReadThreadCmdArgs args; read_thread_args_alloc(&args); read_thread_args_parse(&args, argc, argv, longopts, false); GraphFileReader *gfile = &args.gfile; GPathFileBuffer *gpfiles = &args.gpfiles; CorrectAlnInputBuffer *inputs = &args.inputs; size_t i; if(args.zero_link_counts && gpfiles->len == 0) cmd_print_usage("-0,--zero-paths without -p,--paths <in.ctp> has no meaning"); // Check each path file only loads one colour gpaths_only_for_colour(gpfiles->b, gpfiles->len, 0); // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem, total_mem; size_t path_hash_mem, path_store_mem, path_mem; bool sep_path_list = (!args.use_new_paths && gpfiles->len > 0); bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 + sizeof(GPath*)*8 + 2 * args.nthreads; // Have traversed // false -> don't use mem_to_use to decide how many kmers to store in hash // since we need some of that memory for storing paths kmers_in_hash = cmd_get_kmers_in_hash(args.memargs.mem_to_use, args.memargs.mem_to_use_set, args.memargs.num_kmers, args.memargs.num_kmers_set, bits_per_kmer, gfile->num_of_kmers, gfile->num_of_kmers, false, &graph_mem); // Paths memory size_t min_path_mem = 0; gpath_reader_sum_mem(gpfiles->b, gpfiles->len, 1, true, true, &min_path_mem); if(graph_mem + min_path_mem > args.memargs.mem_to_use) { char buf[50]; die("Require at least %s memory", bytes_to_str(graph_mem+min_path_mem, 1, buf)); } path_mem = args.memargs.mem_to_use - graph_mem; size_t pentry_hash_mem = sizeof(GPEntry)/0.7; size_t pentry_store_mem = sizeof(GPath) + 8 + // struct + sequence 1 + // in colour sizeof(uint8_t) + // counts sizeof(uint32_t); // kmer length size_t max_paths = path_mem / (pentry_store_mem + pentry_hash_mem); path_store_mem = max_paths * pentry_store_mem; path_hash_mem = max_paths * pentry_hash_mem; cmd_print_mem(path_hash_mem, "paths hash"); cmd_print_mem(path_store_mem, "paths store"); total_mem = graph_mem + path_mem; cmd_check_mem_limit(args.memargs.mem_to_use, total_mem); // // Open output file // gzFile gzout = futil_gzopen_create(args.out_ctp_path, "w"); status("Creating paths file: %s", futil_outpath_str(args.out_ctp_path)); // // Allocate memory // dBGraph db_graph; size_t kmer_size = gfile->hdr.kmer_size; db_graph_alloc(&db_graph, kmer_size, 1, 1, kmers_in_hash, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL); // Split path memory 2:1 between store and hash // Create a path store that tracks path counts gpath_store_alloc(&db_graph.gpstore, db_graph.num_of_cols, db_graph.ht.capacity, 0, path_store_mem, true, sep_path_list); // Create path hash table for fast lookup gpath_hash_alloc(&db_graph.gphash, &db_graph.gpstore, path_hash_mem); if(args.use_new_paths) { status("Using paths as they are added (risky)"); } else { status("Not using new paths as they are added (safe)"); } // // Start up workers to add paths to the graph // GenPathWorker *workers; workers = gen_paths_workers_alloc(args.nthreads, &db_graph); // Setup for loading graphs graph LoadingStats gstats; loading_stats_init(&gstats); // Path statistics LoadingStats *load_stats = gen_paths_get_stats(workers); CorrectAlnStats *aln_stats = gen_paths_get_aln_stats(workers); // Load contig hist distribution for(i = 0; i < gpfiles->len; i++) { gpath_reader_load_contig_hist(gpfiles->b[i].json, gpfiles->b[i].fltr.path.b, file_filter_fromcol(&gpfiles->b[i].fltr, 0), &aln_stats->contig_histgrm); } GraphLoadingPrefs gprefs = {.db_graph = &db_graph, .boolean_covgs = false, .must_exist_in_graph = false, .must_exist_in_edges = NULL, .empty_colours = false}; // already loaded paths // Load graph, print stats, close file graph_load(gfile, gprefs, &gstats); hash_table_print_stats_brief(&db_graph.ht); graph_file_close(gfile); // Load existing paths for(i = 0; i < gpfiles->len; i++) gpath_reader_load(&gpfiles->b[i], GPATH_DIE_MISSING_KMERS, &db_graph); // zero link counts of already loaded links if(args.zero_link_counts) { status("Zeroing link counts for loaded links"); gpath_set_zero_nseen(&db_graph.gpstore.gpset); } if(!args.use_new_paths) gpath_store_split_read_write(&db_graph.gpstore); // Deal with a set of files at once // Can have different numbers of inputs vs threads size_t start, end; for(start = 0; start < inputs->len; start += MAX_IO_THREADS) { end = MIN2(inputs->len, start+MAX_IO_THREADS); generate_paths(inputs->b+start, end-start, workers, args.nthreads); } // Print memory statistics gpath_hash_print_stats(&db_graph.gphash); gpath_store_print_stats(&db_graph.gpstore); correct_aln_dump_stats(aln_stats, load_stats, args.dump_seq_sizes, args.dump_frag_sizes, db_graph.ht.num_kmers); // Don't need GPathHash anymore gpath_hash_dealloc(&db_graph.gphash); cJSON **hdrs = ctx_malloc(gpfiles->len * sizeof(cJSON*)); for(i = 0; i < gpfiles->len; i++) hdrs[i] = gpfiles->b[i].json; size_t output_threads = MIN2(args.nthreads, MAX_IO_THREADS); // Generate a cJSON header for all inputs cJSON *thread_hdr = cJSON_CreateObject(); cJSON *inputs_hdr = cJSON_CreateArray(); cJSON_AddItemToObject(thread_hdr, "inputs", inputs_hdr); for(i = 0; i < inputs->len; i++) cJSON_AddItemToArray(inputs_hdr, correct_aln_input_json_hdr(&inputs->b[i])); // Write output file gpath_save(gzout, args.out_ctp_path, output_threads, true, "thread", thread_hdr, hdrs, gpfiles->len, &aln_stats->contig_histgrm, 1, &db_graph); gzclose(gzout); ctx_free(hdrs); // Optionally run path checks for debugging // gpath_checks_all_paths(&db_graph, args.nthreads); // ins_gap, err_gap no longer allocated after this line gen_paths_workers_dealloc(workers, args.nthreads); // Close and free input files etc. read_thread_args_dealloc(&args); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
// Returns 0 on success, otherwise != 0 int ctx_unitigs(int argc, char **argv) { size_t nthreads = 0; struct MemArgs memargs = MEM_ARGS_INIT; const char *out_path = NULL; UnitigSyntax syntax = PRINT_FASTA; bool dot_use_points = false; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'F': cmd_check(!syntax, cmd); syntax = PRINT_FASTA; break; case 'g': cmd_check(!syntax, cmd); syntax = PRINT_GFA; break; case 'd': cmd_check(!syntax, cmd); syntax = PRINT_DOT; break; case 'P': cmd_check(!dot_use_points, cmd); dot_use_points = true; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ die("`"CMD" unitigs -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } if(dot_use_points && syntax == PRINT_FASTA) cmd_print_usage("--point is only for use with --dot"); // Defaults for unset values if(out_path == NULL) out_path = "-"; if(nthreads == 0) nthreads = DEFAULT_NTHREADS; if(optind >= argc) cmd_print_usage(NULL); size_t i, num_gfiles = (size_t)(argc - optind); char **gfile_paths = argv + optind; if(dot_use_points && syntax != PRINT_DOT) cmd_print_usage("--points only valid with --graphviz / --dot"); ctx_assert(num_gfiles > 0); // Open graph files GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); size_t ctx_max_kmers = 0, ctx_sum_kmers = 0; graph_files_open(gfile_paths, gfiles, num_gfiles, &ctx_max_kmers, &ctx_sum_kmers); // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem; bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 + 1; if(syntax != PRINT_FASTA) bits_per_kmer += sizeof(UnitigEnd) * 8; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, ctx_max_kmers, ctx_sum_kmers, true, &graph_mem); cmd_check_mem_limit(memargs.mem_to_use, graph_mem); status("Output in %s format to %s\n", syntax_strs[syntax], futil_outpath_str(out_path)); // // Open output file // // Print to stdout unless --out <out> is specified FILE *fout = futil_fopen_create(out_path, "w"); // // Allocate memory // dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, 1, 1, kmers_in_hash, DBG_ALLOC_EDGES); UnitigPrinter printer; unitig_printer_init(&printer, &db_graph, nthreads, syntax, fout); if(syntax == PRINT_DOT || syntax == PRINT_GFA) unitig_graph_alloc(&printer.ugraph, &db_graph); // Load graphs GraphLoadingPrefs gprefs = {.db_graph = &db_graph, .boolean_covgs = false, .must_exist_in_graph = false, .empty_colours = false}; for(i = 0; i < num_gfiles; i++) { file_filter_flatten(&gfiles[i].fltr, 0); graph_load(&gfiles[i], gprefs, NULL); graph_file_close(&gfiles[i]); } ctx_free(gfiles); hash_table_print_stats(&db_graph.ht); switch(syntax) { case PRINT_FASTA: status("Printing unitgs in FASTA using %zu threads", nthreads); supernodes_iterate(nthreads, printer.visited, &db_graph, print_unitig_fasta, &printer); break; case PRINT_GFA: print_gfa_syntax(&printer); break; case PRINT_DOT: print_dot_syntax(&printer, dot_use_points); break; default: die("Invalid print syntax: %i", syntax); } char num_unitigs_str[50]; ulong_to_str(printer.num_unitigs, num_unitigs_str); status("Dumped %s unitigs\n", num_unitigs_str); fclose(fout); unitig_printer_destroy(&printer); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int ctx_correct(int argc, char **argv) { size_t i, j; struct ReadThreadCmdArgs args = READ_THREAD_CMD_ARGS_INIT; read_thread_args_alloc(&args); read_thread_args_parse(&args, argc, argv, longopts, true); GraphFileReader *gfile = &args.gfile; PathFileBuffer *pfiles = &args.pfiles; CorrectAlnInputBuffer *inputs = &args.inputs; size_t ctx_total_cols = gfile->hdr.num_of_cols; size_t ctx_num_kmers = gfile->num_of_kmers; if(args.colour > ctx_total_cols) cmd_print_usage("-c %zu is too big [> %zu]", args.colour, ctx_total_cols); size_t ctp_usedcols = 0; for(i = 0; i < pfiles->len; i++) { if(!file_filter_iscolloaded(&pfiles->data[i].fltr, args.colour)) { cmd_print_usage("Path file doesn't load into colour %zu: %s", args.colour, pfiles->data[i].fltr.orig_path.buff); } ctp_usedcols = MAX2(ctp_usedcols, path_file_usedcols(&pfiles->data[i])); } // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem, path_mem, total_mem; // 1 bit needed per kmer if we need to keep track of noreseed bits_per_kmer = sizeof(Edges)*8 + ctx_num_kmers + sizeof(uint64_t)*8; kmers_in_hash = cmd_get_kmers_in_hash2(args.memargs.mem_to_use, args.memargs.mem_to_use_set, args.memargs.num_kmers, args.memargs.num_kmers_set, bits_per_kmer, ctx_num_kmers, ctx_num_kmers, false, &graph_mem); // Paths memory path_mem = path_files_mem_required(pfiles->data, pfiles->len, false, false, ctp_usedcols, 0); cmd_print_mem(path_mem, "paths"); // Total memory total_mem = graph_mem + path_mem; cmd_check_mem_limit(args.memargs.mem_to_use, total_mem); // // Check we can read all output files // // Open output files SeqOutput *outputs = ctx_calloc(inputs->len, sizeof(SeqOutput)); bool output_files_exist = false; for(i = 0; i < inputs->len; i++) { CorrectAlnInput *input = &inputs->data[i]; input->crt_params.ctxcol = input->crt_params.ctpcol = args.colour; SeqOutput *output = &outputs[i]; seq_output_alloc(output); seq_output_set_paths(output, input->out_base, async_task_pe_output(&input->files)); input->output = output; // output check prints warnings and returns true if errors output_files_exist |= seq_output_files_exist_check(output); } // Abandon if some of the output files already exist if(output_files_exist) die("Output files already exist"); // Attempt to open all files for(i = 0; i < inputs->len && seq_output_open(&outputs[i]); i++) {} // Check if something went wrong - if so remove all output files if(i < inputs->len) { for(j = 0; j < i; j++) seq_output_delete(&outputs[i]); die("Couldn't open output files"); } // // Allocate memory // dBGraph db_graph; db_graph_alloc(&db_graph, gfile->hdr.kmer_size, ctx_total_cols, 1, kmers_in_hash); size_t bytes_per_col = roundup_bits2bytes(db_graph.ht.capacity); db_graph.col_edges = ctx_calloc(db_graph.ht.capacity, sizeof(Edges)); db_graph.node_in_cols = ctx_calloc(bytes_per_col * ctx_total_cols, 1); // Paths path_store_alloc(&db_graph.pstore, path_mem, false, db_graph.ht.capacity, ctp_usedcols); // // Load Graph and Path files // LoadingStats gstats = LOAD_STATS_INIT_MACRO; GraphLoadingPrefs gprefs = {.db_graph = &db_graph, .boolean_covgs = false, .must_exist_in_graph = false, .must_exist_in_edges = NULL, .empty_colours = true}; // Load graph, print stats, close file graph_load(gfile, gprefs, &gstats); hash_table_print_stats_brief(&db_graph.ht); graph_file_close(gfile); // Load path files (does nothing if num_fpiles == 0) paths_format_merge(pfiles->data, pfiles->len, false, false, args.num_of_threads, &db_graph); // // Run alignment // correct_reads(args.num_of_threads, MAX_IO_THREADS, inputs->data, inputs->len, &db_graph); // Close and free output files for(i = 0; i < inputs->len; i++) seq_output_dealloc(&outputs[i]); ctx_free(outputs); read_thread_args_dealloc(&args); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int ctx_infer_edges(int argc, char **argv) { size_t num_of_threads = DEFAULT_NTHREADS; struct MemArgs memargs = MEM_ARGS_INIT; char *out_ctx_path = NULL; bool add_pop_edges = false, add_all_edges = false; // Arg parsing char cmd[100]; char shortopts[100]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'o': cmd_check(!out_ctx_path,cmd); out_ctx_path = optarg; break; case 't': num_of_threads = cmd_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'A': add_all_edges = true; break; case 'P': add_pop_edges = true; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" inferedges -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } // Default to adding all edges if(!add_pop_edges && !add_all_edges) add_all_edges = true; // Can only specify one of --pop --all if(add_pop_edges && add_all_edges) cmd_print_usage("Please specify only one of --all --pop"); // Check that optind+1 == argc if(optind+1 > argc) cmd_print_usage("Expected exactly one graph file"); else if(optind+1 < argc) cmd_print_usage("Expected only one graph file. What is this: '%s'", argv[optind]); // // Open graph file // char *graph_path = argv[optind]; status("Reading graph: %s", graph_path); if(strchr(graph_path,':') != NULL) cmd_print_usage("Cannot use ':' in input graph for `"CMD" inferedges`"); GraphFileReader file; memset(&file, 0, sizeof(file)); file_filter_open(&file.fltr, graph_path); // Use stat to detect if we are reading from a stream struct stat st; bool reading_stream = (stat(file.fltr.path.b, &st) != 0); // Mode r+ means open (not create) for update (read & write) graph_file_open2(&file, graph_path, reading_stream ? "r" : "r+", 0); if(!file_filter_is_direct(&file.fltr)) cmd_print_usage("Inferedges with filter not implemented - sorry"); bool editing_file = !(out_ctx_path || reading_stream); FILE *fout = NULL; // Editing input file or writing a new file if(!editing_file) fout = futil_fopen_create(out_ctx_path ? out_ctx_path : "-", "w"); // Print output status if(fout == stdout) status("Writing to STDOUT"); else if(fout != NULL) status("Writing to: %s", out_ctx_path); else status("Editing file in place: %s", graph_path); status("Inferring all missing %sedges", add_pop_edges ? "population " : ""); // // Decide on memory // const size_t ncols = file.hdr.num_of_cols; size_t kmers_in_hash, graph_mem, bits_per_kmer; // reading stream: all covgs + edges // reading file: one bit per kmer per colour for 'in colour' bits_per_kmer = sizeof(BinaryKmer)*8; if(reading_stream) { bits_per_kmer += ncols * 8 * (sizeof(Edges) + sizeof(Covg)); } else { bits_per_kmer += ncols; // in colour } kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, file.num_of_kmers, file.num_of_kmers, memargs.mem_to_use_set, &graph_mem); cmd_check_mem_limit(memargs.mem_to_use, graph_mem); // // Allocate memory // int alloc_flags = reading_stream ? DBG_ALLOC_EDGES | DBG_ALLOC_COVGS : DBG_ALLOC_NODE_IN_COL; dBGraph db_graph; db_graph_alloc(&db_graph, file.hdr.kmer_size, ncols, reading_stream ? ncols : 1, kmers_in_hash, alloc_flags); LoadingStats stats = LOAD_STATS_INIT_MACRO; GraphLoadingPrefs gprefs = {.db_graph = &db_graph, .boolean_covgs = false, .must_exist_in_graph = false, .must_exist_in_edges = NULL, .empty_colours = false}; // We need to load the graph for both --pop and --all since we need to check // if the next kmer is in each of the colours graph_load(&file, gprefs, &stats); if(add_pop_edges) status("Inferring edges from population...\n"); else status("Inferring all missing edges...\n"); size_t num_kmers_edited; if(reading_stream) { ctx_assert(fout != NULL); num_kmers_edited = infer_edges(num_of_threads, add_all_edges, &db_graph); graph_write_header(fout, &file.hdr); graph_write_all_kmers(fout, &db_graph); } else if(fout == NULL) { num_kmers_edited = inferedges_on_mmap(&db_graph, add_all_edges, &file); } else { num_kmers_edited = inferedges_on_file(&db_graph, add_all_edges, &file, fout); } if(fout != NULL && fout != stdout) fclose(fout); char modified_str[100], kmers_str[100]; ulong_to_str(num_kmers_edited, modified_str); ulong_to_str(db_graph.ht.num_kmers, kmers_str); double modified_rate = 0; if(db_graph.ht.num_kmers) modified_rate = (100.0 * num_kmers_edited) / db_graph.ht.num_kmers; status("%s of %s (%.2f%%) nodes modified\n", modified_str, kmers_str, modified_rate); if(editing_file) { // Close and re-open fclose(file.fh); file.fh = NULL; futil_update_timestamp(file.fltr.path.b); } graph_file_close(&file); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
static void parse_args(int argc, char **argv) { seq_format fmt = SEQ_FMT_FASTQ; bool invert = false; size_t i; aln_reads_buf_alloc(&inputs, 8); asyncio_buf_alloc(&files, 8); AlignReadsData input; AsyncIOInput seqfiles; // Arg parsing char cmd[100], shortopts[100]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 't': cmd_check(!nthreads,cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'F': cmd_check(fmt==SEQ_FMT_FASTQ, cmd); fmt = cmd_parse_format(cmd, optarg); break; case 'v': cmd_check(!invert,cmd); invert = true; break; case '1': case '2': case 'i': memset(&input, 0, sizeof(input)); memset(&seqfiles, 0, sizeof(seqfiles)); asyncio_task_parse(&seqfiles, c, optarg, 0, &input.out_base); aln_reads_buf_push(&inputs, &input, 1); asyncio_buf_push(&files, &seqfiles, 1); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" reads -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } ctx_assert(inputs.len == files.len); // Defaults if(!nthreads) nthreads = DEFAULT_NTHREADS; if(inputs.len == 0) cmd_print_usage("Please specify at least one sequence file (-1, -2 or -i)"); if(optind >= argc) cmd_print_usage("Please specify input graph file(s)"); num_gfiles = (size_t)(argc - optind); gfile_paths = argv + optind; for(i = 0; i < inputs.len; i++) { inputs.b[i].invert = invert; inputs.b[i].fmt = fmt; files.b[i].ptr = &inputs.b[i]; } }
int ctx_links(int argc, char **argv) { size_t limit = 0; const char *link_out_path = NULL, *csv_out_path = NULL, *plot_out_path = NULL; const char *thresh_path = NULL, *hist_path = NULL; size_t hist_distsize = 0, hist_covgsize = 0; size_t cutoff = 0; bool clean = false; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!link_out_path, cmd); link_out_path = optarg; break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'l': cmd_check(!csv_out_path, cmd); csv_out_path = optarg; break; case 'c': cmd_check(!cutoff, cmd); cutoff = cmd_size(cmd, optarg); clean = true; break; case 'L': cmd_check(!limit, cmd); limit = cmd_size(cmd, optarg); break; case 'P': cmd_check(!plot_out_path, cmd); plot_out_path = optarg; break; case 'T': cmd_check(!thresh_path, cmd); thresh_path = optarg; break; case 'H': cmd_check(!hist_path, cmd); hist_path = optarg; break; case 'C': cmd_check(!hist_covgsize, cmd); hist_covgsize = cmd_size(cmd, optarg); break; case 'D': cmd_check(!hist_distsize, cmd); hist_distsize = cmd_size(cmd, optarg); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" links -h` for help. Bad option: %s", argv[optind-1]); default: ctx_assert2(0, "shouldn't reach here: %c", c); } } if(hist_distsize && !hist_path) cmd_print_usage("--max-dist without --covg-hist"); if(hist_covgsize && !hist_path) cmd_print_usage("--max-covg without --covg-hist"); // Defaults if(!hist_distsize) hist_distsize = DEFAULT_MAX_DIST; if(!hist_covgsize) hist_covgsize = DEFAULT_MAX_COVG; if(optind + 1 != argc) cmd_print_usage("Wrong number of arguments"); const char *ctp_path = argv[optind]; bool list = (csv_out_path != NULL); bool plot = (plot_out_path != NULL); bool save = (link_out_path != NULL); bool hist_covg = (thresh_path != NULL || hist_path != NULL); size_t plot_kmer_idx = (limit == 0 ? 0 : limit - 1); if(clean && !save) cmd_print_usage("Need to give --out <out.ctp.gz> with --clean"); if(!save && !list && !plot && !hist_covg) cmd_print_usage("Please specify one of --plot, --list or --clean"); if(link_out_path && hist_covg && strcmp(link_out_path,"-") == 0) cmd_print_usage("Outputing both cleaning threshold (-T) and links (-o) to STDOUT!"); // Open input file FILE *list_fh = NULL, *plot_fh = NULL, *link_tmp_fh = NULL; FILE *thresh_fh = NULL, *hist_fh = NULL; gzFile link_gz = NULL; // Check file don't exist or that we can overwrite // Will ignore if path is null bool err = false; err |= futil_check_outfile(csv_out_path); err |= futil_check_outfile(plot_out_path); err |= futil_check_outfile(link_out_path); err |= futil_check_outfile(thresh_path); err |= futil_check_outfile(hist_path); if(err) die("Use -f,--force to overwrite files"); StrBuf link_tmp_path; strbuf_alloc(&link_tmp_path, 1024); GPathReader ctpin; memset(&ctpin, 0, sizeof(ctpin)); gpath_reader_open(&ctpin, ctp_path); size_t ncols = file_filter_into_ncols(&ctpin.fltr); size_t kmer_size = gpath_reader_get_kmer_size(&ctpin); cJSON *newhdr = cJSON_Duplicate(ctpin.json, 1); if(ncols != 1) die("Can only clean a single colour at a time. Sorry."); uint64_t (*hists)[hist_covgsize] = NULL; if(hist_covg) { hists = ctx_calloc(hist_distsize, sizeof(hists[0])); } if(hist_path && (hist_fh = futil_fopen_create(hist_path, "w")) == NULL) die("Cannot open file: %s", hist_path); if(thresh_path && (thresh_fh = futil_fopen_create(thresh_path, "w")) == NULL) die("Cannot open file: %s", thresh_path); if(limit) status("Limiting to the first %zu kmers", limit); if(clean) { timestamp(); message(" Cleaning coverage below %zu", cutoff); message("\n"); } if(save) { // Check we can find the fields we need cJSON *links_json = json_hdr_get(newhdr, "paths", cJSON_Object, link_out_path); cJSON *nkmers_json = json_hdr_get(links_json, "num_kmers_with_paths", cJSON_Number, link_out_path); cJSON *nlinks_json = json_hdr_get(links_json, "num_paths", cJSON_Number, link_out_path); cJSON *nbytes_json = json_hdr_get(links_json, "path_bytes", cJSON_Number, link_out_path); if(!nkmers_json || !nlinks_json || !nbytes_json) die("Cannot find required header entries"); // Create a random temporary file link_tmp_fh = create_tmp_file(&link_tmp_path, link_out_path); status("Saving output to: %s", link_out_path); status("Temporary output: %s", link_tmp_path.b); // Open output file if((link_gz = futil_gzopen_create(link_out_path, "w")) == NULL) die("Cannot open output link file: %s", link_out_path); // Need to open output file first so we can get absolute path // Update the header to include this command json_hdr_add_curr_cmd(newhdr, link_out_path); } if(list) { status("Listing to %s", csv_out_path); if((list_fh = futil_fopen_create(csv_out_path, "w")) == NULL) die("Cannot open output CSV file %s", csv_out_path); // Print csv header fprintf(list_fh, "SeqLen,Covg\n"); } if(plot) { status("Plotting kmer %zu to %s", plot_kmer_idx, plot_out_path); if((plot_fh = futil_fopen_create(plot_out_path, "w")) == NULL) die("Cannot open output .dot file %s", plot_out_path); } SizeBuffer countbuf, jposbuf; size_buf_alloc(&countbuf, 16); size_buf_alloc(&jposbuf, 1024); StrBuf kmerbuf, juncsbuf, seqbuf, outbuf; strbuf_alloc(&kmerbuf, 1024); strbuf_alloc(&juncsbuf, 1024); strbuf_alloc(&seqbuf, 1024); strbuf_alloc(&outbuf, 1024); bool link_fw; size_t njuncs; size_t knum, nlinks, num_links_exp = 0; LinkTree ltree; ltree_alloc(<ree, kmer_size); LinkTreeStats tree_stats; memset(&tree_stats, 0, sizeof(tree_stats)); size_t init_num_links = 0, num_links = 0; for(knum = 0; !limit || knum < limit; knum++) { ltree_reset(<ree); if(!gpath_reader_read_kmer(&ctpin, &kmerbuf, &num_links_exp)) break; ctx_assert2(kmerbuf.end == kmer_size, "Kmer incorrect length %zu != %zu", kmerbuf.end, kmer_size); // status("kmer: %s", kmerbuf.b); for(nlinks = 0; gpath_reader_read_link(&ctpin, &link_fw, &njuncs, &countbuf, &juncsbuf, &seqbuf, &jposbuf); nlinks++) { ltree_add(<ree, link_fw, countbuf.b[0], jposbuf.b, juncsbuf.b, seqbuf.b); } if(nlinks != num_links_exp) warn("Links count mismatch %zu != %zu", nlinks, num_links_exp); if(hist_covg) { ltree_update_covg_hists(<ree, (uint64_t*)hists, hist_distsize, hist_covgsize); } if(clean) { ltree_clean(<ree, cutoff); } // Accumulate statistics ltree_get_stats(<ree, &tree_stats); num_links = tree_stats.num_links - init_num_links; init_num_links = tree_stats.num_links; if(list) { ltree_write_list(<ree, &outbuf); if(fwrite(outbuf.b, 1, outbuf.end, list_fh) != outbuf.end) die("Cannot write CSV file to: %s", csv_out_path); strbuf_reset(&outbuf); } if(save && num_links) { ltree_write_ctp(<ree, kmerbuf.b, num_links, &outbuf); if(fwrite(outbuf.b, 1, outbuf.end, link_tmp_fh) != outbuf.end) die("Cannot write ctp file to: %s", link_tmp_path.b); strbuf_reset(&outbuf); } if(plot && knum == plot_kmer_idx) { status("Plotting tree..."); ltree_write_dot(<ree, &outbuf); if(fwrite(outbuf.b, 1, outbuf.end, plot_fh) != outbuf.end) die("Cannot write plot DOT file to: %s", plot_out_path); strbuf_reset(&outbuf); } } gpath_reader_close(&ctpin); cJSON *links_json = json_hdr_get(newhdr, "paths", cJSON_Object, link_out_path); cJSON *nkmers_json = json_hdr_get(links_json, "num_kmers_with_paths", cJSON_Number, link_out_path); cJSON *nlinks_json = json_hdr_get(links_json, "num_paths", cJSON_Number, link_out_path); cJSON *nbytes_json = json_hdr_get(links_json, "path_bytes", cJSON_Number, link_out_path); status("Number of kmers with links %li -> %zu", nkmers_json->valueint, tree_stats.num_trees_with_links); status("Number of links %li -> %zu", nlinks_json->valueint, tree_stats.num_links); status("Number of bytes %li -> %zu", nbytes_json->valueint, tree_stats.num_link_bytes); if(save) { // Update JSON nkmers_json->valuedouble = nkmers_json->valueint = tree_stats.num_trees_with_links; nlinks_json->valuedouble = nlinks_json->valueint = tree_stats.num_links; nbytes_json->valuedouble = nbytes_json->valueint = tree_stats.num_link_bytes; char *json_str = cJSON_Print(newhdr); if(gzputs(link_gz, json_str) != (int)strlen(json_str)) die("Cannot write ctp file to: %s", link_out_path); free(json_str); gzputs(link_gz, "\n\n"); gzputs(link_gz, ctp_explanation_comment); gzputs(link_gz, "\n"); fseek(link_tmp_fh, 0, SEEK_SET); char *tmp = ctx_malloc(4*ONE_MEGABYTE); size_t s; while((s = fread(tmp, 1, 4*ONE_MEGABYTE, link_tmp_fh)) > 0) { if(gzwrite(link_gz, tmp, s) != (int)s) die("Cannot write to output: %s", link_out_path); } ctx_free(tmp); gzclose(link_gz); fclose(link_tmp_fh); } // Write histogram to file if(hist_fh) { size_t i, j; fprintf(hist_fh, " "); for(j = 1; j < hist_covgsize; j++) fprintf(hist_fh, ",covg.%02zu", j); fprintf(hist_fh, "\n"); for(i = 1; i < hist_distsize; i++) { fprintf(hist_fh, "dist.%02zu", i); for(j = 1; j < hist_covgsize; j++) { fprintf(hist_fh, ",%"PRIu64, hists[i][j]); } fprintf(hist_fh, "\n"); } } if(thresh_fh) { // Use median of first five cutoffs print_suggest_cutoff(6, hist_covgsize, hists, thresh_fh); } if(hist_fh && hist_fh != stdout) fclose(hist_fh); if(list) { fclose(list_fh); } if(plot) { fclose(plot_fh); } ctx_free(hists); cJSON_Delete(newhdr); strbuf_dealloc(&link_tmp_path); ltree_dealloc(<ree); size_buf_dealloc(&countbuf); size_buf_dealloc(&jposbuf); strbuf_dealloc(&kmerbuf); strbuf_dealloc(&juncsbuf); strbuf_dealloc(&seqbuf); strbuf_dealloc(&outbuf); return EXIT_SUCCESS; }
void read_thread_args_parse(struct ReadThreadCmdArgs *args, int argc, char **argv, const struct option *longopts, bool correct_cmd) { size_t i; CorrectAlnInput task = CORRECT_ALN_INPUT_INIT; uint8_t fq_offset = 0; GPathReader tmp_gpfile; CorrectAlnInputBuffer *inputs = &args->inputs; args->memargs = (struct MemArgs)MEM_ARGS_INIT; args->fmt = SEQ_FMT_FASTQ; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int used = 1, c; char *tmp_path; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'o': cmd_check(!args->out_ctp_path,cmd); args->out_ctp_path = optarg; break; case 'p': memset(&tmp_gpfile, 0, sizeof(GPathReader)); gpath_reader_open(&tmp_gpfile, optarg); gpfile_buf_push(&args->gpfiles, &tmp_gpfile, 1); break; case 't': cmd_check(!args->nthreads, cmd); args->nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&args->memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&args->memargs, optarg); break; case 'c': args->colour = cmd_uint32(cmd, optarg); break; case 'F': cmd_check(args->fmt == SEQ_FMT_FASTQ, cmd); args->fmt = cmd_parse_format(cmd, optarg); break; case '1': case '2': case 'i': used = 1; correct_aln_input_buf_push(inputs, &task, 1); asyncio_task_parse(&inputs->b[inputs->len-1].files, c, optarg, fq_offset, correct_cmd ? &tmp_path : NULL); if(correct_cmd) inputs->b[inputs->len-1].out_base = tmp_path; break; case 'M': if(!strcmp(optarg,"FF")) task.matedir = READPAIR_FF; else if(!strcmp(optarg,"FR")) task.matedir = READPAIR_FR; else if(!strcmp(optarg,"RF")) task.matedir = READPAIR_RF; else if(!strcmp(optarg,"RR")) task.matedir = READPAIR_RR; else die("-M,--matepair <orient> must be one of: FF,FR,RF,RR"); used = 0; break; case 'O': fq_offset = cmd_uint8(cmd, optarg); used = 0; break; case 'Q': task.fq_cutoff = cmd_uint8(cmd, optarg); used = 0; break; case 'H': task.hp_cutoff = cmd_uint8(cmd, optarg); used = 0; break; case 'l': task.crt_params.frag_len_min = cmd_uint32(cmd, optarg); used = 0; break; case 'L': task.crt_params.frag_len_max = cmd_uint32(cmd, optarg); used = 0; break; case 'w': task.crt_params.one_way_gap_traverse = true; used = 0; break; case 'W': task.crt_params.one_way_gap_traverse = false; used = 0; break; case 'd': task.crt_params.gap_wiggle = cmd_udouble(cmd, optarg); used = 0; break; case 'D': task.crt_params.gap_variance = cmd_udouble(cmd, optarg); used = 0; break; case 'X': task.crt_params.max_context = cmd_uint32(cmd, optarg); used = 0; break; case 'e': task.crt_params.use_end_check = true; used = 0; break; case 'E': task.crt_params.use_end_check = false; used = 0; break; case 'g': cmd_check(!args->dump_seq_sizes, cmd); args->dump_seq_sizes = optarg; break; case 'G': cmd_check(!args->dump_frag_sizes, cmd); args->dump_frag_sizes = optarg; break; case 'u': args->use_new_paths = true; break; case 'x': gen_paths_print_contigs = true; break; case 'y': gen_paths_print_paths = true; break; case 'z': gen_paths_print_reads = true; break; case 'Z': cmd_check(!args->fq_zero, cmd); if(strlen(optarg) != 1) cmd_print_usage("--fq-zero <c> requires a single char"); args->fq_zero = optarg[0]; break; case 'P': cmd_check(!args->append_orig_seq,cmd); args->append_orig_seq = true; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" thread/correct -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } if(args->nthreads == 0) args->nthreads = DEFAULT_NTHREADS; // Check that optind+1 == argc if(optind+1 > argc) cmd_print_usage("Expected exactly one graph file"); else if(optind+1 < argc) cmd_print_usage("Expected only one graph file. What is this: '%s'", argv[optind]); char *graph_path = argv[optind]; status("Reading graph: %s", graph_path); if(!used) cmd_print_usage("Ignored arguments after last --seq"); // ctx_thread requires output file if(!correct_cmd && !args->out_ctp_path) cmd_print_usage("--out <out.ctp> is required"); // // Open graph graph file // GraphFileReader *gfile = &args->gfile; graph_file_open(gfile, graph_path); if(!correct_cmd && file_filter_into_ncols(&gfile->fltr) > 1) die("Please specify a single colour e.g. %s:0", file_filter_path(&gfile->fltr)); // // Open path files // size_t path_max_usedcols = 0; for(i = 0; i < args->gpfiles.len; i++) { // file_filter_update_intocol(&args->pfiles.b[i].fltr, 0); if(!correct_cmd && file_filter_into_ncols(&args->gpfiles.b[i].fltr) > 1) { die("Please specify a single colour e.g. %s:0", file_filter_path(&args->gpfiles.b[i].fltr)); } path_max_usedcols = MAX2(path_max_usedcols, file_filter_into_ncols(&args->gpfiles.b[i].fltr)); } args->path_max_usedcols = path_max_usedcols; // Check for compatibility between graph files and path files graphs_gpaths_compatible(gfile, 1, args->gpfiles.b, args->gpfiles.len, -1); // if no paths loaded, set all max_context values to 1, since >1 kmer only // useful if can pickup paths if(args->gpfiles.len == 0) { for(i = 0; i < inputs->len; i++) inputs->b[i].crt_params.max_context = 1; } // Check frag_len_min < frag_len_max for(i = 0; i < inputs->len; i++) { CorrectAlnInput *t = &inputs->b[i]; t->files.ptr = t; if(t->crt_params.frag_len_min > t->crt_params.frag_len_max) { die("--min-ins %u is greater than --max-ins %u", t->crt_params.frag_len_min, t->crt_params.frag_len_max); } correct_aln_input_print(&inputs->b[i]); args->max_gap_limit = MAX2(args->max_gap_limit, t->crt_params.frag_len_max); } futil_create_output(args->dump_seq_sizes); futil_create_output(args->dump_frag_sizes); }
void read_thread_args_parse(struct ReadThreadCmdArgs *args, int argc, char **argv, const struct option *longopts, bool correct_cmd) { size_t i; int tmp_thresh; // 0 => no calling, -1 => auto CorrectAlnInput task = CORRECT_ALN_INPUT_INIT; uint8_t fq_offset = 0; size_t dump_seq_n = 0, dump_mp_n = 0; // how many times are -g -G specified PathFileReader tmp_pfile; CorrectAlnInputBuffer *inputs = &args->inputs; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int used = 1, c; char *tmp_path; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': if(args->out_ctp_path != NULL) cmd_print_usage(NULL); args->out_ctp_path = optarg; break; case 'p': tmp_pfile = INIT_PATH_READER; path_file_open(&tmp_pfile, optarg, true); pfile_buf_add(&args->pfiles, tmp_pfile); break; case 't': if(args->num_of_threads != 0) die("%s set twice", cmd); args->num_of_threads = cmd_parse_arg_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&args->memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&args->memargs, optarg); break; case 'c': args->colour = cmd_parse_arg_uint32(cmd, optarg); break; case '1': case '2': case 'i': used = 1; correct_aln_input_buf_add(inputs, task); asyncio_task_parse(&inputs->data[inputs->len-1].files, c, optarg, fq_offset, correct_cmd ? &tmp_path : NULL); if(correct_cmd) inputs->data[inputs->len-1].out_base = tmp_path; break; case 'f': task.matedir = READPAIR_FR; used = 0; break; case 'F': task.matedir = READPAIR_FF; used = 0; break; case 'r': task.matedir = READPAIR_RF; used = 0; break; case 'R': task.matedir = READPAIR_RR; used = 0; break; case 'w': task.crt_params.one_way_gap_traverse = true; used = 0; break; case 'W': task.crt_params.one_way_gap_traverse = false; used = 0; break; case 'q': fq_offset = cmd_parse_arg_uint8(cmd, optarg); used = 0; break; case 'Q': task.fq_cutoff = cmd_parse_arg_uint8(cmd, optarg); used = 0; break; case 'H': task.hp_cutoff = cmd_parse_arg_uint8(cmd, optarg); used = 0; break; case 'e': task.crt_params.use_end_check = true; used = 0; break; case 'E': task.crt_params.use_end_check = false; used = 0; break; case 'g': task.crt_params.ins_gap_min = cmd_parse_arg_uint32(cmd, optarg); used = 0; break; case 'G': task.crt_params.ins_gap_max = cmd_parse_arg_uint32(cmd, optarg); used = 0; break; case 'S': args->dump_seq_sizes = optarg; dump_seq_n++; break; case 'M': args->dump_mp_sizes = optarg; dump_mp_n++; break; case 'u': args->use_new_paths = true; break; case 'C': if(optarg == NULL || strcmp(optarg,"auto")) args->clean_threshold = -1; else if(parse_entire_int(optarg,&tmp_thresh) && tmp_thresh >= -1) { if(tmp_thresh != -1 && tmp_thresh < 2) warn("Ignoring --clean %u (too small < 2)", tmp_thresh); else if(tmp_thresh > 255) warn("Ignoring --clean %u (too big > 255)", tmp_thresh); else args->clean_threshold = tmp_thresh; } else die("Bad argument for %s <auto|N> where N > 1", cmd); args->clean_paths = (args->clean_threshold != 0); break; case 'X': gen_paths_print_contigs = true; break; case 'Y': gen_paths_print_paths = true; break; case 'Z': gen_paths_print_reads = true; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" thread -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } if(args->num_of_threads == 0) args->num_of_threads = DEFAULT_NTHREADS; // Check that optind+1 == argc if(optind+1 > argc) cmd_print_usage("Expected exactly one graph file"); else if(optind+1 < argc) cmd_print_usage("Expected only one graph file. What is this: '%s'", argv[optind]); char *graph_path = argv[optind]; status("Reading graph: %s", graph_path); if(!used) cmd_print_usage("Ignored arguments after last --seq"); if(dump_seq_n > 1) die("Cannot specify --seq-gaps <out> more than once"); if(dump_mp_n > 1) die("Cannot specify --mp-gaps <out> more than once"); // // Open graph graph file // GraphFileReader *gfile = &args->gfile; graph_file_open(gfile, graph_path, true); file_filter_update_intocol(&gfile->fltr, 0); if(!correct_cmd && graph_file_usedcols(gfile) > 1) die("Please specify a single colour e.g. %s:0", gfile->fltr.file_path.buff); // // Open path files // size_t path_max_usedcols = 0; for(i = 0; i < args->pfiles.len; i++) { // file_filter_update_intocol(&args->pfiles.data[i].fltr, 0); if(!correct_cmd && path_file_usedcols(&args->pfiles.data[i]) > 1) { die("Please specify a single colour e.g. %s:0", args->pfiles.data[i].fltr.file_path.buff); } path_max_usedcols = MAX2(path_max_usedcols, path_file_usedcols(&args->pfiles.data[i])); } args->path_max_usedcols = path_max_usedcols; // Check for compatibility between graph files and path files graphs_paths_compatible(gfile, 1, args->pfiles.data, args->pfiles.len); // Check ins_gap_min < ins_gap_max for(i = 0; i < inputs->len; i++) { CorrectAlnInput *t = &inputs->data[i]; t->files.ptr = t; if(t->crt_params.ins_gap_min > t->crt_params.ins_gap_max) { die("--min-ins %u is greater than --max-ins %u", t->crt_params.ins_gap_min, t->crt_params.ins_gap_max); } correct_aln_input_print(&inputs->data[i]); args->max_gap_limit = MAX2(args->max_gap_limit, t->crt_params.ins_gap_max); } }
int ctx_bubbles(int argc, char **argv) { size_t nthreads = 0; struct MemArgs memargs = MEM_ARGS_INIT; const char *out_path = NULL; size_t max_allele_len = 0, max_flank_len = 0; bool remove_serial_bubbles = true; // List of haploid colours size_t *hapcols = NULL; int nhapcols = 0; char *hapcols_arg = NULL; GPathReader tmp_gpfile; GPathFileBuffer gpfiles; gpfile_buf_alloc(&gpfiles, 8); // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'p': memset(&tmp_gpfile, 0, sizeof(GPathReader)); gpath_reader_open(&tmp_gpfile, optarg); gpfile_buf_push(&gpfiles, &tmp_gpfile, 1); break; case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'H': cmd_check(!hapcols_arg, cmd); hapcols_arg = optarg; break; case 'A': cmd_check(!max_allele_len, cmd); max_allele_len = cmd_uint32_nonzero(cmd, optarg); break; case 'F': cmd_check(!max_flank_len, cmd); max_flank_len = cmd_uint32_nonzero(cmd, optarg); break; case 'S': cmd_check(remove_serial_bubbles,cmd); remove_serial_bubbles = false; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } // Defaults for unset values if(out_path == NULL) out_path = "-"; if(nthreads == 0) nthreads = DEFAULT_NTHREADS; if(max_allele_len == 0) max_allele_len = DEFAULT_MAX_ALLELE; if(max_flank_len == 0) max_flank_len = DEFAULT_MAX_FLANK; if(optind >= argc) cmd_print_usage("Require input graph files (.ctx)"); // // Open graph files // const size_t num_gfiles = argc - optind; char **graph_paths = argv + optind; ctx_assert(num_gfiles > 0); GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); size_t i, ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0; ncols = graph_files_open(graph_paths, gfiles, num_gfiles, &ctx_max_kmers, &ctx_sum_kmers); // Check graph + paths are compatible graphs_gpaths_compatible(gfiles, num_gfiles, gpfiles.b, gpfiles.len, -1); // // Check haploid colours are valid // if(hapcols_arg != NULL) { if((nhapcols = range_get_num(hapcols_arg, ncols)) < 0) die("Invalid haploid colour list: %s", hapcols_arg); hapcols = ctx_calloc(nhapcols, sizeof(hapcols[0])); if(range_parse_array(hapcols_arg, hapcols, ncols) < 0) die("Invalid haploid colour list: %s", hapcols_arg); } // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem, path_mem, thread_mem; char thread_mem_str[100]; // edges(1bytes) + kmer_paths(8bytes) + in_colour(1bit/col) + // visitedfw/rv(2bits/thread) bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 + (gpfiles.len > 0 ? sizeof(GPath*)*8 : 0) + ncols + 2*nthreads; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, ctx_max_kmers, ctx_sum_kmers, false, &graph_mem); // Thread memory thread_mem = roundup_bits2bytes(kmers_in_hash) * 2; bytes_to_str(thread_mem * nthreads, 1, thread_mem_str); status("[memory] (of which threads: %zu x %zu = %s)\n", nthreads, thread_mem, thread_mem_str); // Paths memory size_t rem_mem = memargs.mem_to_use - MIN2(memargs.mem_to_use, graph_mem+thread_mem); path_mem = gpath_reader_mem_req(gpfiles.b, gpfiles.len, ncols, rem_mem, false, kmers_in_hash, false); // Shift path store memory from graphs->paths graph_mem -= sizeof(GPath*)*kmers_in_hash; path_mem += sizeof(GPath*)*kmers_in_hash; cmd_print_mem(path_mem, "paths"); size_t total_mem = graph_mem + thread_mem + path_mem; cmd_check_mem_limit(memargs.mem_to_use, total_mem); // // Open output file // gzFile gzout = futil_gzopen_create(out_path, "w"); // Allocate memory dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, ncols, 1, kmers_in_hash, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL); // Paths gpath_reader_alloc_gpstore(gpfiles.b, gpfiles.len, path_mem, false, &db_graph); // // Load graphs // GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph); gprefs.empty_colours = true; for(i = 0; i < num_gfiles; i++) { graph_load(&gfiles[i], gprefs, NULL); graph_file_close(&gfiles[i]); gprefs.empty_colours = false; } ctx_free(gfiles); hash_table_print_stats(&db_graph.ht); // Load link files for(i = 0; i < gpfiles.len; i++) gpath_reader_load(&gpfiles.b[i], GPATH_DIE_MISSING_KMERS, &db_graph); // Create array of cJSON** from input files cJSON **hdrs = ctx_malloc(gpfiles.len * sizeof(cJSON*)); for(i = 0; i < gpfiles.len; i++) hdrs[i] = gpfiles.b[i].json; // Now call variants BubbleCallingPrefs call_prefs = {.max_allele_len = max_allele_len, .max_flank_len = max_flank_len, .haploid_cols = hapcols, .nhaploid_cols = nhapcols, .remove_serial_bubbles = remove_serial_bubbles}; invoke_bubble_caller(nthreads, &call_prefs, gzout, out_path, hdrs, gpfiles.len, &db_graph); status(" saved to: %s\n", out_path); gzclose(gzout); ctx_free(hdrs); // Close input link files for(i = 0; i < gpfiles.len; i++) gpath_reader_close(&gpfiles.b[i]); gpfile_buf_dealloc(&gpfiles); ctx_free(hapcols); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int ctx_calls2vcf(int argc, char **argv) { const char *in_path = NULL, *out_path = NULL, *out_type = NULL; // Filtering parameters int32_t min_mapq = -1, max_align_len = -1, max_allele_len = -1; // Alignment parameters int nwmatch = 1, nwmismatch = -2, nwgapopen = -4, nwgapextend = -1; // ref paths char const*const* ref_paths = NULL; size_t nref_paths = 0; // flank file const char *sam_path = NULL; // // Things we figure out by looking at the input // bool isbubble = false; // samples in VCF, (0 for bubble, does not include ref in breakpoint calls) size_t i, kmer_size, num_samples; // // Reference genome // // Hash map of chromosome name -> sequence ChromHash *genome; ReadBuffer chroms; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 'O': cmd_check(!out_type, cmd); out_type = optarg; break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'F': cmd_check(!sam_path,cmd); sam_path = optarg; break; case 'Q': cmd_check(min_mapq < 0,cmd); min_mapq = cmd_uint32(cmd, optarg); break; case 'A': cmd_check(max_align_len < 0,cmd); max_align_len = cmd_uint32(cmd, optarg); break; case 'L': cmd_check(max_allele_len < 0,cmd); max_allele_len = cmd_uint32(cmd, optarg); break; case 'm': nwmatch = cmd_int32(cmd, optarg); break; case 'M': nwmismatch = cmd_int32(cmd, optarg); break; case 'g': nwgapopen = cmd_int32(cmd, optarg); break; case 'G': nwgapextend = cmd_int32(cmd, optarg); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ die("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]); default: ctx_assert2(0, "shouldn't reach here: %c", c); } } // Defaults for unset values if(out_path == NULL) out_path = "-"; if(max_align_len < 0) max_align_len = DEFAULT_MAX_ALIGN; if(max_allele_len < 0) max_allele_len = DEFAULT_MAX_ALLELE; if(optind+2 > argc) cmd_print_usage("Require <in.txt.gz> and at least one reference"); in_path = argv[optind++]; ref_paths = (char const*const*)argv + optind; nref_paths = argc - optind; // These functions call die() on error gzFile gzin = futil_gzopen(in_path, "r"); // Read call file header cJSON *json = json_hdr_load(gzin, in_path); // Check we can handle the kmer size kmer_size = json_hdr_get_kmer_size(json, in_path); db_graph_check_kmer_size(kmer_size, in_path); // Get format (bubble or breakpoint file) cJSON *json_fmt = json_hdr_get(json, "file_format", cJSON_String, in_path); if(strcmp(json_fmt->valuestring,"CtxBreakpoints") == 0) isbubble = false; else if(strcmp(json_fmt->valuestring,"CtxBubbles") == 0) isbubble = true; else die("Unknown format: '%s'", json_fmt->valuestring); status("Reading %s in %s format", futil_inpath_str(in_path), isbubble ? "bubble" : "breakpoint"); if(isbubble) { // bubble specific if(sam_path == NULL) cmd_print_usage("Require -F <flanks.sam> with bubble file"); if(min_mapq < 0) min_mapq = DEFAULT_MIN_MAPQ; } else { // breakpoint specific if(min_mapq >= 0) cmd_print_usage("-Q,--min-mapq <Q> only valid with bubble calls"); } // Open flank file if it exists htsFile *samfh = NULL; bam_hdr_t *bam_hdr = NULL; bam1_t *mflank = NULL; if(sam_path) { if((samfh = hts_open(sam_path, "r")) == NULL) die("Cannot open SAM/BAM %s", sam_path); // Load BAM header bam_hdr = sam_hdr_read(samfh); if(bam_hdr == NULL) die("Cannot load BAM header: %s", sam_path); mflank = bam_init1(); } // Output VCF has 0 samples if bubbles file, otherwise has N where N is // number of samples/colours in the breakpoint graph size_t num_graph_samples = json_hdr_get_ncols(json, in_path); size_t num_graph_nonref = json_hdr_get_nonref_ncols(json, in_path); num_samples = 0; if(!isbubble) { // If last colour has "is_ref", drop number of samples by one num_samples = num_graph_nonref < num_graph_samples ? num_graph_samples-1 : num_graph_samples; } // // Open output file // if(!out_path) out_path = "-"; int mode = vcf_misc_get_outtype(out_type, out_path); futil_create_output(out_path); htsFile *vcffh = hts_open(out_path, modes_htslib[mode]); status("[calls2vcf] Reading %s call file with %zu samples", isbubble ? "Bubble" : "Breakpoint", num_graph_samples); status("[calls2vcf] %zu sample output to: %s format: %s", num_samples, futil_outpath_str(out_path), hsmodes_htslib[mode]); if(isbubble) status("[calls2vcf] min. MAPQ: %i", min_mapq); status("[calls2vcf] max alignment length: %i", max_align_len); status("[calls2vcf] max VCF allele length: %i", max_allele_len); status("[calls2vcf] alignment match:%i mismatch:%i gap open:%i extend:%i", nwmatch, nwmismatch, nwgapopen, nwgapextend); // Load reference genome read_buf_alloc(&chroms, 1024); genome = chrom_hash_init(); chrom_hash_load(ref_paths, nref_paths, &chroms, genome); // convert to upper case char *s; for(i = 0; i < chroms.len; i++) for(s = chroms.b[i].seq.b; *s; s++) *s = toupper(*s); if(!isbubble) brkpnt_check_refs_match(json, genome, in_path); bcf_hdr_t *vcfhdr = make_vcf_hdr(json, in_path, !isbubble, kmer_size, ref_paths, nref_paths, chroms.b, chroms.len); if(bcf_hdr_write(vcffh, vcfhdr) != 0) die("Cannot write VCF header"); AlignedCall *call = acall_init(); CallDecomp *aligner = call_decomp_init(vcffh, vcfhdr); scoring_t *scoring = call_decomp_get_scoring(aligner); scoring_init(scoring, nwmatch, nwmismatch, nwgapopen, nwgapextend, false, false, 0, 0, 0, 0); CallFileEntry centry; call_file_entry_alloc(¢ry); char kmer_str[50]; sprintf(kmer_str, ";K%zu", kmer_size); if(isbubble) { // Bubble calls DecompBubble *bubbles = decomp_bubble_init(); // Set scoring for aligning 3' flank scoring = decomp_bubble_get_scoring(bubbles); scoring_init(scoring, nwmatch, nwmismatch, nwgapopen, nwgapextend, true, true, 0, 0, 0, 0); while(call_file_read(gzin, in_path, ¢ry)) { do { if(sam_read1(samfh, bam_hdr, mflank) < 0) die("We've run out of SAM entries!"); } while(mflank->core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY)); // Align call strbuf_reset(&call->info); decomp_bubble_call(bubbles, genome, kmer_size, min_mapq, ¢ry, mflank, bam_hdr, call); strbuf_append_str(&call->info, kmer_str); acall_decompose(aligner, call, max_align_len, max_allele_len); } // print bubble stats DecompBubbleStats *bub_stats = ctx_calloc(1, sizeof(*bub_stats)); decomp_bubble_cpy_stats(bub_stats, bubbles); print_bubble_stats(bub_stats); ctx_free(bub_stats); decomp_bubble_destroy(bubbles); } else { // Breakpoint calls DecompBreakpoint *breakpoints = decomp_brkpt_init(); while(call_file_read(gzin, in_path, ¢ry)) { strbuf_reset(&call->info); decomp_brkpt_call(breakpoints, genome, num_samples, ¢ry, call); strbuf_append_str(&call->info, kmer_str); acall_decompose(aligner, call, max_align_len, max_allele_len); } // print bubble stats DecompBreakpointStats *brk_stats = ctx_calloc(1, sizeof(*brk_stats)); decomp_brkpt_cpy_stats(brk_stats, breakpoints); print_breakpoint_stats(brk_stats); ctx_free(brk_stats); decomp_brkpt_destroy(breakpoints); } // Print stats DecomposeStats *astats = ctx_calloc(1, sizeof(*astats)); call_decomp_cpy_stats(astats, aligner); print_acall_stats(astats); ctx_free(astats); call_file_entry_dealloc(¢ry); call_decomp_destroy(aligner); acall_destroy(call); // Finished - clean up cJSON_Delete(json); gzclose(gzin); bcf_hdr_destroy(vcfhdr); hts_close(vcffh); for(i = 0; i < chroms.len; i++) seq_read_dealloc(&chroms.b[i]); read_buf_dealloc(&chroms); chrom_hash_destroy(genome); if(sam_path) { hts_close(samfh); bam_hdr_destroy(bam_hdr); bam_destroy1(mflank); } return EXIT_SUCCESS; }
static void parse_args(int argc, char **argv) { BuildGraphTask task; memset(&task, 0, sizeof(task)); task.prefs = SEQ_LOADING_PREFS_INIT; task.stats = SEQ_LOADING_STATS_INIT; uint8_t fq_offset = 0; int intocolour = -1; GraphFileReader tmp_gfile; // Arg parsing char cmd[100], shortopts[100]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; bool sample_named = false, pref_unused = false; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 't': cmd_check(!nthreads,cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'k': cmd_check(!kmer_size,cmd); kmer_size = cmd_kmer_size(cmd, optarg); break; case 's': intocolour++; check_sample_name(optarg); sample_name_buf_add(&snamebuf, (SampleName){.colour = intocolour, .name = optarg}); sample_named = true; break; case '1': case '2': case 'i': pref_unused = false; if(!sample_named) cmd_print_usage("Please give sample name first [-s,--sample <name>]"); asyncio_task_parse(&task.files, c, optarg, fq_offset, NULL); task.prefs.colour = intocolour; add_task(&task); break; case 'M': if(!strcmp(optarg,"FF")) task.prefs.matedir = READPAIR_FF; else if(!strcmp(optarg,"FR")) task.prefs.matedir = READPAIR_FR; else if(!strcmp(optarg,"RF")) task.prefs.matedir = READPAIR_RF; else if(!strcmp(optarg,"RR")) task.prefs.matedir = READPAIR_RR; else die("-M,--matepair <orient> must be one of: FF,FR,RF,RR"); pref_unused = true; break; case 'O': fq_offset = cmd_uint8(cmd, optarg); pref_unused = true; break; case 'Q': task.prefs.fq_cutoff = cmd_uint8(cmd, optarg); pref_unused = true; break; case 'H': task.prefs.hp_cutoff = cmd_uint8(cmd, optarg); pref_unused = true; break; case 'p': task.prefs.remove_pcr_dups = true; pref_unused = true; break; case 'P': task.prefs.remove_pcr_dups = false; pref_unused = true; break; case 'g': if(intocolour == -1) intocolour = 0; graph_file_reset(&tmp_gfile); graph_file_open2(&tmp_gfile, optarg, "r", true, intocolour); intocolour = MAX2((size_t)intocolour, file_filter_into_ncols(&tmp_gfile.fltr)-1); gfile_buf_push(&gfilebuf, &tmp_gfile, 1); sample_named = false; break; case 'I': graph_file_reset(&tmp_gfile); graph_file_open(&tmp_gfile, optarg); if(file_filter_into_ncols(&tmp_gfile.fltr) > 1) warn("Flattening intersection graph into colour 0: %s", optarg); file_filter_flatten(&tmp_gfile.fltr, 0); gfile_buf_push(&gisecbuf, &tmp_gfile, 1); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" build -h` for help. Bad option: %s", argv[optind-1]); default: die("Bad option: %s", cmd); } }
int ctx_view(int argc, char **argv) { // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // TODO: // print_action actions[argc]; // bool read_kmers = false; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); cmd_print_usage("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]); default: cmd_print_usage("Programmer fail. Tell Isaac."); } } if(print_kmers) parse_kmers = 1; bool no_flags = (!print_info && !parse_kmers && !print_kmers); if(no_flags) { print_info = parse_kmers = 1; } if(optind+1 != argc) cmd_print_usage("Require one input graph file (.ctx)"); char *path = argv[optind]; size_t num_errors = 0, num_warnings = 0; GraphFileReader gfile; memset(&gfile, 0, sizeof(gfile)); int ret = graph_file_open(&gfile, path); if(ret == 0) die("Cannot open file: %s", path); if(print_info) { char fsize_str[50]; bytes_to_str((size_t)gfile.file_size, 0, fsize_str); printf("Loading file: %s\n", file_filter_path(&gfile.fltr)); printf("File size: %s\n", fsize_str); printf("----\n"); } size_t i, col, ncols = file_filter_into_ncols(&gfile.fltr); size_t kmer_size = gfile.hdr.kmer_size; ctx_assert(ncols > 0); GraphFileHeader hdr; memset(&hdr, 0, sizeof(hdr)); graph_file_merge_header(&hdr, &gfile); uint64_t nkmers_read = 0, nkmers_loaded = 0; uint64_t num_all_zero_kmers = 0, num_zero_covg_kmers = 0; uint64_t *col_nkmers, *col_sum_covgs; col_nkmers = ctx_calloc(ncols, sizeof(col_nkmers[0])); col_sum_covgs = ctx_calloc(ncols, sizeof(col_sum_covgs[0])); // Print header if(print_info) print_header(&hdr, gfile.num_of_kmers); BinaryKmer bkmer; Covg covgs[ncols], keep_kmer; Edges edges[ncols]; bool direct_read = file_filter_is_direct(&gfile.fltr); if(parse_kmers || print_kmers) { if(print_info && print_kmers) printf("----\n"); for(; graph_file_read_reset(&gfile, &bkmer, covgs, edges); nkmers_read++) { // If kmer has no covg in any samples -> don't load keep_kmer = 0; for(col = 0; col < ncols; col++) { col_nkmers[col] += (covgs[col] > 0); col_sum_covgs[col] += covgs[col]; keep_kmer |= covgs[col]; } if(!direct_read && !keep_kmer) continue; nkmers_loaded++; /* Kmer Checks */ // graph_file_read_reset() already checks for: // 1. oversized kmers // 2. kmers with covg 0 in all colours // 3. edges without coverage in a colour // Check for all-zeros (i.e. all As kmer: AAAAAA) uint64_t kmer_words_or = 0; for(i = 0; i < hdr.num_of_bitfields; i++) kmer_words_or |= bkmer.b[i]; if(kmer_words_or == 0) { if(num_all_zero_kmers == 1) { loading_error("more than one all 'A's kmers seen [index: %"PRIu64"]\n", nkmers_read); } num_all_zero_kmers++; } // Check covg is 0 for all colours for(i = 0; i < ncols && covgs[i] == 0; i++); num_zero_covg_kmers += (i == ncols); // Print if(print_kmers) db_graph_print_kmer2(bkmer, covgs, edges, ncols, kmer_size, stdout); } } // check for various reading errors // if(errno != 0) // loading_error("errno set [%i]: %s\n", (int)errno, strerror(errno)); int err = ferror(gfile.fh); if(err != 0) loading_error("occurred after file reading [%i]\n", err); char nstr[50]; if(print_kmers || parse_kmers) { // file_size is set to -1 if we are reading from a stream, // therefore won't be able to check number of kmers read if(gfile.file_size != -1 && nkmers_read != (uint64_t)gfile.num_of_kmers) { loading_warning("Expected %zu kmers, read %zu\n", (size_t)gfile.num_of_kmers, (size_t)nkmers_read); } if(num_all_zero_kmers > 1) { loading_error("%s all-zero-kmers seen\n", ulong_to_str(num_all_zero_kmers, nstr)); } if(num_zero_covg_kmers > 0) { loading_warning("%s kmers have no coverage in any colour\n", ulong_to_str(num_zero_covg_kmers, nstr)); } } // Count warnings printed by graph_file_reader.c num_warnings += gfile.error_zero_covg; num_warnings += gfile.error_missing_covg; // Can only print these stats if we're read in the kmers if((print_kmers || parse_kmers) && print_info) { // print kmer coverage per sample printf("\n---- Per colour stats\n"); printf("num. kmers:"); for(col = 0; col < ncols; col++) printf("\t%s", ulong_to_str(col_nkmers[col], nstr)); printf("\n"); printf("sum coverage:"); for(col = 0; col < ncols; col++) printf("\t%s", ulong_to_str(col_sum_covgs[col], nstr)); printf("\n"); printf("kmer coverage:"); for(col = 0; col < ncols; col++) printf("\t%.2f", safe_frac(col_sum_covgs[col], col_nkmers[col])); printf("\n"); // Overall stats uint64_t sum_covgs = 0; double mean_kmer_covg = 0.0; for(col = 0; col < ncols; col++) sum_covgs += col_sum_covgs[col]; mean_kmer_covg = nkmers_loaded ? (double)sum_covgs / nkmers_loaded : 0.0; printf("\n---- Overall stats\n"); printf("Total kmers: %s\n", ulong_to_str(nkmers_loaded, nstr)); printf("Total coverage: %s\n", ulong_to_str(sum_covgs, nstr)); printf("Mean coverage: %s\n", double_to_str(mean_kmer_covg, 2, nstr)); } if(print_info) { // Print memory stats uint64_t mem, capacity, num_buckets, req_capacity; uint8_t bucket_size; req_capacity = (size_t)(gfile.num_of_kmers / IDEAL_OCCUPANCY); capacity = hash_table_cap(req_capacity, &num_buckets, &bucket_size); mem = ht_mem(bucket_size, num_buckets, sizeof(BinaryKmer)*8 + ncols*(sizeof(Covg)+sizeof(Edges))*8); char memstr[100], capacitystr[100], bucket_size_str[100], num_buckets_str[100]; bytes_to_str(mem, 1, memstr); ulong_to_str(capacity, capacitystr); ulong_to_str(bucket_size, bucket_size_str); ulong_to_str(num_buckets, num_buckets_str); size_t mem_height = (size_t)__builtin_ctzl(num_buckets); printf("\n---- Memory\n"); printf("memory required: %s [capacity: %s]\n", memstr, capacitystr); printf(" bucket size: %s; number of buckets: %s\n", bucket_size_str, num_buckets_str); printf(" --kmer_size %zu --mem_height %zu --mem_width %i\n", kmer_size, mem_height, bucket_size); } if((print_kmers || parse_kmers) && print_info) { printf("\n----\n"); if(num_warnings > 0 || num_errors > 0) { printf("Warnings: %zu; Errors: %zu\n", (size_t)num_warnings, (size_t)num_errors); } if(num_errors == 0) printf(num_warnings ? "Graph may be ok\n" : "Graph is valid\n"); } ctx_free(col_nkmers); ctx_free(col_sum_covgs); // Close file (which zeros it) graph_file_close(&gfile); graph_header_dealloc(&hdr); return num_errors ? EXIT_FAILURE : EXIT_SUCCESS; }
int ctx_sort(int argc, char **argv) { const char *out_path = NULL; struct MemArgs memargs = MEM_ARGS_INIT; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" sort -h` for help. Bad option: %s", argv[optind-1]); default: die("Bad option: [%c]: %s", c, cmd); } } if(optind+1 != argc) cmd_print_usage("Require exactly one input graph file (.ctx)"); const char *ctx_path = argv[optind]; // // Open Graph file // GraphFileReader gfile; memset(&gfile, 0, sizeof(GraphFileReader)); graph_file_open2(&gfile, ctx_path, out_path ? "r" : "r+", true, 0); if(!file_filter_is_direct(&gfile.fltr)) die("Cannot open graph file with a filter ('in.ctx:blah' syntax)"); size_t num_kmers, memory; // Reading from a stream if(gfile.num_of_kmers < 0) { if(!memargs.num_kmers_set) die("If reading from a stream, must give -n <num_kmers>"); num_kmers = memargs.num_kmers; } else num_kmers = gfile.num_of_kmers; // Open output path (if given) FILE *fout = out_path ? futil_fopen_create(out_path, "w") : NULL; size_t i; size_t ncols = gfile.hdr.num_of_cols; size_t kmer_mem = sizeof(BinaryKmer) + (sizeof(Edges)+sizeof(Covg))*ncols; memory = (sizeof(char*) + kmer_mem) * num_kmers; char mem_str[50]; bytes_to_str(memory, 1, mem_str); if(memory > memargs.mem_to_use) die("Require at least %s memory", mem_str); status("[memory] Total: %s", mem_str); char *mem = ctx_malloc(kmer_mem * num_kmers); char **kmers = ctx_malloc(num_kmers*sizeof(char*)); // Read in whole file // if(graph_file_fseek(gfile, gfile.hdr_size, SEEK_SET) != 0) die("fseek failed"); size_t nkread = gfr_fread_bytes(&gfile, mem, num_kmers*kmer_mem); if(nkread != num_kmers*kmer_mem) die("Could only read %zu bytes [<%zu]", nkread, num_kmers*kmer_mem); // check we are at the end of the file char tmpc; if(gfr_fread_bytes(&gfile, &tmpc, 1) != 0) { die("More kmers in file than believed (kmers: %zu ncols: %zu).", num_kmers, ncols); } status("Read %zu kmers with %zu colour%s", num_kmers, ncols, util_plural_str(ncols)); for(i = 0; i < num_kmers; i++) kmers[i] = mem + kmer_mem*i; sort_block(kmers, num_kmers); // Print if(out_path != NULL) { // saving to a different destination - write header graph_write_header(fout, &gfile.hdr); } else { // Directly manipulating gfile.fh here, using it to write later // Not doing any more reading if(fseek(gfile.fh, gfile.hdr_size, SEEK_SET) != 0) die("fseek failed"); fout = gfile.fh; } for(i = 0; i < num_kmers; i++) if(fwrite(kmers[i], 1, kmer_mem, fout) != kmer_mem) die("Cannot write to file"); if(out_path) fclose(fout); graph_file_close(&gfile); ctx_free(kmers); ctx_free(mem); return EXIT_SUCCESS; }
int ctx_clean(int argc, char **argv) { size_t nthreads = 0, use_ncols = 0; struct MemArgs memargs = MEM_ARGS_INIT; const char *out_ctx_path = NULL; int min_keep_tip = -1, unitig_min = -1; // <0 => default, 0 => noclean uint32_t fallback_thresh = 0; const char *len_before_path = NULL, *len_after_path = NULL; const char *covg_before_path = NULL, *covg_after_path = NULL; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'o': if(out_ctx_path != NULL) cmd_print_usage(NULL); out_ctx_path = optarg; break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'N': use_ncols = cmd_uint32_nonzero(cmd, optarg); break; case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'T': cmd_check(min_keep_tip<0, cmd); min_keep_tip = (optarg != NULL ? (int)cmd_uint32(cmd, optarg) : -1); break; case 'S': case 'U': cmd_check(unitig_min<0, cmd); unitig_min = (optarg != NULL ? cmd_uint32(cmd, optarg) : -1); break; case 'B': cmd_check(!fallback_thresh, cmd); fallback_thresh = cmd_uint32_nonzero(cmd, optarg); break; case 'l': cmd_check(!len_before_path, cmd); len_before_path = optarg; break; case 'L': cmd_check(!len_after_path, cmd); len_after_path = optarg; break; case 'c': cmd_check(!covg_before_path, cmd); covg_before_path = optarg; break; case 'C': cmd_check(!covg_after_path, cmd); covg_after_path = optarg; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" clean -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } if(nthreads == 0) nthreads = DEFAULT_NTHREADS; if(optind >= argc) cmd_print_usage("Please give input graph files"); bool unitig_cleaning = (unitig_min != 0); bool tip_cleaning = (min_keep_tip != 0); bool doing_cleaning = (unitig_cleaning || tip_cleaning); // If you ever want to estimate cleaning threshold without outputting // a graph, change this to a warning if(doing_cleaning && out_ctx_path == NULL) { cmd_print_usage("Please specify --out <out.ctx> for cleaned graph"); // warn("No cleaning being done: you did not specify --out <out.ctx>"); } if(!doing_cleaning && (covg_after_path || len_after_path)) { warn("You gave --len-after <out> / --covg-after <out> without " "any cleaning (set -U, --unitigs or -t, --tips)"); } if(doing_cleaning && strcmp(out_ctx_path,"-") != 0 && !futil_get_force() && futil_file_exists(out_ctx_path)) { cmd_print_usage("Output file already exists: %s", out_ctx_path); } if(fallback_thresh && !unitig_cleaning) warn("-B, --fallback <T> without --unitigs"); // Use remaining args as graph files char **gfile_paths = argv + optind; size_t i, j, num_gfiles = (size_t)(argc - optind); // Open graph files GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); size_t col, ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0; ncols = graph_files_open(gfile_paths, gfiles, num_gfiles, &ctx_max_kmers, &ctx_sum_kmers); size_t kmer_size = gfiles[0].hdr.kmer_size; // default to one colour for now if(use_ncols == 0) use_ncols = 1; // Flatten if we don't have to remember colours / output a graph if(out_ctx_path == NULL) { ncols = use_ncols = 1; for(i = 0; i < num_gfiles; i++) file_filter_flatten(&gfiles[i].fltr, 0); } if(ncols < use_ncols) { warn("I only need %zu colour%s ('--ncols %zu' ignored)", ncols, util_plural_str(ncols), use_ncols); use_ncols = ncols; } char max_kmers_str[100]; ulong_to_str(ctx_max_kmers, max_kmers_str); status("%zu input graph%s, max kmers: %s, using %zu colours", num_gfiles, util_plural_str(num_gfiles), max_kmers_str, use_ncols); // If no arguments given we default to removing tips < 2*kmer_size if(min_keep_tip < 0) min_keep_tip = 2 * kmer_size; // Warn if any graph files already cleaned size_t fromcol; ErrorCleaning *cleaning; for(i = 0; i < num_gfiles; i++) { for(j = 0; j < file_filter_num(&gfiles[i].fltr); j++) { fromcol = file_filter_fromcol(&gfiles[i].fltr, j); cleaning = &gfiles[i].hdr.ginfo[fromcol].cleaning; if(cleaning->cleaned_snodes && unitig_cleaning) { warn("%s:%zu already has unitig cleaning with threshold: <%zu", file_filter_path(&gfiles[i].fltr), fromcol, (size_t)cleaning->clean_snodes_thresh); } if(cleaning->cleaned_tips && tip_cleaning) { warn("%s:%zu already has had tip cleaned", file_filter_path(&gfiles[i].fltr), fromcol); } } } // Print steps size_t step = 0; status("Actions:\n"); if(covg_before_path != NULL) status("%zu. Saving kmer coverage distribution to: %s", step++, covg_before_path); if(len_before_path != NULL) status("%zu. Saving unitig length distribution to: %s", step++, len_before_path); if(min_keep_tip > 0) status("%zu. Cleaning tips shorter than %i nodes", step++, min_keep_tip); if(unitig_min > 0) status("%zu. Cleaning unitigs with coverage < %i", step++, unitig_min); if(unitig_min < 0) status("%zu. Cleaning unitigs with auto-detected threshold", step++); if(covg_after_path != NULL) status("%zu. Saving kmer coverage distribution to: %s", step++, covg_after_path); if(len_after_path != NULL) status("%zu. Saving unitig length distribution to: %s", step++, len_after_path); // // Decide memory usage // bool all_colours_loaded = (ncols <= use_ncols); bool use_mem_limit = (memargs.mem_to_use_set && num_gfiles > 1) || !ctx_max_kmers; size_t kmers_in_hash, bits_per_kmer, graph_mem; size_t per_col_bits = (sizeof(Covg)+sizeof(Edges)) * 8; size_t extra_edge_bits = (all_colours_loaded ? 0 : sizeof(Edges) * 8); bits_per_kmer = sizeof(BinaryKmer)*8 + per_col_bits * use_ncols + extra_edge_bits; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, ctx_max_kmers, ctx_sum_kmers, use_mem_limit, &graph_mem); // Maximise the number of colours we load to fill the mem size_t max_usencols = (memargs.mem_to_use*8 - sizeof(BinaryKmer)*8*kmers_in_hash + extra_edge_bits*kmers_in_hash) / (per_col_bits*kmers_in_hash); use_ncols = MIN2(max_usencols, ncols); cmd_check_mem_limit(memargs.mem_to_use, graph_mem); // // Check output files are writable // futil_create_output(out_ctx_path); // Does nothing if arg is NULL futil_create_output(covg_before_path); futil_create_output(covg_after_path); futil_create_output(len_before_path); futil_create_output(len_after_path); // Create db_graph // Load as many colours as possible // Use an extra set of edge to take intersections dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, use_ncols, use_ncols, kmers_in_hash, DBG_ALLOC_EDGES | DBG_ALLOC_COVGS); // Extra edges required to hold union of kept edges Edges *edges_union = NULL; if(use_ncols < ncols) edges_union = ctx_calloc(db_graph.ht.capacity, sizeof(Edges)); // Load graph into a single colour GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph); // Construct cleaned graph header GraphFileHeader outhdr; memset(&outhdr, 0, sizeof(GraphFileHeader)); for(i = 0; i < num_gfiles; i++) graph_file_merge_header(&outhdr, &gfiles[i]); if(ncols > use_ncols) { db_graph.num_of_cols = db_graph.num_edge_cols = 1; SWAP(edges_union, db_graph.col_edges); graphs_load_files_flat(gfiles, num_gfiles, gprefs, NULL); SWAP(edges_union, db_graph.col_edges); db_graph.num_of_cols = db_graph.num_edge_cols = use_ncols; } else { for(i = 0; i < num_gfiles; i++) graph_load(&gfiles[i], gprefs, NULL); } char num_kmers_str[100]; ulong_to_str(db_graph.ht.num_kmers, num_kmers_str); status("Total kmers loaded: %s\n", num_kmers_str); size_t initial_nkmers = db_graph.ht.num_kmers; hash_table_print_stats(&db_graph.ht); uint8_t *visited = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1); uint8_t *keep = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1); // Always estimate cleaning threshold // if(unitig_min <= 0 || covg_before_path || len_before_path) // { // Get coverage distribution and estimate cleaning threshold int est_min_covg = cleaning_get_threshold(nthreads, covg_before_path, len_before_path, visited, &db_graph); if(est_min_covg < 0) status("Cannot find recommended cleaning threshold"); else status("Recommended cleaning threshold is: %i", est_min_covg); // Use estimated threshold if threshold not set if(unitig_min < 0) { if(fallback_thresh > 0 && est_min_covg < (int)fallback_thresh) { status("Using fallback threshold: %i", fallback_thresh); unitig_min = fallback_thresh; } else if(est_min_covg >= 0) unitig_min = est_min_covg; } // } // Die if we failed to find suitable cleaning threshold if(unitig_min < 0) die("Need cleaning threshold (--unitigs=<D> or --fallback <D>)"); // Cleaning parameters should now be set (>0) or turned off (==0) ctx_assert(unitig_min >= 0); ctx_assert(min_keep_tip >= 0); if(unitig_min || min_keep_tip) { // Clean graph of tips (if min_keep_tip > 0) and unitigs (if threshold > 0) clean_graph(nthreads, unitig_min, min_keep_tip, covg_after_path, len_after_path, visited, keep, &db_graph); } ctx_free(visited); ctx_free(keep); if(out_ctx_path != NULL) { // Set output header ginfo cleaned for(col = 0; col < ncols; col++) { cleaning = &outhdr.ginfo[col].cleaning; cleaning->cleaned_snodes |= unitig_cleaning; cleaning->cleaned_tips |= tip_cleaning; // if(tip_cleaning) { // strbuf_append_str(&outhdr.ginfo[col].sample_name, ".tipclean"); // } if(unitig_cleaning) { size_t thresh = cleaning->clean_snodes_thresh; thresh = cleaning->cleaned_snodes ? MAX2(thresh, (uint32_t)unitig_min) : (uint32_t)unitig_min; cleaning->clean_snodes_thresh = thresh; // char name_append[200]; // sprintf(name_append, ".supclean%zu", thresh); // strbuf_append_str(&outhdr.ginfo[col].sample_name, name_append); } } // Print stats on removed kmers size_t removed_nkmers = initial_nkmers - db_graph.ht.num_kmers; double removed_pct = (100.0 * removed_nkmers) / initial_nkmers; char removed_str[100], init_str[100]; ulong_to_str(removed_nkmers, removed_str); ulong_to_str(initial_nkmers, init_str); status("Removed %s of %s (%.2f%%) kmers", removed_str, init_str, removed_pct); // kmers_loaded=true graph_writer_merge(out_ctx_path, gfiles, num_gfiles, true, all_colours_loaded, edges_union, &outhdr, &db_graph); } ctx_check(db_graph.ht.num_kmers == hash_table_count_kmers(&db_graph.ht)); // TODO: report kmer coverage for each sample graph_header_dealloc(&outhdr); for(i = 0; i < num_gfiles; i++) graph_file_close(&gfiles[i]); ctx_free(gfiles); ctx_free(edges_union); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int ctx_rmsubstr(int argc, char **argv) { struct MemArgs memargs = MEM_ARGS_INIT; size_t kmer_size = 0, nthreads = 0; const char *output_file = NULL; seq_format fmt = SEQ_FMT_FASTA; bool invert = false; // Arg parsing char cmd[100], shortopts[100]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'o': cmd_check(!output_file, cmd); output_file = optarg; break; case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'k': cmd_check(!kmer_size,cmd); kmer_size = cmd_uint32(cmd, optarg); break; case 'F': cmd_check(fmt==SEQ_FMT_FASTA, cmd); fmt = cmd_parse_format(cmd, optarg); break; case 'v': cmd_check(!invert,cmd); invert = true; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); cmd_print_usage("`"CMD" rmsubstr -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } // Defaults if(!nthreads) nthreads = DEFAULT_NTHREADS; if(!kmer_size) kmer_size = DEFAULT_KMER; if(!(kmer_size&1)) cmd_print_usage("Kmer size must be odd"); if(kmer_size < MIN_KMER_SIZE) cmd_print_usage("Kmer size too small (recompile)"); if(kmer_size > MAX_KMER_SIZE) cmd_print_usage("Kmer size too large (recompile?)"); if(optind >= argc) cmd_print_usage("Please specify at least one input sequence file (.fq, .fq etc.)"); size_t i, num_seq_files = argc - optind; char **seq_paths = argv + optind; seq_file_t **seq_files = ctx_calloc(num_seq_files, sizeof(seq_file_t*)); for(i = 0; i < num_seq_files; i++) if((seq_files[i] = seq_open(seq_paths[i])) == NULL) die("Cannot read sequence file %s", seq_paths[i]); // Estimate number of bases // set to -1 if we cannot calc int64_t est_num_bases = seq_est_seq_bases(seq_files, num_seq_files); if(est_num_bases < 0) { warn("Cannot get file sizes, using pipes"); est_num_bases = memargs.num_kmers * IDEAL_OCCUPANCY; } status("[memory] Estimated number of bases: %li", (long)est_num_bases); // Use file sizes to decide on memory // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem; bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(KONodeList) + sizeof(KOccur) + // see kmer_occur.h 8; // 1 byte per kmer for each base to load sequence files kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, est_num_bases, est_num_bases, false, &graph_mem); cmd_check_mem_limit(memargs.mem_to_use, graph_mem); // // Open output file // if(output_file == NULL) output_file = "-"; FILE *fout = futil_fopen_create(output_file, "w"); // // Set up memory // dBGraph db_graph; db_graph_alloc(&db_graph, kmer_size, 1, 0, kmers_in_hash, DBG_ALLOC_BKTLOCKS); // // Load reference sequence into a read buffer // ReadBuffer rbuf; read_buf_alloc(&rbuf, 1024); seq_load_all_reads(seq_files, num_seq_files, &rbuf); // Check for reads too short for(i = 0; i < rbuf.len && rbuf.b[i].seq.end >= kmer_size; i++) {} if(i < rbuf.len) warn("Reads shorter than kmer size (%zu) will not be filtered", kmer_size); KOGraph kograph = kograph_create(rbuf.b, rbuf.len, true, 0, nthreads, &db_graph); size_t num_reads = rbuf.len, num_reads_printed = 0, num_bad_reads = 0; // Loop over reads printing those that are not substrings int ret; for(i = 0; i < rbuf.len; i++) { ret = _is_substr(&rbuf, i, &kograph, &db_graph); if(ret == -1) num_bad_reads++; else if((ret && invert) || (!ret && !invert)) { seqout_print_read(&rbuf.b[i], fmt, fout); num_reads_printed++; } } char num_reads_str[100], num_reads_printed_str[100], num_bad_reads_str[100]; ulong_to_str(num_reads, num_reads_str); ulong_to_str(num_reads_printed, num_reads_printed_str); ulong_to_str(num_bad_reads, num_bad_reads_str); status("Printed %s / %s (%.1f%%) to %s", num_reads_printed_str, num_reads_str, !num_reads ? 0.0 : (100.0 * num_reads_printed) / num_reads, futil_outpath_str(output_file)); if(num_bad_reads > 0) { status("Bad reads: %s / %s (%.1f%%) - no kmer {ACGT} of length %zu", num_bad_reads_str, num_reads_str, (100.0 * num_bad_reads) / num_reads, kmer_size); } fclose(fout); kograph_dealloc(&kograph); // Free sequence memory for(i = 0; i < rbuf.len; i++) seq_read_dealloc(&rbuf.b[i]); read_buf_dealloc(&rbuf); ctx_free(seq_files); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int main(int argc, char **argv) { time_t start, end; time(&start); ctx_msg_out = stderr; cortex_init(); cmd_init(argc, argv); if(argc == 1) print_help(stderr, NULL); const CtxCmd *cmd = ctx_get_command(argv[1]); if(cmd == NULL) print_help(stderr, "Unrecognised command: %s", argv[1]); // Once we have set cmd_usage, we can call cmd_print_usage() from anywhere cmd_set_usage(cmd->usage); // If no arguments after command, print help if(argc == 2) cmd_print_usage(NULL); // Look for -q, --quiet argument, if given silence output int argi = 1; while(argi < argc && !(!strcmp(argv[argi],"-q") || !strcmp(argv[argi],"--quiet"))) argi++; if(argi < argc) { // Found -q, --quiet argument ctx_msg_out = NULL; // Remove argument for(--argc; argi < argc; argi++) argv[argi] = argv[argi+1]; } // Print status header cmd_print_status_header(); SWAP(argv[1],argv[0]); int ret = cmd->func(argc-1, argv+1); time(&end); cmd_destroy(); // Warn if more allocations than deallocations size_t still_alloced = alloc_get_num_allocs() - alloc_get_num_frees(); if(still_alloced) warn("%zu allocates not free'd.", still_alloced); char nallocs_str[50]; ulong_to_str(alloc_get_num_allocs(), nallocs_str); status("[memory] We made %s allocs", nallocs_str); status(ret == 0 ? "Done." : "Fail."); // Print time taken double diff = difftime(end,start); if(diff < 60) status("[time] %.2lf seconds\n", diff); else { char timestr[100]; seconds_to_str((size_t)diff, timestr); status("[time] %.2lf seconds (%s)\n", diff, timestr); } cortex_destroy(); return ret; }
static a_uint32_t * cmd_parse(char *cmd_str, int *cmd_index, int *cmd_index_sub) { int cmd_nr = 0; a_uint32_t *arg_val = ioctl_argp; char *tmp_str[CMDSTR_ARGS_MAX]; if (cmd_str == NULL) return NULL; memset(arg_val, 0, CMDSTR_ARGS_MAX * sizeof (a_uint32_t)); /* split string into array */ if ((tmp_str[cmd_nr] = (void *) strtok(cmd_str, " ")) == NULL) return NULL; /*handle help */ if (!strcasecmp(tmp_str[cmd_nr], "help")) { dprintf("input ? get help\n\n"); return NULL; } while (tmp_str[cmd_nr]) { if (++cmd_nr == 3) break; tmp_str[cmd_nr] = (void *) strtok(NULL, " "); } /*commond string lookup */ int cmd_depth = cmd_lookup(tmp_str, cmd_index, cmd_index_sub); if (*cmd_index == GCMD_DESC_NO_MATCH || *cmd_index_sub == GCMD_DESC_NO_MATCH) { dprintf("invalid or incomplete command.\n\n"); return NULL; } /*parse param */ cmd_nr = 0; if (cmd_depth == 2) { tmp_str[cmd_nr] = tmp_str[2]; cmd_nr++; } tmp_str[cmd_nr] = (void *) strtok(NULL, " "); while (tmp_str[cmd_nr]) { if (++cmd_nr == CMDSTR_ARGS_MAX) break; tmp_str[cmd_nr] = (void *) strtok(NULL, " "); } arg_val[0] = GCMD_SUB_API(*cmd_index, *cmd_index_sub); arg_val[1] = (a_uint32_t) ioctl_buf; int rtn_code; if (arg_val[0] < SW_API_MAX) { /*api command parse */ rtn_code = cmd_parse_api(tmp_str, arg_val); } else if (arg_val[0] > SW_API_MAX) { /*user command parse */ rtn_code = cmd_parse_sw(tmp_str, arg_val); } else { rtn_code = SW_BAD_PARAM; } if(rtn_code != SW_OK) { cmd_print_error(rtn_code); if(rtn_code == SW_BAD_PARAM) cmd_print_usage(*cmd_index, *cmd_index_sub); return NULL; } return arg_val; }
int ctx_join(int argc, char **argv) { struct MemArgs memargs = MEM_ARGS_INIT; const char *out_path = NULL; size_t use_ncols = 0; GraphFileReader tmp_gfile; GraphFileBuffer isec_gfiles_buf; gfile_buf_alloc(&isec_gfiles_buf, 8); // Arg parsing char cmd[100], shortopts[100]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'N': cmd_check(!use_ncols, cmd); use_ncols = cmd_uint32_nonzero(cmd, optarg); break; case 'i': graph_file_reset(&tmp_gfile); graph_file_open(&tmp_gfile, optarg); if(file_filter_into_ncols(&tmp_gfile.fltr) > 1) warn("Flattening intersection graph into colour 0: %s", optarg); file_filter_flatten(&tmp_gfile.fltr, 0); gfile_buf_push(&isec_gfiles_buf, &tmp_gfile, 1); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" join -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } GraphFileReader *igfiles = isec_gfiles_buf.b; size_t num_igfiles = isec_gfiles_buf.len; if(!out_path) cmd_print_usage("--out <out.ctx> required"); if(optind >= argc) cmd_print_usage("Please specify at least one input graph file"); // optind .. argend-1 are graphs to load size_t num_gfiles = (size_t)(argc - optind); char **gfile_paths = argv + optind; GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); status("Probing %zu graph files and %zu intersect files", num_gfiles, num_igfiles); // Check all binaries are valid binaries with matching kmer size size_t i; size_t ctx_max_cols = 0; uint64_t min_intersect_num_kmers = 0, ctx_max_kmers = 0, ctx_sum_kmers = 0; for(i = 0; i < num_gfiles; i++) { graph_file_open2(&gfiles[i], gfile_paths[i], "r", true, ctx_max_cols); if(gfiles[0].hdr.kmer_size != gfiles[i].hdr.kmer_size) { cmd_print_usage("Kmer sizes don't match [%u vs %u]", gfiles[0].hdr.kmer_size, gfiles[i].hdr.kmer_size); } ctx_max_cols = MAX2(ctx_max_cols, file_filter_into_ncols(&gfiles[i].fltr)); ctx_max_kmers = MAX2(ctx_max_kmers, graph_file_nkmers(&gfiles[i])); ctx_sum_kmers += graph_file_nkmers(&gfiles[i]); } // Probe intersection graph files for(i = 0; i < num_igfiles; i++) { if(gfiles[0].hdr.kmer_size != igfiles[i].hdr.kmer_size) { cmd_print_usage("Kmer sizes don't match [%u vs %u]", gfiles[0].hdr.kmer_size, igfiles[i].hdr.kmer_size); } uint64_t nkmers = graph_file_nkmers(&igfiles[i]); if(i == 0) min_intersect_num_kmers = nkmers; else if(nkmers < min_intersect_num_kmers) { // Put smallest intersection binary first SWAP(igfiles[i], igfiles[0]); min_intersect_num_kmers = nkmers; } } bool take_intersect = (num_igfiles > 0); // If we are taking an intersection, // all kmers intersection kmers will need to be loaded if(take_intersect) ctx_max_kmers = ctx_sum_kmers = min_intersect_num_kmers; bool use_ncols_set = (use_ncols > 0); bool output_to_stdout = (strcmp(out_path,"-") == 0); // if(use_ncols == 0) use_ncols = 1; if(use_ncols_set) { if(use_ncols < ctx_max_cols && output_to_stdout) die("I need %zu colours if outputting to STDOUT (--ncols)", ctx_max_cols); if(use_ncols > ctx_max_cols) { warn("I only need %zu colour%s ('--ncols %zu' ignored)", ctx_max_cols, util_plural_str(ctx_max_cols), use_ncols); use_ncols = ctx_max_cols; } } else { use_ncols = output_to_stdout ? ctx_max_cols : 1; } // Check out_path is writable futil_create_output(out_path); status("Output %zu cols; from %zu files; intersecting %zu graphs; ", ctx_max_cols, num_gfiles, num_igfiles); if(num_gfiles == 1 && num_igfiles == 0) { // Loading only one file with no intersection files // Don't need to store a graph in memory, can filter as stream // Don't actually store anything in the de Bruijn graph, but we need to // pass it, so mock one up dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, file_filter_into_ncols(&gfiles[0].fltr), 0, 1024, 0); graph_writer_stream_mkhdr(out_path, &gfiles[0], &db_graph, NULL, NULL); graph_file_close(&gfiles[0]); gfile_buf_dealloc(&isec_gfiles_buf); ctx_free(gfiles); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; } // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem; bits_per_kmer = sizeof(BinaryKmer)*8 + (sizeof(Covg) + sizeof(Edges)) * 8 * use_ncols; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, ctx_max_kmers, ctx_sum_kmers, true, &graph_mem); if(!use_ncols_set) { // Maximise use_ncols size_t max_usencols = (memargs.mem_to_use*8) / bits_per_kmer; use_ncols = MIN2(max_usencols, ctx_max_cols); bits_per_kmer = sizeof(BinaryKmer)*8 + (sizeof(Covg) + sizeof(Edges)) * 8 * use_ncols; // Re-check memory used kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, ctx_max_kmers, ctx_sum_kmers, true, &graph_mem); } status("Using %zu colour%s in memory", use_ncols, util_plural_str(use_ncols)); cmd_check_mem_limit(memargs.mem_to_use, graph_mem); // Create db_graph dBGraph db_graph; Edges *intersect_edges = NULL; size_t edge_cols = (use_ncols + take_intersect); db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, use_ncols, use_ncols, kmers_in_hash, DBG_ALLOC_COVGS); // We allocate edges ourself since it's a special case db_graph.col_edges = ctx_calloc(db_graph.ht.capacity*edge_cols, sizeof(Edges)); // Load intersection binaries char *intsct_gname_ptr = NULL; StrBuf intersect_gname; strbuf_alloc(&intersect_gname, 1024); if(take_intersect) { GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph); gprefs.boolean_covgs = true; // covg++ only for(i = 0; i < num_igfiles; i++) { graph_load(&igfiles[i], gprefs, NULL); // Update intersect header // note: intersection graphs all load exactly one colour into colour 0 graph_info_make_intersect(&igfiles[i].hdr.ginfo[0], &intersect_gname); gprefs.must_exist_in_graph = true; gprefs.must_exist_in_edges = db_graph.col_edges; } if(num_igfiles > 1) { // Remove nodes where covg != num_igfiles HASH_ITERATE_SAFE(&db_graph.ht, remove_non_intersect_nodes, db_graph.col_covgs, (Covg)num_igfiles, &db_graph.ht); } status("Loaded intersection set\n"); intsct_gname_ptr = intersect_gname.b; for(i = 0; i < num_igfiles; i++) graph_file_close(&igfiles[i]); // Reset graph info for(i = 0; i < db_graph.num_of_cols; i++) graph_info_init(&db_graph.ginfo[i]); // Zero covgs memset(db_graph.col_covgs, 0, db_graph.ht.capacity * sizeof(Covg)); // Use union edges we loaded to intersect new edges intersect_edges = db_graph.col_edges; db_graph.col_edges += db_graph.ht.capacity; } bool kmers_loaded = take_intersect, colours_loaded = false; graph_writer_merge_mkhdr(out_path, gfiles, num_gfiles, kmers_loaded, colours_loaded, intersect_edges, intsct_gname_ptr, &db_graph); if(take_intersect) db_graph.col_edges -= db_graph.ht.capacity; for(i = 0; i < num_gfiles; i++) graph_file_close(&gfiles[i]); strbuf_dealloc(&intersect_gname); gfile_buf_dealloc(&isec_gfiles_buf); ctx_free(gfiles); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int ctx_clean(int argc, char **argv) { size_t nthreads = 0, use_ncols = 0; struct MemArgs memargs = MEM_ARGS_INIT; const char *out_ctx_path = NULL; bool tip_cleaning = false, supernode_cleaning = false; size_t min_keep_tip = 0; Covg threshold = 0, fallback_thresh = 0; const char *len_before_path = NULL, *len_after_path = NULL; const char *covg_before_path = NULL, *covg_after_path = NULL; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'o': if(out_ctx_path != NULL) cmd_print_usage(NULL); out_ctx_path = optarg; break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'N': use_ncols = cmd_uint32_nonzero(cmd, optarg); break; case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'T': cmd_check(!tip_cleaning, cmd); min_keep_tip = cmd_uint32_nonzero(cmd, optarg); tip_cleaning = true; break; case 'S': cmd_check(!supernode_cleaning, cmd); if(optarg != NULL) threshold = cmd_uint32_nonzero(cmd, optarg); supernode_cleaning = true; break; case 'B': cmd_check(!fallback_thresh, cmd); fallback_thresh = cmd_uint32_nonzero(cmd, optarg); break; case 'l': cmd_check(!len_before_path, cmd); len_before_path = optarg; break; case 'L': cmd_check(!len_after_path, cmd); len_after_path = optarg; break; case 'c': cmd_check(!covg_before_path, cmd); covg_before_path = optarg; break; case 'C': cmd_check(!covg_after_path, cmd); covg_after_path = optarg; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" clean -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } if(nthreads == 0) nthreads = DEFAULT_NTHREADS; if(optind >= argc) cmd_print_usage("Please give input graph files"); // Default behaviour if(!tip_cleaning && !supernode_cleaning) { if(out_ctx_path != NULL) supernode_cleaning = tip_cleaning = true; // do both else warn("No cleaning being done: you did not specify --out <out.ctx>"); } bool doing_cleaning = (supernode_cleaning || tip_cleaning); if(doing_cleaning && out_ctx_path == NULL) { cmd_print_usage("Please specify --out <out.ctx> for cleaned graph"); } if(!doing_cleaning && (covg_after_path || len_after_path)) { cmd_print_usage("You gave --len-after <out> / --covg-after <out> without " "any cleaning (set -s, --supernodes or -t, --tips)"); } if(doing_cleaning && strcmp(out_ctx_path,"-") != 0 && !futil_get_force() && futil_file_exists(out_ctx_path)) { cmd_print_usage("Output file already exists: %s", out_ctx_path); } if(fallback_thresh && !supernode_cleaning) cmd_print_usage("-B, --fallback <T> without --supernodes"); // Use remaining args as graph files char **gfile_paths = argv + optind; size_t i, j, num_gfiles = (size_t)(argc - optind); // Open graph files GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); size_t ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0; ncols = graph_files_open(gfile_paths, gfiles, num_gfiles, &ctx_max_kmers, &ctx_sum_kmers); size_t kmer_size = gfiles[0].hdr.kmer_size; // default to one colour for now if(use_ncols == 0) use_ncols = 1; // Flatten if we don't have to remember colours / output a graph if(!doing_cleaning) { ncols = use_ncols = 1; for(i = 0; i < num_gfiles; i++) file_filter_flatten(&gfiles[i].fltr, 0); } if(ncols < use_ncols) { warn("I only need %zu colour%s ('--ncols %zu' ignored)", ncols, util_plural_str(ncols), use_ncols); use_ncols = ncols; } char max_kmers_str[100]; ulong_to_str(ctx_max_kmers, max_kmers_str); status("%zu input graph%s, max kmers: %s, using %zu colours", num_gfiles, util_plural_str(num_gfiles), max_kmers_str, use_ncols); // If no arguments given we default to removing tips < 2*kmer_size if(tip_cleaning && min_keep_tip == 0) min_keep_tip = 2 * kmer_size; // Warn if any graph files already cleaned size_t fromcol, intocol; ErrorCleaning *cleaning; for(i = 0; i < num_gfiles; i++) { for(j = 0; j < file_filter_num(&gfiles[i].fltr); j++) { fromcol = file_filter_fromcol(&gfiles[i].fltr, j); cleaning = &gfiles[i].hdr.ginfo[fromcol].cleaning; if(cleaning->cleaned_snodes && supernode_cleaning) { warn("%s:%zu already has supernode cleaning with threshold: <%zu", file_filter_path(&gfiles[i].fltr), fromcol, (size_t)cleaning->clean_snodes_thresh); } if(cleaning->cleaned_tips && tip_cleaning) { warn("%s:%zu already has had tip cleaned", file_filter_path(&gfiles[i].fltr), fromcol); } } } // Print steps size_t step = 0; status("Actions:\n"); if(covg_before_path != NULL) status("%zu. Saving kmer coverage distribution to: %s", step++, covg_before_path); if(len_before_path != NULL) status("%zu. Saving supernode length distribution to: %s", step++, len_before_path); if(tip_cleaning) status("%zu. Cleaning tips shorter than %zu nodes", step++, min_keep_tip); if(supernode_cleaning && threshold > 0) status("%zu. Cleaning supernodes with coverage < %u", step++, threshold); if(supernode_cleaning && threshold <= 0) status("%zu. Cleaning supernodes with auto-detected threshold", step++); if(covg_after_path != NULL) status("%zu. Saving kmer coverage distribution to: %s", step++, covg_after_path); if(len_after_path != NULL) status("%zu. Saving supernode length distribution to: %s", step++, len_after_path); // // Decide memory usage // bool all_colours_loaded = (ncols <= use_ncols); bool use_mem_limit = (memargs.mem_to_use_set && num_gfiles > 1) || !ctx_max_kmers; size_t kmers_in_hash, bits_per_kmer, graph_mem; size_t per_kmer_per_col_bits = (sizeof(BinaryKmer)+sizeof(Covg)+sizeof(Edges)) * 8; size_t pop_edges_per_kmer_bits = (all_colours_loaded ? 0 : sizeof(Edges) * 8); bits_per_kmer = per_kmer_per_col_bits * use_ncols + pop_edges_per_kmer_bits; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, ctx_max_kmers, ctx_sum_kmers, use_mem_limit, &graph_mem); // Maximise the number of colours we load to fill the mem size_t max_usencols = (memargs.mem_to_use*8 - pop_edges_per_kmer_bits * kmers_in_hash) / (per_kmer_per_col_bits * kmers_in_hash); use_ncols = MIN2(max_usencols, ncols); cmd_check_mem_limit(memargs.mem_to_use, graph_mem); // // Check output files are writable // futil_create_output(out_ctx_path); // Does nothing if arg is NULL futil_create_output(covg_before_path); futil_create_output(covg_after_path); futil_create_output(len_before_path); futil_create_output(len_after_path); // Create db_graph // Load as many colours as possible // Use an extra set of edge to take intersections dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, use_ncols, use_ncols, kmers_in_hash, DBG_ALLOC_COVGS); // Edges is a special case size_t num_edges = db_graph.ht.capacity * (use_ncols + !all_colours_loaded); db_graph.col_edges = ctx_calloc(num_edges, sizeof(Edges)); // Load graph into a single colour LoadingStats stats = LOAD_STATS_INIT_MACRO; GraphLoadingPrefs gprefs = {.db_graph = &db_graph, .boolean_covgs = false, .must_exist_in_graph = false, .must_exist_in_edges = NULL, .empty_colours = false}; // Construct cleaned graph header GraphFileHeader outhdr; memset(&outhdr, 0, sizeof(GraphFileHeader)); outhdr.version = CTX_GRAPH_FILEFORMAT; outhdr.kmer_size = db_graph.kmer_size; outhdr.num_of_cols = ncols; outhdr.num_of_bitfields = (db_graph.kmer_size*2+63)/64; graph_header_alloc(&outhdr, ncols); // Merge info into header size_t gcol = 0; for(i = 0; i < num_gfiles; i++) { for(j = 0; j < file_filter_num(&gfiles[i].fltr); j++, gcol++) { fromcol = file_filter_fromcol(&gfiles[i].fltr, j); intocol = file_filter_intocol(&gfiles[i].fltr, j); graph_info_merge(&outhdr.ginfo[intocol], &gfiles[i].hdr.ginfo[fromcol]); } } if(ncols > use_ncols) { graph_files_load_flat(gfiles, num_gfiles, gprefs, &stats); } else { for(i = 0; i < num_gfiles; i++) graph_load(&gfiles[i], gprefs, &stats); } char num_kmers_str[100]; ulong_to_str(db_graph.ht.num_kmers, num_kmers_str); status("Total kmers loaded: %s\n", num_kmers_str); size_t initial_nkmers = db_graph.ht.num_kmers; hash_table_print_stats(&db_graph.ht); uint8_t *visited = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1); uint8_t *keep = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1); if((supernode_cleaning && threshold <= 0) || covg_before_path || len_before_path) { // Get coverage distribution and estimate cleaning threshold int est_threshold = cleaning_get_threshold(nthreads, covg_before_path, len_before_path, visited, &db_graph); if(est_threshold < 0) status("Cannot find recommended cleaning threshold"); else status("Recommended cleaning threshold is: %i", est_threshold); // Use estimated threshold if threshold not set if(threshold <= 0) { if(fallback_thresh > 0 && est_threshold < (int)fallback_thresh) { status("Using fallback threshold: %i", fallback_thresh); threshold = fallback_thresh; } else if(est_threshold >= 0) threshold = est_threshold; } } // Die if we failed to find suitable cleaning threshold if(supernode_cleaning && threshold <= 0) die("Need cleaning threshold (--supernodes=<D> or --fallback <D>)"); if(doing_cleaning) { // Clean graph of tips (if min_keep_tip > 0) and supernodes (if threshold > 0) clean_graph(nthreads, threshold, min_keep_tip, covg_after_path, len_after_path, visited, keep, &db_graph); } ctx_free(visited); ctx_free(keep); if(doing_cleaning) { // Output graph file Edges *intersect_edges = NULL; bool kmers_loaded = true; size_t col, thresh; // Set output header ginfo cleaned for(col = 0; col < ncols; col++) { cleaning = &outhdr.ginfo[col].cleaning; cleaning->cleaned_snodes |= supernode_cleaning; cleaning->cleaned_tips |= tip_cleaning; // if(tip_cleaning) { // strbuf_append_str(&outhdr.ginfo[col].sample_name, ".tipclean"); // } if(supernode_cleaning) { thresh = cleaning->clean_snodes_thresh; thresh = cleaning->cleaned_snodes ? MAX2(thresh, (uint32_t)threshold) : (uint32_t)threshold; cleaning->clean_snodes_thresh = thresh; // char name_append[200]; // sprintf(name_append, ".supclean%zu", thresh); // strbuf_append_str(&outhdr.ginfo[col].sample_name, name_append); } } if(!all_colours_loaded) { // We haven't loaded all the colours // intersect_edges are edges to mask with // resets graph edges intersect_edges = db_graph.col_edges; db_graph.col_edges += db_graph.ht.capacity; } // Print stats on removed kmers size_t removed_nkmers = initial_nkmers - db_graph.ht.num_kmers; double removed_pct = (100.0 * removed_nkmers) / initial_nkmers; char removed_str[100], init_str[100]; ulong_to_str(removed_nkmers, removed_str); ulong_to_str(initial_nkmers, init_str); status("Removed %s of %s (%.2f%%) kmers", removed_str, init_str, removed_pct); graph_files_merge(out_ctx_path, gfiles, num_gfiles, kmers_loaded, all_colours_loaded, intersect_edges, &outhdr, &db_graph); // Swap back if(!all_colours_loaded) db_graph.col_edges = intersect_edges; } ctx_check(db_graph.ht.num_kmers == hash_table_count_kmers(&db_graph.ht)); graph_header_dealloc(&outhdr); for(i = 0; i < num_gfiles; i++) graph_file_close(&gfiles[i]); ctx_free(gfiles); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int ctx_vcfcov(int argc, char **argv) { struct MemArgs memargs = MEM_ARGS_INIT; const char *out_path = NULL, *out_type = NULL; uint32_t max_allele_len = 0, max_gt_vars = 0; char *ref_path = NULL; bool low_mem = false; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; size_t i; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 'O': cmd_check(!out_type, cmd); out_type = optarg; break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'r': cmd_check(!ref_path, cmd); ref_path = optarg; break; case 'L': cmd_check(!max_allele_len,cmd); max_allele_len = cmd_uint32(cmd,optarg); break; case 'N': cmd_check(!max_gt_vars,cmd); max_gt_vars = cmd_uint32(cmd,optarg); break; case 'M': cmd_check(!low_mem, cmd); low_mem = true; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } // Defaults for unset values if(out_path == NULL) out_path = "-"; if(ref_path == NULL) cmd_print_usage("Require a reference (-r,--ref <ref.fa>)"); if(optind+2 > argc) cmd_print_usage("Require VCF and graph files"); if(!max_allele_len) max_allele_len = DEFAULT_MAX_ALLELE_LEN; if(!max_gt_vars) max_gt_vars = DEFAULT_MAX_GT_VARS; status("[vcfcov] max allele length: %u; max number of variants: %u", max_allele_len, max_gt_vars); // open ref // index fasta with: samtools faidx ref.fa faidx_t *fai = fai_load(ref_path); if(fai == NULL) die("Cannot load ref index: %s / %s.fai", ref_path, ref_path); // Open input VCF file const char *vcf_path = argv[optind++]; htsFile *vcffh = hts_open(vcf_path, "r"); if(vcffh == NULL) die("Cannot open VCF file: %s", vcf_path); bcf_hdr_t *vcfhdr = bcf_hdr_read(vcffh); if(vcfhdr == NULL) die("Cannot read VCF header: %s", vcf_path); // Test we can close and reopen files if(low_mem) { if((vcffh = hts_open(vcf_path, "r")) == NULL) die("Cannot re-open VCF file: %s", vcf_path); if((vcfhdr = bcf_hdr_read(vcffh)) == NULL) die("Cannot re-read VCF header: %s", vcf_path); } // // Open graph files // const size_t num_gfiles = argc - optind; char **graph_paths = argv + optind; ctx_assert(num_gfiles > 0); GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); size_t ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0; ncols = graph_files_open(graph_paths, gfiles, num_gfiles, &ctx_max_kmers, &ctx_sum_kmers); // Check graph + paths are compatible graphs_gpaths_compatible(gfiles, num_gfiles, NULL, 0, -1); // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem; bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Covg)*8 * ncols; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, low_mem ? -1 : (int64_t)ctx_max_kmers, ctx_sum_kmers, true, &graph_mem); cmd_check_mem_limit(memargs.mem_to_use, graph_mem); // // Open output file // // v=>vcf, z=>compressed vcf, b=>bcf, bu=>uncompressed bcf int mode = vcf_misc_get_outtype(out_type, out_path); futil_create_output(out_path); htsFile *outfh = hts_open(out_path, modes_htslib[mode]); status("[vcfcov] Output format: %s", hsmodes_htslib[mode]); // Allocate memory dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, ncols, 1, kmers_in_hash, DBG_ALLOC_COVGS); // // Set up tag names // // *R => ref, *A => alt sprintf(kcov_ref_tag, "K%zuR", db_graph.kmer_size); // mean coverage sprintf(kcov_alt_tag, "K%zuA", db_graph.kmer_size); // #SAMPLE=<ID=...,K29KCOV=...,K29NK=...,K29RLK> // - K29_kcov is empirical kmer coverage // - K29_nkmers is the number of kmers in the sample // - mean_read_length is the mean read length in bases char sample_kcov_tag[20], sample_nk_tag[20], sample_rlk_tag[20]; sprintf(sample_kcov_tag, "K%zu_kcov", db_graph.kmer_size); // mean coverage sprintf(sample_nk_tag, "K%zu_nkmers", db_graph.kmer_size); sprintf(sample_rlk_tag, "mean_read_length"); // // Load kmers if we are using --low-mem // VcfCovStats st; memset(&st, 0, sizeof(st)); VcfCovPrefs prefs = {.kcov_ref_tag = kcov_ref_tag, .kcov_alt_tag = kcov_alt_tag, .max_allele_len = max_allele_len, .max_gt_vars = max_gt_vars, .load_kmers_only = false}; if(low_mem) { status("[vcfcov] Loading kmers from VCF+ref"); prefs.load_kmers_only = true; vcfcov_file(vcffh, vcfhdr, NULL, NULL, vcf_path, fai, NULL, &prefs, &st, &db_graph); // Close files hts_close(vcffh); bcf_hdr_destroy(vcfhdr); // Re-open files if((vcffh = hts_open(vcf_path, "r")) == NULL) die("Cannot re-open VCF file: %s", vcf_path); if((vcfhdr = bcf_hdr_read(vcffh)) == NULL) die("Cannot re-read VCF header: %s", vcf_path); prefs.load_kmers_only = false; } // // Load graphs // GraphLoadingStats gstats; memset(&gstats, 0, sizeof(gstats)); GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph); gprefs.must_exist_in_graph = low_mem; for(i = 0; i < num_gfiles; i++) { graph_load(&gfiles[i], gprefs, &gstats); graph_file_close(&gfiles[i]); } ctx_free(gfiles); hash_table_print_stats(&db_graph.ht); // // Set up VCF header / graph matchup // size_t *samplehdrids = ctx_malloc(db_graph.num_of_cols * sizeof(size_t)); // Add samples to vcf header bcf_hdr_t *outhdr = bcf_hdr_dup(vcfhdr); bcf_hrec_t *hrec; int sid; char hdrstr[200]; for(i = 0; i < db_graph.num_of_cols; i++) { char *sname = db_graph.ginfo[i].sample_name.b; if((sid = bcf_hdr_id2int(outhdr, BCF_DT_SAMPLE, sname)) < 0) { bcf_hdr_add_sample(outhdr, sname); sid = bcf_hdr_id2int(outhdr, BCF_DT_SAMPLE, sname); } samplehdrids[i] = sid; // Add SAMPLE field hrec = bcf_hdr_get_hrec(outhdr, BCF_HL_STR, "ID", sname, "SAMPLE"); if(hrec == NULL) { sprintf(hdrstr, "##SAMPLE=<ID=%s,%s=%"PRIu64",%s=%"PRIu64",%s=%zu>", sname, sample_kcov_tag, gstats.nkmers[i] ? gstats.sumcov[i] / gstats.nkmers[i] : 0, sample_nk_tag, gstats.nkmers[i], sample_rlk_tag, (size_t)db_graph.ginfo[i].mean_read_length); bcf_hdr_append(outhdr, hdrstr); } else { // mean kcovg sprintf(hdrstr, "%"PRIu64, gstats.sumcov[i] / gstats.nkmers[i]); vcf_misc_add_update_hrec(hrec, sample_kcov_tag, hdrstr); // num kmers sprintf(hdrstr, "%"PRIu64, gstats.nkmers[i]); vcf_misc_add_update_hrec(hrec, sample_nk_tag, hdrstr); // mean read length in kmers sprintf(hdrstr, "%zu", (size_t)db_graph.ginfo[i].mean_read_length); vcf_misc_add_update_hrec(hrec, sample_rlk_tag, hdrstr); } status("[vcfcov] Colour %zu: %s [VCF column %zu]", i, sname, samplehdrids[i]); } // Add genotype format fields // One field per alternative allele sprintf(hdrstr, "##FORMAT=<ID=%s,Number=A,Type=Integer," "Description=\"Coverage on ref (k=%zu): sum(kmer_covs) / exp_num_kmers\">\n", kcov_ref_tag, db_graph.kmer_size); bcf_hdr_append(outhdr, hdrstr); sprintf(hdrstr, "##FORMAT=<ID=%s,Number=A,Type=Integer," "Description=\"Coverage on alt (k=%zu): sum(kmer_covs) / exp_num_kmers\">\n", kcov_alt_tag, db_graph.kmer_size); bcf_hdr_append(outhdr, hdrstr); bcf_hdr_set_version(outhdr, "VCFv4.2"); // Add command string to header vcf_misc_hdr_add_cmd(outhdr, cmd_get_cmdline(), cmd_get_cwd()); if(bcf_hdr_write(outfh, outhdr) != 0) die("Cannot write header to: %s", futil_outpath_str(out_path)); status("[vcfcov] Reading %s and adding coverage", vcf_path); // Reset stats and get coverage memset(&st, 0, sizeof(st)); vcfcov_file(vcffh, vcfhdr, outfh, outhdr, vcf_path, fai, samplehdrids, &prefs, &st, &db_graph); // Print statistics char ns0[50], ns1[50]; status("[vcfcov] Read %s VCF lines", ulong_to_str(st.nvcf_lines, ns0)); status("[vcfcov] Read %s ALTs", ulong_to_str(st.nalts_read, ns0)); status("[vcfcov] Used %s kmers", ulong_to_str(st.ngt_kmers, ns0)); status("[vcfcov] ALTs used: %s / %s (%.2f%%)", ulong_to_str(st.nalts_loaded, ns0), ulong_to_str(st.nalts_read, ns1), st.nalts_read ? (100.0*st.nalts_loaded) / st.nalts_read : 0.0); status("[vcfcov] ALTs too long (>%ubp): %s / %s (%.2f%%)", max_allele_len, ulong_to_str(st.nalts_too_long, ns0), ulong_to_str(st.nalts_read, ns1), st.nalts_read ? (100.0*st.nalts_too_long) / st.nalts_read : 0.0); status("[vcfcov] ALTs too dense (>%u within %zubp): %s / %s (%.2f%%)", max_gt_vars, db_graph.kmer_size, ulong_to_str(st.nalts_no_covg, ns0), ulong_to_str(st.nalts_read, ns1), st.nalts_read ? (100.0*st.nalts_no_covg) / st.nalts_read : 0.0); status("[vcfcov] ALTs printed with coverage: %s / %s (%.2f%%)", ulong_to_str(st.nalts_with_covg, ns0), ulong_to_str(st.nalts_read, ns1), st.nalts_read ? (100.0*st.nalts_with_covg) / st.nalts_read : 0.0); status("[vcfcov] Saved to: %s\n", out_path); ctx_free(samplehdrids); graph_loading_stats_destroy(&gstats); bcf_hdr_destroy(vcfhdr); bcf_hdr_destroy(outhdr); hts_close(vcffh); hts_close(outfh); fai_destroy(fai); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }