int ctx_join(int argc, char **argv) { struct MemArgs memargs = MEM_ARGS_INIT; const char *out_path = NULL; size_t use_ncols = 0; GraphFileReader tmp_gfile; GraphFileBuffer isec_gfiles_buf; gfile_buf_alloc(&isec_gfiles_buf, 8); // Arg parsing char cmd[100], shortopts[100]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'N': cmd_check(!use_ncols, cmd); use_ncols = cmd_uint32_nonzero(cmd, optarg); break; case 'i': graph_file_reset(&tmp_gfile); graph_file_open(&tmp_gfile, optarg); if(file_filter_into_ncols(&tmp_gfile.fltr) > 1) warn("Flattening intersection graph into colour 0: %s", optarg); file_filter_flatten(&tmp_gfile.fltr, 0); gfile_buf_push(&isec_gfiles_buf, &tmp_gfile, 1); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" join -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } GraphFileReader *igfiles = isec_gfiles_buf.b; size_t num_igfiles = isec_gfiles_buf.len; if(!out_path) cmd_print_usage("--out <out.ctx> required"); if(optind >= argc) cmd_print_usage("Please specify at least one input graph file"); // optind .. argend-1 are graphs to load size_t num_gfiles = (size_t)(argc - optind); char **gfile_paths = argv + optind; GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); status("Probing %zu graph files and %zu intersect files", num_gfiles, num_igfiles); // Check all binaries are valid binaries with matching kmer size size_t i; size_t ctx_max_cols = 0; uint64_t min_intersect_num_kmers = 0, ctx_max_kmers = 0, ctx_sum_kmers = 0; for(i = 0; i < num_gfiles; i++) { graph_file_open2(&gfiles[i], gfile_paths[i], "r", true, ctx_max_cols); if(gfiles[0].hdr.kmer_size != gfiles[i].hdr.kmer_size) { cmd_print_usage("Kmer sizes don't match [%u vs %u]", gfiles[0].hdr.kmer_size, gfiles[i].hdr.kmer_size); } ctx_max_cols = MAX2(ctx_max_cols, file_filter_into_ncols(&gfiles[i].fltr)); ctx_max_kmers = MAX2(ctx_max_kmers, graph_file_nkmers(&gfiles[i])); ctx_sum_kmers += graph_file_nkmers(&gfiles[i]); } // Probe intersection graph files for(i = 0; i < num_igfiles; i++) { if(gfiles[0].hdr.kmer_size != igfiles[i].hdr.kmer_size) { cmd_print_usage("Kmer sizes don't match [%u vs %u]", gfiles[0].hdr.kmer_size, igfiles[i].hdr.kmer_size); } uint64_t nkmers = graph_file_nkmers(&igfiles[i]); if(i == 0) min_intersect_num_kmers = nkmers; else if(nkmers < min_intersect_num_kmers) { // Put smallest intersection binary first SWAP(igfiles[i], igfiles[0]); min_intersect_num_kmers = nkmers; } } bool take_intersect = (num_igfiles > 0); // If we are taking an intersection, // all kmers intersection kmers will need to be loaded if(take_intersect) ctx_max_kmers = ctx_sum_kmers = min_intersect_num_kmers; bool use_ncols_set = (use_ncols > 0); bool output_to_stdout = (strcmp(out_path,"-") == 0); // if(use_ncols == 0) use_ncols = 1; if(use_ncols_set) { if(use_ncols < ctx_max_cols && output_to_stdout) die("I need %zu colours if outputting to STDOUT (--ncols)", ctx_max_cols); if(use_ncols > ctx_max_cols) { warn("I only need %zu colour%s ('--ncols %zu' ignored)", ctx_max_cols, util_plural_str(ctx_max_cols), use_ncols); use_ncols = ctx_max_cols; } } else { use_ncols = output_to_stdout ? ctx_max_cols : 1; } // Check out_path is writable futil_create_output(out_path); status("Output %zu cols; from %zu files; intersecting %zu graphs; ", ctx_max_cols, num_gfiles, num_igfiles); if(num_gfiles == 1 && num_igfiles == 0) { // Loading only one file with no intersection files // Don't need to store a graph in memory, can filter as stream // Don't actually store anything in the de Bruijn graph, but we need to // pass it, so mock one up dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, file_filter_into_ncols(&gfiles[0].fltr), 0, 1024, 0); graph_writer_stream_mkhdr(out_path, &gfiles[0], &db_graph, NULL, NULL); graph_file_close(&gfiles[0]); gfile_buf_dealloc(&isec_gfiles_buf); ctx_free(gfiles); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; } // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem; bits_per_kmer = sizeof(BinaryKmer)*8 + (sizeof(Covg) + sizeof(Edges)) * 8 * use_ncols; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, ctx_max_kmers, ctx_sum_kmers, true, &graph_mem); if(!use_ncols_set) { // Maximise use_ncols size_t max_usencols = (memargs.mem_to_use*8) / bits_per_kmer; use_ncols = MIN2(max_usencols, ctx_max_cols); bits_per_kmer = sizeof(BinaryKmer)*8 + (sizeof(Covg) + sizeof(Edges)) * 8 * use_ncols; // Re-check memory used kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, ctx_max_kmers, ctx_sum_kmers, true, &graph_mem); } status("Using %zu colour%s in memory", use_ncols, util_plural_str(use_ncols)); cmd_check_mem_limit(memargs.mem_to_use, graph_mem); // Create db_graph dBGraph db_graph; Edges *intersect_edges = NULL; size_t edge_cols = (use_ncols + take_intersect); db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, use_ncols, use_ncols, kmers_in_hash, DBG_ALLOC_COVGS); // We allocate edges ourself since it's a special case db_graph.col_edges = ctx_calloc(db_graph.ht.capacity*edge_cols, sizeof(Edges)); // Load intersection binaries char *intsct_gname_ptr = NULL; StrBuf intersect_gname; strbuf_alloc(&intersect_gname, 1024); if(take_intersect) { GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph); gprefs.boolean_covgs = true; // covg++ only for(i = 0; i < num_igfiles; i++) { graph_load(&igfiles[i], gprefs, NULL); // Update intersect header // note: intersection graphs all load exactly one colour into colour 0 graph_info_make_intersect(&igfiles[i].hdr.ginfo[0], &intersect_gname); gprefs.must_exist_in_graph = true; gprefs.must_exist_in_edges = db_graph.col_edges; } if(num_igfiles > 1) { // Remove nodes where covg != num_igfiles HASH_ITERATE_SAFE(&db_graph.ht, remove_non_intersect_nodes, db_graph.col_covgs, (Covg)num_igfiles, &db_graph.ht); } status("Loaded intersection set\n"); intsct_gname_ptr = intersect_gname.b; for(i = 0; i < num_igfiles; i++) graph_file_close(&igfiles[i]); // Reset graph info for(i = 0; i < db_graph.num_of_cols; i++) graph_info_init(&db_graph.ginfo[i]); // Zero covgs memset(db_graph.col_covgs, 0, db_graph.ht.capacity * sizeof(Covg)); // Use union edges we loaded to intersect new edges intersect_edges = db_graph.col_edges; db_graph.col_edges += db_graph.ht.capacity; } bool kmers_loaded = take_intersect, colours_loaded = false; graph_writer_merge_mkhdr(out_path, gfiles, num_gfiles, kmers_loaded, colours_loaded, intersect_edges, intsct_gname_ptr, &db_graph); if(take_intersect) db_graph.col_edges -= db_graph.ht.capacity; for(i = 0; i < num_gfiles; i++) graph_file_close(&gfiles[i]); strbuf_dealloc(&intersect_gname); gfile_buf_dealloc(&isec_gfiles_buf); ctx_free(gfiles); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int ctx_pop_bubbles(int argc, char **argv) { size_t nthreads = 0; struct MemArgs memargs = MEM_ARGS_INIT; const char *out_path = NULL; int32_t max_covg = -1; // max mean coverage to remove <=0 => ignore int32_t max_klen = -1; // max length (kmers) to remove <=0 => ignore int32_t max_kdiff = -1; // max diff between bubble branch lengths <0 => ignore // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'C': cmd_check(max_covg<0, cmd); max_covg = cmd_uint32(cmd, optarg); break; case 'L': cmd_check(max_klen<0, cmd); max_klen = cmd_uint32(cmd, optarg); break; case 'D': cmd_check(max_kdiff<0, cmd); max_kdiff = cmd_uint32(cmd, optarg); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" pop -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } // Defaults for unset values if(out_path == NULL) out_path = "-"; if(nthreads == 0) nthreads = DEFAULT_NTHREADS; if(optind >= argc) cmd_print_usage("Require input graph files (.ctx)"); // // Open graph files // const size_t num_gfiles = argc - optind; char **graph_paths = argv + optind; ctx_assert(num_gfiles > 0); GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); size_t i, ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0; ncols = graph_files_open(graph_paths, gfiles, num_gfiles, &ctx_max_kmers, &ctx_sum_kmers); bool reread_graph_to_filter = (num_gfiles == 1 && strcmp(file_filter_path(&gfiles[0].fltr),"-") != 0); if(reread_graph_to_filter) { file_filter_flatten(&gfiles[0].fltr, 0); ncols = 1; } // Check graphs are compatible graphs_gpaths_compatible(gfiles, num_gfiles, NULL, 0, -1); // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem; bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Covg)*8*ncols + sizeof(Edges)*8*ncols + 2; // 1 bit for visited, 1 for removed kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, ctx_max_kmers, ctx_sum_kmers, false, &graph_mem); cmd_check_mem_limit(memargs.mem_to_use, graph_mem); // Check out_path is writable futil_create_output(out_path); // Allocate memory dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, ncols, ncols, kmers_in_hash, DBG_ALLOC_EDGES | DBG_ALLOC_COVGS); size_t nkwords = roundup_bits2bytes(db_graph.ht.capacity); uint8_t *visited = ctx_calloc(1, nkwords); uint8_t *rmvbits = ctx_calloc(1, nkwords); // // Load graphs // GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph); gprefs.empty_colours = true; for(i = 0; i < num_gfiles; i++) { graph_load(&gfiles[i], gprefs, NULL); graph_file_close(&gfiles[i]); gprefs.empty_colours = false; } ctx_free(gfiles); hash_table_print_stats(&db_graph.ht); PopBubblesPrefs prefs = {.max_rmv_covg = max_covg, .max_rmv_klen = max_klen, .max_rmv_kdiff = max_kdiff}; size_t npopped = 0; char npopped_str[50]; status("Popping bubbles..."); npopped = pop_bubbles(&db_graph, nthreads, prefs, visited, rmvbits); ulong_to_str(npopped, npopped_str); status("Popped %s bubbles", npopped_str); size_t nkmers0 = db_graph.ht.num_kmers; status("Removing nodes..."); for(i = 0; i < nkwords; i++) rmvbits[i] = ~rmvbits[i]; prune_nodes_lacking_flag(nthreads, rmvbits, &db_graph); size_t nkmers1 = db_graph.ht.num_kmers; ctx_assert(nkmers1 <= nkmers0); char nkmers0str[50], nkmers1str[50], ndiffstr[50]; ulong_to_str(nkmers0, nkmers0str); ulong_to_str(nkmers1, nkmers1str); ulong_to_str(nkmers0-nkmers1, ndiffstr); status("Number of kmers %s -> %s (-%s)", nkmers0str, nkmers1str, ndiffstr); if(reread_graph_to_filter) { status("Streaming filtered file to: %s\n", out_path); GraphFileReader gfile; memset(&gfile, 0, sizeof(GraphFileReader)); graph_file_open(&gfile, graph_paths[0]); graph_writer_stream_mkhdr(out_path, &gfile, &db_graph, db_graph.col_edges, NULL); graph_file_close(&gfile); } else { status("Saving to: %s\n", out_path); graph_writer_save_mkhdr(out_path, &db_graph, CTX_GRAPH_FILEFORMAT, NULL, 0, ncols); } ctx_free(visited); ctx_free(rmvbits); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
static void parse_args(int argc, char **argv) { BuildGraphTask task; memset(&task, 0, sizeof(task)); task.prefs = SEQ_LOADING_PREFS_INIT; task.stats = SEQ_LOADING_STATS_INIT; uint8_t fq_offset = 0; int intocolour = -1; GraphFileReader tmp_gfile; // Arg parsing char cmd[100], shortopts[100]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; bool sample_named = false, pref_unused = false; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 't': cmd_check(!nthreads,cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'k': cmd_check(!kmer_size,cmd); kmer_size = cmd_kmer_size(cmd, optarg); break; case 's': intocolour++; check_sample_name(optarg); sample_name_buf_add(&snamebuf, (SampleName){.colour = intocolour, .name = optarg}); sample_named = true; break; case '1': case '2': case 'i': pref_unused = false; if(!sample_named) cmd_print_usage("Please give sample name first [-s,--sample <name>]"); asyncio_task_parse(&task.files, c, optarg, fq_offset, NULL); task.prefs.colour = intocolour; add_task(&task); break; case 'M': if(!strcmp(optarg,"FF")) task.prefs.matedir = READPAIR_FF; else if(!strcmp(optarg,"FR")) task.prefs.matedir = READPAIR_FR; else if(!strcmp(optarg,"RF")) task.prefs.matedir = READPAIR_RF; else if(!strcmp(optarg,"RR")) task.prefs.matedir = READPAIR_RR; else die("-M,--matepair <orient> must be one of: FF,FR,RF,RR"); pref_unused = true; break; case 'O': fq_offset = cmd_uint8(cmd, optarg); pref_unused = true; break; case 'Q': task.prefs.fq_cutoff = cmd_uint8(cmd, optarg); pref_unused = true; break; case 'H': task.prefs.hp_cutoff = cmd_uint8(cmd, optarg); pref_unused = true; break; case 'p': task.prefs.remove_pcr_dups = true; pref_unused = true; break; case 'P': task.prefs.remove_pcr_dups = false; pref_unused = true; break; case 'g': if(intocolour == -1) intocolour = 0; graph_file_reset(&tmp_gfile); graph_file_open2(&tmp_gfile, optarg, "r", true, intocolour); intocolour = MAX2((size_t)intocolour, file_filter_into_ncols(&tmp_gfile.fltr)-1); gfile_buf_push(&gfilebuf, &tmp_gfile, 1); sample_named = false; break; case 'I': graph_file_reset(&tmp_gfile); graph_file_open(&tmp_gfile, optarg); if(file_filter_into_ncols(&tmp_gfile.fltr) > 1) warn("Flattening intersection graph into colour 0: %s", optarg); file_filter_flatten(&tmp_gfile.fltr, 0); gfile_buf_push(&gisecbuf, &tmp_gfile, 1); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" build -h` for help. Bad option: %s", argv[optind-1]); default: die("Bad option: %s", cmd); } }
void read_thread_args_parse(struct ReadThreadCmdArgs *args, int argc, char **argv, const struct option *longopts, bool correct_cmd) { size_t i; CorrectAlnInput task = CORRECT_ALN_INPUT_INIT; uint8_t fq_offset = 0; GPathReader tmp_gpfile; CorrectAlnInputBuffer *inputs = &args->inputs; args->memargs = (struct MemArgs)MEM_ARGS_INIT; args->fmt = SEQ_FMT_FASTQ; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int used = 1, c; char *tmp_path; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'o': cmd_check(!args->out_ctp_path,cmd); args->out_ctp_path = optarg; break; case 'p': memset(&tmp_gpfile, 0, sizeof(GPathReader)); gpath_reader_open(&tmp_gpfile, optarg); gpfile_buf_push(&args->gpfiles, &tmp_gpfile, 1); break; case 't': cmd_check(!args->nthreads, cmd); args->nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&args->memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&args->memargs, optarg); break; case 'c': args->colour = cmd_uint32(cmd, optarg); break; case 'F': cmd_check(args->fmt == SEQ_FMT_FASTQ, cmd); args->fmt = cmd_parse_format(cmd, optarg); break; case '1': case '2': case 'i': used = 1; correct_aln_input_buf_push(inputs, &task, 1); asyncio_task_parse(&inputs->b[inputs->len-1].files, c, optarg, fq_offset, correct_cmd ? &tmp_path : NULL); if(correct_cmd) inputs->b[inputs->len-1].out_base = tmp_path; break; case 'M': if(!strcmp(optarg,"FF")) task.matedir = READPAIR_FF; else if(!strcmp(optarg,"FR")) task.matedir = READPAIR_FR; else if(!strcmp(optarg,"RF")) task.matedir = READPAIR_RF; else if(!strcmp(optarg,"RR")) task.matedir = READPAIR_RR; else die("-M,--matepair <orient> must be one of: FF,FR,RF,RR"); used = 0; break; case 'O': fq_offset = cmd_uint8(cmd, optarg); used = 0; break; case 'Q': task.fq_cutoff = cmd_uint8(cmd, optarg); used = 0; break; case 'H': task.hp_cutoff = cmd_uint8(cmd, optarg); used = 0; break; case 'l': task.crt_params.frag_len_min = cmd_uint32(cmd, optarg); used = 0; break; case 'L': task.crt_params.frag_len_max = cmd_uint32(cmd, optarg); used = 0; break; case 'w': task.crt_params.one_way_gap_traverse = true; used = 0; break; case 'W': task.crt_params.one_way_gap_traverse = false; used = 0; break; case 'd': task.crt_params.gap_wiggle = cmd_udouble(cmd, optarg); used = 0; break; case 'D': task.crt_params.gap_variance = cmd_udouble(cmd, optarg); used = 0; break; case 'X': task.crt_params.max_context = cmd_uint32(cmd, optarg); used = 0; break; case 'e': task.crt_params.use_end_check = true; used = 0; break; case 'E': task.crt_params.use_end_check = false; used = 0; break; case 'g': cmd_check(!args->dump_seq_sizes, cmd); args->dump_seq_sizes = optarg; break; case 'G': cmd_check(!args->dump_frag_sizes, cmd); args->dump_frag_sizes = optarg; break; case 'u': args->use_new_paths = true; break; case 'x': gen_paths_print_contigs = true; break; case 'y': gen_paths_print_paths = true; break; case 'z': gen_paths_print_reads = true; break; case 'Z': cmd_check(!args->fq_zero, cmd); if(strlen(optarg) != 1) cmd_print_usage("--fq-zero <c> requires a single char"); args->fq_zero = optarg[0]; break; case 'P': cmd_check(!args->append_orig_seq,cmd); args->append_orig_seq = true; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" thread/correct -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } if(args->nthreads == 0) args->nthreads = DEFAULT_NTHREADS; // Check that optind+1 == argc if(optind+1 > argc) cmd_print_usage("Expected exactly one graph file"); else if(optind+1 < argc) cmd_print_usage("Expected only one graph file. What is this: '%s'", argv[optind]); char *graph_path = argv[optind]; status("Reading graph: %s", graph_path); if(!used) cmd_print_usage("Ignored arguments after last --seq"); // ctx_thread requires output file if(!correct_cmd && !args->out_ctp_path) cmd_print_usage("--out <out.ctp> is required"); // // Open graph graph file // GraphFileReader *gfile = &args->gfile; graph_file_open(gfile, graph_path); if(!correct_cmd && file_filter_into_ncols(&gfile->fltr) > 1) die("Please specify a single colour e.g. %s:0", file_filter_path(&gfile->fltr)); // // Open path files // size_t path_max_usedcols = 0; for(i = 0; i < args->gpfiles.len; i++) { // file_filter_update_intocol(&args->pfiles.b[i].fltr, 0); if(!correct_cmd && file_filter_into_ncols(&args->gpfiles.b[i].fltr) > 1) { die("Please specify a single colour e.g. %s:0", file_filter_path(&args->gpfiles.b[i].fltr)); } path_max_usedcols = MAX2(path_max_usedcols, file_filter_into_ncols(&args->gpfiles.b[i].fltr)); } args->path_max_usedcols = path_max_usedcols; // Check for compatibility between graph files and path files graphs_gpaths_compatible(gfile, 1, args->gpfiles.b, args->gpfiles.len, -1); // if no paths loaded, set all max_context values to 1, since >1 kmer only // useful if can pickup paths if(args->gpfiles.len == 0) { for(i = 0; i < inputs->len; i++) inputs->b[i].crt_params.max_context = 1; } // Check frag_len_min < frag_len_max for(i = 0; i < inputs->len; i++) { CorrectAlnInput *t = &inputs->b[i]; t->files.ptr = t; if(t->crt_params.frag_len_min > t->crt_params.frag_len_max) { die("--min-ins %u is greater than --max-ins %u", t->crt_params.frag_len_min, t->crt_params.frag_len_max); } correct_aln_input_print(&inputs->b[i]); args->max_gap_limit = MAX2(args->max_gap_limit, t->crt_params.frag_len_max); } futil_create_output(args->dump_seq_sizes); futil_create_output(args->dump_frag_sizes); }
void read_thread_args_parse(struct ReadThreadCmdArgs *args, int argc, char **argv, const struct option *longopts, bool correct_cmd) { size_t i; int tmp_thresh; // 0 => no calling, -1 => auto CorrectAlnInput task = CORRECT_ALN_INPUT_INIT; uint8_t fq_offset = 0; size_t dump_seq_n = 0, dump_mp_n = 0; // how many times are -g -G specified PathFileReader tmp_pfile; CorrectAlnInputBuffer *inputs = &args->inputs; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int used = 1, c; char *tmp_path; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': if(args->out_ctp_path != NULL) cmd_print_usage(NULL); args->out_ctp_path = optarg; break; case 'p': tmp_pfile = INIT_PATH_READER; path_file_open(&tmp_pfile, optarg, true); pfile_buf_add(&args->pfiles, tmp_pfile); break; case 't': if(args->num_of_threads != 0) die("%s set twice", cmd); args->num_of_threads = cmd_parse_arg_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&args->memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&args->memargs, optarg); break; case 'c': args->colour = cmd_parse_arg_uint32(cmd, optarg); break; case '1': case '2': case 'i': used = 1; correct_aln_input_buf_add(inputs, task); asyncio_task_parse(&inputs->data[inputs->len-1].files, c, optarg, fq_offset, correct_cmd ? &tmp_path : NULL); if(correct_cmd) inputs->data[inputs->len-1].out_base = tmp_path; break; case 'f': task.matedir = READPAIR_FR; used = 0; break; case 'F': task.matedir = READPAIR_FF; used = 0; break; case 'r': task.matedir = READPAIR_RF; used = 0; break; case 'R': task.matedir = READPAIR_RR; used = 0; break; case 'w': task.crt_params.one_way_gap_traverse = true; used = 0; break; case 'W': task.crt_params.one_way_gap_traverse = false; used = 0; break; case 'q': fq_offset = cmd_parse_arg_uint8(cmd, optarg); used = 0; break; case 'Q': task.fq_cutoff = cmd_parse_arg_uint8(cmd, optarg); used = 0; break; case 'H': task.hp_cutoff = cmd_parse_arg_uint8(cmd, optarg); used = 0; break; case 'e': task.crt_params.use_end_check = true; used = 0; break; case 'E': task.crt_params.use_end_check = false; used = 0; break; case 'g': task.crt_params.ins_gap_min = cmd_parse_arg_uint32(cmd, optarg); used = 0; break; case 'G': task.crt_params.ins_gap_max = cmd_parse_arg_uint32(cmd, optarg); used = 0; break; case 'S': args->dump_seq_sizes = optarg; dump_seq_n++; break; case 'M': args->dump_mp_sizes = optarg; dump_mp_n++; break; case 'u': args->use_new_paths = true; break; case 'C': if(optarg == NULL || strcmp(optarg,"auto")) args->clean_threshold = -1; else if(parse_entire_int(optarg,&tmp_thresh) && tmp_thresh >= -1) { if(tmp_thresh != -1 && tmp_thresh < 2) warn("Ignoring --clean %u (too small < 2)", tmp_thresh); else if(tmp_thresh > 255) warn("Ignoring --clean %u (too big > 255)", tmp_thresh); else args->clean_threshold = tmp_thresh; } else die("Bad argument for %s <auto|N> where N > 1", cmd); args->clean_paths = (args->clean_threshold != 0); break; case 'X': gen_paths_print_contigs = true; break; case 'Y': gen_paths_print_paths = true; break; case 'Z': gen_paths_print_reads = true; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" thread -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } if(args->num_of_threads == 0) args->num_of_threads = DEFAULT_NTHREADS; // Check that optind+1 == argc if(optind+1 > argc) cmd_print_usage("Expected exactly one graph file"); else if(optind+1 < argc) cmd_print_usage("Expected only one graph file. What is this: '%s'", argv[optind]); char *graph_path = argv[optind]; status("Reading graph: %s", graph_path); if(!used) cmd_print_usage("Ignored arguments after last --seq"); if(dump_seq_n > 1) die("Cannot specify --seq-gaps <out> more than once"); if(dump_mp_n > 1) die("Cannot specify --mp-gaps <out> more than once"); // // Open graph graph file // GraphFileReader *gfile = &args->gfile; graph_file_open(gfile, graph_path, true); file_filter_update_intocol(&gfile->fltr, 0); if(!correct_cmd && graph_file_usedcols(gfile) > 1) die("Please specify a single colour e.g. %s:0", gfile->fltr.file_path.buff); // // Open path files // size_t path_max_usedcols = 0; for(i = 0; i < args->pfiles.len; i++) { // file_filter_update_intocol(&args->pfiles.data[i].fltr, 0); if(!correct_cmd && path_file_usedcols(&args->pfiles.data[i]) > 1) { die("Please specify a single colour e.g. %s:0", args->pfiles.data[i].fltr.file_path.buff); } path_max_usedcols = MAX2(path_max_usedcols, path_file_usedcols(&args->pfiles.data[i])); } args->path_max_usedcols = path_max_usedcols; // Check for compatibility between graph files and path files graphs_paths_compatible(gfile, 1, args->pfiles.data, args->pfiles.len); // Check ins_gap_min < ins_gap_max for(i = 0; i < inputs->len; i++) { CorrectAlnInput *t = &inputs->data[i]; t->files.ptr = t; if(t->crt_params.ins_gap_min > t->crt_params.ins_gap_max) { die("--min-ins %u is greater than --max-ins %u", t->crt_params.ins_gap_min, t->crt_params.ins_gap_max); } correct_aln_input_print(&inputs->data[i]); args->max_gap_limit = MAX2(args->max_gap_limit, t->crt_params.ins_gap_max); } }
int ctx_view(int argc, char **argv) { // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // TODO: // print_action actions[argc]; // bool read_kmers = false; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); cmd_print_usage("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]); default: cmd_print_usage("Programmer fail. Tell Isaac."); } } if(print_kmers) parse_kmers = 1; bool no_flags = (!print_info && !parse_kmers && !print_kmers); if(no_flags) { print_info = parse_kmers = 1; } if(optind+1 != argc) cmd_print_usage("Require one input graph file (.ctx)"); char *path = argv[optind]; size_t num_errors = 0, num_warnings = 0; GraphFileReader gfile; memset(&gfile, 0, sizeof(gfile)); int ret = graph_file_open(&gfile, path); if(ret == 0) die("Cannot open file: %s", path); if(print_info) { char fsize_str[50]; bytes_to_str((size_t)gfile.file_size, 0, fsize_str); printf("Loading file: %s\n", file_filter_path(&gfile.fltr)); printf("File size: %s\n", fsize_str); printf("----\n"); } size_t i, col, ncols = file_filter_into_ncols(&gfile.fltr); size_t kmer_size = gfile.hdr.kmer_size; ctx_assert(ncols > 0); GraphFileHeader hdr; memset(&hdr, 0, sizeof(hdr)); graph_file_merge_header(&hdr, &gfile); uint64_t nkmers_read = 0, nkmers_loaded = 0; uint64_t num_all_zero_kmers = 0, num_zero_covg_kmers = 0; uint64_t *col_nkmers, *col_sum_covgs; col_nkmers = ctx_calloc(ncols, sizeof(col_nkmers[0])); col_sum_covgs = ctx_calloc(ncols, sizeof(col_sum_covgs[0])); // Print header if(print_info) print_header(&hdr, gfile.num_of_kmers); BinaryKmer bkmer; Covg covgs[ncols], keep_kmer; Edges edges[ncols]; bool direct_read = file_filter_is_direct(&gfile.fltr); if(parse_kmers || print_kmers) { if(print_info && print_kmers) printf("----\n"); for(; graph_file_read_reset(&gfile, &bkmer, covgs, edges); nkmers_read++) { // If kmer has no covg in any samples -> don't load keep_kmer = 0; for(col = 0; col < ncols; col++) { col_nkmers[col] += (covgs[col] > 0); col_sum_covgs[col] += covgs[col]; keep_kmer |= covgs[col]; } if(!direct_read && !keep_kmer) continue; nkmers_loaded++; /* Kmer Checks */ // graph_file_read_reset() already checks for: // 1. oversized kmers // 2. kmers with covg 0 in all colours // 3. edges without coverage in a colour // Check for all-zeros (i.e. all As kmer: AAAAAA) uint64_t kmer_words_or = 0; for(i = 0; i < hdr.num_of_bitfields; i++) kmer_words_or |= bkmer.b[i]; if(kmer_words_or == 0) { if(num_all_zero_kmers == 1) { loading_error("more than one all 'A's kmers seen [index: %"PRIu64"]\n", nkmers_read); } num_all_zero_kmers++; } // Check covg is 0 for all colours for(i = 0; i < ncols && covgs[i] == 0; i++); num_zero_covg_kmers += (i == ncols); // Print if(print_kmers) db_graph_print_kmer2(bkmer, covgs, edges, ncols, kmer_size, stdout); } } // check for various reading errors // if(errno != 0) // loading_error("errno set [%i]: %s\n", (int)errno, strerror(errno)); int err = ferror(gfile.fh); if(err != 0) loading_error("occurred after file reading [%i]\n", err); char nstr[50]; if(print_kmers || parse_kmers) { // file_size is set to -1 if we are reading from a stream, // therefore won't be able to check number of kmers read if(gfile.file_size != -1 && nkmers_read != (uint64_t)gfile.num_of_kmers) { loading_warning("Expected %zu kmers, read %zu\n", (size_t)gfile.num_of_kmers, (size_t)nkmers_read); } if(num_all_zero_kmers > 1) { loading_error("%s all-zero-kmers seen\n", ulong_to_str(num_all_zero_kmers, nstr)); } if(num_zero_covg_kmers > 0) { loading_warning("%s kmers have no coverage in any colour\n", ulong_to_str(num_zero_covg_kmers, nstr)); } } // Count warnings printed by graph_file_reader.c num_warnings += gfile.error_zero_covg; num_warnings += gfile.error_missing_covg; // Can only print these stats if we're read in the kmers if((print_kmers || parse_kmers) && print_info) { // print kmer coverage per sample printf("\n---- Per colour stats\n"); printf("num. kmers:"); for(col = 0; col < ncols; col++) printf("\t%s", ulong_to_str(col_nkmers[col], nstr)); printf("\n"); printf("sum coverage:"); for(col = 0; col < ncols; col++) printf("\t%s", ulong_to_str(col_sum_covgs[col], nstr)); printf("\n"); printf("kmer coverage:"); for(col = 0; col < ncols; col++) printf("\t%.2f", safe_frac(col_sum_covgs[col], col_nkmers[col])); printf("\n"); // Overall stats uint64_t sum_covgs = 0; double mean_kmer_covg = 0.0; for(col = 0; col < ncols; col++) sum_covgs += col_sum_covgs[col]; mean_kmer_covg = nkmers_loaded ? (double)sum_covgs / nkmers_loaded : 0.0; printf("\n---- Overall stats\n"); printf("Total kmers: %s\n", ulong_to_str(nkmers_loaded, nstr)); printf("Total coverage: %s\n", ulong_to_str(sum_covgs, nstr)); printf("Mean coverage: %s\n", double_to_str(mean_kmer_covg, 2, nstr)); } if(print_info) { // Print memory stats uint64_t mem, capacity, num_buckets, req_capacity; uint8_t bucket_size; req_capacity = (size_t)(gfile.num_of_kmers / IDEAL_OCCUPANCY); capacity = hash_table_cap(req_capacity, &num_buckets, &bucket_size); mem = ht_mem(bucket_size, num_buckets, sizeof(BinaryKmer)*8 + ncols*(sizeof(Covg)+sizeof(Edges))*8); char memstr[100], capacitystr[100], bucket_size_str[100], num_buckets_str[100]; bytes_to_str(mem, 1, memstr); ulong_to_str(capacity, capacitystr); ulong_to_str(bucket_size, bucket_size_str); ulong_to_str(num_buckets, num_buckets_str); size_t mem_height = (size_t)__builtin_ctzl(num_buckets); printf("\n---- Memory\n"); printf("memory required: %s [capacity: %s]\n", memstr, capacitystr); printf(" bucket size: %s; number of buckets: %s\n", bucket_size_str, num_buckets_str); printf(" --kmer_size %zu --mem_height %zu --mem_width %i\n", kmer_size, mem_height, bucket_size); } if((print_kmers || parse_kmers) && print_info) { printf("\n----\n"); if(num_warnings > 0 || num_errors > 0) { printf("Warnings: %zu; Errors: %zu\n", (size_t)num_warnings, (size_t)num_errors); } if(num_errors == 0) printf(num_warnings ? "Graph may be ok\n" : "Graph is valid\n"); } ctx_free(col_nkmers); ctx_free(col_sum_covgs); // Close file (which zeros it) graph_file_close(&gfile); graph_header_dealloc(&hdr); return num_errors ? EXIT_FAILURE : EXIT_SUCCESS; }
int ctx_exp_abc(int argc, char **argv) { size_t i, nthreads = 0, num_repeats = 0, max_AB_dist = 0; struct MemArgs memargs = MEM_ARGS_INIT; bool print_failed_contigs = false; GPathReader tmp_gpfile; GPathFileBuffer gpfiles; gpfile_buf_alloc(&gpfiles, 8); // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 't': cmd_check(!nthreads,cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'p': memset(&tmp_gpfile, 0, sizeof(GPathReader)); gpath_reader_open(&tmp_gpfile, optarg); gpfile_buf_push(&gpfiles, &tmp_gpfile, 1); break; case 'N': cmd_check(!num_repeats,cmd); num_repeats = cmd_uint32_nonzero(cmd, optarg); break; case 'M': cmd_check(!max_AB_dist,cmd); max_AB_dist = cmd_uint32_nonzero(cmd, optarg); break; case 'P': cmd_check(!print_failed_contigs,cmd); print_failed_contigs = true; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" exp_abc -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } // Defaults if(nthreads == 0) nthreads = DEFAULT_NTHREADS; if(num_repeats == 0) num_repeats = DEFAULT_NUM_REPEATS; if(max_AB_dist == 0) max_AB_dist = DEFAULT_MAX_AB_DIST; if(print_failed_contigs && nthreads != 1) { warn("--print forces nthreads to be one. soz."); nthreads = 1; } if(optind+1 != argc) cmd_print_usage("Require exactly one input graph file (.ctx)"); const char *ctx_path = argv[optind]; // // Open Graph file // GraphFileReader gfile; memset(&gfile, 0, sizeof(GraphFileReader)); graph_file_open(&gfile, ctx_path); size_t ncols = file_filter_into_ncols(&gfile.fltr); // Check only loading one colour if(ncols > 1) die("Only implemented for one colour currently"); // Check graph + paths are compatible graphs_gpaths_compatible(&gfile, 1, gpfiles.b, gpfiles.len, -1); // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem, path_mem, total_mem; // 1 bit needed per kmer if we need to keep track of kmer usage bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 + sizeof(GPath*)*8 + ncols; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, gfile.num_of_kmers, gfile.num_of_kmers, false, &graph_mem); // Paths memory size_t rem_mem = memargs.mem_to_use - MIN2(memargs.mem_to_use, graph_mem); path_mem = gpath_reader_mem_req(gpfiles.b, gpfiles.len, ncols, rem_mem, false, kmers_in_hash, false); // Shift path store memory from graphs->paths graph_mem -= sizeof(GPath*)*kmers_in_hash; path_mem += sizeof(GPath*)*kmers_in_hash; cmd_print_mem(path_mem, "paths"); total_mem = graph_mem + path_mem; cmd_check_mem_limit(memargs.mem_to_use, total_mem); // // Allocate memory // dBGraph db_graph; db_graph_alloc(&db_graph, gfile.hdr.kmer_size, 1, 1, kmers_in_hash, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL); // Paths gpath_reader_alloc_gpstore(gpfiles.b, gpfiles.len, path_mem, false, &db_graph); // Load the graph GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph); gprefs.empty_colours = true; graph_load(&gfile, gprefs, NULL); graph_file_close(&gfile); hash_table_print_stats(&db_graph.ht); // Load link files for(i = 0; i < gpfiles.len; i++) { gpath_reader_load(&gpfiles.b[i], GPATH_DIE_MISSING_KMERS, &db_graph); gpath_reader_close(&gpfiles.b[i]); } gpfile_buf_dealloc(&gpfiles); status("\n"); status("Test 1: Priming region A->B (n: %zu max_AB_dist: %zu)", num_repeats, max_AB_dist); run_exp_abc(&db_graph, true, nthreads, num_repeats, max_AB_dist, print_failed_contigs); status("\n"); status("Test 2: Trying to traverse A->B (n: %zu max_AB_dist: %zu)", num_repeats, max_AB_dist); run_exp_abc(&db_graph, false, nthreads, num_repeats, max_AB_dist, print_failed_contigs); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }