int ctx_thread(int argc, char **argv) { struct ReadThreadCmdArgs args; read_thread_args_alloc(&args); read_thread_args_parse(&args, argc, argv, longopts, false); GraphFileReader *gfile = &args.gfile; GPathFileBuffer *gpfiles = &args.gpfiles; CorrectAlnInputBuffer *inputs = &args.inputs; size_t i; if(args.zero_link_counts && gpfiles->len == 0) cmd_print_usage("-0,--zero-paths without -p,--paths <in.ctp> has no meaning"); // Check each path file only loads one colour gpaths_only_for_colour(gpfiles->b, gpfiles->len, 0); // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem, total_mem; size_t path_hash_mem, path_store_mem, path_mem; bool sep_path_list = (!args.use_new_paths && gpfiles->len > 0); bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 + sizeof(GPath*)*8 + 2 * args.nthreads; // Have traversed // false -> don't use mem_to_use to decide how many kmers to store in hash // since we need some of that memory for storing paths kmers_in_hash = cmd_get_kmers_in_hash(args.memargs.mem_to_use, args.memargs.mem_to_use_set, args.memargs.num_kmers, args.memargs.num_kmers_set, bits_per_kmer, gfile->num_of_kmers, gfile->num_of_kmers, false, &graph_mem); // Paths memory size_t min_path_mem = 0; gpath_reader_sum_mem(gpfiles->b, gpfiles->len, 1, true, true, &min_path_mem); if(graph_mem + min_path_mem > args.memargs.mem_to_use) { char buf[50]; die("Require at least %s memory", bytes_to_str(graph_mem+min_path_mem, 1, buf)); } path_mem = args.memargs.mem_to_use - graph_mem; size_t pentry_hash_mem = sizeof(GPEntry)/0.7; size_t pentry_store_mem = sizeof(GPath) + 8 + // struct + sequence 1 + // in colour sizeof(uint8_t) + // counts sizeof(uint32_t); // kmer length size_t max_paths = path_mem / (pentry_store_mem + pentry_hash_mem); path_store_mem = max_paths * pentry_store_mem; path_hash_mem = max_paths * pentry_hash_mem; cmd_print_mem(path_hash_mem, "paths hash"); cmd_print_mem(path_store_mem, "paths store"); total_mem = graph_mem + path_mem; cmd_check_mem_limit(args.memargs.mem_to_use, total_mem); // // Open output file // gzFile gzout = futil_gzopen_create(args.out_ctp_path, "w"); status("Creating paths file: %s", futil_outpath_str(args.out_ctp_path)); // // Allocate memory // dBGraph db_graph; size_t kmer_size = gfile->hdr.kmer_size; db_graph_alloc(&db_graph, kmer_size, 1, 1, kmers_in_hash, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL); // Split path memory 2:1 between store and hash // Create a path store that tracks path counts gpath_store_alloc(&db_graph.gpstore, db_graph.num_of_cols, db_graph.ht.capacity, 0, path_store_mem, true, sep_path_list); // Create path hash table for fast lookup gpath_hash_alloc(&db_graph.gphash, &db_graph.gpstore, path_hash_mem); if(args.use_new_paths) { status("Using paths as they are added (risky)"); } else { status("Not using new paths as they are added (safe)"); } // // Start up workers to add paths to the graph // GenPathWorker *workers; workers = gen_paths_workers_alloc(args.nthreads, &db_graph); // Setup for loading graphs graph LoadingStats gstats; loading_stats_init(&gstats); // Path statistics LoadingStats *load_stats = gen_paths_get_stats(workers); CorrectAlnStats *aln_stats = gen_paths_get_aln_stats(workers); // Load contig hist distribution for(i = 0; i < gpfiles->len; i++) { gpath_reader_load_contig_hist(gpfiles->b[i].json, gpfiles->b[i].fltr.path.b, file_filter_fromcol(&gpfiles->b[i].fltr, 0), &aln_stats->contig_histgrm); } GraphLoadingPrefs gprefs = {.db_graph = &db_graph, .boolean_covgs = false, .must_exist_in_graph = false, .must_exist_in_edges = NULL, .empty_colours = false}; // already loaded paths // Load graph, print stats, close file graph_load(gfile, gprefs, &gstats); hash_table_print_stats_brief(&db_graph.ht); graph_file_close(gfile); // Load existing paths for(i = 0; i < gpfiles->len; i++) gpath_reader_load(&gpfiles->b[i], GPATH_DIE_MISSING_KMERS, &db_graph); // zero link counts of already loaded links if(args.zero_link_counts) { status("Zeroing link counts for loaded links"); gpath_set_zero_nseen(&db_graph.gpstore.gpset); } if(!args.use_new_paths) gpath_store_split_read_write(&db_graph.gpstore); // Deal with a set of files at once // Can have different numbers of inputs vs threads size_t start, end; for(start = 0; start < inputs->len; start += MAX_IO_THREADS) { end = MIN2(inputs->len, start+MAX_IO_THREADS); generate_paths(inputs->b+start, end-start, workers, args.nthreads); } // Print memory statistics gpath_hash_print_stats(&db_graph.gphash); gpath_store_print_stats(&db_graph.gpstore); correct_aln_dump_stats(aln_stats, load_stats, args.dump_seq_sizes, args.dump_frag_sizes, db_graph.ht.num_kmers); // Don't need GPathHash anymore gpath_hash_dealloc(&db_graph.gphash); cJSON **hdrs = ctx_malloc(gpfiles->len * sizeof(cJSON*)); for(i = 0; i < gpfiles->len; i++) hdrs[i] = gpfiles->b[i].json; size_t output_threads = MIN2(args.nthreads, MAX_IO_THREADS); // Generate a cJSON header for all inputs cJSON *thread_hdr = cJSON_CreateObject(); cJSON *inputs_hdr = cJSON_CreateArray(); cJSON_AddItemToObject(thread_hdr, "inputs", inputs_hdr); for(i = 0; i < inputs->len; i++) cJSON_AddItemToArray(inputs_hdr, correct_aln_input_json_hdr(&inputs->b[i])); // Write output file gpath_save(gzout, args.out_ctp_path, output_threads, true, "thread", thread_hdr, hdrs, gpfiles->len, &aln_stats->contig_histgrm, 1, &db_graph); gzclose(gzout); ctx_free(hdrs); // Optionally run path checks for debugging // gpath_checks_all_paths(&db_graph, args.nthreads); // ins_gap, err_gap no longer allocated after this line gen_paths_workers_dealloc(workers, args.nthreads); // Close and free input files etc. read_thread_args_dealloc(&args); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int ctx_correct(int argc, char **argv) { size_t i; struct ReadThreadCmdArgs args; read_thread_args_alloc(&args); read_thread_args_parse(&args, argc, argv, longopts, true); GraphFileReader *gfile = &args.gfile; GPathFileBuffer *gpfiles = &args.gpfiles; CorrectAlnInputBuffer *inputs = &args.inputs; // Update colours in graph file - sample in 0, all others in 1 size_t ncols = gpath_load_sample_pop(gfile, 1, gpfiles->b, gpfiles->len, args.colour); // Check for compatibility between graph files and link files graphs_gpaths_compatible(gfile, 1, gpfiles->b, gpfiles->len, 1); int64_t ctx_num_kmers = gfile->num_of_kmers; // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem, path_mem, total_mem; // 1 bit needed per kmer if we need to keep track of noreseed bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 + (gpfiles->len > 0 ? sizeof(GPath*)*8 : 0) + ncols; // in colour kmers_in_hash = cmd_get_kmers_in_hash(args.memargs.mem_to_use, args.memargs.mem_to_use_set, args.memargs.num_kmers, args.memargs.num_kmers_set, bits_per_kmer, ctx_num_kmers, ctx_num_kmers, false, &graph_mem); // Paths memory size_t rem_mem = args.memargs.mem_to_use - MIN2(args.memargs.mem_to_use, graph_mem); path_mem = gpath_reader_mem_req(gpfiles->b, gpfiles->len, ncols, rem_mem, false, kmers_in_hash, false); cmd_print_mem(path_mem, "paths"); // Shift path store memory from graphs->paths graph_mem -= sizeof(GPath*)*kmers_in_hash; path_mem += sizeof(GPath*)*kmers_in_hash; // Total memory total_mem = graph_mem + path_mem; cmd_check_mem_limit(args.memargs.mem_to_use, total_mem); // // Check we can write all output files // // Open output files SeqOutput *outputs = ctx_calloc(inputs->len, sizeof(SeqOutput)); bool err_occurred = false; for(i = 0; i < inputs->len && !err_occurred; i++) { CorrectAlnInput *input = &inputs->b[i]; // We loaded target colour into colour zero input->crt_params.ctxcol = input->crt_params.ctpcol = 0; bool is_pe = asyncio_task_is_pe(&input->files); err_occurred = !seqout_open(&outputs[i], input->out_base, args.fmt, is_pe); input->output = &outputs[i]; } // Abandon if some of the output files already exist if(err_occurred) { for(i = 0; i < inputs->len; i++) seqout_close(&outputs[i], true); die("Error creating output files"); } // // Allocate memory // dBGraph db_graph; db_graph_alloc(&db_graph, gfile->hdr.kmer_size, ncols, 1, kmers_in_hash, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL); // Create a path store that does not tracks path counts gpath_reader_alloc_gpstore(gpfiles->b, gpfiles->len, path_mem, false, &db_graph); // // Load Graph and link files // GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph); gprefs.empty_colours = true; // Load graph, print stats, close file graph_load(gfile, gprefs, NULL); hash_table_print_stats_brief(&db_graph.ht); graph_file_close(gfile); // Load link files for(i = 0; i < gpfiles->len; i++) { gpath_reader_load(&gpfiles->b[i], GPATH_DIE_MISSING_KMERS, &db_graph); gpath_reader_close(&gpfiles->b[i]); } // // Run alignment // correct_reads(inputs->b, inputs->len, args.dump_seq_sizes, args.dump_frag_sizes, args.fq_zero, args.append_orig_seq, args.nthreads, &db_graph); // Close and free output files for(i = 0; i < inputs->len; i++) seqout_close(&outputs[i], false); ctx_free(outputs); // Closes input files read_thread_args_dealloc(&args); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int ctx_correct(int argc, char **argv) { size_t i, j; struct ReadThreadCmdArgs args = READ_THREAD_CMD_ARGS_INIT; read_thread_args_alloc(&args); read_thread_args_parse(&args, argc, argv, longopts, true); GraphFileReader *gfile = &args.gfile; PathFileBuffer *pfiles = &args.pfiles; CorrectAlnInputBuffer *inputs = &args.inputs; size_t ctx_total_cols = gfile->hdr.num_of_cols; size_t ctx_num_kmers = gfile->num_of_kmers; if(args.colour > ctx_total_cols) cmd_print_usage("-c %zu is too big [> %zu]", args.colour, ctx_total_cols); size_t ctp_usedcols = 0; for(i = 0; i < pfiles->len; i++) { if(!file_filter_iscolloaded(&pfiles->data[i].fltr, args.colour)) { cmd_print_usage("Path file doesn't load into colour %zu: %s", args.colour, pfiles->data[i].fltr.orig_path.buff); } ctp_usedcols = MAX2(ctp_usedcols, path_file_usedcols(&pfiles->data[i])); } // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem, path_mem, total_mem; // 1 bit needed per kmer if we need to keep track of noreseed bits_per_kmer = sizeof(Edges)*8 + ctx_num_kmers + sizeof(uint64_t)*8; kmers_in_hash = cmd_get_kmers_in_hash2(args.memargs.mem_to_use, args.memargs.mem_to_use_set, args.memargs.num_kmers, args.memargs.num_kmers_set, bits_per_kmer, ctx_num_kmers, ctx_num_kmers, false, &graph_mem); // Paths memory path_mem = path_files_mem_required(pfiles->data, pfiles->len, false, false, ctp_usedcols, 0); cmd_print_mem(path_mem, "paths"); // Total memory total_mem = graph_mem + path_mem; cmd_check_mem_limit(args.memargs.mem_to_use, total_mem); // // Check we can read all output files // // Open output files SeqOutput *outputs = ctx_calloc(inputs->len, sizeof(SeqOutput)); bool output_files_exist = false; for(i = 0; i < inputs->len; i++) { CorrectAlnInput *input = &inputs->data[i]; input->crt_params.ctxcol = input->crt_params.ctpcol = args.colour; SeqOutput *output = &outputs[i]; seq_output_alloc(output); seq_output_set_paths(output, input->out_base, async_task_pe_output(&input->files)); input->output = output; // output check prints warnings and returns true if errors output_files_exist |= seq_output_files_exist_check(output); } // Abandon if some of the output files already exist if(output_files_exist) die("Output files already exist"); // Attempt to open all files for(i = 0; i < inputs->len && seq_output_open(&outputs[i]); i++) {} // Check if something went wrong - if so remove all output files if(i < inputs->len) { for(j = 0; j < i; j++) seq_output_delete(&outputs[i]); die("Couldn't open output files"); } // // Allocate memory // dBGraph db_graph; db_graph_alloc(&db_graph, gfile->hdr.kmer_size, ctx_total_cols, 1, kmers_in_hash); size_t bytes_per_col = roundup_bits2bytes(db_graph.ht.capacity); db_graph.col_edges = ctx_calloc(db_graph.ht.capacity, sizeof(Edges)); db_graph.node_in_cols = ctx_calloc(bytes_per_col * ctx_total_cols, 1); // Paths path_store_alloc(&db_graph.pstore, path_mem, false, db_graph.ht.capacity, ctp_usedcols); // // Load Graph and Path files // LoadingStats gstats = LOAD_STATS_INIT_MACRO; GraphLoadingPrefs gprefs = {.db_graph = &db_graph, .boolean_covgs = false, .must_exist_in_graph = false, .must_exist_in_edges = NULL, .empty_colours = true}; // Load graph, print stats, close file graph_load(gfile, gprefs, &gstats); hash_table_print_stats_brief(&db_graph.ht); graph_file_close(gfile); // Load path files (does nothing if num_fpiles == 0) paths_format_merge(pfiles->data, pfiles->len, false, false, args.num_of_threads, &db_graph); // // Run alignment // correct_reads(args.num_of_threads, MAX_IO_THREADS, inputs->data, inputs->len, &db_graph); // Close and free output files for(i = 0; i < inputs->len; i++) seq_output_dealloc(&outputs[i]); ctx_free(outputs); read_thread_args_dealloc(&args); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int ctx_contigs(int argc, char **argv) { size_t nthreads = 0; struct MemArgs memargs = MEM_ARGS_INIT; const char *out_path = NULL; size_t i, contig_limit = 0, colour = 0; bool cmd_reseed = false, cmd_no_reseed = false; // -r, -R const char *conf_table_path = NULL; // save confidence table to here bool use_missing_info_check = true, seed_with_unused_paths = false; double min_step_confid = -1.0, min_cumul_confid = -1.0; // < 0 => no min // Read length and expected depth for calculating confidences size_t genome_size = 0; seq_file_t *tmp_seed_file = NULL; SeqFilePtrBuffer seed_buf; seq_file_ptr_buf_alloc(&seed_buf, 16); GPathReader tmp_gpfile; GPathFileBuffer gpfiles; gpfile_buf_alloc(&gpfiles, 8); // Arg parsing char cmd[100], shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'o': cmd_check(!out_path,cmd); out_path = optarg; break; case 't': cmd_check(!nthreads,cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'p': memset(&tmp_gpfile, 0, sizeof(GPathReader)); gpath_reader_open(&tmp_gpfile, optarg); gpfile_buf_push(&gpfiles, &tmp_gpfile, 1); break; case '1': case 's': // --seed <in.fa> if((tmp_seed_file = seq_open(optarg)) == NULL) die("Cannot read --seed file: %s", optarg); seq_file_ptr_buf_add(&seed_buf, tmp_seed_file); break; case 'r': cmd_check(!cmd_reseed,cmd); cmd_reseed = true; break; case 'R': cmd_check(!cmd_no_reseed,cmd); cmd_no_reseed = true; break; case 'N': cmd_check(!contig_limit,cmd); contig_limit = cmd_uint32_nonzero(cmd, optarg); break; case 'c': cmd_check(!colour,cmd); colour = cmd_uint32(cmd, optarg); break; case 'G': cmd_check(!genome_size,cmd); genome_size = cmd_bases(cmd, optarg); break; case 'S': cmd_check(!conf_table_path,cmd); conf_table_path = optarg; break; case 'M': cmd_check(use_missing_info_check,cmd); use_missing_info_check = false; break; case 'P': cmd_check(!seed_with_unused_paths,cmd); seed_with_unused_paths = true; break; case 'C': cmd_check(min_cumul_confid < 0,cmd); min_cumul_confid = cmd_udouble(cmd,optarg); if(min_cumul_confid > 1) die("%s must be 0 <= x <= 1", cmd); break; case 'T': cmd_check(min_step_confid < 0,cmd); min_step_confid = cmd_udouble(cmd,optarg); if(min_step_confid > 1) die("%s must be 0 <= x <= 1", cmd); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ die("`"CMD" contigs -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } if(cmd_no_reseed && cmd_reseed) cmd_print_usage("Cannot specify both -r and -R"); if(contig_limit && seed_with_unused_paths) cmd_print_usage("Cannot combine --ncontigs with --use-seed-paths"); bool sample_with_replacement = cmd_reseed; // Defaults if(nthreads == 0) nthreads = DEFAULT_NTHREADS; if(!seed_buf.len && !contig_limit && sample_with_replacement) { cmd_print_usage("Please specify one or more of: " "--no-reseed | --ncontigs | --seed <in.fa>"); } if(optind >= argc) cmd_print_usage("Require input graph files (.ctx)"); // // Open graph files // const size_t num_gfiles = argc - optind; char **graph_paths = argv + optind; ctx_assert(num_gfiles > 0); GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); size_t ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0; graph_files_open(graph_paths, gfiles, num_gfiles, &ctx_max_kmers, &ctx_sum_kmers); // char *ctx_path = argv[optind]; // // Open Graph file // // GraphFileReader gfile; // memset(&gfile, 0, sizeof(GraphFileReader)); // graph_file_open(&gfile, ctx_path); // Update colours in graph file - sample in 0, all others in 1 // never need more than two colours ncols = gpath_load_sample_pop(gfiles, num_gfiles, gpfiles.b, gpfiles.len, colour); // Check for compatibility between graph files and path files // pop_colour is colour 1 graphs_gpaths_compatible(gfiles, num_gfiles, gpfiles.b, gpfiles.len, 1); if(!genome_size) { char nk_str[50]; if(ctx_max_kmers <= 0) die("Please pass --genome <G> if streaming"); genome_size = ctx_max_kmers; ulong_to_str(genome_size, nk_str); status("Taking number of kmers as genome size: %s", nk_str); } // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem, path_mem, total_mem; // 1 bit needed per kmer if we need to keep track of kmer usage bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 + sizeof(GPath*)*8 + ncols + !sample_with_replacement; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, ctx_max_kmers, ctx_sum_kmers, false, &graph_mem); // Paths memory size_t rem_mem = memargs.mem_to_use - MIN2(memargs.mem_to_use, graph_mem); path_mem = gpath_reader_mem_req(gpfiles.b, gpfiles.len, ncols, rem_mem, false); // Shift path store memory from graphs->paths graph_mem -= sizeof(GPath*)*kmers_in_hash; path_mem += sizeof(GPath*)*kmers_in_hash; cmd_print_mem(path_mem, "paths"); // Total memory total_mem = graph_mem + path_mem; cmd_check_mem_limit(memargs.mem_to_use, total_mem); // Load contig hist distribution from ctp files ZeroSizeBuffer contig_hist; memset(&contig_hist, 0, sizeof(contig_hist)); for(i = 0; i < gpfiles.len; i++) { gpath_reader_load_contig_hist(gpfiles.b[i].json, gpfiles.b[i].fltr.path.b, file_filter_fromcol(&gpfiles.b[i].fltr, 0), &contig_hist); } // Calculate confidences, only for one colour ContigConfidenceTable conf_table; conf_table_alloc(&conf_table, 1); conf_table_update_hist(&conf_table, 0, genome_size, contig_hist.b, contig_hist.len); if(conf_table_path != NULL) { conf_table_save(&conf_table, conf_table_path); } zsize_buf_dealloc(&contig_hist); // // Output file if printing // FILE *fout = out_path ? futil_fopen_create(out_path, "w") : NULL; // Allocate dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, ncols, 1, kmers_in_hash, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL); // Paths gpath_reader_alloc_gpstore(gpfiles.b, gpfiles.len, path_mem, false, &db_graph); uint8_t *visited = NULL; if(!sample_with_replacement) visited = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1); // Load graph LoadingStats stats = LOAD_STATS_INIT_MACRO; GraphLoadingPrefs gprefs = {.db_graph = &db_graph, .boolean_covgs = false, .must_exist_in_graph = false, .empty_colours = true}; for(i = 0; i < num_gfiles; i++) { graph_load(&gfiles[i], gprefs, &stats); graph_file_close(&gfiles[i]); gprefs.empty_colours = false; } ctx_free(gfiles); hash_table_print_stats(&db_graph.ht); // Load path files for(i = 0; i < gpfiles.len; i++) { gpath_reader_load(&gpfiles.b[i], GPATH_DIE_MISSING_KMERS, &db_graph); gpath_reader_close(&gpfiles.b[i]); } gpfile_buf_dealloc(&gpfiles); AssembleContigStats assem_stats; assemble_contigs_stats_init(&assem_stats); assemble_contigs(nthreads, seed_buf.b, seed_buf.len, contig_limit, visited, use_missing_info_check, seed_with_unused_paths, min_step_confid, min_cumul_confid, fout, out_path, &assem_stats, &conf_table, &db_graph, 0); // Sample always loaded into colour zero if(fout && fout != stdout) fclose(fout); assemble_contigs_stats_print(&assem_stats); assemble_contigs_stats_destroy(&assem_stats); conf_table_dealloc(&conf_table); for(i = 0; i < seed_buf.len; i++) seq_close(seed_buf.b[i]); seq_file_ptr_buf_dealloc(&seed_buf); ctx_free(visited); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int ctx_bubbles(int argc, char **argv) { size_t nthreads = 0; struct MemArgs memargs = MEM_ARGS_INIT; const char *out_path = NULL; size_t max_allele_len = 0, max_flank_len = 0; bool remove_serial_bubbles = true; // List of haploid colours size_t *hapcols = NULL; int nhapcols = 0; char *hapcols_arg = NULL; GPathReader tmp_gpfile; GPathFileBuffer gpfiles; gpfile_buf_alloc(&gpfiles, 8); // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'p': memset(&tmp_gpfile, 0, sizeof(GPathReader)); gpath_reader_open(&tmp_gpfile, optarg); gpfile_buf_push(&gpfiles, &tmp_gpfile, 1); break; case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'H': cmd_check(!hapcols_arg, cmd); hapcols_arg = optarg; break; case 'A': cmd_check(!max_allele_len, cmd); max_allele_len = cmd_uint32_nonzero(cmd, optarg); break; case 'F': cmd_check(!max_flank_len, cmd); max_flank_len = cmd_uint32_nonzero(cmd, optarg); break; case 'S': cmd_check(remove_serial_bubbles,cmd); remove_serial_bubbles = false; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } // Defaults for unset values if(out_path == NULL) out_path = "-"; if(nthreads == 0) nthreads = DEFAULT_NTHREADS; if(max_allele_len == 0) max_allele_len = DEFAULT_MAX_ALLELE; if(max_flank_len == 0) max_flank_len = DEFAULT_MAX_FLANK; if(optind >= argc) cmd_print_usage("Require input graph files (.ctx)"); // // Open graph files // const size_t num_gfiles = argc - optind; char **graph_paths = argv + optind; ctx_assert(num_gfiles > 0); GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); size_t i, ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0; ncols = graph_files_open(graph_paths, gfiles, num_gfiles, &ctx_max_kmers, &ctx_sum_kmers); // Check graph + paths are compatible graphs_gpaths_compatible(gfiles, num_gfiles, gpfiles.b, gpfiles.len, -1); // // Check haploid colours are valid // if(hapcols_arg != NULL) { if((nhapcols = range_get_num(hapcols_arg, ncols)) < 0) die("Invalid haploid colour list: %s", hapcols_arg); hapcols = ctx_calloc(nhapcols, sizeof(hapcols[0])); if(range_parse_array(hapcols_arg, hapcols, ncols) < 0) die("Invalid haploid colour list: %s", hapcols_arg); } // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem, path_mem, thread_mem; char thread_mem_str[100]; // edges(1bytes) + kmer_paths(8bytes) + in_colour(1bit/col) + // visitedfw/rv(2bits/thread) bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 + (gpfiles.len > 0 ? sizeof(GPath*)*8 : 0) + ncols + 2*nthreads; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, ctx_max_kmers, ctx_sum_kmers, false, &graph_mem); // Thread memory thread_mem = roundup_bits2bytes(kmers_in_hash) * 2; bytes_to_str(thread_mem * nthreads, 1, thread_mem_str); status("[memory] (of which threads: %zu x %zu = %s)\n", nthreads, thread_mem, thread_mem_str); // Paths memory size_t rem_mem = memargs.mem_to_use - MIN2(memargs.mem_to_use, graph_mem+thread_mem); path_mem = gpath_reader_mem_req(gpfiles.b, gpfiles.len, ncols, rem_mem, false, kmers_in_hash, false); // Shift path store memory from graphs->paths graph_mem -= sizeof(GPath*)*kmers_in_hash; path_mem += sizeof(GPath*)*kmers_in_hash; cmd_print_mem(path_mem, "paths"); size_t total_mem = graph_mem + thread_mem + path_mem; cmd_check_mem_limit(memargs.mem_to_use, total_mem); // // Open output file // gzFile gzout = futil_gzopen_create(out_path, "w"); // Allocate memory dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, ncols, 1, kmers_in_hash, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL); // Paths gpath_reader_alloc_gpstore(gpfiles.b, gpfiles.len, path_mem, false, &db_graph); // // Load graphs // GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph); gprefs.empty_colours = true; for(i = 0; i < num_gfiles; i++) { graph_load(&gfiles[i], gprefs, NULL); graph_file_close(&gfiles[i]); gprefs.empty_colours = false; } ctx_free(gfiles); hash_table_print_stats(&db_graph.ht); // Load link files for(i = 0; i < gpfiles.len; i++) gpath_reader_load(&gpfiles.b[i], GPATH_DIE_MISSING_KMERS, &db_graph); // Create array of cJSON** from input files cJSON **hdrs = ctx_malloc(gpfiles.len * sizeof(cJSON*)); for(i = 0; i < gpfiles.len; i++) hdrs[i] = gpfiles.b[i].json; // Now call variants BubbleCallingPrefs call_prefs = {.max_allele_len = max_allele_len, .max_flank_len = max_flank_len, .haploid_cols = hapcols, .nhaploid_cols = nhapcols, .remove_serial_bubbles = remove_serial_bubbles}; invoke_bubble_caller(nthreads, &call_prefs, gzout, out_path, hdrs, gpfiles.len, &db_graph); status(" saved to: %s\n", out_path); gzclose(gzout); ctx_free(hdrs); // Close input link files for(i = 0; i < gpfiles.len; i++) gpath_reader_close(&gpfiles.b[i]); gpfile_buf_dealloc(&gpfiles); ctx_free(hapcols); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int ctx_exp_abc(int argc, char **argv) { size_t i, nthreads = 0, num_repeats = 0, max_AB_dist = 0; struct MemArgs memargs = MEM_ARGS_INIT; bool print_failed_contigs = false; GPathReader tmp_gpfile; GPathFileBuffer gpfiles; gpfile_buf_alloc(&gpfiles, 8); // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 't': cmd_check(!nthreads,cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'p': memset(&tmp_gpfile, 0, sizeof(GPathReader)); gpath_reader_open(&tmp_gpfile, optarg); gpfile_buf_push(&gpfiles, &tmp_gpfile, 1); break; case 'N': cmd_check(!num_repeats,cmd); num_repeats = cmd_uint32_nonzero(cmd, optarg); break; case 'M': cmd_check(!max_AB_dist,cmd); max_AB_dist = cmd_uint32_nonzero(cmd, optarg); break; case 'P': cmd_check(!print_failed_contigs,cmd); print_failed_contigs = true; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" exp_abc -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } // Defaults if(nthreads == 0) nthreads = DEFAULT_NTHREADS; if(num_repeats == 0) num_repeats = DEFAULT_NUM_REPEATS; if(max_AB_dist == 0) max_AB_dist = DEFAULT_MAX_AB_DIST; if(print_failed_contigs && nthreads != 1) { warn("--print forces nthreads to be one. soz."); nthreads = 1; } if(optind+1 != argc) cmd_print_usage("Require exactly one input graph file (.ctx)"); const char *ctx_path = argv[optind]; // // Open Graph file // GraphFileReader gfile; memset(&gfile, 0, sizeof(GraphFileReader)); graph_file_open(&gfile, ctx_path); size_t ncols = file_filter_into_ncols(&gfile.fltr); // Check only loading one colour if(ncols > 1) die("Only implemented for one colour currently"); // Check graph + paths are compatible graphs_gpaths_compatible(&gfile, 1, gpfiles.b, gpfiles.len, -1); // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem, path_mem, total_mem; // 1 bit needed per kmer if we need to keep track of kmer usage bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 + sizeof(GPath*)*8 + ncols; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, gfile.num_of_kmers, gfile.num_of_kmers, false, &graph_mem); // Paths memory size_t rem_mem = memargs.mem_to_use - MIN2(memargs.mem_to_use, graph_mem); path_mem = gpath_reader_mem_req(gpfiles.b, gpfiles.len, ncols, rem_mem, false, kmers_in_hash, false); // Shift path store memory from graphs->paths graph_mem -= sizeof(GPath*)*kmers_in_hash; path_mem += sizeof(GPath*)*kmers_in_hash; cmd_print_mem(path_mem, "paths"); total_mem = graph_mem + path_mem; cmd_check_mem_limit(memargs.mem_to_use, total_mem); // // Allocate memory // dBGraph db_graph; db_graph_alloc(&db_graph, gfile.hdr.kmer_size, 1, 1, kmers_in_hash, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL); // Paths gpath_reader_alloc_gpstore(gpfiles.b, gpfiles.len, path_mem, false, &db_graph); // Load the graph GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph); gprefs.empty_colours = true; graph_load(&gfile, gprefs, NULL); graph_file_close(&gfile); hash_table_print_stats(&db_graph.ht); // Load link files for(i = 0; i < gpfiles.len; i++) { gpath_reader_load(&gpfiles.b[i], GPATH_DIE_MISSING_KMERS, &db_graph); gpath_reader_close(&gpfiles.b[i]); } gpfile_buf_dealloc(&gpfiles); status("\n"); status("Test 1: Priming region A->B (n: %zu max_AB_dist: %zu)", num_repeats, max_AB_dist); run_exp_abc(&db_graph, true, nthreads, num_repeats, max_AB_dist, print_failed_contigs); status("\n"); status("Test 2: Trying to traverse A->B (n: %zu max_AB_dist: %zu)", num_repeats, max_AB_dist); run_exp_abc(&db_graph, false, nthreads, num_repeats, max_AB_dist, print_failed_contigs); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }