int main(int argc, char **argv) { (void)argc; (void)argv; cortex_init(); cmd_init(argc, argv); if(argc != 3) die("usage: ./debug <in.ctp> <in.ctx>"); const char *out_path = argv[2]; GPathReader pfile; memset(&pfile, 0, sizeof(GPathReader)); gpath_reader_open(&pfile, argv[1], true); status("Got file with %zu colours", pfile.ncolours); size_t i, kmer_size = 7, ncols = 3; gpath_reader_check(&pfile, kmer_size, ncols); gzFile gzout = futil_gzopen_create(out_path, "w"); dBGraph db_graph; db_graph_alloc(&db_graph, kmer_size, ncols, 1, 1024, DBG_ALLOC_EDGES); // Create a path store that tracks path counts gpath_store_alloc(&db_graph.gpstore, db_graph.num_of_cols, db_graph.ht.capacity, ONE_MEGABYTE, true, false); // Create path hash table for fast lookup gpath_hash_alloc(&db_graph.gphash, &db_graph.gpstore, ONE_MEGABYTE); // Set sample names for(i = 0; i < pfile.ncolours; i++) { const char *sample_name = gpath_reader_get_sample_name(&pfile, i); ctx_assert(sample_name != NULL); strbuf_set(&db_graph.ginfo[i].sample_name, sample_name); } // Load path files, add kmers that are missing gpath_reader_load(&pfile, GPATH_ADD_MISSING_KMERS, &db_graph); hash_table_print_stats(&db_graph.ht); // Write output file gpath_save(gzout, out_path, 1, true, NULL, NULL, &pfile.json, 1, &db_graph); gzclose(gzout); // Checks // gpath_checks_all_paths(&db_graph, 2); // use two threads gpath_checks_counts(&db_graph); // Clean up gpath_reader_close(&pfile); db_graph_dealloc(&db_graph); cortex_destroy(); return EXIT_SUCCESS; }
int ctx_links(int argc, char **argv) { size_t limit = 0; const char *link_out_path = NULL, *csv_out_path = NULL, *plot_out_path = NULL; const char *thresh_path = NULL, *hist_path = NULL; size_t hist_distsize = 0, hist_covgsize = 0; size_t cutoff = 0; bool clean = false; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!link_out_path, cmd); link_out_path = optarg; break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'l': cmd_check(!csv_out_path, cmd); csv_out_path = optarg; break; case 'c': cmd_check(!cutoff, cmd); cutoff = cmd_size(cmd, optarg); clean = true; break; case 'L': cmd_check(!limit, cmd); limit = cmd_size(cmd, optarg); break; case 'P': cmd_check(!plot_out_path, cmd); plot_out_path = optarg; break; case 'T': cmd_check(!thresh_path, cmd); thresh_path = optarg; break; case 'H': cmd_check(!hist_path, cmd); hist_path = optarg; break; case 'C': cmd_check(!hist_covgsize, cmd); hist_covgsize = cmd_size(cmd, optarg); break; case 'D': cmd_check(!hist_distsize, cmd); hist_distsize = cmd_size(cmd, optarg); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" links -h` for help. Bad option: %s", argv[optind-1]); default: ctx_assert2(0, "shouldn't reach here: %c", c); } } if(hist_distsize && !hist_path) cmd_print_usage("--max-dist without --covg-hist"); if(hist_covgsize && !hist_path) cmd_print_usage("--max-covg without --covg-hist"); // Defaults if(!hist_distsize) hist_distsize = DEFAULT_MAX_DIST; if(!hist_covgsize) hist_covgsize = DEFAULT_MAX_COVG; if(optind + 1 != argc) cmd_print_usage("Wrong number of arguments"); const char *ctp_path = argv[optind]; bool list = (csv_out_path != NULL); bool plot = (plot_out_path != NULL); bool save = (link_out_path != NULL); bool hist_covg = (thresh_path != NULL || hist_path != NULL); size_t plot_kmer_idx = (limit == 0 ? 0 : limit - 1); if(clean && !save) cmd_print_usage("Need to give --out <out.ctp.gz> with --clean"); if(!save && !list && !plot && !hist_covg) cmd_print_usage("Please specify one of --plot, --list or --clean"); if(link_out_path && hist_covg && strcmp(link_out_path,"-") == 0) cmd_print_usage("Outputing both cleaning threshold (-T) and links (-o) to STDOUT!"); // Open input file FILE *list_fh = NULL, *plot_fh = NULL, *link_tmp_fh = NULL; FILE *thresh_fh = NULL, *hist_fh = NULL; gzFile link_gz = NULL; // Check file don't exist or that we can overwrite // Will ignore if path is null bool err = false; err |= futil_check_outfile(csv_out_path); err |= futil_check_outfile(plot_out_path); err |= futil_check_outfile(link_out_path); err |= futil_check_outfile(thresh_path); err |= futil_check_outfile(hist_path); if(err) die("Use -f,--force to overwrite files"); StrBuf link_tmp_path; strbuf_alloc(&link_tmp_path, 1024); GPathReader ctpin; memset(&ctpin, 0, sizeof(ctpin)); gpath_reader_open(&ctpin, ctp_path); size_t ncols = file_filter_into_ncols(&ctpin.fltr); size_t kmer_size = gpath_reader_get_kmer_size(&ctpin); cJSON *newhdr = cJSON_Duplicate(ctpin.json, 1); if(ncols != 1) die("Can only clean a single colour at a time. Sorry."); uint64_t (*hists)[hist_covgsize] = NULL; if(hist_covg) { hists = ctx_calloc(hist_distsize, sizeof(hists[0])); } if(hist_path && (hist_fh = futil_fopen_create(hist_path, "w")) == NULL) die("Cannot open file: %s", hist_path); if(thresh_path && (thresh_fh = futil_fopen_create(thresh_path, "w")) == NULL) die("Cannot open file: %s", thresh_path); if(limit) status("Limiting to the first %zu kmers", limit); if(clean) { timestamp(); message(" Cleaning coverage below %zu", cutoff); message("\n"); } if(save) { // Check we can find the fields we need cJSON *links_json = json_hdr_get(newhdr, "paths", cJSON_Object, link_out_path); cJSON *nkmers_json = json_hdr_get(links_json, "num_kmers_with_paths", cJSON_Number, link_out_path); cJSON *nlinks_json = json_hdr_get(links_json, "num_paths", cJSON_Number, link_out_path); cJSON *nbytes_json = json_hdr_get(links_json, "path_bytes", cJSON_Number, link_out_path); if(!nkmers_json || !nlinks_json || !nbytes_json) die("Cannot find required header entries"); // Create a random temporary file link_tmp_fh = create_tmp_file(&link_tmp_path, link_out_path); status("Saving output to: %s", link_out_path); status("Temporary output: %s", link_tmp_path.b); // Open output file if((link_gz = futil_gzopen_create(link_out_path, "w")) == NULL) die("Cannot open output link file: %s", link_out_path); // Need to open output file first so we can get absolute path // Update the header to include this command json_hdr_add_curr_cmd(newhdr, link_out_path); } if(list) { status("Listing to %s", csv_out_path); if((list_fh = futil_fopen_create(csv_out_path, "w")) == NULL) die("Cannot open output CSV file %s", csv_out_path); // Print csv header fprintf(list_fh, "SeqLen,Covg\n"); } if(plot) { status("Plotting kmer %zu to %s", plot_kmer_idx, plot_out_path); if((plot_fh = futil_fopen_create(plot_out_path, "w")) == NULL) die("Cannot open output .dot file %s", plot_out_path); } SizeBuffer countbuf, jposbuf; size_buf_alloc(&countbuf, 16); size_buf_alloc(&jposbuf, 1024); StrBuf kmerbuf, juncsbuf, seqbuf, outbuf; strbuf_alloc(&kmerbuf, 1024); strbuf_alloc(&juncsbuf, 1024); strbuf_alloc(&seqbuf, 1024); strbuf_alloc(&outbuf, 1024); bool link_fw; size_t njuncs; size_t knum, nlinks, num_links_exp = 0; LinkTree ltree; ltree_alloc(<ree, kmer_size); LinkTreeStats tree_stats; memset(&tree_stats, 0, sizeof(tree_stats)); size_t init_num_links = 0, num_links = 0; for(knum = 0; !limit || knum < limit; knum++) { ltree_reset(<ree); if(!gpath_reader_read_kmer(&ctpin, &kmerbuf, &num_links_exp)) break; ctx_assert2(kmerbuf.end == kmer_size, "Kmer incorrect length %zu != %zu", kmerbuf.end, kmer_size); // status("kmer: %s", kmerbuf.b); for(nlinks = 0; gpath_reader_read_link(&ctpin, &link_fw, &njuncs, &countbuf, &juncsbuf, &seqbuf, &jposbuf); nlinks++) { ltree_add(<ree, link_fw, countbuf.b[0], jposbuf.b, juncsbuf.b, seqbuf.b); } if(nlinks != num_links_exp) warn("Links count mismatch %zu != %zu", nlinks, num_links_exp); if(hist_covg) { ltree_update_covg_hists(<ree, (uint64_t*)hists, hist_distsize, hist_covgsize); } if(clean) { ltree_clean(<ree, cutoff); } // Accumulate statistics ltree_get_stats(<ree, &tree_stats); num_links = tree_stats.num_links - init_num_links; init_num_links = tree_stats.num_links; if(list) { ltree_write_list(<ree, &outbuf); if(fwrite(outbuf.b, 1, outbuf.end, list_fh) != outbuf.end) die("Cannot write CSV file to: %s", csv_out_path); strbuf_reset(&outbuf); } if(save && num_links) { ltree_write_ctp(<ree, kmerbuf.b, num_links, &outbuf); if(fwrite(outbuf.b, 1, outbuf.end, link_tmp_fh) != outbuf.end) die("Cannot write ctp file to: %s", link_tmp_path.b); strbuf_reset(&outbuf); } if(plot && knum == plot_kmer_idx) { status("Plotting tree..."); ltree_write_dot(<ree, &outbuf); if(fwrite(outbuf.b, 1, outbuf.end, plot_fh) != outbuf.end) die("Cannot write plot DOT file to: %s", plot_out_path); strbuf_reset(&outbuf); } } gpath_reader_close(&ctpin); cJSON *links_json = json_hdr_get(newhdr, "paths", cJSON_Object, link_out_path); cJSON *nkmers_json = json_hdr_get(links_json, "num_kmers_with_paths", cJSON_Number, link_out_path); cJSON *nlinks_json = json_hdr_get(links_json, "num_paths", cJSON_Number, link_out_path); cJSON *nbytes_json = json_hdr_get(links_json, "path_bytes", cJSON_Number, link_out_path); status("Number of kmers with links %li -> %zu", nkmers_json->valueint, tree_stats.num_trees_with_links); status("Number of links %li -> %zu", nlinks_json->valueint, tree_stats.num_links); status("Number of bytes %li -> %zu", nbytes_json->valueint, tree_stats.num_link_bytes); if(save) { // Update JSON nkmers_json->valuedouble = nkmers_json->valueint = tree_stats.num_trees_with_links; nlinks_json->valuedouble = nlinks_json->valueint = tree_stats.num_links; nbytes_json->valuedouble = nbytes_json->valueint = tree_stats.num_link_bytes; char *json_str = cJSON_Print(newhdr); if(gzputs(link_gz, json_str) != (int)strlen(json_str)) die("Cannot write ctp file to: %s", link_out_path); free(json_str); gzputs(link_gz, "\n\n"); gzputs(link_gz, ctp_explanation_comment); gzputs(link_gz, "\n"); fseek(link_tmp_fh, 0, SEEK_SET); char *tmp = ctx_malloc(4*ONE_MEGABYTE); size_t s; while((s = fread(tmp, 1, 4*ONE_MEGABYTE, link_tmp_fh)) > 0) { if(gzwrite(link_gz, tmp, s) != (int)s) die("Cannot write to output: %s", link_out_path); } ctx_free(tmp); gzclose(link_gz); fclose(link_tmp_fh); } // Write histogram to file if(hist_fh) { size_t i, j; fprintf(hist_fh, " "); for(j = 1; j < hist_covgsize; j++) fprintf(hist_fh, ",covg.%02zu", j); fprintf(hist_fh, "\n"); for(i = 1; i < hist_distsize; i++) { fprintf(hist_fh, "dist.%02zu", i); for(j = 1; j < hist_covgsize; j++) { fprintf(hist_fh, ",%"PRIu64, hists[i][j]); } fprintf(hist_fh, "\n"); } } if(thresh_fh) { // Use median of first five cutoffs print_suggest_cutoff(6, hist_covgsize, hists, thresh_fh); } if(hist_fh && hist_fh != stdout) fclose(hist_fh); if(list) { fclose(list_fh); } if(plot) { fclose(plot_fh); } ctx_free(hists); cJSON_Delete(newhdr); strbuf_dealloc(&link_tmp_path); ltree_dealloc(<ree); size_buf_dealloc(&countbuf); size_buf_dealloc(&jposbuf); strbuf_dealloc(&kmerbuf); strbuf_dealloc(&juncsbuf); strbuf_dealloc(&seqbuf); strbuf_dealloc(&outbuf); return EXIT_SUCCESS; }
void read_thread_args_parse(struct ReadThreadCmdArgs *args, int argc, char **argv, const struct option *longopts, bool correct_cmd) { size_t i; CorrectAlnInput task = CORRECT_ALN_INPUT_INIT; uint8_t fq_offset = 0; GPathReader tmp_gpfile; CorrectAlnInputBuffer *inputs = &args->inputs; args->memargs = (struct MemArgs)MEM_ARGS_INIT; args->fmt = SEQ_FMT_FASTQ; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int used = 1, c; char *tmp_path; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'o': cmd_check(!args->out_ctp_path,cmd); args->out_ctp_path = optarg; break; case 'p': memset(&tmp_gpfile, 0, sizeof(GPathReader)); gpath_reader_open(&tmp_gpfile, optarg); gpfile_buf_push(&args->gpfiles, &tmp_gpfile, 1); break; case 't': cmd_check(!args->nthreads, cmd); args->nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&args->memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&args->memargs, optarg); break; case 'c': args->colour = cmd_uint32(cmd, optarg); break; case 'F': cmd_check(args->fmt == SEQ_FMT_FASTQ, cmd); args->fmt = cmd_parse_format(cmd, optarg); break; case '1': case '2': case 'i': used = 1; correct_aln_input_buf_push(inputs, &task, 1); asyncio_task_parse(&inputs->b[inputs->len-1].files, c, optarg, fq_offset, correct_cmd ? &tmp_path : NULL); if(correct_cmd) inputs->b[inputs->len-1].out_base = tmp_path; break; case 'M': if(!strcmp(optarg,"FF")) task.matedir = READPAIR_FF; else if(!strcmp(optarg,"FR")) task.matedir = READPAIR_FR; else if(!strcmp(optarg,"RF")) task.matedir = READPAIR_RF; else if(!strcmp(optarg,"RR")) task.matedir = READPAIR_RR; else die("-M,--matepair <orient> must be one of: FF,FR,RF,RR"); used = 0; break; case 'O': fq_offset = cmd_uint8(cmd, optarg); used = 0; break; case 'Q': task.fq_cutoff = cmd_uint8(cmd, optarg); used = 0; break; case 'H': task.hp_cutoff = cmd_uint8(cmd, optarg); used = 0; break; case 'l': task.crt_params.frag_len_min = cmd_uint32(cmd, optarg); used = 0; break; case 'L': task.crt_params.frag_len_max = cmd_uint32(cmd, optarg); used = 0; break; case 'w': task.crt_params.one_way_gap_traverse = true; used = 0; break; case 'W': task.crt_params.one_way_gap_traverse = false; used = 0; break; case 'd': task.crt_params.gap_wiggle = cmd_udouble(cmd, optarg); used = 0; break; case 'D': task.crt_params.gap_variance = cmd_udouble(cmd, optarg); used = 0; break; case 'X': task.crt_params.max_context = cmd_uint32(cmd, optarg); used = 0; break; case 'e': task.crt_params.use_end_check = true; used = 0; break; case 'E': task.crt_params.use_end_check = false; used = 0; break; case 'g': cmd_check(!args->dump_seq_sizes, cmd); args->dump_seq_sizes = optarg; break; case 'G': cmd_check(!args->dump_frag_sizes, cmd); args->dump_frag_sizes = optarg; break; case 'u': args->use_new_paths = true; break; case 'x': gen_paths_print_contigs = true; break; case 'y': gen_paths_print_paths = true; break; case 'z': gen_paths_print_reads = true; break; case 'Z': cmd_check(!args->fq_zero, cmd); if(strlen(optarg) != 1) cmd_print_usage("--fq-zero <c> requires a single char"); args->fq_zero = optarg[0]; break; case 'P': cmd_check(!args->append_orig_seq,cmd); args->append_orig_seq = true; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" thread/correct -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } if(args->nthreads == 0) args->nthreads = DEFAULT_NTHREADS; // Check that optind+1 == argc if(optind+1 > argc) cmd_print_usage("Expected exactly one graph file"); else if(optind+1 < argc) cmd_print_usage("Expected only one graph file. What is this: '%s'", argv[optind]); char *graph_path = argv[optind]; status("Reading graph: %s", graph_path); if(!used) cmd_print_usage("Ignored arguments after last --seq"); // ctx_thread requires output file if(!correct_cmd && !args->out_ctp_path) cmd_print_usage("--out <out.ctp> is required"); // // Open graph graph file // GraphFileReader *gfile = &args->gfile; graph_file_open(gfile, graph_path); if(!correct_cmd && file_filter_into_ncols(&gfile->fltr) > 1) die("Please specify a single colour e.g. %s:0", file_filter_path(&gfile->fltr)); // // Open path files // size_t path_max_usedcols = 0; for(i = 0; i < args->gpfiles.len; i++) { // file_filter_update_intocol(&args->pfiles.b[i].fltr, 0); if(!correct_cmd && file_filter_into_ncols(&args->gpfiles.b[i].fltr) > 1) { die("Please specify a single colour e.g. %s:0", file_filter_path(&args->gpfiles.b[i].fltr)); } path_max_usedcols = MAX2(path_max_usedcols, file_filter_into_ncols(&args->gpfiles.b[i].fltr)); } args->path_max_usedcols = path_max_usedcols; // Check for compatibility between graph files and path files graphs_gpaths_compatible(gfile, 1, args->gpfiles.b, args->gpfiles.len, -1); // if no paths loaded, set all max_context values to 1, since >1 kmer only // useful if can pickup paths if(args->gpfiles.len == 0) { for(i = 0; i < inputs->len; i++) inputs->b[i].crt_params.max_context = 1; } // Check frag_len_min < frag_len_max for(i = 0; i < inputs->len; i++) { CorrectAlnInput *t = &inputs->b[i]; t->files.ptr = t; if(t->crt_params.frag_len_min > t->crt_params.frag_len_max) { die("--min-ins %u is greater than --max-ins %u", t->crt_params.frag_len_min, t->crt_params.frag_len_max); } correct_aln_input_print(&inputs->b[i]); args->max_gap_limit = MAX2(args->max_gap_limit, t->crt_params.frag_len_max); } futil_create_output(args->dump_seq_sizes); futil_create_output(args->dump_frag_sizes); }
int ctx_contigs(int argc, char **argv) { size_t nthreads = 0; struct MemArgs memargs = MEM_ARGS_INIT; const char *out_path = NULL; size_t i, contig_limit = 0, colour = 0; bool cmd_reseed = false, cmd_no_reseed = false; // -r, -R const char *conf_table_path = NULL; // save confidence table to here bool use_missing_info_check = true, seed_with_unused_paths = false; double min_step_confid = -1.0, min_cumul_confid = -1.0; // < 0 => no min // Read length and expected depth for calculating confidences size_t genome_size = 0; seq_file_t *tmp_seed_file = NULL; SeqFilePtrBuffer seed_buf; seq_file_ptr_buf_alloc(&seed_buf, 16); GPathReader tmp_gpfile; GPathFileBuffer gpfiles; gpfile_buf_alloc(&gpfiles, 8); // Arg parsing char cmd[100], shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'o': cmd_check(!out_path,cmd); out_path = optarg; break; case 't': cmd_check(!nthreads,cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'p': memset(&tmp_gpfile, 0, sizeof(GPathReader)); gpath_reader_open(&tmp_gpfile, optarg); gpfile_buf_push(&gpfiles, &tmp_gpfile, 1); break; case '1': case 's': // --seed <in.fa> if((tmp_seed_file = seq_open(optarg)) == NULL) die("Cannot read --seed file: %s", optarg); seq_file_ptr_buf_add(&seed_buf, tmp_seed_file); break; case 'r': cmd_check(!cmd_reseed,cmd); cmd_reseed = true; break; case 'R': cmd_check(!cmd_no_reseed,cmd); cmd_no_reseed = true; break; case 'N': cmd_check(!contig_limit,cmd); contig_limit = cmd_uint32_nonzero(cmd, optarg); break; case 'c': cmd_check(!colour,cmd); colour = cmd_uint32(cmd, optarg); break; case 'G': cmd_check(!genome_size,cmd); genome_size = cmd_bases(cmd, optarg); break; case 'S': cmd_check(!conf_table_path,cmd); conf_table_path = optarg; break; case 'M': cmd_check(use_missing_info_check,cmd); use_missing_info_check = false; break; case 'P': cmd_check(!seed_with_unused_paths,cmd); seed_with_unused_paths = true; break; case 'C': cmd_check(min_cumul_confid < 0,cmd); min_cumul_confid = cmd_udouble(cmd,optarg); if(min_cumul_confid > 1) die("%s must be 0 <= x <= 1", cmd); break; case 'T': cmd_check(min_step_confid < 0,cmd); min_step_confid = cmd_udouble(cmd,optarg); if(min_step_confid > 1) die("%s must be 0 <= x <= 1", cmd); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ die("`"CMD" contigs -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } if(cmd_no_reseed && cmd_reseed) cmd_print_usage("Cannot specify both -r and -R"); if(contig_limit && seed_with_unused_paths) cmd_print_usage("Cannot combine --ncontigs with --use-seed-paths"); bool sample_with_replacement = cmd_reseed; // Defaults if(nthreads == 0) nthreads = DEFAULT_NTHREADS; if(!seed_buf.len && !contig_limit && sample_with_replacement) { cmd_print_usage("Please specify one or more of: " "--no-reseed | --ncontigs | --seed <in.fa>"); } if(optind >= argc) cmd_print_usage("Require input graph files (.ctx)"); // // Open graph files // const size_t num_gfiles = argc - optind; char **graph_paths = argv + optind; ctx_assert(num_gfiles > 0); GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); size_t ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0; graph_files_open(graph_paths, gfiles, num_gfiles, &ctx_max_kmers, &ctx_sum_kmers); // char *ctx_path = argv[optind]; // // Open Graph file // // GraphFileReader gfile; // memset(&gfile, 0, sizeof(GraphFileReader)); // graph_file_open(&gfile, ctx_path); // Update colours in graph file - sample in 0, all others in 1 // never need more than two colours ncols = gpath_load_sample_pop(gfiles, num_gfiles, gpfiles.b, gpfiles.len, colour); // Check for compatibility between graph files and path files // pop_colour is colour 1 graphs_gpaths_compatible(gfiles, num_gfiles, gpfiles.b, gpfiles.len, 1); if(!genome_size) { char nk_str[50]; if(ctx_max_kmers <= 0) die("Please pass --genome <G> if streaming"); genome_size = ctx_max_kmers; ulong_to_str(genome_size, nk_str); status("Taking number of kmers as genome size: %s", nk_str); } // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem, path_mem, total_mem; // 1 bit needed per kmer if we need to keep track of kmer usage bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 + sizeof(GPath*)*8 + ncols + !sample_with_replacement; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, ctx_max_kmers, ctx_sum_kmers, false, &graph_mem); // Paths memory size_t rem_mem = memargs.mem_to_use - MIN2(memargs.mem_to_use, graph_mem); path_mem = gpath_reader_mem_req(gpfiles.b, gpfiles.len, ncols, rem_mem, false); // Shift path store memory from graphs->paths graph_mem -= sizeof(GPath*)*kmers_in_hash; path_mem += sizeof(GPath*)*kmers_in_hash; cmd_print_mem(path_mem, "paths"); // Total memory total_mem = graph_mem + path_mem; cmd_check_mem_limit(memargs.mem_to_use, total_mem); // Load contig hist distribution from ctp files ZeroSizeBuffer contig_hist; memset(&contig_hist, 0, sizeof(contig_hist)); for(i = 0; i < gpfiles.len; i++) { gpath_reader_load_contig_hist(gpfiles.b[i].json, gpfiles.b[i].fltr.path.b, file_filter_fromcol(&gpfiles.b[i].fltr, 0), &contig_hist); } // Calculate confidences, only for one colour ContigConfidenceTable conf_table; conf_table_alloc(&conf_table, 1); conf_table_update_hist(&conf_table, 0, genome_size, contig_hist.b, contig_hist.len); if(conf_table_path != NULL) { conf_table_save(&conf_table, conf_table_path); } zsize_buf_dealloc(&contig_hist); // // Output file if printing // FILE *fout = out_path ? futil_fopen_create(out_path, "w") : NULL; // Allocate dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, ncols, 1, kmers_in_hash, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL); // Paths gpath_reader_alloc_gpstore(gpfiles.b, gpfiles.len, path_mem, false, &db_graph); uint8_t *visited = NULL; if(!sample_with_replacement) visited = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1); // Load graph LoadingStats stats = LOAD_STATS_INIT_MACRO; GraphLoadingPrefs gprefs = {.db_graph = &db_graph, .boolean_covgs = false, .must_exist_in_graph = false, .empty_colours = true}; for(i = 0; i < num_gfiles; i++) { graph_load(&gfiles[i], gprefs, &stats); graph_file_close(&gfiles[i]); gprefs.empty_colours = false; } ctx_free(gfiles); hash_table_print_stats(&db_graph.ht); // Load path files for(i = 0; i < gpfiles.len; i++) { gpath_reader_load(&gpfiles.b[i], GPATH_DIE_MISSING_KMERS, &db_graph); gpath_reader_close(&gpfiles.b[i]); } gpfile_buf_dealloc(&gpfiles); AssembleContigStats assem_stats; assemble_contigs_stats_init(&assem_stats); assemble_contigs(nthreads, seed_buf.b, seed_buf.len, contig_limit, visited, use_missing_info_check, seed_with_unused_paths, min_step_confid, min_cumul_confid, fout, out_path, &assem_stats, &conf_table, &db_graph, 0); // Sample always loaded into colour zero if(fout && fout != stdout) fclose(fout); assemble_contigs_stats_print(&assem_stats); assemble_contigs_stats_destroy(&assem_stats); conf_table_dealloc(&conf_table); for(i = 0; i < seed_buf.len; i++) seq_close(seed_buf.b[i]); seq_file_ptr_buf_dealloc(&seed_buf); ctx_free(visited); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int ctx_bubbles(int argc, char **argv) { size_t nthreads = 0; struct MemArgs memargs = MEM_ARGS_INIT; const char *out_path = NULL; size_t max_allele_len = 0, max_flank_len = 0; bool remove_serial_bubbles = true; // List of haploid colours size_t *hapcols = NULL; int nhapcols = 0; char *hapcols_arg = NULL; GPathReader tmp_gpfile; GPathFileBuffer gpfiles; gpfile_buf_alloc(&gpfiles, 8); // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'p': memset(&tmp_gpfile, 0, sizeof(GPathReader)); gpath_reader_open(&tmp_gpfile, optarg); gpfile_buf_push(&gpfiles, &tmp_gpfile, 1); break; case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'H': cmd_check(!hapcols_arg, cmd); hapcols_arg = optarg; break; case 'A': cmd_check(!max_allele_len, cmd); max_allele_len = cmd_uint32_nonzero(cmd, optarg); break; case 'F': cmd_check(!max_flank_len, cmd); max_flank_len = cmd_uint32_nonzero(cmd, optarg); break; case 'S': cmd_check(remove_serial_bubbles,cmd); remove_serial_bubbles = false; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } // Defaults for unset values if(out_path == NULL) out_path = "-"; if(nthreads == 0) nthreads = DEFAULT_NTHREADS; if(max_allele_len == 0) max_allele_len = DEFAULT_MAX_ALLELE; if(max_flank_len == 0) max_flank_len = DEFAULT_MAX_FLANK; if(optind >= argc) cmd_print_usage("Require input graph files (.ctx)"); // // Open graph files // const size_t num_gfiles = argc - optind; char **graph_paths = argv + optind; ctx_assert(num_gfiles > 0); GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); size_t i, ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0; ncols = graph_files_open(graph_paths, gfiles, num_gfiles, &ctx_max_kmers, &ctx_sum_kmers); // Check graph + paths are compatible graphs_gpaths_compatible(gfiles, num_gfiles, gpfiles.b, gpfiles.len, -1); // // Check haploid colours are valid // if(hapcols_arg != NULL) { if((nhapcols = range_get_num(hapcols_arg, ncols)) < 0) die("Invalid haploid colour list: %s", hapcols_arg); hapcols = ctx_calloc(nhapcols, sizeof(hapcols[0])); if(range_parse_array(hapcols_arg, hapcols, ncols) < 0) die("Invalid haploid colour list: %s", hapcols_arg); } // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem, path_mem, thread_mem; char thread_mem_str[100]; // edges(1bytes) + kmer_paths(8bytes) + in_colour(1bit/col) + // visitedfw/rv(2bits/thread) bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 + (gpfiles.len > 0 ? sizeof(GPath*)*8 : 0) + ncols + 2*nthreads; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, ctx_max_kmers, ctx_sum_kmers, false, &graph_mem); // Thread memory thread_mem = roundup_bits2bytes(kmers_in_hash) * 2; bytes_to_str(thread_mem * nthreads, 1, thread_mem_str); status("[memory] (of which threads: %zu x %zu = %s)\n", nthreads, thread_mem, thread_mem_str); // Paths memory size_t rem_mem = memargs.mem_to_use - MIN2(memargs.mem_to_use, graph_mem+thread_mem); path_mem = gpath_reader_mem_req(gpfiles.b, gpfiles.len, ncols, rem_mem, false, kmers_in_hash, false); // Shift path store memory from graphs->paths graph_mem -= sizeof(GPath*)*kmers_in_hash; path_mem += sizeof(GPath*)*kmers_in_hash; cmd_print_mem(path_mem, "paths"); size_t total_mem = graph_mem + thread_mem + path_mem; cmd_check_mem_limit(memargs.mem_to_use, total_mem); // // Open output file // gzFile gzout = futil_gzopen_create(out_path, "w"); // Allocate memory dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, ncols, 1, kmers_in_hash, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL); // Paths gpath_reader_alloc_gpstore(gpfiles.b, gpfiles.len, path_mem, false, &db_graph); // // Load graphs // GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph); gprefs.empty_colours = true; for(i = 0; i < num_gfiles; i++) { graph_load(&gfiles[i], gprefs, NULL); graph_file_close(&gfiles[i]); gprefs.empty_colours = false; } ctx_free(gfiles); hash_table_print_stats(&db_graph.ht); // Load link files for(i = 0; i < gpfiles.len; i++) gpath_reader_load(&gpfiles.b[i], GPATH_DIE_MISSING_KMERS, &db_graph); // Create array of cJSON** from input files cJSON **hdrs = ctx_malloc(gpfiles.len * sizeof(cJSON*)); for(i = 0; i < gpfiles.len; i++) hdrs[i] = gpfiles.b[i].json; // Now call variants BubbleCallingPrefs call_prefs = {.max_allele_len = max_allele_len, .max_flank_len = max_flank_len, .haploid_cols = hapcols, .nhaploid_cols = nhapcols, .remove_serial_bubbles = remove_serial_bubbles}; invoke_bubble_caller(nthreads, &call_prefs, gzout, out_path, hdrs, gpfiles.len, &db_graph); status(" saved to: %s\n", out_path); gzclose(gzout); ctx_free(hdrs); // Close input link files for(i = 0; i < gpfiles.len; i++) gpath_reader_close(&gpfiles.b[i]); gpfile_buf_dealloc(&gpfiles); ctx_free(hapcols); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int ctx_exp_abc(int argc, char **argv) { size_t i, nthreads = 0, num_repeats = 0, max_AB_dist = 0; struct MemArgs memargs = MEM_ARGS_INIT; bool print_failed_contigs = false; GPathReader tmp_gpfile; GPathFileBuffer gpfiles; gpfile_buf_alloc(&gpfiles, 8); // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 't': cmd_check(!nthreads,cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'p': memset(&tmp_gpfile, 0, sizeof(GPathReader)); gpath_reader_open(&tmp_gpfile, optarg); gpfile_buf_push(&gpfiles, &tmp_gpfile, 1); break; case 'N': cmd_check(!num_repeats,cmd); num_repeats = cmd_uint32_nonzero(cmd, optarg); break; case 'M': cmd_check(!max_AB_dist,cmd); max_AB_dist = cmd_uint32_nonzero(cmd, optarg); break; case 'P': cmd_check(!print_failed_contigs,cmd); print_failed_contigs = true; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" exp_abc -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } // Defaults if(nthreads == 0) nthreads = DEFAULT_NTHREADS; if(num_repeats == 0) num_repeats = DEFAULT_NUM_REPEATS; if(max_AB_dist == 0) max_AB_dist = DEFAULT_MAX_AB_DIST; if(print_failed_contigs && nthreads != 1) { warn("--print forces nthreads to be one. soz."); nthreads = 1; } if(optind+1 != argc) cmd_print_usage("Require exactly one input graph file (.ctx)"); const char *ctx_path = argv[optind]; // // Open Graph file // GraphFileReader gfile; memset(&gfile, 0, sizeof(GraphFileReader)); graph_file_open(&gfile, ctx_path); size_t ncols = file_filter_into_ncols(&gfile.fltr); // Check only loading one colour if(ncols > 1) die("Only implemented for one colour currently"); // Check graph + paths are compatible graphs_gpaths_compatible(&gfile, 1, gpfiles.b, gpfiles.len, -1); // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem, path_mem, total_mem; // 1 bit needed per kmer if we need to keep track of kmer usage bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 + sizeof(GPath*)*8 + ncols; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, gfile.num_of_kmers, gfile.num_of_kmers, false, &graph_mem); // Paths memory size_t rem_mem = memargs.mem_to_use - MIN2(memargs.mem_to_use, graph_mem); path_mem = gpath_reader_mem_req(gpfiles.b, gpfiles.len, ncols, rem_mem, false, kmers_in_hash, false); // Shift path store memory from graphs->paths graph_mem -= sizeof(GPath*)*kmers_in_hash; path_mem += sizeof(GPath*)*kmers_in_hash; cmd_print_mem(path_mem, "paths"); total_mem = graph_mem + path_mem; cmd_check_mem_limit(memargs.mem_to_use, total_mem); // // Allocate memory // dBGraph db_graph; db_graph_alloc(&db_graph, gfile.hdr.kmer_size, 1, 1, kmers_in_hash, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL); // Paths gpath_reader_alloc_gpstore(gpfiles.b, gpfiles.len, path_mem, false, &db_graph); // Load the graph GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph); gprefs.empty_colours = true; graph_load(&gfile, gprefs, NULL); graph_file_close(&gfile); hash_table_print_stats(&db_graph.ht); // Load link files for(i = 0; i < gpfiles.len; i++) { gpath_reader_load(&gpfiles.b[i], GPATH_DIE_MISSING_KMERS, &db_graph); gpath_reader_close(&gpfiles.b[i]); } gpfile_buf_dealloc(&gpfiles); status("\n"); status("Test 1: Priming region A->B (n: %zu max_AB_dist: %zu)", num_repeats, max_AB_dist); run_exp_abc(&db_graph, true, nthreads, num_repeats, max_AB_dist, print_failed_contigs); status("\n"); status("Test 2: Trying to traverse A->B (n: %zu max_AB_dist: %zu)", num_repeats, max_AB_dist); run_exp_abc(&db_graph, false, nthreads, num_repeats, max_AB_dist, print_failed_contigs); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }