void _construct_graph_with_paths(dBGraph *graph, size_t kmer_size, size_t ncols, char **seqs, size_t nseqs, CorrectAlnParam path_params) { size_t i; db_graph_alloc(graph, kmer_size, ncols, ncols, 1024); // Graph data graph->bktlocks = ctx_calloc(roundup_bits2bytes(graph->ht.num_of_buckets), 1); graph->col_edges = ctx_calloc(graph->ht.capacity * ncols, sizeof(Edges)); graph->col_covgs = ctx_calloc(graph->ht.capacity * ncols, sizeof(Covg)); graph->node_in_cols = ctx_calloc(roundup_bits2bytes(graph->ht.capacity) * ncols, 1); // Path data path_store_alloc(&graph->pstore, 1024, true, graph->ht.capacity, ncols); graph->pstore.kmer_locks = ctx_calloc(roundup_bits2bytes(graph->ht.capacity), 1); // Build graph for(i = 0; i < nseqs; i++) build_graph_from_str_mt(graph, 0, seqs[i], strlen(seqs[i])); graph->num_of_cols_used = MAX2(graph->num_of_cols_used, 1); GenPathWorker *gen_path_wrkr = gen_paths_workers_alloc(1, graph, NULL); for(i = 0; i < nseqs; i++) gen_paths_from_str_mt(gen_path_wrkr, seqs[i], path_params); gen_paths_workers_dealloc(gen_path_wrkr, 1); }
void all_tests_add_paths_multi(dBGraph *graph, const char **seqs, size_t nseqs, CorrectAlnParam params, int exp_npaths, int exp_nkmers) { size_t npaths = graph->gpstore.num_paths; size_t nkmers = graph->gpstore.num_kmers_with_paths; size_t i, nworkers = 1; GenPathWorker *wrkrs = gen_paths_workers_alloc(nworkers, graph); // Set up asyncio input data AsyncIOInput io = {.file1 = NULL, .file2 = NULL, .fq_offset = 0, .interleaved = false}; CorrectAlnInput task = {.files = io, .fq_cutoff = 0, .hp_cutoff = 0, .matedir = READPAIR_FR, .crt_params = params, .out_base = NULL, .output = NULL}; AsyncIOData iodata; asynciodata_alloc(&iodata); seq_read_reset(&iodata.r2); iodata.fq_offset1 = iodata.fq_offset2 = 0; iodata.ptr = NULL; // Add paths for(i = 0; i < nseqs; i++) { seq_read_set(&iodata.r1, seqs[i]); gen_paths_worker_seq(wrkrs, &iodata, &task); } asynciodata_dealloc(&iodata); gen_paths_workers_dealloc(wrkrs, nworkers); // Check we added the right number of paths if(exp_npaths >= 0) { TASSERT2(graph->gpstore.num_paths == npaths + (size_t)exp_npaths, "%zu %zu %zu", (size_t)graph->gpstore.num_paths, (size_t)npaths, (size_t)exp_npaths); } if(exp_nkmers >= 0) { TASSERT(graph->gpstore.num_kmers_with_paths == nkmers + (size_t)exp_nkmers); } } void all_tests_add_paths(dBGraph *graph, const char *seq, CorrectAlnParam params, int exp_npaths, int exp_nkmers) { all_tests_add_paths_multi(graph, &seq, 1, params, exp_npaths, exp_nkmers); } void all_tests_construct_graph(dBGraph *graph, size_t kmer_size, size_t ncols, const char **seqs, size_t nseqs, CorrectAlnParam path_params) { size_t i; db_graph_alloc(graph, kmer_size, ncols, ncols, 1024, DBG_ALLOC_EDGES | DBG_ALLOC_COVGS | DBG_ALLOC_NODE_IN_COL | DBG_ALLOC_BKTLOCKS); // Path data gpath_store_alloc(&graph->gpstore, ncols, graph->ht.capacity, 0, ONE_MEGABYTE, true, false); // Don't use links to add new links gpath_store_split_read_write(&graph->gpstore); // Allocate path hash table just in case gpath_hash_alloc(&graph->gphash, &graph->gpstore, ONE_MEGABYTE); // Build graph for(i = 0; i < nseqs; i++) build_graph_from_str_mt(graph, 0, seqs[i], strlen(seqs[i]), false); gpath_store_merge_read_write(&graph->gpstore); graph->num_of_cols_used = MAX2(graph->num_of_cols_used, 1); all_tests_add_paths_multi(graph, seqs, nseqs, path_params, -1, -1); }
int ctx_thread(int argc, char **argv) { struct ReadThreadCmdArgs args; read_thread_args_alloc(&args); read_thread_args_parse(&args, argc, argv, longopts, false); GraphFileReader *gfile = &args.gfile; GPathFileBuffer *gpfiles = &args.gpfiles; CorrectAlnInputBuffer *inputs = &args.inputs; size_t i; if(args.zero_link_counts && gpfiles->len == 0) cmd_print_usage("-0,--zero-paths without -p,--paths <in.ctp> has no meaning"); // Check each path file only loads one colour gpaths_only_for_colour(gpfiles->b, gpfiles->len, 0); // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem, total_mem; size_t path_hash_mem, path_store_mem, path_mem; bool sep_path_list = (!args.use_new_paths && gpfiles->len > 0); bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 + sizeof(GPath*)*8 + 2 * args.nthreads; // Have traversed // false -> don't use mem_to_use to decide how many kmers to store in hash // since we need some of that memory for storing paths kmers_in_hash = cmd_get_kmers_in_hash(args.memargs.mem_to_use, args.memargs.mem_to_use_set, args.memargs.num_kmers, args.memargs.num_kmers_set, bits_per_kmer, gfile->num_of_kmers, gfile->num_of_kmers, false, &graph_mem); // Paths memory size_t min_path_mem = 0; gpath_reader_sum_mem(gpfiles->b, gpfiles->len, 1, true, true, &min_path_mem); if(graph_mem + min_path_mem > args.memargs.mem_to_use) { char buf[50]; die("Require at least %s memory", bytes_to_str(graph_mem+min_path_mem, 1, buf)); } path_mem = args.memargs.mem_to_use - graph_mem; size_t pentry_hash_mem = sizeof(GPEntry)/0.7; size_t pentry_store_mem = sizeof(GPath) + 8 + // struct + sequence 1 + // in colour sizeof(uint8_t) + // counts sizeof(uint32_t); // kmer length size_t max_paths = path_mem / (pentry_store_mem + pentry_hash_mem); path_store_mem = max_paths * pentry_store_mem; path_hash_mem = max_paths * pentry_hash_mem; cmd_print_mem(path_hash_mem, "paths hash"); cmd_print_mem(path_store_mem, "paths store"); total_mem = graph_mem + path_mem; cmd_check_mem_limit(args.memargs.mem_to_use, total_mem); // // Open output file // gzFile gzout = futil_gzopen_create(args.out_ctp_path, "w"); status("Creating paths file: %s", futil_outpath_str(args.out_ctp_path)); // // Allocate memory // dBGraph db_graph; size_t kmer_size = gfile->hdr.kmer_size; db_graph_alloc(&db_graph, kmer_size, 1, 1, kmers_in_hash, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL); // Split path memory 2:1 between store and hash // Create a path store that tracks path counts gpath_store_alloc(&db_graph.gpstore, db_graph.num_of_cols, db_graph.ht.capacity, 0, path_store_mem, true, sep_path_list); // Create path hash table for fast lookup gpath_hash_alloc(&db_graph.gphash, &db_graph.gpstore, path_hash_mem); if(args.use_new_paths) { status("Using paths as they are added (risky)"); } else { status("Not using new paths as they are added (safe)"); } // // Start up workers to add paths to the graph // GenPathWorker *workers; workers = gen_paths_workers_alloc(args.nthreads, &db_graph); // Setup for loading graphs graph LoadingStats gstats; loading_stats_init(&gstats); // Path statistics LoadingStats *load_stats = gen_paths_get_stats(workers); CorrectAlnStats *aln_stats = gen_paths_get_aln_stats(workers); // Load contig hist distribution for(i = 0; i < gpfiles->len; i++) { gpath_reader_load_contig_hist(gpfiles->b[i].json, gpfiles->b[i].fltr.path.b, file_filter_fromcol(&gpfiles->b[i].fltr, 0), &aln_stats->contig_histgrm); } GraphLoadingPrefs gprefs = {.db_graph = &db_graph, .boolean_covgs = false, .must_exist_in_graph = false, .must_exist_in_edges = NULL, .empty_colours = false}; // already loaded paths // Load graph, print stats, close file graph_load(gfile, gprefs, &gstats); hash_table_print_stats_brief(&db_graph.ht); graph_file_close(gfile); // Load existing paths for(i = 0; i < gpfiles->len; i++) gpath_reader_load(&gpfiles->b[i], GPATH_DIE_MISSING_KMERS, &db_graph); // zero link counts of already loaded links if(args.zero_link_counts) { status("Zeroing link counts for loaded links"); gpath_set_zero_nseen(&db_graph.gpstore.gpset); } if(!args.use_new_paths) gpath_store_split_read_write(&db_graph.gpstore); // Deal with a set of files at once // Can have different numbers of inputs vs threads size_t start, end; for(start = 0; start < inputs->len; start += MAX_IO_THREADS) { end = MIN2(inputs->len, start+MAX_IO_THREADS); generate_paths(inputs->b+start, end-start, workers, args.nthreads); } // Print memory statistics gpath_hash_print_stats(&db_graph.gphash); gpath_store_print_stats(&db_graph.gpstore); correct_aln_dump_stats(aln_stats, load_stats, args.dump_seq_sizes, args.dump_frag_sizes, db_graph.ht.num_kmers); // Don't need GPathHash anymore gpath_hash_dealloc(&db_graph.gphash); cJSON **hdrs = ctx_malloc(gpfiles->len * sizeof(cJSON*)); for(i = 0; i < gpfiles->len; i++) hdrs[i] = gpfiles->b[i].json; size_t output_threads = MIN2(args.nthreads, MAX_IO_THREADS); // Generate a cJSON header for all inputs cJSON *thread_hdr = cJSON_CreateObject(); cJSON *inputs_hdr = cJSON_CreateArray(); cJSON_AddItemToObject(thread_hdr, "inputs", inputs_hdr); for(i = 0; i < inputs->len; i++) cJSON_AddItemToArray(inputs_hdr, correct_aln_input_json_hdr(&inputs->b[i])); // Write output file gpath_save(gzout, args.out_ctp_path, output_threads, true, "thread", thread_hdr, hdrs, gpfiles->len, &aln_stats->contig_histgrm, 1, &db_graph); gzclose(gzout); ctx_free(hdrs); // Optionally run path checks for debugging // gpath_checks_all_paths(&db_graph, args.nthreads); // ins_gap, err_gap no longer allocated after this line gen_paths_workers_dealloc(workers, args.nthreads); // Close and free input files etc. read_thread_args_dealloc(&args); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
static void test_repeat_loop() { TASSERT(sizeof(FollowPath) == 20); // Construct 1 colour graph with kmer-size=11 dBGraph graph; size_t kmer_size = 11, ncols = 1; // Set up alignment correction params CorrectAlnParam params = {.ctpcol = 0, .ctxcol = 0, .ins_gap_min = 0, .ins_gap_max = 0, .one_way_gap_traverse = true, .use_end_check = true, .max_context = 10, .gap_variance = 0.1, .gap_wiggle = 5}; // Sequence with repeat char seq[] = "ATTTGGAACTCCGGA" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "CGTCAGGAGCTAACT"; char p0[] = "ATTTGGAACTCCGGA""GATAGGGCCAGT""GATAGGGCCAGT"; char p1[] = "GATAGGGCCAGT""GATAGGGCCAGT""CGTCAGGAGCTAACT"; // Allocate graph, but don't add any sequence _construct_graph_with_paths(&graph, kmer_size, ncols, NULL, 0, params); GenPathWorker *gen_path_wrkr = gen_paths_workers_alloc(1, &graph, NULL); GraphWalker gwlk; RepeatWalker rptwlk; graph_walker_alloc(&gwlk); rpt_walker_alloc(&rptwlk, graph.ht.capacity, 12); dBNodeBuffer nbuf; db_node_buf_alloc(&nbuf, 1024); // Construct graph but no paths build_graph_from_str_mt(&graph, 0, seq, strlen(seq)); TASSERT2(graph.ht.num_kmers == 15+12+15, "%zu", (size_t)graph.ht.num_kmers); // Find first node in sequence dBNode node0 = db_graph_find_str(&graph, seq); TASSERT(node0.key != HASH_NOT_FOUND); // 1) With no paths char ans0[] = "ATTTGGAACTCCGGA""GATAGGGCCAGT"; test_walk(&gwlk, &rptwlk, node0, &nbuf, &graph, 15+2, ans0); // 2) Add small paths - produces collapsed down seq with two copy repeat gen_paths_from_str_mt(gen_path_wrkr, p0, params); gen_paths_from_str_mt(gen_path_wrkr, p1, params); char ans1[] = "ATTTGGAACTCCGGA""GATAGGGCCAGT""GATAGGGCCAGT""CGTCAGGAGCTAACT"; test_walk(&gwlk, &rptwlk, node0, &nbuf, &graph, 15+12+12+5, ans1); // 3) Add long paths gen_paths_from_str_mt(gen_path_wrkr, seq, params); test_walk(&gwlk, &rptwlk, node0, &nbuf, &graph, strlen(seq)+1-kmer_size, seq); graph_walker_dealloc(&gwlk); rpt_walker_dealloc(&rptwlk); db_node_buf_dealloc(&nbuf); gen_paths_workers_dealloc(gen_path_wrkr, 1); db_graph_dealloc(&graph); } void test_repeat_walker() { test_status("Testing repeat_walker.h"); test_repeat_loop(); }