static void gpath_save_thread(void *arg) { GPathSaver *wrkr = (GPathSaver*)arg; const dBGraph *db_graph = wrkr->db_graph; GPathSubset subset; StrBuf sbuf; gpath_subset_alloc(&subset); gpath_subset_init(&subset, &wrkr->db_graph->gpstore.gpset); strbuf_alloc(&sbuf, 2 * DEFAULT_IO_BUFSIZE); dBNodeBuffer nbuf; SizeBuffer jposbuf; db_node_buf_alloc(&nbuf, 1024); size_buf_alloc(&jposbuf, 256); HASH_ITERATE_PART(&db_graph->ht, wrkr->threadid, wrkr->nthreads, _gpath_gzsave_node, &sbuf, &subset, wrkr->save_seq ? &nbuf : NULL, wrkr->save_seq ? &jposbuf : NULL, wrkr->gzout, wrkr->outlock, db_graph); _gpath_save_flush(wrkr->gzout, &sbuf, wrkr->outlock); db_node_buf_dealloc(&nbuf); size_buf_dealloc(&jposbuf); gpath_subset_dealloc(&subset); strbuf_dealloc(&sbuf); }
static void run_exp_abc(const dBGraph *db_graph, bool prime_AB, size_t nthreads, size_t num_repeats, size_t max_AB_dist, bool print_failed_contigs) { ExpABCWorker *wrkrs = ctx_calloc(nthreads, sizeof(ExpABCWorker)); size_t i, j; if(max_AB_dist == 0) max_AB_dist = SIZE_MAX; for(i = 0; i < nthreads; i++) { wrkrs[i].colour = 0; wrkrs[i].nthreads = nthreads; wrkrs[i].db_graph = db_graph; wrkrs[i].prime_AB = prime_AB; wrkrs[i].num_limit = num_repeats / nthreads; wrkrs[i].max_AB_dist = max_AB_dist; wrkrs[i].print_failed_contigs = print_failed_contigs; db_node_buf_alloc(&wrkrs[i].nbuf, 1024); graph_walker_alloc(&wrkrs[i].gwlk, db_graph); rpt_walker_alloc(&wrkrs[i].rptwlk, db_graph->ht.capacity, 22); // 4MB } util_run_threads(wrkrs, nthreads, sizeof(ExpABCWorker), nthreads, run_exp_abc_thread); // Merge results size_t num_tests = 0, results[NUM_RESULT_VALUES] = {0}; size_t ab_fail_state[GRPHWLK_NUM_STATES] = {0}; size_t bc_fail_state[GRPHWLK_NUM_STATES] = {0}; for(i = 0; i < nthreads; i++) { num_tests += wrkrs[i].num_tests; for(j = 0; j < NUM_RESULT_VALUES; j++) results[j] += wrkrs[i].results[j]; for(j = 0; j < GRPHWLK_NUM_STATES; j++) ab_fail_state[j] += wrkrs[i].ab_fail_state[j]; for(j = 0; j < GRPHWLK_NUM_STATES; j++) bc_fail_state[j] += wrkrs[i].bc_fail_state[j]; db_node_buf_dealloc(&wrkrs[i].nbuf); graph_walker_dealloc(&wrkrs[i].gwlk); rpt_walker_dealloc(&wrkrs[i].rptwlk); } // Print results char nrunstr[50]; ulong_to_str(num_tests, nrunstr); status("Ran %s tests with %zu threads", nrunstr, nthreads); const char *titles[] = {"RES_ABC_SUCCESS", "RES_AB_WRONG", "RES_AB_FAILED", "RES_BC_WRONG", "RES_BC_FAILED", "RES_BC_OVERSHOT", "RES_LOST_IN_RPT", "RES_NO_TRAVERSAL"}; util_print_nums(titles, results, NUM_RESULT_VALUES, 30); status("AB_FAILED:"); graph_step_print_state_hist(ab_fail_state); status("BC_FAILED:"); graph_step_print_state_hist(bc_fail_state); ctx_free(wrkrs); }
void graph_cache_alloc(GraphCache *cache, const dBGraph *db_graph) { db_node_buf_alloc(&cache->node_buf, 1024); cache_snode_buf_alloc(&cache->snode_buf, 1024); cache_step_buf_alloc(&cache->step_buf, 1024); cache_path_buf_alloc(&cache->path_buf, 1024); cache->snode_hash = kh_init(SnodeIdHash); cache->db_graph = db_graph; }
static void _check_node_paths(const char *kmer, const char **path_strs, size_t npaths, size_t colour, const dBGraph *graph) { TASSERT(strlen(kmer) == graph->kmer_size); const GPath *paths[npaths]; // corresponding to path_strs memset(paths, 0, sizeof(paths)); size_t i, num_paths_seen = 0; const GPathStore *gpstore = &graph->gpstore; dBNode node = db_graph_find_str(graph, kmer); const GPath *path = gpath_store_fetch_traverse(gpstore, node.key); dBNodeBuffer nbuf; SizeBuffer jposbuf; db_node_buf_alloc(&nbuf, 64); size_buf_alloc(&jposbuf, 64); #define MAX_SEQ 128 char seq[MAX_SEQ]; for(; path != NULL; path = path->next) { if(path->orient == node.orient && gpath_has_colour(path, gpstore->gpset.ncols, colour)) { TASSERT(num_paths_seen < npaths); db_node_buf_reset(&nbuf); gpath_fetch(node, path, &nbuf, &jposbuf, colour, graph); if(nbuf.len > MAX_SEQ) die("Too many nodes. Cannot continue. %zu", nbuf.len); db_nodes_to_str(nbuf.b, nbuf.len, graph, seq); TASSERT(strlen(seq) == graph->kmer_size + nbuf.len - 1); for(i = 0; i < npaths; i++) { if(strcmp(path_strs[i],seq) == 0) { TASSERT(paths[i] == NULL, "Duplicate paths: %s", seq); paths[i] = path; break; } } TASSERT2(i < npaths, "Path not found: %s", seq); num_paths_seen++; } } TASSERT(num_paths_seen == npaths); for(i = 0; i < npaths; i++) { TASSERT2(paths[i] != NULL, "path not in graph: %s", path_strs[i]); } db_node_buf_dealloc(&nbuf); size_buf_dealloc(&jposbuf); }
static void pull_out_supernodes(const char **seq, const char **ans, size_t n, const dBGraph *graph) { dBNodeBuffer nbuf; db_node_buf_alloc(&nbuf, 1024); // 1. Check pulling out supernodes works for iterating over the graph uint64_t *visited; visited = ctx_calloc(roundup_bits2words64(graph->ht.capacity), 8); HASH_ITERATE(&graph->ht, supernode_from_kmer, &nbuf, visited, graph, ans, n); ctx_free(visited); // 2. Check pulling out supernodes works when we iterate over inputs size_t i, j, len; dBNode node; char tmpstr[SNODEBUF]; for(i = 0; i < n; i++) { len = strlen(seq[i]); for(j = 0; j+graph->kmer_size <= len; j++) { // Find node node = db_graph_find_str(graph, seq[i]+j); TASSERT(node.key != HASH_NOT_FOUND); // Fetch supernode db_node_buf_reset(&nbuf); supernode_find(node.key, &nbuf, graph); supernode_normalise(nbuf.b, nbuf.len, graph); // Compare TASSERT(nbuf.len < SNODEBUF); db_nodes_to_str(nbuf.b, nbuf.len, graph, tmpstr); if(strcmp(tmpstr, ans[i]) != 0) { test_status("Got: %s from ans[i]:%s\n", tmpstr, ans[i]); } TASSERT(strcmp(tmpstr, ans[i]) == 0); } } db_node_buf_dealloc(&nbuf); }
void test_graph_crawler() { test_status("Testing graph crawler..."); // Construct 1 colour graph with kmer-size=11 dBGraph graph; const size_t kmer_size = 11, ncols = 3; db_graph_alloc(&graph, kmer_size, ncols, 1, 2048, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL | DBG_ALLOC_BKTLOCKS); char graphseq[3][77] = // < X X X............... {"GTTCCAGAGCGGAGGTCTCCCAACAACATGGTATAAGTTGTCTAGCCCCGGTTCGCGCGGGTACTTCTTACAGCGC", "GTTCCAGAGCGGAGGTCTCCCAACAACTTGGTATAAGTTGTCTAGTCCCGGTTCGCGCGGCATTTCAGCATTGTTA", "GTTCCAGAGCGCGACAGAGTGCATATCACGCTAAGCACAGCCCTCTTCTATCTGCTTTTAAATGGATCAATAATCG"}; build_graph_from_str_mt(&graph, 0, graphseq[0], strlen(graphseq[0])); build_graph_from_str_mt(&graph, 1, graphseq[1], strlen(graphseq[1])); build_graph_from_str_mt(&graph, 2, graphseq[2], strlen(graphseq[2])); // Crawl graph GraphCrawler crawler; graph_crawler_alloc(&crawler, &graph); dBNode node = db_graph_find_str(&graph, graphseq[0]); dBNode next_node = db_graph_find_str(&graph, graphseq[0]+1); TASSERT(node.key != HASH_NOT_FOUND); TASSERT(next_node.key != HASH_NOT_FOUND); BinaryKmer bkey = db_node_get_bkmer(&graph, node.key); Edges edges = db_node_get_edges(&graph, node.key, 0); dBNode next_nodes[4]; Nucleotide next_nucs[4]; size_t i, p, num_next, next_idx; num_next = db_graph_next_nodes(&graph, bkey, node.orient, edges, next_nodes, next_nucs); next_idx = 0; while(next_idx < num_next && !db_nodes_are_equal(next_nodes[next_idx],next_node)) next_idx++; TASSERT(next_idx < num_next && db_nodes_are_equal(next_nodes[next_idx],next_node)); // Crawl in all colours graph_crawler_fetch(&crawler, node, next_nodes, next_idx, num_next, NULL, graph.num_of_cols, NULL, NULL, NULL); TASSERT2(crawler.num_paths == 2, "crawler.num_paths: %u", crawler.num_paths); // Fetch paths dBNodeBuffer nbuf; db_node_buf_alloc(&nbuf, 16); StrBuf sbuf; strbuf_alloc(&sbuf, 128); for(p = 0; p < crawler.num_paths; p++) { db_node_buf_reset(&nbuf); graph_crawler_get_path_nodes(&crawler, p, &nbuf); strbuf_ensure_capacity(&sbuf, nbuf.len+graph.kmer_size); sbuf.end = db_nodes_to_str(nbuf.b, nbuf.len, &graph, sbuf.b); for(i = 0; i < 3 && strcmp(graphseq[i]+1,sbuf.b) != 0; i++) {} TASSERT2(i < 3, "seq: %s", sbuf.b); TASSERT2(sbuf.end == 75, "sbuf.end: %zu", sbuf.end); TASSERT2(nbuf.len == 65, "nbuf.len: %zu", nbuf.len); } strbuf_dealloc(&sbuf); db_node_buf_dealloc(&nbuf); graph_crawler_dealloc(&crawler); db_graph_dealloc(&graph); }
void db_alignment_alloc(dBAlignment *aln) { db_node_buf_alloc(&aln->nodes, INIT_BUFLEN); int32_buf_alloc(&aln->rpos, INIT_BUFLEN); }
static void test_repeat_loop() { TASSERT(sizeof(FollowPath) == 20); // Construct 1 colour graph with kmer-size=11 dBGraph graph; size_t kmer_size = 11, ncols = 1; // Set up alignment correction params CorrectAlnParam params = {.ctpcol = 0, .ctxcol = 0, .ins_gap_min = 0, .ins_gap_max = 0, .one_way_gap_traverse = true, .use_end_check = true, .max_context = 10, .gap_variance = 0.1, .gap_wiggle = 5}; // Sequence with repeat char seq[] = "ATTTGGAACTCCGGA" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "CGTCAGGAGCTAACT"; char p0[] = "ATTTGGAACTCCGGA""GATAGGGCCAGT""GATAGGGCCAGT"; char p1[] = "GATAGGGCCAGT""GATAGGGCCAGT""CGTCAGGAGCTAACT"; // Allocate graph, but don't add any sequence _construct_graph_with_paths(&graph, kmer_size, ncols, NULL, 0, params); GenPathWorker *gen_path_wrkr = gen_paths_workers_alloc(1, &graph, NULL); GraphWalker gwlk; RepeatWalker rptwlk; graph_walker_alloc(&gwlk); rpt_walker_alloc(&rptwlk, graph.ht.capacity, 12); dBNodeBuffer nbuf; db_node_buf_alloc(&nbuf, 1024); // Construct graph but no paths build_graph_from_str_mt(&graph, 0, seq, strlen(seq)); TASSERT2(graph.ht.num_kmers == 15+12+15, "%zu", (size_t)graph.ht.num_kmers); // Find first node in sequence dBNode node0 = db_graph_find_str(&graph, seq); TASSERT(node0.key != HASH_NOT_FOUND); // 1) With no paths char ans0[] = "ATTTGGAACTCCGGA""GATAGGGCCAGT"; test_walk(&gwlk, &rptwlk, node0, &nbuf, &graph, 15+2, ans0); // 2) Add small paths - produces collapsed down seq with two copy repeat gen_paths_from_str_mt(gen_path_wrkr, p0, params); gen_paths_from_str_mt(gen_path_wrkr, p1, params); char ans1[] = "ATTTGGAACTCCGGA""GATAGGGCCAGT""GATAGGGCCAGT""CGTCAGGAGCTAACT"; test_walk(&gwlk, &rptwlk, node0, &nbuf, &graph, 15+12+12+5, ans1); // 3) Add long paths gen_paths_from_str_mt(gen_path_wrkr, seq, params); test_walk(&gwlk, &rptwlk, node0, &nbuf, &graph, strlen(seq)+1-kmer_size, seq); graph_walker_dealloc(&gwlk); rpt_walker_dealloc(&rptwlk); db_node_buf_dealloc(&nbuf); gen_paths_workers_dealloc(gen_path_wrkr, 1); db_graph_dealloc(&graph); } void test_repeat_walker() { test_status("Testing repeat_walker.h"); test_repeat_loop(); }
// Load each sequence into a separate colour static void test_bubbles(dBGraph *graph, const char **seqs, size_t nseqs, const char *flank5p, const char *flank3p, const char **alleles, size_t nalleles) { db_graph_reset(graph); TASSERT(graph->num_of_cols >= nseqs); size_t i; for(i = 0; i < nseqs; i++) build_graph_from_str_mt(graph, i, seqs[i], strlen(seqs[i]), false); graph->num_of_cols_used = MAX2(graph->num_of_cols_used, 1); StrBuf sbuf; dBNodeBuffer nbuf; strbuf_alloc(&sbuf, 128); db_node_buf_alloc(&nbuf, 128); BubbleCallingPrefs prefs = {.max_allele_len = 100, .max_flank_len = 100, .haploid_cols = NULL, .nhaploid_cols = 0, .remove_serial_bubbles = true}; BubbleCaller *caller = bubble_callers_new(1, &prefs, NULL, graph); _call_bubble(caller, flank5p, flank3p, alleles, nalleles, &nbuf, &sbuf); strbuf_dealloc(&sbuf); db_node_buf_dealloc(&nbuf); bubble_callers_destroy(caller, 1); } void test_bubble_caller() { test_status("Testing bubble calling..."); // Construct 1 colour graph with kmer-size=11 dBGraph graph; const size_t kmer_size = 11, ncols = 3; // Create graph db_graph_alloc(&graph, kmer_size, ncols, 1, 2000, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL | DBG_ALLOC_BKTLOCKS); // mutations: x const char *seqs0[] = {"AGGGATAAAACTCTGTACTGGATCTCCCT", "AGGGATAAAACTCTcTACTGGATCTCCCT"}; const char flank5p0[] = "AGGGATAAAACTCT"; const char flank3p0[] = "TACTGGATCTCCCT"; const char *alleles0[] = {"ATAAAACTCTGTACTGGATCT", "ATAAAACTCTcTACTGGATCT"}; test_bubbles(&graph, seqs0, 2, flank5p0, flank3p0, alleles0, 2); // mutations: x y const char *seqs1[] = {"CCCGTAGGTAAGGGCGTTAGTGCAAGGCCACATTGGGACACGAGTTGATA", "CCCGTAGGTAAGtGCGTTAGTGCAAGGCCACATTGGGACACGAGTTGATA", "CCCGTAGGTAAGGGCGTTAGTGCAAGGCCACtTTGGGACACGAGTTGATA"}; // forwards const char flank5p1a[] = "CCCGTAGGTAAG"; const char flank3p1a[] = "GCGTTAGTGCAAGGCCAC"; const char *alleles1a[] = {"CGTAGGTAAGGGCGTTAGTGC", "CGTAGGTAAGtGCGTTAGTGC"}; const char flank5p1b[] = "GCGTTAGTGCAAGGCCAC"; const char flank3p1b[] = "TTGGGACACGAGTTGATA"; const char *alleles1b[] = {"GCAAGGCCACATTGGGACACG", "GCAAGGCCACtTTGGGACACG"}; test_bubbles(&graph, seqs1, 3, flank5p1a, flank3p1a, alleles1a, 2); test_bubbles(&graph, seqs1, 3, flank5p1b, flank3p1b, alleles1b, 2); // reverse // mutations: y x // TATCAACTCGTGTCCCAATGTGGCCTTGCACTAACGCCCTTACCTACGGG // TATCAACTCGTGTCCCAATGTGGCCTTGCACTAACGCaCTTACCTACGGG // TATCAACTCGTGTCCCAAaGTGGCCTTGCACTAACGCCCTTACCTACGGG // const char flank5p1c[] = "GTGGCCTTGCACTAACGC"; const char flank3p1c[] = "CTTACCTACGGG"; const char *alleles1c[] = {"GCACTAACGCCCTTACCTACG", "GCACTAACGCaCTTACCTACG"}; const char flank5p1d[] = "TATCAACTCGTGTCCCAA"; const char flank3p1d[] = "GTGGCCTTGCACTAACGC"; const char *alleles1d[] = {"CGTGTCCCAATGTGGCCTTGC", "CGTGTCCCAAaGTGGCCTTGC"}; test_bubbles(&graph, seqs1, 3, flank5p1c, flank3p1c, alleles1c, 2); test_bubbles(&graph, seqs1, 3, flank5p1d, flank3p1d, alleles1d, 2); db_graph_dealloc(&graph); }
static BreakpointCaller* brkpt_callers_new(size_t num_callers, gzFile gzout, size_t min_ref_flank, size_t max_ref_flank, const KOGraph kograph, const dBGraph *db_graph) { ctx_assert(num_callers > 0); const size_t ncols = db_graph->num_of_cols; BreakpointCaller *callers = ctx_malloc(num_callers * sizeof(BreakpointCaller)); pthread_mutex_t *out_lock = ctx_malloc(sizeof(pthread_mutex_t)); if(pthread_mutex_init(out_lock, NULL) != 0) die("mutex init failed"); size_t *callid = ctx_calloc(1, sizeof(size_t)); // Each colour in each caller can have a GraphCache path at once PathRefRun *path_ref_runs = ctx_calloc(num_callers*MAX_REFRUNS_PER_CALLER(ncols), sizeof(PathRefRun)); size_t i; for(i = 0; i < num_callers; i++) { BreakpointCaller tmp = {.threadid = i, .nthreads = num_callers, .kograph = kograph, .db_graph = db_graph, .gzout = gzout, .out_lock = out_lock, .callid = callid, .allele_refs = path_ref_runs, .flank5p_refs = path_ref_runs+MAX_REFRUNS_PER_ORIENT(ncols), .min_ref_nkmers = min_ref_flank, .max_ref_nkmers = max_ref_flank}; memcpy(&callers[i], &tmp, sizeof(BreakpointCaller)); path_ref_runs += MAX_REFRUNS_PER_CALLER(ncols); db_node_buf_alloc(&callers[i].allelebuf, 1024); db_node_buf_alloc(&callers[i].flank5pbuf, 1024); kmer_run_buf_alloc(&callers[i].koruns_5p, 128); kmer_run_buf_alloc(&callers[i].koruns_5p_ended, 128); kmer_run_buf_alloc(&callers[i].koruns_3p, 128); kmer_run_buf_alloc(&callers[i].koruns_3p_ended, 128); kmer_run_buf_alloc(&callers[i].allele_run_buf, 128); kmer_run_buf_alloc(&callers[i].flank5p_run_buf, 128); graph_crawler_alloc(&callers[i].crawlers[0], db_graph); graph_crawler_alloc(&callers[i].crawlers[1], db_graph); } return callers; } static void brkpt_callers_destroy(BreakpointCaller *callers, size_t num_callers) { size_t i; for(i = 0; i < num_callers; i++) { db_node_buf_dealloc(&callers[i].allelebuf); db_node_buf_dealloc(&callers[i].flank5pbuf); kmer_run_buf_dealloc(&callers[i].koruns_5p); kmer_run_buf_dealloc(&callers[i].koruns_5p_ended); kmer_run_buf_dealloc(&callers[i].koruns_3p); kmer_run_buf_dealloc(&callers[i].koruns_3p_ended); kmer_run_buf_dealloc(&callers[i].allele_run_buf); kmer_run_buf_dealloc(&callers[i].flank5p_run_buf); graph_crawler_dealloc(&callers[i].crawlers[0]); graph_crawler_dealloc(&callers[i].crawlers[1]); } pthread_mutex_destroy(callers[0].out_lock); ctx_free(callers[0].out_lock); ctx_free(callers[0].callid); ctx_free(callers[0].allele_refs); ctx_free(callers); }
BubbleCaller* bubble_callers_new(size_t num_callers, BubbleCallingPrefs prefs, gzFile gzout, const dBGraph *db_graph) { ctx_assert(num_callers > 0); // Max usage is 4 * max_allele_len * cols size_t i; size_t max_path_len = MAX2(prefs.max_flank_len, prefs.max_allele_len); BubbleCaller *callers = ctx_malloc(num_callers * sizeof(BubbleCaller)); pthread_mutex_t *out_lock = ctx_malloc(sizeof(pthread_mutex_t)); if(pthread_mutex_init(out_lock, NULL) != 0) die("mutex init failed"); size_t *num_bubbles_ptr = ctx_calloc(1, sizeof(size_t)); for(i = 0; i < num_callers; i++) { BubbleCaller tmp = {.threadid = i, .nthreads = num_callers, .haploid_seen = ctx_calloc(1+prefs.num_haploid, sizeof(bool)), .num_bubbles_ptr = num_bubbles_ptr, .prefs = prefs, .db_graph = db_graph, .gzout = gzout, .out_lock = out_lock}; memcpy(&callers[i], &tmp, sizeof(BubbleCaller)); // First two buffers don't actually need to grow db_node_buf_alloc(&callers[i].flank5p, prefs.max_flank_len); db_node_buf_alloc(&callers[i].pathbuf, max_path_len); graph_walker_alloc(&callers[i].wlk, db_graph); rpt_walker_alloc(&callers[i].rptwlk, db_graph->ht.capacity, 22); // 4MB graph_cache_alloc(&callers[i].cache, db_graph); cache_stepptr_buf_alloc(&callers[i].spp_forward, 1024); cache_stepptr_buf_alloc(&callers[i].spp_reverse, 1024); strbuf_alloc(&callers[i].output_buf, 2048); } return callers; } void bubble_callers_destroy(BubbleCaller *callers, size_t num_callers) { ctx_assert(num_callers > 0); size_t i; for(i = 0; i < num_callers; i++) { ctx_free(callers[i].haploid_seen); db_node_buf_dealloc(&callers[i].flank5p); db_node_buf_dealloc(&callers[i].pathbuf); rpt_walker_dealloc(&callers[i].rptwlk); graph_walker_dealloc(&callers[i].wlk); graph_cache_dealloc(&callers[i].cache); cache_stepptr_buf_dealloc(&callers[i].spp_forward); cache_stepptr_buf_dealloc(&callers[i].spp_reverse); strbuf_dealloc(&callers[i].output_buf); } pthread_mutex_destroy(callers[0].out_lock); ctx_free(callers[0].out_lock); ctx_free(callers[0].num_bubbles_ptr); ctx_free(callers); }