// Traverse from node0 -> node1 static void traverse_5pflank(BreakpointCaller *caller, GraphCrawler *crawler, dBNode node0, dBNode node1) { const dBGraph *db_graph = crawler->cache.db_graph; dBNode next_nodes[4]; Nucleotide next_nucs[4]; size_t i, num_next; BinaryKmer bkmer0 = db_node_get_bkmer(db_graph, node0.key); num_next = db_graph_next_nodes(db_graph, bkmer0, node0.orient, db_node_edges(db_graph, node0.key, 0), next_nodes, next_nucs); // Find index of previous node for(i = 0; i < num_next && !db_nodes_are_equal(next_nodes[i],node1); i++) {} ctx_assert(i < num_next && db_nodes_are_equal(next_nodes[i],node1)); kmer_run_buf_reset(&caller->koruns_5p); kmer_run_buf_reset(&caller->koruns_5p_ended); kmer_run_buf_reset(&caller->flank5p_run_buf); // Go backwards to get 5p flank // NULL means loop from 0..(ncols-1) graph_crawler_fetch(crawler, node0, next_nodes, next_nucs, i, num_next, NULL, db_graph->num_of_cols, gcrawler_flank5p_stop_at_ref_covg, gcrawler_flank5p_finish_ref_covg, caller); }
// `node1` should be the first node of a supernode // `node0` should be the previous node // `next_base` is the last base of `node1` // `jmpfunc` is called with each supernode traversed and if it returns true // we continue crawling, otherwise we stop // `endfunc` is a function called at the end of traversal void graph_crawler_fetch(GraphCrawler *crawler, dBNode node0, dBNode next_nodes[4], size_t take_idx, size_t num_next, uint32_t *cols, size_t ncols, bool (*jmpfunc)(GraphCache *_cache, GCacheStep *_step, void *_arg), void (*endfunc)(GraphCache *_cache, uint32_t _pathid, void *_arg), void *arg) { const dBGraph *db_graph = crawler->cache.db_graph; GraphCache *cache = &crawler->cache; GraphWalker *wlk = &crawler->wlk; RepeatWalker *rptwlk = &crawler->rptwlk; GCUniColPath *unipaths = crawler->unicol_paths; ctx_assert(take_idx < num_next); ctx_assert(!db_nodes_are_equal(node0, next_nodes[take_idx])); // Fetch all paths in all colours dBNode node1 = next_nodes[take_idx]; bool is_fork; size_t i, c, col, nedges_cols, num_unicol_paths = 0; int pathid; for(c = 0; c < ncols; c++) { col = (cols != NULL ? cols[c] : c); if(db_node_has_col(db_graph, node0.key, col) && db_node_has_col(db_graph, node1.key, col)) { // Determine if this fork is a fork in the current colour for(nedges_cols = 0, i = 0; i < num_next && nedges_cols <= 1; i++) nedges_cols += db_node_has_col(db_graph, next_nodes[i].key, col); is_fork = (nedges_cols > 1); graph_walker_setup(wlk, true, col, col, db_graph); graph_walker_start(wlk, node0); graph_walker_force(wlk, node1, is_fork); pathid = graph_crawler_load_path(cache, node1, wlk, rptwlk, jmpfunc, arg); if(endfunc != NULL) endfunc(cache, pathid, arg); graph_walker_finish(wlk); graph_crawler_reset_rpt_walker(rptwlk, cache, pathid); unipaths[num_unicol_paths++] = (GCUniColPath){.colour = col, .pathid = pathid}; } else pathid = -1; crawler->col_paths[col] = pathid; }
// Constructs a path of supernodes (SupernodePath) // `wlk` GraphWalker should be set to go at `node` // `rptwlk` RepeatWalker should be clear // `jmpfunc` is called with each supernode traversed and if it returns true // we continue crawling, otherwise we stop. If NULL assume always true // returns pathid in GraphCache uint32_t graph_crawler_load_path(GraphCache *cache, dBNode node, GraphWalker *wlk, RepeatWalker *rptwlk, bool (*jmpfunc)(GraphCache *_cache, GCacheStep *_step, void *_arg), void *arg) { size_t i; uint32_t stepid, pathid = graph_cache_new_path(cache); ctx_assert(db_nodes_are_equal(wlk->node, node)); for(i = 0; ; i++) { stepid = graph_cache_new_step(cache, node); GCacheStep *step = graph_cache_step(cache, stepid); GCacheSnode *snode = graph_cache_snode(cache, step->supernode); // Traverse to the end of the supernode walk_supernode_end(cache, snode, step->orient, wlk); if(jmpfunc != NULL && !jmpfunc(cache, step, arg)) break; // Find next node uint8_t num_edges; const dBNode *next_nodes; Nucleotide next_bases[4]; if(step->orient == FORWARD) { num_edges = snode->num_next; next_nodes = snode->next_nodes; binary_seq_unpack_byte(next_bases, snode->next_bases); } else { num_edges = snode->num_prev; next_nodes = snode->prev_nodes; binary_seq_unpack_byte(next_bases, snode->prev_bases); } // Traverse to next supernode if(!graph_walker_next_nodes(wlk, num_edges, next_nodes, next_bases) || !rpt_walker_attempt_traverse(rptwlk, wlk)) break; node = wlk->node; } return pathid; }
// Check we can walk along a set of nodes through the graph // If @allow_extend is true, traverse past the end of the buffer and add nodes static inline int confirm_seq(size_t startidx, bool allow_extend, GraphWalker *wlk, RepeatWalker *rpt, dBNodeBuffer *nbuf, size_t colour, const dBGraph *db_graph) { ctx_assert(startidx < nbuf->len); size_t i, init_len = nbuf->len; graph_walker_setup(wlk, true, colour, colour, db_graph); graph_walker_start(wlk, nbuf->b[startidx]); for(i = startidx+1; graph_walker_next(wlk); i++) { if(!rpt_walker_attempt_traverse(rpt, wlk)) { reset(wlk,rpt,nbuf); return CONFIRM_REPEAT; } if(i < init_len) { if(!db_nodes_are_equal(nbuf->b[i], wlk->node)) { reset(wlk,rpt,nbuf); return CONFIRM_WRONG; } } else { db_node_buf_add(nbuf, wlk->node); if(!allow_extend) { reset(wlk,rpt,nbuf); nbuf->len--; // Remove node we added return CONFIRM_OVERSHOT; } } } // printf("stopped %zu / %zu %zu\n", i, init_len, nbuf->len); reset(wlk,rpt,nbuf); return i < init_len ? CONFIRM_SHORT : CONFIRM_SUCCESS; }
void test_graph_crawler() { test_status("Testing graph crawler..."); // Construct 1 colour graph with kmer-size=11 dBGraph graph; const size_t kmer_size = 11, ncols = 3; db_graph_alloc(&graph, kmer_size, ncols, 1, 2048, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL | DBG_ALLOC_BKTLOCKS); char graphseq[3][77] = // < X X X............... {"GTTCCAGAGCGGAGGTCTCCCAACAACATGGTATAAGTTGTCTAGCCCCGGTTCGCGCGGGTACTTCTTACAGCGC", "GTTCCAGAGCGGAGGTCTCCCAACAACTTGGTATAAGTTGTCTAGTCCCGGTTCGCGCGGCATTTCAGCATTGTTA", "GTTCCAGAGCGCGACAGAGTGCATATCACGCTAAGCACAGCCCTCTTCTATCTGCTTTTAAATGGATCAATAATCG"}; build_graph_from_str_mt(&graph, 0, graphseq[0], strlen(graphseq[0])); build_graph_from_str_mt(&graph, 1, graphseq[1], strlen(graphseq[1])); build_graph_from_str_mt(&graph, 2, graphseq[2], strlen(graphseq[2])); // Crawl graph GraphCrawler crawler; graph_crawler_alloc(&crawler, &graph); dBNode node = db_graph_find_str(&graph, graphseq[0]); dBNode next_node = db_graph_find_str(&graph, graphseq[0]+1); TASSERT(node.key != HASH_NOT_FOUND); TASSERT(next_node.key != HASH_NOT_FOUND); BinaryKmer bkey = db_node_get_bkmer(&graph, node.key); Edges edges = db_node_get_edges(&graph, node.key, 0); dBNode next_nodes[4]; Nucleotide next_nucs[4]; size_t i, p, num_next, next_idx; num_next = db_graph_next_nodes(&graph, bkey, node.orient, edges, next_nodes, next_nucs); next_idx = 0; while(next_idx < num_next && !db_nodes_are_equal(next_nodes[next_idx],next_node)) next_idx++; TASSERT(next_idx < num_next && db_nodes_are_equal(next_nodes[next_idx],next_node)); // Crawl in all colours graph_crawler_fetch(&crawler, node, next_nodes, next_idx, num_next, NULL, graph.num_of_cols, NULL, NULL, NULL); TASSERT2(crawler.num_paths == 2, "crawler.num_paths: %u", crawler.num_paths); // Fetch paths dBNodeBuffer nbuf; db_node_buf_alloc(&nbuf, 16); StrBuf sbuf; strbuf_alloc(&sbuf, 128); for(p = 0; p < crawler.num_paths; p++) { db_node_buf_reset(&nbuf); graph_crawler_get_path_nodes(&crawler, p, &nbuf); strbuf_ensure_capacity(&sbuf, nbuf.len+graph.kmer_size); sbuf.end = db_nodes_to_str(nbuf.b, nbuf.len, &graph, sbuf.b); for(i = 0; i < 3 && strcmp(graphseq[i]+1,sbuf.b) != 0; i++) {} TASSERT2(i < 3, "seq: %s", sbuf.b); TASSERT2(sbuf.end == 75, "sbuf.end: %zu", sbuf.end); TASSERT2(nbuf.len == 65, "nbuf.len: %zu", nbuf.len); } strbuf_dealloc(&sbuf); db_node_buf_dealloc(&nbuf); graph_crawler_dealloc(&crawler); db_graph_dealloc(&graph); }
static inline int test_statement_node(dBNode node, ExpABCWorker *wrkr) { const dBGraph *db_graph = wrkr->db_graph; dBNodeBuffer *nbuf = &wrkr->nbuf; GraphWalker *wlk = &wrkr->gwlk; RepeatWalker *rpt = &wrkr->rptwlk; size_t b_idx, col = wrkr->colour; // rpt_walker_clear(rpt); db_node_buf_reset(nbuf); db_node_buf_add(nbuf, node); // size_t AB_limit = wrkr->prime_AB ? SIZE_MAX : wrkr->max_AB_dist; size_t walk_limit = wrkr->max_AB_dist; // status("walk_limit: %zu", walk_limit); // Walk from B to find A graph_walker_setup(wlk, true, col, col, db_graph); graph_walker_start(wlk, nbuf->b[0]); while(graph_walker_next(wlk) && nbuf->len < walk_limit) { if(!rpt_walker_attempt_traverse(rpt, wlk)) { reset(wlk,rpt,nbuf); return RES_LOST_IN_RPT; } db_node_buf_add(nbuf, wlk->node); } reset(wlk,rpt,nbuf); if(nbuf->len == 1) return RES_NO_TRAVERSAL; // Traverse A->B db_nodes_reverse_complement(nbuf->b, nbuf->len); b_idx = nbuf->len - 1; if(wrkr->prime_AB) { // Prime A->B without attempting to cross graph_walker_prime(wlk, nbuf->b, nbuf->len, nbuf->len, true); while(graph_walker_next(wlk)) { if(!rpt_walker_attempt_traverse(rpt, wlk)) { reset(wlk,rpt,nbuf); return RES_LOST_IN_RPT; } db_node_buf_add(nbuf, wlk->node); } } else { // Attempt to traverse A->B then extend past B int r = confirm_seq(0, true, wlk, rpt, nbuf, col, db_graph); switch(r) { case CONFIRM_REPEAT: return RES_LOST_IN_RPT; case CONFIRM_OVERSHOT: ctx_assert2(0,"Can't 'overshoot' when extending"); case CONFIRM_WRONG: return RES_AB_WRONG; case CONFIRM_SHORT: if(wrkr->print_failed_contigs) print_failed(node, nbuf, db_graph, true, wrkr->prime_AB); wrkr->ab_fail_state[wlk->last_step.status]++; return RES_AB_FAILED; } } reset(wlk,rpt,nbuf); if(nbuf->len == b_idx+1) return RES_NO_TRAVERSAL; // Couldn't get past B // Last node is now C // Walk from B... record whether or not we reach C ctx_assert(db_nodes_are_equal(nbuf->b[b_idx], db_node_reverse(node))); int r = confirm_seq(b_idx, false, wlk, rpt, nbuf, col, db_graph); switch(r) { case CONFIRM_REPEAT: return RES_LOST_IN_RPT; case CONFIRM_OVERSHOT: return RES_BC_OVERSHOT; case CONFIRM_WRONG: return RES_BC_WRONG; case CONFIRM_SHORT: if(wrkr->print_failed_contigs) print_failed(node, nbuf, db_graph, false, wrkr->prime_AB); wrkr->bc_fail_state[wlk->last_step.status]++; return RES_BC_FAILED; case CONFIRM_SUCCESS: return RES_ABC_SUCCESS; } die("Shouldn't reach here: r=%i", r); return -1; }