// Reverse ordering and orientations void db_nodes_reverse_complement(dBNode *nlist, size_t len) { if(len == 0) return; size_t i, j; dBNode tmp; for(i = 0, j = len-1; i+1 < j; i++, j--) { tmp = nlist[i]; nlist[i] = db_node_reverse(nlist[j]); nlist[j] = db_node_reverse(tmp); } tmp = nlist[i]; nlist[i] = db_node_reverse(nlist[j]); nlist[j] = db_node_reverse(tmp); }
static inline void walk_supernode_end(const GraphCache *cache, const GCacheSnode *snode, Orientation snorient, GraphWalker *wlk) { // Only need to traverse the first and last nodes of a supernode const dBNode *first = graph_cache_first_node(cache, snode); const dBNode *last = graph_cache_last_node(cache, snode); dBNode lastnode; size_t num_nodes = last - first + 1; if(num_nodes > 1) { lastnode = (snorient == FORWARD ? *last : db_node_reverse(*first)); graph_walker_jump_along_snode(wlk, lastnode, num_nodes); } }
// `fork_node` is a node with outdegree > 1 void find_bubbles(BubbleCaller *caller, dBNode fork_node) { graph_cache_reset(&caller->cache); const dBGraph *db_graph = caller->db_graph; GraphCache *cache = &caller->cache; GraphWalker *wlk = &caller->wlk; RepeatWalker *rptwlk = &caller->rptwlk; // char tmpstr[MAX_KMER_SIZE+3]; // db_node_to_str(db_graph, fork_node, tmpstr); // status("Calling from %s", tmpstr); dBNode nodes[4]; Nucleotide bases[4]; size_t i, num_next, num_edges_in_col; BinaryKmer fork_bkmer = db_node_get_bkmer(db_graph, fork_node.key); num_next = db_graph_next_nodes(db_graph, fork_bkmer, fork_node.orient, db_node_edges(db_graph, fork_node.key, 0), nodes, bases); // loop over alleles, then colours Colour colour, colours_loaded = db_graph->num_of_cols; bool node_has_col[4]; uint32_t pathid; for(colour = 0; colour < colours_loaded; colour++) { if(!db_node_has_col(db_graph, fork_node.key, colour)) continue; // Determine if this fork is a fork in the current colour num_edges_in_col = 0; for(i = 0; i < num_next; i++) { node_has_col[i] = (db_node_has_col(db_graph, nodes[i].key, colour) > 0); num_edges_in_col += node_has_col[i]; } graph_walker_setup(wlk, true, colour, colour, db_graph); for(i = 0; i < num_next; i++) { if(node_has_col[i]) { graph_walker_start(wlk, fork_node); graph_walker_force(wlk, nodes[i], num_edges_in_col > 1); pathid = graph_crawler_load_path_limit(cache, nodes[i], wlk, rptwlk, caller->prefs.max_allele_len); graph_walker_finish(wlk); graph_crawler_reset_rpt_walker(rptwlk, cache, pathid); } } } // Set up 5p flank caller->flank5p.b[0] = db_node_reverse(fork_node); caller->flank5p.len = 0; // set to one to signify we haven't fetched flank yet }
// Walk the graph remembering the last time we met the ref // When traversal fails, dump sequence up to last meeting with the ref static void follow_break(BreakpointCaller *caller, dBNode node) { size_t i, j, k, num_next; dBNode next_nodes[4]; Nucleotide next_nucs[4]; size_t nonref_idx[4], num_nonref_next = 0; const dBGraph *db_graph = caller->db_graph; BinaryKmer bkey = db_node_get_bkmer(db_graph, node.key); Edges edges = db_node_get_edges(db_graph, node.key, 0); num_next = db_graph_next_nodes(db_graph, bkey, node.orient, edges, next_nodes, next_nucs); // Filter out next nodes in the reference for(i = 0; i < num_next; i++) { if(kograph_num(caller->kograph, next_nodes[i].key) == 0) { nonref_idx[num_nonref_next] = i; num_nonref_next++; } } // Abandon if all options are in ref or none are if(num_nonref_next == num_next || num_nonref_next == 0) return; // Follow all paths not in ref, in all colours GraphCrawler *fw_crawler = &caller->crawlers[node.orient]; GraphCrawler *rv_crawler = &caller->crawlers[!node.orient]; dBNodeBuffer *allelebuf = &caller->allelebuf, *flank5pbuf = &caller->flank5pbuf; GCMultiColPath *flank5p_multicolpath, *allele_multicolpath; KOccurRun *flank5p_runs, *flank3p_runs; size_t flank5p_pathid, allele_pathid; size_t num_flank5p_runs, num_flank3p_runs; // We fetch 5' flanks in all colours then merge matching paths // we stop fetching a single path if it stops tracking the reference // Alternatively, we could fetch the 5' flank in everyone and stop after a // given distance, then check for that set of paths how much it tracks the // reference. This has the advantage of scaling much better with number of // samples, but not so well as min_ref_nkmers increases (since we fetch // many flanks that can't be used) - I think this is less of a worry. // Loop over possible next nodes at this junction for(i = 0; i < num_nonref_next; i++) { size_t next_idx = nonref_idx[i]; // Go backwards to get 5p flank traverse_5pflank(caller, rv_crawler, db_node_reverse(next_nodes[next_idx]), db_node_reverse(node)); // Loop over the flanks we got for(j = 0; j < rv_crawler->num_paths; j++) { // Get 5p flank db_node_buf_reset(flank5pbuf); graph_crawler_get_path_nodes(rv_crawler, j, flank5pbuf); flank5p_multicolpath = &rv_crawler->multicol_paths[j]; flank5p_pathid = flank5p_multicolpath->pathid; // Fetch 3pflank ref position num_flank5p_runs = caller->flank5p_refs[flank5p_pathid].num_runs; flank5p_runs = fetch_ref_contact(&rv_crawler->cache, flank5p_pathid, caller->flank5p_refs, &caller->flank5p_run_buf); koruns_reverse(flank5p_runs, num_flank5p_runs, flank5pbuf->len); koruns_sort_by_qoffset(flank5p_runs, num_flank5p_runs); db_nodes_reverse_complement(flank5pbuf->data, flank5pbuf->len); if(num_flank5p_runs > 0) { // Reset caller kmer_run_buf_reset(&caller->koruns_3p); kmer_run_buf_reset(&caller->koruns_3p_ended); kmer_run_buf_reset(&caller->allele_run_buf); // functions gcrawler_path_stop_at_ref_covg(), // gcrawler_path_finish_ref_covg() // both fill koruns_3p, koruns_3p_ended and allele_run_buf // Only traverse in the colours we have a flank for graph_crawler_fetch(fw_crawler, node, next_nodes, next_nucs, next_idx, num_next, flank5p_multicolpath->cols, flank5p_multicolpath->num_cols, gcrawler_path_stop_at_ref_covg, gcrawler_path_finish_ref_covg, caller); // Assemble contigs - fetch forwards for each path for given 5p flank for(k = 0; k < fw_crawler->num_paths; k++) { // Fetch nodes db_node_buf_reset(allelebuf); graph_crawler_get_path_nodes(fw_crawler, k, allelebuf); ctx_assert(allelebuf->len > 0); allele_multicolpath = &fw_crawler->multicol_paths[k]; allele_pathid = allele_multicolpath->pathid; // Fetch 3pflank ref position num_flank3p_runs = caller->allele_refs[allele_pathid].num_runs; flank3p_runs = fetch_ref_contact(&fw_crawler->cache, allele_pathid, caller->allele_refs, &caller->allele_run_buf); process_contig(caller, allele_multicolpath->cols, allele_multicolpath->num_cols, flank5pbuf, allelebuf, flank5p_runs, num_flank5p_runs, flank3p_runs, num_flank3p_runs); } } } } }
static inline int test_statement_node(dBNode node, ExpABCWorker *wrkr) { const dBGraph *db_graph = wrkr->db_graph; dBNodeBuffer *nbuf = &wrkr->nbuf; GraphWalker *wlk = &wrkr->gwlk; RepeatWalker *rpt = &wrkr->rptwlk; size_t b_idx, col = wrkr->colour; // rpt_walker_clear(rpt); db_node_buf_reset(nbuf); db_node_buf_add(nbuf, node); // size_t AB_limit = wrkr->prime_AB ? SIZE_MAX : wrkr->max_AB_dist; size_t walk_limit = wrkr->max_AB_dist; // status("walk_limit: %zu", walk_limit); // Walk from B to find A graph_walker_setup(wlk, true, col, col, db_graph); graph_walker_start(wlk, nbuf->b[0]); while(graph_walker_next(wlk) && nbuf->len < walk_limit) { if(!rpt_walker_attempt_traverse(rpt, wlk)) { reset(wlk,rpt,nbuf); return RES_LOST_IN_RPT; } db_node_buf_add(nbuf, wlk->node); } reset(wlk,rpt,nbuf); if(nbuf->len == 1) return RES_NO_TRAVERSAL; // Traverse A->B db_nodes_reverse_complement(nbuf->b, nbuf->len); b_idx = nbuf->len - 1; if(wrkr->prime_AB) { // Prime A->B without attempting to cross graph_walker_prime(wlk, nbuf->b, nbuf->len, nbuf->len, true); while(graph_walker_next(wlk)) { if(!rpt_walker_attempt_traverse(rpt, wlk)) { reset(wlk,rpt,nbuf); return RES_LOST_IN_RPT; } db_node_buf_add(nbuf, wlk->node); } } else { // Attempt to traverse A->B then extend past B int r = confirm_seq(0, true, wlk, rpt, nbuf, col, db_graph); switch(r) { case CONFIRM_REPEAT: return RES_LOST_IN_RPT; case CONFIRM_OVERSHOT: ctx_assert2(0,"Can't 'overshoot' when extending"); case CONFIRM_WRONG: return RES_AB_WRONG; case CONFIRM_SHORT: if(wrkr->print_failed_contigs) print_failed(node, nbuf, db_graph, true, wrkr->prime_AB); wrkr->ab_fail_state[wlk->last_step.status]++; return RES_AB_FAILED; } } reset(wlk,rpt,nbuf); if(nbuf->len == b_idx+1) return RES_NO_TRAVERSAL; // Couldn't get past B // Last node is now C // Walk from B... record whether or not we reach C ctx_assert(db_nodes_are_equal(nbuf->b[b_idx], db_node_reverse(node))); int r = confirm_seq(b_idx, false, wlk, rpt, nbuf, col, db_graph); switch(r) { case CONFIRM_REPEAT: return RES_LOST_IN_RPT; case CONFIRM_OVERSHOT: return RES_BC_OVERSHOT; case CONFIRM_WRONG: return RES_BC_WRONG; case CONFIRM_SHORT: if(wrkr->print_failed_contigs) print_failed(node, nbuf, db_graph, false, wrkr->prime_AB); wrkr->bc_fail_state[wlk->last_step.status]++; return RES_BC_FAILED; case CONFIRM_SUCCESS: return RES_ABC_SUCCESS; } die("Shouldn't reach here: r=%i", r); return -1; }