// Traverse from node0 -> node1 static void traverse_5pflank(BreakpointCaller *caller, GraphCrawler *crawler, dBNode node0, dBNode node1) { const dBGraph *db_graph = crawler->cache.db_graph; dBNode next_nodes[4]; Nucleotide next_nucs[4]; size_t i, num_next; BinaryKmer bkmer0 = db_node_get_bkmer(db_graph, node0.key); num_next = db_graph_next_nodes(db_graph, bkmer0, node0.orient, db_node_edges(db_graph, node0.key, 0), next_nodes, next_nucs); // Find index of previous node for(i = 0; i < num_next && !db_nodes_are_equal(next_nodes[i],node1); i++) {} ctx_assert(i < num_next && db_nodes_are_equal(next_nodes[i],node1)); kmer_run_buf_reset(&caller->koruns_5p); kmer_run_buf_reset(&caller->koruns_5p_ended); kmer_run_buf_reset(&caller->flank5p_run_buf); // Go backwards to get 5p flank // NULL means loop from 0..(ncols-1) graph_crawler_fetch(crawler, node0, next_nodes, next_nucs, i, num_next, NULL, db_graph->num_of_cols, gcrawler_flank5p_stop_at_ref_covg, gcrawler_flank5p_finish_ref_covg, caller); }
// Orient supernode // Once oriented, supernode has lowest possible kmerkey at the beginning, // oriented FORWARDs if possible void supernode_normalise(dBNode *nlist, size_t len, const dBGraph *db_graph) { // Sort supernode into forward orientation ctx_assert(len > 0); if(len == 1) { nlist[0].orient = FORWARD; return; } BinaryKmer bkmer0 = db_node_get_bkmer(db_graph, nlist[0].key); BinaryKmer bkmer1 = db_node_get_bkmer(db_graph, nlist[len-1].key); // Check if closed cycle if(supernode_is_closed_cycle(nlist, len, bkmer0, bkmer1, db_graph)) { // find lowest kmer to start from BinaryKmer lowest = bkmer0, tmp; size_t i, idx = 0; for(i = 1; i < len; i++) { tmp = db_node_get_bkmer(db_graph, nlist[i].key); if(binary_kmer_less_than(tmp, lowest)) { lowest = tmp; idx = i; } } // If already starting from the lowest kmer no change needed if(idx > 0 || nlist[0].orient != FORWARD) { // a->b->c->d->e->f->a // if c is lowest and FORWARD: c->d->e->f->a->b (keep orientations) // if c is lowest and REVERSE: c->b->a->f->e->d (reverse orientations) if(nlist[idx].orient == FORWARD) { // Shift left by idx, without affecting orientations db_nodes_left_shift(nlist, len, idx); } else { db_nodes_reverse_complement(nlist, idx+1); db_nodes_reverse_complement(nlist+idx+1, len-idx-1); } } } else if(binary_kmer_less_than(bkmer1,bkmer0)) { db_nodes_reverse_complement(nlist, len); } }
// Get bkey:orient string representation e.g. "AGAGTTTTATC:1". // :0 means forward, :1 means reverse // `str` must be at least kmer_size+3 chars long // Returns length in bytes. Null terminates `str`. size_t db_node_to_str(const dBGraph *db_graph, dBNode node, char *str) { const size_t kmer_size = db_graph->kmer_size; BinaryKmer bkmer = db_node_get_bkmer(db_graph, node.key); binary_kmer_to_str(bkmer, kmer_size, str); str[kmer_size] = ':'; str[kmer_size+1] = '0' + node.orient; str[kmer_size+2] = '\0'; return kmer_size + 2; }
// Print: // 0: AAACCCAAATGCAAACCCAAATGCAAACCCA:1 TGGGTTTGCATTTGGGTTTGCATTTGGGTTT // 1: CAAACCCAAATGCAAACCCAAATGCAAACCC:1 GGGTTTGCATTTGGGTTTGCATTTGGGTTTG // ... void db_nodes_print_verbose(const dBNode *nodes, size_t num, const dBGraph *db_graph, FILE *out) { if(num == 0) return; const size_t kmer_size = db_graph->kmer_size; size_t i; BinaryKmer bkmer, bkey; char kmerstr[MAX_KMER_SIZE+1], keystr[MAX_KMER_SIZE+1]; bkmer = db_node_get_bkmer(db_graph, nodes[0].key); bkey = db_node_oriented_bkmer(db_graph, nodes[0]); binary_kmer_to_str(bkmer, kmer_size, kmerstr); binary_kmer_to_str(bkey, kmer_size, keystr); fprintf(out, "%3zu: %s:%i %s\n", (size_t)0, kmerstr, (int)nodes[0].orient, keystr); for(i = 1; i < num; i++) { bkmer = db_node_get_bkmer(db_graph, nodes[i].key); bkey = db_node_oriented_bkmer(db_graph, nodes[i]); binary_kmer_to_str(bkmer, kmer_size, kmerstr); binary_kmer_to_str(bkey, kmer_size, keystr); fprintf(out, "%3zu: %s:%i %s\n", i, kmerstr, (int)nodes[i].orient, keystr); } }
// For every kmer in the graph, we run this function static inline bool print_edges(hkey_t hkey, size_t threadid, void *arg) { (void)threadid; UnitigPrinter *p = (UnitigPrinter*)arg; UnitigEnd uend = p->ugraph.unitig_ends[hkey]; // Check if node is an end of a unitig if(uend.assigned) { BinaryKmer bkey = db_node_get_bkmer(p->db_graph, hkey); Edges edges = db_node_get_edges(p->db_graph, hkey, 0); if(uend.left) { _print_edge(hkey, false, bkey, edges, uend, p); } if(uend.right) { _print_edge(hkey, true, bkey, edges, uend, p); } } return false; // keep iterating }
// Returns number of bytes added size_t db_nodes_to_str(const dBNode *nodes, size_t num, const dBGraph *db_graph, char *str) { if(num == 0) return 0; size_t i; size_t kmer_size = db_graph->kmer_size; BinaryKmer bkmer = db_node_get_bkmer(db_graph, nodes[0].key); Nucleotide nuc; binary_kmer_to_str(bkmer, kmer_size, str); if(nodes[0].orient == REVERSE) dna_reverse_complement_str(str, kmer_size); for(i = 1; i < num; i++) { nuc = db_node_get_last_nuc(nodes[i], db_graph); str[kmer_size+i-1] = dna_nuc_to_char(nuc); } str[kmer_size+num-1] = '\0'; return kmer_size+num-1; }
// Edges restricted to this colour, only in one direction (node.orient) Edges db_node_edges_in_col(dBNode node, size_t col, const dBGraph *db_graph) { if(db_graph->node_in_cols == NULL && db_graph->col_covgs == NULL) { Edges edges = db_node_get_edges(db_graph, node.key, col); return edges_mask_orientation(edges, node.orient); } // Edges are merged into one colour ctx_assert(db_graph->num_edge_cols == 1); ctx_assert(db_graph->node_in_cols != NULL || db_graph->col_covgs != NULL); Edges edges = db_node_get_edges(db_graph, node.key, 0); // Check which next nodes are in the given colour BinaryKmer bkmer = db_node_get_bkmer(db_graph, node.key); dBNode nodes[4]; Nucleotide nucs[4]; size_t i, n; n = db_graph_next_nodes(db_graph, bkmer, node.orient, edges, nodes, nucs); edges = 0; if(db_graph->node_in_cols != NULL) { for(i = 0; i < n; i++) if(db_node_has_col(db_graph, nodes[i].key, col)) edges = edges_set_edge(edges, nucs[i], node.orient); } else if(db_graph->col_covgs != NULL) { for(i = 0; i < n; i++) if(db_node_col_covg(db_graph, nodes[i].key, col) > 0) edges = edges_set_edge(edges, nucs[i], node.orient); } else ctx_assert(0); return edges; }
static inline int infer_edges_node(hkey_t hkey, bool add_all_edges, Covg *tmp_covgs, const dBGraph *db_graph, size_t *num_nodes_modified) { BinaryKmer bkmer = db_node_get_bkmer(db_graph, hkey); Edges *edges = &db_node_edges(db_graph, hkey, 0); size_t col; // Create coverages that are zero or one depending on if node has colour if(db_graph->col_covgs == NULL) { for(col = 0; col < db_graph->num_of_cols; col++) tmp_covgs[col] = db_node_has_col(db_graph, hkey, col); } else { tmp_covgs = &db_node_covg(db_graph, hkey, 0); } (*num_nodes_modified) += (add_all_edges ? infer_all_edges(bkmer, edges, tmp_covgs, db_graph) : infer_pop_edges(bkmer, edges, tmp_covgs, db_graph)); return 0; // => keep iterating }
void test_graph_crawler() { test_status("Testing graph crawler..."); // Construct 1 colour graph with kmer-size=11 dBGraph graph; const size_t kmer_size = 11, ncols = 3; db_graph_alloc(&graph, kmer_size, ncols, 1, 2048, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL | DBG_ALLOC_BKTLOCKS); char graphseq[3][77] = // < X X X............... {"GTTCCAGAGCGGAGGTCTCCCAACAACATGGTATAAGTTGTCTAGCCCCGGTTCGCGCGGGTACTTCTTACAGCGC", "GTTCCAGAGCGGAGGTCTCCCAACAACTTGGTATAAGTTGTCTAGTCCCGGTTCGCGCGGCATTTCAGCATTGTTA", "GTTCCAGAGCGCGACAGAGTGCATATCACGCTAAGCACAGCCCTCTTCTATCTGCTTTTAAATGGATCAATAATCG"}; build_graph_from_str_mt(&graph, 0, graphseq[0], strlen(graphseq[0])); build_graph_from_str_mt(&graph, 1, graphseq[1], strlen(graphseq[1])); build_graph_from_str_mt(&graph, 2, graphseq[2], strlen(graphseq[2])); // Crawl graph GraphCrawler crawler; graph_crawler_alloc(&crawler, &graph); dBNode node = db_graph_find_str(&graph, graphseq[0]); dBNode next_node = db_graph_find_str(&graph, graphseq[0]+1); TASSERT(node.key != HASH_NOT_FOUND); TASSERT(next_node.key != HASH_NOT_FOUND); BinaryKmer bkey = db_node_get_bkmer(&graph, node.key); Edges edges = db_node_get_edges(&graph, node.key, 0); dBNode next_nodes[4]; Nucleotide next_nucs[4]; size_t i, p, num_next, next_idx; num_next = db_graph_next_nodes(&graph, bkey, node.orient, edges, next_nodes, next_nucs); next_idx = 0; while(next_idx < num_next && !db_nodes_are_equal(next_nodes[next_idx],next_node)) next_idx++; TASSERT(next_idx < num_next && db_nodes_are_equal(next_nodes[next_idx],next_node)); // Crawl in all colours graph_crawler_fetch(&crawler, node, next_nodes, next_idx, num_next, NULL, graph.num_of_cols, NULL, NULL, NULL); TASSERT2(crawler.num_paths == 2, "crawler.num_paths: %u", crawler.num_paths); // Fetch paths dBNodeBuffer nbuf; db_node_buf_alloc(&nbuf, 16); StrBuf sbuf; strbuf_alloc(&sbuf, 128); for(p = 0; p < crawler.num_paths; p++) { db_node_buf_reset(&nbuf); graph_crawler_get_path_nodes(&crawler, p, &nbuf); strbuf_ensure_capacity(&sbuf, nbuf.len+graph.kmer_size); sbuf.end = db_nodes_to_str(nbuf.b, nbuf.len, &graph, sbuf.b); for(i = 0; i < 3 && strcmp(graphseq[i]+1,sbuf.b) != 0; i++) {} TASSERT2(i < 3, "seq: %s", sbuf.b); TASSERT2(sbuf.end == 75, "sbuf.end: %zu", sbuf.end); TASSERT2(nbuf.len == 65, "nbuf.len: %zu", nbuf.len); } strbuf_dealloc(&sbuf); db_node_buf_dealloc(&nbuf); graph_crawler_dealloc(&crawler); db_graph_dealloc(&graph); }
// Walk the graph remembering the last time we met the ref // When traversal fails, dump sequence up to last meeting with the ref static void follow_break(BreakpointCaller *caller, dBNode node) { size_t i, j, k, num_next; dBNode next_nodes[4]; Nucleotide next_nucs[4]; size_t nonref_idx[4], num_nonref_next = 0; const dBGraph *db_graph = caller->db_graph; BinaryKmer bkey = db_node_get_bkmer(db_graph, node.key); Edges edges = db_node_get_edges(db_graph, node.key, 0); num_next = db_graph_next_nodes(db_graph, bkey, node.orient, edges, next_nodes, next_nucs); // Filter out next nodes in the reference for(i = 0; i < num_next; i++) { if(kograph_num(caller->kograph, next_nodes[i].key) == 0) { nonref_idx[num_nonref_next] = i; num_nonref_next++; } } // Abandon if all options are in ref or none are if(num_nonref_next == num_next || num_nonref_next == 0) return; // Follow all paths not in ref, in all colours GraphCrawler *fw_crawler = &caller->crawlers[node.orient]; GraphCrawler *rv_crawler = &caller->crawlers[!node.orient]; dBNodeBuffer *allelebuf = &caller->allelebuf, *flank5pbuf = &caller->flank5pbuf; GCMultiColPath *flank5p_multicolpath, *allele_multicolpath; KOccurRun *flank5p_runs, *flank3p_runs; size_t flank5p_pathid, allele_pathid; size_t num_flank5p_runs, num_flank3p_runs; // We fetch 5' flanks in all colours then merge matching paths // we stop fetching a single path if it stops tracking the reference // Alternatively, we could fetch the 5' flank in everyone and stop after a // given distance, then check for that set of paths how much it tracks the // reference. This has the advantage of scaling much better with number of // samples, but not so well as min_ref_nkmers increases (since we fetch // many flanks that can't be used) - I think this is less of a worry. // Loop over possible next nodes at this junction for(i = 0; i < num_nonref_next; i++) { size_t next_idx = nonref_idx[i]; // Go backwards to get 5p flank traverse_5pflank(caller, rv_crawler, db_node_reverse(next_nodes[next_idx]), db_node_reverse(node)); // Loop over the flanks we got for(j = 0; j < rv_crawler->num_paths; j++) { // Get 5p flank db_node_buf_reset(flank5pbuf); graph_crawler_get_path_nodes(rv_crawler, j, flank5pbuf); flank5p_multicolpath = &rv_crawler->multicol_paths[j]; flank5p_pathid = flank5p_multicolpath->pathid; // Fetch 3pflank ref position num_flank5p_runs = caller->flank5p_refs[flank5p_pathid].num_runs; flank5p_runs = fetch_ref_contact(&rv_crawler->cache, flank5p_pathid, caller->flank5p_refs, &caller->flank5p_run_buf); koruns_reverse(flank5p_runs, num_flank5p_runs, flank5pbuf->len); koruns_sort_by_qoffset(flank5p_runs, num_flank5p_runs); db_nodes_reverse_complement(flank5pbuf->data, flank5pbuf->len); if(num_flank5p_runs > 0) { // Reset caller kmer_run_buf_reset(&caller->koruns_3p); kmer_run_buf_reset(&caller->koruns_3p_ended); kmer_run_buf_reset(&caller->allele_run_buf); // functions gcrawler_path_stop_at_ref_covg(), // gcrawler_path_finish_ref_covg() // both fill koruns_3p, koruns_3p_ended and allele_run_buf // Only traverse in the colours we have a flank for graph_crawler_fetch(fw_crawler, node, next_nodes, next_nucs, next_idx, num_next, flank5p_multicolpath->cols, flank5p_multicolpath->num_cols, gcrawler_path_stop_at_ref_covg, gcrawler_path_finish_ref_covg, caller); // Assemble contigs - fetch forwards for each path for given 5p flank for(k = 0; k < fw_crawler->num_paths; k++) { // Fetch nodes db_node_buf_reset(allelebuf); graph_crawler_get_path_nodes(fw_crawler, k, allelebuf); ctx_assert(allelebuf->len > 0); allele_multicolpath = &fw_crawler->multicol_paths[k]; allele_pathid = allele_multicolpath->pathid; // Fetch 3pflank ref position num_flank3p_runs = caller->allele_refs[allele_pathid].num_runs; flank3p_runs = fetch_ref_contact(&fw_crawler->cache, allele_pathid, caller->allele_refs, &caller->allele_run_buf); process_contig(caller, allele_multicolpath->cols, allele_multicolpath->num_cols, flank5pbuf, allelebuf, flank5p_runs, num_flank5p_runs, flank3p_runs, num_flank3p_runs); } } } } }
// `fork_node` is a node with outdegree > 1 void find_bubbles(BubbleCaller *caller, dBNode fork_node) { graph_cache_reset(&caller->cache); const dBGraph *db_graph = caller->db_graph; GraphCache *cache = &caller->cache; GraphWalker *wlk = &caller->wlk; RepeatWalker *rptwlk = &caller->rptwlk; // char tmpstr[MAX_KMER_SIZE+3]; // db_node_to_str(db_graph, fork_node, tmpstr); // status("Calling from %s", tmpstr); dBNode nodes[4]; Nucleotide bases[4]; size_t i, num_next, num_edges_in_col; BinaryKmer fork_bkmer = db_node_get_bkmer(db_graph, fork_node.key); num_next = db_graph_next_nodes(db_graph, fork_bkmer, fork_node.orient, db_node_edges(db_graph, fork_node.key, 0), nodes, bases); // loop over alleles, then colours Colour colour, colours_loaded = db_graph->num_of_cols; bool node_has_col[4]; uint32_t pathid; for(colour = 0; colour < colours_loaded; colour++) { if(!db_node_has_col(db_graph, fork_node.key, colour)) continue; // Determine if this fork is a fork in the current colour num_edges_in_col = 0; for(i = 0; i < num_next; i++) { node_has_col[i] = (db_node_has_col(db_graph, nodes[i].key, colour) > 0); num_edges_in_col += node_has_col[i]; } graph_walker_setup(wlk, true, colour, colour, db_graph); for(i = 0; i < num_next; i++) { if(node_has_col[i]) { graph_walker_start(wlk, fork_node); graph_walker_force(wlk, nodes[i], num_edges_in_col > 1); pathid = graph_crawler_load_path_limit(cache, nodes[i], wlk, rptwlk, caller->prefs.max_allele_len); graph_walker_finish(wlk); graph_crawler_reset_rpt_walker(rptwlk, cache, pathid); } } } // Set up 5p flank caller->flank5p.b[0] = db_node_reverse(fork_node); caller->flank5p.len = 0; // set to one to signify we haven't fetched flank yet }