// `node1` should be the first node of a supernode // `node0` should be the previous node // `next_base` is the last base of `node1` // `jmpfunc` is called with each supernode traversed and if it returns true // we continue crawling, otherwise we stop // `endfunc` is a function called at the end of traversal void graph_crawler_fetch(GraphCrawler *crawler, dBNode node0, dBNode next_nodes[4], size_t take_idx, size_t num_next, uint32_t *cols, size_t ncols, bool (*jmpfunc)(GraphCache *_cache, GCacheStep *_step, void *_arg), void (*endfunc)(GraphCache *_cache, uint32_t _pathid, void *_arg), void *arg) { const dBGraph *db_graph = crawler->cache.db_graph; GraphCache *cache = &crawler->cache; GraphWalker *wlk = &crawler->wlk; RepeatWalker *rptwlk = &crawler->rptwlk; GCUniColPath *unipaths = crawler->unicol_paths; ctx_assert(take_idx < num_next); ctx_assert(!db_nodes_are_equal(node0, next_nodes[take_idx])); // Fetch all paths in all colours dBNode node1 = next_nodes[take_idx]; bool is_fork; size_t i, c, col, nedges_cols, num_unicol_paths = 0; int pathid; for(c = 0; c < ncols; c++) { col = (cols != NULL ? cols[c] : c); if(db_node_has_col(db_graph, node0.key, col) && db_node_has_col(db_graph, node1.key, col)) { // Determine if this fork is a fork in the current colour for(nedges_cols = 0, i = 0; i < num_next && nedges_cols <= 1; i++) nedges_cols += db_node_has_col(db_graph, next_nodes[i].key, col); is_fork = (nedges_cols > 1); graph_walker_setup(wlk, true, col, col, db_graph); graph_walker_start(wlk, node0); graph_walker_force(wlk, node1, is_fork); pathid = graph_crawler_load_path(cache, node1, wlk, rptwlk, jmpfunc, arg); if(endfunc != NULL) endfunc(cache, pathid, arg); graph_walker_finish(wlk); graph_crawler_reset_rpt_walker(rptwlk, cache, pathid); unipaths[num_unicol_paths++] = (GCUniColPath){.colour = col, .pathid = pathid}; } else pathid = -1; crawler->col_paths[col] = pathid; }
// Return 1 if changed; 0 otherwise bool infer_pop_edges(const BinaryKmer node_bkey, Edges *edges, const Covg *covgs, const dBGraph *db_graph) { Edges uedges = 0, iedges = 0xf, add_edges, edge; size_t orient, nuc, col, kmer_size = db_graph->kmer_size; const size_t ncols = db_graph->num_of_cols; BinaryKmer bkey, bkmer; hkey_t next; Edges newedges[ncols]; // char tmp[MAX_KMER_SIZE+1]; // binary_kmer_to_str(node_bkey, db_graph->kmer_size, tmp); // status("Inferring %s", tmp); for(col = 0; col < ncols; col++) { uedges |= edges[col]; // union of edges iedges &= edges[col]; // intersection of edges newedges[col] = edges[col]; } add_edges = uedges & ~iedges; if(!add_edges) return 0; for(orient = 0; orient < 2; orient++) { bkmer = (orient == FORWARD ? binary_kmer_left_shift_one_base(node_bkey, kmer_size) : binary_kmer_right_shift_one_base(node_bkey)); for(nuc = 0; nuc < 4; nuc++) { edge = nuc_orient_to_edge(nuc, orient); if(add_edges & edge) { // get next bkmer, look up in graph if(orient == FORWARD) binary_kmer_set_last_nuc(&bkmer, nuc); else binary_kmer_set_first_nuc(&bkmer, dna_nuc_complement(nuc), kmer_size); bkey = bkmer_get_key(bkmer, kmer_size); next = hash_table_find(&db_graph->ht, bkey); ctx_assert(next != HASH_NOT_FOUND); for(col = 0; col < ncols; col++) if(covgs[col] > 0 && db_node_has_col(db_graph, next, col)) newedges[col] |= edge; } } } int cmp = memcmp(edges, newedges, sizeof(Edges)*ncols); memcpy(edges, newedges, sizeof(Edges)*ncols); return (cmp != 0); }
// Return 1 if changed; 0 otherwise bool infer_all_edges(const BinaryKmer node_bkey, Edges *edges, const Covg *covgs, const dBGraph *db_graph) { Edges iedges = 0xff, edge; size_t orient, nuc, col, kmer_size = db_graph->kmer_size; const size_t ncols = db_graph->num_of_cols; BinaryKmer bkey, bkmer; hkey_t next; Edges newedges[ncols]; memcpy(newedges, edges, ncols * sizeof(Edges)); // intersection of edges for(col = 0; col < ncols; col++) iedges &= edges[col]; for(orient = 0; orient < 2; orient++) { bkmer = (orient == FORWARD ? binary_kmer_left_shift_one_base(node_bkey, kmer_size) : binary_kmer_right_shift_one_base(node_bkey)); for(nuc = 0; nuc < 4; nuc++) { edge = nuc_orient_to_edge(nuc, orient); if(!(iedges & edge)) { // edges are missing from some samples if(orient == FORWARD) binary_kmer_set_last_nuc(&bkmer, nuc); else binary_kmer_set_first_nuc(&bkmer, dna_nuc_complement(nuc), kmer_size); bkey = bkmer_get_key(bkmer, kmer_size); next = hash_table_find(&db_graph->ht, bkey); if(next != HASH_NOT_FOUND) { for(col = 0; col < ncols; col++) { if(covgs[col] > 0 && db_node_has_col(db_graph, next, col)) { newedges[col] |= edge; } } } } } } // Check if we changed the edges int cmp = memcmp(edges, newedges, sizeof(Edges)*ncols); memcpy(edges, newedges, sizeof(Edges)*ncols); return (cmp != 0); }
static inline void infer_edges_node(hkey_t hkey, bool add_all_edges, const dBGraph *db_graph, size_t *num_nodes_modified) { BinaryKmer bkmer = db_node_bkmer(db_graph, hkey); Edges *edges = &db_node_edges(db_graph, hkey, 0); size_t col; // Create coverages that are zero or one depending on if node has colour Covg covgs[db_graph->num_of_cols]; for(col = 0; col < db_graph->num_of_cols; col++) covgs[col] = db_node_has_col(db_graph, hkey, col); (*num_nodes_modified) += (add_all_edges ? infer_all_edges(bkmer, edges, covgs, db_graph) : infer_pop_edges(bkmer, edges, covgs, db_graph)); }
static inline void _add_edge_to_colours(hkey_t next_hkey, const Covg *covgs, Edges *edges, Edges new_edge, const dBGraph *db_graph) { size_t col, ncols = db_graph->num_of_cols; if(db_graph->col_covgs != NULL) { for(col = 0; col < ncols; col++) { if(covgs[col] > 0 && db_node_covg(db_graph, next_hkey, col)) { edges[col] |= new_edge; } } } else { for(col = 0; col < ncols; col++) { if(covgs[col] > 0 && db_node_has_col(db_graph, next_hkey, col)) { edges[col] |= new_edge; } } } }
// Edges restricted to this colour, only in one direction (node.orient) Edges db_node_edges_in_col(dBNode node, size_t col, const dBGraph *db_graph) { if(db_graph->node_in_cols == NULL && db_graph->col_covgs == NULL) { Edges edges = db_node_get_edges(db_graph, node.key, col); return edges_mask_orientation(edges, node.orient); } // Edges are merged into one colour ctx_assert(db_graph->num_edge_cols == 1); ctx_assert(db_graph->node_in_cols != NULL || db_graph->col_covgs != NULL); Edges edges = db_node_get_edges(db_graph, node.key, 0); // Check which next nodes are in the given colour BinaryKmer bkmer = db_node_get_bkmer(db_graph, node.key); dBNode nodes[4]; Nucleotide nucs[4]; size_t i, n; n = db_graph_next_nodes(db_graph, bkmer, node.orient, edges, nodes, nucs); edges = 0; if(db_graph->node_in_cols != NULL) { for(i = 0; i < n; i++) if(db_node_has_col(db_graph, nodes[i].key, col)) edges = edges_set_edge(edges, nucs[i], node.orient); } else if(db_graph->col_covgs != NULL) { for(i = 0; i < n; i++) if(db_node_col_covg(db_graph, nodes[i].key, col) > 0) edges = edges_set_edge(edges, nucs[i], node.orient); } else ctx_assert(0); return edges; }
static inline int infer_edges_node(hkey_t hkey, bool add_all_edges, Covg *tmp_covgs, const dBGraph *db_graph, size_t *num_nodes_modified) { BinaryKmer bkmer = db_node_get_bkmer(db_graph, hkey); Edges *edges = &db_node_edges(db_graph, hkey, 0); size_t col; // Create coverages that are zero or one depending on if node has colour if(db_graph->col_covgs == NULL) { for(col = 0; col < db_graph->num_of_cols; col++) tmp_covgs[col] = db_node_has_col(db_graph, hkey, col); } else { tmp_covgs = &db_node_covg(db_graph, hkey, 0); } (*num_nodes_modified) += (add_all_edges ? infer_all_edges(bkmer, edges, tmp_covgs, db_graph) : infer_pop_edges(bkmer, edges, tmp_covgs, db_graph)); return 0; // => keep iterating }
// if colour is -1 aligns to all colours, otherwise aligns to given colour only // Returns number of kmers lost from the end static size_t db_alignment_from_read(dBAlignment *aln, const read_t *r, uint8_t qcutoff, uint8_t hp_cutoff, const dBGraph *db_graph, int colour) { size_t contig_start, contig_end = 0, search_start = 0; const size_t kmer_size = db_graph->kmer_size; BinaryKmer bkmer, tmp_key; Nucleotide nuc; hkey_t node; size_t i, offset, nxtbse; dBNodeBuffer *nodes = &aln->nodes; Int32Buffer *rpos = &aln->rpos; ctx_assert(nodes->len == rpos->len); size_t n = nodes->len, init_len = n; db_node_buf_capacity(nodes, n + r->seq.end); int32_buf_capacity(rpos, n + r->seq.end); while((contig_start = seq_contig_start(r, search_start, kmer_size, qcutoff, hp_cutoff)) < r->seq.end) { contig_end = seq_contig_end(r, contig_start, kmer_size, qcutoff, hp_cutoff, &search_start); const char *contig = r->seq.b + contig_start; size_t contig_len = contig_end - contig_start; bkmer = binary_kmer_from_str(contig, kmer_size); bkmer = binary_kmer_right_shift_one_base(bkmer); for(offset=contig_start, nxtbse=kmer_size-1; nxtbse < contig_len; nxtbse++,offset++) { nuc = dna_char_to_nuc(contig[nxtbse]); bkmer = binary_kmer_left_shift_add(bkmer, kmer_size, nuc); tmp_key = binary_kmer_get_key(bkmer, kmer_size); node = hash_table_find(&db_graph->ht, tmp_key); if(node != HASH_NOT_FOUND && (colour == -1 || db_node_has_col(db_graph, node, colour))) { nodes->b[n].key = node; nodes->b[n].orient = bkmer_get_orientation(bkmer, tmp_key); rpos->b[n] = offset; n++; } } } // Return number of bases from the last kmer found until read end size_t ret = (n == init_len ? r->seq.end /* No kmers found */ : r->seq.end - (rpos->b[n-1] + kmer_size)); nodes->len = rpos->len = n; // Check for sequence gaps for(i = init_len; i+1 < nodes->len; i++) { if(rpos->b[i]+1 < rpos->b[i+1]) { aln->seq_gaps = true; break; } } return ret; }
// `fork_node` is a node with outdegree > 1 void find_bubbles(BubbleCaller *caller, dBNode fork_node) { graph_cache_reset(&caller->cache); const dBGraph *db_graph = caller->db_graph; GraphCache *cache = &caller->cache; GraphWalker *wlk = &caller->wlk; RepeatWalker *rptwlk = &caller->rptwlk; // char tmpstr[MAX_KMER_SIZE+3]; // db_node_to_str(db_graph, fork_node, tmpstr); // status("Calling from %s", tmpstr); dBNode nodes[4]; Nucleotide bases[4]; size_t i, num_next, num_edges_in_col; BinaryKmer fork_bkmer = db_node_get_bkmer(db_graph, fork_node.key); num_next = db_graph_next_nodes(db_graph, fork_bkmer, fork_node.orient, db_node_edges(db_graph, fork_node.key, 0), nodes, bases); // loop over alleles, then colours Colour colour, colours_loaded = db_graph->num_of_cols; bool node_has_col[4]; uint32_t pathid; for(colour = 0; colour < colours_loaded; colour++) { if(!db_node_has_col(db_graph, fork_node.key, colour)) continue; // Determine if this fork is a fork in the current colour num_edges_in_col = 0; for(i = 0; i < num_next; i++) { node_has_col[i] = (db_node_has_col(db_graph, nodes[i].key, colour) > 0); num_edges_in_col += node_has_col[i]; } graph_walker_setup(wlk, true, colour, colour, db_graph); for(i = 0; i < num_next; i++) { if(node_has_col[i]) { graph_walker_start(wlk, fork_node); graph_walker_force(wlk, nodes[i], num_edges_in_col > 1); pathid = graph_crawler_load_path_limit(cache, nodes[i], wlk, rptwlk, caller->prefs.max_allele_len); graph_walker_finish(wlk); graph_crawler_reset_rpt_walker(rptwlk, cache, pathid); } } } // Set up 5p flank caller->flank5p.b[0] = db_node_reverse(fork_node); caller->flank5p.len = 0; // set to one to signify we haven't fetched flank yet }