// Orient supernode // Once oriented, supernode has lowest possible kmerkey at the beginning, // oriented FORWARDs if possible void supernode_normalise(dBNode *nlist, size_t len, const dBGraph *db_graph) { // Sort supernode into forward orientation ctx_assert(len > 0); if(len == 1) { nlist[0].orient = FORWARD; return; } BinaryKmer bkmer0 = db_node_get_bkmer(db_graph, nlist[0].key); BinaryKmer bkmer1 = db_node_get_bkmer(db_graph, nlist[len-1].key); // Check if closed cycle if(supernode_is_closed_cycle(nlist, len, bkmer0, bkmer1, db_graph)) { // find lowest kmer to start from BinaryKmer lowest = bkmer0, tmp; size_t i, idx = 0; for(i = 1; i < len; i++) { tmp = db_node_get_bkmer(db_graph, nlist[i].key); if(binary_kmer_less_than(tmp, lowest)) { lowest = tmp; idx = i; } } // If already starting from the lowest kmer no change needed if(idx > 0 || nlist[0].orient != FORWARD) { // a->b->c->d->e->f->a // if c is lowest and FORWARD: c->d->e->f->a->b (keep orientations) // if c is lowest and REVERSE: c->b->a->f->e->d (reverse orientations) if(nlist[idx].orient == FORWARD) { // Shift left by idx, without affecting orientations db_nodes_left_shift(nlist, len, idx); } else { db_nodes_reverse_complement(nlist, idx+1); db_nodes_reverse_complement(nlist+idx+1, len-idx-1); } } } else if(binary_kmer_less_than(bkmer1,bkmer0)) { db_nodes_reverse_complement(nlist, len); } }
static void test_kmer_occur_filter() { // Construct 1 colour graph with kmer-size=11 dBGraph graph; const size_t kmer_size = 11, ncols = 3; size_t i; // Create graph db_graph_alloc(&graph, kmer_size, ncols, 1, 2000, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL | DBG_ALLOC_BKTLOCKS); // xyz------->>> y > < X // TTCGACCCGACAGGGCAACGTAGTCCGACAGGGCACAGCCCTGTCGGGGGGTGCA #define NUM_NODES 3 #define NUM_READS 3 const char *tmp[NUM_READS] = { "AACA", "TTCGACCCGACAGGGCAACGTAGTCCGACAGGGCACAGCCCTGTCGGGGGGTGCA", "TCTAGCATGTGTGTT"}; read_t reads[NUM_READS]; for(i = 0; i < NUM_READS; i++) { seq_read_alloc(&reads[i]); seq_read_set(&reads[i], tmp[i]); } KOGraph kograph = kograph_create(reads, NUM_READS, true, 0, 1, &graph); TASSERT(kograph.nchroms == NUM_READS); TASSERT(kograph.koccurs != NULL); KOccurRunBuffer koruns, koruns_tmp, koruns_ended; korun_buf_alloc(&koruns, 16); korun_buf_alloc(&koruns_tmp, 16); korun_buf_alloc(&koruns_ended, 16); // Check CCCGACAGGGCAA starts at CCCGACAGGGC // x=CCCGACAGGGC, y=CCGACAGGGCA, z=CGACAGGGCAA // X=GCCCTGTCGGG, Y=TGCCCTGTCGG, Z=TTGCCCTGTCG dBNode nodes[NUM_NODES]; for(i = 0; i < NUM_NODES; i++) nodes[i] = db_graph_find_str(&graph, &"CCCGACAGGGCAA"[i]); korun_buf_reset(&koruns); korun_buf_reset(&koruns_ended); kograph_filter_extend(&kograph, nodes, NUM_NODES, true, 0, 0, &koruns, &koruns_tmp, &koruns_ended); // Checks TASSERT2(koruns.len == 1, "koruns.len: %zu", koruns.len); TASSERT(koruns.b[0].strand == STRAND_PLUS); // left-to-right with ref TASSERT2(koruns.b[0].chrom == 1, "chrom: %zu", (size_t)koruns.b[0].chrom); TASSERT2(koruns.b[0].first == 5, "offset: %zu", (size_t)koruns.b[0].first); TASSERT2(koruns.b[0].last == 7, "last: %zu", (size_t)koruns.b[0].last); // Test reverse db_nodes_reverse_complement(nodes, NUM_NODES); korun_buf_reset(&koruns); korun_buf_reset(&koruns_ended); kograph_filter_extend(&kograph, nodes, 1, true, 0, 0, &koruns, &koruns_tmp, &koruns_ended); kograph_filter_extend(&kograph, nodes+1, 1, true, 0, 1, &koruns, &koruns_tmp, &koruns_ended); kograph_filter_extend(&kograph, nodes+2, 1, true, 0, 2, &koruns, &koruns_tmp, &koruns_ended); // Print out for debugging // printf("koruns: "); // koruns_print(koruns.b, koruns.len, kmer_size, stdout); // printf("\nkoruns_ended: "); // koruns_print(koruns_ended.b, koruns_ended.len, kmer_size, stdout); // printf("\n"); // Check results match: // koruns: chromid:1:17-5:-, chromid:1:37-47:+ // koruns_ended: chromid:1:34-24:- TASSERT2(koruns.len == 2, "koruns.len: %zu", koruns.len); TASSERT2(koruns_ended.len == 1, "koruns_ended.len: %zu", koruns_ended.len); TASSERT(koruns.b[0].strand == STRAND_MINUS); // reverse complement of ref TASSERT2(koruns.b[0].chrom == 1, "chrom: %zu", (size_t)koruns.b[0].chrom); TASSERT2(koruns.b[0].first == 7, "offset: %zu", (size_t)koruns.b[0].first); TASSERT2(koruns.b[0].last == 5, "last: %zu", (size_t)koruns.b[0].last); korun_buf_dealloc(&koruns); korun_buf_dealloc(&koruns_tmp); korun_buf_dealloc(&koruns_ended); for(i = 0; i < NUM_READS; i++) seq_read_dealloc(&reads[i]); kograph_dealloc(&kograph); db_graph_dealloc(&graph); }
// Potential bubble - filter ref and duplicate alleles static void print_bubble(BubbleCaller *caller, GCacheStep **steps, size_t num_paths) { const BubbleCallingPrefs prefs = caller->prefs; const dBGraph *db_graph = caller->db_graph; GCacheSnode *snode; size_t i; dBNodeBuffer *flank5p = &caller->flank5p; if(flank5p->len == 0) { // Haven't fetched 5p flank yet // flank5p[0] already contains the first node flank5p->len = 1; supernode_extend(flank5p, prefs.max_flank_len, db_graph); db_nodes_reverse_complement(flank5p->b, flank5p->len); } // // Print Bubble // // write to string buffer then flush to gzFile StrBuf *sbuf = &caller->output_buf; strbuf_reset(sbuf); // Temporary node buffer to use dBNodeBuffer *pathbuf = &caller->pathbuf; db_node_buf_reset(pathbuf); // Get bubble number (threadsafe num_bubbles_ptr++) size_t id = __sync_fetch_and_add((volatile size_t*)caller->num_bubbles_ptr, 1); // This can be set to anything without a '.' in it const char prefix[] = "call"; // 5p flank // strbuf_sprintf(sbuf, ">bubble.%s%zu.5pflank kmers=%zu\n", prefix, id, flank5p->len); strbuf_append_str(sbuf, ">bubble."); strbuf_append_str(sbuf, prefix); strbuf_append_ulong(sbuf, id); strbuf_append_str(sbuf, ".5pflank kmers="); strbuf_append_ulong(sbuf, flank5p->len); strbuf_append_char(sbuf, '\n'); branch_to_str(flank5p->b, flank5p->len, true, sbuf, db_graph); // 3p flank db_node_buf_reset(pathbuf); snode = graph_cache_snode(&caller->cache, steps[0]->supernode); graph_cache_snode_fetch_nodes(&caller->cache, snode, steps[0]->orient, pathbuf); // strbuf_sprintf(sbuf, ">bubble.%s%zu.3pflank kmers=%zu\n", prefix, id, pathbuf->len); strbuf_append_str(sbuf, ">bubble."); strbuf_append_str(sbuf, prefix); strbuf_append_ulong(sbuf, id); strbuf_append_str(sbuf, ".3pflank kmers="); strbuf_append_ulong(sbuf, pathbuf->len); strbuf_append_char(sbuf, '\n'); branch_to_str(pathbuf->b, pathbuf->len, false, sbuf, db_graph); // Print alleles for(i = 0; i < num_paths; i++) { db_node_buf_reset(pathbuf); graph_cache_step_fetch_nodes(&caller->cache, steps[i], pathbuf); // strbuf_sprintf(sbuf, ">bubble.%s%zu.branch.%zu kmers=%zu\n", // prefix, id, i, pathbuf->len); strbuf_append_str(sbuf, ">bubble."); strbuf_append_str(sbuf, prefix); strbuf_append_ulong(sbuf, id); strbuf_append_str(sbuf, ".branch."); strbuf_append_ulong(sbuf, i); strbuf_append_str(sbuf, " kmers="); strbuf_append_ulong(sbuf, pathbuf->len); strbuf_append_char(sbuf, '\n'); branch_to_str(pathbuf->b, pathbuf->len, false, sbuf, db_graph); } strbuf_append_char(sbuf, '\n'); ctx_assert(strlen(sbuf->b) == sbuf->end); // lock, print, unlock pthread_mutex_lock(caller->out_lock); gzwrite(caller->gzout, sbuf->b, sbuf->end); pthread_mutex_unlock(caller->out_lock); }
// Walk the graph remembering the last time we met the ref // When traversal fails, dump sequence up to last meeting with the ref static void follow_break(BreakpointCaller *caller, dBNode node) { size_t i, j, k, num_next; dBNode next_nodes[4]; Nucleotide next_nucs[4]; size_t nonref_idx[4], num_nonref_next = 0; const dBGraph *db_graph = caller->db_graph; BinaryKmer bkey = db_node_get_bkmer(db_graph, node.key); Edges edges = db_node_get_edges(db_graph, node.key, 0); num_next = db_graph_next_nodes(db_graph, bkey, node.orient, edges, next_nodes, next_nucs); // Filter out next nodes in the reference for(i = 0; i < num_next; i++) { if(kograph_num(caller->kograph, next_nodes[i].key) == 0) { nonref_idx[num_nonref_next] = i; num_nonref_next++; } } // Abandon if all options are in ref or none are if(num_nonref_next == num_next || num_nonref_next == 0) return; // Follow all paths not in ref, in all colours GraphCrawler *fw_crawler = &caller->crawlers[node.orient]; GraphCrawler *rv_crawler = &caller->crawlers[!node.orient]; dBNodeBuffer *allelebuf = &caller->allelebuf, *flank5pbuf = &caller->flank5pbuf; GCMultiColPath *flank5p_multicolpath, *allele_multicolpath; KOccurRun *flank5p_runs, *flank3p_runs; size_t flank5p_pathid, allele_pathid; size_t num_flank5p_runs, num_flank3p_runs; // We fetch 5' flanks in all colours then merge matching paths // we stop fetching a single path if it stops tracking the reference // Alternatively, we could fetch the 5' flank in everyone and stop after a // given distance, then check for that set of paths how much it tracks the // reference. This has the advantage of scaling much better with number of // samples, but not so well as min_ref_nkmers increases (since we fetch // many flanks that can't be used) - I think this is less of a worry. // Loop over possible next nodes at this junction for(i = 0; i < num_nonref_next; i++) { size_t next_idx = nonref_idx[i]; // Go backwards to get 5p flank traverse_5pflank(caller, rv_crawler, db_node_reverse(next_nodes[next_idx]), db_node_reverse(node)); // Loop over the flanks we got for(j = 0; j < rv_crawler->num_paths; j++) { // Get 5p flank db_node_buf_reset(flank5pbuf); graph_crawler_get_path_nodes(rv_crawler, j, flank5pbuf); flank5p_multicolpath = &rv_crawler->multicol_paths[j]; flank5p_pathid = flank5p_multicolpath->pathid; // Fetch 3pflank ref position num_flank5p_runs = caller->flank5p_refs[flank5p_pathid].num_runs; flank5p_runs = fetch_ref_contact(&rv_crawler->cache, flank5p_pathid, caller->flank5p_refs, &caller->flank5p_run_buf); koruns_reverse(flank5p_runs, num_flank5p_runs, flank5pbuf->len); koruns_sort_by_qoffset(flank5p_runs, num_flank5p_runs); db_nodes_reverse_complement(flank5pbuf->data, flank5pbuf->len); if(num_flank5p_runs > 0) { // Reset caller kmer_run_buf_reset(&caller->koruns_3p); kmer_run_buf_reset(&caller->koruns_3p_ended); kmer_run_buf_reset(&caller->allele_run_buf); // functions gcrawler_path_stop_at_ref_covg(), // gcrawler_path_finish_ref_covg() // both fill koruns_3p, koruns_3p_ended and allele_run_buf // Only traverse in the colours we have a flank for graph_crawler_fetch(fw_crawler, node, next_nodes, next_nucs, next_idx, num_next, flank5p_multicolpath->cols, flank5p_multicolpath->num_cols, gcrawler_path_stop_at_ref_covg, gcrawler_path_finish_ref_covg, caller); // Assemble contigs - fetch forwards for each path for given 5p flank for(k = 0; k < fw_crawler->num_paths; k++) { // Fetch nodes db_node_buf_reset(allelebuf); graph_crawler_get_path_nodes(fw_crawler, k, allelebuf); ctx_assert(allelebuf->len > 0); allele_multicolpath = &fw_crawler->multicol_paths[k]; allele_pathid = allele_multicolpath->pathid; // Fetch 3pflank ref position num_flank3p_runs = caller->allele_refs[allele_pathid].num_runs; flank3p_runs = fetch_ref_contact(&fw_crawler->cache, allele_pathid, caller->allele_refs, &caller->allele_run_buf); process_contig(caller, allele_multicolpath->cols, allele_multicolpath->num_cols, flank5pbuf, allelebuf, flank5p_runs, num_flank5p_runs, flank3p_runs, num_flank3p_runs); } } } } }
static inline int test_statement_node(dBNode node, ExpABCWorker *wrkr) { const dBGraph *db_graph = wrkr->db_graph; dBNodeBuffer *nbuf = &wrkr->nbuf; GraphWalker *wlk = &wrkr->gwlk; RepeatWalker *rpt = &wrkr->rptwlk; size_t b_idx, col = wrkr->colour; // rpt_walker_clear(rpt); db_node_buf_reset(nbuf); db_node_buf_add(nbuf, node); // size_t AB_limit = wrkr->prime_AB ? SIZE_MAX : wrkr->max_AB_dist; size_t walk_limit = wrkr->max_AB_dist; // status("walk_limit: %zu", walk_limit); // Walk from B to find A graph_walker_setup(wlk, true, col, col, db_graph); graph_walker_start(wlk, nbuf->b[0]); while(graph_walker_next(wlk) && nbuf->len < walk_limit) { if(!rpt_walker_attempt_traverse(rpt, wlk)) { reset(wlk,rpt,nbuf); return RES_LOST_IN_RPT; } db_node_buf_add(nbuf, wlk->node); } reset(wlk,rpt,nbuf); if(nbuf->len == 1) return RES_NO_TRAVERSAL; // Traverse A->B db_nodes_reverse_complement(nbuf->b, nbuf->len); b_idx = nbuf->len - 1; if(wrkr->prime_AB) { // Prime A->B without attempting to cross graph_walker_prime(wlk, nbuf->b, nbuf->len, nbuf->len, true); while(graph_walker_next(wlk)) { if(!rpt_walker_attempt_traverse(rpt, wlk)) { reset(wlk,rpt,nbuf); return RES_LOST_IN_RPT; } db_node_buf_add(nbuf, wlk->node); } } else { // Attempt to traverse A->B then extend past B int r = confirm_seq(0, true, wlk, rpt, nbuf, col, db_graph); switch(r) { case CONFIRM_REPEAT: return RES_LOST_IN_RPT; case CONFIRM_OVERSHOT: ctx_assert2(0,"Can't 'overshoot' when extending"); case CONFIRM_WRONG: return RES_AB_WRONG; case CONFIRM_SHORT: if(wrkr->print_failed_contigs) print_failed(node, nbuf, db_graph, true, wrkr->prime_AB); wrkr->ab_fail_state[wlk->last_step.status]++; return RES_AB_FAILED; } } reset(wlk,rpt,nbuf); if(nbuf->len == b_idx+1) return RES_NO_TRAVERSAL; // Couldn't get past B // Last node is now C // Walk from B... record whether or not we reach C ctx_assert(db_nodes_are_equal(nbuf->b[b_idx], db_node_reverse(node))); int r = confirm_seq(b_idx, false, wlk, rpt, nbuf, col, db_graph); switch(r) { case CONFIRM_REPEAT: return RES_LOST_IN_RPT; case CONFIRM_OVERSHOT: return RES_BC_OVERSHOT; case CONFIRM_WRONG: return RES_BC_WRONG; case CONFIRM_SHORT: if(wrkr->print_failed_contigs) print_failed(node, nbuf, db_graph, false, wrkr->prime_AB); wrkr->bc_fail_state[wlk->last_step.status]++; return RES_BC_FAILED; case CONFIRM_SUCCESS: return RES_ABC_SUCCESS; } die("Shouldn't reach here: r=%i", r); return -1; }