// `nbuf` and `sbuf` are temporary variables used by this function static void _call_bubble(BubbleCaller *caller, const char *flank5p, const char *flank3p, const char **alleles, size_t num_alleles, dBNodeBuffer *nbuf, StrBuf *sbuf) { const dBGraph *graph = caller->db_graph; const size_t kmer_size = graph->kmer_size; dBNode node5p = db_graph_find_str(graph, flank5p+strlen(flank5p)-kmer_size); dBNode node3p = db_graph_find_str(graph, flank3p); TASSERT(node5p.key != HASH_NOT_FOUND); TASSERT(node3p.key != HASH_NOT_FOUND); Edges edges5p = db_node_get_edges_union(graph, node5p.key); Edges edges3p = db_node_get_edges_union(graph, node3p.key); TASSERT(edges_get_outdegree(edges5p, node5p.orient) > 1); TASSERT(edges_get_indegree(edges3p, node3p.orient) > 1); find_bubbles(caller, node5p); GCacheUnitig *snode3p; Orientation snorient3p; GCacheStepPtrBuf *stepbuf; // Get 3p flank and orientation snode3p = graph_cache_find_unitig(&caller->cache, node3p); TASSERT(snode3p != NULL); snorient3p = gc_unitig_get_orient(&caller->cache, snode3p, node3p); find_bubbles_ending_with(caller, snode3p); stepbuf = (snorient3p == FORWARD ? &caller->spp_forward : &caller->spp_reverse); _check_alleles(&caller->cache, stepbuf, alleles, num_alleles, nbuf, sbuf); }
// Returns 1 if a read is a substring of ANY read in the list or a complete // match with a read before it in the list. Returns <= 0 otherwise. // 1 => is substr // 0 => not substr // -1 => not enough bases of ACGT static int _is_substr(const ReadBuffer *rbuf, size_t idx, const KOGraph *kograph, const dBGraph *db_graph) { const size_t kmer_size = db_graph->kmer_size; const read_t *r = &rbuf->b[idx], *r2; size_t contig_start; contig_start = seq_contig_start(r, 0, kmer_size, 0, 0); if(contig_start >= r->seq.end) return -1; // No kmers in this sequence dBNode node = db_graph_find_str(db_graph, r->seq.b+contig_start); ctx_assert(node.key != HASH_NOT_FOUND); // expect at least one hit (for this read!) ctx_assert(kograph_occurs(kograph, node.key)); KOccur *hit; for(hit = kograph_get(kograph, node.key); 1; hit++) { if(hit->chrom != idx) { r2 = &rbuf->b[hit->chrom]; // A read is a duplicate (i.e. return 1) if it is a substring of ANY // read in the list or a complete match with a read before it in the list. // That is why we have: (hit->chrom < idx || r->seq.end < r2->seq.end) // since identical strings have equal length if(hit->chrom < idx || r->seq.end < r2->seq.end) { if(hit->orient == node.orient) { // potential FORWARD match if(hit->offset >= contig_start && hit->offset + r->seq.end <= r2->seq.end && strncasecmp(r->seq.b, r2->seq.b+hit->offset-contig_start, r->seq.end) == 0) { return 1; } } else { // potential REVERSE match // if read is '<NNNN>[kmer]<rem>' rX_rem is the number of chars after // the first valid kmer size_t r1_rem = r->seq.end - (contig_start + kmer_size); size_t r2_rem = r2->seq.end - (hit->offset + kmer_size); if(r1_rem <= hit->offset && r2_rem >= contig_start && dna_revncasecmp(r->seq.b, r2->seq.b+hit->offset-r1_rem, r->seq.end) == 0) { return 1; } } } } if(!hit->next) break; } return 0; }
static void _check_node_paths(const char *kmer, const char **path_strs, size_t npaths, size_t colour, const dBGraph *graph) { TASSERT(strlen(kmer) == graph->kmer_size); const GPath *paths[npaths]; // corresponding to path_strs memset(paths, 0, sizeof(paths)); size_t i, num_paths_seen = 0; const GPathStore *gpstore = &graph->gpstore; dBNode node = db_graph_find_str(graph, kmer); const GPath *path = gpath_store_fetch_traverse(gpstore, node.key); dBNodeBuffer nbuf; SizeBuffer jposbuf; db_node_buf_alloc(&nbuf, 64); size_buf_alloc(&jposbuf, 64); #define MAX_SEQ 128 char seq[MAX_SEQ]; for(; path != NULL; path = path->next) { if(path->orient == node.orient && gpath_has_colour(path, gpstore->gpset.ncols, colour)) { TASSERT(num_paths_seen < npaths); db_node_buf_reset(&nbuf); gpath_fetch(node, path, &nbuf, &jposbuf, colour, graph); if(nbuf.len > MAX_SEQ) die("Too many nodes. Cannot continue. %zu", nbuf.len); db_nodes_to_str(nbuf.b, nbuf.len, graph, seq); TASSERT(strlen(seq) == graph->kmer_size + nbuf.len - 1); for(i = 0; i < npaths; i++) { if(strcmp(path_strs[i],seq) == 0) { TASSERT(paths[i] == NULL, "Duplicate paths: %s", seq); paths[i] = path; break; } } TASSERT2(i < npaths, "Path not found: %s", seq); num_paths_seen++; } } TASSERT(num_paths_seen == npaths); for(i = 0; i < npaths; i++) { TASSERT2(paths[i] != NULL, "path not in graph: %s", path_strs[i]); } db_node_buf_dealloc(&nbuf); size_buf_dealloc(&jposbuf); }
static void pull_out_supernodes(const char **seq, const char **ans, size_t n, const dBGraph *graph) { dBNodeBuffer nbuf; db_node_buf_alloc(&nbuf, 1024); // 1. Check pulling out supernodes works for iterating over the graph uint64_t *visited; visited = ctx_calloc(roundup_bits2words64(graph->ht.capacity), 8); HASH_ITERATE(&graph->ht, supernode_from_kmer, &nbuf, visited, graph, ans, n); ctx_free(visited); // 2. Check pulling out supernodes works when we iterate over inputs size_t i, j, len; dBNode node; char tmpstr[SNODEBUF]; for(i = 0; i < n; i++) { len = strlen(seq[i]); for(j = 0; j+graph->kmer_size <= len; j++) { // Find node node = db_graph_find_str(graph, seq[i]+j); TASSERT(node.key != HASH_NOT_FOUND); // Fetch supernode db_node_buf_reset(&nbuf); supernode_find(node.key, &nbuf, graph); supernode_normalise(nbuf.b, nbuf.len, graph); // Compare TASSERT(nbuf.len < SNODEBUF); db_nodes_to_str(nbuf.b, nbuf.len, graph, tmpstr); if(strcmp(tmpstr, ans[i]) != 0) { test_status("Got: %s from ans[i]:%s\n", tmpstr, ans[i]); } TASSERT(strcmp(tmpstr, ans[i]) == 0); } } db_node_buf_dealloc(&nbuf); }
void test_graph_crawler() { test_status("Testing graph crawler..."); // Construct 1 colour graph with kmer-size=11 dBGraph graph; const size_t kmer_size = 11, ncols = 3; db_graph_alloc(&graph, kmer_size, ncols, 1, 2048, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL | DBG_ALLOC_BKTLOCKS); char graphseq[3][77] = // < X X X............... {"GTTCCAGAGCGGAGGTCTCCCAACAACATGGTATAAGTTGTCTAGCCCCGGTTCGCGCGGGTACTTCTTACAGCGC", "GTTCCAGAGCGGAGGTCTCCCAACAACTTGGTATAAGTTGTCTAGTCCCGGTTCGCGCGGCATTTCAGCATTGTTA", "GTTCCAGAGCGCGACAGAGTGCATATCACGCTAAGCACAGCCCTCTTCTATCTGCTTTTAAATGGATCAATAATCG"}; build_graph_from_str_mt(&graph, 0, graphseq[0], strlen(graphseq[0])); build_graph_from_str_mt(&graph, 1, graphseq[1], strlen(graphseq[1])); build_graph_from_str_mt(&graph, 2, graphseq[2], strlen(graphseq[2])); // Crawl graph GraphCrawler crawler; graph_crawler_alloc(&crawler, &graph); dBNode node = db_graph_find_str(&graph, graphseq[0]); dBNode next_node = db_graph_find_str(&graph, graphseq[0]+1); TASSERT(node.key != HASH_NOT_FOUND); TASSERT(next_node.key != HASH_NOT_FOUND); BinaryKmer bkey = db_node_get_bkmer(&graph, node.key); Edges edges = db_node_get_edges(&graph, node.key, 0); dBNode next_nodes[4]; Nucleotide next_nucs[4]; size_t i, p, num_next, next_idx; num_next = db_graph_next_nodes(&graph, bkey, node.orient, edges, next_nodes, next_nucs); next_idx = 0; while(next_idx < num_next && !db_nodes_are_equal(next_nodes[next_idx],next_node)) next_idx++; TASSERT(next_idx < num_next && db_nodes_are_equal(next_nodes[next_idx],next_node)); // Crawl in all colours graph_crawler_fetch(&crawler, node, next_nodes, next_idx, num_next, NULL, graph.num_of_cols, NULL, NULL, NULL); TASSERT2(crawler.num_paths == 2, "crawler.num_paths: %u", crawler.num_paths); // Fetch paths dBNodeBuffer nbuf; db_node_buf_alloc(&nbuf, 16); StrBuf sbuf; strbuf_alloc(&sbuf, 128); for(p = 0; p < crawler.num_paths; p++) { db_node_buf_reset(&nbuf); graph_crawler_get_path_nodes(&crawler, p, &nbuf); strbuf_ensure_capacity(&sbuf, nbuf.len+graph.kmer_size); sbuf.end = db_nodes_to_str(nbuf.b, nbuf.len, &graph, sbuf.b); for(i = 0; i < 3 && strcmp(graphseq[i]+1,sbuf.b) != 0; i++) {} TASSERT2(i < 3, "seq: %s", sbuf.b); TASSERT2(sbuf.end == 75, "sbuf.end: %zu", sbuf.end); TASSERT2(nbuf.len == 65, "nbuf.len: %zu", nbuf.len); } strbuf_dealloc(&sbuf); db_node_buf_dealloc(&nbuf); graph_crawler_dealloc(&crawler); db_graph_dealloc(&graph); }
static void test_repeat_loop() { TASSERT(sizeof(FollowPath) == 20); // Construct 1 colour graph with kmer-size=11 dBGraph graph; size_t kmer_size = 11, ncols = 1; // Set up alignment correction params CorrectAlnParam params = {.ctpcol = 0, .ctxcol = 0, .ins_gap_min = 0, .ins_gap_max = 0, .one_way_gap_traverse = true, .use_end_check = true, .max_context = 10, .gap_variance = 0.1, .gap_wiggle = 5}; // Sequence with repeat char seq[] = "ATTTGGAACTCCGGA" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "CGTCAGGAGCTAACT"; char p0[] = "ATTTGGAACTCCGGA""GATAGGGCCAGT""GATAGGGCCAGT"; char p1[] = "GATAGGGCCAGT""GATAGGGCCAGT""CGTCAGGAGCTAACT"; // Allocate graph, but don't add any sequence _construct_graph_with_paths(&graph, kmer_size, ncols, NULL, 0, params); GenPathWorker *gen_path_wrkr = gen_paths_workers_alloc(1, &graph, NULL); GraphWalker gwlk; RepeatWalker rptwlk; graph_walker_alloc(&gwlk); rpt_walker_alloc(&rptwlk, graph.ht.capacity, 12); dBNodeBuffer nbuf; db_node_buf_alloc(&nbuf, 1024); // Construct graph but no paths build_graph_from_str_mt(&graph, 0, seq, strlen(seq)); TASSERT2(graph.ht.num_kmers == 15+12+15, "%zu", (size_t)graph.ht.num_kmers); // Find first node in sequence dBNode node0 = db_graph_find_str(&graph, seq); TASSERT(node0.key != HASH_NOT_FOUND); // 1) With no paths char ans0[] = "ATTTGGAACTCCGGA""GATAGGGCCAGT"; test_walk(&gwlk, &rptwlk, node0, &nbuf, &graph, 15+2, ans0); // 2) Add small paths - produces collapsed down seq with two copy repeat gen_paths_from_str_mt(gen_path_wrkr, p0, params); gen_paths_from_str_mt(gen_path_wrkr, p1, params); char ans1[] = "ATTTGGAACTCCGGA""GATAGGGCCAGT""GATAGGGCCAGT""CGTCAGGAGCTAACT"; test_walk(&gwlk, &rptwlk, node0, &nbuf, &graph, 15+12+12+5, ans1); // 3) Add long paths gen_paths_from_str_mt(gen_path_wrkr, seq, params); test_walk(&gwlk, &rptwlk, node0, &nbuf, &graph, strlen(seq)+1-kmer_size, seq); graph_walker_dealloc(&gwlk); rpt_walker_dealloc(&rptwlk); db_node_buf_dealloc(&nbuf); gen_paths_workers_dealloc(gen_path_wrkr, 1); db_graph_dealloc(&graph); } void test_repeat_walker() { test_status("Testing repeat_walker.h"); test_repeat_loop(); }
static void test_kmer_occur_filter() { // Construct 1 colour graph with kmer-size=11 dBGraph graph; const size_t kmer_size = 11, ncols = 3; size_t i; // Create graph db_graph_alloc(&graph, kmer_size, ncols, 1, 2000, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL | DBG_ALLOC_BKTLOCKS); // xyz------->>> y > < X // TTCGACCCGACAGGGCAACGTAGTCCGACAGGGCACAGCCCTGTCGGGGGGTGCA #define NUM_NODES 3 #define NUM_READS 3 const char *tmp[NUM_READS] = { "AACA", "TTCGACCCGACAGGGCAACGTAGTCCGACAGGGCACAGCCCTGTCGGGGGGTGCA", "TCTAGCATGTGTGTT"}; read_t reads[NUM_READS]; for(i = 0; i < NUM_READS; i++) { seq_read_alloc(&reads[i]); seq_read_set(&reads[i], tmp[i]); } KOGraph kograph = kograph_create(reads, NUM_READS, true, 0, 1, &graph); TASSERT(kograph.nchroms == NUM_READS); TASSERT(kograph.koccurs != NULL); KOccurRunBuffer koruns, koruns_tmp, koruns_ended; korun_buf_alloc(&koruns, 16); korun_buf_alloc(&koruns_tmp, 16); korun_buf_alloc(&koruns_ended, 16); // Check CCCGACAGGGCAA starts at CCCGACAGGGC // x=CCCGACAGGGC, y=CCGACAGGGCA, z=CGACAGGGCAA // X=GCCCTGTCGGG, Y=TGCCCTGTCGG, Z=TTGCCCTGTCG dBNode nodes[NUM_NODES]; for(i = 0; i < NUM_NODES; i++) nodes[i] = db_graph_find_str(&graph, &"CCCGACAGGGCAA"[i]); korun_buf_reset(&koruns); korun_buf_reset(&koruns_ended); kograph_filter_extend(&kograph, nodes, NUM_NODES, true, 0, 0, &koruns, &koruns_tmp, &koruns_ended); // Checks TASSERT2(koruns.len == 1, "koruns.len: %zu", koruns.len); TASSERT(koruns.b[0].strand == STRAND_PLUS); // left-to-right with ref TASSERT2(koruns.b[0].chrom == 1, "chrom: %zu", (size_t)koruns.b[0].chrom); TASSERT2(koruns.b[0].first == 5, "offset: %zu", (size_t)koruns.b[0].first); TASSERT2(koruns.b[0].last == 7, "last: %zu", (size_t)koruns.b[0].last); // Test reverse db_nodes_reverse_complement(nodes, NUM_NODES); korun_buf_reset(&koruns); korun_buf_reset(&koruns_ended); kograph_filter_extend(&kograph, nodes, 1, true, 0, 0, &koruns, &koruns_tmp, &koruns_ended); kograph_filter_extend(&kograph, nodes+1, 1, true, 0, 1, &koruns, &koruns_tmp, &koruns_ended); kograph_filter_extend(&kograph, nodes+2, 1, true, 0, 2, &koruns, &koruns_tmp, &koruns_ended); // Print out for debugging // printf("koruns: "); // koruns_print(koruns.b, koruns.len, kmer_size, stdout); // printf("\nkoruns_ended: "); // koruns_print(koruns_ended.b, koruns_ended.len, kmer_size, stdout); // printf("\n"); // Check results match: // koruns: chromid:1:17-5:-, chromid:1:37-47:+ // koruns_ended: chromid:1:34-24:- TASSERT2(koruns.len == 2, "koruns.len: %zu", koruns.len); TASSERT2(koruns_ended.len == 1, "koruns_ended.len: %zu", koruns_ended.len); TASSERT(koruns.b[0].strand == STRAND_MINUS); // reverse complement of ref TASSERT2(koruns.b[0].chrom == 1, "chrom: %zu", (size_t)koruns.b[0].chrom); TASSERT2(koruns.b[0].first == 7, "offset: %zu", (size_t)koruns.b[0].first); TASSERT2(koruns.b[0].last == 5, "last: %zu", (size_t)koruns.b[0].last); korun_buf_dealloc(&koruns); korun_buf_dealloc(&koruns_tmp); korun_buf_dealloc(&koruns_ended); for(i = 0; i < NUM_READS; i++) seq_read_dealloc(&reads[i]); kograph_dealloc(&kograph); db_graph_dealloc(&graph); }