static void _test_add_paths() { test_status("Testing adding paths in generate_paths.c and gpath_fetch()"); // Construct 1 colour graph with kmer-size=11 dBGraph graph; size_t kmer_size = 11, ncols = 1; db_graph_alloc(&graph, kmer_size, ncols, ncols, 1024, DBG_ALLOC_EDGES | DBG_ALLOC_COVGS | DBG_ALLOC_BKTLOCKS | DBG_ALLOC_NODE_IN_COL); // Create a path store that tracks path counts gpath_store_alloc(&graph.gpstore, graph.num_of_cols, graph.ht.capacity, 0, ONE_MEGABYTE, true, false); // Create path hash table for fast lookup gpath_hash_alloc(&graph.gphash, &graph.gpstore, ONE_MEGABYTE); build_graph_from_str_mt(&graph, 0, seq0, strlen(seq0)); build_graph_from_str_mt(&graph, 0, seq1, strlen(seq1)); build_graph_from_str_mt(&graph, 0, seq2, strlen(seq2)); build_graph_from_str_mt(&graph, 0, seq3, strlen(seq3)); // Set up alignment correction params CorrectAlnParam params = {.ctpcol = 0, .ctxcol = 0, .frag_len_min = 0, .frag_len_max = 0, .one_way_gap_traverse = true, .use_end_check = true, .max_context = 10, .gap_variance = 0.1, .gap_wiggle = 5}; all_tests_add_paths(&graph, seq0, params, 5, 5); // path lens: 3+3+2+2+2 all_tests_add_paths(&graph, seq1, params, 5, 2); // path lens: 3+3+2+2+2 all_tests_add_paths(&graph, seq2, params, 3, 2); // path lens: 1+1+1 all_tests_add_paths(&graph, seq3, params, 2, 1); // path lens: 1+1 // Test path store gpath_checks_all_paths(&graph, 1); // use one thread // Test path content _check_node_paths(kmerA, kmerApaths, NPATHS_A, 0, &graph); _check_node_paths(kmerB, kmerBpaths, NPATHS_B, 0, &graph); _check_node_paths(kmerAB, kmerABpaths, NPATHS_AB, 0, &graph); _check_node_paths(kmerC, kmerCpaths, NPATHS_C, 0, &graph); _check_node_paths(kmerG, kmerGpaths, NPATHS_G, 0, &graph); _check_node_paths(kmerF, kmerFpaths, NPATHS_F, 0, &graph); _check_node_paths(kmerE, kmerEpaths, NPATHS_E, 0, &graph); _check_node_paths(kmerD, kmerDpaths, NPATHS_D, 0, &graph); _check_node_paths(kmerDEF,kmerDEFpaths,NPATHS_DEF,0, &graph); _check_node_paths(kmerDE, kmerDEpaths, NPATHS_DE, 0, &graph); db_graph_dealloc(&graph); } void test_paths() { _test_add_paths(); }
void _construct_graph_with_paths(dBGraph *graph, size_t kmer_size, size_t ncols, char **seqs, size_t nseqs, CorrectAlnParam path_params) { size_t i; db_graph_alloc(graph, kmer_size, ncols, ncols, 1024); // Graph data graph->bktlocks = ctx_calloc(roundup_bits2bytes(graph->ht.num_of_buckets), 1); graph->col_edges = ctx_calloc(graph->ht.capacity * ncols, sizeof(Edges)); graph->col_covgs = ctx_calloc(graph->ht.capacity * ncols, sizeof(Covg)); graph->node_in_cols = ctx_calloc(roundup_bits2bytes(graph->ht.capacity) * ncols, 1); // Path data path_store_alloc(&graph->pstore, 1024, true, graph->ht.capacity, ncols); graph->pstore.kmer_locks = ctx_calloc(roundup_bits2bytes(graph->ht.capacity), 1); // Build graph for(i = 0; i < nseqs; i++) build_graph_from_str_mt(graph, 0, seqs[i], strlen(seqs[i])); graph->num_of_cols_used = MAX2(graph->num_of_cols_used, 1); GenPathWorker *gen_path_wrkr = gen_paths_workers_alloc(1, graph, NULL); for(i = 0; i < nseqs; i++) gen_paths_from_str_mt(gen_path_wrkr, seqs[i], path_params); gen_paths_workers_dealloc(gen_path_wrkr, 1); }
void test_supernode() { test_status("testing supernode_find()..."); // Construct 1 colour graph with kmer-size=11 dBGraph graph; size_t kmer_size = 19, ncols = 1; db_graph_alloc(&graph, kmer_size, ncols, ncols, 1024, DBG_ALLOC_EDGES | DBG_ALLOC_COVGS | DBG_ALLOC_BKTLOCKS); #define NSEQ 7 const char *seq[NSEQ] = {"AGAGAGAGAGAGAGAGAGAGAGAG", "AAAAAAAAAAAAAAAAAAAAAAAAAA", "ATATATATATATATATATATATATATAT", "CGTTCGCGCATGGCCCACG", "GAACCAATCGGTCGACTGT", "CCCCGCAAAGTCCACTTAGTGTAAGGTACAAATTCTGCAGAGTTGCTGGATCAGCGATAC", "TCAATCCGATAGCAACCCGGTCCAA""TCAATCCGATAGCAACCCGGTCCAA"}; const char *ans[NSEQ] = {"AGAGAGAGAGAGAGAGAGAG", // key AGAGAGAGAGAGAGAGAGA < CTCTCTCTCTCTCTCTCTC "AAAAAAAAAAAAAAAAAAA", "ATATATATATATATATATA", "CGTGGGCCATGCGCGAACG", "ACAGTCGACCGATTGGTTC", "CCCCGCAAAGTCCACTTAGTGTAAGGTACAAATTCTGCAGAGTTGCTGGATCAGCGATAC", "AACCCGGTCCAATCAATCCGATAGCAACCCGGTCCAATCAATC"}; // Load all seq into colour 0 size_t i; for(i = 0; i < NSEQ; i++) build_graph_from_str_mt(&graph, 0, seq[i], strlen(seq[i]), false); pull_out_supernodes(seq, ans, NSEQ, &graph); db_graph_dealloc(&graph); }
void all_tests_add_paths_multi(dBGraph *graph, const char **seqs, size_t nseqs, CorrectAlnParam params, int exp_npaths, int exp_nkmers) { size_t npaths = graph->gpstore.num_paths; size_t nkmers = graph->gpstore.num_kmers_with_paths; size_t i, nworkers = 1; GenPathWorker *wrkrs = gen_paths_workers_alloc(nworkers, graph); // Set up asyncio input data AsyncIOInput io = {.file1 = NULL, .file2 = NULL, .fq_offset = 0, .interleaved = false}; CorrectAlnInput task = {.files = io, .fq_cutoff = 0, .hp_cutoff = 0, .matedir = READPAIR_FR, .crt_params = params, .out_base = NULL, .output = NULL}; AsyncIOData iodata; asynciodata_alloc(&iodata); seq_read_reset(&iodata.r2); iodata.fq_offset1 = iodata.fq_offset2 = 0; iodata.ptr = NULL; // Add paths for(i = 0; i < nseqs; i++) { seq_read_set(&iodata.r1, seqs[i]); gen_paths_worker_seq(wrkrs, &iodata, &task); } asynciodata_dealloc(&iodata); gen_paths_workers_dealloc(wrkrs, nworkers); // Check we added the right number of paths if(exp_npaths >= 0) { TASSERT2(graph->gpstore.num_paths == npaths + (size_t)exp_npaths, "%zu %zu %zu", (size_t)graph->gpstore.num_paths, (size_t)npaths, (size_t)exp_npaths); } if(exp_nkmers >= 0) { TASSERT(graph->gpstore.num_kmers_with_paths == nkmers + (size_t)exp_nkmers); } } void all_tests_add_paths(dBGraph *graph, const char *seq, CorrectAlnParam params, int exp_npaths, int exp_nkmers) { all_tests_add_paths_multi(graph, &seq, 1, params, exp_npaths, exp_nkmers); } void all_tests_construct_graph(dBGraph *graph, size_t kmer_size, size_t ncols, const char **seqs, size_t nseqs, CorrectAlnParam path_params) { size_t i; db_graph_alloc(graph, kmer_size, ncols, ncols, 1024, DBG_ALLOC_EDGES | DBG_ALLOC_COVGS | DBG_ALLOC_NODE_IN_COL | DBG_ALLOC_BKTLOCKS); // Path data gpath_store_alloc(&graph->gpstore, ncols, graph->ht.capacity, 0, ONE_MEGABYTE, true, false); // Don't use links to add new links gpath_store_split_read_write(&graph->gpstore); // Allocate path hash table just in case gpath_hash_alloc(&graph->gphash, &graph->gpstore, ONE_MEGABYTE); // Build graph for(i = 0; i < nseqs; i++) build_graph_from_str_mt(graph, 0, seqs[i], strlen(seqs[i]), false); gpath_store_merge_read_write(&graph->gpstore); graph->num_of_cols_used = MAX2(graph->num_of_cols_used, 1); all_tests_add_paths_multi(graph, seqs, nseqs, path_params, -1, -1); }
void test_graph_crawler() { test_status("Testing graph crawler..."); // Construct 1 colour graph with kmer-size=11 dBGraph graph; const size_t kmer_size = 11, ncols = 3; db_graph_alloc(&graph, kmer_size, ncols, 1, 2048, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL | DBG_ALLOC_BKTLOCKS); char graphseq[3][77] = // < X X X............... {"GTTCCAGAGCGGAGGTCTCCCAACAACATGGTATAAGTTGTCTAGCCCCGGTTCGCGCGGGTACTTCTTACAGCGC", "GTTCCAGAGCGGAGGTCTCCCAACAACTTGGTATAAGTTGTCTAGTCCCGGTTCGCGCGGCATTTCAGCATTGTTA", "GTTCCAGAGCGCGACAGAGTGCATATCACGCTAAGCACAGCCCTCTTCTATCTGCTTTTAAATGGATCAATAATCG"}; build_graph_from_str_mt(&graph, 0, graphseq[0], strlen(graphseq[0])); build_graph_from_str_mt(&graph, 1, graphseq[1], strlen(graphseq[1])); build_graph_from_str_mt(&graph, 2, graphseq[2], strlen(graphseq[2])); // Crawl graph GraphCrawler crawler; graph_crawler_alloc(&crawler, &graph); dBNode node = db_graph_find_str(&graph, graphseq[0]); dBNode next_node = db_graph_find_str(&graph, graphseq[0]+1); TASSERT(node.key != HASH_NOT_FOUND); TASSERT(next_node.key != HASH_NOT_FOUND); BinaryKmer bkey = db_node_get_bkmer(&graph, node.key); Edges edges = db_node_get_edges(&graph, node.key, 0); dBNode next_nodes[4]; Nucleotide next_nucs[4]; size_t i, p, num_next, next_idx; num_next = db_graph_next_nodes(&graph, bkey, node.orient, edges, next_nodes, next_nucs); next_idx = 0; while(next_idx < num_next && !db_nodes_are_equal(next_nodes[next_idx],next_node)) next_idx++; TASSERT(next_idx < num_next && db_nodes_are_equal(next_nodes[next_idx],next_node)); // Crawl in all colours graph_crawler_fetch(&crawler, node, next_nodes, next_idx, num_next, NULL, graph.num_of_cols, NULL, NULL, NULL); TASSERT2(crawler.num_paths == 2, "crawler.num_paths: %u", crawler.num_paths); // Fetch paths dBNodeBuffer nbuf; db_node_buf_alloc(&nbuf, 16); StrBuf sbuf; strbuf_alloc(&sbuf, 128); for(p = 0; p < crawler.num_paths; p++) { db_node_buf_reset(&nbuf); graph_crawler_get_path_nodes(&crawler, p, &nbuf); strbuf_ensure_capacity(&sbuf, nbuf.len+graph.kmer_size); sbuf.end = db_nodes_to_str(nbuf.b, nbuf.len, &graph, sbuf.b); for(i = 0; i < 3 && strcmp(graphseq[i]+1,sbuf.b) != 0; i++) {} TASSERT2(i < 3, "seq: %s", sbuf.b); TASSERT2(sbuf.end == 75, "sbuf.end: %zu", sbuf.end); TASSERT2(nbuf.len == 65, "nbuf.len: %zu", nbuf.len); } strbuf_dealloc(&sbuf); db_node_buf_dealloc(&nbuf); graph_crawler_dealloc(&crawler); db_graph_dealloc(&graph); }
static void test_repeat_loop() { TASSERT(sizeof(FollowPath) == 20); // Construct 1 colour graph with kmer-size=11 dBGraph graph; size_t kmer_size = 11, ncols = 1; // Set up alignment correction params CorrectAlnParam params = {.ctpcol = 0, .ctxcol = 0, .ins_gap_min = 0, .ins_gap_max = 0, .one_way_gap_traverse = true, .use_end_check = true, .max_context = 10, .gap_variance = 0.1, .gap_wiggle = 5}; // Sequence with repeat char seq[] = "ATTTGGAACTCCGGA" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "CGTCAGGAGCTAACT"; char p0[] = "ATTTGGAACTCCGGA""GATAGGGCCAGT""GATAGGGCCAGT"; char p1[] = "GATAGGGCCAGT""GATAGGGCCAGT""CGTCAGGAGCTAACT"; // Allocate graph, but don't add any sequence _construct_graph_with_paths(&graph, kmer_size, ncols, NULL, 0, params); GenPathWorker *gen_path_wrkr = gen_paths_workers_alloc(1, &graph, NULL); GraphWalker gwlk; RepeatWalker rptwlk; graph_walker_alloc(&gwlk); rpt_walker_alloc(&rptwlk, graph.ht.capacity, 12); dBNodeBuffer nbuf; db_node_buf_alloc(&nbuf, 1024); // Construct graph but no paths build_graph_from_str_mt(&graph, 0, seq, strlen(seq)); TASSERT2(graph.ht.num_kmers == 15+12+15, "%zu", (size_t)graph.ht.num_kmers); // Find first node in sequence dBNode node0 = db_graph_find_str(&graph, seq); TASSERT(node0.key != HASH_NOT_FOUND); // 1) With no paths char ans0[] = "ATTTGGAACTCCGGA""GATAGGGCCAGT"; test_walk(&gwlk, &rptwlk, node0, &nbuf, &graph, 15+2, ans0); // 2) Add small paths - produces collapsed down seq with two copy repeat gen_paths_from_str_mt(gen_path_wrkr, p0, params); gen_paths_from_str_mt(gen_path_wrkr, p1, params); char ans1[] = "ATTTGGAACTCCGGA""GATAGGGCCAGT""GATAGGGCCAGT""CGTCAGGAGCTAACT"; test_walk(&gwlk, &rptwlk, node0, &nbuf, &graph, 15+12+12+5, ans1); // 3) Add long paths gen_paths_from_str_mt(gen_path_wrkr, seq, params); test_walk(&gwlk, &rptwlk, node0, &nbuf, &graph, strlen(seq)+1-kmer_size, seq); graph_walker_dealloc(&gwlk); rpt_walker_dealloc(&rptwlk); db_node_buf_dealloc(&nbuf); gen_paths_workers_dealloc(gen_path_wrkr, 1); db_graph_dealloc(&graph); } void test_repeat_walker() { test_status("Testing repeat_walker.h"); test_repeat_loop(); }
// Load each sequence into a separate colour static void test_bubbles(dBGraph *graph, const char **seqs, size_t nseqs, const char *flank5p, const char *flank3p, const char **alleles, size_t nalleles) { db_graph_reset(graph); TASSERT(graph->num_of_cols >= nseqs); size_t i; for(i = 0; i < nseqs; i++) build_graph_from_str_mt(graph, i, seqs[i], strlen(seqs[i]), false); graph->num_of_cols_used = MAX2(graph->num_of_cols_used, 1); StrBuf sbuf; dBNodeBuffer nbuf; strbuf_alloc(&sbuf, 128); db_node_buf_alloc(&nbuf, 128); BubbleCallingPrefs prefs = {.max_allele_len = 100, .max_flank_len = 100, .haploid_cols = NULL, .nhaploid_cols = 0, .remove_serial_bubbles = true}; BubbleCaller *caller = bubble_callers_new(1, &prefs, NULL, graph); _call_bubble(caller, flank5p, flank3p, alleles, nalleles, &nbuf, &sbuf); strbuf_dealloc(&sbuf); db_node_buf_dealloc(&nbuf); bubble_callers_destroy(caller, 1); } void test_bubble_caller() { test_status("Testing bubble calling..."); // Construct 1 colour graph with kmer-size=11 dBGraph graph; const size_t kmer_size = 11, ncols = 3; // Create graph db_graph_alloc(&graph, kmer_size, ncols, 1, 2000, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL | DBG_ALLOC_BKTLOCKS); // mutations: x const char *seqs0[] = {"AGGGATAAAACTCTGTACTGGATCTCCCT", "AGGGATAAAACTCTcTACTGGATCTCCCT"}; const char flank5p0[] = "AGGGATAAAACTCT"; const char flank3p0[] = "TACTGGATCTCCCT"; const char *alleles0[] = {"ATAAAACTCTGTACTGGATCT", "ATAAAACTCTcTACTGGATCT"}; test_bubbles(&graph, seqs0, 2, flank5p0, flank3p0, alleles0, 2); // mutations: x y const char *seqs1[] = {"CCCGTAGGTAAGGGCGTTAGTGCAAGGCCACATTGGGACACGAGTTGATA", "CCCGTAGGTAAGtGCGTTAGTGCAAGGCCACATTGGGACACGAGTTGATA", "CCCGTAGGTAAGGGCGTTAGTGCAAGGCCACtTTGGGACACGAGTTGATA"}; // forwards const char flank5p1a[] = "CCCGTAGGTAAG"; const char flank3p1a[] = "GCGTTAGTGCAAGGCCAC"; const char *alleles1a[] = {"CGTAGGTAAGGGCGTTAGTGC", "CGTAGGTAAGtGCGTTAGTGC"}; const char flank5p1b[] = "GCGTTAGTGCAAGGCCAC"; const char flank3p1b[] = "TTGGGACACGAGTTGATA"; const char *alleles1b[] = {"GCAAGGCCACATTGGGACACG", "GCAAGGCCACtTTGGGACACG"}; test_bubbles(&graph, seqs1, 3, flank5p1a, flank3p1a, alleles1a, 2); test_bubbles(&graph, seqs1, 3, flank5p1b, flank3p1b, alleles1b, 2); // reverse // mutations: y x // TATCAACTCGTGTCCCAATGTGGCCTTGCACTAACGCCCTTACCTACGGG // TATCAACTCGTGTCCCAATGTGGCCTTGCACTAACGCaCTTACCTACGGG // TATCAACTCGTGTCCCAAaGTGGCCTTGCACTAACGCCCTTACCTACGGG // const char flank5p1c[] = "GTGGCCTTGCACTAACGC"; const char flank3p1c[] = "CTTACCTACGGG"; const char *alleles1c[] = {"GCACTAACGCCCTTACCTACG", "GCACTAACGCaCTTACCTACG"}; const char flank5p1d[] = "TATCAACTCGTGTCCCAA"; const char flank3p1d[] = "GTGGCCTTGCACTAACGC"; const char *alleles1d[] = {"CGTGTCCCAATGTGGCCTTGC", "CGTGTCCCAAaGTGGCCTTGC"}; test_bubbles(&graph, seqs1, 3, flank5p1c, flank3p1c, alleles1c, 2); test_bubbles(&graph, seqs1, 3, flank5p1d, flank3p1d, alleles1d, 2); db_graph_dealloc(&graph); }