static void test_walk(GraphWalker *gwlk, RepeatWalker *rptwlk, dBNode node0, dBNodeBuffer *nbuf, const dBGraph *graph, size_t expnkmers, const char *ans) { db_node_buf_reset(nbuf); graph_walker_init(gwlk, graph, 0, 0, node0); do { db_node_buf_add(nbuf, gwlk->node); } while(graph_walker_next(gwlk) && rpt_walker_attempt_traverse(rptwlk, gwlk)); // db_nodes_print(nbuf->data, nbuf->len, graph, stdout); // printf("\n"); // printf("%s\n", graph_step_str[gwlk->last_step.status]); TASSERT2(nbuf->len == expnkmers, "%zu / %zu", nbuf->len, expnkmers); char tmp[nbuf->len+MAX_KMER_SIZE]; db_nodes_to_str(nbuf->data, nbuf->len, graph, tmp); TASSERT2(strcmp(tmp,ans) == 0, "%s vs %s", tmp, ans); graph_walker_finish(gwlk); rpt_walker_fast_clear(rptwlk, nbuf->data, nbuf->len); }
static void test_util_num_to_str() { char str[100]; // Check NaN and Inf are correctly written TASSERT2(strcmp(num_to_str(NAN, 2, str),"NaN") == 0, "Got: %s", str); TASSERT2(strcmp(num_to_str(INFINITY, 2, str),"Inf") == 0, "Got: %s", str); }
static void _check_node_paths(const char *kmer, const char **path_strs, size_t npaths, size_t colour, const dBGraph *graph) { TASSERT(strlen(kmer) == graph->kmer_size); const GPath *paths[npaths]; // corresponding to path_strs memset(paths, 0, sizeof(paths)); size_t i, num_paths_seen = 0; const GPathStore *gpstore = &graph->gpstore; dBNode node = db_graph_find_str(graph, kmer); const GPath *path = gpath_store_fetch_traverse(gpstore, node.key); dBNodeBuffer nbuf; SizeBuffer jposbuf; db_node_buf_alloc(&nbuf, 64); size_buf_alloc(&jposbuf, 64); #define MAX_SEQ 128 char seq[MAX_SEQ]; for(; path != NULL; path = path->next) { if(path->orient == node.orient && gpath_has_colour(path, gpstore->gpset.ncols, colour)) { TASSERT(num_paths_seen < npaths); db_node_buf_reset(&nbuf); gpath_fetch(node, path, &nbuf, &jposbuf, colour, graph); if(nbuf.len > MAX_SEQ) die("Too many nodes. Cannot continue. %zu", nbuf.len); db_nodes_to_str(nbuf.b, nbuf.len, graph, seq); TASSERT(strlen(seq) == graph->kmer_size + nbuf.len - 1); for(i = 0; i < npaths; i++) { if(strcmp(path_strs[i],seq) == 0) { TASSERT(paths[i] == NULL, "Duplicate paths: %s", seq); paths[i] = path; break; } } TASSERT2(i < npaths, "Path not found: %s", seq); num_paths_seen++; } } TASSERT(num_paths_seen == npaths); for(i = 0; i < npaths; i++) { TASSERT2(paths[i] != NULL, "path not in graph: %s", path_strs[i]); } db_node_buf_dealloc(&nbuf); size_buf_dealloc(&jposbuf); }
static void _manual_test_pack_cpy_unpack(const char *seq, size_t len, size_t shift) { TASSERT(len >= shift); size_t i, nbytes = (len+3)/4, outlen = len - shift; Nucleotide bases[len], bases2[len]; uint8_t packed[nbytes], packed2[nbytes]; char seq2[len+1]; // convert to bases for(i = 0; i < len; i++) bases[i] = dna_char_to_nuc(seq[i]); // bases -> packed binary_seq_pack(packed, bases, len); // shift cpy binary_seq_cpy(packed2, packed, shift, len); // packed -> bases binary_seq_unpack(packed2, bases2, outlen); // convert to char for(i = 0; i < outlen; i++) seq2[i] = dna_nuc_to_char(bases2[i]); seq2[outlen] = '\0'; TASSERT2(strncmp(seq+shift, seq2, outlen) == 0, "in: %s\nout:%s\n", seq, seq2); }
static void _binary_seq_str_test(const char *seq) { size_t len = strlen(seq); char str[len+1]; uint8_t data[len]; binary_seq_from_str(seq, len, data); binary_seq_to_str(data, len, str); TASSERT2(strcmp(seq, str) == 0, "1: '%s' vs '%s'", seq, str); }
static void _check_alleles(GraphCache *cache, GCacheStepPtrBuf *steps, const char **alleles, size_t num_alleles, dBNodeBuffer *nbuf, StrBuf *sbuf) { TASSERT2(steps->len == num_alleles, "Number of alleles doesn't match"); size_t i, j; for(i = 0; i < steps->len; i++) { db_node_buf_reset(nbuf); gc_step_fetch_nodes(cache, steps->b[i], nbuf); strbuf_ensure_capacity(sbuf, nbuf->len+MAX_KMER_SIZE+1); db_nodes_to_str(nbuf->b, nbuf->len, cache->db_graph, sbuf->b); // Find this node for(j = 0; j < num_alleles && strcasecmp(sbuf->b,alleles[j]); j++) {} TASSERT2(j < num_alleles, "Couldn't find allele: %s", sbuf->b); } }
static void supernode_from_kmer(hkey_t hkey, dBNodeBuffer *nbuf, uint64_t *visited, const dBGraph *graph, const char **ans, size_t n) { size_t i; char tmpstr[SNODEBUF]; if(!bitset_get(visited, hkey)) { db_node_buf_reset(nbuf); supernode_find(hkey, nbuf, graph); for(i = 0; i < nbuf->len; i++) bitset_set(visited, nbuf->b[i].key); supernode_normalise(nbuf->b, nbuf->len, graph); TASSERT(nbuf->len < SNODEBUF); db_nodes_to_str(nbuf->b, nbuf->len, graph, tmpstr); for(i = 0; i < n && strcmp(tmpstr,ans[i]) != 0; i++); TASSERT2(i < n, "Got: %s", tmpstr); } }
static void test_util_bytes_to_str() { test_status("Testing bytes_to_str()"); char str[100]; // Excess decimal points are trimmed off // 14.0MB -> 14MB TASSERT2(strcmp(bytes_to_str(14688256,1,str),"14MB") == 0, "Got: %s", str); // 1.9GB -> 1.9GB TASSERT2(strcmp(bytes_to_str(2040110000,1,str),"1.9GB") == 0, "Got: %s", str); // 1.99GB -> 2GB TASSERT2(strcmp(bytes_to_str(2140110000,1,str),"2GB") == 0, "Got: %s", str); // 1500KB -> 1.4MB TASSERT2(strcmp(bytes_to_str(1500000,1,str),"1.4MB") == 0, "Got: %s", str); // 0.5GB -> 512MB TASSERT2(strcmp(bytes_to_str(536900000,1,str),"512MB") == 0, "Got: %s", str); // 1 -> 1B TASSERT2(strcmp(bytes_to_str(1,1,str),"1B") == 0, "Got: %s", str); // 1023 -> 1023B TASSERT2(strcmp(bytes_to_str(1023,1,str),"1,023B") == 0, "Got: %s", str); }
void all_tests_add_paths_multi(dBGraph *graph, const char **seqs, size_t nseqs, CorrectAlnParam params, int exp_npaths, int exp_nkmers) { size_t npaths = graph->gpstore.num_paths; size_t nkmers = graph->gpstore.num_kmers_with_paths; size_t i, nworkers = 1; GenPathWorker *wrkrs = gen_paths_workers_alloc(nworkers, graph); // Set up asyncio input data AsyncIOInput io = {.file1 = NULL, .file2 = NULL, .fq_offset = 0, .interleaved = false}; CorrectAlnInput task = {.files = io, .fq_cutoff = 0, .hp_cutoff = 0, .matedir = READPAIR_FR, .crt_params = params, .out_base = NULL, .output = NULL}; AsyncIOData iodata; asynciodata_alloc(&iodata); seq_read_reset(&iodata.r2); iodata.fq_offset1 = iodata.fq_offset2 = 0; iodata.ptr = NULL; // Add paths for(i = 0; i < nseqs; i++) { seq_read_set(&iodata.r1, seqs[i]); gen_paths_worker_seq(wrkrs, &iodata, &task); } asynciodata_dealloc(&iodata); gen_paths_workers_dealloc(wrkrs, nworkers); // Check we added the right number of paths if(exp_npaths >= 0) { TASSERT2(graph->gpstore.num_paths == npaths + (size_t)exp_npaths, "%zu %zu %zu", (size_t)graph->gpstore.num_paths, (size_t)npaths, (size_t)exp_npaths); } if(exp_nkmers >= 0) { TASSERT(graph->gpstore.num_kmers_with_paths == nkmers + (size_t)exp_nkmers); } } void all_tests_add_paths(dBGraph *graph, const char *seq, CorrectAlnParam params, int exp_npaths, int exp_nkmers) { all_tests_add_paths_multi(graph, &seq, 1, params, exp_npaths, exp_nkmers); } void all_tests_construct_graph(dBGraph *graph, size_t kmer_size, size_t ncols, const char **seqs, size_t nseqs, CorrectAlnParam path_params) { size_t i; db_graph_alloc(graph, kmer_size, ncols, ncols, 1024, DBG_ALLOC_EDGES | DBG_ALLOC_COVGS | DBG_ALLOC_NODE_IN_COL | DBG_ALLOC_BKTLOCKS); // Path data gpath_store_alloc(&graph->gpstore, ncols, graph->ht.capacity, 0, ONE_MEGABYTE, true, false); // Don't use links to add new links gpath_store_split_read_write(&graph->gpstore); // Allocate path hash table just in case gpath_hash_alloc(&graph->gphash, &graph->gpstore, ONE_MEGABYTE); // Build graph for(i = 0; i < nseqs; i++) build_graph_from_str_mt(graph, 0, seqs[i], strlen(seqs[i]), false); gpath_store_merge_read_write(&graph->gpstore); graph->num_of_cols_used = MAX2(graph->num_of_cols_used, 1); all_tests_add_paths_multi(graph, seqs, nseqs, path_params, -1, -1); }
void test_graph_crawler() { test_status("Testing graph crawler..."); // Construct 1 colour graph with kmer-size=11 dBGraph graph; const size_t kmer_size = 11, ncols = 3; db_graph_alloc(&graph, kmer_size, ncols, 1, 2048, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL | DBG_ALLOC_BKTLOCKS); char graphseq[3][77] = // < X X X............... {"GTTCCAGAGCGGAGGTCTCCCAACAACATGGTATAAGTTGTCTAGCCCCGGTTCGCGCGGGTACTTCTTACAGCGC", "GTTCCAGAGCGGAGGTCTCCCAACAACTTGGTATAAGTTGTCTAGTCCCGGTTCGCGCGGCATTTCAGCATTGTTA", "GTTCCAGAGCGCGACAGAGTGCATATCACGCTAAGCACAGCCCTCTTCTATCTGCTTTTAAATGGATCAATAATCG"}; build_graph_from_str_mt(&graph, 0, graphseq[0], strlen(graphseq[0])); build_graph_from_str_mt(&graph, 1, graphseq[1], strlen(graphseq[1])); build_graph_from_str_mt(&graph, 2, graphseq[2], strlen(graphseq[2])); // Crawl graph GraphCrawler crawler; graph_crawler_alloc(&crawler, &graph); dBNode node = db_graph_find_str(&graph, graphseq[0]); dBNode next_node = db_graph_find_str(&graph, graphseq[0]+1); TASSERT(node.key != HASH_NOT_FOUND); TASSERT(next_node.key != HASH_NOT_FOUND); BinaryKmer bkey = db_node_get_bkmer(&graph, node.key); Edges edges = db_node_get_edges(&graph, node.key, 0); dBNode next_nodes[4]; Nucleotide next_nucs[4]; size_t i, p, num_next, next_idx; num_next = db_graph_next_nodes(&graph, bkey, node.orient, edges, next_nodes, next_nucs); next_idx = 0; while(next_idx < num_next && !db_nodes_are_equal(next_nodes[next_idx],next_node)) next_idx++; TASSERT(next_idx < num_next && db_nodes_are_equal(next_nodes[next_idx],next_node)); // Crawl in all colours graph_crawler_fetch(&crawler, node, next_nodes, next_idx, num_next, NULL, graph.num_of_cols, NULL, NULL, NULL); TASSERT2(crawler.num_paths == 2, "crawler.num_paths: %u", crawler.num_paths); // Fetch paths dBNodeBuffer nbuf; db_node_buf_alloc(&nbuf, 16); StrBuf sbuf; strbuf_alloc(&sbuf, 128); for(p = 0; p < crawler.num_paths; p++) { db_node_buf_reset(&nbuf); graph_crawler_get_path_nodes(&crawler, p, &nbuf); strbuf_ensure_capacity(&sbuf, nbuf.len+graph.kmer_size); sbuf.end = db_nodes_to_str(nbuf.b, nbuf.len, &graph, sbuf.b); for(i = 0; i < 3 && strcmp(graphseq[i]+1,sbuf.b) != 0; i++) {} TASSERT2(i < 3, "seq: %s", sbuf.b); TASSERT2(sbuf.end == 75, "sbuf.end: %zu", sbuf.end); TASSERT2(nbuf.len == 65, "nbuf.len: %zu", nbuf.len); } strbuf_dealloc(&sbuf); db_node_buf_dealloc(&nbuf); graph_crawler_dealloc(&crawler); db_graph_dealloc(&graph); }
int main(int argc, char **argv) { cortex_init(); cmd_init(argc, argv); ctx_msg_out = NULL; ctx_tst_out = stdout; test_status("Tests running k=%i..%i...", get_min_kmer_size(), get_max_kmer_size()); test_status("[version] "VERSION_STATUS_STR"\n"); // Binary Kmer tests should work for all values of MAXK test_bkmer_functions(); test_hash_table(); #if MAX_KMER_SIZE == 31 // not kmer dependent test_util(); test_dna_functions(); test_binary_seq_functions(); // only written in k=31 test_db_node(); test_build_graph(); test_supernode(); test_subgraph(); test_cleaning(); test_paths(); // test_path_sets(); // TODO: replace with test_path_subset() test_graph_walker(); test_corrected_aln(); test_repeat_walker(); test_graph_crawler(); test_bubble_caller(); test_kmer_occur(); test_infer_edges_tests(); #endif cmd_destroy(); // Check we free'd all our memory size_t still_alloced = alloc_get_num_allocs() - alloc_get_num_frees(); TASSERT2(still_alloced == 0, "%zu not free'd", still_alloced); // Finished char num_test_str[100], num_passed_str[100]; size_t tests_num_passed = tests_num_run - tests_num_failed; ulong_to_str(tests_num_run, num_test_str); ulong_to_str(tests_num_passed, num_passed_str); test_status("Tests passed: %s / %s (%.1f%%)", num_passed_str, num_test_str, (100.0*tests_num_passed)/tests_num_run); if(tests_num_failed) test_status("%zu tests failed", tests_num_failed); else test_status("All tests passed."); cortex_destroy(); // Return 1 if any tests failed, 0 on success return tests_num_failed ? 1 : 0; }
static void test_repeat_loop() { TASSERT(sizeof(FollowPath) == 20); // Construct 1 colour graph with kmer-size=11 dBGraph graph; size_t kmer_size = 11, ncols = 1; // Set up alignment correction params CorrectAlnParam params = {.ctpcol = 0, .ctxcol = 0, .ins_gap_min = 0, .ins_gap_max = 0, .one_way_gap_traverse = true, .use_end_check = true, .max_context = 10, .gap_variance = 0.1, .gap_wiggle = 5}; // Sequence with repeat char seq[] = "ATTTGGAACTCCGGA" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "GATAGGGCCAGT" "CGTCAGGAGCTAACT"; char p0[] = "ATTTGGAACTCCGGA""GATAGGGCCAGT""GATAGGGCCAGT"; char p1[] = "GATAGGGCCAGT""GATAGGGCCAGT""CGTCAGGAGCTAACT"; // Allocate graph, but don't add any sequence _construct_graph_with_paths(&graph, kmer_size, ncols, NULL, 0, params); GenPathWorker *gen_path_wrkr = gen_paths_workers_alloc(1, &graph, NULL); GraphWalker gwlk; RepeatWalker rptwlk; graph_walker_alloc(&gwlk); rpt_walker_alloc(&rptwlk, graph.ht.capacity, 12); dBNodeBuffer nbuf; db_node_buf_alloc(&nbuf, 1024); // Construct graph but no paths build_graph_from_str_mt(&graph, 0, seq, strlen(seq)); TASSERT2(graph.ht.num_kmers == 15+12+15, "%zu", (size_t)graph.ht.num_kmers); // Find first node in sequence dBNode node0 = db_graph_find_str(&graph, seq); TASSERT(node0.key != HASH_NOT_FOUND); // 1) With no paths char ans0[] = "ATTTGGAACTCCGGA""GATAGGGCCAGT"; test_walk(&gwlk, &rptwlk, node0, &nbuf, &graph, 15+2, ans0); // 2) Add small paths - produces collapsed down seq with two copy repeat gen_paths_from_str_mt(gen_path_wrkr, p0, params); gen_paths_from_str_mt(gen_path_wrkr, p1, params); char ans1[] = "ATTTGGAACTCCGGA""GATAGGGCCAGT""GATAGGGCCAGT""CGTCAGGAGCTAACT"; test_walk(&gwlk, &rptwlk, node0, &nbuf, &graph, 15+12+12+5, ans1); // 3) Add long paths gen_paths_from_str_mt(gen_path_wrkr, seq, params); test_walk(&gwlk, &rptwlk, node0, &nbuf, &graph, strlen(seq)+1-kmer_size, seq); graph_walker_dealloc(&gwlk); rpt_walker_dealloc(&rptwlk); db_node_buf_dealloc(&nbuf); gen_paths_workers_dealloc(gen_path_wrkr, 1); db_graph_dealloc(&graph); } void test_repeat_walker() { test_status("Testing repeat_walker.h"); test_repeat_loop(); }
static void test_kmer_occur_filter() { // Construct 1 colour graph with kmer-size=11 dBGraph graph; const size_t kmer_size = 11, ncols = 3; size_t i; // Create graph db_graph_alloc(&graph, kmer_size, ncols, 1, 2000, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL | DBG_ALLOC_BKTLOCKS); // xyz------->>> y > < X // TTCGACCCGACAGGGCAACGTAGTCCGACAGGGCACAGCCCTGTCGGGGGGTGCA #define NUM_NODES 3 #define NUM_READS 3 const char *tmp[NUM_READS] = { "AACA", "TTCGACCCGACAGGGCAACGTAGTCCGACAGGGCACAGCCCTGTCGGGGGGTGCA", "TCTAGCATGTGTGTT"}; read_t reads[NUM_READS]; for(i = 0; i < NUM_READS; i++) { seq_read_alloc(&reads[i]); seq_read_set(&reads[i], tmp[i]); } KOGraph kograph = kograph_create(reads, NUM_READS, true, 0, 1, &graph); TASSERT(kograph.nchroms == NUM_READS); TASSERT(kograph.koccurs != NULL); KOccurRunBuffer koruns, koruns_tmp, koruns_ended; korun_buf_alloc(&koruns, 16); korun_buf_alloc(&koruns_tmp, 16); korun_buf_alloc(&koruns_ended, 16); // Check CCCGACAGGGCAA starts at CCCGACAGGGC // x=CCCGACAGGGC, y=CCGACAGGGCA, z=CGACAGGGCAA // X=GCCCTGTCGGG, Y=TGCCCTGTCGG, Z=TTGCCCTGTCG dBNode nodes[NUM_NODES]; for(i = 0; i < NUM_NODES; i++) nodes[i] = db_graph_find_str(&graph, &"CCCGACAGGGCAA"[i]); korun_buf_reset(&koruns); korun_buf_reset(&koruns_ended); kograph_filter_extend(&kograph, nodes, NUM_NODES, true, 0, 0, &koruns, &koruns_tmp, &koruns_ended); // Checks TASSERT2(koruns.len == 1, "koruns.len: %zu", koruns.len); TASSERT(koruns.b[0].strand == STRAND_PLUS); // left-to-right with ref TASSERT2(koruns.b[0].chrom == 1, "chrom: %zu", (size_t)koruns.b[0].chrom); TASSERT2(koruns.b[0].first == 5, "offset: %zu", (size_t)koruns.b[0].first); TASSERT2(koruns.b[0].last == 7, "last: %zu", (size_t)koruns.b[0].last); // Test reverse db_nodes_reverse_complement(nodes, NUM_NODES); korun_buf_reset(&koruns); korun_buf_reset(&koruns_ended); kograph_filter_extend(&kograph, nodes, 1, true, 0, 0, &koruns, &koruns_tmp, &koruns_ended); kograph_filter_extend(&kograph, nodes+1, 1, true, 0, 1, &koruns, &koruns_tmp, &koruns_ended); kograph_filter_extend(&kograph, nodes+2, 1, true, 0, 2, &koruns, &koruns_tmp, &koruns_ended); // Print out for debugging // printf("koruns: "); // koruns_print(koruns.b, koruns.len, kmer_size, stdout); // printf("\nkoruns_ended: "); // koruns_print(koruns_ended.b, koruns_ended.len, kmer_size, stdout); // printf("\n"); // Check results match: // koruns: chromid:1:17-5:-, chromid:1:37-47:+ // koruns_ended: chromid:1:34-24:- TASSERT2(koruns.len == 2, "koruns.len: %zu", koruns.len); TASSERT2(koruns_ended.len == 1, "koruns_ended.len: %zu", koruns_ended.len); TASSERT(koruns.b[0].strand == STRAND_MINUS); // reverse complement of ref TASSERT2(koruns.b[0].chrom == 1, "chrom: %zu", (size_t)koruns.b[0].chrom); TASSERT2(koruns.b[0].first == 7, "offset: %zu", (size_t)koruns.b[0].first); TASSERT2(koruns.b[0].last == 5, "last: %zu", (size_t)koruns.b[0].last); korun_buf_dealloc(&koruns); korun_buf_dealloc(&koruns_tmp); korun_buf_dealloc(&koruns_ended); for(i = 0; i < NUM_READS; i++) seq_read_dealloc(&reads[i]); kograph_dealloc(&kograph); db_graph_dealloc(&graph); }