// Parse path and create FileFilter, calls die() with msg on error // fltr should be zero'd before call void file_filter_open(FileFilter *fltr, const char *path) { const char *path_start, *path_end; size_t path_len; // Duplicate input string and file path strbuf_set(&fltr->input, path); file_filter_deconstruct_path(path, &path_start, &path_end); path_len = path_end - path_start; strbuf_ensure_capacity(&fltr->path, path_len); memcpy(fltr->path.b, path_start, path_len); fltr->path.b[fltr->path.end = path_len] = '\0'; }
// Strip indels ('-') from allele and add to string buffer static inline void print_vcf_allele(const char *allele, size_t len, int8_t prev_base, int8_t next_base, StrBuf *sbuf) { size_t i; if(prev_base > 0) strbuf_append_char(sbuf, char_to_vcf_char(prev_base)); strbuf_ensure_capacity(sbuf, sbuf->end+len); for(i = 0; i < len; i++) { if(allele[i] != '-') sbuf->b[sbuf->end++] = char_to_vcf_char(allele[i]); } sbuf->b[sbuf->end] = 0; if(next_base > 0) strbuf_append_char(sbuf, char_to_vcf_char(next_base)); }
static void branch_to_str(const dBNode *nodes, size_t len, bool print_first_kmer, StrBuf *sbuf, const dBGraph *db_graph) { size_t i = print_first_kmer, kmer_size = db_graph->kmer_size; Nucleotide nuc; BinaryKmer bkmer; if(print_first_kmer) { strbuf_ensure_capacity(sbuf, sbuf->end + kmer_size); bkmer = db_node_oriented_bkmer(db_graph, nodes[0]); binary_kmer_to_str(bkmer, kmer_size, sbuf->b+sbuf->end); sbuf->end += kmer_size; } // i == 1 if print_first_kmer, otherwise 0 strbuf_ensure_capacity(sbuf, sbuf->end + len + 1); // +1 for '\n' for(; i < len; i++) { nuc = db_node_get_last_nuc(nodes[i], db_graph); sbuf->b[sbuf->end++] = dna_nuc_to_char(nuc); } sbuf->b[sbuf->end++] = '\n'; sbuf->b[sbuf->end] = '\0'; }
static void _check_alleles(GraphCache *cache, GCacheStepPtrBuf *steps, const char **alleles, size_t num_alleles, dBNodeBuffer *nbuf, StrBuf *sbuf) { TASSERT2(steps->len == num_alleles, "Number of alleles doesn't match"); size_t i, j; for(i = 0; i < steps->len; i++) { db_node_buf_reset(nbuf); gc_step_fetch_nodes(cache, steps->b[i], nbuf); strbuf_ensure_capacity(sbuf, nbuf->len+MAX_KMER_SIZE+1); db_nodes_to_str(nbuf->b, nbuf->len, cache->db_graph, sbuf->b); // Find this node for(j = 0; j < num_alleles && strcasecmp(sbuf->b,alleles[j]); j++) {} TASSERT2(j < num_alleles, "Couldn't find allele: %s", sbuf->b); } }
void test_graph_crawler() { test_status("Testing graph crawler..."); // Construct 1 colour graph with kmer-size=11 dBGraph graph; const size_t kmer_size = 11, ncols = 3; db_graph_alloc(&graph, kmer_size, ncols, 1, 2048, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL | DBG_ALLOC_BKTLOCKS); char graphseq[3][77] = // < X X X............... {"GTTCCAGAGCGGAGGTCTCCCAACAACATGGTATAAGTTGTCTAGCCCCGGTTCGCGCGGGTACTTCTTACAGCGC", "GTTCCAGAGCGGAGGTCTCCCAACAACTTGGTATAAGTTGTCTAGTCCCGGTTCGCGCGGCATTTCAGCATTGTTA", "GTTCCAGAGCGCGACAGAGTGCATATCACGCTAAGCACAGCCCTCTTCTATCTGCTTTTAAATGGATCAATAATCG"}; build_graph_from_str_mt(&graph, 0, graphseq[0], strlen(graphseq[0])); build_graph_from_str_mt(&graph, 1, graphseq[1], strlen(graphseq[1])); build_graph_from_str_mt(&graph, 2, graphseq[2], strlen(graphseq[2])); // Crawl graph GraphCrawler crawler; graph_crawler_alloc(&crawler, &graph); dBNode node = db_graph_find_str(&graph, graphseq[0]); dBNode next_node = db_graph_find_str(&graph, graphseq[0]+1); TASSERT(node.key != HASH_NOT_FOUND); TASSERT(next_node.key != HASH_NOT_FOUND); BinaryKmer bkey = db_node_get_bkmer(&graph, node.key); Edges edges = db_node_get_edges(&graph, node.key, 0); dBNode next_nodes[4]; Nucleotide next_nucs[4]; size_t i, p, num_next, next_idx; num_next = db_graph_next_nodes(&graph, bkey, node.orient, edges, next_nodes, next_nucs); next_idx = 0; while(next_idx < num_next && !db_nodes_are_equal(next_nodes[next_idx],next_node)) next_idx++; TASSERT(next_idx < num_next && db_nodes_are_equal(next_nodes[next_idx],next_node)); // Crawl in all colours graph_crawler_fetch(&crawler, node, next_nodes, next_idx, num_next, NULL, graph.num_of_cols, NULL, NULL, NULL); TASSERT2(crawler.num_paths == 2, "crawler.num_paths: %u", crawler.num_paths); // Fetch paths dBNodeBuffer nbuf; db_node_buf_alloc(&nbuf, 16); StrBuf sbuf; strbuf_alloc(&sbuf, 128); for(p = 0; p < crawler.num_paths; p++) { db_node_buf_reset(&nbuf); graph_crawler_get_path_nodes(&crawler, p, &nbuf); strbuf_ensure_capacity(&sbuf, nbuf.len+graph.kmer_size); sbuf.end = db_nodes_to_str(nbuf.b, nbuf.len, &graph, sbuf.b); for(i = 0; i < 3 && strcmp(graphseq[i]+1,sbuf.b) != 0; i++) {} TASSERT2(i < 3, "seq: %s", sbuf.b); TASSERT2(sbuf.end == 75, "sbuf.end: %zu", sbuf.end); TASSERT2(nbuf.len == 65, "nbuf.len: %zu", nbuf.len); } strbuf_dealloc(&sbuf); db_node_buf_dealloc(&nbuf); graph_crawler_dealloc(&crawler); db_graph_dealloc(&graph); }
/** * Print paths to a string buffer. Paths are sorted before being written. * * @param hkey All paths associated with hkey are written to the buffer * @param sbuf paths are written this string buffer * @param subset is a temp variable that is reused each time * @param nbuf temporary buffer, if not NULL, used to add seq=... to output * @param jposbuf temporary buffer, if not NULL, used to add juncpos=... to output */ void gpath_save_sbuf(hkey_t hkey, StrBuf *sbuf, GPathSubset *subset, dBNodeBuffer *nbuf, SizeBuffer *jposbuf, const dBGraph *db_graph) { ctx_assert(db_graph->num_of_cols == 1 || nbuf == NULL); ctx_assert(db_graph->num_of_cols == 1 || jposbuf == NULL); const GPathStore *gpstore = &db_graph->gpstore; const GPathSet *gpset = &gpstore->gpset; const size_t ncols = gpstore->gpset.ncols; GPath *first_gpath = gpath_store_fetch(gpstore, hkey); const GPath *gpath; size_t i, j, col; // Load and sort paths for given kmer gpath_subset_reset(subset); gpath_subset_load_llist(subset, first_gpath); gpath_subset_sort(subset); if(subset->list.len == 0) return; // Print "<kmer> <npaths>" BinaryKmer bkmer = db_graph->ht.table[hkey]; char bkstr[MAX_KMER_SIZE+1]; binary_kmer_to_str(bkmer, db_graph->kmer_size, bkstr); // strbuf_sprintf(sbuf, "%s %zu\n", bkstr, subset->list.len); strbuf_append_strn(sbuf, bkstr, db_graph->kmer_size); strbuf_append_char(sbuf, ' '); strbuf_append_ulong(sbuf, subset->list.len); strbuf_append_char(sbuf, '\n'); char orchar[2] = {0}; orchar[FORWARD] = 'F'; orchar[REVERSE] = 'R'; const uint8_t *nseenptr; for(i = 0; i < subset->list.len; i++) { gpath = subset->list.b[i]; nseenptr = gpath_set_get_nseen(gpset, gpath); // strbuf_sprintf(sbuf, "%c %zu %u %u", orchar[gpath->orient], klen, // gpath->num_juncs, (uint32_t)nseenptr[0]); strbuf_append_char(sbuf, orchar[gpath->orient]); strbuf_append_char(sbuf, ' '); strbuf_append_ulong(sbuf, gpath->num_juncs); strbuf_append_char(sbuf, ' '); strbuf_append_ulong(sbuf, nseenptr[0]); for(col = 1; col < ncols; col++) { // strbuf_sprintf(sbuf, ",%u", (uint32_t)nseenptr[col]); strbuf_append_char(sbuf, ','); strbuf_append_ulong(sbuf, nseenptr[col]); } strbuf_append_char(sbuf, ' '); strbuf_ensure_capacity(sbuf, sbuf->end + gpath->num_juncs + 2); binary_seq_to_str(gpath->seq, gpath->num_juncs, sbuf->b+sbuf->end); sbuf->end += gpath->num_juncs; if(nbuf) { // Trace this path through the graph // First, find a colour this path is in for(col = 0; col < ncols && !gpath_has_colour(gpath, ncols, col); col++) {} if(col == ncols) die("path is not in any colours"); dBNode node = {.key = hkey, .orient = gpath->orient}; db_node_buf_reset(nbuf); if(jposbuf) size_buf_reset(jposbuf); // indices of junctions in nbuf gpath_fetch(node, gpath, nbuf, jposbuf, col, db_graph); strbuf_append_str(sbuf, " seq="); strbuf_ensure_capacity(sbuf, sbuf->end + db_graph->kmer_size + nbuf->len); sbuf->end += db_nodes_to_str(nbuf->b, nbuf->len, db_graph, sbuf->b+sbuf->end); if(jposbuf) { strbuf_append_str(sbuf, " juncpos="); strbuf_append_ulong(sbuf, jposbuf->b[0]); for(j = 1; j < jposbuf->len; j++) { strbuf_append_char(sbuf, ','); strbuf_append_ulong(sbuf, jposbuf->b[j]); } } } strbuf_append_char(sbuf, '\n'); } } // @subset is a temp variable that is reused each time // @sbuf is a temp variable that is reused each time static inline int _gpath_gzsave_node(hkey_t hkey, StrBuf *sbuf, GPathSubset *subset, dBNodeBuffer *nbuf, SizeBuffer *jposbuf, gzFile gzout, pthread_mutex_t *outlock, const dBGraph *db_graph) { gpath_save_sbuf(hkey, sbuf, subset, nbuf, jposbuf, db_graph); if(sbuf->end > DEFAULT_IO_BUFSIZE) _gpath_save_flush(gzout, sbuf, outlock); return 0; // => keep iterating }