// Get bkey:orient string representation e.g. "AGAGTTTTATC:1". // :0 means forward, :1 means reverse // `str` must be at least kmer_size+3 chars long // Returns length in bytes. Null terminates `str`. size_t db_node_to_str(const dBGraph *db_graph, dBNode node, char *str) { const size_t kmer_size = db_graph->kmer_size; BinaryKmer bkmer = db_node_get_bkmer(db_graph, node.key); binary_kmer_to_str(bkmer, kmer_size, str); str[kmer_size] = ':'; str[kmer_size+1] = '0' + node.orient; str[kmer_size+2] = '\0'; return kmer_size + 2; }
static void print_failed(dBNode node, const dBNodeBuffer *nbuf, const dBGraph *db_graph, bool is_AB, bool prime_AB) { const size_t kmer_size = db_graph->kmer_size; char bkmerstr[MAX_KMER_SIZE+1]; BinaryKmer bkmer = db_node_get_bkey(db_graph, node.key); binary_kmer_to_str(bkmer, kmer_size, bkmerstr); printf(">%s:%i %s %s\n", bkmerstr, node.orient, is_AB ? "A->B" : "B->C", prime_AB ? "prime_AB" : "walk_AB"); db_nodes_print(nbuf->b, nbuf->len, db_graph, stdout); fputc('\n', stdout); }
// Print: // 0: AAACCCAAATGCAAACCCAAATGCAAACCCA:1 TGGGTTTGCATTTGGGTTTGCATTTGGGTTT // 1: CAAACCCAAATGCAAACCCAAATGCAAACCC:1 GGGTTTGCATTTGGGTTTGCATTTGGGTTTG // ... void db_nodes_print_verbose(const dBNode *nodes, size_t num, const dBGraph *db_graph, FILE *out) { if(num == 0) return; const size_t kmer_size = db_graph->kmer_size; size_t i; BinaryKmer bkmer, bkey; char kmerstr[MAX_KMER_SIZE+1], keystr[MAX_KMER_SIZE+1]; bkmer = db_node_get_bkmer(db_graph, nodes[0].key); bkey = db_node_oriented_bkmer(db_graph, nodes[0]); binary_kmer_to_str(bkmer, kmer_size, kmerstr); binary_kmer_to_str(bkey, kmer_size, keystr); fprintf(out, "%3zu: %s:%i %s\n", (size_t)0, kmerstr, (int)nodes[0].orient, keystr); for(i = 1; i < num; i++) { bkmer = db_node_get_bkmer(db_graph, nodes[i].key); bkey = db_node_oriented_bkmer(db_graph, nodes[i]); binary_kmer_to_str(bkmer, kmer_size, kmerstr); binary_kmer_to_str(bkey, kmer_size, keystr); fprintf(out, "%3zu: %s:%i %s\n", i, kmerstr, (int)nodes[i].orient, keystr); } }
void db_nodes_gzprint(const dBNode *nodes, size_t num, const dBGraph *db_graph, gzFile out) { size_t i, kmer_size = db_graph->kmer_size; Nucleotide nuc; BinaryKmer bkmer; char tmp[MAX_KMER_SIZE+1]; bkmer = db_node_oriented_bkmer(db_graph, nodes[0]); binary_kmer_to_str(bkmer, kmer_size, tmp); gzputs(out, tmp); for(i = 1; i < num; i++) { nuc = db_node_get_last_nuc(nodes[i], db_graph); gzputc(out, dna_nuc_to_char(nuc)); } }
void db_nodes_print(const dBNode *nodes, size_t num, const dBGraph *db_graph, FILE *out) { const size_t kmer_size = db_graph->kmer_size; size_t i; Nucleotide nuc; BinaryKmer bkmer; char tmp[MAX_KMER_SIZE+1]; bkmer = db_node_oriented_bkmer(db_graph, nodes[0]); binary_kmer_to_str(bkmer, kmer_size, tmp); fputs(tmp, out); for(i = 1; i < num; i++) { nuc = db_node_get_last_nuc(nodes[i], db_graph); fputc(dna_nuc_to_char(nuc), out); } }
// Returns number of bytes added size_t db_nodes_to_str(const dBNode *nodes, size_t num, const dBGraph *db_graph, char *str) { if(num == 0) return 0; size_t i; size_t kmer_size = db_graph->kmer_size; BinaryKmer bkmer = db_node_get_bkmer(db_graph, nodes[0].key); Nucleotide nuc; binary_kmer_to_str(bkmer, kmer_size, str); if(nodes[0].orient == REVERSE) dna_reverse_complement_str(str, kmer_size); for(i = 1; i < num; i++) { nuc = db_node_get_last_nuc(nodes[i], db_graph); str[kmer_size+i-1] = dna_nuc_to_char(nuc); } str[kmer_size+num-1] = '\0'; return kmer_size+num-1; }
static void branch_to_str(const dBNode *nodes, size_t len, bool print_first_kmer, StrBuf *sbuf, const dBGraph *db_graph) { size_t i = print_first_kmer, kmer_size = db_graph->kmer_size; Nucleotide nuc; BinaryKmer bkmer; if(print_first_kmer) { strbuf_ensure_capacity(sbuf, sbuf->end + kmer_size); bkmer = db_node_oriented_bkmer(db_graph, nodes[0]); binary_kmer_to_str(bkmer, kmer_size, sbuf->b+sbuf->end); sbuf->end += kmer_size; } // i == 1 if print_first_kmer, otherwise 0 strbuf_ensure_capacity(sbuf, sbuf->end + len + 1); // +1 for '\n' for(; i < len; i++) { nuc = db_node_get_last_nuc(nodes[i], db_graph); sbuf->b[sbuf->end++] = dna_nuc_to_char(nuc); } sbuf->b[sbuf->end++] = '\n'; sbuf->b[sbuf->end] = '\0'; }
int ctx_index(int argc, char **argv) { const char *out_path = NULL; size_t block_size = 0, block_kmers = 0; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 'b': cmd_check(!block_kmers, cmd); block_kmers = cmd_size_nonzero(cmd, optarg); break; case 's': cmd_check(!block_size, cmd); block_size = cmd_size_nonzero(cmd, optarg); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" index -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } if(optind+1 != argc) cmd_print_usage("Require exactly one input graph file (.ctx)"); if(block_size && block_kmers) cmd_print_usage("Cannot use --block-kmers and --block-size together"); const char *ctx_path = argv[optind]; // // Open Graph file // GraphFileReader gfile; memset(&gfile, 0, sizeof(GraphFileReader)); graph_file_open2(&gfile, ctx_path, "r+", true, 0); if(!file_filter_is_direct(&gfile.fltr)) die("Cannot open graph file with a filter ('in.ctx:blah' syntax)"); // Open output file FILE *fout = out_path ? futil_fopen_create(out_path, "w") : stdout; // Start size_t filencols = gfile.hdr.num_of_cols; size_t kmer_size = gfile.hdr.kmer_size; const char *path = file_filter_path(&gfile.fltr); size_t ncols = file_filter_into_ncols(&gfile.fltr); size_t kmer_mem = sizeof(BinaryKmer) + (sizeof(Edges)+sizeof(Covg))*filencols; if(block_size) { block_kmers = block_size / kmer_mem; } else if(!block_size && !block_kmers) { block_size = 4 * ONE_MEGABYTE; block_kmers = block_size / kmer_mem; } // Update block-size block_size = block_kmers * kmer_mem; status("[index] block bytes: %zu kmers: %zu; kmer bytes: %zu, hdr: %zu", block_size, block_kmers, kmer_mem, (size_t)gfile.hdr_size); if(block_kmers == 0) die("Cannot set block_kmers to zero"); // Print header fputs("#block_start\tnext_block\tfirst_kmer\tkmer_idx\tnext_kmer_idx\n", fout); BinaryKmer bkmer = BINARY_KMER_ZERO_MACRO; BinaryKmer prev_bkmer = BINARY_KMER_ZERO_MACRO; Covg *covgs = ctx_malloc(ncols * sizeof(Covg)); Edges *edges = ctx_malloc(ncols * sizeof(Edges)); char bkmerstr[MAX_KMER_SIZE+1]; size_t rem_block = block_size - kmer_mem; // block after first kmer char *tmp_mem = ctx_malloc(rem_block); // Read in file, print index size_t nblocks = 0; size_t bl_bytes = 0, bl_kmers = 0; size_t bl_byte_offset = gfile.hdr_size, bl_kmer_offset = 0; while(1) { if(!graph_file_read(&gfile, &bkmer, covgs, edges)) { status("Read kmer failed"); break; } binary_kmer_to_str(bkmer, kmer_size, bkmerstr); if(nblocks > 0 && !binary_kmer_less_than(prev_bkmer,bkmer)) die("File is not sorted: %s [%s]", bkmerstr, path); // We've already read one kmer entry, read rest of block bl_bytes = kmer_mem + gfr_fread_bytes(&gfile, tmp_mem, rem_block); bl_kmers = 1 + bl_bytes / kmer_mem; fprintf(fout, "%zu\t%zu\t%s\t%zu\t%zu\n", bl_byte_offset, bl_byte_offset+bl_bytes, bkmerstr, bl_kmer_offset, bl_kmer_offset+bl_kmers); bl_byte_offset += bl_bytes; bl_kmer_offset += bl_kmers; nblocks++; if(bl_kmers < block_kmers) { status("last block %zu < %zu; %zu vs %zu", bl_kmers, block_kmers, bl_bytes, block_size); break; } prev_bkmer = bkmer; } ctx_free(covgs); ctx_free(edges); ctx_free(tmp_mem); // done char num_kmers_str[50], num_blocks_str[50]; char block_mem_str[50], block_kmers_str[50]; ulong_to_str(bl_kmer_offset, num_kmers_str); ulong_to_str(nblocks, num_blocks_str); bytes_to_str(block_size, 1, block_mem_str); ulong_to_str(block_kmers, block_kmers_str); status("Read %s kmers in %s block%s (block size %s / %s kmers)", num_kmers_str, num_blocks_str, util_plural_str(nblocks), block_mem_str, block_kmers_str); if(fout != stdout) status("Saved to %s", out_path); graph_file_close(&gfile); fclose(fout); return EXIT_SUCCESS; }
/** * Print paths to a string buffer. Paths are sorted before being written. * * @param hkey All paths associated with hkey are written to the buffer * @param sbuf paths are written this string buffer * @param subset is a temp variable that is reused each time * @param nbuf temporary buffer, if not NULL, used to add seq=... to output * @param jposbuf temporary buffer, if not NULL, used to add juncpos=... to output */ void gpath_save_sbuf(hkey_t hkey, StrBuf *sbuf, GPathSubset *subset, dBNodeBuffer *nbuf, SizeBuffer *jposbuf, const dBGraph *db_graph) { ctx_assert(db_graph->num_of_cols == 1 || nbuf == NULL); ctx_assert(db_graph->num_of_cols == 1 || jposbuf == NULL); const GPathStore *gpstore = &db_graph->gpstore; const GPathSet *gpset = &gpstore->gpset; const size_t ncols = gpstore->gpset.ncols; GPath *first_gpath = gpath_store_fetch(gpstore, hkey); const GPath *gpath; size_t i, j, col; // Load and sort paths for given kmer gpath_subset_reset(subset); gpath_subset_load_llist(subset, first_gpath); gpath_subset_sort(subset); if(subset->list.len == 0) return; // Print "<kmer> <npaths>" BinaryKmer bkmer = db_graph->ht.table[hkey]; char bkstr[MAX_KMER_SIZE+1]; binary_kmer_to_str(bkmer, db_graph->kmer_size, bkstr); // strbuf_sprintf(sbuf, "%s %zu\n", bkstr, subset->list.len); strbuf_append_strn(sbuf, bkstr, db_graph->kmer_size); strbuf_append_char(sbuf, ' '); strbuf_append_ulong(sbuf, subset->list.len); strbuf_append_char(sbuf, '\n'); char orchar[2] = {0}; orchar[FORWARD] = 'F'; orchar[REVERSE] = 'R'; const uint8_t *nseenptr; for(i = 0; i < subset->list.len; i++) { gpath = subset->list.b[i]; nseenptr = gpath_set_get_nseen(gpset, gpath); // strbuf_sprintf(sbuf, "%c %zu %u %u", orchar[gpath->orient], klen, // gpath->num_juncs, (uint32_t)nseenptr[0]); strbuf_append_char(sbuf, orchar[gpath->orient]); strbuf_append_char(sbuf, ' '); strbuf_append_ulong(sbuf, gpath->num_juncs); strbuf_append_char(sbuf, ' '); strbuf_append_ulong(sbuf, nseenptr[0]); for(col = 1; col < ncols; col++) { // strbuf_sprintf(sbuf, ",%u", (uint32_t)nseenptr[col]); strbuf_append_char(sbuf, ','); strbuf_append_ulong(sbuf, nseenptr[col]); } strbuf_append_char(sbuf, ' '); strbuf_ensure_capacity(sbuf, sbuf->end + gpath->num_juncs + 2); binary_seq_to_str(gpath->seq, gpath->num_juncs, sbuf->b+sbuf->end); sbuf->end += gpath->num_juncs; if(nbuf) { // Trace this path through the graph // First, find a colour this path is in for(col = 0; col < ncols && !gpath_has_colour(gpath, ncols, col); col++) {} if(col == ncols) die("path is not in any colours"); dBNode node = {.key = hkey, .orient = gpath->orient}; db_node_buf_reset(nbuf); if(jposbuf) size_buf_reset(jposbuf); // indices of junctions in nbuf gpath_fetch(node, gpath, nbuf, jposbuf, col, db_graph); strbuf_append_str(sbuf, " seq="); strbuf_ensure_capacity(sbuf, sbuf->end + db_graph->kmer_size + nbuf->len); sbuf->end += db_nodes_to_str(nbuf->b, nbuf->len, db_graph, sbuf->b+sbuf->end); if(jposbuf) { strbuf_append_str(sbuf, " juncpos="); strbuf_append_ulong(sbuf, jposbuf->b[0]); for(j = 1; j < jposbuf->len; j++) { strbuf_append_char(sbuf, ','); strbuf_append_ulong(sbuf, jposbuf->b[j]); } } } strbuf_append_char(sbuf, '\n'); } } // @subset is a temp variable that is reused each time // @sbuf is a temp variable that is reused each time static inline int _gpath_gzsave_node(hkey_t hkey, StrBuf *sbuf, GPathSubset *subset, dBNodeBuffer *nbuf, SizeBuffer *jposbuf, gzFile gzout, pthread_mutex_t *outlock, const dBGraph *db_graph) { gpath_save_sbuf(hkey, sbuf, subset, nbuf, jposbuf, db_graph); if(sbuf->end > DEFAULT_IO_BUFSIZE) _gpath_save_flush(gzout, sbuf, outlock); return 0; // => keep iterating }