// Using file so can call fseek and don't need to load whole graph static size_t inferedges_on_mmap(const dBGraph *db_graph, bool add_all_edges, GraphFileReader *file) { ctx_assert(db_graph->num_of_cols == file->hdr.num_of_cols); ctx_assert(file_filter_is_direct(&file->fltr)); ctx_assert2(!isatty(fileno(file->fh)), "Use inferedges_on_stream() instead"); ctx_assert(file->num_of_kmers >= 0); ctx_assert(file->file_size >= 0); status("[inferedges] Processing mmap file: %s [hdr: %zu bytes file: %zu bytes]", file_filter_path(&file->fltr), (size_t)file->hdr_size, (size_t)file->file_size); if(fseek(file->fh, 0, SEEK_SET) != 0) die("fseek failed: %s", strerror(errno)); // Open memory mapped file void *mmap_ptr = mmap(NULL, file->file_size, PROT_WRITE, MAP_SHARED, fileno(file->fh), 0); if(mmap_ptr == MAP_FAILED) die("Cannot memory map file: %s [%s]", file->fltr.path.b, strerror(errno)); const size_t ncols = file->hdr.num_of_cols; BinaryKmer bkmer; Edges edges[ncols]; Covg covgs[ncols]; bool updated; size_t i, num_kmers = file->num_of_kmers, num_kmers_edited = 0; size_t filekmersize = sizeof(BinaryKmer) + (sizeof(Edges)+sizeof(Covg)) * ncols; char *ptr = (char*)mmap_ptr + file->hdr_size; for(i = 0; i < num_kmers; i++, ptr += filekmersize) { char *fh_covgs = ptr + sizeof(BinaryKmer); char *fh_edges = fh_covgs + sizeof(Covg)*ncols; memcpy(bkmer.b, ptr, sizeof(BinaryKmer)); memcpy(covgs, fh_covgs, ncols * sizeof(Covg)); memcpy(edges, fh_edges, ncols * sizeof(Edges)); updated = (add_all_edges ? infer_all_edges(bkmer, edges, covgs, db_graph) : infer_pop_edges(bkmer, edges, covgs, db_graph)); if(updated) { memcpy(fh_covgs, covgs, ncols * sizeof(Covg)); memcpy(fh_edges, edges, ncols * sizeof(Edges)); num_kmers_edited++; } } if(munmap(mmap_ptr, file->file_size) == -1) die("Cannot release mmap file: %s [%s]", file->fltr.path.b, strerror(errno)); return num_kmers_edited; }
// Print file filter description void file_filter_status(const FileFilter *fltr) { size_t i; pthread_mutex_lock(&ctx_biglock); timestamp(); message("[FileFilter] Loading file %s [%u colour%s]", file_filter_path(fltr), fltr->filencols, util_plural_str(fltr->filencols)); if(!file_filter_is_direct(fltr)) { message(" with filter: %u->%u", file_filter_fromcol(fltr, 0), file_filter_intocol(fltr, 0)); for(i = 1; i < file_filter_num(fltr); i++) message(",%u->%u", file_filter_fromcol(fltr,i), file_filter_intocol(fltr,i)); } message("\n"); pthread_mutex_unlock(&ctx_biglock); }
// Using file so can call fseek and don't need to load whole graph static size_t inferedges_on_file(const dBGraph *db_graph, bool add_all_edges, GraphFileReader *file, FILE *fout) { ctx_assert(db_graph->num_of_cols == file->hdr.num_of_cols); ctx_assert(file_filter_is_direct(&file->fltr)); ctx_assert2(!isatty(fileno(file->fh)), "Use inferedges_on_stream() instead"); ctx_assert(fout != NULL); ctx_assert(fileno(file->fh) != fileno(fout)); status("[inferedges] Processing file: %s", file_filter_path(&file->fltr)); // Print header graph_write_header(fout, &file->hdr); // Read the input file again if(fseek(file->fh, file->hdr_size, SEEK_SET) != 0) die("fseek failed: %s", strerror(errno)); const size_t ncols = file->hdr.num_of_cols; BinaryKmer bkmer; Edges edges[ncols]; Covg covgs[ncols]; size_t num_kmers_edited = 0; bool updated; while(graph_file_read_reset(file, ncols, &bkmer, covgs, edges)) { updated = (add_all_edges ? infer_all_edges(bkmer, edges, covgs, db_graph) : infer_pop_edges(bkmer, edges, covgs, db_graph)); graph_write_kmer(fout, file->hdr.num_of_bitfields, file->hdr.num_of_cols, bkmer, covgs, edges); num_kmers_edited += updated; } return num_kmers_edited; }
int ctx_sort(int argc, char **argv) { const char *out_path = NULL; struct MemArgs memargs = MEM_ARGS_INIT; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" sort -h` for help. Bad option: %s", argv[optind-1]); default: die("Bad option: [%c]: %s", c, cmd); } } if(optind+1 != argc) cmd_print_usage("Require exactly one input graph file (.ctx)"); const char *ctx_path = argv[optind]; // // Open Graph file // GraphFileReader gfile; memset(&gfile, 0, sizeof(GraphFileReader)); graph_file_open2(&gfile, ctx_path, out_path ? "r" : "r+", true, 0); if(!file_filter_is_direct(&gfile.fltr)) die("Cannot open graph file with a filter ('in.ctx:blah' syntax)"); size_t num_kmers, memory; // Reading from a stream if(gfile.num_of_kmers < 0) { if(!memargs.num_kmers_set) die("If reading from a stream, must give -n <num_kmers>"); num_kmers = memargs.num_kmers; } else num_kmers = gfile.num_of_kmers; // Open output path (if given) FILE *fout = out_path ? futil_fopen_create(out_path, "w") : NULL; size_t i; size_t ncols = gfile.hdr.num_of_cols; size_t kmer_mem = sizeof(BinaryKmer) + (sizeof(Edges)+sizeof(Covg))*ncols; memory = (sizeof(char*) + kmer_mem) * num_kmers; char mem_str[50]; bytes_to_str(memory, 1, mem_str); if(memory > memargs.mem_to_use) die("Require at least %s memory", mem_str); status("[memory] Total: %s", mem_str); char *mem = ctx_malloc(kmer_mem * num_kmers); char **kmers = ctx_malloc(num_kmers*sizeof(char*)); // Read in whole file // if(graph_file_fseek(gfile, gfile.hdr_size, SEEK_SET) != 0) die("fseek failed"); size_t nkread = gfr_fread_bytes(&gfile, mem, num_kmers*kmer_mem); if(nkread != num_kmers*kmer_mem) die("Could only read %zu bytes [<%zu]", nkread, num_kmers*kmer_mem); // check we are at the end of the file char tmpc; if(gfr_fread_bytes(&gfile, &tmpc, 1) != 0) { die("More kmers in file than believed (kmers: %zu ncols: %zu).", num_kmers, ncols); } status("Read %zu kmers with %zu colour%s", num_kmers, ncols, util_plural_str(ncols)); for(i = 0; i < num_kmers; i++) kmers[i] = mem + kmer_mem*i; sort_block(kmers, num_kmers); // Print if(out_path != NULL) { // saving to a different destination - write header graph_write_header(fout, &gfile.hdr); } else { // Directly manipulating gfile.fh here, using it to write later // Not doing any more reading if(fseek(gfile.fh, gfile.hdr_size, SEEK_SET) != 0) die("fseek failed"); fout = gfile.fh; } for(i = 0; i < num_kmers; i++) if(fwrite(kmers[i], 1, kmer_mem, fout) != kmer_mem) die("Cannot write to file"); if(out_path) fclose(fout); graph_file_close(&gfile); ctx_free(kmers); ctx_free(mem); return EXIT_SUCCESS; }
int ctx_index(int argc, char **argv) { const char *out_path = NULL; size_t block_size = 0, block_kmers = 0; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 'b': cmd_check(!block_kmers, cmd); block_kmers = cmd_size_nonzero(cmd, optarg); break; case 's': cmd_check(!block_size, cmd); block_size = cmd_size_nonzero(cmd, optarg); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" index -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } if(optind+1 != argc) cmd_print_usage("Require exactly one input graph file (.ctx)"); if(block_size && block_kmers) cmd_print_usage("Cannot use --block-kmers and --block-size together"); const char *ctx_path = argv[optind]; // // Open Graph file // GraphFileReader gfile; memset(&gfile, 0, sizeof(GraphFileReader)); graph_file_open2(&gfile, ctx_path, "r+", true, 0); if(!file_filter_is_direct(&gfile.fltr)) die("Cannot open graph file with a filter ('in.ctx:blah' syntax)"); // Open output file FILE *fout = out_path ? futil_fopen_create(out_path, "w") : stdout; // Start size_t filencols = gfile.hdr.num_of_cols; size_t kmer_size = gfile.hdr.kmer_size; const char *path = file_filter_path(&gfile.fltr); size_t ncols = file_filter_into_ncols(&gfile.fltr); size_t kmer_mem = sizeof(BinaryKmer) + (sizeof(Edges)+sizeof(Covg))*filencols; if(block_size) { block_kmers = block_size / kmer_mem; } else if(!block_size && !block_kmers) { block_size = 4 * ONE_MEGABYTE; block_kmers = block_size / kmer_mem; } // Update block-size block_size = block_kmers * kmer_mem; status("[index] block bytes: %zu kmers: %zu; kmer bytes: %zu, hdr: %zu", block_size, block_kmers, kmer_mem, (size_t)gfile.hdr_size); if(block_kmers == 0) die("Cannot set block_kmers to zero"); // Print header fputs("#block_start\tnext_block\tfirst_kmer\tkmer_idx\tnext_kmer_idx\n", fout); BinaryKmer bkmer = BINARY_KMER_ZERO_MACRO; BinaryKmer prev_bkmer = BINARY_KMER_ZERO_MACRO; Covg *covgs = ctx_malloc(ncols * sizeof(Covg)); Edges *edges = ctx_malloc(ncols * sizeof(Edges)); char bkmerstr[MAX_KMER_SIZE+1]; size_t rem_block = block_size - kmer_mem; // block after first kmer char *tmp_mem = ctx_malloc(rem_block); // Read in file, print index size_t nblocks = 0; size_t bl_bytes = 0, bl_kmers = 0; size_t bl_byte_offset = gfile.hdr_size, bl_kmer_offset = 0; while(1) { if(!graph_file_read(&gfile, &bkmer, covgs, edges)) { status("Read kmer failed"); break; } binary_kmer_to_str(bkmer, kmer_size, bkmerstr); if(nblocks > 0 && !binary_kmer_less_than(prev_bkmer,bkmer)) die("File is not sorted: %s [%s]", bkmerstr, path); // We've already read one kmer entry, read rest of block bl_bytes = kmer_mem + gfr_fread_bytes(&gfile, tmp_mem, rem_block); bl_kmers = 1 + bl_bytes / kmer_mem; fprintf(fout, "%zu\t%zu\t%s\t%zu\t%zu\n", bl_byte_offset, bl_byte_offset+bl_bytes, bkmerstr, bl_kmer_offset, bl_kmer_offset+bl_kmers); bl_byte_offset += bl_bytes; bl_kmer_offset += bl_kmers; nblocks++; if(bl_kmers < block_kmers) { status("last block %zu < %zu; %zu vs %zu", bl_kmers, block_kmers, bl_bytes, block_size); break; } prev_bkmer = bkmer; } ctx_free(covgs); ctx_free(edges); ctx_free(tmp_mem); // done char num_kmers_str[50], num_blocks_str[50]; char block_mem_str[50], block_kmers_str[50]; ulong_to_str(bl_kmer_offset, num_kmers_str); ulong_to_str(nblocks, num_blocks_str); bytes_to_str(block_size, 1, block_mem_str); ulong_to_str(block_kmers, block_kmers_str); status("Read %s kmers in %s block%s (block size %s / %s kmers)", num_kmers_str, num_blocks_str, util_plural_str(nblocks), block_mem_str, block_kmers_str); if(fout != stdout) status("Saved to %s", out_path); graph_file_close(&gfile); fclose(fout); return EXIT_SUCCESS; }
int ctx_infer_edges(int argc, char **argv) { size_t num_of_threads = DEFAULT_NTHREADS; struct MemArgs memargs = MEM_ARGS_INIT; char *out_ctx_path = NULL; bool add_pop_edges = false, add_all_edges = false; // Arg parsing char cmd[100]; char shortopts[100]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'o': cmd_check(!out_ctx_path,cmd); out_ctx_path = optarg; break; case 't': num_of_threads = cmd_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'A': add_all_edges = true; break; case 'P': add_pop_edges = true; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" inferedges -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } // Default to adding all edges if(!add_pop_edges && !add_all_edges) add_all_edges = true; // Can only specify one of --pop --all if(add_pop_edges && add_all_edges) cmd_print_usage("Please specify only one of --all --pop"); // Check that optind+1 == argc if(optind+1 > argc) cmd_print_usage("Expected exactly one graph file"); else if(optind+1 < argc) cmd_print_usage("Expected only one graph file. What is this: '%s'", argv[optind]); // // Open graph file // char *graph_path = argv[optind]; status("Reading graph: %s", graph_path); if(strchr(graph_path,':') != NULL) cmd_print_usage("Cannot use ':' in input graph for `"CMD" inferedges`"); GraphFileReader file; memset(&file, 0, sizeof(file)); file_filter_open(&file.fltr, graph_path); // Use stat to detect if we are reading from a stream struct stat st; bool reading_stream = (stat(file.fltr.path.b, &st) != 0); // Mode r+ means open (not create) for update (read & write) graph_file_open2(&file, graph_path, reading_stream ? "r" : "r+", 0); if(!file_filter_is_direct(&file.fltr)) cmd_print_usage("Inferedges with filter not implemented - sorry"); bool editing_file = !(out_ctx_path || reading_stream); FILE *fout = NULL; // Editing input file or writing a new file if(!editing_file) fout = futil_fopen_create(out_ctx_path ? out_ctx_path : "-", "w"); // Print output status if(fout == stdout) status("Writing to STDOUT"); else if(fout != NULL) status("Writing to: %s", out_ctx_path); else status("Editing file in place: %s", graph_path); status("Inferring all missing %sedges", add_pop_edges ? "population " : ""); // // Decide on memory // const size_t ncols = file.hdr.num_of_cols; size_t kmers_in_hash, graph_mem, bits_per_kmer; // reading stream: all covgs + edges // reading file: one bit per kmer per colour for 'in colour' bits_per_kmer = sizeof(BinaryKmer)*8; if(reading_stream) { bits_per_kmer += ncols * 8 * (sizeof(Edges) + sizeof(Covg)); } else { bits_per_kmer += ncols; // in colour } kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, file.num_of_kmers, file.num_of_kmers, memargs.mem_to_use_set, &graph_mem); cmd_check_mem_limit(memargs.mem_to_use, graph_mem); // // Allocate memory // int alloc_flags = reading_stream ? DBG_ALLOC_EDGES | DBG_ALLOC_COVGS : DBG_ALLOC_NODE_IN_COL; dBGraph db_graph; db_graph_alloc(&db_graph, file.hdr.kmer_size, ncols, reading_stream ? ncols : 1, kmers_in_hash, alloc_flags); LoadingStats stats = LOAD_STATS_INIT_MACRO; GraphLoadingPrefs gprefs = {.db_graph = &db_graph, .boolean_covgs = false, .must_exist_in_graph = false, .must_exist_in_edges = NULL, .empty_colours = false}; // We need to load the graph for both --pop and --all since we need to check // if the next kmer is in each of the colours graph_load(&file, gprefs, &stats); if(add_pop_edges) status("Inferring edges from population...\n"); else status("Inferring all missing edges...\n"); size_t num_kmers_edited; if(reading_stream) { ctx_assert(fout != NULL); num_kmers_edited = infer_edges(num_of_threads, add_all_edges, &db_graph); graph_write_header(fout, &file.hdr); graph_write_all_kmers(fout, &db_graph); } else if(fout == NULL) { num_kmers_edited = inferedges_on_mmap(&db_graph, add_all_edges, &file); } else { num_kmers_edited = inferedges_on_file(&db_graph, add_all_edges, &file, fout); } if(fout != NULL && fout != stdout) fclose(fout); char modified_str[100], kmers_str[100]; ulong_to_str(num_kmers_edited, modified_str); ulong_to_str(db_graph.ht.num_kmers, kmers_str); double modified_rate = 0; if(db_graph.ht.num_kmers) modified_rate = (100.0 * num_kmers_edited) / db_graph.ht.num_kmers; status("%s of %s (%.2f%%) nodes modified\n", modified_str, kmers_str, modified_rate); if(editing_file) { // Close and re-open fclose(file.fh); file.fh = NULL; futil_update_timestamp(file.fltr.path.b); } graph_file_close(&file); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int ctx_view(int argc, char **argv) { // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // TODO: // print_action actions[argc]; // bool read_kmers = false; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); cmd_print_usage("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]); default: cmd_print_usage("Programmer fail. Tell Isaac."); } } if(print_kmers) parse_kmers = 1; bool no_flags = (!print_info && !parse_kmers && !print_kmers); if(no_flags) { print_info = parse_kmers = 1; } if(optind+1 != argc) cmd_print_usage("Require one input graph file (.ctx)"); char *path = argv[optind]; size_t num_errors = 0, num_warnings = 0; GraphFileReader gfile; memset(&gfile, 0, sizeof(gfile)); int ret = graph_file_open(&gfile, path); if(ret == 0) die("Cannot open file: %s", path); if(print_info) { char fsize_str[50]; bytes_to_str((size_t)gfile.file_size, 0, fsize_str); printf("Loading file: %s\n", file_filter_path(&gfile.fltr)); printf("File size: %s\n", fsize_str); printf("----\n"); } size_t i, col, ncols = file_filter_into_ncols(&gfile.fltr); size_t kmer_size = gfile.hdr.kmer_size; ctx_assert(ncols > 0); GraphFileHeader hdr; memset(&hdr, 0, sizeof(hdr)); graph_file_merge_header(&hdr, &gfile); uint64_t nkmers_read = 0, nkmers_loaded = 0; uint64_t num_all_zero_kmers = 0, num_zero_covg_kmers = 0; uint64_t *col_nkmers, *col_sum_covgs; col_nkmers = ctx_calloc(ncols, sizeof(col_nkmers[0])); col_sum_covgs = ctx_calloc(ncols, sizeof(col_sum_covgs[0])); // Print header if(print_info) print_header(&hdr, gfile.num_of_kmers); BinaryKmer bkmer; Covg covgs[ncols], keep_kmer; Edges edges[ncols]; bool direct_read = file_filter_is_direct(&gfile.fltr); if(parse_kmers || print_kmers) { if(print_info && print_kmers) printf("----\n"); for(; graph_file_read_reset(&gfile, &bkmer, covgs, edges); nkmers_read++) { // If kmer has no covg in any samples -> don't load keep_kmer = 0; for(col = 0; col < ncols; col++) { col_nkmers[col] += (covgs[col] > 0); col_sum_covgs[col] += covgs[col]; keep_kmer |= covgs[col]; } if(!direct_read && !keep_kmer) continue; nkmers_loaded++; /* Kmer Checks */ // graph_file_read_reset() already checks for: // 1. oversized kmers // 2. kmers with covg 0 in all colours // 3. edges without coverage in a colour // Check for all-zeros (i.e. all As kmer: AAAAAA) uint64_t kmer_words_or = 0; for(i = 0; i < hdr.num_of_bitfields; i++) kmer_words_or |= bkmer.b[i]; if(kmer_words_or == 0) { if(num_all_zero_kmers == 1) { loading_error("more than one all 'A's kmers seen [index: %"PRIu64"]\n", nkmers_read); } num_all_zero_kmers++; } // Check covg is 0 for all colours for(i = 0; i < ncols && covgs[i] == 0; i++); num_zero_covg_kmers += (i == ncols); // Print if(print_kmers) db_graph_print_kmer2(bkmer, covgs, edges, ncols, kmer_size, stdout); } } // check for various reading errors // if(errno != 0) // loading_error("errno set [%i]: %s\n", (int)errno, strerror(errno)); int err = ferror(gfile.fh); if(err != 0) loading_error("occurred after file reading [%i]\n", err); char nstr[50]; if(print_kmers || parse_kmers) { // file_size is set to -1 if we are reading from a stream, // therefore won't be able to check number of kmers read if(gfile.file_size != -1 && nkmers_read != (uint64_t)gfile.num_of_kmers) { loading_warning("Expected %zu kmers, read %zu\n", (size_t)gfile.num_of_kmers, (size_t)nkmers_read); } if(num_all_zero_kmers > 1) { loading_error("%s all-zero-kmers seen\n", ulong_to_str(num_all_zero_kmers, nstr)); } if(num_zero_covg_kmers > 0) { loading_warning("%s kmers have no coverage in any colour\n", ulong_to_str(num_zero_covg_kmers, nstr)); } } // Count warnings printed by graph_file_reader.c num_warnings += gfile.error_zero_covg; num_warnings += gfile.error_missing_covg; // Can only print these stats if we're read in the kmers if((print_kmers || parse_kmers) && print_info) { // print kmer coverage per sample printf("\n---- Per colour stats\n"); printf("num. kmers:"); for(col = 0; col < ncols; col++) printf("\t%s", ulong_to_str(col_nkmers[col], nstr)); printf("\n"); printf("sum coverage:"); for(col = 0; col < ncols; col++) printf("\t%s", ulong_to_str(col_sum_covgs[col], nstr)); printf("\n"); printf("kmer coverage:"); for(col = 0; col < ncols; col++) printf("\t%.2f", safe_frac(col_sum_covgs[col], col_nkmers[col])); printf("\n"); // Overall stats uint64_t sum_covgs = 0; double mean_kmer_covg = 0.0; for(col = 0; col < ncols; col++) sum_covgs += col_sum_covgs[col]; mean_kmer_covg = nkmers_loaded ? (double)sum_covgs / nkmers_loaded : 0.0; printf("\n---- Overall stats\n"); printf("Total kmers: %s\n", ulong_to_str(nkmers_loaded, nstr)); printf("Total coverage: %s\n", ulong_to_str(sum_covgs, nstr)); printf("Mean coverage: %s\n", double_to_str(mean_kmer_covg, 2, nstr)); } if(print_info) { // Print memory stats uint64_t mem, capacity, num_buckets, req_capacity; uint8_t bucket_size; req_capacity = (size_t)(gfile.num_of_kmers / IDEAL_OCCUPANCY); capacity = hash_table_cap(req_capacity, &num_buckets, &bucket_size); mem = ht_mem(bucket_size, num_buckets, sizeof(BinaryKmer)*8 + ncols*(sizeof(Covg)+sizeof(Edges))*8); char memstr[100], capacitystr[100], bucket_size_str[100], num_buckets_str[100]; bytes_to_str(mem, 1, memstr); ulong_to_str(capacity, capacitystr); ulong_to_str(bucket_size, bucket_size_str); ulong_to_str(num_buckets, num_buckets_str); size_t mem_height = (size_t)__builtin_ctzl(num_buckets); printf("\n---- Memory\n"); printf("memory required: %s [capacity: %s]\n", memstr, capacitystr); printf(" bucket size: %s; number of buckets: %s\n", bucket_size_str, num_buckets_str); printf(" --kmer_size %zu --mem_height %zu --mem_width %i\n", kmer_size, mem_height, bucket_size); } if((print_kmers || parse_kmers) && print_info) { printf("\n----\n"); if(num_warnings > 0 || num_errors > 0) { printf("Warnings: %zu; Errors: %zu\n", (size_t)num_warnings, (size_t)num_errors); } if(num_errors == 0) printf(num_warnings ? "Graph may be ok\n" : "Graph is valid\n"); } ctx_free(col_nkmers); ctx_free(col_sum_covgs); // Close file (which zeros it) graph_file_close(&gfile); graph_header_dealloc(&hdr); return num_errors ? EXIT_FAILURE : EXIT_SUCCESS; }