Esempio n. 1
0
// Using file so can call fseek and don't need to load whole graph
static size_t inferedges_on_mmap(const dBGraph *db_graph, bool add_all_edges,
                                 GraphFileReader *file)
{
  ctx_assert(db_graph->num_of_cols == file->hdr.num_of_cols);
  ctx_assert(file_filter_is_direct(&file->fltr));
  ctx_assert2(!isatty(fileno(file->fh)), "Use inferedges_on_stream() instead");
  ctx_assert(file->num_of_kmers >= 0);
  ctx_assert(file->file_size >= 0);

  status("[inferedges] Processing mmap file: %s [hdr: %zu bytes file: %zu bytes]",
         file_filter_path(&file->fltr),
         (size_t)file->hdr_size, (size_t)file->file_size);

  if(fseek(file->fh, 0, SEEK_SET) != 0)
    die("fseek failed: %s", strerror(errno));

  // Open memory mapped file
  void *mmap_ptr = mmap(NULL, file->file_size, PROT_WRITE, MAP_SHARED,
                        fileno(file->fh), 0);

  if(mmap_ptr == MAP_FAILED)
    die("Cannot memory map file: %s [%s]", file->fltr.path.b, strerror(errno));

  const size_t ncols = file->hdr.num_of_cols;
  BinaryKmer bkmer;
  Edges edges[ncols];
  Covg covgs[ncols];

  bool updated;
  size_t i, num_kmers = file->num_of_kmers, num_kmers_edited = 0;
  size_t filekmersize = sizeof(BinaryKmer) + (sizeof(Edges)+sizeof(Covg)) * ncols;

  char *ptr = (char*)mmap_ptr + file->hdr_size;

  for(i = 0; i < num_kmers; i++, ptr += filekmersize)
  {
    char *fh_covgs = ptr      + sizeof(BinaryKmer);
    char *fh_edges = fh_covgs + sizeof(Covg)*ncols;

    memcpy(bkmer.b, ptr,      sizeof(BinaryKmer));
    memcpy(covgs,   fh_covgs, ncols * sizeof(Covg));
    memcpy(edges,   fh_edges, ncols * sizeof(Edges));

    updated = (add_all_edges ? infer_all_edges(bkmer, edges, covgs, db_graph)
                             : infer_pop_edges(bkmer, edges, covgs, db_graph));

    if(updated) {
      memcpy(fh_covgs, covgs, ncols * sizeof(Covg));
      memcpy(fh_edges, edges, ncols * sizeof(Edges));
      num_kmers_edited++;
    }
  }

  if(munmap(mmap_ptr, file->file_size) == -1)
    die("Cannot release mmap file: %s [%s]", file->fltr.path.b, strerror(errno));

  return num_kmers_edited;
}
Esempio n. 2
0
// Print file filter description
void file_filter_status(const FileFilter *fltr)
{
  size_t i;

  pthread_mutex_lock(&ctx_biglock);
  timestamp();
  message("[FileFilter] Loading file %s [%u colour%s]", file_filter_path(fltr),
          fltr->filencols, util_plural_str(fltr->filencols));

  if(!file_filter_is_direct(fltr))
  {
    message(" with filter: %u->%u", file_filter_fromcol(fltr, 0),
                                    file_filter_intocol(fltr, 0));

    for(i = 1; i < file_filter_num(fltr); i++)
      message(",%u->%u", file_filter_fromcol(fltr,i), file_filter_intocol(fltr,i));
  }
  message("\n");
  pthread_mutex_unlock(&ctx_biglock);
}
Esempio n. 3
0
// Using file so can call fseek and don't need to load whole graph
static size_t inferedges_on_file(const dBGraph *db_graph, bool add_all_edges,
                                 GraphFileReader *file, FILE *fout)
{
  ctx_assert(db_graph->num_of_cols == file->hdr.num_of_cols);
  ctx_assert(file_filter_is_direct(&file->fltr));
  ctx_assert2(!isatty(fileno(file->fh)), "Use inferedges_on_stream() instead");
  ctx_assert(fout != NULL);
  ctx_assert(fileno(file->fh) != fileno(fout));

  status("[inferedges] Processing file: %s", file_filter_path(&file->fltr));

  // Print header
  graph_write_header(fout, &file->hdr);

  // Read the input file again
  if(fseek(file->fh, file->hdr_size, SEEK_SET) != 0)
    die("fseek failed: %s", strerror(errno));

  const size_t ncols = file->hdr.num_of_cols;
  BinaryKmer bkmer;
  Edges edges[ncols];
  Covg covgs[ncols];

  size_t num_kmers_edited = 0;
  bool updated;

  while(graph_file_read_reset(file, ncols, &bkmer, covgs, edges))
  {
    updated = (add_all_edges ? infer_all_edges(bkmer, edges, covgs, db_graph)
                             : infer_pop_edges(bkmer, edges, covgs, db_graph));

    graph_write_kmer(fout, file->hdr.num_of_bitfields, file->hdr.num_of_cols,
                     bkmer, covgs, edges);

    num_kmers_edited += updated;
  }

  return num_kmers_edited;
}
Esempio n. 4
0
int ctx_sort(int argc, char **argv)
{
  const char *out_path = NULL;
  struct MemArgs memargs = MEM_ARGS_INIT;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'o': cmd_check(!out_path, cmd); out_path = optarg; break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" sort -h` for help. Bad option: %s", argv[optind-1]);
      default: die("Bad option: [%c]: %s", c, cmd);
    }
  }

  if(optind+1 != argc)
    cmd_print_usage("Require exactly one input graph file (.ctx)");

  const char *ctx_path = argv[optind];

  //
  // Open Graph file
  //
  GraphFileReader gfile;
  memset(&gfile, 0, sizeof(GraphFileReader));
  graph_file_open2(&gfile, ctx_path, out_path ? "r" : "r+", true, 0);

  if(!file_filter_is_direct(&gfile.fltr))
    die("Cannot open graph file with a filter ('in.ctx:blah' syntax)");

  size_t num_kmers, memory;

  // Reading from a stream
  if(gfile.num_of_kmers < 0) {
    if(!memargs.num_kmers_set)
      die("If reading from a stream, must give -n <num_kmers>");
    num_kmers = memargs.num_kmers;
  }
  else num_kmers = gfile.num_of_kmers;

  // Open output path (if given)
  FILE *fout = out_path ? futil_fopen_create(out_path, "w") : NULL;

  size_t i;
  size_t ncols = gfile.hdr.num_of_cols;
  size_t kmer_mem = sizeof(BinaryKmer) + (sizeof(Edges)+sizeof(Covg))*ncols;

  memory = (sizeof(char*) + kmer_mem) * num_kmers;

  char mem_str[50];
  bytes_to_str(memory, 1, mem_str);

  if(memory > memargs.mem_to_use)
    die("Require at least %s memory", mem_str);

  status("[memory] Total: %s", mem_str);

  char *mem = ctx_malloc(kmer_mem * num_kmers);
  char **kmers = ctx_malloc(num_kmers*sizeof(char*));

  // Read in whole file
  // if(graph_file_fseek(gfile, gfile.hdr_size, SEEK_SET) != 0) die("fseek failed");
  size_t nkread = gfr_fread_bytes(&gfile, mem, num_kmers*kmer_mem);

  if(nkread != num_kmers*kmer_mem)
    die("Could only read %zu bytes [<%zu]", nkread, num_kmers*kmer_mem);

  // check we are at the end of the file
  char tmpc;
  if(gfr_fread_bytes(&gfile, &tmpc, 1) != 0) {
    die("More kmers in file than believed (kmers: %zu ncols: %zu).",
        num_kmers, ncols);
  }

  status("Read %zu kmers with %zu colour%s", num_kmers,
         ncols, util_plural_str(ncols));

  for(i = 0; i < num_kmers; i++)
    kmers[i] = mem + kmer_mem*i;

  sort_block(kmers, num_kmers);

  // Print
  if(out_path != NULL) {
    // saving to a different destination - write header
    graph_write_header(fout, &gfile.hdr);
  }
  else {
    // Directly manipulating gfile.fh here, using it to write later
    // Not doing any more reading
    if(fseek(gfile.fh, gfile.hdr_size, SEEK_SET) != 0) die("fseek failed");
    fout = gfile.fh;
  }

  for(i = 0; i < num_kmers; i++)
    if(fwrite(kmers[i], 1, kmer_mem, fout) != kmer_mem)
      die("Cannot write to file");

  if(out_path) fclose(fout);

  graph_file_close(&gfile);
  ctx_free(kmers);
  ctx_free(mem);

  return EXIT_SUCCESS;
}
Esempio n. 5
0
int ctx_index(int argc, char **argv)
{
  const char *out_path = NULL;
  size_t block_size = 0, block_kmers = 0;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'o': cmd_check(!out_path, cmd); out_path = optarg; break;
      case 'b':
        cmd_check(!block_kmers, cmd);
        block_kmers = cmd_size_nonzero(cmd, optarg);
        break;
      case 's':
        cmd_check(!block_size, cmd);
        block_size = cmd_size_nonzero(cmd, optarg);
        break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" index -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  if(optind+1 != argc)
    cmd_print_usage("Require exactly one input graph file (.ctx)");

  if(block_size && block_kmers)
    cmd_print_usage("Cannot use --block-kmers and --block-size together");

  const char *ctx_path = argv[optind];

  //
  // Open Graph file
  //
  GraphFileReader gfile;
  memset(&gfile, 0, sizeof(GraphFileReader));
  graph_file_open2(&gfile, ctx_path, "r+", true, 0);

  if(!file_filter_is_direct(&gfile.fltr))
    die("Cannot open graph file with a filter ('in.ctx:blah' syntax)");

  // Open output file
  FILE *fout = out_path ? futil_fopen_create(out_path, "w") : stdout;

  // Start
  size_t filencols = gfile.hdr.num_of_cols;
  size_t kmer_size = gfile.hdr.kmer_size;
  const char *path = file_filter_path(&gfile.fltr);

  size_t ncols = file_filter_into_ncols(&gfile.fltr);
  size_t kmer_mem = sizeof(BinaryKmer) + (sizeof(Edges)+sizeof(Covg))*filencols;

  if(block_size) {
    block_kmers = block_size / kmer_mem;
  } else if(!block_size && !block_kmers) {
    block_size = 4 * ONE_MEGABYTE;
    block_kmers = block_size / kmer_mem;
  }

  // Update block-size
  block_size = block_kmers * kmer_mem;

  status("[index] block bytes: %zu kmers: %zu; kmer bytes: %zu, hdr: %zu",
         block_size, block_kmers, kmer_mem, (size_t)gfile.hdr_size);

  if(block_kmers == 0) die("Cannot set block_kmers to zero");

  // Print header
  fputs("#block_start\tnext_block\tfirst_kmer\tkmer_idx\tnext_kmer_idx\n", fout);

  BinaryKmer bkmer = BINARY_KMER_ZERO_MACRO;
  BinaryKmer prev_bkmer = BINARY_KMER_ZERO_MACRO;
  Covg *covgs = ctx_malloc(ncols * sizeof(Covg));
  Edges *edges = ctx_malloc(ncols * sizeof(Edges));
  char bkmerstr[MAX_KMER_SIZE+1];

  size_t rem_block = block_size - kmer_mem; // block after first kmer
  char *tmp_mem = ctx_malloc(rem_block);

  // Read in file, print index
  size_t nblocks = 0;
  size_t bl_bytes = 0, bl_kmers = 0;
  size_t bl_byte_offset = gfile.hdr_size, bl_kmer_offset = 0;

  while(1)
  {
    if(!graph_file_read(&gfile, &bkmer, covgs, edges)) {
      status("Read kmer failed"); break; }
    binary_kmer_to_str(bkmer, kmer_size, bkmerstr);
    if(nblocks > 0 && !binary_kmer_less_than(prev_bkmer,bkmer))
      die("File is not sorted: %s [%s]", bkmerstr, path);
    // We've already read one kmer entry, read rest of block
    bl_bytes = kmer_mem + gfr_fread_bytes(&gfile, tmp_mem, rem_block);
    bl_kmers = 1 + bl_bytes / kmer_mem;
    fprintf(fout, "%zu\t%zu\t%s\t%zu\t%zu\n",
            bl_byte_offset, bl_byte_offset+bl_bytes, bkmerstr,
            bl_kmer_offset, bl_kmer_offset+bl_kmers);
    bl_byte_offset += bl_bytes;
    bl_kmer_offset += bl_kmers;
    nblocks++;
    if(bl_kmers < block_kmers) {
      status("last block %zu < %zu; %zu vs %zu",
             bl_kmers, block_kmers, bl_bytes, block_size);
      break;
    }
    prev_bkmer = bkmer;
  }

  ctx_free(covgs);
  ctx_free(edges);
  ctx_free(tmp_mem);

  // done
  char num_kmers_str[50], num_blocks_str[50];
  char block_mem_str[50], block_kmers_str[50];
  ulong_to_str(bl_kmer_offset, num_kmers_str);
  ulong_to_str(nblocks, num_blocks_str);
  bytes_to_str(block_size, 1, block_mem_str);
  ulong_to_str(block_kmers, block_kmers_str);

  status("Read %s kmers in %s block%s (block size %s / %s kmers)",
         num_kmers_str, num_blocks_str, util_plural_str(nblocks),
         block_mem_str, block_kmers_str);

  if(fout != stdout) status("Saved to %s", out_path);

  graph_file_close(&gfile);
  fclose(fout);

  return EXIT_SUCCESS;
}
Esempio n. 6
0
int ctx_infer_edges(int argc, char **argv)
{
  size_t num_of_threads = DEFAULT_NTHREADS;
  struct MemArgs memargs = MEM_ARGS_INIT;
  char *out_ctx_path = NULL;
  bool add_pop_edges = false, add_all_edges = false;

  // Arg parsing
  char cmd[100];
  char shortopts[100];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'o': cmd_check(!out_ctx_path,cmd); out_ctx_path = optarg; break;
      case 't': num_of_threads = cmd_uint32_nonzero(cmd, optarg); break;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'A': add_all_edges = true; break;
      case 'P': add_pop_edges = true; break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" inferedges -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  // Default to adding all edges
  if(!add_pop_edges && !add_all_edges) add_all_edges = true;

  // Can only specify one of --pop --all
  if(add_pop_edges && add_all_edges)
    cmd_print_usage("Please specify only one of --all --pop");

  // Check that optind+1 == argc
  if(optind+1 > argc)
    cmd_print_usage("Expected exactly one graph file");
  else if(optind+1 < argc)
    cmd_print_usage("Expected only one graph file. What is this: '%s'", argv[optind]);

  //
  // Open graph file
  //
  char *graph_path = argv[optind];
  status("Reading graph: %s", graph_path);

  if(strchr(graph_path,':') != NULL)
    cmd_print_usage("Cannot use ':' in input graph for `"CMD" inferedges`");

  GraphFileReader file;
  memset(&file, 0, sizeof(file));

  file_filter_open(&file.fltr, graph_path);

  // Use stat to detect if we are reading from a stream
  struct stat st;
  bool reading_stream = (stat(file.fltr.path.b, &st) != 0);

  // Mode r+ means open (not create) for update (read & write)
  graph_file_open2(&file, graph_path, reading_stream ? "r" : "r+", 0);

  if(!file_filter_is_direct(&file.fltr))
    cmd_print_usage("Inferedges with filter not implemented - sorry");

  bool editing_file = !(out_ctx_path || reading_stream);

  FILE *fout = NULL;

  // Editing input file or writing a new file
  if(!editing_file)
    fout = futil_fopen_create(out_ctx_path ? out_ctx_path : "-", "w");

  // Print output status
  if(fout == stdout) status("Writing to STDOUT");
  else if(fout != NULL) status("Writing to: %s", out_ctx_path);
  else status("Editing file in place: %s", graph_path);

  status("Inferring all missing %sedges", add_pop_edges ? "population " : "");

  //
  // Decide on memory
  //
  const size_t ncols = file.hdr.num_of_cols;
  size_t kmers_in_hash, graph_mem, bits_per_kmer;

  // reading stream: all covgs + edges
  // reading file: one bit per kmer per colour for 'in colour'
  bits_per_kmer = sizeof(BinaryKmer)*8;

  if(reading_stream) {
    bits_per_kmer += ncols * 8 * (sizeof(Edges) + sizeof(Covg));
  } else {
    bits_per_kmer += ncols; // in colour
  }

  kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                        memargs.mem_to_use_set,
                                        memargs.num_kmers,
                                        memargs.num_kmers_set,
                                        bits_per_kmer,
                                        file.num_of_kmers, file.num_of_kmers,
                                        memargs.mem_to_use_set, &graph_mem);

  cmd_check_mem_limit(memargs.mem_to_use, graph_mem);

  //
  // Allocate memory
  //
  int alloc_flags = reading_stream ? DBG_ALLOC_EDGES | DBG_ALLOC_COVGS
                                   : DBG_ALLOC_NODE_IN_COL;

  dBGraph db_graph;
  db_graph_alloc(&db_graph, file.hdr.kmer_size,
                 ncols, reading_stream ? ncols : 1,
                 kmers_in_hash, alloc_flags);

  LoadingStats stats = LOAD_STATS_INIT_MACRO;
  GraphLoadingPrefs gprefs = {.db_graph = &db_graph,
                              .boolean_covgs = false,
                              .must_exist_in_graph = false,
                              .must_exist_in_edges = NULL,
                              .empty_colours = false};

  // We need to load the graph for both --pop and --all since we need to check
  // if the next kmer is in each of the colours
  graph_load(&file, gprefs, &stats);

  if(add_pop_edges) status("Inferring edges from population...\n");
  else status("Inferring all missing edges...\n");

  size_t num_kmers_edited;

  if(reading_stream)
  {
    ctx_assert(fout != NULL);
    num_kmers_edited = infer_edges(num_of_threads, add_all_edges, &db_graph);
    graph_write_header(fout, &file.hdr);
    graph_write_all_kmers(fout, &db_graph);
  }
  else if(fout == NULL) {
    num_kmers_edited = inferedges_on_mmap(&db_graph, add_all_edges, &file);
  } else {
    num_kmers_edited = inferedges_on_file(&db_graph, add_all_edges, &file, fout);
  }

  if(fout != NULL && fout != stdout) fclose(fout);

  char modified_str[100], kmers_str[100];
  ulong_to_str(num_kmers_edited, modified_str);
  ulong_to_str(db_graph.ht.num_kmers, kmers_str);

  double modified_rate = 0;
  if(db_graph.ht.num_kmers)
    modified_rate = (100.0 * num_kmers_edited) / db_graph.ht.num_kmers;

  status("%s of %s (%.2f%%) nodes modified\n",
         modified_str, kmers_str, modified_rate);

  if(editing_file)
  {
    // Close and re-open
    fclose(file.fh);
    file.fh = NULL;
    futil_update_timestamp(file.fltr.path.b);
  }

  graph_file_close(&file);
  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}
Esempio n. 7
0
int ctx_view(int argc, char **argv)
{
  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // TODO:
  // print_action actions[argc];
  // bool read_kmers = false;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        cmd_print_usage("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]);
      default:
        cmd_print_usage("Programmer fail. Tell Isaac.");
    }
  }

  if(print_kmers) parse_kmers = 1;

  bool no_flags = (!print_info && !parse_kmers && !print_kmers);
  if(no_flags) { print_info = parse_kmers = 1; }

  if(optind+1 != argc) cmd_print_usage("Require one input graph file (.ctx)");

  char *path = argv[optind];
  size_t num_errors = 0, num_warnings = 0;

  GraphFileReader gfile;
  memset(&gfile, 0, sizeof(gfile));
  int ret = graph_file_open(&gfile, path);
  if(ret == 0) die("Cannot open file: %s", path);

  if(print_info)
  {
    char fsize_str[50];
    bytes_to_str((size_t)gfile.file_size, 0, fsize_str);
    printf("Loading file: %s\n", file_filter_path(&gfile.fltr));
    printf("File size: %s\n", fsize_str);
    printf("----\n");
  }

  size_t i, col, ncols = file_filter_into_ncols(&gfile.fltr);
  size_t kmer_size = gfile.hdr.kmer_size;
  ctx_assert(ncols > 0);

  GraphFileHeader hdr;
  memset(&hdr, 0, sizeof(hdr));
  graph_file_merge_header(&hdr, &gfile);

  uint64_t nkmers_read = 0, nkmers_loaded = 0;
  uint64_t num_all_zero_kmers = 0, num_zero_covg_kmers = 0;
  uint64_t *col_nkmers, *col_sum_covgs;
  col_nkmers = ctx_calloc(ncols, sizeof(col_nkmers[0]));
  col_sum_covgs = ctx_calloc(ncols, sizeof(col_sum_covgs[0]));

  // Print header
  if(print_info) print_header(&hdr, gfile.num_of_kmers);

  BinaryKmer bkmer;
  Covg covgs[ncols], keep_kmer;
  Edges edges[ncols];

  bool direct_read = file_filter_is_direct(&gfile.fltr);

  if(parse_kmers || print_kmers)
  {
    if(print_info && print_kmers) printf("----\n");

    for(; graph_file_read_reset(&gfile, &bkmer, covgs, edges); nkmers_read++)
    {
      // If kmer has no covg in any samples -> don't load
      keep_kmer = 0;
      for(col = 0; col < ncols; col++) {
        col_nkmers[col] += (covgs[col] > 0);
        col_sum_covgs[col] += covgs[col];
        keep_kmer |= covgs[col];
      }

      if(!direct_read && !keep_kmer) continue;
      nkmers_loaded++;

      /* Kmer Checks */
      // graph_file_read_reset() already checks for:
      // 1. oversized kmers
      // 2. kmers with covg 0 in all colours
      // 3. edges without coverage in a colour

      // Check for all-zeros (i.e. all As kmer: AAAAAA)
      uint64_t kmer_words_or = 0;

      for(i = 0; i < hdr.num_of_bitfields; i++)
        kmer_words_or |= bkmer.b[i];

      if(kmer_words_or == 0)
      {
        if(num_all_zero_kmers == 1)
        {
          loading_error("more than one all 'A's kmers seen [index: %"PRIu64"]\n",
                        nkmers_read);
        }

        num_all_zero_kmers++;
      }

      // Check covg is 0 for all colours
      for(i = 0; i < ncols && covgs[i] == 0; i++);
      num_zero_covg_kmers += (i == ncols);

      // Print
      if(print_kmers)
        db_graph_print_kmer2(bkmer, covgs, edges, ncols, kmer_size, stdout);
    }
  }

  // check for various reading errors
  // if(errno != 0)
  //   loading_error("errno set [%i]: %s\n", (int)errno, strerror(errno));

  int err = ferror(gfile.fh);
  if(err != 0)
    loading_error("occurred after file reading [%i]\n", err);

  char nstr[50];

  if(print_kmers || parse_kmers)
  {
    // file_size is set to -1 if we are reading from a stream,
    // therefore won't be able to check number of kmers read
    if(gfile.file_size != -1 && nkmers_read != (uint64_t)gfile.num_of_kmers) {
      loading_warning("Expected %zu kmers, read %zu\n",
                      (size_t)gfile.num_of_kmers, (size_t)nkmers_read);
    }

    if(num_all_zero_kmers > 1)
    {
      loading_error("%s all-zero-kmers seen\n",
                    ulong_to_str(num_all_zero_kmers, nstr));
    }

    if(num_zero_covg_kmers > 0)
    {
      loading_warning("%s kmers have no coverage in any colour\n",
                      ulong_to_str(num_zero_covg_kmers, nstr));
    }
  }

  // Count warnings printed by graph_file_reader.c
  num_warnings += gfile.error_zero_covg;
  num_warnings += gfile.error_missing_covg;

  // Can only print these stats if we're read in the kmers
  if((print_kmers || parse_kmers) && print_info)
  {
    // print kmer coverage per sample
    printf("\n---- Per colour stats\n");
    printf("num. kmers:");
    for(col = 0; col < ncols; col++)
      printf("\t%s", ulong_to_str(col_nkmers[col], nstr));
    printf("\n");
    printf("sum coverage:");
    for(col = 0; col < ncols; col++)
      printf("\t%s", ulong_to_str(col_sum_covgs[col], nstr));
    printf("\n");
    printf("kmer coverage:");
    for(col = 0; col < ncols; col++)
      printf("\t%.2f", safe_frac(col_sum_covgs[col], col_nkmers[col]));
    printf("\n");

    // Overall stats
    uint64_t sum_covgs = 0;
    double mean_kmer_covg = 0.0;
    for(col = 0; col < ncols; col++) sum_covgs += col_sum_covgs[col];
    mean_kmer_covg = nkmers_loaded ? (double)sum_covgs / nkmers_loaded : 0.0;

    printf("\n---- Overall stats\n");
    printf("Total kmers:    %s\n", ulong_to_str(nkmers_loaded, nstr));
    printf("Total coverage: %s\n", ulong_to_str(sum_covgs, nstr));
    printf("Mean coverage:  %s\n", double_to_str(mean_kmer_covg, 2, nstr));
  }

  if(print_info)
  {
    // Print memory stats
    uint64_t mem, capacity, num_buckets, req_capacity;
    uint8_t bucket_size;

    req_capacity = (size_t)(gfile.num_of_kmers / IDEAL_OCCUPANCY);
    capacity = hash_table_cap(req_capacity, &num_buckets, &bucket_size);
    mem = ht_mem(bucket_size, num_buckets,
                 sizeof(BinaryKmer)*8 + ncols*(sizeof(Covg)+sizeof(Edges))*8);

    char memstr[100], capacitystr[100], bucket_size_str[100], num_buckets_str[100];
    bytes_to_str(mem, 1, memstr);
    ulong_to_str(capacity, capacitystr);
    ulong_to_str(bucket_size, bucket_size_str);
    ulong_to_str(num_buckets, num_buckets_str);

    size_t mem_height = (size_t)__builtin_ctzl(num_buckets);

    printf("\n---- Memory\n");
    printf("memory required: %s [capacity: %s]\n", memstr, capacitystr);
    printf("  bucket size: %s; number of buckets: %s\n",
            bucket_size_str, num_buckets_str);
    printf("  --kmer_size %zu --mem_height %zu --mem_width %i\n",
           kmer_size, mem_height, bucket_size);
  }

  if((print_kmers || parse_kmers) && print_info)
  {
    printf("\n----\n");
    if(num_warnings > 0 || num_errors > 0) {
      printf("Warnings: %zu; Errors: %zu\n",
              (size_t)num_warnings, (size_t)num_errors);
    }
    if(num_errors == 0)
      printf(num_warnings ? "Graph may be ok\n" : "Graph is valid\n");
  }

  ctx_free(col_nkmers);
  ctx_free(col_sum_covgs);

  // Close file (which zeros it)
  graph_file_close(&gfile);
  graph_header_dealloc(&hdr);

  return num_errors ? EXIT_FAILURE : EXIT_SUCCESS;
}