Esempio n. 1
// Using file so can call fseek and don't need to load whole graph
static size_t inferedges_on_mmap(const dBGraph *db_graph, bool add_all_edges,
                                 GraphFileReader *file)
  ctx_assert(db_graph->num_of_cols == file->hdr.num_of_cols);
  ctx_assert2(!isatty(fileno(file->fh)), "Use inferedges_on_stream() instead");
  ctx_assert(file->num_of_kmers >= 0);
  ctx_assert(file->file_size >= 0);

  status("[inferedges] Processing mmap file: %s [hdr: %zu bytes file: %zu bytes]",
         (size_t)file->hdr_size, (size_t)file->file_size);

  if(fseek(file->fh, 0, SEEK_SET) != 0)
    die("fseek failed: %s", strerror(errno));

  // Open memory mapped file
  void *mmap_ptr = mmap(NULL, file->file_size, PROT_WRITE, MAP_SHARED,
                        fileno(file->fh), 0);

  if(mmap_ptr == MAP_FAILED)
    die("Cannot memory map file: %s [%s]", file->fltr.path.b, strerror(errno));

  const size_t ncols = file->hdr.num_of_cols;
  BinaryKmer bkmer;
  Edges edges[ncols];
  Covg covgs[ncols];

  bool updated;
  size_t i, num_kmers = file->num_of_kmers, num_kmers_edited = 0;
  size_t filekmersize = sizeof(BinaryKmer) + (sizeof(Edges)+sizeof(Covg)) * ncols;

  char *ptr = (char*)mmap_ptr + file->hdr_size;

  for(i = 0; i < num_kmers; i++, ptr += filekmersize)
    char *fh_covgs = ptr      + sizeof(BinaryKmer);
    char *fh_edges = fh_covgs + sizeof(Covg)*ncols;

    memcpy(bkmer.b, ptr,      sizeof(BinaryKmer));
    memcpy(covgs,   fh_covgs, ncols * sizeof(Covg));
    memcpy(edges,   fh_edges, ncols * sizeof(Edges));

    updated = (add_all_edges ? infer_all_edges(bkmer, edges, covgs, db_graph)
                             : infer_pop_edges(bkmer, edges, covgs, db_graph));

    if(updated) {
      memcpy(fh_covgs, covgs, ncols * sizeof(Covg));
      memcpy(fh_edges, edges, ncols * sizeof(Edges));

  if(munmap(mmap_ptr, file->file_size) == -1)
    die("Cannot release mmap file: %s [%s]", file->fltr.path.b, strerror(errno));

  return num_kmers_edited;
Esempio n. 2
// Print file filter description
void file_filter_status(const FileFilter *fltr)
  size_t i;

  message("[FileFilter] Loading file %s [%u colour%s]", file_filter_path(fltr),
          fltr->filencols, util_plural_str(fltr->filencols));

    message(" with filter: %u->%u", file_filter_fromcol(fltr, 0),
                                    file_filter_intocol(fltr, 0));

    for(i = 1; i < file_filter_num(fltr); i++)
      message(",%u->%u", file_filter_fromcol(fltr,i), file_filter_intocol(fltr,i));
Esempio n. 3
// Using file so can call fseek and don't need to load whole graph
static size_t inferedges_on_file(const dBGraph *db_graph, bool add_all_edges,
                                 GraphFileReader *file, FILE *fout)
  ctx_assert(db_graph->num_of_cols == file->hdr.num_of_cols);
  ctx_assert2(!isatty(fileno(file->fh)), "Use inferedges_on_stream() instead");
  ctx_assert(fout != NULL);
  ctx_assert(fileno(file->fh) != fileno(fout));

  status("[inferedges] Processing file: %s", file_filter_path(&file->fltr));

  // Print header
  graph_write_header(fout, &file->hdr);

  // Read the input file again
  if(fseek(file->fh, file->hdr_size, SEEK_SET) != 0)
    die("fseek failed: %s", strerror(errno));

  const size_t ncols = file->hdr.num_of_cols;
  BinaryKmer bkmer;
  Edges edges[ncols];
  Covg covgs[ncols];

  size_t num_kmers_edited = 0;
  bool updated;

  while(graph_file_read_reset(file, ncols, &bkmer, covgs, edges))
    updated = (add_all_edges ? infer_all_edges(bkmer, edges, covgs, db_graph)
                             : infer_pop_edges(bkmer, edges, covgs, db_graph));

    graph_write_kmer(fout, file->hdr.num_of_bitfields, file->hdr.num_of_cols,
                     bkmer, covgs, edges);

    num_kmers_edited += updated;

  return num_kmers_edited;
Esempio n. 4
int ctx_clean(int argc, char **argv)
  size_t nthreads = 0, use_ncols = 0;
  struct MemArgs memargs = MEM_ARGS_INIT;
  const char *out_ctx_path = NULL;
  bool tip_cleaning = false, supernode_cleaning = false;
  size_t min_keep_tip = 0;
  Covg threshold = 0, fallback_thresh = 0;
  const char *len_before_path = NULL, *len_after_path = NULL;
  const char *covg_before_path = NULL, *covg_after_path = NULL;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'o':
        if(out_ctx_path != NULL) cmd_print_usage(NULL);
        out_ctx_path = optarg;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'N': use_ncols = cmd_uint32_nonzero(cmd, optarg); break;
      case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break;
      case 'T':
        cmd_check(!tip_cleaning, cmd);
        min_keep_tip = cmd_uint32_nonzero(cmd, optarg);
        tip_cleaning = true;
      case 'S':
        cmd_check(!supernode_cleaning, cmd);
        if(optarg != NULL) threshold = cmd_uint32_nonzero(cmd, optarg);
        supernode_cleaning = true;
      case 'B': cmd_check(!fallback_thresh, cmd); fallback_thresh = cmd_uint32_nonzero(cmd, optarg); break;
      case 'l': cmd_check(!len_before_path, cmd); len_before_path = optarg; break;
      case 'L': cmd_check(!len_after_path, cmd); len_after_path = optarg; break;
      case 'c': cmd_check(!covg_before_path, cmd); covg_before_path = optarg; break;
      case 'C': cmd_check(!covg_after_path, cmd); covg_after_path = optarg; break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" clean -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();

  if(nthreads == 0) nthreads = DEFAULT_NTHREADS;

  if(optind >= argc) cmd_print_usage("Please give input graph files");

  // Default behaviour
  if(!tip_cleaning && !supernode_cleaning) {
    if(out_ctx_path != NULL)
      supernode_cleaning = tip_cleaning = true; // do both
      warn("No cleaning being done: you did not specify --out <out.ctx>");

  bool doing_cleaning = (supernode_cleaning || tip_cleaning);

  if(doing_cleaning && out_ctx_path == NULL) {
    cmd_print_usage("Please specify --out <out.ctx> for cleaned graph");

  if(!doing_cleaning && (covg_after_path || len_after_path)) {
    cmd_print_usage("You gave --len-after <out> / --covg-after <out> without "
                    "any cleaning (set -s, --supernodes or -t, --tips)");

  if(doing_cleaning && strcmp(out_ctx_path,"-") != 0 &&
     !futil_get_force() && futil_file_exists(out_ctx_path))
    cmd_print_usage("Output file already exists: %s", out_ctx_path);

  if(fallback_thresh && !supernode_cleaning)
    cmd_print_usage("-B, --fallback <T> without --supernodes");

  // Use remaining args as graph files
  char **gfile_paths = argv + optind;
  size_t i, j, num_gfiles = (size_t)(argc - optind);

  // Open graph files
  GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader));
  size_t ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0;

  ncols = graph_files_open(gfile_paths, gfiles, num_gfiles,
                           &ctx_max_kmers, &ctx_sum_kmers);

  size_t kmer_size = gfiles[0].hdr.kmer_size;

  // default to one colour for now
  if(use_ncols == 0) use_ncols = 1;

  // Flatten if we don't have to remember colours / output a graph
    ncols = use_ncols = 1;
    for(i = 0; i < num_gfiles; i++)
      file_filter_flatten(&gfiles[i].fltr, 0);

  if(ncols < use_ncols) {
    warn("I only need %zu colour%s ('--ncols %zu' ignored)",
         ncols, util_plural_str(ncols), use_ncols);
    use_ncols = ncols;

  char max_kmers_str[100];
  ulong_to_str(ctx_max_kmers, max_kmers_str);
  status("%zu input graph%s, max kmers: %s, using %zu colours",
         num_gfiles, util_plural_str(num_gfiles), max_kmers_str, use_ncols);

  // If no arguments given we default to removing tips < 2*kmer_size
  if(tip_cleaning && min_keep_tip == 0)
    min_keep_tip = 2 * kmer_size;

  // Warn if any graph files already cleaned
  size_t fromcol, intocol;
  ErrorCleaning *cleaning;

  for(i = 0; i < num_gfiles; i++) {
    for(j = 0; j < file_filter_num(&gfiles[i].fltr); j++) {
      fromcol = file_filter_fromcol(&gfiles[i].fltr, j);
      cleaning = &gfiles[i].hdr.ginfo[fromcol].cleaning;
      if(cleaning->cleaned_snodes && supernode_cleaning) {
        warn("%s:%zu already has supernode cleaning with threshold: <%zu",
             file_filter_path(&gfiles[i].fltr), fromcol,
      if(cleaning->cleaned_tips && tip_cleaning) {
        warn("%s:%zu already has had tip cleaned",
             file_filter_path(&gfiles[i].fltr), fromcol);

  // Print steps
  size_t step = 0;
  if(covg_before_path != NULL)
    status("%zu. Saving kmer coverage distribution to: %s", step++, covg_before_path);
  if(len_before_path != NULL)
    status("%zu. Saving supernode length distribution to: %s", step++, len_before_path);
    status("%zu. Cleaning tips shorter than %zu nodes", step++, min_keep_tip);
  if(supernode_cleaning && threshold > 0)
    status("%zu. Cleaning supernodes with coverage < %u", step++, threshold);
  if(supernode_cleaning && threshold <= 0)
    status("%zu. Cleaning supernodes with auto-detected threshold", step++);
  if(covg_after_path != NULL)
    status("%zu. Saving kmer coverage distribution to: %s", step++, covg_after_path);
  if(len_after_path != NULL)
    status("%zu. Saving supernode length distribution to: %s", step++, len_after_path);

  // Decide memory usage
  bool all_colours_loaded = (ncols <= use_ncols);
  bool use_mem_limit = (memargs.mem_to_use_set && num_gfiles > 1) || !ctx_max_kmers;

  size_t kmers_in_hash, bits_per_kmer, graph_mem;
  size_t per_kmer_per_col_bits = (sizeof(BinaryKmer)+sizeof(Covg)+sizeof(Edges)) * 8;
  size_t pop_edges_per_kmer_bits = (all_colours_loaded ? 0 : sizeof(Edges) * 8);

  bits_per_kmer = per_kmer_per_col_bits * use_ncols + pop_edges_per_kmer_bits;

  kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                        ctx_max_kmers, ctx_sum_kmers,
                                        use_mem_limit, &graph_mem);

  // Maximise the number of colours we load to fill the mem
  size_t max_usencols = (memargs.mem_to_use*8 - pop_edges_per_kmer_bits * kmers_in_hash) /
                        (per_kmer_per_col_bits * kmers_in_hash);
  use_ncols = MIN2(max_usencols, ncols);

  cmd_check_mem_limit(memargs.mem_to_use, graph_mem);

  // Check output files are writable

  // Does nothing if arg is NULL

  // Create db_graph
  // Load as many colours as possible
  // Use an extra set of edge to take intersections
  dBGraph db_graph;
  db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, use_ncols, use_ncols,
                 kmers_in_hash, DBG_ALLOC_COVGS);

  // Edges is a special case
  size_t num_edges = * (use_ncols + !all_colours_loaded);
  db_graph.col_edges = ctx_calloc(num_edges, sizeof(Edges));

  // Load graph into a single colour
  LoadingStats stats = LOAD_STATS_INIT_MACRO;

  GraphLoadingPrefs gprefs = {.db_graph = &db_graph,
                              .boolean_covgs = false,
                              .must_exist_in_graph = false,
                              .must_exist_in_edges = NULL,
                              .empty_colours = false};

  // Construct cleaned graph header
  GraphFileHeader outhdr;
  memset(&outhdr, 0, sizeof(GraphFileHeader));
  outhdr.version = CTX_GRAPH_FILEFORMAT;
  outhdr.kmer_size = db_graph.kmer_size;
  outhdr.num_of_cols = ncols;
  outhdr.num_of_bitfields = (db_graph.kmer_size*2+63)/64;
  graph_header_alloc(&outhdr, ncols);

  // Merge info into header
  size_t gcol = 0;
  for(i = 0; i < num_gfiles; i++) {
    for(j = 0; j < file_filter_num(&gfiles[i].fltr); j++, gcol++) {
      fromcol = file_filter_fromcol(&gfiles[i].fltr, j);
      intocol = file_filter_intocol(&gfiles[i].fltr, j);
      graph_info_merge(&outhdr.ginfo[intocol], &gfiles[i].hdr.ginfo[fromcol]);

  if(ncols > use_ncols) {
    graph_files_load_flat(gfiles, num_gfiles, gprefs, &stats);
  } else {
    for(i = 0; i < num_gfiles; i++)
      graph_load(&gfiles[i], gprefs, &stats);

  char num_kmers_str[100];
  ulong_to_str(, num_kmers_str);
  status("Total kmers loaded: %s\n", num_kmers_str);

  size_t initial_nkmers =;

  uint8_t *visited = ctx_calloc(roundup_bits2bytes(, 1);
  uint8_t *keep = ctx_calloc(roundup_bits2bytes(, 1);

  if((supernode_cleaning && threshold <= 0) || covg_before_path || len_before_path)
    // Get coverage distribution and estimate cleaning threshold
    int est_threshold = cleaning_get_threshold(nthreads,
                                               visited, &db_graph);

    if(est_threshold < 0) status("Cannot find recommended cleaning threshold");
    else status("Recommended cleaning threshold is: %i", est_threshold);

    // Use estimated threshold if threshold not set
    if(threshold <= 0) {
      if(fallback_thresh > 0 && est_threshold < (int)fallback_thresh) {
        status("Using fallback threshold: %i", fallback_thresh);
        threshold = fallback_thresh;
      else if(est_threshold >= 0) threshold = est_threshold;

  // Die if we failed to find suitable cleaning threshold
  if(supernode_cleaning && threshold <= 0)
    die("Need cleaning threshold (--supernodes=<D> or --fallback <D>)");

  if(doing_cleaning) {
    // Clean graph of tips (if min_keep_tip > 0) and supernodes (if threshold > 0)
    clean_graph(nthreads, threshold, min_keep_tip,
                covg_after_path, len_after_path,
                visited, keep, &db_graph);


    // Output graph file
    Edges *intersect_edges = NULL;
    bool kmers_loaded = true;
    size_t col, thresh;

    // Set output header ginfo cleaned
    for(col = 0; col < ncols; col++)
      cleaning = &outhdr.ginfo[col].cleaning;
      cleaning->cleaned_snodes |= supernode_cleaning;
      cleaning->cleaned_tips |= tip_cleaning;

      // if(tip_cleaning) {
      //   strbuf_append_str(&outhdr.ginfo[col].sample_name, ".tipclean");
      // }

      if(supernode_cleaning) {
        thresh = cleaning->clean_snodes_thresh;
        thresh = cleaning->cleaned_snodes ? MAX2(thresh, (uint32_t)threshold)
                                          : (uint32_t)threshold;
        cleaning->clean_snodes_thresh = thresh;

        // char name_append[200];
        // sprintf(name_append, ".supclean%zu", thresh);
        // strbuf_append_str(&outhdr.ginfo[col].sample_name, name_append);

      // We haven't loaded all the colours
      // intersect_edges are edges to mask with
      // resets graph edges
      intersect_edges = db_graph.col_edges;
      db_graph.col_edges +=;

    // Print stats on removed kmers
    size_t removed_nkmers = initial_nkmers -;
    double removed_pct = (100.0 * removed_nkmers) / initial_nkmers;
    char removed_str[100], init_str[100];
    ulong_to_str(removed_nkmers, removed_str);
    ulong_to_str(initial_nkmers, init_str);
    status("Removed %s of %s (%.2f%%) kmers", removed_str, init_str, removed_pct);

    graph_files_merge(out_ctx_path, gfiles, num_gfiles,
                      kmers_loaded, all_colours_loaded,
                      intersect_edges, &outhdr, &db_graph);

    // Swap back
      db_graph.col_edges = intersect_edges;

  ctx_check( == hash_table_count_kmers(&;


  for(i = 0; i < num_gfiles; i++) graph_file_close(&gfiles[i]);


  return EXIT_SUCCESS;
Esempio n. 5
int ctx_index(int argc, char **argv)
  const char *out_path = NULL;
  size_t block_size = 0, block_kmers = 0;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'o': cmd_check(!out_path, cmd); out_path = optarg; break;
      case 'b':
        cmd_check(!block_kmers, cmd);
        block_kmers = cmd_size_nonzero(cmd, optarg);
      case 's':
        cmd_check(!block_size, cmd);
        block_size = cmd_size_nonzero(cmd, optarg);
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" index -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();

  if(optind+1 != argc)
    cmd_print_usage("Require exactly one input graph file (.ctx)");

  if(block_size && block_kmers)
    cmd_print_usage("Cannot use --block-kmers and --block-size together");

  const char *ctx_path = argv[optind];

  // Open Graph file
  GraphFileReader gfile;
  memset(&gfile, 0, sizeof(GraphFileReader));
  graph_file_open2(&gfile, ctx_path, "r+", true, 0);

    die("Cannot open graph file with a filter ('in.ctx:blah' syntax)");

  // Open output file
  FILE *fout = out_path ? futil_fopen_create(out_path, "w") : stdout;

  // Start
  size_t filencols = gfile.hdr.num_of_cols;
  size_t kmer_size = gfile.hdr.kmer_size;
  const char *path = file_filter_path(&gfile.fltr);

  size_t ncols = file_filter_into_ncols(&gfile.fltr);
  size_t kmer_mem = sizeof(BinaryKmer) + (sizeof(Edges)+sizeof(Covg))*filencols;

  if(block_size) {
    block_kmers = block_size / kmer_mem;
  } else if(!block_size && !block_kmers) {
    block_size = 4 * ONE_MEGABYTE;
    block_kmers = block_size / kmer_mem;

  // Update block-size
  block_size = block_kmers * kmer_mem;

  status("[index] block bytes: %zu kmers: %zu; kmer bytes: %zu, hdr: %zu",
         block_size, block_kmers, kmer_mem, (size_t)gfile.hdr_size);

  if(block_kmers == 0) die("Cannot set block_kmers to zero");

  // Print header
  fputs("#block_start\tnext_block\tfirst_kmer\tkmer_idx\tnext_kmer_idx\n", fout);

  BinaryKmer bkmer = BINARY_KMER_ZERO_MACRO;
  BinaryKmer prev_bkmer = BINARY_KMER_ZERO_MACRO;
  Covg *covgs = ctx_malloc(ncols * sizeof(Covg));
  Edges *edges = ctx_malloc(ncols * sizeof(Edges));
  char bkmerstr[MAX_KMER_SIZE+1];

  size_t rem_block = block_size - kmer_mem; // block after first kmer
  char *tmp_mem = ctx_malloc(rem_block);

  // Read in file, print index
  size_t nblocks = 0;
  size_t bl_bytes = 0, bl_kmers = 0;
  size_t bl_byte_offset = gfile.hdr_size, bl_kmer_offset = 0;

    if(!graph_file_read(&gfile, &bkmer, covgs, edges)) {
      status("Read kmer failed"); break; }
    binary_kmer_to_str(bkmer, kmer_size, bkmerstr);
    if(nblocks > 0 && !binary_kmer_less_than(prev_bkmer,bkmer))
      die("File is not sorted: %s [%s]", bkmerstr, path);
    // We've already read one kmer entry, read rest of block
    bl_bytes = kmer_mem + gfr_fread_bytes(&gfile, tmp_mem, rem_block);
    bl_kmers = 1 + bl_bytes / kmer_mem;
    fprintf(fout, "%zu\t%zu\t%s\t%zu\t%zu\n",
            bl_byte_offset, bl_byte_offset+bl_bytes, bkmerstr,
            bl_kmer_offset, bl_kmer_offset+bl_kmers);
    bl_byte_offset += bl_bytes;
    bl_kmer_offset += bl_kmers;
    if(bl_kmers < block_kmers) {
      status("last block %zu < %zu; %zu vs %zu",
             bl_kmers, block_kmers, bl_bytes, block_size);
    prev_bkmer = bkmer;


  // done
  char num_kmers_str[50], num_blocks_str[50];
  char block_mem_str[50], block_kmers_str[50];
  ulong_to_str(bl_kmer_offset, num_kmers_str);
  ulong_to_str(nblocks, num_blocks_str);
  bytes_to_str(block_size, 1, block_mem_str);
  ulong_to_str(block_kmers, block_kmers_str);

  status("Read %s kmers in %s block%s (block size %s / %s kmers)",
         num_kmers_str, num_blocks_str, util_plural_str(nblocks),
         block_mem_str, block_kmers_str);

  if(fout != stdout) status("Saved to %s", out_path);


  return EXIT_SUCCESS;
Esempio n. 6
int ctx_pop_bubbles(int argc, char **argv)
  size_t nthreads = 0;
  struct MemArgs memargs = MEM_ARGS_INIT;
  const char *out_path = NULL;
  int32_t max_covg  = -1; // max mean coverage to remove <=0 => ignore
  int32_t max_klen  = -1; // max length (kmers) to remove <=0 => ignore
  int32_t max_kdiff = -1; // max diff between bubble branch lengths <0 => ignore

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'o': cmd_check(!out_path, cmd); out_path = optarg; break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'C': cmd_check(max_covg<0,  cmd); max_covg  = cmd_uint32(cmd, optarg); break;
      case 'L': cmd_check(max_klen<0,  cmd); max_klen  = cmd_uint32(cmd, optarg); break;
      case 'D': cmd_check(max_kdiff<0, cmd); max_kdiff = cmd_uint32(cmd, optarg); break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" pop -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();

  // Defaults for unset values
  if(out_path == NULL) out_path = "-";
  if(nthreads == 0) nthreads = DEFAULT_NTHREADS;

  if(optind >= argc) cmd_print_usage("Require input graph files (.ctx)");

  // Open graph files
  const size_t num_gfiles = argc - optind;
  char **graph_paths = argv + optind;
  ctx_assert(num_gfiles > 0);

  GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader));
  size_t i, ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0;

  ncols = graph_files_open(graph_paths, gfiles, num_gfiles,
                           &ctx_max_kmers, &ctx_sum_kmers);

  bool reread_graph_to_filter = (num_gfiles == 1 &&
                                 strcmp(file_filter_path(&gfiles[0].fltr),"-") != 0);

  if(reread_graph_to_filter) {
    file_filter_flatten(&gfiles[0].fltr, 0);
    ncols = 1;

  // Check graphs are compatible
  graphs_gpaths_compatible(gfiles, num_gfiles, NULL, 0, -1);

  // Decide on memory
  size_t bits_per_kmer, kmers_in_hash, graph_mem;

  bits_per_kmer = sizeof(BinaryKmer)*8 +
                  sizeof(Covg)*8*ncols +
                  sizeof(Edges)*8*ncols +
                  2; // 1 bit for visited, 1 for removed

  kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                        ctx_max_kmers, ctx_sum_kmers,
                                        false, &graph_mem);

  cmd_check_mem_limit(memargs.mem_to_use, graph_mem);

  // Check out_path is writable

  // Allocate memory
  dBGraph db_graph;
  db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, ncols, ncols,
                 kmers_in_hash,  DBG_ALLOC_EDGES | DBG_ALLOC_COVGS);

  size_t nkwords = roundup_bits2bytes(;
  uint8_t *visited = ctx_calloc(1, nkwords);
  uint8_t *rmvbits  = ctx_calloc(1, nkwords);

  // Load graphs
  GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph);
  gprefs.empty_colours = true;

  for(i = 0; i < num_gfiles; i++) {
    graph_load(&gfiles[i], gprefs, NULL);
    gprefs.empty_colours = false;


  PopBubblesPrefs prefs = {.max_rmv_covg = max_covg,
                           .max_rmv_klen = max_klen,
                           .max_rmv_kdiff = max_kdiff};
  size_t npopped = 0;
  char npopped_str[50];

  status("Popping bubbles...");
  npopped = pop_bubbles(&db_graph, nthreads, prefs, visited, rmvbits);
  ulong_to_str(npopped, npopped_str);
  status("Popped %s bubbles", npopped_str);

  size_t nkmers0 =;
  status("Removing nodes...");
  for(i = 0; i < nkwords; i++) rmvbits[i] = ~rmvbits[i];
  prune_nodes_lacking_flag(nthreads, rmvbits, &db_graph);
  size_t nkmers1 =;

  ctx_assert(nkmers1 <= nkmers0);
  char nkmers0str[50], nkmers1str[50], ndiffstr[50];
  ulong_to_str(nkmers0, nkmers0str);
  ulong_to_str(nkmers1, nkmers1str);
  ulong_to_str(nkmers0-nkmers1, ndiffstr);
  status("Number of kmers %s -> %s (-%s)", nkmers0str, nkmers1str, ndiffstr);

    status("Streaming filtered file to: %s\n", out_path);
    GraphFileReader gfile;
    memset(&gfile, 0, sizeof(GraphFileReader));
    graph_file_open(&gfile, graph_paths[0]);
    graph_writer_stream_mkhdr(out_path, &gfile, &db_graph,
                              db_graph.col_edges, NULL);
    status("Saving to: %s\n", out_path);
    graph_writer_save_mkhdr(out_path, &db_graph, CTX_GRAPH_FILEFORMAT, NULL,
                          0, ncols);



  return EXIT_SUCCESS;
Esempio n. 7
int ctx_clean(int argc, char **argv)
  size_t nthreads = 0, use_ncols = 0;
  struct MemArgs memargs = MEM_ARGS_INIT;
  const char *out_ctx_path = NULL;
  int min_keep_tip = -1, unitig_min = -1; // <0 => default, 0 => noclean
  uint32_t fallback_thresh = 0;
  const char *len_before_path = NULL, *len_after_path = NULL;
  const char *covg_before_path = NULL, *covg_after_path = NULL;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'o':
        if(out_ctx_path != NULL) cmd_print_usage(NULL);
        out_ctx_path = optarg;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'N': use_ncols = cmd_uint32_nonzero(cmd, optarg); break;
      case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break;
      case 'T':
        cmd_check(min_keep_tip<0, cmd);
        min_keep_tip = (optarg != NULL ? (int)cmd_uint32(cmd, optarg) : -1);
      case 'S':
      case 'U':
        cmd_check(unitig_min<0, cmd);
        unitig_min = (optarg != NULL ? cmd_uint32(cmd, optarg) : -1);
      case 'B': cmd_check(!fallback_thresh, cmd); fallback_thresh = cmd_uint32_nonzero(cmd, optarg); break;
      case 'l': cmd_check(!len_before_path, cmd); len_before_path = optarg; break;
      case 'L': cmd_check(!len_after_path, cmd); len_after_path = optarg; break;
      case 'c': cmd_check(!covg_before_path, cmd); covg_before_path = optarg; break;
      case 'C': cmd_check(!covg_after_path, cmd); covg_after_path = optarg; break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" clean -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();

  if(nthreads == 0) nthreads = DEFAULT_NTHREADS;

  if(optind >= argc) cmd_print_usage("Please give input graph files");

  bool unitig_cleaning = (unitig_min != 0);
  bool tip_cleaning = (min_keep_tip != 0);
  bool doing_cleaning = (unitig_cleaning || tip_cleaning);

  // If you ever want to estimate cleaning threshold without outputting
  // a graph, change this to a warning
  if(doing_cleaning && out_ctx_path == NULL) {
    cmd_print_usage("Please specify --out <out.ctx> for cleaned graph");
    // warn("No cleaning being done: you did not specify --out <out.ctx>");

  if(!doing_cleaning && (covg_after_path || len_after_path)) {
    warn("You gave --len-after <out> / --covg-after <out> without "
         "any cleaning (set -U, --unitigs or -t, --tips)");

  if(doing_cleaning && strcmp(out_ctx_path,"-") != 0 &&
     !futil_get_force() && futil_file_exists(out_ctx_path))
    cmd_print_usage("Output file already exists: %s", out_ctx_path);

  if(fallback_thresh && !unitig_cleaning)
    warn("-B, --fallback <T> without --unitigs");

  // Use remaining args as graph files
  char **gfile_paths = argv + optind;
  size_t i, j, num_gfiles = (size_t)(argc - optind);

  // Open graph files
  GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader));
  size_t col, ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0;

  ncols = graph_files_open(gfile_paths, gfiles, num_gfiles,
                           &ctx_max_kmers, &ctx_sum_kmers);

  size_t kmer_size = gfiles[0].hdr.kmer_size;

  // default to one colour for now
  if(use_ncols == 0) use_ncols = 1;

  // Flatten if we don't have to remember colours / output a graph
  if(out_ctx_path == NULL)
    ncols = use_ncols = 1;
    for(i = 0; i < num_gfiles; i++)
      file_filter_flatten(&gfiles[i].fltr, 0);

  if(ncols < use_ncols) {
    warn("I only need %zu colour%s ('--ncols %zu' ignored)",
         ncols, util_plural_str(ncols), use_ncols);
    use_ncols = ncols;

  char max_kmers_str[100];
  ulong_to_str(ctx_max_kmers, max_kmers_str);
  status("%zu input graph%s, max kmers: %s, using %zu colours",
         num_gfiles, util_plural_str(num_gfiles), max_kmers_str, use_ncols);

  // If no arguments given we default to removing tips < 2*kmer_size
  if(min_keep_tip < 0)
    min_keep_tip = 2 * kmer_size;

  // Warn if any graph files already cleaned
  size_t fromcol;
  ErrorCleaning *cleaning;

  for(i = 0; i < num_gfiles; i++) {
    for(j = 0; j < file_filter_num(&gfiles[i].fltr); j++) {
      fromcol = file_filter_fromcol(&gfiles[i].fltr, j);
      cleaning = &gfiles[i].hdr.ginfo[fromcol].cleaning;
      if(cleaning->cleaned_snodes && unitig_cleaning) {
        warn("%s:%zu already has unitig cleaning with threshold: <%zu",
             file_filter_path(&gfiles[i].fltr), fromcol,
      if(cleaning->cleaned_tips && tip_cleaning) {
        warn("%s:%zu already has had tip cleaned",
             file_filter_path(&gfiles[i].fltr), fromcol);

  // Print steps
  size_t step = 0;
  if(covg_before_path != NULL)
    status("%zu. Saving kmer coverage distribution to: %s", step++, covg_before_path);
  if(len_before_path != NULL)
    status("%zu. Saving unitig length distribution to: %s", step++, len_before_path);
  if(min_keep_tip > 0)
    status("%zu. Cleaning tips shorter than %i nodes", step++, min_keep_tip);
  if(unitig_min > 0)
    status("%zu. Cleaning unitigs with coverage < %i", step++, unitig_min);
  if(unitig_min < 0)
    status("%zu. Cleaning unitigs with auto-detected threshold", step++);
  if(covg_after_path != NULL)
    status("%zu. Saving kmer coverage distribution to: %s", step++, covg_after_path);
  if(len_after_path != NULL)
    status("%zu. Saving unitig length distribution to: %s", step++, len_after_path);

  // Decide memory usage
  bool all_colours_loaded = (ncols <= use_ncols);
  bool use_mem_limit = (memargs.mem_to_use_set && num_gfiles > 1) || !ctx_max_kmers;

  size_t kmers_in_hash, bits_per_kmer, graph_mem;
  size_t per_col_bits = (sizeof(Covg)+sizeof(Edges)) * 8;
  size_t extra_edge_bits = (all_colours_loaded ? 0 : sizeof(Edges) * 8);

  bits_per_kmer = sizeof(BinaryKmer)*8 +
                  per_col_bits * use_ncols +

  kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                        ctx_max_kmers, ctx_sum_kmers,
                                        use_mem_limit, &graph_mem);

  // Maximise the number of colours we load to fill the mem
  size_t max_usencols = (memargs.mem_to_use*8 -
                         sizeof(BinaryKmer)*8*kmers_in_hash +
                         extra_edge_bits*kmers_in_hash) /
  use_ncols = MIN2(max_usencols, ncols);

  cmd_check_mem_limit(memargs.mem_to_use, graph_mem);

  // Check output files are writable

  // Does nothing if arg is NULL

  // Create db_graph
  // Load as many colours as possible
  // Use an extra set of edge to take intersections
  dBGraph db_graph;
  db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, use_ncols, use_ncols,
                 kmers_in_hash, DBG_ALLOC_EDGES | DBG_ALLOC_COVGS);

  // Extra edges required to hold union of kept edges
  Edges *edges_union = NULL;
  if(use_ncols < ncols)
    edges_union = ctx_calloc(, sizeof(Edges));

  // Load graph into a single colour
  GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph);

  // Construct cleaned graph header
  GraphFileHeader outhdr;
  memset(&outhdr, 0, sizeof(GraphFileHeader));
  for(i = 0; i < num_gfiles; i++)
    graph_file_merge_header(&outhdr, &gfiles[i]);

  if(ncols > use_ncols)
    db_graph.num_of_cols = db_graph.num_edge_cols = 1;
    SWAP(edges_union, db_graph.col_edges);
    graphs_load_files_flat(gfiles, num_gfiles, gprefs, NULL);
    SWAP(edges_union, db_graph.col_edges);
    db_graph.num_of_cols = db_graph.num_edge_cols = use_ncols;
  else {
    for(i = 0; i < num_gfiles; i++)
      graph_load(&gfiles[i], gprefs, NULL);

  char num_kmers_str[100];
  ulong_to_str(, num_kmers_str);
  status("Total kmers loaded: %s\n", num_kmers_str);

  size_t initial_nkmers =;

  uint8_t *visited = ctx_calloc(roundup_bits2bytes(, 1);
  uint8_t *keep = ctx_calloc(roundup_bits2bytes(, 1);

  // Always estimate cleaning threshold
  // if(unitig_min <= 0 || covg_before_path || len_before_path)
  // {
    // Get coverage distribution and estimate cleaning threshold
    int est_min_covg = cleaning_get_threshold(nthreads,
                                              visited, &db_graph);

    if(est_min_covg < 0) status("Cannot find recommended cleaning threshold");
    else status("Recommended cleaning threshold is: %i", est_min_covg);

    // Use estimated threshold if threshold not set
    if(unitig_min < 0) {
      if(fallback_thresh > 0 && est_min_covg < (int)fallback_thresh) {
        status("Using fallback threshold: %i", fallback_thresh);
        unitig_min = fallback_thresh;
      else if(est_min_covg >= 0) unitig_min = est_min_covg;
  // }

  // Die if we failed to find suitable cleaning threshold
  if(unitig_min < 0)
    die("Need cleaning threshold (--unitigs=<D> or --fallback <D>)");

  // Cleaning parameters should now be set (>0) or turned off (==0)
  ctx_assert(unitig_min >= 0);
  ctx_assert(min_keep_tip >= 0);

  if(unitig_min || min_keep_tip)
    // Clean graph of tips (if min_keep_tip > 0) and unitigs (if threshold > 0)
    clean_graph(nthreads, unitig_min, min_keep_tip,
                covg_after_path, len_after_path,
                visited, keep, &db_graph);


  if(out_ctx_path != NULL)
    // Set output header ginfo cleaned
    for(col = 0; col < ncols; col++)
      cleaning = &outhdr.ginfo[col].cleaning;
      cleaning->cleaned_snodes |= unitig_cleaning;
      cleaning->cleaned_tips |= tip_cleaning;

      // if(tip_cleaning) {
      //   strbuf_append_str(&outhdr.ginfo[col].sample_name, ".tipclean");
      // }

      if(unitig_cleaning) {
        size_t thresh = cleaning->clean_snodes_thresh;
        thresh = cleaning->cleaned_snodes ? MAX2(thresh, (uint32_t)unitig_min)
                                          : (uint32_t)unitig_min;
        cleaning->clean_snodes_thresh = thresh;

        // char name_append[200];
        // sprintf(name_append, ".supclean%zu", thresh);
        // strbuf_append_str(&outhdr.ginfo[col].sample_name, name_append);

    // Print stats on removed kmers
    size_t removed_nkmers = initial_nkmers -;
    double removed_pct = (100.0 * removed_nkmers) / initial_nkmers;
    char removed_str[100], init_str[100];
    ulong_to_str(removed_nkmers, removed_str);
    ulong_to_str(initial_nkmers, init_str);
    status("Removed %s of %s (%.2f%%) kmers", removed_str, init_str, removed_pct);

    // kmers_loaded=true
    graph_writer_merge(out_ctx_path, gfiles, num_gfiles,
                      true, all_colours_loaded,
                      edges_union, &outhdr, &db_graph);

  ctx_check( == hash_table_count_kmers(&;

  // TODO: report kmer coverage for each sample


  for(i = 0; i < num_gfiles; i++) graph_file_close(&gfiles[i]);


  return EXIT_SUCCESS;
Esempio n. 8
void read_thread_args_parse(struct ReadThreadCmdArgs *args,
                            int argc, char **argv,
                            const struct option *longopts, bool correct_cmd)
  size_t i;
  CorrectAlnInput task = CORRECT_ALN_INPUT_INIT;
  uint8_t fq_offset = 0;
  GPathReader tmp_gpfile;

  CorrectAlnInputBuffer *inputs = &args->inputs;
  args->memargs = (struct MemArgs)MEM_ARGS_INIT;
  args->fmt = SEQ_FMT_FASTQ;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int used = 1, c;
  char *tmp_path;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'o': cmd_check(!args->out_ctp_path,cmd); args->out_ctp_path = optarg; break;
      case 'p':
        memset(&tmp_gpfile, 0, sizeof(GPathReader));
        gpath_reader_open(&tmp_gpfile, optarg);
        gpfile_buf_push(&args->gpfiles, &tmp_gpfile, 1);
      case 't':
        cmd_check(!args->nthreads, cmd);
        args->nthreads = cmd_uint32_nonzero(cmd, optarg);
      case 'm': cmd_mem_args_set_memory(&args->memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&args->memargs, optarg); break;
      case 'c': args->colour = cmd_uint32(cmd, optarg); break;
      case 'F':
        cmd_check(args->fmt == SEQ_FMT_FASTQ, cmd);
        args->fmt = cmd_parse_format(cmd, optarg);
      case '1':
      case '2':
      case 'i':
        used = 1;
        correct_aln_input_buf_push(inputs, &task, 1);
        asyncio_task_parse(&inputs->b[inputs->len-1].files, c, optarg,
                           fq_offset, correct_cmd ? &tmp_path : NULL);
        if(correct_cmd) inputs->b[inputs->len-1].out_base = tmp_path;
      case 'M':
             if(!strcmp(optarg,"FF")) task.matedir = READPAIR_FF;
        else if(!strcmp(optarg,"FR")) task.matedir = READPAIR_FR;
        else if(!strcmp(optarg,"RF")) task.matedir = READPAIR_RF;
        else if(!strcmp(optarg,"RR")) task.matedir = READPAIR_RR;
        else die("-M,--matepair <orient> must be one of: FF,FR,RF,RR");
        used = 0; break;
      case 'O': fq_offset = cmd_uint8(cmd, optarg); used = 0; break;
      case 'Q': task.fq_cutoff = cmd_uint8(cmd, optarg); used = 0; break;
      case 'H': task.hp_cutoff = cmd_uint8(cmd, optarg); used = 0; break;
      case 'l': task.crt_params.frag_len_min = cmd_uint32(cmd, optarg); used = 0; break;
      case 'L': task.crt_params.frag_len_max = cmd_uint32(cmd, optarg); used = 0; break;
      case 'w': task.crt_params.one_way_gap_traverse = true; used = 0; break;
      case 'W': task.crt_params.one_way_gap_traverse = false; used = 0; break;
      case 'd': task.crt_params.gap_wiggle = cmd_udouble(cmd, optarg); used = 0; break;
      case 'D': task.crt_params.gap_variance = cmd_udouble(cmd, optarg); used = 0; break;
      case 'X': task.crt_params.max_context = cmd_uint32(cmd, optarg); used = 0; break;
      case 'e': task.crt_params.use_end_check = true; used = 0; break;
      case 'E': task.crt_params.use_end_check = false; used = 0; break;
      case 'g': cmd_check(!args->dump_seq_sizes, cmd); args->dump_seq_sizes = optarg; break;
      case 'G': cmd_check(!args->dump_frag_sizes, cmd); args->dump_frag_sizes = optarg; break;
      case 'u': args->use_new_paths = true; break;
      case 'x': gen_paths_print_contigs = true; break;
      case 'y': gen_paths_print_paths = true; break;
      case 'z': gen_paths_print_reads = true; break;
      case 'Z':
        cmd_check(!args->fq_zero, cmd);
        if(strlen(optarg) != 1)
          cmd_print_usage("--fq-zero <c> requires a single char");
        args->fq_zero = optarg[0];
      case 'P': cmd_check(!args->append_orig_seq,cmd); args->append_orig_seq = true; break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" thread/correct -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();

  if(args->nthreads == 0) args->nthreads = DEFAULT_NTHREADS;

  // Check that optind+1 == argc
  if(optind+1 > argc)
    cmd_print_usage("Expected exactly one graph file");
  else if(optind+1 < argc)
    cmd_print_usage("Expected only one graph file. What is this: '%s'", argv[optind]);

  char *graph_path = argv[optind];
  status("Reading graph: %s", graph_path);

  if(!used) cmd_print_usage("Ignored arguments after last --seq");

  // ctx_thread requires output file
  if(!correct_cmd && !args->out_ctp_path)
    cmd_print_usage("--out <out.ctp> is required");

  // Open graph graph file
  GraphFileReader *gfile = &args->gfile;
  graph_file_open(gfile, graph_path);

  if(!correct_cmd && file_filter_into_ncols(&gfile->fltr) > 1)
    die("Please specify a single colour e.g. %s:0", file_filter_path(&gfile->fltr));

  // Open path files
  size_t path_max_usedcols = 0;
  for(i = 0; i < args->gpfiles.len; i++) {
    // file_filter_update_intocol(&args->pfiles.b[i].fltr, 0);
    if(!correct_cmd && file_filter_into_ncols(&args->gpfiles.b[i].fltr) > 1) {
      die("Please specify a single colour e.g. %s:0",
    path_max_usedcols = MAX2(path_max_usedcols,
  args->path_max_usedcols = path_max_usedcols;

  // Check for compatibility between graph files and path files
  graphs_gpaths_compatible(gfile, 1, args->gpfiles.b, args->gpfiles.len, -1);

  // if no paths loaded, set all max_context values to 1, since >1 kmer only
  // useful if can pickup paths
  if(args->gpfiles.len == 0) {
    for(i = 0; i < inputs->len; i++)
      inputs->b[i].crt_params.max_context = 1;

  // Check frag_len_min < frag_len_max
  for(i = 0; i < inputs->len; i++)
    CorrectAlnInput *t = &inputs->b[i];
    t->files.ptr = t;
    if(t->crt_params.frag_len_min > t->crt_params.frag_len_max) {
      die("--min-ins %u is greater than --max-ins %u",
          t->crt_params.frag_len_min, t->crt_params.frag_len_max);
    args->max_gap_limit = MAX2(args->max_gap_limit, t->crt_params.frag_len_max);

Esempio n. 9
int ctx_view(int argc, char **argv)
  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // TODO:
  // print_action actions[argc];
  // bool read_kmers = false;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        cmd_print_usage("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]);
        cmd_print_usage("Programmer fail. Tell Isaac.");

  if(print_kmers) parse_kmers = 1;

  bool no_flags = (!print_info && !parse_kmers && !print_kmers);
  if(no_flags) { print_info = parse_kmers = 1; }

  if(optind+1 != argc) cmd_print_usage("Require one input graph file (.ctx)");

  char *path = argv[optind];
  size_t num_errors = 0, num_warnings = 0;

  GraphFileReader gfile;
  memset(&gfile, 0, sizeof(gfile));
  int ret = graph_file_open(&gfile, path);
  if(ret == 0) die("Cannot open file: %s", path);

    char fsize_str[50];
    bytes_to_str((size_t)gfile.file_size, 0, fsize_str);
    printf("Loading file: %s\n", file_filter_path(&gfile.fltr));
    printf("File size: %s\n", fsize_str);

  size_t i, col, ncols = file_filter_into_ncols(&gfile.fltr);
  size_t kmer_size = gfile.hdr.kmer_size;
  ctx_assert(ncols > 0);

  GraphFileHeader hdr;
  memset(&hdr, 0, sizeof(hdr));
  graph_file_merge_header(&hdr, &gfile);

  uint64_t nkmers_read = 0, nkmers_loaded = 0;
  uint64_t num_all_zero_kmers = 0, num_zero_covg_kmers = 0;
  uint64_t *col_nkmers, *col_sum_covgs;
  col_nkmers = ctx_calloc(ncols, sizeof(col_nkmers[0]));
  col_sum_covgs = ctx_calloc(ncols, sizeof(col_sum_covgs[0]));

  // Print header
  if(print_info) print_header(&hdr, gfile.num_of_kmers);

  BinaryKmer bkmer;
  Covg covgs[ncols], keep_kmer;
  Edges edges[ncols];

  bool direct_read = file_filter_is_direct(&gfile.fltr);

  if(parse_kmers || print_kmers)
    if(print_info && print_kmers) printf("----\n");

    for(; graph_file_read_reset(&gfile, &bkmer, covgs, edges); nkmers_read++)
      // If kmer has no covg in any samples -> don't load
      keep_kmer = 0;
      for(col = 0; col < ncols; col++) {
        col_nkmers[col] += (covgs[col] > 0);
        col_sum_covgs[col] += covgs[col];
        keep_kmer |= covgs[col];

      if(!direct_read && !keep_kmer) continue;

      /* Kmer Checks */
      // graph_file_read_reset() already checks for:
      // 1. oversized kmers
      // 2. kmers with covg 0 in all colours
      // 3. edges without coverage in a colour

      // Check for all-zeros (i.e. all As kmer: AAAAAA)
      uint64_t kmer_words_or = 0;

      for(i = 0; i < hdr.num_of_bitfields; i++)
        kmer_words_or |= bkmer.b[i];

      if(kmer_words_or == 0)
        if(num_all_zero_kmers == 1)
          loading_error("more than one all 'A's kmers seen [index: %"PRIu64"]\n",


      // Check covg is 0 for all colours
      for(i = 0; i < ncols && covgs[i] == 0; i++);
      num_zero_covg_kmers += (i == ncols);

      // Print
        db_graph_print_kmer2(bkmer, covgs, edges, ncols, kmer_size, stdout);

  // check for various reading errors
  // if(errno != 0)
  //   loading_error("errno set [%i]: %s\n", (int)errno, strerror(errno));

  int err = ferror(gfile.fh);
  if(err != 0)
    loading_error("occurred after file reading [%i]\n", err);

  char nstr[50];

  if(print_kmers || parse_kmers)
    // file_size is set to -1 if we are reading from a stream,
    // therefore won't be able to check number of kmers read
    if(gfile.file_size != -1 && nkmers_read != (uint64_t)gfile.num_of_kmers) {
      loading_warning("Expected %zu kmers, read %zu\n",
                      (size_t)gfile.num_of_kmers, (size_t)nkmers_read);

    if(num_all_zero_kmers > 1)
      loading_error("%s all-zero-kmers seen\n",
                    ulong_to_str(num_all_zero_kmers, nstr));

    if(num_zero_covg_kmers > 0)
      loading_warning("%s kmers have no coverage in any colour\n",
                      ulong_to_str(num_zero_covg_kmers, nstr));

  // Count warnings printed by graph_file_reader.c
  num_warnings += gfile.error_zero_covg;
  num_warnings += gfile.error_missing_covg;

  // Can only print these stats if we're read in the kmers
  if((print_kmers || parse_kmers) && print_info)
    // print kmer coverage per sample
    printf("\n---- Per colour stats\n");
    printf("num. kmers:");
    for(col = 0; col < ncols; col++)
      printf("\t%s", ulong_to_str(col_nkmers[col], nstr));
    printf("sum coverage:");
    for(col = 0; col < ncols; col++)
      printf("\t%s", ulong_to_str(col_sum_covgs[col], nstr));
    printf("kmer coverage:");
    for(col = 0; col < ncols; col++)
      printf("\t%.2f", safe_frac(col_sum_covgs[col], col_nkmers[col]));

    // Overall stats
    uint64_t sum_covgs = 0;
    double mean_kmer_covg = 0.0;
    for(col = 0; col < ncols; col++) sum_covgs += col_sum_covgs[col];
    mean_kmer_covg = nkmers_loaded ? (double)sum_covgs / nkmers_loaded : 0.0;

    printf("\n---- Overall stats\n");
    printf("Total kmers:    %s\n", ulong_to_str(nkmers_loaded, nstr));
    printf("Total coverage: %s\n", ulong_to_str(sum_covgs, nstr));
    printf("Mean coverage:  %s\n", double_to_str(mean_kmer_covg, 2, nstr));

    // Print memory stats
    uint64_t mem, capacity, num_buckets, req_capacity;
    uint8_t bucket_size;

    req_capacity = (size_t)(gfile.num_of_kmers / IDEAL_OCCUPANCY);
    capacity = hash_table_cap(req_capacity, &num_buckets, &bucket_size);
    mem = ht_mem(bucket_size, num_buckets,
                 sizeof(BinaryKmer)*8 + ncols*(sizeof(Covg)+sizeof(Edges))*8);

    char memstr[100], capacitystr[100], bucket_size_str[100], num_buckets_str[100];
    bytes_to_str(mem, 1, memstr);
    ulong_to_str(capacity, capacitystr);
    ulong_to_str(bucket_size, bucket_size_str);
    ulong_to_str(num_buckets, num_buckets_str);

    size_t mem_height = (size_t)__builtin_ctzl(num_buckets);

    printf("\n---- Memory\n");
    printf("memory required: %s [capacity: %s]\n", memstr, capacitystr);
    printf("  bucket size: %s; number of buckets: %s\n",
            bucket_size_str, num_buckets_str);
    printf("  --kmer_size %zu --mem_height %zu --mem_width %i\n",
           kmer_size, mem_height, bucket_size);

  if((print_kmers || parse_kmers) && print_info)
    if(num_warnings > 0 || num_errors > 0) {
      printf("Warnings: %zu; Errors: %zu\n",
              (size_t)num_warnings, (size_t)num_errors);
    if(num_errors == 0)
      printf(num_warnings ? "Graph may be ok\n" : "Graph is valid\n");


  // Close file (which zeros it)

  return num_errors ? EXIT_FAILURE : EXIT_SUCCESS;