Ejemplo n.º 1
0
// Creates <base>.tmp.<rand>
static FILE* create_tmp_file(StrBuf *path, const char *base)
{
  size_t i;
  const size_t attempt_limit = 100;
  FILE *fh;

  for(i = 0; i < attempt_limit; i++) {
    size_t r = rand() % 9999;
    strbuf_reset(path);
    strbuf_sprintf(path, "%s.tmp.%04zu", base, r);
    if(!futil_file_exists(path->b)) break;
  }
  if(i == attempt_limit)
    die("Temporary files already exist (%zu tries): %s", attempt_limit, path->b);

  if((fh = futil_fopen_create(path->b, "r+")) == NULL) {
    die("Cannot write temporary file: %s [%s]", path->b, strerror(errno));
  }

  unlink(path->b); // Immediately unlink to hide temp file
  return fh;
}
Ejemplo n.º 2
0
int ctx_rmsubstr(int argc, char **argv)
{
  struct MemArgs memargs = MEM_ARGS_INIT;
  size_t kmer_size = 0, nthreads = 0;
  const char *output_file = NULL;
  seq_format fmt = SEQ_FMT_FASTA;
  bool invert = false;

  // Arg parsing
  char cmd[100], shortopts[100];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'o': cmd_check(!output_file, cmd); output_file = optarg; break;
      case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'k': cmd_check(!kmer_size,cmd); kmer_size = cmd_uint32(cmd, optarg); break;
      case 'F': cmd_check(fmt==SEQ_FMT_FASTA, cmd); fmt = cmd_parse_format(cmd, optarg); break;
      case 'v': cmd_check(!invert,cmd); invert = true; break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        cmd_print_usage("`"CMD" rmsubstr -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  // Defaults
  if(!nthreads) nthreads = DEFAULT_NTHREADS;
  if(!kmer_size) kmer_size = DEFAULT_KMER;

  if(!(kmer_size&1)) cmd_print_usage("Kmer size must be odd");
  if(kmer_size < MIN_KMER_SIZE) cmd_print_usage("Kmer size too small (recompile)");
  if(kmer_size > MAX_KMER_SIZE) cmd_print_usage("Kmer size too large (recompile?)");

  if(optind >= argc)
    cmd_print_usage("Please specify at least one input sequence file (.fq, .fq etc.)");

  size_t i, num_seq_files = argc - optind;
  char **seq_paths = argv + optind;
  seq_file_t **seq_files = ctx_calloc(num_seq_files, sizeof(seq_file_t*));

  for(i = 0; i < num_seq_files; i++)
    if((seq_files[i] = seq_open(seq_paths[i])) == NULL)
      die("Cannot read sequence file %s", seq_paths[i]);

  // Estimate number of bases
  // set to -1 if we cannot calc
  int64_t est_num_bases = seq_est_seq_bases(seq_files, num_seq_files);
  if(est_num_bases < 0) {
    warn("Cannot get file sizes, using pipes");
    est_num_bases = memargs.num_kmers * IDEAL_OCCUPANCY;
  }

  status("[memory] Estimated number of bases: %li", (long)est_num_bases);

  // Use file sizes to decide on memory

  //
  // Decide on memory
  //
  size_t bits_per_kmer, kmers_in_hash, graph_mem;

  bits_per_kmer = sizeof(BinaryKmer)*8 +
                  sizeof(KONodeList) + sizeof(KOccur) + // see kmer_occur.h
                  8; // 1 byte per kmer for each base to load sequence files

  kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                        memargs.mem_to_use_set,
                                        memargs.num_kmers,
                                        memargs.num_kmers_set,
                                        bits_per_kmer,
                                        est_num_bases, est_num_bases,
                                        false, &graph_mem);

  cmd_check_mem_limit(memargs.mem_to_use, graph_mem);

  //
  // Open output file
  //
  if(output_file == NULL) output_file = "-";
  FILE *fout = futil_fopen_create(output_file, "w");

  //
  // Set up memory
  //
  dBGraph db_graph;
  db_graph_alloc(&db_graph, kmer_size, 1, 0, kmers_in_hash, DBG_ALLOC_BKTLOCKS);

  //
  // Load reference sequence into a read buffer
  //
  ReadBuffer rbuf;
  read_buf_alloc(&rbuf, 1024);
  seq_load_all_reads(seq_files, num_seq_files, &rbuf);

  // Check for reads too short
  for(i = 0; i < rbuf.len && rbuf.b[i].seq.end >= kmer_size; i++) {}
  if(i < rbuf.len)
    warn("Reads shorter than kmer size (%zu) will not be filtered", kmer_size);

  KOGraph kograph = kograph_create(rbuf.b, rbuf.len, true, 0,
                                   nthreads, &db_graph);

  size_t num_reads = rbuf.len, num_reads_printed = 0, num_bad_reads = 0;

  // Loop over reads printing those that are not substrings
  int ret;
  for(i = 0; i < rbuf.len; i++) {
    ret = _is_substr(&rbuf, i, &kograph, &db_graph);
    if(ret == -1) num_bad_reads++;
    else if((ret && invert) || (!ret && !invert)) {
      seqout_print_read(&rbuf.b[i], fmt, fout);
      num_reads_printed++;
    }
  }

  char num_reads_str[100], num_reads_printed_str[100], num_bad_reads_str[100];
  ulong_to_str(num_reads, num_reads_str);
  ulong_to_str(num_reads_printed, num_reads_printed_str);
  ulong_to_str(num_bad_reads, num_bad_reads_str);

  status("Printed %s / %s (%.1f%%) to %s",
         num_reads_printed_str, num_reads_str,
         !num_reads ? 0.0 : (100.0 * num_reads_printed) / num_reads,
         futil_outpath_str(output_file));

  if(num_bad_reads > 0) {
    status("Bad reads: %s / %s (%.1f%%) - no kmer {ACGT} of length %zu",
           num_bad_reads_str, num_reads_str,
           (100.0 * num_bad_reads) / num_reads,
           kmer_size);
  }

  fclose(fout);
  kograph_dealloc(&kograph);

  // Free sequence memory
  for(i = 0; i < rbuf.len; i++) seq_read_dealloc(&rbuf.b[i]);
  read_buf_dealloc(&rbuf);
  ctx_free(seq_files);

  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}
Ejemplo n.º 3
0
int ctx_sort(int argc, char **argv)
{
  const char *out_path = NULL;
  struct MemArgs memargs = MEM_ARGS_INIT;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'o': cmd_check(!out_path, cmd); out_path = optarg; break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" sort -h` for help. Bad option: %s", argv[optind-1]);
      default: die("Bad option: [%c]: %s", c, cmd);
    }
  }

  if(optind+1 != argc)
    cmd_print_usage("Require exactly one input graph file (.ctx)");

  const char *ctx_path = argv[optind];

  //
  // Open Graph file
  //
  GraphFileReader gfile;
  memset(&gfile, 0, sizeof(GraphFileReader));
  graph_file_open2(&gfile, ctx_path, out_path ? "r" : "r+", true, 0);

  if(!file_filter_is_direct(&gfile.fltr))
    die("Cannot open graph file with a filter ('in.ctx:blah' syntax)");

  size_t num_kmers, memory;

  // Reading from a stream
  if(gfile.num_of_kmers < 0) {
    if(!memargs.num_kmers_set)
      die("If reading from a stream, must give -n <num_kmers>");
    num_kmers = memargs.num_kmers;
  }
  else num_kmers = gfile.num_of_kmers;

  // Open output path (if given)
  FILE *fout = out_path ? futil_fopen_create(out_path, "w") : NULL;

  size_t i;
  size_t ncols = gfile.hdr.num_of_cols;
  size_t kmer_mem = sizeof(BinaryKmer) + (sizeof(Edges)+sizeof(Covg))*ncols;

  memory = (sizeof(char*) + kmer_mem) * num_kmers;

  char mem_str[50];
  bytes_to_str(memory, 1, mem_str);

  if(memory > memargs.mem_to_use)
    die("Require at least %s memory", mem_str);

  status("[memory] Total: %s", mem_str);

  char *mem = ctx_malloc(kmer_mem * num_kmers);
  char **kmers = ctx_malloc(num_kmers*sizeof(char*));

  // Read in whole file
  // if(graph_file_fseek(gfile, gfile.hdr_size, SEEK_SET) != 0) die("fseek failed");
  size_t nkread = gfr_fread_bytes(&gfile, mem, num_kmers*kmer_mem);

  if(nkread != num_kmers*kmer_mem)
    die("Could only read %zu bytes [<%zu]", nkread, num_kmers*kmer_mem);

  // check we are at the end of the file
  char tmpc;
  if(gfr_fread_bytes(&gfile, &tmpc, 1) != 0) {
    die("More kmers in file than believed (kmers: %zu ncols: %zu).",
        num_kmers, ncols);
  }

  status("Read %zu kmers with %zu colour%s", num_kmers,
         ncols, util_plural_str(ncols));

  for(i = 0; i < num_kmers; i++)
    kmers[i] = mem + kmer_mem*i;

  sort_block(kmers, num_kmers);

  // Print
  if(out_path != NULL) {
    // saving to a different destination - write header
    graph_write_header(fout, &gfile.hdr);
  }
  else {
    // Directly manipulating gfile.fh here, using it to write later
    // Not doing any more reading
    if(fseek(gfile.fh, gfile.hdr_size, SEEK_SET) != 0) die("fseek failed");
    fout = gfile.fh;
  }

  for(i = 0; i < num_kmers; i++)
    if(fwrite(kmers[i], 1, kmer_mem, fout) != kmer_mem)
      die("Cannot write to file");

  if(out_path) fclose(fout);

  graph_file_close(&gfile);
  ctx_free(kmers);
  ctx_free(mem);

  return EXIT_SUCCESS;
}
Ejemplo n.º 4
0
// Returns 0 on success, otherwise != 0
int ctx_unitigs(int argc, char **argv)
{
  size_t nthreads = 0;
  struct MemArgs memargs = MEM_ARGS_INIT;
  const char *out_path = NULL;
  UnitigSyntax syntax = PRINT_FASTA;
  bool dot_use_points = false;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'o': cmd_check(!out_path, cmd); out_path = optarg; break;
      case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'F': cmd_check(!syntax, cmd); syntax = PRINT_FASTA; break;
      case 'g': cmd_check(!syntax, cmd); syntax = PRINT_GFA; break;
      case 'd': cmd_check(!syntax, cmd); syntax = PRINT_DOT; break;
      case 'P': cmd_check(!dot_use_points, cmd); dot_use_points = true; break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        die("`"CMD" unitigs -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  if(dot_use_points && syntax == PRINT_FASTA)
    cmd_print_usage("--point is only for use with --dot");

  // Defaults for unset values
  if(out_path == NULL) out_path = "-";
  if(nthreads == 0) nthreads = DEFAULT_NTHREADS;

  if(optind >= argc) cmd_print_usage(NULL);

  size_t i, num_gfiles = (size_t)(argc - optind);
  char **gfile_paths = argv + optind;

  if(dot_use_points && syntax != PRINT_DOT)
    cmd_print_usage("--points only valid with --graphviz / --dot");

  ctx_assert(num_gfiles > 0);

  // Open graph files
  GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader));
  size_t ctx_max_kmers = 0, ctx_sum_kmers = 0;

  graph_files_open(gfile_paths, gfiles, num_gfiles,
                   &ctx_max_kmers, &ctx_sum_kmers);

  //
  // Decide on memory
  //
  size_t bits_per_kmer, kmers_in_hash, graph_mem;

  bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 + 1;
  if(syntax != PRINT_FASTA) bits_per_kmer += sizeof(UnitigEnd) * 8;

  kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                        memargs.mem_to_use_set,
                                        memargs.num_kmers,
                                        memargs.num_kmers_set,
                                        bits_per_kmer,
                                        ctx_max_kmers, ctx_sum_kmers,
                                        true, &graph_mem);

  cmd_check_mem_limit(memargs.mem_to_use, graph_mem);

  status("Output in %s format to %s\n", syntax_strs[syntax],
         futil_outpath_str(out_path));

  //
  // Open output file
  //

  // Print to stdout unless --out <out> is specified
  FILE *fout = futil_fopen_create(out_path, "w");

  //
  // Allocate memory
  //
  dBGraph db_graph;
  db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, 1, 1, kmers_in_hash,
                 DBG_ALLOC_EDGES);

  UnitigPrinter printer;
  unitig_printer_init(&printer, &db_graph, nthreads, syntax, fout);

  if(syntax == PRINT_DOT || syntax == PRINT_GFA)
    unitig_graph_alloc(&printer.ugraph, &db_graph);

  // Load graphs
  GraphLoadingPrefs gprefs = {.db_graph = &db_graph,
                              .boolean_covgs = false,
                              .must_exist_in_graph = false,
                              .empty_colours = false};

  for(i = 0; i < num_gfiles; i++) {
    file_filter_flatten(&gfiles[i].fltr, 0);
    graph_load(&gfiles[i], gprefs, NULL);
    graph_file_close(&gfiles[i]);
  }
  ctx_free(gfiles);

  hash_table_print_stats(&db_graph.ht);

  switch(syntax)
  {
    case PRINT_FASTA:
      status("Printing unitgs in FASTA using %zu threads", nthreads);
      supernodes_iterate(nthreads, printer.visited, &db_graph,
                         print_unitig_fasta, &printer);
      break;
    case PRINT_GFA:
      print_gfa_syntax(&printer);
      break;
    case PRINT_DOT:
      print_dot_syntax(&printer, dot_use_points);
      break;
    default:
      die("Invalid print syntax: %i", syntax);
  }

  char num_unitigs_str[50];
  ulong_to_str(printer.num_unitigs, num_unitigs_str);
  status("Dumped %s unitigs\n", num_unitigs_str);

  fclose(fout);

  unitig_printer_destroy(&printer);
  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}
Ejemplo n.º 5
0
int ctx_calls2vcf(int argc, char **argv)
{
  parse_cmdline_args(argc, argv);
  size_t i;

  // These functions call die() on error
  gzFile gzin = futil_gzopen(input_path, "r");

  nw_aligner_setup();

  // Read file header
  cJSON *json = read_input_header(gzin);

  // Get format (bubble or breakpoint file)
  cJSON *json_fmt = json_hdr_get(json, "file_format", cJSON_String, input_path);
  if(strcmp(json_fmt->valuestring,"CtxBreakpoints") == 0) input_bubble_format = false;
  else if(strcmp(json_fmt->valuestring,"CtxBubbles") == 0) input_bubble_format = true;
  else die("Unknown format: '%s'", json_fmt->valuestring);

  status("Reading %s in %s format", futil_inpath_str(input_path),
         input_bubble_format ? "bubble" : "breakpoint");

  if(input_bubble_format && sam_path == NULL)
    cmd_print_usage("Require -F <flanks.sam> with bubble file");

  // Open flank file if it exists
  if(sam_path) flanks_sam_open();

  // Open output file
  FILE *fout = futil_fopen_create(out_path, "w");

  // Load reference genome
  read_buf_alloc(&chroms, 1024);
  genome = kh_init(ChromHash);
  seq_reader_load_ref_genome(ref_paths, num_ref_paths, &chroms, genome);

  // convert to upper case
  char *s;
  for(i = 0; i < chroms.len; i++)
    for(s = chroms.b[i].seq.b; *s; s++) *s = toupper(*s);

  if(!input_bubble_format) brkpnt_check_refs_match(json, input_path);

  // Output VCF has 0 samples if bubbles file, otherwise has N where N is
  // number of samples/colours in the breakpoint graph
  size_t num_graph_samples = json_hdr_get_ncols(json, input_path);
  size_t num_graph_nonref = json_hdr_get_nonref_ncols(json, input_path);

  num_samples = 0;
  if(!input_bubble_format) {
    // If last colour has "is_ref", drop number of samples by one
    num_samples = num_graph_nonref < num_graph_samples ? num_graph_samples-1
                                                       : num_graph_samples;
  }

  print_vcf_header(json, !input_bubble_format, fout);
  status("Reading %s call file with %zu samples",
         input_bubble_format ? "Bubble" : "Breakpoint", num_graph_samples);
  status("Writing a VCF with %zu samples", num_samples);
  parse_entries(gzin, fout);

  // Print stats
  char num_entries_read_str[50];
  char num_vars_printed_str[50];
  ulong_to_str(num_entries_read, num_entries_read_str);
  ulong_to_str(num_vars_printed, num_vars_printed_str);

  status("Read %s entries, printed %s vcf entries to: %s",
         num_entries_read_str, num_vars_printed_str, futil_outpath_str(out_path));

  if(input_bubble_format) {
    char msg[200];
    // Bubble caller specific
    print_stat(num_flank5p_unmapped,    num_entries_read, "flank 5p unmapped");
    sprintf(msg, "flank 5p low mapq (<%zu)", min_mapq);
    print_stat(num_flank5p_lowqual,     num_entries_read, msg);
    print_stat(num_flank3p_not_found,   num_entries_read, "flank 3p not found");
    print_stat(num_flank3p_multihits,   num_entries_read, "flank 3p multiple hits");
    print_stat(num_flank3p_approx_match,num_entries_read, "flank 3p approx match used");
    print_stat(num_flank3p_exact_match, num_entries_read, "flank 3p exact match");
  } else {
    // Breakpoint caller specific
    print_stat(num_flanks_not_uniquely_mapped, num_entries_read, "flank pairs contain one flank not mapped uniquely");
    print_stat(num_flanks_diff_chroms,         num_entries_read, "flank pairs map to diff chroms");
    print_stat(num_flanks_diff_strands,        num_entries_read, "flank pairs map to diff strands");
  }
  print_stat(num_flanks_too_far_apart,       num_entries_read, "flank pairs too far apart");
  print_stat(num_flanks_overlap_too_large,   num_entries_read, "flank pairs overlap too much");
  print_stat(num_entries_well_mapped,        num_entries_read, "flank pairs map well");

  status("Aligned %zu allele pairs and %zu flanks", num_nw_allele, num_nw_flank);

  // Finished - clean up
  cJSON_Delete(json);
  gzclose(gzin);
  fclose(fout);

  for(i = 0; i < chroms.len; i++) seq_read_dealloc(&chroms.b[i]);
  read_buf_dealloc(&chroms);
  kh_destroy_ChromHash(genome);
  nw_aligner_destroy();

  if(sam_path) flanks_sam_close();

  // hide unused method warnings
  (void)kh_del_ChromHash;
  (void)kh_put_ChromHash;
  (void)kh_get_ChromHash;
  (void)kh_clear_ChromHash;
  (void)kh_destroy_ChromHash;
  (void)kh_init_ChromHash;

  return EXIT_SUCCESS;
}
Ejemplo n.º 6
0
int ctx_index(int argc, char **argv)
{
  const char *out_path = NULL;
  size_t block_size = 0, block_kmers = 0;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'o': cmd_check(!out_path, cmd); out_path = optarg; break;
      case 'b':
        cmd_check(!block_kmers, cmd);
        block_kmers = cmd_size_nonzero(cmd, optarg);
        break;
      case 's':
        cmd_check(!block_size, cmd);
        block_size = cmd_size_nonzero(cmd, optarg);
        break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" index -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  if(optind+1 != argc)
    cmd_print_usage("Require exactly one input graph file (.ctx)");

  if(block_size && block_kmers)
    cmd_print_usage("Cannot use --block-kmers and --block-size together");

  const char *ctx_path = argv[optind];

  //
  // Open Graph file
  //
  GraphFileReader gfile;
  memset(&gfile, 0, sizeof(GraphFileReader));
  graph_file_open2(&gfile, ctx_path, "r+", true, 0);

  if(!file_filter_is_direct(&gfile.fltr))
    die("Cannot open graph file with a filter ('in.ctx:blah' syntax)");

  // Open output file
  FILE *fout = out_path ? futil_fopen_create(out_path, "w") : stdout;

  // Start
  size_t filencols = gfile.hdr.num_of_cols;
  size_t kmer_size = gfile.hdr.kmer_size;
  const char *path = file_filter_path(&gfile.fltr);

  size_t ncols = file_filter_into_ncols(&gfile.fltr);
  size_t kmer_mem = sizeof(BinaryKmer) + (sizeof(Edges)+sizeof(Covg))*filencols;

  if(block_size) {
    block_kmers = block_size / kmer_mem;
  } else if(!block_size && !block_kmers) {
    block_size = 4 * ONE_MEGABYTE;
    block_kmers = block_size / kmer_mem;
  }

  // Update block-size
  block_size = block_kmers * kmer_mem;

  status("[index] block bytes: %zu kmers: %zu; kmer bytes: %zu, hdr: %zu",
         block_size, block_kmers, kmer_mem, (size_t)gfile.hdr_size);

  if(block_kmers == 0) die("Cannot set block_kmers to zero");

  // Print header
  fputs("#block_start\tnext_block\tfirst_kmer\tkmer_idx\tnext_kmer_idx\n", fout);

  BinaryKmer bkmer = BINARY_KMER_ZERO_MACRO;
  BinaryKmer prev_bkmer = BINARY_KMER_ZERO_MACRO;
  Covg *covgs = ctx_malloc(ncols * sizeof(Covg));
  Edges *edges = ctx_malloc(ncols * sizeof(Edges));
  char bkmerstr[MAX_KMER_SIZE+1];

  size_t rem_block = block_size - kmer_mem; // block after first kmer
  char *tmp_mem = ctx_malloc(rem_block);

  // Read in file, print index
  size_t nblocks = 0;
  size_t bl_bytes = 0, bl_kmers = 0;
  size_t bl_byte_offset = gfile.hdr_size, bl_kmer_offset = 0;

  while(1)
  {
    if(!graph_file_read(&gfile, &bkmer, covgs, edges)) {
      status("Read kmer failed"); break; }
    binary_kmer_to_str(bkmer, kmer_size, bkmerstr);
    if(nblocks > 0 && !binary_kmer_less_than(prev_bkmer,bkmer))
      die("File is not sorted: %s [%s]", bkmerstr, path);
    // We've already read one kmer entry, read rest of block
    bl_bytes = kmer_mem + gfr_fread_bytes(&gfile, tmp_mem, rem_block);
    bl_kmers = 1 + bl_bytes / kmer_mem;
    fprintf(fout, "%zu\t%zu\t%s\t%zu\t%zu\n",
            bl_byte_offset, bl_byte_offset+bl_bytes, bkmerstr,
            bl_kmer_offset, bl_kmer_offset+bl_kmers);
    bl_byte_offset += bl_bytes;
    bl_kmer_offset += bl_kmers;
    nblocks++;
    if(bl_kmers < block_kmers) {
      status("last block %zu < %zu; %zu vs %zu",
             bl_kmers, block_kmers, bl_bytes, block_size);
      break;
    }
    prev_bkmer = bkmer;
  }

  ctx_free(covgs);
  ctx_free(edges);
  ctx_free(tmp_mem);

  // done
  char num_kmers_str[50], num_blocks_str[50];
  char block_mem_str[50], block_kmers_str[50];
  ulong_to_str(bl_kmer_offset, num_kmers_str);
  ulong_to_str(nblocks, num_blocks_str);
  bytes_to_str(block_size, 1, block_mem_str);
  ulong_to_str(block_kmers, block_kmers_str);

  status("Read %s kmers in %s block%s (block size %s / %s kmers)",
         num_kmers_str, num_blocks_str, util_plural_str(nblocks),
         block_mem_str, block_kmers_str);

  if(fout != stdout) status("Saved to %s", out_path);

  graph_file_close(&gfile);
  fclose(fout);

  return EXIT_SUCCESS;
}
Ejemplo n.º 7
0
int ctx_infer_edges(int argc, char **argv)
{
  size_t num_of_threads = DEFAULT_NTHREADS;
  struct MemArgs memargs = MEM_ARGS_INIT;
  char *out_ctx_path = NULL;
  bool add_pop_edges = false, add_all_edges = false;

  // Arg parsing
  char cmd[100];
  char shortopts[100];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'o': cmd_check(!out_ctx_path,cmd); out_ctx_path = optarg; break;
      case 't': num_of_threads = cmd_uint32_nonzero(cmd, optarg); break;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'A': add_all_edges = true; break;
      case 'P': add_pop_edges = true; break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" inferedges -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  // Default to adding all edges
  if(!add_pop_edges && !add_all_edges) add_all_edges = true;

  // Can only specify one of --pop --all
  if(add_pop_edges && add_all_edges)
    cmd_print_usage("Please specify only one of --all --pop");

  // Check that optind+1 == argc
  if(optind+1 > argc)
    cmd_print_usage("Expected exactly one graph file");
  else if(optind+1 < argc)
    cmd_print_usage("Expected only one graph file. What is this: '%s'", argv[optind]);

  //
  // Open graph file
  //
  char *graph_path = argv[optind];
  status("Reading graph: %s", graph_path);

  if(strchr(graph_path,':') != NULL)
    cmd_print_usage("Cannot use ':' in input graph for `"CMD" inferedges`");

  GraphFileReader file;
  memset(&file, 0, sizeof(file));

  file_filter_open(&file.fltr, graph_path);

  // Use stat to detect if we are reading from a stream
  struct stat st;
  bool reading_stream = (stat(file.fltr.path.b, &st) != 0);

  // Mode r+ means open (not create) for update (read & write)
  graph_file_open2(&file, graph_path, reading_stream ? "r" : "r+", 0);

  if(!file_filter_is_direct(&file.fltr))
    cmd_print_usage("Inferedges with filter not implemented - sorry");

  bool editing_file = !(out_ctx_path || reading_stream);

  FILE *fout = NULL;

  // Editing input file or writing a new file
  if(!editing_file)
    fout = futil_fopen_create(out_ctx_path ? out_ctx_path : "-", "w");

  // Print output status
  if(fout == stdout) status("Writing to STDOUT");
  else if(fout != NULL) status("Writing to: %s", out_ctx_path);
  else status("Editing file in place: %s", graph_path);

  status("Inferring all missing %sedges", add_pop_edges ? "population " : "");

  //
  // Decide on memory
  //
  const size_t ncols = file.hdr.num_of_cols;
  size_t kmers_in_hash, graph_mem, bits_per_kmer;

  // reading stream: all covgs + edges
  // reading file: one bit per kmer per colour for 'in colour'
  bits_per_kmer = sizeof(BinaryKmer)*8;

  if(reading_stream) {
    bits_per_kmer += ncols * 8 * (sizeof(Edges) + sizeof(Covg));
  } else {
    bits_per_kmer += ncols; // in colour
  }

  kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                        memargs.mem_to_use_set,
                                        memargs.num_kmers,
                                        memargs.num_kmers_set,
                                        bits_per_kmer,
                                        file.num_of_kmers, file.num_of_kmers,
                                        memargs.mem_to_use_set, &graph_mem);

  cmd_check_mem_limit(memargs.mem_to_use, graph_mem);

  //
  // Allocate memory
  //
  int alloc_flags = reading_stream ? DBG_ALLOC_EDGES | DBG_ALLOC_COVGS
                                   : DBG_ALLOC_NODE_IN_COL;

  dBGraph db_graph;
  db_graph_alloc(&db_graph, file.hdr.kmer_size,
                 ncols, reading_stream ? ncols : 1,
                 kmers_in_hash, alloc_flags);

  LoadingStats stats = LOAD_STATS_INIT_MACRO;
  GraphLoadingPrefs gprefs = {.db_graph = &db_graph,
                              .boolean_covgs = false,
                              .must_exist_in_graph = false,
                              .must_exist_in_edges = NULL,
                              .empty_colours = false};

  // We need to load the graph for both --pop and --all since we need to check
  // if the next kmer is in each of the colours
  graph_load(&file, gprefs, &stats);

  if(add_pop_edges) status("Inferring edges from population...\n");
  else status("Inferring all missing edges...\n");

  size_t num_kmers_edited;

  if(reading_stream)
  {
    ctx_assert(fout != NULL);
    num_kmers_edited = infer_edges(num_of_threads, add_all_edges, &db_graph);
    graph_write_header(fout, &file.hdr);
    graph_write_all_kmers(fout, &db_graph);
  }
  else if(fout == NULL) {
    num_kmers_edited = inferedges_on_mmap(&db_graph, add_all_edges, &file);
  } else {
    num_kmers_edited = inferedges_on_file(&db_graph, add_all_edges, &file, fout);
  }

  if(fout != NULL && fout != stdout) fclose(fout);

  char modified_str[100], kmers_str[100];
  ulong_to_str(num_kmers_edited, modified_str);
  ulong_to_str(db_graph.ht.num_kmers, kmers_str);

  double modified_rate = 0;
  if(db_graph.ht.num_kmers)
    modified_rate = (100.0 * num_kmers_edited) / db_graph.ht.num_kmers;

  status("%s of %s (%.2f%%) nodes modified\n",
         modified_str, kmers_str, modified_rate);

  if(editing_file)
  {
    // Close and re-open
    fclose(file.fh);
    file.fh = NULL;
    futil_update_timestamp(file.fltr.path.b);
  }

  graph_file_close(&file);
  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}
Ejemplo n.º 8
0
int ctx_links(int argc, char **argv)
{
  size_t limit = 0;
  const char *link_out_path = NULL, *csv_out_path = NULL, *plot_out_path = NULL;
  const char *thresh_path = NULL, *hist_path = NULL;

  size_t hist_distsize = 0, hist_covgsize = 0;
  size_t cutoff = 0;
  bool clean = false;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'o': cmd_check(!link_out_path, cmd); link_out_path = optarg; break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'l': cmd_check(!csv_out_path, cmd); csv_out_path = optarg; break;
      case 'c': cmd_check(!cutoff, cmd); cutoff = cmd_size(cmd, optarg); clean = true; break;
      case 'L': cmd_check(!limit, cmd); limit = cmd_size(cmd, optarg); break;
      case 'P': cmd_check(!plot_out_path, cmd); plot_out_path = optarg; break;
      case 'T': cmd_check(!thresh_path, cmd); thresh_path = optarg; break;
      case 'H': cmd_check(!hist_path, cmd); hist_path = optarg; break;
      case 'C': cmd_check(!hist_covgsize, cmd); hist_covgsize = cmd_size(cmd, optarg); break;
      case 'D': cmd_check(!hist_distsize, cmd); hist_distsize = cmd_size(cmd, optarg); break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" links -h` for help. Bad option: %s", argv[optind-1]);
      default: ctx_assert2(0, "shouldn't reach here: %c", c);
    }
  }

  if(hist_distsize && !hist_path) cmd_print_usage("--max-dist without --covg-hist");
  if(hist_covgsize && !hist_path) cmd_print_usage("--max-covg without --covg-hist");

  // Defaults
  if(!hist_distsize) hist_distsize = DEFAULT_MAX_DIST;
  if(!hist_covgsize) hist_covgsize = DEFAULT_MAX_COVG;

  if(optind + 1 != argc) cmd_print_usage("Wrong number of arguments");
  const char *ctp_path = argv[optind];

  bool list = (csv_out_path != NULL);
  bool plot = (plot_out_path != NULL);
  bool save = (link_out_path != NULL);
  bool hist_covg = (thresh_path != NULL || hist_path != NULL);

  size_t plot_kmer_idx = (limit == 0 ? 0 : limit - 1);

  if(clean && !save)
    cmd_print_usage("Need to give --out <out.ctp.gz> with --clean");

  if(!save && !list && !plot && !hist_covg)
    cmd_print_usage("Please specify one of --plot, --list or --clean");

  if(link_out_path && hist_covg && strcmp(link_out_path,"-") == 0)
    cmd_print_usage("Outputing both cleaning threshold (-T) and links (-o) to STDOUT!");

  // Open input file
  FILE *list_fh = NULL, *plot_fh = NULL, *link_tmp_fh = NULL;
  FILE *thresh_fh = NULL, *hist_fh = NULL;
  gzFile link_gz = NULL;

  // Check file don't exist or that we can overwrite
  // Will ignore if path is null
  bool err = false;
  err |= futil_check_outfile(csv_out_path);
  err |= futil_check_outfile(plot_out_path);
  err |= futil_check_outfile(link_out_path);
  err |= futil_check_outfile(thresh_path);
  err |= futil_check_outfile(hist_path);
  if(err) die("Use -f,--force to overwrite files");

  StrBuf link_tmp_path;
  strbuf_alloc(&link_tmp_path, 1024);

  GPathReader ctpin;
  memset(&ctpin, 0, sizeof(ctpin));
  gpath_reader_open(&ctpin, ctp_path);

  size_t ncols = file_filter_into_ncols(&ctpin.fltr);
  size_t kmer_size = gpath_reader_get_kmer_size(&ctpin);
  cJSON *newhdr = cJSON_Duplicate(ctpin.json, 1);

  if(ncols != 1) die("Can only clean a single colour at a time. Sorry.");

  uint64_t (*hists)[hist_covgsize] = NULL;

  if(hist_covg) {
    hists = ctx_calloc(hist_distsize, sizeof(hists[0]));
  }

  if(hist_path && (hist_fh = futil_fopen_create(hist_path, "w")) == NULL)
      die("Cannot open file: %s", hist_path);

  if(thresh_path && (thresh_fh = futil_fopen_create(thresh_path, "w")) == NULL)
      die("Cannot open file: %s", thresh_path);

  if(limit)
    status("Limiting to the first %zu kmers", limit);

  if(clean)
  {
    timestamp();
    message(" Cleaning coverage below %zu", cutoff);
    message("\n");
  }

  if(save)
  {
    // Check we can find the fields we need
    cJSON *links_json  = json_hdr_get(newhdr, "paths", cJSON_Object, link_out_path);
    cJSON *nkmers_json = json_hdr_get(links_json, "num_kmers_with_paths", cJSON_Number, link_out_path);
    cJSON *nlinks_json = json_hdr_get(links_json, "num_paths",            cJSON_Number, link_out_path);
    cJSON *nbytes_json = json_hdr_get(links_json, "path_bytes",           cJSON_Number, link_out_path);
    if(!nkmers_json || !nlinks_json || !nbytes_json)
      die("Cannot find required header entries");

    // Create a random temporary file
    link_tmp_fh = create_tmp_file(&link_tmp_path, link_out_path);

    status("Saving output to: %s", link_out_path);
    status("Temporary output: %s", link_tmp_path.b);

    // Open output file
    if((link_gz = futil_gzopen_create(link_out_path, "w")) == NULL)
      die("Cannot open output link file: %s", link_out_path);

    // Need to open output file first so we can get absolute path
    // Update the header to include this command
    json_hdr_add_curr_cmd(newhdr, link_out_path);
  }

  if(list)
  {
    status("Listing to %s", csv_out_path);
    if((list_fh = futil_fopen_create(csv_out_path, "w")) == NULL)
      die("Cannot open output CSV file %s", csv_out_path);

    // Print csv header
    fprintf(list_fh, "SeqLen,Covg\n");
  }

  if(plot)
  {
    status("Plotting kmer %zu to %s", plot_kmer_idx, plot_out_path);
    if((plot_fh = futil_fopen_create(plot_out_path, "w")) == NULL)
      die("Cannot open output .dot file %s", plot_out_path);
  }

  SizeBuffer countbuf, jposbuf;
  size_buf_alloc(&countbuf, 16);
  size_buf_alloc(&jposbuf, 1024);

  StrBuf kmerbuf, juncsbuf, seqbuf, outbuf;
  strbuf_alloc(&kmerbuf, 1024);
  strbuf_alloc(&juncsbuf, 1024);
  strbuf_alloc(&seqbuf, 1024);
  strbuf_alloc(&outbuf, 1024);

  bool link_fw;
  size_t njuncs;
  size_t knum, nlinks, num_links_exp = 0;

  LinkTree ltree;
  ltree_alloc(&ltree, kmer_size);

  LinkTreeStats tree_stats;
  memset(&tree_stats, 0, sizeof(tree_stats));
  size_t init_num_links = 0, num_links = 0;

  for(knum = 0; !limit || knum < limit; knum++)
  {
    ltree_reset(&ltree);
    if(!gpath_reader_read_kmer(&ctpin, &kmerbuf, &num_links_exp)) break;
    ctx_assert2(kmerbuf.end == kmer_size, "Kmer incorrect length %zu != %zu",
                kmerbuf.end, kmer_size);
    // status("kmer: %s", kmerbuf.b);

    for(nlinks = 0;
        gpath_reader_read_link(&ctpin, &link_fw, &njuncs,
                               &countbuf, &juncsbuf,
                               &seqbuf, &jposbuf);
        nlinks++)
    {
      ltree_add(&ltree, link_fw, countbuf.b[0], jposbuf.b,
                juncsbuf.b, seqbuf.b);
    }

    if(nlinks != num_links_exp)
      warn("Links count mismatch %zu != %zu", nlinks, num_links_exp);

    if(hist_covg)
    {
      ltree_update_covg_hists(&ltree, (uint64_t*)hists,
                              hist_distsize, hist_covgsize);
    }
    if(clean)
    {
      ltree_clean(&ltree, cutoff);
    }

    // Accumulate statistics
    ltree_get_stats(&ltree, &tree_stats);
    num_links = tree_stats.num_links - init_num_links;
    init_num_links = tree_stats.num_links;

    if(list)
    {
      ltree_write_list(&ltree, &outbuf);
      if(fwrite(outbuf.b, 1, outbuf.end, list_fh) != outbuf.end)
        die("Cannot write CSV file to: %s", csv_out_path);
      strbuf_reset(&outbuf);
    }
    if(save && num_links)
    {
      ltree_write_ctp(&ltree, kmerbuf.b, num_links, &outbuf);
      if(fwrite(outbuf.b, 1, outbuf.end, link_tmp_fh) != outbuf.end)
        die("Cannot write ctp file to: %s", link_tmp_path.b);
      strbuf_reset(&outbuf);
    }
    if(plot && knum == plot_kmer_idx)
    {
      status("Plotting tree...");
      ltree_write_dot(&ltree, &outbuf);
      if(fwrite(outbuf.b, 1, outbuf.end, plot_fh) != outbuf.end)
        die("Cannot write plot DOT file to: %s", plot_out_path);
      strbuf_reset(&outbuf);
    }
  }

  gpath_reader_close(&ctpin);

  cJSON *links_json = json_hdr_get(newhdr, "paths", cJSON_Object, link_out_path);
  cJSON *nkmers_json = json_hdr_get(links_json, "num_kmers_with_paths", cJSON_Number, link_out_path);
  cJSON *nlinks_json = json_hdr_get(links_json, "num_paths",            cJSON_Number, link_out_path);
  cJSON *nbytes_json = json_hdr_get(links_json, "path_bytes",           cJSON_Number, link_out_path);

  status("Number of kmers with links %li -> %zu", nkmers_json->valueint, tree_stats.num_trees_with_links);
  status("Number of links %li -> %zu", nlinks_json->valueint, tree_stats.num_links);
  status("Number of bytes %li -> %zu", nbytes_json->valueint, tree_stats.num_link_bytes);

  if(save)
  {
    // Update JSON
    nkmers_json->valuedouble = nkmers_json->valueint = tree_stats.num_trees_with_links;
    nlinks_json->valuedouble = nlinks_json->valueint = tree_stats.num_links;
    nbytes_json->valuedouble = nbytes_json->valueint = tree_stats.num_link_bytes;

    char *json_str = cJSON_Print(newhdr);
    if(gzputs(link_gz, json_str) != (int)strlen(json_str))
      die("Cannot write ctp file to: %s", link_out_path);
    free(json_str);

    gzputs(link_gz, "\n\n");
    gzputs(link_gz, ctp_explanation_comment);
    gzputs(link_gz, "\n");

    fseek(link_tmp_fh, 0, SEEK_SET);
    char *tmp = ctx_malloc(4*ONE_MEGABYTE);
    size_t s;
    while((s = fread(tmp, 1, 4*ONE_MEGABYTE, link_tmp_fh)) > 0) {
      if(gzwrite(link_gz, tmp, s) != (int)s)
        die("Cannot write to output: %s", link_out_path);
    }
    ctx_free(tmp);

    gzclose(link_gz);
    fclose(link_tmp_fh);
  }

  // Write histogram to file
  if(hist_fh)
  {
    size_t i, j;
    fprintf(hist_fh, "  ");
    for(j = 1; j < hist_covgsize; j++) fprintf(hist_fh, ",covg.%02zu", j);
    fprintf(hist_fh, "\n");
    for(i = 1; i < hist_distsize; i++) {
      fprintf(hist_fh, "dist.%02zu", i);
      for(j = 1; j < hist_covgsize; j++) {
        fprintf(hist_fh, ",%"PRIu64, hists[i][j]);
      }
      fprintf(hist_fh, "\n");
    }
  }

  if(thresh_fh)
  {
    // Use median of first five cutoffs
    print_suggest_cutoff(6, hist_covgsize, hists, thresh_fh);
  }

  if(hist_fh && hist_fh != stdout) fclose(hist_fh);

  if(list)
  {
    fclose(list_fh);
  }

  if(plot)
  {
    fclose(plot_fh);
  }

  ctx_free(hists);
  cJSON_Delete(newhdr);
  strbuf_dealloc(&link_tmp_path);
  ltree_dealloc(&ltree);
  size_buf_dealloc(&countbuf);
  size_buf_dealloc(&jposbuf);
  strbuf_dealloc(&kmerbuf);
  strbuf_dealloc(&juncsbuf);
  strbuf_dealloc(&seqbuf);
  strbuf_dealloc(&outbuf);

  return EXIT_SUCCESS;
}
Ejemplo n.º 9
0
int ctx_contigs(int argc, char **argv)
{
  size_t nthreads = 0;
  struct MemArgs memargs = MEM_ARGS_INIT;
  const char *out_path = NULL;
  size_t i, contig_limit = 0, colour = 0;
  bool cmd_reseed = false, cmd_no_reseed = false; // -r, -R
  const char *conf_table_path = NULL; // save confidence table to here
  bool use_missing_info_check = true, seed_with_unused_paths = false;
  double min_step_confid = -1.0, min_cumul_confid = -1.0; // < 0 => no min

  // Read length and expected depth for calculating confidences
  size_t genome_size = 0;

  seq_file_t *tmp_seed_file = NULL;
  SeqFilePtrBuffer seed_buf;
  seq_file_ptr_buf_alloc(&seed_buf, 16);

  GPathReader tmp_gpfile;
  GPathFileBuffer gpfiles;
  gpfile_buf_alloc(&gpfiles, 8);

  // Arg parsing
  char cmd[100], shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'o': cmd_check(!out_path,cmd); out_path = optarg; break;
      case 't': cmd_check(!nthreads,cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'p':
        memset(&tmp_gpfile, 0, sizeof(GPathReader));
        gpath_reader_open(&tmp_gpfile, optarg);
        gpfile_buf_push(&gpfiles, &tmp_gpfile, 1);
        break;
      case '1':
      case 's': // --seed <in.fa>
        if((tmp_seed_file = seq_open(optarg)) == NULL)
          die("Cannot read --seed file: %s", optarg);
        seq_file_ptr_buf_add(&seed_buf, tmp_seed_file);
        break;
      case 'r': cmd_check(!cmd_reseed,cmd); cmd_reseed = true; break;
      case 'R': cmd_check(!cmd_no_reseed,cmd); cmd_no_reseed = true; break;
      case 'N':
        cmd_check(!contig_limit,cmd);
        contig_limit = cmd_uint32_nonzero(cmd, optarg);
        break;
      case 'c': cmd_check(!colour,cmd); colour = cmd_uint32(cmd, optarg); break;
      case 'G': cmd_check(!genome_size,cmd); genome_size = cmd_bases(cmd, optarg); break;
      case 'S': cmd_check(!conf_table_path,cmd); conf_table_path = optarg; break;
      case 'M': cmd_check(use_missing_info_check,cmd); use_missing_info_check = false; break;
      case 'P': cmd_check(!seed_with_unused_paths,cmd); seed_with_unused_paths = true; break;
      case 'C':
        cmd_check(min_cumul_confid < 0,cmd);
        min_cumul_confid = cmd_udouble(cmd,optarg);
        if(min_cumul_confid > 1) die("%s must be 0 <= x <= 1", cmd);
        break;
      case 'T':
        cmd_check(min_step_confid < 0,cmd);
        min_step_confid = cmd_udouble(cmd,optarg);
        if(min_step_confid > 1) die("%s must be 0 <= x <= 1", cmd);
        break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        die("`"CMD" contigs -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  if(cmd_no_reseed && cmd_reseed)
    cmd_print_usage("Cannot specify both -r and -R");

  if(contig_limit && seed_with_unused_paths)
    cmd_print_usage("Cannot combine --ncontigs with --use-seed-paths");

  bool sample_with_replacement = cmd_reseed;

  // Defaults
  if(nthreads == 0) nthreads = DEFAULT_NTHREADS;

  if(!seed_buf.len && !contig_limit && sample_with_replacement) {
    cmd_print_usage("Please specify one or more of: "
                    "--no-reseed | --ncontigs | --seed <in.fa>");
  }

  if(optind >= argc) cmd_print_usage("Require input graph files (.ctx)");

  //
  // Open graph files
  //
  const size_t num_gfiles = argc - optind;
  char **graph_paths = argv + optind;
  ctx_assert(num_gfiles > 0);

  GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader));
  size_t ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0;

  graph_files_open(graph_paths, gfiles, num_gfiles,
                   &ctx_max_kmers, &ctx_sum_kmers);

  // char *ctx_path = argv[optind];

  //
  // Open Graph file
  //
  // GraphFileReader gfile;
  // memset(&gfile, 0, sizeof(GraphFileReader));
  // graph_file_open(&gfile, ctx_path);

  // Update colours in graph file - sample in 0, all others in 1
  // never need more than two colours
  ncols = gpath_load_sample_pop(gfiles, num_gfiles,
                                gpfiles.b, gpfiles.len, colour);

  // Check for compatibility between graph files and path files
  // pop_colour is colour 1
  graphs_gpaths_compatible(gfiles, num_gfiles, gpfiles.b, gpfiles.len, 1);

  if(!genome_size)
  {
    char nk_str[50];
    if(ctx_max_kmers <= 0) die("Please pass --genome <G> if streaming");
    genome_size = ctx_max_kmers;
    ulong_to_str(genome_size, nk_str);
    status("Taking number of kmers as genome size: %s", nk_str);
  }

  //
  // Decide on memory
  //
  size_t bits_per_kmer, kmers_in_hash, graph_mem, path_mem, total_mem;

  // 1 bit needed per kmer if we need to keep track of kmer usage
  bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 + sizeof(GPath*)*8 +
                  ncols + !sample_with_replacement;

  kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                        memargs.mem_to_use_set,
                                        memargs.num_kmers,
                                        memargs.num_kmers_set,
                                        bits_per_kmer,
                                        ctx_max_kmers, ctx_sum_kmers,
                                        false, &graph_mem);

  // Paths memory
  size_t rem_mem = memargs.mem_to_use - MIN2(memargs.mem_to_use, graph_mem);
  path_mem = gpath_reader_mem_req(gpfiles.b, gpfiles.len, ncols, rem_mem, false);

  // Shift path store memory from graphs->paths
  graph_mem -= sizeof(GPath*)*kmers_in_hash;
  path_mem  += sizeof(GPath*)*kmers_in_hash;
  cmd_print_mem(path_mem, "paths");

  // Total memory
  total_mem = graph_mem + path_mem;
  cmd_check_mem_limit(memargs.mem_to_use, total_mem);

  // Load contig hist distribution from ctp files
  ZeroSizeBuffer contig_hist;
  memset(&contig_hist, 0, sizeof(contig_hist));

  for(i = 0; i < gpfiles.len; i++) {
    gpath_reader_load_contig_hist(gpfiles.b[i].json,
                                  gpfiles.b[i].fltr.path.b,
                                  file_filter_fromcol(&gpfiles.b[i].fltr, 0),
                                  &contig_hist);
  }

  // Calculate confidences, only for one colour
  ContigConfidenceTable conf_table;
  conf_table_alloc(&conf_table, 1);
  conf_table_update_hist(&conf_table, 0, genome_size,
                         contig_hist.b, contig_hist.len);

  if(conf_table_path != NULL) {
    conf_table_save(&conf_table, conf_table_path);
  }

  zsize_buf_dealloc(&contig_hist);

  //
  // Output file if printing
  //
  FILE *fout = out_path ? futil_fopen_create(out_path, "w") : NULL;

  // Allocate
  dBGraph db_graph;
  db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, ncols, 1, kmers_in_hash,
                 DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL);

  // Paths
  gpath_reader_alloc_gpstore(gpfiles.b, gpfiles.len, path_mem,
                             false, &db_graph);

  uint8_t *visited = NULL;

  if(!sample_with_replacement)
    visited = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1);

  // Load graph
  LoadingStats stats = LOAD_STATS_INIT_MACRO;

  GraphLoadingPrefs gprefs = {.db_graph = &db_graph,
                              .boolean_covgs = false,
                              .must_exist_in_graph = false,
                              .empty_colours = true};

  for(i = 0; i < num_gfiles; i++) {
    graph_load(&gfiles[i], gprefs, &stats);
    graph_file_close(&gfiles[i]);
    gprefs.empty_colours = false;
  }
  ctx_free(gfiles);

  hash_table_print_stats(&db_graph.ht);

  // Load path files
  for(i = 0; i < gpfiles.len; i++) {
    gpath_reader_load(&gpfiles.b[i], GPATH_DIE_MISSING_KMERS, &db_graph);
    gpath_reader_close(&gpfiles.b[i]);
  }
  gpfile_buf_dealloc(&gpfiles);

  AssembleContigStats assem_stats;
  assemble_contigs_stats_init(&assem_stats);

  assemble_contigs(nthreads, seed_buf.b, seed_buf.len,
                   contig_limit, visited,
                   use_missing_info_check, seed_with_unused_paths,
                   min_step_confid, min_cumul_confid,
                   fout, out_path, &assem_stats, &conf_table,
                   &db_graph, 0); // Sample always loaded into colour zero

  if(fout && fout != stdout) fclose(fout);

  assemble_contigs_stats_print(&assem_stats);
  assemble_contigs_stats_destroy(&assem_stats);

  conf_table_dealloc(&conf_table);

  for(i = 0; i < seed_buf.len; i++)
    seq_close(seed_buf.b[i]);

  seq_file_ptr_buf_dealloc(&seed_buf);

  ctx_free(visited);
  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}