Пример #1
0
// if one of the files is reading from stdin, sum_kmers_ptr is set to 0
// `max_cols_ptr` is used to return the most colours being loaded from a single file
// returns the number of colours being loaded in total
size_t graph_files_open(char **graph_paths,
                        GraphFileReader *gfiles, size_t num_gfiles,
                        size_t *max_kmers_ptr, size_t *sum_kmers_ptr)
{
  size_t i, ctx_max_kmers = 0, ctx_sum_kmers = 0;
  bool ctx_uses_stdin = false;
  size_t ncols = 0;

  for(i = 0; i < num_gfiles; i++)
  {
    memset(&gfiles[i], 0, sizeof(GraphFileReader));
    graph_file_open2(&gfiles[i], graph_paths[i], "r", ncols);

    if(gfiles[0].hdr.kmer_size != gfiles[i].hdr.kmer_size) {
      cmd_print_usage("Kmer sizes don't match [%u vs %u]",
                      gfiles[0].hdr.kmer_size, gfiles[i].hdr.kmer_size);
    }

    ncols = MAX2(ncols, file_filter_into_ncols(&gfiles[i].fltr));

    ctx_max_kmers = MAX2(ctx_max_kmers, graph_file_nkmers(&gfiles[i]));
    ctx_sum_kmers += graph_file_nkmers(&gfiles[i]);
    ctx_uses_stdin |= file_filter_isstdin(&gfiles[i].fltr);
  }

  if(ctx_uses_stdin) ctx_sum_kmers = SIZE_MAX;

  *max_kmers_ptr = ctx_max_kmers;
  *sum_kmers_ptr = ctx_sum_kmers;

  return ncols;
}
Пример #2
0
int ctx_index(int argc, char **argv)
{
  const char *out_path = NULL;
  size_t block_size = 0, block_kmers = 0;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'o': cmd_check(!out_path, cmd); out_path = optarg; break;
      case 'b':
        cmd_check(!block_kmers, cmd);
        block_kmers = cmd_size_nonzero(cmd, optarg);
        break;
      case 's':
        cmd_check(!block_size, cmd);
        block_size = cmd_size_nonzero(cmd, optarg);
        break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" index -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  if(optind+1 != argc)
    cmd_print_usage("Require exactly one input graph file (.ctx)");

  if(block_size && block_kmers)
    cmd_print_usage("Cannot use --block-kmers and --block-size together");

  const char *ctx_path = argv[optind];

  //
  // Open Graph file
  //
  GraphFileReader gfile;
  memset(&gfile, 0, sizeof(GraphFileReader));
  graph_file_open2(&gfile, ctx_path, "r+", true, 0);

  if(!file_filter_is_direct(&gfile.fltr))
    die("Cannot open graph file with a filter ('in.ctx:blah' syntax)");

  // Open output file
  FILE *fout = out_path ? futil_fopen_create(out_path, "w") : stdout;

  // Start
  size_t filencols = gfile.hdr.num_of_cols;
  size_t kmer_size = gfile.hdr.kmer_size;
  const char *path = file_filter_path(&gfile.fltr);

  size_t ncols = file_filter_into_ncols(&gfile.fltr);
  size_t kmer_mem = sizeof(BinaryKmer) + (sizeof(Edges)+sizeof(Covg))*filencols;

  if(block_size) {
    block_kmers = block_size / kmer_mem;
  } else if(!block_size && !block_kmers) {
    block_size = 4 * ONE_MEGABYTE;
    block_kmers = block_size / kmer_mem;
  }

  // Update block-size
  block_size = block_kmers * kmer_mem;

  status("[index] block bytes: %zu kmers: %zu; kmer bytes: %zu, hdr: %zu",
         block_size, block_kmers, kmer_mem, (size_t)gfile.hdr_size);

  if(block_kmers == 0) die("Cannot set block_kmers to zero");

  // Print header
  fputs("#block_start\tnext_block\tfirst_kmer\tkmer_idx\tnext_kmer_idx\n", fout);

  BinaryKmer bkmer = BINARY_KMER_ZERO_MACRO;
  BinaryKmer prev_bkmer = BINARY_KMER_ZERO_MACRO;
  Covg *covgs = ctx_malloc(ncols * sizeof(Covg));
  Edges *edges = ctx_malloc(ncols * sizeof(Edges));
  char bkmerstr[MAX_KMER_SIZE+1];

  size_t rem_block = block_size - kmer_mem; // block after first kmer
  char *tmp_mem = ctx_malloc(rem_block);

  // Read in file, print index
  size_t nblocks = 0;
  size_t bl_bytes = 0, bl_kmers = 0;
  size_t bl_byte_offset = gfile.hdr_size, bl_kmer_offset = 0;

  while(1)
  {
    if(!graph_file_read(&gfile, &bkmer, covgs, edges)) {
      status("Read kmer failed"); break; }
    binary_kmer_to_str(bkmer, kmer_size, bkmerstr);
    if(nblocks > 0 && !binary_kmer_less_than(prev_bkmer,bkmer))
      die("File is not sorted: %s [%s]", bkmerstr, path);
    // We've already read one kmer entry, read rest of block
    bl_bytes = kmer_mem + gfr_fread_bytes(&gfile, tmp_mem, rem_block);
    bl_kmers = 1 + bl_bytes / kmer_mem;
    fprintf(fout, "%zu\t%zu\t%s\t%zu\t%zu\n",
            bl_byte_offset, bl_byte_offset+bl_bytes, bkmerstr,
            bl_kmer_offset, bl_kmer_offset+bl_kmers);
    bl_byte_offset += bl_bytes;
    bl_kmer_offset += bl_kmers;
    nblocks++;
    if(bl_kmers < block_kmers) {
      status("last block %zu < %zu; %zu vs %zu",
             bl_kmers, block_kmers, bl_bytes, block_size);
      break;
    }
    prev_bkmer = bkmer;
  }

  ctx_free(covgs);
  ctx_free(edges);
  ctx_free(tmp_mem);

  // done
  char num_kmers_str[50], num_blocks_str[50];
  char block_mem_str[50], block_kmers_str[50];
  ulong_to_str(bl_kmer_offset, num_kmers_str);
  ulong_to_str(nblocks, num_blocks_str);
  bytes_to_str(block_size, 1, block_mem_str);
  ulong_to_str(block_kmers, block_kmers_str);

  status("Read %s kmers in %s block%s (block size %s / %s kmers)",
         num_kmers_str, num_blocks_str, util_plural_str(nblocks),
         block_mem_str, block_kmers_str);

  if(fout != stdout) status("Saved to %s", out_path);

  graph_file_close(&gfile);
  fclose(fout);

  return EXIT_SUCCESS;
}
Пример #3
0
int ctx_join(int argc, char **argv)
{
  struct MemArgs memargs = MEM_ARGS_INIT;
  const char *out_path = NULL;
  size_t use_ncols = 0;

  GraphFileReader tmp_gfile;
  GraphFileBuffer isec_gfiles_buf;
  gfile_buf_alloc(&isec_gfiles_buf, 8);

  // Arg parsing
  char cmd[100], shortopts[100];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'o': cmd_check(!out_path, cmd); out_path = optarg; break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'N': cmd_check(!use_ncols, cmd); use_ncols = cmd_uint32_nonzero(cmd, optarg); break;
      case 'i':
        graph_file_reset(&tmp_gfile);
        graph_file_open(&tmp_gfile, optarg);
        if(file_filter_into_ncols(&tmp_gfile.fltr) > 1)
          warn("Flattening intersection graph into colour 0: %s", optarg);
        file_filter_flatten(&tmp_gfile.fltr, 0);
        gfile_buf_push(&isec_gfiles_buf, &tmp_gfile, 1);
        break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" join -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  GraphFileReader *igfiles = isec_gfiles_buf.b;
  size_t num_igfiles = isec_gfiles_buf.len;

  if(!out_path) cmd_print_usage("--out <out.ctx> required");

  if(optind >= argc)
    cmd_print_usage("Please specify at least one input graph file");

  // optind .. argend-1 are graphs to load
  size_t num_gfiles = (size_t)(argc - optind);
  char **gfile_paths = argv + optind;

  GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader));

  status("Probing %zu graph files and %zu intersect files", num_gfiles, num_igfiles);

  // Check all binaries are valid binaries with matching kmer size
  size_t i;
  size_t ctx_max_cols = 0;
  uint64_t min_intersect_num_kmers = 0, ctx_max_kmers = 0, ctx_sum_kmers = 0;

  for(i = 0; i < num_gfiles; i++)
  {
    graph_file_open2(&gfiles[i], gfile_paths[i], "r", true, ctx_max_cols);

    if(gfiles[0].hdr.kmer_size != gfiles[i].hdr.kmer_size) {
      cmd_print_usage("Kmer sizes don't match [%u vs %u]",
                      gfiles[0].hdr.kmer_size, gfiles[i].hdr.kmer_size);
    }

    ctx_max_cols = MAX2(ctx_max_cols, file_filter_into_ncols(&gfiles[i].fltr));
    ctx_max_kmers = MAX2(ctx_max_kmers, graph_file_nkmers(&gfiles[i]));
    ctx_sum_kmers += graph_file_nkmers(&gfiles[i]);
  }

  // Probe intersection graph files
  for(i = 0; i < num_igfiles; i++)
  {
    if(gfiles[0].hdr.kmer_size != igfiles[i].hdr.kmer_size) {
      cmd_print_usage("Kmer sizes don't match [%u vs %u]",
                  gfiles[0].hdr.kmer_size, igfiles[i].hdr.kmer_size);
    }

    uint64_t nkmers = graph_file_nkmers(&igfiles[i]);

    if(i == 0) min_intersect_num_kmers = nkmers;
    else if(nkmers < min_intersect_num_kmers)
    {
      // Put smallest intersection binary first
      SWAP(igfiles[i], igfiles[0]);
      min_intersect_num_kmers = nkmers;
    }
  }

  bool take_intersect = (num_igfiles > 0);

  // If we are taking an intersection,
  // all kmers intersection kmers will need to be loaded
  if(take_intersect)
    ctx_max_kmers = ctx_sum_kmers = min_intersect_num_kmers;

  bool use_ncols_set = (use_ncols > 0);
  bool output_to_stdout = (strcmp(out_path,"-") == 0);

  // if(use_ncols == 0) use_ncols = 1;
  if(use_ncols_set) {
    if(use_ncols < ctx_max_cols && output_to_stdout)
      die("I need %zu colours if outputting to STDOUT (--ncols)", ctx_max_cols);
    if(use_ncols > ctx_max_cols) {
      warn("I only need %zu colour%s ('--ncols %zu' ignored)",
           ctx_max_cols, util_plural_str(ctx_max_cols), use_ncols);
      use_ncols = ctx_max_cols;
    }
  }
  else {
    use_ncols = output_to_stdout ? ctx_max_cols : 1;
  }

  // Check out_path is writable
  futil_create_output(out_path);

  status("Output %zu cols; from %zu files; intersecting %zu graphs; ",
         ctx_max_cols, num_gfiles, num_igfiles);

  if(num_gfiles == 1 && num_igfiles == 0)
  {
    // Loading only one file with no intersection files
    // Don't need to store a graph in memory, can filter as stream
    // Don't actually store anything in the de Bruijn graph, but we need to
    // pass it, so mock one up
    dBGraph db_graph;
    db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size,
                   file_filter_into_ncols(&gfiles[0].fltr), 0, 1024, 0);

    graph_writer_stream_mkhdr(out_path, &gfiles[0], &db_graph, NULL, NULL);
    graph_file_close(&gfiles[0]);
    gfile_buf_dealloc(&isec_gfiles_buf);
    ctx_free(gfiles);

    db_graph_dealloc(&db_graph);

    return EXIT_SUCCESS;
  }

  //
  // Decide on memory
  //
  size_t bits_per_kmer, kmers_in_hash, graph_mem;

  bits_per_kmer = sizeof(BinaryKmer)*8 +
                  (sizeof(Covg) + sizeof(Edges)) * 8 * use_ncols;

  kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                        memargs.mem_to_use_set,
                                        memargs.num_kmers,
                                        memargs.num_kmers_set,
                                        bits_per_kmer,
                                        ctx_max_kmers, ctx_sum_kmers,
                                        true, &graph_mem);

  if(!use_ncols_set)
  {
    // Maximise use_ncols
    size_t max_usencols = (memargs.mem_to_use*8) / bits_per_kmer;

    use_ncols = MIN2(max_usencols, ctx_max_cols);
    bits_per_kmer = sizeof(BinaryKmer)*8 +
                    (sizeof(Covg) + sizeof(Edges)) * 8 * use_ncols;

    // Re-check memory used
    kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                          memargs.mem_to_use_set,
                                          memargs.num_kmers,
                                          memargs.num_kmers_set,
                                          bits_per_kmer,
                                          ctx_max_kmers, ctx_sum_kmers,
                                          true, &graph_mem);
  }

  status("Using %zu colour%s in memory", use_ncols, util_plural_str(use_ncols));

  cmd_check_mem_limit(memargs.mem_to_use, graph_mem);

  // Create db_graph
  dBGraph db_graph;
  Edges *intersect_edges = NULL;
  size_t edge_cols = (use_ncols + take_intersect);

  db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, use_ncols, use_ncols,
                 kmers_in_hash, DBG_ALLOC_COVGS);

  // We allocate edges ourself since it's a special case
  db_graph.col_edges = ctx_calloc(db_graph.ht.capacity*edge_cols, sizeof(Edges));

  // Load intersection binaries
  char *intsct_gname_ptr = NULL;
  StrBuf intersect_gname;
  strbuf_alloc(&intersect_gname, 1024);

  if(take_intersect)
  {
    GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph);
    gprefs.boolean_covgs = true; // covg++ only

    for(i = 0; i < num_igfiles; i++)
    {
      graph_load(&igfiles[i], gprefs, NULL);

      // Update intersect header
      // note: intersection graphs all load exactly one colour into colour 0
      graph_info_make_intersect(&igfiles[i].hdr.ginfo[0], &intersect_gname);

      gprefs.must_exist_in_graph = true;
      gprefs.must_exist_in_edges = db_graph.col_edges;
    }

    if(num_igfiles > 1)
    {
      // Remove nodes where covg != num_igfiles
      HASH_ITERATE_SAFE(&db_graph.ht, remove_non_intersect_nodes,
                        db_graph.col_covgs, (Covg)num_igfiles, &db_graph.ht);
    }

    status("Loaded intersection set\n");
    intsct_gname_ptr = intersect_gname.b;

    for(i = 0; i < num_igfiles; i++) graph_file_close(&igfiles[i]);

    // Reset graph info
    for(i = 0; i < db_graph.num_of_cols; i++)
      graph_info_init(&db_graph.ginfo[i]);

    // Zero covgs
    memset(db_graph.col_covgs, 0, db_graph.ht.capacity * sizeof(Covg));

    // Use union edges we loaded to intersect new edges
    intersect_edges = db_graph.col_edges;
    db_graph.col_edges += db_graph.ht.capacity;
  }

  bool kmers_loaded = take_intersect, colours_loaded = false;

  graph_writer_merge_mkhdr(out_path, gfiles, num_gfiles,
                          kmers_loaded, colours_loaded, intersect_edges,
                          intsct_gname_ptr, &db_graph);

  if(take_intersect)
    db_graph.col_edges -= db_graph.ht.capacity;

  for(i = 0; i < num_gfiles; i++) graph_file_close(&gfiles[i]);

  strbuf_dealloc(&intersect_gname);
  gfile_buf_dealloc(&isec_gfiles_buf);
  ctx_free(gfiles);

  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}
Пример #4
0
static void parse_args(int argc, char **argv)
{
  BuildGraphTask task;
  memset(&task, 0, sizeof(task));
  task.prefs = SEQ_LOADING_PREFS_INIT;
  task.stats = SEQ_LOADING_STATS_INIT;
  uint8_t fq_offset = 0;
  int intocolour = -1;
  GraphFileReader tmp_gfile;

  // Arg parsing
  char cmd[100], shortopts[100];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;
  bool sample_named = false, pref_unused = false;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 't': cmd_check(!nthreads,cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'k': cmd_check(!kmer_size,cmd); kmer_size = cmd_kmer_size(cmd, optarg); break;
      case 's':
        intocolour++;
        check_sample_name(optarg);
        sample_name_buf_add(&snamebuf, (SampleName){.colour = intocolour,
                                                    .name = optarg});
        sample_named = true;
        break;
      case '1':
      case '2':
      case 'i':
        pref_unused = false;
        if(!sample_named)
          cmd_print_usage("Please give sample name first [-s,--sample <name>]");
        asyncio_task_parse(&task.files, c, optarg, fq_offset, NULL);
        task.prefs.colour = intocolour;
        add_task(&task);
        break;
      case 'M':
             if(!strcmp(optarg,"FF")) task.prefs.matedir = READPAIR_FF;
        else if(!strcmp(optarg,"FR")) task.prefs.matedir = READPAIR_FR;
        else if(!strcmp(optarg,"RF")) task.prefs.matedir = READPAIR_RF;
        else if(!strcmp(optarg,"RR")) task.prefs.matedir = READPAIR_RR;
        else die("-M,--matepair <orient> must be one of: FF,FR,RF,RR");
        pref_unused = true; break;
      case 'O': fq_offset = cmd_uint8(cmd, optarg); pref_unused = true; break;
      case 'Q': task.prefs.fq_cutoff = cmd_uint8(cmd, optarg); pref_unused = true; break;
      case 'H': task.prefs.hp_cutoff = cmd_uint8(cmd, optarg); pref_unused = true; break;
      case 'p': task.prefs.remove_pcr_dups = true; pref_unused = true; break;
      case 'P': task.prefs.remove_pcr_dups = false; pref_unused = true; break;
      case 'g':
        if(intocolour == -1) intocolour = 0;
        graph_file_reset(&tmp_gfile);
        graph_file_open2(&tmp_gfile, optarg, "r", true, intocolour);
        intocolour = MAX2((size_t)intocolour, file_filter_into_ncols(&tmp_gfile.fltr)-1);
        gfile_buf_push(&gfilebuf, &tmp_gfile, 1);
        sample_named = false;
        break;
      case 'I':
        graph_file_reset(&tmp_gfile);
        graph_file_open(&tmp_gfile, optarg);
        if(file_filter_into_ncols(&tmp_gfile.fltr) > 1)
          warn("Flattening intersection graph into colour 0: %s", optarg);
        file_filter_flatten(&tmp_gfile.fltr, 0);
        gfile_buf_push(&gisecbuf, &tmp_gfile, 1);
        break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" build -h` for help. Bad option: %s", argv[optind-1]);
      default: die("Bad option: %s", cmd);
    }
  }
Пример #5
0
int ctx_links(int argc, char **argv)
{
  size_t limit = 0;
  const char *link_out_path = NULL, *csv_out_path = NULL, *plot_out_path = NULL;
  const char *thresh_path = NULL, *hist_path = NULL;

  size_t hist_distsize = 0, hist_covgsize = 0;
  size_t cutoff = 0;
  bool clean = false;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'o': cmd_check(!link_out_path, cmd); link_out_path = optarg; break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'l': cmd_check(!csv_out_path, cmd); csv_out_path = optarg; break;
      case 'c': cmd_check(!cutoff, cmd); cutoff = cmd_size(cmd, optarg); clean = true; break;
      case 'L': cmd_check(!limit, cmd); limit = cmd_size(cmd, optarg); break;
      case 'P': cmd_check(!plot_out_path, cmd); plot_out_path = optarg; break;
      case 'T': cmd_check(!thresh_path, cmd); thresh_path = optarg; break;
      case 'H': cmd_check(!hist_path, cmd); hist_path = optarg; break;
      case 'C': cmd_check(!hist_covgsize, cmd); hist_covgsize = cmd_size(cmd, optarg); break;
      case 'D': cmd_check(!hist_distsize, cmd); hist_distsize = cmd_size(cmd, optarg); break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" links -h` for help. Bad option: %s", argv[optind-1]);
      default: ctx_assert2(0, "shouldn't reach here: %c", c);
    }
  }

  if(hist_distsize && !hist_path) cmd_print_usage("--max-dist without --covg-hist");
  if(hist_covgsize && !hist_path) cmd_print_usage("--max-covg without --covg-hist");

  // Defaults
  if(!hist_distsize) hist_distsize = DEFAULT_MAX_DIST;
  if(!hist_covgsize) hist_covgsize = DEFAULT_MAX_COVG;

  if(optind + 1 != argc) cmd_print_usage("Wrong number of arguments");
  const char *ctp_path = argv[optind];

  bool list = (csv_out_path != NULL);
  bool plot = (plot_out_path != NULL);
  bool save = (link_out_path != NULL);
  bool hist_covg = (thresh_path != NULL || hist_path != NULL);

  size_t plot_kmer_idx = (limit == 0 ? 0 : limit - 1);

  if(clean && !save)
    cmd_print_usage("Need to give --out <out.ctp.gz> with --clean");

  if(!save && !list && !plot && !hist_covg)
    cmd_print_usage("Please specify one of --plot, --list or --clean");

  if(link_out_path && hist_covg && strcmp(link_out_path,"-") == 0)
    cmd_print_usage("Outputing both cleaning threshold (-T) and links (-o) to STDOUT!");

  // Open input file
  FILE *list_fh = NULL, *plot_fh = NULL, *link_tmp_fh = NULL;
  FILE *thresh_fh = NULL, *hist_fh = NULL;
  gzFile link_gz = NULL;

  // Check file don't exist or that we can overwrite
  // Will ignore if path is null
  bool err = false;
  err |= futil_check_outfile(csv_out_path);
  err |= futil_check_outfile(plot_out_path);
  err |= futil_check_outfile(link_out_path);
  err |= futil_check_outfile(thresh_path);
  err |= futil_check_outfile(hist_path);
  if(err) die("Use -f,--force to overwrite files");

  StrBuf link_tmp_path;
  strbuf_alloc(&link_tmp_path, 1024);

  GPathReader ctpin;
  memset(&ctpin, 0, sizeof(ctpin));
  gpath_reader_open(&ctpin, ctp_path);

  size_t ncols = file_filter_into_ncols(&ctpin.fltr);
  size_t kmer_size = gpath_reader_get_kmer_size(&ctpin);
  cJSON *newhdr = cJSON_Duplicate(ctpin.json, 1);

  if(ncols != 1) die("Can only clean a single colour at a time. Sorry.");

  uint64_t (*hists)[hist_covgsize] = NULL;

  if(hist_covg) {
    hists = ctx_calloc(hist_distsize, sizeof(hists[0]));
  }

  if(hist_path && (hist_fh = futil_fopen_create(hist_path, "w")) == NULL)
      die("Cannot open file: %s", hist_path);

  if(thresh_path && (thresh_fh = futil_fopen_create(thresh_path, "w")) == NULL)
      die("Cannot open file: %s", thresh_path);

  if(limit)
    status("Limiting to the first %zu kmers", limit);

  if(clean)
  {
    timestamp();
    message(" Cleaning coverage below %zu", cutoff);
    message("\n");
  }

  if(save)
  {
    // Check we can find the fields we need
    cJSON *links_json  = json_hdr_get(newhdr, "paths", cJSON_Object, link_out_path);
    cJSON *nkmers_json = json_hdr_get(links_json, "num_kmers_with_paths", cJSON_Number, link_out_path);
    cJSON *nlinks_json = json_hdr_get(links_json, "num_paths",            cJSON_Number, link_out_path);
    cJSON *nbytes_json = json_hdr_get(links_json, "path_bytes",           cJSON_Number, link_out_path);
    if(!nkmers_json || !nlinks_json || !nbytes_json)
      die("Cannot find required header entries");

    // Create a random temporary file
    link_tmp_fh = create_tmp_file(&link_tmp_path, link_out_path);

    status("Saving output to: %s", link_out_path);
    status("Temporary output: %s", link_tmp_path.b);

    // Open output file
    if((link_gz = futil_gzopen_create(link_out_path, "w")) == NULL)
      die("Cannot open output link file: %s", link_out_path);

    // Need to open output file first so we can get absolute path
    // Update the header to include this command
    json_hdr_add_curr_cmd(newhdr, link_out_path);
  }

  if(list)
  {
    status("Listing to %s", csv_out_path);
    if((list_fh = futil_fopen_create(csv_out_path, "w")) == NULL)
      die("Cannot open output CSV file %s", csv_out_path);

    // Print csv header
    fprintf(list_fh, "SeqLen,Covg\n");
  }

  if(plot)
  {
    status("Plotting kmer %zu to %s", plot_kmer_idx, plot_out_path);
    if((plot_fh = futil_fopen_create(plot_out_path, "w")) == NULL)
      die("Cannot open output .dot file %s", plot_out_path);
  }

  SizeBuffer countbuf, jposbuf;
  size_buf_alloc(&countbuf, 16);
  size_buf_alloc(&jposbuf, 1024);

  StrBuf kmerbuf, juncsbuf, seqbuf, outbuf;
  strbuf_alloc(&kmerbuf, 1024);
  strbuf_alloc(&juncsbuf, 1024);
  strbuf_alloc(&seqbuf, 1024);
  strbuf_alloc(&outbuf, 1024);

  bool link_fw;
  size_t njuncs;
  size_t knum, nlinks, num_links_exp = 0;

  LinkTree ltree;
  ltree_alloc(&ltree, kmer_size);

  LinkTreeStats tree_stats;
  memset(&tree_stats, 0, sizeof(tree_stats));
  size_t init_num_links = 0, num_links = 0;

  for(knum = 0; !limit || knum < limit; knum++)
  {
    ltree_reset(&ltree);
    if(!gpath_reader_read_kmer(&ctpin, &kmerbuf, &num_links_exp)) break;
    ctx_assert2(kmerbuf.end == kmer_size, "Kmer incorrect length %zu != %zu",
                kmerbuf.end, kmer_size);
    // status("kmer: %s", kmerbuf.b);

    for(nlinks = 0;
        gpath_reader_read_link(&ctpin, &link_fw, &njuncs,
                               &countbuf, &juncsbuf,
                               &seqbuf, &jposbuf);
        nlinks++)
    {
      ltree_add(&ltree, link_fw, countbuf.b[0], jposbuf.b,
                juncsbuf.b, seqbuf.b);
    }

    if(nlinks != num_links_exp)
      warn("Links count mismatch %zu != %zu", nlinks, num_links_exp);

    if(hist_covg)
    {
      ltree_update_covg_hists(&ltree, (uint64_t*)hists,
                              hist_distsize, hist_covgsize);
    }
    if(clean)
    {
      ltree_clean(&ltree, cutoff);
    }

    // Accumulate statistics
    ltree_get_stats(&ltree, &tree_stats);
    num_links = tree_stats.num_links - init_num_links;
    init_num_links = tree_stats.num_links;

    if(list)
    {
      ltree_write_list(&ltree, &outbuf);
      if(fwrite(outbuf.b, 1, outbuf.end, list_fh) != outbuf.end)
        die("Cannot write CSV file to: %s", csv_out_path);
      strbuf_reset(&outbuf);
    }
    if(save && num_links)
    {
      ltree_write_ctp(&ltree, kmerbuf.b, num_links, &outbuf);
      if(fwrite(outbuf.b, 1, outbuf.end, link_tmp_fh) != outbuf.end)
        die("Cannot write ctp file to: %s", link_tmp_path.b);
      strbuf_reset(&outbuf);
    }
    if(plot && knum == plot_kmer_idx)
    {
      status("Plotting tree...");
      ltree_write_dot(&ltree, &outbuf);
      if(fwrite(outbuf.b, 1, outbuf.end, plot_fh) != outbuf.end)
        die("Cannot write plot DOT file to: %s", plot_out_path);
      strbuf_reset(&outbuf);
    }
  }

  gpath_reader_close(&ctpin);

  cJSON *links_json = json_hdr_get(newhdr, "paths", cJSON_Object, link_out_path);
  cJSON *nkmers_json = json_hdr_get(links_json, "num_kmers_with_paths", cJSON_Number, link_out_path);
  cJSON *nlinks_json = json_hdr_get(links_json, "num_paths",            cJSON_Number, link_out_path);
  cJSON *nbytes_json = json_hdr_get(links_json, "path_bytes",           cJSON_Number, link_out_path);

  status("Number of kmers with links %li -> %zu", nkmers_json->valueint, tree_stats.num_trees_with_links);
  status("Number of links %li -> %zu", nlinks_json->valueint, tree_stats.num_links);
  status("Number of bytes %li -> %zu", nbytes_json->valueint, tree_stats.num_link_bytes);

  if(save)
  {
    // Update JSON
    nkmers_json->valuedouble = nkmers_json->valueint = tree_stats.num_trees_with_links;
    nlinks_json->valuedouble = nlinks_json->valueint = tree_stats.num_links;
    nbytes_json->valuedouble = nbytes_json->valueint = tree_stats.num_link_bytes;

    char *json_str = cJSON_Print(newhdr);
    if(gzputs(link_gz, json_str) != (int)strlen(json_str))
      die("Cannot write ctp file to: %s", link_out_path);
    free(json_str);

    gzputs(link_gz, "\n\n");
    gzputs(link_gz, ctp_explanation_comment);
    gzputs(link_gz, "\n");

    fseek(link_tmp_fh, 0, SEEK_SET);
    char *tmp = ctx_malloc(4*ONE_MEGABYTE);
    size_t s;
    while((s = fread(tmp, 1, 4*ONE_MEGABYTE, link_tmp_fh)) > 0) {
      if(gzwrite(link_gz, tmp, s) != (int)s)
        die("Cannot write to output: %s", link_out_path);
    }
    ctx_free(tmp);

    gzclose(link_gz);
    fclose(link_tmp_fh);
  }

  // Write histogram to file
  if(hist_fh)
  {
    size_t i, j;
    fprintf(hist_fh, "  ");
    for(j = 1; j < hist_covgsize; j++) fprintf(hist_fh, ",covg.%02zu", j);
    fprintf(hist_fh, "\n");
    for(i = 1; i < hist_distsize; i++) {
      fprintf(hist_fh, "dist.%02zu", i);
      for(j = 1; j < hist_covgsize; j++) {
        fprintf(hist_fh, ",%"PRIu64, hists[i][j]);
      }
      fprintf(hist_fh, "\n");
    }
  }

  if(thresh_fh)
  {
    // Use median of first five cutoffs
    print_suggest_cutoff(6, hist_covgsize, hists, thresh_fh);
  }

  if(hist_fh && hist_fh != stdout) fclose(hist_fh);

  if(list)
  {
    fclose(list_fh);
  }

  if(plot)
  {
    fclose(plot_fh);
  }

  ctx_free(hists);
  cJSON_Delete(newhdr);
  strbuf_dealloc(&link_tmp_path);
  ltree_dealloc(&ltree);
  size_buf_dealloc(&countbuf);
  size_buf_dealloc(&jposbuf);
  strbuf_dealloc(&kmerbuf);
  strbuf_dealloc(&juncsbuf);
  strbuf_dealloc(&seqbuf);
  strbuf_dealloc(&outbuf);

  return EXIT_SUCCESS;
}
Пример #6
0
void read_thread_args_parse(struct ReadThreadCmdArgs *args,
                            int argc, char **argv,
                            const struct option *longopts, bool correct_cmd)
{
  size_t i;
  CorrectAlnInput task = CORRECT_ALN_INPUT_INIT;
  uint8_t fq_offset = 0;
  GPathReader tmp_gpfile;

  CorrectAlnInputBuffer *inputs = &args->inputs;
  args->memargs = (struct MemArgs)MEM_ARGS_INIT;
  args->fmt = SEQ_FMT_FASTQ;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int used = 1, c;
  char *tmp_path;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'o': cmd_check(!args->out_ctp_path,cmd); args->out_ctp_path = optarg; break;
      case 'p':
        memset(&tmp_gpfile, 0, sizeof(GPathReader));
        gpath_reader_open(&tmp_gpfile, optarg);
        gpfile_buf_push(&args->gpfiles, &tmp_gpfile, 1);
        break;
      case 't':
        cmd_check(!args->nthreads, cmd);
        args->nthreads = cmd_uint32_nonzero(cmd, optarg);
        break;
      case 'm': cmd_mem_args_set_memory(&args->memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&args->memargs, optarg); break;
      case 'c': args->colour = cmd_uint32(cmd, optarg); break;
      case 'F':
        cmd_check(args->fmt == SEQ_FMT_FASTQ, cmd);
        args->fmt = cmd_parse_format(cmd, optarg);
        break;
      case '1':
      case '2':
      case 'i':
        used = 1;
        correct_aln_input_buf_push(inputs, &task, 1);
        asyncio_task_parse(&inputs->b[inputs->len-1].files, c, optarg,
                           fq_offset, correct_cmd ? &tmp_path : NULL);
        if(correct_cmd) inputs->b[inputs->len-1].out_base = tmp_path;
        break;
      case 'M':
             if(!strcmp(optarg,"FF")) task.matedir = READPAIR_FF;
        else if(!strcmp(optarg,"FR")) task.matedir = READPAIR_FR;
        else if(!strcmp(optarg,"RF")) task.matedir = READPAIR_RF;
        else if(!strcmp(optarg,"RR")) task.matedir = READPAIR_RR;
        else die("-M,--matepair <orient> must be one of: FF,FR,RF,RR");
        used = 0; break;
      case 'O': fq_offset = cmd_uint8(cmd, optarg); used = 0; break;
      case 'Q': task.fq_cutoff = cmd_uint8(cmd, optarg); used = 0; break;
      case 'H': task.hp_cutoff = cmd_uint8(cmd, optarg); used = 0; break;
      case 'l': task.crt_params.frag_len_min = cmd_uint32(cmd, optarg); used = 0; break;
      case 'L': task.crt_params.frag_len_max = cmd_uint32(cmd, optarg); used = 0; break;
      case 'w': task.crt_params.one_way_gap_traverse = true; used = 0; break;
      case 'W': task.crt_params.one_way_gap_traverse = false; used = 0; break;
      case 'd': task.crt_params.gap_wiggle = cmd_udouble(cmd, optarg); used = 0; break;
      case 'D': task.crt_params.gap_variance = cmd_udouble(cmd, optarg); used = 0; break;
      case 'X': task.crt_params.max_context = cmd_uint32(cmd, optarg); used = 0; break;
      case 'e': task.crt_params.use_end_check = true; used = 0; break;
      case 'E': task.crt_params.use_end_check = false; used = 0; break;
      case 'g': cmd_check(!args->dump_seq_sizes, cmd); args->dump_seq_sizes = optarg; break;
      case 'G': cmd_check(!args->dump_frag_sizes, cmd); args->dump_frag_sizes = optarg; break;
      case 'u': args->use_new_paths = true; break;
      case 'x': gen_paths_print_contigs = true; break;
      case 'y': gen_paths_print_paths = true; break;
      case 'z': gen_paths_print_reads = true; break;
      case 'Z':
        cmd_check(!args->fq_zero, cmd);
        if(strlen(optarg) != 1)
          cmd_print_usage("--fq-zero <c> requires a single char");
        args->fq_zero = optarg[0];
        break;
      case 'P': cmd_check(!args->append_orig_seq,cmd); args->append_orig_seq = true; break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" thread/correct -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  if(args->nthreads == 0) args->nthreads = DEFAULT_NTHREADS;

  // Check that optind+1 == argc
  if(optind+1 > argc)
    cmd_print_usage("Expected exactly one graph file");
  else if(optind+1 < argc)
    cmd_print_usage("Expected only one graph file. What is this: '%s'", argv[optind]);

  char *graph_path = argv[optind];
  status("Reading graph: %s", graph_path);

  if(!used) cmd_print_usage("Ignored arguments after last --seq");

  // ctx_thread requires output file
  if(!correct_cmd && !args->out_ctp_path)
    cmd_print_usage("--out <out.ctp> is required");

  //
  // Open graph graph file
  //
  GraphFileReader *gfile = &args->gfile;
  graph_file_open(gfile, graph_path);

  if(!correct_cmd && file_filter_into_ncols(&gfile->fltr) > 1)
    die("Please specify a single colour e.g. %s:0", file_filter_path(&gfile->fltr));

  //
  // Open path files
  //
  size_t path_max_usedcols = 0;
  for(i = 0; i < args->gpfiles.len; i++) {
    // file_filter_update_intocol(&args->pfiles.b[i].fltr, 0);
    if(!correct_cmd && file_filter_into_ncols(&args->gpfiles.b[i].fltr) > 1) {
      die("Please specify a single colour e.g. %s:0",
          file_filter_path(&args->gpfiles.b[i].fltr));
    }
    path_max_usedcols = MAX2(path_max_usedcols,
                             file_filter_into_ncols(&args->gpfiles.b[i].fltr));
  }
  args->path_max_usedcols = path_max_usedcols;

  // Check for compatibility between graph files and path files
  graphs_gpaths_compatible(gfile, 1, args->gpfiles.b, args->gpfiles.len, -1);

  // if no paths loaded, set all max_context values to 1, since >1 kmer only
  // useful if can pickup paths
  if(args->gpfiles.len == 0) {
    for(i = 0; i < inputs->len; i++)
      inputs->b[i].crt_params.max_context = 1;
  }

  // Check frag_len_min < frag_len_max
  for(i = 0; i < inputs->len; i++)
  {
    CorrectAlnInput *t = &inputs->b[i];
    t->files.ptr = t;
    if(t->crt_params.frag_len_min > t->crt_params.frag_len_max) {
      die("--min-ins %u is greater than --max-ins %u",
          t->crt_params.frag_len_min, t->crt_params.frag_len_max);
    }
    correct_aln_input_print(&inputs->b[i]);
    args->max_gap_limit = MAX2(args->max_gap_limit, t->crt_params.frag_len_max);
  }

  futil_create_output(args->dump_seq_sizes);
  futil_create_output(args->dump_frag_sizes);
}
Пример #7
0
int ctx_view(int argc, char **argv)
{
  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // TODO:
  // print_action actions[argc];
  // bool read_kmers = false;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        cmd_print_usage("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]);
      default:
        cmd_print_usage("Programmer fail. Tell Isaac.");
    }
  }

  if(print_kmers) parse_kmers = 1;

  bool no_flags = (!print_info && !parse_kmers && !print_kmers);
  if(no_flags) { print_info = parse_kmers = 1; }

  if(optind+1 != argc) cmd_print_usage("Require one input graph file (.ctx)");

  char *path = argv[optind];
  size_t num_errors = 0, num_warnings = 0;

  GraphFileReader gfile;
  memset(&gfile, 0, sizeof(gfile));
  int ret = graph_file_open(&gfile, path);
  if(ret == 0) die("Cannot open file: %s", path);

  if(print_info)
  {
    char fsize_str[50];
    bytes_to_str((size_t)gfile.file_size, 0, fsize_str);
    printf("Loading file: %s\n", file_filter_path(&gfile.fltr));
    printf("File size: %s\n", fsize_str);
    printf("----\n");
  }

  size_t i, col, ncols = file_filter_into_ncols(&gfile.fltr);
  size_t kmer_size = gfile.hdr.kmer_size;
  ctx_assert(ncols > 0);

  GraphFileHeader hdr;
  memset(&hdr, 0, sizeof(hdr));
  graph_file_merge_header(&hdr, &gfile);

  uint64_t nkmers_read = 0, nkmers_loaded = 0;
  uint64_t num_all_zero_kmers = 0, num_zero_covg_kmers = 0;
  uint64_t *col_nkmers, *col_sum_covgs;
  col_nkmers = ctx_calloc(ncols, sizeof(col_nkmers[0]));
  col_sum_covgs = ctx_calloc(ncols, sizeof(col_sum_covgs[0]));

  // Print header
  if(print_info) print_header(&hdr, gfile.num_of_kmers);

  BinaryKmer bkmer;
  Covg covgs[ncols], keep_kmer;
  Edges edges[ncols];

  bool direct_read = file_filter_is_direct(&gfile.fltr);

  if(parse_kmers || print_kmers)
  {
    if(print_info && print_kmers) printf("----\n");

    for(; graph_file_read_reset(&gfile, &bkmer, covgs, edges); nkmers_read++)
    {
      // If kmer has no covg in any samples -> don't load
      keep_kmer = 0;
      for(col = 0; col < ncols; col++) {
        col_nkmers[col] += (covgs[col] > 0);
        col_sum_covgs[col] += covgs[col];
        keep_kmer |= covgs[col];
      }

      if(!direct_read && !keep_kmer) continue;
      nkmers_loaded++;

      /* Kmer Checks */
      // graph_file_read_reset() already checks for:
      // 1. oversized kmers
      // 2. kmers with covg 0 in all colours
      // 3. edges without coverage in a colour

      // Check for all-zeros (i.e. all As kmer: AAAAAA)
      uint64_t kmer_words_or = 0;

      for(i = 0; i < hdr.num_of_bitfields; i++)
        kmer_words_or |= bkmer.b[i];

      if(kmer_words_or == 0)
      {
        if(num_all_zero_kmers == 1)
        {
          loading_error("more than one all 'A's kmers seen [index: %"PRIu64"]\n",
                        nkmers_read);
        }

        num_all_zero_kmers++;
      }

      // Check covg is 0 for all colours
      for(i = 0; i < ncols && covgs[i] == 0; i++);
      num_zero_covg_kmers += (i == ncols);

      // Print
      if(print_kmers)
        db_graph_print_kmer2(bkmer, covgs, edges, ncols, kmer_size, stdout);
    }
  }

  // check for various reading errors
  // if(errno != 0)
  //   loading_error("errno set [%i]: %s\n", (int)errno, strerror(errno));

  int err = ferror(gfile.fh);
  if(err != 0)
    loading_error("occurred after file reading [%i]\n", err);

  char nstr[50];

  if(print_kmers || parse_kmers)
  {
    // file_size is set to -1 if we are reading from a stream,
    // therefore won't be able to check number of kmers read
    if(gfile.file_size != -1 && nkmers_read != (uint64_t)gfile.num_of_kmers) {
      loading_warning("Expected %zu kmers, read %zu\n",
                      (size_t)gfile.num_of_kmers, (size_t)nkmers_read);
    }

    if(num_all_zero_kmers > 1)
    {
      loading_error("%s all-zero-kmers seen\n",
                    ulong_to_str(num_all_zero_kmers, nstr));
    }

    if(num_zero_covg_kmers > 0)
    {
      loading_warning("%s kmers have no coverage in any colour\n",
                      ulong_to_str(num_zero_covg_kmers, nstr));
    }
  }

  // Count warnings printed by graph_file_reader.c
  num_warnings += gfile.error_zero_covg;
  num_warnings += gfile.error_missing_covg;

  // Can only print these stats if we're read in the kmers
  if((print_kmers || parse_kmers) && print_info)
  {
    // print kmer coverage per sample
    printf("\n---- Per colour stats\n");
    printf("num. kmers:");
    for(col = 0; col < ncols; col++)
      printf("\t%s", ulong_to_str(col_nkmers[col], nstr));
    printf("\n");
    printf("sum coverage:");
    for(col = 0; col < ncols; col++)
      printf("\t%s", ulong_to_str(col_sum_covgs[col], nstr));
    printf("\n");
    printf("kmer coverage:");
    for(col = 0; col < ncols; col++)
      printf("\t%.2f", safe_frac(col_sum_covgs[col], col_nkmers[col]));
    printf("\n");

    // Overall stats
    uint64_t sum_covgs = 0;
    double mean_kmer_covg = 0.0;
    for(col = 0; col < ncols; col++) sum_covgs += col_sum_covgs[col];
    mean_kmer_covg = nkmers_loaded ? (double)sum_covgs / nkmers_loaded : 0.0;

    printf("\n---- Overall stats\n");
    printf("Total kmers:    %s\n", ulong_to_str(nkmers_loaded, nstr));
    printf("Total coverage: %s\n", ulong_to_str(sum_covgs, nstr));
    printf("Mean coverage:  %s\n", double_to_str(mean_kmer_covg, 2, nstr));
  }

  if(print_info)
  {
    // Print memory stats
    uint64_t mem, capacity, num_buckets, req_capacity;
    uint8_t bucket_size;

    req_capacity = (size_t)(gfile.num_of_kmers / IDEAL_OCCUPANCY);
    capacity = hash_table_cap(req_capacity, &num_buckets, &bucket_size);
    mem = ht_mem(bucket_size, num_buckets,
                 sizeof(BinaryKmer)*8 + ncols*(sizeof(Covg)+sizeof(Edges))*8);

    char memstr[100], capacitystr[100], bucket_size_str[100], num_buckets_str[100];
    bytes_to_str(mem, 1, memstr);
    ulong_to_str(capacity, capacitystr);
    ulong_to_str(bucket_size, bucket_size_str);
    ulong_to_str(num_buckets, num_buckets_str);

    size_t mem_height = (size_t)__builtin_ctzl(num_buckets);

    printf("\n---- Memory\n");
    printf("memory required: %s [capacity: %s]\n", memstr, capacitystr);
    printf("  bucket size: %s; number of buckets: %s\n",
            bucket_size_str, num_buckets_str);
    printf("  --kmer_size %zu --mem_height %zu --mem_width %i\n",
           kmer_size, mem_height, bucket_size);
  }

  if((print_kmers || parse_kmers) && print_info)
  {
    printf("\n----\n");
    if(num_warnings > 0 || num_errors > 0) {
      printf("Warnings: %zu; Errors: %zu\n",
              (size_t)num_warnings, (size_t)num_errors);
    }
    if(num_errors == 0)
      printf(num_warnings ? "Graph may be ok\n" : "Graph is valid\n");
  }

  ctx_free(col_nkmers);
  ctx_free(col_sum_covgs);

  // Close file (which zeros it)
  graph_file_close(&gfile);
  graph_header_dealloc(&hdr);

  return num_errors ? EXIT_FAILURE : EXIT_SUCCESS;
}
Пример #8
0
int ctx_exp_abc(int argc, char **argv)
{
  size_t i, nthreads = 0, num_repeats = 0, max_AB_dist = 0;
  struct MemArgs memargs = MEM_ARGS_INIT;
  bool print_failed_contigs = false;

  GPathReader tmp_gpfile;
  GPathFileBuffer gpfiles;
  gpfile_buf_alloc(&gpfiles, 8);

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 't': cmd_check(!nthreads,cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'p':
        memset(&tmp_gpfile, 0, sizeof(GPathReader));
        gpath_reader_open(&tmp_gpfile, optarg);
        gpfile_buf_push(&gpfiles, &tmp_gpfile, 1);
        break;
      case 'N': cmd_check(!num_repeats,cmd); num_repeats = cmd_uint32_nonzero(cmd, optarg); break;
      case 'M': cmd_check(!max_AB_dist,cmd); max_AB_dist = cmd_uint32_nonzero(cmd, optarg); break;
      case 'P': cmd_check(!print_failed_contigs,cmd); print_failed_contigs = true; break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" exp_abc -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  // Defaults
  if(nthreads == 0) nthreads = DEFAULT_NTHREADS;
  if(num_repeats == 0) num_repeats = DEFAULT_NUM_REPEATS;
  if(max_AB_dist == 0) max_AB_dist = DEFAULT_MAX_AB_DIST;

  if(print_failed_contigs && nthreads != 1) {
    warn("--print forces nthreads to be one. soz.");
    nthreads = 1;
  }

  if(optind+1 != argc) cmd_print_usage("Require exactly one input graph file (.ctx)");

  const char *ctx_path = argv[optind];

  //
  // Open Graph file
  //
  GraphFileReader gfile;
  memset(&gfile, 0, sizeof(GraphFileReader));
  graph_file_open(&gfile, ctx_path);

  size_t ncols = file_filter_into_ncols(&gfile.fltr);

  // Check only loading one colour
  if(ncols > 1) die("Only implemented for one colour currently");

  // Check graph + paths are compatible
  graphs_gpaths_compatible(&gfile, 1, gpfiles.b, gpfiles.len, -1);

  //
  // Decide on memory
  //
  size_t bits_per_kmer, kmers_in_hash, graph_mem, path_mem, total_mem;

  // 1 bit needed per kmer if we need to keep track of kmer usage
  bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 + sizeof(GPath*)*8 +
                  ncols;

  kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                        memargs.mem_to_use_set,
                                        memargs.num_kmers,
                                        memargs.num_kmers_set,
                                        bits_per_kmer,
                                        gfile.num_of_kmers, gfile.num_of_kmers,
                                        false, &graph_mem);

  // Paths memory
  size_t rem_mem = memargs.mem_to_use - MIN2(memargs.mem_to_use, graph_mem);
  path_mem = gpath_reader_mem_req(gpfiles.b, gpfiles.len, ncols, rem_mem, false,
                                  kmers_in_hash, false);

  // Shift path store memory from graphs->paths
  graph_mem -= sizeof(GPath*)*kmers_in_hash;
  path_mem  += sizeof(GPath*)*kmers_in_hash;
  cmd_print_mem(path_mem, "paths");

  total_mem = graph_mem + path_mem;
  cmd_check_mem_limit(memargs.mem_to_use, total_mem);

  //
  // Allocate memory
  //
  dBGraph db_graph;
  db_graph_alloc(&db_graph, gfile.hdr.kmer_size, 1, 1, kmers_in_hash,
                 DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL);

  // Paths
  gpath_reader_alloc_gpstore(gpfiles.b, gpfiles.len, path_mem, false, &db_graph);

  // Load the graph
  GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph);
  gprefs.empty_colours = true;

  graph_load(&gfile, gprefs, NULL);
  graph_file_close(&gfile);

  hash_table_print_stats(&db_graph.ht);

  // Load link files
  for(i = 0; i < gpfiles.len; i++) {
    gpath_reader_load(&gpfiles.b[i], GPATH_DIE_MISSING_KMERS, &db_graph);
    gpath_reader_close(&gpfiles.b[i]);
  }
  gpfile_buf_dealloc(&gpfiles);

  status("\n");
  status("Test 1: Priming region A->B (n: %zu max_AB_dist: %zu)",
         num_repeats, max_AB_dist);

  run_exp_abc(&db_graph, true, nthreads, num_repeats,
              max_AB_dist, print_failed_contigs);

  status("\n");
  status("Test 2: Trying to traverse A->B (n: %zu max_AB_dist: %zu)",
         num_repeats, max_AB_dist);

  run_exp_abc(&db_graph, false, nthreads, num_repeats,
              max_AB_dist, print_failed_contigs);

  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}