Example #1
0
// if one of the files is reading from stdin, sum_kmers_ptr is set to 0
// `max_cols_ptr` is used to return the most colours being loaded from a single file
// returns the number of colours being loaded in total
size_t graph_files_open(char **graph_paths,
                        GraphFileReader *gfiles, size_t num_gfiles,
                        size_t *max_kmers_ptr, size_t *sum_kmers_ptr)
{
  size_t i, ctx_max_kmers = 0, ctx_sum_kmers = 0;
  bool ctx_uses_stdin = false;
  size_t ncols = 0;

  for(i = 0; i < num_gfiles; i++)
  {
    memset(&gfiles[i], 0, sizeof(GraphFileReader));
    graph_file_open2(&gfiles[i], graph_paths[i], "r", ncols);

    if(gfiles[0].hdr.kmer_size != gfiles[i].hdr.kmer_size) {
      cmd_print_usage("Kmer sizes don't match [%u vs %u]",
                      gfiles[0].hdr.kmer_size, gfiles[i].hdr.kmer_size);
    }

    ncols = MAX2(ncols, file_filter_into_ncols(&gfiles[i].fltr));

    ctx_max_kmers = MAX2(ctx_max_kmers, graph_file_nkmers(&gfiles[i]));
    ctx_sum_kmers += graph_file_nkmers(&gfiles[i]);
    ctx_uses_stdin |= file_filter_isstdin(&gfiles[i].fltr);
  }

  if(ctx_uses_stdin) ctx_sum_kmers = SIZE_MAX;

  *max_kmers_ptr = ctx_max_kmers;
  *sum_kmers_ptr = ctx_sum_kmers;

  return ncols;
}
Example #2
0
int ctx_join(int argc, char **argv)
{
  struct MemArgs memargs = MEM_ARGS_INIT;
  const char *out_path = NULL;
  size_t use_ncols = 0;

  GraphFileReader tmp_gfile;
  GraphFileBuffer isec_gfiles_buf;
  gfile_buf_alloc(&isec_gfiles_buf, 8);

  // Arg parsing
  char cmd[100], shortopts[100];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'o': cmd_check(!out_path, cmd); out_path = optarg; break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'N': cmd_check(!use_ncols, cmd); use_ncols = cmd_uint32_nonzero(cmd, optarg); break;
      case 'i':
        graph_file_reset(&tmp_gfile);
        graph_file_open(&tmp_gfile, optarg);
        if(file_filter_into_ncols(&tmp_gfile.fltr) > 1)
          warn("Flattening intersection graph into colour 0: %s", optarg);
        file_filter_flatten(&tmp_gfile.fltr, 0);
        gfile_buf_push(&isec_gfiles_buf, &tmp_gfile, 1);
        break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" join -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  GraphFileReader *igfiles = isec_gfiles_buf.b;
  size_t num_igfiles = isec_gfiles_buf.len;

  if(!out_path) cmd_print_usage("--out <out.ctx> required");

  if(optind >= argc)
    cmd_print_usage("Please specify at least one input graph file");

  // optind .. argend-1 are graphs to load
  size_t num_gfiles = (size_t)(argc - optind);
  char **gfile_paths = argv + optind;

  GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader));

  status("Probing %zu graph files and %zu intersect files", num_gfiles, num_igfiles);

  // Check all binaries are valid binaries with matching kmer size
  size_t i;
  size_t ctx_max_cols = 0;
  uint64_t min_intersect_num_kmers = 0, ctx_max_kmers = 0, ctx_sum_kmers = 0;

  for(i = 0; i < num_gfiles; i++)
  {
    graph_file_open2(&gfiles[i], gfile_paths[i], "r", true, ctx_max_cols);

    if(gfiles[0].hdr.kmer_size != gfiles[i].hdr.kmer_size) {
      cmd_print_usage("Kmer sizes don't match [%u vs %u]",
                      gfiles[0].hdr.kmer_size, gfiles[i].hdr.kmer_size);
    }

    ctx_max_cols = MAX2(ctx_max_cols, file_filter_into_ncols(&gfiles[i].fltr));
    ctx_max_kmers = MAX2(ctx_max_kmers, graph_file_nkmers(&gfiles[i]));
    ctx_sum_kmers += graph_file_nkmers(&gfiles[i]);
  }

  // Probe intersection graph files
  for(i = 0; i < num_igfiles; i++)
  {
    if(gfiles[0].hdr.kmer_size != igfiles[i].hdr.kmer_size) {
      cmd_print_usage("Kmer sizes don't match [%u vs %u]",
                  gfiles[0].hdr.kmer_size, igfiles[i].hdr.kmer_size);
    }

    uint64_t nkmers = graph_file_nkmers(&igfiles[i]);

    if(i == 0) min_intersect_num_kmers = nkmers;
    else if(nkmers < min_intersect_num_kmers)
    {
      // Put smallest intersection binary first
      SWAP(igfiles[i], igfiles[0]);
      min_intersect_num_kmers = nkmers;
    }
  }

  bool take_intersect = (num_igfiles > 0);

  // If we are taking an intersection,
  // all kmers intersection kmers will need to be loaded
  if(take_intersect)
    ctx_max_kmers = ctx_sum_kmers = min_intersect_num_kmers;

  bool use_ncols_set = (use_ncols > 0);
  bool output_to_stdout = (strcmp(out_path,"-") == 0);

  // if(use_ncols == 0) use_ncols = 1;
  if(use_ncols_set) {
    if(use_ncols < ctx_max_cols && output_to_stdout)
      die("I need %zu colours if outputting to STDOUT (--ncols)", ctx_max_cols);
    if(use_ncols > ctx_max_cols) {
      warn("I only need %zu colour%s ('--ncols %zu' ignored)",
           ctx_max_cols, util_plural_str(ctx_max_cols), use_ncols);
      use_ncols = ctx_max_cols;
    }
  }
  else {
    use_ncols = output_to_stdout ? ctx_max_cols : 1;
  }

  // Check out_path is writable
  futil_create_output(out_path);

  status("Output %zu cols; from %zu files; intersecting %zu graphs; ",
         ctx_max_cols, num_gfiles, num_igfiles);

  if(num_gfiles == 1 && num_igfiles == 0)
  {
    // Loading only one file with no intersection files
    // Don't need to store a graph in memory, can filter as stream
    // Don't actually store anything in the de Bruijn graph, but we need to
    // pass it, so mock one up
    dBGraph db_graph;
    db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size,
                   file_filter_into_ncols(&gfiles[0].fltr), 0, 1024, 0);

    graph_writer_stream_mkhdr(out_path, &gfiles[0], &db_graph, NULL, NULL);
    graph_file_close(&gfiles[0]);
    gfile_buf_dealloc(&isec_gfiles_buf);
    ctx_free(gfiles);

    db_graph_dealloc(&db_graph);

    return EXIT_SUCCESS;
  }

  //
  // Decide on memory
  //
  size_t bits_per_kmer, kmers_in_hash, graph_mem;

  bits_per_kmer = sizeof(BinaryKmer)*8 +
                  (sizeof(Covg) + sizeof(Edges)) * 8 * use_ncols;

  kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                        memargs.mem_to_use_set,
                                        memargs.num_kmers,
                                        memargs.num_kmers_set,
                                        bits_per_kmer,
                                        ctx_max_kmers, ctx_sum_kmers,
                                        true, &graph_mem);

  if(!use_ncols_set)
  {
    // Maximise use_ncols
    size_t max_usencols = (memargs.mem_to_use*8) / bits_per_kmer;

    use_ncols = MIN2(max_usencols, ctx_max_cols);
    bits_per_kmer = sizeof(BinaryKmer)*8 +
                    (sizeof(Covg) + sizeof(Edges)) * 8 * use_ncols;

    // Re-check memory used
    kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                          memargs.mem_to_use_set,
                                          memargs.num_kmers,
                                          memargs.num_kmers_set,
                                          bits_per_kmer,
                                          ctx_max_kmers, ctx_sum_kmers,
                                          true, &graph_mem);
  }

  status("Using %zu colour%s in memory", use_ncols, util_plural_str(use_ncols));

  cmd_check_mem_limit(memargs.mem_to_use, graph_mem);

  // Create db_graph
  dBGraph db_graph;
  Edges *intersect_edges = NULL;
  size_t edge_cols = (use_ncols + take_intersect);

  db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, use_ncols, use_ncols,
                 kmers_in_hash, DBG_ALLOC_COVGS);

  // We allocate edges ourself since it's a special case
  db_graph.col_edges = ctx_calloc(db_graph.ht.capacity*edge_cols, sizeof(Edges));

  // Load intersection binaries
  char *intsct_gname_ptr = NULL;
  StrBuf intersect_gname;
  strbuf_alloc(&intersect_gname, 1024);

  if(take_intersect)
  {
    GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph);
    gprefs.boolean_covgs = true; // covg++ only

    for(i = 0; i < num_igfiles; i++)
    {
      graph_load(&igfiles[i], gprefs, NULL);

      // Update intersect header
      // note: intersection graphs all load exactly one colour into colour 0
      graph_info_make_intersect(&igfiles[i].hdr.ginfo[0], &intersect_gname);

      gprefs.must_exist_in_graph = true;
      gprefs.must_exist_in_edges = db_graph.col_edges;
    }

    if(num_igfiles > 1)
    {
      // Remove nodes where covg != num_igfiles
      HASH_ITERATE_SAFE(&db_graph.ht, remove_non_intersect_nodes,
                        db_graph.col_covgs, (Covg)num_igfiles, &db_graph.ht);
    }

    status("Loaded intersection set\n");
    intsct_gname_ptr = intersect_gname.b;

    for(i = 0; i < num_igfiles; i++) graph_file_close(&igfiles[i]);

    // Reset graph info
    for(i = 0; i < db_graph.num_of_cols; i++)
      graph_info_init(&db_graph.ginfo[i]);

    // Zero covgs
    memset(db_graph.col_covgs, 0, db_graph.ht.capacity * sizeof(Covg));

    // Use union edges we loaded to intersect new edges
    intersect_edges = db_graph.col_edges;
    db_graph.col_edges += db_graph.ht.capacity;
  }

  bool kmers_loaded = take_intersect, colours_loaded = false;

  graph_writer_merge_mkhdr(out_path, gfiles, num_gfiles,
                          kmers_loaded, colours_loaded, intersect_edges,
                          intsct_gname_ptr, &db_graph);

  if(take_intersect)
    db_graph.col_edges -= db_graph.ht.capacity;

  for(i = 0; i < num_gfiles; i++) graph_file_close(&gfiles[i]);

  strbuf_dealloc(&intersect_gname);
  gfile_buf_dealloc(&isec_gfiles_buf);
  ctx_free(gfiles);

  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}