Esempio n. 1
0
// Creates <base>.tmp.<rand>
static FILE* create_tmp_file(StrBuf *path, const char *base)
{
  size_t i;
  const size_t attempt_limit = 100;
  FILE *fh;

  for(i = 0; i < attempt_limit; i++) {
    size_t r = rand() % 9999;
    strbuf_reset(path);
    strbuf_sprintf(path, "%s.tmp.%04zu", base, r);
    if(!futil_file_exists(path->b)) break;
  }
  if(i == attempt_limit)
    die("Temporary files already exist (%zu tries): %s", attempt_limit, path->b);

  if((fh = futil_fopen_create(path->b, "r+")) == NULL) {
    die("Cannot write temporary file: %s [%s]", path->b, strerror(errno));
  }

  unlink(path->b); // Immediately unlink to hide temp file
  return fh;
}
Esempio n. 2
0
int ctx_clean(int argc, char **argv)
{
  size_t nthreads = 0, use_ncols = 0;
  struct MemArgs memargs = MEM_ARGS_INIT;
  const char *out_ctx_path = NULL;
  bool tip_cleaning = false, supernode_cleaning = false;
  size_t min_keep_tip = 0;
  Covg threshold = 0, fallback_thresh = 0;
  const char *len_before_path = NULL, *len_after_path = NULL;
  const char *covg_before_path = NULL, *covg_after_path = NULL;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'o':
        if(out_ctx_path != NULL) cmd_print_usage(NULL);
        out_ctx_path = optarg;
        break;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'N': use_ncols = cmd_uint32_nonzero(cmd, optarg); break;
      case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break;
      case 'T':
        cmd_check(!tip_cleaning, cmd);
        min_keep_tip = cmd_uint32_nonzero(cmd, optarg);
        tip_cleaning = true;
        break;
      case 'S':
        cmd_check(!supernode_cleaning, cmd);
        if(optarg != NULL) threshold = cmd_uint32_nonzero(cmd, optarg);
        supernode_cleaning = true;
        break;
      case 'B': cmd_check(!fallback_thresh, cmd); fallback_thresh = cmd_uint32_nonzero(cmd, optarg); break;
      case 'l': cmd_check(!len_before_path, cmd); len_before_path = optarg; break;
      case 'L': cmd_check(!len_after_path, cmd); len_after_path = optarg; break;
      case 'c': cmd_check(!covg_before_path, cmd); covg_before_path = optarg; break;
      case 'C': cmd_check(!covg_after_path, cmd); covg_after_path = optarg; break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" clean -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  if(nthreads == 0) nthreads = DEFAULT_NTHREADS;

  if(optind >= argc) cmd_print_usage("Please give input graph files");

  // Default behaviour
  if(!tip_cleaning && !supernode_cleaning) {
    if(out_ctx_path != NULL)
      supernode_cleaning = tip_cleaning = true; // do both
    else
      warn("No cleaning being done: you did not specify --out <out.ctx>");
  }

  bool doing_cleaning = (supernode_cleaning || tip_cleaning);

  if(doing_cleaning && out_ctx_path == NULL) {
    cmd_print_usage("Please specify --out <out.ctx> for cleaned graph");
  }

  if(!doing_cleaning && (covg_after_path || len_after_path)) {
    cmd_print_usage("You gave --len-after <out> / --covg-after <out> without "
                    "any cleaning (set -s, --supernodes or -t, --tips)");
  }

  if(doing_cleaning && strcmp(out_ctx_path,"-") != 0 &&
     !futil_get_force() && futil_file_exists(out_ctx_path))
  {
    cmd_print_usage("Output file already exists: %s", out_ctx_path);
  }

  if(fallback_thresh && !supernode_cleaning)
    cmd_print_usage("-B, --fallback <T> without --supernodes");

  // Use remaining args as graph files
  char **gfile_paths = argv + optind;
  size_t i, j, num_gfiles = (size_t)(argc - optind);

  // Open graph files
  GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader));
  size_t ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0;

  ncols = graph_files_open(gfile_paths, gfiles, num_gfiles,
                           &ctx_max_kmers, &ctx_sum_kmers);

  size_t kmer_size = gfiles[0].hdr.kmer_size;

  // default to one colour for now
  if(use_ncols == 0) use_ncols = 1;

  // Flatten if we don't have to remember colours / output a graph
  if(!doing_cleaning)
  {
    ncols = use_ncols = 1;
    for(i = 0; i < num_gfiles; i++)
      file_filter_flatten(&gfiles[i].fltr, 0);
  }

  if(ncols < use_ncols) {
    warn("I only need %zu colour%s ('--ncols %zu' ignored)",
         ncols, util_plural_str(ncols), use_ncols);
    use_ncols = ncols;
  }

  char max_kmers_str[100];
  ulong_to_str(ctx_max_kmers, max_kmers_str);
  status("%zu input graph%s, max kmers: %s, using %zu colours",
         num_gfiles, util_plural_str(num_gfiles), max_kmers_str, use_ncols);

  // If no arguments given we default to removing tips < 2*kmer_size
  if(tip_cleaning && min_keep_tip == 0)
    min_keep_tip = 2 * kmer_size;

  // Warn if any graph files already cleaned
  size_t fromcol, intocol;
  ErrorCleaning *cleaning;

  for(i = 0; i < num_gfiles; i++) {
    for(j = 0; j < file_filter_num(&gfiles[i].fltr); j++) {
      fromcol = file_filter_fromcol(&gfiles[i].fltr, j);
      cleaning = &gfiles[i].hdr.ginfo[fromcol].cleaning;
      if(cleaning->cleaned_snodes && supernode_cleaning) {
        warn("%s:%zu already has supernode cleaning with threshold: <%zu",
             file_filter_path(&gfiles[i].fltr), fromcol,
             (size_t)cleaning->clean_snodes_thresh);
      }
      if(cleaning->cleaned_tips && tip_cleaning) {
        warn("%s:%zu already has had tip cleaned",
             file_filter_path(&gfiles[i].fltr), fromcol);
      }
    }
  }

  // Print steps
  size_t step = 0;
  status("Actions:\n");
  if(covg_before_path != NULL)
    status("%zu. Saving kmer coverage distribution to: %s", step++, covg_before_path);
  if(len_before_path != NULL)
    status("%zu. Saving supernode length distribution to: %s", step++, len_before_path);
  if(tip_cleaning)
    status("%zu. Cleaning tips shorter than %zu nodes", step++, min_keep_tip);
  if(supernode_cleaning && threshold > 0)
    status("%zu. Cleaning supernodes with coverage < %u", step++, threshold);
  if(supernode_cleaning && threshold <= 0)
    status("%zu. Cleaning supernodes with auto-detected threshold", step++);
  if(covg_after_path != NULL)
    status("%zu. Saving kmer coverage distribution to: %s", step++, covg_after_path);
  if(len_after_path != NULL)
    status("%zu. Saving supernode length distribution to: %s", step++, len_after_path);

  //
  // Decide memory usage
  //
  bool all_colours_loaded = (ncols <= use_ncols);
  bool use_mem_limit = (memargs.mem_to_use_set && num_gfiles > 1) || !ctx_max_kmers;

  size_t kmers_in_hash, bits_per_kmer, graph_mem;
  size_t per_kmer_per_col_bits = (sizeof(BinaryKmer)+sizeof(Covg)+sizeof(Edges)) * 8;
  size_t pop_edges_per_kmer_bits = (all_colours_loaded ? 0 : sizeof(Edges) * 8);

  bits_per_kmer = per_kmer_per_col_bits * use_ncols + pop_edges_per_kmer_bits;

  kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                        memargs.mem_to_use_set,
                                        memargs.num_kmers,
                                        memargs.num_kmers_set,
                                        bits_per_kmer,
                                        ctx_max_kmers, ctx_sum_kmers,
                                        use_mem_limit, &graph_mem);

  // Maximise the number of colours we load to fill the mem
  size_t max_usencols = (memargs.mem_to_use*8 - pop_edges_per_kmer_bits * kmers_in_hash) /
                        (per_kmer_per_col_bits * kmers_in_hash);
  use_ncols = MIN2(max_usencols, ncols);

  cmd_check_mem_limit(memargs.mem_to_use, graph_mem);

  //
  // Check output files are writable
  //
  futil_create_output(out_ctx_path);

  // Does nothing if arg is NULL
  futil_create_output(covg_before_path);
  futil_create_output(covg_after_path);
  futil_create_output(len_before_path);
  futil_create_output(len_after_path);

  // Create db_graph
  // Load as many colours as possible
  // Use an extra set of edge to take intersections
  dBGraph db_graph;
  db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, use_ncols, use_ncols,
                 kmers_in_hash, DBG_ALLOC_COVGS);

  // Edges is a special case
  size_t num_edges = db_graph.ht.capacity * (use_ncols + !all_colours_loaded);
  db_graph.col_edges = ctx_calloc(num_edges, sizeof(Edges));

  // Load graph into a single colour
  LoadingStats stats = LOAD_STATS_INIT_MACRO;

  GraphLoadingPrefs gprefs = {.db_graph = &db_graph,
                              .boolean_covgs = false,
                              .must_exist_in_graph = false,
                              .must_exist_in_edges = NULL,
                              .empty_colours = false};

  // Construct cleaned graph header
  GraphFileHeader outhdr;
  memset(&outhdr, 0, sizeof(GraphFileHeader));
  outhdr.version = CTX_GRAPH_FILEFORMAT;
  outhdr.kmer_size = db_graph.kmer_size;
  outhdr.num_of_cols = ncols;
  outhdr.num_of_bitfields = (db_graph.kmer_size*2+63)/64;
  graph_header_alloc(&outhdr, ncols);

  // Merge info into header
  size_t gcol = 0;
  for(i = 0; i < num_gfiles; i++) {
    for(j = 0; j < file_filter_num(&gfiles[i].fltr); j++, gcol++) {
      fromcol = file_filter_fromcol(&gfiles[i].fltr, j);
      intocol = file_filter_intocol(&gfiles[i].fltr, j);
      graph_info_merge(&outhdr.ginfo[intocol], &gfiles[i].hdr.ginfo[fromcol]);
    }
  }

  if(ncols > use_ncols) {
    graph_files_load_flat(gfiles, num_gfiles, gprefs, &stats);
  } else {
    for(i = 0; i < num_gfiles; i++)
      graph_load(&gfiles[i], gprefs, &stats);
  }

  char num_kmers_str[100];
  ulong_to_str(db_graph.ht.num_kmers, num_kmers_str);
  status("Total kmers loaded: %s\n", num_kmers_str);

  size_t initial_nkmers = db_graph.ht.num_kmers;
  hash_table_print_stats(&db_graph.ht);

  uint8_t *visited = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1);
  uint8_t *keep = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1);

  if((supernode_cleaning && threshold <= 0) || covg_before_path || len_before_path)
  {
    // Get coverage distribution and estimate cleaning threshold
    int est_threshold = cleaning_get_threshold(nthreads,
                                               covg_before_path,
                                               len_before_path,
                                               visited, &db_graph);

    if(est_threshold < 0) status("Cannot find recommended cleaning threshold");
    else status("Recommended cleaning threshold is: %i", est_threshold);

    // Use estimated threshold if threshold not set
    if(threshold <= 0) {
      if(fallback_thresh > 0 && est_threshold < (int)fallback_thresh) {
        status("Using fallback threshold: %i", fallback_thresh);
        threshold = fallback_thresh;
      }
      else if(est_threshold >= 0) threshold = est_threshold;
    }
  }

  // Die if we failed to find suitable cleaning threshold
  if(supernode_cleaning && threshold <= 0)
    die("Need cleaning threshold (--supernodes=<D> or --fallback <D>)");

  if(doing_cleaning) {
    // Clean graph of tips (if min_keep_tip > 0) and supernodes (if threshold > 0)
    clean_graph(nthreads, threshold, min_keep_tip,
                covg_after_path, len_after_path,
                visited, keep, &db_graph);
  }

  ctx_free(visited);
  ctx_free(keep);

  if(doing_cleaning)
  {
    // Output graph file
    Edges *intersect_edges = NULL;
    bool kmers_loaded = true;
    size_t col, thresh;

    // Set output header ginfo cleaned
    for(col = 0; col < ncols; col++)
    {
      cleaning = &outhdr.ginfo[col].cleaning;
      cleaning->cleaned_snodes |= supernode_cleaning;
      cleaning->cleaned_tips |= tip_cleaning;

      // if(tip_cleaning) {
      //   strbuf_append_str(&outhdr.ginfo[col].sample_name, ".tipclean");
      // }

      if(supernode_cleaning) {
        thresh = cleaning->clean_snodes_thresh;
        thresh = cleaning->cleaned_snodes ? MAX2(thresh, (uint32_t)threshold)
                                          : (uint32_t)threshold;
        cleaning->clean_snodes_thresh = thresh;

        // char name_append[200];
        // sprintf(name_append, ".supclean%zu", thresh);
        // strbuf_append_str(&outhdr.ginfo[col].sample_name, name_append);
      }
    }

    if(!all_colours_loaded)
    {
      // We haven't loaded all the colours
      // intersect_edges are edges to mask with
      // resets graph edges
      intersect_edges = db_graph.col_edges;
      db_graph.col_edges += db_graph.ht.capacity;
    }

    // Print stats on removed kmers
    size_t removed_nkmers = initial_nkmers - db_graph.ht.num_kmers;
    double removed_pct = (100.0 * removed_nkmers) / initial_nkmers;
    char removed_str[100], init_str[100];
    ulong_to_str(removed_nkmers, removed_str);
    ulong_to_str(initial_nkmers, init_str);
    status("Removed %s of %s (%.2f%%) kmers", removed_str, init_str, removed_pct);

    graph_files_merge(out_ctx_path, gfiles, num_gfiles,
                      kmers_loaded, all_colours_loaded,
                      intersect_edges, &outhdr, &db_graph);

    // Swap back
    if(!all_colours_loaded)
      db_graph.col_edges = intersect_edges;
  }

  ctx_check(db_graph.ht.num_kmers == hash_table_count_kmers(&db_graph.ht));

  graph_header_dealloc(&outhdr);

  for(i = 0; i < num_gfiles; i++) graph_file_close(&gfiles[i]);
  ctx_free(gfiles);

  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}
Esempio n. 3
0
static void dgen_parse_cmdline(int argc, char **argv)
{
  // DEV: could make it optional to write the HLA types at the internal nodes.
  // DEV: talk to Gil about generating his approximation to the coalescent 
  // with recombination - different trees as you track along the viral sequence.
  static struct option longopts[] =
  {
    {"prop_past_sampling", required_argument, NULL, 's'},
    {"consensus_fasta",    required_argument, NULL, 'c'},
    {"root_fasta",         required_argument, NULL, 'r'},
    {"write_newick_tree",  no_argument,       NULL, 'n'},
    {"write_tree_matrix",  no_argument,       NULL, 'g'},
    {"num_queries",        required_argument, NULL, 'q'},
    {"prop_query",         required_argument, NULL, 'f'},
    {NULL, 0, NULL, 0}
  };

  char shortopts[] = "s:j:c:r:ngq:f:";
  
  int optional;
  while((optional = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1) {
    switch (optional) {
      case 's': past_sampling = atof(optarg); break;
      case 'c': cons_path = optarg; break;
      case 'r': root_path = optarg; break;
      case 'n': write_newick_tree_to_file = true; break;
      case 'g': write_tree_to_file = true; break;
      case 'q': num_queries = atoi(optarg); break;
      case 'f': query_fraction = atof(optarg); break;
      default: die("Unknown option: %c", optional);
    }
  }

  if(argc - optind != 6) print_usage();
  
  json_parameters_path = argv[optind];
  printf("%s\n", json_parameters_path);
  N = atoi(argv[optind+1]);
  M = atoi(argv[optind+2]);
  
  if(M > N)
    die("Sample size larger than the infected population size: [M=%i, N=%i]", M, N);

  if(past_sampling < 0) past_sampling = (Decimal) M/N;
  if(past_sampling > 1) 
    die("Past sampling proportion is greater than 1: [M=%i, N=%i]", M, N);

  lambda = atof(argv[optind+3]);
  mu_tree = atof(argv[optind+4]);

  out_dir = argv[optind+5];

  if(!mkpath(out_dir, 0755)) die("Cannot create output dir: %s", out_dir);
  
  time_t rawtime;
  struct tm * timeinfo;

  time(&rawtime);
  timeinfo = localtime(&rawtime);
  char date[100];
  sprintf(date, "%d_%d_%d_%d_%d_%d", timeinfo->tm_year+1900, 
          timeinfo->tm_mon+1, timeinfo->tm_mday, timeinfo->tm_hour,
          timeinfo->tm_min, timeinfo->tm_sec);

  char output_data_path[PATH_MAX+1];
  char newick_data_path[PATH_MAX+1];
  char summary_path[PATH_MAX+1];
  char json_summary_path[PATH_MAX+1];
  char simulated_refs_path[PATH_MAX+1];
  char simulated_root_path[PATH_MAX+1];
  char simulated_queries_path[PATH_MAX+1];
  char hla_query_path[PATH_MAX+1];

  sprintf(output_data_path, "%s/%s_tree.txt", out_dir, date);
  sprintf(newick_data_path, "%s/%s_newick.tre", out_dir, date);
  sprintf(summary_path, "%s/%s_run_summary.txt", out_dir, date);
  sprintf(json_summary_path, "%s/%s_summary.json", out_dir, date);
  sprintf(simulated_refs_path, "%s/%s_simulated_birth_death_refs.fasta", out_dir, date);
  sprintf(simulated_root_path, "%s/%s_simulated_birth_death_root.fasta", out_dir, date);
  sprintf(simulated_queries_path, "%s/%s_simulated_birth_death_queries.fasta", out_dir, date);
  sprintf(hla_query_path, "%s/%s_simulated_hla_queries_birth_death.csv", out_dir, date);
  
  while(futil_file_exists(output_data_path))
  {
    printf("This filename already exists, wait for a second to change the folder name.\n");
    sleep(1);
    time(&rawtime);
    timeinfo = localtime(&rawtime);
    sprintf(date, "%d_%d_%d_%d_%d_%d\n", timeinfo->tm_year+1900, 
            timeinfo->tm_mon+1, timeinfo->tm_mday, timeinfo->tm_hour,
            timeinfo->tm_min, timeinfo->tm_sec);

    sprintf(output_data_path, "%s/%s_tree.txt", out_dir, date);
    sprintf(newick_data_path, "%s/%s_newick.tre", out_dir, date);
    sprintf(summary_path, "%s/%s_summary.txt", out_dir, date);
    sprintf(json_summary_path, "%s/%s_summary_dgen.json", out_dir, date);
    sprintf(simulated_refs_path, "%s/%s_simulated_birth_death_refs.fasta", out_dir, date);
    sprintf(simulated_root_path, "%s/%s_simulated_birth_death_root.fasta", out_dir, date);
    sprintf(simulated_queries_path, "%s/%s_simulated_birth_death_queries.fasta", out_dir, date);
    sprintf(hla_query_path, "%s/%s_simulated_hla_queries_birth_death.csv", out_dir, date);  
  }
  
  if(write_tree_to_file == true) printf("Writing tree information to: %s\n", output_data_path);
  if(write_newick_tree_to_file == true) printf("Writing Newick tree information to: %s\n", newick_data_path);
  printf("Writing summary information to: %s\n", summary_path);
  printf("Writing passed .json information to: %s\n", summary_path);
  printf("Writing simulated reference sequences to: %s\n", simulated_refs_path);
  printf("Writing simulated root sequence to: %s\n", simulated_root_path);
  printf("Writing simulated query sequences to: %s\n", simulated_queries_path);
  printf("Writing simulated HLA query data to: %s\n", hla_query_path);
  
  if(write_tree_to_file == true) tree_data_file = fopen(output_data_path, "w");
  if(write_newick_tree_to_file == true) newick_tree_data_file = fopen(newick_data_path, "w");
  simulated_refs_file = fopen(simulated_refs_path, "w");
  simulated_root_file = fopen(simulated_root_path, "w");
  simulated_queries_file = fopen(simulated_queries_path, "w");
  summary_file = fopen(summary_path, "w");
  json_summary_file = fopen(json_summary_path, "w");
  hla_query_file = fopen(hla_query_path, "w");

  if(tree_data_file == NULL && write_tree_to_file == true)
    die("Cannot open output file: %s", output_data_path);
  if(newick_tree_data_file == NULL && write_newick_tree_to_file == true)
    die("Cannot open output file: %s", newick_data_path);
  if(summary_file == NULL) die("Cannot open output file: %s", summary_path);
  if(json_summary_file == NULL) die("Cannot open output file: %s", json_summary_path);
  if(hla_query_file == NULL) die("Cannot open output file: %s", hla_query_path);  
  if(simulated_refs_file == NULL) die("Cannot open output file: %s", simulated_refs_path);
  if(simulated_root_file == NULL) die("Cannot open output file: %s", simulated_root_path);
  if(simulated_queries_file == NULL) die("Cannot open output file: %s", simulated_queries_path);

  // Add this information to the summary file.
  fprintf(summary_file, "Total lineages at present: %i\n"
                        "Sampled lineages at present: %i\n"
                        "Historical sampling proportion: "DECPRINT"\n"
                        "Birth rate in the genealogy (lambda): "DECPRINT"\n"
                        "Death rate in the genealogy (mu_tree): "DECPRINT"\n", 
          N, M, past_sampling, lambda, mu_tree);
}
Esempio n. 4
0
int ctx_clean(int argc, char **argv)
{
  size_t nthreads = 0, use_ncols = 0;
  struct MemArgs memargs = MEM_ARGS_INIT;
  const char *out_ctx_path = NULL;
  int min_keep_tip = -1, unitig_min = -1; // <0 => default, 0 => noclean
  uint32_t fallback_thresh = 0;
  const char *len_before_path = NULL, *len_after_path = NULL;
  const char *covg_before_path = NULL, *covg_after_path = NULL;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'o':
        if(out_ctx_path != NULL) cmd_print_usage(NULL);
        out_ctx_path = optarg;
        break;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'N': use_ncols = cmd_uint32_nonzero(cmd, optarg); break;
      case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break;
      case 'T':
        cmd_check(min_keep_tip<0, cmd);
        min_keep_tip = (optarg != NULL ? (int)cmd_uint32(cmd, optarg) : -1);
        break;
      case 'S':
      case 'U':
        cmd_check(unitig_min<0, cmd);
        unitig_min = (optarg != NULL ? cmd_uint32(cmd, optarg) : -1);
        break;
      case 'B': cmd_check(!fallback_thresh, cmd); fallback_thresh = cmd_uint32_nonzero(cmd, optarg); break;
      case 'l': cmd_check(!len_before_path, cmd); len_before_path = optarg; break;
      case 'L': cmd_check(!len_after_path, cmd); len_after_path = optarg; break;
      case 'c': cmd_check(!covg_before_path, cmd); covg_before_path = optarg; break;
      case 'C': cmd_check(!covg_after_path, cmd); covg_after_path = optarg; break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" clean -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  if(nthreads == 0) nthreads = DEFAULT_NTHREADS;

  if(optind >= argc) cmd_print_usage("Please give input graph files");

  bool unitig_cleaning = (unitig_min != 0);
  bool tip_cleaning = (min_keep_tip != 0);
  bool doing_cleaning = (unitig_cleaning || tip_cleaning);

  // If you ever want to estimate cleaning threshold without outputting
  // a graph, change this to a warning
  if(doing_cleaning && out_ctx_path == NULL) {
    cmd_print_usage("Please specify --out <out.ctx> for cleaned graph");
    // warn("No cleaning being done: you did not specify --out <out.ctx>");
  }

  if(!doing_cleaning && (covg_after_path || len_after_path)) {
    warn("You gave --len-after <out> / --covg-after <out> without "
         "any cleaning (set -U, --unitigs or -t, --tips)");
  }

  if(doing_cleaning && strcmp(out_ctx_path,"-") != 0 &&
     !futil_get_force() && futil_file_exists(out_ctx_path))
  {
    cmd_print_usage("Output file already exists: %s", out_ctx_path);
  }

  if(fallback_thresh && !unitig_cleaning)
    warn("-B, --fallback <T> without --unitigs");

  // Use remaining args as graph files
  char **gfile_paths = argv + optind;
  size_t i, j, num_gfiles = (size_t)(argc - optind);

  // Open graph files
  GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader));
  size_t col, ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0;

  ncols = graph_files_open(gfile_paths, gfiles, num_gfiles,
                           &ctx_max_kmers, &ctx_sum_kmers);

  size_t kmer_size = gfiles[0].hdr.kmer_size;

  // default to one colour for now
  if(use_ncols == 0) use_ncols = 1;

  // Flatten if we don't have to remember colours / output a graph
  if(out_ctx_path == NULL)
  {
    ncols = use_ncols = 1;
    for(i = 0; i < num_gfiles; i++)
      file_filter_flatten(&gfiles[i].fltr, 0);
  }

  if(ncols < use_ncols) {
    warn("I only need %zu colour%s ('--ncols %zu' ignored)",
         ncols, util_plural_str(ncols), use_ncols);
    use_ncols = ncols;
  }

  char max_kmers_str[100];
  ulong_to_str(ctx_max_kmers, max_kmers_str);
  status("%zu input graph%s, max kmers: %s, using %zu colours",
         num_gfiles, util_plural_str(num_gfiles), max_kmers_str, use_ncols);

  // If no arguments given we default to removing tips < 2*kmer_size
  if(min_keep_tip < 0)
    min_keep_tip = 2 * kmer_size;

  // Warn if any graph files already cleaned
  size_t fromcol;
  ErrorCleaning *cleaning;

  for(i = 0; i < num_gfiles; i++) {
    for(j = 0; j < file_filter_num(&gfiles[i].fltr); j++) {
      fromcol = file_filter_fromcol(&gfiles[i].fltr, j);
      cleaning = &gfiles[i].hdr.ginfo[fromcol].cleaning;
      if(cleaning->cleaned_snodes && unitig_cleaning) {
        warn("%s:%zu already has unitig cleaning with threshold: <%zu",
             file_filter_path(&gfiles[i].fltr), fromcol,
             (size_t)cleaning->clean_snodes_thresh);
      }
      if(cleaning->cleaned_tips && tip_cleaning) {
        warn("%s:%zu already has had tip cleaned",
             file_filter_path(&gfiles[i].fltr), fromcol);
      }
    }
  }

  // Print steps
  size_t step = 0;
  status("Actions:\n");
  if(covg_before_path != NULL)
    status("%zu. Saving kmer coverage distribution to: %s", step++, covg_before_path);
  if(len_before_path != NULL)
    status("%zu. Saving unitig length distribution to: %s", step++, len_before_path);
  if(min_keep_tip > 0)
    status("%zu. Cleaning tips shorter than %i nodes", step++, min_keep_tip);
  if(unitig_min > 0)
    status("%zu. Cleaning unitigs with coverage < %i", step++, unitig_min);
  if(unitig_min < 0)
    status("%zu. Cleaning unitigs with auto-detected threshold", step++);
  if(covg_after_path != NULL)
    status("%zu. Saving kmer coverage distribution to: %s", step++, covg_after_path);
  if(len_after_path != NULL)
    status("%zu. Saving unitig length distribution to: %s", step++, len_after_path);

  //
  // Decide memory usage
  //
  bool all_colours_loaded = (ncols <= use_ncols);
  bool use_mem_limit = (memargs.mem_to_use_set && num_gfiles > 1) || !ctx_max_kmers;

  size_t kmers_in_hash, bits_per_kmer, graph_mem;
  size_t per_col_bits = (sizeof(Covg)+sizeof(Edges)) * 8;
  size_t extra_edge_bits = (all_colours_loaded ? 0 : sizeof(Edges) * 8);

  bits_per_kmer = sizeof(BinaryKmer)*8 +
                  per_col_bits * use_ncols +
                  extra_edge_bits;

  kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                        memargs.mem_to_use_set,
                                        memargs.num_kmers,
                                        memargs.num_kmers_set,
                                        bits_per_kmer,
                                        ctx_max_kmers, ctx_sum_kmers,
                                        use_mem_limit, &graph_mem);

  // Maximise the number of colours we load to fill the mem
  size_t max_usencols = (memargs.mem_to_use*8 -
                         sizeof(BinaryKmer)*8*kmers_in_hash +
                         extra_edge_bits*kmers_in_hash) /
                        (per_col_bits*kmers_in_hash);
  use_ncols = MIN2(max_usencols, ncols);

  cmd_check_mem_limit(memargs.mem_to_use, graph_mem);

  //
  // Check output files are writable
  //
  futil_create_output(out_ctx_path);

  // Does nothing if arg is NULL
  futil_create_output(covg_before_path);
  futil_create_output(covg_after_path);
  futil_create_output(len_before_path);
  futil_create_output(len_after_path);

  // Create db_graph
  // Load as many colours as possible
  // Use an extra set of edge to take intersections
  dBGraph db_graph;
  db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, use_ncols, use_ncols,
                 kmers_in_hash, DBG_ALLOC_EDGES | DBG_ALLOC_COVGS);

  // Extra edges required to hold union of kept edges
  Edges *edges_union = NULL;
  if(use_ncols < ncols)
    edges_union = ctx_calloc(db_graph.ht.capacity, sizeof(Edges));

  // Load graph into a single colour
  GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph);

  // Construct cleaned graph header
  GraphFileHeader outhdr;
  memset(&outhdr, 0, sizeof(GraphFileHeader));
  for(i = 0; i < num_gfiles; i++)
    graph_file_merge_header(&outhdr, &gfiles[i]);

  if(ncols > use_ncols)
  {
    db_graph.num_of_cols = db_graph.num_edge_cols = 1;
    SWAP(edges_union, db_graph.col_edges);
    graphs_load_files_flat(gfiles, num_gfiles, gprefs, NULL);
    SWAP(edges_union, db_graph.col_edges);
    db_graph.num_of_cols = db_graph.num_edge_cols = use_ncols;
  }
  else {
    for(i = 0; i < num_gfiles; i++)
      graph_load(&gfiles[i], gprefs, NULL);
  }

  char num_kmers_str[100];
  ulong_to_str(db_graph.ht.num_kmers, num_kmers_str);
  status("Total kmers loaded: %s\n", num_kmers_str);

  size_t initial_nkmers = db_graph.ht.num_kmers;
  hash_table_print_stats(&db_graph.ht);

  uint8_t *visited = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1);
  uint8_t *keep = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1);

  // Always estimate cleaning threshold
  // if(unitig_min <= 0 || covg_before_path || len_before_path)
  // {
    // Get coverage distribution and estimate cleaning threshold
    int est_min_covg = cleaning_get_threshold(nthreads,
                                              covg_before_path,
                                              len_before_path,
                                              visited, &db_graph);

    if(est_min_covg < 0) status("Cannot find recommended cleaning threshold");
    else status("Recommended cleaning threshold is: %i", est_min_covg);

    // Use estimated threshold if threshold not set
    if(unitig_min < 0) {
      if(fallback_thresh > 0 && est_min_covg < (int)fallback_thresh) {
        status("Using fallback threshold: %i", fallback_thresh);
        unitig_min = fallback_thresh;
      }
      else if(est_min_covg >= 0) unitig_min = est_min_covg;
    }
  // }

  // Die if we failed to find suitable cleaning threshold
  if(unitig_min < 0)
    die("Need cleaning threshold (--unitigs=<D> or --fallback <D>)");

  // Cleaning parameters should now be set (>0) or turned off (==0)
  ctx_assert(unitig_min >= 0);
  ctx_assert(min_keep_tip >= 0);

  if(unitig_min || min_keep_tip)
  {
    // Clean graph of tips (if min_keep_tip > 0) and unitigs (if threshold > 0)
    clean_graph(nthreads, unitig_min, min_keep_tip,
                covg_after_path, len_after_path,
                visited, keep, &db_graph);
  }

  ctx_free(visited);
  ctx_free(keep);

  if(out_ctx_path != NULL)
  {
    // Set output header ginfo cleaned
    for(col = 0; col < ncols; col++)
    {
      cleaning = &outhdr.ginfo[col].cleaning;
      cleaning->cleaned_snodes |= unitig_cleaning;
      cleaning->cleaned_tips |= tip_cleaning;

      // if(tip_cleaning) {
      //   strbuf_append_str(&outhdr.ginfo[col].sample_name, ".tipclean");
      // }

      if(unitig_cleaning) {
        size_t thresh = cleaning->clean_snodes_thresh;
        thresh = cleaning->cleaned_snodes ? MAX2(thresh, (uint32_t)unitig_min)
                                          : (uint32_t)unitig_min;
        cleaning->clean_snodes_thresh = thresh;

        // char name_append[200];
        // sprintf(name_append, ".supclean%zu", thresh);
        // strbuf_append_str(&outhdr.ginfo[col].sample_name, name_append);
      }
    }

    // Print stats on removed kmers
    size_t removed_nkmers = initial_nkmers - db_graph.ht.num_kmers;
    double removed_pct = (100.0 * removed_nkmers) / initial_nkmers;
    char removed_str[100], init_str[100];
    ulong_to_str(removed_nkmers, removed_str);
    ulong_to_str(initial_nkmers, init_str);
    status("Removed %s of %s (%.2f%%) kmers", removed_str, init_str, removed_pct);

    // kmers_loaded=true
    graph_writer_merge(out_ctx_path, gfiles, num_gfiles,
                      true, all_colours_loaded,
                      edges_union, &outhdr, &db_graph);
  }

  ctx_check(db_graph.ht.num_kmers == hash_table_count_kmers(&db_graph.ht));

  // TODO: report kmer coverage for each sample

  graph_header_dealloc(&outhdr);

  for(i = 0; i < num_gfiles; i++) graph_file_close(&gfiles[i]);
  ctx_free(gfiles);

  ctx_free(edges_union);
  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}
Esempio n. 5
0
// Check file don't exist or that we can overwrite it. Ignores if path is NULL
bool futil_check_outfile(const char *file)
{
  bool err = (file && !futil_get_force() && futil_file_exists(file));
  if(err) warn("File already exists: %s", file);
  return err;
}