Beispiel #1
0
/**
 * Dumps the hash table. It stores the size of the structs so we can 
 * validate when reading that the memory do corresponds to the structure
 * that we are writing. 
 * 
 */
void hash_table_dump_memory(char * filename, HashTable * hash){
	
	FILE * fp = fopen(filename, "wb");
	char * magic = MAGIC_TEXT;
	int size_ht = sizeof(HashTable);
	int size_e = sizeof(Element);
	int magic_size = strlen(magic);
	short version = HASH_VERSION;

	
	long long number_buckets = hash->number_buckets;
	int bucket_size = hash->bucket_size; 
	long long hash_size=number_buckets * bucket_size;	
	
	//Header stuff, this is enough information to prepare the hash table.  
	fwrite(magic, sizeof(char), magic_size, fp);
	fwrite(&version, sizeof(short), 1, fp);
	fwrite(&size_ht, sizeof(int), 1,fp);
	fwrite(&size_e, sizeof(int), 1,fp);
	fwrite(&hash->kmer_size, sizeof(short), 1, fp);
	fwrite(&number_buckets, sizeof(long long), 1, fp);
	fwrite(&bucket_size, sizeof(int), 1, fp);
	fwrite(&hash->max_rehash_tries, sizeof(int), 1, fp);
	fwrite(&hash->unique_kmers, sizeof(long long), 1, fp);
	
	
	//The actual data of the hash table. We are storing everything 
	fwrite(hash->table, size_e, hash_size, fp);
	fwrite(hash->next_element, sizeof(int), number_buckets, fp);
	fwrite(hash->collisions, sizeof(long long), hash->max_rehash_tries, fp);
	log_and_screen_printf("Hash dumped to : %s \n" , filename);
	hash_table_print_stats(hash);
	fclose(fp);
	
}
Beispiel #2
0
int main(int argc, char **argv)
{
  (void)argc; (void)argv;
  cortex_init();
  cmd_init(argc, argv);

  if(argc != 3) die("usage: ./debug <in.ctp> <in.ctx>");

  const char *out_path = argv[2];

  GPathReader pfile;
  memset(&pfile, 0, sizeof(GPathReader));
  gpath_reader_open(&pfile, argv[1], true);
  status("Got file with %zu colours", pfile.ncolours);

  size_t i, kmer_size = 7, ncols = 3;

  gpath_reader_check(&pfile, kmer_size, ncols);
  gzFile gzout = futil_gzopen_create(out_path, "w");

  dBGraph db_graph;
  db_graph_alloc(&db_graph, kmer_size, ncols, 1, 1024, DBG_ALLOC_EDGES);

  // Create a path store that tracks path counts
  gpath_store_alloc(&db_graph.gpstore,
                    db_graph.num_of_cols, db_graph.ht.capacity,
                    ONE_MEGABYTE, true, false);

  // Create path hash table for fast lookup
  gpath_hash_alloc(&db_graph.gphash, &db_graph.gpstore, ONE_MEGABYTE);

  // Set sample names
  for(i = 0; i < pfile.ncolours; i++) {
    const char *sample_name = gpath_reader_get_sample_name(&pfile, i);
    ctx_assert(sample_name != NULL);
    strbuf_set(&db_graph.ginfo[i].sample_name, sample_name);
  }

  // Load path files, add kmers that are missing
  gpath_reader_load(&pfile, GPATH_ADD_MISSING_KMERS, &db_graph);

  hash_table_print_stats(&db_graph.ht);

  // Write output file
  gpath_save(gzout, out_path, 1, true, NULL, NULL, &pfile.json, 1, &db_graph);
  gzclose(gzout);

  // Checks
  // gpath_checks_all_paths(&db_graph, 2); // use two threads
  gpath_checks_counts(&db_graph);

  // Clean up
  gpath_reader_close(&pfile);
  db_graph_dealloc(&db_graph);
  cortex_destroy();

  return EXIT_SUCCESS;
}
Beispiel #3
0
static KmerHash * load_kmer_table(KmerStatsCmdLine cmd_line){
    //TODO: Use a special format that contains the memory requirements. 
    
     
    log_and_screen_printf("\nHash table from file: %s\n", cmd_line.reference_kmers);
    KmerHash * kmer_hash = hash_table_new(cmd_line.number_of_buckets_bits,
                              cmd_line.bucket_size, 25, cmd_line.kmer_size);
    boolean all_entries_are_unique = true;
    
    log_and_screen_printf("\nReading kmers file: %s\n", cmd_line.reference_kmers);		
    fflush(stdout);
    load_binary_from_filename_into_kmers_hash(cmd_line.reference_kmers, kmer_hash, KMER_HASH_REFERENCE_INDEX, all_entries_are_unique);
    
    log_and_screen_printf("\nRead of file complete. Total kmers: %'lld\n", hash_table_get_unique_kmers(kmer_hash));
    hash_table_print_stats(kmer_hash);
    return kmer_hash;

}   
Beispiel #4
0
static long long load_reads_coverage_table(KmerStatsCmdLine cmd_line,  KmerHash * kmer_hash){
    
    
    log_and_screen_printf("\nLoading sample from: %s\n", cmd_line.input_filename);
    long long loaded_kmers = 0;
    if(cmd_line.format == KMERS){
        //boolean all_entries_are_unique = false;
        //loaded_kmers = load_binary_from_filename_into_kmers_hash(cmd_line.input_filename, kmer_hash, KMER_HASH_SAMPLE_INDEX, all_entries_are_unique);
        loaded_kmers = load_kmers_binary_from_filename_update_coverage(cmd_line.input_filename, kmer_hash, KMER_HASH_SAMPLE_INDEX);
    }else{
        KmerFileReaderArgs fra;
        fra.bad_reads = 0;
        fra.colour = KMER_HASH_SAMPLE_INDEX;
        fra.fastq_ascii_offset = cmd_line.quality_score_offset;
        KmerFileReaderInnerArgs fria;
        fria.format = cmd_line.format;
        fria.kmer_size = kmer_hash->kmer_size;
        fria.max_read_length = 1000;
        fria.new_entry = true;
        fra.filename = cmd_line.input_filename;
        fra.quality_cut_off = cmd_line.quality_score_threshold;
        fra.inner_args = &fria;
        fra.insert = false;
        fra.max_read_length = 1000;
        fra.maximum_ocupancy = 75;
        fra.KmerHash = kmer_hash;
           //fp, seq, max_read_length, full_entry, &full_entry
        
        loaded_kmers = load_seq_into_kmers_hash(&fra);
        log_and_screen_printf("Loaded %'lld kmers (bad reads %'lld)", loaded_kmers, fra.bad_reads);
        hash_table_print_stats(kmer_hash);
    }
   
    
    
    return loaded_kmers;
}
Beispiel #5
0
int ctx_clean(int argc, char **argv)
{
  size_t nthreads = 0, use_ncols = 0;
  struct MemArgs memargs = MEM_ARGS_INIT;
  const char *out_ctx_path = NULL;
  bool tip_cleaning = false, supernode_cleaning = false;
  size_t min_keep_tip = 0;
  Covg threshold = 0, fallback_thresh = 0;
  const char *len_before_path = NULL, *len_after_path = NULL;
  const char *covg_before_path = NULL, *covg_after_path = NULL;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'o':
        if(out_ctx_path != NULL) cmd_print_usage(NULL);
        out_ctx_path = optarg;
        break;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'N': use_ncols = cmd_uint32_nonzero(cmd, optarg); break;
      case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break;
      case 'T':
        cmd_check(!tip_cleaning, cmd);
        min_keep_tip = cmd_uint32_nonzero(cmd, optarg);
        tip_cleaning = true;
        break;
      case 'S':
        cmd_check(!supernode_cleaning, cmd);
        if(optarg != NULL) threshold = cmd_uint32_nonzero(cmd, optarg);
        supernode_cleaning = true;
        break;
      case 'B': cmd_check(!fallback_thresh, cmd); fallback_thresh = cmd_uint32_nonzero(cmd, optarg); break;
      case 'l': cmd_check(!len_before_path, cmd); len_before_path = optarg; break;
      case 'L': cmd_check(!len_after_path, cmd); len_after_path = optarg; break;
      case 'c': cmd_check(!covg_before_path, cmd); covg_before_path = optarg; break;
      case 'C': cmd_check(!covg_after_path, cmd); covg_after_path = optarg; break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" clean -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  if(nthreads == 0) nthreads = DEFAULT_NTHREADS;

  if(optind >= argc) cmd_print_usage("Please give input graph files");

  // Default behaviour
  if(!tip_cleaning && !supernode_cleaning) {
    if(out_ctx_path != NULL)
      supernode_cleaning = tip_cleaning = true; // do both
    else
      warn("No cleaning being done: you did not specify --out <out.ctx>");
  }

  bool doing_cleaning = (supernode_cleaning || tip_cleaning);

  if(doing_cleaning && out_ctx_path == NULL) {
    cmd_print_usage("Please specify --out <out.ctx> for cleaned graph");
  }

  if(!doing_cleaning && (covg_after_path || len_after_path)) {
    cmd_print_usage("You gave --len-after <out> / --covg-after <out> without "
                    "any cleaning (set -s, --supernodes or -t, --tips)");
  }

  if(doing_cleaning && strcmp(out_ctx_path,"-") != 0 &&
     !futil_get_force() && futil_file_exists(out_ctx_path))
  {
    cmd_print_usage("Output file already exists: %s", out_ctx_path);
  }

  if(fallback_thresh && !supernode_cleaning)
    cmd_print_usage("-B, --fallback <T> without --supernodes");

  // Use remaining args as graph files
  char **gfile_paths = argv + optind;
  size_t i, j, num_gfiles = (size_t)(argc - optind);

  // Open graph files
  GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader));
  size_t ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0;

  ncols = graph_files_open(gfile_paths, gfiles, num_gfiles,
                           &ctx_max_kmers, &ctx_sum_kmers);

  size_t kmer_size = gfiles[0].hdr.kmer_size;

  // default to one colour for now
  if(use_ncols == 0) use_ncols = 1;

  // Flatten if we don't have to remember colours / output a graph
  if(!doing_cleaning)
  {
    ncols = use_ncols = 1;
    for(i = 0; i < num_gfiles; i++)
      file_filter_flatten(&gfiles[i].fltr, 0);
  }

  if(ncols < use_ncols) {
    warn("I only need %zu colour%s ('--ncols %zu' ignored)",
         ncols, util_plural_str(ncols), use_ncols);
    use_ncols = ncols;
  }

  char max_kmers_str[100];
  ulong_to_str(ctx_max_kmers, max_kmers_str);
  status("%zu input graph%s, max kmers: %s, using %zu colours",
         num_gfiles, util_plural_str(num_gfiles), max_kmers_str, use_ncols);

  // If no arguments given we default to removing tips < 2*kmer_size
  if(tip_cleaning && min_keep_tip == 0)
    min_keep_tip = 2 * kmer_size;

  // Warn if any graph files already cleaned
  size_t fromcol, intocol;
  ErrorCleaning *cleaning;

  for(i = 0; i < num_gfiles; i++) {
    for(j = 0; j < file_filter_num(&gfiles[i].fltr); j++) {
      fromcol = file_filter_fromcol(&gfiles[i].fltr, j);
      cleaning = &gfiles[i].hdr.ginfo[fromcol].cleaning;
      if(cleaning->cleaned_snodes && supernode_cleaning) {
        warn("%s:%zu already has supernode cleaning with threshold: <%zu",
             file_filter_path(&gfiles[i].fltr), fromcol,
             (size_t)cleaning->clean_snodes_thresh);
      }
      if(cleaning->cleaned_tips && tip_cleaning) {
        warn("%s:%zu already has had tip cleaned",
             file_filter_path(&gfiles[i].fltr), fromcol);
      }
    }
  }

  // Print steps
  size_t step = 0;
  status("Actions:\n");
  if(covg_before_path != NULL)
    status("%zu. Saving kmer coverage distribution to: %s", step++, covg_before_path);
  if(len_before_path != NULL)
    status("%zu. Saving supernode length distribution to: %s", step++, len_before_path);
  if(tip_cleaning)
    status("%zu. Cleaning tips shorter than %zu nodes", step++, min_keep_tip);
  if(supernode_cleaning && threshold > 0)
    status("%zu. Cleaning supernodes with coverage < %u", step++, threshold);
  if(supernode_cleaning && threshold <= 0)
    status("%zu. Cleaning supernodes with auto-detected threshold", step++);
  if(covg_after_path != NULL)
    status("%zu. Saving kmer coverage distribution to: %s", step++, covg_after_path);
  if(len_after_path != NULL)
    status("%zu. Saving supernode length distribution to: %s", step++, len_after_path);

  //
  // Decide memory usage
  //
  bool all_colours_loaded = (ncols <= use_ncols);
  bool use_mem_limit = (memargs.mem_to_use_set && num_gfiles > 1) || !ctx_max_kmers;

  size_t kmers_in_hash, bits_per_kmer, graph_mem;
  size_t per_kmer_per_col_bits = (sizeof(BinaryKmer)+sizeof(Covg)+sizeof(Edges)) * 8;
  size_t pop_edges_per_kmer_bits = (all_colours_loaded ? 0 : sizeof(Edges) * 8);

  bits_per_kmer = per_kmer_per_col_bits * use_ncols + pop_edges_per_kmer_bits;

  kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                        memargs.mem_to_use_set,
                                        memargs.num_kmers,
                                        memargs.num_kmers_set,
                                        bits_per_kmer,
                                        ctx_max_kmers, ctx_sum_kmers,
                                        use_mem_limit, &graph_mem);

  // Maximise the number of colours we load to fill the mem
  size_t max_usencols = (memargs.mem_to_use*8 - pop_edges_per_kmer_bits * kmers_in_hash) /
                        (per_kmer_per_col_bits * kmers_in_hash);
  use_ncols = MIN2(max_usencols, ncols);

  cmd_check_mem_limit(memargs.mem_to_use, graph_mem);

  //
  // Check output files are writable
  //
  futil_create_output(out_ctx_path);

  // Does nothing if arg is NULL
  futil_create_output(covg_before_path);
  futil_create_output(covg_after_path);
  futil_create_output(len_before_path);
  futil_create_output(len_after_path);

  // Create db_graph
  // Load as many colours as possible
  // Use an extra set of edge to take intersections
  dBGraph db_graph;
  db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, use_ncols, use_ncols,
                 kmers_in_hash, DBG_ALLOC_COVGS);

  // Edges is a special case
  size_t num_edges = db_graph.ht.capacity * (use_ncols + !all_colours_loaded);
  db_graph.col_edges = ctx_calloc(num_edges, sizeof(Edges));

  // Load graph into a single colour
  LoadingStats stats = LOAD_STATS_INIT_MACRO;

  GraphLoadingPrefs gprefs = {.db_graph = &db_graph,
                              .boolean_covgs = false,
                              .must_exist_in_graph = false,
                              .must_exist_in_edges = NULL,
                              .empty_colours = false};

  // Construct cleaned graph header
  GraphFileHeader outhdr;
  memset(&outhdr, 0, sizeof(GraphFileHeader));
  outhdr.version = CTX_GRAPH_FILEFORMAT;
  outhdr.kmer_size = db_graph.kmer_size;
  outhdr.num_of_cols = ncols;
  outhdr.num_of_bitfields = (db_graph.kmer_size*2+63)/64;
  graph_header_alloc(&outhdr, ncols);

  // Merge info into header
  size_t gcol = 0;
  for(i = 0; i < num_gfiles; i++) {
    for(j = 0; j < file_filter_num(&gfiles[i].fltr); j++, gcol++) {
      fromcol = file_filter_fromcol(&gfiles[i].fltr, j);
      intocol = file_filter_intocol(&gfiles[i].fltr, j);
      graph_info_merge(&outhdr.ginfo[intocol], &gfiles[i].hdr.ginfo[fromcol]);
    }
  }

  if(ncols > use_ncols) {
    graph_files_load_flat(gfiles, num_gfiles, gprefs, &stats);
  } else {
    for(i = 0; i < num_gfiles; i++)
      graph_load(&gfiles[i], gprefs, &stats);
  }

  char num_kmers_str[100];
  ulong_to_str(db_graph.ht.num_kmers, num_kmers_str);
  status("Total kmers loaded: %s\n", num_kmers_str);

  size_t initial_nkmers = db_graph.ht.num_kmers;
  hash_table_print_stats(&db_graph.ht);

  uint8_t *visited = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1);
  uint8_t *keep = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1);

  if((supernode_cleaning && threshold <= 0) || covg_before_path || len_before_path)
  {
    // Get coverage distribution and estimate cleaning threshold
    int est_threshold = cleaning_get_threshold(nthreads,
                                               covg_before_path,
                                               len_before_path,
                                               visited, &db_graph);

    if(est_threshold < 0) status("Cannot find recommended cleaning threshold");
    else status("Recommended cleaning threshold is: %i", est_threshold);

    // Use estimated threshold if threshold not set
    if(threshold <= 0) {
      if(fallback_thresh > 0 && est_threshold < (int)fallback_thresh) {
        status("Using fallback threshold: %i", fallback_thresh);
        threshold = fallback_thresh;
      }
      else if(est_threshold >= 0) threshold = est_threshold;
    }
  }

  // Die if we failed to find suitable cleaning threshold
  if(supernode_cleaning && threshold <= 0)
    die("Need cleaning threshold (--supernodes=<D> or --fallback <D>)");

  if(doing_cleaning) {
    // Clean graph of tips (if min_keep_tip > 0) and supernodes (if threshold > 0)
    clean_graph(nthreads, threshold, min_keep_tip,
                covg_after_path, len_after_path,
                visited, keep, &db_graph);
  }

  ctx_free(visited);
  ctx_free(keep);

  if(doing_cleaning)
  {
    // Output graph file
    Edges *intersect_edges = NULL;
    bool kmers_loaded = true;
    size_t col, thresh;

    // Set output header ginfo cleaned
    for(col = 0; col < ncols; col++)
    {
      cleaning = &outhdr.ginfo[col].cleaning;
      cleaning->cleaned_snodes |= supernode_cleaning;
      cleaning->cleaned_tips |= tip_cleaning;

      // if(tip_cleaning) {
      //   strbuf_append_str(&outhdr.ginfo[col].sample_name, ".tipclean");
      // }

      if(supernode_cleaning) {
        thresh = cleaning->clean_snodes_thresh;
        thresh = cleaning->cleaned_snodes ? MAX2(thresh, (uint32_t)threshold)
                                          : (uint32_t)threshold;
        cleaning->clean_snodes_thresh = thresh;

        // char name_append[200];
        // sprintf(name_append, ".supclean%zu", thresh);
        // strbuf_append_str(&outhdr.ginfo[col].sample_name, name_append);
      }
    }

    if(!all_colours_loaded)
    {
      // We haven't loaded all the colours
      // intersect_edges are edges to mask with
      // resets graph edges
      intersect_edges = db_graph.col_edges;
      db_graph.col_edges += db_graph.ht.capacity;
    }

    // Print stats on removed kmers
    size_t removed_nkmers = initial_nkmers - db_graph.ht.num_kmers;
    double removed_pct = (100.0 * removed_nkmers) / initial_nkmers;
    char removed_str[100], init_str[100];
    ulong_to_str(removed_nkmers, removed_str);
    ulong_to_str(initial_nkmers, init_str);
    status("Removed %s of %s (%.2f%%) kmers", removed_str, init_str, removed_pct);

    graph_files_merge(out_ctx_path, gfiles, num_gfiles,
                      kmers_loaded, all_colours_loaded,
                      intersect_edges, &outhdr, &db_graph);

    // Swap back
    if(!all_colours_loaded)
      db_graph.col_edges = intersect_edges;
  }

  ctx_check(db_graph.ht.num_kmers == hash_table_count_kmers(&db_graph.ht));

  graph_header_dealloc(&outhdr);

  for(i = 0; i < num_gfiles; i++) graph_file_close(&gfiles[i]);
  ctx_free(gfiles);

  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}
Beispiel #6
0
// Returns 0 on success, otherwise != 0
int ctx_unitigs(int argc, char **argv)
{
  size_t nthreads = 0;
  struct MemArgs memargs = MEM_ARGS_INIT;
  const char *out_path = NULL;
  UnitigSyntax syntax = PRINT_FASTA;
  bool dot_use_points = false;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'o': cmd_check(!out_path, cmd); out_path = optarg; break;
      case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'F': cmd_check(!syntax, cmd); syntax = PRINT_FASTA; break;
      case 'g': cmd_check(!syntax, cmd); syntax = PRINT_GFA; break;
      case 'd': cmd_check(!syntax, cmd); syntax = PRINT_DOT; break;
      case 'P': cmd_check(!dot_use_points, cmd); dot_use_points = true; break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        die("`"CMD" unitigs -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  if(dot_use_points && syntax == PRINT_FASTA)
    cmd_print_usage("--point is only for use with --dot");

  // Defaults for unset values
  if(out_path == NULL) out_path = "-";
  if(nthreads == 0) nthreads = DEFAULT_NTHREADS;

  if(optind >= argc) cmd_print_usage(NULL);

  size_t i, num_gfiles = (size_t)(argc - optind);
  char **gfile_paths = argv + optind;

  if(dot_use_points && syntax != PRINT_DOT)
    cmd_print_usage("--points only valid with --graphviz / --dot");

  ctx_assert(num_gfiles > 0);

  // Open graph files
  GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader));
  size_t ctx_max_kmers = 0, ctx_sum_kmers = 0;

  graph_files_open(gfile_paths, gfiles, num_gfiles,
                   &ctx_max_kmers, &ctx_sum_kmers);

  //
  // Decide on memory
  //
  size_t bits_per_kmer, kmers_in_hash, graph_mem;

  bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 + 1;
  if(syntax != PRINT_FASTA) bits_per_kmer += sizeof(UnitigEnd) * 8;

  kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                        memargs.mem_to_use_set,
                                        memargs.num_kmers,
                                        memargs.num_kmers_set,
                                        bits_per_kmer,
                                        ctx_max_kmers, ctx_sum_kmers,
                                        true, &graph_mem);

  cmd_check_mem_limit(memargs.mem_to_use, graph_mem);

  status("Output in %s format to %s\n", syntax_strs[syntax],
         futil_outpath_str(out_path));

  //
  // Open output file
  //

  // Print to stdout unless --out <out> is specified
  FILE *fout = futil_fopen_create(out_path, "w");

  //
  // Allocate memory
  //
  dBGraph db_graph;
  db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, 1, 1, kmers_in_hash,
                 DBG_ALLOC_EDGES);

  UnitigPrinter printer;
  unitig_printer_init(&printer, &db_graph, nthreads, syntax, fout);

  if(syntax == PRINT_DOT || syntax == PRINT_GFA)
    unitig_graph_alloc(&printer.ugraph, &db_graph);

  // Load graphs
  GraphLoadingPrefs gprefs = {.db_graph = &db_graph,
                              .boolean_covgs = false,
                              .must_exist_in_graph = false,
                              .empty_colours = false};

  for(i = 0; i < num_gfiles; i++) {
    file_filter_flatten(&gfiles[i].fltr, 0);
    graph_load(&gfiles[i], gprefs, NULL);
    graph_file_close(&gfiles[i]);
  }
  ctx_free(gfiles);

  hash_table_print_stats(&db_graph.ht);

  switch(syntax)
  {
    case PRINT_FASTA:
      status("Printing unitgs in FASTA using %zu threads", nthreads);
      supernodes_iterate(nthreads, printer.visited, &db_graph,
                         print_unitig_fasta, &printer);
      break;
    case PRINT_GFA:
      print_gfa_syntax(&printer);
      break;
    case PRINT_DOT:
      print_dot_syntax(&printer, dot_use_points);
      break;
    default:
      die("Invalid print syntax: %i", syntax);
  }

  char num_unitigs_str[50];
  ulong_to_str(printer.num_unitigs, num_unitigs_str);
  status("Dumped %s unitigs\n", num_unitigs_str);

  fclose(fout);

  unitig_printer_destroy(&printer);
  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}
Beispiel #7
0
int ctx_pop_bubbles(int argc, char **argv)
{
  size_t nthreads = 0;
  struct MemArgs memargs = MEM_ARGS_INIT;
  const char *out_path = NULL;
  int32_t max_covg  = -1; // max mean coverage to remove <=0 => ignore
  int32_t max_klen  = -1; // max length (kmers) to remove <=0 => ignore
  int32_t max_kdiff = -1; // max diff between bubble branch lengths <0 => ignore

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'o': cmd_check(!out_path, cmd); out_path = optarg; break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'C': cmd_check(max_covg<0,  cmd); max_covg  = cmd_uint32(cmd, optarg); break;
      case 'L': cmd_check(max_klen<0,  cmd); max_klen  = cmd_uint32(cmd, optarg); break;
      case 'D': cmd_check(max_kdiff<0, cmd); max_kdiff = cmd_uint32(cmd, optarg); break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" pop -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  // Defaults for unset values
  if(out_path == NULL) out_path = "-";
  if(nthreads == 0) nthreads = DEFAULT_NTHREADS;

  if(optind >= argc) cmd_print_usage("Require input graph files (.ctx)");

  //
  // Open graph files
  //
  const size_t num_gfiles = argc - optind;
  char **graph_paths = argv + optind;
  ctx_assert(num_gfiles > 0);

  GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader));
  size_t i, ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0;

  ncols = graph_files_open(graph_paths, gfiles, num_gfiles,
                           &ctx_max_kmers, &ctx_sum_kmers);

  bool reread_graph_to_filter = (num_gfiles == 1 &&
                                 strcmp(file_filter_path(&gfiles[0].fltr),"-") != 0);

  if(reread_graph_to_filter) {
    file_filter_flatten(&gfiles[0].fltr, 0);
    ncols = 1;
  }

  // Check graphs are compatible
  graphs_gpaths_compatible(gfiles, num_gfiles, NULL, 0, -1);

  //
  // Decide on memory
  //
  size_t bits_per_kmer, kmers_in_hash, graph_mem;

  bits_per_kmer = sizeof(BinaryKmer)*8 +
                  sizeof(Covg)*8*ncols +
                  sizeof(Edges)*8*ncols +
                  2; // 1 bit for visited, 1 for removed

  kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                        memargs.mem_to_use_set,
                                        memargs.num_kmers,
                                        memargs.num_kmers_set,
                                        bits_per_kmer,
                                        ctx_max_kmers, ctx_sum_kmers,
                                        false, &graph_mem);

  cmd_check_mem_limit(memargs.mem_to_use, graph_mem);

  // Check out_path is writable
  futil_create_output(out_path);

  // Allocate memory
  dBGraph db_graph;
  db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, ncols, ncols,
                 kmers_in_hash,  DBG_ALLOC_EDGES | DBG_ALLOC_COVGS);

  size_t nkwords = roundup_bits2bytes(db_graph.ht.capacity);
  uint8_t *visited = ctx_calloc(1, nkwords);
  uint8_t *rmvbits  = ctx_calloc(1, nkwords);

  //
  // Load graphs
  //
  GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph);
  gprefs.empty_colours = true;

  for(i = 0; i < num_gfiles; i++) {
    graph_load(&gfiles[i], gprefs, NULL);
    graph_file_close(&gfiles[i]);
    gprefs.empty_colours = false;
  }
  ctx_free(gfiles);

  hash_table_print_stats(&db_graph.ht);

  PopBubblesPrefs prefs = {.max_rmv_covg = max_covg,
                           .max_rmv_klen = max_klen,
                           .max_rmv_kdiff = max_kdiff};
  size_t npopped = 0;
  char npopped_str[50];

  status("Popping bubbles...");
  npopped = pop_bubbles(&db_graph, nthreads, prefs, visited, rmvbits);
  ulong_to_str(npopped, npopped_str);
  status("Popped %s bubbles", npopped_str);

  size_t nkmers0 = db_graph.ht.num_kmers;
  status("Removing nodes...");
  for(i = 0; i < nkwords; i++) rmvbits[i] = ~rmvbits[i];
  prune_nodes_lacking_flag(nthreads, rmvbits, &db_graph);
  size_t nkmers1 = db_graph.ht.num_kmers;

  ctx_assert(nkmers1 <= nkmers0);
  char nkmers0str[50], nkmers1str[50], ndiffstr[50];
  ulong_to_str(nkmers0, nkmers0str);
  ulong_to_str(nkmers1, nkmers1str);
  ulong_to_str(nkmers0-nkmers1, ndiffstr);
  status("Number of kmers %s -> %s (-%s)", nkmers0str, nkmers1str, ndiffstr);

  if(reread_graph_to_filter)
  {
    status("Streaming filtered file to: %s\n", out_path);
    GraphFileReader gfile;
    memset(&gfile, 0, sizeof(GraphFileReader));
    graph_file_open(&gfile, graph_paths[0]);
    graph_writer_stream_mkhdr(out_path, &gfile, &db_graph,
                              db_graph.col_edges, NULL);
    graph_file_close(&gfile);
  }
  else
  {
    status("Saving to: %s\n", out_path);
    graph_writer_save_mkhdr(out_path, &db_graph, CTX_GRAPH_FILEFORMAT, NULL,
                          0, ncols);
  }

  ctx_free(visited);
  ctx_free(rmvbits);

  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}
Beispiel #8
0
HashTable *  hash_table_read_dumped_memory(char * filename ){
	
	HashTable * hash = calloc(1, sizeof(HashTable));
	
	FILE * fp = fopen(filename, "rb");
	int magic_size = strlen(MAGIC_TEXT);
	char * magic = calloc(magic_size, sizeof(char));
	int size_ht;
	int size_e = sizeof(Element);
	size_t readed; 
	short version;
	long long number_buckets;
	int bucket_size; 
	long long hash_size;	
	
	
	if(fp == NULL){
		exit_while_reading(fp, filename);
	}

	//Header stuff, this is enough information to prepare the hash table.  
	readed = fread(magic, sizeof(char), magic_size, fp);
	validate_read(readed, magic_size,  fp,  filename);
	if(strcmp(magic, MAGIC_TEXT) != 0){
		log_and_screen_printf( "[hash_table_read_dumped_memory] Invalid magic number!\n");
		fclose(fp);
		exit(-1);
		
	}
	//#printf("%s\n", magic);
	
	readed = fread(&version, sizeof(short), 1, fp);
	validate_read(readed, 1,  fp,  filename);
	//#printf("%d\n", version);
	if(version != HASH_VERSION){
		log_and_screen_printf( "[hash_table_read_dumped_memory] Invalid version number!\n");
		exit_while_reading(fp, filename);
		
	}
	readed = fread(&size_ht, sizeof(int), 1,fp);
	validate_read(readed, 1,  fp,  filename);
	//#printf("%d\n", size_ht);
	if(size_ht != sizeof(HashTable)){
		log_and_screen_printf( "[hash_table_read_dumped_memory] Invalid size of hash table!\n");
		exit_while_reading(fp, filename);
		
	}
	
	readed = fread(&size_e, sizeof(int), 1,fp);
	validate_read(readed, 1,  fp,  filename);
	//#printf("%d\n", size_e);
	if(size_e != sizeof(Element)){
		log_and_screen_printf( "[hash_table_read_dumped_memory] Invalid size of element!\n");
		exit_while_reading(fp, filename);
		
	}
		
	readed = fread(&hash->kmer_size, sizeof(short), 1, fp);
	//printf("kmer size %d\n", hash->kmer_size);
	validate_read(readed, 1,  fp,  filename);
	
	readed = fread(&number_buckets, sizeof(long long), 1, fp);
	validate_read(readed, 1,  fp,  filename);
	//printf("number of buckets %lld\n",number_buckets);
	
	readed = fread(&bucket_size, sizeof(int), 1, fp);
	validate_read(readed, 1,  fp,  filename);
	//printf("bucket size%d \n",bucket_size);
	
	readed = fread(&hash->max_rehash_tries, sizeof(int), 1, fp);
	validate_read(readed, 1,  fp,  filename);
	//printf("hash->max_rehash_tries %d\n", hash->max_rehash_tries);
	
	readed = fread(&hash->unique_kmers, sizeof(long long), 1, fp);
	validate_read(readed, 1,  fp,  filename);
//	printf("hash->unique_kmers %lld\n", hash->unique_kmers);
	
	hash->number_buckets = number_buckets ;
	hash->bucket_size = bucket_size; 
	
	hash_size=number_buckets * bucket_size;	
	//printf("Hash size %lld \n", hash_size);
	
	//Allocating the table according to the description of the file
	hash->table = calloc(hash_size, sizeof(Element));
	hash->next_element = calloc(number_buckets, sizeof(int));
	hash->collisions = calloc(number_buckets, sizeof(long long));
	
	if(hash->table == NULL){
		log_and_screen_printf( "Unable to create hash table\n ");
		exit_while_reading(fp, filename);
	}
	
	//Reading the actual data. 
	readed = fread(hash->table, size_e, hash_size, fp);
	validate_read(readed, hash_size,  fp,  filename);
	
	readed = fread(hash->next_element, sizeof(int), number_buckets, fp);
	validate_read(readed, number_buckets,  fp,  filename);
	
	
	readed = fread(hash->collisions, sizeof(long long), hash->max_rehash_tries, fp);
	validate_read(readed, hash->max_rehash_tries,  fp,  filename);
	
	fclose(fp);
	
	log_and_screen_printf("Hash readed from: %s \n" , filename);
	hash_table_print_stats(hash);
	
	return hash;
}
Beispiel #9
0
int ctx_clean(int argc, char **argv)
{
  size_t nthreads = 0, use_ncols = 0;
  struct MemArgs memargs = MEM_ARGS_INIT;
  const char *out_ctx_path = NULL;
  int min_keep_tip = -1, unitig_min = -1; // <0 => default, 0 => noclean
  uint32_t fallback_thresh = 0;
  const char *len_before_path = NULL, *len_after_path = NULL;
  const char *covg_before_path = NULL, *covg_after_path = NULL;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'o':
        if(out_ctx_path != NULL) cmd_print_usage(NULL);
        out_ctx_path = optarg;
        break;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'N': use_ncols = cmd_uint32_nonzero(cmd, optarg); break;
      case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break;
      case 'T':
        cmd_check(min_keep_tip<0, cmd);
        min_keep_tip = (optarg != NULL ? (int)cmd_uint32(cmd, optarg) : -1);
        break;
      case 'S':
      case 'U':
        cmd_check(unitig_min<0, cmd);
        unitig_min = (optarg != NULL ? cmd_uint32(cmd, optarg) : -1);
        break;
      case 'B': cmd_check(!fallback_thresh, cmd); fallback_thresh = cmd_uint32_nonzero(cmd, optarg); break;
      case 'l': cmd_check(!len_before_path, cmd); len_before_path = optarg; break;
      case 'L': cmd_check(!len_after_path, cmd); len_after_path = optarg; break;
      case 'c': cmd_check(!covg_before_path, cmd); covg_before_path = optarg; break;
      case 'C': cmd_check(!covg_after_path, cmd); covg_after_path = optarg; break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" clean -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  if(nthreads == 0) nthreads = DEFAULT_NTHREADS;

  if(optind >= argc) cmd_print_usage("Please give input graph files");

  bool unitig_cleaning = (unitig_min != 0);
  bool tip_cleaning = (min_keep_tip != 0);
  bool doing_cleaning = (unitig_cleaning || tip_cleaning);

  // If you ever want to estimate cleaning threshold without outputting
  // a graph, change this to a warning
  if(doing_cleaning && out_ctx_path == NULL) {
    cmd_print_usage("Please specify --out <out.ctx> for cleaned graph");
    // warn("No cleaning being done: you did not specify --out <out.ctx>");
  }

  if(!doing_cleaning && (covg_after_path || len_after_path)) {
    warn("You gave --len-after <out> / --covg-after <out> without "
         "any cleaning (set -U, --unitigs or -t, --tips)");
  }

  if(doing_cleaning && strcmp(out_ctx_path,"-") != 0 &&
     !futil_get_force() && futil_file_exists(out_ctx_path))
  {
    cmd_print_usage("Output file already exists: %s", out_ctx_path);
  }

  if(fallback_thresh && !unitig_cleaning)
    warn("-B, --fallback <T> without --unitigs");

  // Use remaining args as graph files
  char **gfile_paths = argv + optind;
  size_t i, j, num_gfiles = (size_t)(argc - optind);

  // Open graph files
  GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader));
  size_t col, ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0;

  ncols = graph_files_open(gfile_paths, gfiles, num_gfiles,
                           &ctx_max_kmers, &ctx_sum_kmers);

  size_t kmer_size = gfiles[0].hdr.kmer_size;

  // default to one colour for now
  if(use_ncols == 0) use_ncols = 1;

  // Flatten if we don't have to remember colours / output a graph
  if(out_ctx_path == NULL)
  {
    ncols = use_ncols = 1;
    for(i = 0; i < num_gfiles; i++)
      file_filter_flatten(&gfiles[i].fltr, 0);
  }

  if(ncols < use_ncols) {
    warn("I only need %zu colour%s ('--ncols %zu' ignored)",
         ncols, util_plural_str(ncols), use_ncols);
    use_ncols = ncols;
  }

  char max_kmers_str[100];
  ulong_to_str(ctx_max_kmers, max_kmers_str);
  status("%zu input graph%s, max kmers: %s, using %zu colours",
         num_gfiles, util_plural_str(num_gfiles), max_kmers_str, use_ncols);

  // If no arguments given we default to removing tips < 2*kmer_size
  if(min_keep_tip < 0)
    min_keep_tip = 2 * kmer_size;

  // Warn if any graph files already cleaned
  size_t fromcol;
  ErrorCleaning *cleaning;

  for(i = 0; i < num_gfiles; i++) {
    for(j = 0; j < file_filter_num(&gfiles[i].fltr); j++) {
      fromcol = file_filter_fromcol(&gfiles[i].fltr, j);
      cleaning = &gfiles[i].hdr.ginfo[fromcol].cleaning;
      if(cleaning->cleaned_snodes && unitig_cleaning) {
        warn("%s:%zu already has unitig cleaning with threshold: <%zu",
             file_filter_path(&gfiles[i].fltr), fromcol,
             (size_t)cleaning->clean_snodes_thresh);
      }
      if(cleaning->cleaned_tips && tip_cleaning) {
        warn("%s:%zu already has had tip cleaned",
             file_filter_path(&gfiles[i].fltr), fromcol);
      }
    }
  }

  // Print steps
  size_t step = 0;
  status("Actions:\n");
  if(covg_before_path != NULL)
    status("%zu. Saving kmer coverage distribution to: %s", step++, covg_before_path);
  if(len_before_path != NULL)
    status("%zu. Saving unitig length distribution to: %s", step++, len_before_path);
  if(min_keep_tip > 0)
    status("%zu. Cleaning tips shorter than %i nodes", step++, min_keep_tip);
  if(unitig_min > 0)
    status("%zu. Cleaning unitigs with coverage < %i", step++, unitig_min);
  if(unitig_min < 0)
    status("%zu. Cleaning unitigs with auto-detected threshold", step++);
  if(covg_after_path != NULL)
    status("%zu. Saving kmer coverage distribution to: %s", step++, covg_after_path);
  if(len_after_path != NULL)
    status("%zu. Saving unitig length distribution to: %s", step++, len_after_path);

  //
  // Decide memory usage
  //
  bool all_colours_loaded = (ncols <= use_ncols);
  bool use_mem_limit = (memargs.mem_to_use_set && num_gfiles > 1) || !ctx_max_kmers;

  size_t kmers_in_hash, bits_per_kmer, graph_mem;
  size_t per_col_bits = (sizeof(Covg)+sizeof(Edges)) * 8;
  size_t extra_edge_bits = (all_colours_loaded ? 0 : sizeof(Edges) * 8);

  bits_per_kmer = sizeof(BinaryKmer)*8 +
                  per_col_bits * use_ncols +
                  extra_edge_bits;

  kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                        memargs.mem_to_use_set,
                                        memargs.num_kmers,
                                        memargs.num_kmers_set,
                                        bits_per_kmer,
                                        ctx_max_kmers, ctx_sum_kmers,
                                        use_mem_limit, &graph_mem);

  // Maximise the number of colours we load to fill the mem
  size_t max_usencols = (memargs.mem_to_use*8 -
                         sizeof(BinaryKmer)*8*kmers_in_hash +
                         extra_edge_bits*kmers_in_hash) /
                        (per_col_bits*kmers_in_hash);
  use_ncols = MIN2(max_usencols, ncols);

  cmd_check_mem_limit(memargs.mem_to_use, graph_mem);

  //
  // Check output files are writable
  //
  futil_create_output(out_ctx_path);

  // Does nothing if arg is NULL
  futil_create_output(covg_before_path);
  futil_create_output(covg_after_path);
  futil_create_output(len_before_path);
  futil_create_output(len_after_path);

  // Create db_graph
  // Load as many colours as possible
  // Use an extra set of edge to take intersections
  dBGraph db_graph;
  db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, use_ncols, use_ncols,
                 kmers_in_hash, DBG_ALLOC_EDGES | DBG_ALLOC_COVGS);

  // Extra edges required to hold union of kept edges
  Edges *edges_union = NULL;
  if(use_ncols < ncols)
    edges_union = ctx_calloc(db_graph.ht.capacity, sizeof(Edges));

  // Load graph into a single colour
  GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph);

  // Construct cleaned graph header
  GraphFileHeader outhdr;
  memset(&outhdr, 0, sizeof(GraphFileHeader));
  for(i = 0; i < num_gfiles; i++)
    graph_file_merge_header(&outhdr, &gfiles[i]);

  if(ncols > use_ncols)
  {
    db_graph.num_of_cols = db_graph.num_edge_cols = 1;
    SWAP(edges_union, db_graph.col_edges);
    graphs_load_files_flat(gfiles, num_gfiles, gprefs, NULL);
    SWAP(edges_union, db_graph.col_edges);
    db_graph.num_of_cols = db_graph.num_edge_cols = use_ncols;
  }
  else {
    for(i = 0; i < num_gfiles; i++)
      graph_load(&gfiles[i], gprefs, NULL);
  }

  char num_kmers_str[100];
  ulong_to_str(db_graph.ht.num_kmers, num_kmers_str);
  status("Total kmers loaded: %s\n", num_kmers_str);

  size_t initial_nkmers = db_graph.ht.num_kmers;
  hash_table_print_stats(&db_graph.ht);

  uint8_t *visited = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1);
  uint8_t *keep = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1);

  // Always estimate cleaning threshold
  // if(unitig_min <= 0 || covg_before_path || len_before_path)
  // {
    // Get coverage distribution and estimate cleaning threshold
    int est_min_covg = cleaning_get_threshold(nthreads,
                                              covg_before_path,
                                              len_before_path,
                                              visited, &db_graph);

    if(est_min_covg < 0) status("Cannot find recommended cleaning threshold");
    else status("Recommended cleaning threshold is: %i", est_min_covg);

    // Use estimated threshold if threshold not set
    if(unitig_min < 0) {
      if(fallback_thresh > 0 && est_min_covg < (int)fallback_thresh) {
        status("Using fallback threshold: %i", fallback_thresh);
        unitig_min = fallback_thresh;
      }
      else if(est_min_covg >= 0) unitig_min = est_min_covg;
    }
  // }

  // Die if we failed to find suitable cleaning threshold
  if(unitig_min < 0)
    die("Need cleaning threshold (--unitigs=<D> or --fallback <D>)");

  // Cleaning parameters should now be set (>0) or turned off (==0)
  ctx_assert(unitig_min >= 0);
  ctx_assert(min_keep_tip >= 0);

  if(unitig_min || min_keep_tip)
  {
    // Clean graph of tips (if min_keep_tip > 0) and unitigs (if threshold > 0)
    clean_graph(nthreads, unitig_min, min_keep_tip,
                covg_after_path, len_after_path,
                visited, keep, &db_graph);
  }

  ctx_free(visited);
  ctx_free(keep);

  if(out_ctx_path != NULL)
  {
    // Set output header ginfo cleaned
    for(col = 0; col < ncols; col++)
    {
      cleaning = &outhdr.ginfo[col].cleaning;
      cleaning->cleaned_snodes |= unitig_cleaning;
      cleaning->cleaned_tips |= tip_cleaning;

      // if(tip_cleaning) {
      //   strbuf_append_str(&outhdr.ginfo[col].sample_name, ".tipclean");
      // }

      if(unitig_cleaning) {
        size_t thresh = cleaning->clean_snodes_thresh;
        thresh = cleaning->cleaned_snodes ? MAX2(thresh, (uint32_t)unitig_min)
                                          : (uint32_t)unitig_min;
        cleaning->clean_snodes_thresh = thresh;

        // char name_append[200];
        // sprintf(name_append, ".supclean%zu", thresh);
        // strbuf_append_str(&outhdr.ginfo[col].sample_name, name_append);
      }
    }

    // Print stats on removed kmers
    size_t removed_nkmers = initial_nkmers - db_graph.ht.num_kmers;
    double removed_pct = (100.0 * removed_nkmers) / initial_nkmers;
    char removed_str[100], init_str[100];
    ulong_to_str(removed_nkmers, removed_str);
    ulong_to_str(initial_nkmers, init_str);
    status("Removed %s of %s (%.2f%%) kmers", removed_str, init_str, removed_pct);

    // kmers_loaded=true
    graph_writer_merge(out_ctx_path, gfiles, num_gfiles,
                      true, all_colours_loaded,
                      edges_union, &outhdr, &db_graph);
  }

  ctx_check(db_graph.ht.num_kmers == hash_table_count_kmers(&db_graph.ht));

  // TODO: report kmer coverage for each sample

  graph_header_dealloc(&outhdr);

  for(i = 0; i < num_gfiles; i++) graph_file_close(&gfiles[i]);
  ctx_free(gfiles);

  ctx_free(edges_union);
  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}
Beispiel #10
0
int ctx_contigs(int argc, char **argv)
{
  size_t nthreads = 0;
  struct MemArgs memargs = MEM_ARGS_INIT;
  const char *out_path = NULL;
  size_t i, contig_limit = 0, colour = 0;
  bool cmd_reseed = false, cmd_no_reseed = false; // -r, -R
  const char *conf_table_path = NULL; // save confidence table to here
  bool use_missing_info_check = true, seed_with_unused_paths = false;
  double min_step_confid = -1.0, min_cumul_confid = -1.0; // < 0 => no min

  // Read length and expected depth for calculating confidences
  size_t genome_size = 0;

  seq_file_t *tmp_seed_file = NULL;
  SeqFilePtrBuffer seed_buf;
  seq_file_ptr_buf_alloc(&seed_buf, 16);

  GPathReader tmp_gpfile;
  GPathFileBuffer gpfiles;
  gpfile_buf_alloc(&gpfiles, 8);

  // Arg parsing
  char cmd[100], shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'o': cmd_check(!out_path,cmd); out_path = optarg; break;
      case 't': cmd_check(!nthreads,cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'p':
        memset(&tmp_gpfile, 0, sizeof(GPathReader));
        gpath_reader_open(&tmp_gpfile, optarg);
        gpfile_buf_push(&gpfiles, &tmp_gpfile, 1);
        break;
      case '1':
      case 's': // --seed <in.fa>
        if((tmp_seed_file = seq_open(optarg)) == NULL)
          die("Cannot read --seed file: %s", optarg);
        seq_file_ptr_buf_add(&seed_buf, tmp_seed_file);
        break;
      case 'r': cmd_check(!cmd_reseed,cmd); cmd_reseed = true; break;
      case 'R': cmd_check(!cmd_no_reseed,cmd); cmd_no_reseed = true; break;
      case 'N':
        cmd_check(!contig_limit,cmd);
        contig_limit = cmd_uint32_nonzero(cmd, optarg);
        break;
      case 'c': cmd_check(!colour,cmd); colour = cmd_uint32(cmd, optarg); break;
      case 'G': cmd_check(!genome_size,cmd); genome_size = cmd_bases(cmd, optarg); break;
      case 'S': cmd_check(!conf_table_path,cmd); conf_table_path = optarg; break;
      case 'M': cmd_check(use_missing_info_check,cmd); use_missing_info_check = false; break;
      case 'P': cmd_check(!seed_with_unused_paths,cmd); seed_with_unused_paths = true; break;
      case 'C':
        cmd_check(min_cumul_confid < 0,cmd);
        min_cumul_confid = cmd_udouble(cmd,optarg);
        if(min_cumul_confid > 1) die("%s must be 0 <= x <= 1", cmd);
        break;
      case 'T':
        cmd_check(min_step_confid < 0,cmd);
        min_step_confid = cmd_udouble(cmd,optarg);
        if(min_step_confid > 1) die("%s must be 0 <= x <= 1", cmd);
        break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        die("`"CMD" contigs -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  if(cmd_no_reseed && cmd_reseed)
    cmd_print_usage("Cannot specify both -r and -R");

  if(contig_limit && seed_with_unused_paths)
    cmd_print_usage("Cannot combine --ncontigs with --use-seed-paths");

  bool sample_with_replacement = cmd_reseed;

  // Defaults
  if(nthreads == 0) nthreads = DEFAULT_NTHREADS;

  if(!seed_buf.len && !contig_limit && sample_with_replacement) {
    cmd_print_usage("Please specify one or more of: "
                    "--no-reseed | --ncontigs | --seed <in.fa>");
  }

  if(optind >= argc) cmd_print_usage("Require input graph files (.ctx)");

  //
  // Open graph files
  //
  const size_t num_gfiles = argc - optind;
  char **graph_paths = argv + optind;
  ctx_assert(num_gfiles > 0);

  GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader));
  size_t ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0;

  graph_files_open(graph_paths, gfiles, num_gfiles,
                   &ctx_max_kmers, &ctx_sum_kmers);

  // char *ctx_path = argv[optind];

  //
  // Open Graph file
  //
  // GraphFileReader gfile;
  // memset(&gfile, 0, sizeof(GraphFileReader));
  // graph_file_open(&gfile, ctx_path);

  // Update colours in graph file - sample in 0, all others in 1
  // never need more than two colours
  ncols = gpath_load_sample_pop(gfiles, num_gfiles,
                                gpfiles.b, gpfiles.len, colour);

  // Check for compatibility between graph files and path files
  // pop_colour is colour 1
  graphs_gpaths_compatible(gfiles, num_gfiles, gpfiles.b, gpfiles.len, 1);

  if(!genome_size)
  {
    char nk_str[50];
    if(ctx_max_kmers <= 0) die("Please pass --genome <G> if streaming");
    genome_size = ctx_max_kmers;
    ulong_to_str(genome_size, nk_str);
    status("Taking number of kmers as genome size: %s", nk_str);
  }

  //
  // Decide on memory
  //
  size_t bits_per_kmer, kmers_in_hash, graph_mem, path_mem, total_mem;

  // 1 bit needed per kmer if we need to keep track of kmer usage
  bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 + sizeof(GPath*)*8 +
                  ncols + !sample_with_replacement;

  kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                        memargs.mem_to_use_set,
                                        memargs.num_kmers,
                                        memargs.num_kmers_set,
                                        bits_per_kmer,
                                        ctx_max_kmers, ctx_sum_kmers,
                                        false, &graph_mem);

  // Paths memory
  size_t rem_mem = memargs.mem_to_use - MIN2(memargs.mem_to_use, graph_mem);
  path_mem = gpath_reader_mem_req(gpfiles.b, gpfiles.len, ncols, rem_mem, false);

  // Shift path store memory from graphs->paths
  graph_mem -= sizeof(GPath*)*kmers_in_hash;
  path_mem  += sizeof(GPath*)*kmers_in_hash;
  cmd_print_mem(path_mem, "paths");

  // Total memory
  total_mem = graph_mem + path_mem;
  cmd_check_mem_limit(memargs.mem_to_use, total_mem);

  // Load contig hist distribution from ctp files
  ZeroSizeBuffer contig_hist;
  memset(&contig_hist, 0, sizeof(contig_hist));

  for(i = 0; i < gpfiles.len; i++) {
    gpath_reader_load_contig_hist(gpfiles.b[i].json,
                                  gpfiles.b[i].fltr.path.b,
                                  file_filter_fromcol(&gpfiles.b[i].fltr, 0),
                                  &contig_hist);
  }

  // Calculate confidences, only for one colour
  ContigConfidenceTable conf_table;
  conf_table_alloc(&conf_table, 1);
  conf_table_update_hist(&conf_table, 0, genome_size,
                         contig_hist.b, contig_hist.len);

  if(conf_table_path != NULL) {
    conf_table_save(&conf_table, conf_table_path);
  }

  zsize_buf_dealloc(&contig_hist);

  //
  // Output file if printing
  //
  FILE *fout = out_path ? futil_fopen_create(out_path, "w") : NULL;

  // Allocate
  dBGraph db_graph;
  db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, ncols, 1, kmers_in_hash,
                 DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL);

  // Paths
  gpath_reader_alloc_gpstore(gpfiles.b, gpfiles.len, path_mem,
                             false, &db_graph);

  uint8_t *visited = NULL;

  if(!sample_with_replacement)
    visited = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1);

  // Load graph
  LoadingStats stats = LOAD_STATS_INIT_MACRO;

  GraphLoadingPrefs gprefs = {.db_graph = &db_graph,
                              .boolean_covgs = false,
                              .must_exist_in_graph = false,
                              .empty_colours = true};

  for(i = 0; i < num_gfiles; i++) {
    graph_load(&gfiles[i], gprefs, &stats);
    graph_file_close(&gfiles[i]);
    gprefs.empty_colours = false;
  }
  ctx_free(gfiles);

  hash_table_print_stats(&db_graph.ht);

  // Load path files
  for(i = 0; i < gpfiles.len; i++) {
    gpath_reader_load(&gpfiles.b[i], GPATH_DIE_MISSING_KMERS, &db_graph);
    gpath_reader_close(&gpfiles.b[i]);
  }
  gpfile_buf_dealloc(&gpfiles);

  AssembleContigStats assem_stats;
  assemble_contigs_stats_init(&assem_stats);

  assemble_contigs(nthreads, seed_buf.b, seed_buf.len,
                   contig_limit, visited,
                   use_missing_info_check, seed_with_unused_paths,
                   min_step_confid, min_cumul_confid,
                   fout, out_path, &assem_stats, &conf_table,
                   &db_graph, 0); // Sample always loaded into colour zero

  if(fout && fout != stdout) fclose(fout);

  assemble_contigs_stats_print(&assem_stats);
  assemble_contigs_stats_destroy(&assem_stats);

  conf_table_dealloc(&conf_table);

  for(i = 0; i < seed_buf.len; i++)
    seq_close(seed_buf.b[i]);

  seq_file_ptr_buf_dealloc(&seed_buf);

  ctx_free(visited);
  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}
Beispiel #11
0
int ctx_bubbles(int argc, char **argv)
{
  size_t nthreads = 0;
  struct MemArgs memargs = MEM_ARGS_INIT;
  const char *out_path = NULL;
  size_t max_allele_len = 0, max_flank_len = 0;
  bool remove_serial_bubbles = true;

  // List of haploid colours
  size_t *hapcols = NULL;
  int nhapcols = 0;
  char *hapcols_arg = NULL;

  GPathReader tmp_gpfile;
  GPathFileBuffer gpfiles;
  gpfile_buf_alloc(&gpfiles, 8);

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'o': cmd_check(!out_path, cmd); out_path = optarg; break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'p':
        memset(&tmp_gpfile, 0, sizeof(GPathReader));
        gpath_reader_open(&tmp_gpfile, optarg);
        gpfile_buf_push(&gpfiles, &tmp_gpfile, 1);
        break;
      case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'H': cmd_check(!hapcols_arg, cmd); hapcols_arg = optarg; break;
      case 'A': cmd_check(!max_allele_len, cmd); max_allele_len = cmd_uint32_nonzero(cmd, optarg); break;
      case 'F': cmd_check(!max_flank_len, cmd); max_flank_len = cmd_uint32_nonzero(cmd, optarg); break;
      case 'S': cmd_check(remove_serial_bubbles,cmd); remove_serial_bubbles = false; break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  // Defaults for unset values
  if(out_path == NULL) out_path = "-";
  if(nthreads == 0) nthreads = DEFAULT_NTHREADS;
  if(max_allele_len == 0) max_allele_len = DEFAULT_MAX_ALLELE;
  if(max_flank_len == 0) max_flank_len = DEFAULT_MAX_FLANK;

  if(optind >= argc) cmd_print_usage("Require input graph files (.ctx)");

  //
  // Open graph files
  //
  const size_t num_gfiles = argc - optind;
  char **graph_paths = argv + optind;
  ctx_assert(num_gfiles > 0);

  GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader));
  size_t i, ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0;

  ncols = graph_files_open(graph_paths, gfiles, num_gfiles,
                           &ctx_max_kmers, &ctx_sum_kmers);

  // Check graph + paths are compatible
  graphs_gpaths_compatible(gfiles, num_gfiles, gpfiles.b, gpfiles.len, -1);

  //
  // Check haploid colours are valid
  //
  if(hapcols_arg != NULL) {
    if((nhapcols = range_get_num(hapcols_arg, ncols)) < 0)
      die("Invalid haploid colour list: %s", hapcols_arg);

    hapcols = ctx_calloc(nhapcols, sizeof(hapcols[0]));
    if(range_parse_array(hapcols_arg, hapcols, ncols) < 0)
      die("Invalid haploid colour list: %s", hapcols_arg);
  }

  //
  // Decide on memory
  //
  size_t bits_per_kmer, kmers_in_hash, graph_mem, path_mem, thread_mem;
  char thread_mem_str[100];

  // edges(1bytes) + kmer_paths(8bytes) + in_colour(1bit/col) +
  // visitedfw/rv(2bits/thread)

  bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 +
                  (gpfiles.len > 0 ? sizeof(GPath*)*8 : 0) +
                  ncols + 2*nthreads;

  kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                        memargs.mem_to_use_set,
                                        memargs.num_kmers,
                                        memargs.num_kmers_set,
                                        bits_per_kmer,
                                        ctx_max_kmers, ctx_sum_kmers,
                                        false, &graph_mem);

  // Thread memory
  thread_mem = roundup_bits2bytes(kmers_in_hash) * 2;
  bytes_to_str(thread_mem * nthreads, 1, thread_mem_str);
  status("[memory] (of which threads: %zu x %zu = %s)\n",
          nthreads, thread_mem, thread_mem_str);

  // Paths memory
  size_t rem_mem = memargs.mem_to_use - MIN2(memargs.mem_to_use, graph_mem+thread_mem);
  path_mem = gpath_reader_mem_req(gpfiles.b, gpfiles.len, ncols, rem_mem, false,
                                  kmers_in_hash, false);

  // Shift path store memory from graphs->paths
  graph_mem -= sizeof(GPath*)*kmers_in_hash;
  path_mem  += sizeof(GPath*)*kmers_in_hash;
  cmd_print_mem(path_mem, "paths");

  size_t total_mem = graph_mem + thread_mem + path_mem;
  cmd_check_mem_limit(memargs.mem_to_use, total_mem);

  //
  // Open output file
  //
  gzFile gzout = futil_gzopen_create(out_path, "w");

  // Allocate memory
  dBGraph db_graph;
  db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, ncols, 1, kmers_in_hash,
                 DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL);

  // Paths
  gpath_reader_alloc_gpstore(gpfiles.b, gpfiles.len, path_mem, false, &db_graph);

  //
  // Load graphs
  //
  GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph);
  gprefs.empty_colours = true;

  for(i = 0; i < num_gfiles; i++) {
    graph_load(&gfiles[i], gprefs, NULL);
    graph_file_close(&gfiles[i]);
    gprefs.empty_colours = false;
  }
  ctx_free(gfiles);

  hash_table_print_stats(&db_graph.ht);

  // Load link files
  for(i = 0; i < gpfiles.len; i++)
    gpath_reader_load(&gpfiles.b[i], GPATH_DIE_MISSING_KMERS, &db_graph);

  // Create array of cJSON** from input files
  cJSON **hdrs = ctx_malloc(gpfiles.len * sizeof(cJSON*));
  for(i = 0; i < gpfiles.len; i++) hdrs[i] = gpfiles.b[i].json;

  // Now call variants
  BubbleCallingPrefs call_prefs = {.max_allele_len = max_allele_len,
                                   .max_flank_len = max_flank_len,
                                   .haploid_cols = hapcols,
                                   .nhaploid_cols = nhapcols,
                                   .remove_serial_bubbles = remove_serial_bubbles};

  invoke_bubble_caller(nthreads, &call_prefs,
                       gzout, out_path,
                       hdrs, gpfiles.len,
                       &db_graph);

  status("  saved to: %s\n", out_path);
  gzclose(gzout);
  ctx_free(hdrs);

  // Close input link files
  for(i = 0; i < gpfiles.len; i++)
    gpath_reader_close(&gpfiles.b[i]);
  gpfile_buf_dealloc(&gpfiles);

  ctx_free(hapcols);
  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}
Beispiel #12
0
int ctx_exp_abc(int argc, char **argv)
{
  size_t i, nthreads = 0, num_repeats = 0, max_AB_dist = 0;
  struct MemArgs memargs = MEM_ARGS_INIT;
  bool print_failed_contigs = false;

  GPathReader tmp_gpfile;
  GPathFileBuffer gpfiles;
  gpfile_buf_alloc(&gpfiles, 8);

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 't': cmd_check(!nthreads,cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'p':
        memset(&tmp_gpfile, 0, sizeof(GPathReader));
        gpath_reader_open(&tmp_gpfile, optarg);
        gpfile_buf_push(&gpfiles, &tmp_gpfile, 1);
        break;
      case 'N': cmd_check(!num_repeats,cmd); num_repeats = cmd_uint32_nonzero(cmd, optarg); break;
      case 'M': cmd_check(!max_AB_dist,cmd); max_AB_dist = cmd_uint32_nonzero(cmd, optarg); break;
      case 'P': cmd_check(!print_failed_contigs,cmd); print_failed_contigs = true; break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" exp_abc -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  // Defaults
  if(nthreads == 0) nthreads = DEFAULT_NTHREADS;
  if(num_repeats == 0) num_repeats = DEFAULT_NUM_REPEATS;
  if(max_AB_dist == 0) max_AB_dist = DEFAULT_MAX_AB_DIST;

  if(print_failed_contigs && nthreads != 1) {
    warn("--print forces nthreads to be one. soz.");
    nthreads = 1;
  }

  if(optind+1 != argc) cmd_print_usage("Require exactly one input graph file (.ctx)");

  const char *ctx_path = argv[optind];

  //
  // Open Graph file
  //
  GraphFileReader gfile;
  memset(&gfile, 0, sizeof(GraphFileReader));
  graph_file_open(&gfile, ctx_path);

  size_t ncols = file_filter_into_ncols(&gfile.fltr);

  // Check only loading one colour
  if(ncols > 1) die("Only implemented for one colour currently");

  // Check graph + paths are compatible
  graphs_gpaths_compatible(&gfile, 1, gpfiles.b, gpfiles.len, -1);

  //
  // Decide on memory
  //
  size_t bits_per_kmer, kmers_in_hash, graph_mem, path_mem, total_mem;

  // 1 bit needed per kmer if we need to keep track of kmer usage
  bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 + sizeof(GPath*)*8 +
                  ncols;

  kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                        memargs.mem_to_use_set,
                                        memargs.num_kmers,
                                        memargs.num_kmers_set,
                                        bits_per_kmer,
                                        gfile.num_of_kmers, gfile.num_of_kmers,
                                        false, &graph_mem);

  // Paths memory
  size_t rem_mem = memargs.mem_to_use - MIN2(memargs.mem_to_use, graph_mem);
  path_mem = gpath_reader_mem_req(gpfiles.b, gpfiles.len, ncols, rem_mem, false,
                                  kmers_in_hash, false);

  // Shift path store memory from graphs->paths
  graph_mem -= sizeof(GPath*)*kmers_in_hash;
  path_mem  += sizeof(GPath*)*kmers_in_hash;
  cmd_print_mem(path_mem, "paths");

  total_mem = graph_mem + path_mem;
  cmd_check_mem_limit(memargs.mem_to_use, total_mem);

  //
  // Allocate memory
  //
  dBGraph db_graph;
  db_graph_alloc(&db_graph, gfile.hdr.kmer_size, 1, 1, kmers_in_hash,
                 DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL);

  // Paths
  gpath_reader_alloc_gpstore(gpfiles.b, gpfiles.len, path_mem, false, &db_graph);

  // Load the graph
  GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph);
  gprefs.empty_colours = true;

  graph_load(&gfile, gprefs, NULL);
  graph_file_close(&gfile);

  hash_table_print_stats(&db_graph.ht);

  // Load link files
  for(i = 0; i < gpfiles.len; i++) {
    gpath_reader_load(&gpfiles.b[i], GPATH_DIE_MISSING_KMERS, &db_graph);
    gpath_reader_close(&gpfiles.b[i]);
  }
  gpfile_buf_dealloc(&gpfiles);

  status("\n");
  status("Test 1: Priming region A->B (n: %zu max_AB_dist: %zu)",
         num_repeats, max_AB_dist);

  run_exp_abc(&db_graph, true, nthreads, num_repeats,
              max_AB_dist, print_failed_contigs);

  status("\n");
  status("Test 2: Trying to traverse A->B (n: %zu max_AB_dist: %zu)",
         num_repeats, max_AB_dist);

  run_exp_abc(&db_graph, false, nthreads, num_repeats,
              max_AB_dist, print_failed_contigs);

  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}
Beispiel #13
0
int ctx_vcfcov(int argc, char **argv)
{
  struct MemArgs memargs = MEM_ARGS_INIT;
  const char *out_path = NULL, *out_type = NULL;

  uint32_t max_allele_len = 0, max_gt_vars = 0;
  char *ref_path = NULL;
  bool low_mem = false;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;
  size_t i;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'o': cmd_check(!out_path, cmd); out_path = optarg; break;
      case 'O': cmd_check(!out_type, cmd); out_type = optarg; break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'r': cmd_check(!ref_path, cmd); ref_path = optarg; break;
      case 'L': cmd_check(!max_allele_len,cmd); max_allele_len = cmd_uint32(cmd,optarg); break;
      case 'N': cmd_check(!max_gt_vars,cmd); max_gt_vars = cmd_uint32(cmd,optarg); break;
      case 'M': cmd_check(!low_mem, cmd); low_mem = true; break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  // Defaults for unset values
  if(out_path == NULL) out_path = "-";
  if(ref_path == NULL) cmd_print_usage("Require a reference (-r,--ref <ref.fa>)");
  if(optind+2 > argc) cmd_print_usage("Require VCF and graph files");

  if(!max_allele_len) max_allele_len = DEFAULT_MAX_ALLELE_LEN;
  if(!max_gt_vars) max_gt_vars = DEFAULT_MAX_GT_VARS;

  status("[vcfcov] max allele length: %u; max number of variants: %u",
         max_allele_len, max_gt_vars);

  // open ref
  // index fasta with: samtools faidx ref.fa
  faidx_t *fai = fai_load(ref_path);
  if(fai == NULL) die("Cannot load ref index: %s / %s.fai", ref_path, ref_path);

  // Open input VCF file
  const char *vcf_path = argv[optind++];
  htsFile *vcffh = hts_open(vcf_path, "r");
  if(vcffh == NULL) die("Cannot open VCF file: %s", vcf_path);
  bcf_hdr_t *vcfhdr = bcf_hdr_read(vcffh);
  if(vcfhdr == NULL) die("Cannot read VCF header: %s", vcf_path);

  // Test we can close and reopen files
  if(low_mem) {
    if((vcffh = hts_open(vcf_path, "r")) == NULL)
      die("Cannot re-open VCF file: %s", vcf_path);
    if((vcfhdr = bcf_hdr_read(vcffh)) == NULL)
      die("Cannot re-read VCF header: %s", vcf_path);
  }

  //
  // Open graph files
  //
  const size_t num_gfiles = argc - optind;
  char **graph_paths = argv + optind;
  ctx_assert(num_gfiles > 0);

  GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader));
  size_t ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0;

  ncols = graph_files_open(graph_paths, gfiles, num_gfiles,
                           &ctx_max_kmers, &ctx_sum_kmers);

  // Check graph + paths are compatible
  graphs_gpaths_compatible(gfiles, num_gfiles, NULL, 0, -1);

  //
  // Decide on memory
  //
  size_t bits_per_kmer, kmers_in_hash, graph_mem;

  bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Covg)*8 * ncols;
  kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                        memargs.mem_to_use_set,
                                        memargs.num_kmers,
                                        memargs.num_kmers_set,
                                        bits_per_kmer,
                                        low_mem ? -1 : (int64_t)ctx_max_kmers,
                                        ctx_sum_kmers,
                                        true, &graph_mem);

  cmd_check_mem_limit(memargs.mem_to_use, graph_mem);

  //
  // Open output file
  //
  // v=>vcf, z=>compressed vcf, b=>bcf, bu=>uncompressed bcf
  int mode = vcf_misc_get_outtype(out_type, out_path);
  futil_create_output(out_path);
  htsFile *outfh = hts_open(out_path, modes_htslib[mode]);
  status("[vcfcov] Output format: %s", hsmodes_htslib[mode]);


  // Allocate memory
  dBGraph db_graph;
  db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, ncols, 1, kmers_in_hash,
                 DBG_ALLOC_COVGS);

  //
  // Set up tag names
  //

  // *R => ref, *A => alt
  sprintf(kcov_ref_tag, "K%zuR", db_graph.kmer_size); // mean coverage
  sprintf(kcov_alt_tag, "K%zuA", db_graph.kmer_size);

  // #SAMPLE=<ID=...,K29KCOV=...,K29NK=...,K29RLK>
  // - K29_kcov is empirical kmer coverage
  // - K29_nkmers is the number of kmers in the sample
  // - mean_read_length is the mean read length in bases
  char sample_kcov_tag[20], sample_nk_tag[20], sample_rlk_tag[20];
  sprintf(sample_kcov_tag, "K%zu_kcov", db_graph.kmer_size); // mean coverage
  sprintf(sample_nk_tag, "K%zu_nkmers", db_graph.kmer_size);
  sprintf(sample_rlk_tag, "mean_read_length");

  //
  // Load kmers if we are using --low-mem
  //

  VcfCovStats st;
  memset(&st, 0, sizeof(st));
  VcfCovPrefs prefs = {.kcov_ref_tag = kcov_ref_tag,
                       .kcov_alt_tag = kcov_alt_tag,
                       .max_allele_len = max_allele_len,
                       .max_gt_vars = max_gt_vars,
                       .load_kmers_only = false};

  if(low_mem)
  {
    status("[vcfcov] Loading kmers from VCF+ref");

    prefs.load_kmers_only = true;
    vcfcov_file(vcffh, vcfhdr, NULL, NULL, vcf_path, fai,
                NULL, &prefs, &st, &db_graph);

    // Close files
    hts_close(vcffh);
    bcf_hdr_destroy(vcfhdr);

    // Re-open files
    if((vcffh = hts_open(vcf_path, "r")) == NULL)
      die("Cannot re-open VCF file: %s", vcf_path);
    if((vcfhdr = bcf_hdr_read(vcffh)) == NULL)
      die("Cannot re-read VCF header: %s", vcf_path);

    prefs.load_kmers_only = false;
  }

  //
  // Load graphs
  //
  GraphLoadingStats gstats;
  memset(&gstats, 0, sizeof(gstats));

  GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph);
  gprefs.must_exist_in_graph = low_mem;

  for(i = 0; i < num_gfiles; i++) {
    graph_load(&gfiles[i], gprefs, &gstats);
    graph_file_close(&gfiles[i]);
  }
  ctx_free(gfiles);

  hash_table_print_stats(&db_graph.ht);

  //
  // Set up VCF header / graph matchup
  //
  size_t *samplehdrids = ctx_malloc(db_graph.num_of_cols * sizeof(size_t));

  // Add samples to vcf header
  bcf_hdr_t *outhdr = bcf_hdr_dup(vcfhdr);
  bcf_hrec_t *hrec;
  int sid;
  char hdrstr[200];

  for(i = 0; i < db_graph.num_of_cols; i++) {
    char *sname = db_graph.ginfo[i].sample_name.b;
    if((sid = bcf_hdr_id2int(outhdr, BCF_DT_SAMPLE, sname)) < 0) {
      bcf_hdr_add_sample(outhdr, sname);
      sid = bcf_hdr_id2int(outhdr, BCF_DT_SAMPLE, sname);
    }
    samplehdrids[i] = sid;

    // Add SAMPLE field
    hrec = bcf_hdr_get_hrec(outhdr, BCF_HL_STR, "ID", sname, "SAMPLE");

    if(hrec == NULL) {
      sprintf(hdrstr, "##SAMPLE=<ID=%s,%s=%"PRIu64",%s=%"PRIu64",%s=%zu>", sname,
              sample_kcov_tag,
              gstats.nkmers[i] ? gstats.sumcov[i] / gstats.nkmers[i] : 0,
              sample_nk_tag, gstats.nkmers[i],
              sample_rlk_tag, (size_t)db_graph.ginfo[i].mean_read_length);
      bcf_hdr_append(outhdr, hdrstr);
    }
    else {
      // mean kcovg
      sprintf(hdrstr, "%"PRIu64, gstats.sumcov[i] / gstats.nkmers[i]);
      vcf_misc_add_update_hrec(hrec, sample_kcov_tag, hdrstr);
      // num kmers
      sprintf(hdrstr, "%"PRIu64, gstats.nkmers[i]);
      vcf_misc_add_update_hrec(hrec, sample_nk_tag, hdrstr);
      // mean read length in kmers
      sprintf(hdrstr, "%zu", (size_t)db_graph.ginfo[i].mean_read_length);
      vcf_misc_add_update_hrec(hrec, sample_rlk_tag, hdrstr);
    }

    status("[vcfcov] Colour %zu: %s [VCF column %zu]", i, sname, samplehdrids[i]);
  }

  // Add genotype format fields
  // One field per alternative allele

  sprintf(hdrstr, "##FORMAT=<ID=%s,Number=A,Type=Integer,"
          "Description=\"Coverage on ref (k=%zu): sum(kmer_covs) / exp_num_kmers\">\n",
          kcov_ref_tag, db_graph.kmer_size);
  bcf_hdr_append(outhdr, hdrstr);
  sprintf(hdrstr, "##FORMAT=<ID=%s,Number=A,Type=Integer,"
          "Description=\"Coverage on alt (k=%zu): sum(kmer_covs) / exp_num_kmers\">\n",
          kcov_alt_tag, db_graph.kmer_size);
  bcf_hdr_append(outhdr, hdrstr);

  bcf_hdr_set_version(outhdr, "VCFv4.2");

  // Add command string to header
  vcf_misc_hdr_add_cmd(outhdr, cmd_get_cmdline(), cmd_get_cwd());

  if(bcf_hdr_write(outfh, outhdr) != 0)
    die("Cannot write header to: %s", futil_outpath_str(out_path));

  status("[vcfcov] Reading %s and adding coverage", vcf_path);

  // Reset stats and get coverage
  memset(&st, 0, sizeof(st));

  vcfcov_file(vcffh, vcfhdr, outfh, outhdr, vcf_path, fai,
              samplehdrids, &prefs, &st, &db_graph);

  // Print statistics
  char ns0[50], ns1[50];
  status("[vcfcov] Read %s VCF lines", ulong_to_str(st.nvcf_lines, ns0));
  status("[vcfcov] Read %s ALTs", ulong_to_str(st.nalts_read, ns0));
  status("[vcfcov] Used %s kmers", ulong_to_str(st.ngt_kmers, ns0));
  status("[vcfcov] ALTs used: %s / %s (%.2f%%)",
         ulong_to_str(st.nalts_loaded, ns0), ulong_to_str(st.nalts_read, ns1),
         st.nalts_read ? (100.0*st.nalts_loaded) / st.nalts_read : 0.0);
  status("[vcfcov] ALTs too long (>%ubp): %s / %s (%.2f%%)", max_allele_len,
         ulong_to_str(st.nalts_too_long, ns0), ulong_to_str(st.nalts_read, ns1),
         st.nalts_read ? (100.0*st.nalts_too_long) / st.nalts_read : 0.0);
  status("[vcfcov] ALTs too dense (>%u within %zubp): %s / %s (%.2f%%)",
         max_gt_vars, db_graph.kmer_size,
         ulong_to_str(st.nalts_no_covg, ns0), ulong_to_str(st.nalts_read, ns1),
         st.nalts_read ? (100.0*st.nalts_no_covg) / st.nalts_read : 0.0);
  status("[vcfcov] ALTs printed with coverage: %s / %s (%.2f%%)",
         ulong_to_str(st.nalts_with_covg, ns0), ulong_to_str(st.nalts_read, ns1),
         st.nalts_read ? (100.0*st.nalts_with_covg) / st.nalts_read : 0.0);

  status("[vcfcov] Saved to: %s\n", out_path);

  ctx_free(samplehdrids);
  graph_loading_stats_destroy(&gstats);

  bcf_hdr_destroy(vcfhdr);
  bcf_hdr_destroy(outhdr);
  hts_close(vcffh);
  hts_close(outfh);
  fai_destroy(fai);
  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}