Esempio n. 1
0
// Merge temporary files, closes tmp files
void futil_merge_tmp_files(FILE **tmp_files, size_t num_files, FILE *fout)
{
  #define TMP_BUF_SIZE (32 * ONE_MEGABYTE)

  char *data = ctx_malloc(TMP_BUF_SIZE);
  size_t i, len;
  FILE *fh;

  for(i = 0; i < num_files; i++)
  {
    fh = tmp_files[i];
    if(fseek(fh, 0L, SEEK_SET) != 0) die("fseek error");

    while((len = fread(data, 1, TMP_BUF_SIZE, fh)) > 0)
      if(fwrite(data, 1, len, fout) != len)
        die("write error [%s]", strerror(errno));

    if(ferror(fh)) warn("fread error: %s", strerror(errno));
    fclose(fh);
  }

  ctx_free(data);

  #undef TMP_BUF_SIZE
}
Esempio n. 2
0
// Returns sorted array of hkey_t from the hash table
hkey_t* hash_table_sorted(const HashTable *htable)
{
  ctx_assert(sizeof(hkey_t) == sizeof(BinaryKmer*));
  ctx_assert(sizeof(hkey_t) == sizeof(BkmerPtrHkeyUnion));
  BkmerPtrHkeyUnion *kmers, *nxt, *end;
  nxt = kmers = ctx_malloc(sizeof(BkmerPtrHkeyUnion) * htable->num_kmers);
  end = kmers + htable->num_kmers;
  HASH_ITERATE(htable, _fetch_kmer_union, htable, &nxt);
  // Can sort ignoring that the top flag bit is set on all kmers
  qsort(kmers, htable->num_kmers, sizeof(BinaryKmer*), binary_kmers_qcmp_ptrs);
  for(nxt = kmers; nxt < end; nxt++) nxt->h = nxt->bptr - htable->table;
  return (hkey_t*)kmers;
}
Esempio n. 3
0
/**
 * Calculate cleaning threshold for supernodes from a given distribution
 * of supernode coverages
 * @param covgs histogram of supernode coverages
 */
size_t cleaning_pick_supernode_threshold(const uint64_t *covgs, size_t len,
                                         double seq_depth,
                                         const dBGraph *db_graph)
{
  ctx_assert(len > 5);
  ctx_assert(db_graph->ht.num_kmers > 0);

  size_t i, d1len = len-2, d2len = len-3, f1, f2;
  double *tmp = ctx_malloc((d1len+d2len) * sizeof(double));
  double *delta1 = tmp, *delta2 = tmp + d1len;

  // Get sequencing depth from coverage
  uint64_t covg_sum = 0, capacity = db_graph->ht.capacity * db_graph->num_of_cols;
  for(i = 0; i < capacity; i++) covg_sum += db_graph->col_covgs[i];
  double seq_depth_est = (double)covg_sum / db_graph->ht.num_kmers;

  status("[cleaning] Kmer depth before cleaning supernodes: %.2f", seq_depth_est);
  if(seq_depth <= 0) seq_depth = seq_depth_est;
  else status("[cleaning] Using sequence depth argument: %f", seq_depth);

  size_t fallback_thresh = (size_t)MAX2(1, (seq_depth+1)/2);

  // +1 to ensure covgs is never 0
  for(i = 0; i < d1len; i++) delta1[i] = (double)(covgs[i+1]+1) / (covgs[i+2]+1);

  d1len = i;
  d2len = d1len - 1;

  if(d1len <= 2) {
    status("[cleaning]  (using fallback1)\n");
    ctx_free(tmp);
    return fallback_thresh;
  }

  // d2len is d1len-1
  for(i = 0; i < d2len; i++) delta2[i] = delta1[i] / delta1[i+1];

  for(f1 = 0; f1 < d1len && delta1[f1] >= 1; f1++);
  for(f2 = 0; f2 < d2len && delta2[f2] > 1; f2++);

  ctx_free(tmp);

  if(f1 < d1len && f1 < (seq_depth*0.75))
  { status("[cleaning]   (using f1)"); return f1+1; }
  else if(f2 < d2len)
  { status("[cleaning]   (using f2)"); return f2+1; }
  else
  { status("[cleaning]   (using fallback1)"); return fallback_thresh+1; }
}
Esempio n. 4
0
void chrom_hash_load(char const*const* paths, size_t num_files,
                     ReadBuffer *chroms, ChromHash *genome)
{
  size_t i;

  seq_file_t **ref_files = ctx_malloc(num_files * sizeof(seq_file_t*));

  for(i = 0; i < num_files; i++)
    if((ref_files[i] = seq_open(paths[i])) == NULL)
      die("Cannot read sequence file: %s", paths[i]);

  chrom_hash_load2(ref_files, num_files, chroms, genome);

  ctx_free(ref_files);
}
Esempio n. 5
0
void hash_table_alloc(HashTable *ht, uint64_t req_capacity)
{
  uint64_t num_of_buckets, capacity;
  uint8_t bucket_size;

  capacity = hash_table_cap(req_capacity, &num_of_buckets, &bucket_size);
  uint_fast32_t hash_mask = (uint_fast32_t)(num_of_buckets - 1);

  size_t mem = capacity * sizeof(BinaryKmer) +
               num_of_buckets * sizeof(uint8_t[2]);

  char num_bkts_str[100], bkt_size_str[100], cap_str[100], mem_str[100];
  ulong_to_str(num_of_buckets, num_bkts_str);
  ulong_to_str(bucket_size, bkt_size_str);
  ulong_to_str(capacity, cap_str);
  bytes_to_str(mem, 1, mem_str);
  status("[hasht] Allocating table with %s entries, using %s", cap_str, mem_str);
  status("[hasht]  number of buckets: %s, bucket size: %s", num_bkts_str, bkt_size_str);

  // calloc is required for bucket_data to set the first element of each bucket
  // to the 0th pos
  BinaryKmer *table = ctx_malloc(capacity * sizeof(BinaryKmer));
  uint8_t (*const buckets)[2] = ctx_calloc(num_of_buckets, sizeof(uint8_t[2]));

  size_t i;
  for(i = 0; i < capacity; i++) table[i] = unset_bkmer;

  HashTable data = {
    .table = table,
    .num_of_buckets = num_of_buckets,
    .hash_mask = hash_mask,
    .bucket_size = bucket_size,
    .capacity = capacity,
    .buckets = buckets,
    .num_kmers = 0,
    .collisions = {0},
    .seed = rand()};

  memcpy(ht, &data, sizeof(data));
}

void hash_table_dealloc(HashTable *hash_table)
{
  ctx_free(hash_table->table);
  ctx_free(hash_table->buckets);
}
Esempio n. 6
0
// Merge temporary files, closes tmp files
void futil_merge_tmp_files(FILE **tmp_files, size_t num_files, FILE *fout)
{
  #define TMP_BUF_SIZE (1<<25) /* 32MB */

  char *data = ctx_malloc(TMP_BUF_SIZE);
  size_t i, len;
  FILE *tmp_file;

  for(i = 0; i < num_files; i++)
  {
    tmp_file = tmp_files[i];
    if(fseek(tmp_file, 0L, SEEK_SET) == -1) die("gzseek error");

    while((len = fread(data, 1, TMP_BUF_SIZE, tmp_file)) > 0)
      if(fwrite(data, 1, len, fout) != len)
        die("write error [%s]", strerror(errno));

    fclose(tmp_file);
  }

  ctx_free(data);

  #undef TMP_BUF_SIZE
}
Esempio n. 7
0
int ctx_thread(int argc, char **argv)
{
  struct ReadThreadCmdArgs args;
  read_thread_args_alloc(&args);
  read_thread_args_parse(&args, argc, argv, longopts, false);

  GraphFileReader *gfile = &args.gfile;
  GPathFileBuffer *gpfiles = &args.gpfiles;
  CorrectAlnInputBuffer *inputs = &args.inputs;
  size_t i;

  if(args.zero_link_counts && gpfiles->len == 0)
    cmd_print_usage("-0,--zero-paths without -p,--paths <in.ctp> has no meaning");

  // Check each path file only loads one colour
  gpaths_only_for_colour(gpfiles->b, gpfiles->len, 0);

  //
  // Decide on memory
  //
  size_t bits_per_kmer, kmers_in_hash, graph_mem, total_mem;
  size_t path_hash_mem, path_store_mem, path_mem;
  bool sep_path_list = (!args.use_new_paths && gpfiles->len > 0);

  bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 + sizeof(GPath*)*8 +
                  2 * args.nthreads; // Have traversed

  // false -> don't use mem_to_use to decide how many kmers to store in hash
  // since we need some of that memory for storing paths
  kmers_in_hash = cmd_get_kmers_in_hash(args.memargs.mem_to_use,
                                        args.memargs.mem_to_use_set,
                                        args.memargs.num_kmers,
                                        args.memargs.num_kmers_set,
                                        bits_per_kmer,
                                        gfile->num_of_kmers,
                                        gfile->num_of_kmers,
                                        false, &graph_mem);

  // Paths memory
  size_t min_path_mem = 0;
  gpath_reader_sum_mem(gpfiles->b, gpfiles->len, 1, true, true, &min_path_mem);

  if(graph_mem + min_path_mem > args.memargs.mem_to_use) {
    char buf[50];
    die("Require at least %s memory", bytes_to_str(graph_mem+min_path_mem, 1, buf));
  }

  path_mem = args.memargs.mem_to_use - graph_mem;
  size_t pentry_hash_mem = sizeof(GPEntry)/0.7;
  size_t pentry_store_mem = sizeof(GPath) + 8 + // struct + sequence
                            1 + // in colour
                            sizeof(uint8_t) + // counts
                            sizeof(uint32_t); // kmer length

  size_t max_paths = path_mem / (pentry_store_mem + pentry_hash_mem);
  path_store_mem = max_paths * pentry_store_mem;
  path_hash_mem = max_paths * pentry_hash_mem;
  cmd_print_mem(path_hash_mem, "paths hash");
  cmd_print_mem(path_store_mem, "paths store");

  total_mem = graph_mem + path_mem;
  cmd_check_mem_limit(args.memargs.mem_to_use, total_mem);

  //
  // Open output file
  //
  gzFile gzout = futil_gzopen_create(args.out_ctp_path, "w");

  status("Creating paths file: %s", futil_outpath_str(args.out_ctp_path));

  //
  // Allocate memory
  //
  dBGraph db_graph;
  size_t kmer_size = gfile->hdr.kmer_size;
  db_graph_alloc(&db_graph, kmer_size, 1, 1, kmers_in_hash,
                 DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL);

  // Split path memory 2:1 between store and hash
  // Create a path store that tracks path counts
  gpath_store_alloc(&db_graph.gpstore,
                    db_graph.num_of_cols, db_graph.ht.capacity,
                    0, path_store_mem, true, sep_path_list);

  // Create path hash table for fast lookup
  gpath_hash_alloc(&db_graph.gphash, &db_graph.gpstore, path_hash_mem);

  if(args.use_new_paths) {
    status("Using paths as they are added (risky)");
  } else {
    status("Not using new paths as they are added (safe)");
  }

  //
  // Start up workers to add paths to the graph
  //
  GenPathWorker *workers;
  workers = gen_paths_workers_alloc(args.nthreads, &db_graph);

  // Setup for loading graphs graph
  LoadingStats gstats;
  loading_stats_init(&gstats);

  // Path statistics
  LoadingStats *load_stats = gen_paths_get_stats(workers);
  CorrectAlnStats *aln_stats = gen_paths_get_aln_stats(workers);

  // Load contig hist distribution
  for(i = 0; i < gpfiles->len; i++) {
    gpath_reader_load_contig_hist(gpfiles->b[i].json,
                                  gpfiles->b[i].fltr.path.b,
                                  file_filter_fromcol(&gpfiles->b[i].fltr, 0),
                                  &aln_stats->contig_histgrm);
  }

  GraphLoadingPrefs gprefs = {.db_graph = &db_graph,
                              .boolean_covgs = false,
                              .must_exist_in_graph = false,
                              .must_exist_in_edges = NULL,
                              .empty_colours = false}; // already loaded paths

  // Load graph, print stats, close file
  graph_load(gfile, gprefs, &gstats);
  hash_table_print_stats_brief(&db_graph.ht);
  graph_file_close(gfile);

  // Load existing paths
  for(i = 0; i < gpfiles->len; i++)
    gpath_reader_load(&gpfiles->b[i], GPATH_DIE_MISSING_KMERS, &db_graph);

  // zero link counts of already loaded links
  if(args.zero_link_counts) {
    status("Zeroing link counts for loaded links");
    gpath_set_zero_nseen(&db_graph.gpstore.gpset);
  }

  if(!args.use_new_paths)
    gpath_store_split_read_write(&db_graph.gpstore);

  // Deal with a set of files at once
  // Can have different numbers of inputs vs threads
  size_t start, end;
  for(start = 0; start < inputs->len; start += MAX_IO_THREADS)
  {
    end = MIN2(inputs->len, start+MAX_IO_THREADS);
    generate_paths(inputs->b+start, end-start, workers, args.nthreads);
  }

  // Print memory statistics
  gpath_hash_print_stats(&db_graph.gphash);
  gpath_store_print_stats(&db_graph.gpstore);

  correct_aln_dump_stats(aln_stats, load_stats,
                         args.dump_seq_sizes,
                         args.dump_frag_sizes,
                         db_graph.ht.num_kmers);

  // Don't need GPathHash anymore
  gpath_hash_dealloc(&db_graph.gphash);

  cJSON **hdrs = ctx_malloc(gpfiles->len * sizeof(cJSON*));
  for(i = 0; i < gpfiles->len; i++) hdrs[i] = gpfiles->b[i].json;

  size_t output_threads = MIN2(args.nthreads, MAX_IO_THREADS);

  // Generate a cJSON header for all inputs
  cJSON *thread_hdr = cJSON_CreateObject();
  cJSON *inputs_hdr = cJSON_CreateArray();
  cJSON_AddItemToObject(thread_hdr, "inputs", inputs_hdr);
  for(i = 0; i < inputs->len; i++)
    cJSON_AddItemToArray(inputs_hdr, correct_aln_input_json_hdr(&inputs->b[i]));

  // Write output file
  gpath_save(gzout, args.out_ctp_path, output_threads, true,
             "thread", thread_hdr, hdrs, gpfiles->len,
             &aln_stats->contig_histgrm, 1,
             &db_graph);

  gzclose(gzout);
  ctx_free(hdrs);

  // Optionally run path checks for debugging
  // gpath_checks_all_paths(&db_graph, args.nthreads);

  // ins_gap, err_gap no longer allocated after this line
  gen_paths_workers_dealloc(workers, args.nthreads);

  // Close and free input files etc.
  read_thread_args_dealloc(&args);
  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}
Esempio n. 8
0
int ctx_sort(int argc, char **argv)
{
  const char *out_path = NULL;
  struct MemArgs memargs = MEM_ARGS_INIT;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'o': cmd_check(!out_path, cmd); out_path = optarg; break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" sort -h` for help. Bad option: %s", argv[optind-1]);
      default: die("Bad option: [%c]: %s", c, cmd);
    }
  }

  if(optind+1 != argc)
    cmd_print_usage("Require exactly one input graph file (.ctx)");

  const char *ctx_path = argv[optind];

  //
  // Open Graph file
  //
  GraphFileReader gfile;
  memset(&gfile, 0, sizeof(GraphFileReader));
  graph_file_open2(&gfile, ctx_path, out_path ? "r" : "r+", true, 0);

  if(!file_filter_is_direct(&gfile.fltr))
    die("Cannot open graph file with a filter ('in.ctx:blah' syntax)");

  size_t num_kmers, memory;

  // Reading from a stream
  if(gfile.num_of_kmers < 0) {
    if(!memargs.num_kmers_set)
      die("If reading from a stream, must give -n <num_kmers>");
    num_kmers = memargs.num_kmers;
  }
  else num_kmers = gfile.num_of_kmers;

  // Open output path (if given)
  FILE *fout = out_path ? futil_fopen_create(out_path, "w") : NULL;

  size_t i;
  size_t ncols = gfile.hdr.num_of_cols;
  size_t kmer_mem = sizeof(BinaryKmer) + (sizeof(Edges)+sizeof(Covg))*ncols;

  memory = (sizeof(char*) + kmer_mem) * num_kmers;

  char mem_str[50];
  bytes_to_str(memory, 1, mem_str);

  if(memory > memargs.mem_to_use)
    die("Require at least %s memory", mem_str);

  status("[memory] Total: %s", mem_str);

  char *mem = ctx_malloc(kmer_mem * num_kmers);
  char **kmers = ctx_malloc(num_kmers*sizeof(char*));

  // Read in whole file
  // if(graph_file_fseek(gfile, gfile.hdr_size, SEEK_SET) != 0) die("fseek failed");
  size_t nkread = gfr_fread_bytes(&gfile, mem, num_kmers*kmer_mem);

  if(nkread != num_kmers*kmer_mem)
    die("Could only read %zu bytes [<%zu]", nkread, num_kmers*kmer_mem);

  // check we are at the end of the file
  char tmpc;
  if(gfr_fread_bytes(&gfile, &tmpc, 1) != 0) {
    die("More kmers in file than believed (kmers: %zu ncols: %zu).",
        num_kmers, ncols);
  }

  status("Read %zu kmers with %zu colour%s", num_kmers,
         ncols, util_plural_str(ncols));

  for(i = 0; i < num_kmers; i++)
    kmers[i] = mem + kmer_mem*i;

  sort_block(kmers, num_kmers);

  // Print
  if(out_path != NULL) {
    // saving to a different destination - write header
    graph_write_header(fout, &gfile.hdr);
  }
  else {
    // Directly manipulating gfile.fh here, using it to write later
    // Not doing any more reading
    if(fseek(gfile.fh, gfile.hdr_size, SEEK_SET) != 0) die("fseek failed");
    fout = gfile.fh;
  }

  for(i = 0; i < num_kmers; i++)
    if(fwrite(kmers[i], 1, kmer_mem, fout) != kmer_mem)
      die("Cannot write to file");

  if(out_path) fclose(fout);

  graph_file_close(&gfile);
  ctx_free(kmers);
  ctx_free(mem);

  return EXIT_SUCCESS;
}
Esempio n. 9
0
int ctx_index(int argc, char **argv)
{
  const char *out_path = NULL;
  size_t block_size = 0, block_kmers = 0;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'o': cmd_check(!out_path, cmd); out_path = optarg; break;
      case 'b':
        cmd_check(!block_kmers, cmd);
        block_kmers = cmd_size_nonzero(cmd, optarg);
        break;
      case 's':
        cmd_check(!block_size, cmd);
        block_size = cmd_size_nonzero(cmd, optarg);
        break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" index -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  if(optind+1 != argc)
    cmd_print_usage("Require exactly one input graph file (.ctx)");

  if(block_size && block_kmers)
    cmd_print_usage("Cannot use --block-kmers and --block-size together");

  const char *ctx_path = argv[optind];

  //
  // Open Graph file
  //
  GraphFileReader gfile;
  memset(&gfile, 0, sizeof(GraphFileReader));
  graph_file_open2(&gfile, ctx_path, "r+", true, 0);

  if(!file_filter_is_direct(&gfile.fltr))
    die("Cannot open graph file with a filter ('in.ctx:blah' syntax)");

  // Open output file
  FILE *fout = out_path ? futil_fopen_create(out_path, "w") : stdout;

  // Start
  size_t filencols = gfile.hdr.num_of_cols;
  size_t kmer_size = gfile.hdr.kmer_size;
  const char *path = file_filter_path(&gfile.fltr);

  size_t ncols = file_filter_into_ncols(&gfile.fltr);
  size_t kmer_mem = sizeof(BinaryKmer) + (sizeof(Edges)+sizeof(Covg))*filencols;

  if(block_size) {
    block_kmers = block_size / kmer_mem;
  } else if(!block_size && !block_kmers) {
    block_size = 4 * ONE_MEGABYTE;
    block_kmers = block_size / kmer_mem;
  }

  // Update block-size
  block_size = block_kmers * kmer_mem;

  status("[index] block bytes: %zu kmers: %zu; kmer bytes: %zu, hdr: %zu",
         block_size, block_kmers, kmer_mem, (size_t)gfile.hdr_size);

  if(block_kmers == 0) die("Cannot set block_kmers to zero");

  // Print header
  fputs("#block_start\tnext_block\tfirst_kmer\tkmer_idx\tnext_kmer_idx\n", fout);

  BinaryKmer bkmer = BINARY_KMER_ZERO_MACRO;
  BinaryKmer prev_bkmer = BINARY_KMER_ZERO_MACRO;
  Covg *covgs = ctx_malloc(ncols * sizeof(Covg));
  Edges *edges = ctx_malloc(ncols * sizeof(Edges));
  char bkmerstr[MAX_KMER_SIZE+1];

  size_t rem_block = block_size - kmer_mem; // block after first kmer
  char *tmp_mem = ctx_malloc(rem_block);

  // Read in file, print index
  size_t nblocks = 0;
  size_t bl_bytes = 0, bl_kmers = 0;
  size_t bl_byte_offset = gfile.hdr_size, bl_kmer_offset = 0;

  while(1)
  {
    if(!graph_file_read(&gfile, &bkmer, covgs, edges)) {
      status("Read kmer failed"); break; }
    binary_kmer_to_str(bkmer, kmer_size, bkmerstr);
    if(nblocks > 0 && !binary_kmer_less_than(prev_bkmer,bkmer))
      die("File is not sorted: %s [%s]", bkmerstr, path);
    // We've already read one kmer entry, read rest of block
    bl_bytes = kmer_mem + gfr_fread_bytes(&gfile, tmp_mem, rem_block);
    bl_kmers = 1 + bl_bytes / kmer_mem;
    fprintf(fout, "%zu\t%zu\t%s\t%zu\t%zu\n",
            bl_byte_offset, bl_byte_offset+bl_bytes, bkmerstr,
            bl_kmer_offset, bl_kmer_offset+bl_kmers);
    bl_byte_offset += bl_bytes;
    bl_kmer_offset += bl_kmers;
    nblocks++;
    if(bl_kmers < block_kmers) {
      status("last block %zu < %zu; %zu vs %zu",
             bl_kmers, block_kmers, bl_bytes, block_size);
      break;
    }
    prev_bkmer = bkmer;
  }

  ctx_free(covgs);
  ctx_free(edges);
  ctx_free(tmp_mem);

  // done
  char num_kmers_str[50], num_blocks_str[50];
  char block_mem_str[50], block_kmers_str[50];
  ulong_to_str(bl_kmer_offset, num_kmers_str);
  ulong_to_str(nblocks, num_blocks_str);
  bytes_to_str(block_size, 1, block_mem_str);
  ulong_to_str(block_kmers, block_kmers_str);

  status("Read %s kmers in %s block%s (block size %s / %s kmers)",
         num_kmers_str, num_blocks_str, util_plural_str(nblocks),
         block_mem_str, block_kmers_str);

  if(fout != stdout) status("Saved to %s", out_path);

  graph_file_close(&gfile);
  fclose(fout);

  return EXIT_SUCCESS;
}
Esempio n. 10
0
// GenPathWorker stores four buffers of size n
#define gworker_seq_buf(n) (binary_seq_mem(n)*4)

static void _gen_paths_worker_alloc(GenPathWorker *wrkr, dBGraph *db_graph)
{
  GenPathWorker tmp = {.db_graph = db_graph};

  correct_aln_worker_alloc(&tmp.corrector, true, db_graph);

  // Junction data
  // only fw arrays are malloc'd, rv point to fw
  tmp.junc_arrsize = INIT_BUFLEN;
  tmp.pck_fw = ctx_calloc(2, gworker_seq_buf(tmp.junc_arrsize));
  tmp.pck_rv = tmp.pck_fw + gworker_seq_buf(tmp.junc_arrsize);

  tmp.pos_fw = ctx_malloc(tmp.junc_arrsize * sizeof(*tmp.pos_fw) * 2);
  tmp.pos_rv = tmp.pos_fw + tmp.junc_arrsize;
  tmp.num_fw = tmp.num_rv = 0;

  memcpy(wrkr, &tmp, sizeof(GenPathWorker));
}

static void _gen_paths_worker_dealloc(GenPathWorker *wrkr)
{
  correct_aln_worker_dealloc(&wrkr->corrector);
  ctx_free(wrkr->pck_fw);
  ctx_free(wrkr->pos_fw);
}


GenPathWorker* gen_paths_workers_alloc(size_t n, dBGraph *graph)
Esempio n. 11
0
static BreakpointCaller* brkpt_callers_new(size_t num_callers,
                                           gzFile gzout,
                                           size_t min_ref_flank,
                                           size_t max_ref_flank,
                                           const KOGraph kograph,
                                           const dBGraph *db_graph)
{
  ctx_assert(num_callers > 0);

  const size_t ncols = db_graph->num_of_cols;
  BreakpointCaller *callers = ctx_malloc(num_callers * sizeof(BreakpointCaller));

  pthread_mutex_t *out_lock = ctx_malloc(sizeof(pthread_mutex_t));
  if(pthread_mutex_init(out_lock, NULL) != 0) die("mutex init failed");

  size_t *callid = ctx_calloc(1, sizeof(size_t));

  // Each colour in each caller can have a GraphCache path at once
  PathRefRun *path_ref_runs = ctx_calloc(num_callers*MAX_REFRUNS_PER_CALLER(ncols),
                                         sizeof(PathRefRun));

  size_t i;
  for(i = 0; i < num_callers; i++)
  {
    BreakpointCaller tmp = {.threadid = i,
                            .nthreads = num_callers,
                            .kograph = kograph,
                            .db_graph = db_graph,
                            .gzout = gzout,
                            .out_lock = out_lock,
                            .callid = callid,
                            .allele_refs = path_ref_runs,
                            .flank5p_refs = path_ref_runs+MAX_REFRUNS_PER_ORIENT(ncols),
                            .min_ref_nkmers = min_ref_flank,
                            .max_ref_nkmers = max_ref_flank};

    memcpy(&callers[i], &tmp, sizeof(BreakpointCaller));

    path_ref_runs += MAX_REFRUNS_PER_CALLER(ncols);

    db_node_buf_alloc(&callers[i].allelebuf, 1024);
    db_node_buf_alloc(&callers[i].flank5pbuf, 1024);
    kmer_run_buf_alloc(&callers[i].koruns_5p, 128);
    kmer_run_buf_alloc(&callers[i].koruns_5p_ended, 128);
    kmer_run_buf_alloc(&callers[i].koruns_3p, 128);
    kmer_run_buf_alloc(&callers[i].koruns_3p_ended, 128);
    kmer_run_buf_alloc(&callers[i].allele_run_buf, 128);
    kmer_run_buf_alloc(&callers[i].flank5p_run_buf, 128);
    graph_crawler_alloc(&callers[i].crawlers[0], db_graph);
    graph_crawler_alloc(&callers[i].crawlers[1], db_graph);
  }

  return callers;
}

static void brkpt_callers_destroy(BreakpointCaller *callers, size_t num_callers)
{
  size_t i;
  for(i = 0; i < num_callers; i++) {
    db_node_buf_dealloc(&callers[i].allelebuf);
    db_node_buf_dealloc(&callers[i].flank5pbuf);
    kmer_run_buf_dealloc(&callers[i].koruns_5p);
    kmer_run_buf_dealloc(&callers[i].koruns_5p_ended);
    kmer_run_buf_dealloc(&callers[i].koruns_3p);
    kmer_run_buf_dealloc(&callers[i].koruns_3p_ended);
    kmer_run_buf_dealloc(&callers[i].allele_run_buf);
    kmer_run_buf_dealloc(&callers[i].flank5p_run_buf);
    graph_crawler_dealloc(&callers[i].crawlers[0]);
    graph_crawler_dealloc(&callers[i].crawlers[1]);
  }
  pthread_mutex_destroy(callers[0].out_lock);
  ctx_free(callers[0].out_lock);
  ctx_free(callers[0].callid);
  ctx_free(callers[0].allele_refs);
  ctx_free(callers);
}
Esempio n. 12
0
BubbleCaller* bubble_callers_new(size_t num_callers,
                                 BubbleCallingPrefs prefs,
                                 gzFile gzout,
                                 const dBGraph *db_graph)
{
  ctx_assert(num_callers > 0);

  // Max usage is 4 * max_allele_len * cols
  size_t i;
  size_t max_path_len = MAX2(prefs.max_flank_len, prefs.max_allele_len);

  BubbleCaller *callers = ctx_malloc(num_callers * sizeof(BubbleCaller));

  pthread_mutex_t *out_lock = ctx_malloc(sizeof(pthread_mutex_t));
  if(pthread_mutex_init(out_lock, NULL) != 0) die("mutex init failed");

  size_t *num_bubbles_ptr = ctx_calloc(1, sizeof(size_t));

  for(i = 0; i < num_callers; i++)
  {
    BubbleCaller tmp = {.threadid = i, .nthreads = num_callers,
                        .haploid_seen = ctx_calloc(1+prefs.num_haploid, sizeof(bool)),
                        .num_bubbles_ptr = num_bubbles_ptr,
                        .prefs = prefs,
                        .db_graph = db_graph, .gzout = gzout,
                        .out_lock = out_lock};

    memcpy(&callers[i], &tmp, sizeof(BubbleCaller));

    // First two buffers don't actually need to grow
    db_node_buf_alloc(&callers[i].flank5p, prefs.max_flank_len);
    db_node_buf_alloc(&callers[i].pathbuf, max_path_len);

    graph_walker_alloc(&callers[i].wlk, db_graph);
    rpt_walker_alloc(&callers[i].rptwlk, db_graph->ht.capacity, 22); // 4MB

    graph_cache_alloc(&callers[i].cache, db_graph);
    cache_stepptr_buf_alloc(&callers[i].spp_forward, 1024);
    cache_stepptr_buf_alloc(&callers[i].spp_reverse, 1024);
    strbuf_alloc(&callers[i].output_buf, 2048);
  }

  return callers;
}

void bubble_callers_destroy(BubbleCaller *callers, size_t num_callers)
{
  ctx_assert(num_callers > 0);

  size_t i;
  for(i = 0; i < num_callers; i++)
  {
    ctx_free(callers[i].haploid_seen);

    db_node_buf_dealloc(&callers[i].flank5p);
    db_node_buf_dealloc(&callers[i].pathbuf);

    rpt_walker_dealloc(&callers[i].rptwlk);
    graph_walker_dealloc(&callers[i].wlk);

    graph_cache_dealloc(&callers[i].cache);
    cache_stepptr_buf_dealloc(&callers[i].spp_forward);
    cache_stepptr_buf_dealloc(&callers[i].spp_reverse);
    strbuf_dealloc(&callers[i].output_buf);
  }
  pthread_mutex_destroy(callers[0].out_lock);
  ctx_free(callers[0].out_lock);
  ctx_free(callers[0].num_bubbles_ptr);
  ctx_free(callers);
}
Esempio n. 13
0
int ctx_links(int argc, char **argv)
{
  size_t limit = 0;
  const char *link_out_path = NULL, *csv_out_path = NULL, *plot_out_path = NULL;
  const char *thresh_path = NULL, *hist_path = NULL;

  size_t hist_distsize = 0, hist_covgsize = 0;
  size_t cutoff = 0;
  bool clean = false;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'o': cmd_check(!link_out_path, cmd); link_out_path = optarg; break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'l': cmd_check(!csv_out_path, cmd); csv_out_path = optarg; break;
      case 'c': cmd_check(!cutoff, cmd); cutoff = cmd_size(cmd, optarg); clean = true; break;
      case 'L': cmd_check(!limit, cmd); limit = cmd_size(cmd, optarg); break;
      case 'P': cmd_check(!plot_out_path, cmd); plot_out_path = optarg; break;
      case 'T': cmd_check(!thresh_path, cmd); thresh_path = optarg; break;
      case 'H': cmd_check(!hist_path, cmd); hist_path = optarg; break;
      case 'C': cmd_check(!hist_covgsize, cmd); hist_covgsize = cmd_size(cmd, optarg); break;
      case 'D': cmd_check(!hist_distsize, cmd); hist_distsize = cmd_size(cmd, optarg); break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" links -h` for help. Bad option: %s", argv[optind-1]);
      default: ctx_assert2(0, "shouldn't reach here: %c", c);
    }
  }

  if(hist_distsize && !hist_path) cmd_print_usage("--max-dist without --covg-hist");
  if(hist_covgsize && !hist_path) cmd_print_usage("--max-covg without --covg-hist");

  // Defaults
  if(!hist_distsize) hist_distsize = DEFAULT_MAX_DIST;
  if(!hist_covgsize) hist_covgsize = DEFAULT_MAX_COVG;

  if(optind + 1 != argc) cmd_print_usage("Wrong number of arguments");
  const char *ctp_path = argv[optind];

  bool list = (csv_out_path != NULL);
  bool plot = (plot_out_path != NULL);
  bool save = (link_out_path != NULL);
  bool hist_covg = (thresh_path != NULL || hist_path != NULL);

  size_t plot_kmer_idx = (limit == 0 ? 0 : limit - 1);

  if(clean && !save)
    cmd_print_usage("Need to give --out <out.ctp.gz> with --clean");

  if(!save && !list && !plot && !hist_covg)
    cmd_print_usage("Please specify one of --plot, --list or --clean");

  if(link_out_path && hist_covg && strcmp(link_out_path,"-") == 0)
    cmd_print_usage("Outputing both cleaning threshold (-T) and links (-o) to STDOUT!");

  // Open input file
  FILE *list_fh = NULL, *plot_fh = NULL, *link_tmp_fh = NULL;
  FILE *thresh_fh = NULL, *hist_fh = NULL;
  gzFile link_gz = NULL;

  // Check file don't exist or that we can overwrite
  // Will ignore if path is null
  bool err = false;
  err |= futil_check_outfile(csv_out_path);
  err |= futil_check_outfile(plot_out_path);
  err |= futil_check_outfile(link_out_path);
  err |= futil_check_outfile(thresh_path);
  err |= futil_check_outfile(hist_path);
  if(err) die("Use -f,--force to overwrite files");

  StrBuf link_tmp_path;
  strbuf_alloc(&link_tmp_path, 1024);

  GPathReader ctpin;
  memset(&ctpin, 0, sizeof(ctpin));
  gpath_reader_open(&ctpin, ctp_path);

  size_t ncols = file_filter_into_ncols(&ctpin.fltr);
  size_t kmer_size = gpath_reader_get_kmer_size(&ctpin);
  cJSON *newhdr = cJSON_Duplicate(ctpin.json, 1);

  if(ncols != 1) die("Can only clean a single colour at a time. Sorry.");

  uint64_t (*hists)[hist_covgsize] = NULL;

  if(hist_covg) {
    hists = ctx_calloc(hist_distsize, sizeof(hists[0]));
  }

  if(hist_path && (hist_fh = futil_fopen_create(hist_path, "w")) == NULL)
      die("Cannot open file: %s", hist_path);

  if(thresh_path && (thresh_fh = futil_fopen_create(thresh_path, "w")) == NULL)
      die("Cannot open file: %s", thresh_path);

  if(limit)
    status("Limiting to the first %zu kmers", limit);

  if(clean)
  {
    timestamp();
    message(" Cleaning coverage below %zu", cutoff);
    message("\n");
  }

  if(save)
  {
    // Check we can find the fields we need
    cJSON *links_json  = json_hdr_get(newhdr, "paths", cJSON_Object, link_out_path);
    cJSON *nkmers_json = json_hdr_get(links_json, "num_kmers_with_paths", cJSON_Number, link_out_path);
    cJSON *nlinks_json = json_hdr_get(links_json, "num_paths",            cJSON_Number, link_out_path);
    cJSON *nbytes_json = json_hdr_get(links_json, "path_bytes",           cJSON_Number, link_out_path);
    if(!nkmers_json || !nlinks_json || !nbytes_json)
      die("Cannot find required header entries");

    // Create a random temporary file
    link_tmp_fh = create_tmp_file(&link_tmp_path, link_out_path);

    status("Saving output to: %s", link_out_path);
    status("Temporary output: %s", link_tmp_path.b);

    // Open output file
    if((link_gz = futil_gzopen_create(link_out_path, "w")) == NULL)
      die("Cannot open output link file: %s", link_out_path);

    // Need to open output file first so we can get absolute path
    // Update the header to include this command
    json_hdr_add_curr_cmd(newhdr, link_out_path);
  }

  if(list)
  {
    status("Listing to %s", csv_out_path);
    if((list_fh = futil_fopen_create(csv_out_path, "w")) == NULL)
      die("Cannot open output CSV file %s", csv_out_path);

    // Print csv header
    fprintf(list_fh, "SeqLen,Covg\n");
  }

  if(plot)
  {
    status("Plotting kmer %zu to %s", plot_kmer_idx, plot_out_path);
    if((plot_fh = futil_fopen_create(plot_out_path, "w")) == NULL)
      die("Cannot open output .dot file %s", plot_out_path);
  }

  SizeBuffer countbuf, jposbuf;
  size_buf_alloc(&countbuf, 16);
  size_buf_alloc(&jposbuf, 1024);

  StrBuf kmerbuf, juncsbuf, seqbuf, outbuf;
  strbuf_alloc(&kmerbuf, 1024);
  strbuf_alloc(&juncsbuf, 1024);
  strbuf_alloc(&seqbuf, 1024);
  strbuf_alloc(&outbuf, 1024);

  bool link_fw;
  size_t njuncs;
  size_t knum, nlinks, num_links_exp = 0;

  LinkTree ltree;
  ltree_alloc(&ltree, kmer_size);

  LinkTreeStats tree_stats;
  memset(&tree_stats, 0, sizeof(tree_stats));
  size_t init_num_links = 0, num_links = 0;

  for(knum = 0; !limit || knum < limit; knum++)
  {
    ltree_reset(&ltree);
    if(!gpath_reader_read_kmer(&ctpin, &kmerbuf, &num_links_exp)) break;
    ctx_assert2(kmerbuf.end == kmer_size, "Kmer incorrect length %zu != %zu",
                kmerbuf.end, kmer_size);
    // status("kmer: %s", kmerbuf.b);

    for(nlinks = 0;
        gpath_reader_read_link(&ctpin, &link_fw, &njuncs,
                               &countbuf, &juncsbuf,
                               &seqbuf, &jposbuf);
        nlinks++)
    {
      ltree_add(&ltree, link_fw, countbuf.b[0], jposbuf.b,
                juncsbuf.b, seqbuf.b);
    }

    if(nlinks != num_links_exp)
      warn("Links count mismatch %zu != %zu", nlinks, num_links_exp);

    if(hist_covg)
    {
      ltree_update_covg_hists(&ltree, (uint64_t*)hists,
                              hist_distsize, hist_covgsize);
    }
    if(clean)
    {
      ltree_clean(&ltree, cutoff);
    }

    // Accumulate statistics
    ltree_get_stats(&ltree, &tree_stats);
    num_links = tree_stats.num_links - init_num_links;
    init_num_links = tree_stats.num_links;

    if(list)
    {
      ltree_write_list(&ltree, &outbuf);
      if(fwrite(outbuf.b, 1, outbuf.end, list_fh) != outbuf.end)
        die("Cannot write CSV file to: %s", csv_out_path);
      strbuf_reset(&outbuf);
    }
    if(save && num_links)
    {
      ltree_write_ctp(&ltree, kmerbuf.b, num_links, &outbuf);
      if(fwrite(outbuf.b, 1, outbuf.end, link_tmp_fh) != outbuf.end)
        die("Cannot write ctp file to: %s", link_tmp_path.b);
      strbuf_reset(&outbuf);
    }
    if(plot && knum == plot_kmer_idx)
    {
      status("Plotting tree...");
      ltree_write_dot(&ltree, &outbuf);
      if(fwrite(outbuf.b, 1, outbuf.end, plot_fh) != outbuf.end)
        die("Cannot write plot DOT file to: %s", plot_out_path);
      strbuf_reset(&outbuf);
    }
  }

  gpath_reader_close(&ctpin);

  cJSON *links_json = json_hdr_get(newhdr, "paths", cJSON_Object, link_out_path);
  cJSON *nkmers_json = json_hdr_get(links_json, "num_kmers_with_paths", cJSON_Number, link_out_path);
  cJSON *nlinks_json = json_hdr_get(links_json, "num_paths",            cJSON_Number, link_out_path);
  cJSON *nbytes_json = json_hdr_get(links_json, "path_bytes",           cJSON_Number, link_out_path);

  status("Number of kmers with links %li -> %zu", nkmers_json->valueint, tree_stats.num_trees_with_links);
  status("Number of links %li -> %zu", nlinks_json->valueint, tree_stats.num_links);
  status("Number of bytes %li -> %zu", nbytes_json->valueint, tree_stats.num_link_bytes);

  if(save)
  {
    // Update JSON
    nkmers_json->valuedouble = nkmers_json->valueint = tree_stats.num_trees_with_links;
    nlinks_json->valuedouble = nlinks_json->valueint = tree_stats.num_links;
    nbytes_json->valuedouble = nbytes_json->valueint = tree_stats.num_link_bytes;

    char *json_str = cJSON_Print(newhdr);
    if(gzputs(link_gz, json_str) != (int)strlen(json_str))
      die("Cannot write ctp file to: %s", link_out_path);
    free(json_str);

    gzputs(link_gz, "\n\n");
    gzputs(link_gz, ctp_explanation_comment);
    gzputs(link_gz, "\n");

    fseek(link_tmp_fh, 0, SEEK_SET);
    char *tmp = ctx_malloc(4*ONE_MEGABYTE);
    size_t s;
    while((s = fread(tmp, 1, 4*ONE_MEGABYTE, link_tmp_fh)) > 0) {
      if(gzwrite(link_gz, tmp, s) != (int)s)
        die("Cannot write to output: %s", link_out_path);
    }
    ctx_free(tmp);

    gzclose(link_gz);
    fclose(link_tmp_fh);
  }

  // Write histogram to file
  if(hist_fh)
  {
    size_t i, j;
    fprintf(hist_fh, "  ");
    for(j = 1; j < hist_covgsize; j++) fprintf(hist_fh, ",covg.%02zu", j);
    fprintf(hist_fh, "\n");
    for(i = 1; i < hist_distsize; i++) {
      fprintf(hist_fh, "dist.%02zu", i);
      for(j = 1; j < hist_covgsize; j++) {
        fprintf(hist_fh, ",%"PRIu64, hists[i][j]);
      }
      fprintf(hist_fh, "\n");
    }
  }

  if(thresh_fh)
  {
    // Use median of first five cutoffs
    print_suggest_cutoff(6, hist_covgsize, hists, thresh_fh);
  }

  if(hist_fh && hist_fh != stdout) fclose(hist_fh);

  if(list)
  {
    fclose(list_fh);
  }

  if(plot)
  {
    fclose(plot_fh);
  }

  ctx_free(hists);
  cJSON_Delete(newhdr);
  strbuf_dealloc(&link_tmp_path);
  ltree_dealloc(&ltree);
  size_buf_dealloc(&countbuf);
  size_buf_dealloc(&jposbuf);
  strbuf_dealloc(&kmerbuf);
  strbuf_dealloc(&juncsbuf);
  strbuf_dealloc(&seqbuf);
  strbuf_dealloc(&outbuf);

  return EXIT_SUCCESS;
}
Esempio n. 14
0
int ctx_bubbles(int argc, char **argv)
{
  size_t nthreads = 0;
  struct MemArgs memargs = MEM_ARGS_INIT;
  const char *out_path = NULL;
  size_t max_allele_len = 0, max_flank_len = 0;
  bool remove_serial_bubbles = true;

  // List of haploid colours
  size_t *hapcols = NULL;
  int nhapcols = 0;
  char *hapcols_arg = NULL;

  GPathReader tmp_gpfile;
  GPathFileBuffer gpfiles;
  gpfile_buf_alloc(&gpfiles, 8);

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'o': cmd_check(!out_path, cmd); out_path = optarg; break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'p':
        memset(&tmp_gpfile, 0, sizeof(GPathReader));
        gpath_reader_open(&tmp_gpfile, optarg);
        gpfile_buf_push(&gpfiles, &tmp_gpfile, 1);
        break;
      case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'H': cmd_check(!hapcols_arg, cmd); hapcols_arg = optarg; break;
      case 'A': cmd_check(!max_allele_len, cmd); max_allele_len = cmd_uint32_nonzero(cmd, optarg); break;
      case 'F': cmd_check(!max_flank_len, cmd); max_flank_len = cmd_uint32_nonzero(cmd, optarg); break;
      case 'S': cmd_check(remove_serial_bubbles,cmd); remove_serial_bubbles = false; break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  // Defaults for unset values
  if(out_path == NULL) out_path = "-";
  if(nthreads == 0) nthreads = DEFAULT_NTHREADS;
  if(max_allele_len == 0) max_allele_len = DEFAULT_MAX_ALLELE;
  if(max_flank_len == 0) max_flank_len = DEFAULT_MAX_FLANK;

  if(optind >= argc) cmd_print_usage("Require input graph files (.ctx)");

  //
  // Open graph files
  //
  const size_t num_gfiles = argc - optind;
  char **graph_paths = argv + optind;
  ctx_assert(num_gfiles > 0);

  GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader));
  size_t i, ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0;

  ncols = graph_files_open(graph_paths, gfiles, num_gfiles,
                           &ctx_max_kmers, &ctx_sum_kmers);

  // Check graph + paths are compatible
  graphs_gpaths_compatible(gfiles, num_gfiles, gpfiles.b, gpfiles.len, -1);

  //
  // Check haploid colours are valid
  //
  if(hapcols_arg != NULL) {
    if((nhapcols = range_get_num(hapcols_arg, ncols)) < 0)
      die("Invalid haploid colour list: %s", hapcols_arg);

    hapcols = ctx_calloc(nhapcols, sizeof(hapcols[0]));
    if(range_parse_array(hapcols_arg, hapcols, ncols) < 0)
      die("Invalid haploid colour list: %s", hapcols_arg);
  }

  //
  // Decide on memory
  //
  size_t bits_per_kmer, kmers_in_hash, graph_mem, path_mem, thread_mem;
  char thread_mem_str[100];

  // edges(1bytes) + kmer_paths(8bytes) + in_colour(1bit/col) +
  // visitedfw/rv(2bits/thread)

  bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 +
                  (gpfiles.len > 0 ? sizeof(GPath*)*8 : 0) +
                  ncols + 2*nthreads;

  kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                        memargs.mem_to_use_set,
                                        memargs.num_kmers,
                                        memargs.num_kmers_set,
                                        bits_per_kmer,
                                        ctx_max_kmers, ctx_sum_kmers,
                                        false, &graph_mem);

  // Thread memory
  thread_mem = roundup_bits2bytes(kmers_in_hash) * 2;
  bytes_to_str(thread_mem * nthreads, 1, thread_mem_str);
  status("[memory] (of which threads: %zu x %zu = %s)\n",
          nthreads, thread_mem, thread_mem_str);

  // Paths memory
  size_t rem_mem = memargs.mem_to_use - MIN2(memargs.mem_to_use, graph_mem+thread_mem);
  path_mem = gpath_reader_mem_req(gpfiles.b, gpfiles.len, ncols, rem_mem, false,
                                  kmers_in_hash, false);

  // Shift path store memory from graphs->paths
  graph_mem -= sizeof(GPath*)*kmers_in_hash;
  path_mem  += sizeof(GPath*)*kmers_in_hash;
  cmd_print_mem(path_mem, "paths");

  size_t total_mem = graph_mem + thread_mem + path_mem;
  cmd_check_mem_limit(memargs.mem_to_use, total_mem);

  //
  // Open output file
  //
  gzFile gzout = futil_gzopen_create(out_path, "w");

  // Allocate memory
  dBGraph db_graph;
  db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, ncols, 1, kmers_in_hash,
                 DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL);

  // Paths
  gpath_reader_alloc_gpstore(gpfiles.b, gpfiles.len, path_mem, false, &db_graph);

  //
  // Load graphs
  //
  GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph);
  gprefs.empty_colours = true;

  for(i = 0; i < num_gfiles; i++) {
    graph_load(&gfiles[i], gprefs, NULL);
    graph_file_close(&gfiles[i]);
    gprefs.empty_colours = false;
  }
  ctx_free(gfiles);

  hash_table_print_stats(&db_graph.ht);

  // Load link files
  for(i = 0; i < gpfiles.len; i++)
    gpath_reader_load(&gpfiles.b[i], GPATH_DIE_MISSING_KMERS, &db_graph);

  // Create array of cJSON** from input files
  cJSON **hdrs = ctx_malloc(gpfiles.len * sizeof(cJSON*));
  for(i = 0; i < gpfiles.len; i++) hdrs[i] = gpfiles.b[i].json;

  // Now call variants
  BubbleCallingPrefs call_prefs = {.max_allele_len = max_allele_len,
                                   .max_flank_len = max_flank_len,
                                   .haploid_cols = hapcols,
                                   .nhaploid_cols = nhapcols,
                                   .remove_serial_bubbles = remove_serial_bubbles};

  invoke_bubble_caller(nthreads, &call_prefs,
                       gzout, out_path,
                       hdrs, gpfiles.len,
                       &db_graph);

  status("  saved to: %s\n", out_path);
  gzclose(gzout);
  ctx_free(hdrs);

  // Close input link files
  for(i = 0; i < gpfiles.len; i++)
    gpath_reader_close(&gpfiles.b[i]);
  gpfile_buf_dealloc(&gpfiles);

  ctx_free(hapcols);
  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}
Esempio n. 15
0
int ctx_vcfcov(int argc, char **argv)
{
  struct MemArgs memargs = MEM_ARGS_INIT;
  const char *out_path = NULL, *out_type = NULL;

  uint32_t max_allele_len = 0, max_gt_vars = 0;
  char *ref_path = NULL;
  bool low_mem = false;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;
  size_t i;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'o': cmd_check(!out_path, cmd); out_path = optarg; break;
      case 'O': cmd_check(!out_type, cmd); out_type = optarg; break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'r': cmd_check(!ref_path, cmd); ref_path = optarg; break;
      case 'L': cmd_check(!max_allele_len,cmd); max_allele_len = cmd_uint32(cmd,optarg); break;
      case 'N': cmd_check(!max_gt_vars,cmd); max_gt_vars = cmd_uint32(cmd,optarg); break;
      case 'M': cmd_check(!low_mem, cmd); low_mem = true; break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  // Defaults for unset values
  if(out_path == NULL) out_path = "-";
  if(ref_path == NULL) cmd_print_usage("Require a reference (-r,--ref <ref.fa>)");
  if(optind+2 > argc) cmd_print_usage("Require VCF and graph files");

  if(!max_allele_len) max_allele_len = DEFAULT_MAX_ALLELE_LEN;
  if(!max_gt_vars) max_gt_vars = DEFAULT_MAX_GT_VARS;

  status("[vcfcov] max allele length: %u; max number of variants: %u",
         max_allele_len, max_gt_vars);

  // open ref
  // index fasta with: samtools faidx ref.fa
  faidx_t *fai = fai_load(ref_path);
  if(fai == NULL) die("Cannot load ref index: %s / %s.fai", ref_path, ref_path);

  // Open input VCF file
  const char *vcf_path = argv[optind++];
  htsFile *vcffh = hts_open(vcf_path, "r");
  if(vcffh == NULL) die("Cannot open VCF file: %s", vcf_path);
  bcf_hdr_t *vcfhdr = bcf_hdr_read(vcffh);
  if(vcfhdr == NULL) die("Cannot read VCF header: %s", vcf_path);

  // Test we can close and reopen files
  if(low_mem) {
    if((vcffh = hts_open(vcf_path, "r")) == NULL)
      die("Cannot re-open VCF file: %s", vcf_path);
    if((vcfhdr = bcf_hdr_read(vcffh)) == NULL)
      die("Cannot re-read VCF header: %s", vcf_path);
  }

  //
  // Open graph files
  //
  const size_t num_gfiles = argc - optind;
  char **graph_paths = argv + optind;
  ctx_assert(num_gfiles > 0);

  GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader));
  size_t ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0;

  ncols = graph_files_open(graph_paths, gfiles, num_gfiles,
                           &ctx_max_kmers, &ctx_sum_kmers);

  // Check graph + paths are compatible
  graphs_gpaths_compatible(gfiles, num_gfiles, NULL, 0, -1);

  //
  // Decide on memory
  //
  size_t bits_per_kmer, kmers_in_hash, graph_mem;

  bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Covg)*8 * ncols;
  kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                        memargs.mem_to_use_set,
                                        memargs.num_kmers,
                                        memargs.num_kmers_set,
                                        bits_per_kmer,
                                        low_mem ? -1 : (int64_t)ctx_max_kmers,
                                        ctx_sum_kmers,
                                        true, &graph_mem);

  cmd_check_mem_limit(memargs.mem_to_use, graph_mem);

  //
  // Open output file
  //
  // v=>vcf, z=>compressed vcf, b=>bcf, bu=>uncompressed bcf
  int mode = vcf_misc_get_outtype(out_type, out_path);
  futil_create_output(out_path);
  htsFile *outfh = hts_open(out_path, modes_htslib[mode]);
  status("[vcfcov] Output format: %s", hsmodes_htslib[mode]);


  // Allocate memory
  dBGraph db_graph;
  db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, ncols, 1, kmers_in_hash,
                 DBG_ALLOC_COVGS);

  //
  // Set up tag names
  //

  // *R => ref, *A => alt
  sprintf(kcov_ref_tag, "K%zuR", db_graph.kmer_size); // mean coverage
  sprintf(kcov_alt_tag, "K%zuA", db_graph.kmer_size);

  // #SAMPLE=<ID=...,K29KCOV=...,K29NK=...,K29RLK>
  // - K29_kcov is empirical kmer coverage
  // - K29_nkmers is the number of kmers in the sample
  // - mean_read_length is the mean read length in bases
  char sample_kcov_tag[20], sample_nk_tag[20], sample_rlk_tag[20];
  sprintf(sample_kcov_tag, "K%zu_kcov", db_graph.kmer_size); // mean coverage
  sprintf(sample_nk_tag, "K%zu_nkmers", db_graph.kmer_size);
  sprintf(sample_rlk_tag, "mean_read_length");

  //
  // Load kmers if we are using --low-mem
  //

  VcfCovStats st;
  memset(&st, 0, sizeof(st));
  VcfCovPrefs prefs = {.kcov_ref_tag = kcov_ref_tag,
                       .kcov_alt_tag = kcov_alt_tag,
                       .max_allele_len = max_allele_len,
                       .max_gt_vars = max_gt_vars,
                       .load_kmers_only = false};

  if(low_mem)
  {
    status("[vcfcov] Loading kmers from VCF+ref");

    prefs.load_kmers_only = true;
    vcfcov_file(vcffh, vcfhdr, NULL, NULL, vcf_path, fai,
                NULL, &prefs, &st, &db_graph);

    // Close files
    hts_close(vcffh);
    bcf_hdr_destroy(vcfhdr);

    // Re-open files
    if((vcffh = hts_open(vcf_path, "r")) == NULL)
      die("Cannot re-open VCF file: %s", vcf_path);
    if((vcfhdr = bcf_hdr_read(vcffh)) == NULL)
      die("Cannot re-read VCF header: %s", vcf_path);

    prefs.load_kmers_only = false;
  }

  //
  // Load graphs
  //
  GraphLoadingStats gstats;
  memset(&gstats, 0, sizeof(gstats));

  GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph);
  gprefs.must_exist_in_graph = low_mem;

  for(i = 0; i < num_gfiles; i++) {
    graph_load(&gfiles[i], gprefs, &gstats);
    graph_file_close(&gfiles[i]);
  }
  ctx_free(gfiles);

  hash_table_print_stats(&db_graph.ht);

  //
  // Set up VCF header / graph matchup
  //
  size_t *samplehdrids = ctx_malloc(db_graph.num_of_cols * sizeof(size_t));

  // Add samples to vcf header
  bcf_hdr_t *outhdr = bcf_hdr_dup(vcfhdr);
  bcf_hrec_t *hrec;
  int sid;
  char hdrstr[200];

  for(i = 0; i < db_graph.num_of_cols; i++) {
    char *sname = db_graph.ginfo[i].sample_name.b;
    if((sid = bcf_hdr_id2int(outhdr, BCF_DT_SAMPLE, sname)) < 0) {
      bcf_hdr_add_sample(outhdr, sname);
      sid = bcf_hdr_id2int(outhdr, BCF_DT_SAMPLE, sname);
    }
    samplehdrids[i] = sid;

    // Add SAMPLE field
    hrec = bcf_hdr_get_hrec(outhdr, BCF_HL_STR, "ID", sname, "SAMPLE");

    if(hrec == NULL) {
      sprintf(hdrstr, "##SAMPLE=<ID=%s,%s=%"PRIu64",%s=%"PRIu64",%s=%zu>", sname,
              sample_kcov_tag,
              gstats.nkmers[i] ? gstats.sumcov[i] / gstats.nkmers[i] : 0,
              sample_nk_tag, gstats.nkmers[i],
              sample_rlk_tag, (size_t)db_graph.ginfo[i].mean_read_length);
      bcf_hdr_append(outhdr, hdrstr);
    }
    else {
      // mean kcovg
      sprintf(hdrstr, "%"PRIu64, gstats.sumcov[i] / gstats.nkmers[i]);
      vcf_misc_add_update_hrec(hrec, sample_kcov_tag, hdrstr);
      // num kmers
      sprintf(hdrstr, "%"PRIu64, gstats.nkmers[i]);
      vcf_misc_add_update_hrec(hrec, sample_nk_tag, hdrstr);
      // mean read length in kmers
      sprintf(hdrstr, "%zu", (size_t)db_graph.ginfo[i].mean_read_length);
      vcf_misc_add_update_hrec(hrec, sample_rlk_tag, hdrstr);
    }

    status("[vcfcov] Colour %zu: %s [VCF column %zu]", i, sname, samplehdrids[i]);
  }

  // Add genotype format fields
  // One field per alternative allele

  sprintf(hdrstr, "##FORMAT=<ID=%s,Number=A,Type=Integer,"
          "Description=\"Coverage on ref (k=%zu): sum(kmer_covs) / exp_num_kmers\">\n",
          kcov_ref_tag, db_graph.kmer_size);
  bcf_hdr_append(outhdr, hdrstr);
  sprintf(hdrstr, "##FORMAT=<ID=%s,Number=A,Type=Integer,"
          "Description=\"Coverage on alt (k=%zu): sum(kmer_covs) / exp_num_kmers\">\n",
          kcov_alt_tag, db_graph.kmer_size);
  bcf_hdr_append(outhdr, hdrstr);

  bcf_hdr_set_version(outhdr, "VCFv4.2");

  // Add command string to header
  vcf_misc_hdr_add_cmd(outhdr, cmd_get_cmdline(), cmd_get_cwd());

  if(bcf_hdr_write(outfh, outhdr) != 0)
    die("Cannot write header to: %s", futil_outpath_str(out_path));

  status("[vcfcov] Reading %s and adding coverage", vcf_path);

  // Reset stats and get coverage
  memset(&st, 0, sizeof(st));

  vcfcov_file(vcffh, vcfhdr, outfh, outhdr, vcf_path, fai,
              samplehdrids, &prefs, &st, &db_graph);

  // Print statistics
  char ns0[50], ns1[50];
  status("[vcfcov] Read %s VCF lines", ulong_to_str(st.nvcf_lines, ns0));
  status("[vcfcov] Read %s ALTs", ulong_to_str(st.nalts_read, ns0));
  status("[vcfcov] Used %s kmers", ulong_to_str(st.ngt_kmers, ns0));
  status("[vcfcov] ALTs used: %s / %s (%.2f%%)",
         ulong_to_str(st.nalts_loaded, ns0), ulong_to_str(st.nalts_read, ns1),
         st.nalts_read ? (100.0*st.nalts_loaded) / st.nalts_read : 0.0);
  status("[vcfcov] ALTs too long (>%ubp): %s / %s (%.2f%%)", max_allele_len,
         ulong_to_str(st.nalts_too_long, ns0), ulong_to_str(st.nalts_read, ns1),
         st.nalts_read ? (100.0*st.nalts_too_long) / st.nalts_read : 0.0);
  status("[vcfcov] ALTs too dense (>%u within %zubp): %s / %s (%.2f%%)",
         max_gt_vars, db_graph.kmer_size,
         ulong_to_str(st.nalts_no_covg, ns0), ulong_to_str(st.nalts_read, ns1),
         st.nalts_read ? (100.0*st.nalts_no_covg) / st.nalts_read : 0.0);
  status("[vcfcov] ALTs printed with coverage: %s / %s (%.2f%%)",
         ulong_to_str(st.nalts_with_covg, ns0), ulong_to_str(st.nalts_read, ns1),
         st.nalts_read ? (100.0*st.nalts_with_covg) / st.nalts_read : 0.0);

  status("[vcfcov] Saved to: %s\n", out_path);

  ctx_free(samplehdrids);
  graph_loading_stats_destroy(&gstats);

  bcf_hdr_destroy(vcfhdr);
  bcf_hdr_destroy(outhdr);
  hts_close(vcffh);
  hts_close(outfh);
  fai_destroy(fai);
  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}