예제 #1
0
void _construct_graph_with_paths(dBGraph *graph,
                                 size_t kmer_size, size_t ncols,
                                 char **seqs, size_t nseqs,
                                 CorrectAlnParam path_params)
{
  size_t i;
  db_graph_alloc(graph, kmer_size, ncols, ncols, 1024);

  // Graph data
  graph->bktlocks = ctx_calloc(roundup_bits2bytes(graph->ht.num_of_buckets), 1);
  graph->col_edges = ctx_calloc(graph->ht.capacity * ncols, sizeof(Edges));
  graph->col_covgs = ctx_calloc(graph->ht.capacity * ncols, sizeof(Covg));
  graph->node_in_cols = ctx_calloc(roundup_bits2bytes(graph->ht.capacity) * ncols, 1);

  // Path data
  path_store_alloc(&graph->pstore, 1024, true, graph->ht.capacity, ncols);
  graph->pstore.kmer_locks = ctx_calloc(roundup_bits2bytes(graph->ht.capacity), 1);

  // Build graph
  for(i = 0; i < nseqs; i++)
    build_graph_from_str_mt(graph, 0, seqs[i], strlen(seqs[i]));

  graph->num_of_cols_used = MAX2(graph->num_of_cols_used, 1);

  GenPathWorker *gen_path_wrkr = gen_paths_workers_alloc(1, graph, NULL);

  for(i = 0; i < nseqs; i++)
    gen_paths_from_str_mt(gen_path_wrkr, seqs[i], path_params);

  gen_paths_workers_dealloc(gen_path_wrkr, 1);
}
예제 #2
0
void graph_crawler_alloc(GraphCrawler *crawler, const dBGraph *db_graph)
{
  ctx_assert(db_graph->node_in_cols != NULL);

  size_t ncols = db_graph->num_of_cols;

  int *col_paths = ctx_calloc(ncols, sizeof(int));
  GCMultiColPath *multicol_paths = ctx_calloc(ncols, sizeof(GCMultiColPath));
  GCUniColPath *unicol_paths = ctx_calloc(ncols, sizeof(GCUniColPath));
  uint32_t *col_list = ctx_calloc(ncols, sizeof(uint32_t));

  GraphCrawler tmp = {.num_paths = 0,
                      .col_paths = col_paths,
                      .multicol_paths = multicol_paths,
                      .unicol_paths = unicol_paths,
                      .col_list = col_list};

  memcpy(crawler, &tmp, sizeof(GraphCrawler));

  graph_cache_alloc(&crawler->cache, db_graph);
  graph_walker_alloc(&crawler->wlk, db_graph);
  rpt_walker_alloc(&crawler->rptwlk, db_graph->ht.capacity, 22); // 4MB
}

void graph_crawler_dealloc(GraphCrawler *crawler)
{
  ctx_free(crawler->col_paths);
  ctx_free(crawler->multicol_paths);
  ctx_free(crawler->unicol_paths);
  ctx_free(crawler->col_list);
  graph_cache_dealloc(&crawler->cache);
  graph_walker_dealloc(&crawler->wlk);
  rpt_walker_dealloc(&crawler->rptwlk);
  memset(crawler, 0, sizeof(GraphCrawler)); // reset
}
예제 #3
0
CallDecomp* call_decomp_init(htsFile *vcffh, bcf_hdr_t *vcfhdr)
{
  CallDecomp *dc = ctx_calloc(1, sizeof(CallDecomp));
  dc->nw_aligner = needleman_wunsch_new();
  dc->aln = alignment_create(1024);
  dc->scoring = ctx_calloc(1, sizeof(dc->scoring[0]));
  scoring_system_default(dc->scoring);
  dc->vcffh = vcffh;
  dc->vcfhdr = vcfhdr;
  dc->v = bcf_init();
  strbuf_alloc(&dc->sbuf, 256);
  return dc;
}
예제 #4
0
size_t infer_edges(size_t nthreads, bool add_all_edges, const dBGraph *db_graph)
{
  ctx_assert(db_graph->node_in_cols != NULL);
  ctx_assert(db_graph->col_edges != NULL);

  size_t i, num_nodes_modified = 0;
  status("[inferedges] Processing stream");

  InferEdgesWorker *wrkrs = ctx_calloc(nthreads, sizeof(InferEdgesWorker));

  for(i = 0; i < nthreads; i++) {
    InferEdgesWorker tmp = {.threadid = i, .nthreads = nthreads,
                            .add_all_edges = add_all_edges,
                            .db_graph = db_graph,
                            .num_nodes_modified = 0};
    memcpy(&wrkrs[i], &tmp, sizeof(InferEdgesWorker));
  }

  util_run_threads(wrkrs, nthreads, sizeof(InferEdgesWorker),
                   nthreads, infer_edges_worker);

  // Sum up nodes modified
  for(i = 0; i < nthreads; i++)
    num_nodes_modified += wrkrs[i].num_nodes_modified;

  ctx_free(wrkrs);

  return num_nodes_modified;
}
예제 #5
0
// If num_paths != 0, we ensure at least num_paths capacity
// @split_linked_lists whether you intend to have traverse linked list and
//                     all linked list separate
void gpath_store_alloc(GPathStore *gpstore, size_t ncols, size_t graph_capacity,
                       size_t num_paths, size_t mem,
                       bool count_nseen, bool split_linked_lists)
{
  memset(gpstore, 0, sizeof(*gpstore));

  size_t store_mem = graph_capacity*sizeof(GPath*) * (split_linked_lists ? 2 : 1);
  if(store_mem > mem)
    die("Need at least %zu bytes (only got %zu bytes)", store_mem, mem);

  size_t gpset_mem = mem - store_mem;

  // Don't allow resizing - this allows use to the thread safe
  // and control memory usage (die when we fill up allowed memory)
  if(num_paths)
    gpath_set_alloc2(&gpstore->gpset, ncols, num_paths, gpset_mem, false, count_nseen);
  else
    gpath_set_alloc(&gpstore->gpset, ncols, gpset_mem, false, count_nseen);

  gpstore->graph_capacity = graph_capacity;

  // paths_traverse is always a subset of paths_all
  gpstore->paths_all = ctx_calloc(graph_capacity, sizeof(GPath*));
  gpstore->paths_traverse = gpstore->paths_all;
}
예제 #6
0
static void run_exp_abc(const dBGraph *db_graph, bool prime_AB,
                        size_t nthreads, size_t num_repeats,
                        size_t max_AB_dist, bool print_failed_contigs)
{
  ExpABCWorker *wrkrs = ctx_calloc(nthreads, sizeof(ExpABCWorker));
  size_t i, j;

  if(max_AB_dist == 0) max_AB_dist = SIZE_MAX;

  for(i = 0; i < nthreads; i++) {
    wrkrs[i].colour = 0;
    wrkrs[i].nthreads = nthreads;
    wrkrs[i].db_graph = db_graph;
    wrkrs[i].prime_AB = prime_AB;
    wrkrs[i].num_limit = num_repeats / nthreads;
    wrkrs[i].max_AB_dist = max_AB_dist;
    wrkrs[i].print_failed_contigs = print_failed_contigs;
    db_node_buf_alloc(&wrkrs[i].nbuf, 1024);
    graph_walker_alloc(&wrkrs[i].gwlk, db_graph);
    rpt_walker_alloc(&wrkrs[i].rptwlk, db_graph->ht.capacity, 22); // 4MB
  }

  util_run_threads(wrkrs, nthreads, sizeof(ExpABCWorker),
                   nthreads, run_exp_abc_thread);

  // Merge results
  size_t num_tests = 0, results[NUM_RESULT_VALUES] = {0};
  size_t ab_fail_state[GRPHWLK_NUM_STATES] = {0};
  size_t bc_fail_state[GRPHWLK_NUM_STATES] = {0};

  for(i = 0; i < nthreads; i++) {
    num_tests += wrkrs[i].num_tests;
    for(j = 0; j < NUM_RESULT_VALUES; j++) results[j] += wrkrs[i].results[j];
    for(j = 0; j < GRPHWLK_NUM_STATES; j++) ab_fail_state[j] += wrkrs[i].ab_fail_state[j];
    for(j = 0; j < GRPHWLK_NUM_STATES; j++) bc_fail_state[j] += wrkrs[i].bc_fail_state[j];
    db_node_buf_dealloc(&wrkrs[i].nbuf);
    graph_walker_dealloc(&wrkrs[i].gwlk);
    rpt_walker_dealloc(&wrkrs[i].rptwlk);
  }

  // Print results
  char nrunstr[50];
  ulong_to_str(num_tests, nrunstr);
  status("Ran %s tests with %zu threads", nrunstr, nthreads);

  const char *titles[] = {"RES_ABC_SUCCESS", "RES_AB_WRONG",
                          "RES_AB_FAILED",   "RES_BC_WRONG",
                          "RES_BC_FAILED",   "RES_BC_OVERSHOT",
                          "RES_LOST_IN_RPT", "RES_NO_TRAVERSAL"};

  util_print_nums(titles, results, NUM_RESULT_VALUES, 30);

  status("AB_FAILED:");
  graph_step_print_state_hist(ab_fail_state);
  status("BC_FAILED:");
  graph_step_print_state_hist(bc_fail_state);

  ctx_free(wrkrs);
}
예제 #7
0
static void supernode_cleaner_alloc(SupernodeCleaner *cl, size_t nthreads,
                                    size_t covg_threshold, size_t min_keep_tip,
                                    uint8_t *keep_flags,
                                    const dBGraph *db_graph)
{
  size_t i;
  CovgBuffer *cbufs = ctx_calloc(nthreads, sizeof(CovgBuffer));
  for(i = 0; i < nthreads; i++)
    covg_buf_alloc(&cbufs[i], 1024);

  uint64_t *covg_hist_init, *covg_hist_cleaned;
  uint64_t *mean_covg_hist_init, *mean_covg_hist_cleaned;
  uint64_t *len_hist_init, *len_hist_cleaned;

  covg_hist_init          = ctx_calloc(DUMP_COVG_ARRSIZE, sizeof(uint64_t));
  covg_hist_cleaned       = ctx_calloc(DUMP_COVG_ARRSIZE, sizeof(uint64_t));
  mean_covg_hist_init     = ctx_calloc(DUMP_MEAN_COVG_ARRSIZE, sizeof(uint64_t));
  mean_covg_hist_cleaned  = ctx_calloc(DUMP_MEAN_COVG_ARRSIZE, sizeof(uint64_t));
  len_hist_init           = ctx_calloc(DUMP_LEN_ARRSIZE,  sizeof(uint64_t));
  len_hist_cleaned        = ctx_calloc(DUMP_LEN_ARRSIZE,  sizeof(uint64_t));

  SupernodeCleaner tmp = {.nthreads = nthreads,
                          .covg_threshold = covg_threshold,
                          .min_keep_tip = min_keep_tip,
                          .cbufs = cbufs,
                          .covg_hist_init    = covg_hist_init,
                          .covg_hist_cleaned = covg_hist_cleaned,
                          .covg_arrsize = DUMP_COVG_ARRSIZE,
                          .mean_covg_hist_init = mean_covg_hist_init,
                          .mean_covg_hist_cleaned = mean_covg_hist_cleaned,
                          .mean_covg_arrsize = DUMP_MEAN_COVG_ARRSIZE,
                          .len_hist_init     = len_hist_init,
                          .len_hist_cleaned  = len_hist_cleaned,
                          .len_arrsize = DUMP_LEN_ARRSIZE,
                          .keep_flags = keep_flags,
                          .num_tips = 0,
                          .num_low_covg_snodes = 0,
                          .num_tip_and_low_snodes = 0,
                          .num_tip_kmers = 0,
                          .num_low_covg_snode_kmers = 0,
                          .num_tip_and_low_snode_kmers = 0,
                          .db_graph = db_graph};

  memcpy(cl, &tmp, sizeof(SupernodeCleaner));
}

static void supernode_cleaner_dealloc(SupernodeCleaner *cl)
{
  size_t i;
  for(i = 0; i < cl->nthreads; i++)
    covg_buf_dealloc(&cl->cbufs[i]);
  ctx_free(cl->cbufs);
  ctx_free(cl->covg_hist_init);
  ctx_free(cl->covg_hist_cleaned);
  ctx_free(cl->mean_covg_hist_init);
  ctx_free(cl->mean_covg_hist_cleaned);
  ctx_free(cl->len_hist_init);
  ctx_free(cl->len_hist_cleaned);
  memset(cl, 0, sizeof(SupernodeCleaner));
}
예제 #8
0
static void unitig_cleaner_alloc(UnitigCleaner *cl, size_t nthreads,
                                 size_t covg_threshold, size_t min_keep_tip,
                                 uint8_t *keep_flags,
                                 const dBGraph *db_graph)
{
  size_t i;
  CovgBuffer *cbufs = ctx_calloc(nthreads, sizeof(CovgBuffer));
  for(i = 0; i < nthreads; i++)
    covg_buf_alloc(&cbufs[i], 1024);

  uint64_t *kmer_covgs_init, *kmer_covgs_clean;
  uint64_t *unitig_covgs_init, *unitig_covg_clean;
  uint64_t *len_hist_init, *len_hist_clean;

  kmer_covgs_init      = ctx_calloc(DUMP_COVG_ARRSIZE, sizeof(uint64_t));
  kmer_covgs_clean    = ctx_calloc(DUMP_COVG_ARRSIZE, sizeof(uint64_t));
  unitig_covgs_init    = ctx_calloc(DUMP_COVG_ARRSIZE, sizeof(uint64_t));
  unitig_covg_clean  = ctx_calloc(DUMP_COVG_ARRSIZE, sizeof(uint64_t));
  len_hist_init        = ctx_calloc(DUMP_LEN_ARRSIZE,  sizeof(uint64_t));
  len_hist_clean     = ctx_calloc(DUMP_LEN_ARRSIZE,  sizeof(uint64_t));

  UnitigCleaner tmp = {.nthreads = nthreads,
                       .covg_threshold = covg_threshold,
                       .min_keep_tip = min_keep_tip,
                       .cbufs = cbufs,
                       .kmer_covgs_init   = kmer_covgs_init,
                       .kmer_covgs_clean  = kmer_covgs_clean,
                       .unitig_covgs_init = unitig_covgs_init,
                       .unitig_covg_clean = unitig_covg_clean,
                       .covg_arrsize      = DUMP_COVG_ARRSIZE,
                       .len_hist_init   = len_hist_init,
                       .len_hist_clean  = len_hist_clean,
                       .len_arrsize     = DUMP_LEN_ARRSIZE,
                       .keep_flags = keep_flags,
                       .num_tips = 0,
                       .num_low_covg_snodes = 0,
                       .num_tip_and_low_snodes = 0,
                       .num_tip_kmers = 0,
                       .num_low_covg_snode_kmers = 0,
                       .num_tip_and_low_snode_kmers = 0,
                       .db_graph = db_graph};

  memcpy(cl, &tmp, sizeof(UnitigCleaner));
}

static void unitig_cleaner_dealloc(UnitigCleaner *cl)
{
  size_t i;
  for(i = 0; i < cl->nthreads; i++)
    covg_buf_dealloc(&cl->cbufs[i]);
  ctx_free(cl->cbufs);
  ctx_free(cl->kmer_covgs_init);
  ctx_free(cl->kmer_covgs_clean);
  ctx_free(cl->unitig_covgs_init);
  ctx_free(cl->unitig_covg_clean);
  ctx_free(cl->len_hist_init);
  ctx_free(cl->len_hist_clean);
  memset(cl, 0, sizeof(UnitigCleaner));
}
예제 #9
0
void hash_table_alloc(HashTable *ht, uint64_t req_capacity)
{
  uint64_t num_of_buckets, capacity;
  uint8_t bucket_size;

  capacity = hash_table_cap(req_capacity, &num_of_buckets, &bucket_size);
  uint_fast32_t hash_mask = (uint_fast32_t)(num_of_buckets - 1);

  size_t mem = capacity * sizeof(BinaryKmer) +
               num_of_buckets * sizeof(uint8_t[2]);

  char num_bkts_str[100], bkt_size_str[100], cap_str[100], mem_str[100];
  ulong_to_str(num_of_buckets, num_bkts_str);
  ulong_to_str(bucket_size, bkt_size_str);
  ulong_to_str(capacity, cap_str);
  bytes_to_str(mem, 1, mem_str);
  status("[hasht] Allocating table with %s entries, using %s", cap_str, mem_str);
  status("[hasht]  number of buckets: %s, bucket size: %s", num_bkts_str, bkt_size_str);

  // calloc is required for bucket_data to set the first element of each bucket
  // to the 0th pos
  BinaryKmer *table = ctx_calloc(capacity, sizeof(BinaryKmer));
  uint8_t (*const buckets)[2] = ctx_calloc(num_of_buckets, sizeof(uint8_t[2]));

  HashTable data = {
    .table = table,
    .num_of_buckets = num_of_buckets,
    .hash_mask = hash_mask,
    .bucket_size = bucket_size,
    .capacity = capacity,
    .buckets = buckets,
    .num_kmers = 0,
    .collisions = {0},
    .seed = rand()};

  memcpy(ht, &data, sizeof(data));
}

void hash_table_dealloc(HashTable *hash_table)
{
  ctx_free(hash_table->table);
  ctx_free(hash_table->buckets);
}
예제 #10
0
// Does not allocate ugraph
void unitig_printer_init(UnitigPrinter *printer, const dBGraph *db_graph,
                         size_t nthreads, UnitigSyntax syntax, FILE *fout)
{
  memset(printer, 0, sizeof(UnitigPrinter));
  printer->db_graph = db_graph;
  printer->syntax = syntax;
  printer->fout = fout;
  printer->nthreads = nthreads;
  printer->num_unitigs = 0;
  printer->visited = ctx_calloc(roundup_bits2bytes(db_graph->ht.capacity), 1);
  if(pthread_mutex_init(&printer->outlock, NULL) != 0) die("Mutex init failed");
}
예제 #11
0
/**
 * Save paths to a file.
 * @param gzout         gzFile to write to
 * @param path          path of output file
 * @param save_path_seq if true, save seq= and juncpos= for links, requires
 *                      exactly one colour in the graph
 * @param hdrs is array of JSON headers of input files
 */
void gpath_save(gzFile gzout, const char *path,
                size_t nthreads, bool save_path_seq,
                const char *cmdstr, cJSON *cmdhdr,
                cJSON **hdrs, size_t nhdrs,
                const ZeroSizeBuffer *contig_hists, size_t ncols,
                dBGraph *db_graph)
{
  ctx_assert(nthreads > 0);
  ctx_assert(gpath_set_has_nseen(&db_graph->gpstore.gpset));
  ctx_assert(ncols == db_graph->gpstore.gpset.ncols);
  ctx_assert(!save_path_seq || db_graph->num_of_cols == 1); // save_path => 1 colour

  char npaths_str[50];
  ulong_to_str(db_graph->gpstore.num_paths, npaths_str);

  status("Saving %s paths to: %s", npaths_str, path);
  status("  using %zu threads", nthreads);

  // Write header
  cJSON *json = gpath_save_mkhdr(path, cmdstr, cmdhdr, hdrs, nhdrs,
                                 contig_hists, ncols, db_graph);
  json_hdr_gzprint(json, gzout);
  cJSON_Delete(json);

  // Print comments about the format
  gzputs(gzout, ctp_explanation_comment);

  // Multithreaded
  GPathSaver *wrkrs = ctx_calloc(nthreads, sizeof(GPathSaver));
  pthread_mutex_t outlock;
  size_t i;

  if(pthread_mutex_init(&outlock, NULL) != 0) die("Mutex init failed");

  for(i = 0; i < nthreads; i++) {
    wrkrs[i] = (GPathSaver){.threadid = i,
                            .nthreads = nthreads,
                            .save_seq = save_path_seq,
                            .gzout = gzout,
                            .outlock = &outlock,
                            .db_graph = db_graph};
  }

  // Iterate over kmers writing paths
  util_run_threads(wrkrs, nthreads, sizeof(*wrkrs), nthreads, gpath_save_thread);

  pthread_mutex_destroy(&outlock);
  ctx_free(wrkrs);

  status("[GPathSave] Graph paths saved to %s", path);
}
예제 #12
0
void gpath_store_split_read_write(GPathStore *gpstore)
{
  if(gpstore->paths_traverse == gpstore->paths_all)
  {
    status("[GPathStore] Creating separate read/write GraphPath linked lists");
    if(gpstore->num_paths == 0)
    {
      gpstore->paths_traverse = NULL;
    }
    else
    {
      // Copy current paths over to path set to be updated
      status("[GPathStore]   (allocating new linked lists)");
      size_t mem = gpstore->graph_capacity * sizeof(GPath*);
      gpstore->paths_traverse = ctx_calloc(gpstore->graph_capacity, sizeof(GPath*));
      memcpy(gpstore->paths_traverse, gpstore->paths_all, mem);
    }
  }
}
예제 #13
0
static void pull_out_supernodes(const char **seq, const char **ans, size_t n,
                                const dBGraph *graph)
{
  dBNodeBuffer nbuf;
  db_node_buf_alloc(&nbuf, 1024);

  // 1. Check pulling out supernodes works for iterating over the graph
  uint64_t *visited;
  visited = ctx_calloc(roundup_bits2words64(graph->ht.capacity), 8);
  HASH_ITERATE(&graph->ht, supernode_from_kmer,
               &nbuf, visited, graph, ans, n);
  ctx_free(visited);

  // 2. Check pulling out supernodes works when we iterate over inputs
  size_t i, j, len;
  dBNode node;
  char tmpstr[SNODEBUF];

  for(i = 0; i < n; i++) {
    len = strlen(seq[i]);
    for(j = 0; j+graph->kmer_size <= len; j++)
    {
      // Find node
      node = db_graph_find_str(graph, seq[i]+j);
      TASSERT(node.key != HASH_NOT_FOUND);

      // Fetch supernode
      db_node_buf_reset(&nbuf);
      supernode_find(node.key, &nbuf, graph);
      supernode_normalise(nbuf.b, nbuf.len, graph);

      // Compare
      TASSERT(nbuf.len < SNODEBUF);
      db_nodes_to_str(nbuf.b, nbuf.len, graph, tmpstr);
      if(strcmp(tmpstr, ans[i]) != 0) {
        test_status("Got: %s from ans[i]:%s\n", tmpstr, ans[i]);
      }
      TASSERT(strcmp(tmpstr, ans[i]) == 0);
    }
  }

  db_node_buf_dealloc(&nbuf);
}
예제 #14
0
int ctx_correct(int argc, char **argv)
{
  size_t i, j;
  struct ReadThreadCmdArgs args = READ_THREAD_CMD_ARGS_INIT;
  read_thread_args_alloc(&args);
  read_thread_args_parse(&args, argc, argv, longopts, true);

  GraphFileReader *gfile = &args.gfile;
  PathFileBuffer *pfiles = &args.pfiles;
  CorrectAlnInputBuffer *inputs = &args.inputs;
  size_t ctx_total_cols = gfile->hdr.num_of_cols;
  size_t ctx_num_kmers = gfile->num_of_kmers;

  if(args.colour > ctx_total_cols)
    cmd_print_usage("-c %zu is too big [> %zu]", args.colour, ctx_total_cols);

  size_t ctp_usedcols = 0;
  for(i = 0; i < pfiles->len; i++) {
    if(!file_filter_iscolloaded(&pfiles->data[i].fltr, args.colour)) {
      cmd_print_usage("Path file doesn't load into colour %zu: %s",
                      args.colour, pfiles->data[i].fltr.orig_path.buff);
    }
    ctp_usedcols = MAX2(ctp_usedcols, path_file_usedcols(&pfiles->data[i]));
  }

  //
  // Decide on memory
  //
  size_t bits_per_kmer, kmers_in_hash, graph_mem, path_mem, total_mem;

  // 1 bit needed per kmer if we need to keep track of noreseed
  bits_per_kmer = sizeof(Edges)*8 + ctx_num_kmers + sizeof(uint64_t)*8;
  kmers_in_hash = cmd_get_kmers_in_hash2(args.memargs.mem_to_use,
                                         args.memargs.mem_to_use_set,
                                         args.memargs.num_kmers,
                                         args.memargs.num_kmers_set,
                                         bits_per_kmer,
                                         ctx_num_kmers, ctx_num_kmers,
                                         false, &graph_mem);

  // Paths memory
  path_mem = path_files_mem_required(pfiles->data, pfiles->len, false, false,
                                     ctp_usedcols, 0);
  cmd_print_mem(path_mem, "paths");

  // Total memory
  total_mem = graph_mem + path_mem;
  cmd_check_mem_limit(args.memargs.mem_to_use, total_mem);

  //
  // Check we can read all output files
  //
  // Open output files
  SeqOutput *outputs = ctx_calloc(inputs->len, sizeof(SeqOutput));
  bool output_files_exist = false;

  for(i = 0; i < inputs->len; i++)
  {
    CorrectAlnInput *input = &inputs->data[i];
    input->crt_params.ctxcol = input->crt_params.ctpcol = args.colour;
    SeqOutput *output = &outputs[i];
    seq_output_alloc(output);
    seq_output_set_paths(output, input->out_base,
                         async_task_pe_output(&input->files));
    input->output = output;
    // output check prints warnings and returns true if errors
    output_files_exist |= seq_output_files_exist_check(output);
  }

  // Abandon if some of the output files already exist
  if(output_files_exist) die("Output files already exist");

  // Attempt to open all files
  for(i = 0; i < inputs->len && seq_output_open(&outputs[i]); i++) {}

  // Check if something went wrong - if so remove all output files
  if(i < inputs->len) {
    for(j = 0; j < i; j++) seq_output_delete(&outputs[i]);
    die("Couldn't open output files");
  }

  //
  // Allocate memory
  //

  dBGraph db_graph;
  db_graph_alloc(&db_graph, gfile->hdr.kmer_size, ctx_total_cols, 1, kmers_in_hash);

  size_t bytes_per_col = roundup_bits2bytes(db_graph.ht.capacity);

  db_graph.col_edges = ctx_calloc(db_graph.ht.capacity, sizeof(Edges));
  db_graph.node_in_cols = ctx_calloc(bytes_per_col * ctx_total_cols, 1);

  // Paths
  path_store_alloc(&db_graph.pstore, path_mem, false,
                   db_graph.ht.capacity, ctp_usedcols);

  //
  // Load Graph and Path files
  //
  LoadingStats gstats = LOAD_STATS_INIT_MACRO;
  GraphLoadingPrefs gprefs = {.db_graph = &db_graph,
                              .boolean_covgs = false,
                              .must_exist_in_graph = false,
                              .must_exist_in_edges = NULL,
                              .empty_colours = true};

  // Load graph, print stats, close file
  graph_load(gfile, gprefs, &gstats);
  hash_table_print_stats_brief(&db_graph.ht);
  graph_file_close(gfile);

  // Load path files (does nothing if num_fpiles == 0)
  paths_format_merge(pfiles->data, pfiles->len, false, false,
                     args.num_of_threads, &db_graph);

  //
  // Run alignment
  //
  correct_reads(args.num_of_threads, MAX_IO_THREADS,
                inputs->data, inputs->len,
                &db_graph);

  // Close and free output files
  for(i = 0; i < inputs->len; i++) seq_output_dealloc(&outputs[i]);
  ctx_free(outputs);

  read_thread_args_dealloc(&args);

  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}
예제 #15
0
// Returns 0 on success, otherwise != 0
int ctx_unitigs(int argc, char **argv)
{
  size_t nthreads = 0;
  struct MemArgs memargs = MEM_ARGS_INIT;
  const char *out_path = NULL;
  UnitigSyntax syntax = PRINT_FASTA;
  bool dot_use_points = false;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'o': cmd_check(!out_path, cmd); out_path = optarg; break;
      case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'F': cmd_check(!syntax, cmd); syntax = PRINT_FASTA; break;
      case 'g': cmd_check(!syntax, cmd); syntax = PRINT_GFA; break;
      case 'd': cmd_check(!syntax, cmd); syntax = PRINT_DOT; break;
      case 'P': cmd_check(!dot_use_points, cmd); dot_use_points = true; break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        die("`"CMD" unitigs -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  if(dot_use_points && syntax == PRINT_FASTA)
    cmd_print_usage("--point is only for use with --dot");

  // Defaults for unset values
  if(out_path == NULL) out_path = "-";
  if(nthreads == 0) nthreads = DEFAULT_NTHREADS;

  if(optind >= argc) cmd_print_usage(NULL);

  size_t i, num_gfiles = (size_t)(argc - optind);
  char **gfile_paths = argv + optind;

  if(dot_use_points && syntax != PRINT_DOT)
    cmd_print_usage("--points only valid with --graphviz / --dot");

  ctx_assert(num_gfiles > 0);

  // Open graph files
  GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader));
  size_t ctx_max_kmers = 0, ctx_sum_kmers = 0;

  graph_files_open(gfile_paths, gfiles, num_gfiles,
                   &ctx_max_kmers, &ctx_sum_kmers);

  //
  // Decide on memory
  //
  size_t bits_per_kmer, kmers_in_hash, graph_mem;

  bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 + 1;
  if(syntax != PRINT_FASTA) bits_per_kmer += sizeof(UnitigEnd) * 8;

  kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                        memargs.mem_to_use_set,
                                        memargs.num_kmers,
                                        memargs.num_kmers_set,
                                        bits_per_kmer,
                                        ctx_max_kmers, ctx_sum_kmers,
                                        true, &graph_mem);

  cmd_check_mem_limit(memargs.mem_to_use, graph_mem);

  status("Output in %s format to %s\n", syntax_strs[syntax],
         futil_outpath_str(out_path));

  //
  // Open output file
  //

  // Print to stdout unless --out <out> is specified
  FILE *fout = futil_fopen_create(out_path, "w");

  //
  // Allocate memory
  //
  dBGraph db_graph;
  db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, 1, 1, kmers_in_hash,
                 DBG_ALLOC_EDGES);

  UnitigPrinter printer;
  unitig_printer_init(&printer, &db_graph, nthreads, syntax, fout);

  if(syntax == PRINT_DOT || syntax == PRINT_GFA)
    unitig_graph_alloc(&printer.ugraph, &db_graph);

  // Load graphs
  GraphLoadingPrefs gprefs = {.db_graph = &db_graph,
                              .boolean_covgs = false,
                              .must_exist_in_graph = false,
                              .empty_colours = false};

  for(i = 0; i < num_gfiles; i++) {
    file_filter_flatten(&gfiles[i].fltr, 0);
    graph_load(&gfiles[i], gprefs, NULL);
    graph_file_close(&gfiles[i]);
  }
  ctx_free(gfiles);

  hash_table_print_stats(&db_graph.ht);

  switch(syntax)
  {
    case PRINT_FASTA:
      status("Printing unitgs in FASTA using %zu threads", nthreads);
      supernodes_iterate(nthreads, printer.visited, &db_graph,
                         print_unitig_fasta, &printer);
      break;
    case PRINT_GFA:
      print_gfa_syntax(&printer);
      break;
    case PRINT_DOT:
      print_dot_syntax(&printer, dot_use_points);
      break;
    default:
      die("Invalid print syntax: %i", syntax);
  }

  char num_unitigs_str[50];
  ulong_to_str(printer.num_unitigs, num_unitigs_str);
  status("Dumped %s unitigs\n", num_unitigs_str);

  fclose(fout);

  unitig_printer_destroy(&printer);
  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}
예제 #16
0
// If there is no 'into' filter (e.g. 0,1:in.ctx 0,1 is 'into filter'),
// load into `into_offset..into_offset+N-1`
void file_filter_set_cols(FileFilter *fltr, size_t filencols, size_t into_offset)
{
  size_t i;
  const char *path_start_c, *path_end_c;
  file_filter_deconstruct_path(fltr->input.b, &path_start_c, &path_end_c);

  // This is a hack to get non-const pointer
  size_t offset_start = path_start_c - fltr->input.b;
  size_t offset_end = path_end_c - fltr->input.b;
  char *path_start = fltr->input.b + offset_start;
  char *path_end = fltr->input.b + offset_end;

  char *from_fltr = (*path_end == ':' ? path_end+1 : NULL);
  char *into_fltr = (path_start > fltr->input.b ? fltr->input.b : NULL);

  size_t ncols;

  if(from_fltr) {
    int s = range_get_num(from_fltr, filencols-1);
    if(s < 0)
      die("Invalid filter path: %s (file cols: %zu)", fltr->input.b, filencols);
    ncols = s;
  } else {
    ncols = filencols;
  }

  if(into_fltr) {
    *(path_start-1) = '\0';
    int s = range_get_num(into_fltr, SIZE_MAX);
    *(path_start-1) = ':';
    if(s < 0 || (s != 1 && (size_t)s != ncols))
      die("Invalid filter path: %s (s:%i ncols:%zu)", fltr->input.b, s, ncols);
  }

  fltr->filencols = filencols;
  filter_buf_capacity(&fltr->filter, ncols);
  file_filter_num(fltr) = ncols;

  size_t *tmp = ctx_calloc(ncols, sizeof(size_t));

  if(from_fltr)
  {
    if(range_parse_array(from_fltr, tmp, filencols-1) == -1)
      die("Invalid filter path: %s", fltr->input.b);
    for(i = 0; i < ncols; i++)
      file_filter_fromcol(fltr, i) = tmp[i];
  }
  else {
    for(i = 0; i < ncols; i++)
      file_filter_fromcol(fltr, i) = i;
  }

  if(into_fltr)
  {
    *(path_start-1) = '\0';
    int s = range_parse_array_fill(into_fltr, tmp, SIZE_MAX, ncols);
    *(path_start-1) = ':';
    if(s < 0 || (size_t)s != ncols)
      die("Invalid filter path: %s (s:%i ncols:%zu)", fltr->input.b, s, ncols);
    for(i = 0; i < ncols; i++)
      file_filter_intocol(fltr, i) = tmp[i];
  }
  else {
    for(i = 0; i < ncols; i++)
      file_filter_intocol(fltr, i) = into_offset + i;
  }

  ctx_free(tmp);

  file_filter_update(fltr);

  // for(i = 0; i < file_filter_num(fltr); i++)
  //   status("%u -> %u", file_filter_fromcol(fltr,i), file_filter_intocol(fltr,i));
}
예제 #17
0
static void parse_entries(gzFile gzin, FILE *fout)
{
  CallFileEntry centry;
  call_file_entry_alloc(&centry);

  ChromPosBuffer chrposbuf;
  chrompos_buf_alloc(&chrposbuf, 32);

  StrBuf tmpbuf, flank3pbuf;
  strbuf_alloc(&tmpbuf, 1024);
  strbuf_alloc(&flank3pbuf, 1024);

  const char *flank5p, *flank3p;
  size_t flank5p_len, flank3p_len;
  size_t cpy_flnk_5p, cpy_flnk_3p;

  const read_t *chrom = NULL;
  size_t ref_start = 0, ref_end = 0;
  bool mapped = false, fw_strand = false;

  const char **genotypes = NULL;

  if(!input_bubble_format)
    genotypes = ctx_calloc(num_samples, sizeof(char*));

  for(; call_file_read(gzin, input_path, &centry); num_entries_read++)
  {
    size_t nlines = call_file_num_lines(&centry);
    ctx_assert2(!(nlines&1) && nlines >= 6, "Too few lines: %zu", nlines);

    flank5p = call_file_get_line(&centry,1);
    flank5p_len = call_file_line_len(&centry,1);
    cpy_flnk_5p = cpy_flnk_3p = 0;

    // Read a corresponding SAM entry
    if(input_bubble_format)
    {
      // Trim down alleles, add to 3p flank
      bubble_trim_alleles(&centry, &flank3pbuf);
      flank3p = flank3pbuf.b;
      flank3p_len = flank3pbuf.end;

      mapped = sam_fetch_coords(&centry, flank5p, flank5p_len, flank3p, flank3p_len,
                                &cpy_flnk_5p, &cpy_flnk_3p,
                                &chrom, &ref_start, &ref_end, &fw_strand);
    }
    else {
      flank3p = call_file_get_line(&centry, 3);
      flank3p_len = call_file_line_len(&centry, 3);

      mapped = brkpnt_fetch_coords(&centry, &chrposbuf,
                                   &chrom, &ref_start, &ref_end, &fw_strand,
                                   &cpy_flnk_5p, &cpy_flnk_3p);
    }

    if(mapped)
    {
      // Get call id
      const char *hdrline = call_file_get_line(&centry, 0);
      char callid[100];
      int r = get_callid_str(hdrline, input_bubble_format, callid, sizeof(callid));
      if(r == -1) die("Poorly formatted: %s", hdrline);
      if(r == -2) die("Call id string is too long: %s", hdrline);

      align_entry(&centry, callid, flank5p, flank5p_len, flank3p, flank3p_len,
                  cpy_flnk_5p, cpy_flnk_3p,
                  chrom, ref_start, ref_end, fw_strand,
                  &tmpbuf, genotypes,
                  fout);
    }
  }

  ctx_free(genotypes);
  call_file_entry_dealloc(&centry);
  chrompos_buf_dealloc(&chrposbuf);
  strbuf_dealloc(&tmpbuf);
  strbuf_dealloc(&flank3pbuf);
}
예제 #18
0
int ctx_pop_bubbles(int argc, char **argv)
{
  size_t nthreads = 0;
  struct MemArgs memargs = MEM_ARGS_INIT;
  const char *out_path = NULL;
  int32_t max_covg  = -1; // max mean coverage to remove <=0 => ignore
  int32_t max_klen  = -1; // max length (kmers) to remove <=0 => ignore
  int32_t max_kdiff = -1; // max diff between bubble branch lengths <0 => ignore

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'o': cmd_check(!out_path, cmd); out_path = optarg; break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'C': cmd_check(max_covg<0,  cmd); max_covg  = cmd_uint32(cmd, optarg); break;
      case 'L': cmd_check(max_klen<0,  cmd); max_klen  = cmd_uint32(cmd, optarg); break;
      case 'D': cmd_check(max_kdiff<0, cmd); max_kdiff = cmd_uint32(cmd, optarg); break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" pop -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  // Defaults for unset values
  if(out_path == NULL) out_path = "-";
  if(nthreads == 0) nthreads = DEFAULT_NTHREADS;

  if(optind >= argc) cmd_print_usage("Require input graph files (.ctx)");

  //
  // Open graph files
  //
  const size_t num_gfiles = argc - optind;
  char **graph_paths = argv + optind;
  ctx_assert(num_gfiles > 0);

  GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader));
  size_t i, ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0;

  ncols = graph_files_open(graph_paths, gfiles, num_gfiles,
                           &ctx_max_kmers, &ctx_sum_kmers);

  bool reread_graph_to_filter = (num_gfiles == 1 &&
                                 strcmp(file_filter_path(&gfiles[0].fltr),"-") != 0);

  if(reread_graph_to_filter) {
    file_filter_flatten(&gfiles[0].fltr, 0);
    ncols = 1;
  }

  // Check graphs are compatible
  graphs_gpaths_compatible(gfiles, num_gfiles, NULL, 0, -1);

  //
  // Decide on memory
  //
  size_t bits_per_kmer, kmers_in_hash, graph_mem;

  bits_per_kmer = sizeof(BinaryKmer)*8 +
                  sizeof(Covg)*8*ncols +
                  sizeof(Edges)*8*ncols +
                  2; // 1 bit for visited, 1 for removed

  kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                        memargs.mem_to_use_set,
                                        memargs.num_kmers,
                                        memargs.num_kmers_set,
                                        bits_per_kmer,
                                        ctx_max_kmers, ctx_sum_kmers,
                                        false, &graph_mem);

  cmd_check_mem_limit(memargs.mem_to_use, graph_mem);

  // Check out_path is writable
  futil_create_output(out_path);

  // Allocate memory
  dBGraph db_graph;
  db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, ncols, ncols,
                 kmers_in_hash,  DBG_ALLOC_EDGES | DBG_ALLOC_COVGS);

  size_t nkwords = roundup_bits2bytes(db_graph.ht.capacity);
  uint8_t *visited = ctx_calloc(1, nkwords);
  uint8_t *rmvbits  = ctx_calloc(1, nkwords);

  //
  // Load graphs
  //
  GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph);
  gprefs.empty_colours = true;

  for(i = 0; i < num_gfiles; i++) {
    graph_load(&gfiles[i], gprefs, NULL);
    graph_file_close(&gfiles[i]);
    gprefs.empty_colours = false;
  }
  ctx_free(gfiles);

  hash_table_print_stats(&db_graph.ht);

  PopBubblesPrefs prefs = {.max_rmv_covg = max_covg,
                           .max_rmv_klen = max_klen,
                           .max_rmv_kdiff = max_kdiff};
  size_t npopped = 0;
  char npopped_str[50];

  status("Popping bubbles...");
  npopped = pop_bubbles(&db_graph, nthreads, prefs, visited, rmvbits);
  ulong_to_str(npopped, npopped_str);
  status("Popped %s bubbles", npopped_str);

  size_t nkmers0 = db_graph.ht.num_kmers;
  status("Removing nodes...");
  for(i = 0; i < nkwords; i++) rmvbits[i] = ~rmvbits[i];
  prune_nodes_lacking_flag(nthreads, rmvbits, &db_graph);
  size_t nkmers1 = db_graph.ht.num_kmers;

  ctx_assert(nkmers1 <= nkmers0);
  char nkmers0str[50], nkmers1str[50], ndiffstr[50];
  ulong_to_str(nkmers0, nkmers0str);
  ulong_to_str(nkmers1, nkmers1str);
  ulong_to_str(nkmers0-nkmers1, ndiffstr);
  status("Number of kmers %s -> %s (-%s)", nkmers0str, nkmers1str, ndiffstr);

  if(reread_graph_to_filter)
  {
    status("Streaming filtered file to: %s\n", out_path);
    GraphFileReader gfile;
    memset(&gfile, 0, sizeof(GraphFileReader));
    graph_file_open(&gfile, graph_paths[0]);
    graph_writer_stream_mkhdr(out_path, &gfile, &db_graph,
                              db_graph.col_edges, NULL);
    graph_file_close(&gfile);
  }
  else
  {
    status("Saving to: %s\n", out_path);
    graph_writer_save_mkhdr(out_path, &db_graph, CTX_GRAPH_FILEFORMAT, NULL,
                          0, ncols);
  }

  ctx_free(visited);
  ctx_free(rmvbits);

  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}
예제 #19
0
static BreakpointCaller* brkpt_callers_new(size_t num_callers,
                                           gzFile gzout,
                                           size_t min_ref_flank,
                                           size_t max_ref_flank,
                                           const KOGraph kograph,
                                           const dBGraph *db_graph)
{
  ctx_assert(num_callers > 0);

  const size_t ncols = db_graph->num_of_cols;
  BreakpointCaller *callers = ctx_malloc(num_callers * sizeof(BreakpointCaller));

  pthread_mutex_t *out_lock = ctx_malloc(sizeof(pthread_mutex_t));
  if(pthread_mutex_init(out_lock, NULL) != 0) die("mutex init failed");

  size_t *callid = ctx_calloc(1, sizeof(size_t));

  // Each colour in each caller can have a GraphCache path at once
  PathRefRun *path_ref_runs = ctx_calloc(num_callers*MAX_REFRUNS_PER_CALLER(ncols),
                                         sizeof(PathRefRun));

  size_t i;
  for(i = 0; i < num_callers; i++)
  {
    BreakpointCaller tmp = {.threadid = i,
                            .nthreads = num_callers,
                            .kograph = kograph,
                            .db_graph = db_graph,
                            .gzout = gzout,
                            .out_lock = out_lock,
                            .callid = callid,
                            .allele_refs = path_ref_runs,
                            .flank5p_refs = path_ref_runs+MAX_REFRUNS_PER_ORIENT(ncols),
                            .min_ref_nkmers = min_ref_flank,
                            .max_ref_nkmers = max_ref_flank};

    memcpy(&callers[i], &tmp, sizeof(BreakpointCaller));

    path_ref_runs += MAX_REFRUNS_PER_CALLER(ncols);

    db_node_buf_alloc(&callers[i].allelebuf, 1024);
    db_node_buf_alloc(&callers[i].flank5pbuf, 1024);
    kmer_run_buf_alloc(&callers[i].koruns_5p, 128);
    kmer_run_buf_alloc(&callers[i].koruns_5p_ended, 128);
    kmer_run_buf_alloc(&callers[i].koruns_3p, 128);
    kmer_run_buf_alloc(&callers[i].koruns_3p_ended, 128);
    kmer_run_buf_alloc(&callers[i].allele_run_buf, 128);
    kmer_run_buf_alloc(&callers[i].flank5p_run_buf, 128);
    graph_crawler_alloc(&callers[i].crawlers[0], db_graph);
    graph_crawler_alloc(&callers[i].crawlers[1], db_graph);
  }

  return callers;
}

static void brkpt_callers_destroy(BreakpointCaller *callers, size_t num_callers)
{
  size_t i;
  for(i = 0; i < num_callers; i++) {
    db_node_buf_dealloc(&callers[i].allelebuf);
    db_node_buf_dealloc(&callers[i].flank5pbuf);
    kmer_run_buf_dealloc(&callers[i].koruns_5p);
    kmer_run_buf_dealloc(&callers[i].koruns_5p_ended);
    kmer_run_buf_dealloc(&callers[i].koruns_3p);
    kmer_run_buf_dealloc(&callers[i].koruns_3p_ended);
    kmer_run_buf_dealloc(&callers[i].allele_run_buf);
    kmer_run_buf_dealloc(&callers[i].flank5p_run_buf);
    graph_crawler_dealloc(&callers[i].crawlers[0]);
    graph_crawler_dealloc(&callers[i].crawlers[1]);
  }
  pthread_mutex_destroy(callers[0].out_lock);
  ctx_free(callers[0].out_lock);
  ctx_free(callers[0].callid);
  ctx_free(callers[0].allele_refs);
  ctx_free(callers);
}
예제 #20
0
파일: ctx_join.c 프로젝트: Phelimb/mccortex
int ctx_join(int argc, char **argv)
{
  struct MemArgs memargs = MEM_ARGS_INIT;
  const char *out_path = NULL;
  size_t use_ncols = 0;

  GraphFileReader tmp_gfile;
  GraphFileBuffer isec_gfiles_buf;
  gfile_buf_alloc(&isec_gfiles_buf, 8);

  // Arg parsing
  char cmd[100], shortopts[100];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'o': cmd_check(!out_path, cmd); out_path = optarg; break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'N': cmd_check(!use_ncols, cmd); use_ncols = cmd_uint32_nonzero(cmd, optarg); break;
      case 'i':
        graph_file_reset(&tmp_gfile);
        graph_file_open(&tmp_gfile, optarg);
        if(file_filter_into_ncols(&tmp_gfile.fltr) > 1)
          warn("Flattening intersection graph into colour 0: %s", optarg);
        file_filter_flatten(&tmp_gfile.fltr, 0);
        gfile_buf_push(&isec_gfiles_buf, &tmp_gfile, 1);
        break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" join -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  GraphFileReader *igfiles = isec_gfiles_buf.b;
  size_t num_igfiles = isec_gfiles_buf.len;

  if(!out_path) cmd_print_usage("--out <out.ctx> required");

  if(optind >= argc)
    cmd_print_usage("Please specify at least one input graph file");

  // optind .. argend-1 are graphs to load
  size_t num_gfiles = (size_t)(argc - optind);
  char **gfile_paths = argv + optind;

  GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader));

  status("Probing %zu graph files and %zu intersect files", num_gfiles, num_igfiles);

  // Check all binaries are valid binaries with matching kmer size
  size_t i;
  size_t ctx_max_cols = 0;
  uint64_t min_intersect_num_kmers = 0, ctx_max_kmers = 0, ctx_sum_kmers = 0;

  for(i = 0; i < num_gfiles; i++)
  {
    graph_file_open2(&gfiles[i], gfile_paths[i], "r", true, ctx_max_cols);

    if(gfiles[0].hdr.kmer_size != gfiles[i].hdr.kmer_size) {
      cmd_print_usage("Kmer sizes don't match [%u vs %u]",
                      gfiles[0].hdr.kmer_size, gfiles[i].hdr.kmer_size);
    }

    ctx_max_cols = MAX2(ctx_max_cols, file_filter_into_ncols(&gfiles[i].fltr));
    ctx_max_kmers = MAX2(ctx_max_kmers, graph_file_nkmers(&gfiles[i]));
    ctx_sum_kmers += graph_file_nkmers(&gfiles[i]);
  }

  // Probe intersection graph files
  for(i = 0; i < num_igfiles; i++)
  {
    if(gfiles[0].hdr.kmer_size != igfiles[i].hdr.kmer_size) {
      cmd_print_usage("Kmer sizes don't match [%u vs %u]",
                  gfiles[0].hdr.kmer_size, igfiles[i].hdr.kmer_size);
    }

    uint64_t nkmers = graph_file_nkmers(&igfiles[i]);

    if(i == 0) min_intersect_num_kmers = nkmers;
    else if(nkmers < min_intersect_num_kmers)
    {
      // Put smallest intersection binary first
      SWAP(igfiles[i], igfiles[0]);
      min_intersect_num_kmers = nkmers;
    }
  }

  bool take_intersect = (num_igfiles > 0);

  // If we are taking an intersection,
  // all kmers intersection kmers will need to be loaded
  if(take_intersect)
    ctx_max_kmers = ctx_sum_kmers = min_intersect_num_kmers;

  bool use_ncols_set = (use_ncols > 0);
  bool output_to_stdout = (strcmp(out_path,"-") == 0);

  // if(use_ncols == 0) use_ncols = 1;
  if(use_ncols_set) {
    if(use_ncols < ctx_max_cols && output_to_stdout)
      die("I need %zu colours if outputting to STDOUT (--ncols)", ctx_max_cols);
    if(use_ncols > ctx_max_cols) {
      warn("I only need %zu colour%s ('--ncols %zu' ignored)",
           ctx_max_cols, util_plural_str(ctx_max_cols), use_ncols);
      use_ncols = ctx_max_cols;
    }
  }
  else {
    use_ncols = output_to_stdout ? ctx_max_cols : 1;
  }

  // Check out_path is writable
  futil_create_output(out_path);

  status("Output %zu cols; from %zu files; intersecting %zu graphs; ",
         ctx_max_cols, num_gfiles, num_igfiles);

  if(num_gfiles == 1 && num_igfiles == 0)
  {
    // Loading only one file with no intersection files
    // Don't need to store a graph in memory, can filter as stream
    // Don't actually store anything in the de Bruijn graph, but we need to
    // pass it, so mock one up
    dBGraph db_graph;
    db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size,
                   file_filter_into_ncols(&gfiles[0].fltr), 0, 1024, 0);

    graph_writer_stream_mkhdr(out_path, &gfiles[0], &db_graph, NULL, NULL);
    graph_file_close(&gfiles[0]);
    gfile_buf_dealloc(&isec_gfiles_buf);
    ctx_free(gfiles);

    db_graph_dealloc(&db_graph);

    return EXIT_SUCCESS;
  }

  //
  // Decide on memory
  //
  size_t bits_per_kmer, kmers_in_hash, graph_mem;

  bits_per_kmer = sizeof(BinaryKmer)*8 +
                  (sizeof(Covg) + sizeof(Edges)) * 8 * use_ncols;

  kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                        memargs.mem_to_use_set,
                                        memargs.num_kmers,
                                        memargs.num_kmers_set,
                                        bits_per_kmer,
                                        ctx_max_kmers, ctx_sum_kmers,
                                        true, &graph_mem);

  if(!use_ncols_set)
  {
    // Maximise use_ncols
    size_t max_usencols = (memargs.mem_to_use*8) / bits_per_kmer;

    use_ncols = MIN2(max_usencols, ctx_max_cols);
    bits_per_kmer = sizeof(BinaryKmer)*8 +
                    (sizeof(Covg) + sizeof(Edges)) * 8 * use_ncols;

    // Re-check memory used
    kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                          memargs.mem_to_use_set,
                                          memargs.num_kmers,
                                          memargs.num_kmers_set,
                                          bits_per_kmer,
                                          ctx_max_kmers, ctx_sum_kmers,
                                          true, &graph_mem);
  }

  status("Using %zu colour%s in memory", use_ncols, util_plural_str(use_ncols));

  cmd_check_mem_limit(memargs.mem_to_use, graph_mem);

  // Create db_graph
  dBGraph db_graph;
  Edges *intersect_edges = NULL;
  size_t edge_cols = (use_ncols + take_intersect);

  db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, use_ncols, use_ncols,
                 kmers_in_hash, DBG_ALLOC_COVGS);

  // We allocate edges ourself since it's a special case
  db_graph.col_edges = ctx_calloc(db_graph.ht.capacity*edge_cols, sizeof(Edges));

  // Load intersection binaries
  char *intsct_gname_ptr = NULL;
  StrBuf intersect_gname;
  strbuf_alloc(&intersect_gname, 1024);

  if(take_intersect)
  {
    GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph);
    gprefs.boolean_covgs = true; // covg++ only

    for(i = 0; i < num_igfiles; i++)
    {
      graph_load(&igfiles[i], gprefs, NULL);

      // Update intersect header
      // note: intersection graphs all load exactly one colour into colour 0
      graph_info_make_intersect(&igfiles[i].hdr.ginfo[0], &intersect_gname);

      gprefs.must_exist_in_graph = true;
      gprefs.must_exist_in_edges = db_graph.col_edges;
    }

    if(num_igfiles > 1)
    {
      // Remove nodes where covg != num_igfiles
      HASH_ITERATE_SAFE(&db_graph.ht, remove_non_intersect_nodes,
                        db_graph.col_covgs, (Covg)num_igfiles, &db_graph.ht);
    }

    status("Loaded intersection set\n");
    intsct_gname_ptr = intersect_gname.b;

    for(i = 0; i < num_igfiles; i++) graph_file_close(&igfiles[i]);

    // Reset graph info
    for(i = 0; i < db_graph.num_of_cols; i++)
      graph_info_init(&db_graph.ginfo[i]);

    // Zero covgs
    memset(db_graph.col_covgs, 0, db_graph.ht.capacity * sizeof(Covg));

    // Use union edges we loaded to intersect new edges
    intersect_edges = db_graph.col_edges;
    db_graph.col_edges += db_graph.ht.capacity;
  }

  bool kmers_loaded = take_intersect, colours_loaded = false;

  graph_writer_merge_mkhdr(out_path, gfiles, num_gfiles,
                          kmers_loaded, colours_loaded, intersect_edges,
                          intsct_gname_ptr, &db_graph);

  if(take_intersect)
    db_graph.col_edges -= db_graph.ht.capacity;

  for(i = 0; i < num_gfiles; i++) graph_file_close(&gfiles[i]);

  strbuf_dealloc(&intersect_gname);
  gfile_buf_dealloc(&isec_gfiles_buf);
  ctx_free(gfiles);

  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}
예제 #21
0
  return job_mem + corrector_mem + junc_mem + packed_mem + sizeof(GenPathWorker);
}

// GenPathWorker stores four buffers of size n
#define gworker_seq_buf(n) (binary_seq_mem(n)*4)

static void _gen_paths_worker_alloc(GenPathWorker *wrkr, dBGraph *db_graph)
{
  GenPathWorker tmp = {.db_graph = db_graph};

  correct_aln_worker_alloc(&tmp.corrector, true, db_graph);

  // Junction data
  // only fw arrays are malloc'd, rv point to fw
  tmp.junc_arrsize = INIT_BUFLEN;
  tmp.pck_fw = ctx_calloc(2, gworker_seq_buf(tmp.junc_arrsize));
  tmp.pck_rv = tmp.pck_fw + gworker_seq_buf(tmp.junc_arrsize);

  tmp.pos_fw = ctx_malloc(tmp.junc_arrsize * sizeof(*tmp.pos_fw) * 2);
  tmp.pos_rv = tmp.pos_fw + tmp.junc_arrsize;
  tmp.num_fw = tmp.num_rv = 0;

  memcpy(wrkr, &tmp, sizeof(GenPathWorker));
}

static void _gen_paths_worker_dealloc(GenPathWorker *wrkr)
{
  correct_aln_worker_dealloc(&wrkr->corrector);
  ctx_free(wrkr->pck_fw);
  ctx_free(wrkr->pos_fw);
}
예제 #22
0
BubbleCaller* bubble_callers_new(size_t num_callers,
                                 BubbleCallingPrefs prefs,
                                 gzFile gzout,
                                 const dBGraph *db_graph)
{
  ctx_assert(num_callers > 0);

  // Max usage is 4 * max_allele_len * cols
  size_t i;
  size_t max_path_len = MAX2(prefs.max_flank_len, prefs.max_allele_len);

  BubbleCaller *callers = ctx_malloc(num_callers * sizeof(BubbleCaller));

  pthread_mutex_t *out_lock = ctx_malloc(sizeof(pthread_mutex_t));
  if(pthread_mutex_init(out_lock, NULL) != 0) die("mutex init failed");

  size_t *num_bubbles_ptr = ctx_calloc(1, sizeof(size_t));

  for(i = 0; i < num_callers; i++)
  {
    BubbleCaller tmp = {.threadid = i, .nthreads = num_callers,
                        .haploid_seen = ctx_calloc(1+prefs.num_haploid, sizeof(bool)),
                        .num_bubbles_ptr = num_bubbles_ptr,
                        .prefs = prefs,
                        .db_graph = db_graph, .gzout = gzout,
                        .out_lock = out_lock};

    memcpy(&callers[i], &tmp, sizeof(BubbleCaller));

    // First two buffers don't actually need to grow
    db_node_buf_alloc(&callers[i].flank5p, prefs.max_flank_len);
    db_node_buf_alloc(&callers[i].pathbuf, max_path_len);

    graph_walker_alloc(&callers[i].wlk, db_graph);
    rpt_walker_alloc(&callers[i].rptwlk, db_graph->ht.capacity, 22); // 4MB

    graph_cache_alloc(&callers[i].cache, db_graph);
    cache_stepptr_buf_alloc(&callers[i].spp_forward, 1024);
    cache_stepptr_buf_alloc(&callers[i].spp_reverse, 1024);
    strbuf_alloc(&callers[i].output_buf, 2048);
  }

  return callers;
}

void bubble_callers_destroy(BubbleCaller *callers, size_t num_callers)
{
  ctx_assert(num_callers > 0);

  size_t i;
  for(i = 0; i < num_callers; i++)
  {
    ctx_free(callers[i].haploid_seen);

    db_node_buf_dealloc(&callers[i].flank5p);
    db_node_buf_dealloc(&callers[i].pathbuf);

    rpt_walker_dealloc(&callers[i].rptwlk);
    graph_walker_dealloc(&callers[i].wlk);

    graph_cache_dealloc(&callers[i].cache);
    cache_stepptr_buf_dealloc(&callers[i].spp_forward);
    cache_stepptr_buf_dealloc(&callers[i].spp_reverse);
    strbuf_dealloc(&callers[i].output_buf);
  }
  pthread_mutex_destroy(callers[0].out_lock);
  ctx_free(callers[0].out_lock);
  ctx_free(callers[0].num_bubbles_ptr);
  ctx_free(callers);
}
예제 #23
0
int ctx_correct(int argc, char **argv)
{
  size_t i;
  struct ReadThreadCmdArgs args;
  read_thread_args_alloc(&args);
  read_thread_args_parse(&args, argc, argv, longopts, true);

  GraphFileReader *gfile = &args.gfile;
  GPathFileBuffer *gpfiles = &args.gpfiles;
  CorrectAlnInputBuffer *inputs = &args.inputs;

  // Update colours in graph file - sample in 0, all others in 1
  size_t ncols = gpath_load_sample_pop(gfile, 1, gpfiles->b, gpfiles->len,
                                       args.colour);

  // Check for compatibility between graph files and link files
  graphs_gpaths_compatible(gfile, 1, gpfiles->b, gpfiles->len, 1);

  int64_t ctx_num_kmers = gfile->num_of_kmers;

  //
  // Decide on memory
  //
  size_t bits_per_kmer, kmers_in_hash, graph_mem, path_mem, total_mem;

  // 1 bit needed per kmer if we need to keep track of noreseed
  bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 +
                  (gpfiles->len > 0 ? sizeof(GPath*)*8 : 0) +
                  ncols; // in colour

  kmers_in_hash = cmd_get_kmers_in_hash(args.memargs.mem_to_use,
                                        args.memargs.mem_to_use_set,
                                        args.memargs.num_kmers,
                                        args.memargs.num_kmers_set,
                                        bits_per_kmer,
                                        ctx_num_kmers, ctx_num_kmers,
                                        false, &graph_mem);

  // Paths memory
  size_t rem_mem = args.memargs.mem_to_use - MIN2(args.memargs.mem_to_use, graph_mem);
  path_mem = gpath_reader_mem_req(gpfiles->b, gpfiles->len, ncols, rem_mem, false,
                                  kmers_in_hash, false);

  cmd_print_mem(path_mem, "paths");

  // Shift path store memory from graphs->paths
  graph_mem -= sizeof(GPath*)*kmers_in_hash;
  path_mem  += sizeof(GPath*)*kmers_in_hash;

  // Total memory
  total_mem = graph_mem + path_mem;
  cmd_check_mem_limit(args.memargs.mem_to_use, total_mem);

  //
  // Check we can write all output files
  //
  // Open output files
  SeqOutput *outputs = ctx_calloc(inputs->len, sizeof(SeqOutput));
  bool err_occurred = false;

  for(i = 0; i < inputs->len && !err_occurred; i++)
  {
    CorrectAlnInput *input = &inputs->b[i];
    // We loaded target colour into colour zero
    input->crt_params.ctxcol = input->crt_params.ctpcol = 0;
    bool is_pe = asyncio_task_is_pe(&input->files);
    err_occurred = !seqout_open(&outputs[i], input->out_base, args.fmt, is_pe);
    input->output = &outputs[i];
  }

  // Abandon if some of the output files already exist
  if(err_occurred) {
    for(i = 0; i < inputs->len; i++)
      seqout_close(&outputs[i], true);
    die("Error creating output files");
  }

  //
  // Allocate memory
  //

  dBGraph db_graph;
  db_graph_alloc(&db_graph, gfile->hdr.kmer_size, ncols, 1, kmers_in_hash,
                 DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL);

  // Create a path store that does not tracks path counts
  gpath_reader_alloc_gpstore(gpfiles->b, gpfiles->len, path_mem, false, &db_graph);

  //
  // Load Graph and link files
  //
  GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph);
  gprefs.empty_colours = true;

  // Load graph, print stats, close file
  graph_load(gfile, gprefs, NULL);
  hash_table_print_stats_brief(&db_graph.ht);
  graph_file_close(gfile);

  // Load link files
  for(i = 0; i < gpfiles->len; i++) {
    gpath_reader_load(&gpfiles->b[i], GPATH_DIE_MISSING_KMERS, &db_graph);
    gpath_reader_close(&gpfiles->b[i]);
  }

  //
  // Run alignment
  //
  correct_reads(inputs->b, inputs->len,
                args.dump_seq_sizes, args.dump_frag_sizes,
                args.fq_zero, args.append_orig_seq,
                args.nthreads, &db_graph);

  // Close and free output files
  for(i = 0; i < inputs->len; i++)
    seqout_close(&outputs[i], false);
  ctx_free(outputs);

  // Closes input files
  read_thread_args_dealloc(&args);
  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}
예제 #24
0
int ctx_calls2vcf(int argc, char **argv)
{
  const char *in_path = NULL, *out_path = NULL, *out_type = NULL;
  // Filtering parameters
  int32_t min_mapq = -1, max_align_len = -1, max_allele_len = -1;
  // Alignment parameters
  int nwmatch = 1, nwmismatch = -2, nwgapopen = -4, nwgapextend = -1;
  // ref paths
  char const*const* ref_paths = NULL;
  size_t nref_paths = 0;
  // flank file
  const char *sam_path = NULL;

  //
  // Things we figure out by looking at the input
  //
  bool isbubble = false;
  // samples in VCF, (0 for bubble, does not include ref in breakpoint calls)
  size_t i, kmer_size, num_samples;

  //
  // Reference genome
  //
  // Hash map of chromosome name -> sequence
  ChromHash *genome;
  ReadBuffer chroms;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'o': cmd_check(!out_path, cmd); out_path = optarg; break;
      case 'O': cmd_check(!out_type, cmd); out_type = optarg; break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'F': cmd_check(!sam_path,cmd); sam_path = optarg; break;
      case 'Q': cmd_check(min_mapq < 0,cmd); min_mapq = cmd_uint32(cmd, optarg); break;
      case 'A': cmd_check(max_align_len  < 0,cmd); max_align_len  = cmd_uint32(cmd, optarg); break;
      case 'L': cmd_check(max_allele_len < 0,cmd); max_allele_len = cmd_uint32(cmd, optarg); break;
      case 'm': nwmatch = cmd_int32(cmd, optarg); break;
      case 'M': nwmismatch = cmd_int32(cmd, optarg); break;
      case 'g': nwgapopen = cmd_int32(cmd, optarg); break;
      case 'G': nwgapextend = cmd_int32(cmd, optarg); break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        die("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]);
      default: ctx_assert2(0, "shouldn't reach here: %c", c);
    }
  }

  // Defaults for unset values
  if(out_path == NULL) out_path = "-";
  if(max_align_len  < 0) max_align_len  = DEFAULT_MAX_ALIGN;
  if(max_allele_len < 0) max_allele_len = DEFAULT_MAX_ALLELE;

  if(optind+2 > argc)
    cmd_print_usage("Require <in.txt.gz> and at least one reference");

  in_path = argv[optind++];
  ref_paths = (char const*const*)argv + optind;
  nref_paths = argc - optind;

  // These functions call die() on error
  gzFile gzin = futil_gzopen(in_path, "r");

  // Read call file header
  cJSON *json = json_hdr_load(gzin, in_path);

  // Check we can handle the kmer size
  kmer_size = json_hdr_get_kmer_size(json, in_path);
  db_graph_check_kmer_size(kmer_size, in_path);

  // Get format (bubble or breakpoint file)
  cJSON *json_fmt = json_hdr_get(json, "file_format", cJSON_String, in_path);
  if(strcmp(json_fmt->valuestring,"CtxBreakpoints") == 0) isbubble = false;
  else if(strcmp(json_fmt->valuestring,"CtxBubbles") == 0) isbubble = true;
  else die("Unknown format: '%s'", json_fmt->valuestring);

  status("Reading %s in %s format", futil_inpath_str(in_path),
         isbubble ? "bubble" : "breakpoint");

  if(isbubble) {
    // bubble specific
    if(sam_path == NULL)
      cmd_print_usage("Require -F <flanks.sam> with bubble file");
    if(min_mapq < 0) min_mapq = DEFAULT_MIN_MAPQ;
  }
  else {
    // breakpoint specific
    if(min_mapq >= 0)
      cmd_print_usage("-Q,--min-mapq <Q> only valid with bubble calls");
  }

  // Open flank file if it exists
  htsFile *samfh = NULL;
  bam_hdr_t *bam_hdr = NULL;
  bam1_t *mflank = NULL;

  if(sam_path)
  {
    if((samfh = hts_open(sam_path, "r")) == NULL)
      die("Cannot open SAM/BAM %s", sam_path);

    // Load BAM header
    bam_hdr = sam_hdr_read(samfh);
    if(bam_hdr == NULL) die("Cannot load BAM header: %s", sam_path);
    mflank = bam_init1();
  }

  // Output VCF has 0 samples if bubbles file, otherwise has N where N is
  // number of samples/colours in the breakpoint graph
  size_t num_graph_samples = json_hdr_get_ncols(json, in_path);
  size_t num_graph_nonref = json_hdr_get_nonref_ncols(json, in_path);

  num_samples = 0;
  if(!isbubble) {
    // If last colour has "is_ref", drop number of samples by one
    num_samples = num_graph_nonref < num_graph_samples ? num_graph_samples-1
                                                       : num_graph_samples;
  }

  //
  // Open output file
  //
  if(!out_path) out_path = "-";
  int mode = vcf_misc_get_outtype(out_type, out_path);
  futil_create_output(out_path);
  htsFile *vcffh = hts_open(out_path, modes_htslib[mode]);

  status("[calls2vcf] Reading %s call file with %zu samples",
         isbubble ? "Bubble" : "Breakpoint", num_graph_samples);
  status("[calls2vcf] %zu sample output to: %s format: %s",
         num_samples, futil_outpath_str(out_path), hsmodes_htslib[mode]);

  if(isbubble) status("[calls2vcf] min. MAPQ: %i", min_mapq);
  status("[calls2vcf] max alignment length: %i", max_align_len);
  status("[calls2vcf] max VCF allele length: %i", max_allele_len);
  status("[calls2vcf] alignment match:%i mismatch:%i gap open:%i extend:%i",
         nwmatch, nwmismatch, nwgapopen, nwgapextend);

  // Load reference genome
  read_buf_alloc(&chroms, 1024);
  genome = chrom_hash_init();
  chrom_hash_load(ref_paths, nref_paths, &chroms, genome);

  // convert to upper case
  char *s;
  for(i = 0; i < chroms.len; i++)
    for(s = chroms.b[i].seq.b; *s; s++) *s = toupper(*s);

  if(!isbubble) brkpnt_check_refs_match(json, genome, in_path);

  bcf_hdr_t *vcfhdr = make_vcf_hdr(json, in_path, !isbubble, kmer_size,
                                   ref_paths, nref_paths,
                                   chroms.b, chroms.len);

  if(bcf_hdr_write(vcffh, vcfhdr) != 0) die("Cannot write VCF header");

  AlignedCall *call = acall_init();
  CallDecomp *aligner = call_decomp_init(vcffh, vcfhdr);

  scoring_t *scoring = call_decomp_get_scoring(aligner);
  scoring_init(scoring, nwmatch, nwmismatch, nwgapopen, nwgapextend,
               false, false, 0, 0, 0, 0);

  CallFileEntry centry;
  call_file_entry_alloc(&centry);

  char kmer_str[50];
  sprintf(kmer_str, ";K%zu", kmer_size);

  if(isbubble)
  {
    // Bubble calls
    DecompBubble *bubbles = decomp_bubble_init();

    // Set scoring for aligning 3' flank
    scoring = decomp_bubble_get_scoring(bubbles);
    scoring_init(scoring, nwmatch, nwmismatch, nwgapopen, nwgapextend,
                 true, true, 0, 0, 0, 0);

    while(call_file_read(gzin, in_path, &centry)) {
      do {
        if(sam_read1(samfh, bam_hdr, mflank) < 0)
          die("We've run out of SAM entries!");
      } while(mflank->core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY));

      // Align call
      strbuf_reset(&call->info);
      decomp_bubble_call(bubbles, genome, kmer_size, min_mapq,
                         &centry, mflank, bam_hdr, call);
      strbuf_append_str(&call->info, kmer_str);
      acall_decompose(aligner, call, max_align_len, max_allele_len);
    }

    // print bubble stats
    DecompBubbleStats *bub_stats = ctx_calloc(1, sizeof(*bub_stats));
    decomp_bubble_cpy_stats(bub_stats, bubbles);
    print_bubble_stats(bub_stats);
    ctx_free(bub_stats);

    decomp_bubble_destroy(bubbles);
  }
  else
  {
    // Breakpoint calls
    DecompBreakpoint *breakpoints = decomp_brkpt_init();

    while(call_file_read(gzin, in_path, &centry)) {
      strbuf_reset(&call->info);
      decomp_brkpt_call(breakpoints, genome, num_samples, &centry, call);
      strbuf_append_str(&call->info, kmer_str);
      acall_decompose(aligner, call, max_align_len, max_allele_len);
    }

    // print bubble stats
    DecompBreakpointStats *brk_stats = ctx_calloc(1, sizeof(*brk_stats));
    decomp_brkpt_cpy_stats(brk_stats, breakpoints);
    print_breakpoint_stats(brk_stats);
    ctx_free(brk_stats);

    decomp_brkpt_destroy(breakpoints);
  }

  // Print stats
  DecomposeStats *astats = ctx_calloc(1, sizeof(*astats));
  call_decomp_cpy_stats(astats, aligner);
  print_acall_stats(astats);
  ctx_free(astats);

  call_file_entry_dealloc(&centry);
  call_decomp_destroy(aligner);
  acall_destroy(call);

  // Finished - clean up
  cJSON_Delete(json);
  gzclose(gzin);

  bcf_hdr_destroy(vcfhdr);
  hts_close(vcffh);

  for(i = 0; i < chroms.len; i++) seq_read_dealloc(&chroms.b[i]);
  read_buf_dealloc(&chroms);
  chrom_hash_destroy(genome);

  if(sam_path) {
    hts_close(samfh);
    bam_hdr_destroy(bam_hdr);
    bam_destroy1(mflank);
  }

  return EXIT_SUCCESS;
}
예제 #25
0
int ctx_links(int argc, char **argv)
{
  size_t limit = 0;
  const char *link_out_path = NULL, *csv_out_path = NULL, *plot_out_path = NULL;
  const char *thresh_path = NULL, *hist_path = NULL;

  size_t hist_distsize = 0, hist_covgsize = 0;
  size_t cutoff = 0;
  bool clean = false;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'o': cmd_check(!link_out_path, cmd); link_out_path = optarg; break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'l': cmd_check(!csv_out_path, cmd); csv_out_path = optarg; break;
      case 'c': cmd_check(!cutoff, cmd); cutoff = cmd_size(cmd, optarg); clean = true; break;
      case 'L': cmd_check(!limit, cmd); limit = cmd_size(cmd, optarg); break;
      case 'P': cmd_check(!plot_out_path, cmd); plot_out_path = optarg; break;
      case 'T': cmd_check(!thresh_path, cmd); thresh_path = optarg; break;
      case 'H': cmd_check(!hist_path, cmd); hist_path = optarg; break;
      case 'C': cmd_check(!hist_covgsize, cmd); hist_covgsize = cmd_size(cmd, optarg); break;
      case 'D': cmd_check(!hist_distsize, cmd); hist_distsize = cmd_size(cmd, optarg); break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" links -h` for help. Bad option: %s", argv[optind-1]);
      default: ctx_assert2(0, "shouldn't reach here: %c", c);
    }
  }

  if(hist_distsize && !hist_path) cmd_print_usage("--max-dist without --covg-hist");
  if(hist_covgsize && !hist_path) cmd_print_usage("--max-covg without --covg-hist");

  // Defaults
  if(!hist_distsize) hist_distsize = DEFAULT_MAX_DIST;
  if(!hist_covgsize) hist_covgsize = DEFAULT_MAX_COVG;

  if(optind + 1 != argc) cmd_print_usage("Wrong number of arguments");
  const char *ctp_path = argv[optind];

  bool list = (csv_out_path != NULL);
  bool plot = (plot_out_path != NULL);
  bool save = (link_out_path != NULL);
  bool hist_covg = (thresh_path != NULL || hist_path != NULL);

  size_t plot_kmer_idx = (limit == 0 ? 0 : limit - 1);

  if(clean && !save)
    cmd_print_usage("Need to give --out <out.ctp.gz> with --clean");

  if(!save && !list && !plot && !hist_covg)
    cmd_print_usage("Please specify one of --plot, --list or --clean");

  if(link_out_path && hist_covg && strcmp(link_out_path,"-") == 0)
    cmd_print_usage("Outputing both cleaning threshold (-T) and links (-o) to STDOUT!");

  // Open input file
  FILE *list_fh = NULL, *plot_fh = NULL, *link_tmp_fh = NULL;
  FILE *thresh_fh = NULL, *hist_fh = NULL;
  gzFile link_gz = NULL;

  // Check file don't exist or that we can overwrite
  // Will ignore if path is null
  bool err = false;
  err |= futil_check_outfile(csv_out_path);
  err |= futil_check_outfile(plot_out_path);
  err |= futil_check_outfile(link_out_path);
  err |= futil_check_outfile(thresh_path);
  err |= futil_check_outfile(hist_path);
  if(err) die("Use -f,--force to overwrite files");

  StrBuf link_tmp_path;
  strbuf_alloc(&link_tmp_path, 1024);

  GPathReader ctpin;
  memset(&ctpin, 0, sizeof(ctpin));
  gpath_reader_open(&ctpin, ctp_path);

  size_t ncols = file_filter_into_ncols(&ctpin.fltr);
  size_t kmer_size = gpath_reader_get_kmer_size(&ctpin);
  cJSON *newhdr = cJSON_Duplicate(ctpin.json, 1);

  if(ncols != 1) die("Can only clean a single colour at a time. Sorry.");

  uint64_t (*hists)[hist_covgsize] = NULL;

  if(hist_covg) {
    hists = ctx_calloc(hist_distsize, sizeof(hists[0]));
  }

  if(hist_path && (hist_fh = futil_fopen_create(hist_path, "w")) == NULL)
      die("Cannot open file: %s", hist_path);

  if(thresh_path && (thresh_fh = futil_fopen_create(thresh_path, "w")) == NULL)
      die("Cannot open file: %s", thresh_path);

  if(limit)
    status("Limiting to the first %zu kmers", limit);

  if(clean)
  {
    timestamp();
    message(" Cleaning coverage below %zu", cutoff);
    message("\n");
  }

  if(save)
  {
    // Check we can find the fields we need
    cJSON *links_json  = json_hdr_get(newhdr, "paths", cJSON_Object, link_out_path);
    cJSON *nkmers_json = json_hdr_get(links_json, "num_kmers_with_paths", cJSON_Number, link_out_path);
    cJSON *nlinks_json = json_hdr_get(links_json, "num_paths",            cJSON_Number, link_out_path);
    cJSON *nbytes_json = json_hdr_get(links_json, "path_bytes",           cJSON_Number, link_out_path);
    if(!nkmers_json || !nlinks_json || !nbytes_json)
      die("Cannot find required header entries");

    // Create a random temporary file
    link_tmp_fh = create_tmp_file(&link_tmp_path, link_out_path);

    status("Saving output to: %s", link_out_path);
    status("Temporary output: %s", link_tmp_path.b);

    // Open output file
    if((link_gz = futil_gzopen_create(link_out_path, "w")) == NULL)
      die("Cannot open output link file: %s", link_out_path);

    // Need to open output file first so we can get absolute path
    // Update the header to include this command
    json_hdr_add_curr_cmd(newhdr, link_out_path);
  }

  if(list)
  {
    status("Listing to %s", csv_out_path);
    if((list_fh = futil_fopen_create(csv_out_path, "w")) == NULL)
      die("Cannot open output CSV file %s", csv_out_path);

    // Print csv header
    fprintf(list_fh, "SeqLen,Covg\n");
  }

  if(plot)
  {
    status("Plotting kmer %zu to %s", plot_kmer_idx, plot_out_path);
    if((plot_fh = futil_fopen_create(plot_out_path, "w")) == NULL)
      die("Cannot open output .dot file %s", plot_out_path);
  }

  SizeBuffer countbuf, jposbuf;
  size_buf_alloc(&countbuf, 16);
  size_buf_alloc(&jposbuf, 1024);

  StrBuf kmerbuf, juncsbuf, seqbuf, outbuf;
  strbuf_alloc(&kmerbuf, 1024);
  strbuf_alloc(&juncsbuf, 1024);
  strbuf_alloc(&seqbuf, 1024);
  strbuf_alloc(&outbuf, 1024);

  bool link_fw;
  size_t njuncs;
  size_t knum, nlinks, num_links_exp = 0;

  LinkTree ltree;
  ltree_alloc(&ltree, kmer_size);

  LinkTreeStats tree_stats;
  memset(&tree_stats, 0, sizeof(tree_stats));
  size_t init_num_links = 0, num_links = 0;

  for(knum = 0; !limit || knum < limit; knum++)
  {
    ltree_reset(&ltree);
    if(!gpath_reader_read_kmer(&ctpin, &kmerbuf, &num_links_exp)) break;
    ctx_assert2(kmerbuf.end == kmer_size, "Kmer incorrect length %zu != %zu",
                kmerbuf.end, kmer_size);
    // status("kmer: %s", kmerbuf.b);

    for(nlinks = 0;
        gpath_reader_read_link(&ctpin, &link_fw, &njuncs,
                               &countbuf, &juncsbuf,
                               &seqbuf, &jposbuf);
        nlinks++)
    {
      ltree_add(&ltree, link_fw, countbuf.b[0], jposbuf.b,
                juncsbuf.b, seqbuf.b);
    }

    if(nlinks != num_links_exp)
      warn("Links count mismatch %zu != %zu", nlinks, num_links_exp);

    if(hist_covg)
    {
      ltree_update_covg_hists(&ltree, (uint64_t*)hists,
                              hist_distsize, hist_covgsize);
    }
    if(clean)
    {
      ltree_clean(&ltree, cutoff);
    }

    // Accumulate statistics
    ltree_get_stats(&ltree, &tree_stats);
    num_links = tree_stats.num_links - init_num_links;
    init_num_links = tree_stats.num_links;

    if(list)
    {
      ltree_write_list(&ltree, &outbuf);
      if(fwrite(outbuf.b, 1, outbuf.end, list_fh) != outbuf.end)
        die("Cannot write CSV file to: %s", csv_out_path);
      strbuf_reset(&outbuf);
    }
    if(save && num_links)
    {
      ltree_write_ctp(&ltree, kmerbuf.b, num_links, &outbuf);
      if(fwrite(outbuf.b, 1, outbuf.end, link_tmp_fh) != outbuf.end)
        die("Cannot write ctp file to: %s", link_tmp_path.b);
      strbuf_reset(&outbuf);
    }
    if(plot && knum == plot_kmer_idx)
    {
      status("Plotting tree...");
      ltree_write_dot(&ltree, &outbuf);
      if(fwrite(outbuf.b, 1, outbuf.end, plot_fh) != outbuf.end)
        die("Cannot write plot DOT file to: %s", plot_out_path);
      strbuf_reset(&outbuf);
    }
  }

  gpath_reader_close(&ctpin);

  cJSON *links_json = json_hdr_get(newhdr, "paths", cJSON_Object, link_out_path);
  cJSON *nkmers_json = json_hdr_get(links_json, "num_kmers_with_paths", cJSON_Number, link_out_path);
  cJSON *nlinks_json = json_hdr_get(links_json, "num_paths",            cJSON_Number, link_out_path);
  cJSON *nbytes_json = json_hdr_get(links_json, "path_bytes",           cJSON_Number, link_out_path);

  status("Number of kmers with links %li -> %zu", nkmers_json->valueint, tree_stats.num_trees_with_links);
  status("Number of links %li -> %zu", nlinks_json->valueint, tree_stats.num_links);
  status("Number of bytes %li -> %zu", nbytes_json->valueint, tree_stats.num_link_bytes);

  if(save)
  {
    // Update JSON
    nkmers_json->valuedouble = nkmers_json->valueint = tree_stats.num_trees_with_links;
    nlinks_json->valuedouble = nlinks_json->valueint = tree_stats.num_links;
    nbytes_json->valuedouble = nbytes_json->valueint = tree_stats.num_link_bytes;

    char *json_str = cJSON_Print(newhdr);
    if(gzputs(link_gz, json_str) != (int)strlen(json_str))
      die("Cannot write ctp file to: %s", link_out_path);
    free(json_str);

    gzputs(link_gz, "\n\n");
    gzputs(link_gz, ctp_explanation_comment);
    gzputs(link_gz, "\n");

    fseek(link_tmp_fh, 0, SEEK_SET);
    char *tmp = ctx_malloc(4*ONE_MEGABYTE);
    size_t s;
    while((s = fread(tmp, 1, 4*ONE_MEGABYTE, link_tmp_fh)) > 0) {
      if(gzwrite(link_gz, tmp, s) != (int)s)
        die("Cannot write to output: %s", link_out_path);
    }
    ctx_free(tmp);

    gzclose(link_gz);
    fclose(link_tmp_fh);
  }

  // Write histogram to file
  if(hist_fh)
  {
    size_t i, j;
    fprintf(hist_fh, "  ");
    for(j = 1; j < hist_covgsize; j++) fprintf(hist_fh, ",covg.%02zu", j);
    fprintf(hist_fh, "\n");
    for(i = 1; i < hist_distsize; i++) {
      fprintf(hist_fh, "dist.%02zu", i);
      for(j = 1; j < hist_covgsize; j++) {
        fprintf(hist_fh, ",%"PRIu64, hists[i][j]);
      }
      fprintf(hist_fh, "\n");
    }
  }

  if(thresh_fh)
  {
    // Use median of first five cutoffs
    print_suggest_cutoff(6, hist_covgsize, hists, thresh_fh);
  }

  if(hist_fh && hist_fh != stdout) fclose(hist_fh);

  if(list)
  {
    fclose(list_fh);
  }

  if(plot)
  {
    fclose(plot_fh);
  }

  ctx_free(hists);
  cJSON_Delete(newhdr);
  strbuf_dealloc(&link_tmp_path);
  ltree_dealloc(&ltree);
  size_buf_dealloc(&countbuf);
  size_buf_dealloc(&jposbuf);
  strbuf_dealloc(&kmerbuf);
  strbuf_dealloc(&juncsbuf);
  strbuf_dealloc(&seqbuf);
  strbuf_dealloc(&outbuf);

  return EXIT_SUCCESS;
}
예제 #26
0
int ctx_reads(int argc, char **argv)
{
  parse_args(argc, argv);

  //
  // Open input graphs
  //
  GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader));
  size_t i, ctx_max_kmers = 0, ctx_sum_kmers = 0;

  graph_files_open(gfile_paths, gfiles, num_gfiles,
                   &ctx_max_kmers, &ctx_sum_kmers);

  // Will exit and remove output files on error
  inputs_attempt_open();

  //
  // Calculate memory use
  //
  size_t kmers_in_hash, graph_mem, bits_per_kmer = sizeof(BinaryKmer)*8;

  kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                        memargs.mem_to_use_set,
                                        memargs.num_kmers,
                                        memargs.num_kmers_set,
                                        bits_per_kmer,
                                        ctx_max_kmers, ctx_sum_kmers,
                                        true, &graph_mem);

  cmd_check_mem_limit(memargs.mem_to_use, graph_mem);

  //
  // Set up graph
  //
  dBGraph db_graph;
  db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, 1, 0, kmers_in_hash, 0);

  // Load graphs
  LoadingStats gstats = LOAD_STATS_INIT_MACRO;

  GraphLoadingPrefs gprefs = {.db_graph = &db_graph,
                              .must_exist_in_graph = false,
                              .empty_colours = true,
                              .boolean_covgs = false};

  for(i = 0; i < num_gfiles; i++) {
    file_filter_flatten(&gfiles[i].fltr, 0);
    graph_load(&gfiles[i], gprefs, &gstats);
    graph_file_close(&gfiles[i]);
    gprefs.empty_colours = false;
  }
  ctx_free(gfiles);

  status("Printing reads that do %stouch the graph\n",
         inputs.b[0].invert ? "not " : "");

  //
  // Filter reads using async io
  //
  LoadingStats seq_stats = LOAD_STATS_INIT_MACRO;

  for(i = 0; i < inputs.len; i++) {
    inputs.b[i].stats = &seq_stats;
    inputs.b[i].db_graph = &db_graph;
  }

  // Deal with a set of files at once
  size_t start, end;
  for(start = 0; start < inputs.len; start += MAX_IO_THREADS)
  {
    // Can have different numbers of inputs vs threads
    end = MIN2(inputs.len, start+MAX_IO_THREADS);
    asyncio_run_pool(files.b+start, end-start, filter_reads, NULL, nthreads, 0);
  }

  size_t total_reads_printed = 0;
  size_t total_reads = seq_stats.num_se_reads + seq_stats.num_pe_reads;

  for(i = 0; i < inputs.len; i++)
    total_reads_printed += inputs.b[i].num_of_reads_printed;

  for(i = 0; i < inputs.len; i++) {
    seqout_close(&inputs.b[i].seqout, false);
    asyncio_task_close(&files.b[i]);
  }

  aln_reads_buf_dealloc(&inputs);
  asyncio_buf_dealloc(&files);

  status("Total printed %zu / %zu (%.2f%%) reads\n",
         total_reads_printed, total_reads,
         total_reads ? (100.0 * total_reads_printed) / total_reads : 0.0);

  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}
예제 #27
0
int ctx_rmsubstr(int argc, char **argv)
{
  struct MemArgs memargs = MEM_ARGS_INIT;
  size_t kmer_size = 0, nthreads = 0;
  const char *output_file = NULL;
  seq_format fmt = SEQ_FMT_FASTA;
  bool invert = false;

  // Arg parsing
  char cmd[100], shortopts[100];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'o': cmd_check(!output_file, cmd); output_file = optarg; break;
      case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'k': cmd_check(!kmer_size,cmd); kmer_size = cmd_uint32(cmd, optarg); break;
      case 'F': cmd_check(fmt==SEQ_FMT_FASTA, cmd); fmt = cmd_parse_format(cmd, optarg); break;
      case 'v': cmd_check(!invert,cmd); invert = true; break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        cmd_print_usage("`"CMD" rmsubstr -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  // Defaults
  if(!nthreads) nthreads = DEFAULT_NTHREADS;
  if(!kmer_size) kmer_size = DEFAULT_KMER;

  if(!(kmer_size&1)) cmd_print_usage("Kmer size must be odd");
  if(kmer_size < MIN_KMER_SIZE) cmd_print_usage("Kmer size too small (recompile)");
  if(kmer_size > MAX_KMER_SIZE) cmd_print_usage("Kmer size too large (recompile?)");

  if(optind >= argc)
    cmd_print_usage("Please specify at least one input sequence file (.fq, .fq etc.)");

  size_t i, num_seq_files = argc - optind;
  char **seq_paths = argv + optind;
  seq_file_t **seq_files = ctx_calloc(num_seq_files, sizeof(seq_file_t*));

  for(i = 0; i < num_seq_files; i++)
    if((seq_files[i] = seq_open(seq_paths[i])) == NULL)
      die("Cannot read sequence file %s", seq_paths[i]);

  // Estimate number of bases
  // set to -1 if we cannot calc
  int64_t est_num_bases = seq_est_seq_bases(seq_files, num_seq_files);
  if(est_num_bases < 0) {
    warn("Cannot get file sizes, using pipes");
    est_num_bases = memargs.num_kmers * IDEAL_OCCUPANCY;
  }

  status("[memory] Estimated number of bases: %li", (long)est_num_bases);

  // Use file sizes to decide on memory

  //
  // Decide on memory
  //
  size_t bits_per_kmer, kmers_in_hash, graph_mem;

  bits_per_kmer = sizeof(BinaryKmer)*8 +
                  sizeof(KONodeList) + sizeof(KOccur) + // see kmer_occur.h
                  8; // 1 byte per kmer for each base to load sequence files

  kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                        memargs.mem_to_use_set,
                                        memargs.num_kmers,
                                        memargs.num_kmers_set,
                                        bits_per_kmer,
                                        est_num_bases, est_num_bases,
                                        false, &graph_mem);

  cmd_check_mem_limit(memargs.mem_to_use, graph_mem);

  //
  // Open output file
  //
  if(output_file == NULL) output_file = "-";
  FILE *fout = futil_fopen_create(output_file, "w");

  //
  // Set up memory
  //
  dBGraph db_graph;
  db_graph_alloc(&db_graph, kmer_size, 1, 0, kmers_in_hash, DBG_ALLOC_BKTLOCKS);

  //
  // Load reference sequence into a read buffer
  //
  ReadBuffer rbuf;
  read_buf_alloc(&rbuf, 1024);
  seq_load_all_reads(seq_files, num_seq_files, &rbuf);

  // Check for reads too short
  for(i = 0; i < rbuf.len && rbuf.b[i].seq.end >= kmer_size; i++) {}
  if(i < rbuf.len)
    warn("Reads shorter than kmer size (%zu) will not be filtered", kmer_size);

  KOGraph kograph = kograph_create(rbuf.b, rbuf.len, true, 0,
                                   nthreads, &db_graph);

  size_t num_reads = rbuf.len, num_reads_printed = 0, num_bad_reads = 0;

  // Loop over reads printing those that are not substrings
  int ret;
  for(i = 0; i < rbuf.len; i++) {
    ret = _is_substr(&rbuf, i, &kograph, &db_graph);
    if(ret == -1) num_bad_reads++;
    else if((ret && invert) || (!ret && !invert)) {
      seqout_print_read(&rbuf.b[i], fmt, fout);
      num_reads_printed++;
    }
  }

  char num_reads_str[100], num_reads_printed_str[100], num_bad_reads_str[100];
  ulong_to_str(num_reads, num_reads_str);
  ulong_to_str(num_reads_printed, num_reads_printed_str);
  ulong_to_str(num_bad_reads, num_bad_reads_str);

  status("Printed %s / %s (%.1f%%) to %s",
         num_reads_printed_str, num_reads_str,
         !num_reads ? 0.0 : (100.0 * num_reads_printed) / num_reads,
         futil_outpath_str(output_file));

  if(num_bad_reads > 0) {
    status("Bad reads: %s / %s (%.1f%%) - no kmer {ACGT} of length %zu",
           num_bad_reads_str, num_reads_str,
           (100.0 * num_bad_reads) / num_reads,
           kmer_size);
  }

  fclose(fout);
  kograph_dealloc(&kograph);

  // Free sequence memory
  for(i = 0; i < rbuf.len; i++) seq_read_dealloc(&rbuf.b[i]);
  read_buf_dealloc(&rbuf);
  ctx_free(seq_files);

  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}
예제 #28
0
int ctx_contigs(int argc, char **argv)
{
  size_t nthreads = 0;
  struct MemArgs memargs = MEM_ARGS_INIT;
  const char *out_path = NULL;
  size_t i, contig_limit = 0, colour = 0;
  bool cmd_reseed = false, cmd_no_reseed = false; // -r, -R
  const char *conf_table_path = NULL; // save confidence table to here
  bool use_missing_info_check = true, seed_with_unused_paths = false;
  double min_step_confid = -1.0, min_cumul_confid = -1.0; // < 0 => no min

  // Read length and expected depth for calculating confidences
  size_t genome_size = 0;

  seq_file_t *tmp_seed_file = NULL;
  SeqFilePtrBuffer seed_buf;
  seq_file_ptr_buf_alloc(&seed_buf, 16);

  GPathReader tmp_gpfile;
  GPathFileBuffer gpfiles;
  gpfile_buf_alloc(&gpfiles, 8);

  // Arg parsing
  char cmd[100], shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'o': cmd_check(!out_path,cmd); out_path = optarg; break;
      case 't': cmd_check(!nthreads,cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'p':
        memset(&tmp_gpfile, 0, sizeof(GPathReader));
        gpath_reader_open(&tmp_gpfile, optarg);
        gpfile_buf_push(&gpfiles, &tmp_gpfile, 1);
        break;
      case '1':
      case 's': // --seed <in.fa>
        if((tmp_seed_file = seq_open(optarg)) == NULL)
          die("Cannot read --seed file: %s", optarg);
        seq_file_ptr_buf_add(&seed_buf, tmp_seed_file);
        break;
      case 'r': cmd_check(!cmd_reseed,cmd); cmd_reseed = true; break;
      case 'R': cmd_check(!cmd_no_reseed,cmd); cmd_no_reseed = true; break;
      case 'N':
        cmd_check(!contig_limit,cmd);
        contig_limit = cmd_uint32_nonzero(cmd, optarg);
        break;
      case 'c': cmd_check(!colour,cmd); colour = cmd_uint32(cmd, optarg); break;
      case 'G': cmd_check(!genome_size,cmd); genome_size = cmd_bases(cmd, optarg); break;
      case 'S': cmd_check(!conf_table_path,cmd); conf_table_path = optarg; break;
      case 'M': cmd_check(use_missing_info_check,cmd); use_missing_info_check = false; break;
      case 'P': cmd_check(!seed_with_unused_paths,cmd); seed_with_unused_paths = true; break;
      case 'C':
        cmd_check(min_cumul_confid < 0,cmd);
        min_cumul_confid = cmd_udouble(cmd,optarg);
        if(min_cumul_confid > 1) die("%s must be 0 <= x <= 1", cmd);
        break;
      case 'T':
        cmd_check(min_step_confid < 0,cmd);
        min_step_confid = cmd_udouble(cmd,optarg);
        if(min_step_confid > 1) die("%s must be 0 <= x <= 1", cmd);
        break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        die("`"CMD" contigs -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  if(cmd_no_reseed && cmd_reseed)
    cmd_print_usage("Cannot specify both -r and -R");

  if(contig_limit && seed_with_unused_paths)
    cmd_print_usage("Cannot combine --ncontigs with --use-seed-paths");

  bool sample_with_replacement = cmd_reseed;

  // Defaults
  if(nthreads == 0) nthreads = DEFAULT_NTHREADS;

  if(!seed_buf.len && !contig_limit && sample_with_replacement) {
    cmd_print_usage("Please specify one or more of: "
                    "--no-reseed | --ncontigs | --seed <in.fa>");
  }

  if(optind >= argc) cmd_print_usage("Require input graph files (.ctx)");

  //
  // Open graph files
  //
  const size_t num_gfiles = argc - optind;
  char **graph_paths = argv + optind;
  ctx_assert(num_gfiles > 0);

  GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader));
  size_t ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0;

  graph_files_open(graph_paths, gfiles, num_gfiles,
                   &ctx_max_kmers, &ctx_sum_kmers);

  // char *ctx_path = argv[optind];

  //
  // Open Graph file
  //
  // GraphFileReader gfile;
  // memset(&gfile, 0, sizeof(GraphFileReader));
  // graph_file_open(&gfile, ctx_path);

  // Update colours in graph file - sample in 0, all others in 1
  // never need more than two colours
  ncols = gpath_load_sample_pop(gfiles, num_gfiles,
                                gpfiles.b, gpfiles.len, colour);

  // Check for compatibility between graph files and path files
  // pop_colour is colour 1
  graphs_gpaths_compatible(gfiles, num_gfiles, gpfiles.b, gpfiles.len, 1);

  if(!genome_size)
  {
    char nk_str[50];
    if(ctx_max_kmers <= 0) die("Please pass --genome <G> if streaming");
    genome_size = ctx_max_kmers;
    ulong_to_str(genome_size, nk_str);
    status("Taking number of kmers as genome size: %s", nk_str);
  }

  //
  // Decide on memory
  //
  size_t bits_per_kmer, kmers_in_hash, graph_mem, path_mem, total_mem;

  // 1 bit needed per kmer if we need to keep track of kmer usage
  bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 + sizeof(GPath*)*8 +
                  ncols + !sample_with_replacement;

  kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                        memargs.mem_to_use_set,
                                        memargs.num_kmers,
                                        memargs.num_kmers_set,
                                        bits_per_kmer,
                                        ctx_max_kmers, ctx_sum_kmers,
                                        false, &graph_mem);

  // Paths memory
  size_t rem_mem = memargs.mem_to_use - MIN2(memargs.mem_to_use, graph_mem);
  path_mem = gpath_reader_mem_req(gpfiles.b, gpfiles.len, ncols, rem_mem, false);

  // Shift path store memory from graphs->paths
  graph_mem -= sizeof(GPath*)*kmers_in_hash;
  path_mem  += sizeof(GPath*)*kmers_in_hash;
  cmd_print_mem(path_mem, "paths");

  // Total memory
  total_mem = graph_mem + path_mem;
  cmd_check_mem_limit(memargs.mem_to_use, total_mem);

  // Load contig hist distribution from ctp files
  ZeroSizeBuffer contig_hist;
  memset(&contig_hist, 0, sizeof(contig_hist));

  for(i = 0; i < gpfiles.len; i++) {
    gpath_reader_load_contig_hist(gpfiles.b[i].json,
                                  gpfiles.b[i].fltr.path.b,
                                  file_filter_fromcol(&gpfiles.b[i].fltr, 0),
                                  &contig_hist);
  }

  // Calculate confidences, only for one colour
  ContigConfidenceTable conf_table;
  conf_table_alloc(&conf_table, 1);
  conf_table_update_hist(&conf_table, 0, genome_size,
                         contig_hist.b, contig_hist.len);

  if(conf_table_path != NULL) {
    conf_table_save(&conf_table, conf_table_path);
  }

  zsize_buf_dealloc(&contig_hist);

  //
  // Output file if printing
  //
  FILE *fout = out_path ? futil_fopen_create(out_path, "w") : NULL;

  // Allocate
  dBGraph db_graph;
  db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, ncols, 1, kmers_in_hash,
                 DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL);

  // Paths
  gpath_reader_alloc_gpstore(gpfiles.b, gpfiles.len, path_mem,
                             false, &db_graph);

  uint8_t *visited = NULL;

  if(!sample_with_replacement)
    visited = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1);

  // Load graph
  LoadingStats stats = LOAD_STATS_INIT_MACRO;

  GraphLoadingPrefs gprefs = {.db_graph = &db_graph,
                              .boolean_covgs = false,
                              .must_exist_in_graph = false,
                              .empty_colours = true};

  for(i = 0; i < num_gfiles; i++) {
    graph_load(&gfiles[i], gprefs, &stats);
    graph_file_close(&gfiles[i]);
    gprefs.empty_colours = false;
  }
  ctx_free(gfiles);

  hash_table_print_stats(&db_graph.ht);

  // Load path files
  for(i = 0; i < gpfiles.len; i++) {
    gpath_reader_load(&gpfiles.b[i], GPATH_DIE_MISSING_KMERS, &db_graph);
    gpath_reader_close(&gpfiles.b[i]);
  }
  gpfile_buf_dealloc(&gpfiles);

  AssembleContigStats assem_stats;
  assemble_contigs_stats_init(&assem_stats);

  assemble_contigs(nthreads, seed_buf.b, seed_buf.len,
                   contig_limit, visited,
                   use_missing_info_check, seed_with_unused_paths,
                   min_step_confid, min_cumul_confid,
                   fout, out_path, &assem_stats, &conf_table,
                   &db_graph, 0); // Sample always loaded into colour zero

  if(fout && fout != stdout) fclose(fout);

  assemble_contigs_stats_print(&assem_stats);
  assemble_contigs_stats_destroy(&assem_stats);

  conf_table_dealloc(&conf_table);

  for(i = 0; i < seed_buf.len; i++)
    seq_close(seed_buf.b[i]);

  seq_file_ptr_buf_dealloc(&seed_buf);

  ctx_free(visited);
  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}
예제 #29
0
int ctx_clean(int argc, char **argv)
{
  size_t nthreads = 0, use_ncols = 0;
  struct MemArgs memargs = MEM_ARGS_INIT;
  const char *out_ctx_path = NULL;
  bool tip_cleaning = false, supernode_cleaning = false;
  size_t min_keep_tip = 0;
  Covg threshold = 0, fallback_thresh = 0;
  const char *len_before_path = NULL, *len_after_path = NULL;
  const char *covg_before_path = NULL, *covg_after_path = NULL;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'o':
        if(out_ctx_path != NULL) cmd_print_usage(NULL);
        out_ctx_path = optarg;
        break;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'N': use_ncols = cmd_uint32_nonzero(cmd, optarg); break;
      case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break;
      case 'T':
        cmd_check(!tip_cleaning, cmd);
        min_keep_tip = cmd_uint32_nonzero(cmd, optarg);
        tip_cleaning = true;
        break;
      case 'S':
        cmd_check(!supernode_cleaning, cmd);
        if(optarg != NULL) threshold = cmd_uint32_nonzero(cmd, optarg);
        supernode_cleaning = true;
        break;
      case 'B': cmd_check(!fallback_thresh, cmd); fallback_thresh = cmd_uint32_nonzero(cmd, optarg); break;
      case 'l': cmd_check(!len_before_path, cmd); len_before_path = optarg; break;
      case 'L': cmd_check(!len_after_path, cmd); len_after_path = optarg; break;
      case 'c': cmd_check(!covg_before_path, cmd); covg_before_path = optarg; break;
      case 'C': cmd_check(!covg_after_path, cmd); covg_after_path = optarg; break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" clean -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  if(nthreads == 0) nthreads = DEFAULT_NTHREADS;

  if(optind >= argc) cmd_print_usage("Please give input graph files");

  // Default behaviour
  if(!tip_cleaning && !supernode_cleaning) {
    if(out_ctx_path != NULL)
      supernode_cleaning = tip_cleaning = true; // do both
    else
      warn("No cleaning being done: you did not specify --out <out.ctx>");
  }

  bool doing_cleaning = (supernode_cleaning || tip_cleaning);

  if(doing_cleaning && out_ctx_path == NULL) {
    cmd_print_usage("Please specify --out <out.ctx> for cleaned graph");
  }

  if(!doing_cleaning && (covg_after_path || len_after_path)) {
    cmd_print_usage("You gave --len-after <out> / --covg-after <out> without "
                    "any cleaning (set -s, --supernodes or -t, --tips)");
  }

  if(doing_cleaning && strcmp(out_ctx_path,"-") != 0 &&
     !futil_get_force() && futil_file_exists(out_ctx_path))
  {
    cmd_print_usage("Output file already exists: %s", out_ctx_path);
  }

  if(fallback_thresh && !supernode_cleaning)
    cmd_print_usage("-B, --fallback <T> without --supernodes");

  // Use remaining args as graph files
  char **gfile_paths = argv + optind;
  size_t i, j, num_gfiles = (size_t)(argc - optind);

  // Open graph files
  GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader));
  size_t ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0;

  ncols = graph_files_open(gfile_paths, gfiles, num_gfiles,
                           &ctx_max_kmers, &ctx_sum_kmers);

  size_t kmer_size = gfiles[0].hdr.kmer_size;

  // default to one colour for now
  if(use_ncols == 0) use_ncols = 1;

  // Flatten if we don't have to remember colours / output a graph
  if(!doing_cleaning)
  {
    ncols = use_ncols = 1;
    for(i = 0; i < num_gfiles; i++)
      file_filter_flatten(&gfiles[i].fltr, 0);
  }

  if(ncols < use_ncols) {
    warn("I only need %zu colour%s ('--ncols %zu' ignored)",
         ncols, util_plural_str(ncols), use_ncols);
    use_ncols = ncols;
  }

  char max_kmers_str[100];
  ulong_to_str(ctx_max_kmers, max_kmers_str);
  status("%zu input graph%s, max kmers: %s, using %zu colours",
         num_gfiles, util_plural_str(num_gfiles), max_kmers_str, use_ncols);

  // If no arguments given we default to removing tips < 2*kmer_size
  if(tip_cleaning && min_keep_tip == 0)
    min_keep_tip = 2 * kmer_size;

  // Warn if any graph files already cleaned
  size_t fromcol, intocol;
  ErrorCleaning *cleaning;

  for(i = 0; i < num_gfiles; i++) {
    for(j = 0; j < file_filter_num(&gfiles[i].fltr); j++) {
      fromcol = file_filter_fromcol(&gfiles[i].fltr, j);
      cleaning = &gfiles[i].hdr.ginfo[fromcol].cleaning;
      if(cleaning->cleaned_snodes && supernode_cleaning) {
        warn("%s:%zu already has supernode cleaning with threshold: <%zu",
             file_filter_path(&gfiles[i].fltr), fromcol,
             (size_t)cleaning->clean_snodes_thresh);
      }
      if(cleaning->cleaned_tips && tip_cleaning) {
        warn("%s:%zu already has had tip cleaned",
             file_filter_path(&gfiles[i].fltr), fromcol);
      }
    }
  }

  // Print steps
  size_t step = 0;
  status("Actions:\n");
  if(covg_before_path != NULL)
    status("%zu. Saving kmer coverage distribution to: %s", step++, covg_before_path);
  if(len_before_path != NULL)
    status("%zu. Saving supernode length distribution to: %s", step++, len_before_path);
  if(tip_cleaning)
    status("%zu. Cleaning tips shorter than %zu nodes", step++, min_keep_tip);
  if(supernode_cleaning && threshold > 0)
    status("%zu. Cleaning supernodes with coverage < %u", step++, threshold);
  if(supernode_cleaning && threshold <= 0)
    status("%zu. Cleaning supernodes with auto-detected threshold", step++);
  if(covg_after_path != NULL)
    status("%zu. Saving kmer coverage distribution to: %s", step++, covg_after_path);
  if(len_after_path != NULL)
    status("%zu. Saving supernode length distribution to: %s", step++, len_after_path);

  //
  // Decide memory usage
  //
  bool all_colours_loaded = (ncols <= use_ncols);
  bool use_mem_limit = (memargs.mem_to_use_set && num_gfiles > 1) || !ctx_max_kmers;

  size_t kmers_in_hash, bits_per_kmer, graph_mem;
  size_t per_kmer_per_col_bits = (sizeof(BinaryKmer)+sizeof(Covg)+sizeof(Edges)) * 8;
  size_t pop_edges_per_kmer_bits = (all_colours_loaded ? 0 : sizeof(Edges) * 8);

  bits_per_kmer = per_kmer_per_col_bits * use_ncols + pop_edges_per_kmer_bits;

  kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                        memargs.mem_to_use_set,
                                        memargs.num_kmers,
                                        memargs.num_kmers_set,
                                        bits_per_kmer,
                                        ctx_max_kmers, ctx_sum_kmers,
                                        use_mem_limit, &graph_mem);

  // Maximise the number of colours we load to fill the mem
  size_t max_usencols = (memargs.mem_to_use*8 - pop_edges_per_kmer_bits * kmers_in_hash) /
                        (per_kmer_per_col_bits * kmers_in_hash);
  use_ncols = MIN2(max_usencols, ncols);

  cmd_check_mem_limit(memargs.mem_to_use, graph_mem);

  //
  // Check output files are writable
  //
  futil_create_output(out_ctx_path);

  // Does nothing if arg is NULL
  futil_create_output(covg_before_path);
  futil_create_output(covg_after_path);
  futil_create_output(len_before_path);
  futil_create_output(len_after_path);

  // Create db_graph
  // Load as many colours as possible
  // Use an extra set of edge to take intersections
  dBGraph db_graph;
  db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, use_ncols, use_ncols,
                 kmers_in_hash, DBG_ALLOC_COVGS);

  // Edges is a special case
  size_t num_edges = db_graph.ht.capacity * (use_ncols + !all_colours_loaded);
  db_graph.col_edges = ctx_calloc(num_edges, sizeof(Edges));

  // Load graph into a single colour
  LoadingStats stats = LOAD_STATS_INIT_MACRO;

  GraphLoadingPrefs gprefs = {.db_graph = &db_graph,
                              .boolean_covgs = false,
                              .must_exist_in_graph = false,
                              .must_exist_in_edges = NULL,
                              .empty_colours = false};

  // Construct cleaned graph header
  GraphFileHeader outhdr;
  memset(&outhdr, 0, sizeof(GraphFileHeader));
  outhdr.version = CTX_GRAPH_FILEFORMAT;
  outhdr.kmer_size = db_graph.kmer_size;
  outhdr.num_of_cols = ncols;
  outhdr.num_of_bitfields = (db_graph.kmer_size*2+63)/64;
  graph_header_alloc(&outhdr, ncols);

  // Merge info into header
  size_t gcol = 0;
  for(i = 0; i < num_gfiles; i++) {
    for(j = 0; j < file_filter_num(&gfiles[i].fltr); j++, gcol++) {
      fromcol = file_filter_fromcol(&gfiles[i].fltr, j);
      intocol = file_filter_intocol(&gfiles[i].fltr, j);
      graph_info_merge(&outhdr.ginfo[intocol], &gfiles[i].hdr.ginfo[fromcol]);
    }
  }

  if(ncols > use_ncols) {
    graph_files_load_flat(gfiles, num_gfiles, gprefs, &stats);
  } else {
    for(i = 0; i < num_gfiles; i++)
      graph_load(&gfiles[i], gprefs, &stats);
  }

  char num_kmers_str[100];
  ulong_to_str(db_graph.ht.num_kmers, num_kmers_str);
  status("Total kmers loaded: %s\n", num_kmers_str);

  size_t initial_nkmers = db_graph.ht.num_kmers;
  hash_table_print_stats(&db_graph.ht);

  uint8_t *visited = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1);
  uint8_t *keep = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1);

  if((supernode_cleaning && threshold <= 0) || covg_before_path || len_before_path)
  {
    // Get coverage distribution and estimate cleaning threshold
    int est_threshold = cleaning_get_threshold(nthreads,
                                               covg_before_path,
                                               len_before_path,
                                               visited, &db_graph);

    if(est_threshold < 0) status("Cannot find recommended cleaning threshold");
    else status("Recommended cleaning threshold is: %i", est_threshold);

    // Use estimated threshold if threshold not set
    if(threshold <= 0) {
      if(fallback_thresh > 0 && est_threshold < (int)fallback_thresh) {
        status("Using fallback threshold: %i", fallback_thresh);
        threshold = fallback_thresh;
      }
      else if(est_threshold >= 0) threshold = est_threshold;
    }
  }

  // Die if we failed to find suitable cleaning threshold
  if(supernode_cleaning && threshold <= 0)
    die("Need cleaning threshold (--supernodes=<D> or --fallback <D>)");

  if(doing_cleaning) {
    // Clean graph of tips (if min_keep_tip > 0) and supernodes (if threshold > 0)
    clean_graph(nthreads, threshold, min_keep_tip,
                covg_after_path, len_after_path,
                visited, keep, &db_graph);
  }

  ctx_free(visited);
  ctx_free(keep);

  if(doing_cleaning)
  {
    // Output graph file
    Edges *intersect_edges = NULL;
    bool kmers_loaded = true;
    size_t col, thresh;

    // Set output header ginfo cleaned
    for(col = 0; col < ncols; col++)
    {
      cleaning = &outhdr.ginfo[col].cleaning;
      cleaning->cleaned_snodes |= supernode_cleaning;
      cleaning->cleaned_tips |= tip_cleaning;

      // if(tip_cleaning) {
      //   strbuf_append_str(&outhdr.ginfo[col].sample_name, ".tipclean");
      // }

      if(supernode_cleaning) {
        thresh = cleaning->clean_snodes_thresh;
        thresh = cleaning->cleaned_snodes ? MAX2(thresh, (uint32_t)threshold)
                                          : (uint32_t)threshold;
        cleaning->clean_snodes_thresh = thresh;

        // char name_append[200];
        // sprintf(name_append, ".supclean%zu", thresh);
        // strbuf_append_str(&outhdr.ginfo[col].sample_name, name_append);
      }
    }

    if(!all_colours_loaded)
    {
      // We haven't loaded all the colours
      // intersect_edges are edges to mask with
      // resets graph edges
      intersect_edges = db_graph.col_edges;
      db_graph.col_edges += db_graph.ht.capacity;
    }

    // Print stats on removed kmers
    size_t removed_nkmers = initial_nkmers - db_graph.ht.num_kmers;
    double removed_pct = (100.0 * removed_nkmers) / initial_nkmers;
    char removed_str[100], init_str[100];
    ulong_to_str(removed_nkmers, removed_str);
    ulong_to_str(initial_nkmers, init_str);
    status("Removed %s of %s (%.2f%%) kmers", removed_str, init_str, removed_pct);

    graph_files_merge(out_ctx_path, gfiles, num_gfiles,
                      kmers_loaded, all_colours_loaded,
                      intersect_edges, &outhdr, &db_graph);

    // Swap back
    if(!all_colours_loaded)
      db_graph.col_edges = intersect_edges;
  }

  ctx_check(db_graph.ht.num_kmers == hash_table_count_kmers(&db_graph.ht));

  graph_header_dealloc(&outhdr);

  for(i = 0; i < num_gfiles; i++) graph_file_close(&gfiles[i]);
  ctx_free(gfiles);

  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}
예제 #30
0
int ctx_clean(int argc, char **argv)
{
  size_t nthreads = 0, use_ncols = 0;
  struct MemArgs memargs = MEM_ARGS_INIT;
  const char *out_ctx_path = NULL;
  int min_keep_tip = -1, unitig_min = -1; // <0 => default, 0 => noclean
  uint32_t fallback_thresh = 0;
  const char *len_before_path = NULL, *len_after_path = NULL;
  const char *covg_before_path = NULL, *covg_after_path = NULL;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'o':
        if(out_ctx_path != NULL) cmd_print_usage(NULL);
        out_ctx_path = optarg;
        break;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'N': use_ncols = cmd_uint32_nonzero(cmd, optarg); break;
      case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break;
      case 'T':
        cmd_check(min_keep_tip<0, cmd);
        min_keep_tip = (optarg != NULL ? (int)cmd_uint32(cmd, optarg) : -1);
        break;
      case 'S':
      case 'U':
        cmd_check(unitig_min<0, cmd);
        unitig_min = (optarg != NULL ? cmd_uint32(cmd, optarg) : -1);
        break;
      case 'B': cmd_check(!fallback_thresh, cmd); fallback_thresh = cmd_uint32_nonzero(cmd, optarg); break;
      case 'l': cmd_check(!len_before_path, cmd); len_before_path = optarg; break;
      case 'L': cmd_check(!len_after_path, cmd); len_after_path = optarg; break;
      case 'c': cmd_check(!covg_before_path, cmd); covg_before_path = optarg; break;
      case 'C': cmd_check(!covg_after_path, cmd); covg_after_path = optarg; break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" clean -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  if(nthreads == 0) nthreads = DEFAULT_NTHREADS;

  if(optind >= argc) cmd_print_usage("Please give input graph files");

  bool unitig_cleaning = (unitig_min != 0);
  bool tip_cleaning = (min_keep_tip != 0);
  bool doing_cleaning = (unitig_cleaning || tip_cleaning);

  // If you ever want to estimate cleaning threshold without outputting
  // a graph, change this to a warning
  if(doing_cleaning && out_ctx_path == NULL) {
    cmd_print_usage("Please specify --out <out.ctx> for cleaned graph");
    // warn("No cleaning being done: you did not specify --out <out.ctx>");
  }

  if(!doing_cleaning && (covg_after_path || len_after_path)) {
    warn("You gave --len-after <out> / --covg-after <out> without "
         "any cleaning (set -U, --unitigs or -t, --tips)");
  }

  if(doing_cleaning && strcmp(out_ctx_path,"-") != 0 &&
     !futil_get_force() && futil_file_exists(out_ctx_path))
  {
    cmd_print_usage("Output file already exists: %s", out_ctx_path);
  }

  if(fallback_thresh && !unitig_cleaning)
    warn("-B, --fallback <T> without --unitigs");

  // Use remaining args as graph files
  char **gfile_paths = argv + optind;
  size_t i, j, num_gfiles = (size_t)(argc - optind);

  // Open graph files
  GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader));
  size_t col, ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0;

  ncols = graph_files_open(gfile_paths, gfiles, num_gfiles,
                           &ctx_max_kmers, &ctx_sum_kmers);

  size_t kmer_size = gfiles[0].hdr.kmer_size;

  // default to one colour for now
  if(use_ncols == 0) use_ncols = 1;

  // Flatten if we don't have to remember colours / output a graph
  if(out_ctx_path == NULL)
  {
    ncols = use_ncols = 1;
    for(i = 0; i < num_gfiles; i++)
      file_filter_flatten(&gfiles[i].fltr, 0);
  }

  if(ncols < use_ncols) {
    warn("I only need %zu colour%s ('--ncols %zu' ignored)",
         ncols, util_plural_str(ncols), use_ncols);
    use_ncols = ncols;
  }

  char max_kmers_str[100];
  ulong_to_str(ctx_max_kmers, max_kmers_str);
  status("%zu input graph%s, max kmers: %s, using %zu colours",
         num_gfiles, util_plural_str(num_gfiles), max_kmers_str, use_ncols);

  // If no arguments given we default to removing tips < 2*kmer_size
  if(min_keep_tip < 0)
    min_keep_tip = 2 * kmer_size;

  // Warn if any graph files already cleaned
  size_t fromcol;
  ErrorCleaning *cleaning;

  for(i = 0; i < num_gfiles; i++) {
    for(j = 0; j < file_filter_num(&gfiles[i].fltr); j++) {
      fromcol = file_filter_fromcol(&gfiles[i].fltr, j);
      cleaning = &gfiles[i].hdr.ginfo[fromcol].cleaning;
      if(cleaning->cleaned_snodes && unitig_cleaning) {
        warn("%s:%zu already has unitig cleaning with threshold: <%zu",
             file_filter_path(&gfiles[i].fltr), fromcol,
             (size_t)cleaning->clean_snodes_thresh);
      }
      if(cleaning->cleaned_tips && tip_cleaning) {
        warn("%s:%zu already has had tip cleaned",
             file_filter_path(&gfiles[i].fltr), fromcol);
      }
    }
  }

  // Print steps
  size_t step = 0;
  status("Actions:\n");
  if(covg_before_path != NULL)
    status("%zu. Saving kmer coverage distribution to: %s", step++, covg_before_path);
  if(len_before_path != NULL)
    status("%zu. Saving unitig length distribution to: %s", step++, len_before_path);
  if(min_keep_tip > 0)
    status("%zu. Cleaning tips shorter than %i nodes", step++, min_keep_tip);
  if(unitig_min > 0)
    status("%zu. Cleaning unitigs with coverage < %i", step++, unitig_min);
  if(unitig_min < 0)
    status("%zu. Cleaning unitigs with auto-detected threshold", step++);
  if(covg_after_path != NULL)
    status("%zu. Saving kmer coverage distribution to: %s", step++, covg_after_path);
  if(len_after_path != NULL)
    status("%zu. Saving unitig length distribution to: %s", step++, len_after_path);

  //
  // Decide memory usage
  //
  bool all_colours_loaded = (ncols <= use_ncols);
  bool use_mem_limit = (memargs.mem_to_use_set && num_gfiles > 1) || !ctx_max_kmers;

  size_t kmers_in_hash, bits_per_kmer, graph_mem;
  size_t per_col_bits = (sizeof(Covg)+sizeof(Edges)) * 8;
  size_t extra_edge_bits = (all_colours_loaded ? 0 : sizeof(Edges) * 8);

  bits_per_kmer = sizeof(BinaryKmer)*8 +
                  per_col_bits * use_ncols +
                  extra_edge_bits;

  kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                        memargs.mem_to_use_set,
                                        memargs.num_kmers,
                                        memargs.num_kmers_set,
                                        bits_per_kmer,
                                        ctx_max_kmers, ctx_sum_kmers,
                                        use_mem_limit, &graph_mem);

  // Maximise the number of colours we load to fill the mem
  size_t max_usencols = (memargs.mem_to_use*8 -
                         sizeof(BinaryKmer)*8*kmers_in_hash +
                         extra_edge_bits*kmers_in_hash) /
                        (per_col_bits*kmers_in_hash);
  use_ncols = MIN2(max_usencols, ncols);

  cmd_check_mem_limit(memargs.mem_to_use, graph_mem);

  //
  // Check output files are writable
  //
  futil_create_output(out_ctx_path);

  // Does nothing if arg is NULL
  futil_create_output(covg_before_path);
  futil_create_output(covg_after_path);
  futil_create_output(len_before_path);
  futil_create_output(len_after_path);

  // Create db_graph
  // Load as many colours as possible
  // Use an extra set of edge to take intersections
  dBGraph db_graph;
  db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, use_ncols, use_ncols,
                 kmers_in_hash, DBG_ALLOC_EDGES | DBG_ALLOC_COVGS);

  // Extra edges required to hold union of kept edges
  Edges *edges_union = NULL;
  if(use_ncols < ncols)
    edges_union = ctx_calloc(db_graph.ht.capacity, sizeof(Edges));

  // Load graph into a single colour
  GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph);

  // Construct cleaned graph header
  GraphFileHeader outhdr;
  memset(&outhdr, 0, sizeof(GraphFileHeader));
  for(i = 0; i < num_gfiles; i++)
    graph_file_merge_header(&outhdr, &gfiles[i]);

  if(ncols > use_ncols)
  {
    db_graph.num_of_cols = db_graph.num_edge_cols = 1;
    SWAP(edges_union, db_graph.col_edges);
    graphs_load_files_flat(gfiles, num_gfiles, gprefs, NULL);
    SWAP(edges_union, db_graph.col_edges);
    db_graph.num_of_cols = db_graph.num_edge_cols = use_ncols;
  }
  else {
    for(i = 0; i < num_gfiles; i++)
      graph_load(&gfiles[i], gprefs, NULL);
  }

  char num_kmers_str[100];
  ulong_to_str(db_graph.ht.num_kmers, num_kmers_str);
  status("Total kmers loaded: %s\n", num_kmers_str);

  size_t initial_nkmers = db_graph.ht.num_kmers;
  hash_table_print_stats(&db_graph.ht);

  uint8_t *visited = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1);
  uint8_t *keep = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1);

  // Always estimate cleaning threshold
  // if(unitig_min <= 0 || covg_before_path || len_before_path)
  // {
    // Get coverage distribution and estimate cleaning threshold
    int est_min_covg = cleaning_get_threshold(nthreads,
                                              covg_before_path,
                                              len_before_path,
                                              visited, &db_graph);

    if(est_min_covg < 0) status("Cannot find recommended cleaning threshold");
    else status("Recommended cleaning threshold is: %i", est_min_covg);

    // Use estimated threshold if threshold not set
    if(unitig_min < 0) {
      if(fallback_thresh > 0 && est_min_covg < (int)fallback_thresh) {
        status("Using fallback threshold: %i", fallback_thresh);
        unitig_min = fallback_thresh;
      }
      else if(est_min_covg >= 0) unitig_min = est_min_covg;
    }
  // }

  // Die if we failed to find suitable cleaning threshold
  if(unitig_min < 0)
    die("Need cleaning threshold (--unitigs=<D> or --fallback <D>)");

  // Cleaning parameters should now be set (>0) or turned off (==0)
  ctx_assert(unitig_min >= 0);
  ctx_assert(min_keep_tip >= 0);

  if(unitig_min || min_keep_tip)
  {
    // Clean graph of tips (if min_keep_tip > 0) and unitigs (if threshold > 0)
    clean_graph(nthreads, unitig_min, min_keep_tip,
                covg_after_path, len_after_path,
                visited, keep, &db_graph);
  }

  ctx_free(visited);
  ctx_free(keep);

  if(out_ctx_path != NULL)
  {
    // Set output header ginfo cleaned
    for(col = 0; col < ncols; col++)
    {
      cleaning = &outhdr.ginfo[col].cleaning;
      cleaning->cleaned_snodes |= unitig_cleaning;
      cleaning->cleaned_tips |= tip_cleaning;

      // if(tip_cleaning) {
      //   strbuf_append_str(&outhdr.ginfo[col].sample_name, ".tipclean");
      // }

      if(unitig_cleaning) {
        size_t thresh = cleaning->clean_snodes_thresh;
        thresh = cleaning->cleaned_snodes ? MAX2(thresh, (uint32_t)unitig_min)
                                          : (uint32_t)unitig_min;
        cleaning->clean_snodes_thresh = thresh;

        // char name_append[200];
        // sprintf(name_append, ".supclean%zu", thresh);
        // strbuf_append_str(&outhdr.ginfo[col].sample_name, name_append);
      }
    }

    // Print stats on removed kmers
    size_t removed_nkmers = initial_nkmers - db_graph.ht.num_kmers;
    double removed_pct = (100.0 * removed_nkmers) / initial_nkmers;
    char removed_str[100], init_str[100];
    ulong_to_str(removed_nkmers, removed_str);
    ulong_to_str(initial_nkmers, init_str);
    status("Removed %s of %s (%.2f%%) kmers", removed_str, init_str, removed_pct);

    // kmers_loaded=true
    graph_writer_merge(out_ctx_path, gfiles, num_gfiles,
                      true, all_colours_loaded,
                      edges_union, &outhdr, &db_graph);
  }

  ctx_check(db_graph.ht.num_kmers == hash_table_count_kmers(&db_graph.ht));

  // TODO: report kmer coverage for each sample

  graph_header_dealloc(&outhdr);

  for(i = 0; i < num_gfiles; i++) graph_file_close(&gfiles[i]);
  ctx_free(gfiles);

  ctx_free(edges_union);
  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}