예제 #1
0
/**
 * Get coverage threshold for removing unitigs
 *
 * @param visited should be at least db_graph.ht.capcity bits long and initialised
 *                to zero. On return, it will be 1 at each original kmer index
 * @param covgs_csv_path
 * @param lens_csv_path  paths to files to write CSV histogram of unitigs
                         coverages and lengths BEFORE ANY CLEANING.
 *                       If NULL these are ignored.
 * @return threshold to clean or -1 on error
 */
int cleaning_get_threshold(size_t num_threads,
                           const char *covgs_csv_path,
                           const char *lens_csv_path,
                           uint8_t *visited,
                           const dBGraph *db_graph)
{
  // Estimate optimum cleaning threshold
  status("[cleaning] Calculating unitig stats with %zu threads...", num_threads);
  status("[cleaning]   Using kmer gamma method");

  // Get kmer coverages and unitig lengths
  UnitigCleaner cl;
  unitig_cleaner_alloc(&cl, num_threads, 0, 0, NULL, db_graph);
  supernodes_iterate(num_threads, visited, db_graph, unitig_get_covg, &cl);

  // Get kmer coverage only (faster)
  // KmerCleanerIterator kcls[nthreads];
  // for(i = 0; i < nthreads; i++)
  //   kcls[i] = (KmerCleanerIterator){.threadid = i, .nthreads = nthreads, .cl = &cl};
  // util_run_threads(kcls, nthreads, sizeof(kcls[0]), nthreads, kmer_get_covg);

  // Wipe visited kmer memory
  memset(visited, 0, roundup_bits2bytes(db_graph->ht.capacity));

  if(covgs_csv_path != NULL) {
    cleaning_write_covg_histogram(covgs_csv_path,
                                  cl.kmer_covgs_init,
                                  cl.unitig_covgs_init,
                                  cl.covg_arrsize);
  }

  if(lens_csv_path != NULL) {
    cleaning_write_len_histogram(lens_csv_path,
                                 cl.len_hist_init,
                                 cl.len_arrsize,
                                 db_graph->kmer_size);
  }

  // set threshold using histogram and genome size
  double alpha = 0, beta = 0, false_pos = 0, false_neg = 0;
  int threshold_est = cleaning_pick_kmer_threshold(cl.kmer_covgs_init,
                                                   cl.covg_arrsize,
                                                   &alpha, &beta,
                                                   &false_pos, &false_neg);

  if(threshold_est < 0)
    warn("Cannot pick a cleaning threshold");
  else {
    status("[cleaning] alpha=%f, beta=%f FP=%f FN=%f",
           alpha, beta, false_pos, false_neg);
    status("[cleaning] Recommended unitig cleaning threshold: < %i",
           threshold_est);
  }

  unitig_cleaner_dealloc(&cl);

  return threshold_est;
}
예제 #2
0
/**
 * Remove unitigs with coverage < `covg_threshold` and tips shorter than
 * `min_keep_tip`.
 *
 * @param num_threads    Number of threads to use
 * @param covg_threshold Remove unitigs with mean covg < `covg_threshold`.
 *                       Ignored if 0.
 * @param min_keep_tip   Remove tips with length < `min_keep_tip`. Ignored if 0.
 * @param covgs_csv_path Path to write CSV of kmer coverage histogram
 * @param lens_csv_path  Path to write CSV of unitig length histogram
 *
 * `visited`, `keep` should each be at least db_graph.ht.capcity bits long
 *   and initialised to zero. On return,
 *   `visited` will be 1 at each original kmer index
 *   `keep` will be 1 at each retained kmer index
 **/
void clean_graph(size_t num_threads,
                 size_t covg_threshold, size_t min_keep_tip,
                 const char *covgs_csv_path, const char *lens_csv_path,
                 uint8_t *visited, uint8_t *keep, dBGraph *db_graph)
{
  ctx_assert(db_graph->num_of_cols == 1);
  ctx_assert(db_graph->num_edge_cols > 0);

  size_t init_nkmers = db_graph->ht.num_kmers;

  if(db_graph->ht.num_kmers == 0) return;
  if(covg_threshold == 0 && min_keep_tip == 0) {
    warn("[cleaning] No cleaning specified");
    return;
  }

  if(covg_threshold > 0) {
    status("[cleaning] Removing unitigs with coverage < %zu...", covg_threshold);
    status("[cleaning]   Using kmer gamma method");
  }

  if(min_keep_tip > 0)
    status("[cleaning] Removing tips shorter than %zu...", min_keep_tip);

  status("[cleaning]   using %zu threads", num_threads);

  // Mark nodes to keep
  UnitigCleaner cl;
  unitig_cleaner_alloc(&cl, num_threads, covg_threshold,
                          min_keep_tip, keep, db_graph);
  supernodes_iterate(num_threads, visited, db_graph, unitig_mark, &cl);

  // Print numbers of kmers that are being removed

  char num_snodes_str[50], num_tips_str[50], num_tip_snodes_str[50];
  char num_snode_kmers_str[50], num_tip_kmers_str[50], num_tip_snode_kmers_str[50];
  ulong_to_str(cl.num_low_covg_snodes, num_snodes_str);
  ulong_to_str(cl.num_tips, num_tips_str);
  ulong_to_str(cl.num_tip_and_low_snodes, num_tip_snodes_str);
  ulong_to_str(cl.num_low_covg_snode_kmers, num_snode_kmers_str);
  ulong_to_str(cl.num_tip_kmers, num_tip_kmers_str);
  ulong_to_str(cl.num_tip_and_low_snode_kmers, num_tip_snode_kmers_str);

  status("[cleaning] Removing %s low coverage unitigs [%s kmer%s], "
         "%s unitig tips [%s kmer%s] "
         "and %s of both [%s kmer%s]",
         num_snodes_str,
         num_snode_kmers_str, util_plural_str(cl.num_low_covg_snode_kmers),
         num_tips_str,
         num_tip_kmers_str, util_plural_str(cl.num_tip_kmers),
         num_tip_snodes_str,
         num_tip_snode_kmers_str, util_plural_str(cl.num_tip_and_low_snode_kmers));

  // Remove nodes not marked to keep
  prune_nodes_lacking_flag(num_threads, keep, db_graph);

  // Wipe memory
  memset(visited, 0, roundup_bits2bytes(db_graph->ht.capacity));
  memset(keep, 0, roundup_bits2bytes(db_graph->ht.capacity));

  // Print status update
  char remain_nkmers_str[100], removed_nkmers_str[100];
  size_t remain_nkmers = db_graph->ht.num_kmers;
  size_t removed_nkmers = init_nkmers - remain_nkmers;
  ulong_to_str(remain_nkmers, remain_nkmers_str);
  ulong_to_str(removed_nkmers, removed_nkmers_str);
  status("[cleaning] Remaining kmers: %s removed: %s (%.1f%%)",
         remain_nkmers_str, removed_nkmers_str,
         (100.0*removed_nkmers)/init_nkmers);

  if(covgs_csv_path != NULL) {
    cleaning_write_covg_histogram(covgs_csv_path,
                                  cl.kmer_covgs_clean,
                                  cl.unitig_covg_clean,
                                  cl.covg_arrsize);
  }

  if(lens_csv_path != NULL) {
    cleaning_write_len_histogram(lens_csv_path,
                                 cl.len_hist_clean,
                                 cl.len_arrsize,
                                 db_graph->kmer_size);
  }

  unitig_cleaner_dealloc(&cl);
}
예제 #3
0
/**
 * Get coverage threshold for removing supernodes
 *
 * @param visited should be at least db_graph.ht.capcity bits long and initialised
 *                to zero. On return, it will be 1 at each original kmer index
 * @param covgs_csv_path
 * @param lens_csv_path  paths to files to write CSV histogram of supernodes
                         coverages and lengths BEFORE ANY CLEANING.
 *                       If NULL these are ignored.
 * @return threshold to clean or -1 on error
 */
int cleaning_get_threshold(size_t num_threads,
                           const char *covgs_csv_path,
                           const char *lens_csv_path,
                           uint8_t *visited,
                           const dBGraph *db_graph)
{
  // Estimate optimum cleaning threshold
  status("[cleaning] Calculating supernode stats with %zu threads...", num_threads);
  status("[cleaning]   Using kmer gamma method");

  // Get kmer coverages and supernode lengths
  SupernodeCleaner cl;
  supernode_cleaner_alloc(&cl, num_threads, 0, 0, NULL, db_graph);
  supernodes_iterate(num_threads, visited, db_graph, supernode_get_covg, &cl);

  // Get kmer coverage only (faster)
  // KmerCleanerIterator kcls[nthreads];
  // for(i = 0; i < nthreads; i++)
  //   kcls[i] = (KmerCleanerIterator){.threadid = i, .nthreads = nthreads, .cl = &cl};
  // util_run_threads(kcls, nthreads, sizeof(kcls[0]), nthreads, kmer_get_covg);

  // Wipe visited kmer memory
  memset(visited, 0, roundup_bits2bytes(db_graph->ht.capacity));

  if(covgs_csv_path != NULL) {
    cleaning_write_covg_histogram(covgs_csv_path,
                                  cl.covg_hist_init,
                                  cl.mean_covg_hist_init,
                                  cl.covg_arrsize);
  }

  if(lens_csv_path != NULL) {
    cleaning_write_len_histogram(lens_csv_path,
                                 cl.len_hist_init,
                                 cl.len_arrsize,
                                 db_graph->kmer_size);
  }

  // set threshold using histogram and genome size
  int threshold_est = -1;

  double fdr = 0.001, alpha = 0, beta = 0;
  while(fdr < 1) {
    threshold_est = cleaning_pick_kmer_threshold(cl.covg_hist_init,
                                                 cl.covg_arrsize,
                                                 fdr, &alpha, &beta);
    if(threshold_est >= 0) break;
    fdr *= 10;
  }
  if(threshold_est < 0)
    warn("Cannot pick a cleaning threshold");
  else
    status("[cleaning] FDR set to %f [alpha=%f, beta=%f]", fdr, alpha, beta);

  if(threshold_est >= 0) {
    status("[cleaning] Recommended supernode cleaning threshold: < %i",
           threshold_est);
  }

  supernode_cleaner_dealloc(&cl);

  return threshold_est;
}