Exemplo n.º 1
0
static void print_suggest_cutoff(size_t hist_distsize, size_t hist_covgsize,
                                 uint64_t (*hists)[hist_covgsize],
                                 FILE *fh)
{
  size_t i, dist, median, nthresh_failed = 0;
  size_t cutoffs[hist_distsize], sumcovgs[hist_distsize];

  // Don't use dist[0] -- not informative
  memset(cutoffs, 0, sizeof(cutoffs));
  memset(sumcovgs, 0, sizeof(sumcovgs));

  for(dist = 1; dist < hist_distsize; dist++)
  {
    int t = cleaning_pick_kmer_threshold(hists[dist], hist_covgsize,
                                         NULL, NULL, NULL, NULL);
    if(t < 0) { nthresh_failed++; t = 0; }
    cutoffs[dist] = t;
    for(i = 0; i < hist_covgsize; i++) sumcovgs[dist] += hists[dist][i];
  }

  // Print cutoffs
  fprintf(fh, "sumcovgs=%zu", sumcovgs[1]);
  for(i = 2; i < hist_distsize; i++) fprintf(fh, ",%zu", sumcovgs[i]);
  fprintf(fh, "\ncutoffs=%zu", cutoffs[1]);
  for(i = 2; i < hist_distsize; i++) fprintf(fh, ",%zu", cutoffs[i]);
  fprintf(fh, "\n");

  median = gca_median_size(cutoffs+1, hist_distsize-1);

  fprintf(fh, "suggested_cutoff=%zu\n", median);

  if(nthresh_failed)
    warn("Threshold failed in %zu cases [default to 0]", nthresh_failed);
}
Exemplo n.º 2
0
/**
 * Get coverage threshold for removing unitigs
 *
 * @param visited should be at least db_graph.ht.capcity bits long and initialised
 *                to zero. On return, it will be 1 at each original kmer index
 * @param covgs_csv_path
 * @param lens_csv_path  paths to files to write CSV histogram of unitigs
                         coverages and lengths BEFORE ANY CLEANING.
 *                       If NULL these are ignored.
 * @return threshold to clean or -1 on error
 */
int cleaning_get_threshold(size_t num_threads,
                           const char *covgs_csv_path,
                           const char *lens_csv_path,
                           uint8_t *visited,
                           const dBGraph *db_graph)
{
  // Estimate optimum cleaning threshold
  status("[cleaning] Calculating unitig stats with %zu threads...", num_threads);
  status("[cleaning]   Using kmer gamma method");

  // Get kmer coverages and unitig lengths
  UnitigCleaner cl;
  unitig_cleaner_alloc(&cl, num_threads, 0, 0, NULL, db_graph);
  supernodes_iterate(num_threads, visited, db_graph, unitig_get_covg, &cl);

  // Get kmer coverage only (faster)
  // KmerCleanerIterator kcls[nthreads];
  // for(i = 0; i < nthreads; i++)
  //   kcls[i] = (KmerCleanerIterator){.threadid = i, .nthreads = nthreads, .cl = &cl};
  // util_run_threads(kcls, nthreads, sizeof(kcls[0]), nthreads, kmer_get_covg);

  // Wipe visited kmer memory
  memset(visited, 0, roundup_bits2bytes(db_graph->ht.capacity));

  if(covgs_csv_path != NULL) {
    cleaning_write_covg_histogram(covgs_csv_path,
                                  cl.kmer_covgs_init,
                                  cl.unitig_covgs_init,
                                  cl.covg_arrsize);
  }

  if(lens_csv_path != NULL) {
    cleaning_write_len_histogram(lens_csv_path,
                                 cl.len_hist_init,
                                 cl.len_arrsize,
                                 db_graph->kmer_size);
  }

  // set threshold using histogram and genome size
  double alpha = 0, beta = 0, false_pos = 0, false_neg = 0;
  int threshold_est = cleaning_pick_kmer_threshold(cl.kmer_covgs_init,
                                                   cl.covg_arrsize,
                                                   &alpha, &beta,
                                                   &false_pos, &false_neg);

  if(threshold_est < 0)
    warn("Cannot pick a cleaning threshold");
  else {
    status("[cleaning] alpha=%f, beta=%f FP=%f FN=%f",
           alpha, beta, false_pos, false_neg);
    status("[cleaning] Recommended unitig cleaning threshold: < %i",
           threshold_est);
  }

  unitig_cleaner_dealloc(&cl);

  return threshold_est;
}
Exemplo n.º 3
0
/**
 * Get coverage threshold for removing supernodes
 *
 * @param visited should be at least db_graph.ht.capcity bits long and initialised
 *                to zero. On return, it will be 1 at each original kmer index
 * @param covgs_csv_path
 * @param lens_csv_path  paths to files to write CSV histogram of supernodes
                         coverages and lengths BEFORE ANY CLEANING.
 *                       If NULL these are ignored.
 * @return threshold to clean or -1 on error
 */
int cleaning_get_threshold(size_t num_threads,
                           const char *covgs_csv_path,
                           const char *lens_csv_path,
                           uint8_t *visited,
                           const dBGraph *db_graph)
{
  // Estimate optimum cleaning threshold
  status("[cleaning] Calculating supernode stats with %zu threads...", num_threads);
  status("[cleaning]   Using kmer gamma method");

  // Get kmer coverages and supernode lengths
  SupernodeCleaner cl;
  supernode_cleaner_alloc(&cl, num_threads, 0, 0, NULL, db_graph);
  supernodes_iterate(num_threads, visited, db_graph, supernode_get_covg, &cl);

  // Get kmer coverage only (faster)
  // KmerCleanerIterator kcls[nthreads];
  // for(i = 0; i < nthreads; i++)
  //   kcls[i] = (KmerCleanerIterator){.threadid = i, .nthreads = nthreads, .cl = &cl};
  // util_run_threads(kcls, nthreads, sizeof(kcls[0]), nthreads, kmer_get_covg);

  // Wipe visited kmer memory
  memset(visited, 0, roundup_bits2bytes(db_graph->ht.capacity));

  if(covgs_csv_path != NULL) {
    cleaning_write_covg_histogram(covgs_csv_path,
                                  cl.covg_hist_init,
                                  cl.mean_covg_hist_init,
                                  cl.covg_arrsize);
  }

  if(lens_csv_path != NULL) {
    cleaning_write_len_histogram(lens_csv_path,
                                 cl.len_hist_init,
                                 cl.len_arrsize,
                                 db_graph->kmer_size);
  }

  // set threshold using histogram and genome size
  int threshold_est = -1;

  double fdr = 0.001, alpha = 0, beta = 0;
  while(fdr < 1) {
    threshold_est = cleaning_pick_kmer_threshold(cl.covg_hist_init,
                                                 cl.covg_arrsize,
                                                 fdr, &alpha, &beta);
    if(threshold_est >= 0) break;
    fdr *= 10;
  }
  if(threshold_est < 0)
    warn("Cannot pick a cleaning threshold");
  else
    status("[cleaning] FDR set to %f [alpha=%f, beta=%f]", fdr, alpha, beta);

  if(threshold_est >= 0) {
    status("[cleaning] Recommended supernode cleaning threshold: < %i",
           threshold_est);
  }

  supernode_cleaner_dealloc(&cl);

  return threshold_est;
}