void apply_sb_threshold(var_t *var, sb_filter_t *sb_filter) { char *sb_char = NULL; int sb; if (! sb_filter->thresh) { return; } if ( ! vcf_var_has_info_key(&sb_char, var, "SB")) { if ( ! sb_missing_warning_printed) { LOG_WARN("%s\n", "Requested SB filtering failed since SB tag is missing in variant"); sb_missing_warning_printed = 1; } return; } sb = atoi(sb_char); free(sb_char); if (sb > sb_filter->thresh) { if (sb_filter->no_compound || alt_mostly_on_one_strand(var)) { vcf_var_add_to_filter(var, sb_filter->id); } } }
void apply_af_filter(var_t *var, af_filter_t *af_filter) { char *af_char = NULL; float af; if (af_missing_warning_printed) { return; } if (af_filter->min > 0 || af_filter->max > 0) { if ( ! vcf_var_has_info_key(&af_char, var, "AF")) { if ( ! af_missing_warning_printed) { LOG_WARN("%s\n", "Requested AF filtering failed since AF tag is missing in variant"); af_missing_warning_printed = 1; return; } } af = strtof(af_char, (char **)NULL); /* atof */ if (errno==ERANGE) { LOG_ERROR("Couldn't parse EF from af_char %s. Disabling AF filtering", af_char); af_missing_warning_printed = 1; return; } free(af_char); if (af_filter->min > 0.0 && af < af_filter->min) { vcf_var_add_to_filter(var, af_filter->id_min); } if (af_filter->max > 0.0 && af > af_filter->max) { vcf_var_add_to_filter(var, af_filter->id_max); } } }
void apply_indelqual_threshold(var_t *var, indelqual_filter_t *indelqual_filter) { assert (vcf_var_has_info_key(NULL, var, "INDEL")); if (! indelqual_filter->thresh) { return; } if (var->qual>-1 && var->qual<indelqual_filter->thresh) { vcf_var_add_to_filter(var, indelqual_filter->id); } }
int uniq_phred_from_var(var_t *var) { char *uq_char = NULL; if ( ! vcf_var_has_info_key(&uq_char, var, uniq_phred_tag)) { /* missing because no coverage or other reasons. not unique anyway */ return 0; } else { int uq = (int) strtol(uq_char, (char **)NULL, 10);/* atoi replacement */ free(uq_char); return uq; } }
void apply_dp_filter(var_t *var, dp_filter_t *dp_filter) { char *dp_char = NULL; int cov; if (dp_missing_warning_printed) { return; } if (dp_filter->min > 0 || dp_filter->max > 0) { if ( ! vcf_var_has_info_key(&dp_char, var, "DP")) { if ( ! dp_missing_warning_printed) { #ifdef DEBUG vcf_file_t f; f.fh = stderr; f.gz = 0; vcf_write_var(&f, var); #endif LOG_WARN("%s\n", "Requested coverage filtering failed since DP tag is missing in variant"); dp_missing_warning_printed = 1; return; } } errno = 0; /*cov = atoi(dp_char);*/ cov = strtol(dp_char, (char **) NULL, 10); if (errno) { LOG_FATAL("%s\n", "errpr during int conversion"); exit(1); } free(dp_char); if (dp_filter->min > 0 && cov < dp_filter->min) { vcf_var_add_to_filter(var, dp_filter->id_min); } if (dp_filter->max > 0 && cov > dp_filter->max) { vcf_var_add_to_filter(var, dp_filter->id_max); } } }
int main_uniq(int argc, char *argv[]) { int c, i; char *bam_file = NULL; char *vcf_in = NULL; /* - == stdout */ char *vcf_out = NULL; /* - == stdout */ mplp_conf_t mplp_conf; uniq_conf_t uniq_conf; void (*plp_proc_func)(const plp_col_t*, void*); int rc = 0; var_t **vars = NULL; int num_vars = 0; char *vcf_header = NULL; static int use_det_lim = 0; static int use_orphan = 0; static int output_all = 0; static int is_somatic = 0; /* default uniq options */ memset(&uniq_conf, 0, sizeof(uniq_conf_t)); uniq_conf.uni_freq = DEFAULT_UNI_FREQ; uniq_conf.use_det_lim = 0; uniq_conf.uniq_filter.mtc_type = MTC_FDR; uniq_conf.uniq_filter.alpha = 0.001; /* default pileup options */ memset(&mplp_conf, 0, sizeof(mplp_conf_t)); mplp_conf.max_mq = DEFAULT_MAX_MQ; mplp_conf.min_mq = 1; mplp_conf.min_plp_bq = DEFAULT_MIN_PLP_BQ; mplp_conf.max_depth = DEFAULT_MAX_PLP_DEPTH; mplp_conf.flag = MPLP_NO_ORPHAN; /* keep in sync with long_opts_str and usage * * getopt is a pain in the whole when it comes to syncing of long * and short args and usage. check out gopt, libcfu... */ while (1) { static struct option long_opts[] = { /* see usage sync */ {"help", no_argument, NULL, 'h'}, {"verbose", no_argument, &verbose, 1}, {"debug", no_argument, &debug, 1}, {"use-det-lim", no_argument, &use_det_lim, 1}, {"use-orphan", no_argument, &use_orphan, 1}, {"output-all", no_argument, &output_all, 1}, {"is-somatic", no_argument, &is_somatic, 1}, {"vcf-in", required_argument, NULL, 'v'}, {"vcf-out", required_argument, NULL, 'o'}, {"uni-freq", required_argument, NULL, 'f'}, {"uniq-thresh", required_argument, NULL, 't'}, {"uniq-mtc", required_argument, NULL, 'm'}, {"uniq-alpha", required_argument, NULL, 'a'}, {"uniq-ntests", required_argument, NULL, 'n'}, {0, 0, 0, 0} /* sentinel */ }; /* keep in sync with long_opts and usage */ static const char *long_opts_str = "hv:o:f:t:m:a:n:"; /* getopt_long stores the option index here. */ int long_opts_index = 0; c = getopt_long(argc-1, argv+1, /* skipping 'lofreq', just leaving 'command', i.e. call */ long_opts_str, long_opts, & long_opts_index); if (c == -1) { break; } switch (c) { /* keep in sync with long_opts etc */ case 'h': usage(& uniq_conf); return 0; case 'v': if (0 != strcmp(optarg, "-")) { if (! file_exists(optarg)) { LOG_FATAL("Input file '%s' does not exist. Exiting...\n", optarg); return 1; } } vcf_in = strdup(optarg); break; case 'o': if (0 != strcmp(optarg, "-")) { if (file_exists(optarg)) { LOG_FATAL("Cowardly refusing to overwrite file '%s'. Exiting...\n", optarg); return 1; } } vcf_out = strdup(optarg); break; case 'f': uniq_conf.uni_freq = strtof(optarg, (char **)NULL); /* atof */ if (uniq_conf.uni_freq<=0) { LOG_WARN("%s\n", "Ignoring uni-freq option"); } if (uniq_conf.uni_freq>1.0) { LOG_FATAL("%s\n", "Value for uni-freq has to be <1.0"); return 1; } break; case 't': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } uniq_conf.uniq_filter.thresh = atoi(optarg); uniq_conf.uniq_filter.mtc_type = MTC_NONE; break; case 'm': uniq_conf.uniq_filter.mtc_type = mtc_str_to_type(optarg); if (-1 == uniq_conf.uniq_filter.mtc_type) { LOG_FATAL("Unknown multiple testing correction type '%s' for snv quality filtering\n", optarg); return -1; } break; case 'a': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } uniq_conf.uniq_filter.alpha = strtof(optarg, NULL); break; case 'n': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } uniq_conf.uniq_filter.ntests = atol(optarg); break; case '?': LOG_FATAL("%s\n", "unrecognized arguments found. Exiting...\n"); return 1; default: break; } } if (use_orphan) { mplp_conf.flag &= ~MPLP_NO_ORPHAN; } if (debug) { dump_mplp_conf(& mplp_conf, stderr); } uniq_conf.output_all = output_all; uniq_conf.use_det_lim = use_det_lim; #if DEBUG LOG_DEBUG("uniq_conf.uniq_filter.thresh = %d\n", uniq_conf.uniq_filter.thresh); LOG_DEBUG("uniq_conf.uniq_filter.mtc_type = %d\n", uniq_conf.uniq_filter.mtc_type); LOG_DEBUG("uniq_conf.uniq_filter.alpha = %f\n", uniq_conf.uniq_filter.alpha); LOG_DEBUG("uniq_conf.uniq_filter.ntests = %d\n", uniq_conf.uniq_filter.ntests); #endif if (uniq_conf.uniq_filter.thresh && uniq_conf.uniq_filter.mtc_type != MTC_NONE) { LOG_FATAL("%s\n", "Can't use fixed Unique quality threshold *and* multiple testing correction."); return 1; } if (argc == 2) { fprintf(stderr, "\n"); usage(& uniq_conf); return 1; } if (1 != argc - optind - 1) { fprintf(stderr, "Need exactly one BAM file as last argument\n"); return 1; } bam_file = (argv + optind + 1)[0]; if (! file_exists(bam_file)) { LOG_FATAL("BAM file %s does not exist. Exiting...\n", bam_file); return -1; } if (! vcf_in) { #if 0 vcf_in = malloc(2 * sizeof(char)); strcpy(vcf_in, "-"); #else LOG_FATAL("%s\n", "No input vcf specified. Exiting..."); return -1; #endif } if (! vcf_out) { vcf_out = malloc(2 * sizeof(char)); strcpy(vcf_out, "-"); } if (vcf_file_open(& uniq_conf.vcf_in, vcf_in, HAS_GZIP_EXT(vcf_in), 'r')) { LOG_ERROR("Couldn't open %s\n", vcf_in); return 1; } if (vcf_file_open(& uniq_conf.vcf_out, vcf_out, HAS_GZIP_EXT(vcf_out), 'w')) { LOG_ERROR("Couldn't open %s\n", vcf_out); return 1; } if (0 != vcf_parse_header(&vcf_header, & uniq_conf.vcf_in)) { LOG_WARN("%s\n", "vcf_parse_header() failed. trying to rewind to start..."); if (vcf_file_seek(& uniq_conf.vcf_in, 0, SEEK_SET)) { LOG_FATAL("%s\n", "Couldn't rewind file to parse variants" " after header parsing failed"); return 1; } } else { vcf_header_add(&vcf_header, "##INFO=<ID=UNIQ,Number=0,Type=Flag,Description=\"Unique, i.e. not detectable in paired sample\">\n"); vcf_header_add(&vcf_header, "##INFO=<ID=UQ,Number=1,Type=Integer,Description=\"Phred-scaled uniq score at this position\">\n"); if (is_somatic) { vcf_header_add(&vcf_header, "##INFO=<ID=SOMATIC,Number=0,Type=Flag,Description=\"Somatic event\">\n"); } if (! uniq_conf.use_det_lim) { char full_filter_str[FILTER_STRSIZE]; if (uniq_conf.uniq_filter.thresh > 0) { snprintf(uniq_conf.uniq_filter.id, FILTER_ID_STRSIZE, "min_uq_%d", uniq_conf.uniq_filter.thresh); snprintf(full_filter_str, FILTER_STRSIZE, "##FILTER=<ID=%s,Description=\"Minimum Uniq Phred %d\">\n", uniq_conf.uniq_filter.id, uniq_conf.uniq_filter.thresh); vcf_header_add(&vcf_header, full_filter_str); } else if (uniq_conf.uniq_filter.mtc_type != MTC_NONE) { char buf[64]; mtc_str(buf, uniq_conf.uniq_filter.mtc_type); snprintf(uniq_conf.uniq_filter.id, FILTER_ID_STRSIZE, "uq_%s", buf); snprintf(full_filter_str, FILTER_STRSIZE, "##FILTER=<ID=%s,Description=\"Uniq Multiple Testing Correction: %s corr. pvalue < %f\">\n", uniq_conf.uniq_filter.id, buf, uniq_conf.uniq_filter.alpha); vcf_header_add(& vcf_header, full_filter_str); } } vcf_write_header(& uniq_conf.vcf_out, vcf_header); free(vcf_header); } num_vars = vcf_parse_vars(&vars, & uniq_conf.vcf_in, 1); if (0 == num_vars) { LOG_WARN("%s\n", "Didn't find any variants in input"); goto clean_and_exit; } if (! uniq_conf.uniq_filter.ntests) { uniq_conf.uniq_filter.ntests = num_vars; } plp_proc_func = &uniq_snv; for (i=0; i<num_vars; i++) { char reg_buf[BUF_SIZE]; if (i%100==0) { LOG_VERBOSE("Processing variant %d of %d\n", i+1, num_vars); } uniq_conf.var = vars[i]; snprintf(reg_buf, BUF_SIZE, "%s:%ld-%ld", vars[i]->chrom, vars[i]->pos+1, vars[i]->pos+1); mplp_conf.reg = strdup(reg_buf); LOG_DEBUG("pileup for var no %d at %s %d\n", i+1, uniq_conf.var->chrom, uniq_conf.var->pos+1); #ifdef DISABLE_INDELS if (vcf_var_has_info_key(NULL, uniq_conf.var, "INDEL")) { LOG_WARN("Skipping indel var at %s %d\n", uniq_conf.var->chrom, uniq_conf.var->pos+1); free(mplp_conf.reg); mplp_conf.reg = NULL; continue; } #endif /* no need to check for filter because done by parse_vars */ rc = mpileup(&mplp_conf, plp_proc_func, (void*)&uniq_conf, 1, (const char **) argv + optind + 1); if (uniq_conf.uniq_filter.thresh) { apply_uniq_threshold(uniq_conf.var, & uniq_conf.uniq_filter); } free(mplp_conf.reg); mplp_conf.reg = NULL; } uniq_conf.var = NULL;/* just be sure to not use it accidentally again */ /* print whatever we've got. there's no UQ to test or we * are supposed to print all */ if (uniq_conf.use_det_lim) { for (i=0; i<num_vars; i++) { var_t *var = vars[i]; vcf_write_var(& uniq_conf.vcf_out, var); } /* all done */ goto clean_and_exit; } if (uniq_conf.uniq_filter.mtc_type != MTC_NONE) { if (apply_uniq_filter_mtc(& uniq_conf.uniq_filter, vars, num_vars)) { LOG_FATAL("%s\n", "Multiple testing correction on uniq pvalues failed"); return -1; } } for (i=0; i<num_vars; i++) { var_t *var = vars[i]; if (VCF_VAR_PASSES(var) || uniq_conf.output_all) { vcf_write_var(& uniq_conf.vcf_out, var); } } clean_and_exit: vcf_file_close(& uniq_conf.vcf_in); vcf_file_close(& uniq_conf.vcf_out); for (i=0; i<num_vars; i++) { vcf_free_var(& vars[i]); } free(vars); free(vcf_in); free(vcf_out); if (0==rc) { LOG_VERBOSE("%s\n", "Successful exit."); } /* LOG_FIXME("%s\n", "allow user setting of -S and -J. Currently just using default") */ return rc; }
/* used as pileup callback function which is not ideal since this can * only work on one position (has to be ensured by caller). * * No cov means I won't be called through mpileup and no output will * be generated. Non-sig pv means I'm not sure and no ouput will be * generated. Only if pv is sig we will print the var * * needs to return void to be used as function pointer to mpileup */ void uniq_snv(const plp_col_t *p, void *confp) { uniq_conf_t *conf = (uniq_conf_t *)confp; char *af_char = NULL; float af; int is_uniq = 0; int is_indel; int coverage; is_indel = vcf_var_is_indel(conf->var); #ifdef DISABLE_INDELS if (is_indel) { LOG_WARN("uniq logic can't be applied to indels." " Skipping indel var at %s %d\n", conf->var->chrom, conf->var->pos+1); return; } #endif if (0 != strcmp(p->target, conf->var->chrom) || p->pos != conf->var->pos) { LOG_ERROR("wrong pileup for var. pileup for %s %d. var for %s %d\n", p->target, p->pos+1, conf->var->chrom, conf->var->pos+1); return; } coverage = p->coverage_plp; if (is_indel) { coverage -= p->num_tails; } if (1 > coverage) { return; } if (conf->uni_freq <= 0.0) { if (! vcf_var_has_info_key(&af_char, conf->var, "AF")) { LOG_FATAL("%s\n", "Couldn't parse AF (key not found) from variant"); /* hard to catch error later */ exit(1); } af = strtof(af_char, (char **)NULL); /* atof */ free(af_char); if (af < 0.0 || af > 1.0) { float new_af; new_af = af<0.0 ? 0.01 : 1.0; /* hard to catch error later */ LOG_FATAL("Invalid (value out of bound) AF %f in variant. Resetting to %f\n", af, new_af); af = new_af; } } else { assert(conf->uni_freq <= 1.0); af = conf->uni_freq; } if (conf->use_det_lim) { /* given the current base counts and their error probs, * would we've been able to detect at given frequency. */ long double pvalues[NUM_NONCONS_BASES]; double *err_probs; /* error probs (qualities) passed down to snpcaller */ int num_err_probs; int alt_bases[NUM_NONCONS_BASES];/* actual alt bases */ int alt_counts[NUM_NONCONS_BASES]; /* counts for alt bases handed down to snpcaller */ int alt_raw_counts[NUM_NONCONS_BASES]; /* raw, unfiltered alt-counts */ varcall_conf_t varcall_conf; int bonf = 1; float alpha = 0.01; init_varcall_conf(&varcall_conf); if (debug) { dump_varcall_conf(&varcall_conf, stderr); } plp_to_errprobs(&err_probs, &num_err_probs, alt_bases, alt_counts, alt_raw_counts, p, &varcall_conf); LOG_DEBUG("at %s:%d with cov %d and num_err_probs %d\n", p->target, p->pos, coverage, num_err_probs); /* Now pretend we see AF(SNV-to-test)*coverage variant * bases. Truncate to int, i.e err on the side of caution * during rounding (assume fewer alt bases) */ alt_counts[0] = af * num_err_probs; /* don't use coverage as that is before filtering */ alt_counts[1] = alt_counts[2] = 0; if (snpcaller(pvalues, err_probs, num_err_probs, alt_counts, bonf, alpha)) { fprintf(stderr, "FATAL: snpcaller() failed at %s:%s():%d\n", __FILE__, __FUNCTION__, __LINE__); free(err_probs); return; } /* only need to test first pv */ if (pvalues[0] * (float)bonf < alpha) { /* significant value means given the counts and * qualities we would have been able to detect this * uncalled SNV had it been present at the given * frequency. But since we didn't this is a uniq * variant. * * No point in adding this as phred qual because it * means the opposite of UQ */ vcf_var_add_to_info(conf->var, uniq_flag); } LOG_VERBOSE("%s %d num_quals=%d assumed-var-counts=%d would-have-been-detectable=%d\n", conf->var->chrom, conf->var->pos+1, num_err_probs, alt_counts[0], is_uniq); free(err_probs); } else { int alt_count; double pvalue; char info_str[128]; if (is_indel) { int ref_len = strlen(conf->var->ref); int alt_len = strlen(conf->var->alt); if (ref_len > alt_len) { /* deletion */ char *del_key = malloc((strlen(conf->var->ref)+1)*sizeof(char)); strcpy(del_key, conf->var->ref+1); del_event *it_del = find_del_sequence(&p->del_event_counts, del_key); if (it_del) { alt_count = it_del->count; } else { alt_count = 0; } /* LOG_DEBUG("%s>%s k:%s c:%d\n", conf->var->ref, conf->var->alt, del_key, alt_count); */ free(del_key); } else { /* insertion */ char *ins_key = malloc((strlen(conf->var->alt)+1)*sizeof(char)); strcpy(ins_key, conf->var->alt+1); ins_event *it_ins = find_ins_sequence(&p->ins_event_counts, ins_key); if (it_ins) { alt_count = it_ins->count; } else { alt_count = 0; } /* LOG_DEBUG("%s>%s k:%s c:%d\n", conf->var->ref, conf->var->alt, ins_key, alt_count);*/ free(ins_key); } } else { alt_count = base_count(p, conf->var->alt[0]); } #ifdef DEBUG LOG_DEBUG("Now testing af=%f cov=%d alt_count=%d at %s %d for var:", af, coverage, alt_count, p->target, p->pos+1); #endif /* this is a one sided test */ if (0 != binom(&pvalue, NULL, coverage, alt_count, af)) { LOG_ERROR("%s\n", "binom() failed"); return; } snprintf(info_str, 128, "%s=%d", uniq_phred_tag, PROB_TO_PHREDQUAL_SAFE(pvalue)); vcf_var_add_to_info(conf->var, info_str); LOG_DEBUG("%s %d %s>%s AF=%f | %s (p-value=%g) | BAM alt_count=%d cov=%d (freq=%f)\n", conf->var->chrom, conf->var->pos+1, conf->var->ref, conf->var->alt, af, is_uniq ? "unique" : "not necessarily unique", pvalue, alt_count, coverage, alt_count/(float)coverage); } }
/* mtc_quals allocated here. size returned on exit or -1 on error */ long int mtc_quals_from_vcf_file(mtc_qual_t **mtc_quals, const char *vcf_in) { long int num_vars = 0; long int mtc_qual_size = 0; int mtc_qual_incr = 16384; vcf_file_t vcffh; if (vcf_file_open(&vcffh, vcf_in, HAS_GZIP_EXT(vcf_in), 'r')) { LOG_ERROR("Couldn't open %s\n", vcf_in); return -1; } if (0 != vcf_skip_header(&vcffh)) { LOG_WARN("%s\n", "vcf_skip_header() failed"); return -1; } mtc_qual_size += mtc_qual_incr; (*mtc_quals) = calloc(mtc_qual_size, sizeof(mtc_qual_t)); while (1) { var_t *var; int rc; int is_indel = 0; char *sb_char = NULL; vcf_new_var(&var); rc = vcf_parse_var(&vcffh, var); if (rc) { /* how to distinguish between error and EOF? */ break; } num_vars += 1; /* ingest anything: we keep adding filters */ if (num_vars > mtc_qual_size) { mtc_qual_size += mtc_qual_incr; (*mtc_quals) = realloc((*mtc_quals), mtc_qual_size * sizeof(mtc_qual_t)); } is_indel = vcf_var_is_indel(var); (*mtc_quals)[num_vars-1].is_indel = is_indel; /* variant quality */ if (var->qual==-1) { /* missing qualities to fake value */ var->qual = INT_MAX; if (! varq_missing_warning_printed) { LOG_WARN("%s\n", "Missing variant quality in at least once case. Assuming INT_MAX"); varq_missing_warning_printed = 1; } (*mtc_quals)[num_vars-1].var_qual = INT_MAX; } else { (*mtc_quals)[num_vars-1].var_qual = var->qual; } /* strand bias */ if ( ! vcf_var_has_info_key(&sb_char, var, "SB")) { if ( ! sb_missing_warning_printed) { LOG_WARN("%s\n", "At least one variant has no SB tag! Assuming 0"); sb_missing_warning_printed = 1; } (*mtc_quals)[num_vars-1].sb_qual = 0; } else { (*mtc_quals)[num_vars-1].sb_qual = atoi(sb_char); free(sb_char); } (*mtc_quals)[num_vars-1].is_alt_mostly_on_one_strand = alt_mostly_on_one_strand(var); vcf_free_var(&var); } vcf_file_close(&vcffh); return num_vars; }
/* returns -1 on error * * filter everything that's significant * * very similar to in apply_snvqual_filter_mtc, but reverse logic and looking at all vars */ int apply_sb_filter_mtc(sb_filter_t *sb_filter, var_t **vars, const long int num_vars) { double *sb_probs = NULL; long int i; long int num_ign = 0; long int *orig_idx = NULL;/* we might ignore some variants (missing values etc). keep track of real indices of kept vars */ /* collect values from vars kept in mem */ sb_probs = malloc(num_vars * sizeof(double)); if ( ! sb_probs) {LOG_FATAL("%s\n", "out of memory"); return -1;} orig_idx = malloc(num_vars * sizeof(long int)); if ( ! orig_idx) {LOG_FATAL("%s\n", "out of memory"); return -1;} num_ign = 0; for (i=0; i<num_vars; i++) { char *sb_char = NULL; /* ignore indels too if sb filter is not to be applied */ if (! sb_filter->incl_indels && vcf_var_is_indel(vars[i])) { num_ign += 1; continue; } if ( ! vcf_var_has_info_key(&sb_char, vars[i], "SB")) { if ( ! sb_missing_warning_printed) { LOG_WARN("%s\n", "At least one variant has no SB tag! SB filtering will be incomplete"); sb_missing_warning_printed = 1; } num_ign += 1; continue; } sb_probs[i-num_ign] = PHREDQUAL_TO_PROB(atoi(sb_char)); orig_idx[i-num_ign] = i; /*LOG_FIXME("orig_idx[i=%ld - num_ign=%ld = %ld] = i=%ld\n", i, num_ign, i-num_ign, i);*/ free(sb_char); } if (num_vars-num_ign <= 0) { free(sb_probs); free(orig_idx); return 0; } /* realloc to smaller size apparently not guaranteed to free up space so no point really but let's make sure we don't use that memory */ sb_probs = realloc(sb_probs, (num_vars-num_ign) * sizeof(double)); if (! sb_probs) { LOG_FATAL("realloc failed. Exiting..."); return -1; } orig_idx = realloc(orig_idx, (num_vars-num_ign) * sizeof(long int)); if (! orig_idx) { LOG_FATAL("realloc failed. Exiting..."); return -1; } if (! sb_filter->ntests) { sb_filter->ntests = num_vars - num_ign; } else { if (num_vars-num_ign > sb_filter->ntests) { LOG_WARN("%s\n", "Number of predefined tests for SB filter larger than number of variants! Are you sure that makes sense?"); } } /* multiple testing correction */ if (sb_filter->mtc_type == MTC_BONF) { bonf_corr(sb_probs, num_vars-num_ign, sb_filter->ntests); } else if (sb_filter->mtc_type == MTC_HOLMBONF) { holm_bonf_corr(sb_probs, num_vars-num_ign, sb_filter->alpha, sb_filter->ntests); } else if (sb_filter->mtc_type == MTC_FDR) { long int num_rej = 0; long int *idx_rej; /* indices of rejected i.e. significant values */ num_rej = fdr(sb_probs, num_vars-num_ign, sb_filter->alpha, sb_filter->ntests, &idx_rej); /* first pretend none are significant */ for (i=0; i<num_vars-num_ign; i++) { sb_probs[i] = DBL_MAX; } LOG_DEBUG("%ld results significant after fdr\n", num_rej); for (i=0; i<num_rej; i++) { long int idx = idx_rej[i]; sb_probs[idx] = -1; } free(idx_rej); } else { LOG_FATAL("Internal error: unknown MTC type %d\n", sb_filter->mtc_type); return -1; } for (i=0; i<num_vars-num_ign; i++) { if (sb_probs[i] < sb_filter->alpha) { if (sb_filter->no_compound || alt_mostly_on_one_strand(vars[orig_idx[i]])) { vcf_var_add_to_filter(vars[orig_idx[i]], sb_filter->id); } } } free(orig_idx); free(sb_probs); return 0; }
/* returns -1 on error * * filter everything that's not significant * * Very similar to apply_sb_filter_mtc, but reverse testing logic and only looking at non consvars * */ int apply_indelqual_filter_mtc(indelqual_filter_t *indelqual_filter, var_t **vars, const long int num_vars) { /* can only apply this logic to variants that are not consensus * variants, i.e those that actually have a quality. therefore * keep track of non cons var indeces */ long int *orig_idx = NULL; /* of size num_noncons_vars */ double *noncons_errprobs = NULL; long int num_noncons_vars = 0; long int i; /* FIXME function almost identical to apply_indelqual_filter_mtc just different filter can be easily merged by accepting both types of variants */ /* collect values from noncons vars only and keep track of their indeces */ orig_idx = malloc(num_vars * sizeof(long int)); if ( ! orig_idx) { LOG_FATAL("%s\n", "out of memory"); return -1; } noncons_errprobs = malloc(num_vars * sizeof(double)); if ( ! noncons_errprobs) { LOG_FATAL("%s\n", "out of memory"); return -1; } num_noncons_vars = 0; for (i=0; i<num_vars; i++) { if (vars[i]->qual>-1 && vcf_var_has_info_key(NULL, vars[i], "INDEL")) { noncons_errprobs[num_noncons_vars] = PHREDQUAL_TO_PROB(vars[i]->qual); orig_idx[num_noncons_vars] = i; num_noncons_vars += 1; } } if (! num_noncons_vars) { free(noncons_errprobs); free(orig_idx); return 0; } if (indelqual_filter->ntests && num_noncons_vars > indelqual_filter->ntests) { LOG_WARN("Number of (non consensus) variants larger than number of predefined tests for indelqual filter (%ld > %ld)! Are you sure that makes sense?\n", num_noncons_vars, indelqual_filter->ntests); } orig_idx = realloc(orig_idx, (num_noncons_vars * sizeof(long int))); if ( ! orig_idx) { LOG_FATAL("%s\n", "out of memory"); return -1; } noncons_errprobs = realloc(noncons_errprobs, (num_noncons_vars * sizeof(double))); if ( ! noncons_errprobs) { LOG_FATAL("%s\n", "out of memory"); return -1; } /* only now we can set the number of tests (if it wasn't set by * caller) */ if (! indelqual_filter->ntests) { indelqual_filter->ntests = num_noncons_vars; } /* multiple testing correction */ if (indelqual_filter->mtc_type == MTC_BONF) { bonf_corr(noncons_errprobs, num_noncons_vars, indelqual_filter->ntests); } else if (indelqual_filter->mtc_type == MTC_HOLMBONF) { holm_bonf_corr(noncons_errprobs, num_noncons_vars, indelqual_filter->alpha, indelqual_filter->ntests); } else if (indelqual_filter->mtc_type == MTC_FDR) { long int num_rej = 0; long int *idx_rej; /* indices of rejected i.e. significant values */ num_rej = fdr(noncons_errprobs, num_noncons_vars, indelqual_filter->alpha, indelqual_filter->ntests, &idx_rej); /* first pretend none are significant */ for (i=0; i<num_noncons_vars; i++) { noncons_errprobs[i] = DBL_MAX; } LOG_DEBUG("%ld results significant after fdr\n", num_rej); for (i=0; i<num_rej; i++) { long int idx = idx_rej[i]; noncons_errprobs[idx] = -1; } free(idx_rej); } else { LOG_FATAL("Internal error: unknown MTC type %d\n", indelqual_filter->mtc_type); free(orig_idx); free(noncons_errprobs); return -1; } for (i=0; i<num_noncons_vars; i++) { if (noncons_errprobs[i] > indelqual_filter->alpha) { vcf_var_add_to_filter(vars[orig_idx[i]], indelqual_filter->id); } } free(orig_idx); free(noncons_errprobs); return 0; }