int main_vcfset(int argc, char *argv[]) { vcfset_conf_t vcfset_conf; char *vcf_header = NULL; int rc = 0; char *vcf_in1, *vcf_in2, *vcf_out; long int num_vars_vcf1; long int num_vars_vcf1_ign, num_vars_out; static int only_passed = 0; static int only_pos = 0; static int only_snvs = 0; static int only_indels = 0; static int count_only = 0; tbx_t *vcf2_tbx = NULL; /* index for second vcf file */ htsFile *vcf2_hts = NULL; char *add_info_field = NULL; int vcf_concat_findex = 0; vcf_in1 = vcf_in2 = vcf_out = NULL; num_vars_vcf1 = 0; num_vars_vcf1_ign = num_vars_out = 0; /* default vcfset options */ memset(&vcfset_conf, 0, sizeof(vcfset_conf_t)); /* vcfset_conf.vcf_in1 = NULL; */ /* vcfset_conf.vcf_in2 = NULL; */ /* vcfset_conf.vcf_out = stdout;*/ /* keep in sync with long_opts_str and usage * * getopt is a pain in the whole when it comes to syncing of long * and short args and usage. check out gopt, libcfu... */ while (1) { int c; static struct option long_opts[] = { /* see usage sync */ {"help", no_argument, NULL, 'h'}, {"verbose", no_argument, &verbose, 1}, {"debug", no_argument, &debug, 1}, {"only-passed", no_argument, &only_passed, 1}, {"only-pos", no_argument, &only_pos, 1}, {"only-indels", no_argument, &only_indels, 1}, {"only-snvs", no_argument, &only_snvs, 1}, {"count-only", no_argument, &count_only, 1}, {"vcf1", required_argument, NULL, '1'}, {"vcf2", required_argument, NULL, '2'}, {"vcfout", required_argument, NULL, 'o'}, {"action", required_argument, NULL, 'a'}, {"add-info", required_argument, NULL, 'I'}, {0, 0, 0, 0} /* sentinel */ }; /* keep in sync with long_opts and usage */ static const char *long_opts_str = "h1:2:o:a:I:"; /* getopt_long stores the option index here. */ int long_opts_index = 0; c = getopt_long(argc-1, argv+1, /* skipping 'lofreq', just leaving 'command', i.e. call */ long_opts_str, long_opts, & long_opts_index); if (c == -1) { break; } switch (c) { /* keep in sync with long_opts etc */ case 'h': usage(& vcfset_conf); free(vcf_in1); free(vcf_in2); free(vcf_out); return 0; case '1': vcf_in1 = strdup(optarg); break; case '2': vcf_in2 = strdup(optarg); break; case 'o': if (0 != strcmp(optarg, "-")) { if (file_exists(optarg)) { LOG_FATAL("Cowardly refusing to overwrite file '%s'. Exiting...\n", optarg); free(vcf_in1); free(vcf_in2); return 1; } } vcf_out = strdup(optarg); break; case 'a': if (0 == strcmp(optarg, "intersect")) { vcfset_conf.vcf_setop = SETOP_INTERSECT; } else if (0 == strcmp(optarg, "complement")) { vcfset_conf.vcf_setop = SETOP_COMPLEMENT; } else if (0 == strcmp(optarg, "concat")) { vcfset_conf.vcf_setop = SETOP_CONCAT; } else { LOG_FATAL("Unknown action '%s'. Exiting...\n", optarg); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } break; case 'I': add_info_field = strdup(optarg); break; case '?': LOG_FATAL("%s\n", "unrecognized arguments found. Exiting...\n"); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; default: break; } } vcfset_conf.only_passed = only_passed; vcfset_conf.only_pos = only_pos; vcfset_conf.only_snvs = only_snvs; vcfset_conf.only_indels = only_indels; if (vcfset_conf.only_indels && vcfset_conf.only_snvs) { LOG_FATAL("%s\n", "Can't take only indels *and* only snvs into account"); return 1; } if (0 != argc - optind - 1) { if (vcfset_conf.vcf_setop == SETOP_CONCAT) { vcf_concat_findex = optind; } else { LOG_FATAL("%s\n", "Unrecognized arguments found\n"); return 1; } } else { if (vcfset_conf.vcf_setop == SETOP_CONCAT) { LOG_FATAL("%s\n", "No extra files for concat given\n"); return 1; } } #if 0 int i; for (i=optind+1; i<argc; i++) { LOG_FIXME("argv[%d]=%s\n", i, argv[i]); } #endif if (argc == 2) { fprintf(stderr, "\n"); usage(& vcfset_conf); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } if (vcfset_conf.vcf_setop == SETOP_UNKNOWN) { LOG_FATAL("%s\n", "No set operation specified"); usage(& vcfset_conf); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } if (vcf_in1 == NULL || (vcf_in2 == NULL && vcfset_conf.vcf_setop != SETOP_CONCAT)) { LOG_FATAL("%s\n\n", "At least one vcf input file not specified"); usage(& vcfset_conf); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } if (vcf_in2 != NULL && vcfset_conf.vcf_setop == SETOP_CONCAT) { LOG_FATAL("%s\n\n", "For concat just use the -1 option followed by all other vcf files instead of using -2"); usage(& vcfset_conf); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } if (vcf_file_open(& vcfset_conf.vcf_in1, vcf_in1, HAS_GZIP_EXT(vcf_in1), 'r')) { LOG_ERROR("Couldn't open %s\n", vcf_in1); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } if (vcf_in2) { vcf2_hts = hts_open(vcf_in2, "r"); if (!vcf2_hts) { LOG_FATAL("Couldn't load %s\n", vcf_in2); return 1; } vcf2_tbx = tbx_index_load(vcf_in2); if (!vcf2_tbx) { LOG_FATAL("Couldn't load tabix index for %s\n", vcf_in2); return 1; } } /* vcf_out default if not set: stdout==- */ if (! vcf_out) { vcf_out = malloc(2 * sizeof(char)); strcpy(vcf_out, "-"); } if (! count_only) { if (vcf_file_open(& vcfset_conf.vcf_out, vcf_out, HAS_GZIP_EXT(vcf_out), 'w')) { LOG_ERROR("Couldn't open %s\n", vcf_out); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } } /* use meta-data/header of vcf_in1 for output */ LOG_DEBUG("Getting header from %s\n", vcf_in1); if (0 != vcf_parse_header(&vcf_header, & vcfset_conf.vcf_in1)) { LOG_WARN("%s\n", "vcf_parse_header() failed"); if (vcf_file_seek(& vcfset_conf.vcf_in1, 0, SEEK_SET)) { LOG_FATAL("%s\n", "Couldn't rewind file to parse variants" " after header parsing failed"); return -1; } } else { if (! count_only) { /* vcf_write_header would write *default* header */ vcf_write_header(& vcfset_conf.vcf_out, vcf_header); } free(vcf_header); } /* parse first vcf file */ LOG_DEBUG("Starting to parse variants from %s\n", vcf_in1); while (1) { var_t *var1 = NULL; int rc; int is_indel; kstring_t var2_kstr = {0, 0, 0}; hts_itr_t *var2_itr = NULL; char regbuf[1024]; int var2_match = 0; vcf_new_var(&var1); rc = vcf_parse_var(& vcfset_conf.vcf_in1, var1); if (rc) { free(var1); if (vcfset_conf.vcf_setop != SETOP_CONCAT) { break; } else { vcf_concat_findex++; if (vcf_concat_findex==argc) { break; } /* set vcf1 up anew and simply continue as if nothing happened */ vcf_file_close(& vcfset_conf.vcf_in1); free(vcf_in1); vcf_in1 = strdup(argv[vcf_concat_findex]); LOG_DEBUG("updated vcf_in1 = %s\n", vcf_in1); if (vcf_file_open(& vcfset_conf.vcf_in1, vcf_in1, HAS_GZIP_EXT(vcf_in1), 'r')) { LOG_ERROR("Couldn't open %s\n", vcf_in1); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } if (0 != vcf_skip_header(& vcfset_conf.vcf_in1)) { LOG_WARN("skip header failed for %s\n", vcf_in1); } continue; } } is_indel = vcf_var_is_indel(var1); if (vcfset_conf.only_snvs && is_indel) { free(var1); continue; } else if (vcfset_conf.only_indels && ! is_indel) { free(var1); continue; } if (! vcfset_conf.only_pos && NULL != strchr(var1->alt, ',')) { LOG_FATAL("%s\n", "No support for multi-allelic SNVs in vcf1"); return -1; } if (vcfset_conf.only_passed && ! VCF_VAR_PASSES(var1)) { #ifdef TRACE LOG_DEBUG("Skipping non-passing var1 %s:%d\n", var1->chrom, var1->pos); #endif num_vars_vcf1_ign += 1; vcf_free_var(& var1); continue; } if (add_info_field) { vcf_var_add_to_info(var1, add_info_field); } num_vars_vcf1 += 1; #ifdef TRACE LOG_DEBUG("Got passing var1 %s:%d\n", var1->chrom, var1->pos); #endif if (vcfset_conf.vcf_setop == SETOP_CONCAT) { num_vars_out += 1; if (! count_only) { vcf_write_var(& vcfset_conf.vcf_out, var1); } vcf_free_var(& var1); /* skip comparison against vcf2 */ continue; } /* use index access to vcf2 */ snprintf(regbuf, 1024, "%s:%ld-%ld", var1->chrom, var1->pos+1, var1->pos+1); var2_itr = tbx_itr_querys(vcf2_tbx, regbuf); if (! var2_itr) { var2_match = 0; } else { var2_match = 0; while (tbx_itr_next(vcf2_hts, vcf2_tbx, var2_itr, &var2_kstr) >= 0) { var_t *var2 = NULL; int var2_is_indel = 0; vcf_new_var(&var2); rc = vcf_parse_var_from_line(var2_kstr.s, var2); /* LOG_FIXME("%d:%s>%s looking at var2 %d:%s>%s (reg %s)\n", var1->pos+1, var1->ref, var1->alt, var2->pos+1, var2->ref, var2->alt, regbuf); */ if (rc) { LOG_FATAL("%s\n", "Error while parsing variant returned from tabix"); return -1; } var2_is_indel = vcf_var_is_indel(var2); /* iterator returns anything overlapping with that * position, i.e. this also includes up/downstream * indels, so make sure actual position matches */ if (var1->pos != var2->pos) { var2_match = 0; } else if (vcfset_conf.only_passed && ! VCF_VAR_PASSES(var2)) { var2_match = 0; } else if (vcfset_conf.only_snvs && var2_is_indel) { var2_match = 0; } else if (vcfset_conf.only_indels && ! var2_is_indel) { var2_match = 0; } else if (vcfset_conf.only_pos) { #ifdef TRACE LOG_DEBUG("Pos match for var2 %s:%d\n", var2->chrom, var2->pos); #endif var2_match = 1; } else { if (0==strcmp(var1->ref, var2->ref) && 0==strcmp(var1->alt, var2->alt)) { #ifdef TRACE LOG_DEBUG("Full match for var2 %s:%d\n", var2->chrom, var2->pos); #endif var2_match = 1;/* FIXME: check type as well i.e. snv vs indel */ } } vcf_free_var(&var2); if (var2_match) { break;/* no need to continue */ } } } if (vcfset_conf.vcf_setop == SETOP_COMPLEMENT) { /* relative complement : elements in A but not B */ if (!var2_match) { num_vars_out += 1; if (! count_only) { vcf_write_var(& vcfset_conf.vcf_out, var1); } } } else if (vcfset_conf.vcf_setop == SETOP_INTERSECT) { if (var2_match) { num_vars_out += 1; if (! count_only) { vcf_write_var(& vcfset_conf.vcf_out, var1); } } } else { LOG_FATAL("Internal error: unsupported vcf_setop %d\n", vcfset_conf.vcf_setop); return 1; } vcf_free_var(& var1); tbx_itr_destroy(var2_itr); }/* while (1) */ vcf_file_close(& vcfset_conf.vcf_in1); if (vcf_in2) { hts_close(vcf2_hts); tbx_destroy(vcf2_tbx); } LOG_VERBOSE("Parsed %d variants from 1st vcf file (ignoring %d non-passed of those)\n", num_vars_vcf1 + num_vars_vcf1_ign, num_vars_vcf1_ign); LOG_VERBOSE("Wrote %d variants to output\n", num_vars_out); if (! count_only) { vcf_file_close(& vcfset_conf.vcf_out); } if (0==rc) { if (count_only) { printf("%ld\n", num_vars_out); } LOG_VERBOSE("%s\n", "Successful exit."); } free(vcf_in1); free(vcf_in2); free(vcf_out); return rc; }
/* used as pileup callback function which is not ideal since this can * only work on one position (has to be ensured by caller). * * No cov means I won't be called through mpileup and no output will * be generated. Non-sig pv means I'm not sure and no ouput will be * generated. Only if pv is sig we will print the var * * needs to return void to be used as function pointer to mpileup */ void uniq_snv(const plp_col_t *p, void *confp) { uniq_conf_t *conf = (uniq_conf_t *)confp; char *af_char = NULL; float af; int is_uniq = 0; int is_indel; int coverage; is_indel = vcf_var_is_indel(conf->var); #ifdef DISABLE_INDELS if (is_indel) { LOG_WARN("uniq logic can't be applied to indels." " Skipping indel var at %s %d\n", conf->var->chrom, conf->var->pos+1); return; } #endif if (0 != strcmp(p->target, conf->var->chrom) || p->pos != conf->var->pos) { LOG_ERROR("wrong pileup for var. pileup for %s %d. var for %s %d\n", p->target, p->pos+1, conf->var->chrom, conf->var->pos+1); return; } coverage = p->coverage_plp; if (is_indel) { coverage -= p->num_tails; } if (1 > coverage) { return; } if (conf->uni_freq <= 0.0) { if (! vcf_var_has_info_key(&af_char, conf->var, "AF")) { LOG_FATAL("%s\n", "Couldn't parse AF (key not found) from variant"); /* hard to catch error later */ exit(1); } af = strtof(af_char, (char **)NULL); /* atof */ free(af_char); if (af < 0.0 || af > 1.0) { float new_af; new_af = af<0.0 ? 0.01 : 1.0; /* hard to catch error later */ LOG_FATAL("Invalid (value out of bound) AF %f in variant. Resetting to %f\n", af, new_af); af = new_af; } } else { assert(conf->uni_freq <= 1.0); af = conf->uni_freq; } if (conf->use_det_lim) { /* given the current base counts and their error probs, * would we've been able to detect at given frequency. */ long double pvalues[NUM_NONCONS_BASES]; double *err_probs; /* error probs (qualities) passed down to snpcaller */ int num_err_probs; int alt_bases[NUM_NONCONS_BASES];/* actual alt bases */ int alt_counts[NUM_NONCONS_BASES]; /* counts for alt bases handed down to snpcaller */ int alt_raw_counts[NUM_NONCONS_BASES]; /* raw, unfiltered alt-counts */ varcall_conf_t varcall_conf; int bonf = 1; float alpha = 0.01; init_varcall_conf(&varcall_conf); if (debug) { dump_varcall_conf(&varcall_conf, stderr); } plp_to_errprobs(&err_probs, &num_err_probs, alt_bases, alt_counts, alt_raw_counts, p, &varcall_conf); LOG_DEBUG("at %s:%d with cov %d and num_err_probs %d\n", p->target, p->pos, coverage, num_err_probs); /* Now pretend we see AF(SNV-to-test)*coverage variant * bases. Truncate to int, i.e err on the side of caution * during rounding (assume fewer alt bases) */ alt_counts[0] = af * num_err_probs; /* don't use coverage as that is before filtering */ alt_counts[1] = alt_counts[2] = 0; if (snpcaller(pvalues, err_probs, num_err_probs, alt_counts, bonf, alpha)) { fprintf(stderr, "FATAL: snpcaller() failed at %s:%s():%d\n", __FILE__, __FUNCTION__, __LINE__); free(err_probs); return; } /* only need to test first pv */ if (pvalues[0] * (float)bonf < alpha) { /* significant value means given the counts and * qualities we would have been able to detect this * uncalled SNV had it been present at the given * frequency. But since we didn't this is a uniq * variant. * * No point in adding this as phred qual because it * means the opposite of UQ */ vcf_var_add_to_info(conf->var, uniq_flag); } LOG_VERBOSE("%s %d num_quals=%d assumed-var-counts=%d would-have-been-detectable=%d\n", conf->var->chrom, conf->var->pos+1, num_err_probs, alt_counts[0], is_uniq); free(err_probs); } else { int alt_count; double pvalue; char info_str[128]; if (is_indel) { int ref_len = strlen(conf->var->ref); int alt_len = strlen(conf->var->alt); if (ref_len > alt_len) { /* deletion */ char *del_key = malloc((strlen(conf->var->ref)+1)*sizeof(char)); strcpy(del_key, conf->var->ref+1); del_event *it_del = find_del_sequence(&p->del_event_counts, del_key); if (it_del) { alt_count = it_del->count; } else { alt_count = 0; } /* LOG_DEBUG("%s>%s k:%s c:%d\n", conf->var->ref, conf->var->alt, del_key, alt_count); */ free(del_key); } else { /* insertion */ char *ins_key = malloc((strlen(conf->var->alt)+1)*sizeof(char)); strcpy(ins_key, conf->var->alt+1); ins_event *it_ins = find_ins_sequence(&p->ins_event_counts, ins_key); if (it_ins) { alt_count = it_ins->count; } else { alt_count = 0; } /* LOG_DEBUG("%s>%s k:%s c:%d\n", conf->var->ref, conf->var->alt, ins_key, alt_count);*/ free(ins_key); } } else { alt_count = base_count(p, conf->var->alt[0]); } #ifdef DEBUG LOG_DEBUG("Now testing af=%f cov=%d alt_count=%d at %s %d for var:", af, coverage, alt_count, p->target, p->pos+1); #endif /* this is a one sided test */ if (0 != binom(&pvalue, NULL, coverage, alt_count, af)) { LOG_ERROR("%s\n", "binom() failed"); return; } snprintf(info_str, 128, "%s=%d", uniq_phred_tag, PROB_TO_PHREDQUAL_SAFE(pvalue)); vcf_var_add_to_info(conf->var, info_str); LOG_DEBUG("%s %d %s>%s AF=%f | %s (p-value=%g) | BAM alt_count=%d cov=%d (freq=%f)\n", conf->var->chrom, conf->var->pos+1, conf->var->ref, conf->var->alt, af, is_uniq ? "unique" : "not necessarily unique", pvalue, alt_count, coverage, alt_count/(float)coverage); } }
int main_filter(int argc, char *argv[]) { filter_conf_t cfg; char *vcf_in = NULL, *vcf_out = NULL; static int print_only_passed = 1; static int sb_filter_no_compound = 0; static int sb_filter_incl_indels = 0; static int only_indels = 0; static int only_snvs = 0; char *vcf_header = NULL; mtc_qual_t *mtc_quals = NULL; long int num_vars; static int no_defaults = 0; long int var_idx = -1; /* default filter options */ memset(&cfg, 0, sizeof(filter_conf_t)); cfg.dp_filter.min = cfg.dp_filter.max = -1; cfg.af_filter.min = cfg.af_filter.max = -1; cfg.sb_filter.alpha = DEFAULT_SIG; cfg.snvqual_filter.alpha = DEFAULT_SIG; cfg.indelqual_filter.alpha = DEFAULT_SIG; /* keep in sync with long_opts_str and usage * * getopt is a pain in the whole when it comes to syncing of long * and short args and usage. check out gopt, libcfu... */ while (1) { int c; static struct option long_opts[] = { /* see usage sync */ {"verbose", no_argument, &verbose, 1}, {"debug", no_argument, &debug, 1}, {"print-all", no_argument, &print_only_passed, 0}, {"no-defaults", no_argument, &no_defaults, 1}, {"only-indels", no_argument, &only_indels, 1}, {"only-snvs", no_argument, &only_snvs, 1}, {"help", no_argument, NULL, 'h'}, {"in", required_argument, NULL, 'i'}, {"out", required_argument, NULL, 'o'}, {"cov-min", required_argument, NULL, 'v'}, {"cov-max", required_argument, NULL, 'V'}, {"af-min", required_argument, NULL, 'a'}, {"af-max", required_argument, NULL, 'A'}, {"sb-thresh", required_argument, NULL, 'B'}, {"sb-mtc", required_argument, NULL, 'b'}, {"sb-alpha", required_argument, NULL, 'c'}, {"sb-no-compound", no_argument, &sb_filter_no_compound, 1}, {"sb-incl-indels", no_argument, &sb_filter_incl_indels, 1}, {"snvqual-thresh", required_argument, NULL, 'Q'}, {"snvqual-mtc", required_argument, NULL, 'q'}, {"snvqual-alpha", required_argument, NULL, 'r'}, {"snvqual-ntests", required_argument, NULL, 's'}, {"indelqual-thresh", required_argument, NULL, 'K'}, {"indelqual-mtc", required_argument, NULL, 'k'}, {"indelqual-alpha", required_argument, NULL, 'l'}, {"indelqual-ntests", required_argument, NULL, 'm'}, {0, 0, 0, 0} /* sentinel */ }; /* keep in sync with long_opts and usage */ static const char *long_opts_str = "hi:o:v:V:a:A:B:b:c:Q:q:r:s:K:k:l:m:"; /* getopt_long stores the option index here. */ int long_opts_index = 0; c = getopt_long(argc-1, argv+1, /* skipping 'lofreq', just leaving 'command', i.e. call */ long_opts_str, long_opts, & long_opts_index); if (c == -1) { break; } switch (c) { /* keep in sync with long_opts etc */ case 'h': usage(& cfg); return 0; case 'i': vcf_in = strdup(optarg); break; case 'o': if (0 != strcmp(optarg, "-")) { if (file_exists(optarg)) { LOG_FATAL("Cowardly refusing to overwrite file '%s'. Exiting...\n", optarg); return 1; } } vcf_out = strdup(optarg); break; case 'v': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.dp_filter.min = atoi(optarg); break; case 'V': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.dp_filter.max = atoi(optarg); break; case 'a': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.af_filter.min = strtof(optarg, NULL); break; case 'A': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.af_filter.max = strtof(optarg, NULL); break; case 'B': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.sb_filter.thresh = atoi(optarg); break; case 'b': cfg.sb_filter.mtc_type = mtc_str_to_type(optarg); if (-1 == cfg.sb_filter.mtc_type) { LOG_FATAL("Unknown multiple testing correction type '%s' for strandbias filtering\n", optarg); return -1; } break; case 'c': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.sb_filter.alpha = strtof(optarg, NULL); break; case 'Q': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.snvqual_filter.thresh = atoi(optarg); break; case 'q': cfg.snvqual_filter.mtc_type = mtc_str_to_type(optarg); if (-1 == cfg.snvqual_filter.mtc_type) { LOG_FATAL("Unknown multiple testing correction type '%s' for snv quality filtering\n", optarg); return -1; } break; case 'r': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.snvqual_filter.alpha = strtof(optarg, NULL); break; case 's': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.snvqual_filter.ntests = atol(optarg); break; case 'K': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.indelqual_filter.thresh = atoi(optarg); break; case 'k': cfg.indelqual_filter.mtc_type = mtc_str_to_type(optarg); if (-1 == cfg.indelqual_filter.mtc_type) { LOG_FATAL("Unknown multiple testing correction type '%s' for snv quality filtering\n", optarg); return -1; } break; case 'l': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.indelqual_filter.alpha = strtof(optarg, NULL); break; case 'm': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.indelqual_filter.ntests = atol(optarg); break; case '?': LOG_FATAL("%s\n", "Unrecognized argument found. Exiting...\n"); return 1; default: break; } } cfg.print_only_passed = print_only_passed; cfg.only_indels = only_indels; cfg.only_snvs = only_snvs; cfg.sb_filter.no_compound = sb_filter_no_compound; cfg.sb_filter.incl_indels = sb_filter_incl_indels; if (cfg.only_indels && cfg.only_snvs) { LOG_FATAL("%s\n", "Can't keep only indels and only snvs"); return 1; } if (! no_defaults) { if (cfg.sb_filter.mtc_type==MTC_NONE && ! cfg.sb_filter.thresh) { LOG_VERBOSE("%s\n", "Setting default SB filtering method to FDR"); cfg.sb_filter.mtc_type = MTC_FDR; cfg.sb_filter.alpha = 0.001; } if (cfg.dp_filter.min<0) { cfg.dp_filter.min = 10; LOG_VERBOSE("Setting default minimum coverage to %d\n", cfg.dp_filter.min); } } else { LOG_VERBOSE("%s\n", "Skipping default settings"); } if (0 != argc - optind - 1) {/* FIXME needed at all? */ LOG_FATAL("%s\n", "Unrecognized argument found. Exiting...\n"); return 1; } /* logic check of command line parameters */ if (cfg.dp_filter.max > 0 && cfg.dp_filter.max < cfg.dp_filter.min) { LOG_FATAL("%s\n", "Invalid coverage-filter settings"); return 1; } if ((cfg.af_filter.max > 0 && cfg.af_filter.max < cfg.af_filter.min) || (cfg.af_filter.max > 1.0)) { LOG_FATAL("%s\n", "Invalid AF-filter settings"); return 1; } if (cfg.sb_filter.thresh && cfg.sb_filter.mtc_type != MTC_NONE) { LOG_FATAL("%s\n", "Can't use fixed strand-bias threshold *and* multiple testing correction."); return 1; } if (cfg.snvqual_filter.thresh && cfg.snvqual_filter.mtc_type != MTC_NONE) { LOG_FATAL("%s\n", "Can't use fixed SNV quality threshold *and* multiple testing correction."); return 1; } if (cfg.indelqual_filter.thresh && cfg.indelqual_filter.mtc_type != MTC_NONE) { LOG_FATAL("%s\n", "Can't use fixed indel quality threshold *and* multiple testing correction."); return 1; } if (argc == 2) { fprintf(stderr, "\n"); usage(& cfg); return 1; } if (debug) { dump_filter_conf(& cfg); } /* missing file args default to stdin and stdout */ /* no streaming allowed for vcf_in: we need to determine thresholds first */ if (! vcf_in) { LOG_FATAL("%s\n", "Input VCF missing. No streaming allowed. Need to determine auto threshold in memory friendly manner first."); return 1; } if (! vcf_out) { vcf_out = malloc(2 * sizeof(char)); strcpy(vcf_out, "-"); } LOG_DEBUG("vcf_in=%s vcf_out=%s\n", vcf_in, vcf_out); /* First pass parsing to get qualities for MTC computation (if needed) */ if (cfg.sb_filter.mtc_type != MTC_NONE || cfg.snvqual_filter.mtc_type != MTC_NONE || cfg.indelqual_filter.mtc_type != MTC_NONE) { #ifdef TRACE long int i = 0; #endif LOG_VERBOSE("%s\n", "At least one type of multiple testing correction requested. Doing first pass of vcf"); if ((num_vars = mtc_quals_from_vcf_file(& mtc_quals, vcf_in)) < 0) { LOG_ERROR("Couldn't parse %s\n", vcf_in); return 1; } if (cfg.sb_filter.mtc_type != MTC_NONE) { if (apply_sb_filter_mtc(mtc_quals, & cfg.sb_filter, num_vars)) { LOG_FATAL("%s\n", "Multiple testing correction on strand-bias pvalues failed"); return -1; } } if (cfg.indelqual_filter.mtc_type != MTC_NONE) { if (apply_indelqual_filter_mtc(mtc_quals, & cfg.indelqual_filter, num_vars)) { LOG_FATAL("%s\n", "Multiple testing correction on indel quality pvalues failed"); return -1; } } if (cfg.snvqual_filter.mtc_type != MTC_NONE) { if (apply_snvqual_filter_mtc(mtc_quals, & cfg.snvqual_filter, num_vars)) { LOG_FATAL("%s\n", "Multiple testing correction on SNV quality pvalues failed"); return -1; } } #ifdef TRACE for (i=0; i<num_vars; i++) { LOG_WARN("mtc_quals #%ld sb_qual=%d var_qual=%d is_indel=%d\n", i, mtc_quals[i].sb_qual, mtc_quals[i].var_qual, mtc_quals[i].is_indel); } #endif LOG_VERBOSE("%s\n", "MTC application completed"); } else { LOG_VERBOSE("%s\n", "No multiple testing correction requested. First pass of vcf skipped"); } if (vcf_file_open(& cfg.vcf_in, vcf_in, HAS_GZIP_EXT(vcf_in), 'r')) { LOG_ERROR("Couldn't open %s\n", vcf_in); return 1; } if (vcf_file_open(& cfg.vcf_out, vcf_out, HAS_GZIP_EXT(vcf_out), 'w')) { LOG_ERROR("Couldn't open %s\n", vcf_out); return 1; } free(vcf_in); free(vcf_out); /* print header */ if (0 != vcf_parse_header(&vcf_header, & cfg.vcf_in)) { /* LOG_WARN("%s\n", "vcf_parse_header() failed"); */ if (vcf_file_seek(& cfg.vcf_in, 0, SEEK_SET)) { LOG_FATAL("%s\n", "Couldn't rewind file to parse variants" " after header parsing failed"); return -1; } } /* also sets filter names */ cfg_filter_to_vcf_header(& cfg, &vcf_header); vcf_write_header(& cfg.vcf_out, vcf_header); free(vcf_header); /* read in variants */ while (1) { var_t *var; int rc; int is_indel = 0; vcf_new_var(&var); rc = vcf_parse_var(& cfg.vcf_in, var); if (rc) { /* how to distinguish between error and EOF? */ break; } var_idx += 1; is_indel = vcf_var_is_indel(var); if (cfg.only_snvs && is_indel) { vcf_free_var(&var); continue; } else if (cfg.only_indels && ! is_indel) { vcf_free_var(&var); continue; } /* filters applying to all types of variants */ apply_af_filter(var, & cfg.af_filter); apply_dp_filter(var, & cfg.dp_filter); /* quality threshold per variant type */ if (! is_indel) { if (cfg.snvqual_filter.thresh) { assert(cfg.snvqual_filter.mtc_type == MTC_NONE); apply_snvqual_threshold(var, & cfg.snvqual_filter); } else if (cfg.snvqual_filter.mtc_type != MTC_NONE) { if (mtc_quals[var_idx].var_qual != -1) { vcf_var_add_to_filter(var, cfg.snvqual_filter.id); } } } else { if (cfg.indelqual_filter.thresh) { assert(cfg.indelqual_filter.mtc_type == MTC_NONE); apply_indelqual_threshold(var, & cfg.indelqual_filter); } else if (cfg.indelqual_filter.mtc_type != MTC_NONE) { if (mtc_quals[var_idx].var_qual != -1) { vcf_var_add_to_filter(var, cfg.indelqual_filter.id); } } } /* sb filter */ if (cfg.sb_filter.thresh) { if (! is_indel || cfg.sb_filter.incl_indels) { assert(cfg.sb_filter.mtc_type == MTC_NONE); apply_sb_threshold(var, & cfg.sb_filter); } } else if (cfg.sb_filter.mtc_type != MTC_NONE) { if (! is_indel || cfg.sb_filter.incl_indels) { if (mtc_quals[var_idx].sb_qual == -1) { vcf_var_add_to_filter(var, cfg.sb_filter.id); } } } /* output */ if (cfg.print_only_passed && ! (VCF_VAR_PASSES(var))) { vcf_free_var(&var); continue; } /* add pass if no filters were set */ if (! var->filter || strlen(var->filter)<=1) { char pass_str[] = "PASS"; if (var->filter) { free(var->filter); } var->filter = strdup(pass_str); } vcf_write_var(& cfg.vcf_out, var); vcf_free_var(&var); if (var_idx%1000==0) { (void) vcf_file_flush(& cfg.vcf_out); } } vcf_file_close(& cfg.vcf_in); vcf_file_close(& cfg.vcf_out); free(mtc_quals); LOG_VERBOSE("%s\n", "Successful exit."); return 0; }
/* mtc_quals allocated here. size returned on exit or -1 on error */ long int mtc_quals_from_vcf_file(mtc_qual_t **mtc_quals, const char *vcf_in) { long int num_vars = 0; long int mtc_qual_size = 0; int mtc_qual_incr = 16384; vcf_file_t vcffh; if (vcf_file_open(&vcffh, vcf_in, HAS_GZIP_EXT(vcf_in), 'r')) { LOG_ERROR("Couldn't open %s\n", vcf_in); return -1; } if (0 != vcf_skip_header(&vcffh)) { LOG_WARN("%s\n", "vcf_skip_header() failed"); return -1; } mtc_qual_size += mtc_qual_incr; (*mtc_quals) = calloc(mtc_qual_size, sizeof(mtc_qual_t)); while (1) { var_t *var; int rc; int is_indel = 0; char *sb_char = NULL; vcf_new_var(&var); rc = vcf_parse_var(&vcffh, var); if (rc) { /* how to distinguish between error and EOF? */ break; } num_vars += 1; /* ingest anything: we keep adding filters */ if (num_vars > mtc_qual_size) { mtc_qual_size += mtc_qual_incr; (*mtc_quals) = realloc((*mtc_quals), mtc_qual_size * sizeof(mtc_qual_t)); } is_indel = vcf_var_is_indel(var); (*mtc_quals)[num_vars-1].is_indel = is_indel; /* variant quality */ if (var->qual==-1) { /* missing qualities to fake value */ var->qual = INT_MAX; if (! varq_missing_warning_printed) { LOG_WARN("%s\n", "Missing variant quality in at least once case. Assuming INT_MAX"); varq_missing_warning_printed = 1; } (*mtc_quals)[num_vars-1].var_qual = INT_MAX; } else { (*mtc_quals)[num_vars-1].var_qual = var->qual; } /* strand bias */ if ( ! vcf_var_has_info_key(&sb_char, var, "SB")) { if ( ! sb_missing_warning_printed) { LOG_WARN("%s\n", "At least one variant has no SB tag! Assuming 0"); sb_missing_warning_printed = 1; } (*mtc_quals)[num_vars-1].sb_qual = 0; } else { (*mtc_quals)[num_vars-1].sb_qual = atoi(sb_char); free(sb_char); } (*mtc_quals)[num_vars-1].is_alt_mostly_on_one_strand = alt_mostly_on_one_strand(var); vcf_free_var(&var); } vcf_file_close(&vcffh); return num_vars; }
int main_filter(int argc, char *argv[]) { filter_conf_t cfg; char *vcf_in = NULL, *vcf_out = NULL; static int print_only_passed = 1; static int sb_filter_no_compound = 0; static int sb_filter_incl_indels = 0; static int only_indels = 0; static int only_snvs = 0; char *vcf_header = NULL; var_t **vars = NULL; long int num_vars = 0; /* isn't long overkill here ? */ long int vars_size = 0; /* keeping track of how much memory we've got pre-allocated */ long int i; static int no_defaults = 0; /* default filter options */ memset(&cfg, 0, sizeof(filter_conf_t)); cfg.dp_filter.min = cfg.dp_filter.max = -1; cfg.af_filter.min = cfg.af_filter.max = -1; cfg.sb_filter.alpha = DEFAULT_SIG; cfg.snvqual_filter.alpha = DEFAULT_SIG; cfg.indelqual_filter.alpha = DEFAULT_SIG; /* keep in sync with long_opts_str and usage * * getopt is a pain in the whole when it comes to syncing of long * and short args and usage. check out gopt, libcfu... */ while (1) { int c; static struct option long_opts[] = { /* see usage sync */ {"verbose", no_argument, &verbose, 1}, {"debug", no_argument, &debug, 1}, {"print-all", no_argument, &print_only_passed, 0}, {"no-defaults", no_argument, &no_defaults, 1}, {"only-indels", no_argument, &only_indels, 1}, {"only-snvs", no_argument, &only_snvs, 1}, {"help", no_argument, NULL, 'h'}, {"in", required_argument, NULL, 'i'}, {"out", required_argument, NULL, 'o'}, {"cov-min", required_argument, NULL, 'v'}, {"cov-max", required_argument, NULL, 'V'}, {"af-min", required_argument, NULL, 'a'}, {"af-max", required_argument, NULL, 'A'}, {"sb-thresh", required_argument, NULL, 'B'}, {"sb-mtc", required_argument, NULL, 'b'}, {"sb-alpha", required_argument, NULL, 'c'}, {"sb-no-compound", no_argument, &sb_filter_no_compound, 1}, {"sb-incl-indels", no_argument, &sb_filter_incl_indels, 1}, {"snvqual-thresh", required_argument, NULL, 'Q'}, {"snvqual-mtc", required_argument, NULL, 'q'}, {"snvqual-alpha", required_argument, NULL, 'r'}, {"snvqual-ntests", required_argument, NULL, 's'}, {"indelqual-thresh", required_argument, NULL, 'K'}, {"indelqual-mtc", required_argument, NULL, 'k'}, {"indelqual-alpha", required_argument, NULL, 'l'}, {"indelqual-ntests", required_argument, NULL, 'm'}, {0, 0, 0, 0} /* sentinel */ }; /* keep in sync with long_opts and usage */ static const char *long_opts_str = "hi:o:v:V:a:A:B:b:c:Q:q:r:s:K:k:l:m:"; /* getopt_long stores the option index here. */ int long_opts_index = 0; c = getopt_long(argc-1, argv+1, /* skipping 'lofreq', just leaving 'command', i.e. call */ long_opts_str, long_opts, & long_opts_index); if (c == -1) { break; } switch (c) { /* keep in sync with long_opts etc */ case 'h': usage(& cfg); return 0; case 'i': vcf_in = strdup(optarg); break; case 'o': if (0 != strcmp(optarg, "-")) { if (file_exists(optarg)) { LOG_FATAL("Cowardly refusing to overwrite file '%s'. Exiting...\n", optarg); return 1; } } vcf_out = strdup(optarg); break; case 'v': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.dp_filter.min = atoi(optarg); break; case 'V': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.dp_filter.max = atoi(optarg); break; case 'a': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.af_filter.min = strtof(optarg, NULL); break; case 'A': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.af_filter.max = strtof(optarg, NULL); break; case 'B': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.sb_filter.thresh = atoi(optarg); break; case 'b': cfg.sb_filter.mtc_type = mtc_str_to_type(optarg); if (-1 == cfg.sb_filter.mtc_type) { LOG_FATAL("Unknown multiple testing correction type '%s' for strandbias filtering\n", optarg); return -1; } break; case 'c': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.sb_filter.alpha = strtof(optarg, NULL); break; case 'Q': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.snvqual_filter.thresh = atoi(optarg); break; case 'q': cfg.snvqual_filter.mtc_type = mtc_str_to_type(optarg); if (-1 == cfg.snvqual_filter.mtc_type) { LOG_FATAL("Unknown multiple testing correction type '%s' for snv quality filtering\n", optarg); return -1; } break; case 'r': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.snvqual_filter.alpha = strtof(optarg, NULL); break; case 's': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.snvqual_filter.ntests = atol(optarg); break; case 'K': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.indelqual_filter.thresh = atoi(optarg); break; case 'k': cfg.indelqual_filter.mtc_type = mtc_str_to_type(optarg); if (-1 == cfg.indelqual_filter.mtc_type) { LOG_FATAL("Unknown multiple testing correction type '%s' for snv quality filtering\n", optarg); return -1; } break; case 'l': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.indelqual_filter.alpha = strtof(optarg, NULL); break; case 'm': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.indelqual_filter.ntests = atol(optarg); break; case '?': LOG_FATAL("%s\n", "Unrecognized argument found. Exiting...\n"); return 1; default: break; } } cfg.print_only_passed = print_only_passed; cfg.only_indels = only_indels; cfg.only_snvs = only_snvs; cfg.sb_filter.no_compound = sb_filter_no_compound; cfg.sb_filter.incl_indels = sb_filter_incl_indels; if (cfg.only_indels && cfg.only_snvs) { LOG_FATAL("%s\n", "Can't keep only indels and only snvs"); return 1; } if (! no_defaults) { if (cfg.sb_filter.mtc_type==MTC_NONE && ! cfg.sb_filter.thresh) { LOG_VERBOSE("%s\n", "Setting default SB filtering method to FDR"); cfg.sb_filter.mtc_type = MTC_FDR; cfg.sb_filter.alpha = 0.001; } if (cfg.dp_filter.min<0) { cfg.dp_filter.min = 10; LOG_VERBOSE("Setting default minimum coverage to %d\n", cfg.dp_filter.min); } } else { LOG_VERBOSE("%s\n", "Skipping default settings"); } if (0 != argc - optind - 1) {/* FIXME needed at all? */ LOG_FATAL("%s\n", "Unrecognized argument found. Exiting...\n"); return 1; } /* logic check of command line parameters */ if (cfg.dp_filter.max > 0 && cfg.dp_filter.max < cfg.dp_filter.min) { LOG_FATAL("%s\n", "Invalid coverage-filter settings"); return 1; } if ((cfg.af_filter.max > 0 && cfg.af_filter.max < cfg.af_filter.min) || (cfg.af_filter.max > 1.0)) { LOG_FATAL("%s\n", "Invalid AF-filter settings"); return 1; } if (cfg.sb_filter.thresh && cfg.sb_filter.mtc_type != MTC_NONE) { LOG_FATAL("%s\n", "Can't use fixed strand-bias threshold *and* multiple testing correction."); return 1; } if (cfg.snvqual_filter.thresh && cfg.snvqual_filter.mtc_type != MTC_NONE) { LOG_FATAL("%s\n", "Can't use fixed SNV quality threshold *and* multiple testing correction."); return 1; } if (cfg.indelqual_filter.thresh && cfg.indelqual_filter.mtc_type != MTC_NONE) { LOG_FATAL("%s\n", "Can't use fixed indel quality threshold *and* multiple testing correction."); return 1; } if (argc == 2) { fprintf(stderr, "\n"); usage(& cfg); return 1; } if (debug) { dump_filter_conf(& cfg); } /* missing file args default to stdin and stdout */ if (! vcf_in) { vcf_in = malloc(2 * sizeof(char)); strcpy(vcf_in, "-"); } if (! vcf_out) { vcf_out = malloc(2 * sizeof(char)); strcpy(vcf_out, "-"); } LOG_DEBUG("vcf_in=%s vcf_out=%s\n", vcf_in, vcf_out); /* open vcf files */ if (vcf_file_open(& cfg.vcf_in, vcf_in, HAS_GZIP_EXT(vcf_in), 'r')) { LOG_ERROR("Couldn't open %s\n", vcf_in); return 1; } if (vcf_file_open(& cfg.vcf_out, vcf_out, HAS_GZIP_EXT(vcf_out), 'w')) { LOG_ERROR("Couldn't open %s\n", vcf_out); return 1; } free(vcf_in); free(vcf_out); /* FIXME everything below here should go into a function with args: - cfg - ...what else? */ /* print header */ if (0 != vcf_parse_header(&vcf_header, & cfg.vcf_in)) { /* LOG_WARN("%s\n", "vcf_parse_header() failed"); */ if (vcf_file_seek(& cfg.vcf_in, 0, SEEK_SET)) { LOG_FATAL("%s\n", "Couldn't rewind file to parse variants" " after header parsing failed"); return -1; } } /* also sets filter names */ cfg_filter_to_vcf_header(& cfg, &vcf_header); vcf_write_header(& cfg.vcf_out, vcf_header); free(vcf_header); /* read in variants. since many filters perform multiple testing * correction and therefore need to look at all variants we keep * it simple and load them all into memory. * * in theory we could apply all 'simple' filters directly within * the loop here and depending on the result spit the variant out * or not. only complex filters need to see all variants first to, * e.g. apply multiple testing. */ num_vars = 0; while (1) { var_t *var; int rc; int is_indel = 0; vcf_new_var(&var); rc = vcf_parse_var(& cfg.vcf_in, var); if (rc) { /* how to distinguish between error and EOF? */ free(var); break; } is_indel = vcf_var_is_indel(var); if (cfg.only_snvs && is_indel) { free(var); continue; } else if (cfg.only_indels && ! is_indel) { free(var); continue; } /* read all in, no matter if already filtered. we keep adding filters */ num_vars +=1; if (num_vars >= vars_size) { const long incr = 128; vars = realloc(vars, (vars_size+incr) * sizeof(var_t*)); vars_size += incr; } vars[num_vars-1] = var; #ifdef TRACE { char *key; vcf_var_key(&key, vars[num_vars-1]); fprintf(stderr, "storing var %ld+1: %s\n", num_vars, key); free(key); } #endif /* filters applying to all types of variants */ apply_af_filter(var, & cfg.af_filter); apply_dp_filter(var, & cfg.dp_filter); /* quality threshold per variant type */ if (! is_indel) { if (cfg.snvqual_filter.thresh) { assert(cfg.snvqual_filter.mtc_type == MTC_NONE); apply_snvqual_threshold(var, & cfg.snvqual_filter); } } else { if (cfg.indelqual_filter.thresh) { assert(cfg.indelqual_filter.mtc_type == MTC_NONE); apply_indelqual_threshold(var, & cfg.indelqual_filter); } } if (cfg.sb_filter.thresh) { if (! is_indel || cfg.sb_filter.incl_indels) { assert(cfg.sb_filter.mtc_type == MTC_NONE); apply_sb_threshold(var, & cfg.sb_filter); } } } if (num_vars) { vars = realloc(vars, (num_vars * sizeof(var_t*))); } vcf_file_close(& cfg.vcf_in); LOG_VERBOSE("Parsed %ld variants\n", num_vars); if (cfg.sb_filter.mtc_type != MTC_NONE) { if (apply_sb_filter_mtc(& cfg.sb_filter, vars, num_vars)) { LOG_FATAL("%s\n", "Multiple testing correction on strand-bias pvalues failed"); return -1; } } if (cfg.snvqual_filter.mtc_type != MTC_NONE) { if (apply_snvqual_filter_mtc(& cfg.snvqual_filter, vars, num_vars)) { LOG_FATAL("%s\n", "Multiple testing correction on SNV qualities failed"); return -1; } } if (cfg.indelqual_filter.mtc_type != MTC_NONE) { if (apply_indelqual_filter_mtc(& cfg.indelqual_filter, vars, num_vars)) { LOG_FATAL("%s\n", "Multiple testing correction on Indel qualities failed"); return -1; } } /* output */ for (i=0; i<num_vars; i++) { var_t *v = vars[i]; if (cfg.print_only_passed && ! (VCF_VAR_PASSES(v))) { continue; } /* add pass if no filters were set */ if (! v->filter || strlen(v->filter)<=1) { char pass_str[] = "PASS"; if (v->filter) { free(v->filter); } v->filter = strdup(pass_str); } vcf_write_var(& cfg.vcf_out, v); } vcf_file_close(& cfg.vcf_out); for (i=0; i<num_vars; i++) { vcf_free_var(& vars[i]); } free(vars); LOG_VERBOSE("%s\n", "Successful exit."); return 0; }
/* returns -1 on error * * filter everything that's significant * * very similar to in apply_snvqual_filter_mtc, but reverse logic and looking at all vars */ int apply_sb_filter_mtc(sb_filter_t *sb_filter, var_t **vars, const long int num_vars) { double *sb_probs = NULL; long int i; long int num_ign = 0; long int *orig_idx = NULL;/* we might ignore some variants (missing values etc). keep track of real indices of kept vars */ /* collect values from vars kept in mem */ sb_probs = malloc(num_vars * sizeof(double)); if ( ! sb_probs) {LOG_FATAL("%s\n", "out of memory"); return -1;} orig_idx = malloc(num_vars * sizeof(long int)); if ( ! orig_idx) {LOG_FATAL("%s\n", "out of memory"); return -1;} num_ign = 0; for (i=0; i<num_vars; i++) { char *sb_char = NULL; /* ignore indels too if sb filter is not to be applied */ if (! sb_filter->incl_indels && vcf_var_is_indel(vars[i])) { num_ign += 1; continue; } if ( ! vcf_var_has_info_key(&sb_char, vars[i], "SB")) { if ( ! sb_missing_warning_printed) { LOG_WARN("%s\n", "At least one variant has no SB tag! SB filtering will be incomplete"); sb_missing_warning_printed = 1; } num_ign += 1; continue; } sb_probs[i-num_ign] = PHREDQUAL_TO_PROB(atoi(sb_char)); orig_idx[i-num_ign] = i; /*LOG_FIXME("orig_idx[i=%ld - num_ign=%ld = %ld] = i=%ld\n", i, num_ign, i-num_ign, i);*/ free(sb_char); } if (num_vars-num_ign <= 0) { free(sb_probs); free(orig_idx); return 0; } /* realloc to smaller size apparently not guaranteed to free up space so no point really but let's make sure we don't use that memory */ sb_probs = realloc(sb_probs, (num_vars-num_ign) * sizeof(double)); if (! sb_probs) { LOG_FATAL("realloc failed. Exiting..."); return -1; } orig_idx = realloc(orig_idx, (num_vars-num_ign) * sizeof(long int)); if (! orig_idx) { LOG_FATAL("realloc failed. Exiting..."); return -1; } if (! sb_filter->ntests) { sb_filter->ntests = num_vars - num_ign; } else { if (num_vars-num_ign > sb_filter->ntests) { LOG_WARN("%s\n", "Number of predefined tests for SB filter larger than number of variants! Are you sure that makes sense?"); } } /* multiple testing correction */ if (sb_filter->mtc_type == MTC_BONF) { bonf_corr(sb_probs, num_vars-num_ign, sb_filter->ntests); } else if (sb_filter->mtc_type == MTC_HOLMBONF) { holm_bonf_corr(sb_probs, num_vars-num_ign, sb_filter->alpha, sb_filter->ntests); } else if (sb_filter->mtc_type == MTC_FDR) { long int num_rej = 0; long int *idx_rej; /* indices of rejected i.e. significant values */ num_rej = fdr(sb_probs, num_vars-num_ign, sb_filter->alpha, sb_filter->ntests, &idx_rej); /* first pretend none are significant */ for (i=0; i<num_vars-num_ign; i++) { sb_probs[i] = DBL_MAX; } LOG_DEBUG("%ld results significant after fdr\n", num_rej); for (i=0; i<num_rej; i++) { long int idx = idx_rej[i]; sb_probs[idx] = -1; } free(idx_rej); } else { LOG_FATAL("Internal error: unknown MTC type %d\n", sb_filter->mtc_type); return -1; } for (i=0; i<num_vars-num_ign; i++) { if (sb_probs[i] < sb_filter->alpha) { if (sb_filter->no_compound || alt_mostly_on_one_strand(vars[orig_idx[i]])) { vcf_var_add_to_filter(vars[orig_idx[i]], sb_filter->id); } } } free(orig_idx); free(sb_probs); return 0; }