int main_vcfset(int argc, char *argv[]) { vcfset_conf_t vcfset_conf; char *vcf_header = NULL; int rc = 0; char *vcf_in1, *vcf_in2, *vcf_out; long int num_vars_vcf1; long int num_vars_vcf1_ign, num_vars_out; static int only_passed = 0; static int only_pos = 0; static int only_snvs = 0; static int only_indels = 0; static int count_only = 0; tbx_t *vcf2_tbx = NULL; /* index for second vcf file */ htsFile *vcf2_hts = NULL; char *add_info_field = NULL; int vcf_concat_findex = 0; vcf_in1 = vcf_in2 = vcf_out = NULL; num_vars_vcf1 = 0; num_vars_vcf1_ign = num_vars_out = 0; /* default vcfset options */ memset(&vcfset_conf, 0, sizeof(vcfset_conf_t)); /* vcfset_conf.vcf_in1 = NULL; */ /* vcfset_conf.vcf_in2 = NULL; */ /* vcfset_conf.vcf_out = stdout;*/ /* keep in sync with long_opts_str and usage * * getopt is a pain in the whole when it comes to syncing of long * and short args and usage. check out gopt, libcfu... */ while (1) { int c; static struct option long_opts[] = { /* see usage sync */ {"help", no_argument, NULL, 'h'}, {"verbose", no_argument, &verbose, 1}, {"debug", no_argument, &debug, 1}, {"only-passed", no_argument, &only_passed, 1}, {"only-pos", no_argument, &only_pos, 1}, {"only-indels", no_argument, &only_indels, 1}, {"only-snvs", no_argument, &only_snvs, 1}, {"count-only", no_argument, &count_only, 1}, {"vcf1", required_argument, NULL, '1'}, {"vcf2", required_argument, NULL, '2'}, {"vcfout", required_argument, NULL, 'o'}, {"action", required_argument, NULL, 'a'}, {"add-info", required_argument, NULL, 'I'}, {0, 0, 0, 0} /* sentinel */ }; /* keep in sync with long_opts and usage */ static const char *long_opts_str = "h1:2:o:a:I:"; /* getopt_long stores the option index here. */ int long_opts_index = 0; c = getopt_long(argc-1, argv+1, /* skipping 'lofreq', just leaving 'command', i.e. call */ long_opts_str, long_opts, & long_opts_index); if (c == -1) { break; } switch (c) { /* keep in sync with long_opts etc */ case 'h': usage(& vcfset_conf); free(vcf_in1); free(vcf_in2); free(vcf_out); return 0; case '1': vcf_in1 = strdup(optarg); break; case '2': vcf_in2 = strdup(optarg); break; case 'o': if (0 != strcmp(optarg, "-")) { if (file_exists(optarg)) { LOG_FATAL("Cowardly refusing to overwrite file '%s'. Exiting...\n", optarg); free(vcf_in1); free(vcf_in2); return 1; } } vcf_out = strdup(optarg); break; case 'a': if (0 == strcmp(optarg, "intersect")) { vcfset_conf.vcf_setop = SETOP_INTERSECT; } else if (0 == strcmp(optarg, "complement")) { vcfset_conf.vcf_setop = SETOP_COMPLEMENT; } else if (0 == strcmp(optarg, "concat")) { vcfset_conf.vcf_setop = SETOP_CONCAT; } else { LOG_FATAL("Unknown action '%s'. Exiting...\n", optarg); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } break; case 'I': add_info_field = strdup(optarg); break; case '?': LOG_FATAL("%s\n", "unrecognized arguments found. Exiting...\n"); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; default: break; } } vcfset_conf.only_passed = only_passed; vcfset_conf.only_pos = only_pos; vcfset_conf.only_snvs = only_snvs; vcfset_conf.only_indels = only_indels; if (vcfset_conf.only_indels && vcfset_conf.only_snvs) { LOG_FATAL("%s\n", "Can't take only indels *and* only snvs into account"); return 1; } if (0 != argc - optind - 1) { if (vcfset_conf.vcf_setop == SETOP_CONCAT) { vcf_concat_findex = optind; } else { LOG_FATAL("%s\n", "Unrecognized arguments found\n"); return 1; } } else { if (vcfset_conf.vcf_setop == SETOP_CONCAT) { LOG_FATAL("%s\n", "No extra files for concat given\n"); return 1; } } #if 0 int i; for (i=optind+1; i<argc; i++) { LOG_FIXME("argv[%d]=%s\n", i, argv[i]); } #endif if (argc == 2) { fprintf(stderr, "\n"); usage(& vcfset_conf); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } if (vcfset_conf.vcf_setop == SETOP_UNKNOWN) { LOG_FATAL("%s\n", "No set operation specified"); usage(& vcfset_conf); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } if (vcf_in1 == NULL || (vcf_in2 == NULL && vcfset_conf.vcf_setop != SETOP_CONCAT)) { LOG_FATAL("%s\n\n", "At least one vcf input file not specified"); usage(& vcfset_conf); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } if (vcf_in2 != NULL && vcfset_conf.vcf_setop == SETOP_CONCAT) { LOG_FATAL("%s\n\n", "For concat just use the -1 option followed by all other vcf files instead of using -2"); usage(& vcfset_conf); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } if (vcf_file_open(& vcfset_conf.vcf_in1, vcf_in1, HAS_GZIP_EXT(vcf_in1), 'r')) { LOG_ERROR("Couldn't open %s\n", vcf_in1); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } if (vcf_in2) { vcf2_hts = hts_open(vcf_in2, "r"); if (!vcf2_hts) { LOG_FATAL("Couldn't load %s\n", vcf_in2); return 1; } vcf2_tbx = tbx_index_load(vcf_in2); if (!vcf2_tbx) { LOG_FATAL("Couldn't load tabix index for %s\n", vcf_in2); return 1; } } /* vcf_out default if not set: stdout==- */ if (! vcf_out) { vcf_out = malloc(2 * sizeof(char)); strcpy(vcf_out, "-"); } if (! count_only) { if (vcf_file_open(& vcfset_conf.vcf_out, vcf_out, HAS_GZIP_EXT(vcf_out), 'w')) { LOG_ERROR("Couldn't open %s\n", vcf_out); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } } /* use meta-data/header of vcf_in1 for output */ LOG_DEBUG("Getting header from %s\n", vcf_in1); if (0 != vcf_parse_header(&vcf_header, & vcfset_conf.vcf_in1)) { LOG_WARN("%s\n", "vcf_parse_header() failed"); if (vcf_file_seek(& vcfset_conf.vcf_in1, 0, SEEK_SET)) { LOG_FATAL("%s\n", "Couldn't rewind file to parse variants" " after header parsing failed"); return -1; } } else { if (! count_only) { /* vcf_write_header would write *default* header */ vcf_write_header(& vcfset_conf.vcf_out, vcf_header); } free(vcf_header); } /* parse first vcf file */ LOG_DEBUG("Starting to parse variants from %s\n", vcf_in1); while (1) { var_t *var1 = NULL; int rc; int is_indel; kstring_t var2_kstr = {0, 0, 0}; hts_itr_t *var2_itr = NULL; char regbuf[1024]; int var2_match = 0; vcf_new_var(&var1); rc = vcf_parse_var(& vcfset_conf.vcf_in1, var1); if (rc) { free(var1); if (vcfset_conf.vcf_setop != SETOP_CONCAT) { break; } else { vcf_concat_findex++; if (vcf_concat_findex==argc) { break; } /* set vcf1 up anew and simply continue as if nothing happened */ vcf_file_close(& vcfset_conf.vcf_in1); free(vcf_in1); vcf_in1 = strdup(argv[vcf_concat_findex]); LOG_DEBUG("updated vcf_in1 = %s\n", vcf_in1); if (vcf_file_open(& vcfset_conf.vcf_in1, vcf_in1, HAS_GZIP_EXT(vcf_in1), 'r')) { LOG_ERROR("Couldn't open %s\n", vcf_in1); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } if (0 != vcf_skip_header(& vcfset_conf.vcf_in1)) { LOG_WARN("skip header failed for %s\n", vcf_in1); } continue; } } is_indel = vcf_var_is_indel(var1); if (vcfset_conf.only_snvs && is_indel) { free(var1); continue; } else if (vcfset_conf.only_indels && ! is_indel) { free(var1); continue; } if (! vcfset_conf.only_pos && NULL != strchr(var1->alt, ',')) { LOG_FATAL("%s\n", "No support for multi-allelic SNVs in vcf1"); return -1; } if (vcfset_conf.only_passed && ! VCF_VAR_PASSES(var1)) { #ifdef TRACE LOG_DEBUG("Skipping non-passing var1 %s:%d\n", var1->chrom, var1->pos); #endif num_vars_vcf1_ign += 1; vcf_free_var(& var1); continue; } if (add_info_field) { vcf_var_add_to_info(var1, add_info_field); } num_vars_vcf1 += 1; #ifdef TRACE LOG_DEBUG("Got passing var1 %s:%d\n", var1->chrom, var1->pos); #endif if (vcfset_conf.vcf_setop == SETOP_CONCAT) { num_vars_out += 1; if (! count_only) { vcf_write_var(& vcfset_conf.vcf_out, var1); } vcf_free_var(& var1); /* skip comparison against vcf2 */ continue; } /* use index access to vcf2 */ snprintf(regbuf, 1024, "%s:%ld-%ld", var1->chrom, var1->pos+1, var1->pos+1); var2_itr = tbx_itr_querys(vcf2_tbx, regbuf); if (! var2_itr) { var2_match = 0; } else { var2_match = 0; while (tbx_itr_next(vcf2_hts, vcf2_tbx, var2_itr, &var2_kstr) >= 0) { var_t *var2 = NULL; int var2_is_indel = 0; vcf_new_var(&var2); rc = vcf_parse_var_from_line(var2_kstr.s, var2); /* LOG_FIXME("%d:%s>%s looking at var2 %d:%s>%s (reg %s)\n", var1->pos+1, var1->ref, var1->alt, var2->pos+1, var2->ref, var2->alt, regbuf); */ if (rc) { LOG_FATAL("%s\n", "Error while parsing variant returned from tabix"); return -1; } var2_is_indel = vcf_var_is_indel(var2); /* iterator returns anything overlapping with that * position, i.e. this also includes up/downstream * indels, so make sure actual position matches */ if (var1->pos != var2->pos) { var2_match = 0; } else if (vcfset_conf.only_passed && ! VCF_VAR_PASSES(var2)) { var2_match = 0; } else if (vcfset_conf.only_snvs && var2_is_indel) { var2_match = 0; } else if (vcfset_conf.only_indels && ! var2_is_indel) { var2_match = 0; } else if (vcfset_conf.only_pos) { #ifdef TRACE LOG_DEBUG("Pos match for var2 %s:%d\n", var2->chrom, var2->pos); #endif var2_match = 1; } else { if (0==strcmp(var1->ref, var2->ref) && 0==strcmp(var1->alt, var2->alt)) { #ifdef TRACE LOG_DEBUG("Full match for var2 %s:%d\n", var2->chrom, var2->pos); #endif var2_match = 1;/* FIXME: check type as well i.e. snv vs indel */ } } vcf_free_var(&var2); if (var2_match) { break;/* no need to continue */ } } } if (vcfset_conf.vcf_setop == SETOP_COMPLEMENT) { /* relative complement : elements in A but not B */ if (!var2_match) { num_vars_out += 1; if (! count_only) { vcf_write_var(& vcfset_conf.vcf_out, var1); } } } else if (vcfset_conf.vcf_setop == SETOP_INTERSECT) { if (var2_match) { num_vars_out += 1; if (! count_only) { vcf_write_var(& vcfset_conf.vcf_out, var1); } } } else { LOG_FATAL("Internal error: unsupported vcf_setop %d\n", vcfset_conf.vcf_setop); return 1; } vcf_free_var(& var1); tbx_itr_destroy(var2_itr); }/* while (1) */ vcf_file_close(& vcfset_conf.vcf_in1); if (vcf_in2) { hts_close(vcf2_hts); tbx_destroy(vcf2_tbx); } LOG_VERBOSE("Parsed %d variants from 1st vcf file (ignoring %d non-passed of those)\n", num_vars_vcf1 + num_vars_vcf1_ign, num_vars_vcf1_ign); LOG_VERBOSE("Wrote %d variants to output\n", num_vars_out); if (! count_only) { vcf_file_close(& vcfset_conf.vcf_out); } if (0==rc) { if (count_only) { printf("%ld\n", num_vars_out); } LOG_VERBOSE("%s\n", "Successful exit."); } free(vcf_in1); free(vcf_in2); free(vcf_out); return rc; }
/* mtc_quals allocated here. size returned on exit or -1 on error */ long int mtc_quals_from_vcf_file(mtc_qual_t **mtc_quals, const char *vcf_in) { long int num_vars = 0; long int mtc_qual_size = 0; int mtc_qual_incr = 16384; vcf_file_t vcffh; if (vcf_file_open(&vcffh, vcf_in, HAS_GZIP_EXT(vcf_in), 'r')) { LOG_ERROR("Couldn't open %s\n", vcf_in); return -1; } if (0 != vcf_skip_header(&vcffh)) { LOG_WARN("%s\n", "vcf_skip_header() failed"); return -1; } mtc_qual_size += mtc_qual_incr; (*mtc_quals) = calloc(mtc_qual_size, sizeof(mtc_qual_t)); while (1) { var_t *var; int rc; int is_indel = 0; char *sb_char = NULL; vcf_new_var(&var); rc = vcf_parse_var(&vcffh, var); if (rc) { /* how to distinguish between error and EOF? */ break; } num_vars += 1; /* ingest anything: we keep adding filters */ if (num_vars > mtc_qual_size) { mtc_qual_size += mtc_qual_incr; (*mtc_quals) = realloc((*mtc_quals), mtc_qual_size * sizeof(mtc_qual_t)); } is_indel = vcf_var_is_indel(var); (*mtc_quals)[num_vars-1].is_indel = is_indel; /* variant quality */ if (var->qual==-1) { /* missing qualities to fake value */ var->qual = INT_MAX; if (! varq_missing_warning_printed) { LOG_WARN("%s\n", "Missing variant quality in at least once case. Assuming INT_MAX"); varq_missing_warning_printed = 1; } (*mtc_quals)[num_vars-1].var_qual = INT_MAX; } else { (*mtc_quals)[num_vars-1].var_qual = var->qual; } /* strand bias */ if ( ! vcf_var_has_info_key(&sb_char, var, "SB")) { if ( ! sb_missing_warning_printed) { LOG_WARN("%s\n", "At least one variant has no SB tag! Assuming 0"); sb_missing_warning_printed = 1; } (*mtc_quals)[num_vars-1].sb_qual = 0; } else { (*mtc_quals)[num_vars-1].sb_qual = atoi(sb_char); free(sb_char); } (*mtc_quals)[num_vars-1].is_alt_mostly_on_one_strand = alt_mostly_on_one_strand(var); vcf_free_var(&var); } vcf_file_close(&vcffh); return num_vars; }
int main_filter(int argc, char *argv[]) { filter_conf_t cfg; char *vcf_in = NULL, *vcf_out = NULL; static int print_only_passed = 1; static int sb_filter_no_compound = 0; static int sb_filter_incl_indels = 0; static int only_indels = 0; static int only_snvs = 0; char *vcf_header = NULL; mtc_qual_t *mtc_quals = NULL; long int num_vars; static int no_defaults = 0; long int var_idx = -1; /* default filter options */ memset(&cfg, 0, sizeof(filter_conf_t)); cfg.dp_filter.min = cfg.dp_filter.max = -1; cfg.af_filter.min = cfg.af_filter.max = -1; cfg.sb_filter.alpha = DEFAULT_SIG; cfg.snvqual_filter.alpha = DEFAULT_SIG; cfg.indelqual_filter.alpha = DEFAULT_SIG; /* keep in sync with long_opts_str and usage * * getopt is a pain in the whole when it comes to syncing of long * and short args and usage. check out gopt, libcfu... */ while (1) { int c; static struct option long_opts[] = { /* see usage sync */ {"verbose", no_argument, &verbose, 1}, {"debug", no_argument, &debug, 1}, {"print-all", no_argument, &print_only_passed, 0}, {"no-defaults", no_argument, &no_defaults, 1}, {"only-indels", no_argument, &only_indels, 1}, {"only-snvs", no_argument, &only_snvs, 1}, {"help", no_argument, NULL, 'h'}, {"in", required_argument, NULL, 'i'}, {"out", required_argument, NULL, 'o'}, {"cov-min", required_argument, NULL, 'v'}, {"cov-max", required_argument, NULL, 'V'}, {"af-min", required_argument, NULL, 'a'}, {"af-max", required_argument, NULL, 'A'}, {"sb-thresh", required_argument, NULL, 'B'}, {"sb-mtc", required_argument, NULL, 'b'}, {"sb-alpha", required_argument, NULL, 'c'}, {"sb-no-compound", no_argument, &sb_filter_no_compound, 1}, {"sb-incl-indels", no_argument, &sb_filter_incl_indels, 1}, {"snvqual-thresh", required_argument, NULL, 'Q'}, {"snvqual-mtc", required_argument, NULL, 'q'}, {"snvqual-alpha", required_argument, NULL, 'r'}, {"snvqual-ntests", required_argument, NULL, 's'}, {"indelqual-thresh", required_argument, NULL, 'K'}, {"indelqual-mtc", required_argument, NULL, 'k'}, {"indelqual-alpha", required_argument, NULL, 'l'}, {"indelqual-ntests", required_argument, NULL, 'm'}, {0, 0, 0, 0} /* sentinel */ }; /* keep in sync with long_opts and usage */ static const char *long_opts_str = "hi:o:v:V:a:A:B:b:c:Q:q:r:s:K:k:l:m:"; /* getopt_long stores the option index here. */ int long_opts_index = 0; c = getopt_long(argc-1, argv+1, /* skipping 'lofreq', just leaving 'command', i.e. call */ long_opts_str, long_opts, & long_opts_index); if (c == -1) { break; } switch (c) { /* keep in sync with long_opts etc */ case 'h': usage(& cfg); return 0; case 'i': vcf_in = strdup(optarg); break; case 'o': if (0 != strcmp(optarg, "-")) { if (file_exists(optarg)) { LOG_FATAL("Cowardly refusing to overwrite file '%s'. Exiting...\n", optarg); return 1; } } vcf_out = strdup(optarg); break; case 'v': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.dp_filter.min = atoi(optarg); break; case 'V': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.dp_filter.max = atoi(optarg); break; case 'a': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.af_filter.min = strtof(optarg, NULL); break; case 'A': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.af_filter.max = strtof(optarg, NULL); break; case 'B': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.sb_filter.thresh = atoi(optarg); break; case 'b': cfg.sb_filter.mtc_type = mtc_str_to_type(optarg); if (-1 == cfg.sb_filter.mtc_type) { LOG_FATAL("Unknown multiple testing correction type '%s' for strandbias filtering\n", optarg); return -1; } break; case 'c': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.sb_filter.alpha = strtof(optarg, NULL); break; case 'Q': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.snvqual_filter.thresh = atoi(optarg); break; case 'q': cfg.snvqual_filter.mtc_type = mtc_str_to_type(optarg); if (-1 == cfg.snvqual_filter.mtc_type) { LOG_FATAL("Unknown multiple testing correction type '%s' for snv quality filtering\n", optarg); return -1; } break; case 'r': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.snvqual_filter.alpha = strtof(optarg, NULL); break; case 's': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.snvqual_filter.ntests = atol(optarg); break; case 'K': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.indelqual_filter.thresh = atoi(optarg); break; case 'k': cfg.indelqual_filter.mtc_type = mtc_str_to_type(optarg); if (-1 == cfg.indelqual_filter.mtc_type) { LOG_FATAL("Unknown multiple testing correction type '%s' for snv quality filtering\n", optarg); return -1; } break; case 'l': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.indelqual_filter.alpha = strtof(optarg, NULL); break; case 'm': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.indelqual_filter.ntests = atol(optarg); break; case '?': LOG_FATAL("%s\n", "Unrecognized argument found. Exiting...\n"); return 1; default: break; } } cfg.print_only_passed = print_only_passed; cfg.only_indels = only_indels; cfg.only_snvs = only_snvs; cfg.sb_filter.no_compound = sb_filter_no_compound; cfg.sb_filter.incl_indels = sb_filter_incl_indels; if (cfg.only_indels && cfg.only_snvs) { LOG_FATAL("%s\n", "Can't keep only indels and only snvs"); return 1; } if (! no_defaults) { if (cfg.sb_filter.mtc_type==MTC_NONE && ! cfg.sb_filter.thresh) { LOG_VERBOSE("%s\n", "Setting default SB filtering method to FDR"); cfg.sb_filter.mtc_type = MTC_FDR; cfg.sb_filter.alpha = 0.001; } if (cfg.dp_filter.min<0) { cfg.dp_filter.min = 10; LOG_VERBOSE("Setting default minimum coverage to %d\n", cfg.dp_filter.min); } } else { LOG_VERBOSE("%s\n", "Skipping default settings"); } if (0 != argc - optind - 1) {/* FIXME needed at all? */ LOG_FATAL("%s\n", "Unrecognized argument found. Exiting...\n"); return 1; } /* logic check of command line parameters */ if (cfg.dp_filter.max > 0 && cfg.dp_filter.max < cfg.dp_filter.min) { LOG_FATAL("%s\n", "Invalid coverage-filter settings"); return 1; } if ((cfg.af_filter.max > 0 && cfg.af_filter.max < cfg.af_filter.min) || (cfg.af_filter.max > 1.0)) { LOG_FATAL("%s\n", "Invalid AF-filter settings"); return 1; } if (cfg.sb_filter.thresh && cfg.sb_filter.mtc_type != MTC_NONE) { LOG_FATAL("%s\n", "Can't use fixed strand-bias threshold *and* multiple testing correction."); return 1; } if (cfg.snvqual_filter.thresh && cfg.snvqual_filter.mtc_type != MTC_NONE) { LOG_FATAL("%s\n", "Can't use fixed SNV quality threshold *and* multiple testing correction."); return 1; } if (cfg.indelqual_filter.thresh && cfg.indelqual_filter.mtc_type != MTC_NONE) { LOG_FATAL("%s\n", "Can't use fixed indel quality threshold *and* multiple testing correction."); return 1; } if (argc == 2) { fprintf(stderr, "\n"); usage(& cfg); return 1; } if (debug) { dump_filter_conf(& cfg); } /* missing file args default to stdin and stdout */ /* no streaming allowed for vcf_in: we need to determine thresholds first */ if (! vcf_in) { LOG_FATAL("%s\n", "Input VCF missing. No streaming allowed. Need to determine auto threshold in memory friendly manner first."); return 1; } if (! vcf_out) { vcf_out = malloc(2 * sizeof(char)); strcpy(vcf_out, "-"); } LOG_DEBUG("vcf_in=%s vcf_out=%s\n", vcf_in, vcf_out); /* First pass parsing to get qualities for MTC computation (if needed) */ if (cfg.sb_filter.mtc_type != MTC_NONE || cfg.snvqual_filter.mtc_type != MTC_NONE || cfg.indelqual_filter.mtc_type != MTC_NONE) { #ifdef TRACE long int i = 0; #endif LOG_VERBOSE("%s\n", "At least one type of multiple testing correction requested. Doing first pass of vcf"); if ((num_vars = mtc_quals_from_vcf_file(& mtc_quals, vcf_in)) < 0) { LOG_ERROR("Couldn't parse %s\n", vcf_in); return 1; } if (cfg.sb_filter.mtc_type != MTC_NONE) { if (apply_sb_filter_mtc(mtc_quals, & cfg.sb_filter, num_vars)) { LOG_FATAL("%s\n", "Multiple testing correction on strand-bias pvalues failed"); return -1; } } if (cfg.indelqual_filter.mtc_type != MTC_NONE) { if (apply_indelqual_filter_mtc(mtc_quals, & cfg.indelqual_filter, num_vars)) { LOG_FATAL("%s\n", "Multiple testing correction on indel quality pvalues failed"); return -1; } } if (cfg.snvqual_filter.mtc_type != MTC_NONE) { if (apply_snvqual_filter_mtc(mtc_quals, & cfg.snvqual_filter, num_vars)) { LOG_FATAL("%s\n", "Multiple testing correction on SNV quality pvalues failed"); return -1; } } #ifdef TRACE for (i=0; i<num_vars; i++) { LOG_WARN("mtc_quals #%ld sb_qual=%d var_qual=%d is_indel=%d\n", i, mtc_quals[i].sb_qual, mtc_quals[i].var_qual, mtc_quals[i].is_indel); } #endif LOG_VERBOSE("%s\n", "MTC application completed"); } else { LOG_VERBOSE("%s\n", "No multiple testing correction requested. First pass of vcf skipped"); } if (vcf_file_open(& cfg.vcf_in, vcf_in, HAS_GZIP_EXT(vcf_in), 'r')) { LOG_ERROR("Couldn't open %s\n", vcf_in); return 1; } if (vcf_file_open(& cfg.vcf_out, vcf_out, HAS_GZIP_EXT(vcf_out), 'w')) { LOG_ERROR("Couldn't open %s\n", vcf_out); return 1; } free(vcf_in); free(vcf_out); /* print header */ if (0 != vcf_parse_header(&vcf_header, & cfg.vcf_in)) { /* LOG_WARN("%s\n", "vcf_parse_header() failed"); */ if (vcf_file_seek(& cfg.vcf_in, 0, SEEK_SET)) { LOG_FATAL("%s\n", "Couldn't rewind file to parse variants" " after header parsing failed"); return -1; } } /* also sets filter names */ cfg_filter_to_vcf_header(& cfg, &vcf_header); vcf_write_header(& cfg.vcf_out, vcf_header); free(vcf_header); /* read in variants */ while (1) { var_t *var; int rc; int is_indel = 0; vcf_new_var(&var); rc = vcf_parse_var(& cfg.vcf_in, var); if (rc) { /* how to distinguish between error and EOF? */ break; } var_idx += 1; is_indel = vcf_var_is_indel(var); if (cfg.only_snvs && is_indel) { vcf_free_var(&var); continue; } else if (cfg.only_indels && ! is_indel) { vcf_free_var(&var); continue; } /* filters applying to all types of variants */ apply_af_filter(var, & cfg.af_filter); apply_dp_filter(var, & cfg.dp_filter); /* quality threshold per variant type */ if (! is_indel) { if (cfg.snvqual_filter.thresh) { assert(cfg.snvqual_filter.mtc_type == MTC_NONE); apply_snvqual_threshold(var, & cfg.snvqual_filter); } else if (cfg.snvqual_filter.mtc_type != MTC_NONE) { if (mtc_quals[var_idx].var_qual != -1) { vcf_var_add_to_filter(var, cfg.snvqual_filter.id); } } } else { if (cfg.indelqual_filter.thresh) { assert(cfg.indelqual_filter.mtc_type == MTC_NONE); apply_indelqual_threshold(var, & cfg.indelqual_filter); } else if (cfg.indelqual_filter.mtc_type != MTC_NONE) { if (mtc_quals[var_idx].var_qual != -1) { vcf_var_add_to_filter(var, cfg.indelqual_filter.id); } } } /* sb filter */ if (cfg.sb_filter.thresh) { if (! is_indel || cfg.sb_filter.incl_indels) { assert(cfg.sb_filter.mtc_type == MTC_NONE); apply_sb_threshold(var, & cfg.sb_filter); } } else if (cfg.sb_filter.mtc_type != MTC_NONE) { if (! is_indel || cfg.sb_filter.incl_indels) { if (mtc_quals[var_idx].sb_qual == -1) { vcf_var_add_to_filter(var, cfg.sb_filter.id); } } } /* output */ if (cfg.print_only_passed && ! (VCF_VAR_PASSES(var))) { vcf_free_var(&var); continue; } /* add pass if no filters were set */ if (! var->filter || strlen(var->filter)<=1) { char pass_str[] = "PASS"; if (var->filter) { free(var->filter); } var->filter = strdup(pass_str); } vcf_write_var(& cfg.vcf_out, var); vcf_free_var(&var); if (var_idx%1000==0) { (void) vcf_file_flush(& cfg.vcf_out); } } vcf_file_close(& cfg.vcf_in); vcf_file_close(& cfg.vcf_out); free(mtc_quals); LOG_VERBOSE("%s\n", "Successful exit."); return 0; }
int main_filter(int argc, char *argv[]) { filter_conf_t cfg; char *vcf_in = NULL, *vcf_out = NULL; static int print_only_passed = 1; static int sb_filter_no_compound = 0; static int sb_filter_incl_indels = 0; static int only_indels = 0; static int only_snvs = 0; char *vcf_header = NULL; var_t **vars = NULL; long int num_vars = 0; /* isn't long overkill here ? */ long int vars_size = 0; /* keeping track of how much memory we've got pre-allocated */ long int i; static int no_defaults = 0; /* default filter options */ memset(&cfg, 0, sizeof(filter_conf_t)); cfg.dp_filter.min = cfg.dp_filter.max = -1; cfg.af_filter.min = cfg.af_filter.max = -1; cfg.sb_filter.alpha = DEFAULT_SIG; cfg.snvqual_filter.alpha = DEFAULT_SIG; cfg.indelqual_filter.alpha = DEFAULT_SIG; /* keep in sync with long_opts_str and usage * * getopt is a pain in the whole when it comes to syncing of long * and short args and usage. check out gopt, libcfu... */ while (1) { int c; static struct option long_opts[] = { /* see usage sync */ {"verbose", no_argument, &verbose, 1}, {"debug", no_argument, &debug, 1}, {"print-all", no_argument, &print_only_passed, 0}, {"no-defaults", no_argument, &no_defaults, 1}, {"only-indels", no_argument, &only_indels, 1}, {"only-snvs", no_argument, &only_snvs, 1}, {"help", no_argument, NULL, 'h'}, {"in", required_argument, NULL, 'i'}, {"out", required_argument, NULL, 'o'}, {"cov-min", required_argument, NULL, 'v'}, {"cov-max", required_argument, NULL, 'V'}, {"af-min", required_argument, NULL, 'a'}, {"af-max", required_argument, NULL, 'A'}, {"sb-thresh", required_argument, NULL, 'B'}, {"sb-mtc", required_argument, NULL, 'b'}, {"sb-alpha", required_argument, NULL, 'c'}, {"sb-no-compound", no_argument, &sb_filter_no_compound, 1}, {"sb-incl-indels", no_argument, &sb_filter_incl_indels, 1}, {"snvqual-thresh", required_argument, NULL, 'Q'}, {"snvqual-mtc", required_argument, NULL, 'q'}, {"snvqual-alpha", required_argument, NULL, 'r'}, {"snvqual-ntests", required_argument, NULL, 's'}, {"indelqual-thresh", required_argument, NULL, 'K'}, {"indelqual-mtc", required_argument, NULL, 'k'}, {"indelqual-alpha", required_argument, NULL, 'l'}, {"indelqual-ntests", required_argument, NULL, 'm'}, {0, 0, 0, 0} /* sentinel */ }; /* keep in sync with long_opts and usage */ static const char *long_opts_str = "hi:o:v:V:a:A:B:b:c:Q:q:r:s:K:k:l:m:"; /* getopt_long stores the option index here. */ int long_opts_index = 0; c = getopt_long(argc-1, argv+1, /* skipping 'lofreq', just leaving 'command', i.e. call */ long_opts_str, long_opts, & long_opts_index); if (c == -1) { break; } switch (c) { /* keep in sync with long_opts etc */ case 'h': usage(& cfg); return 0; case 'i': vcf_in = strdup(optarg); break; case 'o': if (0 != strcmp(optarg, "-")) { if (file_exists(optarg)) { LOG_FATAL("Cowardly refusing to overwrite file '%s'. Exiting...\n", optarg); return 1; } } vcf_out = strdup(optarg); break; case 'v': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.dp_filter.min = atoi(optarg); break; case 'V': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.dp_filter.max = atoi(optarg); break; case 'a': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.af_filter.min = strtof(optarg, NULL); break; case 'A': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.af_filter.max = strtof(optarg, NULL); break; case 'B': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.sb_filter.thresh = atoi(optarg); break; case 'b': cfg.sb_filter.mtc_type = mtc_str_to_type(optarg); if (-1 == cfg.sb_filter.mtc_type) { LOG_FATAL("Unknown multiple testing correction type '%s' for strandbias filtering\n", optarg); return -1; } break; case 'c': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.sb_filter.alpha = strtof(optarg, NULL); break; case 'Q': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.snvqual_filter.thresh = atoi(optarg); break; case 'q': cfg.snvqual_filter.mtc_type = mtc_str_to_type(optarg); if (-1 == cfg.snvqual_filter.mtc_type) { LOG_FATAL("Unknown multiple testing correction type '%s' for snv quality filtering\n", optarg); return -1; } break; case 'r': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.snvqual_filter.alpha = strtof(optarg, NULL); break; case 's': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.snvqual_filter.ntests = atol(optarg); break; case 'K': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.indelqual_filter.thresh = atoi(optarg); break; case 'k': cfg.indelqual_filter.mtc_type = mtc_str_to_type(optarg); if (-1 == cfg.indelqual_filter.mtc_type) { LOG_FATAL("Unknown multiple testing correction type '%s' for snv quality filtering\n", optarg); return -1; } break; case 'l': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.indelqual_filter.alpha = strtof(optarg, NULL); break; case 'm': if (! isdigit(optarg[0])) { LOG_FATAL("Non-numeric argument provided: %s\n", optarg); return -1; } cfg.indelqual_filter.ntests = atol(optarg); break; case '?': LOG_FATAL("%s\n", "Unrecognized argument found. Exiting...\n"); return 1; default: break; } } cfg.print_only_passed = print_only_passed; cfg.only_indels = only_indels; cfg.only_snvs = only_snvs; cfg.sb_filter.no_compound = sb_filter_no_compound; cfg.sb_filter.incl_indels = sb_filter_incl_indels; if (cfg.only_indels && cfg.only_snvs) { LOG_FATAL("%s\n", "Can't keep only indels and only snvs"); return 1; } if (! no_defaults) { if (cfg.sb_filter.mtc_type==MTC_NONE && ! cfg.sb_filter.thresh) { LOG_VERBOSE("%s\n", "Setting default SB filtering method to FDR"); cfg.sb_filter.mtc_type = MTC_FDR; cfg.sb_filter.alpha = 0.001; } if (cfg.dp_filter.min<0) { cfg.dp_filter.min = 10; LOG_VERBOSE("Setting default minimum coverage to %d\n", cfg.dp_filter.min); } } else { LOG_VERBOSE("%s\n", "Skipping default settings"); } if (0 != argc - optind - 1) {/* FIXME needed at all? */ LOG_FATAL("%s\n", "Unrecognized argument found. Exiting...\n"); return 1; } /* logic check of command line parameters */ if (cfg.dp_filter.max > 0 && cfg.dp_filter.max < cfg.dp_filter.min) { LOG_FATAL("%s\n", "Invalid coverage-filter settings"); return 1; } if ((cfg.af_filter.max > 0 && cfg.af_filter.max < cfg.af_filter.min) || (cfg.af_filter.max > 1.0)) { LOG_FATAL("%s\n", "Invalid AF-filter settings"); return 1; } if (cfg.sb_filter.thresh && cfg.sb_filter.mtc_type != MTC_NONE) { LOG_FATAL("%s\n", "Can't use fixed strand-bias threshold *and* multiple testing correction."); return 1; } if (cfg.snvqual_filter.thresh && cfg.snvqual_filter.mtc_type != MTC_NONE) { LOG_FATAL("%s\n", "Can't use fixed SNV quality threshold *and* multiple testing correction."); return 1; } if (cfg.indelqual_filter.thresh && cfg.indelqual_filter.mtc_type != MTC_NONE) { LOG_FATAL("%s\n", "Can't use fixed indel quality threshold *and* multiple testing correction."); return 1; } if (argc == 2) { fprintf(stderr, "\n"); usage(& cfg); return 1; } if (debug) { dump_filter_conf(& cfg); } /* missing file args default to stdin and stdout */ if (! vcf_in) { vcf_in = malloc(2 * sizeof(char)); strcpy(vcf_in, "-"); } if (! vcf_out) { vcf_out = malloc(2 * sizeof(char)); strcpy(vcf_out, "-"); } LOG_DEBUG("vcf_in=%s vcf_out=%s\n", vcf_in, vcf_out); /* open vcf files */ if (vcf_file_open(& cfg.vcf_in, vcf_in, HAS_GZIP_EXT(vcf_in), 'r')) { LOG_ERROR("Couldn't open %s\n", vcf_in); return 1; } if (vcf_file_open(& cfg.vcf_out, vcf_out, HAS_GZIP_EXT(vcf_out), 'w')) { LOG_ERROR("Couldn't open %s\n", vcf_out); return 1; } free(vcf_in); free(vcf_out); /* FIXME everything below here should go into a function with args: - cfg - ...what else? */ /* print header */ if (0 != vcf_parse_header(&vcf_header, & cfg.vcf_in)) { /* LOG_WARN("%s\n", "vcf_parse_header() failed"); */ if (vcf_file_seek(& cfg.vcf_in, 0, SEEK_SET)) { LOG_FATAL("%s\n", "Couldn't rewind file to parse variants" " after header parsing failed"); return -1; } } /* also sets filter names */ cfg_filter_to_vcf_header(& cfg, &vcf_header); vcf_write_header(& cfg.vcf_out, vcf_header); free(vcf_header); /* read in variants. since many filters perform multiple testing * correction and therefore need to look at all variants we keep * it simple and load them all into memory. * * in theory we could apply all 'simple' filters directly within * the loop here and depending on the result spit the variant out * or not. only complex filters need to see all variants first to, * e.g. apply multiple testing. */ num_vars = 0; while (1) { var_t *var; int rc; int is_indel = 0; vcf_new_var(&var); rc = vcf_parse_var(& cfg.vcf_in, var); if (rc) { /* how to distinguish between error and EOF? */ free(var); break; } is_indel = vcf_var_is_indel(var); if (cfg.only_snvs && is_indel) { free(var); continue; } else if (cfg.only_indels && ! is_indel) { free(var); continue; } /* read all in, no matter if already filtered. we keep adding filters */ num_vars +=1; if (num_vars >= vars_size) { const long incr = 128; vars = realloc(vars, (vars_size+incr) * sizeof(var_t*)); vars_size += incr; } vars[num_vars-1] = var; #ifdef TRACE { char *key; vcf_var_key(&key, vars[num_vars-1]); fprintf(stderr, "storing var %ld+1: %s\n", num_vars, key); free(key); } #endif /* filters applying to all types of variants */ apply_af_filter(var, & cfg.af_filter); apply_dp_filter(var, & cfg.dp_filter); /* quality threshold per variant type */ if (! is_indel) { if (cfg.snvqual_filter.thresh) { assert(cfg.snvqual_filter.mtc_type == MTC_NONE); apply_snvqual_threshold(var, & cfg.snvqual_filter); } } else { if (cfg.indelqual_filter.thresh) { assert(cfg.indelqual_filter.mtc_type == MTC_NONE); apply_indelqual_threshold(var, & cfg.indelqual_filter); } } if (cfg.sb_filter.thresh) { if (! is_indel || cfg.sb_filter.incl_indels) { assert(cfg.sb_filter.mtc_type == MTC_NONE); apply_sb_threshold(var, & cfg.sb_filter); } } } if (num_vars) { vars = realloc(vars, (num_vars * sizeof(var_t*))); } vcf_file_close(& cfg.vcf_in); LOG_VERBOSE("Parsed %ld variants\n", num_vars); if (cfg.sb_filter.mtc_type != MTC_NONE) { if (apply_sb_filter_mtc(& cfg.sb_filter, vars, num_vars)) { LOG_FATAL("%s\n", "Multiple testing correction on strand-bias pvalues failed"); return -1; } } if (cfg.snvqual_filter.mtc_type != MTC_NONE) { if (apply_snvqual_filter_mtc(& cfg.snvqual_filter, vars, num_vars)) { LOG_FATAL("%s\n", "Multiple testing correction on SNV qualities failed"); return -1; } } if (cfg.indelqual_filter.mtc_type != MTC_NONE) { if (apply_indelqual_filter_mtc(& cfg.indelqual_filter, vars, num_vars)) { LOG_FATAL("%s\n", "Multiple testing correction on Indel qualities failed"); return -1; } } /* output */ for (i=0; i<num_vars; i++) { var_t *v = vars[i]; if (cfg.print_only_passed && ! (VCF_VAR_PASSES(v))) { continue; } /* add pass if no filters were set */ if (! v->filter || strlen(v->filter)<=1) { char pass_str[] = "PASS"; if (v->filter) { free(v->filter); } v->filter = strdup(pass_str); } vcf_write_var(& cfg.vcf_out, v); } vcf_file_close(& cfg.vcf_out); for (i=0; i<num_vars; i++) { vcf_free_var(& vars[i]); } free(vars); LOG_VERBOSE("%s\n", "Successful exit."); return 0; }