Example #1
0
int 
main_vcfset(int argc, char *argv[])
{
     vcfset_conf_t vcfset_conf;
     char *vcf_header = NULL;
     int rc = 0;
     char *vcf_in1, *vcf_in2, *vcf_out;
     long int num_vars_vcf1;
     long int num_vars_vcf1_ign, num_vars_out;
     static int only_passed = 0;
     static int only_pos = 0;
     static int only_snvs = 0;
     static int only_indels = 0;
     static int count_only = 0;
     tbx_t *vcf2_tbx = NULL; /* index for second vcf file */
     htsFile *vcf2_hts = NULL;
     char *add_info_field = NULL;
     int vcf_concat_findex = 0;
     vcf_in1 = vcf_in2 = vcf_out = NULL;
     num_vars_vcf1 = 0;
     num_vars_vcf1_ign = num_vars_out = 0;

     /* default vcfset options */
     memset(&vcfset_conf, 0, sizeof(vcfset_conf_t));
     /* vcfset_conf.vcf_in1 = NULL; */
     /* vcfset_conf.vcf_in2 = NULL; */
     /* vcfset_conf.vcf_out = stdout;*/


    /* keep in sync with long_opts_str and usage 
     *
     * getopt is a pain in the whole when it comes to syncing of long
     * and short args and usage. check out gopt, libcfu...
     */
    while (1) {
         int c;
         static struct option long_opts[] = {
              /* see usage sync */
              {"help", no_argument, NULL, 'h'},
              {"verbose", no_argument, &verbose, 1},
              {"debug", no_argument, &debug, 1},
              {"only-passed", no_argument, &only_passed, 1},
              {"only-pos", no_argument, &only_pos, 1},
              {"only-indels", no_argument, &only_indels, 1},
              {"only-snvs", no_argument, &only_snvs, 1},
              {"count-only", no_argument, &count_only, 1},

              {"vcf1", required_argument, NULL, '1'},
              {"vcf2", required_argument, NULL, '2'},
              {"vcfout", required_argument, NULL, 'o'},
              {"action", required_argument, NULL, 'a'},
              {"add-info", required_argument, NULL, 'I'},

              {0, 0, 0, 0} /* sentinel */
         };

         /* keep in sync with long_opts and usage */
         static const char *long_opts_str = "h1:2:o:a:I:";

         /* getopt_long stores the option index here. */
         int long_opts_index = 0;
         c = getopt_long(argc-1, argv+1, /* skipping 'lofreq', just leaving 'command', i.e. call */
                         long_opts_str, long_opts, & long_opts_index);
         if (c == -1) {
              break;
         }

         switch (c) {
         /* keep in sync with long_opts etc */
         case 'h': 
              usage(& vcfset_conf); 
              free(vcf_in1); free(vcf_in2); free(vcf_out);
              return 0;

         case '1': 
              vcf_in1 = strdup(optarg);
              break;

         case '2': 
              vcf_in2 = strdup(optarg);
              break;

         case 'o':
              if (0 != strcmp(optarg, "-")) {
                   if (file_exists(optarg)) {
                        LOG_FATAL("Cowardly refusing to overwrite file '%s'. Exiting...\n", optarg);
                        free(vcf_in1); free(vcf_in2);
                        return 1;
                   }
              }
              vcf_out = strdup(optarg);
              break;

         case 'a': 
              if (0 == strcmp(optarg, "intersect")) {
                   vcfset_conf.vcf_setop = SETOP_INTERSECT;

              } else if (0 == strcmp(optarg, "complement")) {
                   vcfset_conf.vcf_setop = SETOP_COMPLEMENT;

              } else if (0 == strcmp(optarg, "concat")) {
                   vcfset_conf.vcf_setop = SETOP_CONCAT;

              } else {
                   LOG_FATAL("Unknown action '%s'. Exiting...\n", optarg);
                   free(vcf_in1); free(vcf_in2); free(vcf_out);
                   return 1;
              }
              break;

         case 'I': 
              add_info_field = strdup(optarg);
              break;

         case '?': 
              LOG_FATAL("%s\n", "unrecognized arguments found. Exiting...\n"); 
              free(vcf_in1); free(vcf_in2); free(vcf_out);
              return 1;

         default:
              break;
         }
    }

    vcfset_conf.only_passed = only_passed;
    vcfset_conf.only_pos = only_pos;
    vcfset_conf.only_snvs = only_snvs;
    vcfset_conf.only_indels = only_indels;

    if (vcfset_conf.only_indels && vcfset_conf.only_snvs) {
         LOG_FATAL("%s\n", "Can't take only indels *and* only snvs into account");
         return 1;
    }

    if (0 != argc - optind - 1) {
         if (vcfset_conf.vcf_setop == SETOP_CONCAT) {
              vcf_concat_findex = optind;
         } else {
              LOG_FATAL("%s\n", "Unrecognized arguments found\n");
              return 1;
         }
    } else {
         if (vcfset_conf.vcf_setop == SETOP_CONCAT) {
              LOG_FATAL("%s\n", "No extra files for concat given\n");
              return 1;
         }
    }
#if 0
    int i; for (i=optind+1; i<argc; i++) {
         LOG_FIXME("argv[%d]=%s\n", i, argv[i]);
    }
#endif

    if (argc == 2) {
        fprintf(stderr, "\n");
        usage(& vcfset_conf);
        free(vcf_in1); free(vcf_in2); free(vcf_out);
        return 1;
    }

    if (vcfset_conf.vcf_setop == SETOP_UNKNOWN) {
         LOG_FATAL("%s\n", "No set operation specified");
         usage(& vcfset_conf);
         free(vcf_in1); free(vcf_in2); free(vcf_out);
         return 1;
    }

    if  (vcf_in1 == NULL || (vcf_in2 == NULL && vcfset_conf.vcf_setop != SETOP_CONCAT)) {
         LOG_FATAL("%s\n\n", "At least one vcf input file not specified");
         usage(& vcfset_conf);
         free(vcf_in1); free(vcf_in2); free(vcf_out);
         return 1;
    }
    if (vcf_in2 != NULL && vcfset_conf.vcf_setop == SETOP_CONCAT) {
         LOG_FATAL("%s\n\n", "For concat just use the -1 option followed by all other vcf files instead of using -2");
         usage(& vcfset_conf);
         free(vcf_in1); free(vcf_in2); free(vcf_out);
         return 1;         
    }

    if (vcf_file_open(& vcfset_conf.vcf_in1, vcf_in1, 
                      HAS_GZIP_EXT(vcf_in1), 'r')) {
         LOG_ERROR("Couldn't open %s\n", vcf_in1);
         free(vcf_in1); free(vcf_in2); free(vcf_out);
         return 1;
    }

    if (vcf_in2) {
         vcf2_hts = hts_open(vcf_in2, "r");
         if (!vcf2_hts) {
              LOG_FATAL("Couldn't load %s\n", vcf_in2);
              return 1;
         }
         vcf2_tbx = tbx_index_load(vcf_in2);
         if (!vcf2_tbx) {
              LOG_FATAL("Couldn't load tabix index for %s\n", vcf_in2);
              return 1;
         }
    }

    /* vcf_out default if not set: stdout==- */
    if (! vcf_out) {
         vcf_out = malloc(2 * sizeof(char));
         strcpy(vcf_out, "-");
    }

    if (! count_only) {
         if (vcf_file_open(& vcfset_conf.vcf_out, vcf_out, 
                           HAS_GZIP_EXT(vcf_out), 'w')) {
              LOG_ERROR("Couldn't open %s\n", vcf_out);
              free(vcf_in1); free(vcf_in2); free(vcf_out);
              return 1;
         }
    }

    /* use meta-data/header of vcf_in1 for output
     */
    LOG_DEBUG("Getting header from %s\n", vcf_in1);
    if (0 !=  vcf_parse_header(&vcf_header, & vcfset_conf.vcf_in1)) {
         LOG_WARN("%s\n", "vcf_parse_header() failed");
         if (vcf_file_seek(& vcfset_conf.vcf_in1, 0, SEEK_SET)) {
              LOG_FATAL("%s\n", "Couldn't rewind file to parse variants"
                        " after header parsing failed");
              return -1;
         }
    } else {
         if (! count_only) {
              /* vcf_write_header would write *default* header */
              vcf_write_header(& vcfset_conf.vcf_out, vcf_header);
         }
         free(vcf_header);
    }

    
    /* parse first vcf file
     */
    LOG_DEBUG("Starting to parse variants from %s\n", vcf_in1);
    while (1) {
         var_t *var1 = NULL;
         int rc;
         int is_indel;
         kstring_t var2_kstr = {0, 0, 0};
         hts_itr_t *var2_itr = NULL;
         char regbuf[1024];
         int var2_match = 0;

         vcf_new_var(&var1);
         rc = vcf_parse_var(& vcfset_conf.vcf_in1, var1);
         if (rc) {
              free(var1);
              
              if (vcfset_conf.vcf_setop != SETOP_CONCAT) {
                   break;
              } else {
                   vcf_concat_findex++;
                   if (vcf_concat_findex==argc) {
                        break;
                   }
                   /* set vcf1 up anew and simply continue as if nothing happened 
                    */
                   vcf_file_close(& vcfset_conf.vcf_in1);
                   free(vcf_in1);

                   vcf_in1 = strdup(argv[vcf_concat_findex]);
                   LOG_DEBUG("updated vcf_in1 = %s\n", vcf_in1);
                   if (vcf_file_open(& vcfset_conf.vcf_in1, vcf_in1, 
                                     HAS_GZIP_EXT(vcf_in1), 'r')) {
                        LOG_ERROR("Couldn't open %s\n", vcf_in1);
                        free(vcf_in1); free(vcf_in2); free(vcf_out);
                        return 1;
                   }
                   if (0 != vcf_skip_header(& vcfset_conf.vcf_in1)) {
                        LOG_WARN("skip header failed for %s\n", vcf_in1);
                   }
                   continue;
              }
         }

         is_indel = vcf_var_is_indel(var1);
         if (vcfset_conf.only_snvs && is_indel) {
              free(var1);
              continue;
         } else if (vcfset_conf.only_indels && ! is_indel) {
              free(var1);
              continue;
         }

         if (! vcfset_conf.only_pos && NULL != strchr(var1->alt, ',')) {
              LOG_FATAL("%s\n", "No support for multi-allelic SNVs in vcf1");
              return -1;
         }
         if (vcfset_conf.only_passed && ! VCF_VAR_PASSES(var1)) {
#ifdef TRACE
              LOG_DEBUG("Skipping non-passing var1 %s:%d\n", var1->chrom, var1->pos);
#endif
              num_vars_vcf1_ign += 1;
              vcf_free_var(& var1);
              continue;
         }
         if (add_info_field) {
              vcf_var_add_to_info(var1, add_info_field);
         }
         num_vars_vcf1 += 1;
#ifdef TRACE
         LOG_DEBUG("Got passing var1 %s:%d\n", var1->chrom, var1->pos);
#endif

         if (vcfset_conf.vcf_setop == SETOP_CONCAT) {
              num_vars_out += 1;
              if (! count_only) {
                   vcf_write_var(& vcfset_conf.vcf_out, var1);
              }
              vcf_free_var(& var1);
              /* skip comparison against vcf2 */
              continue;
         }

         /* use index access to vcf2 */
         snprintf(regbuf, 1024, "%s:%ld-%ld", var1->chrom, var1->pos+1, var1->pos+1);
         var2_itr = tbx_itr_querys(vcf2_tbx, regbuf);
         if (! var2_itr) {
              var2_match = 0;
         } else {
              var2_match = 0;
              while (tbx_itr_next(vcf2_hts, vcf2_tbx, var2_itr, &var2_kstr) >= 0) {
                   var_t *var2 = NULL;
                   int var2_is_indel = 0;

                   vcf_new_var(&var2);
                   rc = vcf_parse_var_from_line(var2_kstr.s, var2);
                   /* LOG_FIXME("%d:%s>%s looking at var2 %d:%s>%s (reg %s)\n", 
                             var1->pos+1, var1->ref, var1->alt,
                             var2->pos+1, var2->ref, var2->alt, regbuf); */
                   if (rc) {
                        LOG_FATAL("%s\n", "Error while parsing variant returned from tabix");
                        return -1;
                   }

                   var2_is_indel = vcf_var_is_indel(var2);

                   /* iterator returns anything overlapping with that 
                    * position, i.e. this also includes up/downstream
                    * indels, so make sure actual position matches */
                   if (var1->pos != var2->pos) {
                        var2_match = 0;

                   } else if (vcfset_conf.only_passed && ! VCF_VAR_PASSES(var2)) {
                        var2_match = 0;

                   } else if (vcfset_conf.only_snvs && var2_is_indel) {
                        var2_match = 0;

                   } else if (vcfset_conf.only_indels && ! var2_is_indel) {
                        var2_match = 0;

                   } else if (vcfset_conf.only_pos) {
#ifdef TRACE
                        LOG_DEBUG("Pos match for var2 %s:%d\n", var2->chrom, var2->pos);
#endif
                        var2_match = 1;

                   } else {
                        if (0==strcmp(var1->ref, var2->ref) && 0==strcmp(var1->alt, var2->alt)) {
#ifdef TRACE
                             LOG_DEBUG("Full match for var2 %s:%d\n", var2->chrom, var2->pos);
#endif
                             var2_match = 1;/* FIXME: check type as well i.e. snv vs indel */                             
                        }
                   }
                   vcf_free_var(&var2);
                   if (var2_match) {
                        break;/* no need to continue */
                   }
              }
         }

         if (vcfset_conf.vcf_setop == SETOP_COMPLEMENT) {
              /* relative complement : elements in A but not B */
              if (!var2_match) {
                   num_vars_out += 1;
                   if (! count_only) {
                        vcf_write_var(& vcfset_conf.vcf_out, var1);
                   }
              }
         } else if (vcfset_conf.vcf_setop == SETOP_INTERSECT) {
              if (var2_match) {
                   num_vars_out += 1;
                   if (! count_only) {
                        vcf_write_var(& vcfset_conf.vcf_out, var1);
                   }
              }

         } else {
              LOG_FATAL("Internal error: unsupported vcf_setop %d\n", vcfset_conf.vcf_setop);
              return 1;
         }

         vcf_free_var(& var1);
         tbx_itr_destroy(var2_itr);
    }/* while (1) */

    vcf_file_close(& vcfset_conf.vcf_in1);
    if (vcf_in2) {
         hts_close(vcf2_hts);
         tbx_destroy(vcf2_tbx);
    }
    LOG_VERBOSE("Parsed %d variants from 1st vcf file (ignoring %d non-passed of those)\n", 
                num_vars_vcf1 + num_vars_vcf1_ign, num_vars_vcf1_ign);
    LOG_VERBOSE("Wrote %d variants to output\n", 
                num_vars_out);
    if (! count_only) {
         vcf_file_close(& vcfset_conf.vcf_out);
    }

    if (0==rc) {
         if (count_only) {
              printf("%ld\n", num_vars_out);
         }

         LOG_VERBOSE("%s\n", "Successful exit.");
    }

    free(vcf_in1);
    free(vcf_in2);
    free(vcf_out);


    return rc;
}
Example #2
0
int
main_uniq(int argc, char *argv[])
{
     int c, i;
     char *bam_file = NULL;
     char *vcf_in = NULL; /* - == stdout */
     char *vcf_out = NULL; /* - == stdout */
     mplp_conf_t mplp_conf;
     uniq_conf_t uniq_conf;
     void (*plp_proc_func)(const plp_col_t*, void*);
     int rc = 0;
     var_t **vars = NULL;
     int num_vars = 0;
     char *vcf_header = NULL;
     static int use_det_lim = 0;
     static int use_orphan = 0;
     static int output_all = 0;
     static int is_somatic = 0;

     /* default uniq options */
     memset(&uniq_conf, 0, sizeof(uniq_conf_t));
     uniq_conf.uni_freq = DEFAULT_UNI_FREQ;
     uniq_conf.use_det_lim = 0;

     uniq_conf.uniq_filter.mtc_type = MTC_FDR;
     uniq_conf.uniq_filter.alpha = 0.001;

     /* default pileup options */
     memset(&mplp_conf, 0, sizeof(mplp_conf_t));
     mplp_conf.max_mq = DEFAULT_MAX_MQ;
     mplp_conf.min_mq = 1;
     mplp_conf.min_plp_bq = DEFAULT_MIN_PLP_BQ;
     mplp_conf.max_depth = DEFAULT_MAX_PLP_DEPTH;
     mplp_conf.flag = MPLP_NO_ORPHAN;


    /* keep in sync with long_opts_str and usage
     *
     * getopt is a pain in the whole when it comes to syncing of long
     * and short args and usage. check out gopt, libcfu...
     */
    while (1) {
         static struct option long_opts[] = {
              /* see usage sync */
              {"help", no_argument, NULL, 'h'},
              {"verbose", no_argument, &verbose, 1},
              {"debug", no_argument, &debug, 1},
              {"use-det-lim", no_argument, &use_det_lim, 1},
              {"use-orphan", no_argument, &use_orphan, 1},
              {"output-all", no_argument, &output_all, 1},
              {"is-somatic", no_argument, &is_somatic, 1},

              {"vcf-in", required_argument, NULL, 'v'},
              {"vcf-out", required_argument, NULL, 'o'},

              {"uni-freq", required_argument, NULL, 'f'},

              {"uniq-thresh", required_argument, NULL, 't'},
              {"uniq-mtc", required_argument, NULL, 'm'},
              {"uniq-alpha", required_argument, NULL, 'a'},
              {"uniq-ntests", required_argument, NULL, 'n'},

              {0, 0, 0, 0} /* sentinel */
         };

         /* keep in sync with long_opts and usage */
         static const char *long_opts_str = "hv:o:f:t:m:a:n:";

         /* getopt_long stores the option index here. */
         int long_opts_index = 0;
         c = getopt_long(argc-1, argv+1, /* skipping 'lofreq', just leaving 'command', i.e. call */
                         long_opts_str, long_opts, & long_opts_index);
         if (c == -1) {
              break;
         }

         switch (c) {
         /* keep in sync with long_opts etc */
         case 'h':
              usage(& uniq_conf);
              return 0;

         case 'v':
              if (0 != strcmp(optarg, "-")) {
                   if (! file_exists(optarg)) {
                        LOG_FATAL("Input file '%s' does not exist. Exiting...\n", optarg);
                        return 1;
                   }
              }
              vcf_in = strdup(optarg);
              break;

         case 'o':
              if (0 != strcmp(optarg, "-")) {
                   if (file_exists(optarg)) {
                        LOG_FATAL("Cowardly refusing to overwrite file '%s'. Exiting...\n", optarg);
                        return 1;
                   }
              }
              vcf_out = strdup(optarg);
              break;

         case 'f':
              uniq_conf.uni_freq = strtof(optarg, (char **)NULL); /* atof */
              if (uniq_conf.uni_freq<=0) {
                   LOG_WARN("%s\n", "Ignoring uni-freq option");
              }
              if (uniq_conf.uni_freq>1.0) {
                   LOG_FATAL("%s\n", "Value for uni-freq has to be <1.0");
                   return 1;
              }
              break;

         case 't':
              if (! isdigit(optarg[0])) {
                   LOG_FATAL("Non-numeric argument provided: %s\n", optarg);
                   return -1;
              }
              uniq_conf.uniq_filter.thresh = atoi(optarg);
              uniq_conf.uniq_filter.mtc_type = MTC_NONE;
              break;
         case 'm':
              uniq_conf.uniq_filter.mtc_type = mtc_str_to_type(optarg);
              if (-1 == uniq_conf.uniq_filter.mtc_type) {
                   LOG_FATAL("Unknown multiple testing correction type '%s' for snv quality filtering\n", optarg);
                   return -1;
              }
              break;
         case 'a':
              if (! isdigit(optarg[0])) {
                   LOG_FATAL("Non-numeric argument provided: %s\n", optarg);
                   return -1;
              }
              uniq_conf.uniq_filter.alpha = strtof(optarg, NULL);
              break;
         case 'n':
              if (! isdigit(optarg[0])) {
                   LOG_FATAL("Non-numeric argument provided: %s\n", optarg);
                   return -1;
              }
              uniq_conf.uniq_filter.ntests = atol(optarg);
              break;

         case '?':
              LOG_FATAL("%s\n", "unrecognized arguments found. Exiting...\n");
              return 1;
         default:
              break;
         }
    }
    if (use_orphan) {
         mplp_conf.flag &= ~MPLP_NO_ORPHAN;
    }
    if (debug) {
         dump_mplp_conf(& mplp_conf, stderr);
    }
    uniq_conf.output_all = output_all;
    uniq_conf.use_det_lim = use_det_lim;


#if DEBUG
    LOG_DEBUG("uniq_conf.uniq_filter.thresh = %d\n", uniq_conf.uniq_filter.thresh);
    LOG_DEBUG("uniq_conf.uniq_filter.mtc_type = %d\n", uniq_conf.uniq_filter.mtc_type);
    LOG_DEBUG("uniq_conf.uniq_filter.alpha = %f\n", uniq_conf.uniq_filter.alpha);
    LOG_DEBUG("uniq_conf.uniq_filter.ntests = %d\n", uniq_conf.uniq_filter.ntests);
#endif
    
    if (uniq_conf.uniq_filter.thresh && uniq_conf.uniq_filter.mtc_type != MTC_NONE) {
         LOG_FATAL("%s\n", "Can't use fixed Unique quality threshold *and* multiple testing correction.");
         return 1;
    }

    if (argc == 2) {
        fprintf(stderr, "\n");
        usage(& uniq_conf);
        return 1;
    }

    if (1 != argc - optind - 1) {
        fprintf(stderr, "Need exactly one BAM file as last argument\n");
        return 1;
    }
    bam_file = (argv + optind + 1)[0];
    if (! file_exists(bam_file)) {
         LOG_FATAL("BAM file %s does not exist. Exiting...\n", bam_file);
         return -1;
    }


    if (! vcf_in) {
#if 0
         vcf_in = malloc(2 * sizeof(char));
         strcpy(vcf_in, "-");
#else
         LOG_FATAL("%s\n", "No input vcf specified. Exiting...");
         return -1;
#endif
    }
    if (! vcf_out) {
         vcf_out = malloc(2 * sizeof(char));
         strcpy(vcf_out, "-");
    }

    if (vcf_file_open(& uniq_conf.vcf_in, vcf_in,
                      HAS_GZIP_EXT(vcf_in), 'r')) {
         LOG_ERROR("Couldn't open %s\n", vcf_in);
         return 1;
    }

    if (vcf_file_open(& uniq_conf.vcf_out, vcf_out,
                      HAS_GZIP_EXT(vcf_out), 'w')) {
         LOG_ERROR("Couldn't open %s\n", vcf_out);
         return 1;
    }

    if (0 != vcf_parse_header(&vcf_header, & uniq_conf.vcf_in)) {
         LOG_WARN("%s\n", "vcf_parse_header() failed. trying to rewind to start...");
         if (vcf_file_seek(& uniq_conf.vcf_in, 0, SEEK_SET)) {
              LOG_FATAL("%s\n", "Couldn't rewind file to parse variants"
                        " after header parsing failed");
              return 1;
         }
    } else {
         vcf_header_add(&vcf_header, "##INFO=<ID=UNIQ,Number=0,Type=Flag,Description=\"Unique, i.e. not detectable in paired sample\">\n");
         vcf_header_add(&vcf_header, "##INFO=<ID=UQ,Number=1,Type=Integer,Description=\"Phred-scaled uniq score at this position\">\n");
         if (is_somatic) {
              vcf_header_add(&vcf_header, "##INFO=<ID=SOMATIC,Number=0,Type=Flag,Description=\"Somatic event\">\n");
         }
         if (! uniq_conf.use_det_lim) {
              char full_filter_str[FILTER_STRSIZE];
              if (uniq_conf.uniq_filter.thresh > 0) {
                   snprintf(uniq_conf.uniq_filter.id, FILTER_ID_STRSIZE, "min_uq_%d", uniq_conf.uniq_filter.thresh);
                   snprintf(full_filter_str, FILTER_STRSIZE,
                            "##FILTER=<ID=%s,Description=\"Minimum Uniq Phred %d\">\n",
                            uniq_conf.uniq_filter.id, uniq_conf.uniq_filter.thresh);
                   vcf_header_add(&vcf_header, full_filter_str);
                   
              } else if (uniq_conf.uniq_filter.mtc_type != MTC_NONE) {
                   char buf[64];
                   mtc_str(buf, uniq_conf.uniq_filter.mtc_type);
                   snprintf(uniq_conf.uniq_filter.id, FILTER_ID_STRSIZE, "uq_%s", buf);
                   snprintf(full_filter_str, FILTER_STRSIZE,
                            "##FILTER=<ID=%s,Description=\"Uniq Multiple Testing Correction: %s corr. pvalue < %f\">\n",
                            uniq_conf.uniq_filter.id, buf, uniq_conf.uniq_filter.alpha);
                   vcf_header_add(& vcf_header, full_filter_str);
              }
         }

         vcf_write_header(& uniq_conf.vcf_out, vcf_header);
         free(vcf_header);
    }

    num_vars = vcf_parse_vars(&vars, & uniq_conf.vcf_in, 1);
    if (0 == num_vars) {
         LOG_WARN("%s\n", "Didn't find any variants in input");
         goto clean_and_exit;
    }
    if (! uniq_conf.uniq_filter.ntests) {
         uniq_conf.uniq_filter.ntests = num_vars;
    }

    plp_proc_func = &uniq_snv;

    for (i=0; i<num_vars; i++) {
         char reg_buf[BUF_SIZE];
         if (i%100==0) {
              LOG_VERBOSE("Processing variant %d of %d\n", i+1, num_vars);
         }
         uniq_conf.var = vars[i];

         snprintf(reg_buf, BUF_SIZE, "%s:%ld-%ld",
                  vars[i]->chrom, vars[i]->pos+1, vars[i]->pos+1);
         mplp_conf.reg = strdup(reg_buf);

         LOG_DEBUG("pileup for var no %d at %s %d\n",
                   i+1, uniq_conf.var->chrom, uniq_conf.var->pos+1);
#ifdef DISABLE_INDELS
         if (vcf_var_has_info_key(NULL, uniq_conf.var, "INDEL")) {
              LOG_WARN("Skipping indel var at %s %d\n",
                       uniq_conf.var->chrom, uniq_conf.var->pos+1);
              free(mplp_conf.reg);
              mplp_conf.reg = NULL;
              continue;
         }
#endif
         /* no need to check for filter because done by parse_vars */

         rc = mpileup(&mplp_conf, plp_proc_func, (void*)&uniq_conf,
                      1, (const char **) argv + optind + 1);

         if (uniq_conf.uniq_filter.thresh) {
              apply_uniq_threshold(uniq_conf.var, & uniq_conf.uniq_filter);
         }

         free(mplp_conf.reg);
         mplp_conf.reg = NULL;
    }
    uniq_conf.var = NULL;/* just be sure to not use it accidentally again */


    /* print whatever we've got. there's no UQ to test or we
     * are supposed to print all 
     */
    if (uniq_conf.use_det_lim) {
         for (i=0; i<num_vars; i++) {
              var_t *var = vars[i];
              vcf_write_var(& uniq_conf.vcf_out, var);
         }
         /* all done */
         goto clean_and_exit;
    }



    if (uniq_conf.uniq_filter.mtc_type != MTC_NONE) {
         if (apply_uniq_filter_mtc(& uniq_conf.uniq_filter, vars, num_vars)) {
              LOG_FATAL("%s\n", "Multiple testing correction on uniq pvalues failed");
              return -1;
         }
    }
    
    for (i=0; i<num_vars; i++) {
         var_t *var = vars[i];
         if (VCF_VAR_PASSES(var) || uniq_conf.output_all) {
              vcf_write_var(& uniq_conf.vcf_out, var);
         }
    }

clean_and_exit:

    vcf_file_close(& uniq_conf.vcf_in);
    vcf_file_close(& uniq_conf.vcf_out);

    for (i=0; i<num_vars; i++) {
         vcf_free_var(& vars[i]);
    }
    free(vars);

    free(vcf_in);
    free(vcf_out);

    if (0==rc) {
         LOG_VERBOSE("%s\n", "Successful exit.");
    }
    /* LOG_FIXME("%s\n", "allow user setting of -S and -J. Currently just using default") */

    return rc;
}
Example #3
0
/* mtc_quals allocated here. size returned on exit or -1 on error */
long int
mtc_quals_from_vcf_file(mtc_qual_t **mtc_quals, const char *vcf_in)
{
     long int num_vars = 0;
     long int mtc_qual_size = 0;
     int mtc_qual_incr = 16384;
     vcf_file_t vcffh;

     if (vcf_file_open(&vcffh, vcf_in,
                       HAS_GZIP_EXT(vcf_in), 'r')) {
          LOG_ERROR("Couldn't open %s\n", vcf_in);
          return -1;
     }

    if (0 !=  vcf_skip_header(&vcffh)) {
         LOG_WARN("%s\n", "vcf_skip_header() failed");
         return -1;
    }

    mtc_qual_size += mtc_qual_incr;
    (*mtc_quals) = calloc(mtc_qual_size, sizeof(mtc_qual_t));
     
    while (1) {
         var_t *var;
         int rc;
         int is_indel = 0;
         char *sb_char = NULL;
         

         vcf_new_var(&var);
         rc = vcf_parse_var(&vcffh, var);
         if (rc) {
              /* how to distinguish between error and EOF? */
              break;
         }
         num_vars += 1;
         /* ingest anything: we keep adding filters */


         if (num_vars > mtc_qual_size) {
              mtc_qual_size += mtc_qual_incr;
              (*mtc_quals) = realloc((*mtc_quals), mtc_qual_size * sizeof(mtc_qual_t));
         }

        
         is_indel = vcf_var_is_indel(var);
         (*mtc_quals)[num_vars-1].is_indel = is_indel;

         /* variant quality */
         if (var->qual==-1) {
              /* missing qualities to fake value */
              var->qual = INT_MAX;
              if (! varq_missing_warning_printed) {
                   LOG_WARN("%s\n", "Missing variant quality in at least once case. Assuming INT_MAX");
                   varq_missing_warning_printed = 1;
              }
              (*mtc_quals)[num_vars-1].var_qual = INT_MAX;
         } else {
              (*mtc_quals)[num_vars-1].var_qual = var->qual;
         }

         /* strand bias */
         if ( ! vcf_var_has_info_key(&sb_char, var, "SB")) {
               if ( ! sb_missing_warning_printed) {
                    LOG_WARN("%s\n", "At least one variant has no SB tag! Assuming 0");
                    sb_missing_warning_printed = 1;
               }
               (*mtc_quals)[num_vars-1].sb_qual = 0;
         } else {
              (*mtc_quals)[num_vars-1].sb_qual = atoi(sb_char);
              free(sb_char);
         }

         (*mtc_quals)[num_vars-1].is_alt_mostly_on_one_strand =  alt_mostly_on_one_strand(var);

         vcf_free_var(&var);
    }
    vcf_file_close(&vcffh);

    return num_vars;
}
Example #4
0
int
main_filter(int argc, char *argv[])
{
     filter_conf_t cfg;
     char *vcf_in = NULL, *vcf_out = NULL;
     static int print_only_passed = 1;
     static int sb_filter_no_compound = 0;
     static int sb_filter_incl_indels = 0;
     static int only_indels = 0;
     static int only_snvs = 0;
     char *vcf_header = NULL;
     mtc_qual_t *mtc_quals = NULL;
     long int num_vars;
     static int no_defaults = 0;
     long int var_idx = -1;

     /* default filter options */
     memset(&cfg, 0, sizeof(filter_conf_t));
     cfg.dp_filter.min = cfg.dp_filter.max = -1;
     cfg.af_filter.min = cfg.af_filter.max = -1;
     cfg.sb_filter.alpha = DEFAULT_SIG;
     cfg.snvqual_filter.alpha = DEFAULT_SIG;
     cfg.indelqual_filter.alpha = DEFAULT_SIG;


    /* keep in sync with long_opts_str and usage
     *
     * getopt is a pain in the whole when it comes to syncing of long
     * and short args and usage. check out gopt, libcfu...
     */
    while (1) {
         int c;
         static struct option long_opts[] = {
              /* see usage sync */
              {"verbose", no_argument, &verbose, 1},
              {"debug", no_argument, &debug, 1},
              {"print-all", no_argument, &print_only_passed, 0},
              {"no-defaults", no_argument, &no_defaults, 1},
              {"only-indels", no_argument, &only_indels, 1},
              {"only-snvs", no_argument, &only_snvs, 1},

              {"help", no_argument, NULL, 'h'},
              {"in", required_argument, NULL, 'i'},
              {"out", required_argument, NULL, 'o'},

              {"cov-min", required_argument, NULL, 'v'},
              {"cov-max", required_argument, NULL, 'V'},

              {"af-min", required_argument, NULL, 'a'},
              {"af-max", required_argument, NULL, 'A'},

              {"sb-thresh", required_argument, NULL, 'B'},
              {"sb-mtc", required_argument, NULL, 'b'},
              {"sb-alpha", required_argument, NULL, 'c'},
              {"sb-no-compound", no_argument, &sb_filter_no_compound, 1},
              {"sb-incl-indels", no_argument, &sb_filter_incl_indels, 1},

              {"snvqual-thresh", required_argument, NULL, 'Q'},
              {"snvqual-mtc", required_argument, NULL, 'q'},
              {"snvqual-alpha", required_argument, NULL, 'r'},
              {"snvqual-ntests", required_argument, NULL, 's'},

              {"indelqual-thresh", required_argument, NULL, 'K'},
              {"indelqual-mtc", required_argument, NULL, 'k'},
              {"indelqual-alpha", required_argument, NULL, 'l'},
              {"indelqual-ntests", required_argument, NULL, 'm'},

              {0, 0, 0, 0} /* sentinel */
         };

         /* keep in sync with long_opts and usage */
         static const char *long_opts_str = "hi:o:v:V:a:A:B:b:c:Q:q:r:s:K:k:l:m:";

         /* getopt_long stores the option index here. */
         int long_opts_index = 0;
         c = getopt_long(argc-1, argv+1, /* skipping 'lofreq', just leaving 'command', i.e. call */
                         long_opts_str, long_opts, & long_opts_index);
         if (c == -1) {
              break;
         }

         switch (c) {
         /* keep in sync with long_opts etc */
         case 'h':
              usage(& cfg);
              return 0;

         case 'i':
              vcf_in = strdup(optarg);
              break;
         case 'o':
              if (0 != strcmp(optarg, "-")) {
                   if (file_exists(optarg)) {
                        LOG_FATAL("Cowardly refusing to overwrite file '%s'. Exiting...\n", optarg);
                        return 1;
                   }
              }
              vcf_out = strdup(optarg);
              break;

         case 'v':
              if (! isdigit(optarg[0])) {
                   LOG_FATAL("Non-numeric argument provided: %s\n", optarg);
                   return -1;
              }
              cfg.dp_filter.min = atoi(optarg);
              break;
         case 'V':
              if (! isdigit(optarg[0])) {
                   LOG_FATAL("Non-numeric argument provided: %s\n", optarg);
                   return -1;
              }
              cfg.dp_filter.max = atoi(optarg);
              break;

         case 'a':
              if (! isdigit(optarg[0])) {
                   LOG_FATAL("Non-numeric argument provided: %s\n", optarg);
                   return -1;
              }
              cfg.af_filter.min = strtof(optarg, NULL);
              break;
         case 'A':
              if (! isdigit(optarg[0])) {
                   LOG_FATAL("Non-numeric argument provided: %s\n", optarg);
                   return -1;
              }
              cfg.af_filter.max = strtof(optarg, NULL);
              break;

         case 'B':
              if (! isdigit(optarg[0])) {
                   LOG_FATAL("Non-numeric argument provided: %s\n", optarg);
                   return -1;
              }
              cfg.sb_filter.thresh = atoi(optarg);
              break;
         case 'b':
              cfg.sb_filter.mtc_type = mtc_str_to_type(optarg);
              if (-1 == cfg.sb_filter.mtc_type) {
                   LOG_FATAL("Unknown multiple testing correction type '%s' for strandbias filtering\n", optarg);
                   return -1;
              }
              break;
         case 'c':
              if (! isdigit(optarg[0])) {
                   LOG_FATAL("Non-numeric argument provided: %s\n", optarg);
                   return -1;
              }
              cfg.sb_filter.alpha = strtof(optarg, NULL);
              break;

         case 'Q':
              if (! isdigit(optarg[0])) {
                   LOG_FATAL("Non-numeric argument provided: %s\n", optarg);
                   return -1;
              }
              cfg.snvqual_filter.thresh = atoi(optarg);
              break;
         case 'q':
              cfg.snvqual_filter.mtc_type = mtc_str_to_type(optarg);
              if (-1 == cfg.snvqual_filter.mtc_type) {
                   LOG_FATAL("Unknown multiple testing correction type '%s' for snv quality filtering\n", optarg);
                   return -1;
              }
              break;
         case 'r':
              if (! isdigit(optarg[0])) {
                   LOG_FATAL("Non-numeric argument provided: %s\n", optarg);
                   return -1;
              }
              cfg.snvqual_filter.alpha = strtof(optarg, NULL);
              break;
         case 's':
              if (! isdigit(optarg[0])) {
                   LOG_FATAL("Non-numeric argument provided: %s\n", optarg);
                   return -1;
              }
              cfg.snvqual_filter.ntests = atol(optarg);
              break;

         case 'K':
              if (! isdigit(optarg[0])) {
                   LOG_FATAL("Non-numeric argument provided: %s\n", optarg);
                   return -1;
              }
              cfg.indelqual_filter.thresh = atoi(optarg);
              break;
         case 'k':
              cfg.indelqual_filter.mtc_type = mtc_str_to_type(optarg);
              if (-1 == cfg.indelqual_filter.mtc_type) {
                   LOG_FATAL("Unknown multiple testing correction type '%s' for snv quality filtering\n", optarg);
                   return -1;
              }
              break;
         case 'l':
              if (! isdigit(optarg[0])) {
                   LOG_FATAL("Non-numeric argument provided: %s\n", optarg);
                   return -1;
              }
              cfg.indelqual_filter.alpha = strtof(optarg, NULL);
              break;
         case 'm':
              if (! isdigit(optarg[0])) {
                   LOG_FATAL("Non-numeric argument provided: %s\n", optarg);
                   return -1;
              }
              cfg.indelqual_filter.ntests = atol(optarg);
              break;

         case '?':
              LOG_FATAL("%s\n", "Unrecognized argument found. Exiting...\n");
              return 1;

         default:
              break;
         }
    }
    cfg.print_only_passed = print_only_passed;
    cfg.only_indels = only_indels;
    cfg.only_snvs = only_snvs;
    cfg.sb_filter.no_compound = sb_filter_no_compound;
    cfg.sb_filter.incl_indels = sb_filter_incl_indels;

    if (cfg.only_indels && cfg.only_snvs) {
         LOG_FATAL("%s\n", "Can't keep only indels and only snvs");
         return 1;
    }
    
    if (! no_defaults) {
         if (cfg.sb_filter.mtc_type==MTC_NONE && ! cfg.sb_filter.thresh) {
              LOG_VERBOSE("%s\n", "Setting default SB filtering method to FDR");
              cfg.sb_filter.mtc_type = MTC_FDR;
              cfg.sb_filter.alpha = 0.001;
         }
         if (cfg.dp_filter.min<0) {
              cfg.dp_filter.min = 10;
              LOG_VERBOSE("Setting default minimum coverage to %d\n", cfg.dp_filter.min);
         }
    } else {
         LOG_VERBOSE("%s\n", "Skipping default settings");
    }

    if (0 != argc - optind - 1) {/* FIXME needed at all? */
         LOG_FATAL("%s\n", "Unrecognized argument found. Exiting...\n");
         return 1;
    }

    /* logic check of command line parameters
     */
    if (cfg.dp_filter.max > 0 &&  cfg.dp_filter.max < cfg.dp_filter.min) {
         LOG_FATAL("%s\n", "Invalid coverage-filter settings");
         return 1;
    }
    if ((cfg.af_filter.max > 0 && cfg.af_filter.max < cfg.af_filter.min) ||
        (cfg.af_filter.max > 1.0)) {
         LOG_FATAL("%s\n", "Invalid AF-filter settings");
         return 1;
    }

    if (cfg.sb_filter.thresh && cfg.sb_filter.mtc_type != MTC_NONE) {
         LOG_FATAL("%s\n", "Can't use fixed strand-bias threshold *and* multiple testing correction.");
         return 1;
    }
    if (cfg.snvqual_filter.thresh && cfg.snvqual_filter.mtc_type != MTC_NONE) {
         LOG_FATAL("%s\n", "Can't use fixed SNV quality threshold *and* multiple testing correction.");
         return 1;
    }
    if (cfg.indelqual_filter.thresh && cfg.indelqual_filter.mtc_type != MTC_NONE) {
         LOG_FATAL("%s\n", "Can't use fixed indel quality threshold *and* multiple testing correction.");
         return 1;
    }

    if (argc == 2) {
        fprintf(stderr, "\n");
        usage(& cfg);
        return 1;
    }

    if (debug) {
         dump_filter_conf(& cfg);
     }

    /* missing file args default to stdin and stdout
     */
    /* no streaming allowed for vcf_in: we need to determine thresholds first */
    if  (! vcf_in) {
         LOG_FATAL("%s\n", "Input VCF missing. No streaming allowed. Need to determine auto threshold in memory friendly manner first.");
         return 1;
    }
    if  (! vcf_out) {
         vcf_out = malloc(2 * sizeof(char));
         strcpy(vcf_out, "-");
    }
    LOG_DEBUG("vcf_in=%s vcf_out=%s\n", vcf_in, vcf_out);



    /* First pass parsing to get qualities for MTC computation (if needed)
     */
    if (cfg.sb_filter.mtc_type != MTC_NONE || cfg.snvqual_filter.mtc_type != MTC_NONE || cfg.indelqual_filter.mtc_type != MTC_NONE) {
#ifdef TRACE
         long int i = 0;
#endif
         LOG_VERBOSE("%s\n", "At least one type of multiple testing correction requested. Doing first pass of vcf");

         if ((num_vars = mtc_quals_from_vcf_file(& mtc_quals, vcf_in)) < 0) {
              LOG_ERROR("Couldn't parse %s\n", vcf_in);
              return 1;
         }

         if (cfg.sb_filter.mtc_type != MTC_NONE) {
              if (apply_sb_filter_mtc(mtc_quals, & cfg.sb_filter, num_vars)) {
                   LOG_FATAL("%s\n", "Multiple testing correction on strand-bias pvalues failed");
                   return -1;
              }
         }
         if (cfg.indelqual_filter.mtc_type != MTC_NONE) {
              if (apply_indelqual_filter_mtc(mtc_quals, & cfg.indelqual_filter, num_vars)) {
                   LOG_FATAL("%s\n", "Multiple testing correction on indel quality pvalues failed");
                   return -1;
              }
         }
         if (cfg.snvqual_filter.mtc_type != MTC_NONE) {
              if (apply_snvqual_filter_mtc(mtc_quals, & cfg.snvqual_filter, num_vars)) {
                   LOG_FATAL("%s\n", "Multiple testing correction on SNV quality pvalues failed");
                   return -1;
              }
         }
#ifdef TRACE
         for (i=0; i<num_vars; i++) {
              LOG_WARN("mtc_quals #%ld sb_qual=%d var_qual=%d is_indel=%d\n", 
                       i, mtc_quals[i].sb_qual, mtc_quals[i].var_qual, mtc_quals[i].is_indel);
         }
#endif
         LOG_VERBOSE("%s\n", "MTC application completed");
    } else {
         LOG_VERBOSE("%s\n", "No multiple testing correction requested. First pass of vcf skipped");

    }

    
    if (vcf_file_open(& cfg.vcf_in, vcf_in,
                      HAS_GZIP_EXT(vcf_in), 'r')) {
         LOG_ERROR("Couldn't open %s\n", vcf_in);
         return 1;
    }
    if (vcf_file_open(& cfg.vcf_out, vcf_out,
                      HAS_GZIP_EXT(vcf_out), 'w')) {
         LOG_ERROR("Couldn't open %s\n", vcf_out);
         return 1;
    }
    free(vcf_in);
    free(vcf_out);

    /* print header
     */
    if (0 !=  vcf_parse_header(&vcf_header, & cfg.vcf_in)) {
         /* LOG_WARN("%s\n", "vcf_parse_header() failed"); */
         if (vcf_file_seek(& cfg.vcf_in, 0, SEEK_SET)) {
              LOG_FATAL("%s\n", "Couldn't rewind file to parse variants"
                        " after header parsing failed");
              return -1;
         }
    }
    /* also sets filter names */
    cfg_filter_to_vcf_header(& cfg, &vcf_header);
    vcf_write_header(& cfg.vcf_out, vcf_header);
    free(vcf_header);


    /* read in variants
     */
    while (1) {
         var_t *var;
         int rc;
         int is_indel = 0;

         vcf_new_var(&var);
         rc = vcf_parse_var(& cfg.vcf_in, var);
         if (rc) {
              /* how to distinguish between error and EOF? */
              break;
         }
         var_idx += 1;

         is_indel = vcf_var_is_indel(var);

         if (cfg.only_snvs && is_indel) {
              vcf_free_var(&var);
              continue;
         } else if (cfg.only_indels && ! is_indel) {
              vcf_free_var(&var);
              continue;
         }


         /* filters applying to all types of variants
          */
         apply_af_filter(var, & cfg.af_filter);
         apply_dp_filter(var, & cfg.dp_filter);

         /* quality threshold per variant type
          */
         if (! is_indel) {
              if (cfg.snvqual_filter.thresh) {
                   assert(cfg.snvqual_filter.mtc_type == MTC_NONE);
                   apply_snvqual_threshold(var, & cfg.snvqual_filter);
              } else if (cfg.snvqual_filter.mtc_type != MTC_NONE) {
                   if (mtc_quals[var_idx].var_qual != -1) {
                        vcf_var_add_to_filter(var, cfg.snvqual_filter.id);
                   }
              }

         } else {
              if (cfg.indelqual_filter.thresh) {
                   assert(cfg.indelqual_filter.mtc_type == MTC_NONE);
                   apply_indelqual_threshold(var, & cfg.indelqual_filter);
              } else if (cfg.indelqual_filter.mtc_type != MTC_NONE) {
                   if (mtc_quals[var_idx].var_qual != -1) {
                        vcf_var_add_to_filter(var, cfg.indelqual_filter.id);
                   }
              }
         }
         
         /* sb filter 
          */
         if (cfg.sb_filter.thresh) {
              if (! is_indel || cfg.sb_filter.incl_indels) {
                   assert(cfg.sb_filter.mtc_type == MTC_NONE);
                   apply_sb_threshold(var, & cfg.sb_filter);
              }
         } else if (cfg.sb_filter.mtc_type != MTC_NONE) {
              if (! is_indel || cfg.sb_filter.incl_indels) {
                   if (mtc_quals[var_idx].sb_qual == -1) {
                        vcf_var_add_to_filter(var, cfg.sb_filter.id);
                   }
              }              
         }
         

         /* output
          */
         if (cfg.print_only_passed && ! (VCF_VAR_PASSES(var))) {
              vcf_free_var(&var);
              continue;
         }

         /* add pass if no filters were set */
         if (! var->filter || strlen(var->filter)<=1) {
              char pass_str[] = "PASS";
              if (var->filter) {
                   free(var->filter);
              }
              var->filter = strdup(pass_str);
         }

         vcf_write_var(& cfg.vcf_out, var);
         vcf_free_var(&var);

         if (var_idx%1000==0) {
              (void) vcf_file_flush(& cfg.vcf_out);
         }
    }

    vcf_file_close(& cfg.vcf_in);
    vcf_file_close(& cfg.vcf_out);

    free(mtc_quals);

    LOG_VERBOSE("%s\n", "Successful exit.");

    return 0;
}
Example #5
0
int
main_filter(int argc, char *argv[])
{
     filter_conf_t cfg;
     char *vcf_in = NULL, *vcf_out = NULL;
     static int print_only_passed = 1;
     static int sb_filter_no_compound = 0;
     static int sb_filter_incl_indels = 0;
     static int only_indels = 0;
     static int only_snvs = 0;
     char *vcf_header = NULL;
     var_t **vars = NULL;
     long int num_vars = 0; /* isn't long overkill here ? */
     long int vars_size = 0; /* keeping track of how much memory we've got pre-allocated */
     long int i;
     static int no_defaults = 0;

     /* default filter options */
     memset(&cfg, 0, sizeof(filter_conf_t));
     cfg.dp_filter.min = cfg.dp_filter.max = -1;
     cfg.af_filter.min = cfg.af_filter.max = -1;
     cfg.sb_filter.alpha = DEFAULT_SIG;
     cfg.snvqual_filter.alpha = DEFAULT_SIG;
     cfg.indelqual_filter.alpha = DEFAULT_SIG;


    /* keep in sync with long_opts_str and usage
     *
     * getopt is a pain in the whole when it comes to syncing of long
     * and short args and usage. check out gopt, libcfu...
     */
    while (1) {
         int c;
         static struct option long_opts[] = {
              /* see usage sync */
              {"verbose", no_argument, &verbose, 1},
              {"debug", no_argument, &debug, 1},
              {"print-all", no_argument, &print_only_passed, 0},
              {"no-defaults", no_argument, &no_defaults, 1},
              {"only-indels", no_argument, &only_indels, 1},
              {"only-snvs", no_argument, &only_snvs, 1},

              {"help", no_argument, NULL, 'h'},
              {"in", required_argument, NULL, 'i'},
              {"out", required_argument, NULL, 'o'},

              {"cov-min", required_argument, NULL, 'v'},
              {"cov-max", required_argument, NULL, 'V'},

              {"af-min", required_argument, NULL, 'a'},
              {"af-max", required_argument, NULL, 'A'},

              {"sb-thresh", required_argument, NULL, 'B'},
              {"sb-mtc", required_argument, NULL, 'b'},
              {"sb-alpha", required_argument, NULL, 'c'},
              {"sb-no-compound", no_argument, &sb_filter_no_compound, 1},
              {"sb-incl-indels", no_argument, &sb_filter_incl_indels, 1},

              {"snvqual-thresh", required_argument, NULL, 'Q'},
              {"snvqual-mtc", required_argument, NULL, 'q'},
              {"snvqual-alpha", required_argument, NULL, 'r'},
              {"snvqual-ntests", required_argument, NULL, 's'},

              {"indelqual-thresh", required_argument, NULL, 'K'},
              {"indelqual-mtc", required_argument, NULL, 'k'},
              {"indelqual-alpha", required_argument, NULL, 'l'},
              {"indelqual-ntests", required_argument, NULL, 'm'},

              {0, 0, 0, 0} /* sentinel */
         };

         /* keep in sync with long_opts and usage */
         static const char *long_opts_str = "hi:o:v:V:a:A:B:b:c:Q:q:r:s:K:k:l:m:";

         /* getopt_long stores the option index here. */
         int long_opts_index = 0;
         c = getopt_long(argc-1, argv+1, /* skipping 'lofreq', just leaving 'command', i.e. call */
                         long_opts_str, long_opts, & long_opts_index);
         if (c == -1) {
              break;
         }

         switch (c) {
         /* keep in sync with long_opts etc */
         case 'h':
              usage(& cfg);
              return 0;

         case 'i':
              vcf_in = strdup(optarg);
              break;
         case 'o':
              if (0 != strcmp(optarg, "-")) {
                   if (file_exists(optarg)) {
                        LOG_FATAL("Cowardly refusing to overwrite file '%s'. Exiting...\n", optarg);
                        return 1;
                   }
              }
              vcf_out = strdup(optarg);
              break;

         case 'v':
              if (! isdigit(optarg[0])) {
                   LOG_FATAL("Non-numeric argument provided: %s\n", optarg);
                   return -1;
              }
              cfg.dp_filter.min = atoi(optarg);
              break;
         case 'V':
              if (! isdigit(optarg[0])) {
                   LOG_FATAL("Non-numeric argument provided: %s\n", optarg);
                   return -1;
              }
              cfg.dp_filter.max = atoi(optarg);
              break;

         case 'a':
              if (! isdigit(optarg[0])) {
                   LOG_FATAL("Non-numeric argument provided: %s\n", optarg);
                   return -1;
              }
              cfg.af_filter.min = strtof(optarg, NULL);
              break;
         case 'A':
              if (! isdigit(optarg[0])) {
                   LOG_FATAL("Non-numeric argument provided: %s\n", optarg);
                   return -1;
              }
              cfg.af_filter.max = strtof(optarg, NULL);
              break;

         case 'B':
              if (! isdigit(optarg[0])) {
                   LOG_FATAL("Non-numeric argument provided: %s\n", optarg);
                   return -1;
              }
              cfg.sb_filter.thresh = atoi(optarg);
              break;
         case 'b':
              cfg.sb_filter.mtc_type = mtc_str_to_type(optarg);
              if (-1 == cfg.sb_filter.mtc_type) {
                   LOG_FATAL("Unknown multiple testing correction type '%s' for strandbias filtering\n", optarg);
                   return -1;
              }
              break;
         case 'c':
              if (! isdigit(optarg[0])) {
                   LOG_FATAL("Non-numeric argument provided: %s\n", optarg);
                   return -1;
              }
              cfg.sb_filter.alpha = strtof(optarg, NULL);
              break;

         case 'Q':
              if (! isdigit(optarg[0])) {
                   LOG_FATAL("Non-numeric argument provided: %s\n", optarg);
                   return -1;
              }
              cfg.snvqual_filter.thresh = atoi(optarg);
              break;
         case 'q':
              cfg.snvqual_filter.mtc_type = mtc_str_to_type(optarg);
              if (-1 == cfg.snvqual_filter.mtc_type) {
                   LOG_FATAL("Unknown multiple testing correction type '%s' for snv quality filtering\n", optarg);
                   return -1;
              }
              break;
         case 'r':
              if (! isdigit(optarg[0])) {
                   LOG_FATAL("Non-numeric argument provided: %s\n", optarg);
                   return -1;
              }
              cfg.snvqual_filter.alpha = strtof(optarg, NULL);
              break;
         case 's':
              if (! isdigit(optarg[0])) {
                   LOG_FATAL("Non-numeric argument provided: %s\n", optarg);
                   return -1;
              }
              cfg.snvqual_filter.ntests = atol(optarg);
              break;

         case 'K':
              if (! isdigit(optarg[0])) {
                   LOG_FATAL("Non-numeric argument provided: %s\n", optarg);
                   return -1;
              }
              cfg.indelqual_filter.thresh = atoi(optarg);
              break;
         case 'k':
              cfg.indelqual_filter.mtc_type = mtc_str_to_type(optarg);
              if (-1 == cfg.indelqual_filter.mtc_type) {
                   LOG_FATAL("Unknown multiple testing correction type '%s' for snv quality filtering\n", optarg);
                   return -1;
              }
              break;
         case 'l':
              if (! isdigit(optarg[0])) {
                   LOG_FATAL("Non-numeric argument provided: %s\n", optarg);
                   return -1;
              }
              cfg.indelqual_filter.alpha = strtof(optarg, NULL);
              break;
         case 'm':
              if (! isdigit(optarg[0])) {
                   LOG_FATAL("Non-numeric argument provided: %s\n", optarg);
                   return -1;
              }
              cfg.indelqual_filter.ntests = atol(optarg);
              break;

         case '?':
              LOG_FATAL("%s\n", "Unrecognized argument found. Exiting...\n");
              return 1;

         default:
              break;
         }
    }
    cfg.print_only_passed = print_only_passed;
    cfg.only_indels = only_indels;
    cfg.only_snvs = only_snvs;
    cfg.sb_filter.no_compound = sb_filter_no_compound;
    cfg.sb_filter.incl_indels = sb_filter_incl_indels;

    if (cfg.only_indels && cfg.only_snvs) {
         LOG_FATAL("%s\n", "Can't keep only indels and only snvs");
         return 1;
    }
    
    if (! no_defaults) {
         if (cfg.sb_filter.mtc_type==MTC_NONE && ! cfg.sb_filter.thresh) {
              LOG_VERBOSE("%s\n", "Setting default SB filtering method to FDR");
              cfg.sb_filter.mtc_type = MTC_FDR;
              cfg.sb_filter.alpha = 0.001;
         }
         if (cfg.dp_filter.min<0) {
              cfg.dp_filter.min = 10;
              LOG_VERBOSE("Setting default minimum coverage to %d\n", cfg.dp_filter.min);
         }
    } else {
         LOG_VERBOSE("%s\n", "Skipping default settings");
    }

    if (0 != argc - optind - 1) {/* FIXME needed at all? */
         LOG_FATAL("%s\n", "Unrecognized argument found. Exiting...\n");
         return 1;
    }

    /* logic check of command line parameters
     */
    if (cfg.dp_filter.max > 0 &&  cfg.dp_filter.max < cfg.dp_filter.min) {
         LOG_FATAL("%s\n", "Invalid coverage-filter settings");
         return 1;
    }
    if ((cfg.af_filter.max > 0 && cfg.af_filter.max < cfg.af_filter.min) ||
        (cfg.af_filter.max > 1.0)) {
         LOG_FATAL("%s\n", "Invalid AF-filter settings");
         return 1;
    }

    if (cfg.sb_filter.thresh && cfg.sb_filter.mtc_type != MTC_NONE) {
         LOG_FATAL("%s\n", "Can't use fixed strand-bias threshold *and* multiple testing correction.");
         return 1;
    }
    if (cfg.snvqual_filter.thresh && cfg.snvqual_filter.mtc_type != MTC_NONE) {
         LOG_FATAL("%s\n", "Can't use fixed SNV quality threshold *and* multiple testing correction.");
         return 1;
    }
    if (cfg.indelqual_filter.thresh && cfg.indelqual_filter.mtc_type != MTC_NONE) {
         LOG_FATAL("%s\n", "Can't use fixed indel quality threshold *and* multiple testing correction.");
         return 1;
    }

    if (argc == 2) {
        fprintf(stderr, "\n");
        usage(& cfg);
        return 1;
    }

    if (debug) {
          dump_filter_conf(& cfg);
     }

    /* missing file args default to stdin and stdout
     */
    if  (! vcf_in) {
         vcf_in = malloc(2 * sizeof(char));
         strcpy(vcf_in, "-");
    }
    if  (! vcf_out) {
         vcf_out = malloc(2 * sizeof(char));
         strcpy(vcf_out, "-");
    }
    LOG_DEBUG("vcf_in=%s vcf_out=%s\n", vcf_in, vcf_out);


    /* open vcf files
     */
    if (vcf_file_open(& cfg.vcf_in, vcf_in,
                      HAS_GZIP_EXT(vcf_in), 'r')) {
         LOG_ERROR("Couldn't open %s\n", vcf_in);
         return 1;
    }
    if (vcf_file_open(& cfg.vcf_out, vcf_out,
                      HAS_GZIP_EXT(vcf_out), 'w')) {
         LOG_ERROR("Couldn't open %s\n", vcf_out);
         return 1;
    }
    free(vcf_in);
    free(vcf_out);

    /* FIXME everything below here should go into a function with args:
       - cfg
       - ...what else?
    */

    /* print header
     */
    if (0 !=  vcf_parse_header(&vcf_header, & cfg.vcf_in)) {
         /* LOG_WARN("%s\n", "vcf_parse_header() failed"); */
         if (vcf_file_seek(& cfg.vcf_in, 0, SEEK_SET)) {
              LOG_FATAL("%s\n", "Couldn't rewind file to parse variants"
                        " after header parsing failed");
              return -1;
         }
    }
    /* also sets filter names */
    cfg_filter_to_vcf_header(& cfg, &vcf_header);
    vcf_write_header(& cfg.vcf_out, vcf_header);
    free(vcf_header);


    /* read in variants. since many filters perform multiple testing
     * correction and therefore need to look at all variants we keep
     * it simple and load them all into memory. 
     * 
     * in theory we could apply all 'simple' filters directly within
     * the loop here and depending on the result spit the variant out
     * or not. only complex filters need to see all variants first to,
     * e.g. apply multiple testing.
     */
    num_vars = 0;
    while (1) {
         var_t *var;
         int rc;
         int is_indel = 0;

         vcf_new_var(&var);
         rc = vcf_parse_var(& cfg.vcf_in, var);
         if (rc) {
              /* how to distinguish between error and EOF? */
              free(var);
              break;
         }

         is_indel = vcf_var_is_indel(var);

         if (cfg.only_snvs && is_indel) {
              free(var);
              continue;
         } else if (cfg.only_indels && ! is_indel) {
              free(var);
              continue;
         }

         /* read all in, no matter if already filtered. we keep adding filters */
         num_vars +=1;
         if (num_vars >= vars_size) {
              const long incr = 128;
              vars = realloc(vars, (vars_size+incr) * sizeof(var_t*));
              vars_size += incr;
         }
         vars[num_vars-1] = var;
#ifdef TRACE
         {
              char *key;
              vcf_var_key(&key,  vars[num_vars-1]);
              fprintf(stderr, "storing var %ld+1: %s\n", num_vars, key);
              free(key);
         }
#endif

         /* filters applying to all types of variants
          */
         apply_af_filter(var, & cfg.af_filter);
         apply_dp_filter(var, & cfg.dp_filter);

         /* quality threshold per variant type
          */
         if (! is_indel) {
              if (cfg.snvqual_filter.thresh) {
                   assert(cfg.snvqual_filter.mtc_type == MTC_NONE);
                   apply_snvqual_threshold(var, & cfg.snvqual_filter);
              }

         } else {
              if (cfg.indelqual_filter.thresh) {
                   assert(cfg.indelqual_filter.mtc_type == MTC_NONE);
                   apply_indelqual_threshold(var, & cfg.indelqual_filter);
              }
         }
         
         if (cfg.sb_filter.thresh) {
              if (! is_indel || cfg.sb_filter.incl_indels) {
                   assert(cfg.sb_filter.mtc_type == MTC_NONE);
                   apply_sb_threshold(var, & cfg.sb_filter);
              }
         }
    }

    if (num_vars) {
         vars = realloc(vars, (num_vars * sizeof(var_t*)));
    }
    vcf_file_close(& cfg.vcf_in);
    LOG_VERBOSE("Parsed %ld variants\n", num_vars);


    if (cfg.sb_filter.mtc_type != MTC_NONE) {
         if (apply_sb_filter_mtc(& cfg.sb_filter, vars, num_vars)) {
              LOG_FATAL("%s\n", "Multiple testing correction on strand-bias pvalues failed");
              return -1;
         }
    }

    if (cfg.snvqual_filter.mtc_type != MTC_NONE) {
         if (apply_snvqual_filter_mtc(& cfg.snvqual_filter, vars, num_vars)) {
              LOG_FATAL("%s\n", "Multiple testing correction on SNV qualities failed");
              return -1;
         }
    }

    if (cfg.indelqual_filter.mtc_type != MTC_NONE) {
         if (apply_indelqual_filter_mtc(& cfg.indelqual_filter, vars, num_vars)) {
              LOG_FATAL("%s\n", "Multiple testing correction on Indel qualities failed");
              return -1;
         }
    }

    /* output
     */
    for (i=0; i<num_vars; i++) {
         var_t *v = vars[i];

         if (cfg.print_only_passed && ! (VCF_VAR_PASSES(v))) {
              continue;
         }

         /* add pass if no filters were set */
         if (! v->filter || strlen(v->filter)<=1) {
              char pass_str[] = "PASS";
              if (v->filter) {
                   free(v->filter);
              }
              v->filter = strdup(pass_str);
         }

         vcf_write_var(& cfg.vcf_out, v);
    }
    vcf_file_close(& cfg.vcf_out);


    for (i=0; i<num_vars; i++) {
         vcf_free_var(& vars[i]);
    }
    free(vars);

    LOG_VERBOSE("%s\n", "Successful exit.");

    return 0;
}