示例#1
0
void apply_sb_threshold(var_t *var, sb_filter_t *sb_filter)
{
     char *sb_char = NULL;
     int sb;

     if (! sb_filter->thresh) {
          return;
     }

     if ( ! vcf_var_has_info_key(&sb_char, var, "SB")) {
          if ( ! sb_missing_warning_printed) {
               LOG_WARN("%s\n", "Requested SB filtering failed since SB tag is missing in variant");
               sb_missing_warning_printed = 1;
          }
          return;
     }
     sb = atoi(sb_char);
     free(sb_char);

     if (sb > sb_filter->thresh) {
          if (sb_filter->no_compound || alt_mostly_on_one_strand(var)) {
               vcf_var_add_to_filter(var, sb_filter->id);
          }
     }
}
示例#2
0
void apply_af_filter(var_t *var, af_filter_t *af_filter)
{
     char *af_char = NULL;
     float af;

     if (af_missing_warning_printed) {
          return;
     }

     if (af_filter->min > 0 || af_filter->max > 0) {
          if ( ! vcf_var_has_info_key(&af_char, var, "AF")) {
               if ( ! af_missing_warning_printed) {
                    LOG_WARN("%s\n", "Requested AF filtering failed since AF tag is missing in variant");
                    af_missing_warning_printed = 1;
                    return;
               }
          }
          af = strtof(af_char, (char **)NULL); /* atof */
          if (errno==ERANGE) {
               LOG_ERROR("Couldn't parse EF from af_char %s. Disabling AF filtering", af_char);
               af_missing_warning_printed = 1;
               return;
          }
          free(af_char);

          if (af_filter->min > 0.0 && af < af_filter->min) {
               vcf_var_add_to_filter(var, af_filter->id_min);
          }
          if (af_filter->max > 0.0 && af > af_filter->max) {
               vcf_var_add_to_filter(var, af_filter->id_max);
          }
     }
}
示例#3
0
void apply_indelqual_threshold(var_t *var, indelqual_filter_t *indelqual_filter)
{
     assert (vcf_var_has_info_key(NULL, var, "INDEL"));
     if (! indelqual_filter->thresh) {
          return;
     }
     if (var->qual>-1 && var->qual<indelqual_filter->thresh) {
          vcf_var_add_to_filter(var, indelqual_filter->id);
     }
}
示例#4
0
文件: lofreq_uniq.c 项目: CSB5/lofreq
int
uniq_phred_from_var(var_t *var) {
     char *uq_char = NULL;
     if ( ! vcf_var_has_info_key(&uq_char, var, uniq_phred_tag)) {
          /* missing because no coverage or other reasons. not unique anyway */
          return 0;
     } else {
          int uq = (int) strtol(uq_char, (char **)NULL, 10);/* atoi replacement */
          free(uq_char);
          return uq;
     }          
}
示例#5
0
void apply_dp_filter(var_t *var, dp_filter_t *dp_filter)
{
     char *dp_char = NULL;
     int cov;

     if (dp_missing_warning_printed) {
          return;
     }

     if (dp_filter->min > 0 || dp_filter->max > 0) {
          if ( ! vcf_var_has_info_key(&dp_char, var, "DP")) {
               if ( ! dp_missing_warning_printed) {
#ifdef DEBUG
                    vcf_file_t f; f.fh = stderr; f.gz = 0; vcf_write_var(&f, var);
#endif
                    LOG_WARN("%s\n", "Requested coverage filtering failed since DP tag is missing in variant");
                    dp_missing_warning_printed = 1;
                    return;
               }
          }
          errno = 0;
          /*cov = atoi(dp_char);*/
          cov = strtol(dp_char, (char **) NULL, 10);
          if (errno) {
               LOG_FATAL("%s\n", "errpr during int conversion");
               exit(1);
          }
          free(dp_char);
 
          if (dp_filter->min > 0 && cov < dp_filter->min) {
               vcf_var_add_to_filter(var, dp_filter->id_min);
          }
          if (dp_filter->max > 0 && cov > dp_filter->max) {
               vcf_var_add_to_filter(var, dp_filter->id_max);
          }
     }
}
示例#6
0
文件: lofreq_uniq.c 项目: CSB5/lofreq
int
main_uniq(int argc, char *argv[])
{
     int c, i;
     char *bam_file = NULL;
     char *vcf_in = NULL; /* - == stdout */
     char *vcf_out = NULL; /* - == stdout */
     mplp_conf_t mplp_conf;
     uniq_conf_t uniq_conf;
     void (*plp_proc_func)(const plp_col_t*, void*);
     int rc = 0;
     var_t **vars = NULL;
     int num_vars = 0;
     char *vcf_header = NULL;
     static int use_det_lim = 0;
     static int use_orphan = 0;
     static int output_all = 0;
     static int is_somatic = 0;

     /* default uniq options */
     memset(&uniq_conf, 0, sizeof(uniq_conf_t));
     uniq_conf.uni_freq = DEFAULT_UNI_FREQ;
     uniq_conf.use_det_lim = 0;

     uniq_conf.uniq_filter.mtc_type = MTC_FDR;
     uniq_conf.uniq_filter.alpha = 0.001;

     /* default pileup options */
     memset(&mplp_conf, 0, sizeof(mplp_conf_t));
     mplp_conf.max_mq = DEFAULT_MAX_MQ;
     mplp_conf.min_mq = 1;
     mplp_conf.min_plp_bq = DEFAULT_MIN_PLP_BQ;
     mplp_conf.max_depth = DEFAULT_MAX_PLP_DEPTH;
     mplp_conf.flag = MPLP_NO_ORPHAN;


    /* keep in sync with long_opts_str and usage
     *
     * getopt is a pain in the whole when it comes to syncing of long
     * and short args and usage. check out gopt, libcfu...
     */
    while (1) {
         static struct option long_opts[] = {
              /* see usage sync */
              {"help", no_argument, NULL, 'h'},
              {"verbose", no_argument, &verbose, 1},
              {"debug", no_argument, &debug, 1},
              {"use-det-lim", no_argument, &use_det_lim, 1},
              {"use-orphan", no_argument, &use_orphan, 1},
              {"output-all", no_argument, &output_all, 1},
              {"is-somatic", no_argument, &is_somatic, 1},

              {"vcf-in", required_argument, NULL, 'v'},
              {"vcf-out", required_argument, NULL, 'o'},

              {"uni-freq", required_argument, NULL, 'f'},

              {"uniq-thresh", required_argument, NULL, 't'},
              {"uniq-mtc", required_argument, NULL, 'm'},
              {"uniq-alpha", required_argument, NULL, 'a'},
              {"uniq-ntests", required_argument, NULL, 'n'},

              {0, 0, 0, 0} /* sentinel */
         };

         /* keep in sync with long_opts and usage */
         static const char *long_opts_str = "hv:o:f:t:m:a:n:";

         /* getopt_long stores the option index here. */
         int long_opts_index = 0;
         c = getopt_long(argc-1, argv+1, /* skipping 'lofreq', just leaving 'command', i.e. call */
                         long_opts_str, long_opts, & long_opts_index);
         if (c == -1) {
              break;
         }

         switch (c) {
         /* keep in sync with long_opts etc */
         case 'h':
              usage(& uniq_conf);
              return 0;

         case 'v':
              if (0 != strcmp(optarg, "-")) {
                   if (! file_exists(optarg)) {
                        LOG_FATAL("Input file '%s' does not exist. Exiting...\n", optarg);
                        return 1;
                   }
              }
              vcf_in = strdup(optarg);
              break;

         case 'o':
              if (0 != strcmp(optarg, "-")) {
                   if (file_exists(optarg)) {
                        LOG_FATAL("Cowardly refusing to overwrite file '%s'. Exiting...\n", optarg);
                        return 1;
                   }
              }
              vcf_out = strdup(optarg);
              break;

         case 'f':
              uniq_conf.uni_freq = strtof(optarg, (char **)NULL); /* atof */
              if (uniq_conf.uni_freq<=0) {
                   LOG_WARN("%s\n", "Ignoring uni-freq option");
              }
              if (uniq_conf.uni_freq>1.0) {
                   LOG_FATAL("%s\n", "Value for uni-freq has to be <1.0");
                   return 1;
              }
              break;

         case 't':
              if (! isdigit(optarg[0])) {
                   LOG_FATAL("Non-numeric argument provided: %s\n", optarg);
                   return -1;
              }
              uniq_conf.uniq_filter.thresh = atoi(optarg);
              uniq_conf.uniq_filter.mtc_type = MTC_NONE;
              break;
         case 'm':
              uniq_conf.uniq_filter.mtc_type = mtc_str_to_type(optarg);
              if (-1 == uniq_conf.uniq_filter.mtc_type) {
                   LOG_FATAL("Unknown multiple testing correction type '%s' for snv quality filtering\n", optarg);
                   return -1;
              }
              break;
         case 'a':
              if (! isdigit(optarg[0])) {
                   LOG_FATAL("Non-numeric argument provided: %s\n", optarg);
                   return -1;
              }
              uniq_conf.uniq_filter.alpha = strtof(optarg, NULL);
              break;
         case 'n':
              if (! isdigit(optarg[0])) {
                   LOG_FATAL("Non-numeric argument provided: %s\n", optarg);
                   return -1;
              }
              uniq_conf.uniq_filter.ntests = atol(optarg);
              break;

         case '?':
              LOG_FATAL("%s\n", "unrecognized arguments found. Exiting...\n");
              return 1;
         default:
              break;
         }
    }
    if (use_orphan) {
         mplp_conf.flag &= ~MPLP_NO_ORPHAN;
    }
    if (debug) {
         dump_mplp_conf(& mplp_conf, stderr);
    }
    uniq_conf.output_all = output_all;
    uniq_conf.use_det_lim = use_det_lim;


#if DEBUG
    LOG_DEBUG("uniq_conf.uniq_filter.thresh = %d\n", uniq_conf.uniq_filter.thresh);
    LOG_DEBUG("uniq_conf.uniq_filter.mtc_type = %d\n", uniq_conf.uniq_filter.mtc_type);
    LOG_DEBUG("uniq_conf.uniq_filter.alpha = %f\n", uniq_conf.uniq_filter.alpha);
    LOG_DEBUG("uniq_conf.uniq_filter.ntests = %d\n", uniq_conf.uniq_filter.ntests);
#endif
    
    if (uniq_conf.uniq_filter.thresh && uniq_conf.uniq_filter.mtc_type != MTC_NONE) {
         LOG_FATAL("%s\n", "Can't use fixed Unique quality threshold *and* multiple testing correction.");
         return 1;
    }

    if (argc == 2) {
        fprintf(stderr, "\n");
        usage(& uniq_conf);
        return 1;
    }

    if (1 != argc - optind - 1) {
        fprintf(stderr, "Need exactly one BAM file as last argument\n");
        return 1;
    }
    bam_file = (argv + optind + 1)[0];
    if (! file_exists(bam_file)) {
         LOG_FATAL("BAM file %s does not exist. Exiting...\n", bam_file);
         return -1;
    }


    if (! vcf_in) {
#if 0
         vcf_in = malloc(2 * sizeof(char));
         strcpy(vcf_in, "-");
#else
         LOG_FATAL("%s\n", "No input vcf specified. Exiting...");
         return -1;
#endif
    }
    if (! vcf_out) {
         vcf_out = malloc(2 * sizeof(char));
         strcpy(vcf_out, "-");
    }

    if (vcf_file_open(& uniq_conf.vcf_in, vcf_in,
                      HAS_GZIP_EXT(vcf_in), 'r')) {
         LOG_ERROR("Couldn't open %s\n", vcf_in);
         return 1;
    }

    if (vcf_file_open(& uniq_conf.vcf_out, vcf_out,
                      HAS_GZIP_EXT(vcf_out), 'w')) {
         LOG_ERROR("Couldn't open %s\n", vcf_out);
         return 1;
    }

    if (0 != vcf_parse_header(&vcf_header, & uniq_conf.vcf_in)) {
         LOG_WARN("%s\n", "vcf_parse_header() failed. trying to rewind to start...");
         if (vcf_file_seek(& uniq_conf.vcf_in, 0, SEEK_SET)) {
              LOG_FATAL("%s\n", "Couldn't rewind file to parse variants"
                        " after header parsing failed");
              return 1;
         }
    } else {
         vcf_header_add(&vcf_header, "##INFO=<ID=UNIQ,Number=0,Type=Flag,Description=\"Unique, i.e. not detectable in paired sample\">\n");
         vcf_header_add(&vcf_header, "##INFO=<ID=UQ,Number=1,Type=Integer,Description=\"Phred-scaled uniq score at this position\">\n");
         if (is_somatic) {
              vcf_header_add(&vcf_header, "##INFO=<ID=SOMATIC,Number=0,Type=Flag,Description=\"Somatic event\">\n");
         }
         if (! uniq_conf.use_det_lim) {
              char full_filter_str[FILTER_STRSIZE];
              if (uniq_conf.uniq_filter.thresh > 0) {
                   snprintf(uniq_conf.uniq_filter.id, FILTER_ID_STRSIZE, "min_uq_%d", uniq_conf.uniq_filter.thresh);
                   snprintf(full_filter_str, FILTER_STRSIZE,
                            "##FILTER=<ID=%s,Description=\"Minimum Uniq Phred %d\">\n",
                            uniq_conf.uniq_filter.id, uniq_conf.uniq_filter.thresh);
                   vcf_header_add(&vcf_header, full_filter_str);
                   
              } else if (uniq_conf.uniq_filter.mtc_type != MTC_NONE) {
                   char buf[64];
                   mtc_str(buf, uniq_conf.uniq_filter.mtc_type);
                   snprintf(uniq_conf.uniq_filter.id, FILTER_ID_STRSIZE, "uq_%s", buf);
                   snprintf(full_filter_str, FILTER_STRSIZE,
                            "##FILTER=<ID=%s,Description=\"Uniq Multiple Testing Correction: %s corr. pvalue < %f\">\n",
                            uniq_conf.uniq_filter.id, buf, uniq_conf.uniq_filter.alpha);
                   vcf_header_add(& vcf_header, full_filter_str);
              }
         }

         vcf_write_header(& uniq_conf.vcf_out, vcf_header);
         free(vcf_header);
    }

    num_vars = vcf_parse_vars(&vars, & uniq_conf.vcf_in, 1);
    if (0 == num_vars) {
         LOG_WARN("%s\n", "Didn't find any variants in input");
         goto clean_and_exit;
    }
    if (! uniq_conf.uniq_filter.ntests) {
         uniq_conf.uniq_filter.ntests = num_vars;
    }

    plp_proc_func = &uniq_snv;

    for (i=0; i<num_vars; i++) {
         char reg_buf[BUF_SIZE];
         if (i%100==0) {
              LOG_VERBOSE("Processing variant %d of %d\n", i+1, num_vars);
         }
         uniq_conf.var = vars[i];

         snprintf(reg_buf, BUF_SIZE, "%s:%ld-%ld",
                  vars[i]->chrom, vars[i]->pos+1, vars[i]->pos+1);
         mplp_conf.reg = strdup(reg_buf);

         LOG_DEBUG("pileup for var no %d at %s %d\n",
                   i+1, uniq_conf.var->chrom, uniq_conf.var->pos+1);
#ifdef DISABLE_INDELS
         if (vcf_var_has_info_key(NULL, uniq_conf.var, "INDEL")) {
              LOG_WARN("Skipping indel var at %s %d\n",
                       uniq_conf.var->chrom, uniq_conf.var->pos+1);
              free(mplp_conf.reg);
              mplp_conf.reg = NULL;
              continue;
         }
#endif
         /* no need to check for filter because done by parse_vars */

         rc = mpileup(&mplp_conf, plp_proc_func, (void*)&uniq_conf,
                      1, (const char **) argv + optind + 1);

         if (uniq_conf.uniq_filter.thresh) {
              apply_uniq_threshold(uniq_conf.var, & uniq_conf.uniq_filter);
         }

         free(mplp_conf.reg);
         mplp_conf.reg = NULL;
    }
    uniq_conf.var = NULL;/* just be sure to not use it accidentally again */


    /* print whatever we've got. there's no UQ to test or we
     * are supposed to print all 
     */
    if (uniq_conf.use_det_lim) {
         for (i=0; i<num_vars; i++) {
              var_t *var = vars[i];
              vcf_write_var(& uniq_conf.vcf_out, var);
         }
         /* all done */
         goto clean_and_exit;
    }



    if (uniq_conf.uniq_filter.mtc_type != MTC_NONE) {
         if (apply_uniq_filter_mtc(& uniq_conf.uniq_filter, vars, num_vars)) {
              LOG_FATAL("%s\n", "Multiple testing correction on uniq pvalues failed");
              return -1;
         }
    }
    
    for (i=0; i<num_vars; i++) {
         var_t *var = vars[i];
         if (VCF_VAR_PASSES(var) || uniq_conf.output_all) {
              vcf_write_var(& uniq_conf.vcf_out, var);
         }
    }

clean_and_exit:

    vcf_file_close(& uniq_conf.vcf_in);
    vcf_file_close(& uniq_conf.vcf_out);

    for (i=0; i<num_vars; i++) {
         vcf_free_var(& vars[i]);
    }
    free(vars);

    free(vcf_in);
    free(vcf_out);

    if (0==rc) {
         LOG_VERBOSE("%s\n", "Successful exit.");
    }
    /* LOG_FIXME("%s\n", "allow user setting of -S and -J. Currently just using default") */

    return rc;
}
示例#7
0
文件: lofreq_uniq.c 项目: CSB5/lofreq
/* used as pileup callback function which is not ideal since this can
 * only work on one position (has to be ensured by caller).
 *
 * No cov means I won't be called through mpileup and no output will
 * be generated. Non-sig pv means I'm not sure and no ouput will be
 * generated. Only if pv is sig we will print the var
 *
 * needs to return void to be used as function pointer to mpileup
 */
void
uniq_snv(const plp_col_t *p, void *confp)
{
     uniq_conf_t *conf = (uniq_conf_t *)confp;
     char *af_char = NULL;
     float af;
     int is_uniq = 0;
     int is_indel;
     int coverage;

     is_indel =  vcf_var_is_indel(conf->var);

#ifdef DISABLE_INDELS
     if (is_indel) {
          LOG_WARN("uniq logic can't be applied to indels."
                   " Skipping indel var at %s %d\n",
                   conf->var->chrom, conf->var->pos+1);
          return;
     }
#endif

     if (0 != strcmp(p->target, conf->var->chrom) || p->pos != conf->var->pos) {
          LOG_ERROR("wrong pileup for var. pileup for %s %d. var for %s %d\n",
                    p->target, p->pos+1, conf->var->chrom, conf->var->pos+1);
          return;
     }

     coverage = p->coverage_plp;
     if (is_indel) {
          coverage -= p->num_tails;
     }
     if (1 > coverage) {
          return;
     }

     if (conf->uni_freq <= 0.0) {
          if (! vcf_var_has_info_key(&af_char, conf->var, "AF")) {
               LOG_FATAL("%s\n", "Couldn't parse AF (key not found) from variant");
               /* hard to catch error later */
               exit(1);
          }
          af = strtof(af_char, (char **)NULL); /* atof */
          free(af_char);
          if (af < 0.0 || af > 1.0) {
               float new_af;
               new_af = af<0.0 ? 0.01 : 1.0;
               /* hard to catch error later */
               LOG_FATAL("Invalid (value out of bound) AF %f in variant. Resetting to %f\n", af, new_af);
               af = new_af;
          }

     } else {
          assert(conf->uni_freq <= 1.0);
          af = conf->uni_freq;
     }


     if (conf->use_det_lim) {
          /* given the current base counts and their error probs,
           * would we've been able to detect at given frequency.
           */
          long double pvalues[NUM_NONCONS_BASES];
          double *err_probs; /* error probs (qualities) passed down to snpcaller */
          int num_err_probs;

          int alt_bases[NUM_NONCONS_BASES];/* actual alt bases */
          int alt_counts[NUM_NONCONS_BASES]; /* counts for alt bases handed down to snpcaller */
          int alt_raw_counts[NUM_NONCONS_BASES]; /* raw, unfiltered alt-counts */
          varcall_conf_t varcall_conf;

          int bonf = 1;
          float alpha = 0.01;

          init_varcall_conf(&varcall_conf);
          if (debug) {
               dump_varcall_conf(&varcall_conf, stderr);
          }

          plp_to_errprobs(&err_probs, &num_err_probs,
                          alt_bases, alt_counts, alt_raw_counts,
                          p, &varcall_conf);
          LOG_DEBUG("at %s:%d with cov %d and num_err_probs %d\n", 
              p->target, p->pos, coverage, num_err_probs);

          /* Now pretend we see AF(SNV-to-test)*coverage variant
           * bases. Truncate to int, i.e err on the side of caution
           * during rounding (assume fewer alt bases) */
          alt_counts[0] = af * num_err_probs; /* don't use coverage as that is before filtering */
          alt_counts[1] = alt_counts[2] = 0;

          if (snpcaller(pvalues, err_probs, num_err_probs,
                        alt_counts, bonf, alpha)) {
               fprintf(stderr, "FATAL: snpcaller() failed at %s:%s():%d\n",
                       __FILE__, __FUNCTION__, __LINE__);
               free(err_probs);
               return;
          }

          /* only need to test first pv */
          if (pvalues[0] * (float)bonf < alpha) {
              /* significant value means given the counts and
               * qualities we would have been able to detect this
               * uncalled SNV had it been present at the given
               * frequency. But since we didn't this is a uniq
               * variant.
               * 
               * No point in adding this as phred qual because it
               * means the opposite of UQ
               */

               vcf_var_add_to_info(conf->var, uniq_flag);
          }

          LOG_VERBOSE("%s %d num_quals=%d assumed-var-counts=%d would-have-been-detectable=%d\n",
               conf->var->chrom, conf->var->pos+1, num_err_probs, alt_counts[0], is_uniq);
          free(err_probs);
          
     } else {
          int alt_count;
          double pvalue;
          char info_str[128];

          if (is_indel) {
               int ref_len = strlen(conf->var->ref);
               int alt_len = strlen(conf->var->alt);
               if (ref_len > alt_len) { /* deletion */
                    char *del_key = malloc((strlen(conf->var->ref)+1)*sizeof(char));
                    strcpy(del_key, conf->var->ref+1);
                    del_event *it_del = find_del_sequence(&p->del_event_counts, del_key);
                    if (it_del) {
                         alt_count = it_del->count;
                    } else {
                         alt_count = 0;
                    }
                    /* LOG_DEBUG("%s>%s k:%s c:%d\n", conf->var->ref, conf->var->alt, del_key, alt_count); */
                    free(del_key);
               } else { /* insertion */
                    char *ins_key = malloc((strlen(conf->var->alt)+1)*sizeof(char));
                    strcpy(ins_key, conf->var->alt+1);
                    ins_event *it_ins = find_ins_sequence(&p->ins_event_counts, ins_key);
                    if (it_ins) {
                         alt_count = it_ins->count;
                    } else {
                         alt_count = 0;
                    }
                    /* LOG_DEBUG("%s>%s k:%s c:%d\n", conf->var->ref, conf->var->alt, ins_key, alt_count);*/
                    free(ins_key);
               }

          } else {
               alt_count = base_count(p, conf->var->alt[0]);
          }


#ifdef DEBUG
          LOG_DEBUG("Now testing af=%f cov=%d alt_count=%d at %s %d for var:",
                    af, coverage, alt_count, p->target, p->pos+1);
#endif
          
          /* this is a one sided test */
          if (0 != binom(&pvalue, NULL, coverage, alt_count, af)) {
               LOG_ERROR("%s\n", "binom() failed");
               return;
          }

          snprintf(info_str, 128, "%s=%d", uniq_phred_tag, PROB_TO_PHREDQUAL_SAFE(pvalue));
          vcf_var_add_to_info(conf->var, info_str);

          LOG_DEBUG("%s %d %s>%s AF=%f | %s (p-value=%g) | BAM alt_count=%d cov=%d (freq=%f)\n",
                      conf->var->chrom, conf->var->pos+1, conf->var->ref, conf->var->alt, af,
                      is_uniq ? "unique" : "not necessarily unique", pvalue,
                      alt_count, coverage, alt_count/(float)coverage);
     }
}
示例#8
0
/* mtc_quals allocated here. size returned on exit or -1 on error */
long int
mtc_quals_from_vcf_file(mtc_qual_t **mtc_quals, const char *vcf_in)
{
     long int num_vars = 0;
     long int mtc_qual_size = 0;
     int mtc_qual_incr = 16384;
     vcf_file_t vcffh;

     if (vcf_file_open(&vcffh, vcf_in,
                       HAS_GZIP_EXT(vcf_in), 'r')) {
          LOG_ERROR("Couldn't open %s\n", vcf_in);
          return -1;
     }

    if (0 !=  vcf_skip_header(&vcffh)) {
         LOG_WARN("%s\n", "vcf_skip_header() failed");
         return -1;
    }

    mtc_qual_size += mtc_qual_incr;
    (*mtc_quals) = calloc(mtc_qual_size, sizeof(mtc_qual_t));
     
    while (1) {
         var_t *var;
         int rc;
         int is_indel = 0;
         char *sb_char = NULL;
         

         vcf_new_var(&var);
         rc = vcf_parse_var(&vcffh, var);
         if (rc) {
              /* how to distinguish between error and EOF? */
              break;
         }
         num_vars += 1;
         /* ingest anything: we keep adding filters */


         if (num_vars > mtc_qual_size) {
              mtc_qual_size += mtc_qual_incr;
              (*mtc_quals) = realloc((*mtc_quals), mtc_qual_size * sizeof(mtc_qual_t));
         }

        
         is_indel = vcf_var_is_indel(var);
         (*mtc_quals)[num_vars-1].is_indel = is_indel;

         /* variant quality */
         if (var->qual==-1) {
              /* missing qualities to fake value */
              var->qual = INT_MAX;
              if (! varq_missing_warning_printed) {
                   LOG_WARN("%s\n", "Missing variant quality in at least once case. Assuming INT_MAX");
                   varq_missing_warning_printed = 1;
              }
              (*mtc_quals)[num_vars-1].var_qual = INT_MAX;
         } else {
              (*mtc_quals)[num_vars-1].var_qual = var->qual;
         }

         /* strand bias */
         if ( ! vcf_var_has_info_key(&sb_char, var, "SB")) {
               if ( ! sb_missing_warning_printed) {
                    LOG_WARN("%s\n", "At least one variant has no SB tag! Assuming 0");
                    sb_missing_warning_printed = 1;
               }
               (*mtc_quals)[num_vars-1].sb_qual = 0;
         } else {
              (*mtc_quals)[num_vars-1].sb_qual = atoi(sb_char);
              free(sb_char);
         }

         (*mtc_quals)[num_vars-1].is_alt_mostly_on_one_strand =  alt_mostly_on_one_strand(var);

         vcf_free_var(&var);
    }
    vcf_file_close(&vcffh);

    return num_vars;
}
示例#9
0
/* returns -1 on error 
 *
 * filter everything that's significant
 *
 * very similar to in apply_snvqual_filter_mtc, but reverse logic and looking at all vars
 */
int apply_sb_filter_mtc(sb_filter_t *sb_filter, var_t **vars, const long int num_vars)
{
     double *sb_probs = NULL;
     long int i;
     long int num_ign = 0;
     long int *orig_idx = NULL;/* we might ignore some variants (missing values etc). keep track of real indices of kept vars */

     
     /* collect values from vars kept in mem
      */
     sb_probs = malloc(num_vars * sizeof(double));
     if ( ! sb_probs) {LOG_FATAL("%s\n", "out of memory"); return -1;}
     orig_idx = malloc(num_vars * sizeof(long int));
     if ( ! orig_idx) {LOG_FATAL("%s\n", "out of memory"); return -1;}

     num_ign = 0;
     for (i=0; i<num_vars; i++) {
          char *sb_char = NULL;
          
          /* ignore indels too if sb filter is not to be applied */
          if (! sb_filter->incl_indels && vcf_var_is_indel(vars[i])) {
               num_ign += 1;
               continue;
          }

          if ( ! vcf_var_has_info_key(&sb_char, vars[i], "SB")) {
               if ( ! sb_missing_warning_printed) {
                    LOG_WARN("%s\n", "At least one variant has no SB tag! SB filtering will be incomplete");
                    sb_missing_warning_printed = 1;
               }
               num_ign += 1;
               continue;
          }

          sb_probs[i-num_ign] = PHREDQUAL_TO_PROB(atoi(sb_char));
          orig_idx[i-num_ign] = i;
          /*LOG_FIXME("orig_idx[i=%ld - num_ign=%ld = %ld] = i=%ld\n", i, num_ign, i-num_ign, i);*/
          free(sb_char);
     }
     if (num_vars-num_ign <= 0) {
          free(sb_probs);
          free(orig_idx);
          return 0;
     }


     /* realloc to smaller size apparently not guaranteed to free up space so no point really but let's make sure we don't use that memory */
     sb_probs = realloc(sb_probs, (num_vars-num_ign) * sizeof(double));
     if (! sb_probs) { LOG_FATAL("realloc failed. Exiting..."); return -1; }
     orig_idx = realloc(orig_idx, (num_vars-num_ign) * sizeof(long int));
     if (! orig_idx) { LOG_FATAL("realloc failed. Exiting..."); return -1; }

     if (! sb_filter->ntests) {
          sb_filter->ntests = num_vars - num_ign;
     } else {
          if (num_vars-num_ign > sb_filter->ntests) {
               LOG_WARN("%s\n", "Number of predefined tests for SB filter larger than number of variants! Are you sure that makes sense?");
          }
     }


     /* multiple testing correction
      */
     if (sb_filter->mtc_type == MTC_BONF) {
          bonf_corr(sb_probs, num_vars-num_ign, 
                    sb_filter->ntests);
          
     } else if (sb_filter->mtc_type == MTC_HOLMBONF) {
          holm_bonf_corr(sb_probs, num_vars-num_ign, 
                         sb_filter->alpha, sb_filter->ntests);
          
     } else if (sb_filter->mtc_type == MTC_FDR) {
          long int num_rej = 0;
          long int *idx_rej; /* indices of rejected i.e. significant values */
          
          num_rej = fdr(sb_probs, num_vars-num_ign, 
                        sb_filter->alpha, sb_filter->ntests, 
                        &idx_rej);

          /* first pretend none are significant */
          for (i=0; i<num_vars-num_ign; i++) {
               sb_probs[i] = DBL_MAX;
          }
          LOG_DEBUG("%ld results significant after fdr\n", num_rej);
          for (i=0; i<num_rej; i++) {
               long int idx = idx_rej[i];
               sb_probs[idx] = -1;
          }
          free(idx_rej);
          
     } else {
          LOG_FATAL("Internal error: unknown MTC type %d\n", sb_filter->mtc_type);
          return -1;
     }
     
     for (i=0; i<num_vars-num_ign; i++) {
          if (sb_probs[i] < sb_filter->alpha) {
               if (sb_filter->no_compound || alt_mostly_on_one_strand(vars[orig_idx[i]])) {
                    vcf_var_add_to_filter(vars[orig_idx[i]], sb_filter->id);
               }
          }
     }

     free(orig_idx);
     free(sb_probs);

     return 0;
}
示例#10
0
/* returns -1 on error 
 *
 * filter everything that's not significant
 * 
 * Very similar to apply_sb_filter_mtc, but reverse testing logic and only looking at non consvars
 *
 */
int apply_indelqual_filter_mtc(indelqual_filter_t *indelqual_filter, var_t **vars, const long int num_vars)
{
     /* can only apply this logic to variants that are not consensus
      * variants, i.e those that actually have a quality. therefore
      * keep track of non cons var indeces */
     long int *orig_idx = NULL; /* of size num_noncons_vars */
     double *noncons_errprobs = NULL;
     long int num_noncons_vars = 0;
     long int i;

     /* FIXME function almost identical to apply_indelqual_filter_mtc just different filter can be easily merged by accepting both types of variants */

     /* collect values from noncons vars only and keep track of their indeces
      */
     orig_idx = malloc(num_vars * sizeof(long int));
     if ( ! orig_idx) { LOG_FATAL("%s\n", "out of memory"); return -1; }
     noncons_errprobs = malloc(num_vars * sizeof(double));
     if ( ! noncons_errprobs) { LOG_FATAL("%s\n", "out of memory"); return -1;
     }
     num_noncons_vars = 0;
     for (i=0; i<num_vars; i++) {
          if (vars[i]->qual>-1 && vcf_var_has_info_key(NULL, vars[i], "INDEL")) {
               noncons_errprobs[num_noncons_vars] = PHREDQUAL_TO_PROB(vars[i]->qual);
               orig_idx[num_noncons_vars] = i;
               num_noncons_vars += 1;
          }
     }
     if (! num_noncons_vars) {
          free(noncons_errprobs);
          free(orig_idx);
          return 0;
     }

     if (indelqual_filter->ntests && num_noncons_vars > indelqual_filter->ntests) {
          LOG_WARN("Number of (non consensus) variants larger than number of predefined tests for indelqual filter (%ld > %ld)! Are you sure that makes sense?\n", 
                   num_noncons_vars, indelqual_filter->ntests);
     }

     orig_idx = realloc(orig_idx, (num_noncons_vars * sizeof(long int)));
     if ( ! orig_idx) { LOG_FATAL("%s\n", "out of memory"); return -1; }
     noncons_errprobs = realloc(noncons_errprobs, (num_noncons_vars * sizeof(double)));
     if ( ! noncons_errprobs) { LOG_FATAL("%s\n", "out of memory"); return -1; }

     /* only now we can set the number of tests (if it wasn't set by
      * caller) */
     if (! indelqual_filter->ntests) {
          indelqual_filter->ntests = num_noncons_vars;
     }

     /* multiple testing correction
      */
     if (indelqual_filter->mtc_type == MTC_BONF) {
          bonf_corr(noncons_errprobs, num_noncons_vars, 
                    indelqual_filter->ntests);
          
     } else if (indelqual_filter->mtc_type == MTC_HOLMBONF) {
          holm_bonf_corr(noncons_errprobs, num_noncons_vars, 
                         indelqual_filter->alpha, indelqual_filter->ntests);
          
     } else if (indelqual_filter->mtc_type == MTC_FDR) {
          long int num_rej = 0;
          long int *idx_rej; /* indices of rejected i.e. significant values */
          

          num_rej = fdr(noncons_errprobs, num_noncons_vars, 
                        indelqual_filter->alpha, indelqual_filter->ntests, 
                        &idx_rej);

          /* first pretend none are significant */
          for (i=0; i<num_noncons_vars; i++) {
               noncons_errprobs[i] = DBL_MAX;
          }
          LOG_DEBUG("%ld results significant after fdr\n", num_rej);
          for (i=0; i<num_rej; i++) {
               long int idx = idx_rej[i];
               noncons_errprobs[idx] = -1;
          }
          free(idx_rej);
          
     } else {
          LOG_FATAL("Internal error: unknown MTC type %d\n", indelqual_filter->mtc_type);
          free(orig_idx);
          free(noncons_errprobs);
          return -1;
     }
     
     for (i=0; i<num_noncons_vars; i++) {
          if (noncons_errprobs[i] > indelqual_filter->alpha) {
               vcf_var_add_to_filter(vars[orig_idx[i]], indelqual_filter->id);
          }
     }

     free(orig_idx);
     free(noncons_errprobs);

     return 0;
}