void apply_sb_threshold(var_t *var, sb_filter_t *sb_filter) { char *sb_char = NULL; int sb; if (! sb_filter->thresh) { return; } if ( ! vcf_var_has_info_key(&sb_char, var, "SB")) { if ( ! sb_missing_warning_printed) { LOG_WARN("%s\n", "Requested SB filtering failed since SB tag is missing in variant"); sb_missing_warning_printed = 1; } return; } sb = atoi(sb_char); free(sb_char); if (sb > sb_filter->thresh) { if (sb_filter->no_compound || alt_mostly_on_one_strand(var)) { vcf_var_add_to_filter(var, sb_filter->id); } } }
/* mtc_quals allocated here. size returned on exit or -1 on error */ long int mtc_quals_from_vcf_file(mtc_qual_t **mtc_quals, const char *vcf_in) { long int num_vars = 0; long int mtc_qual_size = 0; int mtc_qual_incr = 16384; vcf_file_t vcffh; if (vcf_file_open(&vcffh, vcf_in, HAS_GZIP_EXT(vcf_in), 'r')) { LOG_ERROR("Couldn't open %s\n", vcf_in); return -1; } if (0 != vcf_skip_header(&vcffh)) { LOG_WARN("%s\n", "vcf_skip_header() failed"); return -1; } mtc_qual_size += mtc_qual_incr; (*mtc_quals) = calloc(mtc_qual_size, sizeof(mtc_qual_t)); while (1) { var_t *var; int rc; int is_indel = 0; char *sb_char = NULL; vcf_new_var(&var); rc = vcf_parse_var(&vcffh, var); if (rc) { /* how to distinguish between error and EOF? */ break; } num_vars += 1; /* ingest anything: we keep adding filters */ if (num_vars > mtc_qual_size) { mtc_qual_size += mtc_qual_incr; (*mtc_quals) = realloc((*mtc_quals), mtc_qual_size * sizeof(mtc_qual_t)); } is_indel = vcf_var_is_indel(var); (*mtc_quals)[num_vars-1].is_indel = is_indel; /* variant quality */ if (var->qual==-1) { /* missing qualities to fake value */ var->qual = INT_MAX; if (! varq_missing_warning_printed) { LOG_WARN("%s\n", "Missing variant quality in at least once case. Assuming INT_MAX"); varq_missing_warning_printed = 1; } (*mtc_quals)[num_vars-1].var_qual = INT_MAX; } else { (*mtc_quals)[num_vars-1].var_qual = var->qual; } /* strand bias */ if ( ! vcf_var_has_info_key(&sb_char, var, "SB")) { if ( ! sb_missing_warning_printed) { LOG_WARN("%s\n", "At least one variant has no SB tag! Assuming 0"); sb_missing_warning_printed = 1; } (*mtc_quals)[num_vars-1].sb_qual = 0; } else { (*mtc_quals)[num_vars-1].sb_qual = atoi(sb_char); free(sb_char); } (*mtc_quals)[num_vars-1].is_alt_mostly_on_one_strand = alt_mostly_on_one_strand(var); vcf_free_var(&var); } vcf_file_close(&vcffh); return num_vars; }
/* returns -1 on error * * filter everything that's significant * * very similar to in apply_snvqual_filter_mtc, but reverse logic and looking at all vars */ int apply_sb_filter_mtc(sb_filter_t *sb_filter, var_t **vars, const long int num_vars) { double *sb_probs = NULL; long int i; long int num_ign = 0; long int *orig_idx = NULL;/* we might ignore some variants (missing values etc). keep track of real indices of kept vars */ /* collect values from vars kept in mem */ sb_probs = malloc(num_vars * sizeof(double)); if ( ! sb_probs) {LOG_FATAL("%s\n", "out of memory"); return -1;} orig_idx = malloc(num_vars * sizeof(long int)); if ( ! orig_idx) {LOG_FATAL("%s\n", "out of memory"); return -1;} num_ign = 0; for (i=0; i<num_vars; i++) { char *sb_char = NULL; /* ignore indels too if sb filter is not to be applied */ if (! sb_filter->incl_indels && vcf_var_is_indel(vars[i])) { num_ign += 1; continue; } if ( ! vcf_var_has_info_key(&sb_char, vars[i], "SB")) { if ( ! sb_missing_warning_printed) { LOG_WARN("%s\n", "At least one variant has no SB tag! SB filtering will be incomplete"); sb_missing_warning_printed = 1; } num_ign += 1; continue; } sb_probs[i-num_ign] = PHREDQUAL_TO_PROB(atoi(sb_char)); orig_idx[i-num_ign] = i; /*LOG_FIXME("orig_idx[i=%ld - num_ign=%ld = %ld] = i=%ld\n", i, num_ign, i-num_ign, i);*/ free(sb_char); } if (num_vars-num_ign <= 0) { free(sb_probs); free(orig_idx); return 0; } /* realloc to smaller size apparently not guaranteed to free up space so no point really but let's make sure we don't use that memory */ sb_probs = realloc(sb_probs, (num_vars-num_ign) * sizeof(double)); if (! sb_probs) { LOG_FATAL("realloc failed. Exiting..."); return -1; } orig_idx = realloc(orig_idx, (num_vars-num_ign) * sizeof(long int)); if (! orig_idx) { LOG_FATAL("realloc failed. Exiting..."); return -1; } if (! sb_filter->ntests) { sb_filter->ntests = num_vars - num_ign; } else { if (num_vars-num_ign > sb_filter->ntests) { LOG_WARN("%s\n", "Number of predefined tests for SB filter larger than number of variants! Are you sure that makes sense?"); } } /* multiple testing correction */ if (sb_filter->mtc_type == MTC_BONF) { bonf_corr(sb_probs, num_vars-num_ign, sb_filter->ntests); } else if (sb_filter->mtc_type == MTC_HOLMBONF) { holm_bonf_corr(sb_probs, num_vars-num_ign, sb_filter->alpha, sb_filter->ntests); } else if (sb_filter->mtc_type == MTC_FDR) { long int num_rej = 0; long int *idx_rej; /* indices of rejected i.e. significant values */ num_rej = fdr(sb_probs, num_vars-num_ign, sb_filter->alpha, sb_filter->ntests, &idx_rej); /* first pretend none are significant */ for (i=0; i<num_vars-num_ign; i++) { sb_probs[i] = DBL_MAX; } LOG_DEBUG("%ld results significant after fdr\n", num_rej); for (i=0; i<num_rej; i++) { long int idx = idx_rej[i]; sb_probs[idx] = -1; } free(idx_rej); } else { LOG_FATAL("Internal error: unknown MTC type %d\n", sb_filter->mtc_type); return -1; } for (i=0; i<num_vars-num_ign; i++) { if (sb_probs[i] < sb_filter->alpha) { if (sb_filter->no_compound || alt_mostly_on_one_strand(vars[orig_idx[i]])) { vcf_var_add_to_filter(vars[orig_idx[i]], sb_filter->id); } } } free(orig_idx); free(sb_probs); return 0; }