/* Counts probability of non-match count along the read after * subtracting error prob at that position (using the original * orientation). used_pos is an array of ints indicating whether * position was used or not (trimmed, clipped etc). alnerrprof and * used_pos must be of at least length b->core.l_qseq. Note: will add * to alnerrprof and used_pos, i.e. arrays should be initialized to 0 if * you don't want aggregate values. * * WARNING code duplication with count_cigar_ops but merging the two * functions is messy. */ void calc_read_alnerrprof(double *alnerrprof, unsigned long int *used_pos, const bam1_t *b, const char *ref) { /* modelled after bam.c:bam_calend(), bam_format1_core() and * pysam's aligned_pairs (./pysam/csamtools.pyx) */ uint32_t *cigar = bam1_cigar(b); uint32_t k, i; const bam1_core_t *c = &b->core; #if 0 int32_t qlen = (int32_t) bam_cigar2qlen(c, cigar); /* read length */ #else int qlen = b->core.l_qseq; /* read length */ #endif uint32_t pos = c->pos; /* pos on genome */ uint32_t qpos = 0; /* pos on read/query */ uint32_t qpos_org = bam1_strand(b) ? qlen-qpos-1 : qpos;/* original qpos before mapping as possible reverse */ /* loop over cigar to get aligned bases * * read: bam_format1_core(NULL, b, BAM_OFDEC); */ for (k=0; k < c->n_cigar; ++k) { /* n_cigar: number of cigar operations */ int op = cigar[k] & BAM_CIGAR_MASK; /* the cigar operation */ uint32_t l = cigar[k] >> BAM_CIGAR_SHIFT; /* following conditionals could be collapsed to much shorter * code, but we keep them as they were in pysam's * aligned_pairs to make later handling of indels easier */ if (op == BAM_CMATCH || op == BAM_CDIFF) { for (i=pos; i<pos+l; i++) { assert(qpos < qlen); /* case agnostic */ char ref_nt = ref[i]; char read_nt = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), qpos)]; int bq = bam1_qual(b)[qpos]; #if 0 printf("[M]MATCH qpos,i,ref,read = %d,%d,%c,%c\n", qpos, i, ref_nt, read_nt); #endif if (ref_nt != 'N') { if (ref_nt != read_nt || op == BAM_CDIFF) { alnerrprof[qpos_org] += (1.0 - PHREDQUAL_TO_PROB(bq)); } /* otherwise leave at 0.0 but count anyway */ used_pos[qpos_org] += 1; } qpos += 1; qpos_org = bam1_strand(b) ? qlen-qpos-1 : qpos; } pos += l; } else if (op == BAM_CINS) { for (i=pos; i<pos+l; i++) { assert(qpos < qlen); alnerrprof[qpos] += (1.0 - PHREDQUAL_TO_PROB(INDEL_QUAL_DEFAULT)); used_pos[qpos] += 1; #if 0 printf("INS qpos,i = %d,None\n", qpos); #endif qpos += 1; qpos_org = bam1_strand(b) ? qlen-qpos-1 : qpos; } } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) { for (i=pos; i<pos+l; i++) { #if 0 printf("DEL qpos,i = None,%d\n", i); #endif if (op == BAM_CDEL) { alnerrprof[qpos] += (1.0 - PHREDQUAL_TO_PROB(INDEL_QUAL_DEFAULT)); used_pos[qpos] += 1; } } pos += l; /* deletion: don't increase qpos */ } else if (op == BAM_CSOFT_CLIP) { #if 0 printf("SOFT CLIP qpos = %d\n", qpos); #endif qpos += l; qpos_org = bam1_strand(b) ? qlen-qpos-1 : qpos; } else if (op != BAM_CHARD_CLIP) { LOG_WARN("Unknown op %d in cigar %s\n", op, cigar_str_from_bam(b)); } } /* for k */ assert(pos == bam_calend(&b->core, bam1_cigar(b))); /* FIXME correct assert? what if hard clipped? */ if (qpos != qlen) { LOG_FIXME("got qpos=%d and qlen=%d for cigar %s l_qseq %d\n", qpos, qlen, cigar_str_from_bam(b), b->core.l_qseq); } assert(qpos == qlen); /* FIXME correct assert? What if hard clipped? */ #if 0 fprintf(stderr, "%s:", __FUNCTION__); for (i=0; i< b->core.l_qseq; i++) { fprintf(stderr, " %g/%d", alnerrprof[i], used_pos[i]); } fprintf(stderr, "\n"); #endif }
/* returns -1 on error * * filter everything that's not significant * * FIXME should be part of lofreq filter. * */ int apply_uniq_filter_mtc(uniq_filter_t *uniq_filter, var_t **vars, const int num_vars) { double *uniq_probs = NULL; int i; if (uniq_filter->ntests && num_vars > uniq_filter->ntests) { LOG_WARN("%s\n", "Number of predefined tests for uniq filter larger than number of variants! Are you sure that makes sense?"); } if (! uniq_filter->ntests) { uniq_filter->ntests = num_vars; } /* collect uniq error probs */ uniq_probs = malloc(num_vars * sizeof(double)); if ( ! uniq_probs) { LOG_FATAL("%s\n", "out of memory"); exit(1); } for (i=0; i<num_vars; i++) { uniq_probs[i] = PHREDQUAL_TO_PROB(uniq_phred_from_var(vars[i])); } /* multiple testing correction */ if (uniq_filter->mtc_type == MTC_BONF) { bonf_corr(uniq_probs, num_vars, uniq_filter->ntests); } else if (uniq_filter->mtc_type == MTC_HOLMBONF) { holm_bonf_corr(uniq_probs, num_vars, uniq_filter->alpha, uniq_filter->ntests); } else if (uniq_filter->mtc_type == MTC_FDR) { int num_rej = 0; long int *idx_rej; /* indices of rejected i.e. significant values */ int i; num_rej = fdr(uniq_probs, num_vars, uniq_filter->alpha, uniq_filter->ntests, &idx_rej); for (i=0; i<num_rej; i++) { int idx = idx_rej[i]; uniq_probs[idx] = -1; } free(idx_rej); } else { LOG_FATAL("Internal error: unknown MTC type %d\n", uniq_filter->mtc_type); return -1; } for (i=0; i<num_vars; i++) { if (uniq_probs[i] > uniq_filter->alpha) { vcf_var_add_to_filter(vars[i], uniq_filter->id); } } free(uniq_probs); return 0; }
/* returns -1 on error. * * Will set any mtc_quals->sb_qual to -1 if significant (i.e filter!) * */ int apply_sb_filter_mtc(mtc_qual_t *mtc_quals, sb_filter_t *sb_filter, const long int num_vars) { double *sb_probs = NULL; long int i; long int num_ign = 0; long int *orig_idx = NULL;/* we might ignore some variants (missing values etc). keep track of real indices of kept vars */ /* collect values from vars kept in mem */ sb_probs = malloc(num_vars * sizeof(double)); if ( ! sb_probs) {LOG_FATAL("%s\n", "out of memory"); return -1;} orig_idx = malloc(num_vars * sizeof(long int)); if ( ! orig_idx) {LOG_FATAL("%s\n", "out of memory"); return -1;} num_ign = 0; for (i=0; i<num_vars; i++) { /* ignore indels too if sb filter is not to be applied */ if (! sb_filter->incl_indels && mtc_quals[i].is_indel) { num_ign += 1; continue; } sb_probs[i-num_ign] = PHREDQUAL_TO_PROB(mtc_quals[i].sb_qual); orig_idx[i-num_ign] = i; } if (num_vars-num_ign <= 0) { free(sb_probs); free(orig_idx); return 0; } #if 0 /* realloc to smaller size apparently not guaranteed to free up space so no point really but let's make sure we don't use that memory */ sb_probs = realloc(sb_probs, (num_vars-num_ign) * sizeof(double)); if (! sb_probs) { LOG_FATAL("realloc failed. Exiting..."); return -1; } orig_idx = realloc(orig_idx, (num_vars-num_ign) * sizeof(long int)); if (! orig_idx) { LOG_FATAL("realloc failed. Exiting..."); return -1; } #endif if (! sb_filter->ntests) { sb_filter->ntests = num_vars - num_ign; } else { if (num_vars - num_ign > sb_filter->ntests) { LOG_WARN("Number of variants (%ld) in SB filter larger than the number of predefined tests (%ld). Are you sure that makes sense?\n", num_vars-num_ign, sb_filter->ntests); } } /* multiple testing correction */ if (sb_filter->mtc_type == MTC_BONF) { bonf_corr(sb_probs, num_vars-num_ign, sb_filter->ntests); } else if (sb_filter->mtc_type == MTC_HOLMBONF) { holm_bonf_corr(sb_probs, num_vars-num_ign, sb_filter->alpha, sb_filter->ntests); } else if (sb_filter->mtc_type == MTC_FDR) { long int num_rej = 0; long int *idx_rej; /* indices of rejected i.e. significant values */ num_rej = fdr(sb_probs, num_vars-num_ign, sb_filter->alpha, sb_filter->ntests, &idx_rej); /* first pretend none are significant */ for (i=0; i<num_vars-num_ign; i++) { sb_probs[i] = DBL_MAX; } for (i=0; i<num_rej; i++) { long int idx = idx_rej[i]; sb_probs[idx] = -1; } free(idx_rej); } else { LOG_FATAL("Internal error: unknown MTC type %d\n", sb_filter->mtc_type); return -1; } for (i=0; i<num_vars-num_ign; i++) { /* note: reverse of qual filters, i.e. qpply filter if sign, and not the other way around! */ if (sb_probs[i] < sb_filter->alpha) { if (sb_filter->no_compound || mtc_quals[orig_idx[i]].is_alt_mostly_on_one_strand) { mtc_quals[orig_idx[i]].sb_qual = -1; } } } free(orig_idx); free(sb_probs); return 0; }
/* returns -1 on error * * filter everything that's not significant * * Very similar to apply_snvqual_filter_mtc, but reverse testing logic and only looking at non consvars * * Will set any mtc_quals->qual to -1 if significant (i.e. don't filter) */ int apply_indelqual_filter_mtc(mtc_qual_t *mtc_quals, indelqual_filter_t *indelqual_filter, const long int num_vars) { long int *orig_idx = NULL; /* of size num_ign */ double *errprobs = NULL; long int num_ign = 0; long int i; /* collect values from noncons vars only and keep track of their indeces */ errprobs = malloc(num_vars * sizeof(double)); if ( ! errprobs) { LOG_FATAL("%s\n", "out of memory"); return -1; } orig_idx = malloc(num_vars * sizeof(long int)); if ( ! orig_idx) { LOG_FATAL("%s\n", "out of memory"); return -1; } num_ign = 0; for (i=0; i<num_vars; i++) { if (! mtc_quals[i].is_indel) { num_ign += 1; continue; } errprobs[i-num_ign] = PHREDQUAL_TO_PROB(mtc_quals[i].var_qual); orig_idx[i-num_ign] = i; } if (num_vars-num_ign <= 0) { free(errprobs); free(orig_idx); return 0; } #if 0 orig_idx = realloc(orig_idx, (num_ign * sizeof(long int))); if ( ! orig_idx) { LOG_FATAL("%s\n", "out of memory"); return -1; } errprobs = realloc(errprobs, (num_ign * sizeof(double))); if ( ! errprobs) { LOG_FATAL("%s\n", "out of memory"); return -1; } #endif /* only now we can set the number of tests (if it wasn't set by * caller) */ if (! indelqual_filter->ntests) { indelqual_filter->ntests = num_vars-num_ign; } else { if (num_vars-num_ign > indelqual_filter->ntests) { LOG_WARN("Number of variants (%ld) larger than the number of predefined tests (%ld). Are you sure that makes sense?\n", num_vars-num_ign, indelqual_filter->ntests); } } /* multiple testing correction */ if (indelqual_filter->mtc_type == MTC_BONF) { bonf_corr(errprobs, num_vars-num_ign, indelqual_filter->ntests); } else if (indelqual_filter->mtc_type == MTC_HOLMBONF) { holm_bonf_corr(errprobs, num_vars-num_ign, indelqual_filter->alpha, indelqual_filter->ntests); } else if (indelqual_filter->mtc_type == MTC_FDR) { long int num_rej = 0; long int *idx_rej; /* indices of rejected i.e. significant values */ num_rej = fdr(errprobs, num_vars-num_ign, indelqual_filter->alpha, indelqual_filter->ntests, &idx_rej); /* first pretend none are significant */ for (i=0; i<num_vars-num_ign; i++) { errprobs[i] = DBL_MAX; } for (i=0; i<num_rej; i++) { long int idx = idx_rej[i]; errprobs[idx] = -1; } free(idx_rej); } else { LOG_FATAL("Internal error: unknown MTC type %d\n", indelqual_filter->mtc_type); free(orig_idx); free(errprobs); return -1; } for (i=0; i<num_vars-num_ign; i++) { if (errprobs[i] < indelqual_filter->alpha) { mtc_quals[orig_idx[i]].var_qual = -1; } } free(orig_idx); free(errprobs); return 0; }
/* returns -1 on error * * filter everything that's significant * * very similar to in apply_snvqual_filter_mtc, but reverse logic and looking at all vars */ int apply_sb_filter_mtc(sb_filter_t *sb_filter, var_t **vars, const long int num_vars) { double *sb_probs = NULL; long int i; long int num_ign = 0; long int *orig_idx = NULL;/* we might ignore some variants (missing values etc). keep track of real indices of kept vars */ /* collect values from vars kept in mem */ sb_probs = malloc(num_vars * sizeof(double)); if ( ! sb_probs) {LOG_FATAL("%s\n", "out of memory"); return -1;} orig_idx = malloc(num_vars * sizeof(long int)); if ( ! orig_idx) {LOG_FATAL("%s\n", "out of memory"); return -1;} num_ign = 0; for (i=0; i<num_vars; i++) { char *sb_char = NULL; /* ignore indels too if sb filter is not to be applied */ if (! sb_filter->incl_indels && vcf_var_is_indel(vars[i])) { num_ign += 1; continue; } if ( ! vcf_var_has_info_key(&sb_char, vars[i], "SB")) { if ( ! sb_missing_warning_printed) { LOG_WARN("%s\n", "At least one variant has no SB tag! SB filtering will be incomplete"); sb_missing_warning_printed = 1; } num_ign += 1; continue; } sb_probs[i-num_ign] = PHREDQUAL_TO_PROB(atoi(sb_char)); orig_idx[i-num_ign] = i; /*LOG_FIXME("orig_idx[i=%ld - num_ign=%ld = %ld] = i=%ld\n", i, num_ign, i-num_ign, i);*/ free(sb_char); } if (num_vars-num_ign <= 0) { free(sb_probs); free(orig_idx); return 0; } /* realloc to smaller size apparently not guaranteed to free up space so no point really but let's make sure we don't use that memory */ sb_probs = realloc(sb_probs, (num_vars-num_ign) * sizeof(double)); if (! sb_probs) { LOG_FATAL("realloc failed. Exiting..."); return -1; } orig_idx = realloc(orig_idx, (num_vars-num_ign) * sizeof(long int)); if (! orig_idx) { LOG_FATAL("realloc failed. Exiting..."); return -1; } if (! sb_filter->ntests) { sb_filter->ntests = num_vars - num_ign; } else { if (num_vars-num_ign > sb_filter->ntests) { LOG_WARN("%s\n", "Number of predefined tests for SB filter larger than number of variants! Are you sure that makes sense?"); } } /* multiple testing correction */ if (sb_filter->mtc_type == MTC_BONF) { bonf_corr(sb_probs, num_vars-num_ign, sb_filter->ntests); } else if (sb_filter->mtc_type == MTC_HOLMBONF) { holm_bonf_corr(sb_probs, num_vars-num_ign, sb_filter->alpha, sb_filter->ntests); } else if (sb_filter->mtc_type == MTC_FDR) { long int num_rej = 0; long int *idx_rej; /* indices of rejected i.e. significant values */ num_rej = fdr(sb_probs, num_vars-num_ign, sb_filter->alpha, sb_filter->ntests, &idx_rej); /* first pretend none are significant */ for (i=0; i<num_vars-num_ign; i++) { sb_probs[i] = DBL_MAX; } LOG_DEBUG("%ld results significant after fdr\n", num_rej); for (i=0; i<num_rej; i++) { long int idx = idx_rej[i]; sb_probs[idx] = -1; } free(idx_rej); } else { LOG_FATAL("Internal error: unknown MTC type %d\n", sb_filter->mtc_type); return -1; } for (i=0; i<num_vars-num_ign; i++) { if (sb_probs[i] < sb_filter->alpha) { if (sb_filter->no_compound || alt_mostly_on_one_strand(vars[orig_idx[i]])) { vcf_var_add_to_filter(vars[orig_idx[i]], sb_filter->id); } } } free(orig_idx); free(sb_probs); return 0; }
/* returns -1 on error * * filter everything that's not significant * * Very similar to apply_sb_filter_mtc, but reverse testing logic and only looking at non consvars * */ int apply_indelqual_filter_mtc(indelqual_filter_t *indelqual_filter, var_t **vars, const long int num_vars) { /* can only apply this logic to variants that are not consensus * variants, i.e those that actually have a quality. therefore * keep track of non cons var indeces */ long int *orig_idx = NULL; /* of size num_noncons_vars */ double *noncons_errprobs = NULL; long int num_noncons_vars = 0; long int i; /* FIXME function almost identical to apply_indelqual_filter_mtc just different filter can be easily merged by accepting both types of variants */ /* collect values from noncons vars only and keep track of their indeces */ orig_idx = malloc(num_vars * sizeof(long int)); if ( ! orig_idx) { LOG_FATAL("%s\n", "out of memory"); return -1; } noncons_errprobs = malloc(num_vars * sizeof(double)); if ( ! noncons_errprobs) { LOG_FATAL("%s\n", "out of memory"); return -1; } num_noncons_vars = 0; for (i=0; i<num_vars; i++) { if (vars[i]->qual>-1 && vcf_var_has_info_key(NULL, vars[i], "INDEL")) { noncons_errprobs[num_noncons_vars] = PHREDQUAL_TO_PROB(vars[i]->qual); orig_idx[num_noncons_vars] = i; num_noncons_vars += 1; } } if (! num_noncons_vars) { free(noncons_errprobs); free(orig_idx); return 0; } if (indelqual_filter->ntests && num_noncons_vars > indelqual_filter->ntests) { LOG_WARN("Number of (non consensus) variants larger than number of predefined tests for indelqual filter (%ld > %ld)! Are you sure that makes sense?\n", num_noncons_vars, indelqual_filter->ntests); } orig_idx = realloc(orig_idx, (num_noncons_vars * sizeof(long int))); if ( ! orig_idx) { LOG_FATAL("%s\n", "out of memory"); return -1; } noncons_errprobs = realloc(noncons_errprobs, (num_noncons_vars * sizeof(double))); if ( ! noncons_errprobs) { LOG_FATAL("%s\n", "out of memory"); return -1; } /* only now we can set the number of tests (if it wasn't set by * caller) */ if (! indelqual_filter->ntests) { indelqual_filter->ntests = num_noncons_vars; } /* multiple testing correction */ if (indelqual_filter->mtc_type == MTC_BONF) { bonf_corr(noncons_errprobs, num_noncons_vars, indelqual_filter->ntests); } else if (indelqual_filter->mtc_type == MTC_HOLMBONF) { holm_bonf_corr(noncons_errprobs, num_noncons_vars, indelqual_filter->alpha, indelqual_filter->ntests); } else if (indelqual_filter->mtc_type == MTC_FDR) { long int num_rej = 0; long int *idx_rej; /* indices of rejected i.e. significant values */ num_rej = fdr(noncons_errprobs, num_noncons_vars, indelqual_filter->alpha, indelqual_filter->ntests, &idx_rej); /* first pretend none are significant */ for (i=0; i<num_noncons_vars; i++) { noncons_errprobs[i] = DBL_MAX; } LOG_DEBUG("%ld results significant after fdr\n", num_rej); for (i=0; i<num_rej; i++) { long int idx = idx_rej[i]; noncons_errprobs[idx] = -1; } free(idx_rej); } else { LOG_FATAL("Internal error: unknown MTC type %d\n", indelqual_filter->mtc_type); free(orig_idx); free(noncons_errprobs); return -1; } for (i=0; i<num_noncons_vars; i++) { if (noncons_errprobs[i] > indelqual_filter->alpha) { vcf_var_add_to_filter(vars[orig_idx[i]], indelqual_filter->id); } } free(orig_idx); free(noncons_errprobs); return 0; }