示例#1
0
文件: samutils.c 项目: CSB5/lofreq
/* Counts probability of non-match count along the read after
 * subtracting error prob at that position (using the original
 * orientation). used_pos is an array of ints indicating whether
 * position was used or not (trimmed, clipped etc). alnerrprof and
 * used_pos must be of at least length b->core.l_qseq. Note: will add
 * to alnerrprof and used_pos, i.e. arrays should be initialized to 0 if
 * you don't want aggregate values.
 *
 * WARNING code duplication with count_cigar_ops but merging the two
 * functions is messy.
 */
void
calc_read_alnerrprof(double *alnerrprof, unsigned long int *used_pos, 
                   const bam1_t *b, const char *ref)
{
     /* modelled after bam.c:bam_calend(), bam_format1_core() and
      * pysam's aligned_pairs (./pysam/csamtools.pyx)
      */
     uint32_t *cigar = bam1_cigar(b);
     uint32_t k, i;
     const bam1_core_t *c = &b->core;
#if 0
     int32_t qlen = (int32_t) bam_cigar2qlen(c, cigar); /* read length */
#else
     int qlen = b->core.l_qseq; /* read length */
#endif
     uint32_t pos = c->pos; /* pos on genome */
     uint32_t qpos = 0; /* pos on read/query */
     uint32_t qpos_org = bam1_strand(b) ? qlen-qpos-1 : qpos;/* original qpos before mapping as possible reverse */


     /* loop over cigar to get aligned bases
      *
      * read: bam_format1_core(NULL, b, BAM_OFDEC);
      */
     for (k=0; k < c->n_cigar; ++k) { /* n_cigar: number of cigar operations */
          int op = cigar[k] & BAM_CIGAR_MASK; /* the cigar operation */
          uint32_t l = cigar[k] >> BAM_CIGAR_SHIFT;

          /* following conditionals could be collapsed to much shorter
           * code, but we keep them as they were in pysam's
           * aligned_pairs to make later handling of indels easier
           */
          if (op == BAM_CMATCH || op == BAM_CDIFF) {
               for (i=pos; i<pos+l; i++) {                             
                    assert(qpos < qlen);
                    /* case agnostic */
                    char ref_nt = ref[i];
                    char read_nt = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), qpos)];
                    int bq = bam1_qual(b)[qpos];
#if 0
                    printf("[M]MATCH qpos,i,ref,read = %d,%d,%c,%c\n", qpos, i, ref_nt, read_nt);
#endif                    

                    if (ref_nt != 'N') {
                         if (ref_nt != read_nt || op == BAM_CDIFF) {
                              alnerrprof[qpos_org] += (1.0 - PHREDQUAL_TO_PROB(bq));
                         } /* otherwise leave at 0.0 but count anyway */
                         used_pos[qpos_org] += 1;
                    }
                    qpos += 1;
                    qpos_org = bam1_strand(b) ? qlen-qpos-1 : qpos;
               }
               pos += l;

          } else if (op == BAM_CINS) {
               for (i=pos; i<pos+l; i++) {
                    assert(qpos < qlen);
                    
                    alnerrprof[qpos] += (1.0 - PHREDQUAL_TO_PROB(INDEL_QUAL_DEFAULT));
                    used_pos[qpos] += 1;
#if 0
                    printf("INS qpos,i = %d,None\n", qpos);
#endif
                    qpos += 1;
                    qpos_org = bam1_strand(b) ? qlen-qpos-1 : qpos;
               }
               
          } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
               for (i=pos; i<pos+l; i++) {
#if 0
                    printf("DEL qpos,i = None,%d\n", i);
#endif

                    if (op == BAM_CDEL) {
                         alnerrprof[qpos] += (1.0 - PHREDQUAL_TO_PROB(INDEL_QUAL_DEFAULT));
                         used_pos[qpos] += 1;
                    }
               }
               pos += l;
               /* deletion: don't increase qpos */

          } else if (op == BAM_CSOFT_CLIP) {
#if 0
               printf("SOFT CLIP qpos = %d\n", qpos);
#endif
               qpos += l;
               qpos_org = bam1_strand(b) ? qlen-qpos-1 : qpos;

          } else if (op != BAM_CHARD_CLIP) {
               LOG_WARN("Unknown op %d in cigar %s\n", op, cigar_str_from_bam(b));

          }
     } /* for k */
     assert(pos == bam_calend(&b->core, bam1_cigar(b))); /* FIXME correct assert? what if hard clipped? */
     if (qpos != qlen) {
          LOG_FIXME("got qpos=%d and qlen=%d for cigar %s l_qseq %d\n", qpos, qlen, cigar_str_from_bam(b), b->core.l_qseq);
     }
     assert(qpos == qlen); /* FIXME correct assert? What if hard clipped? */

#if 0
     fprintf(stderr, "%s:", __FUNCTION__);
     for (i=0; i< b->core.l_qseq; i++) {
          fprintf(stderr, " %g/%d", alnerrprof[i], used_pos[i]);
     }
     fprintf(stderr, "\n");
#endif
}
示例#2
0
文件: lofreq_uniq.c 项目: CSB5/lofreq
/* returns -1 on error 
 *
 * filter everything that's not significant
 * 
 * FIXME should be part of lofreq filter.
 *
 */
int 
apply_uniq_filter_mtc(uniq_filter_t *uniq_filter, var_t **vars, const int num_vars)
{
     double *uniq_probs = NULL;
     int i;

     if (uniq_filter->ntests && num_vars > uniq_filter->ntests) {
         LOG_WARN("%s\n", "Number of predefined tests for uniq filter larger than number of variants! Are you sure that makes sense?");
     }

     if (! uniq_filter->ntests) {
          uniq_filter->ntests = num_vars;
     }

     /* collect uniq error probs
      */
     uniq_probs = malloc(num_vars * sizeof(double));
     if ( ! uniq_probs) {
          LOG_FATAL("%s\n", "out of memory");
          exit(1);
     }
     for (i=0; i<num_vars; i++) {
          uniq_probs[i] = PHREDQUAL_TO_PROB(uniq_phred_from_var(vars[i]));
     }

     /* multiple testing correction
      */
     if (uniq_filter->mtc_type == MTC_BONF) {
          bonf_corr(uniq_probs, num_vars, 
                    uniq_filter->ntests);
          
     } else if (uniq_filter->mtc_type == MTC_HOLMBONF) {
          holm_bonf_corr(uniq_probs, num_vars, 
                         uniq_filter->alpha, uniq_filter->ntests);
          
     } else if (uniq_filter->mtc_type == MTC_FDR) {
          int num_rej = 0;
          long int *idx_rej; /* indices of rejected i.e. significant values */
          int i;
          
          num_rej = fdr(uniq_probs, num_vars, 
                        uniq_filter->alpha, uniq_filter->ntests, 
                        &idx_rej);
          for (i=0; i<num_rej; i++) {
               int idx = idx_rej[i];
               uniq_probs[idx] = -1;
          }
          free(idx_rej);
          
     } else {
          LOG_FATAL("Internal error: unknown MTC type %d\n", uniq_filter->mtc_type);
          return -1;
     }

     for (i=0; i<num_vars; i++) {
          if (uniq_probs[i] > uniq_filter->alpha) {
               vcf_var_add_to_filter(vars[i], uniq_filter->id);
          }
     }

     free(uniq_probs);

     return 0;
}
示例#3
0
/* returns -1 on error. 
 *
 * Will set any mtc_quals->sb_qual to -1 if significant (i.e filter!)
 *
 */
int apply_sb_filter_mtc(mtc_qual_t *mtc_quals, sb_filter_t *sb_filter, const long int num_vars)
{
     double *sb_probs = NULL;
     long int i;
     long int num_ign = 0;
     long int *orig_idx = NULL;/* we might ignore some variants (missing values etc). keep track of real indices of kept vars */

     
     /* collect values from vars kept in mem
      */
     sb_probs = malloc(num_vars * sizeof(double));
     if ( ! sb_probs) {LOG_FATAL("%s\n", "out of memory"); return -1;}
     orig_idx = malloc(num_vars * sizeof(long int));
     if ( ! orig_idx) {LOG_FATAL("%s\n", "out of memory"); return -1;}

     num_ign = 0;
     for (i=0; i<num_vars; i++) {          
          /* ignore indels too if sb filter is not to be applied */
          if (! sb_filter->incl_indels && mtc_quals[i].is_indel) {
               num_ign += 1;
               continue;
          }

          sb_probs[i-num_ign] = PHREDQUAL_TO_PROB(mtc_quals[i].sb_qual);
          orig_idx[i-num_ign] = i;
     }
     if (num_vars-num_ign <= 0) {
          free(sb_probs);
          free(orig_idx);
          return 0;
     }
#if 0
     /* realloc to smaller size apparently not guaranteed to free up space so no point really but let's make sure we don't use that memory */
     sb_probs = realloc(sb_probs, (num_vars-num_ign) * sizeof(double));
     if (! sb_probs) { LOG_FATAL("realloc failed. Exiting..."); return -1; }
     orig_idx = realloc(orig_idx, (num_vars-num_ign) * sizeof(long int));
     if (! orig_idx) { LOG_FATAL("realloc failed. Exiting..."); return -1; }
#endif

     if (! sb_filter->ntests) {
          sb_filter->ntests = num_vars - num_ign;
     } else {
          if (num_vars - num_ign > sb_filter->ntests) {
               LOG_WARN("Number of variants (%ld) in SB filter larger than the number of predefined tests (%ld). Are you sure that makes sense?\n",
                        num_vars-num_ign, sb_filter->ntests);
          }
     }


     /* multiple testing correction
      */
     if (sb_filter->mtc_type == MTC_BONF) {
          bonf_corr(sb_probs, num_vars-num_ign, 
                    sb_filter->ntests);
          
     } else if (sb_filter->mtc_type == MTC_HOLMBONF) {
          holm_bonf_corr(sb_probs, num_vars-num_ign, 
                         sb_filter->alpha, sb_filter->ntests);
          
     } else if (sb_filter->mtc_type == MTC_FDR) {
          long int num_rej = 0;
          long int *idx_rej; /* indices of rejected i.e. significant values */
          
          num_rej = fdr(sb_probs, num_vars-num_ign, 
                        sb_filter->alpha, sb_filter->ntests, 
                        &idx_rej);

          /* first pretend none are significant */
          for (i=0; i<num_vars-num_ign; i++) {
               sb_probs[i] = DBL_MAX;
          }
          for (i=0; i<num_rej; i++) {
               long int idx = idx_rej[i];
               sb_probs[idx] = -1;
          }
          free(idx_rej);
          
     } else {
          LOG_FATAL("Internal error: unknown MTC type %d\n", sb_filter->mtc_type);
          return -1;
     }
     
     for (i=0; i<num_vars-num_ign; i++) {
          /* note: reverse of qual filters, i.e. qpply filter if sign, and not the other way around! */
          if (sb_probs[i] < sb_filter->alpha) {
               if (sb_filter->no_compound || mtc_quals[orig_idx[i]].is_alt_mostly_on_one_strand) {
                    mtc_quals[orig_idx[i]].sb_qual = -1;
               }
          }
     }

     free(orig_idx);
     free(sb_probs);

     return 0;
}
示例#4
0
/* returns -1 on error 
 *
 * filter everything that's not significant
 * 
 * Very similar to apply_snvqual_filter_mtc, but reverse testing logic and only looking at non consvars
 *
 * Will set any mtc_quals->qual to -1 if significant (i.e. don't filter)
 */
int apply_indelqual_filter_mtc(mtc_qual_t *mtc_quals, indelqual_filter_t *indelqual_filter,  const long int num_vars)
{
     long int *orig_idx = NULL; /* of size num_ign */
     double *errprobs = NULL;
     long int num_ign = 0;
     long int i;

   
     /* collect values from noncons vars only and keep track of their indeces
      */
     errprobs = malloc(num_vars * sizeof(double));
     if ( ! errprobs) { LOG_FATAL("%s\n", "out of memory"); return -1; }
     orig_idx = malloc(num_vars * sizeof(long int));
     if ( ! orig_idx) { LOG_FATAL("%s\n", "out of memory"); return -1; }

     num_ign = 0;
     for (i=0; i<num_vars; i++) {
          if (! mtc_quals[i].is_indel) {
               num_ign += 1;
               continue;
          }
          errprobs[i-num_ign] = PHREDQUAL_TO_PROB(mtc_quals[i].var_qual);
          orig_idx[i-num_ign] = i;
     }
     if (num_vars-num_ign <= 0) {
          free(errprobs);
          free(orig_idx);
          return 0;
     }

#if 0
     orig_idx = realloc(orig_idx, (num_ign * sizeof(long int)));
     if ( ! orig_idx) { LOG_FATAL("%s\n", "out of memory"); return -1; }
     errprobs = realloc(errprobs, (num_ign * sizeof(double)));
     if ( ! errprobs) { LOG_FATAL("%s\n", "out of memory"); return -1; }
#endif

     /* only now we can set the number of tests (if it wasn't set by
      * caller) */
     if (! indelqual_filter->ntests) {
          indelqual_filter->ntests = num_vars-num_ign;
     } else {
          if (num_vars-num_ign > indelqual_filter->ntests) {
               LOG_WARN("Number of variants (%ld) larger than the number of predefined tests (%ld). Are you sure that makes sense?\n",
                        num_vars-num_ign, indelqual_filter->ntests);
          }
     }

     /* multiple testing correction
      */
     if (indelqual_filter->mtc_type == MTC_BONF) {
          bonf_corr(errprobs, num_vars-num_ign, 
                    indelqual_filter->ntests);
          
     } else if (indelqual_filter->mtc_type == MTC_HOLMBONF) {
          holm_bonf_corr(errprobs, num_vars-num_ign, 
                         indelqual_filter->alpha, indelqual_filter->ntests);
          
     } else if (indelqual_filter->mtc_type == MTC_FDR) {
          long int num_rej = 0;
          long int *idx_rej; /* indices of rejected i.e. significant values */
          

          num_rej = fdr(errprobs, num_vars-num_ign, 
                        indelqual_filter->alpha, indelqual_filter->ntests, 
                        &idx_rej);
          /* first pretend none are significant */
          for (i=0; i<num_vars-num_ign; i++) {
               errprobs[i] = DBL_MAX;
          }
          for (i=0; i<num_rej; i++) {
               long int idx = idx_rej[i];
               errprobs[idx] = -1;
          }
          free(idx_rej);
          
     } else {
          LOG_FATAL("Internal error: unknown MTC type %d\n", indelqual_filter->mtc_type);
          free(orig_idx);
          free(errprobs);
          return -1;
     }
     
     for (i=0; i<num_vars-num_ign; i++) {
          if (errprobs[i] < indelqual_filter->alpha) {
               mtc_quals[orig_idx[i]].var_qual = -1;
          }
     }

     free(orig_idx);
     free(errprobs);

     return 0;
}
示例#5
0
/* returns -1 on error 
 *
 * filter everything that's significant
 *
 * very similar to in apply_snvqual_filter_mtc, but reverse logic and looking at all vars
 */
int apply_sb_filter_mtc(sb_filter_t *sb_filter, var_t **vars, const long int num_vars)
{
     double *sb_probs = NULL;
     long int i;
     long int num_ign = 0;
     long int *orig_idx = NULL;/* we might ignore some variants (missing values etc). keep track of real indices of kept vars */

     
     /* collect values from vars kept in mem
      */
     sb_probs = malloc(num_vars * sizeof(double));
     if ( ! sb_probs) {LOG_FATAL("%s\n", "out of memory"); return -1;}
     orig_idx = malloc(num_vars * sizeof(long int));
     if ( ! orig_idx) {LOG_FATAL("%s\n", "out of memory"); return -1;}

     num_ign = 0;
     for (i=0; i<num_vars; i++) {
          char *sb_char = NULL;
          
          /* ignore indels too if sb filter is not to be applied */
          if (! sb_filter->incl_indels && vcf_var_is_indel(vars[i])) {
               num_ign += 1;
               continue;
          }

          if ( ! vcf_var_has_info_key(&sb_char, vars[i], "SB")) {
               if ( ! sb_missing_warning_printed) {
                    LOG_WARN("%s\n", "At least one variant has no SB tag! SB filtering will be incomplete");
                    sb_missing_warning_printed = 1;
               }
               num_ign += 1;
               continue;
          }

          sb_probs[i-num_ign] = PHREDQUAL_TO_PROB(atoi(sb_char));
          orig_idx[i-num_ign] = i;
          /*LOG_FIXME("orig_idx[i=%ld - num_ign=%ld = %ld] = i=%ld\n", i, num_ign, i-num_ign, i);*/
          free(sb_char);
     }
     if (num_vars-num_ign <= 0) {
          free(sb_probs);
          free(orig_idx);
          return 0;
     }


     /* realloc to smaller size apparently not guaranteed to free up space so no point really but let's make sure we don't use that memory */
     sb_probs = realloc(sb_probs, (num_vars-num_ign) * sizeof(double));
     if (! sb_probs) { LOG_FATAL("realloc failed. Exiting..."); return -1; }
     orig_idx = realloc(orig_idx, (num_vars-num_ign) * sizeof(long int));
     if (! orig_idx) { LOG_FATAL("realloc failed. Exiting..."); return -1; }

     if (! sb_filter->ntests) {
          sb_filter->ntests = num_vars - num_ign;
     } else {
          if (num_vars-num_ign > sb_filter->ntests) {
               LOG_WARN("%s\n", "Number of predefined tests for SB filter larger than number of variants! Are you sure that makes sense?");
          }
     }


     /* multiple testing correction
      */
     if (sb_filter->mtc_type == MTC_BONF) {
          bonf_corr(sb_probs, num_vars-num_ign, 
                    sb_filter->ntests);
          
     } else if (sb_filter->mtc_type == MTC_HOLMBONF) {
          holm_bonf_corr(sb_probs, num_vars-num_ign, 
                         sb_filter->alpha, sb_filter->ntests);
          
     } else if (sb_filter->mtc_type == MTC_FDR) {
          long int num_rej = 0;
          long int *idx_rej; /* indices of rejected i.e. significant values */
          
          num_rej = fdr(sb_probs, num_vars-num_ign, 
                        sb_filter->alpha, sb_filter->ntests, 
                        &idx_rej);

          /* first pretend none are significant */
          for (i=0; i<num_vars-num_ign; i++) {
               sb_probs[i] = DBL_MAX;
          }
          LOG_DEBUG("%ld results significant after fdr\n", num_rej);
          for (i=0; i<num_rej; i++) {
               long int idx = idx_rej[i];
               sb_probs[idx] = -1;
          }
          free(idx_rej);
          
     } else {
          LOG_FATAL("Internal error: unknown MTC type %d\n", sb_filter->mtc_type);
          return -1;
     }
     
     for (i=0; i<num_vars-num_ign; i++) {
          if (sb_probs[i] < sb_filter->alpha) {
               if (sb_filter->no_compound || alt_mostly_on_one_strand(vars[orig_idx[i]])) {
                    vcf_var_add_to_filter(vars[orig_idx[i]], sb_filter->id);
               }
          }
     }

     free(orig_idx);
     free(sb_probs);

     return 0;
}
示例#6
0
/* returns -1 on error 
 *
 * filter everything that's not significant
 * 
 * Very similar to apply_sb_filter_mtc, but reverse testing logic and only looking at non consvars
 *
 */
int apply_indelqual_filter_mtc(indelqual_filter_t *indelqual_filter, var_t **vars, const long int num_vars)
{
     /* can only apply this logic to variants that are not consensus
      * variants, i.e those that actually have a quality. therefore
      * keep track of non cons var indeces */
     long int *orig_idx = NULL; /* of size num_noncons_vars */
     double *noncons_errprobs = NULL;
     long int num_noncons_vars = 0;
     long int i;

     /* FIXME function almost identical to apply_indelqual_filter_mtc just different filter can be easily merged by accepting both types of variants */

     /* collect values from noncons vars only and keep track of their indeces
      */
     orig_idx = malloc(num_vars * sizeof(long int));
     if ( ! orig_idx) { LOG_FATAL("%s\n", "out of memory"); return -1; }
     noncons_errprobs = malloc(num_vars * sizeof(double));
     if ( ! noncons_errprobs) { LOG_FATAL("%s\n", "out of memory"); return -1;
     }
     num_noncons_vars = 0;
     for (i=0; i<num_vars; i++) {
          if (vars[i]->qual>-1 && vcf_var_has_info_key(NULL, vars[i], "INDEL")) {
               noncons_errprobs[num_noncons_vars] = PHREDQUAL_TO_PROB(vars[i]->qual);
               orig_idx[num_noncons_vars] = i;
               num_noncons_vars += 1;
          }
     }
     if (! num_noncons_vars) {
          free(noncons_errprobs);
          free(orig_idx);
          return 0;
     }

     if (indelqual_filter->ntests && num_noncons_vars > indelqual_filter->ntests) {
          LOG_WARN("Number of (non consensus) variants larger than number of predefined tests for indelqual filter (%ld > %ld)! Are you sure that makes sense?\n", 
                   num_noncons_vars, indelqual_filter->ntests);
     }

     orig_idx = realloc(orig_idx, (num_noncons_vars * sizeof(long int)));
     if ( ! orig_idx) { LOG_FATAL("%s\n", "out of memory"); return -1; }
     noncons_errprobs = realloc(noncons_errprobs, (num_noncons_vars * sizeof(double)));
     if ( ! noncons_errprobs) { LOG_FATAL("%s\n", "out of memory"); return -1; }

     /* only now we can set the number of tests (if it wasn't set by
      * caller) */
     if (! indelqual_filter->ntests) {
          indelqual_filter->ntests = num_noncons_vars;
     }

     /* multiple testing correction
      */
     if (indelqual_filter->mtc_type == MTC_BONF) {
          bonf_corr(noncons_errprobs, num_noncons_vars, 
                    indelqual_filter->ntests);
          
     } else if (indelqual_filter->mtc_type == MTC_HOLMBONF) {
          holm_bonf_corr(noncons_errprobs, num_noncons_vars, 
                         indelqual_filter->alpha, indelqual_filter->ntests);
          
     } else if (indelqual_filter->mtc_type == MTC_FDR) {
          long int num_rej = 0;
          long int *idx_rej; /* indices of rejected i.e. significant values */
          

          num_rej = fdr(noncons_errprobs, num_noncons_vars, 
                        indelqual_filter->alpha, indelqual_filter->ntests, 
                        &idx_rej);

          /* first pretend none are significant */
          for (i=0; i<num_noncons_vars; i++) {
               noncons_errprobs[i] = DBL_MAX;
          }
          LOG_DEBUG("%ld results significant after fdr\n", num_rej);
          for (i=0; i<num_rej; i++) {
               long int idx = idx_rej[i];
               noncons_errprobs[idx] = -1;
          }
          free(idx_rej);
          
     } else {
          LOG_FATAL("Internal error: unknown MTC type %d\n", indelqual_filter->mtc_type);
          free(orig_idx);
          free(noncons_errprobs);
          return -1;
     }
     
     for (i=0; i<num_noncons_vars; i++) {
          if (noncons_errprobs[i] > indelqual_filter->alpha) {
               vcf_var_add_to_filter(vars[orig_idx[i]], indelqual_filter->id);
          }
     }

     free(orig_idx);
     free(noncons_errprobs);

     return 0;
}