Beispiel #1
0
void apply_sb_threshold(var_t *var, sb_filter_t *sb_filter)
{
     char *sb_char = NULL;
     int sb;

     if (! sb_filter->thresh) {
          return;
     }

     if ( ! vcf_var_has_info_key(&sb_char, var, "SB")) {
          if ( ! sb_missing_warning_printed) {
               LOG_WARN("%s\n", "Requested SB filtering failed since SB tag is missing in variant");
               sb_missing_warning_printed = 1;
          }
          return;
     }
     sb = atoi(sb_char);
     free(sb_char);

     if (sb > sb_filter->thresh) {
          if (sb_filter->no_compound || alt_mostly_on_one_strand(var)) {
               vcf_var_add_to_filter(var, sb_filter->id);
          }
     }
}
Beispiel #2
0
/* mtc_quals allocated here. size returned on exit or -1 on error */
long int
mtc_quals_from_vcf_file(mtc_qual_t **mtc_quals, const char *vcf_in)
{
     long int num_vars = 0;
     long int mtc_qual_size = 0;
     int mtc_qual_incr = 16384;
     vcf_file_t vcffh;

     if (vcf_file_open(&vcffh, vcf_in,
                       HAS_GZIP_EXT(vcf_in), 'r')) {
          LOG_ERROR("Couldn't open %s\n", vcf_in);
          return -1;
     }

    if (0 !=  vcf_skip_header(&vcffh)) {
         LOG_WARN("%s\n", "vcf_skip_header() failed");
         return -1;
    }

    mtc_qual_size += mtc_qual_incr;
    (*mtc_quals) = calloc(mtc_qual_size, sizeof(mtc_qual_t));
     
    while (1) {
         var_t *var;
         int rc;
         int is_indel = 0;
         char *sb_char = NULL;
         

         vcf_new_var(&var);
         rc = vcf_parse_var(&vcffh, var);
         if (rc) {
              /* how to distinguish between error and EOF? */
              break;
         }
         num_vars += 1;
         /* ingest anything: we keep adding filters */


         if (num_vars > mtc_qual_size) {
              mtc_qual_size += mtc_qual_incr;
              (*mtc_quals) = realloc((*mtc_quals), mtc_qual_size * sizeof(mtc_qual_t));
         }

        
         is_indel = vcf_var_is_indel(var);
         (*mtc_quals)[num_vars-1].is_indel = is_indel;

         /* variant quality */
         if (var->qual==-1) {
              /* missing qualities to fake value */
              var->qual = INT_MAX;
              if (! varq_missing_warning_printed) {
                   LOG_WARN("%s\n", "Missing variant quality in at least once case. Assuming INT_MAX");
                   varq_missing_warning_printed = 1;
              }
              (*mtc_quals)[num_vars-1].var_qual = INT_MAX;
         } else {
              (*mtc_quals)[num_vars-1].var_qual = var->qual;
         }

         /* strand bias */
         if ( ! vcf_var_has_info_key(&sb_char, var, "SB")) {
               if ( ! sb_missing_warning_printed) {
                    LOG_WARN("%s\n", "At least one variant has no SB tag! Assuming 0");
                    sb_missing_warning_printed = 1;
               }
               (*mtc_quals)[num_vars-1].sb_qual = 0;
         } else {
              (*mtc_quals)[num_vars-1].sb_qual = atoi(sb_char);
              free(sb_char);
         }

         (*mtc_quals)[num_vars-1].is_alt_mostly_on_one_strand =  alt_mostly_on_one_strand(var);

         vcf_free_var(&var);
    }
    vcf_file_close(&vcffh);

    return num_vars;
}
Beispiel #3
0
/* returns -1 on error 
 *
 * filter everything that's significant
 *
 * very similar to in apply_snvqual_filter_mtc, but reverse logic and looking at all vars
 */
int apply_sb_filter_mtc(sb_filter_t *sb_filter, var_t **vars, const long int num_vars)
{
     double *sb_probs = NULL;
     long int i;
     long int num_ign = 0;
     long int *orig_idx = NULL;/* we might ignore some variants (missing values etc). keep track of real indices of kept vars */

     
     /* collect values from vars kept in mem
      */
     sb_probs = malloc(num_vars * sizeof(double));
     if ( ! sb_probs) {LOG_FATAL("%s\n", "out of memory"); return -1;}
     orig_idx = malloc(num_vars * sizeof(long int));
     if ( ! orig_idx) {LOG_FATAL("%s\n", "out of memory"); return -1;}

     num_ign = 0;
     for (i=0; i<num_vars; i++) {
          char *sb_char = NULL;
          
          /* ignore indels too if sb filter is not to be applied */
          if (! sb_filter->incl_indels && vcf_var_is_indel(vars[i])) {
               num_ign += 1;
               continue;
          }

          if ( ! vcf_var_has_info_key(&sb_char, vars[i], "SB")) {
               if ( ! sb_missing_warning_printed) {
                    LOG_WARN("%s\n", "At least one variant has no SB tag! SB filtering will be incomplete");
                    sb_missing_warning_printed = 1;
               }
               num_ign += 1;
               continue;
          }

          sb_probs[i-num_ign] = PHREDQUAL_TO_PROB(atoi(sb_char));
          orig_idx[i-num_ign] = i;
          /*LOG_FIXME("orig_idx[i=%ld - num_ign=%ld = %ld] = i=%ld\n", i, num_ign, i-num_ign, i);*/
          free(sb_char);
     }
     if (num_vars-num_ign <= 0) {
          free(sb_probs);
          free(orig_idx);
          return 0;
     }


     /* realloc to smaller size apparently not guaranteed to free up space so no point really but let's make sure we don't use that memory */
     sb_probs = realloc(sb_probs, (num_vars-num_ign) * sizeof(double));
     if (! sb_probs) { LOG_FATAL("realloc failed. Exiting..."); return -1; }
     orig_idx = realloc(orig_idx, (num_vars-num_ign) * sizeof(long int));
     if (! orig_idx) { LOG_FATAL("realloc failed. Exiting..."); return -1; }

     if (! sb_filter->ntests) {
          sb_filter->ntests = num_vars - num_ign;
     } else {
          if (num_vars-num_ign > sb_filter->ntests) {
               LOG_WARN("%s\n", "Number of predefined tests for SB filter larger than number of variants! Are you sure that makes sense?");
          }
     }


     /* multiple testing correction
      */
     if (sb_filter->mtc_type == MTC_BONF) {
          bonf_corr(sb_probs, num_vars-num_ign, 
                    sb_filter->ntests);
          
     } else if (sb_filter->mtc_type == MTC_HOLMBONF) {
          holm_bonf_corr(sb_probs, num_vars-num_ign, 
                         sb_filter->alpha, sb_filter->ntests);
          
     } else if (sb_filter->mtc_type == MTC_FDR) {
          long int num_rej = 0;
          long int *idx_rej; /* indices of rejected i.e. significant values */
          
          num_rej = fdr(sb_probs, num_vars-num_ign, 
                        sb_filter->alpha, sb_filter->ntests, 
                        &idx_rej);

          /* first pretend none are significant */
          for (i=0; i<num_vars-num_ign; i++) {
               sb_probs[i] = DBL_MAX;
          }
          LOG_DEBUG("%ld results significant after fdr\n", num_rej);
          for (i=0; i<num_rej; i++) {
               long int idx = idx_rej[i];
               sb_probs[idx] = -1;
          }
          free(idx_rej);
          
     } else {
          LOG_FATAL("Internal error: unknown MTC type %d\n", sb_filter->mtc_type);
          return -1;
     }
     
     for (i=0; i<num_vars-num_ign; i++) {
          if (sb_probs[i] < sb_filter->alpha) {
               if (sb_filter->no_compound || alt_mostly_on_one_strand(vars[orig_idx[i]])) {
                    vcf_var_add_to_filter(vars[orig_idx[i]], sb_filter->id);
               }
          }
     }

     free(orig_idx);
     free(sb_probs);

     return 0;
}