void
indel_digt_caller::
starling_indel_call_pprob_digt(const starling_options& opt,
                               const starling_deriv_options& dopt,
                               const starling_sample_options& sample_opt,
                               const double indel_error_prob,
                               const double ref_error_prob,
                               const indel_key& ik,
                               const indel_data& id,
                               const bool is_use_alt_indel,
                               starling_diploid_indel& dindel) const {

    // no immediate plans to include this for regular indel-calling:
    static const bool is_tier2_pass(false);

    if (opt.is_noise_indel_filter && is_diploid_indel_noise(dopt,id,is_tier2_pass)) {
        dindel.is_indel=false;
        return;
    }

    // get likelihood of each genotype:
    double lhood[STAR_DIINDEL::SIZE];
    get_indel_digt_lhood(opt,dopt,sample_opt,indel_error_prob,ref_error_prob,ik,id,
                         opt.is_bindel_diploid_het_bias,opt.bindel_diploid_het_bias,
                         is_tier2_pass,is_use_alt_indel,lhood);

    // mult by prior distro to get unnormalized pprob:
    const double* indel_lnprior(lnprior_genomic());
    for (unsigned gt(0); gt<STAR_DIINDEL::SIZE; ++gt) {
        dindel.pprob[gt] = lhood[gt] + indel_lnprior[gt];
    }

    normalize_ln_distro(dindel.pprob,dindel.pprob+STAR_DIINDEL::SIZE,dindel.max_gt);


#ifdef DEBUG_INDEL_CALL
    log_os << "INDEL_CALL pprob(noindel),pprob(hom),pprob(het): " << dindel.pprob[STAR_DIINDEL::NOINDEL] << " " << dindel.pprob[STAR_DIINDEL::HOM] << " " << dindel.pprob[STAR_DIINDEL::HET] << "\n";
#endif

    dindel.indel_qphred=error_prob_to_qphred(dindel.pprob[STAR_DIINDEL::NOINDEL]);
    dindel.max_gt_qphred=error_prob_to_qphred(prob_comp(dindel.pprob,dindel.pprob+STAR_DIINDEL::SIZE,dindel.max_gt));

    // add new poly calls:
    normalize_ln_distro(lhood,lhood+STAR_DIINDEL::SIZE,dindel.max_gt_poly);
    dindel.max_gt_poly_qphred=error_prob_to_qphred(prob_comp(lhood,lhood+STAR_DIINDEL::SIZE,dindel.max_gt_poly));

    // old report criteria:
    //dindel.is_indel=(dindel.max_gt != STAR_DIINDEL::NOINDEL);
    dindel.is_indel=(dindel.indel_qphred != 0);
}
Example #2
0
qphred_cache::
qphred_cache()
{
    for (int i(0); i<=MAX_QSCORE; ++i)
    {
        q2p[i] = phred_to_error_prob(static_cast<double>(i));
    }
    for (int i(0); i<=MAX_QSCORE; ++i)
    {
        q2lncompe[i] = log1p_switch(-q2p[i]);
    }
    static const double q2lnp(-std::log(10.)/10.);
    for (int i(0); i<=MAX_QSCORE; ++i)
    {
        q2lne[i] = static_cast<double>(i)*q2lnp;
    }

    for (int i(0); i<=MAX_QSCORE; ++i)
    {
        for (int j(0); j<=MAX_MAP; ++j)
        {
            mappedq[j][i] = error_prob_to_qphred(phred_to_mapped_error_prob(i,j));
        }
    }
}
void
write_nonref_2allele_test(const blt_options& opt,
                          const snp_pos_info& pi,
                          const nonref_test_call& nrc,
                          std::ostream& os) {

    os << nrc.snp_qphred
       << '\t' << NR2TEST::label(static_cast<NR2TEST::index_t>(nrc.max_gt)) << "_" << id_to_base(nrc.nonref_id)
       << '\t' << nrc.max_gt_qphred;

    //if(opt.is_print_used_allele_counts) {
    pi.print_known_counts(os,opt.used_allele_count_min_qscore);
    pi.print_known_qscore(os,opt.used_allele_count_min_qscore);
    //}

#if 0
    if (opt.is_print_all_poly_gt) {
        for (unsigned gt(0); gt<DIGT::SIZE; ++gt) {
#if 1
            // print GT as prob:
            os << '\t' << po.pprob[gt];
#else
            // print GT as qval:
            os << '\t' << error_prob_to_qphred(prob_comp(po.pprob,po.pprob+DIGT::SIZE,gt));
#endif
        }
    }
    const result_set& ge(dgt.genome);
    const result_set& po(dgt.poly);
#endif

#if 0
    if (nrc.is_freq) {
        os << std::setprecision(8) << std::fixed;
        for (unsigned i(0); i<N_BASE; ++i) {
            os << '\t' << nrc.allele_freq[i];
        }
        os.unsetf(std::ios::fixed);
    }
#endif
}
void
position_nonref_2allele_test(const snp_pos_info& pi,
                             const blt_options& opt,
                             const bool /*is_always_test*/,
                             nonref_test_call& nrc) {

    static const bool is_mle_freq(false);

    if (pi.ref_base=='N') return;

    // add early escape test here?

    // 1. Determine the two 'primary' alleles -- Simple test just adds
    // up qscores to determine which alleles are primary.
    //
    nrc.nonref_id=(BASE_ID::ANY);
    //unsigned nonref2_id(BASE_ID::ANY); // just ignore this value for now....
    {
        double qtot[N_BASE];
        for (unsigned i(0); i<N_BASE; ++i) qtot[i] = 0;

        const unsigned n_calls(pi.calls.size());
        for (unsigned i(0); i<n_calls; ++i) {
            if (pi.calls[i].base_id==BASE_ID::ANY) continue;
            qtot[pi.calls[i].base_id] += pi.calls[i].get_qscore();
        }

        // get max and max2:
        unsigned max_id=0;
        unsigned max2_id=1;
        for (unsigned b(1); b<N_BASE; ++b) {
            if (qtot[b] > qtot[max_id]) {
                max2_id = max_id;
                max_id = b;
            } else if (qtot[b] > qtot[max2_id]) {
                max2_id = b;
            }
        }

        const unsigned ref_id=base_to_id(pi.ref_base);
        if       (ref_id==max_id) {
            nrc.nonref_id=max2_id;

#if 0
        } else if (ref_id==max2_id) {
            nrc.nonref_id=max_id;
#endif
        } else {
            nrc.nonref_id=max_id;
            //nonref2_id=max2_id;
        }
    }

    blt_float_t lhood[NR2TEST::SIZE];

    lhood[NR2TEST::REF] = calc_pos_nonref_freq_loghood(pi,0.);

    sparse_function sf;
    nonref_allele_freq_loghood_sparse_func nlf(pi,nrc.nonref_id,sf);
    sample_uniform_range(0.,1.,nlf);
    //sample_uniform_range(min_nonref_freq,1.,nlf);

    lhood[NR2TEST::NONREF_MF] = integrate_ln_sparsefunc(sf, opt.min_nonref_freq, 1,1,1);
    lhood[NR2TEST::NONREF_MF_NOISE] = integrate_ln_sparsefunc(sf, 0, opt.nonref_site_error_decay_freq,1,0);

    static const blt_float_t neginf(-std::numeric_limits<blt_float_t>::infinity());
    lhood[NR2TEST::NONREF_OTHER] = neginf;

    //std::cerr << "WAGART: logh ref/nonef: " << lhood[0] << " " << lhood[1] << "\n";

    // TODO: ctor compute this:

    // this goes in here just in case someone cranks both parameters up near 1:
    //
    const double nonref_variant_rate_used = opt.nonref_variant_rate*(1-opt.nonref_site_error_rate);

    blt_float_t prior[NR2TEST::SIZE];
    prior[NR2TEST::REF] = log1p_switch(-(nonref_variant_rate_used+opt.nonref_site_error_rate));
    prior[NR2TEST::NONREF_MF] = std::log(nonref_variant_rate_used/3);
    prior[NR2TEST::NONREF_MF_NOISE] = std::log(opt.nonref_site_error_rate);
    prior[NR2TEST::NONREF_OTHER] = std::log(2*nonref_variant_rate_used/3);

    double pprob[NR2TEST::SIZE];
    for (unsigned i(0); i<NR2TEST::SIZE; ++i) {
        pprob[i] = lhood[i] + prior[i];
    }
    normalize_ln_distro(pprob,pprob+NR2TEST::SIZE,nrc.max_gt);

    nrc.snp_qphred=error_prob_to_qphred(pprob[NR2TEST::REF]+pprob[NR2TEST::NONREF_MF_NOISE]);
    nrc.max_gt_qphred=error_prob_to_qphred(prob_comp(pprob,pprob+NR2TEST::SIZE,nrc.max_gt));

    nrc.is_snp=(nrc.snp_qphred != 0);

    if (! (is_mle_freq && nrc.is_snp)) return;

#if 0
    const double null_loghood(calc_pos_nonref_freq_loghood(pi,0.));

    // heuristic to escape early:
    static const double p_delta(0.001);
    const double delta_loghood(calc_pos_nonref_freq_loghood(pi,p_delta));
    if (null_loghood > delta_loghood) return;

    double x_nonref_freq;
    double x_loghood;

    position_nonref_freq_loghood_minfunc mf(epi);

    static const double x1(0.5);
    static const double x2(0.4);
    codemin::minimize_1d(x1,x2,mf.val(x1),mf,x_nonref_freq,x_loghood);

    x_nonref_freq = mf.arg_to_prob(x_nonref_freq);

    const double log_lrt(-2.*(x_loghood+null_loghood));

    // becuase null has the parameter fixed to a boundary value, the
    // asymmtotic distribution is a 50:50 mixture of csq(0) and chq(1)
    // -- the same effect as multiplying alpha of csq(1) by 2, dividing
    // the null prob by 2. (as we do below):
    boost::math::chi_squared dist(1);
    const double null_prob((1.-boost::math::cdf(dist,log_lrt))/2.);

    sc.is_snp=(null_prob<alpha);
    sc.null_loghood=null_loghood;
    sc.min_test_loghood=-x_loghood;
    sc.snp_prob=1.-null_prob;

    // if it's a snp then get additional information on non-reference
    // allele frequencies.
    //
    if (not sc.is_snp) return;

    static const double line_tol(1e-7);
    static const double start_ratio(0.05);
    static const double min_start_dist(1e-6);
    static const double end_tol(1e-7);
    static const unsigned max_iter(200);

    const unsigned ref_base_id(base_to_id(pi.ref_base));

    const double ref_freq(1.-x_nonref_freq);
    const double nonref_freq((x_nonref_freq)/3.);
    for (unsigned i(0); i<N_BASE; ++i) {
        if (i==ref_base_id) sc.allele_freq[i] = ref_freq;
        else               sc.allele_freq[i] = nonref_freq;
    }

    static const unsigned N_BASE2(N_BASE*N_BASE);
    double conj_dir[N_BASE2];
    std::fill(conj_dir,conj_dir+N_BASE2,0.);
    for (unsigned i(0); i<N_BASE; ++i) {
        const double start_dist( std::max(std::fabs(sc.allele_freq[i]*start_ratio),min_start_dist) );
        conj_dir[i*(N_BASE+1)] = start_dist;
    }

    double start_tol(end_tol);
    unsigned iter;
    double x_all_loghood;
    double final_dlh;
    position_allele_distro_loghood_minfunc alm(epi);
    codemin::minimize_conj_direction(sc.allele_freq,conj_dir,alm,start_tol,end_tol,line_tol,
                                     x_all_loghood,iter,final_dlh,max_iter);
    alm.arg_to_prob(sc.allele_freq,sc.allele_freq);

    sc.min_loghood=-x_all_loghood;
#endif
}
static
void
calculate_result_set(const strelka_options& opt,
                     const double* normal_lnprior,
                     const double lnmatch,
                     const double lnmismatch,
                     const double* normal_lhood,
                     const double* tumor_lhood,
                     result_set& rs) {

#ifdef SOMATIC_DEBUG
    std::vector<double> check_prior(DDIINDEL::SIZE);

    for (unsigned ngt(0); ngt<STAR_DIINDEL::SIZE; ++ngt) {
        const double base_prior(normal_lnprior[ngt]);
        for (unsigned tgt(0); tgt<STAR_DIINDEL::SIZE; ++tgt) {
            const unsigned dgt(DDIINDEL::get_state(ngt,tgt));
            check_prior[dgt] =
                base_prior+
                ((tgt==ngt) ? lnmatch : lnmismatch);
        }
    }

    check_ln_distro(check_prior.begin(),
                    check_prior.end(),
                    "somatic indel full prior");
#endif

    // get unnormalized posterior:
    std::vector<double> pprob(DDIINDEL::SIZE);

    for (unsigned ngt(0); ngt<STAR_DIINDEL::SIZE; ++ngt) {
        const double base_prior(normal_lnprior[ngt]);
        for (unsigned tgt(0); tgt<STAR_DIINDEL::SIZE; ++tgt) {
            const unsigned dgt(DDIINDEL::get_state(ngt,tgt));
            pprob[dgt] =
                normal_lhood[ngt]+
                tumor_lhood[tgt]+
                base_prior+
                ((tgt==ngt) ? lnmatch : lnmismatch);
        }
    }

    normalize_ln_distro(pprob.begin(),pprob.end(),rs.max_gt);

#ifdef DEBUG_INDEL_CALL
    log_os << "INDEL_CALL pprob(noindel),pprob(hom),pprob(het): " << pprob[STAR_DIINDEL::NOINDEL] << " " << pprob[STAR_DIINDEL::HOM] << " " << pprob[STAR_DIINDEL::HET] << "\n";
#endif
    double nonsomatic_sum(0);
    for (unsigned gt(0); gt<STAR_DIINDEL::SIZE; ++gt) {
        nonsomatic_sum += pprob[DDIINDEL::get_state(gt,gt)];
    }
    rs.sindel_qphred=error_prob_to_qphred(nonsomatic_sum);

    double not_somfrom_sum[STAR_DIINDEL::SIZE];
    for (unsigned sgt(0); sgt<STAR_DIINDEL::SIZE; ++sgt) {
        not_somfrom_sum[sgt]=nonsomatic_sum;
        for (unsigned ngt(0); ngt<STAR_DIINDEL::SIZE; ++ngt) {
            if (sgt==ngt) continue;
            for (unsigned tgt(0); tgt<STAR_DIINDEL::SIZE; ++tgt) {
                if (tgt==ngt) continue;
                not_somfrom_sum[sgt] += pprob[DDIINDEL::get_state(ngt,tgt)];
            }
        }
    }
    rs.sindel_from_ref_qphred=error_prob_to_qphred(not_somfrom_sum[STAR_DIINDEL::NOINDEL]);
    rs.sindel_from_het_qphred=error_prob_to_qphred(not_somfrom_sum[STAR_DIINDEL::HET]);
    rs.sindel_from_hom_qphred=error_prob_to_qphred(not_somfrom_sum[STAR_DIINDEL::HOM]);

    double not_somfromanyhom_sum(nonsomatic_sum);
    for (unsigned ngt(0); ngt<STAR_DIINDEL::SIZE; ++ngt) {
        if (STAR_DIINDEL::HET != ngt) continue;
        for (unsigned tgt(0); tgt<STAR_DIINDEL::SIZE; ++tgt) {
            if (tgt==ngt) continue;
            not_somfromanyhom_sum += pprob[DDIINDEL::get_state(ngt,tgt)];
        }
    }
    rs.sindel_from_anyhom_qphred=error_prob_to_qphred(not_somfromanyhom_sum);

    rs.max_gt_qphred=error_prob_to_qphred(prob_comp(pprob.begin(),pprob.end(),rs.max_gt));
}
// Given the likelihood, go through the final computations to get the
// posterior and derived values.
//
static
void
calculate_result_set_grid(const blt_float_t* normal_lhood,
                          const blt_float_t* tumor_lhood,
                          const somatic_snv_caller_strand_grid::prior_set& pset,
                          const blt_float_t lnmatch,
                          const blt_float_t lnmismatch,
                          const unsigned /*ref_gt*/,
                          result_set& rs) {

    // a piece transplanted from 1150 to make a formal correction to
    // the priors which should have a low-impact on the results.
    // the prior below is incomplete
#ifdef DEBUG_ALTERNATE_PRIOR
    static const double neginf(-std::numeric_limits<double>::infinity());

    std::vector<double> prior(DDIGT_SGRID::SIZE);
    std::fill(prior.begin(),prior.end(),neginf);

    // this zero'd code is incomplete and abandoned for now...:
#if 0
    for(unsigned ngt(0); ngt<DIGT_SGRID::PRESTRAND_SIZE; ++ngt) {
        double base_prior(neginf);
        const bool is_noise(ngt>=STAR_DIINDEL::SIZE);
        if(is_noise) {
            base_prior=pset.normal[ngt];
        } else {
            base_prior=pset.nonoise[ngt];
        }
        for(unsigned tgt(0); tgt<DIGT_SGRID::PRESTRAND_SIZE; ++tgt) {
            const blt_float_t tgt_prior_mod( (tgt==ngt) ? lnmatch : lnmismatch );
            const unsigned dgt(DDIGT_SGRID::get_state(ngt,tgt));
            prior[dgt] = normal_genomic_lnprior[ngt]+tgt_prior_mod;
        }
    }

    for(unsigned gt(DIGT_SGRID::PRESTRAND_SIZE); gt<DIGT_SGRID::SIZE; ++gt) {
        const unsigned dgt(DDIGT_SGRID::get_state(gt,gt));
        prior[dgt] = normal_genomic_lnprior[gt]+lnmatch;
    }
#endif

    check_ln_distro(prior.begin(),
                    prior.end(),
                    "somatic snv full prior");
#endif

    // intentionally use higher float res for this structure:
    std::vector<double> pprob(DDIGT_SGRID::SIZE);

    // mult by prior distro to get unnormalized pprob for states in
    // the regular grid model:
    //
    for(unsigned ngt(0); ngt<DIGT_SGRID::PRESTRAND_SIZE; ++ngt) {
        for(unsigned tgt(0); tgt<DIGT_SGRID::PRESTRAND_SIZE; ++tgt) {
            const unsigned dgt(DDIGT_SGRID::get_state(ngt,tgt));

#if 0
            // the trusty old way...:
            const blt_float_t tgt_prior_mod( (tgt==ngt) ? lnmatch : lnmismatch );
            pprob[dgt] = normal_lhood[ngt]+tumor_lhood[tgt]+pset.normal[ngt]+tgt_prior_mod;
#else

            // unorm takes the role of the normal prior for the somatic case:
            //            static const blt_float_t unorm(std::log(static_cast<blt_float_t>(DIGT_SGRID::PRESTRAND_SIZE)));
            blt_float_t prior;
            if(tgt==ngt) {
                prior=pset.normal[ngt]+lnmatch;
            } else {
                prior=pset.somatic_marginal[ngt]+lnmismatch;
            }
            pprob[dgt] = normal_lhood[ngt]+tumor_lhood[tgt]+prior;

#endif
        }
    }

    // Now add the single-strand noise states. note that these states
    // are unique in that we don't look for mixtures of somatic
    // variation with these noise states, b/c single-strand
    // observations can almost exclusively be ruled out as noise:
    //
    for(unsigned gt(DIGT_SGRID::PRESTRAND_SIZE); gt<DIGT_SGRID::SIZE; ++gt) {
        const unsigned dgt(DDIGT_SGRID::get_state(gt,gt));
        pprob[dgt] = normal_lhood[gt]+tumor_lhood[gt]+pset.normal[gt]+lnmatch;
    }

    opt_normalize_ln_distro(pprob.begin(),pprob.end(),DDIGT_SGRID::is_nonsom.val.begin(),rs.max_gt);
    //normalize_ln_distro(pprob.begin(),pprob.end(),rs.max_gt);

    double nonsomatic_sum(0);
    for(unsigned gt(0); gt<DIGT_SGRID::SIZE; ++gt) {
        nonsomatic_sum += pprob[DDIGT_SGRID::get_state(gt,gt)];
    }
    rs.snv_qphred=error_prob_to_qphred(nonsomatic_sum);

    if(0==rs.snv_qphred) return;

#if 0
    // alternate way to calculate the joint:
    //
    double min_not_somfrom_sum(0);
    for(unsigned dgt(0); dgt<DIGT::SIZE; ++dgt) {
        double not_somfrom_sum(nonsomatic_sum);

        for(unsigned ngt(0); ngt<DIGT_SGRID::PRESTRAND_SIZE; ++ngt) {
            // we're looking for the joint prob when state dgt is true
            // in the normal, so skip this as a normal state here:
            //
            if(dgt==ngt) continue;

            for(unsigned tgt(0); tgt<DIGT_SGRID::PRESTRAND_SIZE; ++tgt) {
                // we've already started from the nonsomatic som, so we can skip the equal states:
                //
                if(ngt==tgt) continue;

                not_somfrom_sum += pprob[DDIGT_SGRID::get_state(ngt,tgt)];
            }
        }

        if((dgt==0) || (!_somfrom_sum<min_not_somfrom_sum)) {
            min_not_somfrom_sum=not_somfrom_sum;
            rs.snv_from_ntype_qphred=error_prob_to_qphred(not_somfrom_sum);
            rs.ntype=dgt;
        }
    }
#endif

#if 0
    // reset max_gt to the most likely state excluding normal noise states:
    //
    rs.max_gt=0;
    for(unsigned dgt(0); dgt<DIGT::SIZE; ++dgt) {
        for(unsigned tgt(0); tgt<DIGT_SGRID::PRESTRAND_SIZE; ++tgt) {
            const unsigned xgt(DDIGT_SGRID::get_state(dgt,tgt));
            if(pprob[xgt] > pprob[rs.max_gt]) rs.max_gt=xgt;
        }
    }
#endif

    // Calculate normal distribution alone so that we can classify this call:
    //
    // Polymorphic prior is used because in this situation we want to
    // be conservative about the reference classification --
    // ie. conditioned on only looking at putative somatic sites, we
    // require evidence to show that the normal is in fact reference
    // and not simply an unsampled copy of the somatic variation.
    //
    std::vector<double> normal_pprob(DIGT_SGRID::PRESTRAND_SIZE);
    for(unsigned ngt(0); ngt<DIGT_SGRID::PRESTRAND_SIZE; ++ngt) {
        normal_pprob[ngt] = normal_lhood[ngt]+pset.normal_poly[ngt];
    }

    unsigned max_norm_gt(0);
    normalize_ln_distro(normal_pprob.begin(),normal_pprob.end(),max_norm_gt);

    // find the probability of max_norm_gt:
    const double ngt_prob(prob_comp(normal_pprob.begin(),normal_pprob.end(),max_norm_gt));

    // (1-(1-a)(1-b)) -> a+b-(ab)
    double not_somfrom_sum(nonsomatic_sum+ngt_prob-(nonsomatic_sum*ngt_prob));

    rs.snv_from_ntype_qphred=error_prob_to_qphred(not_somfrom_sum);
    rs.ntype=max_norm_gt;
}