Esempio n. 1
0
qphred_cache::
qphred_cache()
{
    for (int i(0); i<=MAX_QSCORE; ++i)
    {
        q2p[i] = phred_to_error_prob(static_cast<double>(i));
    }
    for (int i(0); i<=MAX_QSCORE; ++i)
    {
        q2lncompe[i] = log1p_switch(-q2p[i]);
    }
    static const double q2lnp(-std::log(10.)/10.);
    for (int i(0); i<=MAX_QSCORE; ++i)
    {
        q2lne[i] = static_cast<double>(i)*q2lnp;
    }

    for (int i(0); i<=MAX_QSCORE; ++i)
    {
        for (int j(0); j<=MAX_MAP; ++j)
        {
            mappedq[j][i] = error_prob_to_qphred(phred_to_mapped_error_prob(i,j));
        }
    }
}
somatic_snv_caller_strand_grid::
somatic_snv_caller_strand_grid(const strelka_options& opt,
                               const pprob_digt_caller& pd_caller)
    : _opt(opt)
{

    _ln_som_match=(log1p_switch(-opt.somatic_snv_rate));
    _ln_som_mismatch=(std::log(opt.somatic_snv_rate/(static_cast<blt_float_t>((DIGT_SGRID::PRESTRAND_SIZE)-1))));

    for(unsigned i(0); i<(N_BASE+1); ++i) {
        prior_set& ps(_lnprior[i]);
        std::fill(ps.normal.begin(),ps.normal.end(),0);
        std::fill(ps.normal_poly.begin(),ps.normal_poly.end(),0);
    }

    for(unsigned i(0); i<(N_BASE+1); ++i) {
        prior_set& ps(_lnprior[i]);
        get_prior(pd_caller.lnprior_genomic(i),i,
                  opt.shared_site_error_rate,
                  opt.shared_site_error_strand_bias_fraction,
                  opt.site_somatic_normal_noise_rate,
                  opt.is_site_somatic_normal_noise_rate,
                  ps.normal,
                  ps.somatic_marginal);
        get_prior(pd_caller.lnprior_polymorphic(i),i,
                  opt.shared_site_error_rate,
                  opt.shared_site_error_strand_bias_fraction,
                  opt.site_somatic_normal_noise_rate,
                  opt.is_site_somatic_normal_noise_rate,
                  ps.normal_poly,
                  ps.somatic_marginal_poly);
    }
}
somatic_indel_caller::
somatic_indel_caller(const strelka_options& opt,
                     const indel_digt_caller& in_caller)
    : _in_caller(in_caller)
{
    _ln_som_match=(log1p_switch(-opt.somatic_indel_rate));
    _ln_som_mismatch=(std::log(opt.somatic_indel_rate/(static_cast<double>(STAR_DIINDEL::SIZE-1))));
}
indel_digt_caller::
indel_digt_caller(const double theta) {

    _lnprior_genomic[STAR_DIINDEL::NOINDEL]=log1p_switch(-(3.*theta)/2.);
    _lnprior_genomic[STAR_DIINDEL::HOM]=std::log(theta/2.);
    _lnprior_genomic[STAR_DIINDEL::HET]=std::log(theta);

    _lnprior_polymorphic[STAR_DIINDEL::NOINDEL]=0.25;
    _lnprior_polymorphic[STAR_DIINDEL::HOM]=0.25;
    _lnprior_polymorphic[STAR_DIINDEL::HET]=0.5;
}
// compute just the non-strand-bias portion of the normal marginal
// prior given p(signal), p(no-strand noise), p(strand-bias noise)
//
static
void
get_nostrand_marginal_prior(const blt_float_t* normal_lnprior,
                            const unsigned ref_gt,
                            const blt_float_t sse_rate,
                            const blt_float_t sseb_fraction,
                            std::vector<blt_float_t>& grid_normal_lnprior) {

    const blt_float_t strand_sse_rate(sse_rate*sseb_fraction);
    const blt_float_t nostrand_sse_rate(sse_rate-strand_sse_rate);

    const blt_float_t ln_csse_rate( log1p_switch(-sse_rate) );
    //    const blt_float_t ln_strand_sse_rate( std::log(strand_sse_rate) );
    const blt_float_t ln_nostrand_sse_rate( std::log(nostrand_sse_rate) );

    // fill in normal sample prior for canonical diploid allele frequencies:
    for(unsigned ngt(0); ngt<DIGT::SIZE; ++ngt) {
        grid_normal_lnprior[ngt] = (normal_lnprior[ngt]+ln_csse_rate);
    }

    // nostrand noise prior distributions for each allele combination axis:
    //
    // weight the prior by the potential originating genotypes:
    // if on AB axis, we want P(AA+noiseB)+P(AB+noise)+P(BB+noiseA)
    // so we have P(AA)*error_prob /3 + P(AB)*error_prob + P(BB)*error_prob/3
    //
    static const unsigned n_het_axes(6);
    blt_float_t nostrand_axis_prior[n_het_axes];
    for(unsigned ngt(N_BASE); ngt<DIGT::SIZE; ++ngt) {
        const unsigned axis_id(ngt-N_BASE);
        nostrand_axis_prior[axis_id] = normal_lnprior[ngt];
        // get the two associated homs:
        for(unsigned b(0); b<N_BASE; ++b) {
            if(DIGT::expect2(b,ngt)<=0) continue;
            nostrand_axis_prior[axis_id] = log_sum(nostrand_axis_prior[axis_id],
                                                   normal_lnprior[b]+ln_one_third);
        }
    }

    static const blt_float_t error_mod( -std::log(static_cast<blt_float_t>(DIGT_SGRID::HET_RES*2)) );

    // fill in normal sample prior for 'noise' frequencies:
    for(unsigned ngt(DIGT::SIZE); ngt<DIGT_SGRID::PRESTRAND_SIZE; ++ngt) {
        // 'ngt2' is the root diploid state corresponding to noise
        // state 'ngt'
        const unsigned ngt2(DIGT_SGRID::get_digt_state(ngt,ref_gt));
        assert(ngt2>=N_BASE);
        const unsigned axis_id(ngt2-N_BASE);
        grid_normal_lnprior[ngt] = (nostrand_axis_prior[axis_id]+ln_nostrand_sse_rate+error_mod);
        //        grid_normal_lnprior[ngt] = (normal_lnprior[ngt2]+ln_sse_rate+error_mod);
    }
}
void
position_nonref_2allele_test(const snp_pos_info& pi,
                             const blt_options& opt,
                             const bool /*is_always_test*/,
                             nonref_test_call& nrc) {

    static const bool is_mle_freq(false);

    if (pi.ref_base=='N') return;

    // add early escape test here?

    // 1. Determine the two 'primary' alleles -- Simple test just adds
    // up qscores to determine which alleles are primary.
    //
    nrc.nonref_id=(BASE_ID::ANY);
    //unsigned nonref2_id(BASE_ID::ANY); // just ignore this value for now....
    {
        double qtot[N_BASE];
        for (unsigned i(0); i<N_BASE; ++i) qtot[i] = 0;

        const unsigned n_calls(pi.calls.size());
        for (unsigned i(0); i<n_calls; ++i) {
            if (pi.calls[i].base_id==BASE_ID::ANY) continue;
            qtot[pi.calls[i].base_id] += pi.calls[i].get_qscore();
        }

        // get max and max2:
        unsigned max_id=0;
        unsigned max2_id=1;
        for (unsigned b(1); b<N_BASE; ++b) {
            if (qtot[b] > qtot[max_id]) {
                max2_id = max_id;
                max_id = b;
            } else if (qtot[b] > qtot[max2_id]) {
                max2_id = b;
            }
        }

        const unsigned ref_id=base_to_id(pi.ref_base);
        if       (ref_id==max_id) {
            nrc.nonref_id=max2_id;

#if 0
        } else if (ref_id==max2_id) {
            nrc.nonref_id=max_id;
#endif
        } else {
            nrc.nonref_id=max_id;
            //nonref2_id=max2_id;
        }
    }

    blt_float_t lhood[NR2TEST::SIZE];

    lhood[NR2TEST::REF] = calc_pos_nonref_freq_loghood(pi,0.);

    sparse_function sf;
    nonref_allele_freq_loghood_sparse_func nlf(pi,nrc.nonref_id,sf);
    sample_uniform_range(0.,1.,nlf);
    //sample_uniform_range(min_nonref_freq,1.,nlf);

    lhood[NR2TEST::NONREF_MF] = integrate_ln_sparsefunc(sf, opt.min_nonref_freq, 1,1,1);
    lhood[NR2TEST::NONREF_MF_NOISE] = integrate_ln_sparsefunc(sf, 0, opt.nonref_site_error_decay_freq,1,0);

    static const blt_float_t neginf(-std::numeric_limits<blt_float_t>::infinity());
    lhood[NR2TEST::NONREF_OTHER] = neginf;

    //std::cerr << "WAGART: logh ref/nonef: " << lhood[0] << " " << lhood[1] << "\n";

    // TODO: ctor compute this:

    // this goes in here just in case someone cranks both parameters up near 1:
    //
    const double nonref_variant_rate_used = opt.nonref_variant_rate*(1-opt.nonref_site_error_rate);

    blt_float_t prior[NR2TEST::SIZE];
    prior[NR2TEST::REF] = log1p_switch(-(nonref_variant_rate_used+opt.nonref_site_error_rate));
    prior[NR2TEST::NONREF_MF] = std::log(nonref_variant_rate_used/3);
    prior[NR2TEST::NONREF_MF_NOISE] = std::log(opt.nonref_site_error_rate);
    prior[NR2TEST::NONREF_OTHER] = std::log(2*nonref_variant_rate_used/3);

    double pprob[NR2TEST::SIZE];
    for (unsigned i(0); i<NR2TEST::SIZE; ++i) {
        pprob[i] = lhood[i] + prior[i];
    }
    normalize_ln_distro(pprob,pprob+NR2TEST::SIZE,nrc.max_gt);

    nrc.snp_qphred=error_prob_to_qphred(pprob[NR2TEST::REF]+pprob[NR2TEST::NONREF_MF_NOISE]);
    nrc.max_gt_qphred=error_prob_to_qphred(prob_comp(pprob,pprob+NR2TEST::SIZE,nrc.max_gt));

    nrc.is_snp=(nrc.snp_qphred != 0);

    if (! (is_mle_freq && nrc.is_snp)) return;

#if 0
    const double null_loghood(calc_pos_nonref_freq_loghood(pi,0.));

    // heuristic to escape early:
    static const double p_delta(0.001);
    const double delta_loghood(calc_pos_nonref_freq_loghood(pi,p_delta));
    if (null_loghood > delta_loghood) return;

    double x_nonref_freq;
    double x_loghood;

    position_nonref_freq_loghood_minfunc mf(epi);

    static const double x1(0.5);
    static const double x2(0.4);
    codemin::minimize_1d(x1,x2,mf.val(x1),mf,x_nonref_freq,x_loghood);

    x_nonref_freq = mf.arg_to_prob(x_nonref_freq);

    const double log_lrt(-2.*(x_loghood+null_loghood));

    // becuase null has the parameter fixed to a boundary value, the
    // asymmtotic distribution is a 50:50 mixture of csq(0) and chq(1)
    // -- the same effect as multiplying alpha of csq(1) by 2, dividing
    // the null prob by 2. (as we do below):
    boost::math::chi_squared dist(1);
    const double null_prob((1.-boost::math::cdf(dist,log_lrt))/2.);

    sc.is_snp=(null_prob<alpha);
    sc.null_loghood=null_loghood;
    sc.min_test_loghood=-x_loghood;
    sc.snp_prob=1.-null_prob;

    // if it's a snp then get additional information on non-reference
    // allele frequencies.
    //
    if (not sc.is_snp) return;

    static const double line_tol(1e-7);
    static const double start_ratio(0.05);
    static const double min_start_dist(1e-6);
    static const double end_tol(1e-7);
    static const unsigned max_iter(200);

    const unsigned ref_base_id(base_to_id(pi.ref_base));

    const double ref_freq(1.-x_nonref_freq);
    const double nonref_freq((x_nonref_freq)/3.);
    for (unsigned i(0); i<N_BASE; ++i) {
        if (i==ref_base_id) sc.allele_freq[i] = ref_freq;
        else               sc.allele_freq[i] = nonref_freq;
    }

    static const unsigned N_BASE2(N_BASE*N_BASE);
    double conj_dir[N_BASE2];
    std::fill(conj_dir,conj_dir+N_BASE2,0.);
    for (unsigned i(0); i<N_BASE; ++i) {
        const double start_dist( std::max(std::fabs(sc.allele_freq[i]*start_ratio),min_start_dist) );
        conj_dir[i*(N_BASE+1)] = start_dist;
    }

    double start_tol(end_tol);
    unsigned iter;
    double x_all_loghood;
    double final_dlh;
    position_allele_distro_loghood_minfunc alm(epi);
    codemin::minimize_conj_direction(sc.allele_freq,conj_dir,alm,start_tol,end_tol,line_tol,
                                     x_all_loghood,iter,final_dlh,max_iter);
    alm.arg_to_prob(sc.allele_freq,sc.allele_freq);

    sc.min_loghood=-x_all_loghood;
#endif
}