// // The method only pre-computes the marginal normal allele-frequency // component of the prior, the prior is expanded to include the tumor // allele frequency during the posterior computation // // // For: // somatic state S // snv noise rate n // snv strand-biased noise fraction x // // ln_csse_rate = log( 1-n ) // ln_strand_sse_rate = log( nx ) // ln_nostrand_sse_rate = log( n(1-x) ) // // grid_normal_lnprior is the mixture of normal diploid probabilities with uniform noise // static void get_prior(const blt_float_t* normal_lnprior, const unsigned ref_gt, const blt_float_t sse_rate, const blt_float_t sseb_fraction, const blt_float_t somatic_normal_noise_rate, const bool is_somatic_normal_noise_rate, std::vector<blt_float_t>& grid_normal_lnprior, std::vector<blt_float_t>& somatic_marginal_lnprior) { get_nostrand_marginal_prior(normal_lnprior,ref_gt,sse_rate,sseb_fraction,grid_normal_lnprior); if(is_somatic_normal_noise_rate) { get_nostrand_marginal_prior(normal_lnprior,ref_gt,somatic_normal_noise_rate,0,somatic_marginal_lnprior); } else { get_nostrand_marginal_prior(normal_lnprior,ref_gt,sse_rate,sseb_fraction,somatic_marginal_lnprior); } const blt_float_t strand_sse_rate(sse_rate*sseb_fraction); // const blt_float_t nostrand_sse_rate(sse_rate-strand_sse_rate); // const blt_float_t ln_csse_rate( log1p_switch(-sse_rate) ); const blt_float_t ln_strand_sse_rate( std::log(strand_sse_rate) ); // const blt_float_t ln_nostrand_sse_rate( std::log(nostrand_sse_rate) ); static const blt_float_t error_mod( -std::log(static_cast<blt_float_t>(DIGT_SGRID::HET_RES*2)) ); // strand noise prior distributions for each allele combination axis: // // weight the prior by the potential originating genotypes: if on // AB axis, with A==ref, we want P(AA+noiseB) as the prior in the // model for the strand error states: the remaining term: // P(AB+noise)+P(BB+noiseA) is enumerated in to caluclate the // prior "throwaway state" -- regions of the stranded error // distribution for which we will approximate the lhood as 0 // static const unsigned n_strand_het_axes(3); blt_float_t strand_axis_prior[n_strand_het_axes]; double throwaway_sum(0); for(unsigned sgt(0); sgt<n_strand_het_axes; ++sgt) { strand_axis_prior[sgt] = normal_lnprior[ref_gt]+ln_one_third; throwaway_sum += std::exp(strand_axis_prior[sgt]); } throwaway_sum /= 2.; // we're only using half-axes for the states represented throwaway_sum = 1. - throwaway_sum; // flaw in using error mod -- because strand state could exist and // be detectable at the canonical gt frequencies, we leave it the // same with the goal of stable performance as the user changes // the fraction term on the command-line: // // TODO: unclear comment? // for(unsigned ngt(DIGT_SGRID::PRESTRAND_SIZE); ngt<(DIGT_SGRID::SIZE); ++ngt) { const unsigned sgt(DIGT_SGRID::get_strand_state(ngt)); grid_normal_lnprior[ngt] = (strand_axis_prior[sgt]+ln_strand_sse_rate+error_mod); } #ifdef SOMATIC_DEBUG throwaway_sum *= std::exp(ln_strand_sse_rate); check_ln_distro(grid_normal_lnprior.begin(), grid_normal_lnprior.end(), "somatic prior", 0.0001, 1.-throwaway_sum); #endif }
static void calculate_result_set(const strelka_options& opt, const double* normal_lnprior, const double lnmatch, const double lnmismatch, const double* normal_lhood, const double* tumor_lhood, result_set& rs) { #ifdef SOMATIC_DEBUG std::vector<double> check_prior(DDIINDEL::SIZE); for (unsigned ngt(0); ngt<STAR_DIINDEL::SIZE; ++ngt) { const double base_prior(normal_lnprior[ngt]); for (unsigned tgt(0); tgt<STAR_DIINDEL::SIZE; ++tgt) { const unsigned dgt(DDIINDEL::get_state(ngt,tgt)); check_prior[dgt] = base_prior+ ((tgt==ngt) ? lnmatch : lnmismatch); } } check_ln_distro(check_prior.begin(), check_prior.end(), "somatic indel full prior"); #endif // get unnormalized posterior: std::vector<double> pprob(DDIINDEL::SIZE); for (unsigned ngt(0); ngt<STAR_DIINDEL::SIZE; ++ngt) { const double base_prior(normal_lnprior[ngt]); for (unsigned tgt(0); tgt<STAR_DIINDEL::SIZE; ++tgt) { const unsigned dgt(DDIINDEL::get_state(ngt,tgt)); pprob[dgt] = normal_lhood[ngt]+ tumor_lhood[tgt]+ base_prior+ ((tgt==ngt) ? lnmatch : lnmismatch); } } normalize_ln_distro(pprob.begin(),pprob.end(),rs.max_gt); #ifdef DEBUG_INDEL_CALL log_os << "INDEL_CALL pprob(noindel),pprob(hom),pprob(het): " << pprob[STAR_DIINDEL::NOINDEL] << " " << pprob[STAR_DIINDEL::HOM] << " " << pprob[STAR_DIINDEL::HET] << "\n"; #endif double nonsomatic_sum(0); for (unsigned gt(0); gt<STAR_DIINDEL::SIZE; ++gt) { nonsomatic_sum += pprob[DDIINDEL::get_state(gt,gt)]; } rs.sindel_qphred=error_prob_to_qphred(nonsomatic_sum); double not_somfrom_sum[STAR_DIINDEL::SIZE]; for (unsigned sgt(0); sgt<STAR_DIINDEL::SIZE; ++sgt) { not_somfrom_sum[sgt]=nonsomatic_sum; for (unsigned ngt(0); ngt<STAR_DIINDEL::SIZE; ++ngt) { if (sgt==ngt) continue; for (unsigned tgt(0); tgt<STAR_DIINDEL::SIZE; ++tgt) { if (tgt==ngt) continue; not_somfrom_sum[sgt] += pprob[DDIINDEL::get_state(ngt,tgt)]; } } } rs.sindel_from_ref_qphred=error_prob_to_qphred(not_somfrom_sum[STAR_DIINDEL::NOINDEL]); rs.sindel_from_het_qphred=error_prob_to_qphred(not_somfrom_sum[STAR_DIINDEL::HET]); rs.sindel_from_hom_qphred=error_prob_to_qphred(not_somfrom_sum[STAR_DIINDEL::HOM]); double not_somfromanyhom_sum(nonsomatic_sum); for (unsigned ngt(0); ngt<STAR_DIINDEL::SIZE; ++ngt) { if (STAR_DIINDEL::HET != ngt) continue; for (unsigned tgt(0); tgt<STAR_DIINDEL::SIZE; ++tgt) { if (tgt==ngt) continue; not_somfromanyhom_sum += pprob[DDIINDEL::get_state(ngt,tgt)]; } } rs.sindel_from_anyhom_qphred=error_prob_to_qphred(not_somfromanyhom_sum); rs.max_gt_qphred=error_prob_to_qphred(prob_comp(pprob.begin(),pprob.end(),rs.max_gt)); }
// Given the likelihood, go through the final computations to get the // posterior and derived values. // static void calculate_result_set_grid(const blt_float_t* normal_lhood, const blt_float_t* tumor_lhood, const somatic_snv_caller_strand_grid::prior_set& pset, const blt_float_t lnmatch, const blt_float_t lnmismatch, const unsigned /*ref_gt*/, result_set& rs) { // a piece transplanted from 1150 to make a formal correction to // the priors which should have a low-impact on the results. // the prior below is incomplete #ifdef DEBUG_ALTERNATE_PRIOR static const double neginf(-std::numeric_limits<double>::infinity()); std::vector<double> prior(DDIGT_SGRID::SIZE); std::fill(prior.begin(),prior.end(),neginf); // this zero'd code is incomplete and abandoned for now...: #if 0 for(unsigned ngt(0); ngt<DIGT_SGRID::PRESTRAND_SIZE; ++ngt) { double base_prior(neginf); const bool is_noise(ngt>=STAR_DIINDEL::SIZE); if(is_noise) { base_prior=pset.normal[ngt]; } else { base_prior=pset.nonoise[ngt]; } for(unsigned tgt(0); tgt<DIGT_SGRID::PRESTRAND_SIZE; ++tgt) { const blt_float_t tgt_prior_mod( (tgt==ngt) ? lnmatch : lnmismatch ); const unsigned dgt(DDIGT_SGRID::get_state(ngt,tgt)); prior[dgt] = normal_genomic_lnprior[ngt]+tgt_prior_mod; } } for(unsigned gt(DIGT_SGRID::PRESTRAND_SIZE); gt<DIGT_SGRID::SIZE; ++gt) { const unsigned dgt(DDIGT_SGRID::get_state(gt,gt)); prior[dgt] = normal_genomic_lnprior[gt]+lnmatch; } #endif check_ln_distro(prior.begin(), prior.end(), "somatic snv full prior"); #endif // intentionally use higher float res for this structure: std::vector<double> pprob(DDIGT_SGRID::SIZE); // mult by prior distro to get unnormalized pprob for states in // the regular grid model: // for(unsigned ngt(0); ngt<DIGT_SGRID::PRESTRAND_SIZE; ++ngt) { for(unsigned tgt(0); tgt<DIGT_SGRID::PRESTRAND_SIZE; ++tgt) { const unsigned dgt(DDIGT_SGRID::get_state(ngt,tgt)); #if 0 // the trusty old way...: const blt_float_t tgt_prior_mod( (tgt==ngt) ? lnmatch : lnmismatch ); pprob[dgt] = normal_lhood[ngt]+tumor_lhood[tgt]+pset.normal[ngt]+tgt_prior_mod; #else // unorm takes the role of the normal prior for the somatic case: // static const blt_float_t unorm(std::log(static_cast<blt_float_t>(DIGT_SGRID::PRESTRAND_SIZE))); blt_float_t prior; if(tgt==ngt) { prior=pset.normal[ngt]+lnmatch; } else { prior=pset.somatic_marginal[ngt]+lnmismatch; } pprob[dgt] = normal_lhood[ngt]+tumor_lhood[tgt]+prior; #endif } } // Now add the single-strand noise states. note that these states // are unique in that we don't look for mixtures of somatic // variation with these noise states, b/c single-strand // observations can almost exclusively be ruled out as noise: // for(unsigned gt(DIGT_SGRID::PRESTRAND_SIZE); gt<DIGT_SGRID::SIZE; ++gt) { const unsigned dgt(DDIGT_SGRID::get_state(gt,gt)); pprob[dgt] = normal_lhood[gt]+tumor_lhood[gt]+pset.normal[gt]+lnmatch; } opt_normalize_ln_distro(pprob.begin(),pprob.end(),DDIGT_SGRID::is_nonsom.val.begin(),rs.max_gt); //normalize_ln_distro(pprob.begin(),pprob.end(),rs.max_gt); double nonsomatic_sum(0); for(unsigned gt(0); gt<DIGT_SGRID::SIZE; ++gt) { nonsomatic_sum += pprob[DDIGT_SGRID::get_state(gt,gt)]; } rs.snv_qphred=error_prob_to_qphred(nonsomatic_sum); if(0==rs.snv_qphred) return; #if 0 // alternate way to calculate the joint: // double min_not_somfrom_sum(0); for(unsigned dgt(0); dgt<DIGT::SIZE; ++dgt) { double not_somfrom_sum(nonsomatic_sum); for(unsigned ngt(0); ngt<DIGT_SGRID::PRESTRAND_SIZE; ++ngt) { // we're looking for the joint prob when state dgt is true // in the normal, so skip this as a normal state here: // if(dgt==ngt) continue; for(unsigned tgt(0); tgt<DIGT_SGRID::PRESTRAND_SIZE; ++tgt) { // we've already started from the nonsomatic som, so we can skip the equal states: // if(ngt==tgt) continue; not_somfrom_sum += pprob[DDIGT_SGRID::get_state(ngt,tgt)]; } } if((dgt==0) || (!_somfrom_sum<min_not_somfrom_sum)) { min_not_somfrom_sum=not_somfrom_sum; rs.snv_from_ntype_qphred=error_prob_to_qphred(not_somfrom_sum); rs.ntype=dgt; } } #endif #if 0 // reset max_gt to the most likely state excluding normal noise states: // rs.max_gt=0; for(unsigned dgt(0); dgt<DIGT::SIZE; ++dgt) { for(unsigned tgt(0); tgt<DIGT_SGRID::PRESTRAND_SIZE; ++tgt) { const unsigned xgt(DDIGT_SGRID::get_state(dgt,tgt)); if(pprob[xgt] > pprob[rs.max_gt]) rs.max_gt=xgt; } } #endif // Calculate normal distribution alone so that we can classify this call: // // Polymorphic prior is used because in this situation we want to // be conservative about the reference classification -- // ie. conditioned on only looking at putative somatic sites, we // require evidence to show that the normal is in fact reference // and not simply an unsampled copy of the somatic variation. // std::vector<double> normal_pprob(DIGT_SGRID::PRESTRAND_SIZE); for(unsigned ngt(0); ngt<DIGT_SGRID::PRESTRAND_SIZE; ++ngt) { normal_pprob[ngt] = normal_lhood[ngt]+pset.normal_poly[ngt]; } unsigned max_norm_gt(0); normalize_ln_distro(normal_pprob.begin(),normal_pprob.end(),max_norm_gt); // find the probability of max_norm_gt: const double ngt_prob(prob_comp(normal_pprob.begin(),normal_pprob.end(),max_norm_gt)); // (1-(1-a)(1-b)) -> a+b-(ab) double not_somfrom_sum(nonsomatic_sum+ngt_prob-(nonsomatic_sum*ngt_prob)); rs.snv_from_ntype_qphred=error_prob_to_qphred(not_somfrom_sum); rs.ntype=max_norm_gt; }