void indel_digt_caller:: starling_indel_call_pprob_digt(const starling_options& opt, const starling_deriv_options& dopt, const starling_sample_options& sample_opt, const double indel_error_prob, const double ref_error_prob, const indel_key& ik, const indel_data& id, const bool is_use_alt_indel, starling_diploid_indel& dindel) const { // no immediate plans to include this for regular indel-calling: static const bool is_tier2_pass(false); if (opt.is_noise_indel_filter && is_diploid_indel_noise(dopt,id,is_tier2_pass)) { dindel.is_indel=false; return; } // get likelihood of each genotype: double lhood[STAR_DIINDEL::SIZE]; get_indel_digt_lhood(opt,dopt,sample_opt,indel_error_prob,ref_error_prob,ik,id, opt.is_bindel_diploid_het_bias,opt.bindel_diploid_het_bias, is_tier2_pass,is_use_alt_indel,lhood); // mult by prior distro to get unnormalized pprob: const double* indel_lnprior(lnprior_genomic()); for (unsigned gt(0); gt<STAR_DIINDEL::SIZE; ++gt) { dindel.pprob[gt] = lhood[gt] + indel_lnprior[gt]; } normalize_ln_distro(dindel.pprob,dindel.pprob+STAR_DIINDEL::SIZE,dindel.max_gt); #ifdef DEBUG_INDEL_CALL log_os << "INDEL_CALL pprob(noindel),pprob(hom),pprob(het): " << dindel.pprob[STAR_DIINDEL::NOINDEL] << " " << dindel.pprob[STAR_DIINDEL::HOM] << " " << dindel.pprob[STAR_DIINDEL::HET] << "\n"; #endif dindel.indel_qphred=error_prob_to_qphred(dindel.pprob[STAR_DIINDEL::NOINDEL]); dindel.max_gt_qphred=error_prob_to_qphred(prob_comp(dindel.pprob,dindel.pprob+STAR_DIINDEL::SIZE,dindel.max_gt)); // add new poly calls: normalize_ln_distro(lhood,lhood+STAR_DIINDEL::SIZE,dindel.max_gt_poly); dindel.max_gt_poly_qphred=error_prob_to_qphred(prob_comp(lhood,lhood+STAR_DIINDEL::SIZE,dindel.max_gt_poly)); // old report criteria: //dindel.is_indel=(dindel.max_gt != STAR_DIINDEL::NOINDEL); dindel.is_indel=(dindel.indel_qphred != 0); }
qphred_cache:: qphred_cache() { for (int i(0); i<=MAX_QSCORE; ++i) { q2p[i] = phred_to_error_prob(static_cast<double>(i)); } for (int i(0); i<=MAX_QSCORE; ++i) { q2lncompe[i] = log1p_switch(-q2p[i]); } static const double q2lnp(-std::log(10.)/10.); for (int i(0); i<=MAX_QSCORE; ++i) { q2lne[i] = static_cast<double>(i)*q2lnp; } for (int i(0); i<=MAX_QSCORE; ++i) { for (int j(0); j<=MAX_MAP; ++j) { mappedq[j][i] = error_prob_to_qphred(phred_to_mapped_error_prob(i,j)); } } }
void write_nonref_2allele_test(const blt_options& opt, const snp_pos_info& pi, const nonref_test_call& nrc, std::ostream& os) { os << nrc.snp_qphred << '\t' << NR2TEST::label(static_cast<NR2TEST::index_t>(nrc.max_gt)) << "_" << id_to_base(nrc.nonref_id) << '\t' << nrc.max_gt_qphred; //if(opt.is_print_used_allele_counts) { pi.print_known_counts(os,opt.used_allele_count_min_qscore); pi.print_known_qscore(os,opt.used_allele_count_min_qscore); //} #if 0 if (opt.is_print_all_poly_gt) { for (unsigned gt(0); gt<DIGT::SIZE; ++gt) { #if 1 // print GT as prob: os << '\t' << po.pprob[gt]; #else // print GT as qval: os << '\t' << error_prob_to_qphred(prob_comp(po.pprob,po.pprob+DIGT::SIZE,gt)); #endif } } const result_set& ge(dgt.genome); const result_set& po(dgt.poly); #endif #if 0 if (nrc.is_freq) { os << std::setprecision(8) << std::fixed; for (unsigned i(0); i<N_BASE; ++i) { os << '\t' << nrc.allele_freq[i]; } os.unsetf(std::ios::fixed); } #endif }
void position_nonref_2allele_test(const snp_pos_info& pi, const blt_options& opt, const bool /*is_always_test*/, nonref_test_call& nrc) { static const bool is_mle_freq(false); if (pi.ref_base=='N') return; // add early escape test here? // 1. Determine the two 'primary' alleles -- Simple test just adds // up qscores to determine which alleles are primary. // nrc.nonref_id=(BASE_ID::ANY); //unsigned nonref2_id(BASE_ID::ANY); // just ignore this value for now.... { double qtot[N_BASE]; for (unsigned i(0); i<N_BASE; ++i) qtot[i] = 0; const unsigned n_calls(pi.calls.size()); for (unsigned i(0); i<n_calls; ++i) { if (pi.calls[i].base_id==BASE_ID::ANY) continue; qtot[pi.calls[i].base_id] += pi.calls[i].get_qscore(); } // get max and max2: unsigned max_id=0; unsigned max2_id=1; for (unsigned b(1); b<N_BASE; ++b) { if (qtot[b] > qtot[max_id]) { max2_id = max_id; max_id = b; } else if (qtot[b] > qtot[max2_id]) { max2_id = b; } } const unsigned ref_id=base_to_id(pi.ref_base); if (ref_id==max_id) { nrc.nonref_id=max2_id; #if 0 } else if (ref_id==max2_id) { nrc.nonref_id=max_id; #endif } else { nrc.nonref_id=max_id; //nonref2_id=max2_id; } } blt_float_t lhood[NR2TEST::SIZE]; lhood[NR2TEST::REF] = calc_pos_nonref_freq_loghood(pi,0.); sparse_function sf; nonref_allele_freq_loghood_sparse_func nlf(pi,nrc.nonref_id,sf); sample_uniform_range(0.,1.,nlf); //sample_uniform_range(min_nonref_freq,1.,nlf); lhood[NR2TEST::NONREF_MF] = integrate_ln_sparsefunc(sf, opt.min_nonref_freq, 1,1,1); lhood[NR2TEST::NONREF_MF_NOISE] = integrate_ln_sparsefunc(sf, 0, opt.nonref_site_error_decay_freq,1,0); static const blt_float_t neginf(-std::numeric_limits<blt_float_t>::infinity()); lhood[NR2TEST::NONREF_OTHER] = neginf; //std::cerr << "WAGART: logh ref/nonef: " << lhood[0] << " " << lhood[1] << "\n"; // TODO: ctor compute this: // this goes in here just in case someone cranks both parameters up near 1: // const double nonref_variant_rate_used = opt.nonref_variant_rate*(1-opt.nonref_site_error_rate); blt_float_t prior[NR2TEST::SIZE]; prior[NR2TEST::REF] = log1p_switch(-(nonref_variant_rate_used+opt.nonref_site_error_rate)); prior[NR2TEST::NONREF_MF] = std::log(nonref_variant_rate_used/3); prior[NR2TEST::NONREF_MF_NOISE] = std::log(opt.nonref_site_error_rate); prior[NR2TEST::NONREF_OTHER] = std::log(2*nonref_variant_rate_used/3); double pprob[NR2TEST::SIZE]; for (unsigned i(0); i<NR2TEST::SIZE; ++i) { pprob[i] = lhood[i] + prior[i]; } normalize_ln_distro(pprob,pprob+NR2TEST::SIZE,nrc.max_gt); nrc.snp_qphred=error_prob_to_qphred(pprob[NR2TEST::REF]+pprob[NR2TEST::NONREF_MF_NOISE]); nrc.max_gt_qphred=error_prob_to_qphred(prob_comp(pprob,pprob+NR2TEST::SIZE,nrc.max_gt)); nrc.is_snp=(nrc.snp_qphred != 0); if (! (is_mle_freq && nrc.is_snp)) return; #if 0 const double null_loghood(calc_pos_nonref_freq_loghood(pi,0.)); // heuristic to escape early: static const double p_delta(0.001); const double delta_loghood(calc_pos_nonref_freq_loghood(pi,p_delta)); if (null_loghood > delta_loghood) return; double x_nonref_freq; double x_loghood; position_nonref_freq_loghood_minfunc mf(epi); static const double x1(0.5); static const double x2(0.4); codemin::minimize_1d(x1,x2,mf.val(x1),mf,x_nonref_freq,x_loghood); x_nonref_freq = mf.arg_to_prob(x_nonref_freq); const double log_lrt(-2.*(x_loghood+null_loghood)); // becuase null has the parameter fixed to a boundary value, the // asymmtotic distribution is a 50:50 mixture of csq(0) and chq(1) // -- the same effect as multiplying alpha of csq(1) by 2, dividing // the null prob by 2. (as we do below): boost::math::chi_squared dist(1); const double null_prob((1.-boost::math::cdf(dist,log_lrt))/2.); sc.is_snp=(null_prob<alpha); sc.null_loghood=null_loghood; sc.min_test_loghood=-x_loghood; sc.snp_prob=1.-null_prob; // if it's a snp then get additional information on non-reference // allele frequencies. // if (not sc.is_snp) return; static const double line_tol(1e-7); static const double start_ratio(0.05); static const double min_start_dist(1e-6); static const double end_tol(1e-7); static const unsigned max_iter(200); const unsigned ref_base_id(base_to_id(pi.ref_base)); const double ref_freq(1.-x_nonref_freq); const double nonref_freq((x_nonref_freq)/3.); for (unsigned i(0); i<N_BASE; ++i) { if (i==ref_base_id) sc.allele_freq[i] = ref_freq; else sc.allele_freq[i] = nonref_freq; } static const unsigned N_BASE2(N_BASE*N_BASE); double conj_dir[N_BASE2]; std::fill(conj_dir,conj_dir+N_BASE2,0.); for (unsigned i(0); i<N_BASE; ++i) { const double start_dist( std::max(std::fabs(sc.allele_freq[i]*start_ratio),min_start_dist) ); conj_dir[i*(N_BASE+1)] = start_dist; } double start_tol(end_tol); unsigned iter; double x_all_loghood; double final_dlh; position_allele_distro_loghood_minfunc alm(epi); codemin::minimize_conj_direction(sc.allele_freq,conj_dir,alm,start_tol,end_tol,line_tol, x_all_loghood,iter,final_dlh,max_iter); alm.arg_to_prob(sc.allele_freq,sc.allele_freq); sc.min_loghood=-x_all_loghood; #endif }
static void calculate_result_set(const strelka_options& opt, const double* normal_lnprior, const double lnmatch, const double lnmismatch, const double* normal_lhood, const double* tumor_lhood, result_set& rs) { #ifdef SOMATIC_DEBUG std::vector<double> check_prior(DDIINDEL::SIZE); for (unsigned ngt(0); ngt<STAR_DIINDEL::SIZE; ++ngt) { const double base_prior(normal_lnprior[ngt]); for (unsigned tgt(0); tgt<STAR_DIINDEL::SIZE; ++tgt) { const unsigned dgt(DDIINDEL::get_state(ngt,tgt)); check_prior[dgt] = base_prior+ ((tgt==ngt) ? lnmatch : lnmismatch); } } check_ln_distro(check_prior.begin(), check_prior.end(), "somatic indel full prior"); #endif // get unnormalized posterior: std::vector<double> pprob(DDIINDEL::SIZE); for (unsigned ngt(0); ngt<STAR_DIINDEL::SIZE; ++ngt) { const double base_prior(normal_lnprior[ngt]); for (unsigned tgt(0); tgt<STAR_DIINDEL::SIZE; ++tgt) { const unsigned dgt(DDIINDEL::get_state(ngt,tgt)); pprob[dgt] = normal_lhood[ngt]+ tumor_lhood[tgt]+ base_prior+ ((tgt==ngt) ? lnmatch : lnmismatch); } } normalize_ln_distro(pprob.begin(),pprob.end(),rs.max_gt); #ifdef DEBUG_INDEL_CALL log_os << "INDEL_CALL pprob(noindel),pprob(hom),pprob(het): " << pprob[STAR_DIINDEL::NOINDEL] << " " << pprob[STAR_DIINDEL::HOM] << " " << pprob[STAR_DIINDEL::HET] << "\n"; #endif double nonsomatic_sum(0); for (unsigned gt(0); gt<STAR_DIINDEL::SIZE; ++gt) { nonsomatic_sum += pprob[DDIINDEL::get_state(gt,gt)]; } rs.sindel_qphred=error_prob_to_qphred(nonsomatic_sum); double not_somfrom_sum[STAR_DIINDEL::SIZE]; for (unsigned sgt(0); sgt<STAR_DIINDEL::SIZE; ++sgt) { not_somfrom_sum[sgt]=nonsomatic_sum; for (unsigned ngt(0); ngt<STAR_DIINDEL::SIZE; ++ngt) { if (sgt==ngt) continue; for (unsigned tgt(0); tgt<STAR_DIINDEL::SIZE; ++tgt) { if (tgt==ngt) continue; not_somfrom_sum[sgt] += pprob[DDIINDEL::get_state(ngt,tgt)]; } } } rs.sindel_from_ref_qphred=error_prob_to_qphred(not_somfrom_sum[STAR_DIINDEL::NOINDEL]); rs.sindel_from_het_qphred=error_prob_to_qphred(not_somfrom_sum[STAR_DIINDEL::HET]); rs.sindel_from_hom_qphred=error_prob_to_qphred(not_somfrom_sum[STAR_DIINDEL::HOM]); double not_somfromanyhom_sum(nonsomatic_sum); for (unsigned ngt(0); ngt<STAR_DIINDEL::SIZE; ++ngt) { if (STAR_DIINDEL::HET != ngt) continue; for (unsigned tgt(0); tgt<STAR_DIINDEL::SIZE; ++tgt) { if (tgt==ngt) continue; not_somfromanyhom_sum += pprob[DDIINDEL::get_state(ngt,tgt)]; } } rs.sindel_from_anyhom_qphred=error_prob_to_qphred(not_somfromanyhom_sum); rs.max_gt_qphred=error_prob_to_qphred(prob_comp(pprob.begin(),pprob.end(),rs.max_gt)); }
// Given the likelihood, go through the final computations to get the // posterior and derived values. // static void calculate_result_set_grid(const blt_float_t* normal_lhood, const blt_float_t* tumor_lhood, const somatic_snv_caller_strand_grid::prior_set& pset, const blt_float_t lnmatch, const blt_float_t lnmismatch, const unsigned /*ref_gt*/, result_set& rs) { // a piece transplanted from 1150 to make a formal correction to // the priors which should have a low-impact on the results. // the prior below is incomplete #ifdef DEBUG_ALTERNATE_PRIOR static const double neginf(-std::numeric_limits<double>::infinity()); std::vector<double> prior(DDIGT_SGRID::SIZE); std::fill(prior.begin(),prior.end(),neginf); // this zero'd code is incomplete and abandoned for now...: #if 0 for(unsigned ngt(0); ngt<DIGT_SGRID::PRESTRAND_SIZE; ++ngt) { double base_prior(neginf); const bool is_noise(ngt>=STAR_DIINDEL::SIZE); if(is_noise) { base_prior=pset.normal[ngt]; } else { base_prior=pset.nonoise[ngt]; } for(unsigned tgt(0); tgt<DIGT_SGRID::PRESTRAND_SIZE; ++tgt) { const blt_float_t tgt_prior_mod( (tgt==ngt) ? lnmatch : lnmismatch ); const unsigned dgt(DDIGT_SGRID::get_state(ngt,tgt)); prior[dgt] = normal_genomic_lnprior[ngt]+tgt_prior_mod; } } for(unsigned gt(DIGT_SGRID::PRESTRAND_SIZE); gt<DIGT_SGRID::SIZE; ++gt) { const unsigned dgt(DDIGT_SGRID::get_state(gt,gt)); prior[dgt] = normal_genomic_lnprior[gt]+lnmatch; } #endif check_ln_distro(prior.begin(), prior.end(), "somatic snv full prior"); #endif // intentionally use higher float res for this structure: std::vector<double> pprob(DDIGT_SGRID::SIZE); // mult by prior distro to get unnormalized pprob for states in // the regular grid model: // for(unsigned ngt(0); ngt<DIGT_SGRID::PRESTRAND_SIZE; ++ngt) { for(unsigned tgt(0); tgt<DIGT_SGRID::PRESTRAND_SIZE; ++tgt) { const unsigned dgt(DDIGT_SGRID::get_state(ngt,tgt)); #if 0 // the trusty old way...: const blt_float_t tgt_prior_mod( (tgt==ngt) ? lnmatch : lnmismatch ); pprob[dgt] = normal_lhood[ngt]+tumor_lhood[tgt]+pset.normal[ngt]+tgt_prior_mod; #else // unorm takes the role of the normal prior for the somatic case: // static const blt_float_t unorm(std::log(static_cast<blt_float_t>(DIGT_SGRID::PRESTRAND_SIZE))); blt_float_t prior; if(tgt==ngt) { prior=pset.normal[ngt]+lnmatch; } else { prior=pset.somatic_marginal[ngt]+lnmismatch; } pprob[dgt] = normal_lhood[ngt]+tumor_lhood[tgt]+prior; #endif } } // Now add the single-strand noise states. note that these states // are unique in that we don't look for mixtures of somatic // variation with these noise states, b/c single-strand // observations can almost exclusively be ruled out as noise: // for(unsigned gt(DIGT_SGRID::PRESTRAND_SIZE); gt<DIGT_SGRID::SIZE; ++gt) { const unsigned dgt(DDIGT_SGRID::get_state(gt,gt)); pprob[dgt] = normal_lhood[gt]+tumor_lhood[gt]+pset.normal[gt]+lnmatch; } opt_normalize_ln_distro(pprob.begin(),pprob.end(),DDIGT_SGRID::is_nonsom.val.begin(),rs.max_gt); //normalize_ln_distro(pprob.begin(),pprob.end(),rs.max_gt); double nonsomatic_sum(0); for(unsigned gt(0); gt<DIGT_SGRID::SIZE; ++gt) { nonsomatic_sum += pprob[DDIGT_SGRID::get_state(gt,gt)]; } rs.snv_qphred=error_prob_to_qphred(nonsomatic_sum); if(0==rs.snv_qphred) return; #if 0 // alternate way to calculate the joint: // double min_not_somfrom_sum(0); for(unsigned dgt(0); dgt<DIGT::SIZE; ++dgt) { double not_somfrom_sum(nonsomatic_sum); for(unsigned ngt(0); ngt<DIGT_SGRID::PRESTRAND_SIZE; ++ngt) { // we're looking for the joint prob when state dgt is true // in the normal, so skip this as a normal state here: // if(dgt==ngt) continue; for(unsigned tgt(0); tgt<DIGT_SGRID::PRESTRAND_SIZE; ++tgt) { // we've already started from the nonsomatic som, so we can skip the equal states: // if(ngt==tgt) continue; not_somfrom_sum += pprob[DDIGT_SGRID::get_state(ngt,tgt)]; } } if((dgt==0) || (!_somfrom_sum<min_not_somfrom_sum)) { min_not_somfrom_sum=not_somfrom_sum; rs.snv_from_ntype_qphred=error_prob_to_qphred(not_somfrom_sum); rs.ntype=dgt; } } #endif #if 0 // reset max_gt to the most likely state excluding normal noise states: // rs.max_gt=0; for(unsigned dgt(0); dgt<DIGT::SIZE; ++dgt) { for(unsigned tgt(0); tgt<DIGT_SGRID::PRESTRAND_SIZE; ++tgt) { const unsigned xgt(DDIGT_SGRID::get_state(dgt,tgt)); if(pprob[xgt] > pprob[rs.max_gt]) rs.max_gt=xgt; } } #endif // Calculate normal distribution alone so that we can classify this call: // // Polymorphic prior is used because in this situation we want to // be conservative about the reference classification -- // ie. conditioned on only looking at putative somatic sites, we // require evidence to show that the normal is in fact reference // and not simply an unsampled copy of the somatic variation. // std::vector<double> normal_pprob(DIGT_SGRID::PRESTRAND_SIZE); for(unsigned ngt(0); ngt<DIGT_SGRID::PRESTRAND_SIZE; ++ngt) { normal_pprob[ngt] = normal_lhood[ngt]+pset.normal_poly[ngt]; } unsigned max_norm_gt(0); normalize_ln_distro(normal_pprob.begin(),normal_pprob.end(),max_norm_gt); // find the probability of max_norm_gt: const double ngt_prob(prob_comp(normal_pprob.begin(),normal_pprob.end(),max_norm_gt)); // (1-(1-a)(1-b)) -> a+b-(ab) double not_somfrom_sum(nonsomatic_sum+ngt_prob-(nonsomatic_sum*ngt_prob)); rs.snv_from_ntype_qphred=error_prob_to_qphred(not_somfrom_sum); rs.ntype=max_norm_gt; }