hap_cand:: hap_cand(const bam_seq_base& read_seq, const uint8_t* init_qual, const int offset) // the offset into read of the pileup base : _total_qual(0) { const int read_len(read_seq.size()); assert((offset>=0) && (offset<read_len)); int start(offset-FLANK_SIZE); int end(offset+FLANK_SIZE+1); const int pre_seq( (start<0) ? -start : 0 ); const int post_seq( (end>read_len) ? (end-read_len) : 0 ); start=std::max(start,0); end=std::min(end,read_len); for (int i(0); i<pre_seq; ++i) { _bq[i] = 0; } for (int i(start); i<end; ++i) { _total_qual += init_qual[i]; const char rs(read_seq.get_char(i)); _bq[i-start+pre_seq] = ( (rs=='N') ? 0 : (init_qual[i]<<QUAL_SHIFT | base_to_id(rs))); } for (int i(0); i<post_seq; ++i) { _bq[i+end-start+pre_seq] = 0; } }
void position_nonref_2allele_test(const snp_pos_info& pi, const blt_options& opt, const bool /*is_always_test*/, nonref_test_call& nrc) { static const bool is_mle_freq(false); if (pi.ref_base=='N') return; // add early escape test here? // 1. Determine the two 'primary' alleles -- Simple test just adds // up qscores to determine which alleles are primary. // nrc.nonref_id=(BASE_ID::ANY); //unsigned nonref2_id(BASE_ID::ANY); // just ignore this value for now.... { double qtot[N_BASE]; for (unsigned i(0); i<N_BASE; ++i) qtot[i] = 0; const unsigned n_calls(pi.calls.size()); for (unsigned i(0); i<n_calls; ++i) { if (pi.calls[i].base_id==BASE_ID::ANY) continue; qtot[pi.calls[i].base_id] += pi.calls[i].get_qscore(); } // get max and max2: unsigned max_id=0; unsigned max2_id=1; for (unsigned b(1); b<N_BASE; ++b) { if (qtot[b] > qtot[max_id]) { max2_id = max_id; max_id = b; } else if (qtot[b] > qtot[max2_id]) { max2_id = b; } } const unsigned ref_id=base_to_id(pi.ref_base); if (ref_id==max_id) { nrc.nonref_id=max2_id; #if 0 } else if (ref_id==max2_id) { nrc.nonref_id=max_id; #endif } else { nrc.nonref_id=max_id; //nonref2_id=max2_id; } } blt_float_t lhood[NR2TEST::SIZE]; lhood[NR2TEST::REF] = calc_pos_nonref_freq_loghood(pi,0.); sparse_function sf; nonref_allele_freq_loghood_sparse_func nlf(pi,nrc.nonref_id,sf); sample_uniform_range(0.,1.,nlf); //sample_uniform_range(min_nonref_freq,1.,nlf); lhood[NR2TEST::NONREF_MF] = integrate_ln_sparsefunc(sf, opt.min_nonref_freq, 1,1,1); lhood[NR2TEST::NONREF_MF_NOISE] = integrate_ln_sparsefunc(sf, 0, opt.nonref_site_error_decay_freq,1,0); static const blt_float_t neginf(-std::numeric_limits<blt_float_t>::infinity()); lhood[NR2TEST::NONREF_OTHER] = neginf; //std::cerr << "WAGART: logh ref/nonef: " << lhood[0] << " " << lhood[1] << "\n"; // TODO: ctor compute this: // this goes in here just in case someone cranks both parameters up near 1: // const double nonref_variant_rate_used = opt.nonref_variant_rate*(1-opt.nonref_site_error_rate); blt_float_t prior[NR2TEST::SIZE]; prior[NR2TEST::REF] = log1p_switch(-(nonref_variant_rate_used+opt.nonref_site_error_rate)); prior[NR2TEST::NONREF_MF] = std::log(nonref_variant_rate_used/3); prior[NR2TEST::NONREF_MF_NOISE] = std::log(opt.nonref_site_error_rate); prior[NR2TEST::NONREF_OTHER] = std::log(2*nonref_variant_rate_used/3); double pprob[NR2TEST::SIZE]; for (unsigned i(0); i<NR2TEST::SIZE; ++i) { pprob[i] = lhood[i] + prior[i]; } normalize_ln_distro(pprob,pprob+NR2TEST::SIZE,nrc.max_gt); nrc.snp_qphred=error_prob_to_qphred(pprob[NR2TEST::REF]+pprob[NR2TEST::NONREF_MF_NOISE]); nrc.max_gt_qphred=error_prob_to_qphred(prob_comp(pprob,pprob+NR2TEST::SIZE,nrc.max_gt)); nrc.is_snp=(nrc.snp_qphred != 0); if (! (is_mle_freq && nrc.is_snp)) return; #if 0 const double null_loghood(calc_pos_nonref_freq_loghood(pi,0.)); // heuristic to escape early: static const double p_delta(0.001); const double delta_loghood(calc_pos_nonref_freq_loghood(pi,p_delta)); if (null_loghood > delta_loghood) return; double x_nonref_freq; double x_loghood; position_nonref_freq_loghood_minfunc mf(epi); static const double x1(0.5); static const double x2(0.4); codemin::minimize_1d(x1,x2,mf.val(x1),mf,x_nonref_freq,x_loghood); x_nonref_freq = mf.arg_to_prob(x_nonref_freq); const double log_lrt(-2.*(x_loghood+null_loghood)); // becuase null has the parameter fixed to a boundary value, the // asymmtotic distribution is a 50:50 mixture of csq(0) and chq(1) // -- the same effect as multiplying alpha of csq(1) by 2, dividing // the null prob by 2. (as we do below): boost::math::chi_squared dist(1); const double null_prob((1.-boost::math::cdf(dist,log_lrt))/2.); sc.is_snp=(null_prob<alpha); sc.null_loghood=null_loghood; sc.min_test_loghood=-x_loghood; sc.snp_prob=1.-null_prob; // if it's a snp then get additional information on non-reference // allele frequencies. // if (not sc.is_snp) return; static const double line_tol(1e-7); static const double start_ratio(0.05); static const double min_start_dist(1e-6); static const double end_tol(1e-7); static const unsigned max_iter(200); const unsigned ref_base_id(base_to_id(pi.ref_base)); const double ref_freq(1.-x_nonref_freq); const double nonref_freq((x_nonref_freq)/3.); for (unsigned i(0); i<N_BASE; ++i) { if (i==ref_base_id) sc.allele_freq[i] = ref_freq; else sc.allele_freq[i] = nonref_freq; } static const unsigned N_BASE2(N_BASE*N_BASE); double conj_dir[N_BASE2]; std::fill(conj_dir,conj_dir+N_BASE2,0.); for (unsigned i(0); i<N_BASE; ++i) { const double start_dist( std::max(std::fabs(sc.allele_freq[i]*start_ratio),min_start_dist) ); conj_dir[i*(N_BASE+1)] = start_dist; } double start_tol(end_tol); unsigned iter; double x_all_loghood; double final_dlh; position_allele_distro_loghood_minfunc alm(epi); codemin::minimize_conj_direction(sc.allele_freq,conj_dir,alm,start_tol,end_tol,line_tol, x_all_loghood,iter,final_dlh,max_iter); alm.arg_to_prob(sc.allele_freq,sc.allele_freq); sc.min_loghood=-x_all_loghood; #endif }
void somatic_snv_caller_strand_grid:: position_somatic_snv_call(const extended_pos_info& normal_epi, const extended_pos_info& tumor_epi, const extended_pos_info* normal_epi_t2_ptr, const extended_pos_info* tumor_epi_t2_ptr, somatic_snv_genotype_grid& sgt) const { static const bool is_always_test(false); { const snp_pos_info& normal_pi(normal_epi.pi); const snp_pos_info& tumor_pi(tumor_epi.pi); if(normal_pi.ref_base=='N') return; sgt.ref_gt=base_to_id(normal_pi.ref_base); // check that a non-reference call meeting quality criteria even // exists: if(not is_always_test) { if(is_spi_allref(normal_pi,sgt.ref_gt) and is_spi_allref(tumor_pi,sgt.ref_gt)) return; } } // strawman model treats normal and tumor as independent, so // calculate separate lhoods: blt_float_t normal_lhood[DIGT_SGRID::SIZE]; blt_float_t tumor_lhood[DIGT_SGRID::SIZE]; const bool is_tier2(NULL != normal_epi_t2_ptr); static const unsigned n_tier(2); result_set tier_rs[n_tier]; for(unsigned i(0); i<n_tier; ++i) { const bool is_include_tier2(i==1); if(is_include_tier2) { if(! is_tier2) continue; if(tier_rs[0].snv_qphred==0) { tier_rs[1].snv_qphred=0; continue; } } // get likelihood of each genotype // static const bool is_normal_het_bias(false); static const blt_float_t normal_het_bias(0.0); static const bool is_tumor_het_bias(false); static const blt_float_t tumor_het_bias(0.0); const extended_pos_info& nepi(is_include_tier2 ? *normal_epi_t2_ptr : normal_epi ); const extended_pos_info& tepi(is_include_tier2 ? *tumor_epi_t2_ptr : tumor_epi ); get_diploid_gt_lhood_spi(_opt,nepi.pi,is_normal_het_bias,normal_het_bias,normal_lhood); get_diploid_gt_lhood_spi(_opt,tepi.pi,is_tumor_het_bias,tumor_het_bias,tumor_lhood); get_diploid_het_grid_lhood_spi(nepi.pi,normal_lhood+DIGT::SIZE); get_diploid_het_grid_lhood_spi(tepi.pi,tumor_lhood+DIGT::SIZE); get_diploid_strand_grid_lhood_spi(nepi.pi,sgt.ref_gt,normal_lhood+DIGT_SGRID::PRESTRAND_SIZE); get_diploid_strand_grid_lhood_spi(tepi.pi,sgt.ref_gt,tumor_lhood+DIGT_SGRID::PRESTRAND_SIZE); // genomic site results: calculate_result_set_grid(normal_lhood, tumor_lhood, get_prior_set(sgt.ref_gt), _ln_som_match,_ln_som_mismatch, sgt.ref_gt, tier_rs[i]); #if 0 #ifdef ENABLE_POLY // polymorphic site results: assert(0); // still needs to be adapted for 2-tier system: calculate_result_set(normal_lhood,tumor_lhood, lnprior_polymorphic(sgt.ref_gt),sgt.ref_gt,sgt.poly); #else sgt.poly.snv_qphred = 0; #endif #endif #ifdef SOMATIC_DEBUG if((i==0) && (tier_rs[i].snv_qphred > 0)) { const somatic_snv_caller_strand_grid::prior_set& pset(get_prior_set(sgt.ref_gt)); const blt_float_t lnmatch(_ln_som_match); const blt_float_t lnmismatch(_ln_som_mismatch); log_os << "DUMP ON\n"; log_os << "tier1_qphred: " << tier_rs[0].snv_qphred << "\n"; // instead of dumping the entire distribution, we sort the lhood,prior,and prob to print out the N top values of each: std::vector<double> lhood(DDIGT_SGRID::SIZE); std::vector<double> prior(DDIGT_SGRID::SIZE); std::vector<double> post(DDIGT_SGRID::SIZE); // first get raw lhood: // for(unsigned ngt(0); ngt<DIGT_SGRID::PRESTRAND_SIZE; ++ngt) { for(unsigned tgt(0); tgt<DIGT_SGRID::PRESTRAND_SIZE; ++tgt) { const unsigned dgt(DDIGT_SGRID::get_state(ngt,tgt)); // unorm takes the role of the normal prior for the somatic case: // static const blt_float_t unorm(std::log(static_cast<blt_float_t>(DIGT_SGRID::PRESTRAND_SIZE))); //blt_float_t prior; //if(tgt==ngt) { prior=pset.normal[ngt]+lnmatch; } //else { prior=pset.somatic_marginal[ngt]+lnmismatch; } blt_float_t pr; if(tgt==ngt) { pr=pset.normal[ngt]+lnmatch; } else { pr=pset.somatic_marginal[ngt]+lnmismatch; } prior[dgt] = pr; lhood[dgt] = normal_lhood[ngt]+tumor_lhood[tgt]; post[dgt] = lhood[dgt] + prior[dgt]; } } for(unsigned gt(DIGT_SGRID::PRESTRAND_SIZE); gt<DIGT_SGRID::SIZE; ++gt) { const unsigned dgt(DDIGT_SGRID::get_state(gt,gt)); lhood[dgt] = normal_lhood[gt]+tumor_lhood[gt]; prior[dgt] = pset.normal[gt]+lnmatch; post[dgt] = lhood[dgt] + prior[dgt]; } std::vector<double> lhood2(lhood); sort_n_dump("lhood_prior",lhood,prior,sgt.ref_gt); sort_n_dump("post_lhood",post,lhood2,sgt.ref_gt); log_os << "DUMP OFF\n"; } #endif } if((tier_rs[0].snv_qphred==0) || (is_tier2 && (tier_rs[1].snv_qphred==0))) return; sgt.snv_tier=0; sgt.snv_from_ntype_tier=0; if(is_tier2) { if(tier_rs[0].snv_qphred > tier_rs[1].snv_qphred) { sgt.snv_tier=1; } if(tier_rs[0].snv_from_ntype_qphred > tier_rs[1].snv_from_ntype_qphred) { sgt.snv_from_ntype_tier=1; } } sgt.rs=tier_rs[sgt.snv_from_ntype_tier]; if(is_tier2 && (tier_rs[0].ntype != tier_rs[1].ntype)) { // catch NTYPE conflict states: sgt.rs.ntype = NTYPE::CONFLICT; sgt.rs.snv_from_ntype_qphred = 0; } else { // classify NTYPE: // // convert diploid genotype into more limited ntype set: // if (sgt.rs.ntype==sgt.ref_gt) { sgt.rs.ntype=NTYPE::REF; } else if(DIGT::is_het(sgt.rs.ntype)) { sgt.rs.ntype=NTYPE::HET; } else { sgt.rs.ntype=NTYPE::HOM; } } sgt.rs.snv_qphred = tier_rs[sgt.snv_tier].snv_qphred; sgt.is_snv=((sgt.rs.snv_qphred != 0)); }