Ejemplo n.º 1
0
hap_cand::
hap_cand(const bam_seq_base& read_seq,
         const uint8_t* init_qual,
         const int offset)  // the offset into read of the pileup base
    : _total_qual(0)
{
    const int read_len(read_seq.size());
    assert((offset>=0) && (offset<read_len));

    int start(offset-FLANK_SIZE);
    int end(offset+FLANK_SIZE+1);
    const int pre_seq( (start<0) ? -start : 0 );
    const int post_seq( (end>read_len) ? (end-read_len) : 0 );
    start=std::max(start,0);
    end=std::min(end,read_len);

    for (int i(0); i<pre_seq; ++i) { _bq[i] = 0; }

    for (int i(start); i<end; ++i) {
        _total_qual += init_qual[i];
        const char rs(read_seq.get_char(i));
        _bq[i-start+pre_seq] =
            ( (rs=='N') ?
              0 :
              (init_qual[i]<<QUAL_SHIFT | base_to_id(rs)));
    }

    for (int i(0); i<post_seq; ++i) { _bq[i+end-start+pre_seq] = 0; }
}
void
position_nonref_2allele_test(const snp_pos_info& pi,
                             const blt_options& opt,
                             const bool /*is_always_test*/,
                             nonref_test_call& nrc) {

    static const bool is_mle_freq(false);

    if (pi.ref_base=='N') return;

    // add early escape test here?

    // 1. Determine the two 'primary' alleles -- Simple test just adds
    // up qscores to determine which alleles are primary.
    //
    nrc.nonref_id=(BASE_ID::ANY);
    //unsigned nonref2_id(BASE_ID::ANY); // just ignore this value for now....
    {
        double qtot[N_BASE];
        for (unsigned i(0); i<N_BASE; ++i) qtot[i] = 0;

        const unsigned n_calls(pi.calls.size());
        for (unsigned i(0); i<n_calls; ++i) {
            if (pi.calls[i].base_id==BASE_ID::ANY) continue;
            qtot[pi.calls[i].base_id] += pi.calls[i].get_qscore();
        }

        // get max and max2:
        unsigned max_id=0;
        unsigned max2_id=1;
        for (unsigned b(1); b<N_BASE; ++b) {
            if (qtot[b] > qtot[max_id]) {
                max2_id = max_id;
                max_id = b;
            } else if (qtot[b] > qtot[max2_id]) {
                max2_id = b;
            }
        }

        const unsigned ref_id=base_to_id(pi.ref_base);
        if       (ref_id==max_id) {
            nrc.nonref_id=max2_id;

#if 0
        } else if (ref_id==max2_id) {
            nrc.nonref_id=max_id;
#endif
        } else {
            nrc.nonref_id=max_id;
            //nonref2_id=max2_id;
        }
    }

    blt_float_t lhood[NR2TEST::SIZE];

    lhood[NR2TEST::REF] = calc_pos_nonref_freq_loghood(pi,0.);

    sparse_function sf;
    nonref_allele_freq_loghood_sparse_func nlf(pi,nrc.nonref_id,sf);
    sample_uniform_range(0.,1.,nlf);
    //sample_uniform_range(min_nonref_freq,1.,nlf);

    lhood[NR2TEST::NONREF_MF] = integrate_ln_sparsefunc(sf, opt.min_nonref_freq, 1,1,1);
    lhood[NR2TEST::NONREF_MF_NOISE] = integrate_ln_sparsefunc(sf, 0, opt.nonref_site_error_decay_freq,1,0);

    static const blt_float_t neginf(-std::numeric_limits<blt_float_t>::infinity());
    lhood[NR2TEST::NONREF_OTHER] = neginf;

    //std::cerr << "WAGART: logh ref/nonef: " << lhood[0] << " " << lhood[1] << "\n";

    // TODO: ctor compute this:

    // this goes in here just in case someone cranks both parameters up near 1:
    //
    const double nonref_variant_rate_used = opt.nonref_variant_rate*(1-opt.nonref_site_error_rate);

    blt_float_t prior[NR2TEST::SIZE];
    prior[NR2TEST::REF] = log1p_switch(-(nonref_variant_rate_used+opt.nonref_site_error_rate));
    prior[NR2TEST::NONREF_MF] = std::log(nonref_variant_rate_used/3);
    prior[NR2TEST::NONREF_MF_NOISE] = std::log(opt.nonref_site_error_rate);
    prior[NR2TEST::NONREF_OTHER] = std::log(2*nonref_variant_rate_used/3);

    double pprob[NR2TEST::SIZE];
    for (unsigned i(0); i<NR2TEST::SIZE; ++i) {
        pprob[i] = lhood[i] + prior[i];
    }
    normalize_ln_distro(pprob,pprob+NR2TEST::SIZE,nrc.max_gt);

    nrc.snp_qphred=error_prob_to_qphred(pprob[NR2TEST::REF]+pprob[NR2TEST::NONREF_MF_NOISE]);
    nrc.max_gt_qphred=error_prob_to_qphred(prob_comp(pprob,pprob+NR2TEST::SIZE,nrc.max_gt));

    nrc.is_snp=(nrc.snp_qphred != 0);

    if (! (is_mle_freq && nrc.is_snp)) return;

#if 0
    const double null_loghood(calc_pos_nonref_freq_loghood(pi,0.));

    // heuristic to escape early:
    static const double p_delta(0.001);
    const double delta_loghood(calc_pos_nonref_freq_loghood(pi,p_delta));
    if (null_loghood > delta_loghood) return;

    double x_nonref_freq;
    double x_loghood;

    position_nonref_freq_loghood_minfunc mf(epi);

    static const double x1(0.5);
    static const double x2(0.4);
    codemin::minimize_1d(x1,x2,mf.val(x1),mf,x_nonref_freq,x_loghood);

    x_nonref_freq = mf.arg_to_prob(x_nonref_freq);

    const double log_lrt(-2.*(x_loghood+null_loghood));

    // becuase null has the parameter fixed to a boundary value, the
    // asymmtotic distribution is a 50:50 mixture of csq(0) and chq(1)
    // -- the same effect as multiplying alpha of csq(1) by 2, dividing
    // the null prob by 2. (as we do below):
    boost::math::chi_squared dist(1);
    const double null_prob((1.-boost::math::cdf(dist,log_lrt))/2.);

    sc.is_snp=(null_prob<alpha);
    sc.null_loghood=null_loghood;
    sc.min_test_loghood=-x_loghood;
    sc.snp_prob=1.-null_prob;

    // if it's a snp then get additional information on non-reference
    // allele frequencies.
    //
    if (not sc.is_snp) return;

    static const double line_tol(1e-7);
    static const double start_ratio(0.05);
    static const double min_start_dist(1e-6);
    static const double end_tol(1e-7);
    static const unsigned max_iter(200);

    const unsigned ref_base_id(base_to_id(pi.ref_base));

    const double ref_freq(1.-x_nonref_freq);
    const double nonref_freq((x_nonref_freq)/3.);
    for (unsigned i(0); i<N_BASE; ++i) {
        if (i==ref_base_id) sc.allele_freq[i] = ref_freq;
        else               sc.allele_freq[i] = nonref_freq;
    }

    static const unsigned N_BASE2(N_BASE*N_BASE);
    double conj_dir[N_BASE2];
    std::fill(conj_dir,conj_dir+N_BASE2,0.);
    for (unsigned i(0); i<N_BASE; ++i) {
        const double start_dist( std::max(std::fabs(sc.allele_freq[i]*start_ratio),min_start_dist) );
        conj_dir[i*(N_BASE+1)] = start_dist;
    }

    double start_tol(end_tol);
    unsigned iter;
    double x_all_loghood;
    double final_dlh;
    position_allele_distro_loghood_minfunc alm(epi);
    codemin::minimize_conj_direction(sc.allele_freq,conj_dir,alm,start_tol,end_tol,line_tol,
                                     x_all_loghood,iter,final_dlh,max_iter);
    alm.arg_to_prob(sc.allele_freq,sc.allele_freq);

    sc.min_loghood=-x_all_loghood;
#endif
}
void
somatic_snv_caller_strand_grid::
position_somatic_snv_call(const extended_pos_info& normal_epi,
                          const extended_pos_info& tumor_epi,
                          const extended_pos_info* normal_epi_t2_ptr,
                          const extended_pos_info* tumor_epi_t2_ptr,
                          somatic_snv_genotype_grid& sgt) const {

    static const bool is_always_test(false);

    {
        const snp_pos_info& normal_pi(normal_epi.pi);
        const snp_pos_info& tumor_pi(tumor_epi.pi);

        if(normal_pi.ref_base=='N') return;
        sgt.ref_gt=base_to_id(normal_pi.ref_base);

        // check that a non-reference call meeting quality criteria even
        // exists:
        if(not is_always_test) {
            if(is_spi_allref(normal_pi,sgt.ref_gt) and is_spi_allref(tumor_pi,sgt.ref_gt)) return;
        }
    }

    // strawman model treats normal and tumor as independent, so
    // calculate separate lhoods:
    blt_float_t normal_lhood[DIGT_SGRID::SIZE];
    blt_float_t tumor_lhood[DIGT_SGRID::SIZE];

    const bool is_tier2(NULL != normal_epi_t2_ptr);

    static const unsigned n_tier(2);
    result_set tier_rs[n_tier];
    for(unsigned i(0); i<n_tier; ++i) {
        const bool is_include_tier2(i==1);
        if(is_include_tier2) {
            if(! is_tier2) continue;
            if(tier_rs[0].snv_qphred==0) {
                tier_rs[1].snv_qphred=0;
                continue;
            }
        }

        // get likelihood of each genotype
        //
        static const bool is_normal_het_bias(false);
        static const blt_float_t normal_het_bias(0.0);
        static const bool is_tumor_het_bias(false);
        static const blt_float_t tumor_het_bias(0.0);

        const extended_pos_info& nepi(is_include_tier2 ? *normal_epi_t2_ptr : normal_epi );
        const extended_pos_info& tepi(is_include_tier2 ? *tumor_epi_t2_ptr : tumor_epi );
        get_diploid_gt_lhood_spi(_opt,nepi.pi,is_normal_het_bias,normal_het_bias,normal_lhood);
        get_diploid_gt_lhood_spi(_opt,tepi.pi,is_tumor_het_bias,tumor_het_bias,tumor_lhood);

        get_diploid_het_grid_lhood_spi(nepi.pi,normal_lhood+DIGT::SIZE);
        get_diploid_het_grid_lhood_spi(tepi.pi,tumor_lhood+DIGT::SIZE);

        get_diploid_strand_grid_lhood_spi(nepi.pi,sgt.ref_gt,normal_lhood+DIGT_SGRID::PRESTRAND_SIZE);
        get_diploid_strand_grid_lhood_spi(tepi.pi,sgt.ref_gt,tumor_lhood+DIGT_SGRID::PRESTRAND_SIZE);

        // genomic site results:
        calculate_result_set_grid(normal_lhood,
                                  tumor_lhood,
                                  get_prior_set(sgt.ref_gt),
                                  _ln_som_match,_ln_som_mismatch,
                                  sgt.ref_gt,
                                  tier_rs[i]);

#if 0
#ifdef ENABLE_POLY
        // polymorphic site results:
        assert(0); // still needs to be adapted for 2-tier system:
        calculate_result_set(normal_lhood,tumor_lhood,
                             lnprior_polymorphic(sgt.ref_gt),sgt.ref_gt,sgt.poly);
#else
        sgt.poly.snv_qphred = 0;
#endif
#endif

#ifdef SOMATIC_DEBUG
        if((i==0) && (tier_rs[i].snv_qphred > 0)) {
            const somatic_snv_caller_strand_grid::prior_set& pset(get_prior_set(sgt.ref_gt));
            const blt_float_t lnmatch(_ln_som_match);
            const blt_float_t lnmismatch(_ln_som_mismatch);

            log_os << "DUMP ON\n";
            log_os << "tier1_qphred: " << tier_rs[0].snv_qphred << "\n";

            // instead of dumping the entire distribution, we sort the lhood,prior,and prob to print out the N top values of each:
            std::vector<double> lhood(DDIGT_SGRID::SIZE);
            std::vector<double> prior(DDIGT_SGRID::SIZE);
            std::vector<double> post(DDIGT_SGRID::SIZE);

            // first get raw lhood:
            //
            for(unsigned ngt(0); ngt<DIGT_SGRID::PRESTRAND_SIZE; ++ngt) {
                for(unsigned tgt(0); tgt<DIGT_SGRID::PRESTRAND_SIZE; ++tgt) {
                    const unsigned dgt(DDIGT_SGRID::get_state(ngt,tgt));
                    // unorm takes the role of the normal prior for the somatic case:
                    //            static const blt_float_t unorm(std::log(static_cast<blt_float_t>(DIGT_SGRID::PRESTRAND_SIZE)));

                    //blt_float_t prior;
                    //if(tgt==ngt) { prior=pset.normal[ngt]+lnmatch; }
                    //else         { prior=pset.somatic_marginal[ngt]+lnmismatch; }
                    blt_float_t pr;
                    if(tgt==ngt) { pr=pset.normal[ngt]+lnmatch; }
                    else         { pr=pset.somatic_marginal[ngt]+lnmismatch; }
                    prior[dgt] = pr;

                    lhood[dgt] = normal_lhood[ngt]+tumor_lhood[tgt];
                    post[dgt] = lhood[dgt] + prior[dgt];
                }
            }

            for(unsigned gt(DIGT_SGRID::PRESTRAND_SIZE); gt<DIGT_SGRID::SIZE; ++gt) {
                const unsigned dgt(DDIGT_SGRID::get_state(gt,gt));
                lhood[dgt] = normal_lhood[gt]+tumor_lhood[gt];
                prior[dgt] = pset.normal[gt]+lnmatch;
                post[dgt] = lhood[dgt] + prior[dgt];
            }

            std::vector<double> lhood2(lhood);
            sort_n_dump("lhood_prior",lhood,prior,sgt.ref_gt);
            sort_n_dump("post_lhood",post,lhood2,sgt.ref_gt);

            log_os << "DUMP OFF\n";
        }
#endif

    }

    if((tier_rs[0].snv_qphred==0) ||
       (is_tier2 && (tier_rs[1].snv_qphred==0))) return;

    sgt.snv_tier=0;
    sgt.snv_from_ntype_tier=0;
    if(is_tier2) {
        if(tier_rs[0].snv_qphred > tier_rs[1].snv_qphred) {
            sgt.snv_tier=1;
        }

        if(tier_rs[0].snv_from_ntype_qphred > tier_rs[1].snv_from_ntype_qphred) {
            sgt.snv_from_ntype_tier=1;
        }
    }

    sgt.rs=tier_rs[sgt.snv_from_ntype_tier];

    if(is_tier2 && (tier_rs[0].ntype != tier_rs[1].ntype)) {
        // catch NTYPE conflict states:
        sgt.rs.ntype = NTYPE::CONFLICT;
        sgt.rs.snv_from_ntype_qphred = 0;
    } else {
        // classify NTYPE:
        //

        // convert diploid genotype into more limited ntype set:
        //
        if       (sgt.rs.ntype==sgt.ref_gt) {
            sgt.rs.ntype=NTYPE::REF;
        } else if(DIGT::is_het(sgt.rs.ntype)) {
            sgt.rs.ntype=NTYPE::HET;
        } else {
            sgt.rs.ntype=NTYPE::HOM;
        }
    }

    sgt.rs.snv_qphred = tier_rs[sgt.snv_tier].snv_qphred;
    sgt.is_snv=((sgt.rs.snv_qphred != 0));
}