// Calculate the expected ratio of reads observed to support each // allele. Note that for sites and single breakpoints this is expected // to match the sample allele ratio, however for indels this can // change as a funciton of indel and read length. // // Note this routine does not accoung for overlapping indels // static void get_het_observed_allele_ratio(const unsigned read_length, const unsigned min_overlap, const indel_key& ik, const double het_allele_ratio, double& log_ref_prob, double& log_indel_prob) { assert((ik.type==INDEL::INSERT) || (ik.type==INDEL::DELETE) || (ik.type == INDEL::SWAP)); // the expected relative read depth for two breakpoints separated by a distance of 0: const unsigned base_expect( (read_length+1)<(2*min_overlap) ? 0 : (read_length+1)-(2*min_overlap) ); // Get expected relative read depth for the shorter and longer // paths of a general sequence replacement. Note this includes // basic insertions and deletions, in these cases // spath_break_distance is 0 and spath_expect equals base_expect: // const double ref_path_expect(base_expect+std::min(ik.delete_length(),base_expect)); const double indel_path_expect(base_expect+std::min(ik.insert_length(),base_expect)); const double ref_path_term((1-het_allele_ratio)*ref_path_expect); const double indel_path_term(het_allele_ratio*indel_path_expect); const double total_path_term(ref_path_term+indel_path_term); if (total_path_term>0) { const double indel_prob(indel_path_term/total_path_term); log_ref_prob=std::log(1.-indel_prob); log_indel_prob=std::log(indel_prob); } }
bool is_indel_conflict(const indel_key& ik1, const indel_key& ik2) { // add one to the end_pos of all indels to prevent immediately // adjacent indels in the final alignments: pos_range pr1(ik1.open_pos_range()); pr1.end_pos++; pos_range pr2(ik2.open_pos_range()); pr2.end_pos++; return pr1.is_range_intersect(pr2); }
static void set_delete_seq(const indel_key& ik, const reference_contig_segment& ref, std::string& seq) { copy_ref_subseq(ref,ik.pos,ik.right_pos(),seq); }
bool is_range_adjacent_indel_breakpoints(const known_pos_range read_pr, const indel_key& ik) { if(read_pr.is_range_intersect(pos_range(ik.pos-1,ik.pos+1))) return true; const pos_t rpos(ik.right_pos()); if(ik.pos==rpos) return false; return (read_pr.is_range_intersect(pos_range(rpos-1,rpos+1))); }
/// get the indel cigar and ref and indel strings used in the indel /// summary line output /// static void get_vcf_summary_strings(const indel_key& ik, const indel_data& id, const reference_contig_segment& ref, std::string& vcf_indel_seq, std::string& vcf_ref_seq) { if (ik.is_breakpoint()) { if (ik.type == INDEL::BP_LEFT) { copy_ref_subseq(ref,ik.pos-1,ik.pos,vcf_ref_seq); vcf_indel_seq = vcf_ref_seq + id.get_insert_seq() + '.'; } else if(ik.type == INDEL::BP_RIGHT) { copy_ref_subseq(ref,ik.pos,ik.pos+1,vcf_ref_seq); vcf_indel_seq = '.' + id.get_insert_seq() + vcf_ref_seq; } else { assert(0); } } else { copy_ref_subseq(ref,ik.pos-1,ik.pos+ik.delete_length(),vcf_ref_seq); copy_ref_subseq(ref,ik.pos-1,ik.pos,vcf_indel_seq); vcf_indel_seq += id.get_insert_seq(); } }
void get_starling_indel_report_info(const indel_key& ik, const indel_data& id, const reference_contig_segment& ref, starling_indel_report_info& iri) { // indel summary info get_indel_summary_strings(ik,id,ref,iri.desc,iri.indel_seq,iri.ref_seq); get_vcf_summary_strings(ik,id,ref,iri.vcf_indel_seq,iri.vcf_ref_seq); iri.it=ik.type; const pos_t indel_begin_pos(ik.pos); const pos_t indel_end_pos(ik.right_pos()); // reference context: { static const unsigned INDEL_CONTEXT_SIZE(10); if(ik.type != INDEL::BP_RIGHT) { iri.ref_upstream.clear(); for(pos_t i(indel_begin_pos-static_cast<pos_t>(INDEL_CONTEXT_SIZE)); i<indel_begin_pos; ++i) { iri.ref_upstream += ref.get_base(i); } } else { iri.ref_upstream = "N/A"; } if(ik.type != INDEL::BP_LEFT) { iri.ref_downstream.clear(); for(pos_t i(indel_end_pos); i<(indel_end_pos+static_cast<pos_t>(INDEL_CONTEXT_SIZE)); ++i) { iri.ref_downstream += ref.get_base(i); } } else { iri.ref_downstream = "N/A"; } } // repeat analysis: set_repeat_info(ik,ref,iri); // interupted hpol compuation: iri.ihpol=get_interupted_hpol_size(indel_begin_pos-1,ref); iri.ihpol=std::max(iri.ihpol,get_interupted_hpol_size(indel_begin_pos,ref)); if(indel_begin_pos != indel_end_pos) { iri.ihpol=std::max(iri.ihpol,get_interupted_hpol_size(indel_end_pos-1,ref)); iri.ihpol=std::max(iri.ihpol,get_interupted_hpol_size(indel_end_pos,ref)); } }
void indel_digt_caller:: get_indel_digt_lhood(const starling_options& opt, const starling_deriv_options& dopt, const starling_sample_options& sample_opt, const double indel_error_prob, const double ref_error_prob, const indel_key& ik, const indel_data& id, const bool is_het_bias, const double het_bias, const bool is_tier2_pass, const bool is_use_alt_indel, double* const lhood) { static const double loghalf(-std::log(2.)); for (unsigned gt(0); gt<STAR_DIINDEL::SIZE; ++gt) lhood[gt] = 0.; const bool is_breakpoint(ik.is_breakpoint()); const double indel_error_lnp(std::log(indel_error_prob)); const double indel_real_lnp(std::log(1.-indel_error_prob)); const double ref_error_lnp(std::log(ref_error_prob)); const double ref_real_lnp(std::log(1.-ref_error_prob)); // typedef read_path_scores::alt_indel_t::const_iterator aiter; typedef indel_data::score_t::const_iterator siter; siter it(id.read_path_lnp.begin()), it_end(id.read_path_lnp.end()); for (; it!=it_end; ++it) { const read_path_scores& path_lnp(it->second); // optionally skip tier2 data: if ((! is_tier2_pass) && (! path_lnp.is_tier1_read)) continue; // get alt path lnp: double alt_path_lnp(path_lnp.ref); #if 0 if (is_use_alt_indel && path_lnp.is_alt && (path_lnp.alt > alt_path_lnp)) { alt_path_lnp=path_lnp.alt; } #else if (is_use_alt_indel and (not path_lnp.alt_indel.empty()) ) { typedef read_path_scores::alt_indel_t::const_iterator aiter; aiter j(path_lnp.alt_indel.begin()), j_end(path_lnp.alt_indel.end()); for (; j!=j_end; ++j) { if (j->second>alt_path_lnp) alt_path_lnp=j->second; } } #endif const double noindel_lnp(log_sum(alt_path_lnp+ref_real_lnp,path_lnp.indel+indel_error_lnp)); const double hom_lnp(log_sum(alt_path_lnp+ref_error_lnp,path_lnp.indel+indel_real_lnp)); // allele ratio convention is that the indel occurs at the // het_allele ratio and the alternate allele occurs at // (1-het_allele_ratio): double log_ref_prob(loghalf); double log_indel_prob(loghalf); if (not is_breakpoint) { static const double het_allele_ratio(0.5); get_het_observed_allele_ratio(path_lnp.read_length,sample_opt.min_read_bp_flank, ik,het_allele_ratio,log_ref_prob,log_indel_prob); } const double het_lnp(log_sum(noindel_lnp+log_ref_prob,hom_lnp+log_indel_prob)); lhood[STAR_DIINDEL::NOINDEL] += integrate_out_sites(dopt,path_lnp.nsite,noindel_lnp,is_tier2_pass); lhood[STAR_DIINDEL::HOM] += integrate_out_sites(dopt,path_lnp.nsite,hom_lnp,is_tier2_pass); lhood[STAR_DIINDEL::HET] += integrate_out_sites(dopt,path_lnp.nsite,het_lnp,is_tier2_pass); #ifdef DEBUG_INDEL_CALL //log_os << std::setprecision(8); //log_os << "INDEL_CALL i,ref_lnp,indel_lnp,lhood(noindel),lhood(hom),lhood(het): " << i << " " << path_lnp.ref << " " << path_lnp.indel << " " << lhood[STAR_DIINDEL::NOINDEL] << " " << lhood[STAR_DIINDEL::HOM] << " " << lhood[STAR_DIINDEL::HET] << "\n"; #endif } if (is_het_bias) { // loop is currently setup to assume a uniform het ratio subgenotype prior const unsigned n_bias_steps(1+static_cast<unsigned>(het_bias/opt.het_bias_max_ratio_inc)); const double ratio_increment(het_bias/static_cast<double>(n_bias_steps)); for (unsigned step(0); step<n_bias_steps; ++step) { const double het_ratio(0.5+(step+1)*ratio_increment); increment_het_ratio_lhood(opt,dopt,sample_opt, indel_error_lnp,indel_real_lnp, ref_error_lnp,ref_real_lnp, ik,id,het_ratio,is_tier2_pass,is_use_alt_indel,lhood); } const unsigned n_het_subgt(1+2*n_bias_steps); const double subgt_log_prior(std::log(static_cast<double>(n_het_subgt))); lhood[STAR_DIINDEL::HET] -= subgt_log_prior; } }
void indel_digt_caller:: get_high_low_het_ratio_lhood(const starling_options& /*opt*/, const starling_deriv_options& dopt, const starling_sample_options& sample_opt, const double indel_error_lnp, const double indel_real_lnp, const double ref_error_lnp, const double ref_real_lnp, const indel_key& ik, const indel_data& id, const double het_ratio, const bool is_tier2_pass, const bool is_use_alt_indel, double& het_lhood_high, double& het_lhood_low) { // handle het ratio and its complement in one step: const double chet_ratio(1.-het_ratio); const double log_het_ratio(std::log(het_ratio)); const double log_chet_ratio(std::log(chet_ratio)); const bool is_breakpoint(ik.is_breakpoint()); het_lhood_high=0; het_lhood_low=0; // typedef read_path_scores::alt_indel_t::const_iterator aiter; typedef indel_data::score_t::const_iterator siter; siter i(id.read_path_lnp.begin()), i_end(id.read_path_lnp.end()); for (; i!=i_end; ++i) { const read_path_scores& path_lnp(i->second); // optionally skip tier2 data: if ((! is_tier2_pass) && (! path_lnp.is_tier1_read)) continue; // get alt path lnp: double alt_path_lnp(path_lnp.ref); #if 0 if (is_use_alt_indel && path_lnp.is_alt && (path_lnp.alt > alt_path_lnp)) { alt_path_lnp=path_lnp.alt; } #else if (is_use_alt_indel && (! path_lnp.alt_indel.empty()) ) { typedef read_path_scores::alt_indel_t::const_iterator aiter; aiter j(path_lnp.alt_indel.begin()), j_end(path_lnp.alt_indel.end()); for (; j!=j_end; ++j) { if (j->second>alt_path_lnp) alt_path_lnp=j->second; } } #endif const double noindel_lnp(log_sum(alt_path_lnp+ref_real_lnp,path_lnp.indel+indel_error_lnp)); const double hom_lnp(log_sum(alt_path_lnp+ref_error_lnp,path_lnp.indel+indel_real_lnp)); // allele ratio convention is that the indel occurs at the // het_allele ratio and the alternate allele occurs at // (1-het_allele_ratio): { double log_ref_prob(log_chet_ratio); double log_indel_prob(log_het_ratio); if (! is_breakpoint) { get_het_observed_allele_ratio(path_lnp.read_length,sample_opt.min_read_bp_flank, ik,het_ratio,log_ref_prob,log_indel_prob); } const double het_lnp(log_sum(noindel_lnp+log_ref_prob,hom_lnp+log_indel_prob)); het_lhood_low += integrate_out_sites(dopt,path_lnp.nsite,het_lnp,is_tier2_pass); } { double log_ref_prob(log_het_ratio); double log_indel_prob(log_chet_ratio); if (! is_breakpoint) { get_het_observed_allele_ratio(path_lnp.read_length,sample_opt.min_read_bp_flank, ik,chet_ratio,log_ref_prob,log_indel_prob); } const double het_lnp(log_sum(noindel_lnp+log_ref_prob,hom_lnp+log_indel_prob)); het_lhood_high += integrate_out_sites(dopt,path_lnp.nsite,het_lnp,is_tier2_pass); } } }
static void set_repeat_info(const indel_key& ik, const reference_contig_segment& ref, starling_indel_report_info& iri) { iri.is_repeat_unit = false; iri.repeat_unit = "N/A"; iri.ref_repeat_count = 0; iri.indel_repeat_count = 0; if(! ((iri.it == INDEL::INSERT) || (iri.it == INDEL::DELETE) || (iri.it == INDEL::SWAP))) return; unsigned insert_repeat_count(0); unsigned delete_repeat_count(0); if (iri.it == INDEL::INSERT) { get_seq_repeat_unit(iri.indel_seq,iri.repeat_unit,insert_repeat_count); } else if(iri.it == INDEL::DELETE) { get_seq_repeat_unit(iri.ref_seq,iri.repeat_unit,delete_repeat_count); } else if(iri.it == INDEL::SWAP) { std::string insert_ru; std::string delete_ru; get_seq_repeat_unit(iri.indel_seq,insert_ru,insert_repeat_count); get_seq_repeat_unit(iri.ref_seq,delete_ru,delete_repeat_count); if((insert_ru != delete_ru) || insert_ru.empty()) return; iri.repeat_unit=insert_ru; } else { assert(0); } // count repeats in contextual sequence: unsigned indel_context_repeat_count(0); { const pos_t indel_begin_pos(ik.pos); const pos_t indel_end_pos(ik.right_pos()); const int repeat_unit_size(static_cast<int>(iri.repeat_unit.size())); // count upstream repeats: for(pos_t i(indel_begin_pos-repeat_unit_size); i>=0; i-=repeat_unit_size) { bool is_repeat(true); for(int j(0); j<repeat_unit_size; ++j) { if(ref.get_base(i+j) != iri.repeat_unit[j]) { is_repeat = false; break; } } if(! is_repeat) break; indel_context_repeat_count += 1; } // count downstream repeats: const pos_t rs(ref.end()); for(pos_t i(indel_end_pos); (i+static_cast<pos_t>(repeat_unit_size)-1)<rs; i+=repeat_unit_size) { bool is_repeat(true); for(int j(0); j<repeat_unit_size; ++j) { if(ref.get_base(i+j) != iri.repeat_unit[j]) { is_repeat = false; break; } } if(! is_repeat) break; indel_context_repeat_count += 1; } } iri.is_repeat_unit = true; iri.ref_repeat_count = indel_context_repeat_count+delete_repeat_count; iri.indel_repeat_count = indel_context_repeat_count+insert_repeat_count; }
// 99% of this task is taking care of indel normalization static bool convert_indel_to_htype(const indel_key& ik, const indel_data& /*id*/, const read_segment& rseg, const reference_contig_segment& ref, htype_element& he) { he.clear(); // get best alignment: const alignment* alptr(rseg.get_best_alignment()); assert(alptr); const alignment& al(*alptr); // Check that alignment is compatible with indel. Many // cases where this fails will be for 'private' // indels. The posterior above is over all candidate // indels, so one candidate may be the best for this read, // *but* the best alignment contains a private indel // instead. // pos_range read_indel_pr; if (! is_indel_in_alignment(al,ik,read_indel_pr)) return false; const bam_seq read_seq(rseg.get_bam_read()); const rc_segment_bam_seq ref_bseq(ref); pos_range ref_indel_pr(ik.open_pos_range()); assert(! read_indel_pr.is_empty()); assert(! ref_indel_pr.is_empty()); // normalization function adjusts ranges: // normalize_indel(read_indel_pr,ref_indel_pr,read_seq,ref_bseq,read_indel_pr,ref_indel_pr); assert(! read_indel_pr.is_empty()); assert(! ref_indel_pr.is_empty()); // build he: if (ref_indel_pr.is_complete()) { he.delete_length=ref_indel_pr.end_pos-ref_indel_pr.begin_pos; } if (! read_indel_pr.is_begin_pos) { he.pos=read_indel_pr.end_pos; } else { he.pos=read_indel_pr.begin_pos; } if (! read_indel_pr.is_end_pos) { he.open_end=OPEN::RIGHT; } else if (! read_indel_pr.is_begin_pos) { he.open_end=OPEN::LEFT; } { // copy into htype element seq (don't worry about efficiency for now) pos_range pr(read_indel_pr); if (! pr.is_complete()) { const pos_range nonclip_pr(get_nonclip_range(al.path)); assert(nonclip_pr.is_complete()); if (! pr.is_begin_pos) { pr.set_begin_pos(nonclip_pr.begin_pos); } else { pr.set_end_pos(nonclip_pr.end_pos); } } assert(pr.begin_pos<=pr.end_pos && pr.begin_pos>=0); for (pos_t i(pr.begin_pos); i<pr.end_pos; ++i) { he.seq.push_back(read_seq.get_char(i)); } } if ((he.delete_length==0) && (he.insert_length()==0)) { he.clear(); return false; } return true; }