hap_cand:: hap_cand(const bam_seq_base& read_seq, const uint8_t* init_qual, const int offset) // the offset into read of the pileup base : _total_qual(0) { const int read_len(read_seq.size()); assert((offset>=0) && (offset<read_len)); int start(offset-FLANK_SIZE); int end(offset+FLANK_SIZE+1); const int pre_seq( (start<0) ? -start : 0 ); const int post_seq( (end>read_len) ? (end-read_len) : 0 ); start=std::max(start,0); end=std::min(end,read_len); for (int i(0); i<pre_seq; ++i) { _bq[i] = 0; } for (int i(start); i<end; ++i) { _total_qual += init_qual[i]; const char rs(read_seq.get_char(i)); _bq[i-start+pre_seq] = ( (rs=='N') ? 0 : (init_qual[i]<<QUAL_SHIFT | base_to_id(rs))); } for (int i(0); i<post_seq; ++i) { _bq[i+end-start+pre_seq] = 0; } }
// score a contiguous matching alignment segment // // note that running the lnp value through as a reference creates more // floating point stability for ambiguous alignments which have the // same score by definition. // static void score_segment(const starling_options& /*opt*/, const unsigned seg_length, const bam_seq_base& seq, const uint8_t* qual, const unsigned read_offset, const bam_seq_base& ref, const pos_t ref_head_pos, double& lnp) { static const double lnthird(-std::log(3.)); for (unsigned i(0); i<seg_length; ++i) { const pos_t readi(static_cast<pos_t>(read_offset+i)); const uint8_t sbase(seq.get_code(readi)); if (sbase == BAM_BASE::ANY) continue; const uint8_t qscore(qual[readi]); bool is_ref(sbase == BAM_BASE::REF); if (! is_ref) { const pos_t refi(ref_head_pos+static_cast<pos_t>(i)); is_ref=(sbase == ref.get_code(refi)); } lnp += ( is_ref ? qphred_to_ln_comp_error_prob(qscore) : qphred_to_ln_error_prob(qscore)+lnthird ); } }
// Handle the regular ol' insertions and deletions. Reports these // types as breakpoints when they're too long: // static void process_simple_indel(const unsigned max_indel_size, const ALIGNPATH::path_t& path, const bam_seq_base& bseq, starling_pos_processor_base& sppr, indel_observation& obs, const unsigned sample_no, const unsigned path_index, const unsigned read_offset, const pos_t ref_head_pos) { using namespace ALIGNPATH; const path_segment& ps(path[path_index]); // large insertion breakpoints are not filtered as noise: if(obs.data.is_noise) { if((ps.type == INSERT) && (ps.length > max_cand_filter_insert_size)) { obs.data.is_noise=false; } } if(ps.length <= max_indel_size) { obs.key.pos=ref_head_pos; obs.key.length = ps.length; if(ps.type == INSERT) { obs.key.type=INDEL::INSERT; bam_seq_to_str(bseq,read_offset,read_offset+ps.length,obs.data.insert_seq); } else { obs.key.type=INDEL::DELETE; } finish_indel_sppr(obs,sppr,sample_no); } else { // left side BP: { obs.key.pos=ref_head_pos; obs.key.length=ps.length; obs.key.type=INDEL::BP_LEFT; const unsigned start(read_offset); const unsigned size(bseq.size()-read_offset); const unsigned end(start+std::min(size,max_indel_size)); bam_seq_to_str(bseq,start,end,obs.data.insert_seq); finish_indel_sppr(obs,sppr,sample_no); } // right side BP: { obs.key.pos=ref_head_pos; if(ps.type == DELETE) obs.key.pos+=ps.length; obs.key.length=ps.length; obs.key.type=INDEL::BP_RIGHT; const unsigned next_read_offset(read_offset+((ps.type==INSERT) ? ps.length : 0)); const unsigned start_offset(next_read_offset-std::min(next_read_offset,max_indel_size)); bam_seq_to_str(bseq,start_offset,next_read_offset,obs.data.insert_seq); finish_indel_sppr(obs,sppr,sample_no); } } }
// Note that unlike other indel processors, the swap processor returns // the number of segments consumed. // // Like regular indels, a swap will be reported as a breakpoint if // it's too long (meaning longeset of insert,delete size). // static unsigned process_swap(const unsigned max_indel_size, const ALIGNPATH::path_t& path, const bam_seq_base& bseq, starling_pos_processor_base& sppr, indel_observation& obs, const unsigned sample_no, const unsigned path_index, const unsigned read_offset, const pos_t ref_head_pos) { using namespace ALIGNPATH; const swap_info sinfo(path,path_index); const unsigned swap_size(std::max(sinfo.insert_length,sinfo.delete_length)); // large insertions are not filtered as noise: if(obs.data.is_noise) { if(sinfo.insert_length > max_cand_filter_insert_size) { obs.data.is_noise=false; } } if(swap_size <= max_indel_size) { obs.key.pos=ref_head_pos; obs.key.length=sinfo.insert_length; obs.key.swap_dlength=sinfo.delete_length; obs.key.type = INDEL::SWAP; bam_seq_to_str(bseq,read_offset,read_offset+sinfo.insert_length,obs.data.insert_seq); finish_indel_sppr(obs,sppr,sample_no); } else { // left side BP: { obs.key.pos=ref_head_pos; obs.key.length=swap_size; obs.key.type=INDEL::BP_LEFT; const unsigned start(read_offset); const unsigned size(bseq.size()-read_offset); const unsigned end(start+std::min(size,max_indel_size)); bam_seq_to_str(bseq,start,end,obs.data.insert_seq); finish_indel_sppr(obs,sppr,sample_no); } // right side BP: { obs.key.pos=ref_head_pos+sinfo.delete_length; obs.key.length=swap_size; obs.key.type=INDEL::BP_RIGHT; const unsigned next_read_offset(read_offset+sinfo.insert_length); const unsigned start_offset(next_read_offset-std::min(next_read_offset,max_indel_size)); bam_seq_to_str(bseq,start_offset,next_read_offset,obs.data.insert_seq); finish_indel_sppr(obs,sppr,sample_no); } } return sinfo.n_seg; }
static void bam_seq_to_str(const bam_seq_base& bs, const unsigned start, const unsigned end, std::string& s) { s.clear(); for(unsigned i(start); i<end; ++i) s.push_back(bs.get_char(i)); }
// Extract indel information from an alignment and store this // in the starling_pos_processor indel buffer. // // assumes that path is already validated for seq!!! // void add_alignment_indels_to_sppr(const unsigned max_indel_size, const reference_contig_segment& ref, const alignment& al, const bam_seq_base& read_seq, starling_pos_processor_base& sppr, const INDEL_ALIGN_TYPE::index_t iat, const align_id_t id, const unsigned sample_no, const std::pair<bool,bool>& edge_pin, const indel_set_t* edge_indel_ptr) { using namespace ALIGNPATH; const unsigned seq_len(read_seq.size()); if(is_apath_invalid(al.path,seq_len)) { std::ostringstream oss; oss << "ERROR: Can't handle alignment path '" << apath_to_cigar(al.path) << "' -- " << get_apath_invalid_reason(al.path,seq_len) << "\n"; throw blt_exception(oss.str().c_str()); } if(is_apath_starling_invalid(al.path)) { std::ostringstream oss; oss << "ERROR: can't handle alignment path '" << apath_to_cigar(al.path) << "'\n"; throw blt_exception(oss.str().c_str()); } const rc_segment_bam_seq ref_bseq(ref); const std::pair<unsigned,unsigned> ends(get_match_edge_segments(al.path)); pos_range valid_pr; get_valid_alignment_range(al,ref_bseq,read_seq,valid_pr); unsigned path_index(0); unsigned read_offset(0); pos_t ref_head_pos(al.pos); const unsigned aps(al.path.size()); while(path_index<aps) { const path_segment& ps(al.path[path_index]); const bool is_begin_edge(path_index<ends.first); const bool is_end_edge(path_index>ends.second); const bool is_edge_segment(is_begin_edge || is_end_edge); const bool is_swap_start(is_segment_swap_start(al.path,path_index)); assert(ps.type != SKIP); assert(! (is_edge_segment && is_swap_start)); indel_observation obs; obs.data.iat = iat; obs.data.id = id; if(MATCH != ps.type) { pos_range indel_read_pr; indel_read_pr.set_begin_pos((read_offset==0) ? 0 : (read_offset-1)); unsigned rlen(0); if (is_swap_start) { const swap_info sinfo(al.path,path_index); rlen=sinfo.insert_length; } else if(is_segment_type_read_length(ps.type)) { rlen=ps.length; } indel_read_pr.set_end_pos(std::min(seq_len,read_offset+1+rlen)); if(! valid_pr.is_superset_of(indel_read_pr)) obs.data.is_noise=true; } unsigned n_seg(1); // number of path segments consumed if(is_edge_segment) { // is this indel occurring on a pinned edge (ie against an exon?) const bool is_pinned_indel((is_begin_edge && edge_pin.first) || (is_end_edge && edge_pin.second)); // edge inserts are allowed for intron adjacent and grouper reads, edge deletions for intron adjacent only: if(ps.type == INSERT) { process_edge_insert(max_indel_size,al.path,read_seq, sppr,obs,sample_no,edge_indel_ptr, seq_len,ends,path_index,read_offset,ref_head_pos, is_pinned_indel); } else if(ps.type == DELETE) { if(is_pinned_indel) { process_edge_delete(max_indel_size,al.path,read_seq, sppr,obs,sample_no, path_index,read_offset,ref_head_pos, is_pinned_indel); } } } else if(is_swap_start) { n_seg = process_swap(max_indel_size,al.path,read_seq, sppr,obs,sample_no, path_index,read_offset,ref_head_pos); } else if(is_segment_type_indel(al.path[path_index].type)) { process_simple_indel(max_indel_size,al.path,read_seq, sppr,obs,sample_no, path_index,read_offset,ref_head_pos); } for(unsigned i(0); i<n_seg; ++i) { increment_path(al.path,path_index,read_offset,ref_head_pos); } } }