void export_md_to_apath(const char* md, const bool is_fwd_strand, path_t& apath, const bool is_edge_deletion_error) { // to make best use of previous code, we parse the MD in the // alignment direction and then orient apath to the forward strand // as a second step if required // assert(NULL != md); apath.clear(); export_md_to_apath_impl(md,apath); unsigned as(apath.size()); if ( ((as>0) and (apath.front().type == DELETE)) or ((as>1) and (apath.back().type == DELETE)) ) { std::ostringstream oss; if (is_edge_deletion_error) { oss << "ERROR: "; } else { oss << "WARNING: "; } oss << "alignment path: " << apath_to_cigar(apath) << " contains meaningless edge deletion.\n"; if (is_edge_deletion_error) { throw blt_exception(oss.str().c_str()); } else { log_os << oss.str(); path_t apath2; for (unsigned i(0); i<as; ++i) { if (((i==0) or ((i+1)==as)) and apath[i].type == DELETE) continue; apath2.push_back(apath[i]); } apath=apath2; as=apath.size(); } } if ( (not is_fwd_strand) and (as>1) ) { std::reverse(apath.begin(),apath.end()); } }
bool is_segment_swap_start(const path_t& apath, unsigned i) { using namespace ALIGNPATH; bool is_insert(false); bool is_delete(false); const unsigned as(apath.size()); for (; i<as; ++i) { if (apath[i].type == INSERT) { is_insert=true; } else if (apath[i].type == DELETE) { is_delete=true; } else { break; } } return (is_insert && is_delete); }
void apath_limit_ref_length( const unsigned target_ref_length, path_t& apath) { unsigned ref_length(0); const unsigned as(apath.size()); for (unsigned i(0); i<as; ++i) { path_segment& ps(apath[i]); if (! is_segment_type_ref_length(ps.type)) continue; ref_length += ps.length; if (ref_length < target_ref_length) continue; if (ref_length > target_ref_length) { const unsigned extra(ref_length - target_ref_length); assert(ps.length > extra); ps.length -= extra; } apath.resize(i+1); break; } }
bool is_clipped_front(const path_t& apath) { const unsigned as(apath.size()); if (as==0) return false; if ((apath[0].type == SOFT_CLIP) || (apath[0].type == HARD_CLIP)) return true; return false; }
void apath_to_bam_cigar(const path_t& apath, uint32_t* bam_cigar) { const unsigned as(apath.size()); for (unsigned i(0); i<as; ++i) { const path_segment& ps(apath[i]); assert(ps.type != NONE); bam_cigar[i] = (ps.length<<BAM_CIGAR_SHIFT | (static_cast<uint32_t>(ps.type)-1)); } }
accessor get_path(path_t const& path) const { accessor next = *this; for (size_t i = 0; i < path.size() && next.is_valid; ++i) { const std::string* key; const int* idx; if ((key = boost::get<std::string>(&path[i]))) { next = next[*key]; } else if ((idx = boost::get<int>(&path[i]))) { next = next[*idx]; } } return next; }
bool is_seq_swap(const path_t& apath) { const unsigned as(apath.size()); for (unsigned i(0); (i+1)<as; ++i) { if (is_segment_type_indel(apath[i].type) && is_segment_type_indel(apath[i+1].type)) { return true; } } return false; }
void apath_append( path_t& apath, const align_t seg_type, const unsigned length) { if (apath.size() && apath.back().type == seg_type) { apath.back().length += length; } else { apath.emplace_back(seg_type,length); } }
unsigned get_clip_len(const path_t& apath) { const unsigned as(apath.size()); if (as==0) return 0; if ((apath[0].type == SOFT_CLIP) || (apath[0].type == HARD_CLIP)) { return apath[0].length; } if (as>1) { if ((apath[as-1].type == SOFT_CLIP) || (apath[as-1].type == HARD_CLIP)) { return apath[as-1].length; } } return 0; }
std::pair<unsigned,unsigned> get_match_edge_segments(const path_t& apath) { const unsigned as(apath.size()); std::pair<unsigned,unsigned> res(as,as); bool is_first_match(false); for (unsigned i(0); i<as; ++i) { const path_segment& ps(apath[i]); if (is_segment_align_match(ps.type)) { if (! is_first_match) res.first=i; is_first_match=true; res.second=i; } } return res; }
void edit_bam_cigar(const path_t& apath, bam1_t& br) { bam1_core_t& bc(br.core); const int old_n_cigar(bc.n_cigar); const int new_n_cigar(apath.size()); const int delta(4*(new_n_cigar-old_n_cigar)); if (0 != delta) { const int end(bc.l_qname+(4*old_n_cigar)); change_bam_data_segment_len(end,delta,br); bc.n_cigar=new_n_cigar; } //update content of cigar array: apath_to_bam_cigar(apath,bam1_cigar(&br)); }
void apath_limit_read_length( const unsigned target_read_start, const unsigned target_read_end, path_t& apath) { bool isStartSet(false); unsigned read_length(0); const unsigned as(apath.size()); unsigned startSegment(0); unsigned endSegment(as); for (unsigned i(0); i<as; ++i) { path_segment& ps(apath[i]); if (! is_segment_type_read_length(ps.type)) continue; read_length += ps.length; if ((! isStartSet) && (read_length > target_read_start)) { { const unsigned extra(ps.length - (read_length - target_read_start)); assert(ps.length > extra); ps.length -= extra; } startSegment=i; isStartSet=true; } if (read_length >= target_read_end) { if (read_length > target_read_end) { const unsigned extra(read_length - target_read_end); assert(ps.length > extra); ps.length -= extra; } endSegment=i+1; break; } } apath = path_t(apath.begin()+startSegment,apath.begin()+endSegment); }
bool is_edge_readref_len_segment(const path_t& apath) { const unsigned as(apath.size()); if (as==0) return false; const std::pair<unsigned,unsigned> ends(get_match_edge_segments(apath)); // at this point we assume the alignment has been sanity checked for legal clipping, // where hard-clip is only on the outside, next soft-clipping, then anything else... // for (unsigned i(0); i<as; ++i) { const path_segment& ps(apath[i]); const bool is_edge_segment((i<ends.first) || (i>ends.second)); const bool is_clip_type(ps.type==INSERT || ps.type==DELETE || ps.type==SKIP || ps.type==SOFT_CLIP); if (is_edge_segment && is_clip_type) return true; } return false; }
std::pair<unsigned,unsigned> get_nonclip_end_segments(const path_t& apath) { const unsigned as(apath.size()); std::pair<unsigned,unsigned> res(as,as); bool is_first_nonclip(false); for (unsigned i(0); i<as; ++i) { const path_segment& ps(apath[i]); if (! (ps.type == SOFT_CLIP || ps.type == HARD_CLIP)) { if (! is_first_nonclip) { res.first=i; is_first_nonclip=true; } res.second=i; } } return res; }
ALIGN_ISSUE::issue_t get_apath_invalid_type(const path_t& apath, const unsigned seq_length) { bool is_match(false); align_t last_type(NONE); const unsigned as(apath.size()); for (unsigned i(0); i<as; ++i) { const path_segment& ps(apath[i]); if (ps.type==NONE) return ALIGN_ISSUE::UNKNOWN_SEGMENT; if ((i!=0) && ps.type==last_type) return ALIGN_ISSUE::REPEATED_SEGMENT; if (! is_match) { if (ps.type==SKIP) return ALIGN_ISSUE::EDGE_SKIP; } if (ps.type==HARD_CLIP) { if (! ((i==0) || ((i+1)==as))) return ALIGN_ISSUE::CLIPPING; } if (ps.type==SOFT_CLIP) { if (! ((i==0) || ((i+1)==as))) { if (i==1) { if (as==3) { if ((apath[0].type != HARD_CLIP) && (apath[i+1].type != HARD_CLIP)) return ALIGN_ISSUE::CLIPPING; } else { if (apath[0].type != HARD_CLIP) return ALIGN_ISSUE::CLIPPING; } } else if ((i+2)==as) { if (apath[i+1].type != HARD_CLIP) return ALIGN_ISSUE::CLIPPING; } else { return ALIGN_ISSUE::CLIPPING; } } } if ((! is_match) && (is_segment_align_match(ps.type))) is_match=true; last_type=ps.type; } if (! is_match) return ALIGN_ISSUE::FLOATING; // run in reverse to finish checking condition (2a): for (unsigned i(0); i<as; ++i) { const path_segment& ps(apath[as-(i+1)]); if (is_segment_align_match(ps.type)) break; //if(ps.type==DELETE) return ALIGN_ISSUE::EDGE_DELETE; if (ps.type==SKIP) return ALIGN_ISSUE::EDGE_SKIP; } if (seq_length != apath_read_length(apath)) return ALIGN_ISSUE::LENGTH; return ALIGN_ISSUE::NONE; }
// 1. remove zero-length segments // 2. remove pads // 3. condense repeated segment types // 4. reduce adjacent insertion/deletion tags to a single pair // 5. replace NDN pattern with single SKIP segment // // return true if path has been altered // bool apath_cleaner(path_t& apath) { bool is_cleaned(false); const unsigned as(apath.size()); unsigned insertIndex(as); unsigned deleteIndex(as); unsigned otherIndex(as); for (unsigned i(0); i<as; ++i) { path_segment& ps(apath[i]); if (ps.length == 0) { is_cleaned = true; } else if (ps.type == PAD) { ps.length = 0; is_cleaned = true; } else if (ps.type == INSERT) { if (insertIndex < as) { apath[insertIndex].length += ps.length; ps.length = 0; is_cleaned = true; } else { insertIndex = i; } } else if (ps.type == DELETE) { if (deleteIndex < as) { apath[deleteIndex].length += ps.length; ps.length = 0; is_cleaned = true; } else { deleteIndex = i; } } else { if ((insertIndex<as) || (deleteIndex<as)) { insertIndex = as; deleteIndex = as; otherIndex = as; } if ((otherIndex < as) && (apath[otherIndex].type == ps.type)) { apath[otherIndex].length += ps.length; ps.length = 0; is_cleaned = true; } else { otherIndex = i; } } } // convert NDN to single N: for (unsigned i(0); i<as; ++i) { path_segment& ps(apath[i]); if (ps.type == SKIP) { if ( (i+2)<as) { if ((apath[i+1].type == DELETE) && (apath[i+2].type == SKIP)) { for (unsigned j(1); j<3; ++j) { ps.length += apath[i+j].length; apath[i+j].length = 0; } is_cleaned = true; } } } } if (is_cleaned) { path_t apath2; for (const path_segment& ps : apath) { if (ps.length == 0) continue; apath2.push_back(ps); } apath = apath2; } return is_cleaned; }