void CSplignTrim::CutToMatchRight(TSeg& s) { size_t pos = s.m_details.rfind('M'); if(pos == string::npos) { s.SetToGap(); return; } size_t len = s.m_details.length() - pos - 1;//length to cut if(len > 0) { CutFromRight(len, s); } }
void CSplignTrim::Cut50FromRight(TSeg& s) { int score = 0, maxscore = -2; string::iterator irs0, irs1, irs, irs_max; irs0 = s.m_details.begin(); irs1 = s.m_details.end(); irs_max = irs0; for(irs = irs0; irs != irs1; ++irs) { if(*irs == 'M') { ++score; } else { --score; } if(score >= maxscore) { maxscore = score; irs_max = irs; } } int len = irs1 - irs_max - 1; if(len > 0) { CutFromRight(len, s); } }
//trims exons around internal alignment gaps to complete codons //if CDS can be retrieved from bioseq void CSplignTrim::TrimHolesToCodons(TSegs& segments, CBioseq_Handle& mrna_bio_handle, bool mrna_strand, size_t mrna_len) { if( mrna_bio_handle ) { //collect CDS intervals (could be more than one in a case of ribosomal slippage) vector<TSeqRange> tr; for(CFeat_CI ci(mrna_bio_handle, SAnnotSelector(CSeqFeatData::e_Cdregion)); ci; ++ci) { for(CSeq_loc_CI slit(ci->GetLocation()); slit; ++slit) { TSeqRange r, ori; ori = slit.GetRange(); if( mrna_strand ) { r = ori; } else {//reverse r.SetFrom(mrna_len - ori.GetTo() - 1); r.SetTo(mrna_len - ori.GetFrom() - 1); } tr.push_back(r); } } if(tr.empty()) return;// CDS not found //trim AdjustGaps(segments);//make sure there is no adjacent gaps size_t pos1 = 0, pos2 = 2; for(; pos2 < segments.size(); ++pos1, ++pos2) { if( segments[pos1].m_exon && !segments[pos1+1].m_exon && segments[pos2].m_exon ) {//candidate for trimming //trim left exon TSeqPos p1 = segments[pos1].m_box[1]; ITERATE(vector<TSeqRange>, it, tr) { if( p1 >= it->GetFrom() && p1 <= it->GetTo() ) { TSeqPos cut_mrna_len = (p1 + 1 - it->GetFrom()) % 3, cnt = 0; string transcript = segments[pos1].m_details; int i = (int)transcript.size() - 1; for(; i>=0; --i) { if( cnt%3 == cut_mrna_len && transcript[i] == 'M' ) { //cut point CutFromRight(transcript.size() - i - 1, segments[pos1]); break; } if( transcript[i] != 'I' ) ++cnt; } if( i < 0 ) {// exon should not be so bad NCBI_THROW(CAlgoAlignException, eInternal, g_msg_InvalidRange); } break; } } //trim right exon TSeqPos p2 = segments[pos2].m_box[0]; ITERATE(vector<TSeqRange>, it, tr) { if( p2 >= it->GetFrom() && p2 <= it->GetTo() ) { TSeqPos cut_mrna_len = ( 3 - ( p2 - it->GetFrom()) % 3 ) %3, cnt = 0; string transcript = segments[pos2].m_details; int i = 0; for( ; i < (int)transcript.size(); ++i) { if( cnt%3 == cut_mrna_len && transcript[i] == 'M' ) { //cut point CutFromLeft(i, segments[pos2]); break; } if( transcript[i] != 'I' ) ++cnt; } if( i == (int)transcript.size() ) {// exon should not be so bad NCBI_THROW(CAlgoAlignException, eInternal, g_msg_InvalidRange); } break; } } } } AdjustGaps(segments); }
// try improving the segment by cutting it from the left, 20/20 rule void CSplignTrim::ImproveFromRight(TSeg& s) { CutToMatchRight(s); Cut50FromRight(s); if(ThrowAwayShortExon(s)) return; int len_total = (int)s.m_details.size(); if(len_total <= 20) return;//two short //compute number of matches int match_total = 0; string::reverse_iterator irs0 = s.m_details.rbegin(), irs1 = s.m_details.rend(), irs; for(irs = irs0; irs != irs1; ++irs) { if(*irs == 'M') { ++match_total; } } //find the left boundary, 20/20 rule {{ size_t pos = max(20, len_total/5) - 1; pos = s.m_details.find('M', pos); if( pos == string::npos ) return;//no M found. pos = s.m_details.find_first_not_of('M', pos); if( pos == string::npos ) return;// 100% id on the right, nothing to trim irs1 = irs1 - pos; }} //after 20/20 *ir1s is M, irs1+1 is not M and eventually irs1 is a left boundary for trimming string::reverse_iterator irs_tr = s.m_details.rend(); //trimming point int match = 0, len = 0; for(irs = irs0; irs != irs1; ++irs) { if(*irs == 'M') { ++match; } ++len; double rid = match / (double)len; double lid = (match_total - match) / (double)(len_total - len); //dropoff check double epsilon = 1e-10; if( lid - rid - m_MaxPartExonIdentDrop > epsilon ) { irs_tr = irs; //do not count trimmed part, adjust values match_total -= match; len_total -= len; match = 0; len = 0; } } if( irs_tr == s.m_details.rend() ) return;//no trimming point found //actual trimming CutFromRight( irs_tr - irs0 + 1 , s ); ThrowAwayShortExon(s); }