void CSplignTrim::CutToMatchLeft(TSeg& s) { size_t pos = s.m_details.find('M'); if(pos == string::npos) { s.SetToGap(); return; } if(pos > 0) { CutFromLeft(pos, s); } }
void CSplignTrim::Cut50FromLeft(TSeg& s) { int score = 0, maxscore = -2; string::reverse_iterator rirs0, rirs1, rirs, rirs_max; rirs0 = s.m_details.rbegin(); rirs1 = s.m_details.rend(); rirs_max = rirs0; for(rirs = rirs0; rirs != rirs1; ++rirs) { if(*rirs == 'M') { ++score; } else { --score; } if(score >= maxscore) { maxscore = score; rirs_max = rirs; } } int len = rirs1 - rirs_max - 1; if(len > 0) { CutFromLeft(len, s); } }
//trims exons around internal alignment gaps to complete codons //if CDS can be retrieved from bioseq void CSplignTrim::TrimHolesToCodons(TSegs& segments, CBioseq_Handle& mrna_bio_handle, bool mrna_strand, size_t mrna_len) { if( mrna_bio_handle ) { //collect CDS intervals (could be more than one in a case of ribosomal slippage) vector<TSeqRange> tr; for(CFeat_CI ci(mrna_bio_handle, SAnnotSelector(CSeqFeatData::e_Cdregion)); ci; ++ci) { for(CSeq_loc_CI slit(ci->GetLocation()); slit; ++slit) { TSeqRange r, ori; ori = slit.GetRange(); if( mrna_strand ) { r = ori; } else {//reverse r.SetFrom(mrna_len - ori.GetTo() - 1); r.SetTo(mrna_len - ori.GetFrom() - 1); } tr.push_back(r); } } if(tr.empty()) return;// CDS not found //trim AdjustGaps(segments);//make sure there is no adjacent gaps size_t pos1 = 0, pos2 = 2; for(; pos2 < segments.size(); ++pos1, ++pos2) { if( segments[pos1].m_exon && !segments[pos1+1].m_exon && segments[pos2].m_exon ) {//candidate for trimming //trim left exon TSeqPos p1 = segments[pos1].m_box[1]; ITERATE(vector<TSeqRange>, it, tr) { if( p1 >= it->GetFrom() && p1 <= it->GetTo() ) { TSeqPos cut_mrna_len = (p1 + 1 - it->GetFrom()) % 3, cnt = 0; string transcript = segments[pos1].m_details; int i = (int)transcript.size() - 1; for(; i>=0; --i) { if( cnt%3 == cut_mrna_len && transcript[i] == 'M' ) { //cut point CutFromRight(transcript.size() - i - 1, segments[pos1]); break; } if( transcript[i] != 'I' ) ++cnt; } if( i < 0 ) {// exon should not be so bad NCBI_THROW(CAlgoAlignException, eInternal, g_msg_InvalidRange); } break; } } //trim right exon TSeqPos p2 = segments[pos2].m_box[0]; ITERATE(vector<TSeqRange>, it, tr) { if( p2 >= it->GetFrom() && p2 <= it->GetTo() ) { TSeqPos cut_mrna_len = ( 3 - ( p2 - it->GetFrom()) % 3 ) %3, cnt = 0; string transcript = segments[pos2].m_details; int i = 0; for( ; i < (int)transcript.size(); ++i) { if( cnt%3 == cut_mrna_len && transcript[i] == 'M' ) { //cut point CutFromLeft(i, segments[pos2]); break; } if( transcript[i] != 'I' ) ++cnt; } if( i == (int)transcript.size() ) {// exon should not be so bad NCBI_THROW(CAlgoAlignException, eInternal, g_msg_InvalidRange); } break; } } } } AdjustGaps(segments); }
void CSplignTrim::ImproveFromLeft(TSeg& s) { CutToMatchLeft(s); Cut50FromLeft(s); if(ThrowAwayShortExon(s)) return; int len_total = (int)s.m_details.size(); if(len_total <= 20) return;//two short //compute number of matches int match_total = 0; string::iterator irs0 = s.m_details.begin(), irs1 = s.m_details.end(), irs; for(irs = irs0; irs != irs1; ++irs) { if(*irs == 'M') { ++match_total; } } //find the right boundary, 20/20 rule {{ int minlen = max(20, len_total/5); size_t pos = irs1 - irs0 - minlen; pos = s.m_details.rfind('M', pos); if( pos == string::npos ) return;//no M found. Should not happen if CutToMatchLeft is called above pos = s.m_details.find_last_not_of('M', pos); if( pos == string::npos ) return;// 100% id on the left, nothing to trim irs1 = irs0 + pos + 1; }} //after 20/20 *irs1 is M, irs1-1 is not M and eventually irs1 is a right boundary for trimming string::iterator irs_tr = s.m_details.end(); //trimming point int match = 0, len = 0; for(irs = irs0; irs != irs1; ++irs) { if(*irs == 'M') { ++match; } ++len; double lid = match / (double)len; double rid = (match_total - match) / (double)(len_total - len); //dropoff check double epsilon = 1e-10; if( rid - lid - m_MaxPartExonIdentDrop > epsilon ) { irs_tr = irs; //do not count trimmed part, adjust values match_total -= match; len_total -= len; match = 0; len = 0; } } if(irs_tr == s.m_details.end()) return;//nothing to trim //actual trimming CutFromLeft( irs_tr - irs0 + 1, s ); ThrowAwayShortExon(s); }