int CAlnVec::CalculateScore(TNumrow row1, TNumrow row2) const { TNumrow numrows = m_NumRows; TNumrow index1 = row1, index2 = row2; TSignedSeqPos start1, start2; string buff1, buff2; bool isAA1, isAA2; int score = 0; TSeqPos len; isAA1 = GetBioseqHandle(row1).GetBioseqCore() ->GetInst().GetMol() == CSeq_inst::eMol_aa; isAA2 = GetBioseqHandle(row2).GetBioseqCore() ->GetInst().GetMol() == CSeq_inst::eMol_aa; CSeqVector& seq_vec1 = x_GetSeqVector(row1); TSeqPos size1 = seq_vec1.size(); CSeqVector & seq_vec2 = x_GetSeqVector(row2); TSeqPos size2 = seq_vec2.size(); for (TNumseg seg = 0; seg < m_NumSegs; seg++) { start1 = m_Starts[index1]; start2 = m_Starts[index2]; if (start1 >=0 && start2 >= 0) { len = m_Lens[seg]; if (IsPositiveStrand(row1)) { seq_vec1.GetSeqData(start1, start1 + len, buff1); } else { seq_vec1.GetSeqData(size1 - (start1 + len), size1 - start1, buff1); } if (IsPositiveStrand(row2)) { seq_vec2.GetSeqData(start2, start2 + len, buff2); } else { seq_vec2.GetSeqData(size2 - (start2 + len), size2 - start2, buff2); } score += CalculateScore(buff1, buff2, isAA1, isAA2); } index1 += numrows; index2 += numrows; } return score; }
CSeqVector& CAlnVec::x_GetSeqVector(TNumrow row) const { TSeqVectorCache::iterator iter = m_SeqVectorCache.find(row); CRef<CSeqVector> seq_vec; if (iter != m_SeqVectorCache.end()) { seq_vec = iter->second; } else { CBioseq_Handle h = GetBioseqHandle(row); CSeqVector vec = h.GetSeqVector (CBioseq_Handle::eCoding_Iupac, IsPositiveStrand(row) ? CBioseq_Handle::eStrand_Plus : CBioseq_Handle::eStrand_Minus); seq_vec.Reset(new CSeqVector(vec)); m_SeqVectorCache[row] = seq_vec; } if ( seq_vec->IsNucleotide() ) { if (m_NaCoding != CSeq_data::e_not_set) { seq_vec->SetCoding(m_NaCoding); } else { seq_vec->SetIupacCoding(); } } else if ( seq_vec->IsProtein() ) { if (m_AaCoding != CSeq_data::e_not_set) { seq_vec->SetCoding(m_AaCoding); } else { seq_vec->SetIupacCoding(); } } return *seq_vec; }
CBioseq_Handle CSimpleOM::GetBioseqHandle(TGi gi) { CSeq_id id; id.SetGi(gi); return GetBioseqHandle(id); }
CBioseq_Handle CSimpleOM::GetBioseqHandle(const string& id_string) { CSeq_id id(id_string); return GetBioseqHandle(id); }
CSeqVector CSimpleOM::GetSeqVector(const CSeq_id_Handle& id, ENa_strand strand) { return GetBioseqHandle(id).GetSeqVector(strand); }
CSeqVector CSimpleOM::GetSeqVector(TGi gi, ENa_strand strand) { return GetBioseqHandle(gi).GetSeqVector(strand); }
CSeqVector CSimpleOM::GetSeqVector(const string& id_string, ENa_strand strand) { return GetBioseqHandle(id_string).GetSeqVector(strand); }
void CAlnVec::CreateConsensus(vector<string>& consens) const { bool isNucleotide = GetBioseqHandle(0).IsNucleotide(); const int numBases = isNucleotide ? 4 : 26; int base_count[26]; // must be a compile-time constant for some compilers // determine what the number of segments required for a gapped consensus // segment is. this must be rounded to be at least 50%. int gap_seg_thresh = m_NumRows - m_NumRows / 2; for (size_t j = 0; j < (size_t)m_NumSegs; ++j) { // evaluate for gap / no gap int gap_count = 0; for (size_t i = 0; i < (size_t)m_NumRows; ++i) { if (m_Starts[ j*m_NumRows + i ] == -1) { ++gap_count; } } // check to make sure that this seg is not a consensus // gap seg if ( gap_count > gap_seg_thresh ) continue; // the base threshold for being considered unique is at least // 70% of the available sequences int base_thresh = ((m_NumRows - gap_count) * 7 + 5) / 10; { // we will build a segment with enough bases to match consens[j].resize(m_Lens[j]); // retrieve all sequences for this segment vector<string> segs(m_NumRows); RetrieveSegmentSequences(j, segs); TransposeSequences(segs); typedef multimap<int, unsigned char, greater<int> > TRevMap; // // evaluate for a consensus // for (size_t i = 0; i < m_Lens[j]; ++i) { if (isNucleotide) { CollectNucleotideFrequences(segs[i], base_count, numBases); } else { CollectProteinFrequences(segs[i], base_count, numBases); } // we create a sorted list (in descending order) of // frequencies of appearance to base // the frequency is "global" for this position: that is, // if 40% of the sequences are gapped, the highest frequency // any base can have is 0.6 TRevMap rev_map; for (int k = 0; k < numBases; ++k) { // this gets around a potentially tricky idiosyncrasy // in some implementations of multimap. depending on // the library, the key may be const (or not) TRevMap::value_type p(base_count[k], isNucleotide ? (1<<k) : k); rev_map.insert(p); } // now, the first element here contains the best frequency // we scan for the appropriate bases if (rev_map.count(rev_map.begin()->first) == 1 && rev_map.begin()->first >= base_thresh) { consens[j][i] = isNucleotide ? ToIupac(rev_map.begin()->second) : (rev_map.begin()->second+'A'); } else { // now we need to make some guesses based on IUPACna // notation int count; unsigned char c = 0x00; int freq = 0; TRevMap::iterator curr = rev_map.begin(); TRevMap::iterator prev = rev_map.begin(); for (count = 0; curr != rev_map.end() && (freq < base_thresh || prev->first == curr->first); ++curr, ++count) { prev = curr; freq += curr->first; if (isNucleotide) { c |= curr->second; } else { unsigned char cur_char = curr->second+'A'; switch (c) { case 0x00: c = cur_char; break; case 'N': case 'D': c = (cur_char == 'N' || cur_char == 'N') ? 'B' : 'X'; break; case 'Q': case 'E': c = (cur_char == 'Q' || cur_char == 'E') ? 'Z' : 'X'; break; case 'I': case 'L': c = (cur_char == 'I' || cur_char == 'L') ? 'J' : 'X'; break; default: c = 'X'; } } } // // catchall // if (count > 2) { consens[j][i] = isNucleotide ? 'N' : 'X'; } else { consens[j][i] = isNucleotide ? ToIupac(c) : c; } } } } } }
// // CreateConsensus() // // compute a consensus sequence given a particular alignment // the rules for a consensus are: // - a segment is consensus gap if > 50% of the sequences are gap at this // segment. 50% exactly is counted as sequence // - for a segment counted as sequence, for each position, the most // frequently occurring base is counted as consensus. in the case of // a tie, the consensus is considered muddied, and the consensus is // so marked // CRef<CDense_seg> CAlnVec::CreateConsensus(int& consensus_row, CBioseq& consensus_seq, const CSeq_id& consensus_id) const { consensus_seq.Reset(); if ( !m_DS || m_NumRows < 1) { return CRef<CDense_seg>(); } bool isNucleotide = GetBioseqHandle(0).IsNucleotide(); size_t i; size_t j; // temporary storage for our consensus vector<string> consens(m_NumSegs); CreateConsensus(consens); // // now, create a new CDense_seg // we create a new CBioseq for our data and // copy the contents of the CDense_seg // string data; TSignedSeqPos total_bases = 0; CRef<CDense_seg> new_ds(new CDense_seg()); new_ds->SetDim(m_NumRows + 1); new_ds->SetNumseg(m_NumSegs); new_ds->SetLens() = m_Lens; new_ds->SetStarts().reserve(m_Starts.size() + m_NumSegs); if ( !m_Strands.empty() ) { new_ds->SetStrands().reserve(m_Strands.size() + m_NumSegs); } for (i = 0; i < consens.size(); ++i) { // copy the old entries for (j = 0; j < (size_t)m_NumRows; ++j) { int idx = i * m_NumRows + j; new_ds->SetStarts().push_back(m_Starts[idx]); if ( !m_Strands.empty() ) { new_ds->SetStrands().push_back(m_Strands[idx]); } } // add our new entry // this places the consensus as the last sequence // it should preferably be the first, but this would mean adjusting // the bioseq handle and seqvector caches, and all row numbers would // shift if (consens[i].length() != 0) { new_ds->SetStarts().push_back(total_bases); } else { new_ds->SetStarts().push_back(-1); } if ( !m_Strands.empty() ) { new_ds->SetStrands().push_back(eNa_strand_unknown); } total_bases += consens[i].length(); data += consens[i]; } // copy our IDs for (i = 0; i < m_Ids.size(); ++i) { new_ds->SetIds().push_back(m_Ids[i]); } // now, we construct a new Bioseq {{ // sequence ID CRef<CSeq_id> id(new CSeq_id()); id->Assign(consensus_id); consensus_seq.SetId().push_back(id); new_ds->SetIds().push_back(id); // add a description for this sequence CSeq_descr& desc = consensus_seq.SetDescr(); CRef<CSeqdesc> d(new CSeqdesc); desc.Set().push_back(d); d->SetComment("This is a generated consensus sequence"); // the main one: Seq-inst CSeq_inst& inst = consensus_seq.SetInst(); inst.SetRepr(CSeq_inst::eRepr_raw); inst.SetMol(isNucleotide ? CSeq_inst::eMol_na : CSeq_inst::eMol_aa); inst.SetLength(data.length()); CSeq_data& seq_data = inst.SetSeq_data(); if (isNucleotide) { CIUPACna& na = seq_data.SetIupacna(); na = CIUPACna(data); } else { CIUPACaa& aa = seq_data.SetIupacaa(); aa = CIUPACaa(data); } }} consensus_row = new_ds->GetIds().size() - 1; return new_ds; }
CBioseq_Handle CTSE_Handle::GetBioseqHandle(const CSeq_id& id) const { return GetBioseqHandle(CSeq_id_Handle::GetHandle(id)); }