string& CAlnVec::GetAlnSeqString(string& buffer, TNumrow row, const TSignedRange& aln_rng) const { string buff; buffer.erase(); CSeqVector& seq_vec = x_GetSeqVector(row); TSeqPos seq_vec_size = seq_vec.size(); // get the chunks which are aligned to seq on anchor CRef<CAlnMap::CAlnChunkVec> chunk_vec = GetAlnChunks(row, aln_rng, fSkipInserts | fSkipUnalignedGaps); // for each chunk for (int i=0; i<chunk_vec->size(); i++) { CConstRef<CAlnMap::CAlnChunk> chunk = (*chunk_vec)[i]; if (chunk->GetType() & fSeq) { // add the sequence string if (IsPositiveStrand(row)) { seq_vec.GetSeqData(chunk->GetRange().GetFrom(), chunk->GetRange().GetTo() + 1, buff); } else { seq_vec.GetSeqData(seq_vec_size - chunk->GetRange().GetTo() - 1, seq_vec_size - chunk->GetRange().GetFrom(), buff); } if (GetWidth(row) == 3) { TranslateNAToAA(buff, buff, GetGenCode(row)); } buffer += buff; } else { // add appropriate number of gap/end chars const int n = chunk->GetAlnRange().GetLength(); char* ch_buff = new char[n+1]; char fill_ch; if (chunk->GetType() & fNoSeqOnLeft || chunk->GetType() & fNoSeqOnRight) { fill_ch = GetEndChar(); } else { fill_ch = GetGapChar(row); } memset(ch_buff, fill_ch, n); ch_buff[n] = 0; buffer += ch_buff; delete[] ch_buff; } } return buffer; }
void CSeqVector_CI::x_FillCache(TSeqPos start, TSeqPos count) { _ASSERT(m_Seg.GetType() != CSeqMap::eSeqEnd); _ASSERT(start >= m_Seg.GetPosition()); _ASSERT(start < m_Seg.GetEndPosition()); x_ResizeCache(count); switch ( m_Seg.GetType() ) { case CSeqMap::eSeqData: { const CSeq_data& data = m_Seg.GetRefData(); if ( data.IsGap() && m_Seg.GetType() == CSeqMap::eSeqGap ) { // workaround for erroneously split gap Seq-data x_FillCache(start, count); return; } TCoding dataCoding = data.Which(); TCoding cacheCoding = x_GetCoding(m_Coding, dataCoding); bool reverse = m_Seg.GetRefMinusStrand(); bool randomize = false; if ( cacheCoding != dataCoding && cacheCoding == CSeq_data::e_Ncbi2na && m_Randomizer) { cacheCoding = CSeq_data::e_Ncbi4na; randomize = true; } const char* table = 0; if ( cacheCoding != dataCoding || reverse || m_CaseConversion != eCaseConversion_none ) { table = sx_GetConvertTable(dataCoding, cacheCoding, reverse, m_CaseConversion); if ( !table && cacheCoding != dataCoding ) { NCBI_THROW_FMT(CSeqVectorException, eCodingError, "Incompatible sequence codings: "<< dataCoding<<" -> "<<cacheCoding); } } TSeqPos dataPos; if ( reverse ) { // Revert segment offset dataPos = m_Seg.GetRefEndPosition() - (start - m_Seg.GetPosition()) - count; } else { dataPos = m_Seg.GetRefPosition() + (start - m_Seg.GetPosition()); } switch ( dataCoding ) { case CSeq_data::e_Iupacna: copy_8bit_any(m_Cache, count, data.GetIupacna().Get(), dataPos, table, reverse); break; case CSeq_data::e_Iupacaa: copy_8bit_any(m_Cache, count, data.GetIupacaa().Get(), dataPos, table, reverse); break; case CSeq_data::e_Ncbi2na: copy_2bit_any(m_Cache, count, data.GetNcbi2na().Get(), dataPos, table, reverse); break; case CSeq_data::e_Ncbi4na: copy_4bit_any(m_Cache, count, data.GetNcbi4na().Get(), dataPos, table, reverse); break; case CSeq_data::e_Ncbi8na: copy_8bit_any(m_Cache, count, data.GetNcbi8na().Get(), dataPos, table, reverse); break; case CSeq_data::e_Ncbipna: NCBI_THROW(CSeqVectorException, eCodingError, "Ncbipna conversion not implemented"); case CSeq_data::e_Ncbi8aa: copy_8bit_any(m_Cache, count, data.GetNcbi8aa().Get(), dataPos, table, reverse); break; case CSeq_data::e_Ncbieaa: copy_8bit_any(m_Cache, count, data.GetNcbieaa().Get(), dataPos, table, reverse); break; case CSeq_data::e_Ncbipaa: NCBI_THROW(CSeqVectorException, eCodingError, "Ncbipaa conversion not implemented"); case CSeq_data::e_Ncbistdaa: copy_8bit_any(m_Cache, count, data.GetNcbistdaa().Get(), dataPos, table, reverse); break; default: NCBI_THROW_FMT(CSeqVectorException, eCodingError, "Invalid data coding: "<<dataCoding); } if ( randomize ) { m_Randomizer->RandomizeData(m_Cache, count, start); } break; } case CSeqMap::eSeqGap: if (m_Coding == CSeq_data::e_Ncbi2na && m_Randomizer) { fill_n(m_Cache, count, sx_GetGapChar(CSeq_data::e_Ncbi4na, eCaseConversion_none)); m_Randomizer->RandomizeData(m_Cache, count, start); } else { fill_n(m_Cache, count, GetGapChar()); } break; default: NCBI_THROW_FMT(CSeqVectorException, eDataError, "Invalid segment type: "<<m_Seg.GetType()); } m_CachePos = start; }
string& CAlnVec::GetColumnVector(string& buffer, TSeqPos aln_pos, TResidueCount * residue_count, bool gaps_in_count) const { buffer.resize(GetNumRows(), GetEndChar()); if (aln_pos > GetAlnStop()) { aln_pos = GetAlnStop(); // out-of-range adjustment } TNumseg seg = GetSeg(aln_pos); TSeqPos delta = aln_pos - GetAlnStart(seg); TSeqPos len = GetLen(seg); TSignedSeqPos pos; for (TNumrow row = 0; row < m_NumRows; row++) { pos = GetStart(row, seg); if (pos >= 0) { // it's a sequence residue bool plus = IsPositiveStrand(row); if (plus) { pos += delta; } else { pos += len - 1 - delta; } CSeqVector& seq_vec = x_GetSeqVector(row); if (GetWidth(row) == 3) { string na_buff, aa_buff; if (plus) { seq_vec.GetSeqData(pos, pos + 3, na_buff); } else { TSeqPos size = seq_vec.size(); seq_vec.GetSeqData(size - pos - 3, size - pos, na_buff); } TranslateNAToAA(na_buff, aa_buff, GetGenCode(row)); buffer[row] = aa_buff[0]; } else { buffer[row] = seq_vec[plus ? pos : seq_vec.size() - pos - 1]; } if (residue_count) { (*residue_count)[FromIupac(buffer[row])]++; } } else { // it's a gap or endchar if (GetEndChar() != (buffer[row] = GetGapChar(row))) { // need to check the where the segment is // only if endchar != gap // this saves a check if there're the same TSegTypeFlags type = GetSegType(row, seg); if (type & fNoSeqOnLeft || type & fNoSeqOnRight) { buffer[row] = GetEndChar(); } } if (gaps_in_count && residue_count) { (*residue_count)[FromIupac(buffer[row])]++; } } } // for row return buffer; }
string& CAlnVec::GetWholeAlnSeqString(TNumrow row, string& buffer, TSeqPosList * insert_aln_starts, TSeqPosList * insert_starts, TSeqPosList * insert_lens, unsigned int scrn_width, TSeqPosList * scrn_lefts, TSeqPosList * scrn_rights) const { TSeqPos aln_pos = 0, len = 0, curr_pos = 0, anchor_pos = 0, scrn_pos = 0, prev_len = 0, ttl_len = 0; TSignedSeqPos start = -1, stop = -1, scrn_lft_seq_pos = -1, scrn_rgt_seq_pos = -1, prev_aln_pos = -1, prev_start = -1; TNumseg seg; int pos, nscrns, delta; TSeqPos aln_len = GetAlnStop() + 1; bool anchored = IsSetAnchor(); bool plus = IsPositiveStrand(row); int width = GetWidth(row); scrn_width *= width; const bool record_inserts = insert_starts && insert_lens; const bool record_coords = scrn_width && scrn_lefts && scrn_rights; // allocate space for the row char* c_buff = new char[aln_len + 1]; char* c_buff_ptr = c_buff; string buff; const TNumseg& left_seg = x_GetSeqLeftSeg(row); const TNumseg& right_seg = x_GetSeqRightSeg(row); // loop through all segments for (seg = 0, pos = row, aln_pos = 0, anchor_pos = m_Anchor; seg < m_NumSegs; ++seg, pos += m_NumRows, anchor_pos += m_NumRows) { const TSeqPos& seg_len = m_Lens[seg]; start = m_Starts[pos]; len = seg_len * width; if (anchored && m_Starts[anchor_pos] < 0) { if (start >= 0) { // record the insert if requested if (record_inserts) { if (prev_aln_pos == (TSignedSeqPos)(aln_pos / width) && start == (TSignedSeqPos)(plus ? prev_start + prev_len : prev_start - len)) { // consolidate the adjacent inserts ttl_len += len; insert_lens->pop_back(); insert_lens->push_back(ttl_len); if (!plus) { insert_starts->pop_back(); insert_starts->push_back(start); } } else { prev_aln_pos = aln_pos / width; ttl_len = len; insert_starts->push_back(start); insert_aln_starts->push_back(prev_aln_pos); insert_lens->push_back(len); } prev_start = start; prev_len = len; } } } else { if (start >= 0) { stop = start + len - 1; // add regular sequence to buffer GetSeqString(buff, row, start, stop); TSeqPos buf_len = min<TSeqPos>(buff.size(), seg_len); memcpy(c_buff_ptr, buff.c_str(), buf_len); c_buff_ptr += buf_len; if (buf_len < seg_len) { // Not enough chars in the sequence, add gap buf_len = seg_len - buf_len; char* ch_buff = new char[buf_len + 1]; char fill_ch; if (seg < left_seg || seg > right_seg) { fill_ch = GetEndChar(); } else { fill_ch = GetGapChar(row); } memset(ch_buff, fill_ch, buf_len); ch_buff[buf_len] = 0; memcpy(c_buff_ptr, ch_buff, buf_len); c_buff_ptr += buf_len; delete[] ch_buff; } // take care of coords if necessary if (record_coords) { if (scrn_lft_seq_pos < 0) { scrn_lft_seq_pos = plus ? start : stop; if (scrn_rgt_seq_pos < 0) { scrn_rgt_seq_pos = scrn_lft_seq_pos; } } // previous scrns nscrns = (aln_pos - scrn_pos) / scrn_width; for (int i = 0; i < nscrns; i++) { scrn_lefts->push_back(scrn_lft_seq_pos); scrn_rights->push_back(scrn_rgt_seq_pos); if (i == 0) { scrn_lft_seq_pos = plus ? start : stop; } scrn_pos += scrn_width; } if (nscrns > 0) { scrn_lft_seq_pos = plus ? start : stop; } // current scrns nscrns = (aln_pos + len - scrn_pos) / scrn_width; curr_pos = aln_pos; for (int i = 0; i < nscrns; i++) { delta = (plus ? scrn_width - (curr_pos - scrn_pos) : curr_pos - scrn_pos - scrn_width); scrn_lefts->push_back(scrn_lft_seq_pos); if (plus ? scrn_lft_seq_pos < start : scrn_lft_seq_pos > stop) { scrn_lft_seq_pos = (plus ? start : stop) + delta; scrn_rgt_seq_pos = scrn_lft_seq_pos + (plus ? -1 : 1); } else { scrn_rgt_seq_pos = scrn_lft_seq_pos + (plus ? -1 : 1) + delta; scrn_lft_seq_pos += delta; } if (seg == left_seg && scrn_lft_seq_pos == scrn_rgt_seq_pos) { if (plus) { scrn_rgt_seq_pos--; } else { scrn_rgt_seq_pos++; } } scrn_rights->push_back(scrn_rgt_seq_pos); curr_pos = scrn_pos += scrn_width; } if (aln_pos + len <= scrn_pos) { scrn_lft_seq_pos = -1; // reset } scrn_rgt_seq_pos = plus ? stop : start; } } else { // add appropriate number of gap/end chars char* ch_buff = new char[seg_len + 1]; char fill_ch; if (seg < left_seg || seg > right_seg) { fill_ch = GetEndChar(); } else { fill_ch = GetGapChar(row); } memset(ch_buff, fill_ch, seg_len); ch_buff[seg_len] = 0; memcpy(c_buff_ptr, ch_buff, seg_len); c_buff_ptr += seg_len; delete[] ch_buff; } aln_pos += len; } } // take care of the remaining coords if necessary if (record_coords) { // previous scrns TSeqPos pos_diff = aln_pos - scrn_pos; if (pos_diff > 0) { nscrns = pos_diff / scrn_width; if (pos_diff % scrn_width) { nscrns++; } for (int i = 0; i < nscrns; i++) { scrn_lefts->push_back(scrn_lft_seq_pos); scrn_rights->push_back(scrn_rgt_seq_pos); if (i == 0) { scrn_lft_seq_pos = scrn_rgt_seq_pos; } scrn_pos += scrn_width; } } } c_buff[aln_len] = '\0'; buffer = c_buff; delete [] c_buff; return buffer; }