CContextTranslator::CContextTranslator(const CSplitQueryBlk& sqb, vector< CRef<IQueryFactory> >* query_chunk_factories /* = NULL */, const CBlastOptions* options /* = NULL */) { const size_t kNumChunks(sqb.GetNumChunks()); m_ContextsPerChunk.reserve(kNumChunks); for (size_t i = 0; i < kNumChunks; i++) { m_ContextsPerChunk.push_back(sqb.GetQueryContexts(i)); } if (query_chunk_factories == NULL || options == NULL) { return; } /// Populate the data to print out m_StartingChunks.resize(kNumChunks); m_AbsoluteContexts.resize(kNumChunks); for (size_t i = 0; i < kNumChunks; i++) { CRef<IQueryFactory> chunk_qf((*query_chunk_factories)[i]); CRef<ILocalQueryData> chunk_qd(chunk_qf->MakeLocalQueryData(options)); BlastQueryInfo* chunk_qinfo = chunk_qd->GetQueryInfo(); for (Int4 ctx = chunk_qinfo->first_context; ctx <= chunk_qinfo->last_context; ctx++) { m_StartingChunks[i].push_back(GetStartingChunk(i, ctx)); m_AbsoluteContexts[i].push_back(GetAbsoluteContext(i, ctx)); } } }
void CQuerySplitter::x_ComputeContextOffsets_TranslatedQueries() { _ASSERT( !m_QueryChunkFactories.empty() ); const EBlastProgramType kProgram = m_Options->GetProgramType(); _ASSERT(Blast_QueryIsTranslated(kProgram)); const BlastQueryInfo* global_qinfo = m_LocalQueryData->GetQueryInfo(); #ifdef DEBUG_COMPARE_SEQUENCES const BLAST_SequenceBlk* global_seq = m_LocalQueryData->GetSequenceBlk(); #endif const size_t kOverlap = SplitQuery_GetOverlapChunkSize(kProgram) / CODON_LENGTH; CContextTranslator ctx_translator(*m_SplitBlk, &m_QueryChunkFactories, m_Options); CQueryDataPerChunk qdpc(*m_SplitBlk, kProgram, m_LocalQueryData); vector<const BlastQueryInfo*> chunk_qinfo(m_NumChunks, 0); for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) { CRef<IQueryFactory> chunk_qf(m_QueryChunkFactories[chunk_num]); CRef<ILocalQueryData> chunk_qd(chunk_qf->MakeLocalQueryData(m_Options)); #ifdef DEBUG_COMPARE_SEQUENCES const BLAST_SequenceBlk* chunk_seq = chunk_qd->GetSequenceBlk(); #endif // BlastQueryInfo structure corresponding to chunk number chunk_num chunk_qinfo[chunk_num] = chunk_qd->GetQueryInfo(); _ASSERT(chunk_qinfo[chunk_num]); // In case the first context differs from 0, for consistency with the // other data returned by this class... for (Int4 ctx = 0; ctx < chunk_qinfo[chunk_num]->first_context; ctx++) { m_SplitBlk->AddContextOffsetToChunk(chunk_num, INT4_MAX); } for (Int4 ctx = chunk_qinfo[chunk_num]->first_context; ctx <= chunk_qinfo[chunk_num]->last_context; ctx++) { size_t correction = 0; const int starting_chunk = ctx_translator.GetStartingChunk(chunk_num, ctx); const int absolute_context = ctx_translator.GetAbsoluteContext(chunk_num, ctx); const int last_query_chunk = qdpc.GetLastChunk(chunk_num, ctx); if (absolute_context == kInvalidContext || starting_chunk == kInvalidContext) { _ASSERT( !chunk_qinfo[chunk_num]->contexts[ctx].is_valid ); // INT4_MAX is the sentinel value for invalid contexts m_SplitBlk->AddContextOffsetToChunk(chunk_num, INT4_MAX); continue; } // The corrections for the contexts corresponding to the negative // strand in the last chunk of a query sequence are all 0 if (!s_IsPlusStrand(chunk_qinfo[chunk_num], ctx) && (chunk_num == (size_t)last_query_chunk) && (ctx % NUM_FRAMES >= 3)) { correction = 0; goto error_check; } // The corrections for the contexts corresponding to the plus // strand are always the same, so only calculate the first one if (s_IsPlusStrand(chunk_qinfo[chunk_num], ctx) && (ctx % NUM_FRAMES == 1 || ctx % NUM_FRAMES == 2)) { correction = m_SplitBlk->GetContextOffsets(chunk_num).back(); goto error_check; } // If the query length is divisible by CODON_LENGTH, the // corrections for all contexts corresponding to a given strand are // the same, so only calculate the first one if ((qdpc.GetQueryLength(chunk_num, ctx) % CODON_LENGTH == 0) && (ctx % NUM_FRAMES != 0) && (ctx % NUM_FRAMES != 3)) { correction = m_SplitBlk->GetContextOffsets(chunk_num).back(); goto error_check; } // If the query length % CODON_LENGTH == 1, the corrections for the // first two contexts of the negative strand are the same, and the // correction for the last context is one more than that. if ((qdpc.GetQueryLength(chunk_num, ctx) % CODON_LENGTH == 1) && !s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) { if (ctx % NUM_FRAMES == 4) { correction = m_SplitBlk->GetContextOffsets(chunk_num).back(); goto error_check; } else if (ctx % NUM_FRAMES == 5) { correction = m_SplitBlk->GetContextOffsets(chunk_num).back() + 1; goto error_check; } } // If the query length % CODON_LENGTH == 2, the corrections for the // last two contexts of the negative strand are the same, which is // one more that the first context on the negative strand. if ((qdpc.GetQueryLength(chunk_num, ctx) % CODON_LENGTH == 2) && !s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) { if (ctx % NUM_FRAMES == 4) { correction = m_SplitBlk->GetContextOffsets(chunk_num).back() + 1; goto error_check; } else if (ctx % NUM_FRAMES == 5) { correction = m_SplitBlk->GetContextOffsets(chunk_num).back(); goto error_check; } } if (s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) { for (int c = chunk_num; c != starting_chunk; c--) { size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo, c - 1, ctx_translator, absolute_context); size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo, c, ctx_translator, absolute_context); size_t overlap = min(kOverlap, curr_len); correction += prev_len - min(overlap, prev_len); } } else { size_t subtrahend = 0; for (int c = chunk_num; c >= starting_chunk && c >= 0; c--) { size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo, c - 1, ctx_translator, absolute_context); size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo, c, ctx_translator, absolute_context); size_t overlap = min(kOverlap, curr_len); subtrahend += (curr_len - min(overlap, prev_len)); } correction = global_qinfo->contexts[absolute_context].query_length - subtrahend; } error_check: _ASSERT((chunk_qinfo[chunk_num]->contexts[ctx].is_valid)); m_SplitBlk->AddContextOffsetToChunk(chunk_num, correction); #ifdef DEBUG_COMPARE_SEQUENCES { int global_offset = global_qinfo->contexts[absolute_context].query_offset + correction; int chunk_offset = chunk_qinfo[chunk_num]->contexts[ctx].query_offset; int num_bases2compare = min(10, chunk_qinfo[chunk_num]->contexts[ctx].query_length); if (!cmp_sequence(&global_seq->sequence[global_offset], &chunk_seq->sequence[chunk_offset], num_bases2compare, Blast_QueryIsProtein(kProgram))) { cerr << "Failed to compare sequence data for chunk " << chunk_num << ", context " << ctx << endl; } } #endif } } _TRACE("CContextTranslator contents: " << ctx_translator); }
// Record the correction needed to synchronize with the complete query all the // query offsets within HSPs that fall within this context void CQuerySplitter::x_ComputeContextOffsets_NonTranslatedQueries() { _ASSERT( !m_QueryChunkFactories.empty() ); const EBlastProgramType kProgram = m_Options->GetProgramType(); _ASSERT( !Blast_QueryIsTranslated(kProgram) ); const BlastQueryInfo* global_qinfo = m_LocalQueryData->GetQueryInfo(); #ifdef DEBUG_COMPARE_SEQUENCES const BLAST_SequenceBlk* global_seq = m_LocalQueryData->GetSequenceBlk(); #endif const size_t kOverlap = SplitQuery_GetOverlapChunkSize(kProgram); CContextTranslator ctx_translator(*m_SplitBlk, &m_QueryChunkFactories, m_Options); vector<const BlastQueryInfo*> chunk_qinfo(m_NumChunks, 0); for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) { CRef<IQueryFactory> chunk_qf(m_QueryChunkFactories[chunk_num]); CRef<ILocalQueryData> chunk_qd(chunk_qf->MakeLocalQueryData(m_Options)); #ifdef DEBUG_COMPARE_SEQUENCES const BLAST_SequenceBlk* chunk_seq = chunk_qd->GetSequenceBlk(); #endif // BlastQueryInfo structure corresponding to chunk number chunk_num chunk_qinfo[chunk_num] = chunk_qd->GetQueryInfo(); _ASSERT(chunk_qinfo[chunk_num]); // In case the first context differs from 0, for consistency with the // other data returned by this class... for (Int4 ctx = 0; ctx < chunk_qinfo[chunk_num]->first_context; ctx++) { m_SplitBlk->AddContextOffsetToChunk(chunk_num, INT4_MAX); } for (Int4 ctx = chunk_qinfo[chunk_num]->first_context; ctx <= chunk_qinfo[chunk_num]->last_context; ctx++) { size_t correction = 0; const int starting_chunk = ctx_translator.GetStartingChunk(chunk_num, ctx); const int absolute_context = ctx_translator.GetAbsoluteContext(chunk_num, ctx); if (absolute_context == kInvalidContext || starting_chunk == kInvalidContext) { _ASSERT( !chunk_qinfo[chunk_num]->contexts[ctx].is_valid ); // INT4_MAX is the sentinel value for invalid contexts m_SplitBlk->AddContextOffsetToChunk(chunk_num, INT4_MAX); continue; } if (s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) { for (int c = chunk_num; c != starting_chunk; c--) { size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo, c - 1, ctx_translator, absolute_context); size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo, c, ctx_translator, absolute_context); size_t overlap = min(kOverlap, curr_len); correction += prev_len - min(overlap, prev_len); } } else { size_t subtrahend = 0; for (int c = chunk_num; c >= starting_chunk && c >= 0; c--) { size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo, c - 1, ctx_translator, absolute_context); size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo, c, ctx_translator, absolute_context); size_t overlap = min(kOverlap, curr_len); subtrahend += (curr_len - min(overlap, prev_len)); } correction = global_qinfo->contexts[absolute_context].query_length - subtrahend; } _ASSERT((chunk_qinfo[chunk_num]->contexts[ctx].is_valid)); m_SplitBlk->AddContextOffsetToChunk(chunk_num, correction); #ifdef DEBUG_COMPARE_SEQUENCES { int global_offset = global_qinfo->contexts[absolute_context].query_offset + correction; int chunk_offset = chunk_qinfo[chunk_num]->contexts[ctx].query_offset; if (!cmp_sequence(&global_seq->sequence[global_offset], &chunk_seq->sequence[chunk_offset], 10, Blast_QueryIsProtein(kProgram))) { cerr << "Failed to compare sequence data!" << endl; } } #endif } } _TRACE("CContextTranslator contents: " << ctx_translator); }