Uint4 SplitQuery_CalculateNumChunks(EBlastProgramType program, size_t *chunk_size, size_t concatenated_query_length, size_t num_queries) { if ( !SplitQuery_ShouldSplit(program, *chunk_size, concatenated_query_length, num_queries)) { _TRACE("Not splitting queries"); return 1; } size_t overlap_size = SplitQuery_GetOverlapChunkSize(program); Uint4 num_chunks = 0; _DEBUG_ARG(size_t target_chunk_size = *chunk_size); // For translated queries the chunk size should be divisible by CODON_LENGTH if (Blast_QueryIsTranslated(program)) { size_t chunk_size_delta = ((*chunk_size) % CODON_LENGTH); *chunk_size -= chunk_size_delta; _ASSERT((*chunk_size % CODON_LENGTH) == 0); } // Fix for small query size if ((*chunk_size) > overlap_size) { num_chunks = concatenated_query_length / ((*chunk_size) - overlap_size); } // Only one chunk, just return; if (num_chunks <= 1) { *chunk_size = concatenated_query_length; return 1; } // Re-adjust the chunk_size to make load even if (!Blast_QueryIsTranslated(program)) { *chunk_size = (concatenated_query_length + (num_chunks - 1) * overlap_size) / num_chunks; // Round up only if this will not decrease the number of chunks if (num_chunks < (*chunk_size) - overlap_size ) (*chunk_size)++; } _TRACE("Number of chunks: " << num_chunks << "; " "Target chunk size: " << target_chunk_size << "; " "Returned chunk size: " << *chunk_size); return num_chunks; }
void CQuerySplitter::x_ComputeChunkRanges() { _ASSERT(m_SplitBlk.NotEmpty()); // Note that this information might not need to be stored in the // SSplitQueryBlk structure, as these ranges can be calculated as follows: // chunk_start = (chunk_num*chunk_size) - (chunk_num*overlap_size); // chunk_end = chunk_start + chunk_size > query_size ? query_size : // chunk_start + chunk_size; size_t chunk_start = 0; const size_t kOverlapSize = SplitQuery_GetOverlapChunkSize(m_Options->GetProgramType()); for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) { size_t chunk_end = chunk_start + m_ChunkSize; // if the chunk end is larger than the sequence ... if (chunk_end >= m_TotalQueryLength || // ... or this is the last chunk and it didn't make it to the end // of the sequence (chunk_end < m_TotalQueryLength && (chunk_num + 1) == m_NumChunks)) { // ... assign this chunk's end to the end of the sequence chunk_end = m_TotalQueryLength; } m_SplitBlk->SetChunkBounds(chunk_num, TChunkRange(chunk_start, chunk_end)); _TRACE("Chunk " << chunk_num << ": ranges from " << chunk_start << " to " << chunk_end); chunk_start += (m_ChunkSize - kOverlapSize); if (chunk_start > m_TotalQueryLength || chunk_end == m_TotalQueryLength) { break; } } // For purposes of having an accurate overlap size when stitching back // HSPs, save the overlap size const size_t kOverlap = Blast_QueryIsTranslated(m_Options->GetProgramType()) ? kOverlapSize / CODON_LENGTH : kOverlapSize; m_SplitBlk->SetChunkOverlapSize(kOverlap); }
void CQuerySplitter::x_ComputeContextOffsets_TranslatedQueries() { _ASSERT( !m_QueryChunkFactories.empty() ); const EBlastProgramType kProgram = m_Options->GetProgramType(); _ASSERT(Blast_QueryIsTranslated(kProgram)); const BlastQueryInfo* global_qinfo = m_LocalQueryData->GetQueryInfo(); #ifdef DEBUG_COMPARE_SEQUENCES const BLAST_SequenceBlk* global_seq = m_LocalQueryData->GetSequenceBlk(); #endif const size_t kOverlap = SplitQuery_GetOverlapChunkSize(kProgram) / CODON_LENGTH; CContextTranslator ctx_translator(*m_SplitBlk, &m_QueryChunkFactories, m_Options); CQueryDataPerChunk qdpc(*m_SplitBlk, kProgram, m_LocalQueryData); vector<const BlastQueryInfo*> chunk_qinfo(m_NumChunks, 0); for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) { CRef<IQueryFactory> chunk_qf(m_QueryChunkFactories[chunk_num]); CRef<ILocalQueryData> chunk_qd(chunk_qf->MakeLocalQueryData(m_Options)); #ifdef DEBUG_COMPARE_SEQUENCES const BLAST_SequenceBlk* chunk_seq = chunk_qd->GetSequenceBlk(); #endif // BlastQueryInfo structure corresponding to chunk number chunk_num chunk_qinfo[chunk_num] = chunk_qd->GetQueryInfo(); _ASSERT(chunk_qinfo[chunk_num]); // In case the first context differs from 0, for consistency with the // other data returned by this class... for (Int4 ctx = 0; ctx < chunk_qinfo[chunk_num]->first_context; ctx++) { m_SplitBlk->AddContextOffsetToChunk(chunk_num, INT4_MAX); } for (Int4 ctx = chunk_qinfo[chunk_num]->first_context; ctx <= chunk_qinfo[chunk_num]->last_context; ctx++) { size_t correction = 0; const int starting_chunk = ctx_translator.GetStartingChunk(chunk_num, ctx); const int absolute_context = ctx_translator.GetAbsoluteContext(chunk_num, ctx); const int last_query_chunk = qdpc.GetLastChunk(chunk_num, ctx); if (absolute_context == kInvalidContext || starting_chunk == kInvalidContext) { _ASSERT( !chunk_qinfo[chunk_num]->contexts[ctx].is_valid ); // INT4_MAX is the sentinel value for invalid contexts m_SplitBlk->AddContextOffsetToChunk(chunk_num, INT4_MAX); continue; } // The corrections for the contexts corresponding to the negative // strand in the last chunk of a query sequence are all 0 if (!s_IsPlusStrand(chunk_qinfo[chunk_num], ctx) && (chunk_num == (size_t)last_query_chunk) && (ctx % NUM_FRAMES >= 3)) { correction = 0; goto error_check; } // The corrections for the contexts corresponding to the plus // strand are always the same, so only calculate the first one if (s_IsPlusStrand(chunk_qinfo[chunk_num], ctx) && (ctx % NUM_FRAMES == 1 || ctx % NUM_FRAMES == 2)) { correction = m_SplitBlk->GetContextOffsets(chunk_num).back(); goto error_check; } // If the query length is divisible by CODON_LENGTH, the // corrections for all contexts corresponding to a given strand are // the same, so only calculate the first one if ((qdpc.GetQueryLength(chunk_num, ctx) % CODON_LENGTH == 0) && (ctx % NUM_FRAMES != 0) && (ctx % NUM_FRAMES != 3)) { correction = m_SplitBlk->GetContextOffsets(chunk_num).back(); goto error_check; } // If the query length % CODON_LENGTH == 1, the corrections for the // first two contexts of the negative strand are the same, and the // correction for the last context is one more than that. if ((qdpc.GetQueryLength(chunk_num, ctx) % CODON_LENGTH == 1) && !s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) { if (ctx % NUM_FRAMES == 4) { correction = m_SplitBlk->GetContextOffsets(chunk_num).back(); goto error_check; } else if (ctx % NUM_FRAMES == 5) { correction = m_SplitBlk->GetContextOffsets(chunk_num).back() + 1; goto error_check; } } // If the query length % CODON_LENGTH == 2, the corrections for the // last two contexts of the negative strand are the same, which is // one more that the first context on the negative strand. if ((qdpc.GetQueryLength(chunk_num, ctx) % CODON_LENGTH == 2) && !s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) { if (ctx % NUM_FRAMES == 4) { correction = m_SplitBlk->GetContextOffsets(chunk_num).back() + 1; goto error_check; } else if (ctx % NUM_FRAMES == 5) { correction = m_SplitBlk->GetContextOffsets(chunk_num).back(); goto error_check; } } if (s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) { for (int c = chunk_num; c != starting_chunk; c--) { size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo, c - 1, ctx_translator, absolute_context); size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo, c, ctx_translator, absolute_context); size_t overlap = min(kOverlap, curr_len); correction += prev_len - min(overlap, prev_len); } } else { size_t subtrahend = 0; for (int c = chunk_num; c >= starting_chunk && c >= 0; c--) { size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo, c - 1, ctx_translator, absolute_context); size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo, c, ctx_translator, absolute_context); size_t overlap = min(kOverlap, curr_len); subtrahend += (curr_len - min(overlap, prev_len)); } correction = global_qinfo->contexts[absolute_context].query_length - subtrahend; } error_check: _ASSERT((chunk_qinfo[chunk_num]->contexts[ctx].is_valid)); m_SplitBlk->AddContextOffsetToChunk(chunk_num, correction); #ifdef DEBUG_COMPARE_SEQUENCES { int global_offset = global_qinfo->contexts[absolute_context].query_offset + correction; int chunk_offset = chunk_qinfo[chunk_num]->contexts[ctx].query_offset; int num_bases2compare = min(10, chunk_qinfo[chunk_num]->contexts[ctx].query_length); if (!cmp_sequence(&global_seq->sequence[global_offset], &chunk_seq->sequence[chunk_offset], num_bases2compare, Blast_QueryIsProtein(kProgram))) { cerr << "Failed to compare sequence data for chunk " << chunk_num << ", context " << ctx << endl; } } #endif } } _TRACE("CContextTranslator contents: " << ctx_translator); }
// Record the correction needed to synchronize with the complete query all the // query offsets within HSPs that fall within this context void CQuerySplitter::x_ComputeContextOffsets_NonTranslatedQueries() { _ASSERT( !m_QueryChunkFactories.empty() ); const EBlastProgramType kProgram = m_Options->GetProgramType(); _ASSERT( !Blast_QueryIsTranslated(kProgram) ); const BlastQueryInfo* global_qinfo = m_LocalQueryData->GetQueryInfo(); #ifdef DEBUG_COMPARE_SEQUENCES const BLAST_SequenceBlk* global_seq = m_LocalQueryData->GetSequenceBlk(); #endif const size_t kOverlap = SplitQuery_GetOverlapChunkSize(kProgram); CContextTranslator ctx_translator(*m_SplitBlk, &m_QueryChunkFactories, m_Options); vector<const BlastQueryInfo*> chunk_qinfo(m_NumChunks, 0); for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) { CRef<IQueryFactory> chunk_qf(m_QueryChunkFactories[chunk_num]); CRef<ILocalQueryData> chunk_qd(chunk_qf->MakeLocalQueryData(m_Options)); #ifdef DEBUG_COMPARE_SEQUENCES const BLAST_SequenceBlk* chunk_seq = chunk_qd->GetSequenceBlk(); #endif // BlastQueryInfo structure corresponding to chunk number chunk_num chunk_qinfo[chunk_num] = chunk_qd->GetQueryInfo(); _ASSERT(chunk_qinfo[chunk_num]); // In case the first context differs from 0, for consistency with the // other data returned by this class... for (Int4 ctx = 0; ctx < chunk_qinfo[chunk_num]->first_context; ctx++) { m_SplitBlk->AddContextOffsetToChunk(chunk_num, INT4_MAX); } for (Int4 ctx = chunk_qinfo[chunk_num]->first_context; ctx <= chunk_qinfo[chunk_num]->last_context; ctx++) { size_t correction = 0; const int starting_chunk = ctx_translator.GetStartingChunk(chunk_num, ctx); const int absolute_context = ctx_translator.GetAbsoluteContext(chunk_num, ctx); if (absolute_context == kInvalidContext || starting_chunk == kInvalidContext) { _ASSERT( !chunk_qinfo[chunk_num]->contexts[ctx].is_valid ); // INT4_MAX is the sentinel value for invalid contexts m_SplitBlk->AddContextOffsetToChunk(chunk_num, INT4_MAX); continue; } if (s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) { for (int c = chunk_num; c != starting_chunk; c--) { size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo, c - 1, ctx_translator, absolute_context); size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo, c, ctx_translator, absolute_context); size_t overlap = min(kOverlap, curr_len); correction += prev_len - min(overlap, prev_len); } } else { size_t subtrahend = 0; for (int c = chunk_num; c >= starting_chunk && c >= 0; c--) { size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo, c - 1, ctx_translator, absolute_context); size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo, c, ctx_translator, absolute_context); size_t overlap = min(kOverlap, curr_len); subtrahend += (curr_len - min(overlap, prev_len)); } correction = global_qinfo->contexts[absolute_context].query_length - subtrahend; } _ASSERT((chunk_qinfo[chunk_num]->contexts[ctx].is_valid)); m_SplitBlk->AddContextOffsetToChunk(chunk_num, correction); #ifdef DEBUG_COMPARE_SEQUENCES { int global_offset = global_qinfo->contexts[absolute_context].query_offset + correction; int chunk_offset = chunk_qinfo[chunk_num]->contexts[ctx].query_offset; if (!cmp_sequence(&global_seq->sequence[global_offset], &chunk_seq->sequence[chunk_offset], 10, Blast_QueryIsProtein(kProgram))) { cerr << "Failed to compare sequence data!" << endl; } } #endif } } _TRACE("CContextTranslator contents: " << ctx_translator); }