Uint4 SplitQuery_CalculateNumChunks(EBlastProgramType program, size_t *chunk_size, size_t concatenated_query_length, size_t num_queries) { if ( !SplitQuery_ShouldSplit(program, *chunk_size, concatenated_query_length, num_queries)) { _TRACE("Not splitting queries"); return 1; } size_t overlap_size = SplitQuery_GetOverlapChunkSize(program); Uint4 num_chunks = 0; _DEBUG_ARG(size_t target_chunk_size = *chunk_size); // For translated queries the chunk size should be divisible by CODON_LENGTH if (Blast_QueryIsTranslated(program)) { size_t chunk_size_delta = ((*chunk_size) % CODON_LENGTH); *chunk_size -= chunk_size_delta; _ASSERT((*chunk_size % CODON_LENGTH) == 0); } // Fix for small query size if ((*chunk_size) > overlap_size) { num_chunks = concatenated_query_length / ((*chunk_size) - overlap_size); } // Only one chunk, just return; if (num_chunks <= 1) { *chunk_size = concatenated_query_length; return 1; } // Re-adjust the chunk_size to make load even if (!Blast_QueryIsTranslated(program)) { *chunk_size = (concatenated_query_length + (num_chunks - 1) * overlap_size) / num_chunks; // Round up only if this will not decrease the number of chunks if (num_chunks < (*chunk_size) - overlap_size ) (*chunk_size)++; } _TRACE("Number of chunks: " << num_chunks << "; " "Target chunk size: " << target_chunk_size << "; " "Returned chunk size: " << *chunk_size); return num_chunks; }
Int2 BlastHitSavingOptionsNew(EBlastProgramType program_number, BlastHitSavingOptions** options, Boolean gapped_calculation) { *options = (BlastHitSavingOptions*) calloc(1, sizeof(BlastHitSavingOptions)); if (*options == NULL) return BLASTERR_INVALIDPARAM; (*options)->hitlist_size = BLAST_HITLIST_SIZE; (*options)->expect_value = BLAST_EXPECT_VALUE; (*options)->program_number = program_number; /* By default, sum statistics is used for all translated searches * (except RPS BLAST), and for all ungapped searches. */ if (program_number == eBlastTypeRpsTblastn) { (*options)->do_sum_stats = FALSE; } else if (!gapped_calculation || Blast_QueryIsTranslated(program_number) || Blast_SubjectIsTranslated(program_number)) { (*options)->do_sum_stats = TRUE; } else { (*options)->do_sum_stats = FALSE; } return 0; }
Int2 BLAST_GetSuggestedThreshold(EBlastProgramType program_number, const char* matrixName, double* threshold) { const double kB62_threshold = 11; if (program_number == eBlastTypeBlastn) return 0; if (matrixName == NULL) return BLASTERR_INVALIDPARAM; if(strcasecmp(matrixName, "BLOSUM62") == 0) *threshold = kB62_threshold; else if(strcasecmp(matrixName, "BLOSUM45") == 0) *threshold = 14; else if(strcasecmp(matrixName, "BLOSUM62_20") == 0) *threshold = 100; else if(strcasecmp(matrixName, "BLOSUM80") == 0) *threshold = 12; else if(strcasecmp(matrixName, "PAM30") == 0) *threshold = 16; else if(strcasecmp(matrixName, "PAM70") == 0) *threshold = 14; else *threshold = kB62_threshold; if (Blast_SubjectIsTranslated(program_number) == TRUE) *threshold += 2; /* Covers tblastn, tblastx, psi-tblastn rpstblastn. */ else if (Blast_QueryIsTranslated(program_number) == TRUE) *threshold += 1; return 0; }
int CCmdLineBlastXML2ReportData::GetQueryGeneticCode() const { if(Blast_QueryIsTranslated(m_Options->GetProgramType())) return m_Options->GetQueryGeneticCode(); return 0; }
void CQuerySplitter::x_ComputeContextOffsetsForChunks() { if (Blast_QueryIsTranslated(m_Options->GetProgramType())) { x_ComputeContextOffsets_TranslatedQueries(); } else { x_ComputeContextOffsets_NonTranslatedQueries(); } }
Int4 Blast_GetQueryIndexFromContext(Int4 context, EBlastProgramType program) { if (program == eBlastTypePsiTblastn || Blast_QueryIsProtein(program)) { return context; } else if (Blast_QueryIsTranslated(program)) { return context / NUM_FRAMES; } else { return context / NUM_STRANDS; } }
size_t SplitQuery_GetChunkSize(EProgram program) { size_t retval = 0; // used for experimentation purposes char* chunk_sz_str = getenv("CHUNK_SIZE"); if (chunk_sz_str && !NStr::IsBlank(chunk_sz_str)) { retval = NStr::StringToInt(chunk_sz_str); _TRACE("DEBUG: Using query chunk size " << retval); } else { switch (program) { case eBlastn: retval = 1000000; break; case eMegablast: case eDiscMegablast: retval = 5000000; break; case eTblastn: retval = 20000; break; // if the query will be translated, round the chunk size up to the next // multiple of 3, that way, when the nucleotide sequence(s) get(s) // split, context N%6 in one chunk will have the same frame as context // N%6 in the next chunk case eBlastx: case eTblastx: // N.B.: the splitting is done on the nucleotide query sequences, // then each of these chunks is translated retval = 10002; break; case eBlastp: default: retval = 10000; break; } _TRACE("Using query chunk size " << retval); } const EBlastProgramType prog_type(EProgramToEBlastProgramType(program)); if (Blast_QueryIsTranslated(prog_type) && !Blast_SubjectIsPssm(prog_type) && (retval % CODON_LENGTH) != 0) { NCBI_THROW(CBlastException, eInvalidArgument, "Split query chunk size must be divisible by 3"); } return retval; }
void CQuerySplitter::x_ComputeChunkRanges() { _ASSERT(m_SplitBlk.NotEmpty()); // Note that this information might not need to be stored in the // SSplitQueryBlk structure, as these ranges can be calculated as follows: // chunk_start = (chunk_num*chunk_size) - (chunk_num*overlap_size); // chunk_end = chunk_start + chunk_size > query_size ? query_size : // chunk_start + chunk_size; size_t chunk_start = 0; const size_t kOverlapSize = SplitQuery_GetOverlapChunkSize(m_Options->GetProgramType()); for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) { size_t chunk_end = chunk_start + m_ChunkSize; // if the chunk end is larger than the sequence ... if (chunk_end >= m_TotalQueryLength || // ... or this is the last chunk and it didn't make it to the end // of the sequence (chunk_end < m_TotalQueryLength && (chunk_num + 1) == m_NumChunks)) { // ... assign this chunk's end to the end of the sequence chunk_end = m_TotalQueryLength; } m_SplitBlk->SetChunkBounds(chunk_num, TChunkRange(chunk_start, chunk_end)); _TRACE("Chunk " << chunk_num << ": ranges from " << chunk_start << " to " << chunk_end); chunk_start += (m_ChunkSize - kOverlapSize); if (chunk_start > m_TotalQueryLength || chunk_end == m_TotalQueryLength) { break; } } // For purposes of having an accurate overlap size when stitching back // HSPs, save the overlap size const size_t kOverlap = Blast_QueryIsTranslated(m_Options->GetProgramType()) ? kOverlapSize / CODON_LENGTH : kOverlapSize; m_SplitBlk->SetChunkOverlapSize(kOverlap); }
Int4 BlastQueryInfoGetQueryLength(const BlastQueryInfo* qinfo, EBlastProgramType program, Int4 query_index) { const Uint4 kNumContexts = BLAST_GetNumberOfContexts(program); ASSERT(query_index < qinfo->num_queries); if (Blast_QueryIsTranslated(program)) { return s_GetTranslatedQueryDNALength(qinfo, query_index); } else if (program == eBlastTypeBlastn) { Int4 retval = qinfo->contexts[query_index*kNumContexts].query_length; if (retval <= 0) { retval = qinfo->contexts[query_index*kNumContexts+1].query_length; } return retval; } else { return qinfo->contexts[query_index*kNumContexts].query_length; } }
size_t SplitQuery_GetOverlapChunkSize(EBlastProgramType program) { size_t retval = 100; // used for experimentation purposes char* overlap_sz_str = getenv("OVERLAP_CHUNK_SIZE"); if (overlap_sz_str && !NStr::IsBlank(overlap_sz_str)) { retval = NStr::StringToInt(overlap_sz_str); _TRACE("DEBUG: Using overlap chunk size " << retval); return retval; } if (Blast_QueryIsTranslated(program)) { // N.B.: this value must be divisible by 3 to work with translated // queries, as we split them in nucleotide coordinates and then do the // translation retval = 297; } _TRACE("Using overlap chunk size " << retval); return retval; }
Int2 SBlastOptionsSetWindowSize(SBlastOptions* options, Int4 window_size) { if (!options || !options->score_options || !options->word_options) return -1; if (window_size < 0) return -2; if (Blast_QueryIsNucleotide(options->program) == TRUE && Blast_QueryIsTranslated(options->program) == FALSE) return 0; if (window_size == 0) { Int2 status=0; if ((status=BLAST_GetSuggestedWindowSize(options->program, options->score_options->matrix, &window_size)) != 0) return status; } options->word_options->window_size = window_size; return 0; }
Int2 SBlastOptionsSetThreshold(SBlastOptions* options, double threshold) { if (!options || !options->lookup_options || !options->score_options) return -1; if (threshold < 0) return -2; if (Blast_QueryIsNucleotide(options->program) == TRUE && Blast_QueryIsTranslated(options->program) == FALSE) return 0; if (threshold == 0) { Int2 status=0; if ((status=BLAST_GetSuggestedThreshold(options->program, options->score_options->matrix, &threshold)) != 0) return status; } options->lookup_options->threshold = threshold; return 0; }
void CQuerySplitter::x_ComputeContextOffsets_TranslatedQueries() { _ASSERT( !m_QueryChunkFactories.empty() ); const EBlastProgramType kProgram = m_Options->GetProgramType(); _ASSERT(Blast_QueryIsTranslated(kProgram)); const BlastQueryInfo* global_qinfo = m_LocalQueryData->GetQueryInfo(); #ifdef DEBUG_COMPARE_SEQUENCES const BLAST_SequenceBlk* global_seq = m_LocalQueryData->GetSequenceBlk(); #endif const size_t kOverlap = SplitQuery_GetOverlapChunkSize(kProgram) / CODON_LENGTH; CContextTranslator ctx_translator(*m_SplitBlk, &m_QueryChunkFactories, m_Options); CQueryDataPerChunk qdpc(*m_SplitBlk, kProgram, m_LocalQueryData); vector<const BlastQueryInfo*> chunk_qinfo(m_NumChunks, 0); for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) { CRef<IQueryFactory> chunk_qf(m_QueryChunkFactories[chunk_num]); CRef<ILocalQueryData> chunk_qd(chunk_qf->MakeLocalQueryData(m_Options)); #ifdef DEBUG_COMPARE_SEQUENCES const BLAST_SequenceBlk* chunk_seq = chunk_qd->GetSequenceBlk(); #endif // BlastQueryInfo structure corresponding to chunk number chunk_num chunk_qinfo[chunk_num] = chunk_qd->GetQueryInfo(); _ASSERT(chunk_qinfo[chunk_num]); // In case the first context differs from 0, for consistency with the // other data returned by this class... for (Int4 ctx = 0; ctx < chunk_qinfo[chunk_num]->first_context; ctx++) { m_SplitBlk->AddContextOffsetToChunk(chunk_num, INT4_MAX); } for (Int4 ctx = chunk_qinfo[chunk_num]->first_context; ctx <= chunk_qinfo[chunk_num]->last_context; ctx++) { size_t correction = 0; const int starting_chunk = ctx_translator.GetStartingChunk(chunk_num, ctx); const int absolute_context = ctx_translator.GetAbsoluteContext(chunk_num, ctx); const int last_query_chunk = qdpc.GetLastChunk(chunk_num, ctx); if (absolute_context == kInvalidContext || starting_chunk == kInvalidContext) { _ASSERT( !chunk_qinfo[chunk_num]->contexts[ctx].is_valid ); // INT4_MAX is the sentinel value for invalid contexts m_SplitBlk->AddContextOffsetToChunk(chunk_num, INT4_MAX); continue; } // The corrections for the contexts corresponding to the negative // strand in the last chunk of a query sequence are all 0 if (!s_IsPlusStrand(chunk_qinfo[chunk_num], ctx) && (chunk_num == (size_t)last_query_chunk) && (ctx % NUM_FRAMES >= 3)) { correction = 0; goto error_check; } // The corrections for the contexts corresponding to the plus // strand are always the same, so only calculate the first one if (s_IsPlusStrand(chunk_qinfo[chunk_num], ctx) && (ctx % NUM_FRAMES == 1 || ctx % NUM_FRAMES == 2)) { correction = m_SplitBlk->GetContextOffsets(chunk_num).back(); goto error_check; } // If the query length is divisible by CODON_LENGTH, the // corrections for all contexts corresponding to a given strand are // the same, so only calculate the first one if ((qdpc.GetQueryLength(chunk_num, ctx) % CODON_LENGTH == 0) && (ctx % NUM_FRAMES != 0) && (ctx % NUM_FRAMES != 3)) { correction = m_SplitBlk->GetContextOffsets(chunk_num).back(); goto error_check; } // If the query length % CODON_LENGTH == 1, the corrections for the // first two contexts of the negative strand are the same, and the // correction for the last context is one more than that. if ((qdpc.GetQueryLength(chunk_num, ctx) % CODON_LENGTH == 1) && !s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) { if (ctx % NUM_FRAMES == 4) { correction = m_SplitBlk->GetContextOffsets(chunk_num).back(); goto error_check; } else if (ctx % NUM_FRAMES == 5) { correction = m_SplitBlk->GetContextOffsets(chunk_num).back() + 1; goto error_check; } } // If the query length % CODON_LENGTH == 2, the corrections for the // last two contexts of the negative strand are the same, which is // one more that the first context on the negative strand. if ((qdpc.GetQueryLength(chunk_num, ctx) % CODON_LENGTH == 2) && !s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) { if (ctx % NUM_FRAMES == 4) { correction = m_SplitBlk->GetContextOffsets(chunk_num).back() + 1; goto error_check; } else if (ctx % NUM_FRAMES == 5) { correction = m_SplitBlk->GetContextOffsets(chunk_num).back(); goto error_check; } } if (s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) { for (int c = chunk_num; c != starting_chunk; c--) { size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo, c - 1, ctx_translator, absolute_context); size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo, c, ctx_translator, absolute_context); size_t overlap = min(kOverlap, curr_len); correction += prev_len - min(overlap, prev_len); } } else { size_t subtrahend = 0; for (int c = chunk_num; c >= starting_chunk && c >= 0; c--) { size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo, c - 1, ctx_translator, absolute_context); size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo, c, ctx_translator, absolute_context); size_t overlap = min(kOverlap, curr_len); subtrahend += (curr_len - min(overlap, prev_len)); } correction = global_qinfo->contexts[absolute_context].query_length - subtrahend; } error_check: _ASSERT((chunk_qinfo[chunk_num]->contexts[ctx].is_valid)); m_SplitBlk->AddContextOffsetToChunk(chunk_num, correction); #ifdef DEBUG_COMPARE_SEQUENCES { int global_offset = global_qinfo->contexts[absolute_context].query_offset + correction; int chunk_offset = chunk_qinfo[chunk_num]->contexts[ctx].query_offset; int num_bases2compare = min(10, chunk_qinfo[chunk_num]->contexts[ctx].query_length); if (!cmp_sequence(&global_seq->sequence[global_offset], &chunk_seq->sequence[chunk_offset], num_bases2compare, Blast_QueryIsProtein(kProgram))) { cerr << "Failed to compare sequence data for chunk " << chunk_num << ", context " << ctx << endl; } } #endif } } _TRACE("CContextTranslator contents: " << ctx_translator); }
// Record the correction needed to synchronize with the complete query all the // query offsets within HSPs that fall within this context void CQuerySplitter::x_ComputeContextOffsets_NonTranslatedQueries() { _ASSERT( !m_QueryChunkFactories.empty() ); const EBlastProgramType kProgram = m_Options->GetProgramType(); _ASSERT( !Blast_QueryIsTranslated(kProgram) ); const BlastQueryInfo* global_qinfo = m_LocalQueryData->GetQueryInfo(); #ifdef DEBUG_COMPARE_SEQUENCES const BLAST_SequenceBlk* global_seq = m_LocalQueryData->GetSequenceBlk(); #endif const size_t kOverlap = SplitQuery_GetOverlapChunkSize(kProgram); CContextTranslator ctx_translator(*m_SplitBlk, &m_QueryChunkFactories, m_Options); vector<const BlastQueryInfo*> chunk_qinfo(m_NumChunks, 0); for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) { CRef<IQueryFactory> chunk_qf(m_QueryChunkFactories[chunk_num]); CRef<ILocalQueryData> chunk_qd(chunk_qf->MakeLocalQueryData(m_Options)); #ifdef DEBUG_COMPARE_SEQUENCES const BLAST_SequenceBlk* chunk_seq = chunk_qd->GetSequenceBlk(); #endif // BlastQueryInfo structure corresponding to chunk number chunk_num chunk_qinfo[chunk_num] = chunk_qd->GetQueryInfo(); _ASSERT(chunk_qinfo[chunk_num]); // In case the first context differs from 0, for consistency with the // other data returned by this class... for (Int4 ctx = 0; ctx < chunk_qinfo[chunk_num]->first_context; ctx++) { m_SplitBlk->AddContextOffsetToChunk(chunk_num, INT4_MAX); } for (Int4 ctx = chunk_qinfo[chunk_num]->first_context; ctx <= chunk_qinfo[chunk_num]->last_context; ctx++) { size_t correction = 0; const int starting_chunk = ctx_translator.GetStartingChunk(chunk_num, ctx); const int absolute_context = ctx_translator.GetAbsoluteContext(chunk_num, ctx); if (absolute_context == kInvalidContext || starting_chunk == kInvalidContext) { _ASSERT( !chunk_qinfo[chunk_num]->contexts[ctx].is_valid ); // INT4_MAX is the sentinel value for invalid contexts m_SplitBlk->AddContextOffsetToChunk(chunk_num, INT4_MAX); continue; } if (s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) { for (int c = chunk_num; c != starting_chunk; c--) { size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo, c - 1, ctx_translator, absolute_context); size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo, c, ctx_translator, absolute_context); size_t overlap = min(kOverlap, curr_len); correction += prev_len - min(overlap, prev_len); } } else { size_t subtrahend = 0; for (int c = chunk_num; c >= starting_chunk && c >= 0; c--) { size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo, c - 1, ctx_translator, absolute_context); size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo, c, ctx_translator, absolute_context); size_t overlap = min(kOverlap, curr_len); subtrahend += (curr_len - min(overlap, prev_len)); } correction = global_qinfo->contexts[absolute_context].query_length - subtrahend; } _ASSERT((chunk_qinfo[chunk_num]->contexts[ctx].is_valid)); m_SplitBlk->AddContextOffsetToChunk(chunk_num, correction); #ifdef DEBUG_COMPARE_SEQUENCES { int global_offset = global_qinfo->contexts[absolute_context].query_offset + correction; int chunk_offset = chunk_qinfo[chunk_num]->contexts[ctx].query_offset; if (!cmp_sequence(&global_seq->sequence[global_offset], &chunk_seq->sequence[chunk_offset], 10, Blast_QueryIsProtein(kProgram))) { cerr << "Failed to compare sequence data!" << endl; } } #endif } } _TRACE("CContextTranslator contents: " << ctx_translator); }
void CQuerySplitter::x_ComputeQueryContextsForChunks() { const EBlastProgramType kProgram = m_Options->GetProgramType(); const unsigned int kNumContexts = GetNumberOfContexts(kProgram); const ENa_strand kStrandOption = m_Options->GetStrandOption(); auto_ptr<CQueryDataPerChunk> qdpc; if (Blast_QueryIsTranslated(kProgram)) { qdpc.reset(new CQueryDataPerChunk(*m_SplitBlk, kProgram, m_LocalQueryData)); } for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) { vector<size_t> queries = m_SplitBlk->GetQueryIndices(chunk_num); for (size_t i = 0; i < queries.size(); i++) { CConstRef<CSeq_loc> sl = m_LocalQueryData->GetSeq_loc(queries[i]); const ENa_strand kStrand = BlastSetup_GetStrand(*sl, kProgram, kStrandOption); if (Blast_QueryIsTranslated(kProgram)) { size_t qlength = qdpc->GetQueryLength(queries[i]); int last_query_chunk = qdpc->GetLastChunk(queries[i]); _ASSERT(last_query_chunk != -1); int shift = s_GetShiftForTranslatedNegStrand(qlength); for (unsigned int ctx = 0; ctx < kNumContexts; ctx++) { // handle the plus strand... if (ctx % NUM_FRAMES < CODON_LENGTH) { if (kStrand == eNa_strand_minus) { m_SplitBlk->AddContextToChunk(chunk_num, kInvalidContext); } else { m_SplitBlk->AddContextToChunk(chunk_num, kNumContexts*queries[i]+ctx); } } else { // handle the negative strand if (kStrand == eNa_strand_plus) { m_SplitBlk->AddContextToChunk(chunk_num, kInvalidContext); } else { if (chunk_num == (size_t)last_query_chunk) { // last chunk doesn't have shift m_SplitBlk->AddContextToChunk(chunk_num, kNumContexts*queries[i]+ctx); } else { m_SplitBlk->AddContextToChunk(chunk_num, kNumContexts*queries[i]+ s_AddShift(ctx, shift)); } } } } } else if (Blast_QueryIsNucleotide(kProgram)) { for (unsigned int ctx = 0; ctx < kNumContexts; ctx++) { // handle the plus strand... if (ctx % NUM_STRANDS == 0) { if (kStrand == eNa_strand_minus) { m_SplitBlk->AddContextToChunk(chunk_num, kInvalidContext); } else { m_SplitBlk->AddContextToChunk(chunk_num, kNumContexts*queries[i]+ctx); } } else { // handle the negative strand if (kStrand == eNa_strand_plus) { m_SplitBlk->AddContextToChunk(chunk_num, kInvalidContext); } else { m_SplitBlk->AddContextToChunk(chunk_num, kNumContexts*queries[i]+ctx); } } } } else if (Blast_QueryIsProtein(kProgram)) { m_SplitBlk->AddContextToChunk(chunk_num, kNumContexts*queries[i]); } else { abort(); } } } }