Int4 Blast_GetQueryIndexFromContext(Int4 context, EBlastProgramType program) { if (program == eBlastTypePsiTblastn || Blast_QueryIsProtein(program)) { return context; } else if (Blast_QueryIsTranslated(program)) { return context / NUM_FRAMES; } else { return context / NUM_STRANDS; } }
BlastHSPStream* BlastHSPStreamNew(EBlastProgramType program, const BlastExtensionOptions* extn_opts, Boolean sort_on_read, Int4 num_queries, BlastHSPWriter *writer) { BlastHSPStream* hsp_stream = (BlastHSPStream*) malloc(sizeof(BlastHSPStream)); hsp_stream->program = program; hsp_stream->num_hsplists = 0; hsp_stream->num_hsplists_alloc = 100; hsp_stream->sorted_hsplists = (BlastHSPList **)malloc( hsp_stream->num_hsplists_alloc * sizeof(BlastHSPList *)); hsp_stream->results = Blast_HSPResultsNew(num_queries); hsp_stream->results_sorted = FALSE; /* This is needed to meet a pre-condition of the composition-based * statistics code */ if ((Blast_QueryIsProtein(program) || Blast_QueryIsPssm(program)) && extn_opts->compositionBasedStats != 0) { hsp_stream->sort_by_score = (SSortByScoreStruct*)calloc(1, sizeof(SSortByScoreStruct)); hsp_stream->sort_by_score->sort_on_read = sort_on_read; hsp_stream->sort_by_score->first_query_index = 0; } else { hsp_stream->sort_by_score = NULL; } hsp_stream->x_lock = NULL; hsp_stream->writer = writer; hsp_stream->writer_initialized = FALSE; hsp_stream->writer_finalized = FALSE; hsp_stream->pre_pipe = NULL; hsp_stream->tback_pipe = NULL; return hsp_stream; }
void CQuerySplitter::x_ComputeContextOffsets_TranslatedQueries() { _ASSERT( !m_QueryChunkFactories.empty() ); const EBlastProgramType kProgram = m_Options->GetProgramType(); _ASSERT(Blast_QueryIsTranslated(kProgram)); const BlastQueryInfo* global_qinfo = m_LocalQueryData->GetQueryInfo(); #ifdef DEBUG_COMPARE_SEQUENCES const BLAST_SequenceBlk* global_seq = m_LocalQueryData->GetSequenceBlk(); #endif const size_t kOverlap = SplitQuery_GetOverlapChunkSize(kProgram) / CODON_LENGTH; CContextTranslator ctx_translator(*m_SplitBlk, &m_QueryChunkFactories, m_Options); CQueryDataPerChunk qdpc(*m_SplitBlk, kProgram, m_LocalQueryData); vector<const BlastQueryInfo*> chunk_qinfo(m_NumChunks, 0); for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) { CRef<IQueryFactory> chunk_qf(m_QueryChunkFactories[chunk_num]); CRef<ILocalQueryData> chunk_qd(chunk_qf->MakeLocalQueryData(m_Options)); #ifdef DEBUG_COMPARE_SEQUENCES const BLAST_SequenceBlk* chunk_seq = chunk_qd->GetSequenceBlk(); #endif // BlastQueryInfo structure corresponding to chunk number chunk_num chunk_qinfo[chunk_num] = chunk_qd->GetQueryInfo(); _ASSERT(chunk_qinfo[chunk_num]); // In case the first context differs from 0, for consistency with the // other data returned by this class... for (Int4 ctx = 0; ctx < chunk_qinfo[chunk_num]->first_context; ctx++) { m_SplitBlk->AddContextOffsetToChunk(chunk_num, INT4_MAX); } for (Int4 ctx = chunk_qinfo[chunk_num]->first_context; ctx <= chunk_qinfo[chunk_num]->last_context; ctx++) { size_t correction = 0; const int starting_chunk = ctx_translator.GetStartingChunk(chunk_num, ctx); const int absolute_context = ctx_translator.GetAbsoluteContext(chunk_num, ctx); const int last_query_chunk = qdpc.GetLastChunk(chunk_num, ctx); if (absolute_context == kInvalidContext || starting_chunk == kInvalidContext) { _ASSERT( !chunk_qinfo[chunk_num]->contexts[ctx].is_valid ); // INT4_MAX is the sentinel value for invalid contexts m_SplitBlk->AddContextOffsetToChunk(chunk_num, INT4_MAX); continue; } // The corrections for the contexts corresponding to the negative // strand in the last chunk of a query sequence are all 0 if (!s_IsPlusStrand(chunk_qinfo[chunk_num], ctx) && (chunk_num == (size_t)last_query_chunk) && (ctx % NUM_FRAMES >= 3)) { correction = 0; goto error_check; } // The corrections for the contexts corresponding to the plus // strand are always the same, so only calculate the first one if (s_IsPlusStrand(chunk_qinfo[chunk_num], ctx) && (ctx % NUM_FRAMES == 1 || ctx % NUM_FRAMES == 2)) { correction = m_SplitBlk->GetContextOffsets(chunk_num).back(); goto error_check; } // If the query length is divisible by CODON_LENGTH, the // corrections for all contexts corresponding to a given strand are // the same, so only calculate the first one if ((qdpc.GetQueryLength(chunk_num, ctx) % CODON_LENGTH == 0) && (ctx % NUM_FRAMES != 0) && (ctx % NUM_FRAMES != 3)) { correction = m_SplitBlk->GetContextOffsets(chunk_num).back(); goto error_check; } // If the query length % CODON_LENGTH == 1, the corrections for the // first two contexts of the negative strand are the same, and the // correction for the last context is one more than that. if ((qdpc.GetQueryLength(chunk_num, ctx) % CODON_LENGTH == 1) && !s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) { if (ctx % NUM_FRAMES == 4) { correction = m_SplitBlk->GetContextOffsets(chunk_num).back(); goto error_check; } else if (ctx % NUM_FRAMES == 5) { correction = m_SplitBlk->GetContextOffsets(chunk_num).back() + 1; goto error_check; } } // If the query length % CODON_LENGTH == 2, the corrections for the // last two contexts of the negative strand are the same, which is // one more that the first context on the negative strand. if ((qdpc.GetQueryLength(chunk_num, ctx) % CODON_LENGTH == 2) && !s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) { if (ctx % NUM_FRAMES == 4) { correction = m_SplitBlk->GetContextOffsets(chunk_num).back() + 1; goto error_check; } else if (ctx % NUM_FRAMES == 5) { correction = m_SplitBlk->GetContextOffsets(chunk_num).back(); goto error_check; } } if (s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) { for (int c = chunk_num; c != starting_chunk; c--) { size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo, c - 1, ctx_translator, absolute_context); size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo, c, ctx_translator, absolute_context); size_t overlap = min(kOverlap, curr_len); correction += prev_len - min(overlap, prev_len); } } else { size_t subtrahend = 0; for (int c = chunk_num; c >= starting_chunk && c >= 0; c--) { size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo, c - 1, ctx_translator, absolute_context); size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo, c, ctx_translator, absolute_context); size_t overlap = min(kOverlap, curr_len); subtrahend += (curr_len - min(overlap, prev_len)); } correction = global_qinfo->contexts[absolute_context].query_length - subtrahend; } error_check: _ASSERT((chunk_qinfo[chunk_num]->contexts[ctx].is_valid)); m_SplitBlk->AddContextOffsetToChunk(chunk_num, correction); #ifdef DEBUG_COMPARE_SEQUENCES { int global_offset = global_qinfo->contexts[absolute_context].query_offset + correction; int chunk_offset = chunk_qinfo[chunk_num]->contexts[ctx].query_offset; int num_bases2compare = min(10, chunk_qinfo[chunk_num]->contexts[ctx].query_length); if (!cmp_sequence(&global_seq->sequence[global_offset], &chunk_seq->sequence[chunk_offset], num_bases2compare, Blast_QueryIsProtein(kProgram))) { cerr << "Failed to compare sequence data for chunk " << chunk_num << ", context " << ctx << endl; } } #endif } } _TRACE("CContextTranslator contents: " << ctx_translator); }
// Record the correction needed to synchronize with the complete query all the // query offsets within HSPs that fall within this context void CQuerySplitter::x_ComputeContextOffsets_NonTranslatedQueries() { _ASSERT( !m_QueryChunkFactories.empty() ); const EBlastProgramType kProgram = m_Options->GetProgramType(); _ASSERT( !Blast_QueryIsTranslated(kProgram) ); const BlastQueryInfo* global_qinfo = m_LocalQueryData->GetQueryInfo(); #ifdef DEBUG_COMPARE_SEQUENCES const BLAST_SequenceBlk* global_seq = m_LocalQueryData->GetSequenceBlk(); #endif const size_t kOverlap = SplitQuery_GetOverlapChunkSize(kProgram); CContextTranslator ctx_translator(*m_SplitBlk, &m_QueryChunkFactories, m_Options); vector<const BlastQueryInfo*> chunk_qinfo(m_NumChunks, 0); for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) { CRef<IQueryFactory> chunk_qf(m_QueryChunkFactories[chunk_num]); CRef<ILocalQueryData> chunk_qd(chunk_qf->MakeLocalQueryData(m_Options)); #ifdef DEBUG_COMPARE_SEQUENCES const BLAST_SequenceBlk* chunk_seq = chunk_qd->GetSequenceBlk(); #endif // BlastQueryInfo structure corresponding to chunk number chunk_num chunk_qinfo[chunk_num] = chunk_qd->GetQueryInfo(); _ASSERT(chunk_qinfo[chunk_num]); // In case the first context differs from 0, for consistency with the // other data returned by this class... for (Int4 ctx = 0; ctx < chunk_qinfo[chunk_num]->first_context; ctx++) { m_SplitBlk->AddContextOffsetToChunk(chunk_num, INT4_MAX); } for (Int4 ctx = chunk_qinfo[chunk_num]->first_context; ctx <= chunk_qinfo[chunk_num]->last_context; ctx++) { size_t correction = 0; const int starting_chunk = ctx_translator.GetStartingChunk(chunk_num, ctx); const int absolute_context = ctx_translator.GetAbsoluteContext(chunk_num, ctx); if (absolute_context == kInvalidContext || starting_chunk == kInvalidContext) { _ASSERT( !chunk_qinfo[chunk_num]->contexts[ctx].is_valid ); // INT4_MAX is the sentinel value for invalid contexts m_SplitBlk->AddContextOffsetToChunk(chunk_num, INT4_MAX); continue; } if (s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) { for (int c = chunk_num; c != starting_chunk; c--) { size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo, c - 1, ctx_translator, absolute_context); size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo, c, ctx_translator, absolute_context); size_t overlap = min(kOverlap, curr_len); correction += prev_len - min(overlap, prev_len); } } else { size_t subtrahend = 0; for (int c = chunk_num; c >= starting_chunk && c >= 0; c--) { size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo, c - 1, ctx_translator, absolute_context); size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo, c, ctx_translator, absolute_context); size_t overlap = min(kOverlap, curr_len); subtrahend += (curr_len - min(overlap, prev_len)); } correction = global_qinfo->contexts[absolute_context].query_length - subtrahend; } _ASSERT((chunk_qinfo[chunk_num]->contexts[ctx].is_valid)); m_SplitBlk->AddContextOffsetToChunk(chunk_num, correction); #ifdef DEBUG_COMPARE_SEQUENCES { int global_offset = global_qinfo->contexts[absolute_context].query_offset + correction; int chunk_offset = chunk_qinfo[chunk_num]->contexts[ctx].query_offset; if (!cmp_sequence(&global_seq->sequence[global_offset], &chunk_seq->sequence[chunk_offset], 10, Blast_QueryIsProtein(kProgram))) { cerr << "Failed to compare sequence data!" << endl; } } #endif } } _TRACE("CContextTranslator contents: " << ctx_translator); }
void CQuerySplitter::x_ComputeQueryContextsForChunks() { const EBlastProgramType kProgram = m_Options->GetProgramType(); const unsigned int kNumContexts = GetNumberOfContexts(kProgram); const ENa_strand kStrandOption = m_Options->GetStrandOption(); auto_ptr<CQueryDataPerChunk> qdpc; if (Blast_QueryIsTranslated(kProgram)) { qdpc.reset(new CQueryDataPerChunk(*m_SplitBlk, kProgram, m_LocalQueryData)); } for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) { vector<size_t> queries = m_SplitBlk->GetQueryIndices(chunk_num); for (size_t i = 0; i < queries.size(); i++) { CConstRef<CSeq_loc> sl = m_LocalQueryData->GetSeq_loc(queries[i]); const ENa_strand kStrand = BlastSetup_GetStrand(*sl, kProgram, kStrandOption); if (Blast_QueryIsTranslated(kProgram)) { size_t qlength = qdpc->GetQueryLength(queries[i]); int last_query_chunk = qdpc->GetLastChunk(queries[i]); _ASSERT(last_query_chunk != -1); int shift = s_GetShiftForTranslatedNegStrand(qlength); for (unsigned int ctx = 0; ctx < kNumContexts; ctx++) { // handle the plus strand... if (ctx % NUM_FRAMES < CODON_LENGTH) { if (kStrand == eNa_strand_minus) { m_SplitBlk->AddContextToChunk(chunk_num, kInvalidContext); } else { m_SplitBlk->AddContextToChunk(chunk_num, kNumContexts*queries[i]+ctx); } } else { // handle the negative strand if (kStrand == eNa_strand_plus) { m_SplitBlk->AddContextToChunk(chunk_num, kInvalidContext); } else { if (chunk_num == (size_t)last_query_chunk) { // last chunk doesn't have shift m_SplitBlk->AddContextToChunk(chunk_num, kNumContexts*queries[i]+ctx); } else { m_SplitBlk->AddContextToChunk(chunk_num, kNumContexts*queries[i]+ s_AddShift(ctx, shift)); } } } } } else if (Blast_QueryIsNucleotide(kProgram)) { for (unsigned int ctx = 0; ctx < kNumContexts; ctx++) { // handle the plus strand... if (ctx % NUM_STRANDS == 0) { if (kStrand == eNa_strand_minus) { m_SplitBlk->AddContextToChunk(chunk_num, kInvalidContext); } else { m_SplitBlk->AddContextToChunk(chunk_num, kNumContexts*queries[i]+ctx); } } else { // handle the negative strand if (kStrand == eNa_strand_plus) { m_SplitBlk->AddContextToChunk(chunk_num, kInvalidContext); } else { m_SplitBlk->AddContextToChunk(chunk_num, kNumContexts*queries[i]+ctx); } } } } else if (Blast_QueryIsProtein(kProgram)) { m_SplitBlk->AddContextToChunk(chunk_num, kNumContexts*queries[i]); } else { abort(); } } } }
int CBlastDemoApplication::Run(void) { // Get arguments const CArgs& args = GetArgs(); EProgram program = ProgramNameToEnum(args["program"].AsString()); bool db_is_aa = (program == eBlastp || program == eBlastx || program == eRPSBlast || program == eRPSTblastn); CRef<CBlastOptionsHandle> opts(CBlastOptionsFactory::Create(program, CBlastOptions::eRemote)); ProcessCommandLineArgs(opts); opts->Validate(); // Can throw CBlastException::eInvalidOptions for invalid option. // This will dump the options to stderr. // opts->GetOptions().DebugDumpText(cerr, "opts", 1); CRef<CObjectManager> objmgr = CObjectManager::GetInstance(); if (!objmgr) { throw std::runtime_error("Could not initialize object manager"); } const bool is_protein = !!Blast_QueryIsProtein(opts->GetOptions().GetProgramType()); SDataLoaderConfig dlconfig(is_protein); CBlastInputSourceConfig iconfig(dlconfig, objects::eNa_strand_other, false, args["parse"].AsBoolean()); CBlastFastaInputSource fasta_input(args["in"].AsInputFile(), iconfig); CScope scope(*objmgr); CBlastInput blast_input(&fasta_input); TSeqLocVector query_loc = blast_input.GetAllSeqLocs(scope); CRef<IQueryFactory> query_factory(new CObjMgr_QueryFactory(query_loc)); const CSearchDatabase target_db(args["db"].AsString(), db_is_aa ? CSearchDatabase::eBlastDbIsProtein : CSearchDatabase::eBlastDbIsNucleotide); CRemoteBlast blaster(query_factory, opts, target_db); // This will dump a lot of stuff to stderr. // blaster.SetVerbose(); bool status = blaster.SubmitSync(); if (status == false) throw std::runtime_error("No results returned by SubmitSync"); cerr << "RID: " << blaster.GetRID() << '\n'; CSearchResultSet results = *blaster.GetResultSet(); CNcbiOstream& out = args["out"].AsOutputFile(); for (unsigned int i = 0; i < results.GetNumResults(); i++) { CConstRef<CSeq_align_set> sas = results[i].GetSeqAlign(); out << MSerial_AsnText << *sas; } return 0; }