Esempio n. 1
0
Int4 
Blast_GetQueryIndexFromContext(Int4 context, EBlastProgramType program)
{
   if (program == eBlastTypePsiTblastn || Blast_QueryIsProtein(program)) {
           return context;
   } else if (Blast_QueryIsTranslated(program)) {
           return context / NUM_FRAMES;
   } else {
           return context / NUM_STRANDS;
   }
}
Esempio n. 2
0
BlastHSPStream* 
BlastHSPStreamNew(EBlastProgramType program, 
                  const BlastExtensionOptions* extn_opts,
                  Boolean sort_on_read,
                  Int4 num_queries,
                  BlastHSPWriter *writer)
{
    BlastHSPStream* hsp_stream = 
       (BlastHSPStream*) malloc(sizeof(BlastHSPStream));

    hsp_stream->program = program;

    hsp_stream->num_hsplists = 0;
    hsp_stream->num_hsplists_alloc = 100;
    hsp_stream->sorted_hsplists = (BlastHSPList **)malloc(
                                           hsp_stream->num_hsplists_alloc *
                                           sizeof(BlastHSPList *));
    hsp_stream->results = Blast_HSPResultsNew(num_queries);

    hsp_stream->results_sorted = FALSE;

    /* This is needed to meet a pre-condition of the composition-based
     * statistics code */
    if ((Blast_QueryIsProtein(program) || Blast_QueryIsPssm(program)) &&
        extn_opts->compositionBasedStats != 0) {
        hsp_stream->sort_by_score = 
            (SSortByScoreStruct*)calloc(1, sizeof(SSortByScoreStruct));
        hsp_stream->sort_by_score->sort_on_read = sort_on_read;
        hsp_stream->sort_by_score->first_query_index = 0;
    } else {
        hsp_stream->sort_by_score = NULL;
    }
    hsp_stream->x_lock = NULL;
    hsp_stream->writer = writer;
    hsp_stream->writer_initialized = FALSE;
    hsp_stream->writer_finalized = FALSE;
    hsp_stream->pre_pipe = NULL;
    hsp_stream->tback_pipe = NULL;

    return hsp_stream;
}
Esempio n. 3
0
void
CQuerySplitter::x_ComputeContextOffsets_TranslatedQueries()
{
    _ASSERT( !m_QueryChunkFactories.empty() );

    const EBlastProgramType kProgram = m_Options->GetProgramType();
    _ASSERT(Blast_QueryIsTranslated(kProgram));
    const BlastQueryInfo* global_qinfo = m_LocalQueryData->GetQueryInfo();
#ifdef DEBUG_COMPARE_SEQUENCES
    const BLAST_SequenceBlk* global_seq = m_LocalQueryData->GetSequenceBlk();
#endif
    const size_t kOverlap = 
        SplitQuery_GetOverlapChunkSize(kProgram) / CODON_LENGTH;
    CContextTranslator ctx_translator(*m_SplitBlk, &m_QueryChunkFactories,
                                      m_Options);
    CQueryDataPerChunk qdpc(*m_SplitBlk, kProgram, m_LocalQueryData);
    vector<const BlastQueryInfo*> chunk_qinfo(m_NumChunks, 0);

    for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) {
        CRef<IQueryFactory> chunk_qf(m_QueryChunkFactories[chunk_num]);
        CRef<ILocalQueryData> chunk_qd(chunk_qf->MakeLocalQueryData(m_Options));
#ifdef DEBUG_COMPARE_SEQUENCES
        const BLAST_SequenceBlk* chunk_seq = chunk_qd->GetSequenceBlk();
#endif

        // BlastQueryInfo structure corresponding to chunk number chunk_num
        chunk_qinfo[chunk_num] = chunk_qd->GetQueryInfo();
        _ASSERT(chunk_qinfo[chunk_num]);

        // In case the first context differs from 0, for consistency with the
        // other data returned by this class...
        for (Int4 ctx = 0; ctx < chunk_qinfo[chunk_num]->first_context; ctx++) {
            m_SplitBlk->AddContextOffsetToChunk(chunk_num, INT4_MAX);
        }

        for (Int4 ctx = chunk_qinfo[chunk_num]->first_context; 
             ctx <= chunk_qinfo[chunk_num]->last_context; 
             ctx++) {

            size_t correction = 0;
            const int starting_chunk =
                ctx_translator.GetStartingChunk(chunk_num, ctx);
            const int absolute_context =
                ctx_translator.GetAbsoluteContext(chunk_num, ctx);
            const int last_query_chunk = qdpc.GetLastChunk(chunk_num, ctx);

            if (absolute_context == kInvalidContext || 
                starting_chunk == kInvalidContext) {
                _ASSERT( !chunk_qinfo[chunk_num]->contexts[ctx].is_valid );
                // INT4_MAX is the sentinel value for invalid contexts
                m_SplitBlk->AddContextOffsetToChunk(chunk_num, INT4_MAX);
                continue;
            }

            // The corrections for the contexts corresponding to the negative
            // strand in the last chunk of a query sequence are all 0
            if (!s_IsPlusStrand(chunk_qinfo[chunk_num], ctx) &&
                (chunk_num == (size_t)last_query_chunk) && 
                (ctx % NUM_FRAMES >= 3)) {
                correction = 0;
                goto error_check;
            }

            // The corrections for the contexts corresponding to the plus
            // strand are always the same, so only calculate the first one
            if (s_IsPlusStrand(chunk_qinfo[chunk_num], ctx) && 
                (ctx % NUM_FRAMES == 1 || ctx % NUM_FRAMES == 2)) {
                correction = m_SplitBlk->GetContextOffsets(chunk_num).back();
                goto error_check;
            }

            // If the query length is divisible by CODON_LENGTH, the
            // corrections for all contexts corresponding to a given strand are
            // the same, so only calculate the first one
            if ((qdpc.GetQueryLength(chunk_num, ctx) % CODON_LENGTH == 0) &&
                (ctx % NUM_FRAMES != 0) && (ctx % NUM_FRAMES != 3)) {
                correction = m_SplitBlk->GetContextOffsets(chunk_num).back();
                goto error_check;
            }

            // If the query length % CODON_LENGTH == 1, the corrections for the
            // first two contexts of the negative strand are the same, and the
            // correction for the last context is one more than that.
            if ((qdpc.GetQueryLength(chunk_num, ctx) % CODON_LENGTH == 1) &&
                !s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) {

                if (ctx % NUM_FRAMES == 4) {
                    correction = 
                        m_SplitBlk->GetContextOffsets(chunk_num).back();
                    goto error_check;
                } else if (ctx % NUM_FRAMES == 5) {
                    correction = 
                        m_SplitBlk->GetContextOffsets(chunk_num).back() + 1;
                    goto error_check;
                }
            }
                
            // If the query length % CODON_LENGTH == 2, the corrections for the
            // last two contexts of the negative strand are the same, which is
            // one more that the first context on the negative strand.
            if ((qdpc.GetQueryLength(chunk_num, ctx) % CODON_LENGTH == 2) &&
                !s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) {

                if (ctx % NUM_FRAMES == 4) {
                    correction = 
                        m_SplitBlk->GetContextOffsets(chunk_num).back() + 1;
                    goto error_check;
                } else if (ctx % NUM_FRAMES == 5) {
                    correction = 
                        m_SplitBlk->GetContextOffsets(chunk_num).back();
                    goto error_check;
                }
            }

            if (s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) {

                for (int c = chunk_num; c != starting_chunk; c--) {
                    size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo, 
                                                         c - 1,
                                                         ctx_translator,
                                                         absolute_context);
                    size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo, c,
                                                         ctx_translator,
                                                         absolute_context);
                    size_t overlap = min(kOverlap, curr_len);
                    correction += prev_len - min(overlap, prev_len);
                }

            } else {

                size_t subtrahend = 0;

                for (int c = chunk_num; c >= starting_chunk && c >= 0; c--) {
                    size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo, 
                                                         c - 1,
                                                         ctx_translator,
                                                         absolute_context);
                    size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo,
                                                         c,
                                                         ctx_translator,
                                                         absolute_context);
                    size_t overlap = min(kOverlap, curr_len);
                    subtrahend += (curr_len - min(overlap, prev_len));
                }
                correction =
                    global_qinfo->contexts[absolute_context].query_length -
                    subtrahend;
            }

error_check:
            _ASSERT((chunk_qinfo[chunk_num]->contexts[ctx].is_valid));
            m_SplitBlk->AddContextOffsetToChunk(chunk_num, correction);
#ifdef DEBUG_COMPARE_SEQUENCES
{
    int global_offset = global_qinfo->contexts[absolute_context].query_offset +
        correction;
    int chunk_offset = chunk_qinfo[chunk_num]->contexts[ctx].query_offset;
    int num_bases2compare = 
        min(10, chunk_qinfo[chunk_num]->contexts[ctx].query_length);
    if (!cmp_sequence(&global_seq->sequence[global_offset], 
                      &chunk_seq->sequence[chunk_offset], 
                      num_bases2compare, Blast_QueryIsProtein(kProgram))) {
        cerr << "Failed to compare sequence data for chunk " << chunk_num
             << ", context " << ctx << endl;
    }
}
#endif
        }
    }
    _TRACE("CContextTranslator contents: " << ctx_translator);
}
Esempio n. 4
0
// Record the correction needed to synchronize with the complete query all the
// query offsets within HSPs that fall within this context
void
CQuerySplitter::x_ComputeContextOffsets_NonTranslatedQueries()
{
    _ASSERT( !m_QueryChunkFactories.empty() );

    const EBlastProgramType kProgram = m_Options->GetProgramType();
    _ASSERT( !Blast_QueryIsTranslated(kProgram) );
    const BlastQueryInfo* global_qinfo = m_LocalQueryData->GetQueryInfo();
#ifdef DEBUG_COMPARE_SEQUENCES
    const BLAST_SequenceBlk* global_seq = m_LocalQueryData->GetSequenceBlk();
#endif
    const size_t kOverlap = SplitQuery_GetOverlapChunkSize(kProgram);
    CContextTranslator ctx_translator(*m_SplitBlk, &m_QueryChunkFactories,
                                      m_Options);
    vector<const BlastQueryInfo*> chunk_qinfo(m_NumChunks, 0);

    for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) {
        CRef<IQueryFactory> chunk_qf(m_QueryChunkFactories[chunk_num]);
        CRef<ILocalQueryData> chunk_qd(chunk_qf->MakeLocalQueryData(m_Options));
#ifdef DEBUG_COMPARE_SEQUENCES
        const BLAST_SequenceBlk* chunk_seq = chunk_qd->GetSequenceBlk();
#endif

        // BlastQueryInfo structure corresponding to chunk number chunk_num
        chunk_qinfo[chunk_num] = chunk_qd->GetQueryInfo();
        _ASSERT(chunk_qinfo[chunk_num]);

        // In case the first context differs from 0, for consistency with the
        // other data returned by this class...
        for (Int4 ctx = 0; ctx < chunk_qinfo[chunk_num]->first_context; ctx++) {
            m_SplitBlk->AddContextOffsetToChunk(chunk_num, INT4_MAX);
        }

        for (Int4 ctx = chunk_qinfo[chunk_num]->first_context; 
             ctx <= chunk_qinfo[chunk_num]->last_context; 
             ctx++) {

            size_t correction = 0;
            const int starting_chunk =
                ctx_translator.GetStartingChunk(chunk_num, ctx);
            const int absolute_context =
                ctx_translator.GetAbsoluteContext(chunk_num, ctx);

            if (absolute_context == kInvalidContext || 
                starting_chunk == kInvalidContext) {
                _ASSERT( !chunk_qinfo[chunk_num]->contexts[ctx].is_valid );
                // INT4_MAX is the sentinel value for invalid contexts
                m_SplitBlk->AddContextOffsetToChunk(chunk_num, INT4_MAX);
                continue;
            }

            if (s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) {

                for (int c = chunk_num; c != starting_chunk; c--) {
                    size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo, 
                                                         c - 1,
                                                         ctx_translator,
                                                         absolute_context);
                    size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo, c,
                                                         ctx_translator,
                                                         absolute_context);
                    size_t overlap = min(kOverlap, curr_len);
                    correction += prev_len - min(overlap, prev_len);
                }

            } else {

                size_t subtrahend = 0;

                for (int c = chunk_num; c >= starting_chunk && c >= 0; c--) {
                    size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo, 
                                                         c - 1,
                                                         ctx_translator,
                                                         absolute_context);
                    size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo,
                                                         c,
                                                         ctx_translator,
                                                         absolute_context);
                    size_t overlap = min(kOverlap, curr_len);
                    subtrahend += (curr_len - min(overlap, prev_len));
                }
                correction =
                    global_qinfo->contexts[absolute_context].query_length -
                    subtrahend;

            }
            _ASSERT((chunk_qinfo[chunk_num]->contexts[ctx].is_valid));
            m_SplitBlk->AddContextOffsetToChunk(chunk_num, correction);
#ifdef DEBUG_COMPARE_SEQUENCES
{
    int global_offset = global_qinfo->contexts[absolute_context].query_offset +
        correction;
    int chunk_offset = chunk_qinfo[chunk_num]->contexts[ctx].query_offset;
    if (!cmp_sequence(&global_seq->sequence[global_offset], 
                      &chunk_seq->sequence[chunk_offset], 10,
                      Blast_QueryIsProtein(kProgram))) {
        cerr << "Failed to compare sequence data!" << endl;
    }
}
#endif

        }
    }
    _TRACE("CContextTranslator contents: " << ctx_translator);
}
Esempio n. 5
0
void
CQuerySplitter::x_ComputeQueryContextsForChunks()
{
    const EBlastProgramType kProgram = m_Options->GetProgramType();
    const unsigned int kNumContexts = GetNumberOfContexts(kProgram);
    const ENa_strand kStrandOption = m_Options->GetStrandOption();
    auto_ptr<CQueryDataPerChunk> qdpc;
    
    if (Blast_QueryIsTranslated(kProgram)) {
        qdpc.reset(new CQueryDataPerChunk(*m_SplitBlk, kProgram, 
                                          m_LocalQueryData));
    }

    for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) {
        vector<size_t> queries = m_SplitBlk->GetQueryIndices(chunk_num);

        for (size_t i = 0; i < queries.size(); i++) {
            CConstRef<CSeq_loc> sl = m_LocalQueryData->GetSeq_loc(queries[i]);
            const ENa_strand kStrand = 
                BlastSetup_GetStrand(*sl, kProgram, kStrandOption);

            if (Blast_QueryIsTranslated(kProgram)) {
                size_t qlength = qdpc->GetQueryLength(queries[i]);
                int last_query_chunk = qdpc->GetLastChunk(queries[i]);
                _ASSERT(last_query_chunk != -1);
                int shift = s_GetShiftForTranslatedNegStrand(qlength);

                for (unsigned int ctx = 0; ctx < kNumContexts; ctx++) {
                    // handle the plus strand...
                    if (ctx % NUM_FRAMES < CODON_LENGTH) {
                        if (kStrand == eNa_strand_minus) {
                            m_SplitBlk->AddContextToChunk(chunk_num,
                                                          kInvalidContext);
                        } else {
                            m_SplitBlk->AddContextToChunk(chunk_num, 
                                              kNumContexts*queries[i]+ctx);
                        }
                    } else { // handle the negative strand
                        if (kStrand == eNa_strand_plus) {
                            m_SplitBlk->AddContextToChunk(chunk_num,
                                                          kInvalidContext);
                        } else {
                            if (chunk_num == (size_t)last_query_chunk) {
                                // last chunk doesn't have shift
                                m_SplitBlk->AddContextToChunk(chunk_num,
                                          kNumContexts*queries[i]+ctx);
                            } else {
                                m_SplitBlk->AddContextToChunk(chunk_num,
                                          kNumContexts*queries[i]+
                                          s_AddShift(ctx, shift));
                            }
                        }
                    }
                }
            } else if (Blast_QueryIsNucleotide(kProgram)) {

                for (unsigned int ctx = 0; ctx < kNumContexts; ctx++) {
                    // handle the plus strand...
                    if (ctx % NUM_STRANDS == 0) {
                        if (kStrand == eNa_strand_minus) {
                            m_SplitBlk->AddContextToChunk(chunk_num,
                                                          kInvalidContext);
                        } else {
                            m_SplitBlk->AddContextToChunk(chunk_num, 
                                              kNumContexts*queries[i]+ctx);
                        }
                    } else { // handle the negative strand
                        if (kStrand == eNa_strand_plus) {
                            m_SplitBlk->AddContextToChunk(chunk_num,
                                                          kInvalidContext);
                        } else {
                            m_SplitBlk->AddContextToChunk(chunk_num, 
                                              kNumContexts*queries[i]+ctx);
                        }
                    }
                }

            } else if (Blast_QueryIsProtein(kProgram)) {
                m_SplitBlk->AddContextToChunk(chunk_num, 
                                              kNumContexts*queries[i]);
            } else {
                abort();
            }
        }
    }
}
Esempio n. 6
0
int CBlastDemoApplication::Run(void)
{
    // Get arguments
    const CArgs& args = GetArgs();

    EProgram program = ProgramNameToEnum(args["program"].AsString());

    bool db_is_aa = (program == eBlastp || program == eBlastx ||
                     program == eRPSBlast || program == eRPSTblastn);

    CRef<CBlastOptionsHandle> opts(CBlastOptionsFactory::Create(program, CBlastOptions::eRemote));

    ProcessCommandLineArgs(opts);

    opts->Validate();  // Can throw CBlastException::eInvalidOptions for invalid option.

    // This will dump the options to stderr.
    // opts->GetOptions().DebugDumpText(cerr, "opts", 1);

    CRef<CObjectManager> objmgr = CObjectManager::GetInstance();
    if (!objmgr) {
         throw std::runtime_error("Could not initialize object manager");
    }

    const bool is_protein = 
        !!Blast_QueryIsProtein(opts->GetOptions().GetProgramType());
    SDataLoaderConfig dlconfig(is_protein);
    CBlastInputSourceConfig iconfig(dlconfig, objects::eNa_strand_other, false, 
                              args["parse"].AsBoolean());
    CBlastFastaInputSource fasta_input(args["in"].AsInputFile(), iconfig);
    CScope scope(*objmgr);

    CBlastInput blast_input(&fasta_input);

    TSeqLocVector query_loc = blast_input.GetAllSeqLocs(scope);

    CRef<IQueryFactory> query_factory(new CObjMgr_QueryFactory(query_loc));

    const CSearchDatabase target_db(args["db"].AsString(),
        db_is_aa ? CSearchDatabase::eBlastDbIsProtein : CSearchDatabase::eBlastDbIsNucleotide);

    CRemoteBlast blaster(query_factory, opts, target_db);

// This will dump a lot of stuff to stderr.
//    blaster.SetVerbose();

    bool status = blaster.SubmitSync();

    if (status == false)
         throw std::runtime_error("No results returned by SubmitSync");

    cerr << "RID: " << blaster.GetRID() << '\n';

    CSearchResultSet results = *blaster.GetResultSet();

    CNcbiOstream& out = args["out"].AsOutputFile();

    for (unsigned int i = 0; i < results.GetNumResults(); i++) {
         CConstRef<CSeq_align_set> sas = results[i].GetSeqAlign();
         out << MSerial_AsnText << *sas;
    }

    return 0;
}