Ejemplo n.º 1
0
CContextTranslator::CContextTranslator(const CSplitQueryBlk& sqb,
           vector< CRef<IQueryFactory> >* query_chunk_factories /* = NULL */,
           const CBlastOptions* options /* = NULL */)
{
    const size_t kNumChunks(sqb.GetNumChunks());
    m_ContextsPerChunk.reserve(kNumChunks);
    for (size_t i = 0; i < kNumChunks; i++) {
        m_ContextsPerChunk.push_back(sqb.GetQueryContexts(i));
    }

    if (query_chunk_factories == NULL || options == NULL) {
        return;
    }

    /// Populate the data to print out
    m_StartingChunks.resize(kNumChunks);
    m_AbsoluteContexts.resize(kNumChunks);
    for (size_t i = 0; i < kNumChunks; i++) {
        CRef<IQueryFactory> chunk_qf((*query_chunk_factories)[i]);
        CRef<ILocalQueryData> chunk_qd(chunk_qf->MakeLocalQueryData(options));
        BlastQueryInfo* chunk_qinfo = chunk_qd->GetQueryInfo();
        for (Int4 ctx = chunk_qinfo->first_context;
             ctx <= chunk_qinfo->last_context; ctx++) {
            m_StartingChunks[i].push_back(GetStartingChunk(i, ctx));
            m_AbsoluteContexts[i].push_back(GetAbsoluteContext(i, ctx));
        }
    }
}
Ejemplo n.º 2
0
void
CQuerySplitter::x_ComputeContextOffsets_TranslatedQueries()
{
    _ASSERT( !m_QueryChunkFactories.empty() );

    const EBlastProgramType kProgram = m_Options->GetProgramType();
    _ASSERT(Blast_QueryIsTranslated(kProgram));
    const BlastQueryInfo* global_qinfo = m_LocalQueryData->GetQueryInfo();
#ifdef DEBUG_COMPARE_SEQUENCES
    const BLAST_SequenceBlk* global_seq = m_LocalQueryData->GetSequenceBlk();
#endif
    const size_t kOverlap = 
        SplitQuery_GetOverlapChunkSize(kProgram) / CODON_LENGTH;
    CContextTranslator ctx_translator(*m_SplitBlk, &m_QueryChunkFactories,
                                      m_Options);
    CQueryDataPerChunk qdpc(*m_SplitBlk, kProgram, m_LocalQueryData);
    vector<const BlastQueryInfo*> chunk_qinfo(m_NumChunks, 0);

    for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) {
        CRef<IQueryFactory> chunk_qf(m_QueryChunkFactories[chunk_num]);
        CRef<ILocalQueryData> chunk_qd(chunk_qf->MakeLocalQueryData(m_Options));
#ifdef DEBUG_COMPARE_SEQUENCES
        const BLAST_SequenceBlk* chunk_seq = chunk_qd->GetSequenceBlk();
#endif

        // BlastQueryInfo structure corresponding to chunk number chunk_num
        chunk_qinfo[chunk_num] = chunk_qd->GetQueryInfo();
        _ASSERT(chunk_qinfo[chunk_num]);

        // In case the first context differs from 0, for consistency with the
        // other data returned by this class...
        for (Int4 ctx = 0; ctx < chunk_qinfo[chunk_num]->first_context; ctx++) {
            m_SplitBlk->AddContextOffsetToChunk(chunk_num, INT4_MAX);
        }

        for (Int4 ctx = chunk_qinfo[chunk_num]->first_context; 
             ctx <= chunk_qinfo[chunk_num]->last_context; 
             ctx++) {

            size_t correction = 0;
            const int starting_chunk =
                ctx_translator.GetStartingChunk(chunk_num, ctx);
            const int absolute_context =
                ctx_translator.GetAbsoluteContext(chunk_num, ctx);
            const int last_query_chunk = qdpc.GetLastChunk(chunk_num, ctx);

            if (absolute_context == kInvalidContext || 
                starting_chunk == kInvalidContext) {
                _ASSERT( !chunk_qinfo[chunk_num]->contexts[ctx].is_valid );
                // INT4_MAX is the sentinel value for invalid contexts
                m_SplitBlk->AddContextOffsetToChunk(chunk_num, INT4_MAX);
                continue;
            }

            // The corrections for the contexts corresponding to the negative
            // strand in the last chunk of a query sequence are all 0
            if (!s_IsPlusStrand(chunk_qinfo[chunk_num], ctx) &&
                (chunk_num == (size_t)last_query_chunk) && 
                (ctx % NUM_FRAMES >= 3)) {
                correction = 0;
                goto error_check;
            }

            // The corrections for the contexts corresponding to the plus
            // strand are always the same, so only calculate the first one
            if (s_IsPlusStrand(chunk_qinfo[chunk_num], ctx) && 
                (ctx % NUM_FRAMES == 1 || ctx % NUM_FRAMES == 2)) {
                correction = m_SplitBlk->GetContextOffsets(chunk_num).back();
                goto error_check;
            }

            // If the query length is divisible by CODON_LENGTH, the
            // corrections for all contexts corresponding to a given strand are
            // the same, so only calculate the first one
            if ((qdpc.GetQueryLength(chunk_num, ctx) % CODON_LENGTH == 0) &&
                (ctx % NUM_FRAMES != 0) && (ctx % NUM_FRAMES != 3)) {
                correction = m_SplitBlk->GetContextOffsets(chunk_num).back();
                goto error_check;
            }

            // If the query length % CODON_LENGTH == 1, the corrections for the
            // first two contexts of the negative strand are the same, and the
            // correction for the last context is one more than that.
            if ((qdpc.GetQueryLength(chunk_num, ctx) % CODON_LENGTH == 1) &&
                !s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) {

                if (ctx % NUM_FRAMES == 4) {
                    correction = 
                        m_SplitBlk->GetContextOffsets(chunk_num).back();
                    goto error_check;
                } else if (ctx % NUM_FRAMES == 5) {
                    correction = 
                        m_SplitBlk->GetContextOffsets(chunk_num).back() + 1;
                    goto error_check;
                }
            }
                
            // If the query length % CODON_LENGTH == 2, the corrections for the
            // last two contexts of the negative strand are the same, which is
            // one more that the first context on the negative strand.
            if ((qdpc.GetQueryLength(chunk_num, ctx) % CODON_LENGTH == 2) &&
                !s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) {

                if (ctx % NUM_FRAMES == 4) {
                    correction = 
                        m_SplitBlk->GetContextOffsets(chunk_num).back() + 1;
                    goto error_check;
                } else if (ctx % NUM_FRAMES == 5) {
                    correction = 
                        m_SplitBlk->GetContextOffsets(chunk_num).back();
                    goto error_check;
                }
            }

            if (s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) {

                for (int c = chunk_num; c != starting_chunk; c--) {
                    size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo, 
                                                         c - 1,
                                                         ctx_translator,
                                                         absolute_context);
                    size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo, c,
                                                         ctx_translator,
                                                         absolute_context);
                    size_t overlap = min(kOverlap, curr_len);
                    correction += prev_len - min(overlap, prev_len);
                }

            } else {

                size_t subtrahend = 0;

                for (int c = chunk_num; c >= starting_chunk && c >= 0; c--) {
                    size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo, 
                                                         c - 1,
                                                         ctx_translator,
                                                         absolute_context);
                    size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo,
                                                         c,
                                                         ctx_translator,
                                                         absolute_context);
                    size_t overlap = min(kOverlap, curr_len);
                    subtrahend += (curr_len - min(overlap, prev_len));
                }
                correction =
                    global_qinfo->contexts[absolute_context].query_length -
                    subtrahend;
            }

error_check:
            _ASSERT((chunk_qinfo[chunk_num]->contexts[ctx].is_valid));
            m_SplitBlk->AddContextOffsetToChunk(chunk_num, correction);
#ifdef DEBUG_COMPARE_SEQUENCES
{
    int global_offset = global_qinfo->contexts[absolute_context].query_offset +
        correction;
    int chunk_offset = chunk_qinfo[chunk_num]->contexts[ctx].query_offset;
    int num_bases2compare = 
        min(10, chunk_qinfo[chunk_num]->contexts[ctx].query_length);
    if (!cmp_sequence(&global_seq->sequence[global_offset], 
                      &chunk_seq->sequence[chunk_offset], 
                      num_bases2compare, Blast_QueryIsProtein(kProgram))) {
        cerr << "Failed to compare sequence data for chunk " << chunk_num
             << ", context " << ctx << endl;
    }
}
#endif
        }
    }
    _TRACE("CContextTranslator contents: " << ctx_translator);
}
Ejemplo n.º 3
0
// Record the correction needed to synchronize with the complete query all the
// query offsets within HSPs that fall within this context
void
CQuerySplitter::x_ComputeContextOffsets_NonTranslatedQueries()
{
    _ASSERT( !m_QueryChunkFactories.empty() );

    const EBlastProgramType kProgram = m_Options->GetProgramType();
    _ASSERT( !Blast_QueryIsTranslated(kProgram) );
    const BlastQueryInfo* global_qinfo = m_LocalQueryData->GetQueryInfo();
#ifdef DEBUG_COMPARE_SEQUENCES
    const BLAST_SequenceBlk* global_seq = m_LocalQueryData->GetSequenceBlk();
#endif
    const size_t kOverlap = SplitQuery_GetOverlapChunkSize(kProgram);
    CContextTranslator ctx_translator(*m_SplitBlk, &m_QueryChunkFactories,
                                      m_Options);
    vector<const BlastQueryInfo*> chunk_qinfo(m_NumChunks, 0);

    for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) {
        CRef<IQueryFactory> chunk_qf(m_QueryChunkFactories[chunk_num]);
        CRef<ILocalQueryData> chunk_qd(chunk_qf->MakeLocalQueryData(m_Options));
#ifdef DEBUG_COMPARE_SEQUENCES
        const BLAST_SequenceBlk* chunk_seq = chunk_qd->GetSequenceBlk();
#endif

        // BlastQueryInfo structure corresponding to chunk number chunk_num
        chunk_qinfo[chunk_num] = chunk_qd->GetQueryInfo();
        _ASSERT(chunk_qinfo[chunk_num]);

        // In case the first context differs from 0, for consistency with the
        // other data returned by this class...
        for (Int4 ctx = 0; ctx < chunk_qinfo[chunk_num]->first_context; ctx++) {
            m_SplitBlk->AddContextOffsetToChunk(chunk_num, INT4_MAX);
        }

        for (Int4 ctx = chunk_qinfo[chunk_num]->first_context; 
             ctx <= chunk_qinfo[chunk_num]->last_context; 
             ctx++) {

            size_t correction = 0;
            const int starting_chunk =
                ctx_translator.GetStartingChunk(chunk_num, ctx);
            const int absolute_context =
                ctx_translator.GetAbsoluteContext(chunk_num, ctx);

            if (absolute_context == kInvalidContext || 
                starting_chunk == kInvalidContext) {
                _ASSERT( !chunk_qinfo[chunk_num]->contexts[ctx].is_valid );
                // INT4_MAX is the sentinel value for invalid contexts
                m_SplitBlk->AddContextOffsetToChunk(chunk_num, INT4_MAX);
                continue;
            }

            if (s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) {

                for (int c = chunk_num; c != starting_chunk; c--) {
                    size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo, 
                                                         c - 1,
                                                         ctx_translator,
                                                         absolute_context);
                    size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo, c,
                                                         ctx_translator,
                                                         absolute_context);
                    size_t overlap = min(kOverlap, curr_len);
                    correction += prev_len - min(overlap, prev_len);
                }

            } else {

                size_t subtrahend = 0;

                for (int c = chunk_num; c >= starting_chunk && c >= 0; c--) {
                    size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo, 
                                                         c - 1,
                                                         ctx_translator,
                                                         absolute_context);
                    size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo,
                                                         c,
                                                         ctx_translator,
                                                         absolute_context);
                    size_t overlap = min(kOverlap, curr_len);
                    subtrahend += (curr_len - min(overlap, prev_len));
                }
                correction =
                    global_qinfo->contexts[absolute_context].query_length -
                    subtrahend;

            }
            _ASSERT((chunk_qinfo[chunk_num]->contexts[ctx].is_valid));
            m_SplitBlk->AddContextOffsetToChunk(chunk_num, correction);
#ifdef DEBUG_COMPARE_SEQUENCES
{
    int global_offset = global_qinfo->contexts[absolute_context].query_offset +
        correction;
    int chunk_offset = chunk_qinfo[chunk_num]->contexts[ctx].query_offset;
    if (!cmp_sequence(&global_seq->sequence[global_offset], 
                      &chunk_seq->sequence[chunk_offset], 10,
                      Blast_QueryIsProtein(kProgram))) {
        cerr << "Failed to compare sequence data!" << endl;
    }
}
#endif

        }
    }
    _TRACE("CContextTranslator contents: " << ctx_translator);
}