Exemple #1
0
//------------------------------------------------------------------------------
CSegMasker::TMaskList*
CSegMasker::operator()(const objects::CSeqVector & data)
{
    if ( !data.IsProtein() ) {
        throw logic_error("SEG can only filter protein sequences");
    }

    if (data.GetCoding() != CSeq_data::e_Ncbistdaa ) {
        throw logic_error("SEG expects protein sequences in ncbistdaa format");
    }

    string sequence;
    BlastSeqLoc* seq_locs = NULL;
    data.GetSeqData(data.begin(), data.end(), sequence);

    Int2 status = SeqBufferSeg((Uint1*)(sequence.data()),
                               static_cast<Int4>(sequence.size()), 0,
                               m_SegParameters, &seq_locs);
    sequence.erase();
    if (status != 0) {
        seq_locs = BlastSeqLocFree(seq_locs);
        throw runtime_error("SEG internal error (check that input is protein) " + NStr::IntToString(status));
    }

    auto_ptr<TMaskList> retval(new TMaskList);
    for (BlastSeqLoc* itr = seq_locs; itr; itr = itr->next) {
        retval->push_back
            (TMaskList::value_type(itr->ssr->left, itr->ssr->right));
    }

    seq_locs = BlastSeqLocFree(seq_locs);

    return retval.release();
}
Exemple #2
0
BlastMaskLoc* BlastMaskLocFree(BlastMaskLoc* mask_loc)
{
   Int4 index;

   if (mask_loc == NULL)
      return NULL;

   for (index=0; index<mask_loc->total_size; index++)
   {
      if (mask_loc->seqloc_array != NULL)
         BlastSeqLocFree(mask_loc->seqloc_array[index]);
   }
   sfree(mask_loc->seqloc_array);
   sfree(mask_loc);
   return NULL;
}
Exemple #3
0
Int2 BlastMaskLocDNAToProtein(BlastMaskLoc* mask_loc, 
                              const BlastQueryInfo* query_info)
{
    Uint4 seq_index;
    BlastSeqLoc* dna_seqlocs[NUM_FRAMES];

    if (!mask_loc)
        return 0;

    /* Check that the array size in BlastMaskLoc corresponds to the number
       of contexts in BlastQueryInfo. */
    ASSERT(mask_loc->total_size == query_info->last_context + 1);

    /* Loop over multiple DNA sequences */
    for (seq_index = 0; seq_index < (Uint4)query_info->num_queries; 
         ++seq_index) { 
        const Uint4 ctx_idx = NUM_FRAMES * seq_index;
        const Int4 dna_length = BlastQueryInfoGetQueryLength(query_info,
                                                             eBlastTypeBlastx,
                                                             seq_index);
        Int4 context;

        /* Save the DNA masking locations, as they'll be freed and overwritten
         * by their translations */
        memset((void*) &dna_seqlocs, 0, sizeof(dna_seqlocs));
        memcpy((void*) &dna_seqlocs, 
               (void*) &mask_loc->seqloc_array[ctx_idx], 
               sizeof(dna_seqlocs));
        memset((void*) &mask_loc->seqloc_array[ctx_idx], 0, sizeof(dna_seqlocs));

        /* Reproduce this mask for all 6 frames, with translated coordinates */
        for (context = 0; context < NUM_FRAMES; ++context) {
            const Int2 frame = BLAST_ContextToFrame(eBlastTypeBlastx, context);
            BlastSeqLoc* frame_seqloc = dna_seqlocs[context];
            BlastSeqLoc* prot_tail = NULL;
            BlastSeqLoc* itr = NULL;

            /* If no masks were provided for some frames, use the first one */
            if (frame_seqloc == NULL && dna_seqlocs[0]) {
                frame_seqloc = dna_seqlocs[0];
            }
            for (itr = frame_seqloc; itr; itr = itr->next) {
                Int4 from, to;
                SSeqRange* seq_range = itr->ssr;
                /* masks should be 0-offset */
                ASSERT(seq_range->right < dna_length);
                ASSERT(seq_range->left  >= 0);
                if (frame < 0) {
                    from = (dna_length + frame - seq_range->right)/CODON_LENGTH;
                    to = (dna_length + frame - seq_range->left)/CODON_LENGTH;
                } else {
                    from = (seq_range->left - frame + 1)/CODON_LENGTH;
                    to = (seq_range->right - frame + 1)/CODON_LENGTH;
                }

                if (from < 0)
                    from = 0;
                if (to   < 0)
                    to   = 0;
                if (from >= query_info->contexts[ctx_idx+context].query_length)
                    from = query_info->contexts[ctx_idx+context].query_length - 1;
                if (to >= query_info->contexts[ctx_idx+context].query_length)
                    to = query_info->contexts[ctx_idx+context].query_length - 1;

                ASSERT(from >= 0);
                ASSERT(to   >= 0);
                ASSERT(from < query_info->contexts[ctx_idx+context].query_length);
                ASSERT(to   < query_info->contexts[ctx_idx+context].query_length);

                /* Cache the tail of the list to avoid the overhead of
                 * traversing the list when appending to it */
                prot_tail = BlastSeqLocNew((prot_tail 
                            ? & prot_tail 
                            : & mask_loc->seqloc_array[ctx_idx+context]), 
                            from, to);
            }
        }
        for (context = 0; context < NUM_FRAMES; ++context) {
            BlastSeqLocFree(dna_seqlocs[context]);
        }
    }

    return 0;
}
Exemple #4
0
Int2
Blast_RunSearch(SeqLoc* query_seqloc,
                Blast_PsiCheckpointLoc * psi_checkpoint,
                const BlastSeqSrc* seq_src,
                SeqLoc* masking_locs,
                const SBlastOptions* options,
                BlastTabularFormatData* tf_data,
                BlastHSPResults **results,
                SeqLoc** filter_out,
                Blast_SummaryReturn* extra_returns)
{
    Int2 status = 0;
    BLAST_SequenceBlk *query = NULL;
    BlastQueryInfo* query_info = NULL;
    double scale_factor = 1.0;
    BlastSeqLoc* lookup_segments = NULL;
    BlastScoreBlk* sbp = NULL;
    LookupTableWrap* lookup_wrap = NULL;
    BlastMaskLoc* mask_loc = NULL;
    BlastHSPStream* hsp_stream = NULL;
    const EBlastProgramType kProgram = options->program;
    const Boolean kRpsBlast = 
        (kProgram == eBlastTypeRpsBlast ||
         kProgram == eBlastTypeRpsTblastn);
    BlastRPSInfo* rps_info = NULL;
    Nlm_MemMapPtr rps_mmap = NULL;
    Nlm_MemMapPtr rps_pssm_mmap = NULL;
    const QuerySetUpOptions* query_options = options->query_options;
    const LookupTableOptions* lookup_options = options->lookup_options;
    const BlastScoringOptions* score_options = options->score_options;
    const BlastHitSavingOptions* hit_options = options->hit_options;
    SBlastOptions* rps_options = NULL;
    const Boolean kPhiBlast = Blast_ProgramIsPhiBlast(kProgram);
    const Uint1 kDeallocateMe = 253;
    Blast_Message *core_msg = NULL;

    if (!query_seqloc || !seq_src || !options || !extra_returns) 
        return -1;

    if ((status = 
         BLAST_ValidateOptions(kProgram, options->ext_options, score_options, 
                               lookup_options, options->word_options, hit_options, 
                               &core_msg)) != 0) {
         extra_returns->error = Blast_MessageToSBlastMessage(core_msg, NULL, NULL, options->believe_query);
         core_msg = Blast_MessageFree(core_msg);
         
        return status;
    }

    if (options->program == eBlastTypeBlastn)
    {
         SeqLoc* dust_mask = NULL; /* Dust mask locations */
         Blast_FindDustSeqLoc(query_seqloc, options, &dust_mask);
         /* Combine dust mask with lower case mask 
            The dust mask will be deallocated by the end of this function
            though as it's copied in BLAST_MainSetUp 
            Not deallocating it will result in a memory leak if masking_locs
            was NULL at the start of this function */
         if (dust_mask)
         {
            SeqLoc* dust_mask_var = dust_mask;
            while (dust_mask_var)
            {
               dust_mask_var->choice = kDeallocateMe;
               dust_mask_var = dust_mask_var->next;
            }
            ValNodeLink(&masking_locs, dust_mask);
         }
    }

    if (kRpsBlast) {
        if ((status = 
             s_RPSExtraStructsSetUp(seq_src, options, &rps_options, &rps_info, 
                                    &rps_mmap, &rps_pssm_mmap, &scale_factor, 
                                    extra_returns)))
            return status;
        score_options = rps_options->score_options;
        hit_options = rps_options->hit_options;
        options = rps_options; /* This will not change the caller's pointer. */
    }

    if ((status = BLAST_SetUpQuery(kProgram, query_seqloc, query_options, 
                                   masking_locs, &query_info, &query))) {
        SBlastMessageWrite(&extra_returns->error, SEV_ERROR,  
                "BLAST_SetUpQuery returned non-zero status\n", NULL, FALSE);
        return status;
    }

    status = 
        BLAST_MainSetUp(kProgram, query_options, score_options, query, 
                        query_info, scale_factor, &lookup_segments, &mask_loc,
                        &sbp, &core_msg, s_BlastFindMatrixPath);
    if (core_msg)
    {
       extra_returns->error = Blast_MessageToSBlastMessage(core_msg, query_seqloc, query_info, options->believe_query);
       core_msg = Blast_MessageFree(core_msg);
    }

    if (status)
        return status;

    if (psi_checkpoint) {
        core_msg = NULL;
        status = s_SetupScoreBlkPssmFromChkpt(sbp, query, psi_checkpoint,
                                              &core_msg);
        if (core_msg) {
            extra_returns->error =
                Blast_MessageToSBlastMessage(core_msg, query_seqloc,
                                             query_info,
                                             options->believe_query);
            core_msg = Blast_MessageFree(core_msg);
        }
        if (status)
            return status;
    }
    if (filter_out) {
        *filter_out = 
            BlastMaskLocToSeqLoc(kProgram, mask_loc, query_seqloc);
    }

    /* Mask locations in BlastMaskLoc form are no longer needed. */
    BlastMaskLocFree(mask_loc);

    if (masking_locs)
    {
          SeqLocPtr slp_var = masking_locs;
          SeqLocPtr last = NULL;
          while (slp_var)
          {
              if (slp_var->choice == kDeallocateMe)
              {
                  if (last == NULL)
                  {
                     masking_locs = slp_var->next;
                     slp_var->next = NULL;
                     Blast_ValNodeMaskListFree(slp_var);
                     slp_var = masking_locs;
                  }
                  else
                  {
                     last->next = slp_var->next;
                     slp_var->next = NULL;
                     Blast_ValNodeMaskListFree(slp_var);
                     slp_var = last->next;
                  }
              } 
              else
              {
                  last = slp_var;
                  slp_var = slp_var->next;
              }
          }
    }

    status = LookupTableWrapInit(query, lookup_options, query_options,
                        lookup_segments, sbp, &lookup_wrap, rps_info, &core_msg);
    if (core_msg)
    {
          extra_returns->error = Blast_MessageToSBlastMessage(core_msg, query_seqloc, query_info, options->believe_query);
          core_msg = Blast_MessageFree(core_msg);
    }
    if (status)
        return status;

    /* For PHI BLAST, save information about pattern occurrences in
       query in the BlastQueryInfo structure. */
    if (kPhiBlast) {
        SPHIPatternSearchBlk* pattern_blk = 
            (SPHIPatternSearchBlk*) lookup_wrap->lut;
        Blast_SetPHIPatternInfo(kProgram, pattern_blk, query, lookup_segments, 
                                query_info, &core_msg);
        if (core_msg)
        {
             extra_returns->error = Blast_MessageToSBlastMessage(core_msg, query_seqloc, query_info, options->believe_query);
             core_msg = Blast_MessageFree(core_msg);
        }

    }
    /* Only need for the setup of lookup table. */
    lookup_segments = BlastSeqLocFree(lookup_segments);

    if ((status = s_BlastHSPStreamSetUp(query, query_info, seq_src, options, sbp,
                                        tf_data, &hsp_stream, extra_returns)))
        return status;

    if ((status = s_BlastThreadManager(query, query_info, seq_src, options,
                                       lookup_wrap, sbp, hsp_stream, rps_info, 
                                       tf_data, results, extra_returns)))
        return status;
    
    lookup_wrap = LookupTableWrapFree(lookup_wrap);
    
    query = BlastSequenceBlkFree(query);
    query_info = BlastQueryInfoFree(query_info);
    BlastScoreBlkFree(sbp);
    
    if (kRpsBlast)
        s_RPSExtraStructsFree(rps_info, rps_mmap, rps_pssm_mmap, rps_options);
    
    return status;
}
Exemple #5
0
/* -- SSH --
   Create lookup table for the large sequence, that represented
   by all collection of PSSM matrixes and dump this table to disk
   Used by RPS Blast.
*/
Boolean RPSCreateLookupFile(ScoreRow *combinedMatrix, Int4 numProfiles,
                            Int4Ptr seqlens, CharPtr filename, 
                            Nlm_FloatHi scalingFactor)
{
    BlastScoreBlk *sbp;
    FILE *fd;
    Int4  **posMatrix;
    Int4 start, i, header_size, all_length, magicNumber;
    Int4Ptr offsets;
    Int4 num_lookups;
    BlastSeqLoc *lookup_segment=NULL;
    BlastAaLookupTable *lookup;
    LookupTableWrap* lookup_wrap_ptr=NULL;
    LookupTableOptions* lookup_options;
   

    if((fd = FileOpen(filename, "wb")) == NULL)
        return FALSE;
    
    num_lookups = 1; /* Single lookup table for all set */

    all_length = seqlens[numProfiles] - seqlens[0];
    
    posMatrix = MemNew((all_length + 1) * sizeof(Int4 *));
    for (i = 0; i < all_length; i++) {
        posMatrix[i] = (Int4 *) &(combinedMatrix[i][0]);
    }
    
    /* Last row is necessary */
    posMatrix[all_length] = MemNew(sizeof(Int4) * PRO_ALPHABET_SIZE);

    for(i = 0; i < PRO_ALPHABET_SIZE; i++) {
        posMatrix[all_length][i] = -INT2_MAX;
    }

    sbp = BlastScoreBlkNew(BLASTAA_SEQ_CODE, 1);
    RPSPsiMatrixAttach(sbp, posMatrix);
    LookupTableOptionsNew(eBlastTypeBlastp, &lookup_options);
    BLAST_FillLookupTableOptions(lookup_options, eBlastTypePsiBlast, FALSE, 
	(Int4) (myargs[3].floatvalue*scalingFactor), myargs[4].intvalue);


    BlastSeqLocNew(&lookup_segment, 0, all_length);

    /* Need query for psi-blast??  where to put the PSSM? */
    LookupTableWrapInit(NULL, lookup_options, NULL, lookup_segment, sbp, &lookup_wrap_ptr, NULL, NULL);
   
    RPSPsiMatrixDetach(sbp);
    sbp = BlastScoreBlkFree(sbp);
    lookup_options = LookupTableOptionsFree(lookup_options);
    lookup_segment = BlastSeqLocFree(lookup_segment);

    lookup = (BlastAaLookupTable*) lookup_wrap_ptr->lut;

    /* Only Uint4 maximum length for lookup file allowed in current
       implementation */
    header_size = (numProfiles+1)*sizeof(Int4) + 8*sizeof(Int4);
    
    /* Beginning of file will be allocated for lookup offsets */
    fseek(fd, header_size, SEEK_SET);
    
    offsets = MemNew(sizeof(Int4) * (num_lookups + 1));
    

    offsets[0] = ftell(fd);
    
    start = seqlens[0]; /* 0 */
    
    RPSDumpLookupTable(lookup, fd);
    
    i = 1;
    
    offsets[i] = ftell(fd); /* Last offset also recorded */
    
    fseek(fd, 0, SEEK_SET);
    magicNumber = RPS_MAGIC_NUMBER;
    FileWrite(&magicNumber, sizeof(Int4), 1, fd); /* header[0] */
    FileWrite(&num_lookups, sizeof(Int4), 1, fd); /* header[1] */
    FileWrite(&lookup->neighbor_matches, sizeof(Int4), 1, fd); /* header[2] */
    FileWrite(&lookup->neighbor_matches, sizeof(Int4), 1, fd); /* header[3] */
    FileWrite(&lookup->overflow_size, sizeof(Int4), 1, fd); /* header[4] */
    
    /* Now writing recorded offsets in the beginning of the file */
    
    fseek(fd, 8*sizeof(Int4), SEEK_SET);
    FileWrite(offsets, sizeof(Int4), num_lookups + 1, fd);
    FileClose(fd);
    
    /* Final memory cleenup */
    
    MemFree(posMatrix[all_length]);
    MemFree(posMatrix);

    return TRUE;
}