Exemple #1
0
int CAlnVec::CalculateScore(TNumrow row1, TNumrow row2) const
{
    TNumrow       numrows = m_NumRows;
    TNumrow       index1 = row1, index2 = row2;
    TSignedSeqPos start1, start2;
    string        buff1, buff2;
    bool          isAA1, isAA2;
    int           score = 0;
    TSeqPos       len;
    
    isAA1 = GetBioseqHandle(row1).GetBioseqCore()
        ->GetInst().GetMol() == CSeq_inst::eMol_aa;

    isAA2 = GetBioseqHandle(row2).GetBioseqCore()
        ->GetInst().GetMol() == CSeq_inst::eMol_aa;

    CSeqVector&   seq_vec1 = x_GetSeqVector(row1);
    TSeqPos       size1    = seq_vec1.size();
    CSeqVector &  seq_vec2 = x_GetSeqVector(row2);
    TSeqPos       size2    = seq_vec2.size();

    for (TNumseg seg = 0; seg < m_NumSegs; seg++) {
        start1 = m_Starts[index1];
        start2 = m_Starts[index2];

        if (start1 >=0  &&  start2 >= 0) {
            len = m_Lens[seg];

            if (IsPositiveStrand(row1)) {
                seq_vec1.GetSeqData(start1,
                                    start1 + len,
                                    buff1);
            } else {
                seq_vec1.GetSeqData(size1 - (start1 + len),
                                    size1 - start1,
                                    buff1);
            }
            if (IsPositiveStrand(row2)) {
                seq_vec2.GetSeqData(start2,
                                    start2 + len,
                                    buff2);
            } else {
                seq_vec2.GetSeqData(size2 - (start2 + len),
                                    size2 - start2,
                                    buff2);
            }
            score += CalculateScore(buff1, buff2, isAA1, isAA2);
        }

        index1 += numrows;
        index2 += numrows;
    }
    return score;
}
Exemple #2
0
CSeqVector& CAlnVec::x_GetSeqVector(TNumrow row) const
{
    TSeqVectorCache::iterator iter = m_SeqVectorCache.find(row);
    CRef<CSeqVector> seq_vec;
    if (iter != m_SeqVectorCache.end()) {
        seq_vec = iter->second;
    }
    else {
        CBioseq_Handle h = GetBioseqHandle(row);
        CSeqVector vec = h.GetSeqVector
            (CBioseq_Handle::eCoding_Iupac,
             IsPositiveStrand(row) ? 
             CBioseq_Handle::eStrand_Plus :
             CBioseq_Handle::eStrand_Minus);
        seq_vec.Reset(new CSeqVector(vec));
        m_SeqVectorCache[row] = seq_vec;
    }
    if ( seq_vec->IsNucleotide() ) {
        if (m_NaCoding != CSeq_data::e_not_set) {
            seq_vec->SetCoding(m_NaCoding);
        }
        else {
            seq_vec->SetIupacCoding();
        }
    }
    else if ( seq_vec->IsProtein() ) {
        if (m_AaCoding != CSeq_data::e_not_set) {
            seq_vec->SetCoding(m_AaCoding);
        }
        else {
            seq_vec->SetIupacCoding();
        }
    }
    return *seq_vec;
}
Exemple #3
0
CBioseq_Handle CSimpleOM::GetBioseqHandle(TGi gi)
{
    CSeq_id id;
    id.SetGi(gi);
    return GetBioseqHandle(id);
}
Exemple #4
0
CBioseq_Handle CSimpleOM::GetBioseqHandle(const string& id_string)
{
    CSeq_id id(id_string);
    return GetBioseqHandle(id);
}
Exemple #5
0
CSeqVector CSimpleOM::GetSeqVector(const CSeq_id_Handle& id, ENa_strand strand)
{
    return GetBioseqHandle(id).GetSeqVector(strand);
}
Exemple #6
0
CSeqVector CSimpleOM::GetSeqVector(TGi gi, ENa_strand strand)
{
    return GetBioseqHandle(gi).GetSeqVector(strand);
}
Exemple #7
0
CSeqVector CSimpleOM::GetSeqVector(const string& id_string, ENa_strand strand)
{
    return GetBioseqHandle(id_string).GetSeqVector(strand);
}
Exemple #8
0
void CAlnVec::CreateConsensus(vector<string>& consens) const
{
    bool isNucleotide = GetBioseqHandle(0).IsNucleotide();

    const int numBases = isNucleotide ? 4 : 26;

    int base_count[26]; // must be a compile-time constant for some compilers

    // determine what the number of segments required for a gapped consensus
    // segment is.  this must be rounded to be at least 50%.
    int gap_seg_thresh = m_NumRows - m_NumRows / 2;

    for (size_t j = 0;  j < (size_t)m_NumSegs;  ++j) {
        // evaluate for gap / no gap
        int gap_count = 0;
        for (size_t i = 0;  i < (size_t)m_NumRows;  ++i) {
            if (m_Starts[ j*m_NumRows + i ] == -1) {
                ++gap_count;
            }
        }

        // check to make sure that this seg is not a consensus
        // gap seg
        if ( gap_count > gap_seg_thresh )
            continue;

        // the base threshold for being considered unique is at least
        // 70% of the available sequences
        int base_thresh =
            ((m_NumRows - gap_count) * 7 + 5) / 10;

        {
            // we will build a segment with enough bases to match
            consens[j].resize(m_Lens[j]);

            // retrieve all sequences for this segment
            vector<string> segs(m_NumRows);
            RetrieveSegmentSequences(j, segs);
            TransposeSequences(segs);

            typedef multimap<int, unsigned char, greater<int> > TRevMap;

            // 
            // evaluate for a consensus
            //
            for (size_t i = 0;  i < m_Lens[j];  ++i) {
                if (isNucleotide) {
                    CollectNucleotideFrequences(segs[i], base_count, numBases);
                } else {
                    CollectProteinFrequences(segs[i], base_count, numBases);
                }


                // we create a sorted list (in descending order) of
                // frequencies of appearance to base
                // the frequency is "global" for this position: that is,
                // if 40% of the sequences are gapped, the highest frequency
                // any base can have is 0.6
                TRevMap rev_map;

                for (int k = 0;  k < numBases;  ++k) {
                    // this gets around a potentially tricky idiosyncrasy
                    // in some implementations of multimap.  depending on
                    // the library, the key may be const (or not)
                    TRevMap::value_type p(base_count[k], isNucleotide ? (1<<k) : k);
                    rev_map.insert(p);
                }

                // now, the first element here contains the best frequency
                // we scan for the appropriate bases
                if (rev_map.count(rev_map.begin()->first) == 1 &&
                    rev_map.begin()->first >= base_thresh) {
                        consens[j][i] = isNucleotide ?
                            ToIupac(rev_map.begin()->second) :
                            (rev_map.begin()->second+'A');
                } else {
                    // now we need to make some guesses based on IUPACna
                    // notation
                    int               count;
                    unsigned char     c    = 0x00;
                    int               freq = 0;
                    TRevMap::iterator curr = rev_map.begin();
                    TRevMap::iterator prev = rev_map.begin();
                    for (count = 0;
                         curr != rev_map.end() &&
                         (freq < base_thresh || prev->first == curr->first);
                         ++curr, ++count) {
                        prev = curr;
                        freq += curr->first;
                        if (isNucleotide) {
                            c |= curr->second;
                        } else {
                            unsigned char cur_char = curr->second+'A';
                            switch (c) {
                                case 0x00:
                                    c = cur_char;
                                    break;
                                case 'N': case 'D':
                                    c = (cur_char == 'N' || cur_char == 'N') ? 'B' : 'X';
                                    break;
                                case 'Q': case 'E':
                                    c = (cur_char == 'Q' || cur_char == 'E') ? 'Z' : 'X';
                                    break;
                                case 'I': case 'L':
                                    c = (cur_char == 'I' || cur_char == 'L') ? 'J' : 'X';
                                    break;
                                default:
                                    c = 'X';
                            }
                        }
                    }

                    //
                    // catchall
                    //
                    if (count > 2) {
                        consens[j][i] = isNucleotide ? 'N' : 'X';
                    } else {
                        consens[j][i] = isNucleotide ? ToIupac(c) : c;
                    }
                }
            }
        }
    }
}
Exemple #9
0
//
// CreateConsensus()
//
// compute a consensus sequence given a particular alignment
// the rules for a consensus are:
//   - a segment is consensus gap if > 50% of the sequences are gap at this
//     segment.  50% exactly is counted as sequence
//   - for a segment counted as sequence, for each position, the most
//     frequently occurring base is counted as consensus.  in the case of
//     a tie, the consensus is considered muddied, and the consensus is
//     so marked
//
CRef<CDense_seg>
CAlnVec::CreateConsensus(int& consensus_row, CBioseq& consensus_seq,
                         const CSeq_id& consensus_id) const
{
    consensus_seq.Reset();
    if ( !m_DS || m_NumRows < 1) {
        return CRef<CDense_seg>();
    }

    bool isNucleotide = GetBioseqHandle(0).IsNucleotide();

    size_t i;
    size_t j;

    // temporary storage for our consensus
    vector<string> consens(m_NumSegs);

    CreateConsensus(consens);

    //
    // now, create a new CDense_seg
    // we create a new CBioseq for our data and
    // copy the contents of the CDense_seg
    //
    string data;
    TSignedSeqPos total_bases = 0;

    CRef<CDense_seg> new_ds(new CDense_seg());
    new_ds->SetDim(m_NumRows + 1);
    new_ds->SetNumseg(m_NumSegs);
    new_ds->SetLens() = m_Lens;
    new_ds->SetStarts().reserve(m_Starts.size() + m_NumSegs);
    if ( !m_Strands.empty() ) {
        new_ds->SetStrands().reserve(m_Strands.size() +
                                     m_NumSegs);
    }

    for (i = 0;  i < consens.size();  ++i) {
        // copy the old entries
        for (j = 0;  j < (size_t)m_NumRows;  ++j) {
            int idx = i * m_NumRows + j;
            new_ds->SetStarts().push_back(m_Starts[idx]);
            if ( !m_Strands.empty() ) {
                new_ds->SetStrands().push_back(m_Strands[idx]);
            }
        }

        // add our new entry
        // this places the consensus as the last sequence
        // it should preferably be the first, but this would mean adjusting
        // the bioseq handle and seqvector caches, and all row numbers would
        // shift
        if (consens[i].length() != 0) {
            new_ds->SetStarts().push_back(total_bases);
        } else {
            new_ds->SetStarts().push_back(-1);
        }
        
        if ( !m_Strands.empty() ) {
            new_ds->SetStrands().push_back(eNa_strand_unknown);
        }

        total_bases += consens[i].length();
        data += consens[i];
    }

    // copy our IDs
    for (i = 0;  i < m_Ids.size();  ++i) {
        new_ds->SetIds().push_back(m_Ids[i]);
    }

    // now, we construct a new Bioseq
    {{

         // sequence ID
         CRef<CSeq_id> id(new CSeq_id());
         id->Assign(consensus_id);
         consensus_seq.SetId().push_back(id);

         new_ds->SetIds().push_back(id);

         // add a description for this sequence
         CSeq_descr& desc = consensus_seq.SetDescr();
         CRef<CSeqdesc> d(new CSeqdesc);
         desc.Set().push_back(d);
         d->SetComment("This is a generated consensus sequence");

         // the main one: Seq-inst
         CSeq_inst& inst = consensus_seq.SetInst();
         inst.SetRepr(CSeq_inst::eRepr_raw);
         inst.SetMol(isNucleotide ? CSeq_inst::eMol_na : CSeq_inst::eMol_aa);
         inst.SetLength(data.length());

         CSeq_data& seq_data = inst.SetSeq_data();
         if (isNucleotide) {
             CIUPACna& na = seq_data.SetIupacna();
             na = CIUPACna(data);
         } else {
             CIUPACaa& aa = seq_data.SetIupacaa();
             aa = CIUPACaa(data);
         }
    }}

    consensus_row = new_ds->GetIds().size() - 1;
    return new_ds;
}
Exemple #10
0
CBioseq_Handle CTSE_Handle::GetBioseqHandle(const CSeq_id& id) const
{
    return GetBioseqHandle(CSeq_id_Handle::GetHandle(id));
}