bool CSeqAnnotFromFasta::PurgeNonAlphaFromSequence(CBioseq& bioseq) {

    bool result = false;
    CSeq_inst::TLength newLength;

    string originalSequence;
//    CRef< CBioseq > bioseq;
//    if (cd.GetBioseqForIndex(index, bioseq)) {

    if (bioseq.GetInst().IsSetSeq_data()) {

        CSeq_data& seqData = bioseq.SetInst().SetSeq_data();

        if (seqData.IsNcbieaa()) {
            originalSequence = seqData.SetNcbieaa().Set();
        } else if (seqData.IsIupacaa()) {
            originalSequence = seqData.SetIupacaa().Set();
        } else if (seqData.IsNcbistdaa()) {
            std::vector < char >& vec = seqData.SetNcbistdaa().Set();
            NcbistdaaToNcbieaaString(vec, &originalSequence);
        }

        if (PurgeNonAlpha(originalSequence)) {
//            if (originalSequence.length() > 0 && find_if(originalSequence.begin(), originalSequence.end(), isNotAlpha) != originalSequence.end()) {

//                originalSequence.erase(remove_if(originalSequence.begin(), originalSequence.end(), isNotAlpha), originalSequence.end());
            
//            _TRACE("after remove non-alpha:  \n" << originalSequence);
                
            seqData.Select(CSeq_data::e_Ncbieaa);
            seqData.SetNcbieaa().Set(originalSequence);
            result = true;
        }
        newLength = originalSequence.length();
        bioseq.SetInst().SetLength(newLength);

    }
//    }
    return result;
}
Exemplo n.º 2
0
//
// CreateConsensus()
//
// compute a consensus sequence given a particular alignment
// the rules for a consensus are:
//   - a segment is consensus gap if > 50% of the sequences are gap at this
//     segment.  50% exactly is counted as sequence
//   - for a segment counted as sequence, for each position, the most
//     frequently occurring base is counted as consensus.  in the case of
//     a tie, the consensus is considered muddied, and the consensus is
//     so marked
//
CRef<CDense_seg>
CAlnVec::CreateConsensus(int& consensus_row, CBioseq& consensus_seq,
                         const CSeq_id& consensus_id) const
{
    consensus_seq.Reset();
    if ( !m_DS || m_NumRows < 1) {
        return CRef<CDense_seg>();
    }

    bool isNucleotide = GetBioseqHandle(0).IsNucleotide();

    size_t i;
    size_t j;

    // temporary storage for our consensus
    vector<string> consens(m_NumSegs);

    CreateConsensus(consens);

    //
    // now, create a new CDense_seg
    // we create a new CBioseq for our data and
    // copy the contents of the CDense_seg
    //
    string data;
    TSignedSeqPos total_bases = 0;

    CRef<CDense_seg> new_ds(new CDense_seg());
    new_ds->SetDim(m_NumRows + 1);
    new_ds->SetNumseg(m_NumSegs);
    new_ds->SetLens() = m_Lens;
    new_ds->SetStarts().reserve(m_Starts.size() + m_NumSegs);
    if ( !m_Strands.empty() ) {
        new_ds->SetStrands().reserve(m_Strands.size() +
                                     m_NumSegs);
    }

    for (i = 0;  i < consens.size();  ++i) {
        // copy the old entries
        for (j = 0;  j < (size_t)m_NumRows;  ++j) {
            int idx = i * m_NumRows + j;
            new_ds->SetStarts().push_back(m_Starts[idx]);
            if ( !m_Strands.empty() ) {
                new_ds->SetStrands().push_back(m_Strands[idx]);
            }
        }

        // add our new entry
        // this places the consensus as the last sequence
        // it should preferably be the first, but this would mean adjusting
        // the bioseq handle and seqvector caches, and all row numbers would
        // shift
        if (consens[i].length() != 0) {
            new_ds->SetStarts().push_back(total_bases);
        } else {
            new_ds->SetStarts().push_back(-1);
        }
        
        if ( !m_Strands.empty() ) {
            new_ds->SetStrands().push_back(eNa_strand_unknown);
        }

        total_bases += consens[i].length();
        data += consens[i];
    }

    // copy our IDs
    for (i = 0;  i < m_Ids.size();  ++i) {
        new_ds->SetIds().push_back(m_Ids[i]);
    }

    // now, we construct a new Bioseq
    {{

         // sequence ID
         CRef<CSeq_id> id(new CSeq_id());
         id->Assign(consensus_id);
         consensus_seq.SetId().push_back(id);

         new_ds->SetIds().push_back(id);

         // add a description for this sequence
         CSeq_descr& desc = consensus_seq.SetDescr();
         CRef<CSeqdesc> d(new CSeqdesc);
         desc.Set().push_back(d);
         d->SetComment("This is a generated consensus sequence");

         // the main one: Seq-inst
         CSeq_inst& inst = consensus_seq.SetInst();
         inst.SetRepr(CSeq_inst::eRepr_raw);
         inst.SetMol(isNucleotide ? CSeq_inst::eMol_na : CSeq_inst::eMol_aa);
         inst.SetLength(data.length());

         CSeq_data& seq_data = inst.SetSeq_data();
         if (isNucleotide) {
             CIUPACna& na = seq_data.SetIupacna();
             na = CIUPACna(data);
         } else {
             CIUPACaa& aa = seq_data.SetIupacaa();
             aa = CIUPACaa(data);
         }
    }}

    consensus_row = new_ds->GetIds().size() - 1;
    return new_ds;
}