void s_Check(const CBioseq& seq) { _ASSERT(!seq.GetId().empty()); const CSeq_inst& inst = seq.GetInst(); const string& seqdata = inst.GetSeq_data().GetIupacna().Get(); _ASSERT(seqdata.size() == inst.GetLength()); ITERATE ( string, i, seqdata ) { _ASSERT(*i >= 'A' && *i <= 'Z'); }
bool CBlastBioseqMaker::IsEmptyBioseq(const CBioseq& bioseq) { if (bioseq.CanGetInst()) { const CSeq_inst& inst = bioseq.GetInst(); return (inst.GetRepr() == CSeq_inst::eRepr_raw && inst.CanGetMol() && inst.CanGetLength() && inst.CanGetSeq_data() == false); } return false; }
CSeqVector::CSeqVector(const CBioseq& bioseq, CScope* scope, EVectorCoding coding, ENa_strand strand) : m_Scope(scope), m_SeqMap(CSeqMap::CreateSeqMapForBioseq(bioseq)), m_Strand(strand), m_Coding(CSeq_data::e_not_set) { m_Size = m_SeqMap->GetLength(scope); m_Mol = bioseq.GetInst().GetMol(); SetCoding(coding); }
bool CSeqAnnotFromFasta::PurgeNonAlphaFromSequence(CBioseq& bioseq) { bool result = false; CSeq_inst::TLength newLength; string originalSequence; // CRef< CBioseq > bioseq; // if (cd.GetBioseqForIndex(index, bioseq)) { if (bioseq.GetInst().IsSetSeq_data()) { CSeq_data& seqData = bioseq.SetInst().SetSeq_data(); if (seqData.IsNcbieaa()) { originalSequence = seqData.SetNcbieaa().Set(); } else if (seqData.IsIupacaa()) { originalSequence = seqData.SetIupacaa().Set(); } else if (seqData.IsNcbistdaa()) { std::vector < char >& vec = seqData.SetNcbistdaa().Set(); NcbistdaaToNcbieaaString(vec, &originalSequence); } if (PurgeNonAlpha(originalSequence)) { // if (originalSequence.length() > 0 && find_if(originalSequence.begin(), originalSequence.end(), isNotAlpha) != originalSequence.end()) { // originalSequence.erase(remove_if(originalSequence.begin(), originalSequence.end(), isNotAlpha), originalSequence.end()); // _TRACE("after remove non-alpha: \n" << originalSequence); seqData.Select(CSeq_data::e_Ncbieaa); seqData.SetNcbieaa().Set(originalSequence); result = true; } newLength = originalSequence.length(); bioseq.SetInst().SetLength(newLength); } // } return result; }
int CReadBlastApp::AnalyzeSeqsViaBioseqs(CBioseq& left, CBioseq& right) { /* if(PrintDetails()) NcbiCerr << "AnalyzeSeqsViaBioseqs(left, right): " << GetStringDescr(left) << ", " << GetStringDescr(right) << NcbiEndl; */ if(is_prot_entry(left) && !is_prot_entry(right)) { // if(PrintDetails()) NcbiCerr << "AnalyzeSeqsViaBioseqs(left, right): going for overlaps\n"; overlaps_prot_na(left, right.GetAnnot()); } return -1; }
Sequence::Sequence(const CBioseq& bioseq) : status(CAV_ERROR_SEQUENCES), bioseqASN(&bioseq), seqIDs(bioseq.GetId()), mmdbLink(NOT_SET) { // try to get description from title or compound if (bioseq.IsSetDescr()) { CSeq_descr::Tdata::const_iterator d, de = bioseq.GetDescr().Get().end(); for (d=bioseq.GetDescr().Get().begin(); d!=de; ++d) { if (d->GetObject().IsTitle()) { description = d->GetObject().GetTitle(); break; } else if (d->GetObject().IsPdb() && d->GetObject().GetPdb().GetCompound().size() > 0) { description = d->GetObject().GetPdb().GetCompound().front(); break; } } } // get link to MMDB id - mainly for CDD's where Biostrucs have to be loaded separately if (bioseq.IsSetAnnot()) { CBioseq::TAnnot::const_iterator a, ae = bioseq.GetAnnot().end(); for (a=bioseq.GetAnnot().begin(); a!=ae; ++a) { if (a->GetObject().GetData().IsIds()) { CSeq_annot::C_Data::TIds::const_iterator i, ie = a->GetObject().GetData().GetIds().end(); for (i=a->GetObject().GetData().GetIds().begin(); i!=ie; ++i) { if (i->GetObject().IsGeneral() && i->GetObject().GetGeneral().GetDb() == "mmdb" && i->GetObject().GetGeneral().GetTag().IsId()) { mmdbLink = i->GetObject().GetGeneral().GetTag().GetId(); break; } } if (i != ie) break; } } } if (mmdbLink != NOT_SET) ERR_POST_X(3, Info << "sequence " << GetTitle() << " is from MMDB id " << mmdbLink); // get sequence string if (bioseq.GetInst().GetRepr() == CSeq_inst::eRepr_raw && bioseq.GetInst().IsSetSeq_data()) { // protein formats if (bioseq.GetInst().GetSeq_data().IsNcbieaa()) { sequenceString = bioseq.GetInst().GetSeq_data().GetNcbieaa().Get(); } else if (bioseq.GetInst().GetSeq_data().IsIupacaa()) { sequenceString = bioseq.GetInst().GetSeq_data().GetIupacaa().Get(); } else if (bioseq.GetInst().GetSeq_data().IsNcbistdaa()) { StringFromStdaa(bioseq.GetInst().GetSeq_data().GetNcbistdaa().Get(), &sequenceString); } // nucleotide formats else if (bioseq.GetInst().GetSeq_data().IsIupacna()) { sequenceString = bioseq.GetInst().GetSeq_data().GetIupacna().Get(); } else if (bioseq.GetInst().GetSeq_data().IsNcbi4na()) { StringFrom4na(bioseq.GetInst().GetSeq_data().GetNcbi4na().Get(), &sequenceString, (bioseq.GetInst().GetMol() == CSeq_inst::eMol_dna)); } else if (bioseq.GetInst().GetSeq_data().IsNcbi8na()) { // same repr. for non-X as 4na StringFrom4na(bioseq.GetInst().GetSeq_data().GetNcbi8na().Get(), &sequenceString, (bioseq.GetInst().GetMol() == CSeq_inst::eMol_dna)); } else if (bioseq.GetInst().GetSeq_data().IsNcbi2na()) { StringFrom2na(bioseq.GetInst().GetSeq_data().GetNcbi2na().Get(), &sequenceString, (bioseq.GetInst().GetMol() == CSeq_inst::eMol_dna)); if (bioseq.GetInst().IsSetLength() && bioseq.GetInst().GetLength() < sequenceString.length()) sequenceString.resize(bioseq.GetInst().GetLength()); } else { ERR_POST_X(4, Critical << "Sequence::Sequence() - sequence " << GetTitle() << ": confused by sequence string format"); return; } if (bioseq.GetInst().IsSetLength() && bioseq.GetInst().GetLength() != sequenceString.length()) { ERR_POST_X(5, Critical << "Sequence::Sequence() - sequence string length mismatch"); return; } } else { ERR_POST_X(6, Critical << "Sequence::Sequence() - sequence " << GetTitle() << ": confused by sequence representation"); return; } status = CAV_SUCCESS; }
// // CreateConsensus() // // compute a consensus sequence given a particular alignment // the rules for a consensus are: // - a segment is consensus gap if > 50% of the sequences are gap at this // segment. 50% exactly is counted as sequence // - for a segment counted as sequence, for each position, the most // frequently occurring base is counted as consensus. in the case of // a tie, the consensus is considered muddied, and the consensus is // so marked // CRef<CDense_seg> CAlnVec::CreateConsensus(int& consensus_row, CBioseq& consensus_seq, const CSeq_id& consensus_id) const { consensus_seq.Reset(); if ( !m_DS || m_NumRows < 1) { return CRef<CDense_seg>(); } bool isNucleotide = GetBioseqHandle(0).IsNucleotide(); size_t i; size_t j; // temporary storage for our consensus vector<string> consens(m_NumSegs); CreateConsensus(consens); // // now, create a new CDense_seg // we create a new CBioseq for our data and // copy the contents of the CDense_seg // string data; TSignedSeqPos total_bases = 0; CRef<CDense_seg> new_ds(new CDense_seg()); new_ds->SetDim(m_NumRows + 1); new_ds->SetNumseg(m_NumSegs); new_ds->SetLens() = m_Lens; new_ds->SetStarts().reserve(m_Starts.size() + m_NumSegs); if ( !m_Strands.empty() ) { new_ds->SetStrands().reserve(m_Strands.size() + m_NumSegs); } for (i = 0; i < consens.size(); ++i) { // copy the old entries for (j = 0; j < (size_t)m_NumRows; ++j) { int idx = i * m_NumRows + j; new_ds->SetStarts().push_back(m_Starts[idx]); if ( !m_Strands.empty() ) { new_ds->SetStrands().push_back(m_Strands[idx]); } } // add our new entry // this places the consensus as the last sequence // it should preferably be the first, but this would mean adjusting // the bioseq handle and seqvector caches, and all row numbers would // shift if (consens[i].length() != 0) { new_ds->SetStarts().push_back(total_bases); } else { new_ds->SetStarts().push_back(-1); } if ( !m_Strands.empty() ) { new_ds->SetStrands().push_back(eNa_strand_unknown); } total_bases += consens[i].length(); data += consens[i]; } // copy our IDs for (i = 0; i < m_Ids.size(); ++i) { new_ds->SetIds().push_back(m_Ids[i]); } // now, we construct a new Bioseq {{ // sequence ID CRef<CSeq_id> id(new CSeq_id()); id->Assign(consensus_id); consensus_seq.SetId().push_back(id); new_ds->SetIds().push_back(id); // add a description for this sequence CSeq_descr& desc = consensus_seq.SetDescr(); CRef<CSeqdesc> d(new CSeqdesc); desc.Set().push_back(d); d->SetComment("This is a generated consensus sequence"); // the main one: Seq-inst CSeq_inst& inst = consensus_seq.SetInst(); inst.SetRepr(CSeq_inst::eRepr_raw); inst.SetMol(isNucleotide ? CSeq_inst::eMol_na : CSeq_inst::eMol_aa); inst.SetLength(data.length()); CSeq_data& seq_data = inst.SetSeq_data(); if (isNucleotide) { CIUPACna& na = seq_data.SetIupacna(); na = CIUPACna(data); } else { CIUPACaa& aa = seq_data.SetIupacaa(); aa = CIUPACaa(data); } }} consensus_row = new_ds->GetIds().size() - 1; return new_ds; }