Sequence::Sequence(const CBioseq& bioseq) : status(CAV_ERROR_SEQUENCES), bioseqASN(&bioseq), seqIDs(bioseq.GetId()), mmdbLink(NOT_SET) { // try to get description from title or compound if (bioseq.IsSetDescr()) { CSeq_descr::Tdata::const_iterator d, de = bioseq.GetDescr().Get().end(); for (d=bioseq.GetDescr().Get().begin(); d!=de; ++d) { if (d->GetObject().IsTitle()) { description = d->GetObject().GetTitle(); break; } else if (d->GetObject().IsPdb() && d->GetObject().GetPdb().GetCompound().size() > 0) { description = d->GetObject().GetPdb().GetCompound().front(); break; } } } // get link to MMDB id - mainly for CDD's where Biostrucs have to be loaded separately if (bioseq.IsSetAnnot()) { CBioseq::TAnnot::const_iterator a, ae = bioseq.GetAnnot().end(); for (a=bioseq.GetAnnot().begin(); a!=ae; ++a) { if (a->GetObject().GetData().IsIds()) { CSeq_annot::C_Data::TIds::const_iterator i, ie = a->GetObject().GetData().GetIds().end(); for (i=a->GetObject().GetData().GetIds().begin(); i!=ie; ++i) { if (i->GetObject().IsGeneral() && i->GetObject().GetGeneral().GetDb() == "mmdb" && i->GetObject().GetGeneral().GetTag().IsId()) { mmdbLink = i->GetObject().GetGeneral().GetTag().GetId(); break; } } if (i != ie) break; } } } if (mmdbLink != NOT_SET) ERR_POST_X(3, Info << "sequence " << GetTitle() << " is from MMDB id " << mmdbLink); // get sequence string if (bioseq.GetInst().GetRepr() == CSeq_inst::eRepr_raw && bioseq.GetInst().IsSetSeq_data()) { // protein formats if (bioseq.GetInst().GetSeq_data().IsNcbieaa()) { sequenceString = bioseq.GetInst().GetSeq_data().GetNcbieaa().Get(); } else if (bioseq.GetInst().GetSeq_data().IsIupacaa()) { sequenceString = bioseq.GetInst().GetSeq_data().GetIupacaa().Get(); } else if (bioseq.GetInst().GetSeq_data().IsNcbistdaa()) { StringFromStdaa(bioseq.GetInst().GetSeq_data().GetNcbistdaa().Get(), &sequenceString); } // nucleotide formats else if (bioseq.GetInst().GetSeq_data().IsIupacna()) { sequenceString = bioseq.GetInst().GetSeq_data().GetIupacna().Get(); } else if (bioseq.GetInst().GetSeq_data().IsNcbi4na()) { StringFrom4na(bioseq.GetInst().GetSeq_data().GetNcbi4na().Get(), &sequenceString, (bioseq.GetInst().GetMol() == CSeq_inst::eMol_dna)); } else if (bioseq.GetInst().GetSeq_data().IsNcbi8na()) { // same repr. for non-X as 4na StringFrom4na(bioseq.GetInst().GetSeq_data().GetNcbi8na().Get(), &sequenceString, (bioseq.GetInst().GetMol() == CSeq_inst::eMol_dna)); } else if (bioseq.GetInst().GetSeq_data().IsNcbi2na()) { StringFrom2na(bioseq.GetInst().GetSeq_data().GetNcbi2na().Get(), &sequenceString, (bioseq.GetInst().GetMol() == CSeq_inst::eMol_dna)); if (bioseq.GetInst().IsSetLength() && bioseq.GetInst().GetLength() < sequenceString.length()) sequenceString.resize(bioseq.GetInst().GetLength()); } else { ERR_POST_X(4, Critical << "Sequence::Sequence() - sequence " << GetTitle() << ": confused by sequence string format"); return; } if (bioseq.GetInst().IsSetLength() && bioseq.GetInst().GetLength() != sequenceString.length()) { ERR_POST_X(5, Critical << "Sequence::Sequence() - sequence string length mismatch"); return; } } else { ERR_POST_X(6, Critical << "Sequence::Sequence() - sequence " << GetTitle() << ": confused by sequence representation"); return; } status = CAV_SUCCESS; }
Sequence::Sequence(SequenceSet *parent, ncbi::objects::CBioseq& bioseq) : StructureBase(parent), bioseqASN(&bioseq), identifier(NULL), molecule(NULL), isProtein(false) { if (bioseq.IsSetDescr()) { string defline, taxid; CSeq_descr::Tdata::const_iterator d, de = bioseq.GetDescr().Get().end(); for (d=bioseq.GetDescr().Get().begin(); d!=de; ++d) { // get "defline" from title or compound if ((*d)->IsTitle()) { // prefer title over compound defline = (*d)->GetTitle(); } else if (defline.size() == 0 && (*d)->IsPdb() && (*d)->GetPdb().GetCompound().size() > 0) { defline = (*d)->GetPdb().GetCompound().front(); } // get taxonomy if ((*d)->IsSource()) { if ((*d)->GetSource().GetOrg().IsSetTaxname()) taxid = (*d)->GetSource().GetOrg().GetTaxname(); else if ((*d)->GetSource().GetOrg().IsSetCommon()) taxid = (*d)->GetSource().GetOrg().GetCommon(); } } if (taxid.size() > 0) taxonomy = string("[") + taxid + ']'; if (defline.size() > 0) { title = defline; // remove taxonomy repeated at end of title if (taxonomy.size() > 0 && NStr::EndsWith(title, taxonomy, NStr::eNocase)) title = title.substr(0, title.size() - taxonomy.size()); if (title[title.size() - 1] == ' ') title = title.substr(0, title.size() - 1); } } // get link to MMDB id - mainly for CDD's where Biostrucs have to be loaded separately int mmdbID = MoleculeIdentifier::VALUE_NOT_SET; if (bioseq.IsSetAnnot()) { CBioseq::TAnnot::const_iterator a, ae = bioseq.GetAnnot().end(); for (a=bioseq.GetAnnot().begin(); a!=ae; ++a) { if (a->GetObject().GetData().IsIds()) { CSeq_annot::C_Data::TIds::const_iterator i, ie = a->GetObject().GetData().GetIds().end(); for (i=a->GetObject().GetData().GetIds().begin(); i!=ie; ++i) { if (i->GetObject().IsGeneral() && i->GetObject().GetGeneral().GetDb() == "mmdb" && i->GetObject().GetGeneral().GetTag().IsId()) { mmdbID = i->GetObject().GetGeneral().GetTag().GetId(); break; } } if (i != ie) break; } } } // get sequence string if (bioseq.GetInst().GetRepr() == CSeq_inst::eRepr_raw && bioseq.GetInst().IsSetSeq_data()) { // protein formats if (bioseq.GetInst().GetSeq_data().IsNcbieaa()) { sequenceString = bioseq.GetInst().GetSeq_data().GetNcbieaa().Get(); isProtein = true; } else if (bioseq.GetInst().GetSeq_data().IsIupacaa()) { sequenceString = bioseq.GetInst().GetSeq_data().GetIupacaa().Get(); isProtein = true; } else if (bioseq.GetInst().GetSeq_data().IsNcbistdaa()) { StringFromStdaa(bioseq.GetInst().GetSeq_data().GetNcbistdaa().Get(), &sequenceString); isProtein = true; } // nucleotide formats else if (bioseq.GetInst().GetSeq_data().IsIupacna()) { sequenceString = bioseq.GetInst().GetSeq_data().GetIupacna().Get(); // convert 'T' to 'U' for RNA if (bioseq.GetInst().GetMol() == CSeq_inst::eMol_rna) { for (unsigned int i=0; i<sequenceString.size(); ++i) { if (sequenceString[i] == 'T') sequenceString[i] = 'U'; } } } else if (bioseq.GetInst().GetSeq_data().IsNcbi4na()) { StringFrom4na(bioseq.GetInst().GetSeq_data().GetNcbi4na().Get(), &sequenceString, (bioseq.GetInst().GetMol() == CSeq_inst::eMol_dna)); } else if (bioseq.GetInst().GetSeq_data().IsNcbi8na()) { // same repr. for non-X as 4na StringFrom4na(bioseq.GetInst().GetSeq_data().GetNcbi8na().Get(), &sequenceString, (bioseq.GetInst().GetMol() == CSeq_inst::eMol_dna)); } else if (bioseq.GetInst().GetSeq_data().IsNcbi2na()) { StringFrom2na(bioseq.GetInst().GetSeq_data().GetNcbi2na().Get(), &sequenceString, (bioseq.GetInst().GetMol() == CSeq_inst::eMol_dna)); if (bioseq.GetInst().IsSetLength() && bioseq.GetInst().GetLength() < sequenceString.length()) sequenceString.resize(bioseq.GetInst().GetLength()); } else { ERRORMSG("Sequence::Sequence() - sequence " << bioseq.GetId().front()->GetSeqIdString() << ": confused by sequence string format"); return; } // check length if (bioseq.GetInst().IsSetLength() && bioseq.GetInst().GetLength() != sequenceString.length()) { ERRORMSG("Sequence::Sequence() - sequence string length mismatch"); return; } // force uppercase for (unsigned int i=0; i<sequenceString.length(); ++i) sequenceString[i] = toupper((unsigned char) sequenceString[i]); } else { ERRORMSG("Sequence::Sequence() - sequence " << bioseq.GetId().front()->GetSeqIdString() << ": confused by sequence representation"); return; } // get identifier (may be NULL if there's a problem!) identifier = MoleculeIdentifier::GetIdentifier(this, mmdbID, bioseq.GetId()); }