// ---------------------------------------------------------------------------- bool CGtfReader::x_FeatureSetDataCDS( const CGff2Record& record, CRef< CSeq_feat > pFeature ) // ---------------------------------------------------------------------------- { if ( ! CGff2Reader::x_FeatureSetDataCDS( record, pFeature ) ) { return false; } CCdregion& cdr = pFeature->SetData().SetCdregion(); string strValue; if ( record.GetAttribute( "protein_id", strValue ) ) { CRef<CSeq_id> pId = CReadUtil::AsSeqId(strValue,m_iFlags); if (pId->IsGenbank()) { pFeature->SetProduct().SetWhole(*pId); } } if ( record.GetAttribute( "ribosomal_slippage", strValue ) ) { pFeature->SetExcept( true ); pFeature->SetExcept_text( "ribosomal slippage" ); } if ( record.GetAttribute( "transl_table", strValue ) ) { CRef< CGenetic_code::C_E > pGc( new CGenetic_code::C_E ); pGc->SetId( NStr::StringToUInt( strValue ) ); cdr.SetCode().Set().push_back( pGc ); } return true; }
// ---------------------------------------------------------------------------- bool CGtfReader::xFeatureSetQualifiersGene( const CGff2Record& record, CRef< CSeq_feat > pFeature ) // ---------------------------------------------------------------------------- { // // Create GB qualifiers for the record attributes: // CRef< CGb_qual > pQual(0); const CGff2Record::TAttributes& attrs = record.Attributes(); CGff2Record::TAttrCit it = attrs.begin(); for (/*NOOP*/; it != attrs.end(); ++it) { // gtf genes don't get transcript_id if (it->first == "transcript_id") { continue; } // special case some well-known attributes if (x_ProcessQualifierSpecialCase(it, pFeature)) { continue; } // turn everything else into a qualifier pQual.Reset(new CGb_qual); pQual->SetQual(it->first); pQual->SetVal(it->second); pFeature->SetQual().push_back(pQual); } return true; }
// ---------------------------------------------------------------------------- bool CGtfReader::x_MergeFeatureLocationSingleInterval( const CGff2Record& record, CRef< CSeq_feat > pFeature ) // ---------------------------------------------------------------------------- { const CSeq_interval& gene_int = pFeature->GetLocation().GetInt(); if ( gene_int.GetFrom() > record.SeqStart() -1 ) { pFeature->SetLocation().SetInt().SetFrom( record.SeqStart() ); } if ( gene_int.GetTo() < record.SeqStop() - 1 ) { pFeature->SetLocation().SetInt().SetTo( record.SeqStop() ); } if (record.Type() == "CDS" && pFeature->GetData().IsCdregion()) { return x_FeatureTrimQualifiers(record, pFeature); } return true; }
// ---------------------------------------------------------------------------- string s_FeatureKey( const CGff2Record& gff ) // ---------------------------------------------------------------------------- { string strGeneId = s_GeneKey( gff ); if ( gff.Type() == "gene" ) { return strGeneId; } string strTranscriptId; if ( ! gff.GetAttribute( "transcript_id", strTranscriptId ) ) { cerr << "Unexpected: GTF feature without a transcript_id." << endl; strTranscriptId = "transcript_id"; } return strGeneId + "|" + strTranscriptId; }
// ---------------------------------------------------------------------------- bool CGtfReader::x_CreateFeatureLocation( const CGff2Record& record, CRef< CSeq_feat > pFeature ) // ---------------------------------------------------------------------------- { CRef<CSeq_id> pId = CReadUtil::AsSeqId( record.Id(), m_iFlags & fAllIdsAsLocal); CSeq_interval& location = pFeature->SetLocation().SetInt(); location.SetId( *pId ); location.SetFrom( record.SeqStart() ); if (record.Type() != "mRNA") { location.SetTo(record.SeqStop()); } else { // place holder // actual location will be computed from the exons and CDSs living on // this feature. location.SetTo(record.SeqStart()); } if ( record.IsSetStrand() ) { location.SetStrand( record.Strand() ); } return true; }
// ---------------------------------------------------------------------------- bool CGtfReader::x_MergeFeatureLocationMultiInterval( const CGff2Record& record, CRef< CSeq_feat > pFeature ) // ---------------------------------------------------------------------------- { CRef<CSeq_id> pId = CReadUtil::AsSeqId( record.Id(), m_iFlags & fAllIdsAsLocal); CRef< CSeq_loc > pLocation( new CSeq_loc ); pLocation->SetInt().SetId( *pId ); pLocation->SetInt().SetFrom( record.SeqStart() ); pLocation->SetInt().SetTo( record.SeqStop() ); if ( record.IsSetStrand() ) { pLocation->SetInt().SetStrand( record.Strand() ); } pLocation = pLocation->Add( pFeature->SetLocation(), CSeq_loc::fSortAndMerge_All, 0 ); pFeature->SetLocation( *pLocation ); return true; }
// ---------------------------------------------------------------------------- string s_GeneKey( const CGff2Record& gff ) // ---------------------------------------------------------------------------- { string strGeneId; if ( ! gff.GetAttribute( "gene_id", strGeneId ) ) { cerr << "Unexpected: GTF feature without a gene_id." << endl; return "gene_id"; } return strGeneId; }
// ---------------------------------------------------------------------------- bool CGtfReader::x_CdsIsPartial( const CGff2Record& record ) // ---------------------------------------------------------------------------- { string strPartial; // if ( record.Type() != "CDS" ) { // return false; // } if ( record.GetAttribute( "partial", strPartial ) ) { return true; } CRef< CSeq_feat > mRna; if ( ! x_FindParentMrna( record, mRna ) ) { return false; } return ( mRna->IsSetPartial() && mRna->GetPartial() ); }
// ----------------------------------------------------------------------------- bool CGtfReader::x_CreateParentCds( const CGff2Record& gff, CRef< CSeq_annot > pAnnot ) // ----------------------------------------------------------------------------- { // // Create a single cds feature. // This creation may either be triggered by an actual CDS feature found in the // gtf, or by a feature that would imply a CDS feature (such as a start codon // or a stop codon). The latter is necessary because nothing the the gtf // standard stipulates that gtf features have to be arranged in any particular // order. // CRef< CSeq_feat > pFeature( new CSeq_feat ); string strType = gff.Type(); if ( strType != "CDS" && strType != "start_codon" && strType != "stop_codon" ) { return false; } if ( ! x_FeatureSetDataCDS( gff, pFeature ) ) { return false; } if ( ! x_CreateFeatureLocation( gff, pFeature ) ) { return false; } if ( ! x_CreateFeatureId( gff, "cds", pFeature ) ) { return false; } if ( ! x_CreateGeneXrefs( gff, pFeature ) ) { return false; } if ( ! x_CreateMrnaXrefs( gff, pFeature ) ) { return false; } if ( ! x_FeatureSetQualifiers( gff, pFeature ) ) { return false; } m_CdsMap[ s_FeatureKey( gff ) ] = pFeature; return xAddFeatureToAnnot( pFeature, pAnnot ); }
// ---------------------------------------------------------------------------- bool CGtfReader::x_FeatureSetDataMRNA( const CGff2Record& record, CRef<CSeq_feat> pFeature) // ---------------------------------------------------------------------------- { if ( ! CGff2Reader::x_FeatureSetDataRna( record, pFeature, CSeqFeatData::eSubtype_mRNA)) { return false; } CRNA_ref& rna = pFeature->SetData().SetRna(); string strValue; if (record.GetAttribute("product", strValue)) { rna.SetExt().SetName(strValue); } return true; }
// ---------------------------------------------------------------------------- bool CGtfReader::x_FeatureSetDataGene( const CGff2Record& record, CRef< CSeq_feat > pFeature ) // ---------------------------------------------------------------------------- { if ( ! CGff2Reader::x_FeatureSetDataGene( record, pFeature ) ) { return false; } CGene_ref& gene = pFeature->SetData().SetGene(); string strValue; if ( record.GetAttribute( "gene_synonym", strValue ) ) { gene.SetSyn().push_back( strValue ); } // mss-399: do -not- use gene_id for /gene_syn or /gene: //if ( record.GetAttribute( "gene_id", strValue ) ) { // gene.SetSyn().push_front( strValue ); //} return true; }
// ---------------------------------------------------------------------------- bool CGff3Reader::x_UpdateAnnot( const CGff2Record& record, CRef< CSeq_annot > pAnnot ) // ---------------------------------------------------------------------------- { string gbkey; record.GetAttribute("gbkey", gbkey); CRef< CSeq_feat > pFeature(new CSeq_feat); // Round trip info: CRef< CGb_qual > pQual( new CGb_qual ); pQual->SetQual( "gff_source" ); pQual->SetVal( record.Source() ); pFeature->SetQual().push_back( pQual ); pQual.Reset( new CGb_qual ); pQual->SetQual( "gff_type" ); pQual->SetVal( record.Type() ); pFeature->SetQual().push_back( pQual ); if ( record.IsSetScore() ) { pQual.Reset( new CGb_qual ); pQual->SetQual( "gff_score" ); pQual->SetVal( NStr::DoubleToString( record.Score() ) ); pFeature->SetQual().push_back( pQual ); } // Special case: exon feature belonging to an RNA we have already seen if (record.Type() == "exon") { string parent; if (record.GetAttribute("Parent", parent)) { IdToFeatureMap::iterator it = m_MapIdToFeature.find(parent); if (it != m_MapIdToFeature.end()) { return record.UpdateFeature(m_iFlags, it->second); } } } // Special case: Piece of another feature we have already seen string id; if (record.GetAttribute("ID", id)) { IdToFeatureMap::iterator it = m_MapIdToFeature.find(id); if (it != m_MapIdToFeature.end()) { return record.UpdateFeature(m_iFlags, it->second); } } // General case: brand new regular feature if (!record.InitializeFeature(m_iFlags, pFeature)) { return false; } string strId; if ( record.GetAttribute( "ID", strId ) ) { m_MapIdToFeature[ strId ] = pFeature; } return x_AddFeatureToAnnot( pFeature, pAnnot ); }
// ---------------------------------------------------------------------------- bool CGtfReader::x_UpdateAnnotCds( const CGff2Record& gff, CRef< CSeq_annot > pAnnot ) // ---------------------------------------------------------------------------- { // // If there is no gene feature to go with this CDS then make one. Otherwise, // make sure the existing gene feature includes the location of the CDS. // CRef< CSeq_feat > pGene; if ( ! x_FindParentGene( gff, pGene ) ) { if ( ! x_CreateParentGene( gff, pAnnot ) ) { return false; } } else { if ( ! x_MergeParentGene( gff, pGene ) ) { return false; } } // // If there is no CDS feature with this gene_id|transcript_id then make one. // Otherwise, fix up the location of the existing one. // CRef< CSeq_feat > pCds; if ( ! x_FindParentCds( gff, pCds ) ) { // // Create a brand new CDS feature: // if ( ! x_CreateParentCds( gff, pAnnot ) ) { return false; } x_FindParentCds( gff, pCds ); } else { // // Update an already existing CDS features: // if ( ! x_MergeFeatureLocationMultiInterval( gff, pCds ) ) { return false; } if (!x_FeatureTrimQualifiers(gff, pCds)) { return false; } } if ( x_CdsIsPartial( gff ) ) { CRef<CSeq_feat> pParent; if ( x_FindParentMrna( gff, pParent ) ) { CSeq_loc& loc = pCds->SetLocation(); size_t uCdsStart = gff.SeqStart(); size_t uMrnaStart = pParent->GetLocation().GetStart( eExtreme_Positional ); if ( uCdsStart == uMrnaStart ) { loc.SetPartialStart( true, eExtreme_Positional ); // cerr << "fuzzed down: " << gff.SeqStart() << " " << gff.SeqStop() << " vs. " << uMrnaStart << endl; } size_t uCdsStop = gff.SeqStop(); size_t uMrnaStop = pParent->GetLocation().GetStop( eExtreme_Positional ); if ( uCdsStop == uMrnaStop && gff.Type() != "stop_codon" ) { loc.SetPartialStop( true, eExtreme_Positional ); // cerr << "fuzzed up : " << gff.SeqStart() << " " << gff.SeqStop() << " vs. " << uMrnaStop << endl; } } } return true; }
// ---------------------------------------------------------------------------- bool CGtfReader::x_UpdateAnnotFeature( const CGff2Record& gff, CRef< CSeq_annot > pAnnot, ILineErrorListener* pEC) // ---------------------------------------------------------------------------- { CRef< CSeq_feat > pFeature( new CSeq_feat ); // // Handle officially recognized GTF types: // string strType = gff.Type(); if ( strType == "CDS" ) { // // Observations: // Location does not include the stop codon hence must be fixed up once // the stop codon is seen. // return x_UpdateAnnotCds( gff, pAnnot ); } if ( strType == "start_codon" ) { // // Observation: // Comes in up to three pieces (depending on splicing). // Location _is_ included in CDS. // return x_UpdateAnnotStartCodon( gff, pAnnot ); } if ( strType == "stop_codon" ) { // // Observation: // Comes in up to three pieces (depending on splicing). // Location not included in CDS hence must be used to fix up location of // the coding region. // return x_UpdateAnnotStopCodon( gff, pAnnot ); } if ( strType == "5UTR" ) { return x_UpdateAnnot5utr( gff, pAnnot ); } if ( strType == "3UTR" ) { return x_UpdateAnnot3utr( gff, pAnnot ); } if ( strType == "inter" ) { return x_UpdateAnnotInter( gff, pAnnot ); } if ( strType == "inter_CNS" ) { return x_UpdateAnnotInterCns( gff, pAnnot ); } if ( strType == "intron_CNS" ) { return x_UpdateAnnotIntronCns( gff, pAnnot ); } if ( strType == "exon" || strType == "initial" || strType == "internal" || strType == "terminal" || strType == "single") { return x_UpdateAnnotExon( gff, pAnnot ); } // // Every other type is not officially sanctioned GTF, and per spec we are // supposed to ignore it. In the spirit of being lenient on input we may // try to salvage some of it anyway. // if ( strType == "gene" ) { // // Not an official GTF feature type but seen frequently. Hence we give // it some recognition. // if ( ! x_CreateParentGene( gff, pAnnot ) ) { return false; } } if (strType == "mRNA") { if ( ! x_CreateParentMrna(gff, pAnnot) ) { return false; } } return x_UpdateAnnotMiscFeature( gff, pAnnot ); }