Exemple #1
0
//  ----------------------------------------------------------------------------
bool CGtfReader::x_FeatureSetDataCDS(
    const CGff2Record& record,
    CRef< CSeq_feat > pFeature )
//  ----------------------------------------------------------------------------
{
    if ( ! CGff2Reader::x_FeatureSetDataCDS( record, pFeature ) ) {
        return false;
    }

    CCdregion& cdr = pFeature->SetData().SetCdregion();
    string strValue;
    if ( record.GetAttribute( "protein_id", strValue ) ) {
        CRef<CSeq_id> pId = CReadUtil::AsSeqId(strValue,m_iFlags);
        if (pId->IsGenbank()) {
            pFeature->SetProduct().SetWhole(*pId);
        }
    }
    if ( record.GetAttribute( "ribosomal_slippage", strValue ) ) {
        pFeature->SetExcept( true );
        pFeature->SetExcept_text( "ribosomal slippage" );
    }
    if ( record.GetAttribute( "transl_table", strValue ) ) {
        CRef< CGenetic_code::C_E > pGc( new CGenetic_code::C_E );
        pGc->SetId( NStr::StringToUInt( strValue ) );
        cdr.SetCode().Set().push_back( pGc );
    }
    return true;
}
Exemple #2
0
//  ----------------------------------------------------------------------------
bool CGtfReader::xFeatureSetQualifiersGene(
    const CGff2Record& record,
    CRef< CSeq_feat > pFeature )
//  ----------------------------------------------------------------------------
{
    //
    //  Create GB qualifiers for the record attributes:
    //
    CRef< CGb_qual > pQual(0);
    const CGff2Record::TAttributes& attrs = record.Attributes();
    CGff2Record::TAttrCit it = attrs.begin();
    for (/*NOOP*/; it != attrs.end(); ++it) {
        // gtf genes don't get transcript_id
        if (it->first == "transcript_id") {
            continue;
        }
        // special case some well-known attributes
        if (x_ProcessQualifierSpecialCase(it, pFeature)) {
            continue;
        }

        // turn everything else into a qualifier
        pQual.Reset(new CGb_qual);
        pQual->SetQual(it->first);
        pQual->SetVal(it->second);
        pFeature->SetQual().push_back(pQual);
    } 
    return true;
}
Exemple #3
0
//  ----------------------------------------------------------------------------
bool CGtfReader::x_MergeFeatureLocationSingleInterval(
    const CGff2Record& record,
    CRef< CSeq_feat > pFeature )
//  ----------------------------------------------------------------------------
{
    const CSeq_interval& gene_int = pFeature->GetLocation().GetInt();
    if ( gene_int.GetFrom() > record.SeqStart() -1 ) {
        pFeature->SetLocation().SetInt().SetFrom( record.SeqStart() );
    }
    if ( gene_int.GetTo() < record.SeqStop() - 1 ) {
        pFeature->SetLocation().SetInt().SetTo( record.SeqStop() );
    }
    if (record.Type() == "CDS"  &&  pFeature->GetData().IsCdregion()) {
        return x_FeatureTrimQualifiers(record, pFeature);
    }
    return true;
}
Exemple #4
0
//  ----------------------------------------------------------------------------
string s_FeatureKey(
    const CGff2Record& gff )
//  ----------------------------------------------------------------------------
{
    string strGeneId = s_GeneKey( gff );
    if ( gff.Type() == "gene" ) {
        return strGeneId;
    }

    string strTranscriptId;
    if ( ! gff.GetAttribute( "transcript_id", strTranscriptId ) ) {
        cerr << "Unexpected: GTF feature without a transcript_id." << endl;
        strTranscriptId = "transcript_id";
    }

    return strGeneId + "|" + strTranscriptId;
}
Exemple #5
0
//  ----------------------------------------------------------------------------
bool CGtfReader::x_CreateFeatureLocation(
    const CGff2Record& record,
    CRef< CSeq_feat > pFeature )
//  ----------------------------------------------------------------------------
{
    CRef<CSeq_id> pId = CReadUtil::AsSeqId(
        record.Id(), m_iFlags & fAllIdsAsLocal);

    CSeq_interval& location = pFeature->SetLocation().SetInt();
    location.SetId( *pId );
    location.SetFrom( record.SeqStart() );
    if (record.Type() != "mRNA") {
        location.SetTo(record.SeqStop());
    }
    else {
        // place holder
        //  actual location will be computed from the exons and CDSs living on 
        //  this feature.
        location.SetTo(record.SeqStart());
    }
    if ( record.IsSetStrand() ) {
        location.SetStrand( record.Strand() );
    }

    return true;
}
Exemple #6
0
//  ----------------------------------------------------------------------------
bool CGtfReader::x_MergeFeatureLocationMultiInterval(
    const CGff2Record& record,
    CRef< CSeq_feat > pFeature )
//  ----------------------------------------------------------------------------
{
    CRef<CSeq_id> pId = CReadUtil::AsSeqId(
        record.Id(), m_iFlags & fAllIdsAsLocal);

    CRef< CSeq_loc > pLocation( new CSeq_loc );
    pLocation->SetInt().SetId( *pId );
    pLocation->SetInt().SetFrom( record.SeqStart() );
    pLocation->SetInt().SetTo( record.SeqStop() );
    if ( record.IsSetStrand() ) {
        pLocation->SetInt().SetStrand( record.Strand() );
    }
    pLocation = pLocation->Add( 
        pFeature->SetLocation(), CSeq_loc::fSortAndMerge_All, 0 );
    pFeature->SetLocation( *pLocation );
    return true;
}
Exemple #7
0
//  ----------------------------------------------------------------------------
string s_GeneKey(
    const CGff2Record& gff )
//  ----------------------------------------------------------------------------
{
    string strGeneId;
    if ( ! gff.GetAttribute( "gene_id", strGeneId ) ) {
        cerr << "Unexpected: GTF feature without a gene_id." << endl;
        return "gene_id";
    }
    return strGeneId;
}
Exemple #8
0
//  ----------------------------------------------------------------------------
bool CGtfReader::x_CdsIsPartial(
    const CGff2Record& record )
//  ----------------------------------------------------------------------------
{
    string strPartial;
//    if ( record.Type() != "CDS" ) {
//        return false;
//    }
    if ( record.GetAttribute( "partial", strPartial ) ) {
        return true;
    }
    CRef< CSeq_feat > mRna;
    if ( ! x_FindParentMrna( record, mRna ) ) {
        return false;
    }
    return ( mRna->IsSetPartial() && mRna->GetPartial() );
}
Exemple #9
0
//  -----------------------------------------------------------------------------
bool CGtfReader::x_CreateParentCds(
    const CGff2Record& gff,
    CRef< CSeq_annot > pAnnot )
//  -----------------------------------------------------------------------------
{
    //
    // Create a single cds feature.
	// This creation may either be triggered by an actual CDS feature found in the
	//	gtf, or by a feature that would imply a CDS feature (such as a start codon 
	//	or a stop codon). The latter is necessary because nothing the the gtf 
	//	standard stipulates that gtf features have to be arranged in any particular
	//	order.
    //
    CRef< CSeq_feat > pFeature( new CSeq_feat );

    string strType = gff.Type();
    if ( strType != "CDS"  &&  strType != "start_codon"  &&  strType != "stop_codon" ) {
        return false;
    }

    if ( ! x_FeatureSetDataCDS( gff, pFeature ) ) {
        return false;
    }
    if ( ! x_CreateFeatureLocation( gff, pFeature ) ) {
        return false;
    }
    if ( ! x_CreateFeatureId( gff, "cds", pFeature ) ) {
        return false;
    }
    if ( ! x_CreateGeneXrefs( gff, pFeature ) ) {
        return false;
    }
    if ( ! x_CreateMrnaXrefs( gff, pFeature ) ) {
        return false;
    }
    if ( ! x_FeatureSetQualifiers( gff, pFeature ) ) {
        return false;
    }

    m_CdsMap[ s_FeatureKey( gff ) ] = pFeature;

    return xAddFeatureToAnnot( pFeature, pAnnot );
}
Exemple #10
0
//  ----------------------------------------------------------------------------
bool CGtfReader::x_FeatureSetDataMRNA(
    const CGff2Record& record,
    CRef<CSeq_feat> pFeature)
//  ----------------------------------------------------------------------------
{
    if ( ! CGff2Reader::x_FeatureSetDataRna( 
            record, pFeature, CSeqFeatData::eSubtype_mRNA)) {
        return false;
    }
    
    CRNA_ref& rna = pFeature->SetData().SetRna();

    string strValue;
    if (record.GetAttribute("product", strValue)) {
        rna.SetExt().SetName(strValue);
    }

    return true;
}
Exemple #11
0
//  ----------------------------------------------------------------------------
bool CGtfReader::x_FeatureSetDataGene(
    const CGff2Record& record,
    CRef< CSeq_feat > pFeature )
//  ----------------------------------------------------------------------------
{
    if ( ! CGff2Reader::x_FeatureSetDataGene( record, pFeature ) ) {
        return false;
    }

    CGene_ref& gene = pFeature->SetData().SetGene();

    string strValue;
    if ( record.GetAttribute( "gene_synonym", strValue ) ) {
        gene.SetSyn().push_back( strValue );
    }
    //  mss-399: do -not- use gene_id for /gene_syn or /gene:
    //if ( record.GetAttribute( "gene_id", strValue ) ) {
    //    gene.SetSyn().push_front( strValue );
    //}
    return true;
}
Exemple #12
0
//  ----------------------------------------------------------------------------
bool CGff3Reader::x_UpdateAnnot(
    const CGff2Record& record,
    CRef< CSeq_annot > pAnnot )
//  ----------------------------------------------------------------------------
{
    string gbkey;
    record.GetAttribute("gbkey", gbkey);
    CRef< CSeq_feat > pFeature(new CSeq_feat);

    //  Round trip info:
    CRef< CGb_qual > pQual( new CGb_qual );
    pQual->SetQual( "gff_source" );
    pQual->SetVal( record.Source() );
    pFeature->SetQual().push_back( pQual );

    pQual.Reset( new CGb_qual );
    pQual->SetQual( "gff_type" );
    pQual->SetVal( record.Type() );
    pFeature->SetQual().push_back( pQual );

    if ( record.IsSetScore() ) {
        pQual.Reset( new CGb_qual );
        pQual->SetQual( "gff_score" );
        pQual->SetVal( NStr::DoubleToString( record.Score() ) );
        pFeature->SetQual().push_back( pQual );
    }

    //  Special case: exon feature belonging to an RNA we have already seen
    if (record.Type() == "exon") {
        string parent;
        if (record.GetAttribute("Parent", parent)) {
            IdToFeatureMap::iterator it = m_MapIdToFeature.find(parent);
            if (it != m_MapIdToFeature.end()) {
                return record.UpdateFeature(m_iFlags, it->second);
            }
        }
    }

    //  Special case: Piece of another feature we have already seen
    string id;
    if (record.GetAttribute("ID", id)) {
        IdToFeatureMap::iterator it = m_MapIdToFeature.find(id);
        if (it != m_MapIdToFeature.end()) {
            return record.UpdateFeature(m_iFlags, it->second);
        }
    }

    //  General case: brand new regular feature
    if (!record.InitializeFeature(m_iFlags, pFeature)) {
        return false;
    }

    string strId;
    if ( record.GetAttribute( "ID", strId ) ) {
        m_MapIdToFeature[ strId ] = pFeature;
    }
    return x_AddFeatureToAnnot( pFeature, pAnnot );
}
Exemple #13
0
//  ----------------------------------------------------------------------------
bool CGtfReader::x_UpdateAnnotCds(
    const CGff2Record& gff,
    CRef< CSeq_annot > pAnnot )
//  ----------------------------------------------------------------------------
{
    //
    // If there is no gene feature to go with this CDS then make one. Otherwise,
    //  make sure the existing gene feature includes the location of the CDS.
    //
    CRef< CSeq_feat > pGene;
    if ( ! x_FindParentGene( gff, pGene ) ) {
        if ( ! x_CreateParentGene( gff, pAnnot ) ) {
            return false;
        }
    }
    else {
        if ( ! x_MergeParentGene( gff, pGene ) ) {
            return false;
        }
    }
    
    //
    // If there is no CDS feature with this gene_id|transcript_id then make one.
    //  Otherwise, fix up the location of the existing one.
    //
    CRef< CSeq_feat > pCds;
    if ( ! x_FindParentCds( gff, pCds ) ) {
        //
        // Create a brand new CDS feature:
        //
        if ( ! x_CreateParentCds( gff, pAnnot ) ) {
            return false;
        }
        x_FindParentCds( gff, pCds );
    }
    else {
        //
        // Update an already existing CDS features:
        //
        if ( ! x_MergeFeatureLocationMultiInterval( gff, pCds ) ) {
            return false;
        }
        if (!x_FeatureTrimQualifiers(gff, pCds)) {
            return false;
        }
    }

    if ( x_CdsIsPartial( gff ) ) {
        CRef<CSeq_feat> pParent;
        if ( x_FindParentMrna( gff, pParent ) ) {
            CSeq_loc& loc = pCds->SetLocation();
            size_t uCdsStart = gff.SeqStart();
            size_t uMrnaStart = pParent->GetLocation().GetStart( eExtreme_Positional );
            if ( uCdsStart == uMrnaStart ) {
                loc.SetPartialStart( true, eExtreme_Positional );
//                cerr << "fuzzed down: " << gff.SeqStart() << "  " << gff.SeqStop() << " vs. " << uMrnaStart << endl;
            }

            size_t uCdsStop =  gff.SeqStop();
            size_t uMrnaStop = pParent->GetLocation().GetStop( eExtreme_Positional );
            if ( uCdsStop == uMrnaStop  && gff.Type() != "stop_codon" ) {
                loc.SetPartialStop( true, eExtreme_Positional );
//                cerr << "fuzzed up  : " << gff.SeqStart() << "  " << gff.SeqStop() << " vs. " << uMrnaStop << endl;
            }
        }
    }
    return true;
}
Exemple #14
0
//  ----------------------------------------------------------------------------
bool CGtfReader::x_UpdateAnnotFeature(
    const CGff2Record& gff,
    CRef< CSeq_annot > pAnnot,
    ILineErrorListener* pEC)
//  ----------------------------------------------------------------------------
{
    CRef< CSeq_feat > pFeature( new CSeq_feat );

    //
    // Handle officially recognized GTF types:
    //
    string strType = gff.Type();
    if ( strType == "CDS" ) {
        //
        // Observations:
        // Location does not include the stop codon hence must be fixed up once
        //  the stop codon is seen.
        //
        return x_UpdateAnnotCds( gff, pAnnot );
    }
    if ( strType == "start_codon" ) {
        //
        // Observation:
        // Comes in up to three pieces (depending on splicing).
        // Location _is_ included in CDS.
        //
        return x_UpdateAnnotStartCodon( gff, pAnnot );
    }
    if ( strType == "stop_codon" ) {
        //
        // Observation:
        // Comes in up to three pieces (depending on splicing).
        // Location not included in CDS hence must be used to fix up location of
        //  the coding region.
        //
        return x_UpdateAnnotStopCodon( gff, pAnnot );
    }
    if ( strType == "5UTR" ) {
        return x_UpdateAnnot5utr( gff, pAnnot );
    }
    if ( strType == "3UTR" ) {
        return x_UpdateAnnot3utr( gff, pAnnot );
    }
    if ( strType == "inter" ) {
        return x_UpdateAnnotInter( gff, pAnnot );
    }
    if ( strType == "inter_CNS" ) {
        return x_UpdateAnnotInterCns( gff, pAnnot );
    }
    if ( strType == "intron_CNS" ) {
        return x_UpdateAnnotIntronCns( gff, pAnnot );
    }
    if ( strType == "exon"  ||
         strType == "initial"  ||
         strType == "internal"  ||
         strType == "terminal"  ||
         strType == "single") {
        return x_UpdateAnnotExon( gff, pAnnot );
    }

    //
    //  Every other type is not officially sanctioned GTF, and per spec we are
    //  supposed to ignore it. In the spirit of being lenient on input we may
    //  try to salvage some of it anyway.
    //
    if ( strType == "gene" ) {
        //
        // Not an official GTF feature type but seen frequently. Hence we give
        //  it some recognition.
        //
        if ( ! x_CreateParentGene( gff, pAnnot ) ) {
            return false;
        }
    }
    if (strType == "mRNA") {
        if ( ! x_CreateParentMrna(gff, pAnnot) ) {
            return false;
        }
    }
    return x_UpdateAnnotMiscFeature( gff, pAnnot );
}