Ejemplo n.º 1
0
TranscriptGeneMap transcriptToGeneMapFromFasta( const std::string& transcriptsFile ) {

    using std::vector;
    NameVector transcriptNames;
    NameVector geneNames {"gene"};

    vector<bfs::path> paths{transcriptsFile};
    StreamingReadParser parser(paths);
    parser.start();

    ReadProducer<StreamingReadParser> producer(parser);

    ReadSeq* s;
    // while there are transcripts left to process
    while (producer.nextRead(s)) {
      // The transcript name
      std::string fullHeader(s->name, s->nlen);
      std::string header = fullHeader.substr(0, fullHeader.find(' '));
      transcriptNames.emplace_back(header);
      producer.finishedWithRead(s);
    }

    // Sort the transcript names
    std::sort(transcriptNames.begin(), transcriptNames.end());

    // Since we have no real gene groupings, the t2g vector is trivial,
    // everything maps to gene 0.
    IndexVector t2g(transcriptNames.size(), 0);

    return TranscriptGeneMap(transcriptNames, geneNames, t2g);
}
Ejemplo n.º 2
0
void
Symbol::findSymbols(QualifiedName name, SymbolVector& symbols)
{
    NameVector names;
    context()->separateName(name, names);

    //
    //  NOTE: this is all very fishy. If a tuple symbol is used here
    //  that looks like this: (int,foo.bar), it will separate into:
    //  "(int,foo" and "bar)" which is clearly wrong. The parens
    //  should prevent the dot separation. Similarly, these names will
    //  separate incorrectly:
    //
    //      int[foo.bar]
    //      [foo.bar]
    //      vector foo.bar[3]
    //
    //  This function assumes that these symbols will live in the
    //  global namespace and therefore will be found if no tokenizing
    //  of the name occurs.
    //
    
    qualifiedNameLookup(names, this, symbols);

    if (symbols.empty())
    {
        names.clear();
        names.push_back(name);
        qualifiedNameLookup(names, this, symbols);
    }
}
Ejemplo n.º 3
0
void
Symbol::findSymbols(QualifiedName name, ConstSymbolVector& symbols) const
{
    NameVector names;
    context()->separateName(name, names);
    qualifiedNameLookup(names, this, symbols);

    //
    //  See comments in above function
    //

    if (symbols.empty())
    {
        names.clear();
        names.push_back(name);
        qualifiedNameLookup(names, this, symbols);
    }
}
Ejemplo n.º 4
0
    void _computeReverseMap() {

        _genesToTranscripts.resize( _geneNames.size(), {});

        Index geneID;
        Index transcriptID = 0;
        size_t maxNumTrans = 0;
        Index maxGene;
        for ( size_t transcriptID = 0; transcriptID < _transcriptsToGenes.size(); ++transcriptID ) {
            _genesToTranscripts[ _transcriptsToGenes[transcriptID] ].push_back( transcriptID );
            if ( maxNumTrans < _genesToTranscripts[ _transcriptsToGenes[transcriptID] ].size() ) {
                maxNumTrans = _genesToTranscripts[ _transcriptsToGenes[transcriptID] ].size();
                maxGene = _transcriptsToGenes[transcriptID];
            }
        }
        std::cerr << "max # of transcripts in a gene was " << maxNumTrans << " in gene " << _geneNames[maxGene] << "\n";
    }
Ejemplo n.º 5
0
 Index findTranscriptID( const std::string &tname ) {
     using std::distance;
     using std::lower_bound;
     auto it = lower_bound( _transcriptNames.begin(), _transcriptNames.end(), tname );
     return ( it == _transcriptNames.end() ) ? INVALID : ( distance(_transcriptNames.begin(), it) );
 }
Ejemplo n.º 6
0
 Size numGenes() {
     return _geneNames.size();
 }
Ejemplo n.º 7
0
 Size numTranscripts() {
     return _transcriptNames.size();
 }
Ejemplo n.º 8
0
TranscriptGeneMap readTranscriptToGeneMap( std::ifstream &ifile ) {

    using std::unordered_set;
    using std::unordered_map;
    using std::vector;
    using std::tuple;
    using std::string;
    using std::get;

    using NameID = tuple<string, size_t>;

    IndexVector t2g;
    NameVector transcriptNames;
    NameVector geneNames;

    // holds the transcript name ID mapping
    vector<NameID> transcripts;
    // holds the mapping from transcript ID to gene ID
    IndexVector t2gUnordered;
    // holds the set of gene IDs
    unordered_map<string, size_t> geneNameToID;

    // To read the input and assign ids
    size_t transcriptCounter = 0;
    size_t geneCounter = 0;
    string transcript;
    string gene;

    while ( ifile >> transcript >> gene ) {
        // The transcript and it's ID
        transcripts.push_back( make_tuple(transcript, transcriptCounter) );

        auto geneIt = geneNameToID.find(gene);
        size_t geneID = 0;

        if ( geneIt == geneNameToID.end() ) {
            // If we haven't seen this gene yet, give it a new ID
            geneNameToID[gene] = geneCounter;
            geneID = geneCounter;
            geneNames.push_back(gene);
            ++geneCounter;
        } else {
            // Otherwise lookup the ID
            geneID = geneIt->second;
        }

        // Map the transcript to the gene in terms of their IDs
        t2gUnordered.push_back(geneID);

        ++transcriptCounter;
    }

    std::sort( transcripts.begin(), transcripts.end(),
               []( const NameID & a, const NameID & b) -> bool { return get<0>(a) < get<0>(b); } );

    // Resize these vectors for fast access
    transcriptNames.resize(t2gUnordered.size());
    t2g.resize(t2gUnordered.size());

    for ( size_t newID = 0; newID < transcripts.size(); ++newID ) {
        // For each transcript, map it to the appropriate gene
        string oldName; size_t oldID;
        std::tie(oldName, oldID) = transcripts[newID];
        t2g[newID] = t2gUnordered[oldID];
        transcriptNames[newID] = oldName;
    }

    return TranscriptGeneMap(transcriptNames, geneNames, t2g);
}
Ejemplo n.º 9
0
TranscriptGeneMap transcriptToGeneMapFromFeatures( std::vector<GenomicFeature<T>> &feats ) {
    using std::unordered_set;
    using std::unordered_map;
    using std::vector;
    using std::tuple;
    using std::string;
    using std::get;

    using NameID = tuple<string, size_t>;

    IndexVector t2g;
    NameVector transcriptNames;
    NameVector geneNames;

    // holds the mapping from transcript ID to gene ID
    IndexVector t2gUnordered;
    // holds the set of gene IDs
    unordered_map<string, size_t> geneNameToID;

    // To read the input and assign ids
    size_t transcriptCounter = 0;
    size_t geneCounter = 0;
    string transcript;
    string gene;

    std::sort( feats.begin(), feats.end(),
    []( const GenomicFeature<T> & a, const GenomicFeature<T> & b) -> bool {
        return a.sattr.transcript_id < b.sattr.transcript_id;
    } );

    std::string currentTranscript = "";
    for ( auto & feat : feats ) {

        auto &gene = feat.sattr.gene_id;
        auto &transcript = feat.sattr.transcript_id;

        if ( transcript != currentTranscript ) {
            auto geneIt = geneNameToID.find(gene);
            size_t geneID = 0;

            if ( geneIt == geneNameToID.end() ) {
                // If we haven't seen this gene yet, give it a new ID
                geneNameToID[gene] = geneCounter;
                geneID = geneCounter;
                geneNames.push_back(gene);
                ++geneCounter;
            } else {
                // Otherwise lookup the ID
                geneID = geneIt->second;
            }

            transcriptNames.push_back(transcript);
            t2g.push_back(geneID);

            //++transcriptID;
            currentTranscript = transcript;
        }

    }

    return TranscriptGeneMap(transcriptNames, geneNames, t2g);
}