TranscriptGeneMap transcriptToGeneMapFromFasta( const std::string& transcriptsFile ) { using std::vector; NameVector transcriptNames; NameVector geneNames {"gene"}; vector<bfs::path> paths{transcriptsFile}; StreamingReadParser parser(paths); parser.start(); ReadProducer<StreamingReadParser> producer(parser); ReadSeq* s; // while there are transcripts left to process while (producer.nextRead(s)) { // The transcript name std::string fullHeader(s->name, s->nlen); std::string header = fullHeader.substr(0, fullHeader.find(' ')); transcriptNames.emplace_back(header); producer.finishedWithRead(s); } // Sort the transcript names std::sort(transcriptNames.begin(), transcriptNames.end()); // Since we have no real gene groupings, the t2g vector is trivial, // everything maps to gene 0. IndexVector t2g(transcriptNames.size(), 0); return TranscriptGeneMap(transcriptNames, geneNames, t2g); }
void Symbol::findSymbols(QualifiedName name, SymbolVector& symbols) { NameVector names; context()->separateName(name, names); // // NOTE: this is all very fishy. If a tuple symbol is used here // that looks like this: (int,foo.bar), it will separate into: // "(int,foo" and "bar)" which is clearly wrong. The parens // should prevent the dot separation. Similarly, these names will // separate incorrectly: // // int[foo.bar] // [foo.bar] // vector foo.bar[3] // // This function assumes that these symbols will live in the // global namespace and therefore will be found if no tokenizing // of the name occurs. // qualifiedNameLookup(names, this, symbols); if (symbols.empty()) { names.clear(); names.push_back(name); qualifiedNameLookup(names, this, symbols); } }
void Symbol::findSymbols(QualifiedName name, ConstSymbolVector& symbols) const { NameVector names; context()->separateName(name, names); qualifiedNameLookup(names, this, symbols); // // See comments in above function // if (symbols.empty()) { names.clear(); names.push_back(name); qualifiedNameLookup(names, this, symbols); } }
void _computeReverseMap() { _genesToTranscripts.resize( _geneNames.size(), {}); Index geneID; Index transcriptID = 0; size_t maxNumTrans = 0; Index maxGene; for ( size_t transcriptID = 0; transcriptID < _transcriptsToGenes.size(); ++transcriptID ) { _genesToTranscripts[ _transcriptsToGenes[transcriptID] ].push_back( transcriptID ); if ( maxNumTrans < _genesToTranscripts[ _transcriptsToGenes[transcriptID] ].size() ) { maxNumTrans = _genesToTranscripts[ _transcriptsToGenes[transcriptID] ].size(); maxGene = _transcriptsToGenes[transcriptID]; } } std::cerr << "max # of transcripts in a gene was " << maxNumTrans << " in gene " << _geneNames[maxGene] << "\n"; }
Index findTranscriptID( const std::string &tname ) { using std::distance; using std::lower_bound; auto it = lower_bound( _transcriptNames.begin(), _transcriptNames.end(), tname ); return ( it == _transcriptNames.end() ) ? INVALID : ( distance(_transcriptNames.begin(), it) ); }
Size numGenes() { return _geneNames.size(); }
Size numTranscripts() { return _transcriptNames.size(); }
TranscriptGeneMap readTranscriptToGeneMap( std::ifstream &ifile ) { using std::unordered_set; using std::unordered_map; using std::vector; using std::tuple; using std::string; using std::get; using NameID = tuple<string, size_t>; IndexVector t2g; NameVector transcriptNames; NameVector geneNames; // holds the transcript name ID mapping vector<NameID> transcripts; // holds the mapping from transcript ID to gene ID IndexVector t2gUnordered; // holds the set of gene IDs unordered_map<string, size_t> geneNameToID; // To read the input and assign ids size_t transcriptCounter = 0; size_t geneCounter = 0; string transcript; string gene; while ( ifile >> transcript >> gene ) { // The transcript and it's ID transcripts.push_back( make_tuple(transcript, transcriptCounter) ); auto geneIt = geneNameToID.find(gene); size_t geneID = 0; if ( geneIt == geneNameToID.end() ) { // If we haven't seen this gene yet, give it a new ID geneNameToID[gene] = geneCounter; geneID = geneCounter; geneNames.push_back(gene); ++geneCounter; } else { // Otherwise lookup the ID geneID = geneIt->second; } // Map the transcript to the gene in terms of their IDs t2gUnordered.push_back(geneID); ++transcriptCounter; } std::sort( transcripts.begin(), transcripts.end(), []( const NameID & a, const NameID & b) -> bool { return get<0>(a) < get<0>(b); } ); // Resize these vectors for fast access transcriptNames.resize(t2gUnordered.size()); t2g.resize(t2gUnordered.size()); for ( size_t newID = 0; newID < transcripts.size(); ++newID ) { // For each transcript, map it to the appropriate gene string oldName; size_t oldID; std::tie(oldName, oldID) = transcripts[newID]; t2g[newID] = t2gUnordered[oldID]; transcriptNames[newID] = oldName; } return TranscriptGeneMap(transcriptNames, geneNames, t2g); }
TranscriptGeneMap transcriptToGeneMapFromFeatures( std::vector<GenomicFeature<T>> &feats ) { using std::unordered_set; using std::unordered_map; using std::vector; using std::tuple; using std::string; using std::get; using NameID = tuple<string, size_t>; IndexVector t2g; NameVector transcriptNames; NameVector geneNames; // holds the mapping from transcript ID to gene ID IndexVector t2gUnordered; // holds the set of gene IDs unordered_map<string, size_t> geneNameToID; // To read the input and assign ids size_t transcriptCounter = 0; size_t geneCounter = 0; string transcript; string gene; std::sort( feats.begin(), feats.end(), []( const GenomicFeature<T> & a, const GenomicFeature<T> & b) -> bool { return a.sattr.transcript_id < b.sattr.transcript_id; } ); std::string currentTranscript = ""; for ( auto & feat : feats ) { auto &gene = feat.sattr.gene_id; auto &transcript = feat.sattr.transcript_id; if ( transcript != currentTranscript ) { auto geneIt = geneNameToID.find(gene); size_t geneID = 0; if ( geneIt == geneNameToID.end() ) { // If we haven't seen this gene yet, give it a new ID geneNameToID[gene] = geneCounter; geneID = geneCounter; geneNames.push_back(gene); ++geneCounter; } else { // Otherwise lookup the ID geneID = geneIt->second; } transcriptNames.push_back(transcript); t2g.push_back(geneID); //++transcriptID; currentTranscript = transcript; } } return TranscriptGeneMap(transcriptNames, geneNames, t2g); }