TranscriptGeneMap transcriptToGeneMapFromFasta( const std::string& transcriptsFile ) { using std::vector; NameVector transcriptNames; NameVector geneNames {"gene"}; vector<bfs::path> paths{transcriptsFile}; StreamingReadParser parser(paths); parser.start(); ReadProducer<StreamingReadParser> producer(parser); ReadSeq* s; // while there are transcripts left to process while (producer.nextRead(s)) { // The transcript name std::string fullHeader(s->name, s->nlen); std::string header = fullHeader.substr(0, fullHeader.find(' ')); transcriptNames.emplace_back(header); producer.finishedWithRead(s); } // Sort the transcript names std::sort(transcriptNames.begin(), transcriptNames.end()); // Since we have no real gene groupings, the t2g vector is trivial, // everything maps to gene 0. IndexVector t2g(transcriptNames.size(), 0); return TranscriptGeneMap(transcriptNames, geneNames, t2g); }
void _computeReverseMap() { _genesToTranscripts.resize( _geneNames.size(), {}); Index geneID; Index transcriptID = 0; size_t maxNumTrans = 0; Index maxGene; for ( size_t transcriptID = 0; transcriptID < _transcriptsToGenes.size(); ++transcriptID ) { _genesToTranscripts[ _transcriptsToGenes[transcriptID] ].push_back( transcriptID ); if ( maxNumTrans < _genesToTranscripts[ _transcriptsToGenes[transcriptID] ].size() ) { maxNumTrans = _genesToTranscripts[ _transcriptsToGenes[transcriptID] ].size(); maxGene = _transcriptsToGenes[transcriptID]; } } std::cerr << "max # of transcripts in a gene was " << maxNumTrans << " in gene " << _geneNames[maxGene] << "\n"; }
Size numGenes() { return _geneNames.size(); }
Size numTranscripts() { return _transcriptNames.size(); }