TranscriptGeneMap transcriptToGeneMapFromFasta( const std::string& transcriptsFile ) { using std::vector; NameVector transcriptNames; NameVector geneNames {"gene"}; vector<bfs::path> paths{transcriptsFile}; StreamingReadParser parser(paths); parser.start(); ReadProducer<StreamingReadParser> producer(parser); ReadSeq* s; // while there are transcripts left to process while (producer.nextRead(s)) { // The transcript name std::string fullHeader(s->name, s->nlen); std::string header = fullHeader.substr(0, fullHeader.find(' ')); transcriptNames.emplace_back(header); producer.finishedWithRead(s); } // Sort the transcript names std::sort(transcriptNames.begin(), transcriptNames.end()); // Since we have no real gene groupings, the t2g vector is trivial, // everything maps to gene 0. IndexVector t2g(transcriptNames.size(), 0); return TranscriptGeneMap(transcriptNames, geneNames, t2g); }
Index findTranscriptID( const std::string &tname ) { using std::distance; using std::lower_bound; auto it = lower_bound( _transcriptNames.begin(), _transcriptNames.end(), tname ); return ( it == _transcriptNames.end() ) ? INVALID : ( distance(_transcriptNames.begin(), it) ); }