// Merge two readsFiles together void mergeReadFiles(const std::string& readsFile1, const std::string& readsFile2, const std::string& outPrefix) { // If the outfile is the empty string, append the reads in readsFile2 into readsFile1 // otherwise cat the files together std::ostream* pWriter; if(outPrefix.empty()) { pWriter = createWriter(readsFile1, std::ios_base::out | std::ios_base::app); } else { pWriter = createWriter(makeFilename(outPrefix, ".fa")); // Copy reads1 to the outfile SeqReader reader(readsFile1); SeqRecord record; while(reader.get(record)) record.write(*pWriter); } // Copy reads2 to writer SeqReader reader(readsFile2); SeqRecord record; while(reader.get(record)) record.write(*pWriter); delete pWriter; }
void QCPostProcess::process(const SequenceWorkItem& item, const QCResult& result) { SeqRecord record = item.read; if(result.kmerPassed && result.dupPassed && result.hpPassed && result.degenPassed) { record.write(*m_pCorrectedWriter); ++m_readsKept; } else { // To be able to rebuild the index after discarding the read, we need to write // the rank of the string (its position in the original read file into the read name) std::stringstream newID; newID << item.read.id << ",seqrank=" << item.idx; record.id = newID.str(); record.write(*m_pDiscardWriter); ++m_readsDiscarded; if(!result.kmerPassed) m_readsFailedKmer += 1; else if(!result.dupPassed) m_readsFailedDup += 1; else if(!result.hpPassed) m_readsFailedHP += 1; else if(!result.degenPassed) m_readsFailedDegen += 1; } }
// Compute the initial BWTs for the input file split into blocks of records using the BCR algorithm MergeVector computeInitialBCR(const BWTDiskParameters& parameters) { SeqReader* pReader = new SeqReader(parameters.inFile); SeqRecord record; int groupID = 0; size_t numReadTotal = 0; MergeVector mergeVector; MergeItem mergeItem; mergeItem.start_index = 0; // Phase 1: Compute the initial BWTs DNAEncodedStringVector readSequences; bool done = false; while(!done) { done = !pReader->get(record); if(!done) { // the read is valid SeqItem item = record.toSeqItem(); if(parameters.bBuildReverse) item.seq.reverse(); readSequences.push_back(item.seq.toString()); ++numReadTotal; } if(readSequences.size() >= parameters.numReadsPerBatch || (done && readSequences.size() > 0)) { std::string bwt_temp_filename = makeTempName(parameters.outPrefix, groupID, parameters.bwtExtension); std::string sai_temp_filename = makeTempName(parameters.outPrefix, groupID, parameters.saiExtension); BWTCA::runBauerCoxRosone(&readSequences, bwt_temp_filename, sai_temp_filename); // Push the merge info mergeItem.end_index = numReadTotal - 1; // inclusive mergeItem.reads_filename = parameters.inFile; mergeItem.bwt_filename = bwt_temp_filename; mergeItem.sai_filename = sai_temp_filename; mergeVector.push_back(mergeItem); // Start the new group mergeItem.start_index = numReadTotal; ++groupID; readSequences.clear(); } } delete pReader; return mergeVector; }
// Generate a report of the quality of each base void generate_quality_stats(JSONWriter* pJSONWriter, const std::string& filename) { size_t max_reads = 10000000; double sample_rate = 0.05; SeqReader reader(filename, SRF_KEEP_CASE | SRF_NO_VALIDATION); SeqRecord record; size_t n_reads = 0; std::vector<size_t> bases_checked; std::vector<size_t> sum_quality; std::vector<size_t> num_q30; while(reader.get(record) && n_reads++ < max_reads) { if((double)rand() / RAND_MAX < sample_rate && record.qual.length() == record.seq.length()) { size_t l = record.seq.length(); if(l > bases_checked.size()) { bases_checked.resize(l); sum_quality.resize(l); num_q30.resize(l); } for(size_t i = 0; i < l; ++i) { bases_checked[i]++; size_t q = record.getPhredScore(i); sum_quality[i] += q; num_q30[i] += (q >= 30); } } } pJSONWriter->String("QualityScores"); pJSONWriter->StartObject(); pJSONWriter->String("mean_quality"); pJSONWriter->StartArray(); for(size_t i = 0; i < bases_checked.size(); ++i) pJSONWriter->Double((float)sum_quality[i] / bases_checked[i]); pJSONWriter->EndArray(); pJSONWriter->String("fraction_q30"); pJSONWriter->StartArray(); for(size_t i = 0; i < bases_checked.size(); ++i) pJSONWriter->Double((float)num_q30[i] / bases_checked[i]); pJSONWriter->EndArray(); pJSONWriter->EndObject(); }
void LRCorrectionPostProcess::process(const SequenceWorkItem& item, const LRCorrectionResult& result) { SeqRecord record = item.read; record.seq = result.correctedSequence; record.qual = ""; if(!record.seq.empty()) { record.write(*m_pCorrectedWriter); m_readsKept += 1; } else { m_readsDiscarded += 1; } }
void ErrorCorrectPostProcess::process(const SequenceWorkItem& item, const ErrorCorrectResult& result) { // Determine if the read should be discarded bool readQCPass = true; if(result.kmerQC) { m_kmerQCPassed += 1; } else if(result.overlapQC) { m_overlapQCPassed += 1; } else { readQCPass = false; m_qcFail += 1; } // Collect metrics for the reads that were actually corrected if(m_bCollectMetrics && readQCPass) { collectMetrics(item.read.seq.toString(), result.correctSequence.toString(), item.read.qual); } SeqRecord record = item.read; record.seq = result.correctSequence; if(readQCPass || m_pDiscardWriter == NULL) { record.write(*m_pCorrectedWriter); ++m_readsKept; } else { record.write(*m_pDiscardWriter); ++m_readsDiscarded; } }
bool SGPairedPathResolveVisitor::visit(StringGraph* /*pGraph*/, Vertex* /*pVertex*/) { assert(false); #if 0 if(pVertex->getColor() == GC_BLACK) return false; // has been resolved already // Get the vertex of the pair std::string pairID = getPairID(pVertex->getID()); Vertex* pPair = pGraph->getVertex(pairID); if(pPair != NULL) { PathVector paths; // get the expected direction between the vertices based on the PE info EdgeDir dir = SGPairedAlgorithms::getDirectionToPair(pVertex->getID()); SGPairedAlgorithms::searchPaths(pVertex, pPair, dir, 300, paths); pVertex->setColor(GC_BLACK); pPair->setColor(GC_BLACK); std::cout << "Found " << paths.size() << " paths from " << pVertex->getID() << " to " << pPair->getID() << "\n"; if(paths.size() == 1) { std::string fragment = SGPairedAlgorithms::pathToString(pVertex, paths[0]); SeqRecord record; record.id = pVertex->getID(); record.seq = fragment; record.write(*m_pWriter); } else { SeqRecord recordX; recordX.id = pVertex->getID(); recordX.seq = pVertex->getSeq().toString(); recordX.write(*m_pWriter); SeqRecord recordY; recordY.id = pVertex->getID(); recordY.seq = pVertex->getSeq().toString(); recordY.write(*m_pWriter); } } #endif return false; }
void parseFasta(SeqStream input_stream, string filename, SeqSet &data) { data.filename = filename; char ch; string temp = ""; string nm; unsigned size_guess = 10000; // Seems like it might speed things up // Enclose all of this in a while loop that goes to EOF: input_stream.get(ch); if(ch != '>') { throw("Not in FASTA format"); } bool inseq = false; bool linebreak = false; while(!input_stream.eof()) { SeqRecord rec; rec.reserve(size_guess); nm = ""; while (true && !inseq) { if(!input_stream.good()) { throw("Problem reading file"); } input_stream.get(ch); if (ch == '\n' || ch == '\r') inseq = true; nm += ch; } rec.setName(nm); temp = ""; while(inseq){ input_stream.get(ch); if(input_stream.eof()) break; // ">" after a linebreak means a new name if(ch == '>' && linebreak) { inseq = false; linebreak = false; continue; } // Ignore, but note linebreaks linebreak = false; if(ch == '\n' || ch == '\r') { linebreak = true; continue; } // Ignore whitespace if(ch == ' ' || ch == '\t') { continue; } temp += ch; } rec.append(temp); data.append(rec); size_guess = rec.getSeq().size(); } }
void parseFastq(SeqStream input_stream, string filename, SeqSet &data) { data.filename = filename; char ch; string temp = ""; string nm = ""; unsigned size_guess = 150; // Seems like it might speed things up unsigned line_num = 0; bool linebreak = false; bool name = false; while(!input_stream.eof()) { // Check if stream is okay and read a character if(!input_stream.good()) { throw("Problem reading file"); } input_stream.get(ch); // Check for linebreaks. Treat multiple linebreak characters // as one linebreak. Also, count the number of lines, and when // four lines have been reach, construct a SeqRecord and reset. if(ch == '\n' || ch == '\r') { if(!linebreak) { line_num += 1; if(line_num == 4) { line_num = 0; SeqRecord rec; rec.setName(nm); rec.append(temp); data.append(rec); size_guess = rec.getSeq().size(); nm = ""; temp = ""; temp.reserve(size_guess); name = false; } } linebreak = true; continue; } // If this far, not a linebreak linebreak = false; // For each line of the fastq file if(line_num == 0) { // Name if(!name and ch != '@') { throw("Not in fastq format"); } if(name) { nm += ch; } name = true; } else if(line_num == 1) { // Sequence temp += ch; } else if(line_num == 2) { // Plus line - Ignore continue; } else if(line_num == 3) { // Quality scores - ignore continue; } } }
// // Main // int preprocessMain(int argc, char** argv) { Timer* pTimer = new Timer("sga preprocess"); parsePreprocessOptions(argc, argv); std::cerr << "Parameters:\n"; std::cerr << "QualTrim: " << opt::qualityTrim << "\n"; if(opt::qualityFilter >= 0) std::cerr << "QualFilter: at most " << opt::qualityFilter << " low quality bases\n"; else std::cerr << "QualFilter: no filtering\n"; std::cerr << "HardClip: " << opt::hardClip << "\n"; std::cerr << "Min length: " << opt::minLength << "\n"; std::cerr << "Sample freq: " << opt::sampleFreq << "\n"; std::cerr << "PE Mode: " << opt::peMode << "\n"; std::cerr << "Quality scaling: " << opt::qualityScale << "\n"; std::cerr << "MinGC: " << opt::minGC << "\n"; std::cerr << "MaxGC: " << opt::maxGC << "\n"; std::cerr << "Outfile: " << (opt::outFile.empty() ? "stdout" : opt::outFile) << "\n"; std::cerr << "Orphan file: " << (opt::orphanFile.empty() ? "none" : opt::orphanFile) << "\n"; if(opt::bDiscardAmbiguous) std::cerr << "Discarding sequences with ambiguous bases\n"; if(opt::bDustFilter) std::cerr << "Dust threshold: " << opt::dustThreshold << "\n"; if(!opt::suffix.empty()) std::cerr << "Suffix: " << opt::suffix << "\n"; if(opt::adapterF.length() && opt::adapterR.length()) { std::cerr << "Adapter sequence fwd: " << opt::adapterF << "\n"; std::cerr << "Adapter sequence rev: " << opt::adapterR << "\n"; } // Seed the RNG srand(time(NULL)); std::ostream* pWriter; if(opt::outFile.empty()) { pWriter = &std::cout; } else { std::ostream* pFile = createWriter(opt::outFile); pWriter = pFile; } // Create a filehandle to write orphaned reads to, if necessary std::ostream* pOrphanWriter = NULL; if(!opt::orphanFile.empty()) pOrphanWriter = createWriter(opt::orphanFile); if(opt::peMode == 0) { // Treat files as SE data while(optind < argc) { std::string filename = argv[optind++]; std::cerr << "Processing " << filename << "\n\n"; SeqReader reader(filename, SRF_NO_VALIDATION); SeqRecord record; while(reader.get(record)) { bool passed = processRead(record); if(passed && samplePass()) { if(!opt::suffix.empty()) record.id.append(opt::suffix); record.write(*pWriter); ++s_numReadsKept; s_numBasesKept += record.seq.length(); } } } } else { assert(opt::peMode == 1 || opt::peMode == 2); int numFiles = argc - optind; if(opt::peMode == 1 && numFiles % 2 == 1) { std::cerr << "Error: An even number of files must be given for pe-mode 1\n"; exit(EXIT_FAILURE); } while(optind < argc) { SeqReader* pReader1; SeqReader* pReader2; if(opt::peMode == 1) { // Read from separate files std::string filename1 = argv[optind++]; std::string filename2 = argv[optind++]; pReader1 = new SeqReader(filename1, SRF_NO_VALIDATION); pReader2 = new SeqReader(filename2, SRF_NO_VALIDATION); std::cerr << "Processing pe files " << filename1 << ", " << filename2 << "\n"; } else { // Read from a single file std::string filename = argv[optind++]; pReader1 = new SeqReader(filename, SRF_NO_VALIDATION); pReader2 = pReader1; std::cerr << "Processing interleaved pe file " << filename << "\n"; } SeqRecord record1; SeqRecord record2; while(pReader1->get(record1) && pReader2->get(record2)) { // If the names of the records are the same, append a /1 and /2 to them if(record1.id == record2.id) { if(!opt::suffix.empty()) { record1.id.append(opt::suffix); record2.id.append(opt::suffix); } record1.id.append("/1"); record2.id.append("/2"); } // Ensure the read names are sensible std::string expectedID2 = getPairID(record1.id); std::string expectedID1 = getPairID(record2.id); if(expectedID1 != record1.id || expectedID2 != record2.id) { std::cerr << "Warning: Pair IDs do not match (expected format /1,/2 or /A,/B)\n"; std::cerr << "Read1 ID: " << record1.id << "\n"; std::cerr << "Read2 ID: " << record2.id << "\n"; s_numInvalidPE += 2; } bool passed1 = processRead(record1); bool passed2 = processRead(record2); if(!samplePass()) continue; if(passed1 && passed2) { record1.write(*pWriter); record2.write(*pWriter); s_numReadsKept += 2; s_numBasesKept += record1.seq.length(); s_numBasesKept += record2.seq.length(); } else if(passed1 && pOrphanWriter != NULL) { record1.write(*pOrphanWriter); } else if(passed2 && pOrphanWriter != NULL) { record2.write(*pOrphanWriter); } } if(pReader2 != pReader1) { // only delete reader2 if it is a distinct pointer delete pReader2; pReader2 = NULL; } delete pReader1; pReader1 = NULL; } } if(pWriter != &std::cout) delete pWriter; if(pOrphanWriter != NULL) delete pOrphanWriter; std::cerr << "\nPreprocess stats:\n"; std::cerr << "Reads parsed:\t" << s_numReadsRead << "\n"; std::cerr << "Reads kept:\t" << s_numReadsKept << " (" << (double)s_numReadsKept / (double)s_numReadsRead << ")\n"; std::cerr << "Reads failed primer screen:\t" << s_numReadsPrimer << " (" << (double)s_numReadsPrimer / (double)s_numReadsRead << ")\n"; std::cerr << "Bases parsed:\t" << s_numBasesRead << "\n"; std::cerr << "Bases kept:\t" << s_numBasesKept << " (" << (double)s_numBasesKept / (double)s_numBasesRead << ")\n"; std::cerr << "Number of incorrectly paired reads that were discarded: " << s_numInvalidPE << "\n"; if(opt::bDustFilter) std::cerr << "Number of reads failed dust filter: " << s_numFailedDust << "\n"; delete pTimer; return 0; }
void parseFasta(string filename, SeqSet &data) { //try { ifstream input(filename.c_str(), ifstream::in); data.filename = filename; char ch; string temp = ""; string nm; // Enclose all of this in a while loop that goes to EOF: input.get(ch); if(ch != '>') { throw("Not in FASTA format"); } bool inseq = false; bool linebreak = false; while(!input.eof()) { SeqRecord rec; nm = ""; while (true && !inseq) { input.get(ch); if (ch == '\n' || ch == '\r') inseq = true; nm += ch; } rec.setName(nm); temp = ""; while(inseq){ input.get(ch); if(input.eof()) break; // ">" after a linebreak means a new name if(ch == '>' && linebreak) { inseq = false; linebreak = false; continue; } // Ignore, but note linebreaks linebreak = false; if(ch == '\n' || ch == '\r') { linebreak = true; continue; } // Ignore whitespace if(ch == ' ' || ch == '\t') { continue; } temp += ch; } rec.append(temp); data.append(rec); } //} catch (...) { // throw("Problem parsing file"); //} }
// // Main // int overlapLongMain(int argc, char** argv) { parseOverlapLongOptions(argc, argv); // Open output file std::ostream* pASQGWriter = createWriter(opt::outFile); // Build and write the ASQG header ASQG::HeaderRecord headerRecord; headerRecord.setOverlapTag(opt::minOverlap); headerRecord.setErrorRateTag(opt::errorRate); headerRecord.setInputFileTag(opt::readsFile); headerRecord.setTransitiveTag(true); headerRecord.write(*pASQGWriter); // Determine which index files to use. If a target file was provided, // use the index of the target reads std::string indexPrefix; if(!opt::targetFile.empty()) indexPrefix = stripFilename(opt::targetFile); else indexPrefix = stripFilename(opt::readsFile); BWT* pBWT = new BWT(indexPrefix + BWT_EXT, opt::sampleRate); SampledSuffixArray* pSSA = new SampledSuffixArray(indexPrefix + SAI_EXT, SSA_FT_SAI); Timer* pTimer = new Timer(PROGRAM_IDENT); pBWT->printInfo(); // Read the sequence file and write vertex records for each // Also store the read names in a vector of strings ReadTable reads; SeqReader* pReader = new SeqReader(opt::readsFile, SRF_NO_VALIDATION); SeqRecord record; while(pReader->get(record)) { reads.addRead(record.toSeqItem()); ASQG::VertexRecord vr(record.id, record.seq.toString()); vr.write(*pASQGWriter); if(reads.getCount() % 100000 == 0) printf("Read %zu sequences\n", reads.getCount()); } delete pReader; pReader = NULL; BWTIndexSet index; index.pBWT = pBWT; index.pSSA = pSSA; index.pReadTable = &reads; // Make a prefix for the temporary hits files size_t n_reads = reads.getCount(); omp_set_num_threads(opt::numThreads); #pragma omp parallel for for(size_t read_idx = 0; read_idx < n_reads; ++read_idx) { const SeqItem& curr_read = reads.getRead(read_idx); printf("read %s %zubp\n", curr_read.id.c_str(), curr_read.seq.length()); SequenceOverlapPairVector sopv = KmerOverlaps::retrieveMatches(curr_read.seq.toString(), opt::seedLength, opt::minOverlap, 1 - opt::errorRate, 100, index); printf("Found %zu matches\n", sopv.size()); for(size_t i = 0; i < sopv.size(); ++i) { std::string match_id = reads.getRead(sopv[i].match_idx).id; // We only want to output each edge once so skip this overlap // if the matched read has a lexicographically lower ID if(curr_read.id > match_id) continue; std::string ao = ascii_overlap(sopv[i].sequence[0], sopv[i].sequence[1], sopv[i].overlap, 50); printf("\t%s\t[%d %d] ID=%s OL=%d PI:%.2lf C=%s\n", ao.c_str(), sopv[i].overlap.match[0].start, sopv[i].overlap.match[0].end, match_id.c_str(), sopv[i].overlap.getOverlapLength(), sopv[i].overlap.getPercentIdentity(), sopv[i].overlap.cigar.c_str()); // Convert to ASQG SeqCoord sc1(sopv[i].overlap.match[0].start, sopv[i].overlap.match[0].end, sopv[i].overlap.length[0]); SeqCoord sc2(sopv[i].overlap.match[1].start, sopv[i].overlap.match[1].end, sopv[i].overlap.length[1]); // KmerOverlaps returns the coordinates of the overlap after flipping the reads // to ensure the strand matches. The ASQG file wants the coordinate of the original // sequencing strand. Flip here if necessary if(sopv[i].is_reversed) sc2.flip(); // Convert the SequenceOverlap the ASQG's overlap format Overlap ovr(curr_read.id, sc1, match_id, sc2, sopv[i].is_reversed, -1); ASQG::EdgeRecord er(ovr); er.setCigarTag(sopv[i].overlap.cigar); er.setPercentIdentityTag(sopv[i].overlap.getPercentIdentity()); #pragma omp critical { er.write(*pASQGWriter); } } } // Cleanup delete pReader; delete pBWT; delete pSSA; delete pASQGWriter; delete pTimer; if(opt::numThreads > 1) pthread_exit(NULL); return 0; }
// Compute the initial BWTs for the input file split into blocks of records using the SAIS algorithm MergeVector computeInitialSAIS(const BWTDiskParameters& parameters) { SeqReader* pReader = new SeqReader(parameters.inFile); SeqRecord record; int groupID = 0; size_t numReadTotal = 0; MergeVector mergeVector; MergeItem mergeItem; mergeItem.start_index = 0; // Phase 1: Compute the initial BWTs ReadTable* pCurrRT = new ReadTable; bool done = false; while(!done) { done = !pReader->get(record); if(!done) { // the read is valid SeqItem item = record.toSeqItem(); if(parameters.bBuildReverse) item.seq.reverse(); pCurrRT->addRead(item); ++numReadTotal; } if(pCurrRT->getCount() >= parameters.numReadsPerBatch || (done && pCurrRT->getCount() > 0)) { // Compute the SA and BWT for this group SuffixArray* pSA = new SuffixArray(pCurrRT, 1); // Write the BWT to disk std::string bwt_temp_filename = makeTempName(parameters.outPrefix, groupID, parameters.bwtExtension); pSA->writeBWT(bwt_temp_filename, pCurrRT); std::string sai_temp_filename = makeTempName(parameters.outPrefix, groupID, parameters.saiExtension); pSA->writeIndex(sai_temp_filename); // Push the merge info mergeItem.end_index = numReadTotal - 1; // inclusive mergeItem.reads_filename = parameters.inFile; mergeItem.bwt_filename = bwt_temp_filename; mergeItem.sai_filename = sai_temp_filename; mergeVector.push_back(mergeItem); // Cleanup delete pSA; // Start the new group mergeItem.start_index = numReadTotal; ++groupID; pCurrRT->clear(); } } delete pCurrRT; delete pReader; return mergeVector; }
// The algorithm is as follows. We create M BWTs for subsets of // the input reads. These are created independently and written // to disk. They are then merged either sequentially or pairwise // to create the final BWT void buildBWTDisk(const std::string& in_filename, const std::string& out_prefix, const std::string& bwt_extension, const std::string& sai_extension, bool doReverse, int numThreads, int numReadsPerBatch, int storageLevel) { size_t MAX_READS_PER_GROUP = numReadsPerBatch; SeqReader* pReader = new SeqReader(in_filename); SeqRecord record; int groupID = 0; size_t numReadTotal = 0; MergeVector mergeVector; MergeItem mergeItem; mergeItem.start_index = 0; // Phase 1: Compute the initial BWTs ReadTable* pCurrRT = new ReadTable; bool done = false; while(!done) { done = !pReader->get(record); if(!done) { // the read is valid SeqItem item = record.toSeqItem(); if(doReverse) item.seq.reverse(); pCurrRT->addRead(item); ++numReadTotal; } if(pCurrRT->getCount() >= MAX_READS_PER_GROUP || (done && pCurrRT->getCount() > 0)) { // Compute the SA and BWT for this group SuffixArray* pSA = new SuffixArray(pCurrRT, numThreads); // Write the BWT to disk std::string bwt_temp_filename = makeTempName(out_prefix, groupID, bwt_extension); pSA->writeBWT(bwt_temp_filename, pCurrRT); std::string sai_temp_filename = makeTempName(out_prefix, groupID, sai_extension); pSA->writeIndex(sai_temp_filename); // Push the merge info mergeItem.end_index = numReadTotal - 1; // inclusive mergeItem.reads_filename = in_filename; mergeItem.bwt_filename = bwt_temp_filename; mergeItem.sai_filename = sai_temp_filename; mergeVector.push_back(mergeItem); // Cleanup delete pSA; // Start the new group mergeItem.start_index = numReadTotal; ++groupID; pCurrRT->clear(); } } delete pCurrRT; delete pReader; // Phase 2: Pairwise merge the BWTs int round = 1; MergeVector nextMergeRound; while(mergeVector.size() > 1) { std::cout << "Starting round " << round << "\n"; pReader = new SeqReader(in_filename); for(size_t i = 0; i < mergeVector.size(); i+=2) { if(i + 1 != mergeVector.size()) { std::string bwt_merged_name = makeTempName(out_prefix, groupID, bwt_extension); std::string sai_merged_name = makeTempName(out_prefix, groupID, sai_extension); MergeItem item1 = mergeVector[i]; MergeItem item2 = mergeVector[i+1]; // Perform the actual merge int64_t curr_idx = merge(pReader, item1, item2, bwt_merged_name, sai_merged_name, doReverse, numThreads, storageLevel); // pReader now points to the end of item1's block of // reads. Skip item2's reads assert(curr_idx == item2.start_index); while(curr_idx <= item2.end_index) { bool eof = !pReader->get(record); assert(!eof); (void)eof; ++curr_idx; } // Create the merged mergeItem to use in the next round MergeItem merged; merged.start_index = item1.start_index; merged.end_index = item2.end_index; merged.bwt_filename = bwt_merged_name; merged.sai_filename = sai_merged_name; nextMergeRound.push_back(merged); // Done with the temp files, remove them unlink(item1.bwt_filename.c_str()); unlink(item2.bwt_filename.c_str()); unlink(item1.sai_filename.c_str()); unlink(item2.sai_filename.c_str()); ++groupID; } else { // Singleton, pass through to the next round nextMergeRound.push_back(mergeVector[i]); } } delete pReader; mergeVector.clear(); mergeVector.swap(nextMergeRound); ++round; } assert(mergeVector.size() == 1); // Done, rename the files to their final name std::stringstream bwt_ss; bwt_ss << out_prefix << bwt_extension << (USE_GZ ? ".gz" : ""); std::string bwt_final_filename = bwt_ss.str(); rename(mergeVector.front().bwt_filename.c_str(), bwt_final_filename.c_str()); std::stringstream sai_ss; sai_ss << out_prefix << sai_extension << (USE_GZ ? ".gz" : ""); std::string sai_final_filename = sai_ss.str(); rename(mergeVector.front().sai_filename.c_str(), sai_final_filename.c_str()); }