// Compute the initial BWTs for the input file split into blocks of records using the BCR algorithm MergeVector computeInitialBCR(const BWTDiskParameters& parameters) { SeqReader* pReader = new SeqReader(parameters.inFile); SeqRecord record; int groupID = 0; size_t numReadTotal = 0; MergeVector mergeVector; MergeItem mergeItem; mergeItem.start_index = 0; // Phase 1: Compute the initial BWTs DNAEncodedStringVector readSequences; bool done = false; while(!done) { done = !pReader->get(record); if(!done) { // the read is valid SeqItem item = record.toSeqItem(); if(parameters.bBuildReverse) item.seq.reverse(); readSequences.push_back(item.seq.toString()); ++numReadTotal; } if(readSequences.size() >= parameters.numReadsPerBatch || (done && readSequences.size() > 0)) { std::string bwt_temp_filename = makeTempName(parameters.outPrefix, groupID, parameters.bwtExtension); std::string sai_temp_filename = makeTempName(parameters.outPrefix, groupID, parameters.saiExtension); BWTCA::runBauerCoxRosone(&readSequences, bwt_temp_filename, sai_temp_filename); // Push the merge info mergeItem.end_index = numReadTotal - 1; // inclusive mergeItem.reads_filename = parameters.inFile; mergeItem.bwt_filename = bwt_temp_filename; mergeItem.sai_filename = sai_temp_filename; mergeVector.push_back(mergeItem); // Start the new group mergeItem.start_index = numReadTotal; ++groupID; readSequences.clear(); } } delete pReader; return mergeVector; }
// // Main // int overlapLongMain(int argc, char** argv) { parseOverlapLongOptions(argc, argv); // Open output file std::ostream* pASQGWriter = createWriter(opt::outFile); // Build and write the ASQG header ASQG::HeaderRecord headerRecord; headerRecord.setOverlapTag(opt::minOverlap); headerRecord.setErrorRateTag(opt::errorRate); headerRecord.setInputFileTag(opt::readsFile); headerRecord.setTransitiveTag(true); headerRecord.write(*pASQGWriter); // Determine which index files to use. If a target file was provided, // use the index of the target reads std::string indexPrefix; if(!opt::targetFile.empty()) indexPrefix = stripFilename(opt::targetFile); else indexPrefix = stripFilename(opt::readsFile); BWT* pBWT = new BWT(indexPrefix + BWT_EXT, opt::sampleRate); SampledSuffixArray* pSSA = new SampledSuffixArray(indexPrefix + SAI_EXT, SSA_FT_SAI); Timer* pTimer = new Timer(PROGRAM_IDENT); pBWT->printInfo(); // Read the sequence file and write vertex records for each // Also store the read names in a vector of strings ReadTable reads; SeqReader* pReader = new SeqReader(opt::readsFile, SRF_NO_VALIDATION); SeqRecord record; while(pReader->get(record)) { reads.addRead(record.toSeqItem()); ASQG::VertexRecord vr(record.id, record.seq.toString()); vr.write(*pASQGWriter); if(reads.getCount() % 100000 == 0) printf("Read %zu sequences\n", reads.getCount()); } delete pReader; pReader = NULL; BWTIndexSet index; index.pBWT = pBWT; index.pSSA = pSSA; index.pReadTable = &reads; // Make a prefix for the temporary hits files size_t n_reads = reads.getCount(); omp_set_num_threads(opt::numThreads); #pragma omp parallel for for(size_t read_idx = 0; read_idx < n_reads; ++read_idx) { const SeqItem& curr_read = reads.getRead(read_idx); printf("read %s %zubp\n", curr_read.id.c_str(), curr_read.seq.length()); SequenceOverlapPairVector sopv = KmerOverlaps::retrieveMatches(curr_read.seq.toString(), opt::seedLength, opt::minOverlap, 1 - opt::errorRate, 100, index); printf("Found %zu matches\n", sopv.size()); for(size_t i = 0; i < sopv.size(); ++i) { std::string match_id = reads.getRead(sopv[i].match_idx).id; // We only want to output each edge once so skip this overlap // if the matched read has a lexicographically lower ID if(curr_read.id > match_id) continue; std::string ao = ascii_overlap(sopv[i].sequence[0], sopv[i].sequence[1], sopv[i].overlap, 50); printf("\t%s\t[%d %d] ID=%s OL=%d PI:%.2lf C=%s\n", ao.c_str(), sopv[i].overlap.match[0].start, sopv[i].overlap.match[0].end, match_id.c_str(), sopv[i].overlap.getOverlapLength(), sopv[i].overlap.getPercentIdentity(), sopv[i].overlap.cigar.c_str()); // Convert to ASQG SeqCoord sc1(sopv[i].overlap.match[0].start, sopv[i].overlap.match[0].end, sopv[i].overlap.length[0]); SeqCoord sc2(sopv[i].overlap.match[1].start, sopv[i].overlap.match[1].end, sopv[i].overlap.length[1]); // KmerOverlaps returns the coordinates of the overlap after flipping the reads // to ensure the strand matches. The ASQG file wants the coordinate of the original // sequencing strand. Flip here if necessary if(sopv[i].is_reversed) sc2.flip(); // Convert the SequenceOverlap the ASQG's overlap format Overlap ovr(curr_read.id, sc1, match_id, sc2, sopv[i].is_reversed, -1); ASQG::EdgeRecord er(ovr); er.setCigarTag(sopv[i].overlap.cigar); er.setPercentIdentityTag(sopv[i].overlap.getPercentIdentity()); #pragma omp critical { er.write(*pASQGWriter); } } } // Cleanup delete pReader; delete pBWT; delete pSSA; delete pASQGWriter; delete pTimer; if(opt::numThreads > 1) pthread_exit(NULL); return 0; }
// The algorithm is as follows. We create M BWTs for subsets of // the input reads. These are created independently and written // to disk. They are then merged either sequentially or pairwise // to create the final BWT void buildBWTDisk(const BWTDiskParameters& parameters) { // Build the initial bwts for subsets of the data MergeVector mergeVector; if(parameters.bUseBCR) mergeVector = computeInitialBCR(parameters); else mergeVector = computeInitialSAIS(parameters); // Phase 2: Pairwise merge the BWTs int groupID = mergeVector.size(); // Initial the name of the next intermediate bwt int round = 1; MergeVector nextMergeRound; while(mergeVector.size() > 1) { std::cout << "Starting round " << round << "\n"; SeqReader* pReader = new SeqReader(parameters.inFile); SeqRecord record; for(size_t i = 0; i < mergeVector.size(); i+=2) { if(i + 1 != mergeVector.size()) { std::string bwt_merged_name = makeTempName(parameters.outPrefix, groupID, parameters.bwtExtension); std::string sai_merged_name = makeTempName(parameters.outPrefix, groupID, parameters.saiExtension); MergeItem item1 = mergeVector[i]; MergeItem item2 = mergeVector[i+1]; // Perform the actual merge int64_t curr_idx = merge(pReader, item1, item2, bwt_merged_name, sai_merged_name, parameters.bBuildReverse, parameters.numThreads, parameters.storageLevel); // pReader now points to the end of item1's block of // reads. Skip item2's reads assert(curr_idx == item2.start_index); while(curr_idx <= item2.end_index) { bool eof = !pReader->get(record); assert(!eof); (void)eof; ++curr_idx; } // Create the merged mergeItem to use in the next round MergeItem merged; merged.start_index = item1.start_index; merged.end_index = item2.end_index; merged.bwt_filename = bwt_merged_name; merged.sai_filename = sai_merged_name; nextMergeRound.push_back(merged); // Done with the temp files, remove them unlink(item1.bwt_filename.c_str()); unlink(item2.bwt_filename.c_str()); unlink(item1.sai_filename.c_str()); unlink(item2.sai_filename.c_str()); ++groupID; } else { // Singleton, pass through to the next round nextMergeRound.push_back(mergeVector[i]); } } delete pReader; mergeVector.clear(); mergeVector.swap(nextMergeRound); ++round; } assert(mergeVector.size() == 1); // Done, rename the files to their final name std::stringstream bwt_ss; bwt_ss << parameters.outPrefix << parameters.bwtExtension << (USE_GZ ? ".gz" : ""); std::string bwt_final_filename = bwt_ss.str(); rename(mergeVector.front().bwt_filename.c_str(), bwt_final_filename.c_str()); std::stringstream sai_ss; sai_ss << parameters.outPrefix << parameters.saiExtension << (USE_GZ ? ".gz" : ""); std::string sai_final_filename = sai_ss.str(); rename(mergeVector.front().sai_filename.c_str(), sai_final_filename.c_str()); }
// Compute the initial BWTs for the input file split into blocks of records using the SAIS algorithm MergeVector computeInitialSAIS(const BWTDiskParameters& parameters) { SeqReader* pReader = new SeqReader(parameters.inFile); SeqRecord record; int groupID = 0; size_t numReadTotal = 0; MergeVector mergeVector; MergeItem mergeItem; mergeItem.start_index = 0; // Phase 1: Compute the initial BWTs ReadTable* pCurrRT = new ReadTable; bool done = false; while(!done) { done = !pReader->get(record); if(!done) { // the read is valid SeqItem item = record.toSeqItem(); if(parameters.bBuildReverse) item.seq.reverse(); pCurrRT->addRead(item); ++numReadTotal; } if(pCurrRT->getCount() >= parameters.numReadsPerBatch || (done && pCurrRT->getCount() > 0)) { // Compute the SA and BWT for this group SuffixArray* pSA = new SuffixArray(pCurrRT, 1); // Write the BWT to disk std::string bwt_temp_filename = makeTempName(parameters.outPrefix, groupID, parameters.bwtExtension); pSA->writeBWT(bwt_temp_filename, pCurrRT); std::string sai_temp_filename = makeTempName(parameters.outPrefix, groupID, parameters.saiExtension); pSA->writeIndex(sai_temp_filename); // Push the merge info mergeItem.end_index = numReadTotal - 1; // inclusive mergeItem.reads_filename = parameters.inFile; mergeItem.bwt_filename = bwt_temp_filename; mergeItem.sai_filename = sai_temp_filename; mergeVector.push_back(mergeItem); // Cleanup delete pSA; // Start the new group mergeItem.start_index = numReadTotal; ++groupID; pCurrRT->clear(); } } delete pCurrRT; delete pReader; return mergeVector; }
// The algorithm is as follows. We create M BWTs for subsets of // the input reads. These are created independently and written // to disk. They are then merged either sequentially or pairwise // to create the final BWT void buildBWTDisk(const std::string& in_filename, const std::string& out_prefix, const std::string& bwt_extension, const std::string& sai_extension, bool doReverse, int numThreads, int numReadsPerBatch, int storageLevel) { size_t MAX_READS_PER_GROUP = numReadsPerBatch; SeqReader* pReader = new SeqReader(in_filename); SeqRecord record; int groupID = 0; size_t numReadTotal = 0; MergeVector mergeVector; MergeItem mergeItem; mergeItem.start_index = 0; // Phase 1: Compute the initial BWTs ReadTable* pCurrRT = new ReadTable; bool done = false; while(!done) { done = !pReader->get(record); if(!done) { // the read is valid SeqItem item = record.toSeqItem(); if(doReverse) item.seq.reverse(); pCurrRT->addRead(item); ++numReadTotal; } if(pCurrRT->getCount() >= MAX_READS_PER_GROUP || (done && pCurrRT->getCount() > 0)) { // Compute the SA and BWT for this group SuffixArray* pSA = new SuffixArray(pCurrRT, numThreads); // Write the BWT to disk std::string bwt_temp_filename = makeTempName(out_prefix, groupID, bwt_extension); pSA->writeBWT(bwt_temp_filename, pCurrRT); std::string sai_temp_filename = makeTempName(out_prefix, groupID, sai_extension); pSA->writeIndex(sai_temp_filename); // Push the merge info mergeItem.end_index = numReadTotal - 1; // inclusive mergeItem.reads_filename = in_filename; mergeItem.bwt_filename = bwt_temp_filename; mergeItem.sai_filename = sai_temp_filename; mergeVector.push_back(mergeItem); // Cleanup delete pSA; // Start the new group mergeItem.start_index = numReadTotal; ++groupID; pCurrRT->clear(); } } delete pCurrRT; delete pReader; // Phase 2: Pairwise merge the BWTs int round = 1; MergeVector nextMergeRound; while(mergeVector.size() > 1) { std::cout << "Starting round " << round << "\n"; pReader = new SeqReader(in_filename); for(size_t i = 0; i < mergeVector.size(); i+=2) { if(i + 1 != mergeVector.size()) { std::string bwt_merged_name = makeTempName(out_prefix, groupID, bwt_extension); std::string sai_merged_name = makeTempName(out_prefix, groupID, sai_extension); MergeItem item1 = mergeVector[i]; MergeItem item2 = mergeVector[i+1]; // Perform the actual merge int64_t curr_idx = merge(pReader, item1, item2, bwt_merged_name, sai_merged_name, doReverse, numThreads, storageLevel); // pReader now points to the end of item1's block of // reads. Skip item2's reads assert(curr_idx == item2.start_index); while(curr_idx <= item2.end_index) { bool eof = !pReader->get(record); assert(!eof); (void)eof; ++curr_idx; } // Create the merged mergeItem to use in the next round MergeItem merged; merged.start_index = item1.start_index; merged.end_index = item2.end_index; merged.bwt_filename = bwt_merged_name; merged.sai_filename = sai_merged_name; nextMergeRound.push_back(merged); // Done with the temp files, remove them unlink(item1.bwt_filename.c_str()); unlink(item2.bwt_filename.c_str()); unlink(item1.sai_filename.c_str()); unlink(item2.sai_filename.c_str()); ++groupID; } else { // Singleton, pass through to the next round nextMergeRound.push_back(mergeVector[i]); } } delete pReader; mergeVector.clear(); mergeVector.swap(nextMergeRound); ++round; } assert(mergeVector.size() == 1); // Done, rename the files to their final name std::stringstream bwt_ss; bwt_ss << out_prefix << bwt_extension << (USE_GZ ? ".gz" : ""); std::string bwt_final_filename = bwt_ss.str(); rename(mergeVector.front().bwt_filename.c_str(), bwt_final_filename.c_str()); std::stringstream sai_ss; sai_ss << out_prefix << sai_extension << (USE_GZ ? ".gz" : ""); std::string sai_final_filename = sai_ss.str(); rename(mergeVector.front().sai_filename.c_str(), sai_final_filename.c_str()); }