bool DsrcCompressorMT::Process(const InputParameters &args_) { IFastqStreamReader* fileReader = NULL; DsrcFileWriter* fileWriter = NULL; // make reusable // FastqDataPool* fastqPool = NULL; FastqDataQueue* fastqQueue = NULL; DsrcDataPool* dsrcPool = NULL; DsrcDataQueue* dsrcQueue = NULL; ErrorHandler* errorHandler = NULL; // // FastqReader* dataReader = NULL; DsrcWriter* dataWriter = NULL; FastqDatasetType datasetType; CompressionSettings compSettings = GetCompressionSettings(args_); try { if (args_.useFastqStdIo) fileReader = new FastqStdIoReader(); else fileReader = new FastqFileReader(args_.inputFilename); fileWriter = new DsrcFileWriter(); fileWriter->StartCompress(args_.outputFilename); const uint32 partNum = (args_.fastqBufferSizeMB < 128) ? args_.threadNum * 4 : args_.threadNum * 2; fastqPool = new FastqDataPool(partNum, args_.fastqBufferSizeMB << 20); // maxPart, bufferPartSize fastqQueue = new FastqDataQueue(partNum, 1); // maxPart, threadCount dsrcPool = new DsrcDataPool(partNum, args_.fastqBufferSizeMB << 20); dsrcQueue = new DsrcDataQueue(partNum, args_.threadNum); if (args_.calculateCrc32) errorHandler = new MultithreadedErrorHandler(); else errorHandler = new ErrorHandler(); dataReader = new FastqReader(*fileReader, *fastqQueue, *fastqPool, *errorHandler); dataWriter = new DsrcWriter(*fileWriter, *dsrcQueue, *dsrcPool, *errorHandler); // analyze file // const bool findQOffset = args_.qualityOffset == fq::FastqDatasetType::AutoQualityOffset; if (!findQOffset) datasetType.qualityOffset = args_.qualityOffset; if (!dataReader->AnalyzeFirstChunk(datasetType, findQOffset)) { throw DsrcException("Error analyzing FASTQ dataset"); } fileWriter->SetDatasetType(datasetType); fileWriter->SetCompressionSettings(compSettings); } catch (const std::runtime_error& e_) { AddError(e_.what()); } if (!IsError()) { const uint32 threadsNum = args_.threadNum; // launch threads // th::thread readerThread(th::ref(*dataReader)); std::vector<DsrcCompressor*> operators; operators.resize(threadsNum); #ifdef USE_BOOST_THREAD boost::thread_group opThreadGroup; // why C++11 does not have thread_group? :// for (uint32 i = 0; i < threadsNum; ++i) { operators[i] = new DsrcCompressor(*fastqQueue, *fastqPool, *dsrcQueue, *dsrcPool, *errorHandler, datasetType, compSettings); opThreadGroup.create_thread(th::ref(*operators[i])); } (*dataWriter)(); // main thread works as writer readerThread.join(); opThreadGroup.join_all(); #else std::vector<th::thread> opThreadGroup; for (uint32 i = 0; i < threadsNum; ++i) { operators[i] = new DsrcCompressor(*fastqQueue, *fastqPool, *dsrcQueue, *dsrcPool, *errorHandler, datasetType, compSettings); opThreadGroup.push_back(th::thread(th::ref(*operators[i]))); } (*dataWriter)(); readerThread.join(); // find difference: 'for (std::vector<th::thread>::iterator i = opThreadGroup.begin(); i != opThreadGroup.end(); ++i)' // --> 'for (auto& t : opThreadGroup)' for (th::thread& t : opThreadGroup) { t.join(); } #endif // check for errors // if (errorHandler->IsError()) AddError(errorHandler->GetError()); // free resources, cleanup // fastqQueue->Reset(); dsrcQueue->Reset(); for (uint32 i = 0; i < threadsNum; ++i) { delete operators[i]; } fileReader->Close(); fileWriter->FinishCompress(); // set log // fq::StreamsInfo rawSize = fileWriter->GetFastqStreamInfo(); fq::StreamsInfo compSize = fileWriter->GetDsrcStreamInfo(); std::ostringstream ss; ss << "Compressed streams sizes (in bytes)\n"; ss << "TAG: " << std::setw(16) << compSize.sizes[fq::StreamsInfo::MetaStream] + compSize.sizes[fq::StreamsInfo::TagStream] << " / " << std::setw(16) << rawSize.sizes[fq::StreamsInfo::TagStream] << '\n'; ss << "DNA: " << std::setw(16) << compSize.sizes[fq::StreamsInfo::DnaStream] << " / " << std::setw(16) << rawSize.sizes[fq::StreamsInfo::DnaStream] << '\n'; ss << "QUA: " << std::setw(16) << compSize.sizes[fq::StreamsInfo::QualityStream] << " / " << std::setw(16) << rawSize.sizes[fq::StreamsInfo::QualityStream] << '\n'; AddLog(ss.str()); } TFree(dataWriter); TFree(dataReader); TFree(errorHandler); // make reusable // TFree(dsrcQueue); TFree(dsrcPool); TFree(fastqQueue); TFree(fastqPool); // // TFree(fileWriter); TFree(fileReader); return !IsError(); }
bool DsrcCompressorST::Process(const InputParameters &args_) { ASSERT(!IsError()); IFastqStreamReader* reader = NULL; DsrcFileWriter* writer = NULL; // make reusable // DsrcDataChunk* dsrcChunk = NULL; FastqDataChunk* fastqChunk = NULL; // // FastqDatasetType datasetType; CompressionSettings settings = GetCompressionSettings(args_); try { if (args_.useFastqStdIo) reader = new FastqStdIoReader(); else reader = new FastqFileReader(args_.inputFilename); // join into constructor for RAII style writer = new DsrcFileWriter(); writer->StartCompress(args_.outputFilename); dsrcChunk = new DsrcDataChunk(DsrcDataChunk::DefaultBufferSize); fastqChunk = new FastqDataChunk(args_.fastqBufferSizeMB << 20); // analyze the header // const bool findQOffset = args_.qualityOffset == fq::FastqDatasetType::AutoQualityOffset; if (!findQOffset) { datasetType.qualityOffset = args_.qualityOffset; } FastqParser parser; if (!reader->ReadNextChunk(fastqChunk) || parser.Analyze(*fastqChunk, datasetType, findQOffset) == 0) { throw DsrcException("Error analyzing FASTQ dataset"); } writer->SetDatasetType(datasetType); writer->SetCompressionSettings(settings); } catch (const std::runtime_error& e_) { AddError(e_.what()); } if (!IsError()) { BitMemoryWriter bitMemory(dsrcChunk->data); BlockCompressor superblock(datasetType, settings); do { superblock.Store(bitMemory, dsrcChunk->rawStreamsInfo, dsrcChunk->compStreamsInfo, *fastqChunk); bitMemory.Flush(); dsrcChunk->size = bitMemory.Position(); writer->WriteNextChunk(dsrcChunk); if (args_.calculateCrc32) { BitMemoryReader reader(dsrcChunk->data.Pointer(), dsrcChunk->data.Size()); std::fill(fastqChunk->data.Pointer(), fastqChunk->data.Pointer() + fastqChunk->data.Size(), 0xCC); if (!superblock.VerifyChecksum(reader, *fastqChunk)) { AddError("CRC32 checksums mismatch."); break; } } fastqChunk->Reset(); dsrcChunk->Reset(); bitMemory.Reset(); } while (reader->ReadNextChunk(fastqChunk)); reader->Close(); writer->FinishCompress(); // set log // fq::StreamsInfo rawSize = writer->GetFastqStreamInfo(); fq::StreamsInfo compSize = writer->GetDsrcStreamInfo(); std::ostringstream ss; ss << "Compressed streams sizes (in bytes)\n"; ss << "TAG: " << std::setw(16) << compSize.sizes[fq::StreamsInfo::MetaStream] + compSize.sizes[fq::StreamsInfo::TagStream] << " / " << std::setw(16) << rawSize.sizes[fq::StreamsInfo::TagStream] << '\n'; ss << "DNA: " << std::setw(16) << compSize.sizes[fq::StreamsInfo::DnaStream] << " / " << std::setw(16) << rawSize.sizes[fq::StreamsInfo::DnaStream] << '\n'; ss << "QUA: " << std::setw(16) << compSize.sizes[fq::StreamsInfo::QualityStream] << " / " << std::setw(16) << rawSize.sizes[fq::StreamsInfo::QualityStream] << '\n'; AddLog(ss.str()); } // make reusable // TFree(fastqChunk); TFree(dsrcChunk); // // TFree(writer); TFree(reader); return !IsError(); }
void BinModule::Fastq2Bin(const std::vector<std::string> &inFastqFiles_, const std::string &outBinFile_, uint32 threadNum_, bool compressedInput_, bool verboseMode_) { // TODO: try/catch to free resources // IFastqStreamReader* fastqFile = NULL; if (compressedInput_) fastqFile = new MultiFastqFileReaderGz(inFastqFiles_); else fastqFile = new MultiFastqFileReader(inFastqFiles_); BinFileWriter binFile; binFile.StartCompress(outBinFile_, config); const uint32 minimizersCount = config.minimizer.TotalMinimizersCount(); if (threadNum_ > 1) { FastqChunkPool* fastqPool = NULL; FastqChunkQueue* fastqQueue = NULL; BinaryPartsPool* binPool = NULL; BinaryPartsQueue* binQueue = NULL; FastqChunkReader* fastqReader = NULL; BinChunkWriter* binWriter = NULL; const uint32 partNum = threadNum_ * 4; fastqPool = new FastqChunkPool(partNum, config.fastqBlockSize); fastqQueue = new FastqChunkQueue(partNum, 1); binPool = new BinaryPartsPool(partNum, minimizersCount); binQueue = new BinaryPartsQueue(partNum, threadNum_); fastqReader = new FastqChunkReader(fastqFile, fastqQueue, fastqPool); binWriter = new BinChunkWriter(&binFile, binQueue, binPool); // launch stuff // mt::thread readerThread(mt::ref(*fastqReader)); std::vector<IOperator*> operators; operators.resize(threadNum_); #ifdef USE_BOOST_THREAD boost::thread_group opThreadGroup; for (uint32 i = 0; i < threadNum_; ++i) { operators[i] = new BinEncoder(config.minimizer, config.catParams, fastqQueue, fastqPool, binQueue, binPool); opThreadGroup.create_thread(mt::ref(*operators[i])); } (*binWriter)(); readerThread.join(); opThreadGroup.join_all(); #else std::vector<mt::thread> opThreadGroup; for (uint32 i = 0; i < threadNum_; ++i) { operators[i] = new BinEncoder(config.minimizer, config.catParams, fastqQueue, fastqPool, binQueue, binPool); opThreadGroup.push_back(mt::thread(mt::ref(*operators[i]))); } (*binWriter)(); readerThread.join(); for (mt::thread& t : opThreadGroup) { t.join(); } #endif for (uint32 i = 0; i < threadNum_; ++i) { delete operators[i]; } TFREE(binWriter); TFREE(fastqReader); TFREE(binQueue); TFREE(binPool); TFREE(fastqQueue); TFREE(fastqPool); } else { DnaParser parser; DnaCategorizer categorizer(config.minimizer, config.catParams); DnaPacker packer(config.minimizer); DataChunk fastqChunk(config.fastqBlockSize); std::vector<DnaRecord> records; records.resize(1 << 10); DnaBinBlock dnaBins(minimizersCount); BinaryBinBlock binBins; DataChunk dnaBuffer; while (fastqFile->ReadNextChunk(&fastqChunk)) { uint64 recordsCount = 0; parser.ParseFrom(fastqChunk, dnaBuffer, records, recordsCount); ASSERT(recordsCount > 0); categorizer.Categorize(records, recordsCount, dnaBins); packer.PackToBins(dnaBins, binBins); binFile.WriteNextBlock(&binBins); } } binFile.FinishCompress(); if (verboseMode_) { std::vector<uint64> recordCounts; binFile.GetBinStats(recordCounts); std::cout << "Signatures count: " << recordCounts.size() << std::endl; std::cout << "Records distribution in bins by signature:\n"; for (uint32 i = 0; i < recordCounts.size(); ++i) { if (recordCounts[i] > 0) std::cout << i << " : " << recordCounts[i] << '\n'; } std::cout << std::endl; } delete fastqFile; }