bool DsrcCompressorST::Process(const InputParameters &args_) { ASSERT(!IsError()); IFastqStreamReader* reader = NULL; DsrcFileWriter* writer = NULL; // make reusable // DsrcDataChunk* dsrcChunk = NULL; FastqDataChunk* fastqChunk = NULL; // // FastqDatasetType datasetType; CompressionSettings settings = GetCompressionSettings(args_); try { if (args_.useFastqStdIo) reader = new FastqStdIoReader(); else reader = new FastqFileReader(args_.inputFilename); // join into constructor for RAII style writer = new DsrcFileWriter(); writer->StartCompress(args_.outputFilename); dsrcChunk = new DsrcDataChunk(DsrcDataChunk::DefaultBufferSize); fastqChunk = new FastqDataChunk(args_.fastqBufferSizeMB << 20); // analyze the header // const bool findQOffset = args_.qualityOffset == fq::FastqDatasetType::AutoQualityOffset; if (!findQOffset) { datasetType.qualityOffset = args_.qualityOffset; } FastqParser parser; if (!reader->ReadNextChunk(fastqChunk) || parser.Analyze(*fastqChunk, datasetType, findQOffset) == 0) { throw DsrcException("Error analyzing FASTQ dataset"); } writer->SetDatasetType(datasetType); writer->SetCompressionSettings(settings); } catch (const std::runtime_error& e_) { AddError(e_.what()); } if (!IsError()) { BitMemoryWriter bitMemory(dsrcChunk->data); BlockCompressor superblock(datasetType, settings); do { superblock.Store(bitMemory, dsrcChunk->rawStreamsInfo, dsrcChunk->compStreamsInfo, *fastqChunk); bitMemory.Flush(); dsrcChunk->size = bitMemory.Position(); writer->WriteNextChunk(dsrcChunk); if (args_.calculateCrc32) { BitMemoryReader reader(dsrcChunk->data.Pointer(), dsrcChunk->data.Size()); std::fill(fastqChunk->data.Pointer(), fastqChunk->data.Pointer() + fastqChunk->data.Size(), 0xCC); if (!superblock.VerifyChecksum(reader, *fastqChunk)) { AddError("CRC32 checksums mismatch."); break; } } fastqChunk->Reset(); dsrcChunk->Reset(); bitMemory.Reset(); } while (reader->ReadNextChunk(fastqChunk)); reader->Close(); writer->FinishCompress(); // set log // fq::StreamsInfo rawSize = writer->GetFastqStreamInfo(); fq::StreamsInfo compSize = writer->GetDsrcStreamInfo(); std::ostringstream ss; ss << "Compressed streams sizes (in bytes)\n"; ss << "TAG: " << std::setw(16) << compSize.sizes[fq::StreamsInfo::MetaStream] + compSize.sizes[fq::StreamsInfo::TagStream] << " / " << std::setw(16) << rawSize.sizes[fq::StreamsInfo::TagStream] << '\n'; ss << "DNA: " << std::setw(16) << compSize.sizes[fq::StreamsInfo::DnaStream] << " / " << std::setw(16) << rawSize.sizes[fq::StreamsInfo::DnaStream] << '\n'; ss << "QUA: " << std::setw(16) << compSize.sizes[fq::StreamsInfo::QualityStream] << " / " << std::setw(16) << rawSize.sizes[fq::StreamsInfo::QualityStream] << '\n'; AddLog(ss.str()); } // make reusable // TFree(fastqChunk); TFree(dsrcChunk); // // TFree(writer); TFree(reader); return !IsError(); }
void BinModule::Fastq2Bin(const std::vector<std::string> &inFastqFiles_, const std::string &outBinFile_, uint32 threadNum_, bool compressedInput_, bool verboseMode_) { // TODO: try/catch to free resources // IFastqStreamReader* fastqFile = NULL; if (compressedInput_) fastqFile = new MultiFastqFileReaderGz(inFastqFiles_); else fastqFile = new MultiFastqFileReader(inFastqFiles_); BinFileWriter binFile; binFile.StartCompress(outBinFile_, config); const uint32 minimizersCount = config.minimizer.TotalMinimizersCount(); if (threadNum_ > 1) { FastqChunkPool* fastqPool = NULL; FastqChunkQueue* fastqQueue = NULL; BinaryPartsPool* binPool = NULL; BinaryPartsQueue* binQueue = NULL; FastqChunkReader* fastqReader = NULL; BinChunkWriter* binWriter = NULL; const uint32 partNum = threadNum_ * 4; fastqPool = new FastqChunkPool(partNum, config.fastqBlockSize); fastqQueue = new FastqChunkQueue(partNum, 1); binPool = new BinaryPartsPool(partNum, minimizersCount); binQueue = new BinaryPartsQueue(partNum, threadNum_); fastqReader = new FastqChunkReader(fastqFile, fastqQueue, fastqPool); binWriter = new BinChunkWriter(&binFile, binQueue, binPool); // launch stuff // mt::thread readerThread(mt::ref(*fastqReader)); std::vector<IOperator*> operators; operators.resize(threadNum_); #ifdef USE_BOOST_THREAD boost::thread_group opThreadGroup; for (uint32 i = 0; i < threadNum_; ++i) { operators[i] = new BinEncoder(config.minimizer, config.catParams, fastqQueue, fastqPool, binQueue, binPool); opThreadGroup.create_thread(mt::ref(*operators[i])); } (*binWriter)(); readerThread.join(); opThreadGroup.join_all(); #else std::vector<mt::thread> opThreadGroup; for (uint32 i = 0; i < threadNum_; ++i) { operators[i] = new BinEncoder(config.minimizer, config.catParams, fastqQueue, fastqPool, binQueue, binPool); opThreadGroup.push_back(mt::thread(mt::ref(*operators[i]))); } (*binWriter)(); readerThread.join(); for (mt::thread& t : opThreadGroup) { t.join(); } #endif for (uint32 i = 0; i < threadNum_; ++i) { delete operators[i]; } TFREE(binWriter); TFREE(fastqReader); TFREE(binQueue); TFREE(binPool); TFREE(fastqQueue); TFREE(fastqPool); } else { DnaParser parser; DnaCategorizer categorizer(config.minimizer, config.catParams); DnaPacker packer(config.minimizer); DataChunk fastqChunk(config.fastqBlockSize); std::vector<DnaRecord> records; records.resize(1 << 10); DnaBinBlock dnaBins(minimizersCount); BinaryBinBlock binBins; DataChunk dnaBuffer; while (fastqFile->ReadNextChunk(&fastqChunk)) { uint64 recordsCount = 0; parser.ParseFrom(fastqChunk, dnaBuffer, records, recordsCount); ASSERT(recordsCount > 0); categorizer.Categorize(records, recordsCount, dnaBins); packer.PackToBins(dnaBins, binBins); binFile.WriteNextBlock(&binBins); } } binFile.FinishCompress(); if (verboseMode_) { std::vector<uint64> recordCounts; binFile.GetBinStats(recordCounts); std::cout << "Signatures count: " << recordCounts.size() << std::endl; std::cout << "Records distribution in bins by signature:\n"; for (uint32 i = 0; i < recordCounts.size(); ++i) { if (recordCounts[i] > 0) std::cout << i << " : " << recordCounts[i] << '\n'; } std::cout << std::endl; } delete fastqFile; }