Пример #1
0
bool DsrcCompressorMT::Process(const InputParameters &args_)
{
	IFastqStreamReader* fileReader = NULL;
	DsrcFileWriter* fileWriter = NULL;

	// make reusable
	//
	FastqDataPool* fastqPool = NULL;
	FastqDataQueue* fastqQueue = NULL;
	DsrcDataPool* dsrcPool = NULL;
	DsrcDataQueue* dsrcQueue = NULL;
	ErrorHandler* errorHandler = NULL;
	//
	//

	FastqReader* dataReader = NULL;
	DsrcWriter* dataWriter = NULL;

	FastqDatasetType datasetType;
	CompressionSettings compSettings = GetCompressionSettings(args_);

	try
	{
		if (args_.useFastqStdIo)
			fileReader = new FastqStdIoReader();
		else
			fileReader = new FastqFileReader(args_.inputFilename);

		fileWriter = new DsrcFileWriter();
		fileWriter->StartCompress(args_.outputFilename);

		const uint32 partNum = (args_.fastqBufferSizeMB < 128) ? args_.threadNum * 4 : args_.threadNum * 2;
		fastqPool = new FastqDataPool(partNum, args_.fastqBufferSizeMB << 20);		// maxPart, bufferPartSize
		fastqQueue = new FastqDataQueue(partNum, 1);								// maxPart, threadCount

		dsrcPool = new DsrcDataPool(partNum, args_.fastqBufferSizeMB << 20);
		dsrcQueue = new DsrcDataQueue(partNum, args_.threadNum);

		if (args_.calculateCrc32)
			errorHandler = new MultithreadedErrorHandler();
		else
			errorHandler = new ErrorHandler();

		dataReader = new FastqReader(*fileReader, *fastqQueue, *fastqPool, *errorHandler);
		dataWriter = new DsrcWriter(*fileWriter, *dsrcQueue, *dsrcPool, *errorHandler);

		// analyze file
		//
		const bool findQOffset = args_.qualityOffset == fq::FastqDatasetType::AutoQualityOffset;
		if (!findQOffset)
			datasetType.qualityOffset = args_.qualityOffset;

		if (!dataReader->AnalyzeFirstChunk(datasetType, findQOffset))
		{
			throw DsrcException("Error analyzing FASTQ dataset");
		}

		fileWriter->SetDatasetType(datasetType);
		fileWriter->SetCompressionSettings(compSettings);
	}
	catch (const std::runtime_error& e_)
	{
		AddError(e_.what());
	}

	if (!IsError())
	{
		const uint32 threadsNum = args_.threadNum;

		// launch threads
		//
		th::thread readerThread(th::ref(*dataReader));

		std::vector<DsrcCompressor*> operators;
		operators.resize(threadsNum);

#ifdef USE_BOOST_THREAD
		boost::thread_group opThreadGroup;		// why C++11 does not have thread_group? ://

		for (uint32 i = 0; i < threadsNum; ++i)
		{
			operators[i] = new DsrcCompressor(*fastqQueue, *fastqPool, *dsrcQueue, *dsrcPool, *errorHandler, datasetType, compSettings);
			opThreadGroup.create_thread(th::ref(*operators[i]));
		}

		(*dataWriter)();			// main thread works as writer

		readerThread.join();
		opThreadGroup.join_all();

#else
		std::vector<th::thread> opThreadGroup;

		for (uint32 i = 0; i < threadsNum; ++i)
		{
			operators[i] = new DsrcCompressor(*fastqQueue, *fastqPool, *dsrcQueue, *dsrcPool, *errorHandler, datasetType, compSettings);
			opThreadGroup.push_back(th::thread(th::ref(*operators[i])));
		}

		(*dataWriter)();

		readerThread.join();

		// find difference: 'for (std::vector<th::thread>::iterator i = opThreadGroup.begin(); i != opThreadGroup.end(); ++i)'
		// --> 'for (auto& t : opThreadGroup)'
		for (th::thread& t : opThreadGroup)
		{
			t.join();
		}

#endif

		// check for errors
		//
		if (errorHandler->IsError())
			AddError(errorHandler->GetError());


		// free resources, cleanup
		//
		fastqQueue->Reset();
		dsrcQueue->Reset();

		for (uint32 i = 0; i < threadsNum; ++i)
		{
			delete operators[i];
		}

		fileReader->Close();
		fileWriter->FinishCompress();


		// set log
		//
		fq::StreamsInfo rawSize = fileWriter->GetFastqStreamInfo();
		fq::StreamsInfo compSize = fileWriter->GetDsrcStreamInfo();

		std::ostringstream ss;
		ss << "Compressed streams sizes (in bytes)\n";
		ss << "TAG: " << std::setw(16) << compSize.sizes[fq::StreamsInfo::MetaStream] + compSize.sizes[fq::StreamsInfo::TagStream]
					  << " / " << std::setw(16) << rawSize.sizes[fq::StreamsInfo::TagStream] << '\n';
		ss << "DNA: " << std::setw(16) << compSize.sizes[fq::StreamsInfo::DnaStream]
					  << " / " << std::setw(16) << rawSize.sizes[fq::StreamsInfo::DnaStream] << '\n';
		ss << "QUA: " << std::setw(16) << compSize.sizes[fq::StreamsInfo::QualityStream]
					  << " / " << std::setw(16) << rawSize.sizes[fq::StreamsInfo::QualityStream] << '\n';
		AddLog(ss.str());
	}

	TFree(dataWriter);
	TFree(dataReader);
	TFree(errorHandler);

	// make reusable
	//
	TFree(dsrcQueue);
	TFree(dsrcPool);
	TFree(fastqQueue);
	TFree(fastqPool);
	//
	//

	TFree(fileWriter);
	TFree(fileReader);

	return !IsError();
}
Пример #2
0
bool DsrcCompressorST::Process(const InputParameters &args_)
{
	ASSERT(!IsError());

	IFastqStreamReader* reader = NULL;
	DsrcFileWriter* writer = NULL;

	// make reusable
	//
	DsrcDataChunk* dsrcChunk = NULL;
	FastqDataChunk* fastqChunk = NULL;
	//
	//

	FastqDatasetType datasetType;
	CompressionSettings settings = GetCompressionSettings(args_);

	try
	{
		if (args_.useFastqStdIo)
			reader = new FastqStdIoReader();
		else
			reader = new FastqFileReader(args_.inputFilename);

		// join into constructor for RAII style
		writer = new DsrcFileWriter();
		writer->StartCompress(args_.outputFilename);

		dsrcChunk = new DsrcDataChunk(DsrcDataChunk::DefaultBufferSize);
		fastqChunk = new FastqDataChunk(args_.fastqBufferSizeMB << 20);

		// analyze the header
		//
		const bool findQOffset = args_.qualityOffset == fq::FastqDatasetType::AutoQualityOffset;
		if (!findQOffset)
		{
			datasetType.qualityOffset = args_.qualityOffset;
		}

		FastqParser parser;
		if (!reader->ReadNextChunk(fastqChunk) || parser.Analyze(*fastqChunk, datasetType, findQOffset) == 0)
		{
			throw DsrcException("Error analyzing FASTQ dataset");
		}

		writer->SetDatasetType(datasetType);
		writer->SetCompressionSettings(settings);
	}
	catch (const std::runtime_error& e_)
	{
		AddError(e_.what());
	}

	if (!IsError())
	{
		BitMemoryWriter bitMemory(dsrcChunk->data);
		BlockCompressor superblock(datasetType, settings);

		do
		{
			superblock.Store(bitMemory, dsrcChunk->rawStreamsInfo, dsrcChunk->compStreamsInfo, *fastqChunk);

			bitMemory.Flush();
			dsrcChunk->size = bitMemory.Position();

			writer->WriteNextChunk(dsrcChunk);

			if (args_.calculateCrc32)
			{
				BitMemoryReader reader(dsrcChunk->data.Pointer(), dsrcChunk->data.Size());
				std::fill(fastqChunk->data.Pointer(), fastqChunk->data.Pointer() + fastqChunk->data.Size(), 0xCC);

				if (!superblock.VerifyChecksum(reader, *fastqChunk))
				{
					AddError("CRC32 checksums mismatch.");
					break;
				}
			}

			fastqChunk->Reset();
			dsrcChunk->Reset();
			bitMemory.Reset();
		}
		while (reader->ReadNextChunk(fastqChunk));

		reader->Close();
		writer->FinishCompress();


		// set log
		//
		fq::StreamsInfo rawSize = writer->GetFastqStreamInfo();
		fq::StreamsInfo compSize = writer->GetDsrcStreamInfo();

		std::ostringstream ss;
		ss << "Compressed streams sizes (in bytes)\n";
		ss << "TAG: " << std::setw(16) << compSize.sizes[fq::StreamsInfo::MetaStream] + compSize.sizes[fq::StreamsInfo::TagStream]
					  << " / " << std::setw(16) << rawSize.sizes[fq::StreamsInfo::TagStream] << '\n';
		ss << "DNA: " << std::setw(16) << compSize.sizes[fq::StreamsInfo::DnaStream]
					  << " / " << std::setw(16) << rawSize.sizes[fq::StreamsInfo::DnaStream] << '\n';
		ss << "QUA: " << std::setw(16) << compSize.sizes[fq::StreamsInfo::QualityStream]
					  << " / " << std::setw(16) << rawSize.sizes[fq::StreamsInfo::QualityStream] << '\n';
		AddLog(ss.str());
	}

	// make reusable
	//
	TFree(fastqChunk);
	TFree(dsrcChunk);
	//
	//

	TFree(writer);
	TFree(reader);

	return !IsError();
}
Пример #3
0
void BinModule::Fastq2Bin(const std::vector<std::string> &inFastqFiles_, const std::string &outBinFile_,
						  uint32 threadNum_,  bool compressedInput_, bool verboseMode_)
{
	// TODO: try/catch to free resources
	//
	IFastqStreamReader* fastqFile = NULL;
	if (compressedInput_)
		fastqFile = new MultiFastqFileReaderGz(inFastqFiles_);
	else
		fastqFile = new MultiFastqFileReader(inFastqFiles_);


	BinFileWriter binFile;
	binFile.StartCompress(outBinFile_, config);

	const uint32 minimizersCount = config.minimizer.TotalMinimizersCount();
	if (threadNum_ > 1)
	{
		FastqChunkPool* fastqPool = NULL;
		FastqChunkQueue* fastqQueue = NULL;
		BinaryPartsPool* binPool = NULL;
		BinaryPartsQueue* binQueue = NULL;

		FastqChunkReader* fastqReader = NULL;
		BinChunkWriter* binWriter = NULL;

		const uint32 partNum = threadNum_ * 4;
		fastqPool = new FastqChunkPool(partNum, config.fastqBlockSize);
		fastqQueue = new FastqChunkQueue(partNum, 1);

		binPool = new BinaryPartsPool(partNum, minimizersCount);
		binQueue = new BinaryPartsQueue(partNum, threadNum_);

		fastqReader = new FastqChunkReader(fastqFile, fastqQueue, fastqPool);
		binWriter = new BinChunkWriter(&binFile, binQueue, binPool);

		// launch stuff
		//
		mt::thread readerThread(mt::ref(*fastqReader));

		std::vector<IOperator*> operators;
		operators.resize(threadNum_);

#ifdef USE_BOOST_THREAD
		boost::thread_group opThreadGroup;

		for (uint32 i = 0; i < threadNum_; ++i)
		{
			operators[i] = new BinEncoder(config.minimizer, config.catParams,
										  fastqQueue, fastqPool,
										  binQueue, binPool);
			opThreadGroup.create_thread(mt::ref(*operators[i]));
		}

		(*binWriter)();

		readerThread.join();
		opThreadGroup.join_all();


#else
		std::vector<mt::thread> opThreadGroup;

		for (uint32 i = 0; i < threadNum_; ++i)
		{
			operators[i] = new BinEncoder(config.minimizer, config.catParams,
										  fastqQueue, fastqPool, binQueue, binPool);
			opThreadGroup.push_back(mt::thread(mt::ref(*operators[i])));
		}

		(*binWriter)();

		readerThread.join();

		for (mt::thread& t : opThreadGroup)
		{
			t.join();
		}

#endif

		for (uint32 i = 0; i < threadNum_; ++i)
		{
			delete operators[i];
		}

		TFREE(binWriter);
		TFREE(fastqReader);

		TFREE(binQueue);
		TFREE(binPool);
		TFREE(fastqQueue);
		TFREE(fastqPool);
	}
	else
	{
		DnaParser parser;
		DnaCategorizer categorizer(config.minimizer, config.catParams);
		DnaPacker packer(config.minimizer);

		DataChunk fastqChunk(config.fastqBlockSize);
		std::vector<DnaRecord> records;
		records.resize(1 << 10);

		DnaBinBlock dnaBins(minimizersCount);
		BinaryBinBlock binBins;
		DataChunk dnaBuffer;

		while (fastqFile->ReadNextChunk(&fastqChunk))
		{
			uint64 recordsCount = 0;
			parser.ParseFrom(fastqChunk, dnaBuffer, records, recordsCount);

			ASSERT(recordsCount > 0);
			categorizer.Categorize(records, recordsCount, dnaBins);

			packer.PackToBins(dnaBins, binBins);

			binFile.WriteNextBlock(&binBins);
		}
	}

	binFile.FinishCompress();

	if (verboseMode_)
	{
		std::vector<uint64> recordCounts;
		binFile.GetBinStats(recordCounts);

		std::cout << "Signatures count: " << recordCounts.size() << std::endl;
		std::cout << "Records distribution in bins by signature:\n";
		for (uint32 i = 0; i < recordCounts.size(); ++i)
		{
			if (recordCounts[i] > 0)
				std::cout << i << " : " << recordCounts[i] << '\n';
		}
		std::cout << std::endl;
	}

	delete fastqFile;
}