示例#1
0
bool DsrcCompressorST::Process(const InputParameters &args_)
{
	ASSERT(!IsError());

	IFastqStreamReader* reader = NULL;
	DsrcFileWriter* writer = NULL;

	// make reusable
	//
	DsrcDataChunk* dsrcChunk = NULL;
	FastqDataChunk* fastqChunk = NULL;
	//
	//

	FastqDatasetType datasetType;
	CompressionSettings settings = GetCompressionSettings(args_);

	try
	{
		if (args_.useFastqStdIo)
			reader = new FastqStdIoReader();
		else
			reader = new FastqFileReader(args_.inputFilename);

		// join into constructor for RAII style
		writer = new DsrcFileWriter();
		writer->StartCompress(args_.outputFilename);

		dsrcChunk = new DsrcDataChunk(DsrcDataChunk::DefaultBufferSize);
		fastqChunk = new FastqDataChunk(args_.fastqBufferSizeMB << 20);

		// analyze the header
		//
		const bool findQOffset = args_.qualityOffset == fq::FastqDatasetType::AutoQualityOffset;
		if (!findQOffset)
		{
			datasetType.qualityOffset = args_.qualityOffset;
		}

		FastqParser parser;
		if (!reader->ReadNextChunk(fastqChunk) || parser.Analyze(*fastqChunk, datasetType, findQOffset) == 0)
		{
			throw DsrcException("Error analyzing FASTQ dataset");
		}

		writer->SetDatasetType(datasetType);
		writer->SetCompressionSettings(settings);
	}
	catch (const std::runtime_error& e_)
	{
		AddError(e_.what());
	}

	if (!IsError())
	{
		BitMemoryWriter bitMemory(dsrcChunk->data);
		BlockCompressor superblock(datasetType, settings);

		do
		{
			superblock.Store(bitMemory, dsrcChunk->rawStreamsInfo, dsrcChunk->compStreamsInfo, *fastqChunk);

			bitMemory.Flush();
			dsrcChunk->size = bitMemory.Position();

			writer->WriteNextChunk(dsrcChunk);

			if (args_.calculateCrc32)
			{
				BitMemoryReader reader(dsrcChunk->data.Pointer(), dsrcChunk->data.Size());
				std::fill(fastqChunk->data.Pointer(), fastqChunk->data.Pointer() + fastqChunk->data.Size(), 0xCC);

				if (!superblock.VerifyChecksum(reader, *fastqChunk))
				{
					AddError("CRC32 checksums mismatch.");
					break;
				}
			}

			fastqChunk->Reset();
			dsrcChunk->Reset();
			bitMemory.Reset();
		}
		while (reader->ReadNextChunk(fastqChunk));

		reader->Close();
		writer->FinishCompress();


		// set log
		//
		fq::StreamsInfo rawSize = writer->GetFastqStreamInfo();
		fq::StreamsInfo compSize = writer->GetDsrcStreamInfo();

		std::ostringstream ss;
		ss << "Compressed streams sizes (in bytes)\n";
		ss << "TAG: " << std::setw(16) << compSize.sizes[fq::StreamsInfo::MetaStream] + compSize.sizes[fq::StreamsInfo::TagStream]
					  << " / " << std::setw(16) << rawSize.sizes[fq::StreamsInfo::TagStream] << '\n';
		ss << "DNA: " << std::setw(16) << compSize.sizes[fq::StreamsInfo::DnaStream]
					  << " / " << std::setw(16) << rawSize.sizes[fq::StreamsInfo::DnaStream] << '\n';
		ss << "QUA: " << std::setw(16) << compSize.sizes[fq::StreamsInfo::QualityStream]
					  << " / " << std::setw(16) << rawSize.sizes[fq::StreamsInfo::QualityStream] << '\n';
		AddLog(ss.str());
	}

	// make reusable
	//
	TFree(fastqChunk);
	TFree(dsrcChunk);
	//
	//

	TFree(writer);
	TFree(reader);

	return !IsError();
}
示例#2
0
文件: BinModule.cpp 项目: lrog/orcom
void BinModule::Fastq2Bin(const std::vector<std::string> &inFastqFiles_, const std::string &outBinFile_,
						  uint32 threadNum_,  bool compressedInput_, bool verboseMode_)
{
	// TODO: try/catch to free resources
	//
	IFastqStreamReader* fastqFile = NULL;
	if (compressedInput_)
		fastqFile = new MultiFastqFileReaderGz(inFastqFiles_);
	else
		fastqFile = new MultiFastqFileReader(inFastqFiles_);


	BinFileWriter binFile;
	binFile.StartCompress(outBinFile_, config);

	const uint32 minimizersCount = config.minimizer.TotalMinimizersCount();
	if (threadNum_ > 1)
	{
		FastqChunkPool* fastqPool = NULL;
		FastqChunkQueue* fastqQueue = NULL;
		BinaryPartsPool* binPool = NULL;
		BinaryPartsQueue* binQueue = NULL;

		FastqChunkReader* fastqReader = NULL;
		BinChunkWriter* binWriter = NULL;

		const uint32 partNum = threadNum_ * 4;
		fastqPool = new FastqChunkPool(partNum, config.fastqBlockSize);
		fastqQueue = new FastqChunkQueue(partNum, 1);

		binPool = new BinaryPartsPool(partNum, minimizersCount);
		binQueue = new BinaryPartsQueue(partNum, threadNum_);

		fastqReader = new FastqChunkReader(fastqFile, fastqQueue, fastqPool);
		binWriter = new BinChunkWriter(&binFile, binQueue, binPool);

		// launch stuff
		//
		mt::thread readerThread(mt::ref(*fastqReader));

		std::vector<IOperator*> operators;
		operators.resize(threadNum_);

#ifdef USE_BOOST_THREAD
		boost::thread_group opThreadGroup;

		for (uint32 i = 0; i < threadNum_; ++i)
		{
			operators[i] = new BinEncoder(config.minimizer, config.catParams,
										  fastqQueue, fastqPool,
										  binQueue, binPool);
			opThreadGroup.create_thread(mt::ref(*operators[i]));
		}

		(*binWriter)();

		readerThread.join();
		opThreadGroup.join_all();


#else
		std::vector<mt::thread> opThreadGroup;

		for (uint32 i = 0; i < threadNum_; ++i)
		{
			operators[i] = new BinEncoder(config.minimizer, config.catParams,
										  fastqQueue, fastqPool, binQueue, binPool);
			opThreadGroup.push_back(mt::thread(mt::ref(*operators[i])));
		}

		(*binWriter)();

		readerThread.join();

		for (mt::thread& t : opThreadGroup)
		{
			t.join();
		}

#endif

		for (uint32 i = 0; i < threadNum_; ++i)
		{
			delete operators[i];
		}

		TFREE(binWriter);
		TFREE(fastqReader);

		TFREE(binQueue);
		TFREE(binPool);
		TFREE(fastqQueue);
		TFREE(fastqPool);
	}
	else
	{
		DnaParser parser;
		DnaCategorizer categorizer(config.minimizer, config.catParams);
		DnaPacker packer(config.minimizer);

		DataChunk fastqChunk(config.fastqBlockSize);
		std::vector<DnaRecord> records;
		records.resize(1 << 10);

		DnaBinBlock dnaBins(minimizersCount);
		BinaryBinBlock binBins;
		DataChunk dnaBuffer;

		while (fastqFile->ReadNextChunk(&fastqChunk))
		{
			uint64 recordsCount = 0;
			parser.ParseFrom(fastqChunk, dnaBuffer, records, recordsCount);

			ASSERT(recordsCount > 0);
			categorizer.Categorize(records, recordsCount, dnaBins);

			packer.PackToBins(dnaBins, binBins);

			binFile.WriteNextBlock(&binBins);
		}
	}

	binFile.FinishCompress();

	if (verboseMode_)
	{
		std::vector<uint64> recordCounts;
		binFile.GetBinStats(recordCounts);

		std::cout << "Signatures count: " << recordCounts.size() << std::endl;
		std::cout << "Records distribution in bins by signature:\n";
		for (uint32 i = 0; i < recordCounts.size(); ++i)
		{
			if (recordCounts[i] > 0)
				std::cout << i << " : " << recordCounts[i] << '\n';
		}
		std::cout << std::endl;
	}

	delete fastqFile;
}