Esempio n. 1
0
void normalisefastaBgzf(libmaus::util::ArgInfo const & arginfo, std::ostream & out)
{
	libmaus::fastx::StreamFastAReaderWrapper in(std::cin);
	libmaus::fastx::StreamFastAReaderWrapper::pattern_type pattern;
	int const level = libmaus::bambam::BamBlockWriterBaseFactory::checkCompressionLevel(arginfo.getValue("level",getDefaultLevel()));
	std::string const indexfn = arginfo.getUnparsedValue("index","");

	::libmaus::bambam::BamBlockWriterBaseFactory::checkCompressionLevel(level);

	libmaus::lz::BgzfDeflate<std::ostream> defl(out,level,false /* full flush */);
	uint64_t const inbufsize = defl.getInputBufferSize();
	uint64_t zoffset = 0;
	uint64_t ioffset = 0;
	std::vector<libmaus::fastx::BgzfFastAIndexEntry> index;
	std::ostringstream indexstr;
	
	ioffset += libmaus::util::NumberSerialisation::serialiseNumber(indexstr,inbufsize);
	uint64_t patid = 0;
	
	while ( in.getNextPatternUnlocked(pattern) )
	{
		std::string const name = pattern.getStringId();
		std::string const shortname = stripName(name);
		std::string const & spat = pattern.spattern;
		char const * cpat = spat.c_str();
		uint64_t const patlen = spat.size();
		uint64_t const numblocks = (patlen + inbufsize - 1)/inbufsize;

		index.push_back(libmaus::fastx::BgzfFastAIndexEntry(shortname,patid++,ioffset));
		
		ioffset += libmaus::util::StringSerialisation::serialiseString(indexstr,name);
		ioffset += libmaus::util::StringSerialisation::serialiseString(indexstr,shortname);
		ioffset += libmaus::util::NumberSerialisation::serialiseNumber(indexstr,patlen);
		ioffset += libmaus::util::NumberSerialisation::serialiseNumber(indexstr,zoffset);
		ioffset += libmaus::util::NumberSerialisation::serialiseNumber(indexstr,numblocks);
		
		std::ostringstream nameostr;
		nameostr << '>' << name << '\n';
		std::string const nameser = nameostr.str();
				
		std::pair<uint64_t,uint64_t> const P0 = defl.writeSyncedCount(nameser.c_str(),nameser.size());
		zoffset += P0.second;
		
		uint64_t o = 0;
		while ( o != patlen )
		{
			assert ( o % inbufsize == 0 );
			uint64_t const towrite = std::min(patlen-o,inbufsize);
			std::pair<uint64_t,uint64_t> const P1 = defl.writeSyncedCount(cpat,towrite);
			
			ioffset += libmaus::util::NumberSerialisation::serialiseNumber(indexstr,zoffset);

			zoffset += P1.second;
			o += towrite;
			cpat += towrite;
		}		

		ioffset += libmaus::util::NumberSerialisation::serialiseNumber(indexstr,zoffset);

		std::pair<uint64_t,uint64_t> const Pn = defl.writeSyncedCount("\n",1);
		zoffset += Pn.second;
	}

	defl.flush();
	out << std::flush;
	
	uint64_t const imetaoffset = ioffset;

	ioffset += libmaus::util::NumberSerialisation::serialiseNumber(indexstr,index.size());
	for ( uint64_t i = 0; i < index.size(); ++i )
		ioffset += libmaus::util::NumberSerialisation::serialiseNumber(indexstr,index[i].ioffset);
	
	libmaus::util::NumberSerialisation::serialiseNumber(indexstr,imetaoffset);

	if ( indexfn.size() )
	{
		std::string const & sindex = indexstr.str();
		libmaus::aio::CheckedOutputStream indexCOS(indexfn);
		indexCOS.write(sindex.c_str(),sindex.size());
		indexCOS.flush();
		indexCOS.close();	
	}
}
Esempio n. 2
0
int bamheap2(libmaus::util::ArgInfo const & arginfo)
{
	bool const verbose = arginfo.getValue("verbose",getDefaultVerbose());
	std::string const reference = arginfo.getUnparsedValue("reference",std::string());
	std::string const outputprefix = arginfo.getUnparsedValue("outputprefix",std::string());
	
	libmaus::bambam::BamAlignmentDecoderWrapper::unique_ptr_type decwrapper(
		libmaus::bambam::BamMultiAlignmentDecoderFactory::construct(arginfo));
	::libmaus::bambam::BamAlignmentDecoder * ppdec = &(decwrapper->getDecoder());
	::libmaus::bambam::BamAlignmentDecoder & dec = *ppdec;
	::libmaus::bambam::BamHeader const & header = dec.getHeader();	
	::libmaus::bambam::BamAlignment const & algn = dec.getAlignment();
	
	double const damult = arginfo.getValue<double>("amult",1);
	double const dcmult = arginfo.getValue<double>("cmult",1);
	double const dgmult = arginfo.getValue<double>("gmult",1);
	double const dtmult = arginfo.getValue<double>("tmult",1);
	double const dpadmult = arginfo.getValue<double>("padmult",1);
	
	double maxmult = 0;
	maxmult = std::max(damult,maxmult);
	maxmult = std::max(dcmult,maxmult);
	maxmult = std::max(dgmult,maxmult);
	maxmult = std::max(dtmult,maxmult);
	maxmult = std::max(dpadmult,maxmult);
	
	uint64_t const amult = std::floor((damult / maxmult) * (1ull<<16) + 0.5);
	uint64_t const cmult = std::floor((dcmult / maxmult) * (1ull<<16) + 0.5);
	uint64_t const gmult = std::floor((dgmult / maxmult) * (1ull<<16) + 0.5);
	uint64_t const tmult = std::floor((dtmult / maxmult) * (1ull<<16) + 0.5);
	uint64_t const padmult = std::floor((dpadmult / maxmult) * (1ull<<16) + 0.5);
	
	libmaus::fastx::FastAIndex::unique_ptr_type Pindex;
	libmaus::aio::CheckedInputStream::unique_ptr_type PCIS;
	if ( reference.size() )
	{
		libmaus::fastx::FastAIndex::unique_ptr_type Tindex(
			libmaus::fastx::FastAIndex::load(reference+".fai")
		);
		Pindex = UNIQUE_PTR_MOVE(Tindex);
		
		libmaus::aio::CheckedInputStream::unique_ptr_type TCIS(new libmaus::aio::CheckedInputStream(reference));
		PCIS = UNIQUE_PTR_MOVE(TCIS);
	}

	libmaus::autoarray::AutoArray<libmaus::bambam::cigar_operation> cigop;
	libmaus::autoarray::AutoArray<char> bases;
	
	int64_t prevrefid = -1;
	std::string refidname = "*";
	
	std::map< uint64_t, HeapEntry > M;
	uint64_t alcnt = 0;
	std::vector< std::pair<char,uint8_t> > pendinginserts;
	int64_t loadedRefId = -1;
	int64_t streamRefId = -1;
	libmaus::autoarray::AutoArray<char> refseqbases;
	ConsensusAccuracy * consacc = 0;
	std::map<uint64_t,ConsensusAccuracy> Mconsacc;
	typedef libmaus::util::shared_ptr<std::ostringstream>::type stream_ptr_type;
	stream_ptr_type Pstream;
	ConsensusAux Caux;
	
	Caux.M['a'] = Caux.M['A'] = amult;
	Caux.M['c'] = Caux.M['C'] = cmult;
	Caux.M['g'] = Caux.M['G'] = gmult;
	Caux.M['t'] = Caux.M['T'] = tmult;
	Caux.M[padsym] = padmult;
	
	while ( dec.readAlignment() )
	{
		if ( algn.isMapped() && (!algn.isQCFail()) )
		{
			assert ( ! pendinginserts.size() );
		
			uint32_t const numcigop = algn.getCigarOperations(cigop);
			uint64_t readpos = 0;
			uint64_t refpos = algn.getPos();
			uint64_t const seqlen = algn.decodeRead(bases);
			uint8_t const * qual = libmaus::bambam::BamAlignmentDecoderBase::getQual(algn.D.begin());
			
			// handle finished columns
			if ( algn.getRefID() != prevrefid )
			{
				while ( M.size() )
				{
					HeapEntry & H = M.begin()->second;
					
					if ( outputprefix.size() && (streamRefId != prevrefid) )
					{
						if ( Pstream )
						{
							std::ostringstream fnostr;
							fnostr << outputprefix << "_" << header.getRefIDName(streamRefId);
							libmaus::aio::PosixFdOutputStream PFOS(fnostr.str());
							PFOS << ">" << header.getRefIDName(streamRefId) << '\n';
							PFOS << Pstream->str() << '\n';
							
							Pstream.reset();
						}
						
						stream_ptr_type Tstream(new std::ostringstream);
						Pstream = Tstream;
						streamRefId = prevrefid;
					}
					
					if ( Pindex && (loadedRefId != prevrefid) )
					{
						refseqbases = Pindex->readSequence(*PCIS, Pindex->getSequenceIdByName(refidname));
						loadedRefId = prevrefid;
						
						if ( Mconsacc.find(loadedRefId) == Mconsacc.end() )
							Mconsacc[loadedRefId] = ConsensusAccuracy(refseqbases.size());
						
						consacc = &(Mconsacc[loadedRefId]);
					}
					
					H.toStream(std::cout,M.begin()->first,refidname,(M.begin()->first < refseqbases.size()) ? static_cast<int>(refseqbases[M.begin()->first]) : -1,Caux,consacc,Pstream.get());
					
					M.erase(M.begin());
				}
			
				prevrefid = algn.getRefID();
				refidname = header.getRefIDName(prevrefid);
			}
			else
			{
				while ( M.size() && M.begin()->first < refpos )
				{
					HeapEntry & H = M.begin()->second;

					if ( outputprefix.size() && (streamRefId != prevrefid) )
					{
						if ( Pstream )
						{
							std::ostringstream fnostr;
							fnostr << outputprefix << "_" << header.getRefIDName(streamRefId);
							libmaus::aio::PosixFdOutputStream PFOS(fnostr.str());
							PFOS << ">" << header.getRefIDName(streamRefId) << '\n';
							PFOS << Pstream->str() << '\n';

							Pstream.reset();
						}
						
						stream_ptr_type Tstream(new std::ostringstream);
						Pstream = Tstream;
						streamRefId = prevrefid;
					}

					if ( Pindex && (loadedRefId != prevrefid) )
					{
						refseqbases = Pindex->readSequence(*PCIS, Pindex->getSequenceIdByName(refidname));
						loadedRefId = prevrefid;

						if ( Mconsacc.find(loadedRefId) == Mconsacc.end() )
							Mconsacc[loadedRefId] = ConsensusAccuracy(refseqbases.size());

						consacc = &(Mconsacc[loadedRefId]);
					}
					
					H.toStream(std::cout,M.begin()->first,refidname,(M.begin()->first < refseqbases.size()) ? static_cast<int>(refseqbases[M.begin()->first]) : -1,Caux,consacc,Pstream.get());
					
					M.erase(M.begin());				
				}
			}
			
			for ( uint64_t ci = 0; ci < numcigop; ++ci )
			{
				uint64_t const ciglen = cigop[ci].second;
				
				switch ( cigop[ci].first )
				{
					case libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_CMATCH:
					case libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_CEQUAL:
					case libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_CDIFF:
					{
						if ( pendinginserts.size() )
						{
							M[refpos].I.push_back(pendinginserts);
							pendinginserts.resize(0);
						}
					
						for ( uint64_t i = 0; i < ciglen; ++i )
						{
							M[refpos].V.push_back(std::make_pair(bases[readpos],qual[readpos]));
							readpos++;
							refpos++;
						}
						break;
					}
					case libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_CINS:
					{
						for ( uint64_t i = 0; i < ciglen; ++i, ++readpos )
							pendinginserts.push_back(std::make_pair(bases[readpos],qual[readpos]));
						break;
					}
					case libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_CDEL:
						// handle pending inserts
						if ( pendinginserts.size() )
						{
							M[refpos].I.push_back(pendinginserts);
							pendinginserts.resize(0);
						}
						
						// deleting bases from the reference
						for ( uint64_t i = 0; i < ciglen; ++i, ++refpos )
							M[refpos].V.push_back(std::make_pair(padsym,0));
						break;
					case libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_CREF_SKIP:
						// handle pending inserts
						if ( pendinginserts.size() )
						{
							M[refpos].I.push_back(pendinginserts);
							pendinginserts.resize(0);
						}

						// skip bases on reference
						for ( uint64_t i = 0; i < ciglen; ++i )
						{
							refpos++;
						}
						break;
					case libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_CSOFT_CLIP:
						// skip bases on read
						for ( uint64_t i = 0; i < ciglen; ++i )
						{
							readpos++;
						}
						break;
					case libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_CHARD_CLIP:
						break;
					case libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_CPAD:
					{
						for ( uint64_t i = 0; i < ciglen; ++i, ++readpos )
							pendinginserts.push_back(std::make_pair(padsym,0));
						break;
					}
				}
			}

			if ( pendinginserts.size() )
			{
				M[refpos].I.push_back(pendinginserts);
				M[refpos].iadd++;
				pendinginserts.resize(0);
			}

			assert ( readpos == seqlen );
		}
		
		if ( verbose && ((++alcnt % (1024*1024)) == 0) )
			std::cerr << "[V] " << alcnt << std::endl;
	}

	while ( M.size() )
	{
		HeapEntry & H = M.begin()->second;

		if ( outputprefix.size() && (streamRefId != prevrefid) )
		{
			if ( Pstream )
			{
				std::ostringstream fnostr;
				fnostr << outputprefix << "_" << header.getRefIDName(streamRefId);
				libmaus::aio::PosixFdOutputStream PFOS(fnostr.str());
				PFOS << ">" << header.getRefIDName(streamRefId) << '\n';
				PFOS << Pstream->str() << '\n';

				Pstream.reset();
			}
			
			stream_ptr_type Tstream(new std::ostringstream);
			Pstream = Tstream;
			streamRefId = prevrefid;
		}

		if ( Pindex && (loadedRefId != prevrefid) )
		{
			refseqbases = Pindex->readSequence(*PCIS, Pindex->getSequenceIdByName(refidname));
			loadedRefId = prevrefid;

			if ( Mconsacc.find(loadedRefId) == Mconsacc.end() )
				Mconsacc[loadedRefId] = ConsensusAccuracy(refseqbases.size());

			consacc = &(Mconsacc[loadedRefId]);
		}
			
		H.toStream(std::cout,M.begin()->first,refidname,(M.begin()->first < refseqbases.size()) ? static_cast<int>(refseqbases[M.begin()->first]) : -1,Caux,consacc,Pstream.get());
		
		M.erase(M.begin());
	}
	
	if ( Pstream )
	{
		std::ostringstream fnostr;
		fnostr << outputprefix << "_" << header.getRefIDName(streamRefId);
		libmaus::aio::PosixFdOutputStream PFOS(fnostr.str());
		PFOS << ">" << header.getRefIDName(streamRefId) << '\n';
		PFOS << Pstream->str() << '\n';

		Pstream.reset();
	}
	
	ConsensusAccuracy constotal;
	for ( std::map<uint64_t,ConsensusAccuracy>::const_iterator ita = Mconsacc.begin(); ita != Mconsacc.end(); ++ita )
	{
		std::cerr << header.getRefIDName(ita->first) << "\t" << ita->second << std::endl;

		std::map<uint64_t,uint64_t> const M = ita->second.depthhistogram.get();
		uint64_t total = 0;
		uint64_t preavg = 0;
		for ( std::map<uint64_t,uint64_t>::const_iterator aita = M.begin(); aita != M.end(); ++aita )
		{
			total += aita->second;
			preavg += aita->first * aita->second;
		}

		uint64_t acc = 0;		
		for ( std::map<uint64_t,uint64_t>::const_iterator aita = M.begin(); aita != M.end(); ++aita )
		{
			acc += aita->second;
			std::cerr << "H[" << header.getRefIDName(ita->first) << "," << aita->first << ",+]"
				<< "\t" << aita->second << "\t" << static_cast<double>(aita->second)/total
				<< "\t" << acc << "\t" << static_cast<double>(acc)/total << std::endl;
		}
		acc = 0;
		for ( std::map<uint64_t,uint64_t>::const_reverse_iterator aita = M.rbegin(); aita != M.rend(); ++aita )
		{
			acc += aita->second;
			std::cerr << "H[" << header.getRefIDName(ita->first) << "," << aita->first << ",-]"
				<< "\t" << aita->second << "\t" << static_cast<double>(aita->second)/total
				<< "\t" << acc << "\t" << static_cast<double>(acc)/total << std::endl;
		}
		
		std::cerr << "H[" << header.getRefIDName(ita->first) << ",avg]\t" << 
			static_cast<double>(preavg)/total << std::endl;
		
		constotal += ita->second;
	}
	if ( Mconsacc.size() )
	{
		std::cerr << "all\t" << constotal << std::endl;

		std::map<uint64_t,uint64_t> const M = constotal.depthhistogram.get();
		uint64_t total = 0;
		uint64_t preavg = 0;
		for ( std::map<uint64_t,uint64_t>::const_iterator aita = M.begin(); aita != M.end(); ++aita )
		{
			total += aita->second;
			preavg += aita->first * aita->second;
		}

		uint64_t acc = 0;		
		for ( std::map<uint64_t,uint64_t>::const_iterator aita = M.begin(); aita != M.end(); ++aita )
		{
			acc += aita->second;
			std::cerr << "H[" << "all" << "," << aita->first << ",+]"
				<< "\t" << aita->second << "\t" << static_cast<double>(aita->second)/total
				<< "\t" << acc << "\t" << static_cast<double>(acc)/total << std::endl;
		}
		acc = 0;
		for ( std::map<uint64_t,uint64_t>::const_reverse_iterator aita = M.rbegin(); aita != M.rend(); ++aita )
		{
			acc += aita->second;
			std::cerr << "H[" << "all" << "," << aita->first << ",-]"
				<< "\t" << aita->second << "\t" << static_cast<double>(aita->second)/total
				<< "\t" << acc << "\t" << static_cast<double>(acc)/total << std::endl;
		}
		
		std::cerr << "H[all,avg]\t" << static_cast<double>(preavg) / total << std::endl;
		
	}

	return EXIT_SUCCESS;
}