コード例 #1
0
int bamfilterheader2(libmaus::util::ArgInfo const & arginfo)
{
	std::string const fn = arginfo.getUnparsedRestArg(0);
	
	::libmaus::bitio::IndexedBitVector::unique_ptr_type PIBV;

	// compute vector of used sequences
	{
		libmaus::aio::PosixFdInputStream in(fn);
		::libmaus::bitio::IndexedBitVector::unique_ptr_type TIBV(getUsedSeqVector(arginfo,in));
		PIBV = UNIQUE_PTR_MOVE(TIBV);
	}
	
	// filter file and remove all unused sequences from header
	{
		libmaus::aio::PosixFdInputStream in(fn);
		filterBamUsedSequences(arginfo,in,*PIBV,std::cout);
	}
	
	return EXIT_SUCCESS;
}
コード例 #2
0
int bamfixmatecoordinates(::libmaus::util::ArgInfo const & arginfo)
{
	::libmaus::util::TempFileRemovalContainer::setup();
	::libmaus::timing::RealTimeClock rtc; rtc.start();
	
	bool const verbose = arginfo.getValue<unsigned int>("verbose",getDefaultVerbose());
	unsigned int const colhashbits = arginfo.getValue<unsigned int>("colhashbits",getDefaultColHashBits());
	unsigned int const collistsize = arginfo.getValue<unsigned int>("collistsize",getDefaultColListSize());
	int const level = arginfo.getValue<int>("level",getDefaultLevel());
	std::string const tmpfilenamebase = arginfo.getValue<std::string>("tmpfile",arginfo.getDefaultTmpFileName());
	
	switch ( level )
	{
		case Z_NO_COMPRESSION:
		case Z_BEST_SPEED:
		case Z_BEST_COMPRESSION:
		case Z_DEFAULT_COMPRESSION:
			break;
		default:
		{
			::libmaus::exception::LibMausException se;
			se.getStream()
				<< "Unknown compression level, please use"
				<< " level=" << Z_DEFAULT_COMPRESSION << " (default) or"
				<< " level=" << Z_BEST_SPEED << " (fast) or"
				<< " level=" << Z_BEST_COMPRESSION << " (best) or"
				<< " level=" << Z_NO_COMPRESSION << " (no compression)" << std::endl;
			se.finish();
			throw se;
		}
			break;
	}


	std::string const tmpfilename = tmpfilenamebase + "_bamcollate";
	::libmaus::util::TempFileRemovalContainer::addTempFile(tmpfilename);
	
	::libmaus::bambam::CollatingBamDecoder CBD(std::cin,tmpfilename,false /* put rank */,colhashbits/*hash bits*/,collistsize/*size of output list*/);
	::libmaus::bambam::BamFormatAuxiliary auxdata;
	::libmaus::bambam::BamHeader const & bamheader = CBD.getHeader();
	
	// add PG line to header
	std::string const upheadtext = ::libmaus::bambam::ProgramHeaderLineSet::addProgramLine(
		bamheader.text,
		"bamfixmatecoordinates", // ID
		"bamfixmatecoordinates", // PN
		arginfo.commandline, // CL
		::libmaus::bambam::ProgramHeaderLineSet(bamheader.text).getLastIdInChain(), // PP
		std::string(PACKAGE_VERSION) // VN			
	);
	// construct new header
	::libmaus::bambam::BamHeader uphead(upheadtext);
	
	if ( uphead.getSortOrder() != "queryname" )
		uphead.changeSortOrder("unknown");

	/*
	 * start index/md5 callbacks
	 */
	std::string const tmpfileindex = tmpfilenamebase + "_index";
	::libmaus::util::TempFileRemovalContainer::addTempFile(tmpfileindex);

	std::string md5filename;
	std::string indexfilename;

	std::vector< ::libmaus::lz::BgzfDeflateOutputCallback * > cbs;
	::libmaus::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Pmd5cb;
	if ( arginfo.getValue<unsigned int>("md5",getDefaultMD5()) )
	{
		if ( arginfo.hasArg("md5filename") &&  arginfo.getUnparsedValue("md5filename","") != "" )
			md5filename = arginfo.getUnparsedValue("md5filename","");
		else
			std::cerr << "[V] no filename for md5 given, not creating hash" << std::endl;

		if ( md5filename.size() )
		{
			::libmaus::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Tmd5cb(new ::libmaus::lz::BgzfDeflateOutputCallbackMD5);
			Pmd5cb = UNIQUE_PTR_MOVE(Tmd5cb);
			cbs.push_back(Pmd5cb.get());
		}
	}
	libmaus::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Pindex;
	if ( arginfo.getValue<unsigned int>("index",getDefaultIndex()) )
	{
		if ( arginfo.hasArg("indexfilename") &&  arginfo.getUnparsedValue("indexfilename","") != "" )
			indexfilename = arginfo.getUnparsedValue("indexfilename","");
		else
			std::cerr << "[V] no filename for index given, not creating index" << std::endl;

		if ( indexfilename.size() )
		{
			libmaus::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Tindex(new libmaus::bambam::BgzfDeflateOutputCallbackBamIndex(tmpfileindex));
			Pindex = UNIQUE_PTR_MOVE(Tindex);
			cbs.push_back(Pindex.get());
		}
	}
	std::vector< ::libmaus::lz::BgzfDeflateOutputCallback * > * Pcbs = 0;
	if ( cbs.size() )
		Pcbs = &cbs;
	/*
	 * end md5/index callbacks
	 */
	
	// setup bam writer
	::libmaus::bambam::BamWriter::unique_ptr_type writer(new ::libmaus::bambam::BamWriter(std::cout,uphead,level,Pcbs));
	
	#if 0
	::libmaus::bambam::ProgramHeaderLineSet PHLS(bamheader.text);
	std::cerr << "Last id in PG chain: " << PHLS.getLastIdInChain() << std::endl;
	#endif

	// std::cout << bamheader.text;

	typedef ::libmaus::bambam::CollatingBamDecoder::alignment_ptr_type alignment_ptr_type;
	std::pair<alignment_ptr_type,alignment_ptr_type> P;
	uint64_t const mod = 1024*1024;
	uint64_t proc = 0;
	uint64_t lastproc = 0;
	uint64_t paircnt = 0;
	
	while ( CBD.tryPair(P) )
	{
		uint64_t const mapcnt = getMapCnt(P.first) + getMapCnt(P.second);
		
		if ( mapcnt == 1 )
		{
			int32_t refid = -1;
			int32_t pos = -1;
			
			if ( P.first )
			{
				refid = P.first->getRefID();
				pos = P.first->getPos();
			}
			else
			{
				assert ( P.second );

				refid = P.second->getRefID();
				pos = P.second->getPos();
			}
			
			P.first->putRefId(refid);
			P.first->putPos(pos);
			P.first->putNextRefId(refid);
			P.first->putNextPos(pos);
			P.second->putRefId(refid);
			P.second->putPos(pos);
			P.second->putNextRefId(refid);
			P.second->putNextPos(pos);
		}
		
		if ( P.first )
		{
			P.first->serialise(writer->getStream());
			++proc;
		}
		if ( P.second )
		{
			P.second->serialise(writer->getStream());
			++proc;
		}
		if ( P.first && P.second )
		{
			paircnt++;
		}
		
		if ( verbose && (proc/mod != lastproc/mod) )
		{
			std::cerr 
				<< "Processed " << proc << " fragments, " << paircnt << " pairs, " 
				<< proc/rtc.getElapsedSeconds() << " al/s"
				<< std::endl;
			lastproc = proc;
		}
	}		

	if ( verbose )
		std::cerr 	
			<< "Processed " << proc << " fragments, " << paircnt << " pairs, " 
			<< proc/rtc.getElapsedSeconds() << " al/s"
			<< std::endl;

	writer.reset();

	if ( Pmd5cb )
	{
		Pmd5cb->saveDigestAsFile(md5filename);
	}
	if ( Pindex )
	{
		Pindex->flush(std::string(indexfilename));
	}

	return EXIT_SUCCESS;
}
コード例 #3
0
void normalisefastaBgzf(libmaus::util::ArgInfo const & arginfo, std::ostream & out)
{
	libmaus::fastx::StreamFastAReaderWrapper in(std::cin);
	libmaus::fastx::StreamFastAReaderWrapper::pattern_type pattern;
	int const level = libmaus::bambam::BamBlockWriterBaseFactory::checkCompressionLevel(arginfo.getValue("level",getDefaultLevel()));
	std::string const indexfn = arginfo.getUnparsedValue("index","");

	::libmaus::bambam::BamBlockWriterBaseFactory::checkCompressionLevel(level);

	libmaus::lz::BgzfDeflate<std::ostream> defl(out,level,false /* full flush */);
	uint64_t const inbufsize = defl.getInputBufferSize();
	uint64_t zoffset = 0;
	uint64_t ioffset = 0;
	std::vector<libmaus::fastx::BgzfFastAIndexEntry> index;
	std::ostringstream indexstr;
	
	ioffset += libmaus::util::NumberSerialisation::serialiseNumber(indexstr,inbufsize);
	uint64_t patid = 0;
	
	while ( in.getNextPatternUnlocked(pattern) )
	{
		std::string const name = pattern.getStringId();
		std::string const shortname = stripName(name);
		std::string const & spat = pattern.spattern;
		char const * cpat = spat.c_str();
		uint64_t const patlen = spat.size();
		uint64_t const numblocks = (patlen + inbufsize - 1)/inbufsize;

		index.push_back(libmaus::fastx::BgzfFastAIndexEntry(shortname,patid++,ioffset));
		
		ioffset += libmaus::util::StringSerialisation::serialiseString(indexstr,name);
		ioffset += libmaus::util::StringSerialisation::serialiseString(indexstr,shortname);
		ioffset += libmaus::util::NumberSerialisation::serialiseNumber(indexstr,patlen);
		ioffset += libmaus::util::NumberSerialisation::serialiseNumber(indexstr,zoffset);
		ioffset += libmaus::util::NumberSerialisation::serialiseNumber(indexstr,numblocks);
		
		std::ostringstream nameostr;
		nameostr << '>' << name << '\n';
		std::string const nameser = nameostr.str();
				
		std::pair<uint64_t,uint64_t> const P0 = defl.writeSyncedCount(nameser.c_str(),nameser.size());
		zoffset += P0.second;
		
		uint64_t o = 0;
		while ( o != patlen )
		{
			assert ( o % inbufsize == 0 );
			uint64_t const towrite = std::min(patlen-o,inbufsize);
			std::pair<uint64_t,uint64_t> const P1 = defl.writeSyncedCount(cpat,towrite);
			
			ioffset += libmaus::util::NumberSerialisation::serialiseNumber(indexstr,zoffset);

			zoffset += P1.second;
			o += towrite;
			cpat += towrite;
		}		

		ioffset += libmaus::util::NumberSerialisation::serialiseNumber(indexstr,zoffset);

		std::pair<uint64_t,uint64_t> const Pn = defl.writeSyncedCount("\n",1);
		zoffset += Pn.second;
	}

	defl.flush();
	out << std::flush;
	
	uint64_t const imetaoffset = ioffset;

	ioffset += libmaus::util::NumberSerialisation::serialiseNumber(indexstr,index.size());
	for ( uint64_t i = 0; i < index.size(); ++i )
		ioffset += libmaus::util::NumberSerialisation::serialiseNumber(indexstr,index[i].ioffset);
	
	libmaus::util::NumberSerialisation::serialiseNumber(indexstr,imetaoffset);

	if ( indexfn.size() )
	{
		std::string const & sindex = indexstr.str();
		libmaus::aio::CheckedOutputStream indexCOS(indexfn);
		indexCOS.write(sindex.c_str(),sindex.size());
		indexCOS.flush();
		indexCOS.close();	
	}
}
コード例 #4
0
ファイル: bamheap2.cpp プロジェクト: allenday/biobambam
int bamheap2(libmaus::util::ArgInfo const & arginfo)
{
	bool const verbose = arginfo.getValue("verbose",getDefaultVerbose());
	std::string const reference = arginfo.getUnparsedValue("reference",std::string());
	std::string const outputprefix = arginfo.getUnparsedValue("outputprefix",std::string());
	
	libmaus::bambam::BamAlignmentDecoderWrapper::unique_ptr_type decwrapper(
		libmaus::bambam::BamMultiAlignmentDecoderFactory::construct(arginfo));
	::libmaus::bambam::BamAlignmentDecoder * ppdec = &(decwrapper->getDecoder());
	::libmaus::bambam::BamAlignmentDecoder & dec = *ppdec;
	::libmaus::bambam::BamHeader const & header = dec.getHeader();	
	::libmaus::bambam::BamAlignment const & algn = dec.getAlignment();
	
	double const damult = arginfo.getValue<double>("amult",1);
	double const dcmult = arginfo.getValue<double>("cmult",1);
	double const dgmult = arginfo.getValue<double>("gmult",1);
	double const dtmult = arginfo.getValue<double>("tmult",1);
	double const dpadmult = arginfo.getValue<double>("padmult",1);
	
	double maxmult = 0;
	maxmult = std::max(damult,maxmult);
	maxmult = std::max(dcmult,maxmult);
	maxmult = std::max(dgmult,maxmult);
	maxmult = std::max(dtmult,maxmult);
	maxmult = std::max(dpadmult,maxmult);
	
	uint64_t const amult = std::floor((damult / maxmult) * (1ull<<16) + 0.5);
	uint64_t const cmult = std::floor((dcmult / maxmult) * (1ull<<16) + 0.5);
	uint64_t const gmult = std::floor((dgmult / maxmult) * (1ull<<16) + 0.5);
	uint64_t const tmult = std::floor((dtmult / maxmult) * (1ull<<16) + 0.5);
	uint64_t const padmult = std::floor((dpadmult / maxmult) * (1ull<<16) + 0.5);
	
	libmaus::fastx::FastAIndex::unique_ptr_type Pindex;
	libmaus::aio::CheckedInputStream::unique_ptr_type PCIS;
	if ( reference.size() )
	{
		libmaus::fastx::FastAIndex::unique_ptr_type Tindex(
			libmaus::fastx::FastAIndex::load(reference+".fai")
		);
		Pindex = UNIQUE_PTR_MOVE(Tindex);
		
		libmaus::aio::CheckedInputStream::unique_ptr_type TCIS(new libmaus::aio::CheckedInputStream(reference));
		PCIS = UNIQUE_PTR_MOVE(TCIS);
	}

	libmaus::autoarray::AutoArray<libmaus::bambam::cigar_operation> cigop;
	libmaus::autoarray::AutoArray<char> bases;
	
	int64_t prevrefid = -1;
	std::string refidname = "*";
	
	std::map< uint64_t, HeapEntry > M;
	uint64_t alcnt = 0;
	std::vector< std::pair<char,uint8_t> > pendinginserts;
	int64_t loadedRefId = -1;
	int64_t streamRefId = -1;
	libmaus::autoarray::AutoArray<char> refseqbases;
	ConsensusAccuracy * consacc = 0;
	std::map<uint64_t,ConsensusAccuracy> Mconsacc;
	typedef libmaus::util::shared_ptr<std::ostringstream>::type stream_ptr_type;
	stream_ptr_type Pstream;
	ConsensusAux Caux;
	
	Caux.M['a'] = Caux.M['A'] = amult;
	Caux.M['c'] = Caux.M['C'] = cmult;
	Caux.M['g'] = Caux.M['G'] = gmult;
	Caux.M['t'] = Caux.M['T'] = tmult;
	Caux.M[padsym] = padmult;
	
	while ( dec.readAlignment() )
	{
		if ( algn.isMapped() && (!algn.isQCFail()) )
		{
			assert ( ! pendinginserts.size() );
		
			uint32_t const numcigop = algn.getCigarOperations(cigop);
			uint64_t readpos = 0;
			uint64_t refpos = algn.getPos();
			uint64_t const seqlen = algn.decodeRead(bases);
			uint8_t const * qual = libmaus::bambam::BamAlignmentDecoderBase::getQual(algn.D.begin());
			
			// handle finished columns
			if ( algn.getRefID() != prevrefid )
			{
				while ( M.size() )
				{
					HeapEntry & H = M.begin()->second;
					
					if ( outputprefix.size() && (streamRefId != prevrefid) )
					{
						if ( Pstream )
						{
							std::ostringstream fnostr;
							fnostr << outputprefix << "_" << header.getRefIDName(streamRefId);
							libmaus::aio::PosixFdOutputStream PFOS(fnostr.str());
							PFOS << ">" << header.getRefIDName(streamRefId) << '\n';
							PFOS << Pstream->str() << '\n';
							
							Pstream.reset();
						}
						
						stream_ptr_type Tstream(new std::ostringstream);
						Pstream = Tstream;
						streamRefId = prevrefid;
					}
					
					if ( Pindex && (loadedRefId != prevrefid) )
					{
						refseqbases = Pindex->readSequence(*PCIS, Pindex->getSequenceIdByName(refidname));
						loadedRefId = prevrefid;
						
						if ( Mconsacc.find(loadedRefId) == Mconsacc.end() )
							Mconsacc[loadedRefId] = ConsensusAccuracy(refseqbases.size());
						
						consacc = &(Mconsacc[loadedRefId]);
					}
					
					H.toStream(std::cout,M.begin()->first,refidname,(M.begin()->first < refseqbases.size()) ? static_cast<int>(refseqbases[M.begin()->first]) : -1,Caux,consacc,Pstream.get());
					
					M.erase(M.begin());
				}
			
				prevrefid = algn.getRefID();
				refidname = header.getRefIDName(prevrefid);
			}
			else
			{
				while ( M.size() && M.begin()->first < refpos )
				{
					HeapEntry & H = M.begin()->second;

					if ( outputprefix.size() && (streamRefId != prevrefid) )
					{
						if ( Pstream )
						{
							std::ostringstream fnostr;
							fnostr << outputprefix << "_" << header.getRefIDName(streamRefId);
							libmaus::aio::PosixFdOutputStream PFOS(fnostr.str());
							PFOS << ">" << header.getRefIDName(streamRefId) << '\n';
							PFOS << Pstream->str() << '\n';

							Pstream.reset();
						}
						
						stream_ptr_type Tstream(new std::ostringstream);
						Pstream = Tstream;
						streamRefId = prevrefid;
					}

					if ( Pindex && (loadedRefId != prevrefid) )
					{
						refseqbases = Pindex->readSequence(*PCIS, Pindex->getSequenceIdByName(refidname));
						loadedRefId = prevrefid;

						if ( Mconsacc.find(loadedRefId) == Mconsacc.end() )
							Mconsacc[loadedRefId] = ConsensusAccuracy(refseqbases.size());

						consacc = &(Mconsacc[loadedRefId]);
					}
					
					H.toStream(std::cout,M.begin()->first,refidname,(M.begin()->first < refseqbases.size()) ? static_cast<int>(refseqbases[M.begin()->first]) : -1,Caux,consacc,Pstream.get());
					
					M.erase(M.begin());				
				}
			}
			
			for ( uint64_t ci = 0; ci < numcigop; ++ci )
			{
				uint64_t const ciglen = cigop[ci].second;
				
				switch ( cigop[ci].first )
				{
					case libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_CMATCH:
					case libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_CEQUAL:
					case libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_CDIFF:
					{
						if ( pendinginserts.size() )
						{
							M[refpos].I.push_back(pendinginserts);
							pendinginserts.resize(0);
						}
					
						for ( uint64_t i = 0; i < ciglen; ++i )
						{
							M[refpos].V.push_back(std::make_pair(bases[readpos],qual[readpos]));
							readpos++;
							refpos++;
						}
						break;
					}
					case libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_CINS:
					{
						for ( uint64_t i = 0; i < ciglen; ++i, ++readpos )
							pendinginserts.push_back(std::make_pair(bases[readpos],qual[readpos]));
						break;
					}
					case libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_CDEL:
						// handle pending inserts
						if ( pendinginserts.size() )
						{
							M[refpos].I.push_back(pendinginserts);
							pendinginserts.resize(0);
						}
						
						// deleting bases from the reference
						for ( uint64_t i = 0; i < ciglen; ++i, ++refpos )
							M[refpos].V.push_back(std::make_pair(padsym,0));
						break;
					case libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_CREF_SKIP:
						// handle pending inserts
						if ( pendinginserts.size() )
						{
							M[refpos].I.push_back(pendinginserts);
							pendinginserts.resize(0);
						}

						// skip bases on reference
						for ( uint64_t i = 0; i < ciglen; ++i )
						{
							refpos++;
						}
						break;
					case libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_CSOFT_CLIP:
						// skip bases on read
						for ( uint64_t i = 0; i < ciglen; ++i )
						{
							readpos++;
						}
						break;
					case libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_CHARD_CLIP:
						break;
					case libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_CPAD:
					{
						for ( uint64_t i = 0; i < ciglen; ++i, ++readpos )
							pendinginserts.push_back(std::make_pair(padsym,0));
						break;
					}
				}
			}

			if ( pendinginserts.size() )
			{
				M[refpos].I.push_back(pendinginserts);
				M[refpos].iadd++;
				pendinginserts.resize(0);
			}

			assert ( readpos == seqlen );
		}
		
		if ( verbose && ((++alcnt % (1024*1024)) == 0) )
			std::cerr << "[V] " << alcnt << std::endl;
	}

	while ( M.size() )
	{
		HeapEntry & H = M.begin()->second;

		if ( outputprefix.size() && (streamRefId != prevrefid) )
		{
			if ( Pstream )
			{
				std::ostringstream fnostr;
				fnostr << outputprefix << "_" << header.getRefIDName(streamRefId);
				libmaus::aio::PosixFdOutputStream PFOS(fnostr.str());
				PFOS << ">" << header.getRefIDName(streamRefId) << '\n';
				PFOS << Pstream->str() << '\n';

				Pstream.reset();
			}
			
			stream_ptr_type Tstream(new std::ostringstream);
			Pstream = Tstream;
			streamRefId = prevrefid;
		}

		if ( Pindex && (loadedRefId != prevrefid) )
		{
			refseqbases = Pindex->readSequence(*PCIS, Pindex->getSequenceIdByName(refidname));
			loadedRefId = prevrefid;

			if ( Mconsacc.find(loadedRefId) == Mconsacc.end() )
				Mconsacc[loadedRefId] = ConsensusAccuracy(refseqbases.size());

			consacc = &(Mconsacc[loadedRefId]);
		}
			
		H.toStream(std::cout,M.begin()->first,refidname,(M.begin()->first < refseqbases.size()) ? static_cast<int>(refseqbases[M.begin()->first]) : -1,Caux,consacc,Pstream.get());
		
		M.erase(M.begin());
	}
	
	if ( Pstream )
	{
		std::ostringstream fnostr;
		fnostr << outputprefix << "_" << header.getRefIDName(streamRefId);
		libmaus::aio::PosixFdOutputStream PFOS(fnostr.str());
		PFOS << ">" << header.getRefIDName(streamRefId) << '\n';
		PFOS << Pstream->str() << '\n';

		Pstream.reset();
	}
	
	ConsensusAccuracy constotal;
	for ( std::map<uint64_t,ConsensusAccuracy>::const_iterator ita = Mconsacc.begin(); ita != Mconsacc.end(); ++ita )
	{
		std::cerr << header.getRefIDName(ita->first) << "\t" << ita->second << std::endl;

		std::map<uint64_t,uint64_t> const M = ita->second.depthhistogram.get();
		uint64_t total = 0;
		uint64_t preavg = 0;
		for ( std::map<uint64_t,uint64_t>::const_iterator aita = M.begin(); aita != M.end(); ++aita )
		{
			total += aita->second;
			preavg += aita->first * aita->second;
		}

		uint64_t acc = 0;		
		for ( std::map<uint64_t,uint64_t>::const_iterator aita = M.begin(); aita != M.end(); ++aita )
		{
			acc += aita->second;
			std::cerr << "H[" << header.getRefIDName(ita->first) << "," << aita->first << ",+]"
				<< "\t" << aita->second << "\t" << static_cast<double>(aita->second)/total
				<< "\t" << acc << "\t" << static_cast<double>(acc)/total << std::endl;
		}
		acc = 0;
		for ( std::map<uint64_t,uint64_t>::const_reverse_iterator aita = M.rbegin(); aita != M.rend(); ++aita )
		{
			acc += aita->second;
			std::cerr << "H[" << header.getRefIDName(ita->first) << "," << aita->first << ",-]"
				<< "\t" << aita->second << "\t" << static_cast<double>(aita->second)/total
				<< "\t" << acc << "\t" << static_cast<double>(acc)/total << std::endl;
		}
		
		std::cerr << "H[" << header.getRefIDName(ita->first) << ",avg]\t" << 
			static_cast<double>(preavg)/total << std::endl;
		
		constotal += ita->second;
	}
	if ( Mconsacc.size() )
	{
		std::cerr << "all\t" << constotal << std::endl;

		std::map<uint64_t,uint64_t> const M = constotal.depthhistogram.get();
		uint64_t total = 0;
		uint64_t preavg = 0;
		for ( std::map<uint64_t,uint64_t>::const_iterator aita = M.begin(); aita != M.end(); ++aita )
		{
			total += aita->second;
			preavg += aita->first * aita->second;
		}

		uint64_t acc = 0;		
		for ( std::map<uint64_t,uint64_t>::const_iterator aita = M.begin(); aita != M.end(); ++aita )
		{
			acc += aita->second;
			std::cerr << "H[" << "all" << "," << aita->first << ",+]"
				<< "\t" << aita->second << "\t" << static_cast<double>(aita->second)/total
				<< "\t" << acc << "\t" << static_cast<double>(acc)/total << std::endl;
		}
		acc = 0;
		for ( std::map<uint64_t,uint64_t>::const_reverse_iterator aita = M.rbegin(); aita != M.rend(); ++aita )
		{
			acc += aita->second;
			std::cerr << "H[" << "all" << "," << aita->first << ",-]"
				<< "\t" << aita->second << "\t" << static_cast<double>(aita->second)/total
				<< "\t" << acc << "\t" << static_cast<double>(acc)/total << std::endl;
		}
		
		std::cerr << "H[all,avg]\t" << static_cast<double>(preavg) / total << std::endl;
		
	}

	return EXIT_SUCCESS;
}
コード例 #5
0
ファイル: bamfilter.cpp プロジェクト: KateTaylor/biobambam
int bamfilter(libmaus::util::ArgInfo const & arginfo)
{
	uint64_t const minmapped = arginfo.getValue<uint64_t>("minmapped",getDefaultMinMapped());
	uint64_t const maxmapped = arginfo.getValue<uint64_t>("maxmapped",getDefaultMaxMapped());
	uint64_t const minlen = arginfo.getValue<uint64_t>("minlen",getDefaultMinLen());
	int const level = libmaus::bambam::BamBlockWriterBaseFactory::checkCompressionLevel(arginfo.getValue<int>("level",getDefaultLevel()));
	
	::libmaus::bambam::BamDecoder BD(std::cin);
	::libmaus::bambam::BamHeader const & bamheader = BD.getHeader();
	::libmaus::bambam::BamAlignment & alignment = BD.getAlignment();

	/*
	 * start index/md5 callbacks
	 */
	std::string const tmpfilenamebase = arginfo.getValue<std::string>("tmpfile",arginfo.getDefaultTmpFileName());
	std::string const tmpfileindex = tmpfilenamebase + "_index";
	::libmaus::util::TempFileRemovalContainer::addTempFile(tmpfileindex);

	std::string md5filename;
	std::string indexfilename;

	std::vector< ::libmaus::lz::BgzfDeflateOutputCallback * > cbs;
	::libmaus::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Pmd5cb;
	if ( arginfo.getValue<unsigned int>("md5",getDefaultMD5()) )
	{
		if ( arginfo.hasArg("md5filename") &&  arginfo.getUnparsedValue("md5filename","") != "" )
			md5filename = arginfo.getUnparsedValue("md5filename","");
		else
			std::cerr << "[V] no filename for md5 given, not creating hash" << std::endl;

		if ( md5filename.size() )
		{
			::libmaus::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Tmd5cb(new ::libmaus::lz::BgzfDeflateOutputCallbackMD5);
			Pmd5cb = UNIQUE_PTR_MOVE(Tmd5cb);
			cbs.push_back(Pmd5cb.get());
		}
	}
	libmaus::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Pindex;
	if ( arginfo.getValue<unsigned int>("index",getDefaultIndex()) )
	{
		if ( arginfo.hasArg("indexfilename") &&  arginfo.getUnparsedValue("indexfilename","") != "" )
			indexfilename = arginfo.getUnparsedValue("indexfilename","");
		else
			std::cerr << "[V] no filename for index given, not creating index" << std::endl;

		if ( indexfilename.size() )
		{
			libmaus::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Tindex(new libmaus::bambam::BgzfDeflateOutputCallbackBamIndex(tmpfileindex));
			Pindex = UNIQUE_PTR_MOVE(Tindex);
			cbs.push_back(Pindex.get());
		}
	}
	std::vector< ::libmaus::lz::BgzfDeflateOutputCallback * > * Pcbs = 0;
	if ( cbs.size() )
		Pcbs = &cbs;
	/*
	 * end md5/index callbacks
	 */

	::libmaus::bambam::BamHeader::unique_ptr_type uphead(libmaus::bambam::BamHeaderUpdate::updateHeader(arginfo,bamheader,"bamfilter",std::string(PACKAGE_VERSION)));
	::libmaus::bambam::BamWriter::unique_ptr_type writer(new ::libmaus::bambam::BamWriter(std::cout,*uphead,level,Pcbs));
	
	while ( BD.readAlignment() )
	{
		bool const a_1_mapped = !(alignment.getFlags() & ::libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_FUNMAP);
		bool const a_2_mapped = !(alignment.getFlags() & ::libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_FMUNMAP);
		bool const proper     =  (alignment.getFlags() & ::libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_FPROPER_PAIR);

		uint64_t const nummapped = (a_1_mapped?1:0)+(a_2_mapped?1:0)+(proper?1:0);

		if ( 
			nummapped >= minmapped && 
			nummapped <= maxmapped && 
			alignment.getLseq() >= static_cast<int64_t>(minlen)
		)
			alignment.serialise(writer->getStream());
	}	

	writer.reset();

	if ( Pmd5cb )
	{
		Pmd5cb->saveDigestAsFile(md5filename);
	}
	if ( Pindex )
	{
		Pindex->flush(std::string(indexfilename));
	}
	
	return EXIT_SUCCESS;
}
コード例 #6
0
static void filterBamUsedSequences(
	libmaus::util::ArgInfo const & arginfo,
	std::istream & in,
	::libmaus::bitio::IndexedBitVector const & IBV,
	std::ostream & out
)
{
	libmaus::lz::BgzfInflateStream bgzfin(in);
	libmaus::bambam::BamHeaderLowMem::unique_ptr_type PBHLM ( libmaus::bambam::BamHeaderLowMem::constructFromBAM(bgzfin));

	bool const verbose = arginfo.getValue<unsigned int>("verbose",getDefaultVerbose());
	std::vector< ::libmaus::lz::BgzfDeflateOutputCallback * > cbs;
	::libmaus::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Pmd5cb;
	std::string md5filename;
	if ( arginfo.getValue<unsigned int>("md5",getDefaultMD5()) )
	{
		if ( arginfo.hasArg("md5filename") &&  arginfo.getUnparsedValue("md5filename","") != "" )
			md5filename = arginfo.getUnparsedValue("md5filename","");
		else
			std::cerr << "[V] no filename for md5 given, not creating hash" << std::endl;

		if ( md5filename.size() )
		{
			::libmaus::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Tmd5cb(new ::libmaus::lz::BgzfDeflateOutputCallbackMD5);
			Pmd5cb = UNIQUE_PTR_MOVE(Tmd5cb);
			cbs.push_back(Pmd5cb.get());
		}
	}

	int const level = libmaus::bambam::BamBlockWriterBaseFactory::checkCompressionLevel(arginfo.getValue<int>("level",getDefaultLevel()));
	libmaus::lz::BgzfDeflate<std::ostream>::unique_ptr_type Pbgzfout(
		new libmaus::lz::BgzfDeflate<std::ostream>(
			out,level
		)
	);
	libmaus::lz::BgzfDeflate<std::ostream> & bgzfout = *Pbgzfout;
	
	if ( verbose )
		std::cerr << "[V] writing filtered header...";
	PBHLM->serialiseSequenceSubset(bgzfout,IBV,"bamfilterheader2" /* id */,"bamfilterheader2" /* pn */,
		arginfo.commandline /* pgCL */, PACKAGE_VERSION /* pgVN */
	);
	if ( verbose )
		std::cerr << "done." << std::endl;

	::libmaus::bambam::BamAlignment algn;
	uint64_t c = 0;
	while ( libmaus::bambam::BamAlignmentDecoder::readAlignmentGz(bgzfin,algn) )
	{
		if ( algn.isMapped() )
		{
			int64_t const refid = algn.getRefID();
			assert ( refid >= 0 );
			assert ( IBV.get(refid) );
			algn.putRefId(IBV.rank1(refid)-1);
		}
		else
		{
			algn.putRefId(-1);
		}
		
		if ( algn.isPaired() && algn.isMapped() )
		{
			int64_t const refid = algn.getNextRefID();
			assert ( refid >= 0 );
			assert ( IBV.get(refid) );
			algn.putNextRefId(IBV.rank1(refid)-1);
		}
		else
		{
			algn.putNextRefId(-1);
		}
		
		algn.serialise(bgzfout);
		
		if ( verbose && ( ((++c) & (1024*1024-1)) == 0 ) )
			std::cerr << "[V] " << c/(1024*1024) << std::endl;
	}
	
	bgzfout.flush();
	bgzfout.addEOFBlock();	
		
	Pbgzfout.reset();

	if ( Pmd5cb )
		Pmd5cb->saveDigestAsFile(md5filename);
}
コード例 #7
0
int bamfixmatecoordinatesnamesorted(::libmaus::util::ArgInfo const & arginfo)
{
	bool const verbose = arginfo.getValue<unsigned int>("verbose",getDefaultVerbose());
	
	::libmaus::timing::RealTimeClock rtc; rtc.start();
	
	// gzip compression level for output
	int const level = libmaus::bambam::BamBlockWriterBaseFactory::checkCompressionLevel(arginfo.getValue<int>("level",getDefaultLevel()));
	
	::libmaus::bambam::BamDecoder bamfile(std::cin);
	std::string const headertext(bamfile.getHeader().text);

	// add PG line to header
	std::string const upheadtext = ::libmaus::bambam::ProgramHeaderLineSet::addProgramLine(
		headertext,
		"bamfixmatecoordinatesnamesorted", // ID
		"bamfixmatecoordinatesnamesorted", // PN
		arginfo.commandline, // CL
		::libmaus::bambam::ProgramHeaderLineSet(headertext).getLastIdInChain(), // PP
		std::string(PACKAGE_VERSION) // VN			
	);
	// construct new header
	::libmaus::bambam::BamHeader uphead(upheadtext);
	
	if ( uphead.getSortOrder() != "queryname" )
		uphead.changeSortOrder("unknown");
		
	std::string const & finalheadtext = uphead.text;
	::libmaus::bambam::BamHeader finalheader(finalheadtext);

	/*
	 * start index/md5 callbacks
	 */
	std::string const tmpfilenamebase = arginfo.getValue<std::string>("tmpfile",arginfo.getDefaultTmpFileName());
	std::string const tmpfileindex = tmpfilenamebase + "_index";
	::libmaus::util::TempFileRemovalContainer::addTempFile(tmpfileindex);

	std::string md5filename;
	std::string indexfilename;

	std::vector< ::libmaus::lz::BgzfDeflateOutputCallback * > cbs;
	::libmaus::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Pmd5cb;
	if ( arginfo.getValue<unsigned int>("md5",getDefaultMD5()) )
	{
		if ( arginfo.hasArg("md5filename") &&  arginfo.getUnparsedValue("md5filename","") != "" )
			md5filename = arginfo.getUnparsedValue("md5filename","");
		else
			std::cerr << "[V] no filename for md5 given, not creating hash" << std::endl;

		if ( md5filename.size() )
		{
			::libmaus::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Tmd5cb(new ::libmaus::lz::BgzfDeflateOutputCallbackMD5);
			Pmd5cb = UNIQUE_PTR_MOVE(Tmd5cb);
			cbs.push_back(Pmd5cb.get());
		}
	}
	libmaus::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Pindex;
	if ( arginfo.getValue<unsigned int>("index",getDefaultIndex()) )
	{
		if ( arginfo.hasArg("indexfilename") &&  arginfo.getUnparsedValue("indexfilename","") != "" )
			indexfilename = arginfo.getUnparsedValue("indexfilename","");
		else
			std::cerr << "[V] no filename for index given, not creating index" << std::endl;

		if ( indexfilename.size() )
		{
			libmaus::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Tindex(new libmaus::bambam::BgzfDeflateOutputCallbackBamIndex(tmpfileindex));
			Pindex = UNIQUE_PTR_MOVE(Tindex);
			cbs.push_back(Pindex.get());
		}
	}
	std::vector< ::libmaus::lz::BgzfDeflateOutputCallback * > * Pcbs = 0;
	if ( cbs.size() )
		Pcbs = &cbs;
	/*
	 * end md5/index callbacks
	 */

	::libmaus::bambam::BamWriter::unique_ptr_type writer(new ::libmaus::bambam::BamWriter(std::cout,finalheader,level,Pcbs));
	std::pair< std::pair< ::libmaus::bambam::BamAlignment::shared_ptr_type, bool> , std::pair< ::libmaus::bambam::BamAlignment::shared_ptr_type, bool> > 
		P(std::pair< ::libmaus::bambam::BamAlignment::shared_ptr_type, bool>(::libmaus::bambam::BamAlignment::shared_ptr_type(),false),std::pair< ::libmaus::bambam::BamAlignment::shared_ptr_type, bool>(::libmaus::bambam::BamAlignment::shared_ptr_type(),false));
	
	// try to read two alignments	
	P.first.second  = bamfile.readAlignment();
	if ( P.first.second )
	{
		P.first.first   = bamfile.salignment();
		P.second.second = P.first.second && bamfile.readAlignment();
		P.second.first  = bamfile.salignment();
	}
	
	uint64_t single = 0, pairs = 0;
	uint64_t proc = 0;
	uint64_t lastproc = 0;
	uint64_t const mod = 1024*1024;
	
	// while we have two alignments
	while ( P.first.second && P.second.second )
	{
		uint32_t const aflags = P.first.first->getFlags();
		uint32_t const bflags = P.second.first->getFlags();
	
		// same name?
		if ( 
			(aflags & ::libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_FPAIRED)
			&&
			(bflags & ::libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_FPAIRED)
			&&
			(! strcmp(P.first.first->getName(),P.second.first->getName()))
		)
		{			
			unsigned int const amap = (aflags & ::libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_FUNMAP) ? 0 : 1;
			unsigned int const bmap = (bflags & ::libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_FUNMAP) ? 0 : 1;

			// std::cerr << "Pair " << bam1_qname(P.first.first->alignment) << " amap=" << amap << " bmap=" << bmap << std::endl;
			
			// if exactly one of the two is mapped
			if ( amap + bmap == 1 )
			{
				::libmaus::bambam::BamAlignment::shared_ptr_type mapped = amap ? P.first.first : P.second.first;
				int64_t const tid = mapped->getRefID();
				int64_t const pos = mapped->getPos();
				
				// std::cerr << "tid=" << tid << " pos=" << pos << std::endl;
				
				// set all tid and pos values
				P.first.first->putRefId(tid);
				P.first.first->putPos(pos);
				P.first.first->putNextRefId(tid);
				P.first.first->putNextPos(pos);
				P.second.first->putRefId(tid);
				P.second.first->putPos(pos);
				P.second.first->putNextRefId(tid);
				P.second.first->putNextPos(pos);
			}
		
			// write alignments
			P.first.first->serialise(writer->getStream());
			P.second.first->serialise(writer->getStream());
			// read new alignments
			P.first.second = bamfile.readAlignment();
			if ( P.first.second )
			{
				P.first.first = bamfile.salignment();
				P.second.second = bamfile.readAlignment();
				P.second.first = bamfile.salignment();
			}
			
			pairs++;
			proc += 2;
		}
		// different names
		else
		{
			// write first alignment
			P.first.first->serialise(writer->getStream());
			// move second to first
			std::swap(P.first,P.second);
			// read new second
			P.second.second = P.first.second && bamfile.readAlignment();
			if ( P.second.second )
				P.second.first = bamfile.salignment();
			
			single++;
			proc += 1;
		}
		
		if ( verbose && (proc/mod != lastproc/mod) )
		{
			std::cerr << proc << "\t" << single << "\t" << pairs << "\t" <<
				proc/rtc.getElapsedSeconds() << "al/s"
				<< std::endl;
			lastproc = proc;
		}
	}
	
	if ( P.first.second )
	{
		P.first.first->serialise(writer->getStream());
		single++;
		proc += 1;
	}

	if ( verbose )
		std::cerr << proc << "\t" << single << "\t" << pairs << "\t" <<
			proc/rtc.getElapsedSeconds() << "al/s"
			<< std::endl;
		
	assert ( ! P.second.second );

	writer.reset();

	if ( Pmd5cb )
	{
		Pmd5cb->saveDigestAsFile(md5filename);
	}	
	if ( Pindex )
	{
		Pindex->flush(std::string(indexfilename));
	}

	return EXIT_SUCCESS;
}
コード例 #8
0
ファイル: bamsplitdiv.cpp プロジェクト: RAkers/biobambam
static std::string getDefaultFilePrefix(::libmaus::util::ArgInfo const & arginfo) {
    return arginfo.getDefaultTmpFileName();
}
コード例 #9
0
ファイル: bamsplitdiv.cpp プロジェクト: RAkers/biobambam
int bamsplitmod(libmaus::util::ArgInfo const & arginfo)
{
    if ( isatty(STDIN_FILENO) )
    {
        ::libmaus::exception::LibMausException se;
        se.getStream() << "Refusing read binary data from terminal, please redirect standard input to pipe or file." << std::endl;
        se.finish();
        throw se;
    }

    int const level = libmaus::bambam::BamBlockWriterBaseFactory::checkCompressionLevel(arginfo.getValue<int>("level",getDefaultLevel()));
    int const verbose = arginfo.getValue<int>("verbose",getDefaultVerbose());
    uint64_t const div = arginfo.getValue<int>("div",getDefaultDiv());
    std::string const prefix = arginfo.getUnparsedValue("prefix",getDefaultFilePrefix(arginfo));

    if ( ! div )
    {
        ::libmaus::exception::LibMausException se;
        se.getStream() << "div cannot be 0." << std::endl;
        se.finish();
        throw se;
    }

    libmaus::bambam::BamDecoder bamdec(std::cin);
    libmaus::bambam::BamAlignment const & algn = bamdec.getAlignment();
    libmaus::bambam::BamHeader const & header = bamdec.getHeader();
    ::libmaus::bambam::BamHeader::unique_ptr_type uphead(updateHeader(arginfo,header));

    libmaus::autoarray::AutoArray<libmaus::aio::CheckedOutputStream::unique_ptr_type> COS(div);
    libmaus::autoarray::AutoArray<libmaus::bambam::BamWriter::unique_ptr_type> writers(div);
    std::vector < std::string > filenames;
    for ( uint64_t i = 0; i < div; ++i )
    {
        std::ostringstream ostr;
        ostr << prefix << "_" << std::setw(6) << std::setfill('0') << i << std::setw(0) << ".bam";

        libmaus::aio::CheckedOutputStream::unique_ptr_type tCOS(new libmaus::aio::CheckedOutputStream(ostr.str()));
        COS[i] = UNIQUE_PTR_MOVE(tCOS);
        libmaus::bambam::BamWriter::unique_ptr_type twriter(new libmaus::bambam::BamWriter(*COS[i],*uphead,level));
        writers[i] = UNIQUE_PTR_MOVE(twriter);
    }

    uint64_t c = 0;
    if ( verbose )
    {
        while ( bamdec.readAlignment() )
        {
            algn.serialise ( writers [ (c++) % div ] -> getStream() );

            if ( ((c) & ((1ull<<20)-1)) == 0 )
                std::cerr << "[V] " << c << std::endl;
        }
        std::cerr << "[V] " << c << std::endl;
    }
    else
    {
        while ( bamdec.readAlignment() )
            algn.serialise ( writers [ (c++) % div ] -> getStream() );
    }

    for ( uint64_t i = 0; i < div; ++i )
    {
        writers[i].reset();
        COS[i]->flush();
        COS[i].reset();
    }

    return EXIT_SUCCESS;
}
コード例 #10
0
ファイル: bamcat.cpp プロジェクト: dozy/biobambam
int bamcat(libmaus::util::ArgInfo const & arginfo)
{
	if ( isatty(STDOUT_FILENO) )
	{
		::libmaus::exception::LibMausException se;
		se.getStream() << "Refusing write binary data to terminal, please redirect standard output to pipe or file." << std::endl;
		se.finish();
		throw se;
	}

	int const level = arginfo.getValue<int>("level",getDefaultLevel());
	int const verbose = arginfo.getValue<int>("verbose",getDefaultVerbose());
	
	switch ( level )
	{
		case Z_NO_COMPRESSION:
		case Z_BEST_SPEED:
		case Z_BEST_COMPRESSION:
		case Z_DEFAULT_COMPRESSION:
			break;
		default:
		{
			::libmaus::exception::LibMausException se;
			se.getStream()
				<< "Unknown compression level, please use"
				<< " level=" << Z_DEFAULT_COMPRESSION << " (default) or"
				<< " level=" << Z_BEST_SPEED << " (fast) or"
				<< " level=" << Z_BEST_COMPRESSION << " (best) or"
				<< " level=" << Z_NO_COMPRESSION << " (no compression)" << std::endl;
			se.finish();
			throw se;
		}
			break;
	}

	std::vector<std::string> inputfilenames = arginfo.getPairValues("I");
	
	for ( uint64_t i = 0; i < arginfo.restargs.size(); ++i )
		inputfilenames.push_back(arginfo.restargs[i]);
	
	libmaus::bambam::BamCat bamdec(inputfilenames /* ,true */);
	libmaus::bambam::BamAlignment const & algn = bamdec.getAlignment();
	libmaus::bambam::BamHeader const & header = bamdec.getHeader();
	::libmaus::bambam::BamHeader::unique_ptr_type uphead(updateHeader(arginfo,header));

	/*
	 * start index/md5 callbacks
	 */
	std::string const tmpfilenamebase = arginfo.getValue<std::string>("tmpfile",arginfo.getDefaultTmpFileName());
	std::string const tmpfileindex = tmpfilenamebase + "_index";
	::libmaus::util::TempFileRemovalContainer::addTempFile(tmpfileindex);

	std::string md5filename;
	std::string indexfilename;

	std::vector< ::libmaus::lz::BgzfDeflateOutputCallback * > cbs;
	::libmaus::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Pmd5cb;
	if ( arginfo.getValue<unsigned int>("md5",getDefaultMD5()) )
	{
		if ( arginfo.hasArg("md5filename") &&  arginfo.getUnparsedValue("md5filename","") != "" )
			md5filename = arginfo.getUnparsedValue("md5filename","");
		else
			std::cerr << "[V] no filename for md5 given, not creating hash" << std::endl;

		if ( md5filename.size() )
		{
			::libmaus::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Tmd5cb(new ::libmaus::lz::BgzfDeflateOutputCallbackMD5);
			Pmd5cb = UNIQUE_PTR_MOVE(Tmd5cb);
			cbs.push_back(Pmd5cb.get());
		}
	}
	libmaus::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Pindex;
	if ( arginfo.getValue<unsigned int>("index",getDefaultIndex()) )
	{
		if ( arginfo.hasArg("indexfilename") &&  arginfo.getUnparsedValue("indexfilename","") != "" )
			indexfilename = arginfo.getUnparsedValue("indexfilename","");
		else
			std::cerr << "[V] no filename for index given, not creating index" << std::endl;

		if ( indexfilename.size() )
		{
			libmaus::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Tindex(new libmaus::bambam::BgzfDeflateOutputCallbackBamIndex(tmpfileindex));
			Pindex = UNIQUE_PTR_MOVE(Tindex);
			cbs.push_back(Pindex.get());
		}
	}
	std::vector< ::libmaus::lz::BgzfDeflateOutputCallback * > * Pcbs = 0;
	if ( cbs.size() )
		Pcbs = &cbs;
	/*
	 * end md5/index callbacks
	 */


	::libmaus::bambam::BamWriter::unique_ptr_type writer(new ::libmaus::bambam::BamWriter(std::cout,*uphead,level,Pcbs));
	libmaus::bambam::BamWriter::stream_type & bamoutstr = writer->getStream();
	if ( verbose )
	{
		uint64_t c = 0;
		while ( bamdec.readAlignment() )
		{
			algn.serialise(bamoutstr);
			
			if ( ((++c) & ((1ull<<20)-1)) == 0 )
				std::cerr << "[V] " << c << std::endl;
		}
		
		std::cerr << "[V] " << c << std::endl;
	}
	else
		while ( bamdec.readAlignment() )
			algn.serialise(bamoutstr);

	writer.reset();

	if ( Pmd5cb )
	{
		Pmd5cb->saveDigestAsFile(md5filename);
	}
	if ( Pindex )
	{
		Pindex->flush(std::string(indexfilename));
	}

	return EXIT_SUCCESS;
}
コード例 #11
0
ファイル: bam12split.cpp プロジェクト: KateTaylor/biobambam
int bam12split(::libmaus::util::ArgInfo const & arginfo)
{
	::libmaus::util::TempFileRemovalContainer::setup();

	if ( isatty(STDIN_FILENO) )
	{
		::libmaus::exception::LibMausException se;
		se.getStream() << "Refusing to read binary data from terminal, please redirect standard input to pipe or file." << std::endl;
		se.finish();
		throw se;
	}

	if ( isatty(STDOUT_FILENO) )
	{
		::libmaus::exception::LibMausException se;
		se.getStream() << "Refusing write binary data to terminal, please redirect standard output to pipe or file." << std::endl;
		se.finish();
		throw se;
	}
	
	if ( arginfo.hasArg("keep") && arginfo.hasArg("remove") )
	{
		::libmaus::exception::LibMausException se;
		se.getStream() << "The keep and remove keys are mutually exclusive." << std::endl;
		se.finish();
		throw se;		
	}

	int const level = libmaus::bambam::BamBlockWriterBaseFactory::checkCompressionLevel(arginfo.getValue<int>("level",getDefaultLevel()));
	int const verbose = arginfo.getValue<int>("verbose",getDefaultVerbose());

	::libmaus::bambam::BamDecoder dec(std::cin,false);
	::libmaus::bambam::BamHeader const & header = dec.getHeader();

	std::string const headertext(header.text);

	// add PG line to header
	std::string const upheadtext = ::libmaus::bambam::ProgramHeaderLineSet::addProgramLine(
		headertext,
		"bam12split", // ID
		"bam12split", // PN
		arginfo.commandline, // CL
		::libmaus::bambam::ProgramHeaderLineSet(headertext).getLastIdInChain(), // PP
		std::string(PACKAGE_VERSION) // VN			
	);
	// construct new header
	libmaus::bambam::BamHeader uphead(upheadtext);
	uphead.changeSortOrder("unknown");

	/*
	 * start index/md5 callbacks
	 */
	std::string const tmpfilenamebase = arginfo.getValue<std::string>("tmpfile",arginfo.getDefaultTmpFileName());
	std::string const tmpfileindex = tmpfilenamebase + "_index";
	::libmaus::util::TempFileRemovalContainer::addTempFile(tmpfileindex);

	std::string md5filename;
	std::string indexfilename;

	std::vector< ::libmaus::lz::BgzfDeflateOutputCallback * > cbs;
	::libmaus::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Pmd5cb;
	if ( arginfo.getValue<unsigned int>("md5",getDefaultMD5()) )
	{
		if ( arginfo.hasArg("md5filename") &&  arginfo.getUnparsedValue("md5filename","") != "" )
			md5filename = arginfo.getUnparsedValue("md5filename","");
		else
			std::cerr << "[V] no filename for md5 given, not creating hash" << std::endl;

		if ( md5filename.size() )
		{
			::libmaus::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Tmd5cb(new ::libmaus::lz::BgzfDeflateOutputCallbackMD5);
			Pmd5cb = UNIQUE_PTR_MOVE(Tmd5cb);
			cbs.push_back(Pmd5cb.get());
		}
	}
	libmaus::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Pindex;
	if ( arginfo.getValue<unsigned int>("index",getDefaultIndex()) )
	{
		if ( arginfo.hasArg("indexfilename") &&  arginfo.getUnparsedValue("indexfilename","") != "" )
			indexfilename = arginfo.getUnparsedValue("indexfilename","");
		else
			std::cerr << "[V] no filename for index given, not creating index" << std::endl;

		if ( indexfilename.size() )
		{
			libmaus::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Tindex(new libmaus::bambam::BgzfDeflateOutputCallbackBamIndex(tmpfileindex));
			Pindex = UNIQUE_PTR_MOVE(Tindex);
			cbs.push_back(Pindex.get());
		}
	}
	std::vector< ::libmaus::lz::BgzfDeflateOutputCallback * > * Pcbs = 0;
	if ( cbs.size() )
		Pcbs = &cbs;
	/*
	 * end md5/index callbacks
	 */
		
	::libmaus::bambam::BamWriter::unique_ptr_type writer(new ::libmaus::bambam::BamWriter(std::cout,uphead,level,Pcbs));
	libmaus::bambam::BamAlignment & algn = dec.getAlignment();
	uint64_t c = 0;

	while ( dec.readAlignment() )
	{
		bool const ok = split12(algn);
		
		if ( ok )
			algn.serialise(writer->getStream());
 			
		if ( verbose && (++c & (1024*1024-1)) == 0 )
			std::cerr << "[V] " << c/(1024*1024) << std::endl;
	}

	writer.reset();

	if ( Pmd5cb )
	{
		Pmd5cb->saveDigestAsFile(md5filename);
	}
	if ( Pindex )
	{
		Pindex->flush(std::string(indexfilename));
	}
	
	return EXIT_SUCCESS;
}
コード例 #12
0
ファイル: bamclipreinsert.cpp プロジェクト: dozy/biobambam
int bamclipreinsert(::libmaus::util::ArgInfo const & arginfo)
{
	if ( isatty(STDIN_FILENO) )
	{
		::libmaus::exception::LibMausException se;
		se.getStream() << "Refusing to read binary data from terminal, please redirect standard input to pipe or file." << std::endl;
		se.finish();
		throw se;
	}

	if ( isatty(STDOUT_FILENO) )
	{
		::libmaus::exception::LibMausException se;
		se.getStream() << "Refusing write binary data to terminal, please redirect standard output to pipe or file." << std::endl;
		se.finish();
		throw se;
	}
	
	int const level = arginfo.getValue<int>("level",getDefaultLevel());
	int const verbose = arginfo.getValue<int>("verbose",getDefaultVerbose());
	
	switch ( level )
	{
		case Z_NO_COMPRESSION:
		case Z_BEST_SPEED:
		case Z_BEST_COMPRESSION:
		case Z_DEFAULT_COMPRESSION:
			break;
		default:
		{
			::libmaus::exception::LibMausException se;
			se.getStream()
				<< "Unknown compression level, please use"
				<< " level=" << Z_DEFAULT_COMPRESSION << " (default) or"
				<< " level=" << Z_BEST_SPEED << " (fast) or"
				<< " level=" << Z_BEST_COMPRESSION << " (best) or"
				<< " level=" << Z_NO_COMPRESSION << " (no compression)" << std::endl;
			se.finish();
			throw se;
		}
			break;
	}

	::libmaus::bambam::BamDecoder dec(std::cin,false);
	::libmaus::bambam::BamHeader const & header = dec.getHeader();

	std::string const headertext(header.text);

	// add PG line to header
	std::string const upheadtext = ::libmaus::bambam::ProgramHeaderLineSet::addProgramLine(
		headertext,
		"bamclipreinsert", // ID
		"bamclipreinsert", // PN
		arginfo.commandline, // CL
		::libmaus::bambam::ProgramHeaderLineSet(headertext).getLastIdInChain(), // PP
		std::string(PACKAGE_VERSION) // VN			
	);
		
	// construct new header
	libmaus::bambam::BamHeader const uphead(upheadtext);

	/*
	 * start index/md5 callbacks
	 */
	std::string const tmpfilenamebase = arginfo.getValue<std::string>("tmpfile",arginfo.getDefaultTmpFileName());
	std::string const tmpfileindex = tmpfilenamebase + "_index";
	::libmaus::util::TempFileRemovalContainer::addTempFile(tmpfileindex);

	std::string md5filename;
	std::string indexfilename;

	std::vector< ::libmaus::lz::BgzfDeflateOutputCallback * > cbs;
	::libmaus::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Pmd5cb;
	if ( arginfo.getValue<unsigned int>("md5",getDefaultMD5()) )
	{
		if ( arginfo.hasArg("md5filename") &&  arginfo.getUnparsedValue("md5filename","") != "" )
			md5filename = arginfo.getUnparsedValue("md5filename","");
		else
			std::cerr << "[V] no filename for md5 given, not creating hash" << std::endl;

		if ( md5filename.size() )
		{
			::libmaus::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Tmd5cb(new ::libmaus::lz::BgzfDeflateOutputCallbackMD5);
			Pmd5cb = UNIQUE_PTR_MOVE(Tmd5cb);
			cbs.push_back(Pmd5cb.get());
		}
	}
	libmaus::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Pindex;
	if ( arginfo.getValue<unsigned int>("index",getDefaultIndex()) )
	{
		if ( arginfo.hasArg("indexfilename") &&  arginfo.getUnparsedValue("indexfilename","") != "" )
			indexfilename = arginfo.getUnparsedValue("indexfilename","");
		else
			std::cerr << "[V] no filename for index given, not creating index" << std::endl;

		if ( indexfilename.size() )
		{
			libmaus::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Tindex(new libmaus::bambam::BgzfDeflateOutputCallbackBamIndex(tmpfileindex));
			Pindex = UNIQUE_PTR_MOVE(Tindex);
			cbs.push_back(Pindex.get());
		}
	}
	std::vector< ::libmaus::lz::BgzfDeflateOutputCallback * > * Pcbs = 0;
	if ( cbs.size() )
		Pcbs = &cbs;
	/*
	 * end md5/index callbacks
	 */

	::libmaus::bambam::BamWriter::unique_ptr_type writer(new ::libmaus::bambam::BamWriter(std::cout,uphead,level,Pcbs));
	libmaus::bambam::BamAuxFilterVector bafv;
 	// bafv.set('z','z');
 	// std::vector<uint8_t> R(8);
 	// std::string const zz("zz");
 	
	libmaus::bambam::BamAlignment & algn = dec.getAlignment();
	uint64_t c = 0;

	libmaus::autoarray::AutoArray < std::pair<uint8_t,uint8_t> > auxtags;
	libmaus::autoarray::AutoArray<libmaus::bambam::cigar_operation> cigop;
	std::stack < libmaus::bambam::cigar_operation > hardstack;
	libmaus::bambam::BamAlignment::D_array_type Tcigar;

	while ( dec.readAlignment() )
	{
		// reinsert clipped parts and attach soft clipping cigar operations as needed
		clipReinsert(algn,auxtags,bafv,cigop,Tcigar,hardstack);

		algn.serialise(writer->getStream());

		++c;
		
		if ( verbose && (c & (1024*1024-1)) == 0 )
 			std::cerr << "[V] " << c/(1024*1024) << std::endl;
	}

	writer.reset();

	if ( Pmd5cb )
	{
		Pmd5cb->saveDigestAsFile(md5filename);
	}
	if ( Pindex )
	{
		Pindex->flush(std::string(indexfilename));
	}

	return EXIT_SUCCESS;
}
コード例 #13
0
ファイル: bam12auxmerge.cpp プロジェクト: RAkers/biobambam
int bam12auxmerge(::libmaus::util::ArgInfo const & arginfo)
{
    if ( isatty(STDIN_FILENO) )
    {
        ::libmaus::exception::LibMausException se;
        se.getStream() << "Refusing to read binary data from terminal, please redirect standard input to pipe or file." << std::endl;
        se.finish();
        throw se;
    }

    if ( isatty(STDOUT_FILENO) )
    {
        ::libmaus::exception::LibMausException se;
        se.getStream() << "Refusing write binary data to terminal, please redirect standard output to pipe or file." << std::endl;
        se.finish();
        throw se;
    }

    std::string const prefilename = arginfo.getRestArg<std::string>(0);
    libmaus::bambam::BamDecoder bampredec(prefilename);

    int const level = libmaus::bambam::BamBlockWriterBaseFactory::checkCompressionLevel(arginfo.getValue<int>("level",getDefaultLevel()));
    int const verbose = arginfo.getValue<int>("verbose",getDefaultVerbose());
    int const ranksplit = arginfo.getValue<int>("ranksplit",getDefaultRankSplit());
    int const rankstrip = arginfo.getValue<int>("rankstrip",getDefaultRankSplit());
    int const clipreinsert = arginfo.getValue<int>("clipreinsert",getDefaultClipReinsert());
    int const zztoname = arginfo.getValue<int>("zztoname",getDefaultZZToName());
    uint64_t const mod = arginfo.getValue<int>("mod",getDefaultMod());
    uint64_t const bmod = libmaus::math::nextTwoPow(mod);
    uint64_t const bmask = bmod-1;

    libmaus::autoarray::AutoArray<char> Aread;

    ::libmaus::bambam::BamDecoder bamdec(std::cin,false);
    ::libmaus::bambam::BamHeader const & header = bamdec.getHeader();
    ::libmaus::bambam::BamHeader const & preheader = bampredec.getHeader();

    std::string const headertext(header.text);
    std::string const preheadertext(libmaus::bambam::HeaderLine::removeSequenceLines(preheader.text));

    libmaus::bambam::ProgramHeaderLineSet headerlines(headertext);
    libmaus::bambam::ProgramHeaderLineSet preheaderlines(preheadertext);

    std::vector<libmaus::bambam::HeaderLine> allheaderlines = libmaus::bambam::HeaderLine::extractLines(headertext);

    std::string const lastid = preheaderlines.getLastIdInChain();

    std::stack < std::pair<uint64_t,std::string> > pgtodo;
    for ( uint64_t i = 0; i < headerlines.roots.size(); ++i )
        pgtodo.push(std::pair<uint64_t,std::string>(headerlines.roots[i],lastid));

    std::string upheadtext = preheadertext;
    while ( pgtodo.size() )
    {
        uint64_t const hid = pgtodo.top().first;
        std::string const PP = pgtodo.top().second;
        pgtodo.pop();
        libmaus::bambam::HeaderLine const & line = headerlines.lines[hid];

        // ID, PP, PN, CL, VN
        std::string       ID = (line.M.find("ID") != line.M.end()) ? line.M.find("ID")->second : "";
        std::string const PN = (line.M.find("PN") != line.M.end()) ? line.M.find("PN")->second : "";
        std::string const CL = (line.M.find("CL") != line.M.end()) ? line.M.find("CL")->second : "";
        std::string const VN = (line.M.find("VN") != line.M.end()) ? line.M.find("VN")->second : "";

        upheadtext = ::libmaus::bambam::ProgramHeaderLineSet::addProgramLineRef(
                         upheadtext,
                         ID,
                         PN,
                         CL,
                         PP,
                         VN
                     );

        if ( headerlines.edges.find(hid) != headerlines.edges.end() )
        {
            std::vector<uint64_t> const & children = headerlines.edges.find(hid)->second;

            for ( uint64_t j = 0; j < children.size(); ++j )
                pgtodo.push(std::pair<uint64_t,std::string>(children[j],ID));
        }
    }

    /* copy SQ lines */
    std::ostringstream sqconcstr;
    sqconcstr << upheadtext;
    for ( uint64_t i = 0; i < allheaderlines.size(); ++i )
        if ( allheaderlines[i].type == "SQ" )
            sqconcstr << allheaderlines[i].line << "\n";
    upheadtext = sqconcstr.str();

    ::libmaus::bambam::BamHeader uphead(upheadtext);
    uphead.changeSortOrder("unknown");

    /*
     * start index/md5 callbacks
     */
    std::string const tmpfilenamebase = arginfo.getValue<std::string>("tmpfile",arginfo.getDefaultTmpFileName());
    std::string const tmpfileindex = tmpfilenamebase + "_index";
    ::libmaus::util::TempFileRemovalContainer::addTempFile(tmpfileindex);

    std::string md5filename;
    std::string indexfilename;

    std::vector< ::libmaus::lz::BgzfDeflateOutputCallback * > cbs;
    ::libmaus::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Pmd5cb;
    if ( arginfo.getValue<unsigned int>("md5",getDefaultMD5()) )
    {
        if ( arginfo.hasArg("md5filename") &&  arginfo.getUnparsedValue("md5filename","") != "" )
            md5filename = arginfo.getUnparsedValue("md5filename","");
        else
            std::cerr << "[V] no filename for md5 given, not creating hash" << std::endl;

        if ( md5filename.size() )
        {
            ::libmaus::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Tmd5cb(new ::libmaus::lz::BgzfDeflateOutputCallbackMD5);
            Pmd5cb = UNIQUE_PTR_MOVE(Tmd5cb);
            cbs.push_back(Pmd5cb.get());
        }
    }
    libmaus::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Pindex;
    if ( arginfo.getValue<unsigned int>("index",getDefaultIndex()) )
    {
        if ( arginfo.hasArg("indexfilename") &&  arginfo.getUnparsedValue("indexfilename","") != "" )
            indexfilename = arginfo.getUnparsedValue("indexfilename","");
        else
            std::cerr << "[V] no filename for index given, not creating index" << std::endl;

        if ( indexfilename.size() )
        {
            libmaus::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Tindex(new libmaus::bambam::BgzfDeflateOutputCallbackBamIndex(tmpfileindex));
            Pindex = UNIQUE_PTR_MOVE(Tindex);
            cbs.push_back(Pindex.get());
        }
    }
    std::vector< ::libmaus::lz::BgzfDeflateOutputCallback * > * Pcbs = 0;
    if ( cbs.size() )
        Pcbs = &cbs;
    /*
     * end md5/index callbacks
     */

    ::libmaus::bambam::BamWriter::unique_ptr_type writer(new ::libmaus::bambam::BamWriter(std::cout,uphead,level,Pcbs));

    ::libmaus::bambam::BamAlignment & algn = bamdec.getAlignment();
    ::libmaus::bambam::BamAlignment & prealgn = bampredec.getAlignment();
    int64_t curid = -1;

    libmaus::autoarray::AutoArray< std::pair<uint8_t,uint8_t> > auxpre;
    libmaus::autoarray::AutoArray< std::pair<uint8_t,uint8_t> > auxnew;

    libmaus::bambam::BamAuxFilterVector auxfilter;

    // helpers for clipReinsert
    libmaus::autoarray::AutoArray < std::pair<uint8_t,uint8_t> > auxtags;
    libmaus::autoarray::AutoArray<libmaus::bambam::cigar_operation> cigop;
    std::stack < libmaus::bambam::cigar_operation > hardstack;
    libmaus::bambam::BamAlignment::D_array_type Tcigar;
    libmaus::bambam::BamAuxFilterVector bafv;
    libmaus::bambam::BamAuxFilterVector auxfilterout;
    auxfilterout.set('q','s');
    auxfilterout.set('q','q');

    // helpers for zztoname
    libmaus::bambam::BamAuxFilterVector zzbafv;
    zzbafv.set('z','z');

    // loop over aligned BAM file
    while ( bamdec.readAlignment() )
    {
        if ( ranksplit )
            split12(algn);

        // extract rank
        char const * name = algn.getName();
        char const * u1 = name;
        bool ok = true;
        uint64_t rank = 0;
        while ( *u1 && *u1 != '_' )
        {
            rank *= 10;
            rank += (*u1-'0');
            ok = ok && isdigit(*u1);
            ++u1;
        }

        // unable to find rank?	write out as is and continue
        if ( ! ok )
        {
            algn.serialise(writer->getStream());
            continue;
        }

        // loop over unaligned BAM file
        while ( curid != static_cast<int64_t>(rank) )
        {
            bool const a_ok = bampredec.readAlignment();

            if ( ! a_ok )
            {
                libmaus::exception::LibMausException se;
                se.getStream() << "Found unexpected EOF on file " << prefilename << std::endl;
                se.finish();
                throw se;
            }
            assert ( a_ok );
            ++curid;

            if ( verbose && (! (curid & bmask)) )
                std::cerr << "[V] " << (curid / bmod) << std::endl;
        }

        if ( verbose > 1 )
            std::cerr << "Merging:\n" << algn.formatAlignment(header) << "\n" << prealgn.formatAlignment(preheader) << std::endl;

        uint64_t pretagnum = prealgn.enumerateAuxTags(auxpre);
        uint64_t newtagnum = algn.enumerateAuxTags(auxnew);

        std::sort(auxpre.begin(),auxpre.begin()+pretagnum);
        std::sort(auxnew.begin(),auxnew.begin()+newtagnum);

        if ( verbose > 1 )
            std::cerr << "pretagnum=" << pretagnum << " newtagnum=" << newtagnum << std::endl;

        std::pair<uint8_t,uint8_t> * prec = auxpre.begin();
        std::pair<uint8_t,uint8_t> * pree = prec + pretagnum;
        std::pair<uint8_t,uint8_t> * preo = prec;

        std::pair<uint8_t,uint8_t> * newc = auxnew.begin();
        std::pair<uint8_t,uint8_t> * newe = newc + newtagnum;
        std::pair<uint8_t,uint8_t> * newo = newc;

        while ( prec != pree && newc != newe )
        {
            // pre which is not in new
            if ( *prec < *newc )
            {
                *(preo++) = *(prec++);
            }
            // tag in both, drop pre
            else if ( *prec == *newc )
            {
                *(newo++) = *(newc++);
                prec++;
            }
            // new not in pre
            else
            {
                *(newo++) = *(newc++);
            }
        }

        while ( prec != pree )
            *(preo++) = *(prec++);
        while ( newc != newe )
            *(newo++) = *(newc++);

        pretagnum = preo-auxpre.begin();
        newtagnum = newo-auxnew.begin();

        for ( uint64_t i = 0; i < pretagnum; ++i )
            auxfilter.set(auxpre[i].first,auxpre[i].second);

        algn.copyAuxTags(prealgn, auxfilter);

        for ( uint64_t i = 0; i < pretagnum; ++i )
            auxfilter.clear(auxpre[i].first,auxpre[i].second);

        if ( verbose > 1 )
        {
            std::cerr << "pretagnum=" << pretagnum << " newtagnum=" << newtagnum << std::endl;
            std::cerr << "result: " << algn.formatAlignment(header) << std::endl;
        }

        // copy QC fail flag from original file to aligner output
        if ( prealgn.isQCFail() )
            algn.putFlags( algn.getFlags() | libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_FQCFAIL );

        if ( rankstrip )
            strip12(algn);

        if ( clipreinsert )
            clipReinsert(algn,auxtags,bafv,cigop,Tcigar,hardstack,auxfilterout);

        if ( zztoname )
            zzToRank(algn,zzbafv);

        algn.serialise(writer->getStream());
    }

    writer.reset();

    if ( Pmd5cb )
    {
        Pmd5cb->saveDigestAsFile(md5filename);
    }
    if ( Pindex )
    {
        Pindex->flush(std::string(indexfilename));
    }

    return EXIT_SUCCESS;
}
コード例 #14
0
ファイル: bamsort.cpp プロジェクト: dozy/biobambam
int bamsort(::libmaus::util::ArgInfo const & arginfo)
{
	::libmaus::util::TempFileRemovalContainer::setup();
	
	bool const inputisstdin = (!arginfo.hasArg("I")) || (arginfo.getUnparsedValue("I","-") == "-");
	bool const outputisstdout = (!arginfo.hasArg("O")) || (arginfo.getUnparsedValue("O","-") == "-");

	if ( isatty(STDIN_FILENO) && inputisstdin && (arginfo.getValue<std::string>("inputformat","bam") != "sam") )
	{
		::libmaus::exception::LibMausException se;
		se.getStream() << "Refusing to read binary data from terminal, please redirect standard input to pipe or file." << std::endl;
		se.finish();
		throw se;
	}

	if ( isatty(STDOUT_FILENO) && outputisstdout && (arginfo.getValue<std::string>("outputformat","bam") != "sam") )
	{
		::libmaus::exception::LibMausException se;
		se.getStream() << "Refusing write binary data to terminal, please redirect standard output to pipe or file." << std::endl;
		se.finish();
		throw se;
	}

	int const verbose = arginfo.getValue<int>("verbose",getDefaultVerbose());
	bool const disablevalidation = arginfo.getValue<int>("disablevalidation",getDefaultDisableValidation());

	std::string const inputformat = arginfo.getUnparsedValue("inputformat",getDefaultInputFormat());
	int const level = arginfo.getValue<int>("level",getDefaultLevel());
	switch ( level )
	{
		case Z_NO_COMPRESSION:
		case Z_BEST_SPEED:
		case Z_BEST_COMPRESSION:
		case Z_DEFAULT_COMPRESSION:
			break;
		default:
		{
			::libmaus::exception::LibMausException se;
			se.getStream()
				<< "Unknown compression level, please use"
				<< " level=" << Z_DEFAULT_COMPRESSION << " (default) or"
				<< " level=" << Z_BEST_SPEED << " (fast) or"
				<< " level=" << Z_BEST_COMPRESSION << " (best) or"
				<< " level=" << Z_NO_COMPRESSION << " (no compression)" << std::endl;
			se.finish();
			throw se;
		}
			break;
	}

	// prefix for tmp files
	std::string const tmpfilenamebase = arginfo.getValue<std::string>("tmpfile",arginfo.getDefaultTmpFileName());
	std::string const tmpfilenameout = tmpfilenamebase + "_bamsort";
	::libmaus::util::TempFileRemovalContainer::addTempFile(tmpfilenameout);
	uint64_t blockmem = arginfo.getValue<uint64_t>("blockmb",getDefaultBlockSize())*1024*1024;
	std::string const sortorder = arginfo.getValue<std::string>("SO","coordinate");
	bool const fixmates = arginfo.getValue<int>("fixmates",getDefaultFixMates());
	uint64_t sortthreads = arginfo.getValue<uint64_t>("sortthreads",getDefaultSortThreads());

	// input decoder wrapper
	libmaus::bambam::BamAlignmentDecoderWrapper::unique_ptr_type decwrapper(
		libmaus::bambam::BamMultiAlignmentDecoderFactory::construct(
			arginfo,false // put rank
		)
	);
	::libmaus::bambam::BamAlignmentDecoder * ppdec = &(decwrapper->getDecoder());
	::libmaus::bambam::BamAlignmentDecoder & dec = *ppdec;
	if ( disablevalidation )
		dec.disableValidation();
	::libmaus::bambam::BamHeader const & header = dec.getHeader();

	std::string const headertext(header.text);

	// add PG line to header
	std::string const upheadtext = ::libmaus::bambam::ProgramHeaderLineSet::addProgramLine(
		headertext,
		"bamsort", // ID
		"bamsort", // PN
		arginfo.commandline, // CL
		::libmaus::bambam::ProgramHeaderLineSet(headertext).getLastIdInChain(), // PP
		std::string(PACKAGE_VERSION) // VN			
	);
	// construct new header
	::libmaus::bambam::BamHeader uphead(upheadtext);

	/*
	 * start index/md5 callbacks
	 */
	std::string const tmpfileindex = tmpfilenamebase + "_index";
	::libmaus::util::TempFileRemovalContainer::addTempFile(tmpfileindex);

	std::string md5filename;
	std::string indexfilename;

	std::vector< ::libmaus::lz::BgzfDeflateOutputCallback * > cbs;
	::libmaus::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Pmd5cb;
	if ( arginfo.getValue<unsigned int>("md5",getDefaultMD5()) )
	{
		if ( arginfo.hasArg("md5filename") &&  arginfo.getUnparsedValue("md5filename","") != "" )
			md5filename = arginfo.getUnparsedValue("md5filename","");
		else
			std::cerr << "[V] no filename for md5 given, not creating hash" << std::endl;

		if ( md5filename.size() )
		{
			::libmaus::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Tmd5cb(new ::libmaus::lz::BgzfDeflateOutputCallbackMD5);
			Pmd5cb = UNIQUE_PTR_MOVE(Tmd5cb);
			cbs.push_back(Pmd5cb.get());
		}
	}
	libmaus::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Pindex;
	if ( arginfo.getValue<unsigned int>("index",getDefaultIndex()) )
	{
		if ( arginfo.hasArg("indexfilename") &&  arginfo.getUnparsedValue("indexfilename","") != "" )
			indexfilename = arginfo.getUnparsedValue("indexfilename","");
		else
			std::cerr << "[V] no filename for index given, not creating index" << std::endl;

		if ( indexfilename.size() )
		{
			libmaus::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Tindex(new libmaus::bambam::BgzfDeflateOutputCallbackBamIndex(tmpfileindex));
			Pindex = UNIQUE_PTR_MOVE(Tindex);
			cbs.push_back(Pindex.get());
		}
	}
	std::vector< ::libmaus::lz::BgzfDeflateOutputCallback * > * Pcbs = 0;
	if ( cbs.size() )
		Pcbs = &cbs;
	/*
	 * end md5/index callbacks
	 */
	if ( sortorder != "queryname" )
		uphead.changeSortOrder("coordinate");
	else
		uphead.changeSortOrder("queryname");

	libmaus::bambam::BamBlockWriterBase::unique_ptr_type Pout ( libmaus::bambam::BamBlockWriterBaseFactory::construct(uphead, arginfo, Pcbs) );

	if ( fixmates )
	{
		if ( sortorder != "queryname" )
		{
			::libmaus::bambam::BamEntryContainer< ::libmaus::bambam::BamAlignmentPosComparator > 
				BEC(blockmem,tmpfilenameout,sortthreads);

			if ( verbose )
				std::cerr << "[V] Reading alignments from source." << std::endl;
			uint64_t incnt = 0;

			// current alignment
			libmaus::bambam::BamAlignment & curalgn = dec.getAlignment();
			// previous alignment
			libmaus::bambam::BamAlignment prevalgn;
			// previous alignment valid
			bool prevalgnvalid = false;
			// MQ field filter
			libmaus::bambam::BamAuxFilterVector MQfilter;
			MQfilter.set("MQ");
			
			while ( dec.readAlignment() )
			{
				if ( curalgn.isSecondary() || curalgn.isSupplementary() )
				{
					BEC.putAlignment(curalgn);
				}
				else if ( prevalgnvalid )
				{
					// different name
					if ( strcmp(curalgn.getName(),prevalgn.getName()) )
					{
						BEC.putAlignment(prevalgn);
						curalgn.swap(prevalgn);
					}
					// same name
					else
					{
						libmaus::bambam::BamAlignment::fixMateInformation(prevalgn,curalgn,MQfilter);
						BEC.putAlignment(prevalgn);
						BEC.putAlignment(curalgn);
						prevalgnvalid = false;
					}
				}
				else
				{
					prevalgn.swap(curalgn);
					prevalgnvalid = true;
				}
				
				if ( verbose && ( ( ++incnt & ((1ull<<20)-1) ) == 0 ) )
					std::cerr << "[V] " << incnt << std::endl;
			}
			
			if ( prevalgnvalid )
			{
				BEC.putAlignment(prevalgn);
				prevalgnvalid = false;
			}

			if ( verbose )
				std::cerr << "[V] read " << incnt << " alignments" << std::endl;

			// BEC.createOutput(std::cout, uphead, level, verbose, Pcbs);
			BEC.createOutput(*Pout, verbose);
		}
		else
		{
			::libmaus::bambam::BamEntryContainer< ::libmaus::bambam::BamAlignmentNameComparator > 
				BEC(blockmem,tmpfilenameout,sortthreads);
			
			if ( verbose )
				std::cerr << "[V] Reading alignments from source." << std::endl;
			uint64_t incnt = 0;
			
			// current alignment
			libmaus::bambam::BamAlignment & curalgn = dec.getAlignment();
			// previous alignment
			libmaus::bambam::BamAlignment prevalgn;
			// previous alignment valid
			bool prevalgnvalid = false;
			// MQ field filter
			libmaus::bambam::BamAuxFilterVector MQfilter;
			MQfilter.set("MQ");
			
			while ( dec.readAlignment() )
			{
				if ( curalgn.isSecondary() || curalgn.isSupplementary() )
				{
					BEC.putAlignment(curalgn);
				}
				else if ( prevalgnvalid )
				{
					// different name
					if ( strcmp(curalgn.getName(),prevalgn.getName()) )
					{
						BEC.putAlignment(prevalgn);
						curalgn.swap(prevalgn);
					}
					// same name
					else
					{
						libmaus::bambam::BamAlignment::fixMateInformation(prevalgn,curalgn,MQfilter);
						BEC.putAlignment(prevalgn);
						BEC.putAlignment(curalgn);
						prevalgnvalid = false;
					}
				}
				else
				{
					prevalgn.swap(curalgn);
					prevalgnvalid = true;
				}
				
				if ( verbose && ( ( ++incnt & ((1ull<<20)-1) ) == 0 ) )
					std::cerr << "[V] " << incnt << std::endl;
			}
			
			if ( prevalgnvalid )
			{
				BEC.putAlignment(prevalgn);
				prevalgnvalid = false;
			}
			
			if ( verbose )
				std::cerr << "[V] read " << incnt << " alignments" << std::endl;

			// BEC.createOutput(std::cout, uphead, level, verbose, Pcbs);
			BEC.createOutput(*Pout, verbose);
		}
	}
	else
	{
		if ( sortorder != "queryname" )
		{
			::libmaus::bambam::BamEntryContainer< ::libmaus::bambam::BamAlignmentPosComparator > BEC(blockmem,tmpfilenameout,sortthreads);

			if ( verbose )
				std::cerr << "[V] Reading alignments from source." << std::endl;
			uint64_t incnt = 0;
			
			while ( dec.readAlignment() )
			{
				BEC.putAlignment(dec.getAlignment());
				incnt++;
				if ( verbose && (incnt % (1024*1024) == 0) )
					std::cerr << "[V] " << incnt/(1024*1024) << "M" << std::endl;
			}

			if ( verbose )
				std::cerr << "[V] read " << incnt << " alignments" << std::endl;

			// BEC.createOutput(std::cout, uphead, level, verbose, Pcbs);
			BEC.createOutput(*Pout, verbose);
		}
		else
		{
			::libmaus::bambam::BamEntryContainer< ::libmaus::bambam::BamAlignmentNameComparator > BEC(blockmem,tmpfilenameout,sortthreads);
			
			if ( verbose )
				std::cerr << "[V] Reading alignments from source." << std::endl;
			uint64_t incnt = 0;
			
			while ( dec.readAlignment() )
			{
				BEC.putAlignment(dec.getAlignment());
				incnt++;
				if ( verbose && (incnt % (1024*1024) == 0) )
					std::cerr << "[V] " << incnt/(1024*1024) << "M" << std::endl;
			}
			
			if ( verbose )
				std::cerr << "[V] read " << incnt << " alignments" << std::endl;

			// BEC.createOutput(std::cout, uphead, level, verbose, Pcbs);
			BEC.createOutput(*Pout, verbose);
		}
	}

	// flush encoder so callbacks see all output data
	Pout.reset();

	if ( Pmd5cb )
	{
		Pmd5cb->saveDigestAsFile(md5filename);
	}
	if ( Pindex )
	{
		Pindex->flush(std::string(indexfilename));
	}

	return EXIT_SUCCESS;
}