Beispiel #1
0
int bamsplit(libmaus2::util::ArgInfo const & arginfo)
{
	if ( isatty(STDIN_FILENO) )
	{
		::libmaus2::exception::LibMausException se;
		se.getStream() << "Refusing read binary data from terminal, please redirect standard input to pipe or file." << std::endl;
		se.finish();
		throw se;
	}

	int const level = libmaus2::bambam::BamBlockWriterBaseFactory::checkCompressionLevel(arginfo.getValue<int>("level",getDefaultLevel()));
	int const verbose = arginfo.getValue<int>("verbose",getDefaultVerbose());
	uint64_t const n = arginfo.getValue<int>("n",getDefaultN());
	std::string const prefix = arginfo.getUnparsedValue("prefix",getDefaultFilePrefix(arginfo));

	libmaus2::bambam::BamDecoder bamdec(std::cin);
	libmaus2::bambam::BamAlignment const & algn = bamdec.getAlignment();
	libmaus2::bambam::BamHeader const & header = bamdec.getHeader();
	::libmaus2::bambam::BamHeader::unique_ptr_type uphead(updateHeader(arginfo,header));

	libmaus2::aio::OutputStreamInstance::unique_ptr_type COS;
	libmaus2::bambam::BamWriter::unique_ptr_type writer;

	uint64_t c = 0;
	uint64_t f = 0;
	while ( bamdec.readAlignment() )
	{
		if ( c++ % n == 0 )
		{
			writer.reset();
			if ( COS )
				COS->flush();
			COS.reset();

			std::ostringstream fnostr;
			fnostr << prefix << "_" << std::setw(6) << std::setfill('0') << f++ << std::setw(0) << ".bam";
			std::string const fn = fnostr.str();

			libmaus2::aio::OutputStreamInstance::unique_ptr_type tCOS(new libmaus2::aio::OutputStreamInstance(fn));
			COS = UNIQUE_PTR_MOVE(tCOS);

			libmaus2::bambam::BamWriter::unique_ptr_type twriter(new libmaus2::bambam::BamWriter(*COS,*uphead,level));
			writer = UNIQUE_PTR_MOVE(twriter);

			if ( verbose )
				std::cerr << "[V] opened file " << fn << std::endl;
		}

		algn.serialise(writer->getStream());
	}

	writer.reset();
	if ( COS )
		COS->flush();
	COS.reset();

	return EXIT_SUCCESS;
}
Beispiel #2
0
int bamexplode(libmaus2::util::ArgInfo const & arginfo)
{
	libmaus2::bambam::BamAlignmentDecoderWrapper::unique_ptr_type Preader(libmaus2::bambam::BamMultiAlignmentDecoderFactory::construct(arginfo));

	libmaus2::bambam::BamBlockWriterBase::unique_ptr_type Pwriter;

	libmaus2::bambam::BamAlignmentDecoder & decoder = Preader->getDecoder();
	libmaus2::bambam::BamHeader const & header = decoder.getHeader();
	libmaus2::bambam::BamAlignment const & algn = decoder.getAlignment();
	uint64_t nextfn = 0;
	uint64_t written = std::numeric_limits<uint64_t>::max();
	int32_t prevrefid = std::numeric_limits<int32_t>::max();
	std::string const outputformat = arginfo.getUnparsedValue("outputformat",libmaus2::bambam::BamBlockWriterBaseFactory::getDefaultOutputFormat());
	std::string const prefix = arginfo.getUnparsedValue("prefix",getDefaultPrefix());
	uint64_t const thres = arginfo.getValueUnsignedNumeric("sizethres",getDefaultSizeThres());

	while ( decoder.readAlignment() )
	{
		int32_t const refid = algn.getRefID();

		if ( refid != prevrefid && written > thres )
		{
			Pwriter.reset();
			libmaus2::util::ArgInfo argcopy(arginfo);
			std::ostringstream fnostr;
			fnostr << prefix << std::setw(6) << std::setfill('0') << nextfn++ << std::setw(0) << "." << outputformat;
			argcopy.replaceKey("O",fnostr.str());
			libmaus2::bambam::BamBlockWriterBase::unique_ptr_type Twriter(libmaus2::bambam::BamBlockWriterBaseFactory::construct(header,argcopy));
			Pwriter = UNIQUE_PTR_MOVE(Twriter);
			written = 0;
		}

		Pwriter->writeAlignment(algn);

		prevrefid = refid;
		written ++;
	}

	Pwriter.reset();

	return EXIT_SUCCESS;
}
uint64_t bamrecompress(libmaus2::util::ArgInfo const & arginfo)
{
	int const level = libmaus2::bambam::BamBlockWriterBaseFactory::checkCompressionLevel(arginfo.getValue<int>("level",getDefaultLevel()));
	int const verbose = arginfo.getValue<int>("verbose",getDefaultVerbose());
	int const numthreads = std::max(1,arginfo.getValue<int>("numthreads",getDefaultNumThreads()));

	/*
	 * start index/md5 callbacks
	 */
	std::string const tmpfilenamebase = arginfo.getValue<std::string>("tmpfile",arginfo.getDefaultTmpFileName());
	std::string const tmpfileindex = tmpfilenamebase + "_index";
	::libmaus2::util::TempFileRemovalContainer::addTempFile(tmpfileindex);

	std::string md5filename;
	std::string indexfilename;

	std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > cbs;
	::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Pmd5cb;
	if ( arginfo.getValue<unsigned int>("md5",getDefaultMD5()) )
	{
		if ( arginfo.hasArg("md5filename") &&  arginfo.getUnparsedValue("md5filename","") != "" )
			md5filename = arginfo.getUnparsedValue("md5filename","");
		else
			std::cerr << "[V] no filename for md5 given, not creating hash" << std::endl;

		if ( md5filename.size() )
		{
			::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Tmd5cb(new ::libmaus2::lz::BgzfDeflateOutputCallbackMD5);
			Pmd5cb = UNIQUE_PTR_MOVE(Tmd5cb);
			cbs.push_back(Pmd5cb.get());
		}
	}
	libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Pindex;
	if ( arginfo.getValue<unsigned int>("index",getDefaultIndex()) )
	{
		if ( arginfo.hasArg("indexfilename") &&  arginfo.getUnparsedValue("indexfilename","") != "" )
			indexfilename = arginfo.getUnparsedValue("indexfilename","");
		else
			std::cerr << "[V] no filename for index given, not creating index" << std::endl;

		if ( indexfilename.size() )
		{
			libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Tindex(new libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex(tmpfileindex));
			Pindex = UNIQUE_PTR_MOVE(Tindex);
			cbs.push_back(Pindex.get());
		}
	}
	/*
	 * end md5/index callbacks
	 */

	libmaus2::lz::BgzfInflateDeflateParallel::unique_ptr_type BIDP(new libmaus2::lz::BgzfInflateDeflateParallel(std::cin,std::cout,level,numthreads,4*numthreads));

	for ( uint64_t i = 0; i < cbs.size(); ++i )
		BIDP->registerBlockOutputCallback(cbs[i]);

	libmaus2::autoarray::AutoArray<char> B(64*1024,false);
	int r;
	uint64_t t = 0;
	uint64_t last = std::numeric_limits<uint64_t>::max();
	uint64_t lcnt = 0;
	uint64_t const mod = 64*1024*1024;
	libmaus2::timing::RealTimeClock rtc; rtc.start();
	libmaus2::timing::RealTimeClock lrtc; lrtc.start();

	while ( (r = BIDP->read(B.begin(),B.size())) )
	{
		BIDP->write(B.begin(),r);

		lcnt += r;
		t += r;

		if ( t/mod != last/mod )
		{
			if ( verbose )
			{
				if ( isatty(STDERR_FILENO) )
					std::cerr
						<< "\r" << std::string(60,' ') << "\r";

				std::cerr
						<< rtc.formatTime(rtc.getElapsedSeconds()) << " " << t/(1024*1024) << "MB, " << (lcnt/lrtc.getElapsedSeconds())/(1024.0*1024.0) << "MB/s";

				if ( isatty(STDERR_FILENO) )
					std::cerr << std::flush;
				else
					std::cerr << std::endl;
			}

			lrtc.start();
			last = t;
			lcnt = 0;
		}
	}

	if ( verbose )
	{
		if ( isatty(STDERR_FILENO) )
			std::cerr
				<< "\r" << std::string(60,' ') << "\r";

		std::cerr
				<< rtc.formatTime(rtc.getElapsedSeconds()) << " " << t/(1024*1024) << "MB, " << (t/rtc.getElapsedSeconds())/(1024.0*1024.0) << "MB/s";

		std::cerr << std::endl;
	}

	BIDP.reset();

	if ( Pmd5cb )
	{
		Pmd5cb->saveDigestAsFile(md5filename);
	}
	if ( Pindex )
	{
		Pindex->flush(std::string(indexfilename));
	}

	return 0;
}
int bam12auxmerge(::libmaus2::util::ArgInfo const & arginfo)
{
	if ( isatty(STDIN_FILENO) )
	{
		::libmaus2::exception::LibMausException se;
		se.getStream() << "Refusing to read binary data from terminal, please redirect standard input to pipe or file." << std::endl;
		se.finish();
		throw se;
	}

	if ( isatty(STDOUT_FILENO) )
	{
		::libmaus2::exception::LibMausException se;
		se.getStream() << "Refusing write binary data to terminal, please redirect standard output to pipe or file." << std::endl;
		se.finish();
		throw se;
	}

	std::string const prefilename = arginfo.getRestArg<std::string>(0);
	libmaus2::bambam::BamDecoder bampredec(prefilename);

	int const level = libmaus2::bambam::BamBlockWriterBaseFactory::checkCompressionLevel(arginfo.getValue<int>("level",getDefaultLevel()));
	int const verbose = arginfo.getValue<int>("verbose",getDefaultVerbose());
	int const ranksplit = arginfo.getValue<int>("ranksplit",getDefaultRankSplit());
	int const rankstrip = arginfo.getValue<int>("rankstrip",getDefaultRankSplit());
	int const clipreinsert = arginfo.getValue<int>("clipreinsert",getDefaultClipReinsert());
	int const zztoname = arginfo.getValue<int>("zztoname",getDefaultZZToName());
	int const sanity = arginfo.getValue<int>("sanity",getDefaultSanity());
	uint64_t const mod = arginfo.getValue<int>("mod",getDefaultMod());
	uint64_t const bmod = libmaus2::math::nextTwoPow(mod);
	uint64_t const bmask = bmod-1;

	libmaus2::autoarray::AutoArray<char> Aread;

	::libmaus2::bambam::BamDecoder bamdec(std::cin,false);
	::libmaus2::bambam::BamHeader const & header = bamdec.getHeader();
	::libmaus2::bambam::BamHeader const & preheader = bampredec.getHeader();

	std::string const headertext(header.text);
	std::string const preheadertext(libmaus2::bambam::HeaderLine::removeSequenceLines(preheader.text));

	libmaus2::bambam::ProgramHeaderLineSet headerlines(headertext);
	libmaus2::bambam::ProgramHeaderLineSet preheaderlines(preheadertext);

	std::vector<libmaus2::bambam::HeaderLine> allheaderlines = libmaus2::bambam::HeaderLine::extractLines(headertext);

	std::string const lastid = preheaderlines.getLastIdInChain();

	std::stack < std::pair<uint64_t,std::string> > pgtodo;
	for ( uint64_t i = 0; i < headerlines.roots.size(); ++i )
		pgtodo.push(std::pair<uint64_t,std::string>(headerlines.roots[i],lastid));

	std::string upheadtext = preheadertext;
	while ( pgtodo.size() )
	{
		uint64_t const hid = pgtodo.top().first;
		std::string const PP = pgtodo.top().second;
		pgtodo.pop();
		libmaus2::bambam::HeaderLine const & line = headerlines.lines[hid];

		// ID, PP, PN, CL, VN
		std::string       ID = (line.M.find("ID") != line.M.end()) ? line.M.find("ID")->second : "";
		std::string const PN = (line.M.find("PN") != line.M.end()) ? line.M.find("PN")->second : "";
		std::string const CL = (line.M.find("CL") != line.M.end()) ? line.M.find("CL")->second : "";
		std::string const VN = (line.M.find("VN") != line.M.end()) ? line.M.find("VN")->second : "";

		upheadtext = ::libmaus2::bambam::ProgramHeaderLineSet::addProgramLineRef(
			upheadtext,
			ID,
			PN,
			CL,
			PP,
			VN
		);

		if ( headerlines.edges.find(hid) != headerlines.edges.end() )
		{
			std::vector<uint64_t> const & children = headerlines.edges.find(hid)->second;

			for ( uint64_t j = 0; j < children.size(); ++j )
				pgtodo.push(std::pair<uint64_t,std::string>(children[j],ID));
		}
	}

	/* copy SQ lines */
	std::ostringstream sqconcstr;
	sqconcstr << upheadtext;
	for ( uint64_t i = 0; i < allheaderlines.size(); ++i )
		if ( allheaderlines[i].type == "SQ" )
			sqconcstr << allheaderlines[i].line << "\n";
	upheadtext = sqconcstr.str();

	::libmaus2::bambam::BamHeader uphead(upheadtext);
	uphead.changeSortOrder("unknown");

	/*
	 * start index/md5 callbacks
	 */
	std::string const tmpfilenamebase = arginfo.getValue<std::string>("tmpfile",arginfo.getDefaultTmpFileName());
	std::string const tmpfileindex = tmpfilenamebase + "_index";
	::libmaus2::util::TempFileRemovalContainer::addTempFile(tmpfileindex);

	std::string md5filename;
	std::string indexfilename;

	std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > cbs;
	::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Pmd5cb;
	if ( arginfo.getValue<unsigned int>("md5",getDefaultMD5()) )
	{
		if ( arginfo.hasArg("md5filename") &&  arginfo.getUnparsedValue("md5filename","") != "" )
			md5filename = arginfo.getUnparsedValue("md5filename","");
		else
			std::cerr << "[V] no filename for md5 given, not creating hash" << std::endl;

		if ( md5filename.size() )
		{
			::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Tmd5cb(new ::libmaus2::lz::BgzfDeflateOutputCallbackMD5);
			Pmd5cb = UNIQUE_PTR_MOVE(Tmd5cb);
			cbs.push_back(Pmd5cb.get());
		}
	}
	libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Pindex;
	if ( arginfo.getValue<unsigned int>("index",getDefaultIndex()) )
	{
		if ( arginfo.hasArg("indexfilename") &&  arginfo.getUnparsedValue("indexfilename","") != "" )
			indexfilename = arginfo.getUnparsedValue("indexfilename","");
		else
			std::cerr << "[V] no filename for index given, not creating index" << std::endl;

		if ( indexfilename.size() )
		{
			libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Tindex(new libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex(tmpfileindex));
			Pindex = UNIQUE_PTR_MOVE(Tindex);
			cbs.push_back(Pindex.get());
		}
	}
	std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > * Pcbs = 0;
	if ( cbs.size() )
		Pcbs = &cbs;
	/*
	 * end md5/index callbacks
	 */

	::libmaus2::bambam::BamWriter::unique_ptr_type writer(new ::libmaus2::bambam::BamWriter(std::cout,uphead,level,Pcbs));

	::libmaus2::bambam::BamAlignment & algn = bamdec.getAlignment();
	::libmaus2::bambam::BamAlignment & prealgn = bampredec.getAlignment();
	int64_t curid = -1;

	libmaus2::autoarray::AutoArray< std::pair<uint8_t,uint8_t> > auxpre;
	libmaus2::autoarray::AutoArray< std::pair<uint8_t,uint8_t> > auxnew;

	libmaus2::bambam::BamAuxFilterVector auxfilter;

	// helpers for clipReinsert
	libmaus2::autoarray::AutoArray < std::pair<uint8_t,uint8_t> > auxtags;
	libmaus2::autoarray::AutoArray<libmaus2::bambam::cigar_operation> cigop;
	std::stack < libmaus2::bambam::cigar_operation > hardstack;
	libmaus2::bambam::BamAlignment::D_array_type Tcigar;
	libmaus2::bambam::BamAuxFilterVector bafv;
	libmaus2::bambam::BamAuxFilterVector auxfilterout;
	auxfilterout.set('q','s');
	auxfilterout.set('q','q');

	// helpers for zztoname
 	libmaus2::bambam::BamAuxFilterVector zzbafv;
 	zzbafv.set('z','z');

    	// tag filters for secondary/supplementary reads
	libmaus2::bambam::BamAuxFilterVector auxfiltersec;

	auxfiltersec.set('q','s');
	auxfiltersec.set('q','q');
	auxfiltersec.set('a','s');
	auxfiltersec.set('a','h');
	auxfiltersec.set('a','a');
	auxfiltersec.set('a','f');
	auxfiltersec.set('a','r');
	auxfiltersec.set('a','3');

	// loop over aligned BAM file
	while ( bamdec.readAlignment() )
	{
		if ( ranksplit )
			split12(algn);

		// extract rank
		char const * name = algn.getName();
		char const * u1 = name;
		bool ok = true;
		uint64_t rank = 0;
		while ( *u1 && *u1 != '_' )
		{
			rank *= 10;
			rank += (*u1-'0');
			ok = ok && isdigit(*u1);
			++u1;
		}

		// unable to find rank?	write out as is and continue
		if ( ! ok )
		{
			algn.serialise(writer->getStream());
			continue;
		}

		// loop over unaligned BAM file
		while ( curid != static_cast<int64_t>(rank) )
		{
			bool const a_ok = bampredec.readAlignment();

			if ( ! a_ok )
			{
				libmaus2::exception::LibMausException se;
				se.getStream() << "Found unexpected EOF on file " << prefilename << std::endl;
				se.finish();
				throw se;
			}
			assert ( a_ok );
			++curid;

			if ( verbose && (! (curid & bmask)) )
				std::cerr << "[V] " << (curid / bmod) << std::endl;
		}

		if ( verbose > 1 )
			std::cerr << "Merging:\n" << algn.formatAlignment(header) << "\n" << prealgn.formatAlignment(preheader) << std::endl;

		uint64_t pretagnum = prealgn.enumerateAuxTags(auxpre);
		uint64_t newtagnum = algn.enumerateAuxTags(auxnew);

		// do some sanity checking
		if ( sanity )
		{
		    	// first do a name check
			char const * prename = prealgn.getName();
			u1++; // put on the first letter of readname

			if ( verbose > 1 )
    	    	    	    	std::cerr << "Sanity: comparing " << name << " and " << prename << std::endl;

			if ( !is_suffix(prename, u1) ) // names do not match
			{
			    	libmaus2::exception::LibMausException se;
			    	se.getStream() << "Sanity check failed on read names, found " << name << " and " << prename << std::endl;
				se.finish();
				throw se;
			}

			// now the names match so try the flags

			if ( !(algn.isPaired() == prealgn.isPaired() &&
			     algn.isRead1() == prealgn.isRead1() &&
			     algn.isRead2() == prealgn.isRead2()) )
			{
			    	libmaus2::exception::LibMausException se;
				se.getStream() << "Sanity check failed on flags, " << std::endl
				    	       << "Aligned " << name << " paired " << algn.isPaired() << " first " << algn.isRead1() << " last " << algn.isRead2() << std::endl
			    	    	       << "Unaligned " << prename << " paired " << prealgn.isPaired() << " first " << prealgn.isRead1() << " last " << prealgn.isRead2() << std::endl;
				se.finish();
				throw se;
			}

			if ( verbose > 1 )
			    std::cerr << "Sanity check on flags: " << std::endl
				    	       << "Aligned " << name << " paired " << algn.isPaired() << " first " << algn.isRead1() << " last " << algn.isRead2() << std::endl
			    	    	       << "Unaligned " << prename << " paired " << prealgn.isPaired() << " first " << prealgn.isRead1() << " last " << prealgn.isRead2() << std::endl;


		}

		std::sort(auxpre.begin(),auxpre.begin()+pretagnum);
		std::sort(auxnew.begin(),auxnew.begin()+newtagnum);

		if ( verbose > 1 )
			std::cerr << "pretagnum=" << pretagnum << " newtagnum=" << newtagnum << std::endl;

		std::pair<uint8_t,uint8_t> * prec = auxpre.begin();
		std::pair<uint8_t,uint8_t> * pree = prec + pretagnum;
		std::pair<uint8_t,uint8_t> * preo = prec;

		std::pair<uint8_t,uint8_t> * newc = auxnew.begin();
		std::pair<uint8_t,uint8_t> * newe = newc + newtagnum;
		std::pair<uint8_t,uint8_t> * newo = newc;

		while ( prec != pree && newc != newe )
		{
			// pre which is not in new
			if ( *prec < *newc )
			{
				*(preo++) = *(prec++);
			}
			// tag in both, drop pre
			else if ( *prec == *newc )
			{
				*(newo++) = *(newc++);
				prec++;
			}
			// new not in pre
			else
			{
				*(newo++) = *(newc++);
			}
		}

		while ( prec != pree )
			*(preo++) = *(prec++);
		while ( newc != newe )
			*(newo++) = *(newc++);

		pretagnum = preo-auxpre.begin();
		newtagnum = newo-auxnew.begin();

		for ( uint64_t i = 0; i < pretagnum; ++i )
			auxfilter.set(auxpre[i].first,auxpre[i].second);

		algn.copyAuxTags(prealgn, auxfilter);

		for ( uint64_t i = 0; i < pretagnum; ++i )
			auxfilter.clear(auxpre[i].first,auxpre[i].second);

		if ( verbose > 1 )
		{
			std::cerr << "pretagnum=" << pretagnum << " newtagnum=" << newtagnum << std::endl;
			std::cerr << "result: " << algn.formatAlignment(header) << std::endl;
		}


		if ( algn.isSecondary() || algn.isSupplementary() )
		{
		    	// adding adapter clip data to secondary/supplementary reads
			// can lead to incorrect clip reinserts so remove these tags

			algn.filterOutAux(auxfiltersec);
		}


		// copy QC fail flag from original file to aligner output
		if ( prealgn.isQCFail() )
			algn.putFlags( algn.getFlags() | libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_FQCFAIL );

		if ( rankstrip )
			strip12(algn);

		if ( clipreinsert )
			clipReinsert(algn,auxtags,bafv,cigop,Tcigar,hardstack,auxfilterout);

		if ( zztoname )
			zzToRank(algn,zzbafv);

		algn.serialise(writer->getStream());
	}

	writer.reset();

	if ( Pmd5cb )
	{
		Pmd5cb->saveDigestAsFile(md5filename);
	}
	if ( Pindex )
	{
		Pindex->flush(std::string(indexfilename));
	}

	return EXIT_SUCCESS;
}
Beispiel #5
0
int bamreset(::libmaus2::util::ArgInfo const & arginfo)
{
	if ( isatty(STDIN_FILENO) )
	{
		::libmaus2::exception::LibMausException se;
		se.getStream() << "Refusing to read binary data from terminal, please redirect standard input to pipe or file." << std::endl;
		se.finish();
		throw se;
	}

	if ( isatty(STDOUT_FILENO) )
	{
		::libmaus2::exception::LibMausException se;
		se.getStream() << "Refusing write binary data to terminal, please redirect standard output to pipe or file." << std::endl;
		se.finish();
		throw se;
	}
	
	int const level = libmaus2::bambam::BamBlockWriterBaseFactory::checkCompressionLevel(arginfo.getValue<int>("level",getDefaultLevel()));
	int const verbose = arginfo.getValue<int>("verbose",getDefaultVerbose());
	int const resetsortorder = arginfo.getValue<int>("resetsortorder",getDefaultResetSortOrder());
	
	::libmaus2::bambam::BamDecoder dec(std::cin,false);
	::libmaus2::bambam::BamHeader const & header = dec.getHeader();

	std::string headertext = header.text;

	// no replacement header file given
	if ( ! arginfo.hasArg("resetheadertext") )
	{
		// remove SQ lines
		std::vector<libmaus2::bambam::HeaderLine> allheaderlines = libmaus2::bambam::HeaderLine::extractLines(headertext);

		std::ostringstream upheadstr;
		for ( uint64_t i = 0; i < allheaderlines.size(); ++i )
			if ( allheaderlines[i].type != "SQ" )
				upheadstr << allheaderlines[i].line << std::endl;

		headertext = upheadstr.str();
	}
	// replace header given in file
	else
	{
		std::string const headerfilename = arginfo.getUnparsedValue("resetheadertext","");
		uint64_t const headerlen = libmaus2::util::GetFileSize::getFileSize(headerfilename);
		libmaus2::aio::CheckedInputStream CIS(headerfilename);
		libmaus2::autoarray::AutoArray<char> ctext(headerlen,false);
		CIS.read(ctext.begin(),headerlen);
		headertext = std::string(ctext.begin(),ctext.end());		
	}

	// add PG line to header
	headertext = libmaus2::bambam::ProgramHeaderLineSet::addProgramLine(
		headertext,
		"bamreset", // ID
		"bamreset", // PN
		arginfo.commandline, // CL
		::libmaus2::bambam::ProgramHeaderLineSet(headertext).getLastIdInChain(), // PP
		std::string(PACKAGE_VERSION) // VN			
	);
	
	// construct new header
	libmaus2::bambam::BamHeader uphead(headertext);
	if ( resetsortorder )
		uphead.changeSortOrder("unknown");

	/*
	 * start index/md5 callbacks
	 */
	std::string const tmpfilenamebase = arginfo.getValue<std::string>("tmpfile",arginfo.getDefaultTmpFileName());
	std::string const tmpfileindex = tmpfilenamebase + "_index";
	::libmaus2::util::TempFileRemovalContainer::addTempFile(tmpfileindex);
	uint32_t const excludeflags = libmaus2::bambam::BamFlagBase::stringToFlags(
		arginfo.getValue<std::string>("exclude",getDefaultExcludeFlags()));

	std::string md5filename;
	std::string indexfilename;

	std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > cbs;
	::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Pmd5cb;
	if ( arginfo.getValue<unsigned int>("md5",getDefaultMD5()) )
	{
		if ( arginfo.hasArg("md5filename") &&  arginfo.getUnparsedValue("md5filename","") != "" )
			md5filename = arginfo.getUnparsedValue("md5filename","");
		else
			std::cerr << "[V] no filename for md5 given, not creating hash" << std::endl;

		if ( md5filename.size() )
		{
			::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Tmd5cb(new ::libmaus2::lz::BgzfDeflateOutputCallbackMD5);
			Pmd5cb = UNIQUE_PTR_MOVE(Tmd5cb);
			cbs.push_back(Pmd5cb.get());
		}
	}
	libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Pindex;
	if ( arginfo.getValue<unsigned int>("index",getDefaultIndex()) )
	{
		if ( arginfo.hasArg("indexfilename") &&  arginfo.getUnparsedValue("indexfilename","") != "" )
			indexfilename = arginfo.getUnparsedValue("indexfilename","");
		else
			std::cerr << "[V] no filename for index given, not creating index" << std::endl;

		if ( indexfilename.size() )
		{
			libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Tindex(new libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex(tmpfileindex));
			Pindex = UNIQUE_PTR_MOVE(Tindex);
			cbs.push_back(Pindex.get());
		}
	}
	std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > * Pcbs = 0;
	if ( cbs.size() )
		Pcbs = &cbs;
	/*
	 * end md5/index callbacks
	 */

	::libmaus2::bambam::BamWriter::unique_ptr_type writer(new ::libmaus2::bambam::BamWriter(std::cout,uphead,level,Pcbs));
 	libmaus2::timing::RealTimeClock rtc; rtc.start();
 	
	libmaus2::bambam::BamAlignment & algn = dec.getAlignment();
	uint64_t c = 0;

	bool const resetaux = arginfo.getValue<int>("resetaux",getDefaultResetAux());
	libmaus2::bambam::BamAuxFilterVector::unique_ptr_type const prgfilter(libmaus2::bambam::BamAuxFilterVector::parseAuxFilterList(arginfo));
	libmaus2::bambam::BamAuxFilterVector const * rgfilter = prgfilter.get();

	while ( dec.readAlignment() )
	{
		bool const keep = resetAlignment(algn,resetaux /* reset aux */,excludeflags,rgfilter);
		
		if ( keep )
			algn.serialise(writer->getStream());

		if ( verbose && (++c & (1024*1024-1)) == 0 )
 			std::cerr << "[V] " << c/(1024*1024) << " " << (c / rtc.getElapsedSeconds()) << std::endl;
	}
	
	writer.reset();

	if ( Pmd5cb )
	{
		Pmd5cb->saveDigestAsFile(md5filename);
	}
	if ( Pindex )
	{
		Pindex->flush(std::string(indexfilename));
	}

	return EXIT_SUCCESS;
}
Beispiel #6
0
int bammaskflags(::libmaus2::util::ArgInfo const & arginfo)
{
	uint64_t const maskpos = arginfo.getValue<uint64_t>("maskpos",0xFFFFUL);
	uint64_t const maskneg = arginfo.getValue<uint64_t>("maskneg",getDefaultMaskNeg());
	uint64_t const mask = maskpos & (~maskneg);

	if ( mask )
	{
		std::cerr << "Keeping flags ";
		for ( uint64_t i = 1; i <= ::libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_FSUPPLEMENTARY; i <<= 1 )
			if ( mask & i )
				std::cerr << static_cast< ::libmaus2::bambam::BamFlagBase::bam_flags >(i) << ";";
		std::cerr << std::endl;
		std::cerr << "Erasing flags ";
		for ( uint64_t i = 1; i <= ::libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_FSUPPLEMENTARY; i <<= 1 )
			if ( !(mask & i) )
				std::cerr << static_cast< ::libmaus2::bambam::BamFlagBase::bam_flags >(i) << ";";
		std::cerr << std::endl;
	}
	else
	{
		std::cerr << "Erasing all flags." << std::endl;
	}

	int const level = libmaus2::bambam::BamBlockWriterBaseFactory::checkCompressionLevel(arginfo.getValue<int>("level",getDefaultLevel()));
	int const resetmatecoord = arginfo.getValue<int>("resetmatecoord",0);

	::libmaus2::bambam::BamDecoder BD(std::cin);
	::libmaus2::bambam::BamHeader const & bamheader = BD.getHeader();

	std::string const headertext(bamheader.text);

	// add PG line to header
	std::string const upheadtext = ::libmaus2::bambam::ProgramHeaderLineSet::addProgramLine(
		headertext,
		"bammaskflags", // ID
		"bammaskflags", // PN
		arginfo.commandline, // CL
		::libmaus2::bambam::ProgramHeaderLineSet(headertext).getLastIdInChain(), // PP
		std::string(PACKAGE_VERSION) // VN
	);
	// construct new header
	::libmaus2::bambam::BamHeader uphead(upheadtext);

	::libmaus2::bambam::BamAlignment & alignment = BD.getAlignment();

	/*
	 * start index/md5 callbacks
	 */
	std::string const tmpfilenamebase = arginfo.getValue<std::string>("tmpfile",arginfo.getDefaultTmpFileName());
	std::string const tmpfileindex = tmpfilenamebase + "_index";
	::libmaus2::util::TempFileRemovalContainer::addTempFile(tmpfileindex);

	std::string md5filename;
	std::string indexfilename;

	std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > cbs;
	::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Pmd5cb;
	if ( arginfo.getValue<unsigned int>("md5",getDefaultMD5()) )
	{
		if ( arginfo.hasArg("md5filename") &&  arginfo.getUnparsedValue("md5filename","") != "" )
			md5filename = arginfo.getUnparsedValue("md5filename","");
		else
			std::cerr << "[V] no filename for md5 given, not creating hash" << std::endl;

		if ( md5filename.size() )
		{
			::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Tmd5cb(new ::libmaus2::lz::BgzfDeflateOutputCallbackMD5);
			Pmd5cb = UNIQUE_PTR_MOVE(Tmd5cb);
			cbs.push_back(Pmd5cb.get());
		}
	}
	libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Pindex;
	if ( arginfo.getValue<unsigned int>("index",getDefaultIndex()) )
	{
		if ( arginfo.hasArg("indexfilename") &&  arginfo.getUnparsedValue("indexfilename","") != "" )
			indexfilename = arginfo.getUnparsedValue("indexfilename","");
		else
			std::cerr << "[V] no filename for index given, not creating index" << std::endl;

		if ( indexfilename.size() )
		{
			libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Tindex(new libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex(tmpfileindex));
			Pindex = UNIQUE_PTR_MOVE(Tindex);
			cbs.push_back(Pindex.get());
		}
	}
	std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > * Pcbs = 0;
	if ( cbs.size() )
		Pcbs = &cbs;
	/*
	 * end md5/index callbacks
	 */

	::libmaus2::bambam::BamWriter::unique_ptr_type writer(new ::libmaus2::bambam::BamWriter(std::cout,uphead,level,Pcbs));

	while ( BD.readAlignment() )
	{
		alignment.putFlags(alignment.getFlags() & mask);
		if ( resetmatecoord )
		{
			alignment.putNextRefId(-1);
			alignment.putNextPos(-1);
		}
		alignment.serialise(writer->getStream());
	}

	writer.reset();

	if ( Pmd5cb )
	{
		Pmd5cb->saveDigestAsFile(md5filename);
	}
	if ( Pindex )
	{
		Pindex->flush(std::string(indexfilename));
	}

	return EXIT_SUCCESS;
}
Beispiel #7
0
uint64_t bamheaderfilter(libmaus2::util::ArgInfo const & arginfo)
{
	std::string const inputfilename = arginfo.getUnparsedValue("I","");

	if ( ! inputfilename.size() || inputfilename == "-" )
	{
		::libmaus2::exception::LibMausException se;
		se.getStream() << "No input filename given, please set the I key appropriately." << std::endl;
		se.finish();
		throw se;
	}

	libmaus2::bitio::IndexedBitVector::unique_ptr_type usedrefseq;
	libmaus2::bitio::IndexedBitVector::unique_ptr_type usedrg;
	libmaus2::bambam::BamHeader::unique_ptr_type uheader;

	getUsedRefSeqs(arginfo,usedrefseq,usedrg,uheader);

	/*
	 * start index/md5 callbacks
	 */
	std::string const tmpfilenamebase = arginfo.getValue<std::string>("tmpfile",arginfo.getDefaultTmpFileName());
	std::string const tmpfileindex = tmpfilenamebase + "_index";
	::libmaus2::util::TempFileRemovalContainer::addTempFile(tmpfileindex);

	std::string md5filename;
	std::string indexfilename;

	std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > cbs;
	::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Pmd5cb;
	if ( arginfo.getValue<unsigned int>("md5",getDefaultMD5()) )
	{
		if ( arginfo.hasArg("md5filename") &&  arginfo.getUnparsedValue("md5filename","") != "" )
			md5filename = arginfo.getUnparsedValue("md5filename","");
		else
			std::cerr << "[V] no filename for md5 given, not creating hash" << std::endl;

		if ( md5filename.size() )
		{
			::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Tmd5cb(new ::libmaus2::lz::BgzfDeflateOutputCallbackMD5);
			Pmd5cb = UNIQUE_PTR_MOVE(Tmd5cb);
			cbs.push_back(Pmd5cb.get());
		}
	}
	libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Pindex;
	if ( arginfo.getValue<unsigned int>("index",getDefaultIndex()) )
	{
		if ( arginfo.hasArg("indexfilename") &&  arginfo.getUnparsedValue("indexfilename","") != "" )
			indexfilename = arginfo.getUnparsedValue("indexfilename","");
		else
			std::cerr << "[V] no filename for index given, not creating index" << std::endl;

		if ( indexfilename.size() )
		{
			libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Tindex(new libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex(tmpfileindex));
			Pindex = UNIQUE_PTR_MOVE(Tindex);
			cbs.push_back(Pindex.get());
		}
	}
	/*
	 * end md5/index callbacks
	 */

	std::string headertext(uheader->text);
	std::vector<libmaus2::bambam::HeaderLine> hl = libmaus2::bambam::HeaderLine::extractLines(headertext);
	
	std::ostringstream headertextostr;
	uint64_t rscnt = 0;
	uint64_t rgcnt = 0;
	for ( uint64_t i = 0; i < hl.size(); ++i )
	{
		if ( hl[i].type == "SQ" )
		{
			if ( usedrefseq->get(rscnt) )
				headertextostr << hl[i].line << std::endl;

			rscnt += 1;
		}
		else if ( hl[i].type == "RG" )
		{
			if ( usedrg->get(rgcnt) )
				headertextostr << hl[i].line << std::endl;
			
			rgcnt += 1;
		}
		else
		{
			headertextostr << hl[i].line << std::endl;
		}
	}
	headertext = headertextostr.str();

	// add PG line to header
	std::string const upheadtext = ::libmaus2::bambam::ProgramHeaderLineSet::addProgramLine(
		headertext,
		"bamheaderfilter", // ID
		"bamheaderfilter", // PN
		arginfo.commandline, // CL
		::libmaus2::bambam::ProgramHeaderLineSet(headertext).getLastIdInChain(), // PP
		std::string(PACKAGE_VERSION) // VN			
	);
	// construct new header
	::libmaus2::bambam::BamHeader uphead(upheadtext);
	libmaus2::bambam::BamBlockWriterBase::unique_ptr_type Pout ( libmaus2::bambam::BamBlockWriterBaseFactory::construct(uphead, arginfo, &cbs) );

	// input decoder wrapper
	libmaus2::bambam::BamAlignmentDecoderWrapper::unique_ptr_type decwrapper(
		libmaus2::bambam::BamMultiAlignmentDecoderFactory::construct(
			arginfo,false // put rank
		)
	);
	::libmaus2::bambam::BamAlignmentDecoder * ppdec = &(decwrapper->getDecoder());
	::libmaus2::bambam::BamAlignmentDecoder & dec = *ppdec;
	::libmaus2::bambam::BamAlignment & algn = dec.getAlignment();
	
	while ( dec.readAlignment() )
	{
		if ( (!algn.isPaired()) && algn.isMapped() )
		{
			assert ( algn.getRefID() >= 0 );
			assert ( algn.getRefID() < static_cast<int64_t>(usedrefseq->size()) );
			assert ( usedrefseq->get(algn.getRefID()) );
			assert ( usedrefseq->rank1(algn.getRefID())-1 < uphead.getNumRef() );
			algn.putRefId(usedrefseq->rank1(algn.getRefID())-1);
		}
		if ( algn.isPaired() && algn.isMapped() )
		{
			assert ( algn.getRefID() >= 0 );
			assert ( algn.getRefID() < static_cast<int64_t>(usedrefseq->size()) );
			assert ( usedrefseq->get(algn.getRefID()) );
			assert ( usedrefseq->rank1(algn.getRefID())-1 < uphead.getNumRef() );
			algn.putRefId(usedrefseq->rank1(algn.getRefID())-1);
		}
		if ( algn.isPaired() && algn.isMateMapped() )
		{
			assert ( algn.getNextRefID() >= 0 );
			assert ( algn.getNextRefID() < static_cast<int64_t>(usedrefseq->size()) );
			assert ( usedrefseq->get(algn.getNextRefID()) );
			assert ( usedrefseq->rank1(algn.getNextRefID())-1 < uphead.getNumRef() );
			algn.putNextRefId(usedrefseq->rank1(algn.getNextRefID())-1);
		}
		
		// erase unmapped refid and pos
		if ( algn.isUnmap() )
		{
			algn.putRefId(-1);
			algn.putPos(-1);
		}
		if ( algn.isMateUnmap() )
		{
			algn.putNextRefId(-1);
			algn.putNextPos(-1);
		}

		Pout->writeAlignment(algn);
	}


	Pout.reset();

	if ( Pmd5cb )
	{
		Pmd5cb->saveDigestAsFile(md5filename);
	}
	if ( Pindex )
	{
		Pindex->flush(std::string(indexfilename));
	}
	
	return 0;
}
Beispiel #8
0
int bamvalidateTemplate(::libmaus2::util::ArgInfo const & arginfo)
{
	libmaus2::timing::RealTimeClock rtc; rtc.start();
	bool const verbose = arginfo.getValue("verbose",getDefaultVerbose());
	bool const basequalhist = arginfo.getValue("basequalhist",getDefaultBaseQualHist());

	libmaus2::bambam::BamAlignmentDecoderWrapper::unique_ptr_type decwrapper(
		libmaus2::bambam::BamMultiAlignmentDecoderFactory::construct(
			arginfo,false // put rank
		)
	);
	::libmaus2::bambam::BamAlignmentDecoder * ppdec = &(decwrapper->getDecoder());
	::libmaus2::bambam::BamAlignmentDecoder & dec = *ppdec;
	::libmaus2::bambam::BamHeader const & header = dec.getHeader();
	::libmaus2::bambam::BamAlignment const & algn = dec.getAlignment();

	// add PG line to header
	std::string const upheadtext = ::libmaus2::bambam::ProgramHeaderLineSet::addProgramLine(
		header.text,
		"bamvalidate", // ID
		"bamvalidate", // PN
		arginfo.commandline, // CL
		::libmaus2::bambam::ProgramHeaderLineSet(header.text).getLastIdInChain(), // PP
		std::string(PACKAGE_VERSION) // VN
	);
	// construct new header
	::libmaus2::bambam::BamHeader uphead(upheadtext);

	/*
	 * start index/md5 callbacks and alignment writer
	 */
	std::string md5filename;
	std::string indexfilename;

	std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > cbs;
	::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Pmd5cb;
	libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Pindex;
	libmaus2::bambam::BamBlockWriterBase::unique_ptr_type Pout;

	if ( passthrough )
	{
		std::string const tmpfilenamebase = arginfo.getValue<std::string>("tmpfile",arginfo.getDefaultTmpFileName());
		std::string const tmpfileindex = tmpfilenamebase + "_index";
		::libmaus2::util::TempFileRemovalContainer::addTempFile(tmpfileindex);

		if ( arginfo.getValue<unsigned int>("md5",getDefaultMD5()) )
		{
			if ( arginfo.hasArg("md5filename") &&  arginfo.getUnparsedValue("md5filename","") != "" )
				md5filename = arginfo.getUnparsedValue("md5filename","");
			else
				std::cerr << "[V] no filename for md5 given, not creating hash" << std::endl;

			if ( md5filename.size() )
			{
				::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Tmd5cb(new ::libmaus2::lz::BgzfDeflateOutputCallbackMD5);
				Pmd5cb = UNIQUE_PTR_MOVE(Tmd5cb);
				cbs.push_back(Pmd5cb.get());
			}
		}
		if ( arginfo.getValue<unsigned int>("index",getDefaultIndex()) )
		{
			if ( arginfo.hasArg("indexfilename") &&  arginfo.getUnparsedValue("indexfilename","") != "" )
				indexfilename = arginfo.getUnparsedValue("indexfilename","");
			else
				std::cerr << "[V] no filename for index given, not creating index" << std::endl;

			if ( indexfilename.size() )
			{
				libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Tindex(new libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex(tmpfileindex));
				Pindex = UNIQUE_PTR_MOVE(Tindex);
				cbs.push_back(Pindex.get());
			}
		}
		std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > * Pcbs = 0;
		if ( cbs.size() )
			Pcbs = &cbs;


		libmaus2::bambam::BamBlockWriterBase::unique_ptr_type Tout (
			libmaus2::bambam::BamBlockWriterBaseFactory::construct(uphead, arginfo, Pcbs)
		);
		Pout = UNIQUE_PTR_MOVE(Tout);
	}

	libmaus2::autoarray::AutoArray<char> lastvalidname(256); // max valid read name is 255 bytes
	uint64_t alsok = 0;

	::libmaus2::autoarray::AutoArray<char> qual;
	libmaus2::autoarray::AutoArray<uint64_t> H(static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())+1);
	std::fill(H.begin(),H.end(),0ull);

	try
	{
		while ( dec.readAlignment() )
		{
			if ( passthrough )
				Pout->writeAlignment(algn);

			if ( basequalhist )
			{
				uint64_t const l = algn.getLseq();
				uint8_t const * Qc = libmaus2::bambam::BamAlignmentDecoderBase::getQual(algn.D.begin());
				uint8_t const * const Qe = Qc + l;

				while ( Qc != Qe )
					H[*(Qc++)]++;
			}

			uint64_t const lname = algn.getLReadName();
			char const * name = algn.getName();
			std::copy(name,name+lname+1,lastvalidname.begin());

			alsok += 1;
		}
	}
	catch(std::exception const & ex)
	{
		std::cerr << "[E] name of last valid alignment was " << lastvalidname.begin() << std::endl;
		std::cerr << "[E] read " << alsok << " valid alignments" << std::endl;
		throw;
	}

	Pout.reset();

	if ( Pmd5cb )
	{
		Pmd5cb->saveDigestAsFile(md5filename);
	}
	if ( Pindex )
	{
		Pindex->flush(std::string(indexfilename));
	}

	if ( verbose )
		std::cerr << "[V] checked " << alsok << " alignments in " << rtc.formatTime(rtc.getElapsedSeconds())
			<< " (" << alsok / rtc.getElapsedSeconds() << " al/s)" << std::endl;

	if ( basequalhist )
	{
		uint64_t const s = std::accumulate(H.begin(),H.end(),0ull);

		uint64_t a = 0;
		uint64_t minq = std::numeric_limits<uint64_t>::max();
		uint64_t maxq = 0;

		for ( uint64_t i = 0; i < H.size(); ++i )
			if ( H[i] )
			{
				minq = std::min(minq,i);
				maxq = std::max(maxq,i);

				a += H[i];

				std::cerr
					<< "[H]\t" << i << "\t";

				if ( ( static_cast<uint64_t>(i+33) < static_cast<uint64_t>(std::numeric_limits<char>::max()) && isprint(i+33)) )
					std::cerr << static_cast<char>(i+33);

				std::cerr << "\t"
					<< H[i] << "\t"
					<< (H[i] / static_cast<double>(s)) << "\t"
					<< (a / static_cast<double>(s))
					<< std::endl;
			}

		if ( s )
		{
			std::cerr << "[H]\tmin\t" << minq << "\t";
			if ( ( static_cast<uint64_t>(minq+33) < static_cast<uint64_t>(std::numeric_limits<char>::max()) && isprint(minq+33)) )
				std::cerr << static_cast<char>(minq+33);
			std::cerr << std::endl;
			std::cerr << "[H]\tmax\t" << maxq << "\t";
			if ( ( static_cast<uint64_t>(maxq+33) < static_cast<uint64_t>(std::numeric_limits<char>::max()) && isprint(maxq+33)) )
				std::cerr << static_cast<char>(maxq+33);
			std::cerr << std::endl;
		}
	}

	return EXIT_SUCCESS;
}
Beispiel #9
0
static std::string getDefaultFilePrefix(::libmaus2::util::ArgInfo const & arginfo) { return arginfo.getDefaultTmpFileName(); }
Beispiel #10
0
int bamfiltermc(libmaus2::util::ArgInfo const & arginfo)
{
	bool const verbose = arginfo.getValue("verbose",getDefaultVerbose());

	libmaus2::bambam::BamAlignmentDecoderWrapper::unique_ptr_type decwrapper(
		libmaus2::bambam::BamMultiAlignmentDecoderFactory::construct(arginfo));
	::libmaus2::bambam::BamAlignmentDecoder * ppdec = &(decwrapper->getDecoder());
	::libmaus2::bambam::BamAlignmentDecoder & dec = *ppdec;
	::libmaus2::bambam::BamHeader const & header = dec.getHeader();
	::libmaus2::bambam::BamAlignment & algn = dec.getAlignment();
	std::string const tmpfilenamebase = arginfo.getUnparsedValue("tmpfile",arginfo.getDefaultTmpFileName());
	uint64_t const numthreads = arginfo.getValueUnsignedNumeric<uint64_t>("numthreads",getDefaultNumThreads());

	/*
	 * start index/md5 callbacks
	 */
	std::string const tmpfileindex = tmpfilenamebase + "_index";
	::libmaus2::util::TempFileRemovalContainer::addTempFile(tmpfileindex);

	std::string md5filename;
	std::string indexfilename;

	std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > cbs;
	::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Pmd5cb;
	if ( arginfo.getValue<unsigned int>("md5",getDefaultMD5()) )
	{
		if ( libmaus2::bambam::BamBlockWriterBaseFactory::getMD5FileName(arginfo) != std::string() )
			md5filename = libmaus2::bambam::BamBlockWriterBaseFactory::getMD5FileName(arginfo);
		else
			std::cerr << "[V] no filename for md5 given, not creating hash" << std::endl;

		if ( md5filename.size() )
		{
			::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Tmd5cb(new ::libmaus2::lz::BgzfDeflateOutputCallbackMD5);
			Pmd5cb = UNIQUE_PTR_MOVE(Tmd5cb);
			cbs.push_back(Pmd5cb.get());
		}
	}
	libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Pindex;
	if ( arginfo.getValue<unsigned int>("index",getDefaultIndex()) )
	{
		if ( libmaus2::bambam::BamBlockWriterBaseFactory::getIndexFileName(arginfo) != std::string() )
			indexfilename = libmaus2::bambam::BamBlockWriterBaseFactory::getIndexFileName(arginfo);
		else
			std::cerr << "[V] no filename for index given, not creating index" << std::endl;

		if ( indexfilename.size() )
		{
			libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Tindex(new libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex(tmpfileindex));
			Pindex = UNIQUE_PTR_MOVE(Tindex);
			cbs.push_back(Pindex.get());
		}
	}
	std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > * Pcbs = 0;
	if ( cbs.size() )
		Pcbs = &cbs;
	/*
	 * end md5/index callbacks
	 */

	::libmaus2::bambam::BamHeader::unique_ptr_type genuphead(
		libmaus2::bambam::BamHeaderUpdate::updateHeader(arginfo,header,"bamfiltermc",std::string(PACKAGE_VERSION))
	);
	libmaus2::bambam::BamBlockWriterBase::unique_ptr_type Pwriter(libmaus2::bambam::BamBlockWriterBaseFactory::construct(*genuphead,arginfo,Pcbs));
	libmaus2::bambam::BamBlockWriterBase & wr = *Pwriter;

	// freelist size
	uint64_t const flsize = 16*1024;
	libmaus2::util::FreeList < libmaus2::bambam::BamAlignment, BamAlignmentFreeListDefaultAllocator, BamAlignmentFreeListDefaultTypeInfo > FL(flsize);
	libmaus2::util::SimpleQueue < libmaus2::bambam::BamAlignment::shared_ptr_type > Q;

	libmaus2::bambam::BamAuxFilterVector auxvec;
	auxvec.set('M','C');

	uint64_t alcnt = 0;
	while ( dec.readAlignment() )
	{
		if ( FL.empty() )
			handleQueue(Q,FL,wr,auxvec,numthreads);
		assert ( ! FL.empty() );

		libmaus2::bambam::BamAlignment::shared_ptr_type P = FL.get();
		P->swap(algn);

		Q.push_back(P);

		if ( verbose && ((++alcnt % (1024*1024)) == 0) )
			std::cerr << "[V] " << alcnt << std::endl;
	}

	handleQueue(Q,FL,wr,auxvec,numthreads);

	// reset BAM writer
	Pwriter.reset();

	if ( Pmd5cb )
		Pmd5cb->saveDigestAsFile(md5filename);
	if ( Pindex )
		Pindex->flush(std::string(indexfilename));

	return EXIT_SUCCESS;
}
Beispiel #11
0
int bamfilterflags(::libmaus2::util::ArgInfo const & arginfo)
{
	uint32_t const excludeflags = libmaus2::bambam::BamFlagBase::stringToFlags(arginfo.getValue<std::string>("exclude",""));

	std::cerr << "[V] excluding " << excludeflags << std::endl;

	int const level = libmaus2::bambam::BamBlockWriterBaseFactory::checkCompressionLevel(arginfo.getValue<int>("level",Z_DEFAULT_COMPRESSION));
	
	uint64_t const numthreads = arginfo.getValue<uint64_t>("numthreads",1);
	uint64_t cnt = 0;
	uint64_t kept = 0;

	/*
	 * start index/md5 callbacks
	 */
	std::string const tmpfilenamebase = arginfo.getValue<std::string>("tmpfile",arginfo.getDefaultTmpFileName());
	std::string const tmpfileindex = tmpfilenamebase + "_index";
	::libmaus2::util::TempFileRemovalContainer::addTempFile(tmpfileindex);

	std::string md5filename;
	std::string indexfilename;

	std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > cbs;
	::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Pmd5cb;
	if ( arginfo.getValue<unsigned int>("md5",getDefaultMD5()) )
	{
		if ( arginfo.hasArg("md5filename") &&  arginfo.getUnparsedValue("md5filename","") != "" )
			md5filename = arginfo.getUnparsedValue("md5filename","");
		else
			std::cerr << "[V] no filename for md5 given, not creating hash" << std::endl;

		if ( md5filename.size() )
		{
			::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Tmd5cb(new ::libmaus2::lz::BgzfDeflateOutputCallbackMD5);
			Pmd5cb = UNIQUE_PTR_MOVE(Tmd5cb);
			cbs.push_back(Pmd5cb.get());
		}
	}
	libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Pindex;
	if ( arginfo.getValue<unsigned int>("index",getDefaultIndex()) )
	{
		if ( arginfo.hasArg("indexfilename") &&  arginfo.getUnparsedValue("indexfilename","") != "" )
			indexfilename = arginfo.getUnparsedValue("indexfilename","");
		else
			std::cerr << "[V] no filename for index given, not creating index" << std::endl;

		if ( indexfilename.size() )
		{
			libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Tindex(new libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex(tmpfileindex));
			Pindex = UNIQUE_PTR_MOVE(Tindex);
			cbs.push_back(Pindex.get());
		}
	}
	std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > * Pcbs = 0;
	if ( cbs.size() )
		Pcbs = &cbs;
	/*
	 * end md5/index callbacks
	 */
	 

	if ( numthreads == 1 )
	{
		::libmaus2::bambam::BamDecoder BD(std::cin);
		::libmaus2::bambam::BamHeader const & bamheader = BD.getHeader();
		::libmaus2::bambam::BamHeader::unique_ptr_type uphead(libmaus2::bambam::BamHeaderUpdate::updateHeader(arginfo,bamheader,"bamfilterflags",std::string(PACKAGE_VERSION)));
		::libmaus2::bambam::BamAlignment & alignment = BD.getAlignment();
		::libmaus2::bambam::BamWriter::unique_ptr_type writer(new ::libmaus2::bambam::BamWriter(std::cout,*uphead,level,Pcbs));
	
		for ( ; BD.readAlignment(); ++cnt )
		{
			if ( cnt % (1024*1024) == 0 )
				std::cerr << "[V] processed " << cnt << " kept " << kept << " removed " << (cnt-kept) << std::endl;
			if ( ! (alignment.getFlags() & excludeflags) )			
			{
				alignment.serialise(writer->getStream());
				++kept;
			}
		}

		std::cerr << "[V] " << cnt << std::endl;	
	}
	else
	{
		::libmaus2::bambam::BamHeaderUpdate UH(arginfo,"bamfilterflags",std::string(PACKAGE_VERSION));
		libmaus2::bambam::BamParallelRewrite BPR(std::cin,UH,std::cout,Z_DEFAULT_COMPRESSION,numthreads,4 /* blocks per thread */,Pcbs);
		libmaus2::bambam::BamAlignmentDecoder & dec = BPR.getDecoder();
		libmaus2::bambam::BamParallelRewrite::writer_type & writer = BPR.getWriter();

		libmaus2::bambam::BamAlignment const & algn = dec.getAlignment();
		for ( ; dec.readAlignment(); ++cnt )
		{
			if ( cnt % (1024*1024) == 0 )
				std::cerr << "[V] processed " << cnt << " kept " << kept << " removed " << (cnt-kept) << std::endl;
			if ( ! (algn.getFlags() & excludeflags) )
			{
				algn.serialise(writer.getStream());	
				++kept;
			}
		}
		
		std::cerr << "[V] " << cnt << std::endl;	
	}
	
	std::cerr << "[V] kept " << kept << " removed " << cnt-kept << std::endl;

	if ( Pmd5cb )
	{
		Pmd5cb->saveDigestAsFile(md5filename);
	}
	if ( Pindex )
	{
		Pindex->flush(std::string(indexfilename));
	}

	return EXIT_SUCCESS;
}
Beispiel #12
0
int bamheap2(libmaus2::util::ArgInfo const & arginfo)
{
	bool const verbose = arginfo.getValue("verbose",getDefaultVerbose());
	std::string const reference = arginfo.getUnparsedValue("reference",std::string());
	std::string const outputprefix = arginfo.getUnparsedValue("outputprefix",std::string());

	libmaus2::bambam::BamAlignmentDecoderWrapper::unique_ptr_type decwrapper(
		libmaus2::bambam::BamMultiAlignmentDecoderFactory::construct(arginfo));
	::libmaus2::bambam::BamAlignmentDecoder * ppdec = &(decwrapper->getDecoder());
	::libmaus2::bambam::BamAlignmentDecoder & dec = *ppdec;
	::libmaus2::bambam::BamHeader const & header = dec.getHeader();
	::libmaus2::bambam::BamAlignment const & algn = dec.getAlignment();

	double const damult = arginfo.getValue<double>("amult",1);
	double const dcmult = arginfo.getValue<double>("cmult",1);
	double const dgmult = arginfo.getValue<double>("gmult",1);
	double const dtmult = arginfo.getValue<double>("tmult",1);
	double const dpadmult = arginfo.getValue<double>("padmult",1);

	double maxmult = 0;
	maxmult = std::max(damult,maxmult);
	maxmult = std::max(dcmult,maxmult);
	maxmult = std::max(dgmult,maxmult);
	maxmult = std::max(dtmult,maxmult);
	maxmult = std::max(dpadmult,maxmult);

	uint64_t const amult = std::floor((damult / maxmult) * (1ull<<16) + 0.5);
	uint64_t const cmult = std::floor((dcmult / maxmult) * (1ull<<16) + 0.5);
	uint64_t const gmult = std::floor((dgmult / maxmult) * (1ull<<16) + 0.5);
	uint64_t const tmult = std::floor((dtmult / maxmult) * (1ull<<16) + 0.5);
	uint64_t const padmult = std::floor((dpadmult / maxmult) * (1ull<<16) + 0.5);

	libmaus2::fastx::FastAIndex::unique_ptr_type Pindex;
	libmaus2::aio::InputStreamInstance::unique_ptr_type PCIS;
	if ( reference.size() )
	{
		libmaus2::fastx::FastAIndex::unique_ptr_type Tindex(
			libmaus2::fastx::FastAIndex::load(reference+".fai")
		);
		Pindex = UNIQUE_PTR_MOVE(Tindex);

		libmaus2::aio::InputStreamInstance::unique_ptr_type TCIS(new libmaus2::aio::InputStreamInstance(reference));
		PCIS = UNIQUE_PTR_MOVE(TCIS);
	}

	libmaus2::autoarray::AutoArray<libmaus2::bambam::cigar_operation> cigop;
	libmaus2::autoarray::AutoArray<char> bases;

	int64_t prevrefid = -1;
	std::string refidname = "*";

	std::map< uint64_t, HeapEntry > M;
	uint64_t alcnt = 0;
	std::vector< std::pair<char,uint8_t> > pendinginserts;
	int64_t loadedRefId = -1;
	int64_t streamRefId = -1;
	libmaus2::autoarray::AutoArray<char> refseqbases;
	ConsensusAccuracy * consacc = 0;
	std::map<uint64_t,ConsensusAccuracy> Mconsacc;
	typedef libmaus2::util::shared_ptr<std::ostringstream>::type stream_ptr_type;
	stream_ptr_type Pstream;
	ConsensusAux Caux;

	Caux.M['a'] = Caux.M['A'] = amult;
	Caux.M['c'] = Caux.M['C'] = cmult;
	Caux.M['g'] = Caux.M['G'] = gmult;
	Caux.M['t'] = Caux.M['T'] = tmult;
	Caux.M[padsym] = padmult;

	while ( dec.readAlignment() )
	{
		if (
			algn.isMapped() && (!algn.isQCFail()) &&
			algn.getLseq()
		)
		{
			assert ( ! pendinginserts.size() );

			uint32_t const numcigop = algn.getCigarOperations(cigop);
			uint64_t readpos = 0;
			uint64_t refpos = algn.getPos();
			uint64_t const seqlen = algn.decodeRead(bases);
			uint8_t const * qual = libmaus2::bambam::BamAlignmentDecoderBase::getQual(algn.D.begin());

			// handle finished columns
			if ( algn.getRefID() != prevrefid )
			{
				while ( M.size() )
				{
					HeapEntry & H = M.begin()->second;

					if ( outputprefix.size() && (streamRefId != prevrefid) )
					{
						if ( Pstream )
						{
							std::ostringstream fnostr;
							fnostr << outputprefix << "_" << header.getRefIDName(streamRefId);
							libmaus2::aio::OutputStreamInstance PFOS(fnostr.str());
							PFOS << ">" << header.getRefIDName(streamRefId) << '\n';
							PFOS << Pstream->str() << '\n';

							Pstream.reset();
						}

						stream_ptr_type Tstream(new std::ostringstream);
						Pstream = Tstream;
						streamRefId = prevrefid;
					}

					if ( Pindex && (loadedRefId != prevrefid) )
					{
						refseqbases = Pindex->readSequence(*PCIS, Pindex->getSequenceIdByName(refidname));
						loadedRefId = prevrefid;

						if ( Mconsacc.find(loadedRefId) == Mconsacc.end() )
							Mconsacc[loadedRefId] = ConsensusAccuracy(refseqbases.size());

						consacc = &(Mconsacc[loadedRefId]);
					}

					H.toStream(std::cout,M.begin()->first,refidname,(M.begin()->first < refseqbases.size()) ? static_cast<int>(refseqbases[M.begin()->first]) : -1,Caux,consacc,Pstream.get());

					M.erase(M.begin());
				}

				prevrefid = algn.getRefID();
				refidname = header.getRefIDName(prevrefid);
			}
			else
			{
				while ( M.size() && M.begin()->first < refpos )
				{
					HeapEntry & H = M.begin()->second;

					if ( outputprefix.size() && (streamRefId != prevrefid) )
					{
						if ( Pstream )
						{
							std::ostringstream fnostr;
							fnostr << outputprefix << "_" << header.getRefIDName(streamRefId);
							libmaus2::aio::OutputStreamInstance PFOS(fnostr.str());
							PFOS << ">" << header.getRefIDName(streamRefId) << '\n';
							PFOS << Pstream->str() << '\n';

							Pstream.reset();
						}

						stream_ptr_type Tstream(new std::ostringstream);
						Pstream = Tstream;
						streamRefId = prevrefid;
					}

					if ( Pindex && (loadedRefId != prevrefid) )
					{
						refseqbases = Pindex->readSequence(*PCIS, Pindex->getSequenceIdByName(refidname));
						loadedRefId = prevrefid;

						if ( Mconsacc.find(loadedRefId) == Mconsacc.end() )
							Mconsacc[loadedRefId] = ConsensusAccuracy(refseqbases.size());

						consacc = &(Mconsacc[loadedRefId]);
					}

					H.toStream(std::cout,M.begin()->first,refidname,(M.begin()->first < refseqbases.size()) ? static_cast<int>(refseqbases[M.begin()->first]) : -1,Caux,consacc,Pstream.get());

					M.erase(M.begin());
				}
			}

			for ( uint64_t ci = 0; ci < numcigop; ++ci )
			{
				uint64_t const ciglen = cigop[ci].second;

				switch ( cigop[ci].first )
				{
					case libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CMATCH:
					case libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CEQUAL:
					case libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CDIFF:
					{
						if ( pendinginserts.size() )
						{
							M[refpos].I.push_back(pendinginserts);
							pendinginserts.resize(0);
						}

						for ( uint64_t i = 0; i < ciglen; ++i )
						{
							M[refpos].V.push_back(std::make_pair(bases[readpos],qual[readpos]));
							readpos++;
							refpos++;
						}
						break;
					}
					case libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CINS:
					{
						for ( uint64_t i = 0; i < ciglen; ++i, ++readpos )
							pendinginserts.push_back(std::make_pair(bases[readpos],qual[readpos]));
						break;
					}
					case libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CDEL:
						// handle pending inserts
						if ( pendinginserts.size() )
						{
							M[refpos].I.push_back(pendinginserts);
							pendinginserts.resize(0);
						}

						// deleting bases from the reference
						for ( uint64_t i = 0; i < ciglen; ++i, ++refpos )
							M[refpos].V.push_back(std::make_pair(padsym,0));
						break;
					case libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CREF_SKIP:
						// handle pending inserts
						if ( pendinginserts.size() )
						{
							M[refpos].I.push_back(pendinginserts);
							pendinginserts.resize(0);
						}

						// skip bases on reference
						for ( uint64_t i = 0; i < ciglen; ++i )
						{
							refpos++;
						}
						break;
					case libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CSOFT_CLIP:
						// skip bases on read
						for ( uint64_t i = 0; i < ciglen; ++i )
						{
							readpos++;
						}
						break;
					case libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CHARD_CLIP:
						break;
					case libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CPAD:
					{
						for ( uint64_t i = 0; i < ciglen; ++i, ++readpos )
							pendinginserts.push_back(std::make_pair(padsym,0));
						break;
					}
				}
			}

			if ( pendinginserts.size() )
			{
				M[refpos].I.push_back(pendinginserts);
				M[refpos].iadd++;
				pendinginserts.resize(0);
			}

			assert ( readpos == seqlen );
		}

		if ( verbose && ((++alcnt % (1024*1024)) == 0) )
			std::cerr << "[V] " << alcnt << std::endl;
	}

	while ( M.size() )
	{
		HeapEntry & H = M.begin()->second;

		if ( outputprefix.size() && (streamRefId != prevrefid) )
		{
			if ( Pstream )
			{
				std::ostringstream fnostr;
				fnostr << outputprefix << "_" << header.getRefIDName(streamRefId);
				libmaus2::aio::OutputStreamInstance PFOS(fnostr.str());
				PFOS << ">" << header.getRefIDName(streamRefId) << '\n';
				PFOS << Pstream->str() << '\n';

				Pstream.reset();
			}

			stream_ptr_type Tstream(new std::ostringstream);
			Pstream = Tstream;
			streamRefId = prevrefid;
		}

		if ( Pindex && (loadedRefId != prevrefid) )
		{
			refseqbases = Pindex->readSequence(*PCIS, Pindex->getSequenceIdByName(refidname));
			loadedRefId = prevrefid;

			if ( Mconsacc.find(loadedRefId) == Mconsacc.end() )
				Mconsacc[loadedRefId] = ConsensusAccuracy(refseqbases.size());

			consacc = &(Mconsacc[loadedRefId]);
		}

		H.toStream(std::cout,M.begin()->first,refidname,(M.begin()->first < refseqbases.size()) ? static_cast<int>(refseqbases[M.begin()->first]) : -1,Caux,consacc,Pstream.get());

		M.erase(M.begin());
	}

	if ( Pstream )
	{
		std::ostringstream fnostr;
		fnostr << outputprefix << "_" << header.getRefIDName(streamRefId);
		libmaus2::aio::OutputStreamInstance PFOS(fnostr.str());
		PFOS << ">" << header.getRefIDName(streamRefId) << '\n';
		PFOS << Pstream->str() << '\n';

		Pstream.reset();
	}

	ConsensusAccuracy constotal;
	for ( std::map<uint64_t,ConsensusAccuracy>::const_iterator ita = Mconsacc.begin(); ita != Mconsacc.end(); ++ita )
	{
		std::cerr << header.getRefIDName(ita->first) << "\t" << ita->second << std::endl;

		std::map<uint64_t,uint64_t> const M = ita->second.depthhistogram.get();
		uint64_t total = 0;
		uint64_t preavg = 0;
		for ( std::map<uint64_t,uint64_t>::const_iterator aita = M.begin(); aita != M.end(); ++aita )
		{
			total += aita->second;
			preavg += aita->first * aita->second;
		}

		uint64_t acc = 0;
		for ( std::map<uint64_t,uint64_t>::const_iterator aita = M.begin(); aita != M.end(); ++aita )
		{
			acc += aita->second;
			std::cerr << "H[" << header.getRefIDName(ita->first) << "," << aita->first << ",+]"
				<< "\t" << aita->second << "\t" << static_cast<double>(aita->second)/total
				<< "\t" << acc << "\t" << static_cast<double>(acc)/total << std::endl;
		}
		acc = 0;
		for ( std::map<uint64_t,uint64_t>::const_reverse_iterator aita = M.rbegin(); aita != M.rend(); ++aita )
		{
			acc += aita->second;
			std::cerr << "H[" << header.getRefIDName(ita->first) << "," << aita->first << ",-]"
				<< "\t" << aita->second << "\t" << static_cast<double>(aita->second)/total
				<< "\t" << acc << "\t" << static_cast<double>(acc)/total << std::endl;
		}

		std::cerr << "H[" << header.getRefIDName(ita->first) << ",avg]\t" <<
			static_cast<double>(preavg)/total << std::endl;

		constotal += ita->second;
	}
	if ( Mconsacc.size() )
	{
		std::cerr << "all\t" << constotal << std::endl;

		std::map<uint64_t,uint64_t> const M = constotal.depthhistogram.get();
		uint64_t total = 0;
		uint64_t preavg = 0;
		for ( std::map<uint64_t,uint64_t>::const_iterator aita = M.begin(); aita != M.end(); ++aita )
		{
			total += aita->second;
			preavg += aita->first * aita->second;
		}

		uint64_t acc = 0;
		for ( std::map<uint64_t,uint64_t>::const_iterator aita = M.begin(); aita != M.end(); ++aita )
		{
			acc += aita->second;
			std::cerr << "H[" << "all" << "," << aita->first << ",+]"
				<< "\t" << aita->second << "\t" << static_cast<double>(aita->second)/total
				<< "\t" << acc << "\t" << static_cast<double>(acc)/total << std::endl;
		}
		acc = 0;
		for ( std::map<uint64_t,uint64_t>::const_reverse_iterator aita = M.rbegin(); aita != M.rend(); ++aita )
		{
			acc += aita->second;
			std::cerr << "H[" << "all" << "," << aita->first << ",-]"
				<< "\t" << aita->second << "\t" << static_cast<double>(aita->second)/total
				<< "\t" << acc << "\t" << static_cast<double>(acc)/total << std::endl;
		}

		std::cerr << "H[all,avg]\t" << static_cast<double>(preavg) / total << std::endl;

	}

	return EXIT_SUCCESS;
}
Beispiel #13
0
int fagzToCompact4(libmaus2::util::ArgInfo const & arginfo)
{
	bool const rc = arginfo.getValue<unsigned int>("rc",1);
	bool const gz = arginfo.getValue<unsigned int>("gz",1);

	std::vector<std::string> inputfilenames;
	inputfilenames = arginfo.restargs;

	if ( arginfo.hasArg("inputfilenames") )
	{
		std::string const inf = arginfo.getUnparsedValue("inputfilenames",std::string());
		libmaus2::aio::InputStream::unique_ptr_type Pinf(libmaus2::aio::InputStreamFactoryContainer::constructUnique(inf));
		while ( *Pinf )
		{
			std::string line;
			std::getline(*Pinf,line);
			if ( line.size() )
				inputfilenames.push_back(line);
		}
	}

	std::string const inlcp = libmaus2::util::OutputFileNameTools::lcp(inputfilenames);
	std::string defout = inlcp;
	defout = libmaus2::util::OutputFileNameTools::clipOff(defout,".gz");
	defout = libmaus2::util::OutputFileNameTools::clipOff(defout,".fasta");
	defout = libmaus2::util::OutputFileNameTools::clipOff(defout,".fa");

	std::string const outputfilename = arginfo.getUnparsedValue("outputfilename",defout + ".compact");
	std::string const metaoutputfilename = outputfilename + ".meta";
	int const verbose = arginfo.getValue<int>("verbose",1);
	libmaus2::autoarray::AutoArray<char> B(8*1024,false);
	libmaus2::bitio::CompactArrayWriterFile compactout(outputfilename,2 /* bits per symbol */);

	if ( ! rc )
		std::cerr << "[V] not storing reverse complements" << std::endl;

	// forward mapping table
	libmaus2::autoarray::AutoArray<uint8_t> ftable(256,false);
	// rc mapping for mapped symbols
	libmaus2::autoarray::AutoArray<uint8_t> ctable(256,false);

	std::fill(ftable.begin(),ftable.end(),4);
	std::fill(ctable.begin(),ctable.end(),4);
	ftable['a'] = ftable['A'] = 0;
	ftable['c'] = ftable['C'] = 1;
	ftable['g'] = ftable['G'] = 2;
	ftable['t'] = ftable['T'] = 3;
	uint64_t insize = 0;

	ctable[0] = 3; // A->T
	ctable[1] = 2; // C->G
	ctable[2] = 1; // G->C
	ctable[3] = 0; // T->A

	libmaus2::aio::OutputStreamInstance::unique_ptr_type metaOut(new libmaus2::aio::OutputStreamInstance(metaoutputfilename));
	libmaus2::util::NumberSerialisation::serialiseNumber(*metaOut,0);
	uint64_t nseq = 0;
	std::vector<uint64_t> lvec;

	for ( uint64_t i = 0; i < inputfilenames.size(); ++i )
	{
		std::string const fn = inputfilenames[i];
		libmaus2::aio::InputStreamInstance CIS(fn);
		libmaus2::lz::BufferedGzipStream::unique_ptr_type BGS;
		std::istream * istr = 0;
		if ( gz )
		{
			libmaus2::lz::BufferedGzipStream::unique_ptr_type tBGS(
				new libmaus2::lz::BufferedGzipStream(CIS));
			BGS = UNIQUE_PTR_MOVE(tBGS);
			istr = BGS.get();
		}
		else
		{
			istr = &CIS;
		}
		libmaus2::fastx::StreamFastAReaderWrapper fain(*istr);
		libmaus2::fastx::StreamFastAReaderWrapper::pattern_type pattern;

		while ( fain.getNextPatternUnlocked(pattern) )
		{
			if ( verbose )
				std::cerr << (i+1) << " " << stripAfterDot(basename(fn)) << " " << pattern.sid << "...";

			libmaus2::util::NumberSerialisation::serialiseNumber(*metaOut,pattern.spattern.size());
			lvec.push_back(pattern.spattern.size());
			libmaus2::util::NumberSerialisation::serialiseNumber(*metaOut,0);

			// map symbols
			for ( uint64_t j = 0; j < pattern.spattern.size(); ++j )
				pattern.spattern[j] = ftable[static_cast<uint8_t>(pattern.spattern[j])];

			// replace blocks of N symbols by random bases
			uint64_t l = 0;
			// number of replaced blocks
			uint64_t nr = 0;
			while ( l < pattern.spattern.size() )
			{
				// skip regular bases
				while ( l < pattern.spattern.size() && pattern.spattern[l] < 4 )
					++l;
				assert ( l == pattern.spattern.size() || pattern.spattern[l] == 4 );

				// go to end of non regular bases block
				uint64_t h = l;
				while ( h < pattern.spattern.size() && pattern.spattern[h] == 4 )
					++h;

				// if non regular block is not empty
				if ( h-l )
				{
					// replace by random bases
					for ( uint64_t j = l; j < h; ++j )
						pattern.spattern[j] = (libmaus2::random::Random::rand8() & 3);

					// write bounds
					libmaus2::util::NumberSerialisation::serialiseNumber(*metaOut,l);
					libmaus2::util::NumberSerialisation::serialiseNumber(*metaOut,h);
					// add to interval counter
					nr += 1;
				}

				l = h;
			}

			// make sure there are no more irregular bases
			for ( uint64_t j = 0; j < pattern.spattern.size(); ++j )
				assert ( pattern.spattern[j] < 4 );

			// go back to start of meta data
			metaOut->seekp( - static_cast<int64_t>(2*nr+1)*sizeof(uint64_t), std::ios::cur );
			// write number of intervals replaced
			libmaus2::util::NumberSerialisation::serialiseNumber(*metaOut,nr);
			// skip interval bounds already written
			metaOut->seekp(   static_cast<int64_t>(2*nr  )*sizeof(uint64_t), std::ios::cur );

			// write bases
			compactout.write(pattern.spattern.c_str(),pattern.spattern.size());

			// write reverse complement if requested
			if ( rc )
			{
				// reverse complement
				std::reverse(pattern.spattern.begin(),pattern.spattern.end());
				for ( uint64_t j = 0; j < pattern.spattern.size(); ++j )
					pattern.spattern[j] = ctable[static_cast<uint8_t>(pattern.spattern[j])];

				// write
				compactout.write(pattern.spattern.c_str(),pattern.spattern.size());
			}

			insize += pattern.spattern.size()+1;
			nseq += 1;

			if ( verbose )
				std::cerr << "done, input size " << formatBytes(pattern.spattern.size()+1) << " acc " << formatBytes(insize) << std::endl;
		}
	}

	metaOut->seekp(0);
	libmaus2::util::NumberSerialisation::serialiseNumber(*metaOut,nseq);
	metaOut->flush();
	metaOut.reset();

	libmaus2::aio::InputStreamInstance::unique_ptr_type metaISI(new libmaus2::aio::InputStreamInstance(metaoutputfilename));
	// number of sequences
	uint64_t const rnseq = libmaus2::util::NumberSerialisation::deserialiseNumber(*metaISI);
	assert ( nseq == rnseq );
	for ( uint64_t i = 0; i < nseq; ++i )
	{
		// length of sequence
		uint64_t const l = libmaus2::util::NumberSerialisation::deserialiseNumber(*metaISI);
		assert ( l == lvec[i] );
		uint64_t const nr = libmaus2::util::NumberSerialisation::deserialiseNumber(*metaISI);
		// skip replaced intervals
		metaISI->ignore(2*nr*sizeof(uint64_t));
	}
	assert ( metaISI->peek() == std::istream::traits_type::eof() );

	std::cerr << "Done, total input size " << insize << std::endl;

	compactout.flush();

	return EXIT_SUCCESS;
}
Beispiel #14
0
int bamcat(libmaus2::util::ArgInfo const & arginfo)
{
	if ( isatty(STDOUT_FILENO) )
	{
		::libmaus2::exception::LibMausException se;
		se.getStream() << "Refusing write binary data to terminal, please redirect standard output to pipe or file." << std::endl;
		se.finish();
		throw se;
	}

	int const level = libmaus2::bambam::BamBlockWriterBaseFactory::checkCompressionLevel(arginfo.getValue<int>("level",getDefaultLevel()));
	int const verbose = arginfo.getValue<int>("verbose",getDefaultVerbose());
	int const streaming = arginfo.getValue<int>("streaming",getDefaultStreaming());

	std::vector<std::string> inputfilenames = arginfo.getPairValues("I");
	
	for ( uint64_t i = 0; i < arginfo.restargs.size(); ++i )
		inputfilenames.push_back(arginfo.restargs[i]);
	
	libmaus2::bambam::BamCat bamdec(inputfilenames, false /* put rank */, streaming);
	libmaus2::bambam::BamAlignment const & algn = bamdec.getAlignment();
	libmaus2::bambam::BamHeader const & header = bamdec.getHeader();
	::libmaus2::bambam::BamHeader::unique_ptr_type uphead(updateHeader(arginfo,header));

	/*
	 * start index/md5 callbacks
	 */
	std::string const tmpfilenamebase = arginfo.getValue<std::string>("tmpfile",arginfo.getDefaultTmpFileName());
	std::string const tmpfileindex = tmpfilenamebase + "_index";
	::libmaus2::util::TempFileRemovalContainer::addTempFile(tmpfileindex);

	std::string md5filename;
	std::string indexfilename;

	std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > cbs;
	::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Pmd5cb;
	if ( arginfo.getValue<unsigned int>("md5",getDefaultMD5()) )
	{
		if ( arginfo.hasArg("md5filename") &&  arginfo.getUnparsedValue("md5filename","") != "" )
			md5filename = arginfo.getUnparsedValue("md5filename","");
		else
			std::cerr << "[V] no filename for md5 given, not creating hash" << std::endl;

		if ( md5filename.size() )
		{
			::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Tmd5cb(new ::libmaus2::lz::BgzfDeflateOutputCallbackMD5);
			Pmd5cb = UNIQUE_PTR_MOVE(Tmd5cb);
			cbs.push_back(Pmd5cb.get());
		}
	}
	libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Pindex;
	if ( arginfo.getValue<unsigned int>("index",getDefaultIndex()) )
	{
		if ( arginfo.hasArg("indexfilename") &&  arginfo.getUnparsedValue("indexfilename","") != "" )
			indexfilename = arginfo.getUnparsedValue("indexfilename","");
		else
			std::cerr << "[V] no filename for index given, not creating index" << std::endl;

		if ( indexfilename.size() )
		{
			libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Tindex(new libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex(tmpfileindex));
			Pindex = UNIQUE_PTR_MOVE(Tindex);
			cbs.push_back(Pindex.get());
		}
	}
	std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > * Pcbs = 0;
	if ( cbs.size() )
		Pcbs = &cbs;
	/*
	 * end md5/index callbacks
	 */


	::libmaus2::bambam::BamWriter::unique_ptr_type writer(new ::libmaus2::bambam::BamWriter(std::cout,*uphead,level,Pcbs));
	libmaus2::bambam::BamWriter::stream_type & bamoutstr = writer->getStream();
	if ( verbose )
	{
		uint64_t c = 0;
		while ( bamdec.readAlignment() )
		{
			algn.serialise(bamoutstr);
			
			if ( ((++c) & ((1ull<<20)-1)) == 0 )
				std::cerr << "[V] " << c << std::endl;
		}
		
		std::cerr << "[V] " << c << std::endl;
	}
	else
		while ( bamdec.readAlignment() )
			algn.serialise(bamoutstr);

	writer.reset();

	if ( Pmd5cb )
	{
		Pmd5cb->saveDigestAsFile(md5filename);
	}
	if ( Pindex )
	{
		Pindex->flush(std::string(indexfilename));
	}

	return EXIT_SUCCESS;
}
Beispiel #15
0
void bamalignfrac(::libmaus2::util::ArgInfo const & arginfo)
{
	libmaus2::bambam::BamAlignmentDecoderWrapper::unique_ptr_type decwrapper(
		libmaus2::bambam::BamMultiAlignmentDecoderFactory::construct(arginfo));
	::libmaus2::bambam::BamAlignmentDecoder * ppdec = &(decwrapper->getDecoder());
	::libmaus2::bambam::BamAlignmentDecoder & dec = *ppdec;
	::libmaus2::bambam::BamAlignment const & algn = dec.getAlignment();
        libmaus2::autoarray::AutoArray<libmaus2::bambam::cigar_operation> cigop;

        uint64_t basealgn = 0;
        uint64_t clip = 0;
        uint64_t totalbases = 0;

        #if defined(LIBMAUS2_HAVE_REGEX_H)
        std::string const regexs = arginfo.getUnparsedValue("name","");
        libmaus2::util::unique_ptr<libmaus2::regex::PosixRegex>::type regex_ptr;
        if ( regexs.size() )
	{
	        libmaus2::util::unique_ptr<libmaus2::regex::PosixRegex>::type tregex_ptr(new libmaus2::regex::PosixRegex(regexs));
	        regex_ptr = UNIQUE_PTR_MOVE(tregex_ptr);
	}
	#endif

	while ( dec.readAlignment() )
	{
		if ( 
			algn.isMapped()
			#if defined(LIBMAUS2_HAVE_REGEX_H)
			&&
			(
				(!regex_ptr)
				||
				(regex_ptr->findFirstMatch(algn.getName()) != -1)
			)
			#endif
		)
	        {
		        uint32_t const numcig = algn.getCigarOperations(cigop);
		        
		        totalbases += algn.getLseq();
		        
		        for ( uint64_t i = 0; i < numcig; ++i )
		        {
		        	switch ( cigop[i].first )
		        	{
		        		case libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CMATCH:
		        		case libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CINS:
					case libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CEQUAL:
					case libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CDIFF:
						basealgn += cigop[i].second;
						break;
					case libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CSOFT_CLIP:
						clip += cigop[i].second;
						break;
					case libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CHARD_CLIP:
						totalbases += cigop[i].second;
						clip += cigop[i].second;
						break;
					case libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CDEL:
					case libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CREF_SKIP:
						break;
		        	}
		        }
	        }                                                                        
	}
	
	std::cerr << "total bases in mapped reads\t" << totalbases << std::endl;
	std::cerr << "clipped (hard and soft) bases in mapped reads\t" << clip << std::endl;
	std::cerr << "aligned bases in mapped reads\t" << basealgn << std::endl;
}
int bamrecalculatecigar(libmaus2::util::ArgInfo const & arginfo)
{
	if ( isatty(STDOUT_FILENO) )
	{
		::libmaus2::exception::LibMausException se;
		se.getStream() << "Refusing write binary data to terminal, please redirect standard output to pipe or file." << std::endl;
		se.finish();
		throw se;
	}

	int const verbose = arginfo.getValue<int>("verbose",getDefaultVerbose());
	// input decoder wrapper
	libmaus2::bambam::BamAlignmentDecoderWrapper::unique_ptr_type decwrapper(
		libmaus2::bambam::BamMultiAlignmentDecoderFactory::construct(
			arginfo,false // put rank
		)
	);

	libmaus2::bambam::BamAlignmentDecoder & bamdec = decwrapper->getDecoder();
	libmaus2::bambam::BamAlignment & algn = bamdec.getAlignment();
	libmaus2::bambam::BamHeader const & header = bamdec.getHeader();
	::libmaus2::bambam::BamHeader::unique_ptr_type uphead(updateHeader(arginfo,header));

	/*
	 * start index/md5 callbacks
	 */
	std::string const tmpfilenamebase = arginfo.getValue<std::string>("tmpfile",arginfo.getDefaultTmpFileName());
	std::string const tmpfileindex = tmpfilenamebase + "_index";
	::libmaus2::util::TempFileRemovalContainer::addTempFile(tmpfileindex);

	std::string md5filename;
	std::string indexfilename;

	std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > cbs;
	::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Pmd5cb;
	if ( arginfo.getValue<unsigned int>("md5",getDefaultMD5()) )
	{
		if ( arginfo.hasArg("md5filename") &&  arginfo.getUnparsedValue("md5filename","") != "" )
			md5filename = arginfo.getUnparsedValue("md5filename","");
		else
			std::cerr << "[V] no filename for md5 given, not creating hash" << std::endl;

		if ( md5filename.size() )
		{
			::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Tmd5cb(new ::libmaus2::lz::BgzfDeflateOutputCallbackMD5);
			Pmd5cb = UNIQUE_PTR_MOVE(Tmd5cb);
			cbs.push_back(Pmd5cb.get());
		}
	}
	libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Pindex;
	if ( arginfo.getValue<unsigned int>("index",getDefaultIndex()) )
	{
		if ( arginfo.hasArg("indexfilename") &&  arginfo.getUnparsedValue("indexfilename","") != "" )
			indexfilename = arginfo.getUnparsedValue("indexfilename","");
		else
			std::cerr << "[V] no filename for index given, not creating index" << std::endl;

		if ( indexfilename.size() )
		{
			libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Tindex(new libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex(tmpfileindex));
			Pindex = UNIQUE_PTR_MOVE(Tindex);
			cbs.push_back(Pindex.get());
		}
	}
	std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > * Pcbs = 0;
	if ( cbs.size() )
		Pcbs = &cbs;
	/*
	 * end md5/index callbacks
	 */

	libmaus2::bambam::BamBlockWriterBase::unique_ptr_type writer(
		libmaus2::bambam::BamBlockWriterBaseFactory::construct(*uphead, arginfo, Pcbs)
	);

	libmaus2::autoarray::AutoArray<libmaus2::bambam::cigar_operation> cigopin;
	libmaus2::autoarray::AutoArray<char> readdata;
	libmaus2::bambam::BamAlignment::D_array_type T;

	if ( ! arginfo.hasArg("reference") )
	{
		libmaus2::exception::LibMausException se;
		se.getStream() << "reference key is missing." << std::endl;
		se.finish();
		throw se;
	}

	std::string const reference = arginfo.getUnparsedValue("reference","");

	if ( ! libmaus2::util::GetFileSize::fileExists(reference) )
	{
		libmaus2::exception::LibMausException se;
		se.getStream() << "file " << reference << " does not exist." << std::endl;
		se.finish();
		throw se;
	}

	libmaus2::fastx::FastAIndex::unique_ptr_type FAindex(libmaus2::fastx::FastAIndex::load(reference + ".fai"));
	libmaus2::aio::InputStreamInstance FAISI(reference);

	uint64_t c = 0;
	libmaus2::autoarray::AutoArray<char> ref;
	int64_t refloaded = -1;

	while ( bamdec.readAlignment() )
	{
		if ( algn.isMapped() )
		{
			assert ( algn.getRefID() >= 0 );
			if ( algn.getRefID() != refloaded )
			{
				if ( algn.getRefID() < refloaded )
				{
					libmaus2::exception::LibMausException lme;
					lme.getStream() << "bamrecalculatecigar: file is not sorted by coordinate" << std::endl;
					lme.finish();
					throw lme;
				}

				ref = FAindex->readSequence(FAISI,algn.getRefID());
				refloaded = algn.getRefID();
			}

			uint64_t const numcig = libmaus2::bambam::BamAlignmentDecoderBase::recalculateCigar(
				algn.D.begin(),
				ref.begin() + algn.getPos(),
				cigopin,
				readdata
			);
			algn.replaceCigarString(cigopin,numcig,T);
		}

		writer->writeAlignment(algn);

		if ( ((++c) & ((1ull<<20)-1)) == 0 && verbose )
			std::cerr << "[V] " << c << std::endl;
	}

	if ( verbose )
		std::cerr << "[V] " << c << std::endl;

	writer.reset();

	if ( Pmd5cb )
	{
		Pmd5cb->saveDigestAsFile(md5filename);
	}
	if ( Pindex )
	{
		Pindex->flush(std::string(indexfilename));
	}

	return EXIT_SUCCESS;
}
int bamclipreinsert(::libmaus2::util::ArgInfo const & arginfo)
{
	if ( isatty(STDIN_FILENO) )
	{
		::libmaus2::exception::LibMausException se;
		se.getStream() << "Refusing to read binary data from terminal, please redirect standard input to pipe or file." << std::endl;
		se.finish();
		throw se;
	}

	if ( isatty(STDOUT_FILENO) )
	{
		::libmaus2::exception::LibMausException se;
		se.getStream() << "Refusing write binary data to terminal, please redirect standard output to pipe or file." << std::endl;
		se.finish();
		throw se;
	}

	int const level = libmaus2::bambam::BamBlockWriterBaseFactory::checkCompressionLevel(arginfo.getValue<int>("level",getDefaultLevel()));
	int const verbose = arginfo.getValue<int>("verbose",getDefaultVerbose());

	::libmaus2::bambam::BamDecoder dec(std::cin,false);
	::libmaus2::bambam::BamHeader const & header = dec.getHeader();

	std::string const headertext(header.text);

	// add PG line to header
	std::string const upheadtext = ::libmaus2::bambam::ProgramHeaderLineSet::addProgramLine(
		headertext,
		"bamclipreinsert", // ID
		"bamclipreinsert", // PN
		arginfo.commandline, // CL
		::libmaus2::bambam::ProgramHeaderLineSet(headertext).getLastIdInChain(), // PP
		std::string(PACKAGE_VERSION) // VN
	);

	// construct new header
	libmaus2::bambam::BamHeader const uphead(upheadtext);

	/*
	 * start index/md5 callbacks
	 */
	std::string const tmpfilenamebase = arginfo.getValue<std::string>("tmpfile",arginfo.getDefaultTmpFileName());
	std::string const tmpfileindex = tmpfilenamebase + "_index";
	::libmaus2::util::TempFileRemovalContainer::addTempFile(tmpfileindex);

	std::string md5filename;
	std::string indexfilename;

	std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > cbs;
	::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Pmd5cb;
	if ( arginfo.getValue<unsigned int>("md5",getDefaultMD5()) )
	{
		if ( arginfo.hasArg("md5filename") &&  arginfo.getUnparsedValue("md5filename","") != "" )
			md5filename = arginfo.getUnparsedValue("md5filename","");
		else
			std::cerr << "[V] no filename for md5 given, not creating hash" << std::endl;

		if ( md5filename.size() )
		{
			::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Tmd5cb(new ::libmaus2::lz::BgzfDeflateOutputCallbackMD5);
			Pmd5cb = UNIQUE_PTR_MOVE(Tmd5cb);
			cbs.push_back(Pmd5cb.get());
		}
	}
	libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Pindex;
	if ( arginfo.getValue<unsigned int>("index",getDefaultIndex()) )
	{
		if ( arginfo.hasArg("indexfilename") &&  arginfo.getUnparsedValue("indexfilename","") != "" )
			indexfilename = arginfo.getUnparsedValue("indexfilename","");
		else
			std::cerr << "[V] no filename for index given, not creating index" << std::endl;

		if ( indexfilename.size() )
		{
			libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Tindex(new libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex(tmpfileindex));
			Pindex = UNIQUE_PTR_MOVE(Tindex);
			cbs.push_back(Pindex.get());
		}
	}
	std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > * Pcbs = 0;
	if ( cbs.size() )
		Pcbs = &cbs;
	/*
	 * end md5/index callbacks
	 */

	::libmaus2::bambam::BamWriter::unique_ptr_type writer(new ::libmaus2::bambam::BamWriter(std::cout,uphead,level,Pcbs));
	libmaus2::bambam::BamAuxFilterVector bafv;
 	// bafv.set('z','z');
 	// std::vector<uint8_t> R(8);
 	// std::string const zz("zz");

	libmaus2::bambam::BamAlignment & algn = dec.getAlignment();
	uint64_t c = 0;

	libmaus2::autoarray::AutoArray < std::pair<uint8_t,uint8_t> > auxtags;
	libmaus2::autoarray::AutoArray<libmaus2::bambam::cigar_operation> cigop;
	std::stack < libmaus2::bambam::cigar_operation > hardstack;
	libmaus2::bambam::BamAlignment::D_array_type Tcigar;
	libmaus2::bambam::BamAuxFilterVector auxfilterout;
	auxfilterout.set('q','s');
	auxfilterout.set('q','q');

	while ( dec.readAlignment() )
	{
		// reinsert clipped parts and attach soft clipping cigar operations as needed
		clipReinsert(algn,auxtags,bafv,cigop,Tcigar,hardstack,auxfilterout);

		algn.serialise(writer->getStream());

		++c;

		if ( verbose && (c & (1024*1024-1)) == 0 )
 			std::cerr << "[V] " << c/(1024*1024) << std::endl;
	}

	writer.reset();

	if ( Pmd5cb )
	{
		Pmd5cb->saveDigestAsFile(md5filename);
	}
	if ( Pindex )
	{
		Pindex->flush(std::string(indexfilename));
	}

	return EXIT_SUCCESS;
}
Beispiel #18
0
static int call_damapper(libmaus2::util::ArgParser const & arg, libmaus2::util::ArgInfo const & arginfo)
{
	unsigned int const damapper_arg_k = arg.uniqueArgPresent("k") ? arg.getUnsignedNumericArg<uint64_t>("k") : 20;
	         int const damapper_arg_t = arg.uniqueArgPresent("t") ? arg.getParsedArg<int>("t") : -1;
	         int const damapper_arg_M = arg.uniqueArgPresent("M") ? arg.getParsedArg<int>("M") : -1;
	double const damapper_arg_e = arg.uniqueArgPresent("e") ? arg.getParsedArg<double>("e") : std::numeric_limits<double>::min();
	         int const damapper_arg_s = arg.uniqueArgPresent("s") ? arg.getParsedArg<int>("s") : -1;
	double const damapper_arg_n = arg.uniqueArgPresent("n") ? arg.getParsedArg<double>("n") : std::numeric_limits<double>::min();
	         int const damapper_arg_B = arg.uniqueArgPresent("B") ? arg.getParsedArg<int>("B") : -1;
	         int const damapper_arg_T = arg.uniqueArgPresent("T") ? arg.getParsedArg<int>("T") : -1;
	         int const damapper_arg_b = arg.uniqueArgPresent("b") ? 1 :  0; // comp bias
	         int const damapper_arg_v = arg.uniqueArgPresent("v") ? 1 :  0; // verbose
	         int const damapper_arg_p = arg.uniqueArgPresent("p") ? 1 :  0; // repeat profile (not used)

	unsigned int const numthreads = damapper_arg_T < 0 ? 4 : damapper_arg_T;

	unsigned int const refblocksize = arg.uniqueArgPresent("refblocksize") ? arg.getUnsignedNumericArg<uint64_t>("refblocksize") : 250;
	unsigned int const readblocksize = arg.uniqueArgPresent("readblocksize") ? arg.getUnsignedNumericArg<uint64_t>("readblocksize") : 250;

	std::string const argindexdir = arg.uniqueArgPresent("I") ? arg["I"] :  std::string();
	std::string const argworkdir = arg.uniqueArgPresent("W") ? arg["W"] :  std::string();
	std::string const workdir = makeAbsolute(argworkdir.size() ? argworkdir : ".");

	mkdirP(workdir);

	unsigned int const fastacolwidth = 80;
	bool const reformatref = true;
	bool const reformatreads = true;

	std::string progname = arg.progname;
	ProgramDescriptor PD_fasta2DAM("fasta2DAM",progname);
	ProgramDescriptor PD_DBsplit("DBsplit",progname);
	ProgramDescriptor PD_HPC_damapper("HPC.damapper",progname);
	ProgramDescriptor PD_damapper("damapper",progname);
	ProgramDescriptor PD_lascat("lascat",progname);
	ProgramDescriptor PD_laschainsort("laschainsort",progname);
	ProgramDescriptor PD_lastobam("lastobam",progname);
	ProgramDescriptor PD_LAsort("LAsort",progname);
	ProgramDescriptor PD_LAcat("LAcat",progname);
	ProgramDescriptor PD_LAmerge("LAmerge",progname);

	std::vector<std::string> Vpathadd;
	Vpathadd.push_back(PD_LAsort.dir);
	Vpathadd.push_back(PD_LAcat.dir);
	Vpathadd.push_back(PD_LAmerge.dir);
	std::sort(Vpathadd.begin(),Vpathadd.end());
	Vpathadd.resize ( std::unique(Vpathadd.begin(),Vpathadd.end()) - Vpathadd.begin() );

	std::ostringstream npathstr;
	npathstr << getenv("PATH");
	for ( uint64_t i = 0; i < Vpathadd.size(); ++i )
		npathstr << ":" << Vpathadd[i];
	std::string const npath = npathstr.str();
	libmaus2::util::WriteableString Wnpath(npath);
	setenv("PATH",Wnpath.A.begin(),1 /* overwrite */);

	std::string const reffn = arg[0];

	if ( ! libmaus2::util::GetFileSize::fileExists(reffn) )
	{
		libmaus2::exception::LibMausException lme;
		lme.getStream() << "call.damapper: reference file " << reffn << " does not exist" << std::endl;
		lme.finish();
		throw lme;
	}

	static char const * faend[] = { ".fasta",".fa",0 };

	std::string const refdir = sdirname(reffn);
	std::string const indexdir = makeAbsolute(argindexdir.size() ? argindexdir : refdir);
	std::string const reffile = sbasename(reffn);
	std::string const reffileclipped = libmaus2::util::OutputFileNameTools::endClip(reffile,&faend[0]);

	mkdirP(indexdir);

	std::string const refdam = indexdir + "/"  + reffileclipped + ".dam";
	std::string const refidx = indexdir + "/." + reffileclipped + ".idx";
	std::string const refhdr = indexdir + "/." + reffileclipped + ".hdr";
	std::string const refbps = indexdir + "/." + reffileclipped + ".bps";
	std::string const indexfasta = indexdir + "/"  + reffileclipped + ".dam.fasta";

	if (
		! libmaus2::util::GetFileSize::fileExists(refdam) || libmaus2::util::GetFileSize::isOlder(refdam,reffn) ||
		! libmaus2::util::GetFileSize::fileExists(refidx) || libmaus2::util::GetFileSize::isOlder(refidx,reffn) ||
		! libmaus2::util::GetFileSize::fileExists(refhdr) || libmaus2::util::GetFileSize::isOlder(refhdr,reffn) ||
		! libmaus2::util::GetFileSize::fileExists(refbps) || libmaus2::util::GetFileSize::isOlder(refbps,reffn) ||
		! libmaus2::util::GetFileSize::fileExists(indexfasta) || libmaus2::util::GetFileSize::isOlder(indexfasta,reffn)
	)
	{
		std::string const tmpbase = arginfo.getDefaultTmpFileName();
		std::string const reffastatmp  = indexdir + "/"  + tmpbase + ".fasta";
		std::string const refdamtmp    = indexdir + "/"  + tmpbase + ".dam";
		std::string const refdamidxtmp = indexdir + "/." + tmpbase + ".idx";
		std::string const refdamhdrtmp = indexdir + "/." + tmpbase + ".hdr";
		std::string const refdambpstmp = indexdir + "/." + tmpbase + ".bps";

		libmaus2::util::TempFileRemovalContainer::addTempFile(refdamtmp);
		libmaus2::util::TempFileRemovalContainer::addTempFile(refdamidxtmp);
		libmaus2::util::TempFileRemovalContainer::addTempFile(refdamhdrtmp);
		libmaus2::util::TempFileRemovalContainer::addTempFile(refdambpstmp);
		libmaus2::util::TempFileRemovalContainer::addTempFile(reffastatmp);

		if ( reformatref )
		{
			libmaus2::fastx::FastAReader FAreader(reffn);
			libmaus2::fastx::FastAReader::pattern_type pat;
			libmaus2::aio::OutputStreamInstance::unique_ptr_type OSI(new libmaus2::aio::OutputStreamInstance(reffastatmp));
			while ( FAreader.getNextPatternUnlocked(pat) )
				pat.printMultiLine(*OSI,fastacolwidth);
			OSI->flush();
			OSI.reset();
		}
		else
		{
			if ( symlink(reffn.c_str(),reffastatmp.c_str()) )
			{
				int const error = errno;

				libmaus2::exception::LibMausException lme;
				lme.getStream() << "call.damapper: symlink(" << reffn << "," << reffastatmp << "): " << strerror(error) << std::endl;
				lme.finish();
				throw lme;
			}
		}

		std::string const fasta2DAMcom = PD_fasta2DAM.path + " " + refdamtmp + " " + reffastatmp;

		int const fasta2DAMcomres = callsystem(fasta2DAMcom.c_str());

		if ( damapper_arg_v )
			std::cerr << "[V] " << fasta2DAMcom << std::endl;

		if ( fasta2DAMcomres != 0 )
		{
			libmaus2::exception::LibMausException lme;
			lme.getStream() << "call.damapper: " << fasta2DAMcom << " failed" << std::endl;
			lme.finish();
			throw lme;
		}

		std::ostringstream splitstr;
		splitstr << PD_DBsplit.path << " -s" << refblocksize << " -x" << damapper_arg_k << " " << refdamtmp;
		std::string const splitcom = splitstr.str();

		int const splitcomres = callsystem(splitcom.c_str());

		if ( damapper_arg_v )
			std::cerr << "[V] " << splitcom << std::endl;

		if ( splitcomres != 0 )
		{
			libmaus2::exception::LibMausException lme;
			lme.getStream() << "call.damapper: " << splitcomres << " failed" << std::endl;
			lme.finish();
			throw lme;
		}

		callrename(refdamtmp,refdam);
		callrename(refdamidxtmp,refidx);
		callrename(refdamhdrtmp,refhdr);
		callrename(refdambpstmp,refbps);
		callrename(reffastatmp,indexfasta);
	}

	std::string const tmpbase = arginfo.getDefaultTmpFileName();
	std::string const readsfastatmp = workdir + "/" + tmpbase + "_reads" + ".fasta";
	std::string const readsdamtmp = workdir + "/" + tmpbase + "_reads" + ".dam";
	std::string const readsidxtmp = workdir + "/." + tmpbase + "_reads" + ".idx";
	std::string const readsbpstmp = workdir + "/." + tmpbase + "_reads" + ".bps";
	std::string const readshdrtmp = workdir + "/." + tmpbase + "_reads" + ".hdr";
	std::string const readsdamtmpscript = workdir + "/" + tmpbase + "_reads" + ".dam.script";
	std::string const readslastmp = workdir + "/" + tmpbase + "_reads" + ".las";

	libmaus2::util::TempFileRemovalContainer::addTempFile(readsfastatmp);
	libmaus2::util::TempFileRemovalContainer::addTempFile(readsdamtmp);
	libmaus2::util::TempFileRemovalContainer::addTempFile(readsidxtmp);
	libmaus2::util::TempFileRemovalContainer::addTempFile(readsbpstmp);
	libmaus2::util::TempFileRemovalContainer::addTempFile(readshdrtmp);
	libmaus2::util::TempFileRemovalContainer::addTempFile(readsdamtmpscript);
	libmaus2::util::TempFileRemovalContainer::addTempFile(readslastmp);

	libmaus2::aio::OutputStreamInstance::unique_ptr_type OSI(new libmaus2::aio::OutputStreamInstance(readsfastatmp));
	if ( reformatreads )
	{
		libmaus2::fastx::StreamFastAReaderWrapper SFAR(std::cin);
		libmaus2::fastx::StreamFastAReaderWrapper::pattern_type pat;
		while ( SFAR.getNextPatternUnlocked(pat) )
			pat.printMultiLine(*OSI,fastacolwidth);

	}
	else
	{
		libmaus2::autoarray::AutoArray<char> A(4096,false);
		while ( std::cin )
		{
			std::cin.read(A.begin(),A.size());
			uint64_t const r = std::cin.gcount();
			OSI->write(A.begin(),r);
		}
	}
	OSI->flush();
	OSI.reset();

	std::string const fasta2DAMcom = PD_fasta2DAM.path + " " + readsdamtmp + " " + readsfastatmp;

	int const fasta2DAMcomres = callsystem(fasta2DAMcom.c_str());

	if ( damapper_arg_v )
		std::cerr << "[V] " << fasta2DAMcom << std::endl;

	if ( fasta2DAMcomres != 0 )
	{
		libmaus2::exception::LibMausException lme;
		lme.getStream() << "call.damapper: " << fasta2DAMcom << " failed" << std::endl;
		lme.finish();
		throw lme;
	}

	std::ostringstream splitstr;
	splitstr << PD_DBsplit.path << " -s" << readblocksize << " -x" << damapper_arg_k << " " << readsdamtmp;
	std::string const splitcom = splitstr.str();

	int const splitcomres = callsystem(splitcom.c_str());

	if ( damapper_arg_v )
		std::cerr << "[V] " << splitcom << std::endl;

	if ( splitcomres != 0 )
	{
		libmaus2::exception::LibMausException lme;
		lme.getStream() << "call.damapper: " << splitcomres << " failed" << std::endl;
		lme.finish();
		throw lme;
	}

	std::string const calldir = getCurrentDir();
	if ( chdir(workdir.c_str()) )
	{
		int const error = errno;
		libmaus2::exception::LibMausException lme;
		lme.getStream() << "call.damapper: chdir(" << workdir << "): " << strerror(error) << std::endl;
		lme.finish();
		throw lme;
	}

	std::ostringstream HPC1str;
	HPC1str << PD_HPC_damapper.path;

	HPC1str << " -k" << damapper_arg_k;
	if ( damapper_arg_t != -1 )
		HPC1str << " -t" << damapper_arg_t;
	if ( damapper_arg_M != -1 )
		HPC1str << " -M" << damapper_arg_M;
	if ( damapper_arg_e != std::numeric_limits<double>::min() )
		HPC1str << " -e" << damapper_arg_e;
	if ( damapper_arg_s != -1 )
		HPC1str << " -s" << damapper_arg_s;
	if ( damapper_arg_n != std::numeric_limits<double>::min() )
		HPC1str << " -n" << damapper_arg_n;
	if ( damapper_arg_B != -1 )
		HPC1str << " -B" << damapper_arg_B;
	HPC1str << " -T" << numthreads;
	if ( damapper_arg_b )
		HPC1str << " -b";
	if ( damapper_arg_v )
		HPC1str << " -v";
	if ( damapper_arg_p )
		HPC1str << " -p";

	HPC1str << " -C -N " << refdam << " " << readsdamtmp << " >" << readsdamtmpscript;
	std::string const HPC1 = HPC1str.str();

	int const HPC1res = system(HPC1.c_str());

	if ( damapper_arg_v )
		std::cerr << "[V] " << HPC1 << std::endl;

	if ( HPC1res != 0 )
	{
		libmaus2::exception::LibMausException lme;
		lme.getStream() << "call.damapper: " << HPC1 << " failed" << std::endl;
		lme.finish();
		throw lme;
	}

	std::vector<std::string> Voutput;
	std::vector<std::string> Vdamapper;
	libmaus2::aio::InputStreamInstance::unique_ptr_type HPCisi(new libmaus2::aio::InputStreamInstance(readsdamtmpscript));
	std::string const exp = "# Catenate jobs (2)";
	std::string const expdamapper = "# Damapper jobs (1)";
	std::string const expsub = "LAsort -a ";
	std::string const expcheck = "# Check all .las files (optional but recommended)";
	std::string const expchecksub = "LAcheck -vS ";

	bool activedamapper = false;
	bool activeoutput = false;
	bool activationfound = false;
	bool activationdamapperfound = false;

	while ( *HPCisi )
	{
		std::string line;
		std::getline(*HPCisi,line);

		// std::cerr << line << std::endl;

		if ( line.size() )
		{
			if ( line[0] == '#' )
			{
				activeoutput = false;
				activedamapper = false;

				if ( line == expcheck )
				{
					activeoutput = true;
					activationfound = true;
				}
				else if ( line == expdamapper )
				{
					activedamapper = true;
					activationdamapperfound = true;
				}
			}
			else if ( activeoutput )
			{
				if (
					line.size() >= expchecksub.size()
					&&
					line.substr(0,expchecksub.size()) == expchecksub
				)
				{
					line = line.substr(expchecksub.size());

					std::vector<std::string> tokens;
					uint64_t l = 0;
					while ( l < line.size() )
					{
						while ( l < line.size() && ::std::isspace(static_cast<unsigned char>(line[l])) )
							++l;

						uint64_t h = l;
						while ( h < line.size() && ! ::std::isspace(static_cast<unsigned char>(line[h])) )
							++h;

						if ( h > l )
						{
							tokens.push_back(line.substr(l,h-l));
							// std::cerr << "token " << tokens.back() << std::endl;
						}

						l = h;
					}

					if ( tokens.size() == 3 )
					{
						Voutput.push_back(tokens[2] + ".las");
					}
					else
					{
						libmaus2::exception::LibMausException lme;
						lme.getStream() << "call.damapper: cannot understand line " << line << std::endl;
						lme.finish();
						throw lme;
					}
				}
				else
				{
					libmaus2::exception::LibMausException lme;
					lme.getStream() << "call.damapper: cannot understand line " << line << std::endl;
					lme.finish();
					throw lme;
				}
			}
			else if ( activedamapper )
			{
				if ( startsWith(line,"damapper ") )
				{
					line = line.substr(std::string("damapper ").size());
					line = PD_damapper.path + " " + line;
					Vdamapper.push_back(line);
				}
				else
				{
					libmaus2::exception::LibMausException lme;
					lme.getStream() << "call.damapper: cannot understand line " << line << std::endl;
					lme.finish();
					throw lme;
				}
			}
		}
	}
	HPCisi.reset();

	if ( ! activationfound )
	{
		libmaus2::exception::LibMausException lme;
		lme.getStream() << "call.damapper: activation line for cat jobs not found" << std::endl;
		lme.finish();
		throw lme;
	}
	if ( ! activationdamapperfound )
	{
		libmaus2::exception::LibMausException lme;
		lme.getStream() << "call.damapper: activation line for damapper not found" << std::endl;
		lme.finish();
		throw lme;
	}

	for ( uint64_t i = 0; i < Voutput.size(); ++i )
		libmaus2::util::TempFileRemovalContainer::addTempFile(Voutput[i]);

	/* call damapper */
	for ( uint64_t i = 0; i < Vdamapper.size(); ++i )
	{
		int const damapperret = callsystem(Vdamapper[i].c_str());

		if ( damapper_arg_v )
			std::cerr << "[V] " << Vdamapper[i] << std::endl;

		if ( damapperret != 0 )
		{
			libmaus2::exception::LibMausException lme;
			lme.getStream() << "call.damapper: damapper call failed" << std::endl;
			lme.finish();
			throw lme;
		}
	}

	/* concatenate all output .las files */
	std::ostringstream catostr;
	catostr << PD_laschainsort.path << " -sba " << readslastmp;
	for ( uint64_t i = 0; i < Voutput.size(); ++i )
	{
		catostr << " " << Voutput[i];
	}
	std::string const catcom = catostr.str();

	if ( damapper_arg_v )
		std::cerr << "[V] " << catcom << std::endl;

	if ( system(catcom.c_str()) != 0 )
	{
		libmaus2::exception::LibMausException lme;
		lme.getStream() << "call.damapper: laschainsort call " << catcom << " failed" << std::endl;
		lme.finish();
		throw lme;
	}

	for ( uint64_t i = 0; i < Voutput.size(); ++i )
		::remove(Voutput[i].c_str());

	/* convert .las file to BAM */
	std::ostringstream lastobamstr;
	lastobamstr << PD_lastobam.path << " -t" << numthreads << " -snone " << refdam << " " << indexfasta << " " << readsdamtmp << " " << readsfastatmp << " " << readslastmp;
	std::string const lastobamcom = lastobamstr.str();

	if ( damapper_arg_v )
		std::cerr << "[V] " << lastobamcom << std::endl;

	if ( system(lastobamcom.c_str()) != 0 )
	{
		libmaus2::exception::LibMausException lme;
		lme.getStream() << "call.damapper: lastobam call " << lastobamcom << " failed" << std::endl;
		lme.finish();
		throw lme;
	}

	if ( chdir(calldir.c_str()) )
	{
		int const error = errno;
		libmaus2::exception::LibMausException lme;
		lme.getStream() << "call.damapper: chdir(" << calldir << "): " << strerror(error) << std::endl;
		lme.finish();
		throw lme;
	}


	return EXIT_SUCCESS;
}