int bamsplit(libmaus2::util::ArgInfo const & arginfo) { if ( isatty(STDIN_FILENO) ) { ::libmaus2::exception::LibMausException se; se.getStream() << "Refusing read binary data from terminal, please redirect standard input to pipe or file." << std::endl; se.finish(); throw se; } int const level = libmaus2::bambam::BamBlockWriterBaseFactory::checkCompressionLevel(arginfo.getValue<int>("level",getDefaultLevel())); int const verbose = arginfo.getValue<int>("verbose",getDefaultVerbose()); uint64_t const n = arginfo.getValue<int>("n",getDefaultN()); std::string const prefix = arginfo.getUnparsedValue("prefix",getDefaultFilePrefix(arginfo)); libmaus2::bambam::BamDecoder bamdec(std::cin); libmaus2::bambam::BamAlignment const & algn = bamdec.getAlignment(); libmaus2::bambam::BamHeader const & header = bamdec.getHeader(); ::libmaus2::bambam::BamHeader::unique_ptr_type uphead(updateHeader(arginfo,header)); libmaus2::aio::OutputStreamInstance::unique_ptr_type COS; libmaus2::bambam::BamWriter::unique_ptr_type writer; uint64_t c = 0; uint64_t f = 0; while ( bamdec.readAlignment() ) { if ( c++ % n == 0 ) { writer.reset(); if ( COS ) COS->flush(); COS.reset(); std::ostringstream fnostr; fnostr << prefix << "_" << std::setw(6) << std::setfill('0') << f++ << std::setw(0) << ".bam"; std::string const fn = fnostr.str(); libmaus2::aio::OutputStreamInstance::unique_ptr_type tCOS(new libmaus2::aio::OutputStreamInstance(fn)); COS = UNIQUE_PTR_MOVE(tCOS); libmaus2::bambam::BamWriter::unique_ptr_type twriter(new libmaus2::bambam::BamWriter(*COS,*uphead,level)); writer = UNIQUE_PTR_MOVE(twriter); if ( verbose ) std::cerr << "[V] opened file " << fn << std::endl; } algn.serialise(writer->getStream()); } writer.reset(); if ( COS ) COS->flush(); COS.reset(); return EXIT_SUCCESS; }
int bamexplode(libmaus2::util::ArgInfo const & arginfo) { libmaus2::bambam::BamAlignmentDecoderWrapper::unique_ptr_type Preader(libmaus2::bambam::BamMultiAlignmentDecoderFactory::construct(arginfo)); libmaus2::bambam::BamBlockWriterBase::unique_ptr_type Pwriter; libmaus2::bambam::BamAlignmentDecoder & decoder = Preader->getDecoder(); libmaus2::bambam::BamHeader const & header = decoder.getHeader(); libmaus2::bambam::BamAlignment const & algn = decoder.getAlignment(); uint64_t nextfn = 0; uint64_t written = std::numeric_limits<uint64_t>::max(); int32_t prevrefid = std::numeric_limits<int32_t>::max(); std::string const outputformat = arginfo.getUnparsedValue("outputformat",libmaus2::bambam::BamBlockWriterBaseFactory::getDefaultOutputFormat()); std::string const prefix = arginfo.getUnparsedValue("prefix",getDefaultPrefix()); uint64_t const thres = arginfo.getValueUnsignedNumeric("sizethres",getDefaultSizeThres()); while ( decoder.readAlignment() ) { int32_t const refid = algn.getRefID(); if ( refid != prevrefid && written > thres ) { Pwriter.reset(); libmaus2::util::ArgInfo argcopy(arginfo); std::ostringstream fnostr; fnostr << prefix << std::setw(6) << std::setfill('0') << nextfn++ << std::setw(0) << "." << outputformat; argcopy.replaceKey("O",fnostr.str()); libmaus2::bambam::BamBlockWriterBase::unique_ptr_type Twriter(libmaus2::bambam::BamBlockWriterBaseFactory::construct(header,argcopy)); Pwriter = UNIQUE_PTR_MOVE(Twriter); written = 0; } Pwriter->writeAlignment(algn); prevrefid = refid; written ++; } Pwriter.reset(); return EXIT_SUCCESS; }
int bamclipreinsert(::libmaus2::util::ArgInfo const & arginfo) { if ( isatty(STDIN_FILENO) ) { ::libmaus2::exception::LibMausException se; se.getStream() << "Refusing to read binary data from terminal, please redirect standard input to pipe or file." << std::endl; se.finish(); throw se; } if ( isatty(STDOUT_FILENO) ) { ::libmaus2::exception::LibMausException se; se.getStream() << "Refusing write binary data to terminal, please redirect standard output to pipe or file." << std::endl; se.finish(); throw se; } int const level = libmaus2::bambam::BamBlockWriterBaseFactory::checkCompressionLevel(arginfo.getValue<int>("level",getDefaultLevel())); int const verbose = arginfo.getValue<int>("verbose",getDefaultVerbose()); ::libmaus2::bambam::BamDecoder dec(std::cin,false); ::libmaus2::bambam::BamHeader const & header = dec.getHeader(); std::string const headertext(header.text); // add PG line to header std::string const upheadtext = ::libmaus2::bambam::ProgramHeaderLineSet::addProgramLine( headertext, "bamclipreinsert", // ID "bamclipreinsert", // PN arginfo.commandline, // CL ::libmaus2::bambam::ProgramHeaderLineSet(headertext).getLastIdInChain(), // PP std::string(PACKAGE_VERSION) // VN ); // construct new header libmaus2::bambam::BamHeader const uphead(upheadtext); /* * start index/md5 callbacks */ std::string const tmpfilenamebase = arginfo.getValue<std::string>("tmpfile",arginfo.getDefaultTmpFileName()); std::string const tmpfileindex = tmpfilenamebase + "_index"; ::libmaus2::util::TempFileRemovalContainer::addTempFile(tmpfileindex); std::string md5filename; std::string indexfilename; std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > cbs; ::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Pmd5cb; if ( arginfo.getValue<unsigned int>("md5",getDefaultMD5()) ) { if ( arginfo.hasArg("md5filename") && arginfo.getUnparsedValue("md5filename","") != "" ) md5filename = arginfo.getUnparsedValue("md5filename",""); else std::cerr << "[V] no filename for md5 given, not creating hash" << std::endl; if ( md5filename.size() ) { ::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Tmd5cb(new ::libmaus2::lz::BgzfDeflateOutputCallbackMD5); Pmd5cb = UNIQUE_PTR_MOVE(Tmd5cb); cbs.push_back(Pmd5cb.get()); } } libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Pindex; if ( arginfo.getValue<unsigned int>("index",getDefaultIndex()) ) { if ( arginfo.hasArg("indexfilename") && arginfo.getUnparsedValue("indexfilename","") != "" ) indexfilename = arginfo.getUnparsedValue("indexfilename",""); else std::cerr << "[V] no filename for index given, not creating index" << std::endl; if ( indexfilename.size() ) { libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Tindex(new libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex(tmpfileindex)); Pindex = UNIQUE_PTR_MOVE(Tindex); cbs.push_back(Pindex.get()); } } std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > * Pcbs = 0; if ( cbs.size() ) Pcbs = &cbs; /* * end md5/index callbacks */ ::libmaus2::bambam::BamWriter::unique_ptr_type writer(new ::libmaus2::bambam::BamWriter(std::cout,uphead,level,Pcbs)); libmaus2::bambam::BamAuxFilterVector bafv; // bafv.set('z','z'); // std::vector<uint8_t> R(8); // std::string const zz("zz"); libmaus2::bambam::BamAlignment & algn = dec.getAlignment(); uint64_t c = 0; libmaus2::autoarray::AutoArray < std::pair<uint8_t,uint8_t> > auxtags; libmaus2::autoarray::AutoArray<libmaus2::bambam::cigar_operation> cigop; std::stack < libmaus2::bambam::cigar_operation > hardstack; libmaus2::bambam::BamAlignment::D_array_type Tcigar; libmaus2::bambam::BamAuxFilterVector auxfilterout; auxfilterout.set('q','s'); auxfilterout.set('q','q'); while ( dec.readAlignment() ) { // reinsert clipped parts and attach soft clipping cigar operations as needed clipReinsert(algn,auxtags,bafv,cigop,Tcigar,hardstack,auxfilterout); algn.serialise(writer->getStream()); ++c; if ( verbose && (c & (1024*1024-1)) == 0 ) std::cerr << "[V] " << c/(1024*1024) << std::endl; } writer.reset(); if ( Pmd5cb ) { Pmd5cb->saveDigestAsFile(md5filename); } if ( Pindex ) { Pindex->flush(std::string(indexfilename)); } return EXIT_SUCCESS; }
void bamalignfrac(::libmaus2::util::ArgInfo const & arginfo) { libmaus2::bambam::BamAlignmentDecoderWrapper::unique_ptr_type decwrapper( libmaus2::bambam::BamMultiAlignmentDecoderFactory::construct(arginfo)); ::libmaus2::bambam::BamAlignmentDecoder * ppdec = &(decwrapper->getDecoder()); ::libmaus2::bambam::BamAlignmentDecoder & dec = *ppdec; ::libmaus2::bambam::BamAlignment const & algn = dec.getAlignment(); libmaus2::autoarray::AutoArray<libmaus2::bambam::cigar_operation> cigop; uint64_t basealgn = 0; uint64_t clip = 0; uint64_t totalbases = 0; #if defined(LIBMAUS2_HAVE_REGEX_H) std::string const regexs = arginfo.getUnparsedValue("name",""); libmaus2::util::unique_ptr<libmaus2::regex::PosixRegex>::type regex_ptr; if ( regexs.size() ) { libmaus2::util::unique_ptr<libmaus2::regex::PosixRegex>::type tregex_ptr(new libmaus2::regex::PosixRegex(regexs)); regex_ptr = UNIQUE_PTR_MOVE(tregex_ptr); } #endif while ( dec.readAlignment() ) { if ( algn.isMapped() #if defined(LIBMAUS2_HAVE_REGEX_H) && ( (!regex_ptr) || (regex_ptr->findFirstMatch(algn.getName()) != -1) ) #endif ) { uint32_t const numcig = algn.getCigarOperations(cigop); totalbases += algn.getLseq(); for ( uint64_t i = 0; i < numcig; ++i ) { switch ( cigop[i].first ) { case libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CMATCH: case libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CINS: case libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CEQUAL: case libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CDIFF: basealgn += cigop[i].second; break; case libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CSOFT_CLIP: clip += cigop[i].second; break; case libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CHARD_CLIP: totalbases += cigop[i].second; clip += cigop[i].second; break; case libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CDEL: case libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CREF_SKIP: break; } } } } std::cerr << "total bases in mapped reads\t" << totalbases << std::endl; std::cerr << "clipped (hard and soft) bases in mapped reads\t" << clip << std::endl; std::cerr << "aligned bases in mapped reads\t" << basealgn << std::endl; }
uint64_t bamrecompress(libmaus2::util::ArgInfo const & arginfo) { int const level = libmaus2::bambam::BamBlockWriterBaseFactory::checkCompressionLevel(arginfo.getValue<int>("level",getDefaultLevel())); int const verbose = arginfo.getValue<int>("verbose",getDefaultVerbose()); int const numthreads = std::max(1,arginfo.getValue<int>("numthreads",getDefaultNumThreads())); /* * start index/md5 callbacks */ std::string const tmpfilenamebase = arginfo.getValue<std::string>("tmpfile",arginfo.getDefaultTmpFileName()); std::string const tmpfileindex = tmpfilenamebase + "_index"; ::libmaus2::util::TempFileRemovalContainer::addTempFile(tmpfileindex); std::string md5filename; std::string indexfilename; std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > cbs; ::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Pmd5cb; if ( arginfo.getValue<unsigned int>("md5",getDefaultMD5()) ) { if ( arginfo.hasArg("md5filename") && arginfo.getUnparsedValue("md5filename","") != "" ) md5filename = arginfo.getUnparsedValue("md5filename",""); else std::cerr << "[V] no filename for md5 given, not creating hash" << std::endl; if ( md5filename.size() ) { ::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Tmd5cb(new ::libmaus2::lz::BgzfDeflateOutputCallbackMD5); Pmd5cb = UNIQUE_PTR_MOVE(Tmd5cb); cbs.push_back(Pmd5cb.get()); } } libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Pindex; if ( arginfo.getValue<unsigned int>("index",getDefaultIndex()) ) { if ( arginfo.hasArg("indexfilename") && arginfo.getUnparsedValue("indexfilename","") != "" ) indexfilename = arginfo.getUnparsedValue("indexfilename",""); else std::cerr << "[V] no filename for index given, not creating index" << std::endl; if ( indexfilename.size() ) { libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Tindex(new libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex(tmpfileindex)); Pindex = UNIQUE_PTR_MOVE(Tindex); cbs.push_back(Pindex.get()); } } /* * end md5/index callbacks */ libmaus2::lz::BgzfInflateDeflateParallel::unique_ptr_type BIDP(new libmaus2::lz::BgzfInflateDeflateParallel(std::cin,std::cout,level,numthreads,4*numthreads)); for ( uint64_t i = 0; i < cbs.size(); ++i ) BIDP->registerBlockOutputCallback(cbs[i]); libmaus2::autoarray::AutoArray<char> B(64*1024,false); int r; uint64_t t = 0; uint64_t last = std::numeric_limits<uint64_t>::max(); uint64_t lcnt = 0; uint64_t const mod = 64*1024*1024; libmaus2::timing::RealTimeClock rtc; rtc.start(); libmaus2::timing::RealTimeClock lrtc; lrtc.start(); while ( (r = BIDP->read(B.begin(),B.size())) ) { BIDP->write(B.begin(),r); lcnt += r; t += r; if ( t/mod != last/mod ) { if ( verbose ) { if ( isatty(STDERR_FILENO) ) std::cerr << "\r" << std::string(60,' ') << "\r"; std::cerr << rtc.formatTime(rtc.getElapsedSeconds()) << " " << t/(1024*1024) << "MB, " << (lcnt/lrtc.getElapsedSeconds())/(1024.0*1024.0) << "MB/s"; if ( isatty(STDERR_FILENO) ) std::cerr << std::flush; else std::cerr << std::endl; } lrtc.start(); last = t; lcnt = 0; } } if ( verbose ) { if ( isatty(STDERR_FILENO) ) std::cerr << "\r" << std::string(60,' ') << "\r"; std::cerr << rtc.formatTime(rtc.getElapsedSeconds()) << " " << t/(1024*1024) << "MB, " << (t/rtc.getElapsedSeconds())/(1024.0*1024.0) << "MB/s"; std::cerr << std::endl; } BIDP.reset(); if ( Pmd5cb ) { Pmd5cb->saveDigestAsFile(md5filename); } if ( Pindex ) { Pindex->flush(std::string(indexfilename)); } return 0; }
int bam12auxmerge(::libmaus2::util::ArgInfo const & arginfo) { if ( isatty(STDIN_FILENO) ) { ::libmaus2::exception::LibMausException se; se.getStream() << "Refusing to read binary data from terminal, please redirect standard input to pipe or file." << std::endl; se.finish(); throw se; } if ( isatty(STDOUT_FILENO) ) { ::libmaus2::exception::LibMausException se; se.getStream() << "Refusing write binary data to terminal, please redirect standard output to pipe or file." << std::endl; se.finish(); throw se; } std::string const prefilename = arginfo.getRestArg<std::string>(0); libmaus2::bambam::BamDecoder bampredec(prefilename); int const level = libmaus2::bambam::BamBlockWriterBaseFactory::checkCompressionLevel(arginfo.getValue<int>("level",getDefaultLevel())); int const verbose = arginfo.getValue<int>("verbose",getDefaultVerbose()); int const ranksplit = arginfo.getValue<int>("ranksplit",getDefaultRankSplit()); int const rankstrip = arginfo.getValue<int>("rankstrip",getDefaultRankSplit()); int const clipreinsert = arginfo.getValue<int>("clipreinsert",getDefaultClipReinsert()); int const zztoname = arginfo.getValue<int>("zztoname",getDefaultZZToName()); int const sanity = arginfo.getValue<int>("sanity",getDefaultSanity()); uint64_t const mod = arginfo.getValue<int>("mod",getDefaultMod()); uint64_t const bmod = libmaus2::math::nextTwoPow(mod); uint64_t const bmask = bmod-1; libmaus2::autoarray::AutoArray<char> Aread; ::libmaus2::bambam::BamDecoder bamdec(std::cin,false); ::libmaus2::bambam::BamHeader const & header = bamdec.getHeader(); ::libmaus2::bambam::BamHeader const & preheader = bampredec.getHeader(); std::string const headertext(header.text); std::string const preheadertext(libmaus2::bambam::HeaderLine::removeSequenceLines(preheader.text)); libmaus2::bambam::ProgramHeaderLineSet headerlines(headertext); libmaus2::bambam::ProgramHeaderLineSet preheaderlines(preheadertext); std::vector<libmaus2::bambam::HeaderLine> allheaderlines = libmaus2::bambam::HeaderLine::extractLines(headertext); std::string const lastid = preheaderlines.getLastIdInChain(); std::stack < std::pair<uint64_t,std::string> > pgtodo; for ( uint64_t i = 0; i < headerlines.roots.size(); ++i ) pgtodo.push(std::pair<uint64_t,std::string>(headerlines.roots[i],lastid)); std::string upheadtext = preheadertext; while ( pgtodo.size() ) { uint64_t const hid = pgtodo.top().first; std::string const PP = pgtodo.top().second; pgtodo.pop(); libmaus2::bambam::HeaderLine const & line = headerlines.lines[hid]; // ID, PP, PN, CL, VN std::string ID = (line.M.find("ID") != line.M.end()) ? line.M.find("ID")->second : ""; std::string const PN = (line.M.find("PN") != line.M.end()) ? line.M.find("PN")->second : ""; std::string const CL = (line.M.find("CL") != line.M.end()) ? line.M.find("CL")->second : ""; std::string const VN = (line.M.find("VN") != line.M.end()) ? line.M.find("VN")->second : ""; upheadtext = ::libmaus2::bambam::ProgramHeaderLineSet::addProgramLineRef( upheadtext, ID, PN, CL, PP, VN ); if ( headerlines.edges.find(hid) != headerlines.edges.end() ) { std::vector<uint64_t> const & children = headerlines.edges.find(hid)->second; for ( uint64_t j = 0; j < children.size(); ++j ) pgtodo.push(std::pair<uint64_t,std::string>(children[j],ID)); } } /* copy SQ lines */ std::ostringstream sqconcstr; sqconcstr << upheadtext; for ( uint64_t i = 0; i < allheaderlines.size(); ++i ) if ( allheaderlines[i].type == "SQ" ) sqconcstr << allheaderlines[i].line << "\n"; upheadtext = sqconcstr.str(); ::libmaus2::bambam::BamHeader uphead(upheadtext); uphead.changeSortOrder("unknown"); /* * start index/md5 callbacks */ std::string const tmpfilenamebase = arginfo.getValue<std::string>("tmpfile",arginfo.getDefaultTmpFileName()); std::string const tmpfileindex = tmpfilenamebase + "_index"; ::libmaus2::util::TempFileRemovalContainer::addTempFile(tmpfileindex); std::string md5filename; std::string indexfilename; std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > cbs; ::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Pmd5cb; if ( arginfo.getValue<unsigned int>("md5",getDefaultMD5()) ) { if ( arginfo.hasArg("md5filename") && arginfo.getUnparsedValue("md5filename","") != "" ) md5filename = arginfo.getUnparsedValue("md5filename",""); else std::cerr << "[V] no filename for md5 given, not creating hash" << std::endl; if ( md5filename.size() ) { ::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Tmd5cb(new ::libmaus2::lz::BgzfDeflateOutputCallbackMD5); Pmd5cb = UNIQUE_PTR_MOVE(Tmd5cb); cbs.push_back(Pmd5cb.get()); } } libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Pindex; if ( arginfo.getValue<unsigned int>("index",getDefaultIndex()) ) { if ( arginfo.hasArg("indexfilename") && arginfo.getUnparsedValue("indexfilename","") != "" ) indexfilename = arginfo.getUnparsedValue("indexfilename",""); else std::cerr << "[V] no filename for index given, not creating index" << std::endl; if ( indexfilename.size() ) { libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Tindex(new libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex(tmpfileindex)); Pindex = UNIQUE_PTR_MOVE(Tindex); cbs.push_back(Pindex.get()); } } std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > * Pcbs = 0; if ( cbs.size() ) Pcbs = &cbs; /* * end md5/index callbacks */ ::libmaus2::bambam::BamWriter::unique_ptr_type writer(new ::libmaus2::bambam::BamWriter(std::cout,uphead,level,Pcbs)); ::libmaus2::bambam::BamAlignment & algn = bamdec.getAlignment(); ::libmaus2::bambam::BamAlignment & prealgn = bampredec.getAlignment(); int64_t curid = -1; libmaus2::autoarray::AutoArray< std::pair<uint8_t,uint8_t> > auxpre; libmaus2::autoarray::AutoArray< std::pair<uint8_t,uint8_t> > auxnew; libmaus2::bambam::BamAuxFilterVector auxfilter; // helpers for clipReinsert libmaus2::autoarray::AutoArray < std::pair<uint8_t,uint8_t> > auxtags; libmaus2::autoarray::AutoArray<libmaus2::bambam::cigar_operation> cigop; std::stack < libmaus2::bambam::cigar_operation > hardstack; libmaus2::bambam::BamAlignment::D_array_type Tcigar; libmaus2::bambam::BamAuxFilterVector bafv; libmaus2::bambam::BamAuxFilterVector auxfilterout; auxfilterout.set('q','s'); auxfilterout.set('q','q'); // helpers for zztoname libmaus2::bambam::BamAuxFilterVector zzbafv; zzbafv.set('z','z'); // tag filters for secondary/supplementary reads libmaus2::bambam::BamAuxFilterVector auxfiltersec; auxfiltersec.set('q','s'); auxfiltersec.set('q','q'); auxfiltersec.set('a','s'); auxfiltersec.set('a','h'); auxfiltersec.set('a','a'); auxfiltersec.set('a','f'); auxfiltersec.set('a','r'); auxfiltersec.set('a','3'); // loop over aligned BAM file while ( bamdec.readAlignment() ) { if ( ranksplit ) split12(algn); // extract rank char const * name = algn.getName(); char const * u1 = name; bool ok = true; uint64_t rank = 0; while ( *u1 && *u1 != '_' ) { rank *= 10; rank += (*u1-'0'); ok = ok && isdigit(*u1); ++u1; } // unable to find rank? write out as is and continue if ( ! ok ) { algn.serialise(writer->getStream()); continue; } // loop over unaligned BAM file while ( curid != static_cast<int64_t>(rank) ) { bool const a_ok = bampredec.readAlignment(); if ( ! a_ok ) { libmaus2::exception::LibMausException se; se.getStream() << "Found unexpected EOF on file " << prefilename << std::endl; se.finish(); throw se; } assert ( a_ok ); ++curid; if ( verbose && (! (curid & bmask)) ) std::cerr << "[V] " << (curid / bmod) << std::endl; } if ( verbose > 1 ) std::cerr << "Merging:\n" << algn.formatAlignment(header) << "\n" << prealgn.formatAlignment(preheader) << std::endl; uint64_t pretagnum = prealgn.enumerateAuxTags(auxpre); uint64_t newtagnum = algn.enumerateAuxTags(auxnew); // do some sanity checking if ( sanity ) { // first do a name check char const * prename = prealgn.getName(); u1++; // put on the first letter of readname if ( verbose > 1 ) std::cerr << "Sanity: comparing " << name << " and " << prename << std::endl; if ( !is_suffix(prename, u1) ) // names do not match { libmaus2::exception::LibMausException se; se.getStream() << "Sanity check failed on read names, found " << name << " and " << prename << std::endl; se.finish(); throw se; } // now the names match so try the flags if ( !(algn.isPaired() == prealgn.isPaired() && algn.isRead1() == prealgn.isRead1() && algn.isRead2() == prealgn.isRead2()) ) { libmaus2::exception::LibMausException se; se.getStream() << "Sanity check failed on flags, " << std::endl << "Aligned " << name << " paired " << algn.isPaired() << " first " << algn.isRead1() << " last " << algn.isRead2() << std::endl << "Unaligned " << prename << " paired " << prealgn.isPaired() << " first " << prealgn.isRead1() << " last " << prealgn.isRead2() << std::endl; se.finish(); throw se; } if ( verbose > 1 ) std::cerr << "Sanity check on flags: " << std::endl << "Aligned " << name << " paired " << algn.isPaired() << " first " << algn.isRead1() << " last " << algn.isRead2() << std::endl << "Unaligned " << prename << " paired " << prealgn.isPaired() << " first " << prealgn.isRead1() << " last " << prealgn.isRead2() << std::endl; } std::sort(auxpre.begin(),auxpre.begin()+pretagnum); std::sort(auxnew.begin(),auxnew.begin()+newtagnum); if ( verbose > 1 ) std::cerr << "pretagnum=" << pretagnum << " newtagnum=" << newtagnum << std::endl; std::pair<uint8_t,uint8_t> * prec = auxpre.begin(); std::pair<uint8_t,uint8_t> * pree = prec + pretagnum; std::pair<uint8_t,uint8_t> * preo = prec; std::pair<uint8_t,uint8_t> * newc = auxnew.begin(); std::pair<uint8_t,uint8_t> * newe = newc + newtagnum; std::pair<uint8_t,uint8_t> * newo = newc; while ( prec != pree && newc != newe ) { // pre which is not in new if ( *prec < *newc ) { *(preo++) = *(prec++); } // tag in both, drop pre else if ( *prec == *newc ) { *(newo++) = *(newc++); prec++; } // new not in pre else { *(newo++) = *(newc++); } } while ( prec != pree ) *(preo++) = *(prec++); while ( newc != newe ) *(newo++) = *(newc++); pretagnum = preo-auxpre.begin(); newtagnum = newo-auxnew.begin(); for ( uint64_t i = 0; i < pretagnum; ++i ) auxfilter.set(auxpre[i].first,auxpre[i].second); algn.copyAuxTags(prealgn, auxfilter); for ( uint64_t i = 0; i < pretagnum; ++i ) auxfilter.clear(auxpre[i].first,auxpre[i].second); if ( verbose > 1 ) { std::cerr << "pretagnum=" << pretagnum << " newtagnum=" << newtagnum << std::endl; std::cerr << "result: " << algn.formatAlignment(header) << std::endl; } if ( algn.isSecondary() || algn.isSupplementary() ) { // adding adapter clip data to secondary/supplementary reads // can lead to incorrect clip reinserts so remove these tags algn.filterOutAux(auxfiltersec); } // copy QC fail flag from original file to aligner output if ( prealgn.isQCFail() ) algn.putFlags( algn.getFlags() | libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_FQCFAIL ); if ( rankstrip ) strip12(algn); if ( clipreinsert ) clipReinsert(algn,auxtags,bafv,cigop,Tcigar,hardstack,auxfilterout); if ( zztoname ) zzToRank(algn,zzbafv); algn.serialise(writer->getStream()); } writer.reset(); if ( Pmd5cb ) { Pmd5cb->saveDigestAsFile(md5filename); } if ( Pindex ) { Pindex->flush(std::string(indexfilename)); } return EXIT_SUCCESS; }
int bamreset(::libmaus2::util::ArgInfo const & arginfo) { if ( isatty(STDIN_FILENO) ) { ::libmaus2::exception::LibMausException se; se.getStream() << "Refusing to read binary data from terminal, please redirect standard input to pipe or file." << std::endl; se.finish(); throw se; } if ( isatty(STDOUT_FILENO) ) { ::libmaus2::exception::LibMausException se; se.getStream() << "Refusing write binary data to terminal, please redirect standard output to pipe or file." << std::endl; se.finish(); throw se; } int const level = libmaus2::bambam::BamBlockWriterBaseFactory::checkCompressionLevel(arginfo.getValue<int>("level",getDefaultLevel())); int const verbose = arginfo.getValue<int>("verbose",getDefaultVerbose()); int const resetsortorder = arginfo.getValue<int>("resetsortorder",getDefaultResetSortOrder()); ::libmaus2::bambam::BamDecoder dec(std::cin,false); ::libmaus2::bambam::BamHeader const & header = dec.getHeader(); std::string headertext = header.text; // no replacement header file given if ( ! arginfo.hasArg("resetheadertext") ) { // remove SQ lines std::vector<libmaus2::bambam::HeaderLine> allheaderlines = libmaus2::bambam::HeaderLine::extractLines(headertext); std::ostringstream upheadstr; for ( uint64_t i = 0; i < allheaderlines.size(); ++i ) if ( allheaderlines[i].type != "SQ" ) upheadstr << allheaderlines[i].line << std::endl; headertext = upheadstr.str(); } // replace header given in file else { std::string const headerfilename = arginfo.getUnparsedValue("resetheadertext",""); uint64_t const headerlen = libmaus2::util::GetFileSize::getFileSize(headerfilename); libmaus2::aio::CheckedInputStream CIS(headerfilename); libmaus2::autoarray::AutoArray<char> ctext(headerlen,false); CIS.read(ctext.begin(),headerlen); headertext = std::string(ctext.begin(),ctext.end()); } // add PG line to header headertext = libmaus2::bambam::ProgramHeaderLineSet::addProgramLine( headertext, "bamreset", // ID "bamreset", // PN arginfo.commandline, // CL ::libmaus2::bambam::ProgramHeaderLineSet(headertext).getLastIdInChain(), // PP std::string(PACKAGE_VERSION) // VN ); // construct new header libmaus2::bambam::BamHeader uphead(headertext); if ( resetsortorder ) uphead.changeSortOrder("unknown"); /* * start index/md5 callbacks */ std::string const tmpfilenamebase = arginfo.getValue<std::string>("tmpfile",arginfo.getDefaultTmpFileName()); std::string const tmpfileindex = tmpfilenamebase + "_index"; ::libmaus2::util::TempFileRemovalContainer::addTempFile(tmpfileindex); uint32_t const excludeflags = libmaus2::bambam::BamFlagBase::stringToFlags( arginfo.getValue<std::string>("exclude",getDefaultExcludeFlags())); std::string md5filename; std::string indexfilename; std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > cbs; ::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Pmd5cb; if ( arginfo.getValue<unsigned int>("md5",getDefaultMD5()) ) { if ( arginfo.hasArg("md5filename") && arginfo.getUnparsedValue("md5filename","") != "" ) md5filename = arginfo.getUnparsedValue("md5filename",""); else std::cerr << "[V] no filename for md5 given, not creating hash" << std::endl; if ( md5filename.size() ) { ::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Tmd5cb(new ::libmaus2::lz::BgzfDeflateOutputCallbackMD5); Pmd5cb = UNIQUE_PTR_MOVE(Tmd5cb); cbs.push_back(Pmd5cb.get()); } } libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Pindex; if ( arginfo.getValue<unsigned int>("index",getDefaultIndex()) ) { if ( arginfo.hasArg("indexfilename") && arginfo.getUnparsedValue("indexfilename","") != "" ) indexfilename = arginfo.getUnparsedValue("indexfilename",""); else std::cerr << "[V] no filename for index given, not creating index" << std::endl; if ( indexfilename.size() ) { libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Tindex(new libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex(tmpfileindex)); Pindex = UNIQUE_PTR_MOVE(Tindex); cbs.push_back(Pindex.get()); } } std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > * Pcbs = 0; if ( cbs.size() ) Pcbs = &cbs; /* * end md5/index callbacks */ ::libmaus2::bambam::BamWriter::unique_ptr_type writer(new ::libmaus2::bambam::BamWriter(std::cout,uphead,level,Pcbs)); libmaus2::timing::RealTimeClock rtc; rtc.start(); libmaus2::bambam::BamAlignment & algn = dec.getAlignment(); uint64_t c = 0; bool const resetaux = arginfo.getValue<int>("resetaux",getDefaultResetAux()); libmaus2::bambam::BamAuxFilterVector::unique_ptr_type const prgfilter(libmaus2::bambam::BamAuxFilterVector::parseAuxFilterList(arginfo)); libmaus2::bambam::BamAuxFilterVector const * rgfilter = prgfilter.get(); while ( dec.readAlignment() ) { bool const keep = resetAlignment(algn,resetaux /* reset aux */,excludeflags,rgfilter); if ( keep ) algn.serialise(writer->getStream()); if ( verbose && (++c & (1024*1024-1)) == 0 ) std::cerr << "[V] " << c/(1024*1024) << " " << (c / rtc.getElapsedSeconds()) << std::endl; } writer.reset(); if ( Pmd5cb ) { Pmd5cb->saveDigestAsFile(md5filename); } if ( Pindex ) { Pindex->flush(std::string(indexfilename)); } return EXIT_SUCCESS; }
int bammaskflags(::libmaus2::util::ArgInfo const & arginfo) { uint64_t const maskpos = arginfo.getValue<uint64_t>("maskpos",0xFFFFUL); uint64_t const maskneg = arginfo.getValue<uint64_t>("maskneg",getDefaultMaskNeg()); uint64_t const mask = maskpos & (~maskneg); if ( mask ) { std::cerr << "Keeping flags "; for ( uint64_t i = 1; i <= ::libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_FSUPPLEMENTARY; i <<= 1 ) if ( mask & i ) std::cerr << static_cast< ::libmaus2::bambam::BamFlagBase::bam_flags >(i) << ";"; std::cerr << std::endl; std::cerr << "Erasing flags "; for ( uint64_t i = 1; i <= ::libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_FSUPPLEMENTARY; i <<= 1 ) if ( !(mask & i) ) std::cerr << static_cast< ::libmaus2::bambam::BamFlagBase::bam_flags >(i) << ";"; std::cerr << std::endl; } else { std::cerr << "Erasing all flags." << std::endl; } int const level = libmaus2::bambam::BamBlockWriterBaseFactory::checkCompressionLevel(arginfo.getValue<int>("level",getDefaultLevel())); int const resetmatecoord = arginfo.getValue<int>("resetmatecoord",0); ::libmaus2::bambam::BamDecoder BD(std::cin); ::libmaus2::bambam::BamHeader const & bamheader = BD.getHeader(); std::string const headertext(bamheader.text); // add PG line to header std::string const upheadtext = ::libmaus2::bambam::ProgramHeaderLineSet::addProgramLine( headertext, "bammaskflags", // ID "bammaskflags", // PN arginfo.commandline, // CL ::libmaus2::bambam::ProgramHeaderLineSet(headertext).getLastIdInChain(), // PP std::string(PACKAGE_VERSION) // VN ); // construct new header ::libmaus2::bambam::BamHeader uphead(upheadtext); ::libmaus2::bambam::BamAlignment & alignment = BD.getAlignment(); /* * start index/md5 callbacks */ std::string const tmpfilenamebase = arginfo.getValue<std::string>("tmpfile",arginfo.getDefaultTmpFileName()); std::string const tmpfileindex = tmpfilenamebase + "_index"; ::libmaus2::util::TempFileRemovalContainer::addTempFile(tmpfileindex); std::string md5filename; std::string indexfilename; std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > cbs; ::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Pmd5cb; if ( arginfo.getValue<unsigned int>("md5",getDefaultMD5()) ) { if ( arginfo.hasArg("md5filename") && arginfo.getUnparsedValue("md5filename","") != "" ) md5filename = arginfo.getUnparsedValue("md5filename",""); else std::cerr << "[V] no filename for md5 given, not creating hash" << std::endl; if ( md5filename.size() ) { ::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Tmd5cb(new ::libmaus2::lz::BgzfDeflateOutputCallbackMD5); Pmd5cb = UNIQUE_PTR_MOVE(Tmd5cb); cbs.push_back(Pmd5cb.get()); } } libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Pindex; if ( arginfo.getValue<unsigned int>("index",getDefaultIndex()) ) { if ( arginfo.hasArg("indexfilename") && arginfo.getUnparsedValue("indexfilename","") != "" ) indexfilename = arginfo.getUnparsedValue("indexfilename",""); else std::cerr << "[V] no filename for index given, not creating index" << std::endl; if ( indexfilename.size() ) { libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Tindex(new libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex(tmpfileindex)); Pindex = UNIQUE_PTR_MOVE(Tindex); cbs.push_back(Pindex.get()); } } std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > * Pcbs = 0; if ( cbs.size() ) Pcbs = &cbs; /* * end md5/index callbacks */ ::libmaus2::bambam::BamWriter::unique_ptr_type writer(new ::libmaus2::bambam::BamWriter(std::cout,uphead,level,Pcbs)); while ( BD.readAlignment() ) { alignment.putFlags(alignment.getFlags() & mask); if ( resetmatecoord ) { alignment.putNextRefId(-1); alignment.putNextPos(-1); } alignment.serialise(writer->getStream()); } writer.reset(); if ( Pmd5cb ) { Pmd5cb->saveDigestAsFile(md5filename); } if ( Pindex ) { Pindex->flush(std::string(indexfilename)); } return EXIT_SUCCESS; }
uint64_t bamheaderfilter(libmaus2::util::ArgInfo const & arginfo) { std::string const inputfilename = arginfo.getUnparsedValue("I",""); if ( ! inputfilename.size() || inputfilename == "-" ) { ::libmaus2::exception::LibMausException se; se.getStream() << "No input filename given, please set the I key appropriately." << std::endl; se.finish(); throw se; } libmaus2::bitio::IndexedBitVector::unique_ptr_type usedrefseq; libmaus2::bitio::IndexedBitVector::unique_ptr_type usedrg; libmaus2::bambam::BamHeader::unique_ptr_type uheader; getUsedRefSeqs(arginfo,usedrefseq,usedrg,uheader); /* * start index/md5 callbacks */ std::string const tmpfilenamebase = arginfo.getValue<std::string>("tmpfile",arginfo.getDefaultTmpFileName()); std::string const tmpfileindex = tmpfilenamebase + "_index"; ::libmaus2::util::TempFileRemovalContainer::addTempFile(tmpfileindex); std::string md5filename; std::string indexfilename; std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > cbs; ::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Pmd5cb; if ( arginfo.getValue<unsigned int>("md5",getDefaultMD5()) ) { if ( arginfo.hasArg("md5filename") && arginfo.getUnparsedValue("md5filename","") != "" ) md5filename = arginfo.getUnparsedValue("md5filename",""); else std::cerr << "[V] no filename for md5 given, not creating hash" << std::endl; if ( md5filename.size() ) { ::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Tmd5cb(new ::libmaus2::lz::BgzfDeflateOutputCallbackMD5); Pmd5cb = UNIQUE_PTR_MOVE(Tmd5cb); cbs.push_back(Pmd5cb.get()); } } libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Pindex; if ( arginfo.getValue<unsigned int>("index",getDefaultIndex()) ) { if ( arginfo.hasArg("indexfilename") && arginfo.getUnparsedValue("indexfilename","") != "" ) indexfilename = arginfo.getUnparsedValue("indexfilename",""); else std::cerr << "[V] no filename for index given, not creating index" << std::endl; if ( indexfilename.size() ) { libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Tindex(new libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex(tmpfileindex)); Pindex = UNIQUE_PTR_MOVE(Tindex); cbs.push_back(Pindex.get()); } } /* * end md5/index callbacks */ std::string headertext(uheader->text); std::vector<libmaus2::bambam::HeaderLine> hl = libmaus2::bambam::HeaderLine::extractLines(headertext); std::ostringstream headertextostr; uint64_t rscnt = 0; uint64_t rgcnt = 0; for ( uint64_t i = 0; i < hl.size(); ++i ) { if ( hl[i].type == "SQ" ) { if ( usedrefseq->get(rscnt) ) headertextostr << hl[i].line << std::endl; rscnt += 1; } else if ( hl[i].type == "RG" ) { if ( usedrg->get(rgcnt) ) headertextostr << hl[i].line << std::endl; rgcnt += 1; } else { headertextostr << hl[i].line << std::endl; } } headertext = headertextostr.str(); // add PG line to header std::string const upheadtext = ::libmaus2::bambam::ProgramHeaderLineSet::addProgramLine( headertext, "bamheaderfilter", // ID "bamheaderfilter", // PN arginfo.commandline, // CL ::libmaus2::bambam::ProgramHeaderLineSet(headertext).getLastIdInChain(), // PP std::string(PACKAGE_VERSION) // VN ); // construct new header ::libmaus2::bambam::BamHeader uphead(upheadtext); libmaus2::bambam::BamBlockWriterBase::unique_ptr_type Pout ( libmaus2::bambam::BamBlockWriterBaseFactory::construct(uphead, arginfo, &cbs) ); // input decoder wrapper libmaus2::bambam::BamAlignmentDecoderWrapper::unique_ptr_type decwrapper( libmaus2::bambam::BamMultiAlignmentDecoderFactory::construct( arginfo,false // put rank ) ); ::libmaus2::bambam::BamAlignmentDecoder * ppdec = &(decwrapper->getDecoder()); ::libmaus2::bambam::BamAlignmentDecoder & dec = *ppdec; ::libmaus2::bambam::BamAlignment & algn = dec.getAlignment(); while ( dec.readAlignment() ) { if ( (!algn.isPaired()) && algn.isMapped() ) { assert ( algn.getRefID() >= 0 ); assert ( algn.getRefID() < static_cast<int64_t>(usedrefseq->size()) ); assert ( usedrefseq->get(algn.getRefID()) ); assert ( usedrefseq->rank1(algn.getRefID())-1 < uphead.getNumRef() ); algn.putRefId(usedrefseq->rank1(algn.getRefID())-1); } if ( algn.isPaired() && algn.isMapped() ) { assert ( algn.getRefID() >= 0 ); assert ( algn.getRefID() < static_cast<int64_t>(usedrefseq->size()) ); assert ( usedrefseq->get(algn.getRefID()) ); assert ( usedrefseq->rank1(algn.getRefID())-1 < uphead.getNumRef() ); algn.putRefId(usedrefseq->rank1(algn.getRefID())-1); } if ( algn.isPaired() && algn.isMateMapped() ) { assert ( algn.getNextRefID() >= 0 ); assert ( algn.getNextRefID() < static_cast<int64_t>(usedrefseq->size()) ); assert ( usedrefseq->get(algn.getNextRefID()) ); assert ( usedrefseq->rank1(algn.getNextRefID())-1 < uphead.getNumRef() ); algn.putNextRefId(usedrefseq->rank1(algn.getNextRefID())-1); } // erase unmapped refid and pos if ( algn.isUnmap() ) { algn.putRefId(-1); algn.putPos(-1); } if ( algn.isMateUnmap() ) { algn.putNextRefId(-1); algn.putNextPos(-1); } Pout->writeAlignment(algn); } Pout.reset(); if ( Pmd5cb ) { Pmd5cb->saveDigestAsFile(md5filename); } if ( Pindex ) { Pindex->flush(std::string(indexfilename)); } return 0; }
int bamvalidateTemplate(::libmaus2::util::ArgInfo const & arginfo) { libmaus2::timing::RealTimeClock rtc; rtc.start(); bool const verbose = arginfo.getValue("verbose",getDefaultVerbose()); bool const basequalhist = arginfo.getValue("basequalhist",getDefaultBaseQualHist()); libmaus2::bambam::BamAlignmentDecoderWrapper::unique_ptr_type decwrapper( libmaus2::bambam::BamMultiAlignmentDecoderFactory::construct( arginfo,false // put rank ) ); ::libmaus2::bambam::BamAlignmentDecoder * ppdec = &(decwrapper->getDecoder()); ::libmaus2::bambam::BamAlignmentDecoder & dec = *ppdec; ::libmaus2::bambam::BamHeader const & header = dec.getHeader(); ::libmaus2::bambam::BamAlignment const & algn = dec.getAlignment(); // add PG line to header std::string const upheadtext = ::libmaus2::bambam::ProgramHeaderLineSet::addProgramLine( header.text, "bamvalidate", // ID "bamvalidate", // PN arginfo.commandline, // CL ::libmaus2::bambam::ProgramHeaderLineSet(header.text).getLastIdInChain(), // PP std::string(PACKAGE_VERSION) // VN ); // construct new header ::libmaus2::bambam::BamHeader uphead(upheadtext); /* * start index/md5 callbacks and alignment writer */ std::string md5filename; std::string indexfilename; std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > cbs; ::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Pmd5cb; libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Pindex; libmaus2::bambam::BamBlockWriterBase::unique_ptr_type Pout; if ( passthrough ) { std::string const tmpfilenamebase = arginfo.getValue<std::string>("tmpfile",arginfo.getDefaultTmpFileName()); std::string const tmpfileindex = tmpfilenamebase + "_index"; ::libmaus2::util::TempFileRemovalContainer::addTempFile(tmpfileindex); if ( arginfo.getValue<unsigned int>("md5",getDefaultMD5()) ) { if ( arginfo.hasArg("md5filename") && arginfo.getUnparsedValue("md5filename","") != "" ) md5filename = arginfo.getUnparsedValue("md5filename",""); else std::cerr << "[V] no filename for md5 given, not creating hash" << std::endl; if ( md5filename.size() ) { ::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Tmd5cb(new ::libmaus2::lz::BgzfDeflateOutputCallbackMD5); Pmd5cb = UNIQUE_PTR_MOVE(Tmd5cb); cbs.push_back(Pmd5cb.get()); } } if ( arginfo.getValue<unsigned int>("index",getDefaultIndex()) ) { if ( arginfo.hasArg("indexfilename") && arginfo.getUnparsedValue("indexfilename","") != "" ) indexfilename = arginfo.getUnparsedValue("indexfilename",""); else std::cerr << "[V] no filename for index given, not creating index" << std::endl; if ( indexfilename.size() ) { libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Tindex(new libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex(tmpfileindex)); Pindex = UNIQUE_PTR_MOVE(Tindex); cbs.push_back(Pindex.get()); } } std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > * Pcbs = 0; if ( cbs.size() ) Pcbs = &cbs; libmaus2::bambam::BamBlockWriterBase::unique_ptr_type Tout ( libmaus2::bambam::BamBlockWriterBaseFactory::construct(uphead, arginfo, Pcbs) ); Pout = UNIQUE_PTR_MOVE(Tout); } libmaus2::autoarray::AutoArray<char> lastvalidname(256); // max valid read name is 255 bytes uint64_t alsok = 0; ::libmaus2::autoarray::AutoArray<char> qual; libmaus2::autoarray::AutoArray<uint64_t> H(static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())+1); std::fill(H.begin(),H.end(),0ull); try { while ( dec.readAlignment() ) { if ( passthrough ) Pout->writeAlignment(algn); if ( basequalhist ) { uint64_t const l = algn.getLseq(); uint8_t const * Qc = libmaus2::bambam::BamAlignmentDecoderBase::getQual(algn.D.begin()); uint8_t const * const Qe = Qc + l; while ( Qc != Qe ) H[*(Qc++)]++; } uint64_t const lname = algn.getLReadName(); char const * name = algn.getName(); std::copy(name,name+lname+1,lastvalidname.begin()); alsok += 1; } } catch(std::exception const & ex) { std::cerr << "[E] name of last valid alignment was " << lastvalidname.begin() << std::endl; std::cerr << "[E] read " << alsok << " valid alignments" << std::endl; throw; } Pout.reset(); if ( Pmd5cb ) { Pmd5cb->saveDigestAsFile(md5filename); } if ( Pindex ) { Pindex->flush(std::string(indexfilename)); } if ( verbose ) std::cerr << "[V] checked " << alsok << " alignments in " << rtc.formatTime(rtc.getElapsedSeconds()) << " (" << alsok / rtc.getElapsedSeconds() << " al/s)" << std::endl; if ( basequalhist ) { uint64_t const s = std::accumulate(H.begin(),H.end(),0ull); uint64_t a = 0; uint64_t minq = std::numeric_limits<uint64_t>::max(); uint64_t maxq = 0; for ( uint64_t i = 0; i < H.size(); ++i ) if ( H[i] ) { minq = std::min(minq,i); maxq = std::max(maxq,i); a += H[i]; std::cerr << "[H]\t" << i << "\t"; if ( ( static_cast<uint64_t>(i+33) < static_cast<uint64_t>(std::numeric_limits<char>::max()) && isprint(i+33)) ) std::cerr << static_cast<char>(i+33); std::cerr << "\t" << H[i] << "\t" << (H[i] / static_cast<double>(s)) << "\t" << (a / static_cast<double>(s)) << std::endl; } if ( s ) { std::cerr << "[H]\tmin\t" << minq << "\t"; if ( ( static_cast<uint64_t>(minq+33) < static_cast<uint64_t>(std::numeric_limits<char>::max()) && isprint(minq+33)) ) std::cerr << static_cast<char>(minq+33); std::cerr << std::endl; std::cerr << "[H]\tmax\t" << maxq << "\t"; if ( ( static_cast<uint64_t>(maxq+33) < static_cast<uint64_t>(std::numeric_limits<char>::max()) && isprint(maxq+33)) ) std::cerr << static_cast<char>(maxq+33); std::cerr << std::endl; } } return EXIT_SUCCESS; }
int bamrecalculatecigar(libmaus2::util::ArgInfo const & arginfo) { if ( isatty(STDOUT_FILENO) ) { ::libmaus2::exception::LibMausException se; se.getStream() << "Refusing write binary data to terminal, please redirect standard output to pipe or file." << std::endl; se.finish(); throw se; } int const verbose = arginfo.getValue<int>("verbose",getDefaultVerbose()); // input decoder wrapper libmaus2::bambam::BamAlignmentDecoderWrapper::unique_ptr_type decwrapper( libmaus2::bambam::BamMultiAlignmentDecoderFactory::construct( arginfo,false // put rank ) ); libmaus2::bambam::BamAlignmentDecoder & bamdec = decwrapper->getDecoder(); libmaus2::bambam::BamAlignment & algn = bamdec.getAlignment(); libmaus2::bambam::BamHeader const & header = bamdec.getHeader(); ::libmaus2::bambam::BamHeader::unique_ptr_type uphead(updateHeader(arginfo,header)); /* * start index/md5 callbacks */ std::string const tmpfilenamebase = arginfo.getValue<std::string>("tmpfile",arginfo.getDefaultTmpFileName()); std::string const tmpfileindex = tmpfilenamebase + "_index"; ::libmaus2::util::TempFileRemovalContainer::addTempFile(tmpfileindex); std::string md5filename; std::string indexfilename; std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > cbs; ::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Pmd5cb; if ( arginfo.getValue<unsigned int>("md5",getDefaultMD5()) ) { if ( arginfo.hasArg("md5filename") && arginfo.getUnparsedValue("md5filename","") != "" ) md5filename = arginfo.getUnparsedValue("md5filename",""); else std::cerr << "[V] no filename for md5 given, not creating hash" << std::endl; if ( md5filename.size() ) { ::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Tmd5cb(new ::libmaus2::lz::BgzfDeflateOutputCallbackMD5); Pmd5cb = UNIQUE_PTR_MOVE(Tmd5cb); cbs.push_back(Pmd5cb.get()); } } libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Pindex; if ( arginfo.getValue<unsigned int>("index",getDefaultIndex()) ) { if ( arginfo.hasArg("indexfilename") && arginfo.getUnparsedValue("indexfilename","") != "" ) indexfilename = arginfo.getUnparsedValue("indexfilename",""); else std::cerr << "[V] no filename for index given, not creating index" << std::endl; if ( indexfilename.size() ) { libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Tindex(new libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex(tmpfileindex)); Pindex = UNIQUE_PTR_MOVE(Tindex); cbs.push_back(Pindex.get()); } } std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > * Pcbs = 0; if ( cbs.size() ) Pcbs = &cbs; /* * end md5/index callbacks */ libmaus2::bambam::BamBlockWriterBase::unique_ptr_type writer( libmaus2::bambam::BamBlockWriterBaseFactory::construct(*uphead, arginfo, Pcbs) ); libmaus2::autoarray::AutoArray<libmaus2::bambam::cigar_operation> cigopin; libmaus2::autoarray::AutoArray<char> readdata; libmaus2::bambam::BamAlignment::D_array_type T; if ( ! arginfo.hasArg("reference") ) { libmaus2::exception::LibMausException se; se.getStream() << "reference key is missing." << std::endl; se.finish(); throw se; } std::string const reference = arginfo.getUnparsedValue("reference",""); if ( ! libmaus2::util::GetFileSize::fileExists(reference) ) { libmaus2::exception::LibMausException se; se.getStream() << "file " << reference << " does not exist." << std::endl; se.finish(); throw se; } libmaus2::fastx::FastAIndex::unique_ptr_type FAindex(libmaus2::fastx::FastAIndex::load(reference + ".fai")); libmaus2::aio::InputStreamInstance FAISI(reference); uint64_t c = 0; libmaus2::autoarray::AutoArray<char> ref; int64_t refloaded = -1; while ( bamdec.readAlignment() ) { if ( algn.isMapped() ) { assert ( algn.getRefID() >= 0 ); if ( algn.getRefID() != refloaded ) { if ( algn.getRefID() < refloaded ) { libmaus2::exception::LibMausException lme; lme.getStream() << "bamrecalculatecigar: file is not sorted by coordinate" << std::endl; lme.finish(); throw lme; } ref = FAindex->readSequence(FAISI,algn.getRefID()); refloaded = algn.getRefID(); } uint64_t const numcig = libmaus2::bambam::BamAlignmentDecoderBase::recalculateCigar( algn.D.begin(), ref.begin() + algn.getPos(), cigopin, readdata ); algn.replaceCigarString(cigopin,numcig,T); } writer->writeAlignment(algn); if ( ((++c) & ((1ull<<20)-1)) == 0 && verbose ) std::cerr << "[V] " << c << std::endl; } if ( verbose ) std::cerr << "[V] " << c << std::endl; writer.reset(); if ( Pmd5cb ) { Pmd5cb->saveDigestAsFile(md5filename); } if ( Pindex ) { Pindex->flush(std::string(indexfilename)); } return EXIT_SUCCESS; }
int bamfiltermc(libmaus2::util::ArgInfo const & arginfo) { bool const verbose = arginfo.getValue("verbose",getDefaultVerbose()); libmaus2::bambam::BamAlignmentDecoderWrapper::unique_ptr_type decwrapper( libmaus2::bambam::BamMultiAlignmentDecoderFactory::construct(arginfo)); ::libmaus2::bambam::BamAlignmentDecoder * ppdec = &(decwrapper->getDecoder()); ::libmaus2::bambam::BamAlignmentDecoder & dec = *ppdec; ::libmaus2::bambam::BamHeader const & header = dec.getHeader(); ::libmaus2::bambam::BamAlignment & algn = dec.getAlignment(); std::string const tmpfilenamebase = arginfo.getUnparsedValue("tmpfile",arginfo.getDefaultTmpFileName()); uint64_t const numthreads = arginfo.getValueUnsignedNumeric<uint64_t>("numthreads",getDefaultNumThreads()); /* * start index/md5 callbacks */ std::string const tmpfileindex = tmpfilenamebase + "_index"; ::libmaus2::util::TempFileRemovalContainer::addTempFile(tmpfileindex); std::string md5filename; std::string indexfilename; std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > cbs; ::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Pmd5cb; if ( arginfo.getValue<unsigned int>("md5",getDefaultMD5()) ) { if ( libmaus2::bambam::BamBlockWriterBaseFactory::getMD5FileName(arginfo) != std::string() ) md5filename = libmaus2::bambam::BamBlockWriterBaseFactory::getMD5FileName(arginfo); else std::cerr << "[V] no filename for md5 given, not creating hash" << std::endl; if ( md5filename.size() ) { ::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Tmd5cb(new ::libmaus2::lz::BgzfDeflateOutputCallbackMD5); Pmd5cb = UNIQUE_PTR_MOVE(Tmd5cb); cbs.push_back(Pmd5cb.get()); } } libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Pindex; if ( arginfo.getValue<unsigned int>("index",getDefaultIndex()) ) { if ( libmaus2::bambam::BamBlockWriterBaseFactory::getIndexFileName(arginfo) != std::string() ) indexfilename = libmaus2::bambam::BamBlockWriterBaseFactory::getIndexFileName(arginfo); else std::cerr << "[V] no filename for index given, not creating index" << std::endl; if ( indexfilename.size() ) { libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Tindex(new libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex(tmpfileindex)); Pindex = UNIQUE_PTR_MOVE(Tindex); cbs.push_back(Pindex.get()); } } std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > * Pcbs = 0; if ( cbs.size() ) Pcbs = &cbs; /* * end md5/index callbacks */ ::libmaus2::bambam::BamHeader::unique_ptr_type genuphead( libmaus2::bambam::BamHeaderUpdate::updateHeader(arginfo,header,"bamfiltermc",std::string(PACKAGE_VERSION)) ); libmaus2::bambam::BamBlockWriterBase::unique_ptr_type Pwriter(libmaus2::bambam::BamBlockWriterBaseFactory::construct(*genuphead,arginfo,Pcbs)); libmaus2::bambam::BamBlockWriterBase & wr = *Pwriter; // freelist size uint64_t const flsize = 16*1024; libmaus2::util::FreeList < libmaus2::bambam::BamAlignment, BamAlignmentFreeListDefaultAllocator, BamAlignmentFreeListDefaultTypeInfo > FL(flsize); libmaus2::util::SimpleQueue < libmaus2::bambam::BamAlignment::shared_ptr_type > Q; libmaus2::bambam::BamAuxFilterVector auxvec; auxvec.set('M','C'); uint64_t alcnt = 0; while ( dec.readAlignment() ) { if ( FL.empty() ) handleQueue(Q,FL,wr,auxvec,numthreads); assert ( ! FL.empty() ); libmaus2::bambam::BamAlignment::shared_ptr_type P = FL.get(); P->swap(algn); Q.push_back(P); if ( verbose && ((++alcnt % (1024*1024)) == 0) ) std::cerr << "[V] " << alcnt << std::endl; } handleQueue(Q,FL,wr,auxvec,numthreads); // reset BAM writer Pwriter.reset(); if ( Pmd5cb ) Pmd5cb->saveDigestAsFile(md5filename); if ( Pindex ) Pindex->flush(std::string(indexfilename)); return EXIT_SUCCESS; }
int bamfilterflags(::libmaus2::util::ArgInfo const & arginfo) { uint32_t const excludeflags = libmaus2::bambam::BamFlagBase::stringToFlags(arginfo.getValue<std::string>("exclude","")); std::cerr << "[V] excluding " << excludeflags << std::endl; int const level = libmaus2::bambam::BamBlockWriterBaseFactory::checkCompressionLevel(arginfo.getValue<int>("level",Z_DEFAULT_COMPRESSION)); uint64_t const numthreads = arginfo.getValue<uint64_t>("numthreads",1); uint64_t cnt = 0; uint64_t kept = 0; /* * start index/md5 callbacks */ std::string const tmpfilenamebase = arginfo.getValue<std::string>("tmpfile",arginfo.getDefaultTmpFileName()); std::string const tmpfileindex = tmpfilenamebase + "_index"; ::libmaus2::util::TempFileRemovalContainer::addTempFile(tmpfileindex); std::string md5filename; std::string indexfilename; std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > cbs; ::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Pmd5cb; if ( arginfo.getValue<unsigned int>("md5",getDefaultMD5()) ) { if ( arginfo.hasArg("md5filename") && arginfo.getUnparsedValue("md5filename","") != "" ) md5filename = arginfo.getUnparsedValue("md5filename",""); else std::cerr << "[V] no filename for md5 given, not creating hash" << std::endl; if ( md5filename.size() ) { ::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Tmd5cb(new ::libmaus2::lz::BgzfDeflateOutputCallbackMD5); Pmd5cb = UNIQUE_PTR_MOVE(Tmd5cb); cbs.push_back(Pmd5cb.get()); } } libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Pindex; if ( arginfo.getValue<unsigned int>("index",getDefaultIndex()) ) { if ( arginfo.hasArg("indexfilename") && arginfo.getUnparsedValue("indexfilename","") != "" ) indexfilename = arginfo.getUnparsedValue("indexfilename",""); else std::cerr << "[V] no filename for index given, not creating index" << std::endl; if ( indexfilename.size() ) { libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Tindex(new libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex(tmpfileindex)); Pindex = UNIQUE_PTR_MOVE(Tindex); cbs.push_back(Pindex.get()); } } std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > * Pcbs = 0; if ( cbs.size() ) Pcbs = &cbs; /* * end md5/index callbacks */ if ( numthreads == 1 ) { ::libmaus2::bambam::BamDecoder BD(std::cin); ::libmaus2::bambam::BamHeader const & bamheader = BD.getHeader(); ::libmaus2::bambam::BamHeader::unique_ptr_type uphead(libmaus2::bambam::BamHeaderUpdate::updateHeader(arginfo,bamheader,"bamfilterflags",std::string(PACKAGE_VERSION))); ::libmaus2::bambam::BamAlignment & alignment = BD.getAlignment(); ::libmaus2::bambam::BamWriter::unique_ptr_type writer(new ::libmaus2::bambam::BamWriter(std::cout,*uphead,level,Pcbs)); for ( ; BD.readAlignment(); ++cnt ) { if ( cnt % (1024*1024) == 0 ) std::cerr << "[V] processed " << cnt << " kept " << kept << " removed " << (cnt-kept) << std::endl; if ( ! (alignment.getFlags() & excludeflags) ) { alignment.serialise(writer->getStream()); ++kept; } } std::cerr << "[V] " << cnt << std::endl; } else { ::libmaus2::bambam::BamHeaderUpdate UH(arginfo,"bamfilterflags",std::string(PACKAGE_VERSION)); libmaus2::bambam::BamParallelRewrite BPR(std::cin,UH,std::cout,Z_DEFAULT_COMPRESSION,numthreads,4 /* blocks per thread */,Pcbs); libmaus2::bambam::BamAlignmentDecoder & dec = BPR.getDecoder(); libmaus2::bambam::BamParallelRewrite::writer_type & writer = BPR.getWriter(); libmaus2::bambam::BamAlignment const & algn = dec.getAlignment(); for ( ; dec.readAlignment(); ++cnt ) { if ( cnt % (1024*1024) == 0 ) std::cerr << "[V] processed " << cnt << " kept " << kept << " removed " << (cnt-kept) << std::endl; if ( ! (algn.getFlags() & excludeflags) ) { algn.serialise(writer.getStream()); ++kept; } } std::cerr << "[V] " << cnt << std::endl; } std::cerr << "[V] kept " << kept << " removed " << cnt-kept << std::endl; if ( Pmd5cb ) { Pmd5cb->saveDigestAsFile(md5filename); } if ( Pindex ) { Pindex->flush(std::string(indexfilename)); } return EXIT_SUCCESS; }
int bamheap2(libmaus2::util::ArgInfo const & arginfo) { bool const verbose = arginfo.getValue("verbose",getDefaultVerbose()); std::string const reference = arginfo.getUnparsedValue("reference",std::string()); std::string const outputprefix = arginfo.getUnparsedValue("outputprefix",std::string()); libmaus2::bambam::BamAlignmentDecoderWrapper::unique_ptr_type decwrapper( libmaus2::bambam::BamMultiAlignmentDecoderFactory::construct(arginfo)); ::libmaus2::bambam::BamAlignmentDecoder * ppdec = &(decwrapper->getDecoder()); ::libmaus2::bambam::BamAlignmentDecoder & dec = *ppdec; ::libmaus2::bambam::BamHeader const & header = dec.getHeader(); ::libmaus2::bambam::BamAlignment const & algn = dec.getAlignment(); double const damult = arginfo.getValue<double>("amult",1); double const dcmult = arginfo.getValue<double>("cmult",1); double const dgmult = arginfo.getValue<double>("gmult",1); double const dtmult = arginfo.getValue<double>("tmult",1); double const dpadmult = arginfo.getValue<double>("padmult",1); double maxmult = 0; maxmult = std::max(damult,maxmult); maxmult = std::max(dcmult,maxmult); maxmult = std::max(dgmult,maxmult); maxmult = std::max(dtmult,maxmult); maxmult = std::max(dpadmult,maxmult); uint64_t const amult = std::floor((damult / maxmult) * (1ull<<16) + 0.5); uint64_t const cmult = std::floor((dcmult / maxmult) * (1ull<<16) + 0.5); uint64_t const gmult = std::floor((dgmult / maxmult) * (1ull<<16) + 0.5); uint64_t const tmult = std::floor((dtmult / maxmult) * (1ull<<16) + 0.5); uint64_t const padmult = std::floor((dpadmult / maxmult) * (1ull<<16) + 0.5); libmaus2::fastx::FastAIndex::unique_ptr_type Pindex; libmaus2::aio::InputStreamInstance::unique_ptr_type PCIS; if ( reference.size() ) { libmaus2::fastx::FastAIndex::unique_ptr_type Tindex( libmaus2::fastx::FastAIndex::load(reference+".fai") ); Pindex = UNIQUE_PTR_MOVE(Tindex); libmaus2::aio::InputStreamInstance::unique_ptr_type TCIS(new libmaus2::aio::InputStreamInstance(reference)); PCIS = UNIQUE_PTR_MOVE(TCIS); } libmaus2::autoarray::AutoArray<libmaus2::bambam::cigar_operation> cigop; libmaus2::autoarray::AutoArray<char> bases; int64_t prevrefid = -1; std::string refidname = "*"; std::map< uint64_t, HeapEntry > M; uint64_t alcnt = 0; std::vector< std::pair<char,uint8_t> > pendinginserts; int64_t loadedRefId = -1; int64_t streamRefId = -1; libmaus2::autoarray::AutoArray<char> refseqbases; ConsensusAccuracy * consacc = 0; std::map<uint64_t,ConsensusAccuracy> Mconsacc; typedef libmaus2::util::shared_ptr<std::ostringstream>::type stream_ptr_type; stream_ptr_type Pstream; ConsensusAux Caux; Caux.M['a'] = Caux.M['A'] = amult; Caux.M['c'] = Caux.M['C'] = cmult; Caux.M['g'] = Caux.M['G'] = gmult; Caux.M['t'] = Caux.M['T'] = tmult; Caux.M[padsym] = padmult; while ( dec.readAlignment() ) { if ( algn.isMapped() && (!algn.isQCFail()) && algn.getLseq() ) { assert ( ! pendinginserts.size() ); uint32_t const numcigop = algn.getCigarOperations(cigop); uint64_t readpos = 0; uint64_t refpos = algn.getPos(); uint64_t const seqlen = algn.decodeRead(bases); uint8_t const * qual = libmaus2::bambam::BamAlignmentDecoderBase::getQual(algn.D.begin()); // handle finished columns if ( algn.getRefID() != prevrefid ) { while ( M.size() ) { HeapEntry & H = M.begin()->second; if ( outputprefix.size() && (streamRefId != prevrefid) ) { if ( Pstream ) { std::ostringstream fnostr; fnostr << outputprefix << "_" << header.getRefIDName(streamRefId); libmaus2::aio::OutputStreamInstance PFOS(fnostr.str()); PFOS << ">" << header.getRefIDName(streamRefId) << '\n'; PFOS << Pstream->str() << '\n'; Pstream.reset(); } stream_ptr_type Tstream(new std::ostringstream); Pstream = Tstream; streamRefId = prevrefid; } if ( Pindex && (loadedRefId != prevrefid) ) { refseqbases = Pindex->readSequence(*PCIS, Pindex->getSequenceIdByName(refidname)); loadedRefId = prevrefid; if ( Mconsacc.find(loadedRefId) == Mconsacc.end() ) Mconsacc[loadedRefId] = ConsensusAccuracy(refseqbases.size()); consacc = &(Mconsacc[loadedRefId]); } H.toStream(std::cout,M.begin()->first,refidname,(M.begin()->first < refseqbases.size()) ? static_cast<int>(refseqbases[M.begin()->first]) : -1,Caux,consacc,Pstream.get()); M.erase(M.begin()); } prevrefid = algn.getRefID(); refidname = header.getRefIDName(prevrefid); } else { while ( M.size() && M.begin()->first < refpos ) { HeapEntry & H = M.begin()->second; if ( outputprefix.size() && (streamRefId != prevrefid) ) { if ( Pstream ) { std::ostringstream fnostr; fnostr << outputprefix << "_" << header.getRefIDName(streamRefId); libmaus2::aio::OutputStreamInstance PFOS(fnostr.str()); PFOS << ">" << header.getRefIDName(streamRefId) << '\n'; PFOS << Pstream->str() << '\n'; Pstream.reset(); } stream_ptr_type Tstream(new std::ostringstream); Pstream = Tstream; streamRefId = prevrefid; } if ( Pindex && (loadedRefId != prevrefid) ) { refseqbases = Pindex->readSequence(*PCIS, Pindex->getSequenceIdByName(refidname)); loadedRefId = prevrefid; if ( Mconsacc.find(loadedRefId) == Mconsacc.end() ) Mconsacc[loadedRefId] = ConsensusAccuracy(refseqbases.size()); consacc = &(Mconsacc[loadedRefId]); } H.toStream(std::cout,M.begin()->first,refidname,(M.begin()->first < refseqbases.size()) ? static_cast<int>(refseqbases[M.begin()->first]) : -1,Caux,consacc,Pstream.get()); M.erase(M.begin()); } } for ( uint64_t ci = 0; ci < numcigop; ++ci ) { uint64_t const ciglen = cigop[ci].second; switch ( cigop[ci].first ) { case libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CMATCH: case libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CEQUAL: case libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CDIFF: { if ( pendinginserts.size() ) { M[refpos].I.push_back(pendinginserts); pendinginserts.resize(0); } for ( uint64_t i = 0; i < ciglen; ++i ) { M[refpos].V.push_back(std::make_pair(bases[readpos],qual[readpos])); readpos++; refpos++; } break; } case libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CINS: { for ( uint64_t i = 0; i < ciglen; ++i, ++readpos ) pendinginserts.push_back(std::make_pair(bases[readpos],qual[readpos])); break; } case libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CDEL: // handle pending inserts if ( pendinginserts.size() ) { M[refpos].I.push_back(pendinginserts); pendinginserts.resize(0); } // deleting bases from the reference for ( uint64_t i = 0; i < ciglen; ++i, ++refpos ) M[refpos].V.push_back(std::make_pair(padsym,0)); break; case libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CREF_SKIP: // handle pending inserts if ( pendinginserts.size() ) { M[refpos].I.push_back(pendinginserts); pendinginserts.resize(0); } // skip bases on reference for ( uint64_t i = 0; i < ciglen; ++i ) { refpos++; } break; case libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CSOFT_CLIP: // skip bases on read for ( uint64_t i = 0; i < ciglen; ++i ) { readpos++; } break; case libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CHARD_CLIP: break; case libmaus2::bambam::BamFlagBase::LIBMAUS2_BAMBAM_CPAD: { for ( uint64_t i = 0; i < ciglen; ++i, ++readpos ) pendinginserts.push_back(std::make_pair(padsym,0)); break; } } } if ( pendinginserts.size() ) { M[refpos].I.push_back(pendinginserts); M[refpos].iadd++; pendinginserts.resize(0); } assert ( readpos == seqlen ); } if ( verbose && ((++alcnt % (1024*1024)) == 0) ) std::cerr << "[V] " << alcnt << std::endl; } while ( M.size() ) { HeapEntry & H = M.begin()->second; if ( outputprefix.size() && (streamRefId != prevrefid) ) { if ( Pstream ) { std::ostringstream fnostr; fnostr << outputprefix << "_" << header.getRefIDName(streamRefId); libmaus2::aio::OutputStreamInstance PFOS(fnostr.str()); PFOS << ">" << header.getRefIDName(streamRefId) << '\n'; PFOS << Pstream->str() << '\n'; Pstream.reset(); } stream_ptr_type Tstream(new std::ostringstream); Pstream = Tstream; streamRefId = prevrefid; } if ( Pindex && (loadedRefId != prevrefid) ) { refseqbases = Pindex->readSequence(*PCIS, Pindex->getSequenceIdByName(refidname)); loadedRefId = prevrefid; if ( Mconsacc.find(loadedRefId) == Mconsacc.end() ) Mconsacc[loadedRefId] = ConsensusAccuracy(refseqbases.size()); consacc = &(Mconsacc[loadedRefId]); } H.toStream(std::cout,M.begin()->first,refidname,(M.begin()->first < refseqbases.size()) ? static_cast<int>(refseqbases[M.begin()->first]) : -1,Caux,consacc,Pstream.get()); M.erase(M.begin()); } if ( Pstream ) { std::ostringstream fnostr; fnostr << outputprefix << "_" << header.getRefIDName(streamRefId); libmaus2::aio::OutputStreamInstance PFOS(fnostr.str()); PFOS << ">" << header.getRefIDName(streamRefId) << '\n'; PFOS << Pstream->str() << '\n'; Pstream.reset(); } ConsensusAccuracy constotal; for ( std::map<uint64_t,ConsensusAccuracy>::const_iterator ita = Mconsacc.begin(); ita != Mconsacc.end(); ++ita ) { std::cerr << header.getRefIDName(ita->first) << "\t" << ita->second << std::endl; std::map<uint64_t,uint64_t> const M = ita->second.depthhistogram.get(); uint64_t total = 0; uint64_t preavg = 0; for ( std::map<uint64_t,uint64_t>::const_iterator aita = M.begin(); aita != M.end(); ++aita ) { total += aita->second; preavg += aita->first * aita->second; } uint64_t acc = 0; for ( std::map<uint64_t,uint64_t>::const_iterator aita = M.begin(); aita != M.end(); ++aita ) { acc += aita->second; std::cerr << "H[" << header.getRefIDName(ita->first) << "," << aita->first << ",+]" << "\t" << aita->second << "\t" << static_cast<double>(aita->second)/total << "\t" << acc << "\t" << static_cast<double>(acc)/total << std::endl; } acc = 0; for ( std::map<uint64_t,uint64_t>::const_reverse_iterator aita = M.rbegin(); aita != M.rend(); ++aita ) { acc += aita->second; std::cerr << "H[" << header.getRefIDName(ita->first) << "," << aita->first << ",-]" << "\t" << aita->second << "\t" << static_cast<double>(aita->second)/total << "\t" << acc << "\t" << static_cast<double>(acc)/total << std::endl; } std::cerr << "H[" << header.getRefIDName(ita->first) << ",avg]\t" << static_cast<double>(preavg)/total << std::endl; constotal += ita->second; } if ( Mconsacc.size() ) { std::cerr << "all\t" << constotal << std::endl; std::map<uint64_t,uint64_t> const M = constotal.depthhistogram.get(); uint64_t total = 0; uint64_t preavg = 0; for ( std::map<uint64_t,uint64_t>::const_iterator aita = M.begin(); aita != M.end(); ++aita ) { total += aita->second; preavg += aita->first * aita->second; } uint64_t acc = 0; for ( std::map<uint64_t,uint64_t>::const_iterator aita = M.begin(); aita != M.end(); ++aita ) { acc += aita->second; std::cerr << "H[" << "all" << "," << aita->first << ",+]" << "\t" << aita->second << "\t" << static_cast<double>(aita->second)/total << "\t" << acc << "\t" << static_cast<double>(acc)/total << std::endl; } acc = 0; for ( std::map<uint64_t,uint64_t>::const_reverse_iterator aita = M.rbegin(); aita != M.rend(); ++aita ) { acc += aita->second; std::cerr << "H[" << "all" << "," << aita->first << ",-]" << "\t" << aita->second << "\t" << static_cast<double>(aita->second)/total << "\t" << acc << "\t" << static_cast<double>(acc)/total << std::endl; } std::cerr << "H[all,avg]\t" << static_cast<double>(preavg) / total << std::endl; } return EXIT_SUCCESS; }
int fagzToCompact4(libmaus2::util::ArgInfo const & arginfo) { bool const rc = arginfo.getValue<unsigned int>("rc",1); bool const gz = arginfo.getValue<unsigned int>("gz",1); std::vector<std::string> inputfilenames; inputfilenames = arginfo.restargs; if ( arginfo.hasArg("inputfilenames") ) { std::string const inf = arginfo.getUnparsedValue("inputfilenames",std::string()); libmaus2::aio::InputStream::unique_ptr_type Pinf(libmaus2::aio::InputStreamFactoryContainer::constructUnique(inf)); while ( *Pinf ) { std::string line; std::getline(*Pinf,line); if ( line.size() ) inputfilenames.push_back(line); } } std::string const inlcp = libmaus2::util::OutputFileNameTools::lcp(inputfilenames); std::string defout = inlcp; defout = libmaus2::util::OutputFileNameTools::clipOff(defout,".gz"); defout = libmaus2::util::OutputFileNameTools::clipOff(defout,".fasta"); defout = libmaus2::util::OutputFileNameTools::clipOff(defout,".fa"); std::string const outputfilename = arginfo.getUnparsedValue("outputfilename",defout + ".compact"); std::string const metaoutputfilename = outputfilename + ".meta"; int const verbose = arginfo.getValue<int>("verbose",1); libmaus2::autoarray::AutoArray<char> B(8*1024,false); libmaus2::bitio::CompactArrayWriterFile compactout(outputfilename,2 /* bits per symbol */); if ( ! rc ) std::cerr << "[V] not storing reverse complements" << std::endl; // forward mapping table libmaus2::autoarray::AutoArray<uint8_t> ftable(256,false); // rc mapping for mapped symbols libmaus2::autoarray::AutoArray<uint8_t> ctable(256,false); std::fill(ftable.begin(),ftable.end(),4); std::fill(ctable.begin(),ctable.end(),4); ftable['a'] = ftable['A'] = 0; ftable['c'] = ftable['C'] = 1; ftable['g'] = ftable['G'] = 2; ftable['t'] = ftable['T'] = 3; uint64_t insize = 0; ctable[0] = 3; // A->T ctable[1] = 2; // C->G ctable[2] = 1; // G->C ctable[3] = 0; // T->A libmaus2::aio::OutputStreamInstance::unique_ptr_type metaOut(new libmaus2::aio::OutputStreamInstance(metaoutputfilename)); libmaus2::util::NumberSerialisation::serialiseNumber(*metaOut,0); uint64_t nseq = 0; std::vector<uint64_t> lvec; for ( uint64_t i = 0; i < inputfilenames.size(); ++i ) { std::string const fn = inputfilenames[i]; libmaus2::aio::InputStreamInstance CIS(fn); libmaus2::lz::BufferedGzipStream::unique_ptr_type BGS; std::istream * istr = 0; if ( gz ) { libmaus2::lz::BufferedGzipStream::unique_ptr_type tBGS( new libmaus2::lz::BufferedGzipStream(CIS)); BGS = UNIQUE_PTR_MOVE(tBGS); istr = BGS.get(); } else { istr = &CIS; } libmaus2::fastx::StreamFastAReaderWrapper fain(*istr); libmaus2::fastx::StreamFastAReaderWrapper::pattern_type pattern; while ( fain.getNextPatternUnlocked(pattern) ) { if ( verbose ) std::cerr << (i+1) << " " << stripAfterDot(basename(fn)) << " " << pattern.sid << "..."; libmaus2::util::NumberSerialisation::serialiseNumber(*metaOut,pattern.spattern.size()); lvec.push_back(pattern.spattern.size()); libmaus2::util::NumberSerialisation::serialiseNumber(*metaOut,0); // map symbols for ( uint64_t j = 0; j < pattern.spattern.size(); ++j ) pattern.spattern[j] = ftable[static_cast<uint8_t>(pattern.spattern[j])]; // replace blocks of N symbols by random bases uint64_t l = 0; // number of replaced blocks uint64_t nr = 0; while ( l < pattern.spattern.size() ) { // skip regular bases while ( l < pattern.spattern.size() && pattern.spattern[l] < 4 ) ++l; assert ( l == pattern.spattern.size() || pattern.spattern[l] == 4 ); // go to end of non regular bases block uint64_t h = l; while ( h < pattern.spattern.size() && pattern.spattern[h] == 4 ) ++h; // if non regular block is not empty if ( h-l ) { // replace by random bases for ( uint64_t j = l; j < h; ++j ) pattern.spattern[j] = (libmaus2::random::Random::rand8() & 3); // write bounds libmaus2::util::NumberSerialisation::serialiseNumber(*metaOut,l); libmaus2::util::NumberSerialisation::serialiseNumber(*metaOut,h); // add to interval counter nr += 1; } l = h; } // make sure there are no more irregular bases for ( uint64_t j = 0; j < pattern.spattern.size(); ++j ) assert ( pattern.spattern[j] < 4 ); // go back to start of meta data metaOut->seekp( - static_cast<int64_t>(2*nr+1)*sizeof(uint64_t), std::ios::cur ); // write number of intervals replaced libmaus2::util::NumberSerialisation::serialiseNumber(*metaOut,nr); // skip interval bounds already written metaOut->seekp( static_cast<int64_t>(2*nr )*sizeof(uint64_t), std::ios::cur ); // write bases compactout.write(pattern.spattern.c_str(),pattern.spattern.size()); // write reverse complement if requested if ( rc ) { // reverse complement std::reverse(pattern.spattern.begin(),pattern.spattern.end()); for ( uint64_t j = 0; j < pattern.spattern.size(); ++j ) pattern.spattern[j] = ctable[static_cast<uint8_t>(pattern.spattern[j])]; // write compactout.write(pattern.spattern.c_str(),pattern.spattern.size()); } insize += pattern.spattern.size()+1; nseq += 1; if ( verbose ) std::cerr << "done, input size " << formatBytes(pattern.spattern.size()+1) << " acc " << formatBytes(insize) << std::endl; } } metaOut->seekp(0); libmaus2::util::NumberSerialisation::serialiseNumber(*metaOut,nseq); metaOut->flush(); metaOut.reset(); libmaus2::aio::InputStreamInstance::unique_ptr_type metaISI(new libmaus2::aio::InputStreamInstance(metaoutputfilename)); // number of sequences uint64_t const rnseq = libmaus2::util::NumberSerialisation::deserialiseNumber(*metaISI); assert ( nseq == rnseq ); for ( uint64_t i = 0; i < nseq; ++i ) { // length of sequence uint64_t const l = libmaus2::util::NumberSerialisation::deserialiseNumber(*metaISI); assert ( l == lvec[i] ); uint64_t const nr = libmaus2::util::NumberSerialisation::deserialiseNumber(*metaISI); // skip replaced intervals metaISI->ignore(2*nr*sizeof(uint64_t)); } assert ( metaISI->peek() == std::istream::traits_type::eof() ); std::cerr << "Done, total input size " << insize << std::endl; compactout.flush(); return EXIT_SUCCESS; }
int bamcat(libmaus2::util::ArgInfo const & arginfo) { if ( isatty(STDOUT_FILENO) ) { ::libmaus2::exception::LibMausException se; se.getStream() << "Refusing write binary data to terminal, please redirect standard output to pipe or file." << std::endl; se.finish(); throw se; } int const level = libmaus2::bambam::BamBlockWriterBaseFactory::checkCompressionLevel(arginfo.getValue<int>("level",getDefaultLevel())); int const verbose = arginfo.getValue<int>("verbose",getDefaultVerbose()); int const streaming = arginfo.getValue<int>("streaming",getDefaultStreaming()); std::vector<std::string> inputfilenames = arginfo.getPairValues("I"); for ( uint64_t i = 0; i < arginfo.restargs.size(); ++i ) inputfilenames.push_back(arginfo.restargs[i]); libmaus2::bambam::BamCat bamdec(inputfilenames, false /* put rank */, streaming); libmaus2::bambam::BamAlignment const & algn = bamdec.getAlignment(); libmaus2::bambam::BamHeader const & header = bamdec.getHeader(); ::libmaus2::bambam::BamHeader::unique_ptr_type uphead(updateHeader(arginfo,header)); /* * start index/md5 callbacks */ std::string const tmpfilenamebase = arginfo.getValue<std::string>("tmpfile",arginfo.getDefaultTmpFileName()); std::string const tmpfileindex = tmpfilenamebase + "_index"; ::libmaus2::util::TempFileRemovalContainer::addTempFile(tmpfileindex); std::string md5filename; std::string indexfilename; std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > cbs; ::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Pmd5cb; if ( arginfo.getValue<unsigned int>("md5",getDefaultMD5()) ) { if ( arginfo.hasArg("md5filename") && arginfo.getUnparsedValue("md5filename","") != "" ) md5filename = arginfo.getUnparsedValue("md5filename",""); else std::cerr << "[V] no filename for md5 given, not creating hash" << std::endl; if ( md5filename.size() ) { ::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Tmd5cb(new ::libmaus2::lz::BgzfDeflateOutputCallbackMD5); Pmd5cb = UNIQUE_PTR_MOVE(Tmd5cb); cbs.push_back(Pmd5cb.get()); } } libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Pindex; if ( arginfo.getValue<unsigned int>("index",getDefaultIndex()) ) { if ( arginfo.hasArg("indexfilename") && arginfo.getUnparsedValue("indexfilename","") != "" ) indexfilename = arginfo.getUnparsedValue("indexfilename",""); else std::cerr << "[V] no filename for index given, not creating index" << std::endl; if ( indexfilename.size() ) { libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Tindex(new libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex(tmpfileindex)); Pindex = UNIQUE_PTR_MOVE(Tindex); cbs.push_back(Pindex.get()); } } std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > * Pcbs = 0; if ( cbs.size() ) Pcbs = &cbs; /* * end md5/index callbacks */ ::libmaus2::bambam::BamWriter::unique_ptr_type writer(new ::libmaus2::bambam::BamWriter(std::cout,*uphead,level,Pcbs)); libmaus2::bambam::BamWriter::stream_type & bamoutstr = writer->getStream(); if ( verbose ) { uint64_t c = 0; while ( bamdec.readAlignment() ) { algn.serialise(bamoutstr); if ( ((++c) & ((1ull<<20)-1)) == 0 ) std::cerr << "[V] " << c << std::endl; } std::cerr << "[V] " << c << std::endl; } else while ( bamdec.readAlignment() ) algn.serialise(bamoutstr); writer.reset(); if ( Pmd5cb ) { Pmd5cb->saveDigestAsFile(md5filename); } if ( Pindex ) { Pindex->flush(std::string(indexfilename)); } return EXIT_SUCCESS; }