void normalisefastaBgzf(libmaus::util::ArgInfo const & arginfo, std::ostream & out) { libmaus::fastx::StreamFastAReaderWrapper in(std::cin); libmaus::fastx::StreamFastAReaderWrapper::pattern_type pattern; int const level = libmaus::bambam::BamBlockWriterBaseFactory::checkCompressionLevel(arginfo.getValue("level",getDefaultLevel())); std::string const indexfn = arginfo.getUnparsedValue("index",""); ::libmaus::bambam::BamBlockWriterBaseFactory::checkCompressionLevel(level); libmaus::lz::BgzfDeflate<std::ostream> defl(out,level,false /* full flush */); uint64_t const inbufsize = defl.getInputBufferSize(); uint64_t zoffset = 0; uint64_t ioffset = 0; std::vector<libmaus::fastx::BgzfFastAIndexEntry> index; std::ostringstream indexstr; ioffset += libmaus::util::NumberSerialisation::serialiseNumber(indexstr,inbufsize); uint64_t patid = 0; while ( in.getNextPatternUnlocked(pattern) ) { std::string const name = pattern.getStringId(); std::string const shortname = stripName(name); std::string const & spat = pattern.spattern; char const * cpat = spat.c_str(); uint64_t const patlen = spat.size(); uint64_t const numblocks = (patlen + inbufsize - 1)/inbufsize; index.push_back(libmaus::fastx::BgzfFastAIndexEntry(shortname,patid++,ioffset)); ioffset += libmaus::util::StringSerialisation::serialiseString(indexstr,name); ioffset += libmaus::util::StringSerialisation::serialiseString(indexstr,shortname); ioffset += libmaus::util::NumberSerialisation::serialiseNumber(indexstr,patlen); ioffset += libmaus::util::NumberSerialisation::serialiseNumber(indexstr,zoffset); ioffset += libmaus::util::NumberSerialisation::serialiseNumber(indexstr,numblocks); std::ostringstream nameostr; nameostr << '>' << name << '\n'; std::string const nameser = nameostr.str(); std::pair<uint64_t,uint64_t> const P0 = defl.writeSyncedCount(nameser.c_str(),nameser.size()); zoffset += P0.second; uint64_t o = 0; while ( o != patlen ) { assert ( o % inbufsize == 0 ); uint64_t const towrite = std::min(patlen-o,inbufsize); std::pair<uint64_t,uint64_t> const P1 = defl.writeSyncedCount(cpat,towrite); ioffset += libmaus::util::NumberSerialisation::serialiseNumber(indexstr,zoffset); zoffset += P1.second; o += towrite; cpat += towrite; } ioffset += libmaus::util::NumberSerialisation::serialiseNumber(indexstr,zoffset); std::pair<uint64_t,uint64_t> const Pn = defl.writeSyncedCount("\n",1); zoffset += Pn.second; } defl.flush(); out << std::flush; uint64_t const imetaoffset = ioffset; ioffset += libmaus::util::NumberSerialisation::serialiseNumber(indexstr,index.size()); for ( uint64_t i = 0; i < index.size(); ++i ) ioffset += libmaus::util::NumberSerialisation::serialiseNumber(indexstr,index[i].ioffset); libmaus::util::NumberSerialisation::serialiseNumber(indexstr,imetaoffset); if ( indexfn.size() ) { std::string const & sindex = indexstr.str(); libmaus::aio::CheckedOutputStream indexCOS(indexfn); indexCOS.write(sindex.c_str(),sindex.size()); indexCOS.flush(); indexCOS.close(); } }
int bamheap2(libmaus::util::ArgInfo const & arginfo) { bool const verbose = arginfo.getValue("verbose",getDefaultVerbose()); std::string const reference = arginfo.getUnparsedValue("reference",std::string()); std::string const outputprefix = arginfo.getUnparsedValue("outputprefix",std::string()); libmaus::bambam::BamAlignmentDecoderWrapper::unique_ptr_type decwrapper( libmaus::bambam::BamMultiAlignmentDecoderFactory::construct(arginfo)); ::libmaus::bambam::BamAlignmentDecoder * ppdec = &(decwrapper->getDecoder()); ::libmaus::bambam::BamAlignmentDecoder & dec = *ppdec; ::libmaus::bambam::BamHeader const & header = dec.getHeader(); ::libmaus::bambam::BamAlignment const & algn = dec.getAlignment(); double const damult = arginfo.getValue<double>("amult",1); double const dcmult = arginfo.getValue<double>("cmult",1); double const dgmult = arginfo.getValue<double>("gmult",1); double const dtmult = arginfo.getValue<double>("tmult",1); double const dpadmult = arginfo.getValue<double>("padmult",1); double maxmult = 0; maxmult = std::max(damult,maxmult); maxmult = std::max(dcmult,maxmult); maxmult = std::max(dgmult,maxmult); maxmult = std::max(dtmult,maxmult); maxmult = std::max(dpadmult,maxmult); uint64_t const amult = std::floor((damult / maxmult) * (1ull<<16) + 0.5); uint64_t const cmult = std::floor((dcmult / maxmult) * (1ull<<16) + 0.5); uint64_t const gmult = std::floor((dgmult / maxmult) * (1ull<<16) + 0.5); uint64_t const tmult = std::floor((dtmult / maxmult) * (1ull<<16) + 0.5); uint64_t const padmult = std::floor((dpadmult / maxmult) * (1ull<<16) + 0.5); libmaus::fastx::FastAIndex::unique_ptr_type Pindex; libmaus::aio::CheckedInputStream::unique_ptr_type PCIS; if ( reference.size() ) { libmaus::fastx::FastAIndex::unique_ptr_type Tindex( libmaus::fastx::FastAIndex::load(reference+".fai") ); Pindex = UNIQUE_PTR_MOVE(Tindex); libmaus::aio::CheckedInputStream::unique_ptr_type TCIS(new libmaus::aio::CheckedInputStream(reference)); PCIS = UNIQUE_PTR_MOVE(TCIS); } libmaus::autoarray::AutoArray<libmaus::bambam::cigar_operation> cigop; libmaus::autoarray::AutoArray<char> bases; int64_t prevrefid = -1; std::string refidname = "*"; std::map< uint64_t, HeapEntry > M; uint64_t alcnt = 0; std::vector< std::pair<char,uint8_t> > pendinginserts; int64_t loadedRefId = -1; int64_t streamRefId = -1; libmaus::autoarray::AutoArray<char> refseqbases; ConsensusAccuracy * consacc = 0; std::map<uint64_t,ConsensusAccuracy> Mconsacc; typedef libmaus::util::shared_ptr<std::ostringstream>::type stream_ptr_type; stream_ptr_type Pstream; ConsensusAux Caux; Caux.M['a'] = Caux.M['A'] = amult; Caux.M['c'] = Caux.M['C'] = cmult; Caux.M['g'] = Caux.M['G'] = gmult; Caux.M['t'] = Caux.M['T'] = tmult; Caux.M[padsym] = padmult; while ( dec.readAlignment() ) { if ( algn.isMapped() && (!algn.isQCFail()) ) { assert ( ! pendinginserts.size() ); uint32_t const numcigop = algn.getCigarOperations(cigop); uint64_t readpos = 0; uint64_t refpos = algn.getPos(); uint64_t const seqlen = algn.decodeRead(bases); uint8_t const * qual = libmaus::bambam::BamAlignmentDecoderBase::getQual(algn.D.begin()); // handle finished columns if ( algn.getRefID() != prevrefid ) { while ( M.size() ) { HeapEntry & H = M.begin()->second; if ( outputprefix.size() && (streamRefId != prevrefid) ) { if ( Pstream ) { std::ostringstream fnostr; fnostr << outputprefix << "_" << header.getRefIDName(streamRefId); libmaus::aio::PosixFdOutputStream PFOS(fnostr.str()); PFOS << ">" << header.getRefIDName(streamRefId) << '\n'; PFOS << Pstream->str() << '\n'; Pstream.reset(); } stream_ptr_type Tstream(new std::ostringstream); Pstream = Tstream; streamRefId = prevrefid; } if ( Pindex && (loadedRefId != prevrefid) ) { refseqbases = Pindex->readSequence(*PCIS, Pindex->getSequenceIdByName(refidname)); loadedRefId = prevrefid; if ( Mconsacc.find(loadedRefId) == Mconsacc.end() ) Mconsacc[loadedRefId] = ConsensusAccuracy(refseqbases.size()); consacc = &(Mconsacc[loadedRefId]); } H.toStream(std::cout,M.begin()->first,refidname,(M.begin()->first < refseqbases.size()) ? static_cast<int>(refseqbases[M.begin()->first]) : -1,Caux,consacc,Pstream.get()); M.erase(M.begin()); } prevrefid = algn.getRefID(); refidname = header.getRefIDName(prevrefid); } else { while ( M.size() && M.begin()->first < refpos ) { HeapEntry & H = M.begin()->second; if ( outputprefix.size() && (streamRefId != prevrefid) ) { if ( Pstream ) { std::ostringstream fnostr; fnostr << outputprefix << "_" << header.getRefIDName(streamRefId); libmaus::aio::PosixFdOutputStream PFOS(fnostr.str()); PFOS << ">" << header.getRefIDName(streamRefId) << '\n'; PFOS << Pstream->str() << '\n'; Pstream.reset(); } stream_ptr_type Tstream(new std::ostringstream); Pstream = Tstream; streamRefId = prevrefid; } if ( Pindex && (loadedRefId != prevrefid) ) { refseqbases = Pindex->readSequence(*PCIS, Pindex->getSequenceIdByName(refidname)); loadedRefId = prevrefid; if ( Mconsacc.find(loadedRefId) == Mconsacc.end() ) Mconsacc[loadedRefId] = ConsensusAccuracy(refseqbases.size()); consacc = &(Mconsacc[loadedRefId]); } H.toStream(std::cout,M.begin()->first,refidname,(M.begin()->first < refseqbases.size()) ? static_cast<int>(refseqbases[M.begin()->first]) : -1,Caux,consacc,Pstream.get()); M.erase(M.begin()); } } for ( uint64_t ci = 0; ci < numcigop; ++ci ) { uint64_t const ciglen = cigop[ci].second; switch ( cigop[ci].first ) { case libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_CMATCH: case libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_CEQUAL: case libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_CDIFF: { if ( pendinginserts.size() ) { M[refpos].I.push_back(pendinginserts); pendinginserts.resize(0); } for ( uint64_t i = 0; i < ciglen; ++i ) { M[refpos].V.push_back(std::make_pair(bases[readpos],qual[readpos])); readpos++; refpos++; } break; } case libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_CINS: { for ( uint64_t i = 0; i < ciglen; ++i, ++readpos ) pendinginserts.push_back(std::make_pair(bases[readpos],qual[readpos])); break; } case libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_CDEL: // handle pending inserts if ( pendinginserts.size() ) { M[refpos].I.push_back(pendinginserts); pendinginserts.resize(0); } // deleting bases from the reference for ( uint64_t i = 0; i < ciglen; ++i, ++refpos ) M[refpos].V.push_back(std::make_pair(padsym,0)); break; case libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_CREF_SKIP: // handle pending inserts if ( pendinginserts.size() ) { M[refpos].I.push_back(pendinginserts); pendinginserts.resize(0); } // skip bases on reference for ( uint64_t i = 0; i < ciglen; ++i ) { refpos++; } break; case libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_CSOFT_CLIP: // skip bases on read for ( uint64_t i = 0; i < ciglen; ++i ) { readpos++; } break; case libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_CHARD_CLIP: break; case libmaus::bambam::BamFlagBase::LIBMAUS_BAMBAM_CPAD: { for ( uint64_t i = 0; i < ciglen; ++i, ++readpos ) pendinginserts.push_back(std::make_pair(padsym,0)); break; } } } if ( pendinginserts.size() ) { M[refpos].I.push_back(pendinginserts); M[refpos].iadd++; pendinginserts.resize(0); } assert ( readpos == seqlen ); } if ( verbose && ((++alcnt % (1024*1024)) == 0) ) std::cerr << "[V] " << alcnt << std::endl; } while ( M.size() ) { HeapEntry & H = M.begin()->second; if ( outputprefix.size() && (streamRefId != prevrefid) ) { if ( Pstream ) { std::ostringstream fnostr; fnostr << outputprefix << "_" << header.getRefIDName(streamRefId); libmaus::aio::PosixFdOutputStream PFOS(fnostr.str()); PFOS << ">" << header.getRefIDName(streamRefId) << '\n'; PFOS << Pstream->str() << '\n'; Pstream.reset(); } stream_ptr_type Tstream(new std::ostringstream); Pstream = Tstream; streamRefId = prevrefid; } if ( Pindex && (loadedRefId != prevrefid) ) { refseqbases = Pindex->readSequence(*PCIS, Pindex->getSequenceIdByName(refidname)); loadedRefId = prevrefid; if ( Mconsacc.find(loadedRefId) == Mconsacc.end() ) Mconsacc[loadedRefId] = ConsensusAccuracy(refseqbases.size()); consacc = &(Mconsacc[loadedRefId]); } H.toStream(std::cout,M.begin()->first,refidname,(M.begin()->first < refseqbases.size()) ? static_cast<int>(refseqbases[M.begin()->first]) : -1,Caux,consacc,Pstream.get()); M.erase(M.begin()); } if ( Pstream ) { std::ostringstream fnostr; fnostr << outputprefix << "_" << header.getRefIDName(streamRefId); libmaus::aio::PosixFdOutputStream PFOS(fnostr.str()); PFOS << ">" << header.getRefIDName(streamRefId) << '\n'; PFOS << Pstream->str() << '\n'; Pstream.reset(); } ConsensusAccuracy constotal; for ( std::map<uint64_t,ConsensusAccuracy>::const_iterator ita = Mconsacc.begin(); ita != Mconsacc.end(); ++ita ) { std::cerr << header.getRefIDName(ita->first) << "\t" << ita->second << std::endl; std::map<uint64_t,uint64_t> const M = ita->second.depthhistogram.get(); uint64_t total = 0; uint64_t preavg = 0; for ( std::map<uint64_t,uint64_t>::const_iterator aita = M.begin(); aita != M.end(); ++aita ) { total += aita->second; preavg += aita->first * aita->second; } uint64_t acc = 0; for ( std::map<uint64_t,uint64_t>::const_iterator aita = M.begin(); aita != M.end(); ++aita ) { acc += aita->second; std::cerr << "H[" << header.getRefIDName(ita->first) << "," << aita->first << ",+]" << "\t" << aita->second << "\t" << static_cast<double>(aita->second)/total << "\t" << acc << "\t" << static_cast<double>(acc)/total << std::endl; } acc = 0; for ( std::map<uint64_t,uint64_t>::const_reverse_iterator aita = M.rbegin(); aita != M.rend(); ++aita ) { acc += aita->second; std::cerr << "H[" << header.getRefIDName(ita->first) << "," << aita->first << ",-]" << "\t" << aita->second << "\t" << static_cast<double>(aita->second)/total << "\t" << acc << "\t" << static_cast<double>(acc)/total << std::endl; } std::cerr << "H[" << header.getRefIDName(ita->first) << ",avg]\t" << static_cast<double>(preavg)/total << std::endl; constotal += ita->second; } if ( Mconsacc.size() ) { std::cerr << "all\t" << constotal << std::endl; std::map<uint64_t,uint64_t> const M = constotal.depthhistogram.get(); uint64_t total = 0; uint64_t preavg = 0; for ( std::map<uint64_t,uint64_t>::const_iterator aita = M.begin(); aita != M.end(); ++aita ) { total += aita->second; preavg += aita->first * aita->second; } uint64_t acc = 0; for ( std::map<uint64_t,uint64_t>::const_iterator aita = M.begin(); aita != M.end(); ++aita ) { acc += aita->second; std::cerr << "H[" << "all" << "," << aita->first << ",+]" << "\t" << aita->second << "\t" << static_cast<double>(aita->second)/total << "\t" << acc << "\t" << static_cast<double>(acc)/total << std::endl; } acc = 0; for ( std::map<uint64_t,uint64_t>::const_reverse_iterator aita = M.rbegin(); aita != M.rend(); ++aita ) { acc += aita->second; std::cerr << "H[" << "all" << "," << aita->first << ",-]" << "\t" << aita->second << "\t" << static_cast<double>(aita->second)/total << "\t" << acc << "\t" << static_cast<double>(acc)/total << std::endl; } std::cerr << "H[all,avg]\t" << static_cast<double>(preavg) / total << std::endl; } return EXIT_SUCCESS; }