void testGzip() { libmaus2::aio::CheckedInputStream CIS("configure"); uint64_t t = 0; std::ostringstream ostr; { libmaus2::lz::GzipOutputStream GZOS(ostr); int c = -1; while ( ( c = CIS.get() ) >= 0 ) GZOS.put(c); t = GZOS.terminate(); } CIS.clear(); CIS.seekg(0); assert ( t == ostr.str().size() ); std::istringstream istr(ostr.str()); libmaus2::lz::BufferedGzipStream BGS(istr); int c = -1; while ( (c=CIS.get()) >= 0 ) { int d = BGS.get(); assert ( d == c ); } assert ( BGS.get() < 0 ); }
static void concatenate(std::vector<std::string> const & infilenames, std::string const & outfilename, bool const removeinput = false) { uint64_t const n = ::libmaus2::gamma::GammaRLDecoder::getLength(infilenames); unsigned int const albits = infilenames.size() ? ::libmaus2::gamma::GammaRLDecoder::getAlBits(infilenames[0]) : 0; ::libmaus2::aio::OutputStreamInstance COS(outfilename); ::libmaus2::aio::SynchronousGenericOutput<uint64_t> SGO(COS,64); SGO.put(n); SGO.put(albits); SGO.flush(); uint64_t const headerlen = 2*sizeof(uint64_t); std::vector < ::libmaus2::huffman::IndexEntry > index; uint64_t ioff = headerlen; for ( uint64_t i = 0; i < infilenames.size(); ++i ) { uint64_t const indexpos = ::libmaus2::huffman::IndexLoaderBase::getIndexPos(infilenames[i]); uint64_t const datalen = indexpos-headerlen; // copy data ::libmaus2::aio::InputStreamInstance CIS(infilenames[i]); CIS.seekg(headerlen); ::libmaus2::util::GetFileSize::copy(CIS,COS,datalen); // add entries to index ::libmaus2::huffman::IndexLoaderSequential indexdata(infilenames[i]); ::libmaus2::huffman::IndexEntry ij = indexdata.getNext(); // ::libmaus2::huffman::IndexDecoderData indexdata(infilenames[i]); for ( uint64_t j = 0; j < indexdata.numentries; ++j ) { ::libmaus2::huffman::IndexEntry ij1 = indexdata.getNext(); /* ::libmaus2::huffman::IndexEntry const ij = indexdata.readEntry(j); ::libmaus2::huffman::IndexEntry const ij1 = indexdata.readEntry(j+1); */ index.push_back(::libmaus2::huffman::IndexEntry((ij.pos - headerlen) + ioff, ij1.kcnt - ij.kcnt, ij1.vcnt - ij.vcnt)); ij = ij1; } // update position pointer ioff += datalen; if ( removeinput ) libmaus2::aio::FileRemoval::removeFile(infilenames[i]); } // write index ::libmaus2::aio::SynchronousGenericOutput<uint8_t> SGO8(COS,64*1024); ::libmaus2::aio::SynchronousGenericOutput<uint8_t>::iterator_type it(SGO8); ::libmaus2::bitio::FastWriteBitWriterStream8Std FWBWS(it); writeIndex(index,FWBWS,ioff); FWBWS.flush(); SGO8.flush(); COS.flush(); }
static unique_ptr_type load(base_layer_type const & B, std::string const & fn) { libmaus::aio::CheckedInputStream CIS(fn); unique_ptr_type ptr( new this_type(CIS,B) ); return UNIQUE_PTR_MOVE(ptr); }
/** * compute character histogram in parallel **/ ::libmaus::autoarray::AutoArray<uint64_t> computeCharHist(std::string const & inputfile) { uint64_t const n = ::libmaus::util::GetFileSize::getFileSize(inputfile); #if defined(_OPENMP) uint64_t const numthreads = omp_get_max_threads(); #else uint64_t const numthreads = 1; #endif uint64_t const packsize = (n + numthreads-1)/numthreads; ::libmaus::parallel::OMPLock lock; ::libmaus::autoarray::AutoArray<uint64_t> ghist(256); #if defined(_OPENMP) #pragma omp parallel for #endif for ( int64_t t = 0; t < static_cast<int64_t>(numthreads); ++t ) { uint64_t const low = std::min(n,t*packsize); uint64_t const high = std::min(n,low+packsize); uint64_t const range = high-low; if ( range ) { ::libmaus::autoarray::AutoArray<uint64_t> lhist(ghist.size()); ::libmaus::aio::CheckedInputStream CIS(inputfile); CIS.seekg(low); uint64_t const blocksize = 8192; uint64_t const numblocks = ((range)+blocksize-1)/blocksize; ::libmaus::autoarray::AutoArray<uint8_t> B(blocksize); for ( uint64_t b = 0; b < numblocks; ++b ) { uint64_t const llow = std::min(low + b*blocksize,high); uint64_t const lhigh = std::min(llow + blocksize,high); uint64_t const lrange = lhigh-llow; CIS.read ( reinterpret_cast<char *>(B.begin()), lrange ); assert ( CIS.gcount() == static_cast<int64_t>(lrange) ); for ( uint64_t i = 0; i < lrange; ++i ) lhist[B[i]]++; } lock.lock(); for ( uint64_t i = 0; i < lhist.size(); ++i ) ghist[i] += lhist[i]; lock.unlock(); } } return ghist; }
libmaus2::util::OctetString::OctetString( std::string const & filename, uint64_t offset, uint64_t blength) { ::libmaus2::aio::CheckedInputStream CIS(filename); uint64_t const fs = ::libmaus2::util::GetFileSize::getFileSize(CIS); offset = std::min(offset,fs); blength = std::min(blength,fs-offset); CIS.seekg(offset); A = ::libmaus2::autoarray::AutoArray<uint8_t>(blength,false); CIS.read(reinterpret_cast<char *>(A.begin()),blength); }
void mergeToDense(std::string const & outputfilename, uint64_t const n) { std::string const tmpfilename = tmpgen.getFileName(); libmaus::util::TempFileRemovalContainer::addTempFile(tmpfilename); if ( merge(tmpfilename) ) { libmaus::aio::CheckedInputStream CIS(tmpfilename); libmaus::gamma::SparseGammaGapDecoder SGGD(CIS); libmaus::gamma::SparseGammaGapDecoder::iterator it = SGGD.begin(); libmaus::gamma::GammaGapEncoder GGE(outputfilename); GGE.encode(it,n); remove(tmpfilename.c_str()); } }
void testsparsegammamerge() { libmaus::util::TempFileNameGenerator tmpgen("tmp",3); libmaus::gamma::SparseGammaGapFileSet SGGF(tmpgen); std::map<uint64_t,uint64_t> refM; for ( uint64_t i = 0; i < 25; ++i ) { std::string const fn = tmpgen.getFileName(); libmaus::aio::CheckedOutputStream COS(fn); libmaus::gamma::SparseGammaGapEncoder SGE(COS); SGE.encode(2*i,i+1); refM[2*i] += (i+1); SGE.encode(2*i+2,i+1); refM[2*i+2] += (i+1); SGE.encode(2*i+4,i+1); refM[2*i+4] += (i+1); SGE.term(); SGGF.addFile(fn); } std::string const ffn = tmpgen.getFileName(); SGGF.merge(ffn); libmaus::aio::CheckedInputStream CIS(ffn); libmaus::gamma::SparseGammaGapDecoder SGGD(CIS); for ( uint64_t i = 0; i < 60; ++i ) { uint64_t dv = SGGD.decode(); std::cerr << dv; if ( refM.find(i) != refM.end() ) { std::cerr << "(" << refM.find(i)->second << ")"; assert ( refM.find(i)->second == dv ); } else { std::cerr << "(0)"; assert ( dv == 0 ); } std::cerr << ";"; } std::cerr << std::endl; remove(ffn.c_str()); }
static ::libmaus2::autoarray::AutoArray<uint64_t> loadArray(std::string const & filename) { ::libmaus2::aio::InputStreamInstance CIS(filename); return loadArray(CIS); }
int main(int argc, char * argv[]) { try { ::libmaus::util::ArgInfo const arginfo(argc,argv); ::libmaus::util::TempFileRemovalContainer::setup(); ::std::vector<std::string> const & inputfilenames = arginfo.restargs; char const * fasuffixes[] = { ".fa", ".fasta", 0 }; std::string defoutname = libmaus::util::OutputFileNameTools::endClipLcp(inputfilenames,&fasuffixes[0]) + ".fa"; while ( ::libmaus::util::GetFileSize::fileExists(defoutname) ) defoutname += "_"; std::string const fatempfilename = arginfo.getValue<std::string>("fatempfilename",defoutname); ::libmaus::util::TempFileRemovalContainer::addTempFile(fatempfilename); // std::cerr << "output file name " << defoutname << std::endl; ::std::vector< ::libmaus::fastx::FastAReader::RewriteInfo > const info = ::libmaus::fastx::FastAReader::rewriteFiles(inputfilenames,fatempfilename); std::map < std::string, uint64_t > fachr; ::libmaus::autoarray::AutoArray < uint64_t > fapref(info.size()+1); for ( uint64_t i = 0; i < info.size(); ++i ) { // std::cerr << info[i].valid << "\t" << info[i].idlen << "\t" << info[i].seqlen << "\t" << info[i].getIdPrefix() << std::endl; fachr[info[i].getIdPrefix()] = i; fapref [ i ] = info[i].getEntryLength() ; } fapref.prefixSums(); for ( uint64_t i = 0; i < info.size(); ++i ) fapref [ i ] += info[i].idlen + 2; // > + newline ::libmaus::bambam::BamDecoder decoder(std::cin); ::libmaus::bambam::BamHeader const & bamheader = decoder.bamheader; // std::vector< ::libmaus::bambam::Chromosome > chromosomes ::libmaus::autoarray::AutoArray<uint8_t> uptab(256,false); for ( uint64_t j = 0; j < uptab.size(); ++j ) uptab[j] = toupper(j); ::libmaus::autoarray::AutoArray < ::libmaus::autoarray::AutoArray<uint8_t>::unique_ptr_type > text(bamheader.chromosomes.size()); for ( uint64_t i = 0; i < bamheader.chromosomes.size(); ++i ) { std::string const bamchrname = bamheader.chromosomes[i].name; if ( fachr.find(bamchrname) == fachr.end() ) { ::libmaus::exception::LibMausException se; se.getStream() << "Unable to find reference sequence " << bamchrname << " in fa file." << std::endl; se.finish(); throw se; } uint64_t const faid = fachr.find(bamchrname)->second; if ( bamheader.chromosomes[i].len != info[faid].seqlen ) { ::libmaus::exception::LibMausException se; se.getStream() << "Reference sequence " << bamchrname << " has len " << bamheader.chromosomes[i].len << " in bam file but " << info[faid].seqlen << " in fa file." << std::endl; se.finish(); throw se; } if ( bamheader.chromosomes.size() < 100 ) std::cerr << "Loading sequence " << bamchrname << " of length " << info[faid].seqlen << std::endl; text [ i ] = UNIQUE_PTR_MOVE(::libmaus::autoarray::AutoArray<uint8_t>::unique_ptr_type(new ::libmaus::autoarray::AutoArray<uint8_t>(info[faid].seqlen,false))); ::libmaus::aio::CheckedInputStream CIS(fatempfilename); CIS.seekg(fapref[faid]); CIS.read(reinterpret_cast<char *>(text[i]->begin()),info[faid].seqlen); // sanity check, next symbol in file should be a newline int c; c = CIS.get(); assert ( c == '\n' ); // convert to upper case for ( uint8_t * pa = text[i]->begin(); pa != text[i]->end(); ++pa ) *pa = uptab[*pa]; } for ( uint64_t i = 0; i < bamheader.chromosomes.size(); ++i ) { assert ( text[i]->size() == bamheader.chromosomes[i].len ); } uint64_t decoded = 0; ::libmaus::bambam::BamWriter BW(std::cout,bamheader); while ( decoder.readAlignment() ) { ++decoded; if ( decoded % (1024*1024) == 0 ) { std::cerr << "[V] " << decoded << std::endl; } ::libmaus::bambam::BamAlignment & alignment = decoder.alignment; bool const cigok = checkCigarValid(alignment,bamheader,text); // if cigar is ok then keep alignment if ( cigok ) { if ( !alignment.isUnmap() ) { uint64_t seqpos = 0; uint64_t refpos = alignment.getPos(); std::string const read = alignment.getRead(); std::string modseq = read; ::libmaus::autoarray::AutoArray<uint8_t> const & ctext = *(text[alignment.getRefID()]); std::ostringstream newcigarstream; for ( uint64_t i = 0; i < alignment.getNCigar(); ++i ) { char const cop = alignment.getCigarFieldOpAsChar(i); int64_t const clen = alignment.getCigarFieldLength(i); switch ( cop ) { // match/mismatch, increment both case 'M': { int64_t low = 0; while ( low != clen ) { int64_t high = low; while ( high != clen && ctext[refpos] == read[seqpos] ) { modseq[seqpos] = '='; ++refpos, ++seqpos, ++ high; } if ( high != low ) newcigarstream << high-low << "="; low = high; while ( high != clen && ctext[refpos] != read[seqpos] ) ++refpos, ++seqpos, ++ high; if ( high != low ) newcigarstream << high-low << "X"; low = high; } break; } case '=': { refpos += clen; for ( int64_t j = 0; j < clen; ++j, ++seqpos ) modseq[seqpos] = '='; newcigarstream << clen << cop; break; } case 'X': { refpos += clen; seqpos += clen; newcigarstream << clen << cop; break; } case 'P': case 'I': { seqpos += clen; newcigarstream << clen << cop; break; } case 'N': case 'D': { refpos += clen; newcigarstream << clen << cop; break; } case 'S': { seqpos += clen; newcigarstream << clen << cop; break; } case 'H': { newcigarstream << clen << cop; break; } } } alignment.replaceCigarString(newcigarstream.str()); alignment.replaceSequence(modseq); } alignment.serialise(BW.bgzfos); } } } catch(std::exception const & ex) { std::cerr << ex.what() << std::endl; return EXIT_FAILURE; } }
void testlz4() { std::ostringstream ostr; { libmaus2::lz::Lz4CompressStream compressor(ostr,16*1024); libmaus2::aio::CheckedInputStream CIS("configure"); int c; while ( (c=CIS.get()) > 0 ) compressor.put(c); compressor.writeIndex(); } libmaus2::autoarray::AutoArray<char> const C = libmaus2::autoarray::AutoArray<char>::readFile("configure"); std::istringstream istr(ostr.str()); libmaus2::lz::Lz4Decoder dec(istr); { for ( uint64_t i = 0; i < C.size(); i += 100 ) { if ( i % 16 == 0 ) std::cerr << "i=" <<i << std::endl; int c; dec.clear(); dec.seekg(i); uint64_t j = i; while ( (c=dec.get()) > 0 ) { assert ( c == static_cast<uint8_t>(C[j++]) ); } } uint64_t i = C.size()-1; int c; dec.clear(); dec.seekg(i); uint64_t j = i; while ( (c=dec.get()) > 0 ) { assert ( c == static_cast<uint8_t>(C[j++]) ); } } libmaus2::random::Random::setup(time(0)); dec.clear(); for ( uint64_t j = 0; j < 16384; ++j ) { uint64_t const r = 10; uint64_t const p = libmaus2::random::Random::rand64() % ( C.size()-r ); dec.seekg(p); for ( uint64_t i = 0; i < r; ++i ) { assert ( dec.get() == static_cast<uint8_t>(C[p+i]) ); } } }
void testcompact() { std::string const fn("tmpfile"); #if 0 std::string const fn2("tmpfile2"); std::string const fnm("tmpfile.merged"); #endif ::libmaus::util::TempFileRemovalContainer::setup(); ::libmaus::util::TempFileRemovalContainer::addTempFile(fn); uint64_t n = 1024*1024; unsigned int const b = 3; ::libmaus::bitio::CompactArray CA(n,b); ::libmaus::bitio::CompactArrayWriter CAW(fn,n,b); srand(time(0)); for ( uint64_t i = 0; i < n; ++i ) { CA.set(i,rand() & ((1ull<<b)-1)); CAW.put(CA.get(i)); } CAW.flush(); #if 0 ::libmaus::aio::CheckedOutputStream COS(fn); CA.serialize(COS); COS.flush(); COS.close(); #endif ::libmaus::aio::CheckedInputStream CIS(fn); std::cerr << "compact file size is " << ::libmaus::util::GetFileSize::getFileSize(CIS) << std::endl; assert ( static_cast< ::std::streampos > (CIS.tellg()) == static_cast< ::std::streampos >(0) ); assert ( CIS.get() >= 0 ); ::libmaus::bitio::CompactDecoderWrapper W(fn,4096); W.seekg(0,std::ios::end); int64_t const fs = W.tellg(); W.seekg(0,std::ios::beg); W.clear(); assert ( fs == static_cast<int64_t>(n) ); std::cerr << "n=" << n << " fs=" << fs << std::endl; for ( uint64_t i = 0; i < n; ++i ) { assert ( W.tellg() == static_cast< ::std::streampos >(i) ); int const v = W.get(); assert ( v == static_cast<int>(CA[i]) ); // std::cerr << static_cast<int>(W.get()) << " " << CA[i] << std::endl; } for ( uint64_t i = 0; i < n; i += (rand() % 256) ) { W.clear(); W.seekg(i); std::cerr << "seek to " << W.tellg() << std::endl; for ( uint64_t j = i; j < n; ++j ) { assert ( W.tellg() == static_cast< ::std::streampos >(j) ); int const v = W.get(); assert ( v == static_cast<int>(CA[j]) ); } uint64_t ii = n-i; W.clear(); W.seekg(ii); for ( uint64_t j = ii; j < n; ++j ) { assert ( W.tellg() == static_cast< ::std::streampos >(j) ); int const v = W.get(); assert ( v == static_cast<int>(CA[j]) ); } } }
int main(int argc, char * argv[]) { try { libmaus2::util::ArgInfo const arginfo(argc,argv); libmaus2::timing::RealTimeClock rtc; uint64_t const runs = 10; std::pair <libmaus2::bambam::BamAlignment const *, libmaus2::bambam::BamAlignment const *> P; for ( uint64_t i = 0; i < arginfo.restargs.size(); ++i ) { std::string const fn = arginfo.restargs[i]; double srate = 0, drate = 0; for ( uint64_t j = 0; j < runs; ++j ) { rtc.start(); libmaus2::bambam::BamDecoder bamdec(fn); uint64_t cnt = 0; while ( bamdec.readAlignment() ) ++cnt; double const lela = rtc.getElapsedSeconds(); std::cerr << "[S] " << "cnt=" << cnt << " ela=" << lela << " rate=" << cnt/lela << std::endl; srate += cnt/lela; } for ( uint64_t j = 0; j < runs; ++j ) { rtc.start(); libmaus2::aio::InputStreamInstance CIS(fn); libmaus2::bambam::BamCircularHashCollatingBamDecoder bamdec(CIS,"tmpfile"); uint64_t cnt = 0; while ( bamdec.tryPair(P) ) { if ( P.first ) ++cnt; if ( P.second ) ++cnt; } libmaus2::aio::FileRemoval::removeFile("tmpfile"); double const lela = rtc.getElapsedSeconds(); std::cerr << "[D] " << "cnt=" << cnt << " ela=" << lela << " rate=" << cnt/lela << std::endl; drate += cnt/lela; } srate /= runs; drate /= runs; std::cerr << "[Q] " << srate/drate << std::endl; } } catch(std::exception const & ex) { std::cerr << ex.what() << std::endl; return EXIT_FAILURE; } }
static unique_ptr_type loadFile(std::string const & filename) { libmaus::aio::CheckedInputStream CIS(filename); unique_ptr_type ptr(load(CIS)); return UNIQUE_PTR_MOVE(ptr); }
/** * load a serialised vector of FileFragments from file named filename * * @param filename name of file containing a serialised file fragment vector * @return deserialised vector **/ static std::vector<FileFragment> loadVector(std::string const & filename) { libmaus2::aio::CheckedInputStream CIS(filename); std::vector<FileFragment> V = deserialiseVector(CIS); return V; }
int bamreset(::libmaus2::util::ArgInfo const & arginfo) { if ( isatty(STDIN_FILENO) ) { ::libmaus2::exception::LibMausException se; se.getStream() << "Refusing to read binary data from terminal, please redirect standard input to pipe or file." << std::endl; se.finish(); throw se; } if ( isatty(STDOUT_FILENO) ) { ::libmaus2::exception::LibMausException se; se.getStream() << "Refusing write binary data to terminal, please redirect standard output to pipe or file." << std::endl; se.finish(); throw se; } int const level = libmaus2::bambam::BamBlockWriterBaseFactory::checkCompressionLevel(arginfo.getValue<int>("level",getDefaultLevel())); int const verbose = arginfo.getValue<int>("verbose",getDefaultVerbose()); int const resetsortorder = arginfo.getValue<int>("resetsortorder",getDefaultResetSortOrder()); ::libmaus2::bambam::BamDecoder dec(std::cin,false); ::libmaus2::bambam::BamHeader const & header = dec.getHeader(); std::string headertext = header.text; // no replacement header file given if ( ! arginfo.hasArg("resetheadertext") ) { // remove SQ lines std::vector<libmaus2::bambam::HeaderLine> allheaderlines = libmaus2::bambam::HeaderLine::extractLines(headertext); std::ostringstream upheadstr; for ( uint64_t i = 0; i < allheaderlines.size(); ++i ) if ( allheaderlines[i].type != "SQ" ) upheadstr << allheaderlines[i].line << std::endl; headertext = upheadstr.str(); } // replace header given in file else { std::string const headerfilename = arginfo.getUnparsedValue("resetheadertext",""); uint64_t const headerlen = libmaus2::util::GetFileSize::getFileSize(headerfilename); libmaus2::aio::CheckedInputStream CIS(headerfilename); libmaus2::autoarray::AutoArray<char> ctext(headerlen,false); CIS.read(ctext.begin(),headerlen); headertext = std::string(ctext.begin(),ctext.end()); } // add PG line to header headertext = libmaus2::bambam::ProgramHeaderLineSet::addProgramLine( headertext, "bamreset", // ID "bamreset", // PN arginfo.commandline, // CL ::libmaus2::bambam::ProgramHeaderLineSet(headertext).getLastIdInChain(), // PP std::string(PACKAGE_VERSION) // VN ); // construct new header libmaus2::bambam::BamHeader uphead(headertext); if ( resetsortorder ) uphead.changeSortOrder("unknown"); /* * start index/md5 callbacks */ std::string const tmpfilenamebase = arginfo.getValue<std::string>("tmpfile",arginfo.getDefaultTmpFileName()); std::string const tmpfileindex = tmpfilenamebase + "_index"; ::libmaus2::util::TempFileRemovalContainer::addTempFile(tmpfileindex); uint32_t const excludeflags = libmaus2::bambam::BamFlagBase::stringToFlags( arginfo.getValue<std::string>("exclude",getDefaultExcludeFlags())); std::string md5filename; std::string indexfilename; std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > cbs; ::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Pmd5cb; if ( arginfo.getValue<unsigned int>("md5",getDefaultMD5()) ) { if ( arginfo.hasArg("md5filename") && arginfo.getUnparsedValue("md5filename","") != "" ) md5filename = arginfo.getUnparsedValue("md5filename",""); else std::cerr << "[V] no filename for md5 given, not creating hash" << std::endl; if ( md5filename.size() ) { ::libmaus2::lz::BgzfDeflateOutputCallbackMD5::unique_ptr_type Tmd5cb(new ::libmaus2::lz::BgzfDeflateOutputCallbackMD5); Pmd5cb = UNIQUE_PTR_MOVE(Tmd5cb); cbs.push_back(Pmd5cb.get()); } } libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Pindex; if ( arginfo.getValue<unsigned int>("index",getDefaultIndex()) ) { if ( arginfo.hasArg("indexfilename") && arginfo.getUnparsedValue("indexfilename","") != "" ) indexfilename = arginfo.getUnparsedValue("indexfilename",""); else std::cerr << "[V] no filename for index given, not creating index" << std::endl; if ( indexfilename.size() ) { libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex::unique_ptr_type Tindex(new libmaus2::bambam::BgzfDeflateOutputCallbackBamIndex(tmpfileindex)); Pindex = UNIQUE_PTR_MOVE(Tindex); cbs.push_back(Pindex.get()); } } std::vector< ::libmaus2::lz::BgzfDeflateOutputCallback * > * Pcbs = 0; if ( cbs.size() ) Pcbs = &cbs; /* * end md5/index callbacks */ ::libmaus2::bambam::BamWriter::unique_ptr_type writer(new ::libmaus2::bambam::BamWriter(std::cout,uphead,level,Pcbs)); libmaus2::timing::RealTimeClock rtc; rtc.start(); libmaus2::bambam::BamAlignment & algn = dec.getAlignment(); uint64_t c = 0; bool const resetaux = arginfo.getValue<int>("resetaux",getDefaultResetAux()); libmaus2::bambam::BamAuxFilterVector::unique_ptr_type const prgfilter(libmaus2::bambam::BamAuxFilterVector::parseAuxFilterList(arginfo)); libmaus2::bambam::BamAuxFilterVector const * rgfilter = prgfilter.get(); while ( dec.readAlignment() ) { bool const keep = resetAlignment(algn,resetaux /* reset aux */,excludeflags,rgfilter); if ( keep ) algn.serialise(writer->getStream()); if ( verbose && (++c & (1024*1024-1)) == 0 ) std::cerr << "[V] " << c/(1024*1024) << " " << (c / rtc.getElapsedSeconds()) << std::endl; } writer.reset(); if ( Pmd5cb ) { Pmd5cb->saveDigestAsFile(md5filename); } if ( Pindex ) { Pindex->flush(std::string(indexfilename)); } return EXIT_SUCCESS; }
int fagzToCompact4(libmaus2::util::ArgInfo const & arginfo) { bool const rc = arginfo.getValue<unsigned int>("rc",1); bool const gz = arginfo.getValue<unsigned int>("gz",1); std::vector<std::string> inputfilenames; inputfilenames = arginfo.restargs; if ( arginfo.hasArg("inputfilenames") ) { std::string const inf = arginfo.getUnparsedValue("inputfilenames",std::string()); libmaus2::aio::InputStream::unique_ptr_type Pinf(libmaus2::aio::InputStreamFactoryContainer::constructUnique(inf)); while ( *Pinf ) { std::string line; std::getline(*Pinf,line); if ( line.size() ) inputfilenames.push_back(line); } } std::string const inlcp = libmaus2::util::OutputFileNameTools::lcp(inputfilenames); std::string defout = inlcp; defout = libmaus2::util::OutputFileNameTools::clipOff(defout,".gz"); defout = libmaus2::util::OutputFileNameTools::clipOff(defout,".fasta"); defout = libmaus2::util::OutputFileNameTools::clipOff(defout,".fa"); std::string const outputfilename = arginfo.getUnparsedValue("outputfilename",defout + ".compact"); std::string const metaoutputfilename = outputfilename + ".meta"; int const verbose = arginfo.getValue<int>("verbose",1); libmaus2::autoarray::AutoArray<char> B(8*1024,false); libmaus2::bitio::CompactArrayWriterFile compactout(outputfilename,2 /* bits per symbol */); if ( ! rc ) std::cerr << "[V] not storing reverse complements" << std::endl; // forward mapping table libmaus2::autoarray::AutoArray<uint8_t> ftable(256,false); // rc mapping for mapped symbols libmaus2::autoarray::AutoArray<uint8_t> ctable(256,false); std::fill(ftable.begin(),ftable.end(),4); std::fill(ctable.begin(),ctable.end(),4); ftable['a'] = ftable['A'] = 0; ftable['c'] = ftable['C'] = 1; ftable['g'] = ftable['G'] = 2; ftable['t'] = ftable['T'] = 3; uint64_t insize = 0; ctable[0] = 3; // A->T ctable[1] = 2; // C->G ctable[2] = 1; // G->C ctable[3] = 0; // T->A libmaus2::aio::OutputStreamInstance::unique_ptr_type metaOut(new libmaus2::aio::OutputStreamInstance(metaoutputfilename)); libmaus2::util::NumberSerialisation::serialiseNumber(*metaOut,0); uint64_t nseq = 0; std::vector<uint64_t> lvec; for ( uint64_t i = 0; i < inputfilenames.size(); ++i ) { std::string const fn = inputfilenames[i]; libmaus2::aio::InputStreamInstance CIS(fn); libmaus2::lz::BufferedGzipStream::unique_ptr_type BGS; std::istream * istr = 0; if ( gz ) { libmaus2::lz::BufferedGzipStream::unique_ptr_type tBGS( new libmaus2::lz::BufferedGzipStream(CIS)); BGS = UNIQUE_PTR_MOVE(tBGS); istr = BGS.get(); } else { istr = &CIS; } libmaus2::fastx::StreamFastAReaderWrapper fain(*istr); libmaus2::fastx::StreamFastAReaderWrapper::pattern_type pattern; while ( fain.getNextPatternUnlocked(pattern) ) { if ( verbose ) std::cerr << (i+1) << " " << stripAfterDot(basename(fn)) << " " << pattern.sid << "..."; libmaus2::util::NumberSerialisation::serialiseNumber(*metaOut,pattern.spattern.size()); lvec.push_back(pattern.spattern.size()); libmaus2::util::NumberSerialisation::serialiseNumber(*metaOut,0); // map symbols for ( uint64_t j = 0; j < pattern.spattern.size(); ++j ) pattern.spattern[j] = ftable[static_cast<uint8_t>(pattern.spattern[j])]; // replace blocks of N symbols by random bases uint64_t l = 0; // number of replaced blocks uint64_t nr = 0; while ( l < pattern.spattern.size() ) { // skip regular bases while ( l < pattern.spattern.size() && pattern.spattern[l] < 4 ) ++l; assert ( l == pattern.spattern.size() || pattern.spattern[l] == 4 ); // go to end of non regular bases block uint64_t h = l; while ( h < pattern.spattern.size() && pattern.spattern[h] == 4 ) ++h; // if non regular block is not empty if ( h-l ) { // replace by random bases for ( uint64_t j = l; j < h; ++j ) pattern.spattern[j] = (libmaus2::random::Random::rand8() & 3); // write bounds libmaus2::util::NumberSerialisation::serialiseNumber(*metaOut,l); libmaus2::util::NumberSerialisation::serialiseNumber(*metaOut,h); // add to interval counter nr += 1; } l = h; } // make sure there are no more irregular bases for ( uint64_t j = 0; j < pattern.spattern.size(); ++j ) assert ( pattern.spattern[j] < 4 ); // go back to start of meta data metaOut->seekp( - static_cast<int64_t>(2*nr+1)*sizeof(uint64_t), std::ios::cur ); // write number of intervals replaced libmaus2::util::NumberSerialisation::serialiseNumber(*metaOut,nr); // skip interval bounds already written metaOut->seekp( static_cast<int64_t>(2*nr )*sizeof(uint64_t), std::ios::cur ); // write bases compactout.write(pattern.spattern.c_str(),pattern.spattern.size()); // write reverse complement if requested if ( rc ) { // reverse complement std::reverse(pattern.spattern.begin(),pattern.spattern.end()); for ( uint64_t j = 0; j < pattern.spattern.size(); ++j ) pattern.spattern[j] = ctable[static_cast<uint8_t>(pattern.spattern[j])]; // write compactout.write(pattern.spattern.c_str(),pattern.spattern.size()); } insize += pattern.spattern.size()+1; nseq += 1; if ( verbose ) std::cerr << "done, input size " << formatBytes(pattern.spattern.size()+1) << " acc " << formatBytes(insize) << std::endl; } } metaOut->seekp(0); libmaus2::util::NumberSerialisation::serialiseNumber(*metaOut,nseq); metaOut->flush(); metaOut.reset(); libmaus2::aio::InputStreamInstance::unique_ptr_type metaISI(new libmaus2::aio::InputStreamInstance(metaoutputfilename)); // number of sequences uint64_t const rnseq = libmaus2::util::NumberSerialisation::deserialiseNumber(*metaISI); assert ( nseq == rnseq ); for ( uint64_t i = 0; i < nseq; ++i ) { // length of sequence uint64_t const l = libmaus2::util::NumberSerialisation::deserialiseNumber(*metaISI); assert ( l == lvec[i] ); uint64_t const nr = libmaus2::util::NumberSerialisation::deserialiseNumber(*metaISI); // skip replaced intervals metaISI->ignore(2*nr*sizeof(uint64_t)); } assert ( metaISI->peek() == std::istream::traits_type::eof() ); std::cerr << "Done, total input size " << insize << std::endl; compactout.flush(); return EXIT_SUCCESS; }
/** * load object from file fs and return it encapsulated in a unique pointer * * @param fs filename * @return deserialised object as unique pointer **/ static unique_ptr_type load(std::string const & fs) { libmaus2::aio::CheckedInputStream CIS(fs); unique_ptr_type u(new this_type(CIS)); return UNIQUE_PTR_MOVE(u); }
int main(int argc, char * argv[]) { try { ::libmaus::util::ArgInfo arginfo(argc,argv); ::libmaus::util::TempFileRemovalContainer::setup(); ::std::vector<std::string> const & inputfilenames = arginfo.restargs; char const * fasuffixes[] = { ".fa", ".fasta", 0 }; std::string deftmpname = libmaus::util::OutputFileNameTools::endClipLcp(inputfilenames,&fasuffixes[0]) + ".fa.tmp"; while ( ::libmaus::util::GetFileSize::fileExists(deftmpname) ) deftmpname += "_"; std::string defoutname = libmaus::util::OutputFileNameTools::endClipLcp(inputfilenames,&fasuffixes[0]) + ".fa.recoded"; while ( ::libmaus::util::GetFileSize::fileExists(defoutname) ) defoutname += "_"; std::string const tempfilename = arginfo.getValue<std::string>("tempfilename",deftmpname); std::string const outfilename = arginfo.getValue<std::string>("outputfilename",defoutname); std::string const indexfilename = tempfilename + ".index"; unsigned int const addterm = arginfo.getValue<unsigned int>("addterm",0); unsigned int const termadd = addterm ? 1 : 0; ::libmaus::util::TempFileRemovalContainer::addTempFile(tempfilename); ::libmaus::util::TempFileRemovalContainer::addTempFile(indexfilename); std::cerr << "temp file name " << tempfilename << std::endl; std::cerr << "output file name " << outfilename << std::endl; /* uint64_t const numseq = */ ::libmaus::fastx::FastAReader::rewriteFiles(inputfilenames,tempfilename,indexfilename); uint64_t curpos = 0; ::libmaus::aio::CheckedOutputStream COS(outfilename); // 0,A,C,G,T,N // map forward ::libmaus::autoarray::AutoArray<char> cmap(256,false); std::fill(cmap.begin(),cmap.end(),5+termadd); cmap['\n'] = 0 + termadd; cmap['a'] = cmap['A'] = 1 + termadd; cmap['c'] = cmap['C'] = 2 + termadd; cmap['g'] = cmap['G'] = 3 + termadd; cmap['t'] = cmap['T'] = 4 + termadd; cmap['n'] = cmap['N'] = 5 + termadd; // map to reverse complement ::libmaus::autoarray::AutoArray<char> rmap(256,false); std::fill(rmap.begin(),rmap.end(),5+termadd); rmap['\n'] = 0 + termadd; rmap['a'] = rmap['A'] = 4 + termadd; rmap['c'] = rmap['C'] = 3 + termadd; rmap['g'] = rmap['G'] = 2 + termadd; rmap['t'] = rmap['T'] = 1 + termadd; rmap['n'] = rmap['N'] = 5 + termadd; // reverse complement for mapped data ::libmaus::autoarray::AutoArray<char> xmap(256,false); std::fill(xmap.begin(),xmap.end(),5+termadd); xmap[0] = 0 + termadd; xmap[1] = 4 + termadd; xmap[2] = 3 + termadd; xmap[3] = 2 + termadd; xmap[4] = 1 + termadd; xmap[5] = 5 + termadd; ::libmaus::autoarray::AutoArray<char> imap(256,false); for ( uint64_t i = 0; i < imap.size(); ++i ) imap[i] = static_cast<char>(i); ::libmaus::fastx::FastAReader::RewriteInfoDecoder::unique_ptr_type infodec(new ::libmaus::fastx::FastAReader::RewriteInfoDecoder(indexfilename)); ::libmaus::fastx::FastAReader::RewriteInfo info; uint64_t maxseqlen = 0; while ( infodec->get(info) ) maxseqlen = std::max(maxseqlen,info.seqlen); std::cerr << "[V] max seq len " << maxseqlen << std::endl; ::libmaus::fastx::FastAReader::RewriteInfoDecoder::unique_ptr_type tinfodec(new ::libmaus::fastx::FastAReader::RewriteInfoDecoder(indexfilename)); infodec = UNIQUE_PTR_MOVE(tinfodec); if ( maxseqlen <= 256*1024 ) { ::libmaus::aio::CheckedInputStream CIS(tempfilename); ::libmaus::autoarray::AutoArray<uint8_t> B(maxseqlen+1,false); while ( infodec->get(info) ) { // skip id CIS.ignore(info.idlen+2); // read sequence plus following terminator CIS.read(reinterpret_cast<char *>(B.begin()), info.seqlen+1); // map for ( uint64_t i = 0; i < info.seqlen+1; ++i ) B[i] = cmap[B[i]]; // write COS.write(reinterpret_cast<char const *>(B.begin()),info.seqlen+1); // remap for ( uint64_t i = 0; i < info.seqlen+1; ++i ) B[i] = xmap[B[i]]; // reverse std::reverse(B.begin(),B.begin()+info.seqlen); // write COS.write(reinterpret_cast<char const *>(B.begin()),info.seqlen+1); } } else { while ( infodec->get(info) ) { // std::cerr << info.valid << "\t" << info.idlen << "\t" << info.seqlen << "\t" << info.getIdPrefix() << std::endl; uint64_t const seqbeg = curpos + (info.idlen+2); uint64_t const seqend = seqbeg + info.seqlen; ::libmaus::aio::CheckedInputStream CIS(tempfilename); CIS.seekg(seqbeg); ::libmaus::util::GetFileSize::copyMap(CIS,COS,cmap.begin(),seqend-seqbeg+1); ::libmaus::aio::CircularReverseWrapper CRW(tempfilename,seqend); ::libmaus::util::GetFileSize::copyMap(CRW,COS,rmap.begin(),seqend-seqbeg+1); curpos += (info.idlen+2) + (info.seqlen+1); } } if ( addterm ) COS.put(0); return EXIT_SUCCESS; } catch(std::exception const & ex) { std::cerr << ex.what() << std::endl; return EXIT_FAILURE; } }
int main(int argc, char * argv[]) { try { ::libmaus::util::ArgInfo const arginfo(argc,argv); std::string const input = arginfo.getRestArg<std::string>(0); std::string const output = arginfo.getRestArg<std::string>(1); unsigned int const verbose = arginfo.getValue<unsigned int>("verbose",1); unsigned int const addterm = arginfo.getValue<unsigned int>("addterm",0) ? 1 : 0; ::libmaus::autoarray::AutoArray<uint64_t> const chist = computeCharHist(input); uint64_t maxsym = 0; for ( uint64_t i = 0; i < chist.size(); ++i ) if ( chist[i] ) maxsym = i; if ( addterm ) maxsym += 1; unsigned int const b = maxsym ? (64-::libmaus::bitio::Clz::clz(maxsym)) : 0; uint64_t const n = std::accumulate(chist.begin(),chist.end(),0ull); if ( verbose ) std::cerr << "[V] n=" << n << " maxsym=" << maxsym << " b=" << b << std::endl; uint64_t const blocksize = 8*1024; uint64_t const numblocks = (n+blocksize-1)/blocksize; ::libmaus::autoarray::AutoArray<uint8_t> B(blocksize); ::libmaus::aio::CheckedInputStream CIS(input); ::libmaus::bitio::CompactArrayWriter CAW(output,n+addterm,b); int64_t lastperc = -1; if ( verbose ) std::cerr << "[V] "; for ( uint64_t b = 0; b < numblocks; ++b ) { uint64_t const low = std::min(b*blocksize,n); uint64_t const high = std::min(low+blocksize,n); uint64_t const range = high-low; CIS.read ( reinterpret_cast<char *>(B.begin()), range ); assert ( CIS.gcount() == static_cast<int64_t>(range) ); if ( addterm ) for ( uint64_t i = 0; i < range; ++i ) B[i] += 1; CAW.write(B.begin(),range); int64_t const newperc = (high * 100) / n; if ( verbose && newperc != lastperc ) { lastperc = newperc; std::cerr << "(" << newperc << ")"; } } if ( addterm ) CAW.put(0); if ( verbose ) std::cerr << std::endl; CAW.flush(); #if 0 ::libmaus::bitio::CompactDecoderWrapper CDW(output); for ( uint64_t i = 0; i < n+addterm; ++i ) std::cerr << CDW.get(); std::cerr << std::endl; #endif } catch(std::exception const & ex) { std::cerr << ex.what() << std::endl; } }
static unique_ptr_type load(std::string const & filename) { libmaus2::aio::InputStreamInstance CIS(filename); unique_ptr_type tptr(new this_type(CIS)); return UNIQUE_PTR_MOVE(tptr); }
static unique_ptr_type load(lf_type const * lf, std::string const & fn) { libmaus::aio::CheckedInputStream CIS(fn); unique_ptr_type ptr(new this_type(lf,CIS)); return UNIQUE_PTR_MOVE(ptr); }