char const* getStart() const { if ( pc != C.begin() ) return C.begin(); else return 0; }
int_type underflow() { if ( gptr() < egptr() ) return static_cast<int_type>(*uptr()); assert ( gptr() == egptr() ); char * midptr = buffer.begin() + pushbackspace; uint64_t const copyavail = std::min( // previously read static_cast<uint64_t>(gptr()-eback()), // space we have to copy into static_cast<uint64_t>(midptr-buffer.begin()) ); ::std::memmove(midptr-copyavail,gptr()-copyavail,copyavail); stream.read(midptr, buffer.end()-midptr); size_t const n = stream.gcount(); streamreadpos += n; setg(midptr-copyavail, midptr, midptr+n); if (!n) return traits_type::eof(); return static_cast<int_type>(*uptr()); }
MdStringComputationContext() : T0(256,false), T1(256,false), nm(0) { std::fill(T0.begin(),T0.end(),4); std::fill(T1.begin(),T1.end(),5); T0['A'] = T0['a'] = T1['A'] = T1['a'] = 0; T0['C'] = T0['c'] = T1['C'] = T1['c'] = 1; T0['G'] = T0['g'] = T1['G'] = T1['g'] = 2; T0['T'] = T0['t'] = T1['T'] = T1['t'] = 3; auxvec.set("MD"); auxvec.set("NM"); }
void fillBuffer() { assert ( pc == pe ); if ( setpos ) { // std::cerr << "Seeking to " << readpos << std::endl; in.seekg(readpos); in.clear(); } if ( in.peek() >= 0 && readpos < endpos ) { #if 0 std::cerr << "Filling block, readpos " << readpos << " stream at pos " << in.tellg() << " endpos " << endpos << std::endl; #endif uint64_t blocksize = sizeof(uint64_t) + ( bigbuf ? sizeof(uint64_t) : 0 ); // size of uncompressed buffer uint64_t const n = bigbuf ? ::libmaus::util::NumberSerialisation::deserialiseNumber(in) : ::libmaus::util::UTF8::decodeUTF8(in,blocksize) ; // size of compressed data uint64_t const datasize = ::libmaus::util::NumberSerialisation::deserialiseNumber(in); // add to block size blocksize += datasize; if ( n > B.size() ) { B = ::libmaus::autoarray::AutoArray<char>(0,false); B = ::libmaus::autoarray::AutoArray<char>(n,false); } pa = B.begin(); pc = pa; pe = pa + n; ::libmaus::aio::IStreamWrapper wrapper(in); ::libmaus::lz::IstreamSource< ::libmaus::aio::IStreamWrapper> insource(wrapper,datasize); SnappyCompress::uncompress(insource,B.begin(),n); readpos += blocksize; } }
void getPattern(pattern_type & pat, uint64_t i) { GetObject G(T.begin()+(*dict)[i - dict->FI.low]); C.nextid = i; ::libmaus::fastx::CompactFastQDecoderBase::decodePattern<GetObject>(G,*H,C,pat); pat.patid = i; }
FastQBgzfWriter( ::std::string rindexfilename, uint64_t const rpatperblock, std::ostream & out, int level = Z_DEFAULT_COMPRESSION ) : indexfilename(rindexfilename), patperblock(rpatperblock), fifilename(setupTempFile(indexfilename + ".tmp.fi")), #if defined(LIBMAUS_FASTX_FASTQBGZFWRITER_PARALLEL) bgzfidxfilename(setupTempFile(indexfilename + ".tmp.bgzfidx")), bgzfidxcntfilename(setupTempFile(indexfilename + ".tmp.bgzfidx.cnt")), bgzfidoutstr(new libmaus::aio::CheckedOutputStream(bgzfidxfilename)), bgzfidxcntoutstr(new libmaus::aio::CheckedOutputStream(bgzfidxcntfilename)), #endif fioutstr(new libmaus::aio::CheckedOutputStream(fifilename)), C(0,false), patlow(0), blockcnt(0), #if defined(LIBMAUS_FASTX_FASTQBGZFWRITER_PARALLEL) bgzfenc(new libmaus::lz::BgzfDeflateParallel(out,32,128,level,bgzfidoutstr.get())), #else bgzfenc(new libmaus::lz::BgzfDeflate<std::ostream>(out,level)), #endif lnumsyms(0), minlen(std::numeric_limits<uint64_t>::max()), maxlen(0), pathigh(patlow), pc(C.begin()), p(0), cacc(0) { }
char const* getEnd() const { if ( pc != C.begin() ) return pc; else return 0; }
void getElement(element_type & pat, uint64_t i) const { GetObject G(T.begin()+(*dict)[i - dict->FI.low]); ::libmaus::fastx::CompactFastQContext C; C.nextid = i; ::libmaus::fastx::CompactFastQDecoderBase::decodeElement<GetObject>(G,*H,C,pat); }
CompactFastQContainer(::libmaus::network::SocketBase * textstr) : T(textstr->readMessageInBlocks<uint8_t,::libmaus::autoarray::alloc_type_cxx>()), dict(new ::libmaus::fastx::CompactFastQContainerDictionary(textstr)), H(), C() { GetObject G(T.begin()); H = UNIQUE_PTR_MOVE(::libmaus::fastx::CompactFastQHeader::unique_ptr_type(new ::libmaus::fastx::CompactFastQHeader(G))); }
::libmaus::util::Histogram::unique_ptr_type libmaus::util::Utf8String::getHistogram(::libmaus::autoarray::AutoArray<uint8_t> const & A) { #if defined(_OPENMP) uint64_t const numthreads = omp_get_max_threads(); #else uint64_t const numthreads = 1; #endif ::libmaus::autoarray::AutoArray<uint64_t> const partstarts = computePartStarts(A,numthreads); uint64_t const numparts = partstarts.size()-1; ::libmaus::util::Histogram::unique_ptr_type hist(new ::libmaus::util::Histogram); ::libmaus::parallel::OMPLock lock; #if defined(_OPENMP) #pragma omp parallel for #endif for ( int64_t t = 0; t < static_cast<int64_t>(numparts); ++t ) { ::libmaus::util::Histogram::unique_ptr_type lhist(new ::libmaus::util::Histogram); uint64_t codelen = 0; uint64_t const tcodelen = partstarts[t+1]-partstarts[t]; ::libmaus::util::GetObject<uint8_t const *> G(A.begin()+partstarts[t]); while ( codelen != tcodelen ) (*lhist)(::libmaus::util::UTF8::decodeUTF8(G,codelen)); lock.lock(); hist->merge(*lhist); lock.unlock(); } return UNIQUE_PTR_MOVE(hist); }
LinuxStreamingPosixFdOutputStreamBuffer(std::string const & fn, int64_t const rbuffersize) : fd(doOpen(fn)), closefd(true), optblocksize((rbuffersize < 0) ? getOptimalIOBlockSize(fd,std::string()) : rbuffersize), buffersize(optblocksize), buffer(buffersize,false), prevwrite(0,0) { setp(buffer.begin(),buffer.end()-1); }
virtual size_t compress(char const * input, size_t inputLength, libmaus::autoarray::AutoArray<char> & output) { uint64_t compressBound = SnappyCompress::compressBound(inputLength); if ( output.size() < compressBound ) output = libmaus::autoarray::AutoArray<char>(compressBound,false); return SnappyCompress::rawcompress(input,inputLength,output.begin()); }
void reset() { lnumsyms = 0; minlen = std::numeric_limits<uint64_t>::max(); maxlen = 0; pc = C.begin(); p = 0; }
BgzfParallelRecodeDeflateBase() : B(getBgzfMaxBlockSize(),false), pa(B.begin()), pc(B.begin()), pe(B.end()) { }
GammaRLEncoder(std::string const & filename, unsigned int const ralbits, uint64_t const n, uint64_t const rblocksize, uint64_t const rbufsize = 64*1024) : blocksize(rblocksize), COS(filename), SGO(COS,rbufsize), GE(SGO), A(blocksize), pa(A.begin()), pc(pa), pe(A.end()), cursym(0), curcnt(0), indexwritten(false), albits(ralbits) { SGO.put(n); SGO.put(albits); }
element_type * get() { if ( ! freelistfill ) { // allocate more alignment objects libmaus::autoarray::AutoArray<element_type *> nalloclist( std::max( static_cast<uint64_t>(1), static_cast<uint64_t>(2*alloclist.size()) ) ,false ); std::copy(alloclist.begin(),alloclist.end(),nalloclist.begin()); element_type * nullp = 0; std::fill(nalloclist.begin()+alloclist.size(),nalloclist.end(),nullp); for ( element_type ** p = nalloclist.begin()+alloclist.size(); p != nalloclist.end(); ++p ) *p = new element_type; libmaus::autoarray::AutoArray<element_type *> nfreelist( std::max( static_cast<uint64_t>(1), static_cast<uint64_t>(2*freelist.size()) ) ,false ); std::copy(freelist.begin(),freelist.end(),nfreelist.begin()); std::fill(nfreelist.begin()+freelist.size(),nfreelist.end(),nullp); freelist = nfreelist; for ( element_type ** p = nalloclist.begin()+alloclist.size(); p != nalloclist.end(); ++p ) freelist[freelistfill++] = *p; alloclist = nalloclist; } return freelist[--freelistfill]; }
char const * prevStart(char const * e) const { if ( e == C.begin() ) return 0; assert ( e[-1] == '\n' ); // step over last/quality line's newline --e; // search for plus line's newline while ( *--e != '\n' ) {} // search for sequence line's newline while ( *--e != '\n' ) {} // search for id line's newline while ( *--e != '\n' ) {} // search for start of line while ( e != C.begin() && e[-1] != '\n' ) --e; return e; }
Histogram & operator=(Histogram const & o) { if ( this != &o ) { all = o.all; if ( low.size() != o.low.size() ) low = ::libmaus::autoarray::AutoArray<uint64_t>(o.low.size(),false); std::copy(o.low.begin(),o.low.end(),low.begin()); } return *this; }
void put(libmaus::fastx::FastQReader::pattern_type const & pattern) { uint64_t const patlen = getFastQLength(pattern); while ( (C.end() - pc) < static_cast<ptrdiff_t>(patlen) ) { uint64_t const off = pc-C.begin(); uint64_t const newclen = std::max(2*C.size(),static_cast<uint64_t>(1ull)); C.resize(newclen); pc = C.begin()+off; } *(pc)++ = '@'; std::copy(pattern.sid.begin(),pattern.sid.end(),pc); pc += pattern.sid.size(); *(pc++) = '\n'; std::copy(pattern.spattern.begin(), pattern.spattern.end(),pc); pc += pattern.spattern.size(); *(pc++) = '\n'; *(pc)++ = '+'; std::copy(pattern.plus.begin(), pattern.plus.end(),pc); pc += pattern.plus.size(); *(pc++) = '\n'; std::copy(pattern.quality.begin(), pattern.quality.end(),pc); pc += pattern.quality.size(); *(pc++) = '\n'; assert ( pc <= C.end() ); lnumsyms += pattern.spattern.size(); minlen = std::min(minlen,static_cast<uint64_t>(pattern.spattern.size())); maxlen = std::max(maxlen,static_cast<uint64_t>(pattern.spattern.size())); pathigh++; if ( pathigh - patlow == patperblock ) internalFlush(); }
int_type underflow() { // if there is still data, then return it if ( gptr() < egptr() ) return static_cast<int_type>(*uptr()); assert ( gptr() == egptr() ); // number of bytes for putback buffer uint64_t const putbackcopy = std::min( static_cast<uint64_t>(gptr() - eback()), putbackspace ); // copy bytes std::copy( gptr()-putbackcopy, gptr(), buffer.begin() + putbackspace - putbackcopy ); // load data uint64_t const uncompressedsize = stream.read( buffer.begin()+putbackspace, buffer.size()-putbackspace ); // set buffer pointers setg( buffer.begin()+putbackspace-putbackcopy, buffer.begin()+putbackspace, buffer.begin()+putbackspace+uncompressedsize); symsread += uncompressedsize; if ( uncompressedsize ) return static_cast<int_type>(*uptr()); else return traits_type::eof(); }
void checkSpace(uint64_t const outlen) { // buffer overflow? if ( freeSpace() < outlen ) { flush(); assert ( opc == opa ); if ( outlen > outbuf.size() ) { ::libmaus::autoarray::AutoArray<uint8_t> newbuf(outlen); std::copy( outbuf.begin(), outbuf.end(), newbuf.begin() ); outbuf = newbuf; opa = outbuf.begin(); opc = opa; ope = outbuf.end(); } } assert ( freeSpace() >= outlen ); }
void internalFlush() { if ( pathigh != patlow ) { #if defined(LIBMAUS_FASTX_FASTQBGZFWRITER_PARALLEL) uint64_t const bcnt = bgzfenc->writeSyncedCount(C.begin(),pc-C.begin()); libmaus::util::UTF8::encodeUTF8(bcnt,*bgzfidxcntoutstr); libmaus::fastx::FastInterval const FI(patlow,pathigh,0,0,lnumsyms,minlen,maxlen); #else std::pair<uint64_t,uint64_t> bcntccnt = bgzfenc->writeSyncedCount(C.begin(),pc-C.begin()); libmaus::fastx::FastInterval const FI(patlow,pathigh,cacc,cacc+bcntccnt.second,lnumsyms,minlen,maxlen); cacc += bcntccnt.second; #endif (*fioutstr) << FI.serialise(); blockcnt += 1; std::cerr << FI << std::endl; reset(); patlow = pathigh; } }
::libmaus::autoarray::AutoArray< std::pair<int64_t,uint64_t> > libmaus::util::Utf8String::getHistogramAsArray(::libmaus::autoarray::AutoArray<uint8_t> const & A) { #if defined(_OPENMP) uint64_t const numthreads = omp_get_max_threads(); #else uint64_t const numthreads = 1; #endif ::libmaus::autoarray::AutoArray<uint64_t> const partstarts = computePartStarts(A,numthreads); uint64_t const numparts = partstarts.size()-1; ::libmaus::parallel::OMPLock lock; ::libmaus::parallel::PosixMutex mutex; ::libmaus::util::ExtendingSimpleCountingHash<uint64_t,uint64_t> ESCH(8u); typedef HistogramThread< ::libmaus::util::GetObject<uint8_t const *> > thread_type; typedef thread_type::unique_ptr_type thread_ptr_type; ::libmaus::autoarray::AutoArray< ::libmaus::util::GetObject<uint8_t const *>::unique_ptr_type > getters(numparts); ::libmaus::autoarray::AutoArray<thread_ptr_type> threads(numparts); for ( uint64_t i = 0; i < numparts; ++i ) { ::libmaus::util::GetObject<uint8_t const *>::unique_ptr_type tgettersi( new ::libmaus::util::GetObject<uint8_t const *>(A.begin()+partstarts[i]) ); getters[i] = UNIQUE_PTR_MOVE(tgettersi); thread_ptr_type tthreadsi(new thread_type(*getters[i], partstarts[i+1]-partstarts[i],mutex,ESCH,i)); threads[i] = UNIQUE_PTR_MOVE(tthreadsi); } for ( uint64_t i = 0; i < numparts; ++i ) { threads[i]->join(); threads[i].reset(); } ::libmaus::autoarray::AutoArray< std::pair<int64_t,uint64_t> > R(ESCH.size(),false); uint64_t p = 0; for ( ::libmaus::util::ExtendingSimpleCountingHash<uint64_t,uint64_t>::key_type const * ita = ESCH.begin(); ita != ESCH.end(); ++ita ) if ( *ita != ::libmaus::util::ExtendingSimpleCountingHash<uint64_t,uint64_t>::unused() ) R [ p++ ] = std::pair<int64_t,uint64_t>(*ita,ESCH.getCount(*ita)); std::sort(R.begin(),R.end()); return R; }
/* decode next block */ bool decodeBlock() { /* open new file if necessary */ bool changedfile = false; while ( fileptr < idda.data.size() && blockptr == idda.data[fileptr].numentries ) { fileptr++; blockptr = 0; changedfile = true; } if ( fileptr == idda.data.size() ) return false; if ( changedfile ) openNewFile(); /* align to word boundary */ GD->flush(); /* read block size */ uint64_t const blocksize = GD->decodeWord(32); /* increase size of memory buffer if necessary */ if ( blocksize > decodebuf.size() ) decodebuf.resize(blocksize); /* set buffer pointers */ pa = decodebuf.begin(); pc = pa; pe = pa + blocksize; /* decode block */ for ( uint64_t i = 0; i < blocksize; ++i ) decodebuf[i] = GD->decode(); /* increment block pointer */ blockptr++; return true; }
ConstantStringHash(iterator ita, iterator ite, uint64_t const maxn = 64*1024) { k = 0; n = (1 << k); m = 0; bool ok = false; for ( ; (! ok) && n <= maxn; ++k, n <<= 1, m = (m << 1)|1 ) { libmaus::autoarray::AutoArray<uint64_t> C(n); for ( iterator it = ita; it != ite; ++it ) C [ it->hash() & m ] ++; ok = true; for ( uint64_t i = 0; i < n; ++i ) ok = ok && C[i] <= 1; } if ( ! ok ) { libmaus::exception::LibMausException se; se.getStream() << "Cannot create perfect hash of size <= " << maxn << " for " << ite-ita << " elements" << std::endl; se.finish(); throw se; } H = libmaus::autoarray::AutoArray<int64_t>(n); std::fill(H.begin(),H.end(),-1); for ( iterator it = ita; it != ite; ++it ) H [ it->hash() & m ] = it-ita; for ( iterator it = ita; it != ite; ++it ) assert ( H [ it->hash() & m ] == it-ita ); }
::libmaus::autoarray::AutoArray<uint64_t> libmaus::util::Utf8String::computePartStarts( ::libmaus::autoarray::AutoArray<uint8_t> const & A, uint64_t const tnumparts ) { uint64_t const fs = A.size(); uint64_t const tpartsize = (fs + tnumparts-1)/tnumparts; uint64_t const numparts = (fs + tpartsize-1)/tpartsize; ::libmaus::autoarray::AutoArray<uint64_t> partstarts(numparts+1,false); for ( int64_t i = 0; i < static_cast<int64_t>(numparts); ++i ) { uint64_t j = std::min(i*tpartsize,fs); ::libmaus::util::GetObject<uint8_t const *> G(A.begin()+j); while ( j != fs && ((G.get() & 0xc0) == 0x80) ) ++j; partstarts[i] = j; } partstarts[numparts] = fs; return partstarts; }
ConsensusAux() : M(256), C(256) { std::fill(M.begin(),M.end(),1); std::fill(C.begin(),C.end(),0); }
ConstantStringHash(ConstantStringHash const & O) : k(O.k), n(O.n), m(O.m), H(O.H.size(),false) { std::copy(O.H.begin(),O.H.end(),H.begin()); }
/** * constructor * * @param rindex block index * @param fn file name **/ SnappyAlignmentMergeInput( std::vector < std::pair < uint64_t, uint64_t > > const & rindex, std::string const & fn) : index(rindex), streams(index.size()), data(index.size()), namecomp(static_cast<uint8_t const *>(0)), heapcomp(namecomp,data.begin()), Q(heapcomp) { bool openok = true; try { for ( uint64_t i = 0; i < index.size(); ++i ) if ( index[i].second ) { libmaus::lz::SnappyOffsetFileInputStream::unique_ptr_type tstreamsi( new libmaus::lz::SnappyOffsetFileInputStream(fn,index[i].first) ); streams [ i ] = UNIQUE_PTR_MOVE(tstreamsi); } } catch(std::exception const & ex) { openok = false; } if ( ! openok ) { std::cerr << "[V] failed to open a file handle for each single collation block, trying to merge through a single file handle" << std::endl; for ( uint64_t i = 0; i < index.size(); ++i ) if ( index[i].second ) streams[i].reset(); libmaus::aio::CheckedInputStream::unique_ptr_type TCIS(new libmaus::aio::CheckedInputStream(fn)); Psingle = UNIQUE_PTR_MOVE(TCIS); for ( uint64_t i = 0; i < index.size(); ++i ) if ( index[i].second ) { libmaus::lz::SnappyOffsetFileInputStream::unique_ptr_type tstreamsi ( new libmaus::lz::SnappyOffsetFileInputStream(*Psingle,index[i].first) ); streams [ i ] = UNIQUE_PTR_MOVE(tstreamsi); } } for ( uint64_t i = 0; i < index.size(); ++i ) if ( index[i].second ) { index[i].second -= 1; #if !defined(NDEBUG) bool const alok = #endif libmaus::bambam::BamDecoder::readAlignmentGz(*(streams[i]),data[i],0,false); #if !defined(NDEBUG) assert ( alok ); #endif Q.push(i); } }
pair_type const * begin() const { return H.begin(); }