Exemple #1
0
			char const* getStart() const
			{
				if ( pc != C.begin() )
					return C.begin();
				else
					return 0;
			}
Exemple #2
0
			int_type underflow()
			{
				if ( gptr() < egptr() )
					return static_cast<int_type>(*uptr());
					
				assert ( gptr() == egptr() );
					
				char * midptr = buffer.begin() + pushbackspace;
				uint64_t const copyavail = 
					std::min(
						// previously read
						static_cast<uint64_t>(gptr()-eback()),
						// space we have to copy into
						static_cast<uint64_t>(midptr-buffer.begin())
					);
				::std::memmove(midptr-copyavail,gptr()-copyavail,copyavail);

				stream.read(midptr, buffer.end()-midptr);
				size_t const n = stream.gcount();
				streamreadpos += n;

				setg(midptr-copyavail, midptr, midptr+n);

				if (!n)
					return traits_type::eof();
				
				return static_cast<int_type>(*uptr());
			}
			MdStringComputationContext()
			: T0(256,false), T1(256,false), nm(0)
			{
				std::fill(T0.begin(),T0.end(),4);
				std::fill(T1.begin(),T1.end(),5);
				T0['A'] = T0['a'] =  T1['A'] = T1['a'] = 0;
				T0['C'] = T0['c'] =  T1['C'] = T1['c'] = 1;
				T0['G'] = T0['g'] =  T1['G'] = T1['g'] = 2;
				T0['T'] = T0['t'] =  T1['T'] = T1['t'] = 3;
				auxvec.set("MD");
				auxvec.set("NM");
			}
			void fillBuffer()
			{
				assert ( pc == pe );
				
				if ( setpos )
				{
					// std::cerr << "Seeking to " << readpos << std::endl;
					in.seekg(readpos);
					in.clear();
				}

				if ( in.peek() >= 0 && readpos < endpos )
				{
					#if 0
					std::cerr << "Filling block, readpos " << readpos 
						<< " stream at pos " << in.tellg() 
						<< " endpos " << endpos
						<< std::endl;
					#endif
				
					uint64_t blocksize = sizeof(uint64_t) + ( bigbuf ? sizeof(uint64_t) : 0 );
					
					// size of uncompressed buffer
					uint64_t const n = 
						bigbuf ?
							::libmaus::util::NumberSerialisation::deserialiseNumber(in)
							:
							::libmaus::util::UTF8::decodeUTF8(in,blocksize)
						;

					// size of compressed data
					uint64_t const datasize = ::libmaus::util::NumberSerialisation::deserialiseNumber(in);
					// add to block size
					blocksize += datasize;
						
					if ( n > B.size() )
					{
						B = ::libmaus::autoarray::AutoArray<char>(0,false);
						B = ::libmaus::autoarray::AutoArray<char>(n,false);
					}
					
					pa = B.begin();
					pc = pa;
					pe = pa + n;

					::libmaus::aio::IStreamWrapper wrapper(in);
					::libmaus::lz::IstreamSource< ::libmaus::aio::IStreamWrapper> insource(wrapper,datasize);

					SnappyCompress::uncompress(insource,B.begin(),n);

					readpos += blocksize;
				}
			}
			void getPattern(pattern_type & pat, uint64_t i)
			{
				GetObject G(T.begin()+(*dict)[i - dict->FI.low]);
				C.nextid = i;
				::libmaus::fastx::CompactFastQDecoderBase::decodePattern<GetObject>(G,*H,C,pat);
				pat.patid = i;
			}
Exemple #6
0
			FastQBgzfWriter(
				::std::string rindexfilename,
				uint64_t const rpatperblock,
				std::ostream & out,
				int level = Z_DEFAULT_COMPRESSION
			) : indexfilename(rindexfilename), patperblock(rpatperblock),
			    fifilename(setupTempFile(indexfilename + ".tmp.fi")), 
			    #if defined(LIBMAUS_FASTX_FASTQBGZFWRITER_PARALLEL)
			    bgzfidxfilename(setupTempFile(indexfilename + ".tmp.bgzfidx")),
			    bgzfidxcntfilename(setupTempFile(indexfilename + ".tmp.bgzfidx.cnt")),
			    bgzfidoutstr(new libmaus::aio::CheckedOutputStream(bgzfidxfilename)),
			    bgzfidxcntoutstr(new libmaus::aio::CheckedOutputStream(bgzfidxcntfilename)),
			    #endif
			    fioutstr(new libmaus::aio::CheckedOutputStream(fifilename)),
			    C(0,false), patlow(0), blockcnt(0),
			    #if defined(LIBMAUS_FASTX_FASTQBGZFWRITER_PARALLEL)
			    bgzfenc(new libmaus::lz::BgzfDeflateParallel(out,32,128,level,bgzfidoutstr.get())),
			    #else
			    bgzfenc(new libmaus::lz::BgzfDeflate<std::ostream>(out,level)),
			    #endif
			    lnumsyms(0),
			    minlen(std::numeric_limits<uint64_t>::max()),
			    maxlen(0),
			    pathigh(patlow),
			    pc(C.begin()),
			    p(0),
			    cacc(0)
			{
			}
Exemple #7
0
			char const* getEnd() const
			{
				if ( pc != C.begin() )
					return pc;
				else
					return 0;
			}
			void getElement(element_type & pat, uint64_t i) const
			{
				GetObject G(T.begin()+(*dict)[i - dict->FI.low]);
				::libmaus::fastx::CompactFastQContext C;
				C.nextid = i;
				::libmaus::fastx::CompactFastQDecoderBase::decodeElement<GetObject>(G,*H,C,pat);
			}
			CompactFastQContainer(::libmaus::network::SocketBase * textstr)
			: T(textstr->readMessageInBlocks<uint8_t,::libmaus::autoarray::alloc_type_cxx>()), 
			  dict(new ::libmaus::fastx::CompactFastQContainerDictionary(textstr)), H(), C()
			{
				GetObject G(T.begin());
				H = UNIQUE_PTR_MOVE(::libmaus::fastx::CompactFastQHeader::unique_ptr_type(new ::libmaus::fastx::CompactFastQHeader(G)));
			}
Exemple #10
0
::libmaus::util::Histogram::unique_ptr_type libmaus::util::Utf8String::getHistogram(::libmaus::autoarray::AutoArray<uint8_t> const & A)
{
	#if defined(_OPENMP)
	uint64_t const numthreads = omp_get_max_threads();
	#else
	uint64_t const numthreads = 1;
	#endif
	
	::libmaus::autoarray::AutoArray<uint64_t> const partstarts = computePartStarts(A,numthreads);
	uint64_t const numparts = partstarts.size()-1;
	
	::libmaus::util::Histogram::unique_ptr_type hist(new ::libmaus::util::Histogram);
	::libmaus::parallel::OMPLock lock;
	
	#if defined(_OPENMP)
	#pragma omp parallel for
	#endif
	for ( int64_t t = 0; t < static_cast<int64_t>(numparts); ++t )
	{
		::libmaus::util::Histogram::unique_ptr_type lhist(new ::libmaus::util::Histogram);
	
		uint64_t codelen = 0;
		uint64_t const tcodelen = partstarts[t+1]-partstarts[t];
		::libmaus::util::GetObject<uint8_t const *> G(A.begin()+partstarts[t]);
		
		while ( codelen != tcodelen )
			(*lhist)(::libmaus::util::UTF8::decodeUTF8(G,codelen));
			
		lock.lock();
		hist->merge(*lhist);
		lock.unlock();
	}
	
	return UNIQUE_PTR_MOVE(hist);
}
			LinuxStreamingPosixFdOutputStreamBuffer(std::string const & fn, int64_t const rbuffersize)
			: fd(doOpen(fn)), closefd(true), 
			  optblocksize((rbuffersize < 0) ? getOptimalIOBlockSize(fd,std::string()) : rbuffersize),
			  buffersize(optblocksize),
			  buffer(buffersize,false), prevwrite(0,0)
			{
				setp(buffer.begin(),buffer.end()-1);
			}
			virtual size_t compress(char const * input, size_t inputLength, libmaus::autoarray::AutoArray<char> & output)
			{
				uint64_t compressBound = SnappyCompress::compressBound(inputLength);
				if ( output.size() < compressBound )
					output = libmaus::autoarray::AutoArray<char>(compressBound,false);
				
				return SnappyCompress::rawcompress(input,inputLength,output.begin());
			}
Exemple #13
0
			void reset()
			{
				lnumsyms = 0;
				minlen = std::numeric_limits<uint64_t>::max();
				maxlen = 0;
				pc = C.begin();
				p = 0;
			}
			BgzfParallelRecodeDeflateBase()
			: B(getBgzfMaxBlockSize(),false), 
			  pa(B.begin()), 
			  pc(B.begin()), 
			  pe(B.end())
			{
			
			}
Exemple #15
0
			GammaRLEncoder(std::string const & filename, unsigned int const ralbits, uint64_t const n, uint64_t const rblocksize, uint64_t const rbufsize = 64*1024)
			: 
			  blocksize(rblocksize),
			  COS(filename), SGO(COS,rbufsize), GE(SGO), 
			  A(blocksize), pa(A.begin()), pc(pa), pe(A.end()), 
			  cursym(0), curcnt(0), indexwritten(false), albits(ralbits)
			{
				SGO.put(n);
				SGO.put(albits);
			}
Exemple #16
0
			element_type * get()
			{
				if ( ! freelistfill )
				{
					// allocate more alignment objects
					libmaus::autoarray::AutoArray<element_type *> nalloclist(
						std::max(
							static_cast<uint64_t>(1),
							static_cast<uint64_t>(2*alloclist.size())
						)
						,false
					);

					std::copy(alloclist.begin(),alloclist.end(),nalloclist.begin());
					element_type * nullp = 0;
					std::fill(nalloclist.begin()+alloclist.size(),nalloclist.end(),nullp);
					
					for ( element_type ** p = nalloclist.begin()+alloclist.size();
						p != nalloclist.end(); ++p )
						*p = new element_type;
					
					libmaus::autoarray::AutoArray<element_type *> nfreelist(
						std::max(
							static_cast<uint64_t>(1),
							static_cast<uint64_t>(2*freelist.size())
						)
						,false			
					);
					
					std::copy(freelist.begin(),freelist.end(),nfreelist.begin());
					std::fill(nfreelist.begin()+freelist.size(),nfreelist.end(),nullp);
				
					freelist = nfreelist;
					
					for ( element_type ** p = nalloclist.begin()+alloclist.size();
						p != nalloclist.end(); ++p )
						freelist[freelistfill++] = *p;			
					
					alloclist = nalloclist;
				}
				
				return freelist[--freelistfill];
			}
Exemple #17
0
			char const * prevStart(char const * e) const
			{
				if ( e == C.begin() )
					return 0;
				
				assert ( e[-1] == '\n' );
				// step over last/quality line's newline
				--e;
				
				// search for plus line's newline
				while ( *--e != '\n' ) {}
				// search for sequence line's newline
				while ( *--e != '\n' ) {}
				// search for id line's newline
				while ( *--e != '\n' ) {}
				// search for start of line
				while ( e != C.begin() && e[-1] != '\n' )
					--e;
					
				return e;
			}
Exemple #18
0
			Histogram & operator=(Histogram const & o)
			{
				if ( this != &o )
				{
					all = o.all;
					if ( low.size() != o.low.size() )
						low = ::libmaus::autoarray::AutoArray<uint64_t>(o.low.size(),false);
					std::copy(o.low.begin(),o.low.end(),low.begin());
				}
				
				return *this;
			}
Exemple #19
0
			void put(libmaus::fastx::FastQReader::pattern_type const & pattern)
			{
				uint64_t const patlen = getFastQLength(pattern);
				
				while ( (C.end() - pc) < static_cast<ptrdiff_t>(patlen) )
				{
					uint64_t const off = pc-C.begin();
					uint64_t const newclen = std::max(2*C.size(),static_cast<uint64_t>(1ull));
					C.resize(newclen);
					pc = C.begin()+off;
				}

				*(pc)++ = '@';
				std::copy(pattern.sid.begin(),pattern.sid.end(),pc);
				pc += pattern.sid.size();
				*(pc++) = '\n';

				std::copy(pattern.spattern.begin(), pattern.spattern.end(),pc);
				pc += pattern.spattern.size();
				*(pc++) = '\n';

				*(pc)++ = '+';
				std::copy(pattern.plus.begin(), pattern.plus.end(),pc);
				pc += pattern.plus.size();
				*(pc++) = '\n';

				std::copy(pattern.quality.begin(), pattern.quality.end(),pc);
				pc += pattern.quality.size();
				*(pc++) = '\n';
				
				assert ( pc <= C.end() );
				
				lnumsyms += pattern.spattern.size();
				minlen = std::min(minlen,static_cast<uint64_t>(pattern.spattern.size()));
				maxlen = std::max(maxlen,static_cast<uint64_t>(pattern.spattern.size()));
				pathigh++;
				
				if ( pathigh - patlow == patperblock )
					internalFlush();
			}
			int_type underflow()
			{
				// if there is still data, then return it
				if ( gptr() < egptr() )
					return static_cast<int_type>(*uptr());

				assert ( gptr() == egptr() );

				// number of bytes for putback buffer
				uint64_t const putbackcopy = std::min(
					static_cast<uint64_t>(gptr() - eback()),
					putbackspace
				);
				// copy bytes
				std::copy(
					gptr()-putbackcopy,
					gptr(),
					buffer.begin() + putbackspace - putbackcopy
				);
				
				// load data
				uint64_t const uncompressedsize = stream.read(
						buffer.begin()+putbackspace,
						buffer.size()-putbackspace
					);
				
				// set buffer pointers
				setg(
					buffer.begin()+putbackspace-putbackcopy,
					buffer.begin()+putbackspace,
					buffer.begin()+putbackspace+uncompressedsize);

				symsread += uncompressedsize;
				
				if ( uncompressedsize )
					return static_cast<int_type>(*uptr());
				else
					return traits_type::eof();
			}
Exemple #21
0
		void checkSpace(uint64_t const outlen)
		{
			// buffer overflow?
			if ( freeSpace() < outlen )
			{
				flush();
				assert ( opc == opa );
			
				if ( outlen > outbuf.size() )
				{
					::libmaus::autoarray::AutoArray<uint8_t> newbuf(outlen);	
					std::copy( outbuf.begin(), outbuf.end(), newbuf.begin() );
					
					outbuf = newbuf;
					opa = outbuf.begin();
					opc = opa;
					ope = outbuf.end();
				}
			}
			
			assert ( freeSpace() >= outlen );		
		}
Exemple #22
0
			void internalFlush()
			{
				if ( pathigh != patlow )
				{
					#if defined(LIBMAUS_FASTX_FASTQBGZFWRITER_PARALLEL)
					uint64_t const bcnt = bgzfenc->writeSyncedCount(C.begin(),pc-C.begin());
					libmaus::util::UTF8::encodeUTF8(bcnt,*bgzfidxcntoutstr);
					libmaus::fastx::FastInterval const FI(patlow,pathigh,0,0,lnumsyms,minlen,maxlen);
					#else
					std::pair<uint64_t,uint64_t> bcntccnt = bgzfenc->writeSyncedCount(C.begin(),pc-C.begin());
					libmaus::fastx::FastInterval const FI(patlow,pathigh,cacc,cacc+bcntccnt.second,lnumsyms,minlen,maxlen);
					cacc += bcntccnt.second;
					#endif
					
					(*fioutstr) << FI.serialise();				
					blockcnt += 1;
						
					std::cerr << FI << std::endl;
					
					reset();
					patlow = pathigh;
				}	
			}
Exemple #23
0
::libmaus::autoarray::AutoArray< std::pair<int64_t,uint64_t> > libmaus::util::Utf8String::getHistogramAsArray(::libmaus::autoarray::AutoArray<uint8_t> const & A)
{			
	#if defined(_OPENMP)
	uint64_t const numthreads = omp_get_max_threads();
	#else
	uint64_t const numthreads = 1;
	#endif
	
	::libmaus::autoarray::AutoArray<uint64_t> const partstarts = computePartStarts(A,numthreads);
	uint64_t const numparts = partstarts.size()-1;

	::libmaus::parallel::OMPLock lock;
	::libmaus::parallel::PosixMutex mutex;
	::libmaus::util::ExtendingSimpleCountingHash<uint64_t,uint64_t> ESCH(8u);
	
	typedef HistogramThread< ::libmaus::util::GetObject<uint8_t const *> > thread_type;
	typedef thread_type::unique_ptr_type thread_ptr_type;
	::libmaus::autoarray::AutoArray< ::libmaus::util::GetObject<uint8_t const *>::unique_ptr_type > getters(numparts);
	::libmaus::autoarray::AutoArray<thread_ptr_type> threads(numparts);

	for ( uint64_t i = 0; i < numparts; ++i )
	{
		::libmaus::util::GetObject<uint8_t const *>::unique_ptr_type tgettersi(
                                new ::libmaus::util::GetObject<uint8_t const *>(A.begin()+partstarts[i])
                        );
		getters[i] = UNIQUE_PTR_MOVE(tgettersi);
		thread_ptr_type tthreadsi(new thread_type(*getters[i],
                                partstarts[i+1]-partstarts[i],mutex,ESCH,i));
		threads[i] = UNIQUE_PTR_MOVE(tthreadsi);
	}
	for ( uint64_t i = 0; i < numparts; ++i )
	{
		threads[i]->join();
		threads[i].reset();
	}	

	::libmaus::autoarray::AutoArray< std::pair<int64_t,uint64_t> > R(ESCH.size(),false);
	uint64_t p = 0;
	for ( ::libmaus::util::ExtendingSimpleCountingHash<uint64_t,uint64_t>::key_type const * ita =
		ESCH.begin(); ita != ESCH.end(); ++ita )
		if ( *ita != ::libmaus::util::ExtendingSimpleCountingHash<uint64_t,uint64_t>::unused() )
			R [ p++ ] = std::pair<int64_t,uint64_t>(*ita,ESCH.getCount(*ita));
			
	std::sort(R.begin(),R.end());
	
	return R;
}
Exemple #24
0
			/* decode next block */
			bool decodeBlock()
			{
				/* open new file if necessary */
				bool changedfile = false;
				while ( fileptr < idda.data.size() && blockptr == idda.data[fileptr].numentries )
				{
					fileptr++;
					blockptr = 0;
					changedfile = true;
				}
				if ( fileptr == idda.data.size() )
					return false;
				if ( changedfile )
					openNewFile();

				/* align to word boundary */
				GD->flush();
				/* read block size */
				uint64_t const blocksize = GD->decodeWord(32);

				/* increase size of memory buffer if necessary */
				if ( blocksize > decodebuf.size() )
					decodebuf.resize(blocksize);

				/* set buffer pointers */
				pa = decodebuf.begin();
				pc = pa;
				pe = pa + blocksize;

				/* decode block */
				for ( uint64_t i = 0; i < blocksize; ++i )
					decodebuf[i] = GD->decode();

				/* increment block pointer */
				blockptr++;
				
				return true;
			}
			ConstantStringHash(iterator ita, iterator ite, uint64_t const maxn = 64*1024)
			{
				k = 0;
				n = (1 << k);
				m = 0;
				bool ok = false;
				
				for ( ; (! ok) && n <= maxn; ++k, n <<= 1, m = (m << 1)|1 )
				{
					libmaus::autoarray::AutoArray<uint64_t> C(n);

					for ( iterator it = ita; it != ite; ++it )
						C [ it->hash() & m ] ++;
						
					ok = true;
					for ( uint64_t i = 0; i < n; ++i )
						ok = ok && C[i] <= 1;
				}
				
				if ( ! ok )
				{
					libmaus::exception::LibMausException se;
					se.getStream() << "Cannot create perfect hash of size <= " << maxn << " for " << ite-ita << " elements" << std::endl;
					se.finish();
					throw se;
				}
				
				H = libmaus::autoarray::AutoArray<int64_t>(n);
				std::fill(H.begin(),H.end(),-1);
				
				for ( iterator it = ita; it != ite; ++it )
					H [ it->hash() & m ] = it-ita;

				for ( iterator it = ita; it != ite; ++it )
					assert ( H [ it->hash() & m ] == it-ita );
			}
Exemple #26
0
::libmaus::autoarray::AutoArray<uint64_t> libmaus::util::Utf8String::computePartStarts(
	::libmaus::autoarray::AutoArray<uint8_t> const & A, uint64_t const tnumparts
)
{
	uint64_t const fs = A.size();
	uint64_t const tpartsize = (fs + tnumparts-1)/tnumparts;
	uint64_t const numparts = (fs + tpartsize-1)/tpartsize;
	::libmaus::autoarray::AutoArray<uint64_t> partstarts(numparts+1,false);

	for ( int64_t i = 0; i < static_cast<int64_t>(numparts); ++i )
	{
		uint64_t j = std::min(i*tpartsize,fs);
		::libmaus::util::GetObject<uint8_t const *> G(A.begin()+j);
		
		while ( j != fs && ((G.get() & 0xc0) == 0x80) )
			++j;
			
		partstarts[i] = j;					
	}
	
	partstarts[numparts] = fs;

	return partstarts;
}
Exemple #27
0
	ConsensusAux() : M(256), C(256)
	{
		std::fill(M.begin(),M.end(),1);
		std::fill(C.begin(),C.end(),0);
	}
			ConstantStringHash(ConstantStringHash const & O)
			: k(O.k), n(O.n), m(O.m), H(O.H.size(),false)
			{
				std::copy(O.H.begin(),O.H.end(),H.begin());
			} 
			/**
			 * constructor
			 *
			 * @param rindex block index
			 * @param fn file name
			 **/
			SnappyAlignmentMergeInput(
				std::vector < std::pair < uint64_t, uint64_t > > const & rindex,
				std::string const & fn)
			: index(rindex), streams(index.size()), data(index.size()), namecomp(static_cast<uint8_t const *>(0)), heapcomp(namecomp,data.begin()), Q(heapcomp)
			{
				bool openok = true;
			
				try
				{
					for ( uint64_t i = 0; i < index.size(); ++i )
						if ( index[i].second )
						{
							libmaus::lz::SnappyOffsetFileInputStream::unique_ptr_type tstreamsi(
                                	                                        new libmaus::lz::SnappyOffsetFileInputStream(fn,index[i].first)
                                        	                        );
							streams [ i ] = UNIQUE_PTR_MOVE(tstreamsi);
						}
				}
				catch(std::exception const & ex)
				{
					openok = false;
				}
				
				if ( ! openok )
				{
					std::cerr << "[V] failed to open a file handle for each single collation block, trying to merge through a single file handle" << std::endl;

					for ( uint64_t i = 0; i < index.size(); ++i )
						if ( index[i].second )
							streams[i].reset();
					
					libmaus::aio::CheckedInputStream::unique_ptr_type TCIS(new libmaus::aio::CheckedInputStream(fn));
					Psingle	= UNIQUE_PTR_MOVE(TCIS);

					for ( uint64_t i = 0; i < index.size(); ++i )
						if ( index[i].second )
						{
							libmaus::lz::SnappyOffsetFileInputStream::unique_ptr_type tstreamsi
							(
                                	                	new libmaus::lz::SnappyOffsetFileInputStream(*Psingle,index[i].first)
                                        	        );
							streams [ i ] = UNIQUE_PTR_MOVE(tstreamsi);
						}
				}
					
				for ( uint64_t i = 0; i < index.size(); ++i )
					if ( index[i].second )
					{
						index[i].second -= 1;

						#if !defined(NDEBUG)
						bool const alok = 
						#endif
						        libmaus::bambam::BamDecoder::readAlignmentGz(*(streams[i]),data[i],0,false);
						        
						#if !defined(NDEBUG)
						assert ( alok );
						#endif
						
						Q.push(i);
					}
			}
Exemple #30
0
			pair_type const * begin() const { return H.begin(); }