示例#1
0
void testHuffmanWaveletSer()
{
	// std::string text = "Hello world.";
	std::string text = "fischers fritze fischt frische fische";
	::libmaus::util::shared_ptr< ::libmaus::huffman::HuffmanTreeNode >::type sroot = ::libmaus::huffman::HuffmanBase::createTree(text.begin(),text.end());
	
	::libmaus::util::TempFileNameGenerator tmpgen("tmphuf",3);
	// ::libmaus::wavelet::ImpExternalWaveletGeneratorHuffman exgen(sroot.get(), tmpgen);
	::libmaus::wavelet::ImpExternalWaveletGeneratorHuffmanParallel exgen(sroot.get(), tmpgen, 1);
	for ( uint64_t i = 0; i < text.size(); ++i )
		exgen[0].putSymbol(text[i]);
		// exgen.putSymbol(text[i]);
	exgen.createFinalStream("hufwuf");

	std::ifstream istr("hufwuf",std::ios::binary);
	::libmaus::wavelet::ImpHuffmanWaveletTree IHWT(istr);
	
	for ( uint64_t i = 0; i < IHWT.size(); ++i )
		std::cerr << static_cast<char>(IHWT[i]);
	std::cerr << std::endl;
	for ( uint64_t i = 0; i < IHWT.size(); ++i )
	{
		std::cerr 
			<< static_cast<char>(IHWT.inverseSelect(i).first)
			<< "("
			<< IHWT.inverseSelect(i).second
			<< ")"
			<< "["
			<< IHWT.rank(text[i],i)
			<< "]";
		assert ( IHWT.inverseSelect(i).second + 1 == IHWT.rank(text[i],i) );
		assert ( static_cast<int64_t>(IHWT[i]) == text[i] );
	}
	std::cerr << std::endl;
}
示例#2
0
void testUtf8Bwt(std::string const & fn)
{
	::libmaus::util::Utf8String us(fn);

	typedef ::libmaus::util::Utf8String::saidx_t saidx_t;
	::libmaus::autoarray::AutoArray<saidx_t,::libmaus::autoarray::alloc_type_c> SA = us.computeSuffixArray32();

	// produce bwt
	for ( uint64_t i = 0; i < SA.size(); ++i )
		if ( SA[i] )
			SA[i] = us[SA[i]-1];
		else
			SA[i] = -1;

	// produce huffman shaped wavelet tree of bwt
	::std::map<int64_t,uint64_t> chist = us.getHistogramAsMap();
	chist[-1] = 1;
	::libmaus::huffman::HuffmanTreeNode::shared_ptr_type htree = ::libmaus::huffman::HuffmanBase::createTree(chist);

	::libmaus::util::TempFileNameGenerator tmpgen(fn+"_tmp",3);
	::libmaus::util::FileTempFileContainer tmpcnt(tmpgen);
	::libmaus::wavelet::ImpExternalWaveletGeneratorHuffman IEWGH(htree.get(),tmpcnt);
	
	IEWGH.putSymbol(us[us.size()-1]);
	for ( uint64_t i = 0; i < SA.size(); ++i )
		IEWGH.putSymbol(SA[i]);
	IEWGH.createFinalStream(fn+".hwt");

	// load huffman shaped wavelet tree of bwt
	::libmaus::wavelet::ImpHuffmanWaveletTree::unique_ptr_type IHWT
		(::libmaus::wavelet::ImpHuffmanWaveletTree::load(fn+".hwt"));
		
	// check rank counts
	for ( ::std::map<int64_t,uint64_t>::const_iterator ita = chist.begin(); ita != chist.end(); ++ita )
		assert ( IHWT->rank(ita->first,SA.size()) == ita->second );
		
	/* cumulative symbol freqs, shifted by 1 to accomodate for terminator -1 */
	int64_t const maxsym = chist.rbegin()->first;
	int64_t const shiftedmaxsym = maxsym+1;
	::libmaus::autoarray::AutoArray<uint64_t> D(shiftedmaxsym+1);
	for ( ::std::map<int64_t,uint64_t>::const_iterator ita = chist.begin(); ita != chist.end(); ++ita )
		D [ ita->first + 1 ] = ita->second;
	D.prefixSums();
	
	// terminator has rank 0 and is at position us.size()
	uint64_t rank = 0;
	int64_t pos = us.size();
	
	// decode text backward from bwt
	while ( --pos >= 0 )
	{
		std::pair< int64_t,uint64_t> const is = IHWT->inverseSelect(rank);
		rank = D[is.first+1] + is.second;		
		assert ( is.first == us[pos] );
	}
	
	// remove huffman shaped wavelet tree
	remove ((fn+".hwt").c_str());
}
示例#3
0
void testUtf8ToImpHuffmanWaveletTree(std::string const & fn)
{
	{
		::libmaus::wavelet::Utf8ToImpHuffmanWaveletTree::constructWaveletTree<true>(fn,fn+".hwt");
		// load huffman shaped wavelet tree of bwt
		::libmaus::wavelet::ImpHuffmanWaveletTree::unique_ptr_type IHWT
			(::libmaus::wavelet::ImpHuffmanWaveletTree::load(fn+".hwt"));
		::libmaus::util::Utf8String::shared_ptr_type us = ::libmaus::util::Utf8String::constructRaw(fn);
		std::cerr << "checking length " << us->size() << std::endl;
		for ( uint64_t i = 0; i < us->size(); ++i )
			assert ( (*us)[i] == (*IHWT)[i] );
	}
	
	{
		::libmaus::wavelet::Utf8ToImpCompactHuffmanWaveletTree::constructWaveletTree<true>(fn,fn+".hwt");
		// load huffman shaped wavelet tree of bwt
		::libmaus::wavelet::ImpCompactHuffmanWaveletTree::unique_ptr_type IHWT
			(::libmaus::wavelet::ImpCompactHuffmanWaveletTree::load(fn+".hwt"));
		::libmaus::util::Utf8String::shared_ptr_type us = ::libmaus::util::Utf8String::constructRaw(fn);
		std::cerr << "checking length " << us->size() << "," << IHWT->size() << std::endl;
		for ( uint64_t i = 0; i < us->size(); ++i )
			assert ( (*us)[i] == (*IHWT)[i] );		
	}
}
示例#4
0
void testHuffmanWavelet()
{
	// std::string text = "Hello world.";
	std::string text = "fischers fritze fischt frische fische der biber schwimmt im fluss und bleibt immer treu";
	
	#if 1
	for ( uint64_t i = 0; i < 16; ++i )
		text = text+text;
	#endif
	
	#if 1	
	text = text.substr(0,1572929);
	#endif
		
	std::cerr << "Checking text of size " << text.size() << std::endl;
	
	::libmaus::util::shared_ptr< ::libmaus::huffman::HuffmanTreeNode >::type sroot = ::libmaus::huffman::HuffmanBase::createTree(text.begin(),text.end());
	
	::libmaus::util::TempFileNameGenerator tmpgen("tmphuf",3);
	uint64_t const numfrags = 128;

	#define PAR
	#if defined(PAR)	
	::libmaus::wavelet::ImpExternalWaveletGeneratorHuffmanParallel exgen(sroot.get(), tmpgen, numfrags);
	#else
	::libmaus::wavelet::ImpExternalWaveletGeneratorHuffman exgen(sroot.get(), tmpgen);
	#endif
	
	#if defined(PAR) && defined(_OPENMP)
	#pragma omp parallel for
	#endif
	for ( int64_t f = 0; f < static_cast<int64_t>(numfrags); ++f )
	{	
		uint64_t const symsperfrag = (text.size() + numfrags-1)/numfrags;
		uint64_t const low = std::min(static_cast<uint64_t>(f*symsperfrag),static_cast<uint64_t>(text.size()));
		uint64_t const high = std::min(static_cast<uint64_t>(low+symsperfrag),static_cast<uint64_t>(text.size()));

		// std::cerr << "f=" << f << " low=" << low << " high=" << high << std::endl;
		
		for ( uint64_t i = low; i < high; ++i )
			#if defined(PAR)
			exgen[f].putSymbol(text[i]);
			#else
			exgen.putSymbol(text[i]);
			#endif
	}
	exgen.createFinalStream("hufwuf");

	std::ifstream istr("hufwuf",std::ios::binary);
	::libmaus::wavelet::ImpHuffmanWaveletTree IHWT(istr);
	::libmaus::autoarray::AutoArray<int64_t> symar = sroot->symbolArray();
	
	::libmaus::huffman::EncodeTable<1> E(IHWT.sroot.get());
	E.print();
	
	#if 0
	for ( uint64_t i = 0; i < IHWT.size(); ++i )
		std::cerr << static_cast<char>(IHWT[i]);
	std::cerr << std::endl;
	#endif
	
	std::map<int64_t, uint64_t> rmap;
	
	#if 0
	for ( uint64_t i = 0; i < text.size(); ++i )
	{
		std::cerr << static_cast<char>(IHWT[i]);
	}
	std::cerr << std::endl;
	#endif
	
	for ( uint64_t i = 0; i < IHWT.size(); ++i )
	{
		#if 0
		std::cerr 
			<< static_cast<char>(IHWT.inverseSelect(i).first)
			<< "("
			<< IHWT.inverseSelect(i).second
			<< ")"
			<< "["
			<< IHWT.rank(text[i],i)
			<< "]";
		#endif

		/**
		 * check symbol
		 **/
		if ( static_cast<int64_t>(IHWT[i]) != text[i] )
			std::cerr << "Failure for i=" << i << " expected " << static_cast<int>(text[i]) << " got " << IHWT[i] << std::endl;
		assert ( static_cast<int64_t>(IHWT[i]) == text[i] );

		/**
		 * compare rank to rankm
		 **/
		for ( uint64_t j = 0; j < symar.size(); ++j )
		{
			int64_t const sym = symar[j];
			uint64_t const ra = i ? IHWT.rank(sym,i-1) : 0;
			uint64_t const rb = IHWT.rankm(sym,i);
			assert ( ra == rb );
		}

		for ( uint64_t j = 0; j < symar.size(); ++j )
		{
			int64_t const sym = symar[j];
			assert ( IHWT.rankm(sym,i) == rmap[sym] );
		}

		assert ( IHWT.inverseSelect(i).second == IHWT.rankm(text[i],i) );
		assert ( IHWT.inverseSelect(i).second == rmap[text[i]] );
		
		++rmap [ IHWT[i] ];
		
		// std::cerr << "i=" << i << " IHWT[i]=" << IHWT[i] << " r=" << r << " IHWT.rank()=" << IHWT.rank(IHWT[i],i) << std::endl;
		
		for ( uint64_t j = 0; j < symar.size(); ++j )
		{
			int64_t const sym = symar[j];
			assert ( IHWT.rank(sym,i) == rmap[sym] );
		}
		
		assert ( IHWT.inverseSelect(i).first == text[i] );
		
		// std::cerr << IHWT.inverseSelect(i).second << "\t" << rmap[text[i]] << std::endl;

		assert ( IHWT.inverseSelect(i).second + 1 == IHWT.rank(text[i],i) );
		assert ( IHWT.inverseSelect(i).second + 1 == rmap[text[i]] );

		// assert ( IHWT.select ( text[i], IHWT.rank(text[i],i)-1 ) == i );
	}
	#if 0
	std::cerr << std::endl;
	#endif
}