void testHuffmanWaveletSer() { // std::string text = "Hello world."; std::string text = "fischers fritze fischt frische fische"; ::libmaus::util::shared_ptr< ::libmaus::huffman::HuffmanTreeNode >::type sroot = ::libmaus::huffman::HuffmanBase::createTree(text.begin(),text.end()); ::libmaus::util::TempFileNameGenerator tmpgen("tmphuf",3); // ::libmaus::wavelet::ImpExternalWaveletGeneratorHuffman exgen(sroot.get(), tmpgen); ::libmaus::wavelet::ImpExternalWaveletGeneratorHuffmanParallel exgen(sroot.get(), tmpgen, 1); for ( uint64_t i = 0; i < text.size(); ++i ) exgen[0].putSymbol(text[i]); // exgen.putSymbol(text[i]); exgen.createFinalStream("hufwuf"); std::ifstream istr("hufwuf",std::ios::binary); ::libmaus::wavelet::ImpHuffmanWaveletTree IHWT(istr); for ( uint64_t i = 0; i < IHWT.size(); ++i ) std::cerr << static_cast<char>(IHWT[i]); std::cerr << std::endl; for ( uint64_t i = 0; i < IHWT.size(); ++i ) { std::cerr << static_cast<char>(IHWT.inverseSelect(i).first) << "(" << IHWT.inverseSelect(i).second << ")" << "[" << IHWT.rank(text[i],i) << "]"; assert ( IHWT.inverseSelect(i).second + 1 == IHWT.rank(text[i],i) ); assert ( static_cast<int64_t>(IHWT[i]) == text[i] ); } std::cerr << std::endl; }
void testUtf8Bwt(std::string const & fn) { ::libmaus::util::Utf8String us(fn); typedef ::libmaus::util::Utf8String::saidx_t saidx_t; ::libmaus::autoarray::AutoArray<saidx_t,::libmaus::autoarray::alloc_type_c> SA = us.computeSuffixArray32(); // produce bwt for ( uint64_t i = 0; i < SA.size(); ++i ) if ( SA[i] ) SA[i] = us[SA[i]-1]; else SA[i] = -1; // produce huffman shaped wavelet tree of bwt ::std::map<int64_t,uint64_t> chist = us.getHistogramAsMap(); chist[-1] = 1; ::libmaus::huffman::HuffmanTreeNode::shared_ptr_type htree = ::libmaus::huffman::HuffmanBase::createTree(chist); ::libmaus::util::TempFileNameGenerator tmpgen(fn+"_tmp",3); ::libmaus::util::FileTempFileContainer tmpcnt(tmpgen); ::libmaus::wavelet::ImpExternalWaveletGeneratorHuffman IEWGH(htree.get(),tmpcnt); IEWGH.putSymbol(us[us.size()-1]); for ( uint64_t i = 0; i < SA.size(); ++i ) IEWGH.putSymbol(SA[i]); IEWGH.createFinalStream(fn+".hwt"); // load huffman shaped wavelet tree of bwt ::libmaus::wavelet::ImpHuffmanWaveletTree::unique_ptr_type IHWT (::libmaus::wavelet::ImpHuffmanWaveletTree::load(fn+".hwt")); // check rank counts for ( ::std::map<int64_t,uint64_t>::const_iterator ita = chist.begin(); ita != chist.end(); ++ita ) assert ( IHWT->rank(ita->first,SA.size()) == ita->second ); /* cumulative symbol freqs, shifted by 1 to accomodate for terminator -1 */ int64_t const maxsym = chist.rbegin()->first; int64_t const shiftedmaxsym = maxsym+1; ::libmaus::autoarray::AutoArray<uint64_t> D(shiftedmaxsym+1); for ( ::std::map<int64_t,uint64_t>::const_iterator ita = chist.begin(); ita != chist.end(); ++ita ) D [ ita->first + 1 ] = ita->second; D.prefixSums(); // terminator has rank 0 and is at position us.size() uint64_t rank = 0; int64_t pos = us.size(); // decode text backward from bwt while ( --pos >= 0 ) { std::pair< int64_t,uint64_t> const is = IHWT->inverseSelect(rank); rank = D[is.first+1] + is.second; assert ( is.first == us[pos] ); } // remove huffman shaped wavelet tree remove ((fn+".hwt").c_str()); }
void testUtf8ToImpHuffmanWaveletTree(std::string const & fn) { { ::libmaus::wavelet::Utf8ToImpHuffmanWaveletTree::constructWaveletTree<true>(fn,fn+".hwt"); // load huffman shaped wavelet tree of bwt ::libmaus::wavelet::ImpHuffmanWaveletTree::unique_ptr_type IHWT (::libmaus::wavelet::ImpHuffmanWaveletTree::load(fn+".hwt")); ::libmaus::util::Utf8String::shared_ptr_type us = ::libmaus::util::Utf8String::constructRaw(fn); std::cerr << "checking length " << us->size() << std::endl; for ( uint64_t i = 0; i < us->size(); ++i ) assert ( (*us)[i] == (*IHWT)[i] ); } { ::libmaus::wavelet::Utf8ToImpCompactHuffmanWaveletTree::constructWaveletTree<true>(fn,fn+".hwt"); // load huffman shaped wavelet tree of bwt ::libmaus::wavelet::ImpCompactHuffmanWaveletTree::unique_ptr_type IHWT (::libmaus::wavelet::ImpCompactHuffmanWaveletTree::load(fn+".hwt")); ::libmaus::util::Utf8String::shared_ptr_type us = ::libmaus::util::Utf8String::constructRaw(fn); std::cerr << "checking length " << us->size() << "," << IHWT->size() << std::endl; for ( uint64_t i = 0; i < us->size(); ++i ) assert ( (*us)[i] == (*IHWT)[i] ); } }
void testHuffmanWavelet() { // std::string text = "Hello world."; std::string text = "fischers fritze fischt frische fische der biber schwimmt im fluss und bleibt immer treu"; #if 1 for ( uint64_t i = 0; i < 16; ++i ) text = text+text; #endif #if 1 text = text.substr(0,1572929); #endif std::cerr << "Checking text of size " << text.size() << std::endl; ::libmaus::util::shared_ptr< ::libmaus::huffman::HuffmanTreeNode >::type sroot = ::libmaus::huffman::HuffmanBase::createTree(text.begin(),text.end()); ::libmaus::util::TempFileNameGenerator tmpgen("tmphuf",3); uint64_t const numfrags = 128; #define PAR #if defined(PAR) ::libmaus::wavelet::ImpExternalWaveletGeneratorHuffmanParallel exgen(sroot.get(), tmpgen, numfrags); #else ::libmaus::wavelet::ImpExternalWaveletGeneratorHuffman exgen(sroot.get(), tmpgen); #endif #if defined(PAR) && defined(_OPENMP) #pragma omp parallel for #endif for ( int64_t f = 0; f < static_cast<int64_t>(numfrags); ++f ) { uint64_t const symsperfrag = (text.size() + numfrags-1)/numfrags; uint64_t const low = std::min(static_cast<uint64_t>(f*symsperfrag),static_cast<uint64_t>(text.size())); uint64_t const high = std::min(static_cast<uint64_t>(low+symsperfrag),static_cast<uint64_t>(text.size())); // std::cerr << "f=" << f << " low=" << low << " high=" << high << std::endl; for ( uint64_t i = low; i < high; ++i ) #if defined(PAR) exgen[f].putSymbol(text[i]); #else exgen.putSymbol(text[i]); #endif } exgen.createFinalStream("hufwuf"); std::ifstream istr("hufwuf",std::ios::binary); ::libmaus::wavelet::ImpHuffmanWaveletTree IHWT(istr); ::libmaus::autoarray::AutoArray<int64_t> symar = sroot->symbolArray(); ::libmaus::huffman::EncodeTable<1> E(IHWT.sroot.get()); E.print(); #if 0 for ( uint64_t i = 0; i < IHWT.size(); ++i ) std::cerr << static_cast<char>(IHWT[i]); std::cerr << std::endl; #endif std::map<int64_t, uint64_t> rmap; #if 0 for ( uint64_t i = 0; i < text.size(); ++i ) { std::cerr << static_cast<char>(IHWT[i]); } std::cerr << std::endl; #endif for ( uint64_t i = 0; i < IHWT.size(); ++i ) { #if 0 std::cerr << static_cast<char>(IHWT.inverseSelect(i).first) << "(" << IHWT.inverseSelect(i).second << ")" << "[" << IHWT.rank(text[i],i) << "]"; #endif /** * check symbol **/ if ( static_cast<int64_t>(IHWT[i]) != text[i] ) std::cerr << "Failure for i=" << i << " expected " << static_cast<int>(text[i]) << " got " << IHWT[i] << std::endl; assert ( static_cast<int64_t>(IHWT[i]) == text[i] ); /** * compare rank to rankm **/ for ( uint64_t j = 0; j < symar.size(); ++j ) { int64_t const sym = symar[j]; uint64_t const ra = i ? IHWT.rank(sym,i-1) : 0; uint64_t const rb = IHWT.rankm(sym,i); assert ( ra == rb ); } for ( uint64_t j = 0; j < symar.size(); ++j ) { int64_t const sym = symar[j]; assert ( IHWT.rankm(sym,i) == rmap[sym] ); } assert ( IHWT.inverseSelect(i).second == IHWT.rankm(text[i],i) ); assert ( IHWT.inverseSelect(i).second == rmap[text[i]] ); ++rmap [ IHWT[i] ]; // std::cerr << "i=" << i << " IHWT[i]=" << IHWT[i] << " r=" << r << " IHWT.rank()=" << IHWT.rank(IHWT[i],i) << std::endl; for ( uint64_t j = 0; j < symar.size(); ++j ) { int64_t const sym = symar[j]; assert ( IHWT.rank(sym,i) == rmap[sym] ); } assert ( IHWT.inverseSelect(i).first == text[i] ); // std::cerr << IHWT.inverseSelect(i).second << "\t" << rmap[text[i]] << std::endl; assert ( IHWT.inverseSelect(i).second + 1 == IHWT.rank(text[i],i) ); assert ( IHWT.inverseSelect(i).second + 1 == rmap[text[i]] ); // assert ( IHWT.select ( text[i], IHWT.rank(text[i],i)-1 ) == i ); } #if 0 std::cerr << std::endl; #endif }