void testHuffmanWaveletSer() { // std::string text = "Hello world."; std::string text = "fischers fritze fischt frische fische"; ::libmaus::util::shared_ptr< ::libmaus::huffman::HuffmanTreeNode >::type sroot = ::libmaus::huffman::HuffmanBase::createTree(text.begin(),text.end()); ::libmaus::util::TempFileNameGenerator tmpgen("tmphuf",3); // ::libmaus::wavelet::ImpExternalWaveletGeneratorHuffman exgen(sroot.get(), tmpgen); ::libmaus::wavelet::ImpExternalWaveletGeneratorHuffmanParallel exgen(sroot.get(), tmpgen, 1); for ( uint64_t i = 0; i < text.size(); ++i ) exgen[0].putSymbol(text[i]); // exgen.putSymbol(text[i]); exgen.createFinalStream("hufwuf"); std::ifstream istr("hufwuf",std::ios::binary); ::libmaus::wavelet::ImpHuffmanWaveletTree IHWT(istr); for ( uint64_t i = 0; i < IHWT.size(); ++i ) std::cerr << static_cast<char>(IHWT[i]); std::cerr << std::endl; for ( uint64_t i = 0; i < IHWT.size(); ++i ) { std::cerr << static_cast<char>(IHWT.inverseSelect(i).first) << "(" << IHWT.inverseSelect(i).second << ")" << "[" << IHWT.rank(text[i],i) << "]"; assert ( IHWT.inverseSelect(i).second + 1 == IHWT.rank(text[i],i) ); assert ( static_cast<int64_t>(IHWT[i]) == text[i] ); } std::cerr << std::endl; }
int main() { testCompactHuffmanPar(); return 0; #if 0 ::libmaus::wavelet::ImpHuffmanWaveletTree::unique_ptr_type IMP(new ::libmaus::wavelet::ImpHuffmanWaveletTree(std::cin)); ::libmaus::autoarray::AutoArray<uint32_t>::unique_ptr_type Z(new ::libmaus::autoarray::AutoArray<uint32_t>(64)); ::libmaus::lf::LFZeroImp L(IMP,Z,0); #endif #if 0 LFZeroTemplate ( wt_ptr_type & rW, z_array_ptr_type & rZ, uint64_t const rp0rank ) #endif // testImpExternalWaveletGenerator(); testHuffmanWavelet(); testHuffmanWaveletSer(); #if 0 srand(time(0)); uint64_t const b = 5; ::libmaus::util::TempFileNameGenerator tmpgen(std::string("tmp"),3); ::libmaus::wavelet::ExternalWaveletGenerator ex(b,tmpgen); std::vector < uint64_t > V; for ( uint64_t i = 0; i < 381842; ++i ) { uint64_t const v = rand() % (1ull<<b); // uint64_t const v = i % (1ull<<b); ex.putSymbol(v); V.push_back(v); } std::string const outfilename = "ex"; uint64_t const n = ex.createFinalStream(outfilename); ::std::ifstream istr(outfilename.c_str(), std::ios::binary); ::libmaus::wavelet::WaveletTree < ::libmaus::rank::ERank222B, uint64_t > WT(istr); std::cerr << "Checking..."; for ( uint64_t i = 0; i < n; ++i ) assert ( WT[i] == V[i] ); std::cerr << "done." << std::endl; if ( n < 256 ) { for ( uint64_t i = 0; i < n; ++i ) std::cerr << WT[i] << ";"; std::cerr << std::endl; } #endif }
void testImpExternalWaveletGenerator() { ::libmaus::util::TempFileNameGenerator tmpgen("tmpdir",1); uint64_t const b = 3; ::libmaus::wavelet::ImpExternalWaveletGenerator IEWG(b,tmpgen); #if 0 IEWG.putSymbol(0); IEWG.putSymbol(1); IEWG.putSymbol(2); IEWG.putSymbol(3); IEWG.putSymbol(4); IEWG.putSymbol(5); IEWG.putSymbol(6); IEWG.putSymbol(7); #endif std::vector < uint64_t > V; for ( uint64_t i = 0; i < 381842*41; ++i ) { uint64_t const v = rand() % (1ull<<b); // uint64_t const v = i % (1ull<<b); IEWG.putSymbol(v); V.push_back(v); } std::ostringstream ostr; IEWG.createFinalStream(ostr); std::istringstream istr(ostr.str()); ::libmaus::wavelet::ImpWaveletTree IWT(istr); std::cerr << "Testing..."; std::vector<uint64_t> R(1ull << b,0); for ( uint64_t i = 0; i < IWT.size(); ++i ) { std::pair<uint64_t,uint64_t> IS = IWT.inverseSelect(i); assert ( IS.first == V[i] ); assert ( IS.second == R[V[i]] ); uint64_t const s = IWT.select(V[i],R[V[i]]); // std::cerr << "expect " << i << " got " << s << std::endl; assert ( s == i ); R [ V[i] ] ++; assert ( IWT.rank(V[i],i) == R[V[i]] ); assert ( IWT[i] == V[i] ); } std::cerr << "done." << std::endl; #if 0 for ( uint64_t i = 0; i < IWT.n; ++i ) { std::cerr << "IWT[" << i << "]=" << IWT[i] << std::endl; std::cerr << "rank(" << IWT[i] << "," << i << ")" << "=" << IWT.rank(IWT[i],i) << std::endl; } #endif }
void testUtf8Bwt(std::string const & fn) { ::libmaus::util::Utf8String us(fn); typedef ::libmaus::util::Utf8String::saidx_t saidx_t; ::libmaus::autoarray::AutoArray<saidx_t,::libmaus::autoarray::alloc_type_c> SA = us.computeSuffixArray32(); // produce bwt for ( uint64_t i = 0; i < SA.size(); ++i ) if ( SA[i] ) SA[i] = us[SA[i]-1]; else SA[i] = -1; // produce huffman shaped wavelet tree of bwt ::std::map<int64_t,uint64_t> chist = us.getHistogramAsMap(); chist[-1] = 1; ::libmaus::huffman::HuffmanTreeNode::shared_ptr_type htree = ::libmaus::huffman::HuffmanBase::createTree(chist); ::libmaus::util::TempFileNameGenerator tmpgen(fn+"_tmp",3); ::libmaus::util::FileTempFileContainer tmpcnt(tmpgen); ::libmaus::wavelet::ImpExternalWaveletGeneratorHuffman IEWGH(htree.get(),tmpcnt); IEWGH.putSymbol(us[us.size()-1]); for ( uint64_t i = 0; i < SA.size(); ++i ) IEWGH.putSymbol(SA[i]); IEWGH.createFinalStream(fn+".hwt"); // load huffman shaped wavelet tree of bwt ::libmaus::wavelet::ImpHuffmanWaveletTree::unique_ptr_type IHWT (::libmaus::wavelet::ImpHuffmanWaveletTree::load(fn+".hwt")); // check rank counts for ( ::std::map<int64_t,uint64_t>::const_iterator ita = chist.begin(); ita != chist.end(); ++ita ) assert ( IHWT->rank(ita->first,SA.size()) == ita->second ); /* cumulative symbol freqs, shifted by 1 to accomodate for terminator -1 */ int64_t const maxsym = chist.rbegin()->first; int64_t const shiftedmaxsym = maxsym+1; ::libmaus::autoarray::AutoArray<uint64_t> D(shiftedmaxsym+1); for ( ::std::map<int64_t,uint64_t>::const_iterator ita = chist.begin(); ita != chist.end(); ++ita ) D [ ita->first + 1 ] = ita->second; D.prefixSums(); // terminator has rank 0 and is at position us.size() uint64_t rank = 0; int64_t pos = us.size(); // decode text backward from bwt while ( --pos >= 0 ) { std::pair< int64_t,uint64_t> const is = IHWT->inverseSelect(rank); rank = D[is.first+1] + is.second; assert ( is.first == us[pos] ); } // remove huffman shaped wavelet tree remove ((fn+".hwt").c_str()); }
void testsparsegammamultifilesetmergedense() { libmaus::util::TempFileNameGenerator tmpgen("tmp",3); libmaus::gamma::SparseGammaGapMultiFileLevelSet SGGF(tmpgen,4); std::map<uint64_t,uint64_t> refM; for ( uint64_t i = 0; i < 25; ++i ) { std::string const fn = tmpgen.getFileName(); std::string const indexfn = fn+".idx"; libmaus::aio::CheckedOutputStream COS(fn); libmaus::aio::CheckedInputOutputStream indexCIOS(indexfn); libmaus::gamma::SparseGammaGapBlockEncoder SGE(COS,indexCIOS); remove(indexfn.c_str()); SGE.encode(2*i,i+1); refM[2*i] += (i+1); SGE.encode(2*i+2,i+1); refM[2*i+2] += (i+1); SGE.encode(2*i+4,i+1); refM[2*i+4] += (i+1); SGE.term(); SGGF.addFile(fn); } uint64_t const maxval = refM.size() ? (refM.rbegin())->first : 0; std::string const ffn = tmpgen.getFileName(); std::vector<std::string> const fno = SGGF.mergeToDense(ffn,maxval+1); // libmaus::aio::CheckedInputStream CIS(ffn); libmaus::gamma::GammaGapDecoder SGGD(fno); for ( uint64_t i = 0; i < maxval+1; ++i ) { uint64_t dv = SGGD.decode(); std::cerr << dv; if ( refM.find(i) != refM.end() ) { std::cerr << "(" << refM.find(i)->second << ")"; assert ( refM.find(i)->second == dv ); } else { std::cerr << "(0)"; assert ( dv == 0 ); } std::cerr << ";"; } std::cerr << std::endl; for ( uint64_t i = 0; i < fno.size(); ++i ) { // std::cerr << fno[i] << std::endl; remove(fno[i].c_str()); } }
void testsparsegammamerge() { libmaus::util::TempFileNameGenerator tmpgen("tmp",3); libmaus::gamma::SparseGammaGapFileSet SGGF(tmpgen); std::map<uint64_t,uint64_t> refM; for ( uint64_t i = 0; i < 25; ++i ) { std::string const fn = tmpgen.getFileName(); libmaus::aio::CheckedOutputStream COS(fn); libmaus::gamma::SparseGammaGapEncoder SGE(COS); SGE.encode(2*i,i+1); refM[2*i] += (i+1); SGE.encode(2*i+2,i+1); refM[2*i+2] += (i+1); SGE.encode(2*i+4,i+1); refM[2*i+4] += (i+1); SGE.term(); SGGF.addFile(fn); } std::string const ffn = tmpgen.getFileName(); SGGF.merge(ffn); libmaus::aio::CheckedInputStream CIS(ffn); libmaus::gamma::SparseGammaGapDecoder SGGD(CIS); for ( uint64_t i = 0; i < 60; ++i ) { uint64_t dv = SGGD.decode(); std::cerr << dv; if ( refM.find(i) != refM.end() ) { std::cerr << "(" << refM.find(i)->second << ")"; assert ( refM.find(i)->second == dv ); } else { std::cerr << "(0)"; assert ( dv == 0 ); } std::cerr << ";"; } std::cerr << std::endl; remove(ffn.c_str()); }
void testHuffmanWavelet() { // std::string text = "Hello world."; std::string text = "fischers fritze fischt frische fische der biber schwimmt im fluss und bleibt immer treu"; #if 1 for ( uint64_t i = 0; i < 16; ++i ) text = text+text; #endif #if 1 text = text.substr(0,1572929); #endif std::cerr << "Checking text of size " << text.size() << std::endl; ::libmaus::util::shared_ptr< ::libmaus::huffman::HuffmanTreeNode >::type sroot = ::libmaus::huffman::HuffmanBase::createTree(text.begin(),text.end()); ::libmaus::util::TempFileNameGenerator tmpgen("tmphuf",3); uint64_t const numfrags = 128; #define PAR #if defined(PAR) ::libmaus::wavelet::ImpExternalWaveletGeneratorHuffmanParallel exgen(sroot.get(), tmpgen, numfrags); #else ::libmaus::wavelet::ImpExternalWaveletGeneratorHuffman exgen(sroot.get(), tmpgen); #endif #if defined(PAR) && defined(_OPENMP) #pragma omp parallel for #endif for ( int64_t f = 0; f < static_cast<int64_t>(numfrags); ++f ) { uint64_t const symsperfrag = (text.size() + numfrags-1)/numfrags; uint64_t const low = std::min(static_cast<uint64_t>(f*symsperfrag),static_cast<uint64_t>(text.size())); uint64_t const high = std::min(static_cast<uint64_t>(low+symsperfrag),static_cast<uint64_t>(text.size())); // std::cerr << "f=" << f << " low=" << low << " high=" << high << std::endl; for ( uint64_t i = low; i < high; ++i ) #if defined(PAR) exgen[f].putSymbol(text[i]); #else exgen.putSymbol(text[i]); #endif } exgen.createFinalStream("hufwuf"); std::ifstream istr("hufwuf",std::ios::binary); ::libmaus::wavelet::ImpHuffmanWaveletTree IHWT(istr); ::libmaus::autoarray::AutoArray<int64_t> symar = sroot->symbolArray(); ::libmaus::huffman::EncodeTable<1> E(IHWT.sroot.get()); E.print(); #if 0 for ( uint64_t i = 0; i < IHWT.size(); ++i ) std::cerr << static_cast<char>(IHWT[i]); std::cerr << std::endl; #endif std::map<int64_t, uint64_t> rmap; #if 0 for ( uint64_t i = 0; i < text.size(); ++i ) { std::cerr << static_cast<char>(IHWT[i]); } std::cerr << std::endl; #endif for ( uint64_t i = 0; i < IHWT.size(); ++i ) { #if 0 std::cerr << static_cast<char>(IHWT.inverseSelect(i).first) << "(" << IHWT.inverseSelect(i).second << ")" << "[" << IHWT.rank(text[i],i) << "]"; #endif /** * check symbol **/ if ( static_cast<int64_t>(IHWT[i]) != text[i] ) std::cerr << "Failure for i=" << i << " expected " << static_cast<int>(text[i]) << " got " << IHWT[i] << std::endl; assert ( static_cast<int64_t>(IHWT[i]) == text[i] ); /** * compare rank to rankm **/ for ( uint64_t j = 0; j < symar.size(); ++j ) { int64_t const sym = symar[j]; uint64_t const ra = i ? IHWT.rank(sym,i-1) : 0; uint64_t const rb = IHWT.rankm(sym,i); assert ( ra == rb ); } for ( uint64_t j = 0; j < symar.size(); ++j ) { int64_t const sym = symar[j]; assert ( IHWT.rankm(sym,i) == rmap[sym] ); } assert ( IHWT.inverseSelect(i).second == IHWT.rankm(text[i],i) ); assert ( IHWT.inverseSelect(i).second == rmap[text[i]] ); ++rmap [ IHWT[i] ]; // std::cerr << "i=" << i << " IHWT[i]=" << IHWT[i] << " r=" << r << " IHWT.rank()=" << IHWT.rank(IHWT[i],i) << std::endl; for ( uint64_t j = 0; j < symar.size(); ++j ) { int64_t const sym = symar[j]; assert ( IHWT.rank(sym,i) == rmap[sym] ); } assert ( IHWT.inverseSelect(i).first == text[i] ); // std::cerr << IHWT.inverseSelect(i).second << "\t" << rmap[text[i]] << std::endl; assert ( IHWT.inverseSelect(i).second + 1 == IHWT.rank(text[i],i) ); assert ( IHWT.inverseSelect(i).second + 1 == rmap[text[i]] ); // assert ( IHWT.select ( text[i], IHWT.rank(text[i],i)-1 ) == i ); } #if 0 std::cerr << std::endl; #endif }
void selfie(libmaus2::util::ArgParser const & arg, std::string const & fn) { std::string const compactfn = fn + ".compact"; std::string const compactmetafn = compactfn + ".meta"; if ( ! libmaus2::util::GetFileSize::fileExists(compactfn) || libmaus2::util::GetFileSize::isOlder(compactfn,fn) ) { libmaus2::fastx::FastAToCompact4BigBandBiDir::fastaToCompact4BigBandBiDir( std::vector<std::string>(1,fn), &(std::cerr), false /* single strand */, compactfn ); } uint64_t const numthreads = arg.uniqueArgPresent("t") ? arg.getUnsignedNumericArg<uint64_t>("t") : libmaus2::suffixsort::bwtb3m::BwtMergeSortOptions::getDefaultNumThreads(); std::string const bwtfn = fn + ".bwt"; std::string const bwtmetafn = bwtfn + ".meta"; libmaus2::suffixsort::bwtb3m::BwtMergeSortResult res; if ( ! libmaus2::util::GetFileSize::fileExists(bwtmetafn) || libmaus2::util::GetFileSize::isOlder(bwtmetafn,compactfn) ) { libmaus2::suffixsort::bwtb3m::BwtMergeSortOptions options( compactfn, 16*1024ull*1024ull*1024ull, // mem // libmaus2::suffixsort::bwtb3m::BwtMergeSortOptions::getDefaultMem(), numthreads, "compactstream", false /* bwtonly */, std::string("mem:tmp_"), std::string(), // sparse bwtfn, 16 /* isa */, 16 /* sa */ ); res = libmaus2::suffixsort::bwtb3m::BwtMergeSort::computeBwt(options,&std::cerr); res.serialise(bwtmetafn); } else { res.deserialise(bwtmetafn); } //libmaus2::fastx::FastAIndex::unique_ptr_type PFAI(libmaus2::fastx::FastAIndex::load(fn+".fai")); libmaus2::fastx::DNAIndexMetaDataBigBandBiDir::unique_ptr_type Pmeta(libmaus2::fastx::DNAIndexMetaDataBigBandBiDir::load(compactmetafn)); libmaus2::rank::DNARank::unique_ptr_type Prank(res.loadDNARank(numthreads)); libmaus2::suffixsort::bwtb3m::BwtMergeSortResult::BareSimpleSampledSuffixArray BSSSA(res.loadBareSimpleSuffixArray()); uint64_t const n = Prank->size(); libmaus2::autoarray::AutoArray<char> A(n,false); libmaus2::bitio::CompactDecoderWrapper CDW(compactfn); CDW.read(A.begin(),n); assert ( CDW.gcount() == static_cast<int64_t>(n) ); uint64_t const minfreq = 2; uint64_t const minlen = 20; uint64_t const limit = 32; uint64_t const minsplitlength = 28; uint64_t const minsplitsize = 10; uint64_t const maxxdist = 1000; uint64_t const activemax = 1; uint64_t const fracmul = 95; uint64_t const fracdiv = 100; bool const selfcheck = true; uint64_t const chainminscore = arg.uniqueArgPresent("chainminscore") ? arg.getUnsignedNumericArg<uint64_t>("chainminscore") : 20; uint64_t const maxocc = 500; uint64_t const minprintlength = 1024; uint64_t const algndommul = 95; uint64_t const algndomdiv = 100; uint64_t const chaindommul = 95; uint64_t const chaindomdiv = 100; double const maxerr = arg.uniqueArgPresent("maxerr") ? arg.getParsedArg<double>("maxerr") : std::numeric_limits<double>::max(); uint64_t const cachek = arg.uniqueArgPresent("K") ? arg.getUnsignedNumericArg<uint64_t>("K") : 12; uint64_t const maxpacksize = arg.uniqueArgPresent("P") ? arg.getUnsignedNumericArg<uint64_t>("P") : 128ull*1024ull*1024ull; std::cerr << "[V] generating " << cachek << "-mer cache..."; libmaus2::rank::DNARankKmerCache::unique_ptr_type Pcache(new libmaus2::rank::DNARankKmerCache(*Prank,cachek,numthreads)); std::cerr << "done." << std::endl; std::string const deftmp = libmaus2::util::ArgInfo::getDefaultTmpFileName(arg.progname); libmaus2::util::TempFileNameGenerator tmpgen(deftmp,3); std::string const sorttmp = tmpgen.getFileName(); libmaus2::util::TempFileRemovalContainer::addTempFile(sorttmp); libmaus2::sorting::SortingBufferedOutputFile<CoordinatePair> CPS(sorttmp); libmaus2::parallel::PosixSpinLock CPSlock; uint64_t acc_s = 0; for ( uint64_t zz = 0; zz < Pmeta->S.size(); ) { uint64_t zze = zz; uint64_t pack_s = Pmeta->S[zze++].l; while ( zze < Pmeta->S.size() && pack_s + Pmeta->S[zze].l <= maxpacksize ) pack_s += Pmeta->S[zze++].l; // std::cerr << "[V] " << zz << "-" << zze << " pack_s=" << pack_s << std::endl; zz = zze; uint64_t const low = acc_s; uint64_t const high = acc_s + pack_s; std::cerr << "[V] low=" << low << " high=" << high << " acc_s=" << acc_s << " pack_s=" << pack_s << std::endl; std::string const activefn = libmaus2::rank::DNARankSMEMComputation::activeParallel(tmpgen,*Pcache,A.begin(),low,high,minfreq,minlen,numthreads,maxxdist + 2*(minlen-1)); libmaus2::gamma::GammaIntervalDecoder::unique_ptr_type Pdec(new libmaus2::gamma::GammaIntervalDecoder(std::vector<std::string>(1,activefn),0/*offset */,1 /* numthreads */)); std::string const sortinfn = tmpgen.getFileName(true); libmaus2::sorting::SerialisingSortingBufferedOutputFile<GammaInterval>::unique_ptr_type sptr( new libmaus2::sorting::SerialisingSortingBufferedOutputFile<GammaInterval>(sortinfn) ); { std::pair<uint64_t,uint64_t> P; while ( Pdec->getNext(P) ) { sptr->put( GammaInterval(P.first,P.second) ); } } libmaus2::sorting::SerialisingSortingBufferedOutputFile<GammaInterval>::merger_ptr_type Pmerger( sptr->getMerger() ); struct LockedGet { libmaus2::parallel::PosixSpinLock lock; // libmaus2::gamma::GammaIntervalDecoder& dec; libmaus2::sorting::SerialisingSortingBufferedOutputFile<GammaInterval>::merger_ptr_type & Pmerger; LockedGet(libmaus2::sorting::SerialisingSortingBufferedOutputFile<GammaInterval>::merger_ptr_type & rPmerger) : Pmerger(rPmerger) { } bool getNext(std::pair<uint64_t,uint64_t> & P) { bool ok = false; { libmaus2::parallel::ScopePosixSpinLock slock(lock); GammaInterval Q; ok = Pmerger->getNext(Q); if ( ok ) { P.first = Q.first; P.second = Q.second; } } return ok; } }; libmaus2::autoarray::AutoArray < std::pair<uint64_t,uint64_t> > VP(numthreads); LockedGet LG(Pmerger); libmaus2::fastx::CoordinateCacheBiDir cocache(*Prank,*Pmeta,16 /* blockshfit */); typedef libmaus2::suffixsort::bwtb3m::BwtMergeSortResult::BareSimpleSampledSuffixArray sa_type; typedef libmaus2::lcs::SMEMProcessor<sa_type> smem_proc_type; libmaus2::autoarray::AutoArray < smem_proc_type::unique_ptr_type > Aproc(numthreads); for ( uint64_t i = 0; i < numthreads; ++i ) { smem_proc_type::unique_ptr_type proc(new smem_proc_type( *Pmeta,cocache,*Prank,BSSSA,A.begin(),maxxdist,activemax,fracmul,fracdiv,selfcheck,chainminscore,maxocc,algndommul,algndomdiv,chaindommul,chaindomdiv, libmaus2::lcs::NNP::getDefaultMaxWindowError(),libmaus2::lcs::NNP::getDefaultMaxBack(),false /* domsameref */ ) ); Aproc[i] = UNIQUE_PTR_MOVE(proc); } stateVec.resize(numthreads); for ( uint64_t i = 0; i < numthreads; ++i ) setState(i,"idle"); #if defined(_OPENMP) #pragma omp parallel num_threads(numthreads) #endif { uint64_t const tid = #if defined(_OPENMP) omp_get_thread_num() #else 0 #endif ; std::pair<uint64_t,uint64_t> & P = VP[tid]; smem_proc_type & proc = *(Aproc[tid]); struct SelfieVerbosity : public smem_proc_type::Verbosity { uint64_t tid; std::string prefix; SelfieVerbosity(uint64_t const rtid, std::string const & rprefix) : tid(rtid), prefix(rprefix) { } void operator()(libmaus2::rank::DNARankMEM const & smem, uint64_t const z) const { std::ostringstream ostr; ostr << prefix << "\t" << z << "\t" << smem; setState(tid,ostr.str()); printState(); } }; while ( LG.getNext(P) ) { uint64_t const smemleft = std::max(static_cast<int64_t>(0),static_cast<int64_t>(P.first)-static_cast<int64_t>(minlen-1)); uint64_t const smemright = std::min(P.second+minlen,n); std::ostringstream msgstr; msgstr << "[" << smemleft << "," << smemright << ")"; setState(tid,msgstr.str()); printState(); libmaus2::rank::DNARankSMEMComputation::SMEMEnumerator<char const *> senum( *Prank,A.begin(), smemleft, smemright, minfreq, minlen, limit, minsplitlength, minsplitsize); SelfieVerbosity SV(tid,msgstr.str()); proc.process(senum,A.begin(),n,minprintlength,maxerr,SV); // proc.printAlignments(minprintlength); std::pair<libmaus2::lcs::ChainAlignment const *, libmaus2::lcs::ChainAlignment const *> const AP = proc.getAlignments(); for ( libmaus2::lcs::ChainAlignment const * it = AP.first; it != AP.second; ++it ) { libmaus2::lcs::ChainAlignment const & CA = *it; libmaus2::lcs::NNPAlignResult const & res = CA.res; std::vector<libmaus2::fastx::DNAIndexMetaDataBigBandBiDir::Coordinates> const VA = Pmeta->mapCoordinatePairToList(res.abpos,res.aepos); std::vector<libmaus2::fastx::DNAIndexMetaDataBigBandBiDir::Coordinates> const VB = Pmeta->mapCoordinatePairToList(res.bbpos,res.bepos); if ( VA.size() == 1 && VB.size() == 1 ) { CoordinatePair CP(VA[0],VB[0],res); libmaus2::parallel::ScopePosixSpinLock slock(CPSlock); CPS.put(CP); } } setState(tid,"idle"); printState(); #if 0 std::cerr << "P=[" << P.first << "," << P.second << ")" << std::endl; { std::vector<libmaus2::rank::DNARankMEM> SMEM; libmaus2::rank::DNARankSMEMComputation::smemLimitedParallel( *Prank, *Pcache, A.begin(), P.first, P.second, n, minfreq, minlen, limit, SMEM, 1 /* threads */); std::cerr << "[V] number of SMEMs is " << SMEM.size() << std::endl; // deallocate k-mer cache // Pcache.reset(); std::vector<libmaus2::rank::DNARankMEM> SMEMsplit; libmaus2::rank::DNARankSMEMComputation::smemLimitedParallelSplit(*Prank,A.begin(),P.first,P.second,minlen,limit,minsplitlength,minsplitsize,SMEM,SMEMsplit,1 /* threads */); std::cerr << "[V] number of split SMEMs is " << SMEMsplit.size() << std::endl; // insert split SMEMs into regular SMEMs std::copy(SMEMsplit.begin(),SMEMsplit.end(),std::back_insert_iterator< std::vector<libmaus2::rank::DNARankMEM> >(SMEM)); //libmaus2::sorting::InPlaceParallelSort::inplacesort2(SMEM.begin(),SMEM.end(),numthreads,libmaus2::rank::DNARankMEMPosComparator()); std::sort(SMEM.begin(),SMEM.end(),libmaus2::rank::DNARankMEMPosComparator()); SMEM.resize(std::unique(SMEM.begin(),SMEM.end())-SMEM.begin()); libmaus2::rank::DNARankSMEMComputation::SMEMEnumerator<char const *> senum( *Prank,A.begin(), std::max(static_cast<int64_t>(0),static_cast<int64_t>(P.first)-static_cast<int64_t>(minlen-1)), std::min(P.second+minlen,n), minfreq, minlen, limit, minsplitlength, minsplitsize); libmaus2::rank::DNARankMEM smem; uint64_t c = 0; while ( senum.getNext(smem) ) { // std::cerr << "ccc=" << smem << std::endl; if ( c >= SMEM.size() || smem != SMEM[c] ) { std::cerr << "mismatch " << c << " " << smem; if ( c < SMEM.size() ) std::cerr << " != " << SMEM[c]; else std::cerr << " ???"; std::cerr << std::endl; } else { std::cerr << "match " << c << " " << smem << " " << SMEM[c] << std::endl; } ++c; } std::cerr << "c=" << c << " V=" << SMEM.size() << std::endl; } #endif } } acc_s += pack_s; } libmaus2::sorting::SortingBufferedOutputFile<CoordinatePair>::merger_ptr_type Pmerger(CPS.getMerger()); CoordinatePair CP; while ( Pmerger->getNext(CP) ) { std::ostringstream ostr; CP.A.print(ostr); ostr << " "; CP.B.print(ostr); ostr << " "; ostr << CP.res.getErrorRate(); std::cout << ostr.str() << std::endl; } }
static unique_ptr_type constructFromRL(std::vector<std::string> const & filenames, uint64_t const maxval, std::string const & tmpprefix ) { ::libmaus2::util::TempFileNameGenerator tmpgen(tmpprefix,2); ::libmaus2::huffman::RLDecoder decoder(filenames); return unique_ptr_type(new this_type(decoder,::libmaus2::math::numbits(maxval),tmpgen)); }
void testCompactHuffmanPar() { std::vector<uint8_t> A; std::map<int64_t,uint64_t> F; // uint64_t const n = 1024*1024; uint64_t const n = 64*1024*1024; for ( uint64_t i = 0; i < n; ++i ) { A.push_back(libmaus::random::Random::rand8() & 0xFF); F[A.back()]++; } libmaus::huffman::HuffmanTree H(F.begin(),F.size(),false,true); std::cerr << H; libmaus::util::MemTempFileContainer MTFC; // libmaus::wavelet::ImpExternalWaveletGeneratorCompactHuffman IEWGHN(H,MTFC); libmaus::util::TempFileNameGenerator tmpgen("tmpdir",2); #if defined(_OPENMP) uint64_t const numthreads = omp_get_max_threads(); #else uint64_t const numthreads = 1; #endif libmaus::wavelet::ImpExternalWaveletGeneratorCompactHuffmanParallel IEWGHN(H,tmpgen,numthreads); // std::cerr << "left construction." << std::endl; #if 0 uint64_t A[] = { 0,0,0,0,0,0,3,3,1,3,2,1,2,1,1,2,1,1,1,2,1 }; uint64_t const n = sizeof(A)/sizeof(A[0]); // uint64_t A[] = { 0,0,0,0,0,0 }; #endif #if 0 uint64_t const perthread = (n + numthreads-1)/numthreads; #endif #if defined(_OPENMP) #pragma omp parallel for #endif for ( int64_t i = 0; i < static_cast<int64_t>(n); ++i ) { #if defined(_OPENMP) uint64_t const tid = omp_get_thread_num(); #else uint64_t const tid = 0; #endif IEWGHN[tid].putSymbol(A[i]); } std::string tmpfilename = "tmp.hwt"; // std::ostringstream ostr; libmaus::aio::CheckedOutputStream COS(tmpfilename); IEWGHN.createFinalStream(COS); COS.close(); // std::istringstream istr(ostr.str()); // libmaus::wavelet::ImpCompactHuffmanWaveletTree IHWTN(tmpfilename); libmaus::wavelet::ImpCompactHuffmanWaveletTree::unique_ptr_type pIHWTN(libmaus::wavelet::ImpCompactHuffmanWaveletTree::load(tmpfilename)); libmaus::wavelet::ImpCompactHuffmanWaveletTree const & IHWTN = *pIHWTN; // libmaus::wavelet::ImpCompactHuffmanWaveletTree IHWTN(istr); remove(tmpfilename.c_str()); // std::cerr << IHWTN.size() << std::endl; assert ( IHWTN.size() == n ); std::map<uint64_t,uint64_t> R; for ( uint64_t i = 0; i < IHWTN.size(); ++i ) { if ( i % (32*1024) == 0 ) std::cerr << static_cast<double>(i) / n << std::endl; // std::cerr << IHWTN[i] << ";"; assert ( IHWTN[i] == A[i] ); // std::cerr << "[" << i << "," << IHWTN.select(A[i],R[A[i]]) << "]" << ";"; assert ( i == IHWTN.select(A[i],R[A[i]]) ); assert ( IHWTN.rankm(A[i],i) == R[A[i]] ); R[A[i]]++; assert ( IHWTN.rank (A[i],i) == R[A[i]] ); } std::cerr << std::endl; if ( n <= 128 ) { #if defined(_OPENMP) #pragma omp parallel for #endif for ( uint64_t i = 0; i <= IHWTN.size(); ++i ) for ( uint64_t j = i; j <= IHWTN.size(); ++j ) { assert ( IHWTN.enumerateSymbolsInRange(i,j) == IHWTN.enumerateSymbolsInRangeSlow(i,j) ); } } // ImpExternalWaveletGeneratorCompactHuffman(libmaus::huffman::HuffmanTree const & rH, ::libmaus::util::TempFileContainer & rtmpcnt) }