Пример #1
0
void testHuffmanWaveletSer()
{
	// std::string text = "Hello world.";
	std::string text = "fischers fritze fischt frische fische";
	::libmaus::util::shared_ptr< ::libmaus::huffman::HuffmanTreeNode >::type sroot = ::libmaus::huffman::HuffmanBase::createTree(text.begin(),text.end());
	
	::libmaus::util::TempFileNameGenerator tmpgen("tmphuf",3);
	// ::libmaus::wavelet::ImpExternalWaveletGeneratorHuffman exgen(sroot.get(), tmpgen);
	::libmaus::wavelet::ImpExternalWaveletGeneratorHuffmanParallel exgen(sroot.get(), tmpgen, 1);
	for ( uint64_t i = 0; i < text.size(); ++i )
		exgen[0].putSymbol(text[i]);
		// exgen.putSymbol(text[i]);
	exgen.createFinalStream("hufwuf");

	std::ifstream istr("hufwuf",std::ios::binary);
	::libmaus::wavelet::ImpHuffmanWaveletTree IHWT(istr);
	
	for ( uint64_t i = 0; i < IHWT.size(); ++i )
		std::cerr << static_cast<char>(IHWT[i]);
	std::cerr << std::endl;
	for ( uint64_t i = 0; i < IHWT.size(); ++i )
	{
		std::cerr 
			<< static_cast<char>(IHWT.inverseSelect(i).first)
			<< "("
			<< IHWT.inverseSelect(i).second
			<< ")"
			<< "["
			<< IHWT.rank(text[i],i)
			<< "]";
		assert ( IHWT.inverseSelect(i).second + 1 == IHWT.rank(text[i],i) );
		assert ( static_cast<int64_t>(IHWT[i]) == text[i] );
	}
	std::cerr << std::endl;
}
Пример #2
0
int main()
{
	testCompactHuffmanPar();
	
	return 0;
	
	#if 0
	::libmaus::wavelet::ImpHuffmanWaveletTree::unique_ptr_type IMP(new ::libmaus::wavelet::ImpHuffmanWaveletTree(std::cin));
	::libmaus::autoarray::AutoArray<uint32_t>::unique_ptr_type Z(new ::libmaus::autoarray::AutoArray<uint32_t>(64));
	::libmaus::lf::LFZeroImp L(IMP,Z,0);
	#endif
	
	#if 0
	LFZeroTemplate (
        wt_ptr_type & rW,
        z_array_ptr_type & rZ,
        uint64_t const rp0rank
        )
    	#endif                                                                                                            

	// testImpExternalWaveletGenerator();
	testHuffmanWavelet();
	testHuffmanWaveletSer();
	

	#if 0
	srand(time(0));

	uint64_t const b = 5;
	::libmaus::util::TempFileNameGenerator tmpgen(std::string("tmp"),3);
	::libmaus::wavelet::ExternalWaveletGenerator ex(b,tmpgen);

	std::vector < uint64_t > V;	
	for ( uint64_t i = 0; i < 381842; ++i )
	{
		uint64_t const v = rand() % (1ull<<b);
		// uint64_t const v = i % (1ull<<b);
		ex.putSymbol(v);
		V.push_back(v);
	}
	
	std::string const outfilename = "ex";
	uint64_t const n = ex.createFinalStream(outfilename);
	
	::std::ifstream istr(outfilename.c_str(), std::ios::binary);
	::libmaus::wavelet::WaveletTree < ::libmaus::rank::ERank222B, uint64_t > WT(istr);
	
	std::cerr << "Checking...";
	for ( uint64_t i = 0; i < n; ++i )
		assert ( WT[i] == V[i] );
	std::cerr << "done." << std::endl;
		
	if ( n < 256 )
	{
		for ( uint64_t i = 0; i < n; ++i )
			std::cerr << WT[i] << ";";
		std::cerr << std::endl;
	}
	#endif
}
Пример #3
0
void testImpExternalWaveletGenerator()
{
	::libmaus::util::TempFileNameGenerator tmpgen("tmpdir",1);
	uint64_t const b = 3;
	::libmaus::wavelet::ImpExternalWaveletGenerator IEWG(b,tmpgen);
	
	#if 0
	IEWG.putSymbol(0);
	IEWG.putSymbol(1);
	IEWG.putSymbol(2);
	IEWG.putSymbol(3);
	IEWG.putSymbol(4);
	IEWG.putSymbol(5);
	IEWG.putSymbol(6);
	IEWG.putSymbol(7);
	#endif

	std::vector < uint64_t > V;	
	for ( uint64_t i = 0; i < 381842*41; ++i )
	{
		uint64_t const v = rand() % (1ull<<b);
		// uint64_t const v = i % (1ull<<b);
		IEWG.putSymbol(v);
		V.push_back(v);
	}

	
	std::ostringstream ostr;
	IEWG.createFinalStream(ostr);
	std::istringstream istr(ostr.str());
	
	::libmaus::wavelet::ImpWaveletTree IWT(istr);
	
	std::cerr << "Testing...";
	std::vector<uint64_t> R(1ull << b,0);
	for ( uint64_t i = 0; i < IWT.size(); ++i )
	{
		std::pair<uint64_t,uint64_t> IS = IWT.inverseSelect(i);
		assert ( IS.first == V[i] );
		assert ( IS.second == R[V[i]] );

		uint64_t const s = IWT.select(V[i],R[V[i]]);
		// std::cerr << "expect " << i << " got " << s << std::endl;
		assert ( s == i );

		R [ V[i] ] ++;
		assert ( IWT.rank(V[i],i) == R[V[i]] );
		assert ( IWT[i] == V[i] );
	}
	std::cerr << "done." << std::endl;
	
	#if 0
	for ( uint64_t i = 0; i < IWT.n; ++i )
	{
		std::cerr << "IWT[" << i << "]=" << IWT[i] << std::endl;
		std::cerr << "rank(" << IWT[i] << "," << i << ")" << "=" << IWT.rank(IWT[i],i) << std::endl;
	}
	#endif
}
Пример #4
0
void testUtf8Bwt(std::string const & fn)
{
	::libmaus::util::Utf8String us(fn);

	typedef ::libmaus::util::Utf8String::saidx_t saidx_t;
	::libmaus::autoarray::AutoArray<saidx_t,::libmaus::autoarray::alloc_type_c> SA = us.computeSuffixArray32();

	// produce bwt
	for ( uint64_t i = 0; i < SA.size(); ++i )
		if ( SA[i] )
			SA[i] = us[SA[i]-1];
		else
			SA[i] = -1;

	// produce huffman shaped wavelet tree of bwt
	::std::map<int64_t,uint64_t> chist = us.getHistogramAsMap();
	chist[-1] = 1;
	::libmaus::huffman::HuffmanTreeNode::shared_ptr_type htree = ::libmaus::huffman::HuffmanBase::createTree(chist);

	::libmaus::util::TempFileNameGenerator tmpgen(fn+"_tmp",3);
	::libmaus::util::FileTempFileContainer tmpcnt(tmpgen);
	::libmaus::wavelet::ImpExternalWaveletGeneratorHuffman IEWGH(htree.get(),tmpcnt);
	
	IEWGH.putSymbol(us[us.size()-1]);
	for ( uint64_t i = 0; i < SA.size(); ++i )
		IEWGH.putSymbol(SA[i]);
	IEWGH.createFinalStream(fn+".hwt");

	// load huffman shaped wavelet tree of bwt
	::libmaus::wavelet::ImpHuffmanWaveletTree::unique_ptr_type IHWT
		(::libmaus::wavelet::ImpHuffmanWaveletTree::load(fn+".hwt"));
		
	// check rank counts
	for ( ::std::map<int64_t,uint64_t>::const_iterator ita = chist.begin(); ita != chist.end(); ++ita )
		assert ( IHWT->rank(ita->first,SA.size()) == ita->second );
		
	/* cumulative symbol freqs, shifted by 1 to accomodate for terminator -1 */
	int64_t const maxsym = chist.rbegin()->first;
	int64_t const shiftedmaxsym = maxsym+1;
	::libmaus::autoarray::AutoArray<uint64_t> D(shiftedmaxsym+1);
	for ( ::std::map<int64_t,uint64_t>::const_iterator ita = chist.begin(); ita != chist.end(); ++ita )
		D [ ita->first + 1 ] = ita->second;
	D.prefixSums();
	
	// terminator has rank 0 and is at position us.size()
	uint64_t rank = 0;
	int64_t pos = us.size();
	
	// decode text backward from bwt
	while ( --pos >= 0 )
	{
		std::pair< int64_t,uint64_t> const is = IHWT->inverseSelect(rank);
		rank = D[is.first+1] + is.second;		
		assert ( is.first == us[pos] );
	}
	
	// remove huffman shaped wavelet tree
	remove ((fn+".hwt").c_str());
}
Пример #5
0
void testsparsegammamultifilesetmergedense()
{
	libmaus::util::TempFileNameGenerator tmpgen("tmp",3);
	libmaus::gamma::SparseGammaGapMultiFileLevelSet SGGF(tmpgen,4);
	std::map<uint64_t,uint64_t> refM;
	
	for ( uint64_t i = 0; i < 25;  ++i )
	{
		std::string const fn = tmpgen.getFileName();
		std::string const indexfn = fn+".idx";
		libmaus::aio::CheckedOutputStream COS(fn);
		libmaus::aio::CheckedInputOutputStream indexCIOS(indexfn);
		libmaus::gamma::SparseGammaGapBlockEncoder SGE(COS,indexCIOS);
		remove(indexfn.c_str());
		
		SGE.encode(2*i,i+1);   refM[2*i]   += (i+1);
		SGE.encode(2*i+2,i+1); refM[2*i+2] += (i+1);
		SGE.encode(2*i+4,i+1); refM[2*i+4] += (i+1);
		SGE.term();
		
		SGGF.addFile(fn);
	}
	
	uint64_t const maxval = refM.size() ? (refM.rbegin())->first : 0;
	
	std::string const ffn = tmpgen.getFileName();
	std::vector<std::string> const fno = SGGF.mergeToDense(ffn,maxval+1);
	
	// libmaus::aio::CheckedInputStream CIS(ffn);
	libmaus::gamma::GammaGapDecoder SGGD(fno);
	for ( uint64_t i = 0; i < maxval+1; ++i )
	{
		uint64_t dv = SGGD.decode();
		
		std::cerr << dv;
		if ( refM.find(i) != refM.end() )
		{
			std::cerr << "(" << refM.find(i)->second << ")";
			assert ( refM.find(i)->second == dv );
		}
		else
		{
			std::cerr << "(0)";
			assert ( dv == 0 );
		}
		std::cerr << ";";
	}
	std::cerr << std::endl;

	for ( uint64_t i = 0; i < fno.size(); ++i )
	{
		// std::cerr << fno[i] << std::endl;
		remove(fno[i].c_str());
	}
}
Пример #6
0
void testsparsegammamerge()
{
	libmaus::util::TempFileNameGenerator tmpgen("tmp",3);
	libmaus::gamma::SparseGammaGapFileSet SGGF(tmpgen);
	std::map<uint64_t,uint64_t> refM;
	
	for ( uint64_t i = 0; i < 25;  ++i )
	{
		std::string const fn = tmpgen.getFileName();
		libmaus::aio::CheckedOutputStream COS(fn);
		libmaus::gamma::SparseGammaGapEncoder SGE(COS);
		
		SGE.encode(2*i,i+1);   refM[2*i]   += (i+1);
		SGE.encode(2*i+2,i+1); refM[2*i+2] += (i+1);
		SGE.encode(2*i+4,i+1); refM[2*i+4] += (i+1);
		SGE.term();
		
		SGGF.addFile(fn);
	}
	
	std::string const ffn = tmpgen.getFileName();
	SGGF.merge(ffn);
	
	libmaus::aio::CheckedInputStream CIS(ffn);
	libmaus::gamma::SparseGammaGapDecoder SGGD(CIS);
	for ( uint64_t i = 0; i < 60; ++i )
	{
		uint64_t dv = SGGD.decode();
		
		std::cerr << dv;
		if ( refM.find(i) != refM.end() )
		{
			std::cerr << "(" << refM.find(i)->second << ")";
			assert ( refM.find(i)->second == dv );
		}
		else
		{
			std::cerr << "(0)";
			assert ( dv == 0 );
		}
		std::cerr << ";";
	}
	std::cerr << std::endl;
	
	remove(ffn.c_str());
}
Пример #7
0
void testHuffmanWavelet()
{
	// std::string text = "Hello world.";
	std::string text = "fischers fritze fischt frische fische der biber schwimmt im fluss und bleibt immer treu";
	
	#if 1
	for ( uint64_t i = 0; i < 16; ++i )
		text = text+text;
	#endif
	
	#if 1	
	text = text.substr(0,1572929);
	#endif
		
	std::cerr << "Checking text of size " << text.size() << std::endl;
	
	::libmaus::util::shared_ptr< ::libmaus::huffman::HuffmanTreeNode >::type sroot = ::libmaus::huffman::HuffmanBase::createTree(text.begin(),text.end());
	
	::libmaus::util::TempFileNameGenerator tmpgen("tmphuf",3);
	uint64_t const numfrags = 128;

	#define PAR
	#if defined(PAR)	
	::libmaus::wavelet::ImpExternalWaveletGeneratorHuffmanParallel exgen(sroot.get(), tmpgen, numfrags);
	#else
	::libmaus::wavelet::ImpExternalWaveletGeneratorHuffman exgen(sroot.get(), tmpgen);
	#endif
	
	#if defined(PAR) && defined(_OPENMP)
	#pragma omp parallel for
	#endif
	for ( int64_t f = 0; f < static_cast<int64_t>(numfrags); ++f )
	{	
		uint64_t const symsperfrag = (text.size() + numfrags-1)/numfrags;
		uint64_t const low = std::min(static_cast<uint64_t>(f*symsperfrag),static_cast<uint64_t>(text.size()));
		uint64_t const high = std::min(static_cast<uint64_t>(low+symsperfrag),static_cast<uint64_t>(text.size()));

		// std::cerr << "f=" << f << " low=" << low << " high=" << high << std::endl;
		
		for ( uint64_t i = low; i < high; ++i )
			#if defined(PAR)
			exgen[f].putSymbol(text[i]);
			#else
			exgen.putSymbol(text[i]);
			#endif
	}
	exgen.createFinalStream("hufwuf");

	std::ifstream istr("hufwuf",std::ios::binary);
	::libmaus::wavelet::ImpHuffmanWaveletTree IHWT(istr);
	::libmaus::autoarray::AutoArray<int64_t> symar = sroot->symbolArray();
	
	::libmaus::huffman::EncodeTable<1> E(IHWT.sroot.get());
	E.print();
	
	#if 0
	for ( uint64_t i = 0; i < IHWT.size(); ++i )
		std::cerr << static_cast<char>(IHWT[i]);
	std::cerr << std::endl;
	#endif
	
	std::map<int64_t, uint64_t> rmap;
	
	#if 0
	for ( uint64_t i = 0; i < text.size(); ++i )
	{
		std::cerr << static_cast<char>(IHWT[i]);
	}
	std::cerr << std::endl;
	#endif
	
	for ( uint64_t i = 0; i < IHWT.size(); ++i )
	{
		#if 0
		std::cerr 
			<< static_cast<char>(IHWT.inverseSelect(i).first)
			<< "("
			<< IHWT.inverseSelect(i).second
			<< ")"
			<< "["
			<< IHWT.rank(text[i],i)
			<< "]";
		#endif

		/**
		 * check symbol
		 **/
		if ( static_cast<int64_t>(IHWT[i]) != text[i] )
			std::cerr << "Failure for i=" << i << " expected " << static_cast<int>(text[i]) << " got " << IHWT[i] << std::endl;
		assert ( static_cast<int64_t>(IHWT[i]) == text[i] );

		/**
		 * compare rank to rankm
		 **/
		for ( uint64_t j = 0; j < symar.size(); ++j )
		{
			int64_t const sym = symar[j];
			uint64_t const ra = i ? IHWT.rank(sym,i-1) : 0;
			uint64_t const rb = IHWT.rankm(sym,i);
			assert ( ra == rb );
		}

		for ( uint64_t j = 0; j < symar.size(); ++j )
		{
			int64_t const sym = symar[j];
			assert ( IHWT.rankm(sym,i) == rmap[sym] );
		}

		assert ( IHWT.inverseSelect(i).second == IHWT.rankm(text[i],i) );
		assert ( IHWT.inverseSelect(i).second == rmap[text[i]] );
		
		++rmap [ IHWT[i] ];
		
		// std::cerr << "i=" << i << " IHWT[i]=" << IHWT[i] << " r=" << r << " IHWT.rank()=" << IHWT.rank(IHWT[i],i) << std::endl;
		
		for ( uint64_t j = 0; j < symar.size(); ++j )
		{
			int64_t const sym = symar[j];
			assert ( IHWT.rank(sym,i) == rmap[sym] );
		}
		
		assert ( IHWT.inverseSelect(i).first == text[i] );
		
		// std::cerr << IHWT.inverseSelect(i).second << "\t" << rmap[text[i]] << std::endl;

		assert ( IHWT.inverseSelect(i).second + 1 == IHWT.rank(text[i],i) );
		assert ( IHWT.inverseSelect(i).second + 1 == rmap[text[i]] );

		// assert ( IHWT.select ( text[i], IHWT.rank(text[i],i)-1 ) == i );
	}
	#if 0
	std::cerr << std::endl;
	#endif
}
Пример #8
0
void selfie(libmaus2::util::ArgParser const & arg, std::string const & fn)
{
	std::string const compactfn = fn + ".compact";
	std::string const compactmetafn = compactfn + ".meta";

	if (
		! libmaus2::util::GetFileSize::fileExists(compactfn)
		||
		libmaus2::util::GetFileSize::isOlder(compactfn,fn)
	)
	{
		libmaus2::fastx::FastAToCompact4BigBandBiDir::fastaToCompact4BigBandBiDir(
			std::vector<std::string>(1,fn),
			&(std::cerr),
			false /* single strand */,
			compactfn
		);
	}

	uint64_t const numthreads =
		arg.uniqueArgPresent("t") ? arg.getUnsignedNumericArg<uint64_t>("t") : libmaus2::suffixsort::bwtb3m::BwtMergeSortOptions::getDefaultNumThreads();

	std::string const bwtfn = fn + ".bwt";
	std::string const bwtmetafn = bwtfn + ".meta";

	libmaus2::suffixsort::bwtb3m::BwtMergeSortResult res;
	if (
		! libmaus2::util::GetFileSize::fileExists(bwtmetafn)
		||
		libmaus2::util::GetFileSize::isOlder(bwtmetafn,compactfn)
	)
	{
		libmaus2::suffixsort::bwtb3m::BwtMergeSortOptions options(
			compactfn,
			16*1024ull*1024ull*1024ull, // mem
			// libmaus2::suffixsort::bwtb3m::BwtMergeSortOptions::getDefaultMem(),
			numthreads,
			"compactstream",
			false /* bwtonly */,
			std::string("mem:tmp_"),
			std::string(), // sparse
			bwtfn,
			16 /* isa */,
			16 /* sa */
		);

		res = libmaus2::suffixsort::bwtb3m::BwtMergeSort::computeBwt(options,&std::cerr);
		res.serialise(bwtmetafn);
	}
	else
	{
		res.deserialise(bwtmetafn);
	}

	//libmaus2::fastx::FastAIndex::unique_ptr_type PFAI(libmaus2::fastx::FastAIndex::load(fn+".fai"));
	libmaus2::fastx::DNAIndexMetaDataBigBandBiDir::unique_ptr_type Pmeta(libmaus2::fastx::DNAIndexMetaDataBigBandBiDir::load(compactmetafn));

	libmaus2::rank::DNARank::unique_ptr_type Prank(res.loadDNARank(numthreads));
	libmaus2::suffixsort::bwtb3m::BwtMergeSortResult::BareSimpleSampledSuffixArray BSSSA(res.loadBareSimpleSuffixArray());

	uint64_t const n = Prank->size();
	libmaus2::autoarray::AutoArray<char> A(n,false);
	libmaus2::bitio::CompactDecoderWrapper CDW(compactfn);
	CDW.read(A.begin(),n);
	assert ( CDW.gcount() == static_cast<int64_t>(n) );

	uint64_t const minfreq = 2;
	uint64_t const minlen = 20;
	uint64_t const limit = 32;
	uint64_t const minsplitlength = 28;
	uint64_t const minsplitsize = 10;
	uint64_t const maxxdist = 1000;
	uint64_t const activemax = 1;
	uint64_t const fracmul = 95;
	uint64_t const fracdiv = 100;
	bool const selfcheck = true;
	uint64_t const chainminscore = arg.uniqueArgPresent("chainminscore") ? arg.getUnsignedNumericArg<uint64_t>("chainminscore") : 20;
	uint64_t const maxocc = 500;
	uint64_t const minprintlength = 1024;
	uint64_t const algndommul = 95;
	uint64_t const algndomdiv = 100;
	uint64_t const chaindommul = 95;
	uint64_t const chaindomdiv = 100;
	double const maxerr = arg.uniqueArgPresent("maxerr") ? arg.getParsedArg<double>("maxerr") : std::numeric_limits<double>::max();

	uint64_t const cachek = arg.uniqueArgPresent("K") ? arg.getUnsignedNumericArg<uint64_t>("K") : 12;
	uint64_t const maxpacksize = arg.uniqueArgPresent("P") ? arg.getUnsignedNumericArg<uint64_t>("P") : 128ull*1024ull*1024ull;
	std::cerr << "[V] generating " << cachek << "-mer cache...";
	libmaus2::rank::DNARankKmerCache::unique_ptr_type Pcache(new libmaus2::rank::DNARankKmerCache(*Prank,cachek,numthreads));
	std::cerr << "done." << std::endl;

	std::string const deftmp = libmaus2::util::ArgInfo::getDefaultTmpFileName(arg.progname);
	libmaus2::util::TempFileNameGenerator tmpgen(deftmp,3);

	std::string const sorttmp = tmpgen.getFileName();
	libmaus2::util::TempFileRemovalContainer::addTempFile(sorttmp);
	libmaus2::sorting::SortingBufferedOutputFile<CoordinatePair> CPS(sorttmp);
	libmaus2::parallel::PosixSpinLock CPSlock;

	uint64_t acc_s = 0;
	for ( uint64_t zz = 0; zz < Pmeta->S.size(); )
	{
		uint64_t zze = zz;
		uint64_t pack_s = Pmeta->S[zze++].l;

		while ( zze < Pmeta->S.size() && pack_s + Pmeta->S[zze].l <= maxpacksize )
			pack_s += Pmeta->S[zze++].l;

		// std::cerr << "[V] " << zz << "-" << zze << " pack_s=" << pack_s << std::endl;

		zz = zze;

		uint64_t const low = acc_s;
		uint64_t const high = acc_s + pack_s;

		std::cerr << "[V] low=" << low << " high=" << high << " acc_s=" << acc_s << " pack_s=" << pack_s << std::endl;

		std::string const activefn =
			libmaus2::rank::DNARankSMEMComputation::activeParallel(tmpgen,*Pcache,A.begin(),low,high,minfreq,minlen,numthreads,maxxdist + 2*(minlen-1));

		libmaus2::gamma::GammaIntervalDecoder::unique_ptr_type Pdec(new libmaus2::gamma::GammaIntervalDecoder(std::vector<std::string>(1,activefn),0/*offset */,1 /* numthreads */));

		std::string const sortinfn = tmpgen.getFileName(true);
		libmaus2::sorting::SerialisingSortingBufferedOutputFile<GammaInterval>::unique_ptr_type sptr(
			new libmaus2::sorting::SerialisingSortingBufferedOutputFile<GammaInterval>(sortinfn)
		);

		{
			std::pair<uint64_t,uint64_t> P;
			while ( Pdec->getNext(P) )
			{
				sptr->put(
					GammaInterval(P.first,P.second)
				);
			}
		}

		libmaus2::sorting::SerialisingSortingBufferedOutputFile<GammaInterval>::merger_ptr_type Pmerger(
			sptr->getMerger()
		);

		struct LockedGet
		{
			libmaus2::parallel::PosixSpinLock lock;
			// libmaus2::gamma::GammaIntervalDecoder& dec;
			libmaus2::sorting::SerialisingSortingBufferedOutputFile<GammaInterval>::merger_ptr_type & Pmerger;

			LockedGet(libmaus2::sorting::SerialisingSortingBufferedOutputFile<GammaInterval>::merger_ptr_type & rPmerger) : Pmerger(rPmerger)
			{
			}

			bool getNext(std::pair<uint64_t,uint64_t> & P)
			{
				bool ok = false;
				{
					libmaus2::parallel::ScopePosixSpinLock slock(lock);
					GammaInterval Q;
					ok = Pmerger->getNext(Q);
					if ( ok )
					{
						P.first = Q.first;
						P.second = Q.second;
					}
				}
				return ok;
			}
		};

		libmaus2::autoarray::AutoArray < std::pair<uint64_t,uint64_t> > VP(numthreads);
		LockedGet LG(Pmerger);

		libmaus2::fastx::CoordinateCacheBiDir cocache(*Prank,*Pmeta,16 /* blockshfit */);

		typedef libmaus2::suffixsort::bwtb3m::BwtMergeSortResult::BareSimpleSampledSuffixArray sa_type;
		typedef libmaus2::lcs::SMEMProcessor<sa_type> smem_proc_type;
		libmaus2::autoarray::AutoArray < smem_proc_type::unique_ptr_type > Aproc(numthreads);
		for ( uint64_t i = 0; i < numthreads; ++i )
		{
			smem_proc_type::unique_ptr_type proc(new smem_proc_type(
				*Pmeta,cocache,*Prank,BSSSA,A.begin(),maxxdist,activemax,fracmul,fracdiv,selfcheck,chainminscore,maxocc,algndommul,algndomdiv,chaindommul,chaindomdiv,
				libmaus2::lcs::NNP::getDefaultMaxWindowError(),libmaus2::lcs::NNP::getDefaultMaxBack(),false /* domsameref */
				)
			);
			Aproc[i] = UNIQUE_PTR_MOVE(proc);
		}

		stateVec.resize(numthreads);
		for ( uint64_t i = 0; i < numthreads; ++i )
			setState(i,"idle");


		#if defined(_OPENMP)
		#pragma omp parallel num_threads(numthreads)
		#endif
		{
			uint64_t const tid =
				#if defined(_OPENMP)
				omp_get_thread_num()
				#else
				0
				#endif
				;
			std::pair<uint64_t,uint64_t> & P = VP[tid];
			smem_proc_type & proc = *(Aproc[tid]);

			struct SelfieVerbosity : public smem_proc_type::Verbosity
			{
				uint64_t tid;
				std::string prefix;

				SelfieVerbosity(uint64_t const rtid, std::string const & rprefix)
				: tid(rtid), prefix(rprefix)
				{

				}

				void operator()(libmaus2::rank::DNARankMEM const & smem, uint64_t const z) const
				{
					std::ostringstream ostr;
					ostr << prefix << "\t" << z << "\t" << smem;
					setState(tid,ostr.str());
					printState();
				}
			};

			while ( LG.getNext(P) )
			{
				uint64_t const smemleft = std::max(static_cast<int64_t>(0),static_cast<int64_t>(P.first)-static_cast<int64_t>(minlen-1));
				uint64_t const smemright = std::min(P.second+minlen,n);

				std::ostringstream msgstr;
				msgstr << "[" << smemleft << "," << smemright << ")";

				setState(tid,msgstr.str());
				printState();

				libmaus2::rank::DNARankSMEMComputation::SMEMEnumerator<char const *> senum(
					*Prank,A.begin(),
					smemleft,
					smemright,
					minfreq,
					minlen,
					limit,
					minsplitlength,
					minsplitsize);

				SelfieVerbosity SV(tid,msgstr.str());

				proc.process(senum,A.begin(),n,minprintlength,maxerr,SV);
				// proc.printAlignments(minprintlength);

				std::pair<libmaus2::lcs::ChainAlignment const *, libmaus2::lcs::ChainAlignment const *> const AP =
					proc.getAlignments();

				for ( libmaus2::lcs::ChainAlignment const * it = AP.first; it != AP.second; ++it )
				{
					libmaus2::lcs::ChainAlignment const & CA = *it;
					libmaus2::lcs::NNPAlignResult const & res = CA.res;

					std::vector<libmaus2::fastx::DNAIndexMetaDataBigBandBiDir::Coordinates> const VA = Pmeta->mapCoordinatePairToList(res.abpos,res.aepos);
					std::vector<libmaus2::fastx::DNAIndexMetaDataBigBandBiDir::Coordinates> const VB = Pmeta->mapCoordinatePairToList(res.bbpos,res.bepos);

					if ( VA.size() == 1 && VB.size() == 1 )
					{
						CoordinatePair CP(VA[0],VB[0],res);
						libmaus2::parallel::ScopePosixSpinLock slock(CPSlock);
						CPS.put(CP);
					}
				}

				setState(tid,"idle");
				printState();

				#if 0
				std::cerr << "P=[" << P.first << "," << P.second << ")" << std::endl;

				{
					std::vector<libmaus2::rank::DNARankMEM> SMEM;
					libmaus2::rank::DNARankSMEMComputation::smemLimitedParallel(
						*Prank,
						*Pcache,
						A.begin(),
						P.first,
						P.second,
						n,
						minfreq,
						minlen,
						limit,
						SMEM,
						1 /* threads */);
					std::cerr << "[V] number of SMEMs is " << SMEM.size() << std::endl;

					// deallocate k-mer cache
					// Pcache.reset();

					std::vector<libmaus2::rank::DNARankMEM> SMEMsplit;
					libmaus2::rank::DNARankSMEMComputation::smemLimitedParallelSplit(*Prank,A.begin(),P.first,P.second,minlen,limit,minsplitlength,minsplitsize,SMEM,SMEMsplit,1 /* threads */);
					std::cerr << "[V] number of split SMEMs is " << SMEMsplit.size() << std::endl;

					// insert split SMEMs into regular SMEMs
					std::copy(SMEMsplit.begin(),SMEMsplit.end(),std::back_insert_iterator< std::vector<libmaus2::rank::DNARankMEM> >(SMEM));
					//libmaus2::sorting::InPlaceParallelSort::inplacesort2(SMEM.begin(),SMEM.end(),numthreads,libmaus2::rank::DNARankMEMPosComparator());
					std::sort(SMEM.begin(),SMEM.end(),libmaus2::rank::DNARankMEMPosComparator());

					SMEM.resize(std::unique(SMEM.begin(),SMEM.end())-SMEM.begin());

					libmaus2::rank::DNARankSMEMComputation::SMEMEnumerator<char const *> senum(
						*Prank,A.begin(),
						std::max(static_cast<int64_t>(0),static_cast<int64_t>(P.first)-static_cast<int64_t>(minlen-1)),
						std::min(P.second+minlen,n),
						minfreq,
						minlen,
						limit,
						minsplitlength,
						minsplitsize);

					libmaus2::rank::DNARankMEM smem;
					uint64_t c = 0;
					while ( senum.getNext(smem) )
					{
						// std::cerr << "ccc=" << smem << std::endl;

						if ( c >= SMEM.size() || smem != SMEM[c] )
						{
							std::cerr << "mismatch " << c << " " << smem;
							if ( c < SMEM.size() )
								std::cerr << " != " << SMEM[c];
							else
								std::cerr << " ???";
							std::cerr << std::endl;
						}
						else
						{
							std::cerr << "match " << c << " " << smem << " " << SMEM[c] << std::endl;
						}

						++c;
					}

					std::cerr << "c=" << c << " V=" << SMEM.size() << std::endl;
				}
				#endif
			}
		}

		acc_s += pack_s;
	}

	libmaus2::sorting::SortingBufferedOutputFile<CoordinatePair>::merger_ptr_type Pmerger(CPS.getMerger());
	CoordinatePair CP;
	while ( Pmerger->getNext(CP) )
	{
		std::ostringstream ostr;
		CP.A.print(ostr);
		ostr << " ";
		CP.B.print(ostr);
		ostr << " ";
		ostr << CP.res.getErrorRate();

		std::cout << ostr.str() << std::endl;
	}
}
Пример #9
0
			static unique_ptr_type constructFromRL(std::vector<std::string> const & filenames, uint64_t const maxval, std::string const & tmpprefix )
			{
				::libmaus2::util::TempFileNameGenerator tmpgen(tmpprefix,2);
				::libmaus2::huffman::RLDecoder decoder(filenames);
				return unique_ptr_type(new this_type(decoder,::libmaus2::math::numbits(maxval),tmpgen));
			}
Пример #10
0
void testCompactHuffmanPar()
{
	std::vector<uint8_t> A;
	std::map<int64_t,uint64_t> F;
	// uint64_t const n = 1024*1024;
	uint64_t const n = 64*1024*1024;
	for ( uint64_t i = 0; i < n; ++i )
	{
		A.push_back(libmaus::random::Random::rand8() & 0xFF);
		F[A.back()]++;
	}
	libmaus::huffman::HuffmanTree H(F.begin(),F.size(),false,true);
	
	std::cerr << H;
	
	libmaus::util::MemTempFileContainer MTFC;
	// libmaus::wavelet::ImpExternalWaveletGeneratorCompactHuffman IEWGHN(H,MTFC);
	libmaus::util::TempFileNameGenerator tmpgen("tmpdir",2);
	#if defined(_OPENMP)
	uint64_t const numthreads = omp_get_max_threads();
	#else
	uint64_t const numthreads = 1;
	#endif
	libmaus::wavelet::ImpExternalWaveletGeneratorCompactHuffmanParallel IEWGHN(H,tmpgen,numthreads);
	
	// std::cerr << "left construction." << std::endl;

	#if 0
	uint64_t A[] = { 0,0,0,0,0,0,3,3,1,3,2,1,2,1,1,2,1,1,1,2,1 };
	uint64_t const n = sizeof(A)/sizeof(A[0]);
	// uint64_t A[] = { 0,0,0,0,0,0  };
	#endif
	
	#if 0
	uint64_t const perthread = (n + numthreads-1)/numthreads;
	#endif
	
	#if defined(_OPENMP)
	#pragma omp parallel for
	#endif
	for ( int64_t i = 0; i < static_cast<int64_t>(n); ++i )
	{
		#if defined(_OPENMP)
		uint64_t const tid = omp_get_thread_num();
		#else
		uint64_t const tid = 0;
		#endif
		IEWGHN[tid].putSymbol(A[i]);
	}
	
	std::string tmpfilename = "tmp.hwt";
	// std::ostringstream ostr;
	libmaus::aio::CheckedOutputStream COS(tmpfilename);
	IEWGHN.createFinalStream(COS);
	COS.close();
	// std::istringstream istr(ostr.str());
	// libmaus::wavelet::ImpCompactHuffmanWaveletTree IHWTN(tmpfilename);
	libmaus::wavelet::ImpCompactHuffmanWaveletTree::unique_ptr_type pIHWTN(libmaus::wavelet::ImpCompactHuffmanWaveletTree::load(tmpfilename));
	libmaus::wavelet::ImpCompactHuffmanWaveletTree const & IHWTN = *pIHWTN;
	// libmaus::wavelet::ImpCompactHuffmanWaveletTree IHWTN(istr);
	remove(tmpfilename.c_str());
	
	// std::cerr << IHWTN.size() << std::endl;
	assert ( IHWTN.size() == n );
	
	std::map<uint64_t,uint64_t> R;

	for ( uint64_t i = 0; i < IHWTN.size(); ++i )
	{
		if ( i % (32*1024) == 0 )
			std::cerr << static_cast<double>(i) / n << std::endl;
		// std::cerr << IHWTN[i] << ";";
		assert ( IHWTN[i] == A[i] );

		// std::cerr << "[" << i << "," << IHWTN.select(A[i],R[A[i]]) << "]" << ";";
		assert ( i == IHWTN.select(A[i],R[A[i]]) );

		assert ( IHWTN.rankm(A[i],i) == R[A[i]] );
		R[A[i]]++;
		assert ( IHWTN.rank (A[i],i) == R[A[i]] );
	}
	std::cerr << std::endl;
	
	if ( n <= 128 )
	{
		#if defined(_OPENMP)
		#pragma omp parallel for
		#endif
		for ( uint64_t i = 0; i <= IHWTN.size(); ++i )
			for ( uint64_t j = i; j <= IHWTN.size(); ++j )
			{
				assert ( IHWTN.enumerateSymbolsInRange(i,j) == IHWTN.enumerateSymbolsInRangeSlow(i,j) );
			}
	}

	// ImpExternalWaveletGeneratorCompactHuffman(libmaus::huffman::HuffmanTree const & rH, ::libmaus::util::TempFileContainer & rtmpcnt)

}