::libmaus2::util::Histogram::unique_ptr_type libmaus2::util::Utf8String::getHistogram(::libmaus2::autoarray::AutoArray<uint8_t> const & A) { #if defined(_OPENMP) uint64_t const numthreads = omp_get_max_threads(); #else uint64_t const numthreads = 1; #endif ::libmaus2::autoarray::AutoArray<uint64_t> const partstarts = computePartStarts(A,numthreads); uint64_t const numparts = partstarts.size()-1; ::libmaus2::util::Histogram::unique_ptr_type hist(new ::libmaus2::util::Histogram); ::libmaus2::parallel::OMPLock lock; #if defined(_OPENMP) #pragma omp parallel for #endif for ( int64_t t = 0; t < static_cast<int64_t>(numparts); ++t ) { ::libmaus2::util::Histogram::unique_ptr_type lhist(new ::libmaus2::util::Histogram); uint64_t codelen = 0; uint64_t const tcodelen = partstarts[t+1]-partstarts[t]; ::libmaus2::util::GetObject<uint8_t const *> G(A.begin()+partstarts[t]); while ( codelen != tcodelen ) (*lhist)(::libmaus2::util::UTF8::decodeUTF8(G,codelen)); lock.lock(); hist->merge(*lhist); lock.unlock(); } return UNIQUE_PTR_MOVE(hist); }
/** * compute character histogram in parallel **/ ::libmaus::autoarray::AutoArray<uint64_t> computeCharHist(std::string const & inputfile) { uint64_t const n = ::libmaus::util::GetFileSize::getFileSize(inputfile); #if defined(_OPENMP) uint64_t const numthreads = omp_get_max_threads(); #else uint64_t const numthreads = 1; #endif uint64_t const packsize = (n + numthreads-1)/numthreads; ::libmaus::parallel::OMPLock lock; ::libmaus::autoarray::AutoArray<uint64_t> ghist(256); #if defined(_OPENMP) #pragma omp parallel for #endif for ( int64_t t = 0; t < static_cast<int64_t>(numthreads); ++t ) { uint64_t const low = std::min(n,t*packsize); uint64_t const high = std::min(n,low+packsize); uint64_t const range = high-low; if ( range ) { ::libmaus::autoarray::AutoArray<uint64_t> lhist(ghist.size()); ::libmaus::aio::CheckedInputStream CIS(inputfile); CIS.seekg(low); uint64_t const blocksize = 8192; uint64_t const numblocks = ((range)+blocksize-1)/blocksize; ::libmaus::autoarray::AutoArray<uint8_t> B(blocksize); for ( uint64_t b = 0; b < numblocks; ++b ) { uint64_t const llow = std::min(low + b*blocksize,high); uint64_t const lhigh = std::min(llow + blocksize,high); uint64_t const lrange = lhigh-llow; CIS.read ( reinterpret_cast<char *>(B.begin()), lrange ); assert ( CIS.gcount() == static_cast<int64_t>(lrange) ); for ( uint64_t i = 0; i < lrange; ++i ) lhist[B[i]]++; } lock.lock(); for ( uint64_t i = 0; i < lhist.size(); ++i ) ghist[i] += lhist[i]; lock.unlock(); } } return ghist; }