int main() { int data[] = {1,2,3,4,5,6,7,8,9,10}; int m = reduce(max, data, 10); int s = reduce(sum, data, 10); printf("max : %i; sum: %i\n", m, s); int pm = parallel_reduce(max, data, 10); int ps = parallel_reduce(sum, data, 10); printf("parallel max : %i; parallel sum: %i\n", pm, ps); return 0; }
void dot( const ConstVectorType & X , const Finalize & finalize ) { typedef DotSingle< ConstVectorType > functor ; parallel_reduce( X.dimension_0() , functor( X ) , finalize ); }
void dot( const ConstVectorType & X , const Finalize & finalize ) { typedef DotSingle< ConstVectorType > functor ; parallel_reduce( X.extent(0) , functor( X ) , finalize ); }
double parallelPearsonCorrelationCoefficient(double *a, double *b) { double meanA = parallelMean(a); double meanB = parallelMean(b); double standardDeviationA = parallelStandardDeviation(a,&meanA); double standardDeviationB = parallelStandardDeviation(b,&meanB); // TOP of fraction double value = parallel_reduce(blocked_range<size_t>(0, vec_size, 10000), double(0), [=](blocked_range<size_t> &r, double sum) -> double { for(size_t i=r.begin();i!=r.end();++i){ sum += ((a[i]-meanA) * (b[i]-meanB)); } return sum; }, [=](double a, double b){ return a+b; }); value *= 1.0/vec_size; // BOTTOM of fraction value /= (standardDeviationA*standardDeviationB); return value; }
void CowichanTBB::life(BoolMatrix input, BoolMatrix output) { GameOfLife game(input, output, nr, nc); for (index_t i = 0; i < LIFE_ITERATIONS; ++i) { // update CA simulation parallel_reduce(Range2D(0, nr, 0, nc), game, auto_partitioner()); // check if there are alive cells if (!game.isAlive()) { no_cells_alive(); } // swap arrays (ping-pong approach) game.swap(); } // final result is in input - copy to output if (LIFE_ITERATIONS % 2 == 0) { for (index_t r = 0; r < nr; r++) { for (index_t c = 0; c < nc; c++) { MATRIX_RECT(output, r, c) = MATRIX_RECT(input, r, c); } } } }
void SPHSolver::CalculateDensity( FluidParticle* particle ) { // -- Compute density over a kernel function of neighbors #ifdef USE_TBB float density = parallel_reduce( blocked_range<FluidParticle**>( particle->neighbors, particle->neighbors + particle->neighborsCount ), 0.f, [&](const blocked_range<FluidParticle**>& range, float init)->float { for( FluidParticle** neighbor = range.begin(); neighbor != range.end(); ++neighbor) { init += KernelPoly6(glm::distance((*neighbor)->Position(), particle->Position()), m_kernelRadius); } return init; }, []( float x, float y )->float { return x + y; } ); #else float density = 0.0f; for (size_t i = 0; i < particle->neighborsCount; ++i) { FluidParticle* neighbor = particle->neighbors[i]; float tempDensity = KernelPoly6(glm::distance(neighbor->Position(), particle->Position()), m_kernelRadius); density += tempDensity; } #endif; density = FluidParticle::mass * density; particle->SetDensity(density); }
void testFakeAttributeRead() { task_scheduler_init scheduler( 100 ); TestSceneCache task( "fake" ); parallel_reduce( blocked_range<size_t>( 0, 100 ), task ); BOOST_CHECK( task.errors() == 100000 ); }
inline static value_type apply( const size_t n , const scalar_vector & x ) { value_type result = 0 ; Dot op ; op.x = x ; parallel_reduce( n , op , result ); return result ; }
void InitThreadPool() { boost::uint32_t systemCores = Threading::GetAvailableCoresMask(); boost::uint32_t mainAffinity = systemCores; #ifndef UNIT_TEST mainAffinity &= configHandler->GetUnsigned("SetCoreAffinity"); #endif boost::uint32_t ompAvailCores = systemCores & ~mainAffinity; { #ifndef UNIT_TEST int workerCount = std::min(ThreadPool::GetMaxThreads() - 1, configHandler->GetUnsigned("WorkerThreadCount")); ThreadPool::SetThreadSpinTime(configHandler->GetUnsigned("WorkerThreadSpinTime")); #else int workerCount = -1; #endif const int numCores = ThreadPool::GetMaxThreads(); // For latency reasons our worker threads yield rarely and so eat a lot cputime with idleing. // So it's better we always leave 1 core free for our other threads, drivers & OS if (workerCount < 0) { if (numCores == 2) { workerCount = numCores; } else if (numCores < 6) { workerCount = numCores - 1; } else { workerCount = numCores / 2; } } if (workerCount > numCores) { LOG_L(L_WARNING, "Set ThreadPool workers to %i, but there are just %i cores!", workerCount, numCores); workerCount = numCores; } ThreadPool::SetThreadCount(workerCount); } // set affinity of worker threads boost::uint32_t ompCores = 0; ompCores = parallel_reduce([&]() -> boost::uint32_t { const int i = ThreadPool::GetThreadNum(); // 0 is the source thread, skip if (i == 0) return 0; boost::uint32_t ompCore = GetCpuCoreForWorkerThread(i - 1, ompAvailCores, mainAffinity); //boost::uint32_t ompCore = ompAvailCores; Threading::SetAffinity(ompCore); return ompCore; }, [](boost::uint32_t a, boost::unique_future<boost::uint32_t>& b) -> boost::uint32_t { return a | b.get(); }); // affinity of mainthread boost::uint32_t nonOmpCores = ~ompCores; if (mainAffinity == 0) mainAffinity = systemCores; Threading::SetAffinityHelper("Main", mainAffinity & nonOmpCores); }
void SPHSolver::CalculatePressureForceField( FluidParticle* particle ) { // -- Compute pressure gradient glm::vec3 pressureGrad(0.0f); float particleDensitySquared = particle->Density() * particle->Density(); #ifdef USE_TBB pressureGrad = parallel_reduce( blocked_range<FluidParticle**>( particle->neighbors, particle->neighbors + particle->neighborsCount ), glm::vec3(0.0), [&](const blocked_range<FluidParticle**>& range, glm::vec3 init)->glm::vec3 { for( FluidParticle** neighbor = range.begin(); neighbor != range.end(); ++neighbor ) { glm::vec3 r = particle->Position() - (*neighbor)->Position(); float x = glm::distance((*neighbor)->Position(), particle->Position()); glm::vec3 kernelGrad = GradKernelSpiky(r, x, m_kernelRadius); float neighborDensitySquared = (*neighbor)->Density() * (*neighbor)->Density(); float tempPressureForce = ( (particle->Pressure() / particleDensitySquared) + ((*neighbor)->Pressure() / neighborDensitySquared) ); init += tempPressureForce * kernelGrad; } return init; }, []( glm::vec3 x, glm::vec3 y )->glm::vec3 { return x + y; } ); #else for (size_t i = 0; i < particle->neighborsCount; ++i) { FluidParticle* neighbor = particle->neighbors[i]; glm::vec3 r = particle->Position() - neighbor->Position(); float x = glm::distance(neighbor->Position(), particle->Position()); glm::vec3 kernelGrad = GradKernelSpiky(r, x, m_kernelRadius); float neighborDensitySquared = neighbor->Density() * neighbor->Density(); float tempPressureForce = ( particle->Pressure() / particleDensitySquared + neighbor->Pressure() / neighborDensitySquared ); pressureGrad += tempPressureForce * kernelGrad; } #endif pressureGrad = -pressureGrad * FluidParticle::mass * FluidParticle::mass; particle->SetPressureForce(pressureGrad); }
double parallelStandardDeviation(double *a, double *meanValue) { // reduce by pointing to a place in memory for i in a[] then remove the meanValue and then square // emnumerate the above for all a[] then squareroot and divide by array size // This is the Lambda syntax, return sqrt(parallel_reduce(blocked_range<double*>(a, a+vec_size),0.0, [&](const blocked_range<double*>& r, double sum)->double { for(double* i=r.begin(); i!=r.end(); ++i) sum += pow(*i-*meanValue,2.0); return sum; }, [](double x, double y)->double {return x+y;} )/vec_size); }
void buildGraph(std::vector<std::pair<datadim_t, mdMap_t>> data, std::vector<std::pair<discretedim_t, discretedim_t>> dataDiscrete, std::shared_ptr<gc::Graph> graph, data_t threshold) { std::cout << "Build initial graph: " << std::flush; long edgeCount = 0; data_t xMax = std::numeric_limits<data_t>::lowest(); for (std::size_t i = 0; i < data.size(); i++) { std::list<std::size_t> refs; // reverse search existing vertices TBBSearchHelper helper1(i, graph); parallel_reduce(tbb::blocked_range<std::size_t>(0, i), helper1); refs.splice(refs.end(), helper1.refs); // do not add self reference (=i) // test edges too non exisiting vertices TBBEdgeHelper helper2(data[i].first, data[i].second, dataDiscrete[i].first, dataDiscrete[i].second, &data, &dataDiscrete, threshold); parallel_reduce(tbb::blocked_range<std::size_t>(i + 1, data.size()), helper2); xMax = std::max(xMax, helper2.xMax); refs.splice(refs.end(), helper2.refs); // store graph->add(refs); edgeCount += refs.size(); // report progress if (i % 100 == 0) { std::cout << i << std::flush; } else if (i % 10 == 0) { std::cout << "." << std::flush; } } std::cout << "done (" << (edgeCount / 2) << " edges, max="<< xMax << ")" << std::endl; }
bool operator() () { bool passed = true; printf("%s::%s ... ",TOSTRING(isa),name); fflush(stdout); const size_t M = 10; for (size_t N=10; N<10000000; N*=2.1f) { /* sequentially calculate sum of squares */ size_t sum0 = 0; for (size_t i=0; i<N; i++) { sum0 += i*i; } /* parallel calculation of sum of squares */ double t0 = getSeconds(); for (size_t m=0; m<M; m++) { size_t sum1 = parallel_reduce( size_t(0), size_t(N), size_t(1024), size_t(0), [&](const range<size_t>& r) -> size_t { size_t s = 0; for (size_t i=r.begin(); i<r.end(); i++) s += i*i; return s; }, [](const size_t v0, const size_t v1) { return v0+v1; }); passed = sum0 == sum1; } double t1 = getSeconds(); printf("%zu/%3.2fM ",N,1E-6*double(N*M)/(t1-t0)); } /* output if test passed or not */ if (passed) printf("[passed]\n"); else printf("[failed]\n"); return passed; }
void InitThreadPool() { boost::uint32_t systemCores = Threading::GetAvailableCoresMask(); boost::uint32_t mainAffinity = systemCores; boost::uint32_t ompAvailCores = systemCores & ~mainAffinity; #ifndef UNIT_TEST mainAffinity = systemCores & configHandler->GetUnsigned("SetCoreAffinity"); #endif { int workerCount = -1; #ifndef UNIT_TEST workerCount = configHandler->GetUnsigned("WorkerThreadCount"); ThreadPool::SetThreadSpinTime(configHandler->GetUnsigned("WorkerThreadSpinTime")); #endif // For latency reasons our worker threads yield rarely and so eat a lot cputime with idleing. // So it's better we always leave 1 core free for our other threads, drivers & OS if (workerCount < 0) workerCount = ThreadPool::GetMaxThreads() - 1; //if (workerCount > ThreadPool::GetMaxThreads()) LOG_L(L_WARNING, ""); ThreadPool::SetThreadCount(workerCount); } // set affinity of worker threads boost::uint32_t ompCores = 0; ompCores = parallel_reduce([&]() -> boost::uint32_t { const int i = ThreadPool::GetThreadNum(); if (i != 0) { // 0 is the source thread, skip boost::uint32_t ompCore = GetCpuCoreForWorkerThread(i - 1, ompAvailCores, mainAffinity); Threading::SetAffinity(ompCore); return ompCore; } return 0; }, [](boost::uint32_t a, boost::unique_future<boost::uint32_t>& b) -> boost::uint32_t { return a | b.get(); }); // affinity of mainthread boost::uint32_t nonOmpCores = ~ompCores; if (mainAffinity == 0) mainAffinity = systemCores; Threading::SetAffinityHelper("Main", mainAffinity & nonOmpCores); }
void CowichanTBB::norm(PointVector pointsIn, PointVector pointsOut) { MinMaxReducer minmax(pointsIn); // find min/max coordinates parallel_reduce(Range(0, n), minmax, auto_partitioner()); Point minPoint = minmax.getMinimum(); Point maxPoint = minmax.getMaximum(); // compute scaling factors real xfactor = (real)((maxPoint.x == minPoint.x) ? 0.0 : 1.0 / (maxPoint.x - minPoint.x)); real yfactor = (real)((maxPoint.y == minPoint.y) ? 0.0 : 1.0 / (maxPoint.y - minPoint.y)); Normalizer normalizer(pointsIn, pointsOut, minPoint.x, minPoint.y, xfactor, yfactor); // normalize the vector parallel_for(Range(0, n), normalizer, auto_partitioner()); }
typename XVector::scalar_type V_Dot( const XVector & x, const YVector & y) { V_DotFunctor<XVector,YVector> f(x,y); return parallel_reduce(x.dimension_0(),f); }
void BVH4BuilderTwoLevel::build(size_t threadIndex, size_t threadCount) { /* delete some objects */ size_t N = scene->size(); if (N < objects.size()) { parallel_for(N, objects.size(), [&] (const range<size_t>& r) { for (size_t i=r.begin(); i<r.end(); i++) { delete builders[i]; builders[i] = nullptr; delete objects[i]; objects[i] = nullptr; } }); } /* reset memory allocator */ bvh->alloc.reset(); /* skip build for empty scene */ const size_t numPrimitives = scene->getNumPrimitives<TriangleMesh,1>(); if (numPrimitives == 0) { prims.resize(0); bvh->set(BVH4::emptyNode,empty,0); return; } double t0 = bvh->preBuild(TOSTRING(isa) "::BVH4BuilderTwoLevel"); #if PROFILE profile(2,20,numPrimitives,[&] (ProfileTimer& timer) { #endif /* resize object array if scene got larger */ if (objects.size() < N) objects.resize(N); if (builders.size() < N) builders.resize(N); if (refs.size() < N) refs.resize(N); nextRef = 0; /* create of acceleration structures */ parallel_for(size_t(0), N, [&] (const range<size_t>& r) { for (size_t objectID=r.begin(); objectID<r.end(); objectID++) { TriangleMesh* mesh = scene->getTriangleMeshSafe(objectID); /* verify meshes got deleted properly */ if (mesh == nullptr || mesh->numTimeSteps != 1) { assert(objectID < objects.size () && objects[objectID] == nullptr); assert(objectID < builders.size() && builders[objectID] == nullptr); continue; } /* create BVH and builder for new meshes */ if (objects[objectID] == nullptr) createTriangleMeshAccel(mesh,(AccelData*&)objects[objectID],builders[objectID]); } }); /* parallel build of acceleration structures */ parallel_for(size_t(0), N, [&] (const range<size_t>& r) { for (size_t objectID=r.begin(); objectID<r.end(); objectID++) { /* ignore if no triangle mesh or not enabled */ TriangleMesh* mesh = scene->getTriangleMeshSafe(objectID); if (mesh == nullptr || !mesh->isEnabled() || mesh->numTimeSteps != 1) continue; BVH4* object = objects [objectID]; assert(object); Builder* builder = builders[objectID]; assert(builder); /* build object if it got modified */ #if !PROFILE if (mesh->isModified()) #endif builder->build(0,0); /* create build primitive */ if (!object->bounds.empty()) refs[nextRef++] = BVH4BuilderTwoLevel::BuildRef(object->bounds,object->root); } }); /* fast path for single geometry scenes */ if (nextRef == 1) { bvh->set(refs[0].node,refs[0].bounds(),numPrimitives); return; } /* open all large nodes */ refs.resize(nextRef); open_sequential(numPrimitives); /* fast path for small geometries */ if (refs.size() == 1) { bvh->set(refs[0].node,refs[0].bounds(),numPrimitives); return; } /* compute PrimRefs */ prims.resize(refs.size()); const PrimInfo pinfo = parallel_reduce(size_t(0), refs.size(), size_t(1024), PrimInfo(empty), [&] (const range<size_t>& r) -> PrimInfo { PrimInfo pinfo(empty); for (size_t i=r.begin(); i<r.end(); i++) { pinfo.add(refs[i].bounds()); prims[i] = PrimRef(refs[i].bounds(),(size_t)refs[i].node); } return pinfo; }, [] (const PrimInfo& a, const PrimInfo& b) { return PrimInfo::merge(a,b); }); /* skip if all objects where empty */ if (pinfo.size() == 0) bvh->set(BVH4::emptyNode,empty,0); /* otherwise build toplevel hierarchy */ else { BVH4::NodeRef root; BVHBuilderBinnedSAH::build<BVH4::NodeRef> (root, [&] { return bvh->alloc.threadLocal2(); }, [&] (const isa::BVHBuilderBinnedSAH::BuildRecord& current, BVHBuilderBinnedSAH::BuildRecord* children, const size_t N, FastAllocator::ThreadLocal2* alloc) -> int { BVH4::Node* node = (BVH4::Node*) alloc->alloc0.malloc(sizeof(BVH4::Node)); node->clear(); for (size_t i=0; i<N; i++) { node->set(i,children[i].pinfo.geomBounds); children[i].parent = (size_t*)&node->child(i); } *current.parent = bvh->encodeNode(node); return 0; }, [&] (const BVHBuilderBinnedSAH::BuildRecord& current, FastAllocator::ThreadLocal2* alloc) -> int { assert(current.prims.size() == 1); *current.parent = (BVH4::NodeRef) prims[current.prims.begin()].ID(); return 1; }, [&] (size_t dn) { bvh->scene->progressMonitor(0); }, prims.data(),pinfo,BVH4::N,BVH4::maxBuildDepthLeaf,4,1,1,1.0f,1.0f); bvh->set(root,pinfo.geomBounds,numPrimitives); } #if PROFILE }); #endif bvh->alloc.cleanup(); bvh->postBuild(t0); }
double ParallelTBBSclMlt(const double* A, const double* B, const int len){ ScalarMultiplicator mul(A, B); parallel_reduce(blocked_range<int>(0, len), mul); return mul.Result(); }
double parallelMean(double *a) { return parallel_reduce(blocked_range<double*>(a,a+vec_size),0.0, [](const blocked_range<double*>& r, double value)->double { return accumulate(r.begin(),r.end(),value); },plus<double>())/vec_size; }
size_type apply() const { size_type count = 0u; parallel_reduce(m_bitset.m_blocks.dimension_0(), *this, count); return count; }
BBox<dim> find_bounding_box(Reals coords) { CHECK(coords.size() % dim == 0); return parallel_reduce(coords.size() / dim, BBoxFunctor<dim>(coords)); }
double ParallelTBBSum(const double* src, const int len){ VectorSummator sum(src); parallel_reduce(blocked_range<int>(0, len), sum); return sum.Result(); }