int main() { std::cout << "Parallel Framework: " << PARALLEL_FRAMEWORK << std::endl; std::size_t length(1000); std::vector<std::vector<double>> a(length, std::vector<double>(length)), b(length, std::vector<double>(length)), c(length, std::vector<double>(length)); fillMatrix(a); fillMatrix(b); std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now(); parallel_for_each(par, 0, length, [&a, &b, &c, length](std::size_t i) { for (std::size_t j = 0; j < length; ++j) { for (std::size_t k = 0; k < length; ++k) { c[i][j] += a[i][k] * b[k][j]; } } }); std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now(); std::chrono::duration<double> time_span = std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1); std::cout << "It took me " << time_span.count() << " seconds." << std::endl; std::cout << "serial fallback: " << std::endl; t1 = std::chrono::high_resolution_clock::now(); parallel_for_each(seq, 0, length, [&a, &b, &c, length](std::size_t i) { for (std::size_t j = 0; j < length; ++j) { for (std::size_t k = 0; k < length; ++k) { c[i][j] += a[i][k] * b[k][j]; } } }); t2 = std::chrono::high_resolution_clock::now(); time_span = std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1); std::cout << "It took me " << time_span.count() << " seconds." << std::endl; std::cout << "serial old: " << std::endl; t1 = std::chrono::high_resolution_clock::now(); for (std::size_t i = 0; i < length; ++i) { for (std::size_t j = 0; j < length; ++j) { for (std::size_t k = 0; k < length; ++k) { c[i][j] += a[i][k] * b[k][j]; } } } t2 = std::chrono::high_resolution_clock::now(); time_span = std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1); std::cout << "It took me " << time_span.count() << " seconds." << std::endl; return 0; }
void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, array_view<const float>& A, int lda, array_view<const float>& B, int ldb, float beta, array_view<float>& C, int ldc ) { if ((transa != 'N') && (transa != 'n')) { std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; return; } if ((transb != 'T') && (transb != 't')) { std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl; return; } // In this code we assume the matrix sizes are multiple of tile size if ((m%TILE_SZ) || (n%TILE_SZ)) { std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_SZ << "; n should be multiple of " << TILE_SZ << std::endl; } int threads[2] = {TILE_SZ, TILE_SZ}; int grid[2] = {m/TILE_SZ*threads[0], n/TILE_SZ*threads[1]}; parallel_for_each(extent<2>(grid).tile(threads[0], threads[1]), [=] (tiled_index<2> tidx) [[hc]] { mysgemmNT(tidx, A, lda, B, ldb, C, ldc, k, alpha, beta); }); }
void kmeans::update_clusters(const dataset & p_centers, cluster_sequence & p_clusters) { const dataset & data = *m_ptr_data; p_clusters.clear(); p_clusters.resize(p_centers.size()); /* fill clusters again in line with centers. */ if (m_ptr_indexes->empty()) { std::vector<std::size_t> winners(data.size(), 0); parallel_for(std::size_t(0), data.size(), [this, &p_centers, &winners](std::size_t p_index) { assign_point_to_cluster(p_index, p_centers, winners); }); for (std::size_t index_point = 0; index_point < winners.size(); index_point++) { const std::size_t suitable_index_cluster = winners[index_point]; p_clusters[suitable_index_cluster].push_back(index_point); } } else { /* This part of code is used by X-Means and in case of parallel implementation of this part in scope of X-Means performance is slightly reduced. Experiments has been performed our implementation and Intel TBB library. But in K-Means case only - it works perfectly and increase performance. */ std::vector<std::size_t> winners(data.size(), 0); parallel_for_each(*m_ptr_indexes, [this, &p_centers, &winners](std::size_t p_index) { assign_point_to_cluster(p_index, p_centers, winners); }); for (std::size_t index_point : *m_ptr_indexes) { const std::size_t suitable_index_cluster = winners[index_point]; p_clusters[suitable_index_cluster].push_back(index_point); } } erase_empty_clusters(p_clusters); }
void parallel_for_each(Iterator first, Iterator last, Func f) { ptrdiff_t const range_length = last - first; if (!range_length) return; if (range_length == 1) { f(*first); return; } Iterator const mid = first + (range_length / 2); std::future<void> bgtask = std::async(launch::async, ¶llel_for_each<Iterator, Func>, first, mid, f); try { parallel_for_each(mid, last, f); } catch (...) { bgtask.wait(); throw; } bgtask.get(); }
// Merge identical COMDAT sections. // Two sections are considered the same if their section headers, // contents and relocations are all the same. void ICF::run(const std::vector<Chunk *> &Vec) { // Collect only mergeable sections and group by hash value. parallel_for_each(Vec.begin(), Vec.end(), [&](Chunk *C) { if (auto *SC = dyn_cast<SectionChunk>(C)) { bool Global = SC->Sym && SC->Sym->isExternal(); bool Writable = SC->getPermissions() & llvm::COFF::IMAGE_SCN_MEM_WRITE; if (SC->isCOMDAT() && SC->isLive() && Global && !Writable) SC->GroupID = getHash(SC) | (uint64_t(1) << 63); } }); std::vector<SectionChunk *> Chunks; for (Chunk *C : Vec) { if (auto *SC = dyn_cast<SectionChunk>(C)) { if (SC->GroupID) { Chunks.push_back(SC); } else { SC->GroupID = NextID++; } } } // From now on, sections in Chunks are ordered so that sections in // the same group are consecutive in the vector. std::sort(Chunks.begin(), Chunks.end(), [](SectionChunk *A, SectionChunk *B) { return A->GroupID < B->GroupID; }); // Split groups until we get a convergence. int Cnt = 1; forEachGroup(Chunks, equalsConstant); for (;;) { if (!forEachGroup(Chunks, equalsVariable)) break; ++Cnt; } if (Config->Verbose) llvm::outs() << "\nICF needed " << Cnt << " iterations.\n"; // Merge sections in the same group. for (auto It = Chunks.begin(), End = Chunks.end(); It != End;) { SectionChunk *Head = *It++; auto Bound = std::find_if(It, End, [&](SectionChunk *SC) { return Head->GroupID != SC->GroupID; }); if (It == Bound) continue; if (Config->Verbose) llvm::outs() << "Selected " << Head->getDebugName() << "\n"; while (It != Bound) { SectionChunk *SC = *It++; if (Config->Verbose) llvm::outs() << " Removed " << SC->getDebugName() << "\n"; Head->replace(SC); } } }
int main() { std::mutex aMutex; parallel_for_each(par, 0, 10, [&aMutex](std::size_t i) { std::lock_guard<std::mutex> lock(aMutex); std::cout << "Hello from task " << i << std::endl; }); return 0; }
void tree::for_all(Function& action) { // Perform the action on each child. parallel_for_each(begin(_children), end(_children), [&](tree& child) { child.for_all(action); }); // Perform the action on this node. action(*this); }
int main () { std::vector<int> values(100); int i = 0; for (auto &v: values) v = i++; parallel_for_each(Range(1000), [](int a) { printf("%d\n",a); }, 10); return 0; }
void parallel_for_each(Iterator first, Iterator last, Func f) { const unsigned long length = std::distance(first, last); if (!length) return; const unsigned long min_per_thread = 25; if (length < 2 * min_per_thread) { std::for_each(first, last, f); } else { const Iterator mid_point = first + length / 2; std::future<void> first_half = std::async(¶llel_for_each<Iterator, Func>, first, mid_point, f); parallel_for_each(mid_point, last, f); first_half.get(); } }
// Finds all prime factors of the given value. concurrent_vector<int> prime_factors_of(int n, const concurrent_vector<int>& primes) { // Holds the prime factors of n. concurrent_vector<int> prime_factors; // Use trial division to find the prime factors of n. // Every prime number that divides evenly into n is a prime factor of n. const int max = sqrt(static_cast<double>(n)); parallel_for_each(begin(primes), end(primes), [&](int prime) { if (prime <= max) { if ((n % prime) == 0) prime_factors.push_back(prime); } }); return prime_factors; }
void parallel_for_each(T range, F callback, int numSegments) { parallel_for_each(std::begin(range), std::end(range), callback, numSegments); }
void optimize(Index& index, php::Program& program) { assert(check(program)); trace_time tracer("optimize"); SCOPE_EXIT { state_after("optimize", program); }; // Counters, just for debug printing. std::atomic<uint32_t> total_funcs{0}; auto round = uint32_t{0}; /* * Algorithm: * * Start by running an analyze pass on every function. During * analysis, information about functions or classes will be * requested from the Index, which initially won't really know much, * but will record a dependency. This part is done in parallel: no * passes are mutating anything, just reading from the Index. * * After a pass, we do a single-threaded "update" step to prepare * for the next pass: for each function that was analyzed, note the * facts we learned that may aid analyzing other functions in the * program, and register them in the index. At this point, if any * of these facts are more useful than they used to be, add all the * Contexts that had a dependency on the new information to the work * list again, in case they can do better based on the new fact. * * Repeat until the work list is empty. */ auto work = initial_work(program); while (!work.empty()) { auto const results = [&] { trace_time trace( "analyzing", folly::format("round {} -- {} work items", round, work.size()).str() ); return parallel_map( work, [&] (const Context& ctx) -> folly::Optional<FuncAnalysis> { total_funcs.fetch_add(1, std::memory_order_relaxed); return analyze_func(index, ctx); } ); }(); work.clear(); ++round; trace_time update_time("updating"); std::set<Context> revisit; for (auto i = size_t{0}; i < results.size(); ++i) { auto& result = *results[i]; assert(result.ctx.func == work[i].func); assert(result.ctx.cls == work[i].cls); assert(result.ctx.unit == work[i].unit); auto deps = index.refine_return_type( result.ctx.func, result.inferredReturn ); for (auto& d : deps) revisit.insert(d); } std::copy(begin(revisit), end(revisit), std::back_inserter(work)); } if (Trace::moduleEnabledRelease(Trace::hhbbc_time, 1)) { Trace::traceRelease("total function visits %u\n", total_funcs.load()); } /* * Finally, use the results of all these iterations to perform * optimization. This reanalyzes every function using our * now-very-updated Index, and then runs optimize_func with the * results. * * We do this in parallel: all the shared information is queried out * of the index, and each thread is allowed to modify the bytecode * for the function it is looking at. * * NOTE: currently they can't modify anything other than the * bytecode/Blocks, because other threads may be doing unlocked * queries to php::Func and php::Class structures. */ trace_time final_pass("final pass"); work = initial_work(program); parallel_for_each( initial_work(program), [&] (Context ctx) { optimize_func(index, analyze_func(index, ctx)); } ); }
void parallel_for_each(const TypeContainer & p_container, const TypeAction & p_task) { parallel_for_each(std::begin(p_container), std::end(p_container), p_task); }