void test_count_down_and_wait(hpx::lcos::local::latch& l) { ++num_threads; HPX_TEST(!l.is_ready()); l.count_down_and_wait(); }
void wait_for_latch(hpx::lcos::local::latch& l) { l.count_down_and_wait(); }
std::vector<std::vector<double> > numa_domain_worker(std::size_t domain, Policy policy, hpx::lcos::local::latch& l, std::size_t part_size, std::size_t offset, std::size_t iterations, Vector& a, Vector& b, Vector& c) { typedef typename Vector::iterator iterator; iterator a_begin = a.begin() + offset; iterator b_begin = b.begin() + offset; iterator c_begin = c.begin() + offset; iterator a_end = a_begin + part_size; iterator b_end = b_begin + part_size; iterator c_end = c_begin + part_size; // Initialize arrays hpx::parallel::fill(policy, a_begin, a_end, 1.0); hpx::parallel::fill(policy, b_begin, b_end, 2.0); hpx::parallel::fill(policy, c_begin, c_end, 0.0); double t = mysecond(); hpx::parallel::for_each(policy, a_begin, a_end, [&policy](STREAM_TYPE & v) { v = 2.0 * v; #if defined(HPX_DEBUG) // make sure memory was placed appropriately hpx::threads::topology& topo = retrieve_topology(); hpx::threads::mask_cref_type mem_mask = topo.get_thread_affinity_mask_from_lva( reinterpret_cast<hpx::naming::address_type>(&v)); typedef typename Policy::executor_type executor_type; typedef hpx::parallel::executor_information_traits< executor_type> traits; std::size_t thread_num = hpx::get_worker_thread_num(); hpx::threads::mask_cref_type thread_mask = traits::get_pu_mask(policy.executor(), topo, thread_num); HPX_ASSERT(hpx::threads::mask_size(mem_mask) == hpx::threads::mask_size(thread_mask)); HPX_ASSERT(hpx::threads::bit_and(mem_mask, thread_mask, hpx::threads::mask_size(mem_mask))); #endif }); t = 1.0E6 * (mysecond() - t); if (domain == 0) { // Get initial value for system clock. int quantum = checktick(); if(quantum >= 1) { std::cout << "Your clock granularity/precision appears to be " << quantum << " microseconds.\n" ; } else { std::cout << "Your clock granularity appears to be less than one microsecond.\n" ; quantum = 1; } std::cout << "Each test below will take on the order" << " of " << (int) t << " microseconds.\n" << " (= " << (int) (t/quantum) << " clock ticks)\n" << "Increase the size of the arrays if this shows that\n" << "you are not getting at least 20 clock ticks per test.\n" << "-------------------------------------------------------------\n" ; std::cout << "WARNING -- The above is only a rough guideline.\n" << "For best results, please be sure you know the\n" << "precision of your system timer.\n" << "-------------------------------------------------------------\n" ; } // synchronize across NUMA domains l.count_down_and_wait(); /////////////////////////////////////////////////////////////////////////// // Main Loop std::vector<std::vector<double> > timing(4, std::vector<double>(iterations)); double scalar = 3.0; for(std::size_t iteration = 0; iteration != iterations; ++iteration) { // Copy timing[0][iteration] = mysecond(); hpx::parallel::copy(policy, a_begin, a_end, c_begin); timing[0][iteration] = mysecond() - timing[0][iteration]; // Scale timing[1][iteration] = mysecond(); hpx::parallel::transform(policy, c_begin, c_end, b_begin, [scalar](STREAM_TYPE val) { return scalar * val; } ); timing[1][iteration] = mysecond() - timing[1][iteration]; // Add timing[2][iteration] = mysecond(); hpx::parallel::transform(policy, a_begin, a_end, b_begin, b_end, c_begin, [](STREAM_TYPE val1, STREAM_TYPE val2) { return val1 + val2; } ); timing[2][iteration] = mysecond() - timing[2][iteration]; // Triad timing[3][iteration] = mysecond(); hpx::parallel::transform(policy, b_begin, b_end, c_begin, c_end, a_begin, [scalar](STREAM_TYPE val1, STREAM_TYPE val2) { return val1 + scalar * val2; } ); timing[3][iteration] = mysecond() - timing[3][iteration]; } return timing; }