int main(int argc, char *argv[]) { size_t size = 1000; if(argc >= 2){ size = boost::lexical_cast<size_t>(argv[1]); } std::cout << "size: " << size << std::endl; // setup context and queue for the default device boost::compute::device device = boost::compute::system::default_device(); boost::compute::context context(device); boost::compute::command_queue queue(context, device); // create vector of random numbers on the host std::vector<float> host_x(size); std::vector<float> host_y(size); std::generate(host_x.begin(), host_x.end(), rand_float); std::generate(host_y.begin(), host_y.end(), rand_float); // create vector on the device and copy the data boost::compute::vector<float> device_x(host_x.begin(), host_x.end(), context); boost::compute::vector<float> device_y(host_y.begin(), host_y.end(), context); boost::compute::detail::timer t; boost::compute::blas::axpy(static_cast<int>(size), 2.5f, &device_x[0], 1, &device_y[0], 1, queue); queue.finish(); std::cout << "time: " << t.elapsed() << " ms" << std::endl; // perform saxpy on host serial_saxpy(size, 2.5f, &host_x[0], &host_y[0]); // copy device_y to host_x boost::compute::copy(device_y.begin(), device_y.end(), host_x.begin(), queue); for(size_t i = 0; i < size; i++){ float host_value = host_y[i]; float device_value = host_x[i]; if(std::abs(device_value - host_value) > 1e-3){ std::cout << "ERROR: " << "value at " << i << " " << "device_value (" << device_value << ") " << "!= " << "host_value (" << host_value << ")" << std::endl; return -1; } } return 0; }
int main() { constexpr int N = 1024 * 1024 * 256; constexpr float a = 100.0f; std::vector<float> host_x(N); std::vector<float> host_y(N); // initialize the input data std::default_random_engine random_gen; std::uniform_real_distribution<float> distribution(-N, N); std::generate(host_x.begin(), host_x.end(), [&]() { return distribution(random_gen); }); std::generate(host_y.begin(), host_y.end(), [&]() { return distribution(random_gen); }); // CPU implementation of saxpy std::vector<float> host_result_y(N); for (int i = 0; i < N; i++) { host_result_y[i] = a * host_x[i] + host_y[i]; } std::vector<hc::accelerator> all_accelerators = hc::accelerator::get_all(); std::vector<hc::accelerator> accelerators; for (auto a = all_accelerators.begin(); a != all_accelerators.end(); a++) { // only pick accelerators supported by the HSA runtime if (a->is_hsa_accelerator()) { accelerators.push_back(*a); } } constexpr int numViewPerAcc = 2; int numSaxpyPerView = N/(accelerators.size() * numViewPerAcc); std::vector<hc::accelerator_view> acc_views; std::vector<hc::array_view<float,1>> x_views; std::vector<hc::array_view<float,1>> y_views; std::vector<hc::completion_future> futures; int dataCursor = 0; for (auto acc = accelerators.begin(); acc != accelerators.end(); acc++) { for (int i = 0; i < numViewPerAcc; i++) { // create a new accelerator_view acc_views.push_back(acc->create_view()); // create array_views that only covers the data portion needed by this accelerator_view x_views.push_back(hc::array_view<float,1>(numSaxpyPerView, host_x.data() + dataCursor)); y_views.push_back(hc::array_view<float,1>(numSaxpyPerView, host_y.data() + dataCursor)); dataCursor+=numSaxpyPerView; auto& x_av = x_views.back(); auto& y_av = y_views.back(); hc::completion_future f; f = hc::parallel_for_each(acc_views.back(), x_av.get_extent() , [=](hc::index<1> i) [[hc]] { y_av[i] = a * x_av[i] + y_av[i]; }); futures.push_back(f); //printf("dataCursor: %d\n",dataCursor); } } // If N is not a multiple of the number of acc_views, // calculate the remaining saxpy on the host for (; dataCursor!=N; dataCursor++) { host_y[dataCursor] = a * host_x[dataCursor] + host_y[dataCursor]; } // synchronize all the results back to the host for(auto v = y_views.begin(); v != y_views.end(); v++) { v->synchronize(); } // verify the results int errors = 0; for (int i = 0; i < N; i++) { if (fabs(host_y[i] - host_result_y[i]) > fabs(host_result_y[i] * 0.0001f)) errors++; } std::cout << errors << " errors" << std::endl; return errors; }
int main(int argc, char **argv) { Kokkos::initialize(); try { // Create random field int M = 10; Teuchos::ParameterList solverParams; solverParams.set("Number of KL Terms", M); solverParams.set("Mean", 1.0); solverParams.set("Standard Deviation", 0.1); int ndim = 3; Teuchos::Array<double> domain_upper(ndim), domain_lower(ndim), correlation_length(ndim); for (int i=0; i<ndim; i++) { domain_upper[i] = 1.0; domain_lower[i] = 0.0; correlation_length[i] = 10.0; } solverParams.set("Domain Upper Bounds", domain_upper); solverParams.set("Domain Lower Bounds", domain_lower); solverParams.set("Correlation Lengths", correlation_length); Stokhos::KL::ExponentialRandomField<double> rf(solverParams); rf.print(std::cout); // Evaluate random field at a point Teuchos::Array<double> x(ndim); for (int i=0; i<ndim; i++) x[i] = (domain_upper[i] + domain_lower[i])/2.0 + 0.1*(domain_upper[i] - domain_lower[i])/2.0; Teuchos::Array<double> rvar(M); for (int i=0; i<M; i++) rvar[i] = 1.5; double result = rf.evaluate(x, rvar); std::cout << "result (host) = " << result << std::endl; // Evaluate random field in a functor on device typedef Kokkos::View<double*> view_type; typedef view_type::HostMirror host_view_type; view_type x_view("x", ndim); host_view_type host_x = Kokkos::create_mirror_view(x_view); for (int i=0; i<ndim; i++) host_x(i) = x[i]; Kokkos::deep_copy(x_view, host_x); view_type rvar_view("rvar", M); host_view_type host_rvar = Kokkos::create_mirror_view(rvar_view); for (int i=0; i<M; i++) host_rvar(i) = rvar[i]; Kokkos::deep_copy(rvar_view, host_rvar); RF<double> rf_func(rf, x_view, rvar_view); host_view_type host_y = Kokkos::create_mirror_view(rf_func.y); Kokkos::deep_copy(host_y, rf_func.y); double result2 = host_y(0); std::cout << "result (device)= " << result2 << std::endl; } catch (std::exception& e) { std::cout << e.what() << std::endl; } Kokkos::finalize(); }
void nbody_engine_cuda_bh_tex::fcompute(const nbcoord_t& t, const memory* _y, memory* _f) { Q_UNUSED(t); const smemory* y = dynamic_cast<const smemory*>(_y); smemory* f = dynamic_cast<smemory*>(_f); if(y == NULL) { qDebug() << "y is not smemory"; return; } if(f == NULL) { qDebug() << "f is not smemory"; return; } advise_compute_count(); size_t count = m_data->get_count(); std::vector<nbcoord_t> host_y(y->size() / sizeof(nbcoord_t)); std::vector<nbcoord_t> host_mass(count); read_buffer(host_y.data(), y); read_buffer(host_mass.data(), m_mass); const nbcoord_t* rx = host_y.data(); const nbcoord_t* ry = rx + count; const nbcoord_t* rz = rx + 2 * count; const nbcoord_t* mass = host_mass.data(); nbody_space_heap heap; heap.build(count, rx, ry, rz, mass, m_distance_to_node_radius_ratio); size_t tree_size = heap.get_radius_sqr().size(); if(m_dev_indites == NULL) { m_dev_tree_xyzr = dynamic_cast<smemory*>(create_buffer(tree_size * sizeof(nbcoord_t) * 4)); m_dev_tree_mass = dynamic_cast<smemory*>(create_buffer(tree_size * sizeof(nbcoord_t))); m_dev_indites = dynamic_cast<smemory*>(create_buffer(tree_size * sizeof(int))); } const nbcoord_t* dev_y = static_cast<const nbcoord_t*>(y->data()); nbcoord_t* dev_f = static_cast<nbcoord_t*>(f->data()); int* dev_indites = static_cast<int*>(m_dev_indites->data()); static_assert(sizeof(vertex4<nbcoord_t>) == sizeof(nbcoord_t) * 4, "sizeof(vertex4) must be equal to sizeof(nbcoord_t)*4"); std::vector<vertex4<nbcoord_t>> host_tree_xyzr(tree_size); std::vector<int> host_indites(tree_size); #pragma omp parallel for for(size_t n = 0; n < tree_size; ++n) { host_tree_xyzr[n].x = heap.get_mass_center()[n].x; host_tree_xyzr[n].y = heap.get_mass_center()[n].y; host_tree_xyzr[n].z = heap.get_mass_center()[n].z; host_tree_xyzr[n].w = heap.get_radius_sqr()[n]; host_indites[n] = static_cast<int>(heap.get_body_n()[n]); } write_buffer(m_dev_tree_xyzr, host_tree_xyzr.data()); write_buffer(m_dev_tree_mass, heap.get_mass().data()); write_buffer(m_dev_indites, host_indites.data()); if(m_tree_layout == etl_heap) { fcompute_heap_bh_tex(0, static_cast<int>(count), static_cast<int>(tree_size), dev_f, m_dev_tree_xyzr->tex(4), m_dev_tree_mass->tex(), dev_indites, get_block_size()); } else if(m_tree_layout == etl_heap_stackless) { fcompute_heap_bh_stackless(0, static_cast<int>(count), static_cast<int>(tree_size), dev_f, m_dev_tree_xyzr->tex(4), m_dev_tree_mass->tex(), dev_indites, get_block_size()); } fcompute_xyz(dev_y, dev_f, static_cast<int>(count), get_block_size()); }