Esempio n. 1
0
int main(int argc, char *argv[])
{
    size_t size = 1000;
    if(argc >= 2){
        size = boost::lexical_cast<size_t>(argv[1]);
    }

    std::cout << "size: " << size << std::endl;

    // setup context and queue for the default device
    boost::compute::device device = boost::compute::system::default_device();

    boost::compute::context context(device);
    boost::compute::command_queue queue(context, device);

    // create vector of random numbers on the host
    std::vector<float> host_x(size);
    std::vector<float> host_y(size);
    std::generate(host_x.begin(), host_x.end(), rand_float);
    std::generate(host_y.begin(), host_y.end(), rand_float);

    // create vector on the device and copy the data
    boost::compute::vector<float> device_x(host_x.begin(), host_x.end(), context);
    boost::compute::vector<float> device_y(host_y.begin(), host_y.end(), context);

    boost::compute::detail::timer t;
    boost::compute::blas::axpy(static_cast<int>(size), 2.5f, &device_x[0], 1, &device_y[0], 1, queue);
    queue.finish();
    std::cout << "time: " << t.elapsed() << " ms" << std::endl;

    // perform saxpy on host
    serial_saxpy(size, 2.5f, &host_x[0], &host_y[0]);

    // copy device_y to host_x
    boost::compute::copy(device_y.begin(), device_y.end(), host_x.begin(), queue);

    for(size_t i = 0; i < size; i++){
        float host_value = host_y[i];
        float device_value = host_x[i];

        if(std::abs(device_value - host_value) > 1e-3){
            std::cout << "ERROR: "
                      << "value at " << i << " "
                      << "device_value (" << device_value << ") "
                      << "!= "
                      << "host_value (" << host_value << ")"
                      << std::endl;
            return -1;
        }
    }

    return 0;
}
Esempio n. 2
0
int main() {

  constexpr int N = 1024 * 1024 * 256;
  constexpr float a = 100.0f;

  std::vector<float> host_x(N);
  std::vector<float> host_y(N);

  // initialize the input data
  std::default_random_engine random_gen;
  std::uniform_real_distribution<float> distribution(-N, N);
  std::generate(host_x.begin(), host_x.end(), [&]() { return distribution(random_gen); });
  std::generate(host_y.begin(), host_y.end(), [&]() { return distribution(random_gen); });

  // CPU implementation of saxpy
  std::vector<float> host_result_y(N);
  for (int i = 0; i < N; i++) {
    host_result_y[i] = a * host_x[i] + host_y[i];
  }
  
  std::vector<hc::accelerator> all_accelerators = hc::accelerator::get_all();
  std::vector<hc::accelerator> accelerators;
  for (auto a = all_accelerators.begin(); a != all_accelerators.end(); a++) {

    // only pick accelerators supported by the HSA runtime
    if (a->is_hsa_accelerator()) {
      accelerators.push_back(*a);
    }
  }

  constexpr int numViewPerAcc = 2;
  int numSaxpyPerView = N/(accelerators.size() * numViewPerAcc);

  std::vector<hc::accelerator_view> acc_views;
  std::vector<hc::array_view<float,1>> x_views;
  std::vector<hc::array_view<float,1>> y_views;
  std::vector<hc::completion_future> futures;

  int dataCursor = 0;
  for (auto acc = accelerators.begin(); acc != accelerators.end(); acc++) {
    for (int i = 0; i < numViewPerAcc; i++) {

      // create a new accelerator_view
      acc_views.push_back(acc->create_view());

      // create array_views that only covers the data portion needed by this accelerator_view
      x_views.push_back(hc::array_view<float,1>(numSaxpyPerView, host_x.data() + dataCursor));
      y_views.push_back(hc::array_view<float,1>(numSaxpyPerView, host_y.data() + dataCursor));
      dataCursor+=numSaxpyPerView;


      auto& x_av = x_views.back();
      auto& y_av = y_views.back();
      hc::completion_future f;
      f = hc::parallel_for_each(acc_views.back(), x_av.get_extent()
                            , [=](hc::index<1> i) [[hc]] {
        y_av[i] = a * x_av[i] + y_av[i];
      });
      futures.push_back(f);


      //printf("dataCursor: %d\n",dataCursor);
    }
  }

  // If N is not a multiple of the number of acc_views,
  // calculate the remaining saxpy on the host
  for (; dataCursor!=N; dataCursor++) {
    host_y[dataCursor] = a * host_x[dataCursor] + host_y[dataCursor];
  }

  // synchronize all the results back to the host
  for(auto v = y_views.begin(); v != y_views.end(); v++) {
    v->synchronize();
  }
  
  // verify the results
  int errors = 0;
  for (int i = 0; i < N; i++) {
    if (fabs(host_y[i] - host_result_y[i]) > fabs(host_result_y[i] * 0.0001f))
      errors++;
  }
  std::cout << errors << " errors" << std::endl;

  return errors;
}
Esempio n. 3
0
int main(int argc, char **argv)
{
  Kokkos::initialize();

  try {

    // Create random field
    int M = 10;
    Teuchos::ParameterList solverParams;
    solverParams.set("Number of KL Terms", M);
    solverParams.set("Mean", 1.0);
    solverParams.set("Standard Deviation", 0.1);
    int ndim = 3;
    Teuchos::Array<double> domain_upper(ndim), domain_lower(ndim),
      correlation_length(ndim);
    for (int i=0; i<ndim; i++) {
      domain_upper[i] = 1.0;
      domain_lower[i] = 0.0;
      correlation_length[i] = 10.0;
    }
    solverParams.set("Domain Upper Bounds", domain_upper);
    solverParams.set("Domain Lower Bounds", domain_lower);
    solverParams.set("Correlation Lengths", correlation_length);
    Stokhos::KL::ExponentialRandomField<double> rf(solverParams);
    rf.print(std::cout);

    // Evaluate random field at a point
    Teuchos::Array<double> x(ndim);
    for (int i=0; i<ndim; i++)
      x[i] = (domain_upper[i] + domain_lower[i])/2.0 +
        0.1*(domain_upper[i] - domain_lower[i])/2.0;
    Teuchos::Array<double> rvar(M);
    for (int i=0; i<M; i++)
      rvar[i] = 1.5;
    double result = rf.evaluate(x, rvar);
    std::cout << "result (host)  = " << result << std::endl;

    // Evaluate random field in a functor on device
    typedef Kokkos::View<double*> view_type;
    typedef view_type::HostMirror host_view_type;
    view_type x_view("x", ndim);
    host_view_type host_x = Kokkos::create_mirror_view(x_view);
    for (int i=0; i<ndim; i++)
      host_x(i) = x[i];
    Kokkos::deep_copy(x_view, host_x);
    view_type rvar_view("rvar", M);
    host_view_type host_rvar = Kokkos::create_mirror_view(rvar_view);
    for (int i=0; i<M; i++)
      host_rvar(i) = rvar[i];
    Kokkos::deep_copy(rvar_view, host_rvar);
    RF<double> rf_func(rf, x_view, rvar_view);
    host_view_type host_y = Kokkos::create_mirror_view(rf_func.y);
    Kokkos::deep_copy(host_y, rf_func.y);
    double result2 = host_y(0);
    std::cout << "result (device)= " << result2 << std::endl;
  }
  catch (std::exception& e) {
    std::cout << e.what() << std::endl;
  }

  Kokkos::finalize();
}
Esempio n. 4
0
void nbody_engine_cuda_bh_tex::fcompute(const nbcoord_t& t, const memory* _y, memory* _f)
{
	Q_UNUSED(t);
	const smemory*	y = dynamic_cast<const smemory*>(_y);
	smemory*		f = dynamic_cast<smemory*>(_f);

	if(y == NULL)
	{
		qDebug() << "y is not smemory";
		return;
	}

	if(f == NULL)
	{
		qDebug() << "f is not smemory";
		return;
	}

	advise_compute_count();

	size_t					count = m_data->get_count();
	std::vector<nbcoord_t>	host_y(y->size() / sizeof(nbcoord_t));
	std::vector<nbcoord_t>	host_mass(count);

	read_buffer(host_y.data(), y);
	read_buffer(host_mass.data(), m_mass);

	const nbcoord_t*	rx = host_y.data();
	const nbcoord_t*	ry = rx + count;
	const nbcoord_t*	rz = rx + 2 * count;
	const nbcoord_t*	mass = host_mass.data();

	nbody_space_heap	heap;
	heap.build(count, rx, ry, rz, mass, m_distance_to_node_radius_ratio);

	size_t			tree_size = heap.get_radius_sqr().size();

	if(m_dev_indites == NULL)
	{
		m_dev_tree_xyzr = dynamic_cast<smemory*>(create_buffer(tree_size * sizeof(nbcoord_t) * 4));
		m_dev_tree_mass = dynamic_cast<smemory*>(create_buffer(tree_size * sizeof(nbcoord_t)));
		m_dev_indites = dynamic_cast<smemory*>(create_buffer(tree_size * sizeof(int)));
	}

	const nbcoord_t*	dev_y = static_cast<const nbcoord_t*>(y->data());
	nbcoord_t*			dev_f = static_cast<nbcoord_t*>(f->data());
	int*				dev_indites = static_cast<int*>(m_dev_indites->data());

	static_assert(sizeof(vertex4<nbcoord_t>) == sizeof(nbcoord_t) * 4,
				  "sizeof(vertex4) must be equal to sizeof(nbcoord_t)*4");

	std::vector<vertex4<nbcoord_t>>	host_tree_xyzr(tree_size);
	std::vector<int>				host_indites(tree_size);

	#pragma omp parallel for
	for(size_t n = 0; n < tree_size; ++n)
	{
		host_tree_xyzr[n].x = heap.get_mass_center()[n].x;
		host_tree_xyzr[n].y = heap.get_mass_center()[n].y;
		host_tree_xyzr[n].z = heap.get_mass_center()[n].z;
		host_tree_xyzr[n].w = heap.get_radius_sqr()[n];
		host_indites[n] = static_cast<int>(heap.get_body_n()[n]);
	}

	write_buffer(m_dev_tree_xyzr, host_tree_xyzr.data());
	write_buffer(m_dev_tree_mass, heap.get_mass().data());
	write_buffer(m_dev_indites, host_indites.data());

	if(m_tree_layout == etl_heap)
	{
		fcompute_heap_bh_tex(0, static_cast<int>(count), static_cast<int>(tree_size), dev_f,
							 m_dev_tree_xyzr->tex(4), m_dev_tree_mass->tex(),
							 dev_indites, get_block_size());
	}
	else if(m_tree_layout == etl_heap_stackless)
	{
		fcompute_heap_bh_stackless(0, static_cast<int>(count), static_cast<int>(tree_size), dev_f,
								   m_dev_tree_xyzr->tex(4), m_dev_tree_mass->tex(),
								   dev_indites, get_block_size());
	}

	fcompute_xyz(dev_y, dev_f, static_cast<int>(count), get_block_size());
}