예제 #1
0
int main(int argc, char **argv) {
  read_args(argc, argv);
  counters timer;
  start_measure(timer);

  // declarations
  Complex ioB(1.0, 1.0);
  ioBuffer = cl::sycl::buffer<Complex,2>(cl::sycl::range<2> {M, N});
  ioABuffer = cl::sycl::buffer<Complex,2>(cl::sycl::range<2> {M, N}); 
  ioBBuffer = cl::sycl::buffer<Complex,1>(&ioB, cl::sycl::range<1> {1}); 

  // initialization
  for (size_t i = 0; i < M; ++i){
    for (size_t j = 0; j < N; ++j){
      float tmp = (float) (i*(j+2) + 10) / N;
      Complex value(tmp, tmp);
      cl::sycl::id<2> id = {i, j};
      ioBuffer.get_access<cl::sycl::access::mode::write>()[id] = value;
      ioABuffer.get_access<cl::sycl::access::mode::write>()[id] = value;
    }
  }

  // our work
  coef_var2D<0, 0> c1;  
  coef_var2D<1, 0> c2;
  coef_var2D<0, 1> c3;
  coef_var2D<-1, 0> c4;
  coef_var2D<0, -1> c5;

  auto st = c1+c2+c3+c4+c5;
  input_var2D<Complex, &ioABuffer, &ioBBuffer, &fdl_in, &fac> work_in;
  output_2D<Complex, &ioBuffer, &fdl_out> work_out;
  auto op_work = work_out << st << work_in;

  auto st_id = c1.toStencil();
  input_var2D<Complex, &ioBuffer, &ioBBuffer, &fdl_in, &fac_id> copy_in;
  output_2D<Complex, &ioABuffer, &fdl_out> copy_out;
  auto op_copy = copy_out << st_id << copy_in;

  end_init(timer);
  auto begin_op = counters::clock_type::now();

  // compute result with "gpu"
  {   
    cl::sycl::queue myQueue; 
    for (unsigned int i = 0; i < NB_ITER; ++i){      
      //op_work.doComputation(myQueue);
      op_work.doLocalComputation(myQueue);
      op_copy.doComputation(myQueue);
    }
  }

  auto end_op = counters::clock_type::now();
  timer.stencil_time = std::chrono::duration_cast<counters::duration_type>(end_op - begin_op);
  // loading time is not watched
  end_measure(timer);

  return 0;
}
예제 #2
0
int main(int argc, char **argv) {
  read_args(argc, argv);
  struct counters timer;
  start_measure(timer);

  // declarations
  float tab_var = 1.0;
  float *ioB = &tab_var;
  ioBuffer = cl::sycl::buffer<float,2>(cl::sycl::range<2> {M, N});
  ioABuffer = cl::sycl::buffer<float,2>(cl::sycl::range<2> {M, N}); 
  ioBBuffer = cl::sycl::buffer<float,1>(ioB, cl::sycl::range<1> {1}); 
#if DEBUG_STENCIL
  float *a_test = (float *) malloc(sizeof(float)*M*N);
  float *b_test = (float *) malloc(sizeof(float)*M*N);
#endif

  // initialization
  for (size_t i = 0; i < M; ++i){
    for (size_t j = 0; j < N; ++j){
      float value = ((float) i*(j+2) + 10) / N;
      cl::sycl::id<2> id = {i, j};
      ioBuffer.get_access<cl::sycl::access::mode::write, cl::sycl::access::target::host_buffer>()[id] = value;
      ioABuffer.get_access<cl::sycl::access::mode::write, cl::sycl::access::target::host_buffer>()[id] = value;
#if DEBUG_STENCIL
      a_test[i*N+j] = value;
      b_test[i*N+j] = value;
#endif
    }
  }

  // our work
  coef_var2D<0, 0> c1;  
  coef_var2D<1, 0> c2;
  coef_var2D<0, 1> c3;
  coef_var2D<-1, 0> c4;
  coef_var2D<0, -1> c5;

  auto st = c1+c2+c3+c4+c5;
  input_var2D<float, &ioABuffer, &ioBBuffer, &fdl_in, &fac> work_in;
  output_2D<float, &ioBuffer, &fdl_out> work_out;
  auto op_work = work_out << st << work_in;

  auto st_id = c1.toStencil();
  input_var2D<float, &ioBuffer, &ioBBuffer, &fdl_in, &fac_id> copy_in;
  output_2D<float, &ioABuffer, &fdl_out> copy_out;
  auto op_copy = copy_out << st_id << copy_in;

  end_init(timer);
  struct op_time time_op;
  begin_op(time_op);

  // compute result with "gpu"
  {   
    cl::sycl::queue myQueue; 
    for (unsigned int i = 0; i < NB_ITER; ++i){      
      //op_work.doComputation(myQueue);
      op_work.doLocalComputation(myQueue);
      op_copy.doComputation(myQueue);
    }
  }

  end_op(time_op, timer.stencil_time);
  // loading time is not watched
  end_measure(timer);

#if DEBUG_STENCIL
  // get the gpu result
  auto C = (ioABuffer).get_access<cl::sycl::access::mode::read, cl::sycl::access::target::host_buffer>();
  ute_and_are(a_test,b_test,C);
  free(a_test);
  free(b_test);
#endif

  return 0;
}