Beispiel #1
0
int main(int argc, char **argv)
#endif
{
  int i, niter, step;
  double mflops, t, tmax;
  logical verified;
  char class;
  double tsum[t_last+2], t1[t_last+2],
         tming[t_last+2], tmaxg[t_last+2];
  char *t_recs[t_last+2] = {
       "total", "rhs", "xsolve", "ysolve", "zsolve", 
       "bpack", "exch", "xcomm", "ycomm", "zcomm",
       " totcomp", " totcomm" };

  //---------------------------------------------------------------------
  // Root node reads input file (if it exists) else takes
  // defaults from parameters
  //---------------------------------------------------------------------
  printf("\n\n NAS Parallel Benchmarks (NPB3.3-OCL-MD) - SP Benchmark\n\n");

  FILE *fp;
  fp = fopen("timer.flag", "r");
  timeron = false;
  if (fp != NULL) {
    timeron = true;
    fclose(fp);
  }

  if ((fp = fopen("inputsp.data", "r")) != NULL) {
    int result;
    printf(" Reading from input file inputsp.data\n");
    result = fscanf(fp, "%d", &niter);
    while (fgetc(fp) != '\n');
    result = fscanf(fp, "%*f");
    while (fgetc(fp) != '\n');
    result = fscanf(fp, "%d%d%d", &grid_points[0], &grid_points[1], 
                                  &grid_points[2]);
    fclose(fp);
  } else {
    printf(" No input file inputsp.data. Using compiled defaults\n");
    niter = NITER_DEFAULT;
    grid_points[0] = PROBLEM_SIZE;
    grid_points[1] = PROBLEM_SIZE;
    grid_points[2] = PROBLEM_SIZE;
  }

  setup_opencl(argc, argv);

  printf(" Size: %4dx%4dx%4d\n", 
      grid_points[0], grid_points[1], grid_points[2]);
  printf(" Iterations: %4d", niter);
  if (num_devices != MAXCELLS*MAXCELLS) 
    printf(" WARNING: compiled for %5d devices \n", MAXCELLS*MAXCELLS);
  printf(" Number of active devices: %5d\n\n", num_devices);

  make_set();

  for (i = 0; i < t_last; i++) {
    timer_clear(i);
  }

  set_constants();

  initialize();

  lhsinit();

  exact_rhs();

  compute_buffer_size(5);

  set_kernel_args();

  //---------------------------------------------------------------------
  // do one time step to touch all code, and reinitialize
  //---------------------------------------------------------------------
#ifdef MINIMD_SNUCL_OPTIMIZATIONS
  // set cmd queue property
  for(i = 0; i < num_devices; i++) {
  	clSetCommandQueueProperty(cmd_queue[i], 
			CL_QUEUE_AUTO_DEVICE_SELECTION | 
			//CL_QUEUE_ITERATIVE | 
			CL_QUEUE_COMPUTE_INTENSIVE,
			true,
			NULL);
  }
#endif
  adi();
#ifdef MINIMD_SNUCL_OPTIMIZATIONS
  for(i = 0; i < num_devices; i++) {
  	clSetCommandQueueProperty(cmd_queue[i], 
			0,
			true,
			NULL);
  }
#endif

  initialize();

  //---------------------------------------------------------------------
  // Synchronize before placing time stamp
  //---------------------------------------------------------------------
  for (i = 0; i < t_last; i++) {
    timer_clear(i);
  }

  timer_clear(0);
  timer_start(0);

  for (step = 1; step <= niter; step++) {

    if ((step % 20) == 0 || step == 1) {
      printf(" Time step %4d\n", step);
    }

    adi();

  }

  timer_stop(0);
  t = timer_read(0);

  verify(niter, &class, &verified);

  tmax = t;

  if( tmax != 0.0 ) {
    mflops = (881.174*(double)( PROBLEM_SIZE*PROBLEM_SIZE*PROBLEM_SIZE )
             -4683.91*(double)( PROBLEM_SIZE*PROBLEM_SIZE )
             +11484.5*(double)( PROBLEM_SIZE )
             -19272.4) * (double)( niter ) / (tmax*1000000.0);
  } else {
    mflops = 0.0;
  }

  c_print_results("SP", class, grid_points[0], 
      grid_points[1], grid_points[2], niter,
      tmax, mflops, "          floating point", 
      verified, NPBVERSION,COMPILETIME, CS1, CS2, CS3, CS4, CS5, 
      CS6, CS7, clu_GetDeviceTypeName(device_type), device_name, num_devices);

  if (timeron) {
/*
    for (i = 0; i < t_last; i++) {
      t1[i] = timer_read(i);
    }
    t1[t_xsolve] = t1[t_xsolve] - t1[t_xcomm];
    t1[t_ysolve] = t1[t_ysolve] - t1[t_ycomm];
    t1[t_zsolve] = t1[t_zsolve] - t1[t_zcomm];
    t1[t_last+2] = t1[t_xcomm]+t1[t_ycomm]+t1[t_zcomm]+t1[t_exch];
    t1[t_last+1] = t1[t_total]  - t1[t_last+2];

    MPI_Reduce(&t1, tsum,  t_last+2, dp_type, MPI_SUM, 0, comm_setup);
    MPI_Reduce(&t1, tming, t_last+2, dp_type, MPI_MIN, 0, comm_setup);
    MPI_Reduce(&t1, tmaxg, t_last+2, dp_type, MPI_MAX, 0, comm_setup);

    if (node == 0) {
      printf(" nprocs =%6d           minimum     maximum     average\n",
          total_nodes);
      for (i = 0; i < t_last+2; i++) {
        tsum[i] = tsum[i] / total_nodes;
          printf(" timer %2d(%8s) :  %10.4f  %10.4f  %10.4f\n",
              i+1, t_recs[i], tming[i], tmaxg[i], tsum[i]);
      }
    }
*/
  }

  release_opencl();

  return 0;
}
Beispiel #2
0
void sph_simulation::simulate_single_frame(particle* in_particles,
                                           particle* out_particles) {
  // Calculate the optimal size for workgroups

  // Start groups size at their maximum, make them smaller if necessary
  // Optimally parameters.particles_count should be devisible by
  // CL_DEVICE_MAX_WORK_GROUP_SIZE
  // Refer to CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE
  unsigned int size_of_groups =
      running_device->getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
  while (parameters.particles_count % size_of_groups != 0) {
    size_of_groups /= 2;
  }

  // Make sure that the workgroups are small enough and that the particle data
  // will fit in local memory
  assert(size_of_groups <=
         running_device->getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
  assert(size_of_groups * sizeof(particle) <=
         running_device->getInfo<CL_DEVICE_LOCAL_MEM_SIZE>());

  // Initial transfer to the GPU
  check_cl_error(queue_.enqueueWriteBuffer(
      front_buffer_, CL_TRUE, 0, sizeof(particle) * parameters.particles_count,
      in_particles));

  // Recalculate the boundaries of the grid since the particles probably moved
  // since the last frame.
  cl_float min_x, max_x, min_y, max_y, min_z, max_z;
  cl_float grid_cell_side_length = (parameters.h * 2);

  min_x = min_y = min_z = std::numeric_limits<cl_int>::max();
  max_x = max_y = max_z = std::numeric_limits<cl_int>::min();

  for (size_t i = 0; i < parameters.particles_count; ++i) {
    cl_float3 pos = in_particles[i].position;

    if (pos.s[0] < min_x) min_x = pos.s[0];
    if (pos.s[1] < min_y) min_y = pos.s[1];
    if (pos.s[2] < min_z) min_z = pos.s[2];

    if (pos.s[0] > max_x) max_x = pos.s[0];
    if (pos.s[1] > max_y) max_y = pos.s[1];
    if (pos.s[2] > max_z) max_z = pos.s[2];
  }

  // Add or subtracts a cell length to all sides to create a padding layer
  // This simplifies calculations further down the line
  min_x -= grid_cell_side_length * 2;
  min_y -= grid_cell_side_length * 2;
  min_z -= grid_cell_side_length * 2;

  max_x += grid_cell_side_length * 2;
  max_y += grid_cell_side_length * 2;
  max_z += grid_cell_side_length * 2;

  parameters.min_point.s[0] = min_x;
  parameters.min_point.s[1] = min_y;
  parameters.min_point.s[2] = min_z;

  parameters.max_point.s[0] = max_x;
  parameters.max_point.s[1] = max_y;
  parameters.max_point.s[2] = max_z;

  parameters.grid_size_x =
      static_cast<cl_uint>((max_x - min_x) / grid_cell_side_length);
  parameters.grid_size_y =
      static_cast<cl_uint>((max_y - min_y) / grid_cell_side_length);
  parameters.grid_size_z =
      static_cast<cl_uint>((max_z - min_z) / grid_cell_side_length);

  // The Z-curve uses interleaving of bits in a uint to caculate the index.
  // This means we have floor(32/dimension_count) bits to represent each
  // dimension.
  assert(parameters.grid_size_x < 1024);
  assert(parameters.grid_size_y < 1024);
  assert(parameters.grid_size_z < 1024);

  parameters.grid_cell_count = get_grid_index_z_curve(
      parameters.grid_size_x, parameters.grid_size_y, parameters.grid_size_z);

  // Locate each particle in the grid and build the grid count table
  unsigned int* cell_table = new unsigned int[parameters.grid_cell_count];

  set_kernel_args(kernel_locate_in_grid_, front_buffer_, back_buffer_,
                  parameters);

  check_cl_error(queue_.enqueueNDRangeKernel(
      kernel_locate_in_grid_, cl::NullRange,
      cl::NDRange(parameters.particles_count), cl::NDRange(size_of_groups)));

  check_cl_error(queue_.enqueueReadBuffer(
      back_buffer_, CL_TRUE, 0, sizeof(particle) * parameters.particles_count,
      out_particles));

  sort_particles(out_particles, back_buffer_, front_buffer_, cell_table);

  cl::Buffer cell_table_buffer(
      context_, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
      sizeof(unsigned int) * parameters.grid_cell_count);

  check_cl_error(queue_.enqueueWriteBuffer(
      cell_table_buffer, CL_TRUE, 0,
      sizeof(unsigned int) * parameters.grid_cell_count, cell_table));

  check_cl_error(queue_.enqueueWriteBuffer(
      front_buffer_, CL_TRUE, 0, sizeof(particle) * parameters.particles_count,
      out_particles));

  // Compute the density and the pressure term at every particle.
  check_cl_error(kernel_density_pressure_.setArg(0, front_buffer_));
  check_cl_error(kernel_density_pressure_.setArg(
      1, size_of_groups * sizeof(particle),
      nullptr));  // Declare local memory in arguments
  check_cl_error(kernel_density_pressure_.setArg(2, back_buffer_));
  check_cl_error(kernel_density_pressure_.setArg(3, parameters));
  check_cl_error(kernel_density_pressure_.setArg(4, precomputed_terms));
  check_cl_error(kernel_density_pressure_.setArg(5, cell_table_buffer));

  check_cl_error(queue_.enqueueNDRangeKernel(
      kernel_density_pressure_, cl::NullRange,
      cl::NDRange(parameters.particles_count), cl::NDRange(size_of_groups)));

  cl::Buffer face_normals_buffer(
      context_, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
      sizeof(float) * current_scene.face_normals.size());
  cl::Buffer vertices_buffer(context_,
                             CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
                             sizeof(float) * current_scene.vertices.size());
  cl::Buffer indices_buffer(
      context_, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
      sizeof(unsigned int) * current_scene.indices.size());

  check_cl_error(queue_.enqueueWriteBuffer(
      face_normals_buffer, CL_TRUE, 0,
      sizeof(float) * current_scene.face_normals.size(),
      current_scene.face_normals.data()));

  check_cl_error(
      queue_.enqueueWriteBuffer(vertices_buffer, CL_TRUE, 0,
                                sizeof(float) * current_scene.vertices.size(),
                                current_scene.vertices.data()));

  check_cl_error(queue_.enqueueWriteBuffer(
      indices_buffer, CL_TRUE, 0,
      sizeof(unsigned int) * current_scene.indices.size(),
      current_scene.indices.data()));

  // Compute the density-forces at every particle.
  set_kernel_args(kernel_forces_, back_buffer_, front_buffer_, parameters,
                  precomputed_terms, cell_table_buffer);

  check_cl_error(queue_.enqueueNDRangeKernel(
      kernel_forces_, cl::NullRange, cl::NDRange(parameters.particles_count),
      cl::NDRange(size_of_groups)));

  // Advect particles and resolve collisions with scene geometry.
  set_kernel_args(kernel_advection_collision_, front_buffer_, back_buffer_,
                  parameters, precomputed_terms, cell_table_buffer,
                  face_normals_buffer, vertices_buffer, indices_buffer,
                  current_scene.face_count);

  check_cl_error(queue_.enqueueNDRangeKernel(
      kernel_advection_collision_, cl::NullRange,
      cl::NDRange(parameters.particles_count), cl::NDRange(size_of_groups)));

  check_cl_error(queue_.enqueueReadBuffer(
      back_buffer_, CL_TRUE, 0, sizeof(particle) * parameters.particles_count,
      out_particles));

  delete[] cell_table;
}
 static void set(const constant_vector<T> &rv,
                 backend::kernel &kernel, unsigned device, size_t index_offset,
                 detail::kernel_generator_state_ptr state)
 {
     set_kernel_args(rv.v, kernel, device, index_offset, state);
 }
Beispiel #4
0
/**
 * @brief Sorts the particles according to their grid index using Radix Sort
 *
 * @param[in,out] particles      The array that contains the actual particle
 *data
 * @param[in] first_buffer       The first OpenCL buffer used
 * @param[in] second_buffer      The second OpenCL buffer used
 * @param[out] cell_table        The array that contains the start indexes of
 *the cell in the sorted array
 *
 */
void sph_simulation::sort_particles(particle* particles,
                                    cl::Buffer& first_buffer,
                                    cl::Buffer& second_buffer,
                                    unsigned int* cell_table) {
  cl::Buffer* current_input_buffer = &first_buffer;
  cl::Buffer* current_output_buffer = &second_buffer;

  for (int pass_number = 0; pass_number < 4; ++pass_number) {
    unsigned int zero = 0;
    check_cl_error(queue_.enqueueFillBuffer(
        sort_count_buffer_, zero, 0,
        kSortThreadCount * kBucketCount * sizeof(int)));

    set_kernel_args(kernel_sort_count_, *current_input_buffer,
                    sort_count_buffer_, parameters, kSortThreadCount,
                    pass_number, kRadixWidth);

    check_cl_error(queue_.enqueueNDRangeKernel(
        kernel_sort_count_, cl::NullRange, cl::NDRange(kSortThreadCount),
        cl::NullRange));

    check_cl_error(queue_.enqueueReadBuffer(
        sort_count_buffer_, CL_TRUE, 0,
        sizeof(unsigned int) * kSortThreadCount * kBucketCount,
        sort_count_array_.data()));

    unsigned int running_count = 0;
    for (int i = 0; i < kSortThreadCount * kBucketCount; ++i) {
      unsigned int tmp = sort_count_array_[i];
      sort_count_array_[i] = running_count;
      running_count += tmp;
    }

    check_cl_error(queue_.enqueueWriteBuffer(
        sort_count_buffer_, CL_TRUE, 0,
        sizeof(unsigned int) * kSortThreadCount * kBucketCount,
        sort_count_array_.data()));

    set_kernel_args(kernel_sort_, *current_input_buffer, *current_output_buffer,
                    sort_count_buffer_, parameters, kSortThreadCount,
                    pass_number, kRadixWidth);

    check_cl_error(queue_.enqueueNDRangeKernel(kernel_sort_, cl::NullRange,
                                               cl::NDRange(kSortThreadCount),
                                               cl::NullRange));

    cl::Buffer* tmp = current_input_buffer;
    current_input_buffer = current_output_buffer;
    current_output_buffer = tmp;
  }

  check_cl_error(queue_.enqueueReadBuffer(
      *current_input_buffer, CL_TRUE, 0,
      sizeof(particle) * parameters.particles_count, particles));

  // Build the cell table by computing the cumulative sum at every cell.
  unsigned int current_index = 0;
  for (unsigned int i = 0; i < parameters.grid_cell_count; ++i) {
    cell_table[i] = current_index;
    while (current_index != parameters.particles_count &&
           particles[current_index].grid_index == i) {
      current_index++;
    }
  }
}