Esempio n. 1
0
//------------------------------------------------------------------------------
void release_clenv(CLEnv& e) {
    check_cl_error(clReleaseCommandQueue(e.commandQueue),
                                         "clReleaseCommandQueue");
    check_cl_error(clReleaseKernel(e.kernel), "clReleaseKernel");
    check_cl_error(clReleaseProgram(e.program), "clReleaseProgram");
    check_cl_error(clReleaseContext(e.context), "clReleaseContext");
}
Esempio n. 2
0
void sph_simulation::simulate(int frame_count) {
  if (frame_count == 0) {
    frame_count = (int)ceil(parameters.simulation_time * parameters.target_fps);
  }

  cl_int cl_error;

  std::vector<cl::Device> device_array;
  check_cl_error(init_cl_single_device(&context_, device_array, "", "", true));

  queue_ = cl::CommandQueue(context_, device_array[0], 0, &cl_error);
  check_cl_error(cl_error);

  running_device = &device_array[0];

  std::string source = readKernelFile(BUFFER_KERNEL_FILE_NAME);
  cl::Program program;
  check_cl_error(make_program(&program, context_, device_array, source, true,
                              "-I ./kernels/ -I ./common/"));

  kernel_density_pressure_ = make_kernel(program, "density_pressure");
  kernel_advection_collision_ = make_kernel(program, "advection_collision");
  kernel_forces_ = make_kernel(program, "forces");
  kernel_locate_in_grid_ = make_kernel(program, "locate_in_grid");
  kernel_sort_count_ = make_kernel(program, "sort_count");
  kernel_sort_ = make_kernel(program, "sort");

  front_buffer_ =
      cl::Buffer(context_, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
                 sizeof(particle) * parameters.particles_count);
  back_buffer_ = cl::Buffer(context_, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
                            sizeof(particle) * parameters.particles_count);
  sort_count_buffer_ =
      cl::Buffer(context_, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
                 sizeof(unsigned int) * kSortThreadCount * kBucketCount);

  particle* particles = new particle[parameters.particles_count];
  init_particles(particles, parameters);

  for (int i = 0; i < frame_count; ++i) {
    if (pre_frame) {
      pre_frame(particles, parameters, true);
    }

    for (int j = 0; (float)j < (1.f / parameters.simulation_scale); ++j) {
      if (pre_frame) pre_frame(particles, parameters, false);

      simulate_single_frame(particles, particles);

      if (post_frame) post_frame(particles, parameters, false);
    }

    if (post_frame) {
      post_frame(particles, parameters, true);
    }
  }
  delete[] particles;
}
Esempio n. 3
0
cl::Kernel make_kernel(cl::Program& p, const char* name) {
  cl_int cl_error;
  cl::Kernel k = cl::Kernel(p, name, &cl_error);
  check_cl_error(cl_error);

  return k;
}
Esempio n. 4
0
//------------------------------------------------------------------------------
double get_cl_time(cl_event ev) {
    cl_ulong startTime = cl_ulong(0);
    cl_ulong endTime   = cl_ulong(0);
    size_t retBytes = size_t(0);
    cl_int status = clGetEventProfilingInfo(ev,
                   CL_PROFILING_COMMAND_QUEUED,
                   sizeof(cl_ulong),
                   &startTime, &retBytes);
    check_cl_error(status, "clGetEventProfilingInfo");
    status = clGetEventProfilingInfo(ev,
                   CL_PROFILING_COMMAND_END,
                   sizeof(cl_ulong),
                   &endTime, &retBytes);
    check_cl_error(status, "clGetEventProfilingInfo");
    //event timing is reported in nanoseconds: divide by 1e6 to get
    //time in milliseconds
    return double((endTime - startTime) / 1E6);    
}
Esempio n. 5
0
//------------------------------------------------------------------------------
cl_device_id get_device_id(cl_context ctx) {
    cl_device_id deviceID; 
    // retrieve actual device id from context
    cl_int status = clGetContextInfo(ctx,
                                     CL_CONTEXT_DEVICES,
                                     sizeof(cl_device_id),
                                     &deviceID, 0);
    check_cl_error(status, "clGetContextInfo");
    return deviceID;
}
Esempio n. 6
0
//------------------------------------------------------------------------------
double timeEnqueueNDRangeKernel(cl_command_queue command_queue,
                                cl_kernel kernel,
                                cl_uint work_dim,
                                const size_t *global_work_offset,
                                const size_t *global_work_size,
                                const size_t *local_work_size,
                                cl_uint num_events_in_wait_list,
                                const cl_event *event_wait_list) {
    cl_int status = clFinish(command_queue);
    check_cl_error(status, "clFinish");
    cl_event profilingEvent;
    status = clEnqueueNDRangeKernel(command_queue,
                                    kernel,              
                                    work_dim,
                                    global_work_offset,
                                    global_work_size,
                                    local_work_size,
                                    num_events_in_wait_list,
                                    event_wait_list,
                                    &profilingEvent);
    check_cl_error(status, "clEnqueueNDRangeKernel");
    status = clFinish(command_queue); //ensure kernel execution is
    //terminated; used for timing purposes only; there is no need to enforce
    //termination when issuing a subsequent blocking data transfer operation
    check_cl_error(status, "clFinish");
    status = clWaitForEvents(1, &profilingEvent);
    check_cl_error(status, "clWaitForEvents");
    cl_ulong kernelStartTime = cl_ulong(0);
    cl_ulong kernelEndTime   = cl_ulong(0);
    size_t retBytes = size_t(0);
    double kernelElapsedTime_ms = double(0); 
   
    status = clGetEventProfilingInfo(profilingEvent,
                                     CL_PROFILING_COMMAND_QUEUED,
                                     sizeof(cl_ulong),
                                     &kernelStartTime, &retBytes);
    check_cl_error(status, "clGetEventProfilingInfo");
    status = clGetEventProfilingInfo(profilingEvent,
                                     CL_PROFILING_COMMAND_END,
                                     sizeof(cl_ulong),
                                     &kernelEndTime, &retBytes);
    check_cl_error(status, "clGetEventProfilingInfo");
    //event timing is reported in nano seconds: divide by 1e6 to get
    //time in milliseconds
    kernelElapsedTime_ms =  (double)(kernelEndTime - kernelStartTime) / 1E6;
    return kernelElapsedTime_ms;
}
//------------------------------------------------------------------------------
int main(int argc, char** argv) {

    if(argc < 9) {
        std::cerr << "usage: " << argv[0]
                  << " <platform name> <device type = default | cpu | gpu "
                     "| acc | all>  <device num> <OpenCL source file path>"
                     " <kernel name> <size> <local size> <vec element width>"
                  << std::endl;
        exit(EXIT_FAILURE);   
    }
    const int SIZE = atoi(argv[argc - 3]); // number of elements
    const int CL_ELEMENT_SIZE = atoi(argv[argc - 1]); // number of per-element
                                                      // components
    const int CPU_BLOCK_SIZE = 16384; //use block dot product if SIZE divisible
                                  //by this value
    const size_t BYTE_SIZE = SIZE * sizeof(real_t);
    const int BLOCK_SIZE = atoi(argv[argc - 2]); //local cache for reduction
                                                 //equal to local workgroup size
    const int REDUCED_SIZE = SIZE / BLOCK_SIZE;
    const int REDUCED_BYTE_SIZE = REDUCED_SIZE * sizeof(real_t);
    //setup text header that will be prefixed to opencl code
    std::ostringstream clheaderStream;
    clheaderStream << "#define BLOCK_SIZE " << BLOCK_SIZE      << '\n';
    clheaderStream << "#define VEC_WIDTH "  << CL_ELEMENT_SIZE << '\n';
#ifdef USE_DOUBLE    
    clheaderStream << "#define DOUBLE\n";
    const double EPS = 0.000000001;
#else
    const float EPS = 0.00001;
#endif
    const bool PROFILE_ENABLE_OPTION = true;    
    CLEnv clenv = create_clenv(argv[1], argv[2], atoi(argv[3]),
                               PROFILE_ENABLE_OPTION,
                               argv[4], argv[5], clheaderStream.str());
   
    cl_int status;
    //create input and output matrices
    std::vector<real_t> V1 = create_vector(SIZE);
    std::vector<real_t> V2 = create_vector(SIZE);
    real_t hostDot = std::numeric_limits< real_t >::quiet_NaN();
    real_t deviceDot = std::numeric_limits< real_t >::quiet_NaN();      
//ALLOCATE DATA AND COPY TO DEVICE    
    //allocate output buffer on OpenCL device
    //the partialReduction array contains a sequence of dot products
    //computed on sub-arrays of size BLOCK_SIZE
    cl_mem partialReduction = clCreateBuffer(clenv.context,
                                             CL_MEM_WRITE_ONLY,
                                             REDUCED_BYTE_SIZE,
                                             0,
                                             &status);
    check_cl_error(status, "clCreateBuffer");

    //allocate input buffers on OpenCL devices and copy data
    cl_mem devV1 = clCreateBuffer(clenv.context,
                                  CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                  BYTE_SIZE,
                                  &V1[0], //<-- copy data from V1
                                  &status);
    check_cl_error(status, "clCreateBuffer");                              
    cl_mem devV2 = clCreateBuffer(clenv.context,
                                  CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                  BYTE_SIZE,
                                  &V2[0], //<-- copy data from V2
                                  &status);
    check_cl_error(status, "clCreateBuffer");                              

    //set kernel parameters
    status = clSetKernelArg(clenv.kernel, //kernel
                            0,      //parameter id
                            sizeof(cl_mem), //size of parameter
                            &devV1); //pointer to parameter
    check_cl_error(status, "clSetKernelArg(V1)");
    status = clSetKernelArg(clenv.kernel, //kernel
                            1,      //parameter id
                            sizeof(cl_mem), //size of parameter
                            &devV2); //pointer to parameter
    check_cl_error(status, "clSetKernelArg(V2)");
    status = clSetKernelArg(clenv.kernel, //kernel
                            2,      //parameter id
                            sizeof(cl_mem), //size of parameter
                            &partialReduction); //pointer to parameter
    check_cl_error(status, "clSetKernelArg(devOut)");
   

    //setup kernel launch configuration
    //total number of threads == number of array elements
    const size_t globalWorkSize[1] = {SIZE / CL_ELEMENT_SIZE};
    //number of per-workgroup local threads
    const size_t localWorkSize[1] = {BLOCK_SIZE}; 
//LAUNCH KERNEL
    // make sure all work on the OpenCL device is finished
    status = clFinish(clenv.commandQueue);
    check_cl_error(status, "clFinish");
    cl_event profilingEvent;
    timespec kernelStart = {0,  0};
    timespec kernelEnd = {0, 0};
    clock_gettime(CLOCK_MONOTONIC, &kernelStart);
    //launch kernel
    status = clEnqueueNDRangeKernel(clenv.commandQueue, //queue
                                    clenv.kernel, //kernel                                   
                                    1, //number of dimensions for work-items
                                    0, //global work offset
                                    globalWorkSize, //total number of threads
                                    localWorkSize, //threads per workgroup
                                    0, //number of events that need to
                                       //complete before kernel executed
                                    0, //list of events that need to complete
                                       //before kernel executed
                                    &profilingEvent); //event object associated
                                                      // with this particular
                                                      // kernel execution
                                                      // instance

    check_cl_error(status, "clEnqueueNDRangeKernel");
    status = clFinish(clenv.commandQueue); //ensure kernel execution is
    //terminated; used for timing purposes only; there is no need to enforce
    //termination when issuing a subsequent blocking data transfer operation
    check_cl_error(status, "clFinish");
    status = clWaitForEvents(1, &profilingEvent);
    clock_gettime(CLOCK_MONOTONIC, &kernelEnd);
    check_cl_error(status, "clWaitForEvents");
    //get_cl_time(profilingEvent);  //gives similar results to the following 
    const double kernelElapsedTime_ms = time_diff_ms(kernelStart, kernelEnd);
//READ DATA FROM DEVICE
    //read back and print results
    std::vector< real_t > partialDot(REDUCED_SIZE); 
    status = clEnqueueReadBuffer(clenv.commandQueue,
                                 partialReduction,
                                 CL_TRUE, //blocking read
                                 0, //offset
                                 REDUCED_BYTE_SIZE, //byte size of data
                                 &partialDot[0], //destination buffer in host
                                                 //memory
                                 0, //number of events that need to
                                    //complete before transfer executed
                                 0, //list of events that need to complete
                                    //before transfer executed
                                 &profilingEvent); //event identifying this
                                                   //specific operation
    check_cl_error(status, "clEnqueueReadBuffer");

    const double dataTransferTime_ms = get_cl_time(profilingEvent);

    timespec accStart = {0, 0};
    timespec accEnd   = {0, 0};

//FINAL REDUCTION ON HOST    
    clock_gettime(CLOCK_MONOTONIC, &accStart);
    deviceDot = std::accumulate(partialDot.begin(),
                                partialDot.end(), real_t(0));
    clock_gettime(CLOCK_MONOTONIC, &accEnd);
    const double accTime_ms = time_diff_ms(accStart, accEnd);

//COMPUTE DOT PRODUCT ON HOST
    timespec hostStart = {0, 0};
    timespec hostEnd = {0, 0};
    clock_gettime(CLOCK_MONOTONIC, &hostStart);
    if(true || SIZE % CPU_BLOCK_SIZE != 0) hostDot = host_dot_product(V1, V2);
    else hostDot = host_dot_block(&V1[0], &V2[0], SIZE, CPU_BLOCK_SIZE);
    clock_gettime(CLOCK_MONOTONIC, &hostEnd);
    const double host_time = time_diff_ms(hostStart, hostEnd);
//PRINT RESULTS
    std::cout << deviceDot << ' ' << hostDot << std::endl;

    if(check_result(hostDot, deviceDot, EPS)) {
        std::cout << "PASSED" << std::endl;
        std::cout << "kernel:         " << kernelElapsedTime_ms << "ms\n"
                  << "host reduction: " << accTime_ms << "ms\n"
                  << "total:          " << (kernelElapsedTime_ms + accTime_ms)  
                  << "ms" << std::endl;
        std::cout << "transfer:       " << dataTransferTime_ms 
                  << "ms\n" << std::endl;
        if(true || SIZE % CPU_BLOCK_SIZE != 0) {         
            std::cout << "host:              " << host_time << "ms" << std::endl;
        } else {
            std::cout << "host (16k blocks): " << host_time << "ms" << std::endl; 
        }    
       
    } else {
        std::cout << "FAILED" << std::endl;
    }   

    check_cl_error(clReleaseMemObject(devV1), "clReleaseMemObject");
    check_cl_error(clReleaseMemObject(devV2), "clReleaseMemObject");
    check_cl_error(clReleaseMemObject(partialReduction), "clReleaseMemObject");
    release_clenv(clenv);
   
    return 0;
}
Esempio n. 8
0
void set_kernel_args_internal(int index, cl::Kernel& kernel, T1 a,
                              TArgs... args) {
  check_cl_error(kernel.setArg(index, a));
  set_kernel_args_internal(index + 1, kernel, args...);
}
Esempio n. 9
0
void sph_simulation::simulate_single_frame(particle* in_particles,
                                           particle* out_particles) {
  // Calculate the optimal size for workgroups

  // Start groups size at their maximum, make them smaller if necessary
  // Optimally parameters.particles_count should be devisible by
  // CL_DEVICE_MAX_WORK_GROUP_SIZE
  // Refer to CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE
  unsigned int size_of_groups =
      running_device->getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
  while (parameters.particles_count % size_of_groups != 0) {
    size_of_groups /= 2;
  }

  // Make sure that the workgroups are small enough and that the particle data
  // will fit in local memory
  assert(size_of_groups <=
         running_device->getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
  assert(size_of_groups * sizeof(particle) <=
         running_device->getInfo<CL_DEVICE_LOCAL_MEM_SIZE>());

  // Initial transfer to the GPU
  check_cl_error(queue_.enqueueWriteBuffer(
      front_buffer_, CL_TRUE, 0, sizeof(particle) * parameters.particles_count,
      in_particles));

  // Recalculate the boundaries of the grid since the particles probably moved
  // since the last frame.
  cl_float min_x, max_x, min_y, max_y, min_z, max_z;
  cl_float grid_cell_side_length = (parameters.h * 2);

  min_x = min_y = min_z = std::numeric_limits<cl_int>::max();
  max_x = max_y = max_z = std::numeric_limits<cl_int>::min();

  for (size_t i = 0; i < parameters.particles_count; ++i) {
    cl_float3 pos = in_particles[i].position;

    if (pos.s[0] < min_x) min_x = pos.s[0];
    if (pos.s[1] < min_y) min_y = pos.s[1];
    if (pos.s[2] < min_z) min_z = pos.s[2];

    if (pos.s[0] > max_x) max_x = pos.s[0];
    if (pos.s[1] > max_y) max_y = pos.s[1];
    if (pos.s[2] > max_z) max_z = pos.s[2];
  }

  // Add or subtracts a cell length to all sides to create a padding layer
  // This simplifies calculations further down the line
  min_x -= grid_cell_side_length * 2;
  min_y -= grid_cell_side_length * 2;
  min_z -= grid_cell_side_length * 2;

  max_x += grid_cell_side_length * 2;
  max_y += grid_cell_side_length * 2;
  max_z += grid_cell_side_length * 2;

  parameters.min_point.s[0] = min_x;
  parameters.min_point.s[1] = min_y;
  parameters.min_point.s[2] = min_z;

  parameters.max_point.s[0] = max_x;
  parameters.max_point.s[1] = max_y;
  parameters.max_point.s[2] = max_z;

  parameters.grid_size_x =
      static_cast<cl_uint>((max_x - min_x) / grid_cell_side_length);
  parameters.grid_size_y =
      static_cast<cl_uint>((max_y - min_y) / grid_cell_side_length);
  parameters.grid_size_z =
      static_cast<cl_uint>((max_z - min_z) / grid_cell_side_length);

  // The Z-curve uses interleaving of bits in a uint to caculate the index.
  // This means we have floor(32/dimension_count) bits to represent each
  // dimension.
  assert(parameters.grid_size_x < 1024);
  assert(parameters.grid_size_y < 1024);
  assert(parameters.grid_size_z < 1024);

  parameters.grid_cell_count = get_grid_index_z_curve(
      parameters.grid_size_x, parameters.grid_size_y, parameters.grid_size_z);

  // Locate each particle in the grid and build the grid count table
  unsigned int* cell_table = new unsigned int[parameters.grid_cell_count];

  set_kernel_args(kernel_locate_in_grid_, front_buffer_, back_buffer_,
                  parameters);

  check_cl_error(queue_.enqueueNDRangeKernel(
      kernel_locate_in_grid_, cl::NullRange,
      cl::NDRange(parameters.particles_count), cl::NDRange(size_of_groups)));

  check_cl_error(queue_.enqueueReadBuffer(
      back_buffer_, CL_TRUE, 0, sizeof(particle) * parameters.particles_count,
      out_particles));

  sort_particles(out_particles, back_buffer_, front_buffer_, cell_table);

  cl::Buffer cell_table_buffer(
      context_, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
      sizeof(unsigned int) * parameters.grid_cell_count);

  check_cl_error(queue_.enqueueWriteBuffer(
      cell_table_buffer, CL_TRUE, 0,
      sizeof(unsigned int) * parameters.grid_cell_count, cell_table));

  check_cl_error(queue_.enqueueWriteBuffer(
      front_buffer_, CL_TRUE, 0, sizeof(particle) * parameters.particles_count,
      out_particles));

  // Compute the density and the pressure term at every particle.
  check_cl_error(kernel_density_pressure_.setArg(0, front_buffer_));
  check_cl_error(kernel_density_pressure_.setArg(
      1, size_of_groups * sizeof(particle),
      nullptr));  // Declare local memory in arguments
  check_cl_error(kernel_density_pressure_.setArg(2, back_buffer_));
  check_cl_error(kernel_density_pressure_.setArg(3, parameters));
  check_cl_error(kernel_density_pressure_.setArg(4, precomputed_terms));
  check_cl_error(kernel_density_pressure_.setArg(5, cell_table_buffer));

  check_cl_error(queue_.enqueueNDRangeKernel(
      kernel_density_pressure_, cl::NullRange,
      cl::NDRange(parameters.particles_count), cl::NDRange(size_of_groups)));

  cl::Buffer face_normals_buffer(
      context_, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
      sizeof(float) * current_scene.face_normals.size());
  cl::Buffer vertices_buffer(context_,
                             CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
                             sizeof(float) * current_scene.vertices.size());
  cl::Buffer indices_buffer(
      context_, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
      sizeof(unsigned int) * current_scene.indices.size());

  check_cl_error(queue_.enqueueWriteBuffer(
      face_normals_buffer, CL_TRUE, 0,
      sizeof(float) * current_scene.face_normals.size(),
      current_scene.face_normals.data()));

  check_cl_error(
      queue_.enqueueWriteBuffer(vertices_buffer, CL_TRUE, 0,
                                sizeof(float) * current_scene.vertices.size(),
                                current_scene.vertices.data()));

  check_cl_error(queue_.enqueueWriteBuffer(
      indices_buffer, CL_TRUE, 0,
      sizeof(unsigned int) * current_scene.indices.size(),
      current_scene.indices.data()));

  // Compute the density-forces at every particle.
  set_kernel_args(kernel_forces_, back_buffer_, front_buffer_, parameters,
                  precomputed_terms, cell_table_buffer);

  check_cl_error(queue_.enqueueNDRangeKernel(
      kernel_forces_, cl::NullRange, cl::NDRange(parameters.particles_count),
      cl::NDRange(size_of_groups)));

  // Advect particles and resolve collisions with scene geometry.
  set_kernel_args(kernel_advection_collision_, front_buffer_, back_buffer_,
                  parameters, precomputed_terms, cell_table_buffer,
                  face_normals_buffer, vertices_buffer, indices_buffer,
                  current_scene.face_count);

  check_cl_error(queue_.enqueueNDRangeKernel(
      kernel_advection_collision_, cl::NullRange,
      cl::NDRange(parameters.particles_count), cl::NDRange(size_of_groups)));

  check_cl_error(queue_.enqueueReadBuffer(
      back_buffer_, CL_TRUE, 0, sizeof(particle) * parameters.particles_count,
      out_particles));

  delete[] cell_table;
}
Esempio n. 10
0
/**
 * @brief Sorts the particles according to their grid index using Radix Sort
 *
 * @param[in,out] particles      The array that contains the actual particle
 *data
 * @param[in] first_buffer       The first OpenCL buffer used
 * @param[in] second_buffer      The second OpenCL buffer used
 * @param[out] cell_table        The array that contains the start indexes of
 *the cell in the sorted array
 *
 */
void sph_simulation::sort_particles(particle* particles,
                                    cl::Buffer& first_buffer,
                                    cl::Buffer& second_buffer,
                                    unsigned int* cell_table) {
  cl::Buffer* current_input_buffer = &first_buffer;
  cl::Buffer* current_output_buffer = &second_buffer;

  for (int pass_number = 0; pass_number < 4; ++pass_number) {
    unsigned int zero = 0;
    check_cl_error(queue_.enqueueFillBuffer(
        sort_count_buffer_, zero, 0,
        kSortThreadCount * kBucketCount * sizeof(int)));

    set_kernel_args(kernel_sort_count_, *current_input_buffer,
                    sort_count_buffer_, parameters, kSortThreadCount,
                    pass_number, kRadixWidth);

    check_cl_error(queue_.enqueueNDRangeKernel(
        kernel_sort_count_, cl::NullRange, cl::NDRange(kSortThreadCount),
        cl::NullRange));

    check_cl_error(queue_.enqueueReadBuffer(
        sort_count_buffer_, CL_TRUE, 0,
        sizeof(unsigned int) * kSortThreadCount * kBucketCount,
        sort_count_array_.data()));

    unsigned int running_count = 0;
    for (int i = 0; i < kSortThreadCount * kBucketCount; ++i) {
      unsigned int tmp = sort_count_array_[i];
      sort_count_array_[i] = running_count;
      running_count += tmp;
    }

    check_cl_error(queue_.enqueueWriteBuffer(
        sort_count_buffer_, CL_TRUE, 0,
        sizeof(unsigned int) * kSortThreadCount * kBucketCount,
        sort_count_array_.data()));

    set_kernel_args(kernel_sort_, *current_input_buffer, *current_output_buffer,
                    sort_count_buffer_, parameters, kSortThreadCount,
                    pass_number, kRadixWidth);

    check_cl_error(queue_.enqueueNDRangeKernel(kernel_sort_, cl::NullRange,
                                               cl::NDRange(kSortThreadCount),
                                               cl::NullRange));

    cl::Buffer* tmp = current_input_buffer;
    current_input_buffer = current_output_buffer;
    current_output_buffer = tmp;
  }

  check_cl_error(queue_.enqueueReadBuffer(
      *current_input_buffer, CL_TRUE, 0,
      sizeof(particle) * parameters.particles_count, particles));

  // Build the cell table by computing the cumulative sum at every cell.
  unsigned int current_index = 0;
  for (unsigned int i = 0; i < parameters.grid_cell_count; ++i) {
    cell_table[i] = current_index;
    while (current_index != parameters.particles_count &&
           particles[current_index].grid_index == i) {
      current_index++;
    }
  }
}
//------------------------------------------------------------------------------
int main(int argc, char** argv) {
    if(argc < 6) {
        std::cerr << "usage: " << argv[0]
                  << " <platform name> <device type = default | cpu | gpu "
                     "| acc | all>  <device num> <OpenCL source file path>"
                     " <kernel name>"
                  << std::endl;
        exit(EXIT_FAILURE);  
    }
    std::string platformName = argv[ 1 ];
    std::string deviceType = argv[2]; 
    int deviceNum = atoi(argv[3]);

    log_thread_count("\n\nstart");
#ifdef _OPENMP    
    {
        // const int CHUNKSIZE = 100;
        // const int N = 1000;    
        // int i, chunk;
        // float a[N], b[N], c[N];
        // for (i=0; i < N; i++)
        //     a[i] = b[i] = i * 1.0;
        // chunk = CHUNKSIZE;
        // printf("%d", omp_get_max_threads());
        #pragma omp parallel //shared(a,b,c,chunk) private(i)
        {
            if(omp_get_thread_num() == 0) {
                log_thread_count("OMP PARALLEL");
            }
        // printf("%d", omp_get_max_threads());
        // #pragma omp for schedule(dynamic,chunk) nowait
        // for (i=0; i < N; i++) {
        //     c[i] = a[i] + b[i];
        // }
        }  
        #pragma omp barrier
    }
#endif    

    //1)create context
    cl_context ctx = create_cl_context(platformName, deviceType, deviceNum);
    std::cout << "OpenCL context created" << std::endl;
    
    log_thread_count("CL context created");


    //2)load kernel source
    const std::string programSource = load_text(argv[4]);
    std::cout << "OpenCL source code loaded" << std::endl;
    const char* src = programSource.c_str();
    const size_t sourceLength = programSource.length();

    //3)build program and create kernel
    cl_int status;
    cl_program program = clCreateProgramWithSource(ctx, //context
                                                   1,   //number of strings
                                                   &src, //source
                                                   &sourceLength, // size 
                                                   &status);  // status 
    check_cl_error(status, "clCreateProgramWithSource");
    cl_device_id deviceID; //only a single device was selected
    // retrieve actual device id from context
    status = clGetContextInfo(ctx,
                              CL_CONTEXT_DEVICES,
                              sizeof(cl_device_id),
                              &deviceID, 0);
    check_cl_error(status, "clGetContextInfo");
    cl_int buildStatus = clBuildProgram(program, //program
                                        1, //number of devices
                                        &deviceID, //array of device ids
                                        0, //program options as passed on
                                           //the command line to regualar
                                           //compilers e.g. -DUSE_DOUBLE
                                        0, 0);
    //log output if any
    char buffer[0x10000] = "";
    size_t len = 0;
    status = clGetProgramBuildInfo(program,
                                   deviceID,
                                   CL_PROGRAM_BUILD_LOG,
                                   sizeof(buffer),
                                   buffer,
                                   &len);
    check_cl_error(status, "clBuildProgramInfo");
    if(len > 1) std::cout << "Build output: " << buffer << std::endl;
    check_cl_error(buildStatus, "clBuildProgram");

    std::cout << "Built OpenCL program" << std::endl;
    const char* kernelName = argv[5];
    cl_kernel kernel = clCreateKernel(program, kernelName, &status);
    check_cl_error(status, "clCreateKernel"); 
   
    //4)allocate output buffer on OpenCL device
    typedef float real_t;
    const size_t ARRAY_LENGTH = 16;
    const size_t ARRAY_BYTE_LENGTH = ARRAY_LENGTH * sizeof(real_t);
    cl_mem outputCLBuffer = clCreateBuffer(ctx,
                                           CL_MEM_WRITE_ONLY,
                                           ARRAY_BYTE_LENGTH,
                                           0,
                                           &status);
    check_cl_error(status, "clCreateBuffer");

    //5)create command queue
    cl_command_queue commands = clCreateCommandQueue(ctx, deviceID, 0, &status);
    check_cl_error(status, "clCreateCommandQueue");
   
    //6)set kernel parameters
    const real_t value = real_t(3.14);
    //first parameter: output array
    status = clSetKernelArg(kernel, //kernel
                            0,      //parameter id
                            sizeof(cl_mem), //size of parameter
                            &outputCLBuffer); //pointer to parameter
    check_cl_error(status, "clSetKernelArg(0)");
    //second parameter: value to assign to each array element
    status = clSetKernelArg(kernel, //kernel
                            1,      //parameter id
                            sizeof(real_t), //size of parameter
                            &value); //pointer to parameter
    check_cl_error(status, "clSetKernelArg(1)");

    //7)setup kernel launch configuration
    //total number of threads == number of array elements
    const size_t globalWorkSize[1] = {ARRAY_LENGTH};
    //number of per-workgroup local threads
    const size_t localWorkSize[1] = {1}; 

    //8)launch kernel
    status = clEnqueueNDRangeKernel(commands, //queue
                                    kernel, //kernel                                   
                                    1, //number of dimensions for work-items
                                    0, //global work offset
                                    globalWorkSize, //total number of threads
                                    localWorkSize, //threads per workgroup
                                    0, //number of events that need to
                                       //complete before kernel executed
                                    0, //list of events that need to complete
                                       //before kernel executed
                                    0); //event object identifying this
                                        //particular kernel execution instance

    check_cl_error(status, "clEnqueueNDRangeKernel");
    log_thread_count("kernel launched");
    std::cout << "Lunched OpenCL kernel - setting all array elements to "
              << value << std::endl;

    //9)read back and print results
    std::vector< real_t > hostArray(ARRAY_LENGTH, real_t(0));
    status = clEnqueueReadBuffer(commands,
                                 outputCLBuffer,
                                 CL_TRUE, //blocking read
                                 0, //offset
                                 ARRAY_BYTE_LENGTH, //byte size of data
                                 &hostArray[0], //destination buffer in host
                                                // memory
                                 0, //number of events that need to
                                    //complete before transfer executed
                                 0, //list of events that need to complete
                                    //before transfer executed
                                 0); //event identifying this specific operation
    check_cl_error(status, "clEnqueueReadBuffer");
    log_thread_count("device -> host transfer");
    std::cout << "Output array: " << std::endl;
    std::ostream_iterator<real_t> out_it(std::cout, " ");
    std::copy(hostArray.begin(), hostArray.end(), out_it);
    std::cout << std::endl;

    //10)release resources
    check_cl_error(clReleaseMemObject(outputCLBuffer), "clReleaseMemObject");
    check_cl_error(clReleaseCommandQueue(commands),"clReleaseCommandQueue");
    check_cl_error(clReleaseKernel(kernel), "clReleaseKernel");
    check_cl_error(clReleaseProgram(program), "clReleaseProgram");
    check_cl_error(clReleaseContext(ctx), "clReleaseContext");
    std::cout << "Released OpenCL resources" << std::endl;
    log_thread_count("released resources");

    print_thread_count(threadlog.begin(), threadlog.end());
    std::cout << std::endl;
    return 0;
}
//------------------------------------------------------------------------------
// returns context associated with single device only,
// to make it support multiple devices, a list of
// <device type, device num> pairs is required
cl_context create_cl_context(const std::string& platformName,
                             const std::string& deviceTypeName,
                             int deviceNum) {
    cl_int status = 0;
    //1) get platfors and search for platform matching platformName
    cl_uint numPlatforms = 0;
    status = clGetPlatformIDs(0, 0, &numPlatforms);
    check_cl_error(status, "clGetPlatformIDs");
    if(numPlatforms < 1) {
        std::cout << "No OpenCL platforms found" << std::endl;
        exit(EXIT_SUCCESS);
    }
    typedef std::vector< cl_platform_id > PlatformIDs;
    PlatformIDs platformIDs(numPlatforms);
    status = clGetPlatformIDs(numPlatforms, &platformIDs[0], 0);
    check_cl_error(status, "clGetPlatformIDs");
    std::vector< char > buf(0x10000, char(0));
    cl_platform_id platformID;
    PlatformIDs::const_iterator pi = platformIDs.begin();
    for(; pi != platformIDs.end(); ++pi) {
        status = clGetPlatformInfo(*pi, CL_PLATFORM_NAME,
                                 buf.size(), &buf[0], 0);
        check_cl_error(status, "clGetPlatformInfo");
        if(platformName == &buf[0]) {
            platformID = *pi;
            break; 
        }
    } 
    if(pi == platformIDs.end()) {
        std::cerr << "ERROR - Couldn't find platform " 
                  << platformName << std::endl;
        exit(EXIT_FAILURE);
    }
    //2) get devices of deviceTypeName type and store their ids into
    //   an array then select device id at position deviceNum
    cl_device_type deviceType;
    if(deviceTypeName == "default") 
        deviceType = CL_DEVICE_TYPE_DEFAULT;
    else if(deviceTypeName == "cpu")
        deviceType = CL_DEVICE_TYPE_CPU;
    else if(deviceTypeName == "gpu")
        deviceType = CL_DEVICE_TYPE_GPU;
    else if(deviceTypeName == "acc")
        deviceType = CL_DEVICE_TYPE_ACCELERATOR; 
    else if(deviceTypeName == "all")
        deviceType = CL_DEVICE_TYPE_CPU;
    else {
        std::cerr << "ERROR - device type " << deviceTypeName << " unknown"
                  << std::endl;
        exit(EXIT_FAILURE);          
    }                      
    cl_uint numDevices = 0; 
    status = clGetDeviceIDs(platformID, deviceType, 0, 0, &numDevices);
    check_cl_error(status, "clGetDeviceIDs");
    if(numDevices < 1) {
        std::cerr << "ERROR - Cannot find device of type " 
                  << deviceTypeName << std::endl;
        exit(EXIT_FAILURE);          
    }
    typedef std::vector< cl_device_id > DeviceIDs;
    DeviceIDs deviceIDs(numDevices);
    status = clGetDeviceIDs(platformID, deviceType, numDevices,
                            &deviceIDs[0], 0);
    check_cl_error(status, "clGetDeviceIDs");
    if(deviceNum < 0 || deviceNum >= numDevices) {
        std::cerr << "ERROR - device number out of range: [0," 
                  << (numDevices - 1) << ']' << std::endl;
        exit(EXIT_FAILURE);
    }
    cl_device_id deviceID = deviceIDs[deviceNum]; 
    //3) create and return context
    cl_context_properties ctxProps[] = {
        CL_CONTEXT_PLATFORM,
        cl_context_properties(platformID),
        0
    };
    //only a single device supported
    cl_context ctx = clCreateContext(ctxProps, 1, &deviceID,
                                     &context_callback, 0, &status);
    check_cl_error(status, "clCreateContext");
    return ctx;
}
Esempio n. 13
0
//------------------------------------------------------------------------------
int main(int argc, char** argv) {
    if(argc < 6) {
        std::cerr << "usage: " << argv[0]
                  << " <platform name> <device type = default | cpu | gpu "
                     "| acc | all>  <device num> <OpenCL source file path>"
                     " <kernel name>"
                  << std::endl;
        exit(EXIT_FAILURE);   
    }
    const int SIZE = 256;
    const size_t BYTE_SIZE = SIZE * sizeof(real_t);
    const int BLOCK_SIZE = 16; 
    const int REDUCED_SIZE = SIZE / BLOCK_SIZE;
    const int REDUCED_BYTE_SIZE = REDUCED_SIZE * sizeof(real_t);
    //setup text header that will be prefixed to opencl code
    std::ostringstream clheaderStream;
    clheaderStream << "#define BLOCK_SIZE " << BLOCK_SIZE << '\n';
#ifdef USE_DOUBLE    
    clheaderStream << "#define DOUBLE\n";
    const double EPS = 0.000000001;
#else
    const double EPS = 0.00001;
#endif    
    CLEnv clenv = create_clenv(argv[1], argv[2], atoi(argv[3]), false,
                               argv[4], argv[5], clheaderStream.str());
   
    cl_int status;
    //create input and output matrices
    std::vector<real_t> V1 = create_vector(SIZE);
    std::vector<real_t> V2 = create_vector(SIZE);
    real_t hostDot = std::numeric_limits< real_t >::quiet_NaN();
    real_t deviceDot = std::numeric_limits< real_t >::quiet_NaN();      
    
    //allocate output buffer on OpenCL device
    //the partialReduction array contains a sequence of dot products
    //computed on sub-arrays of size BLOCK_SIZE
    cl_mem partialReduction = clCreateBuffer(clenv.context,
                                             CL_MEM_WRITE_ONLY,
                                             REDUCED_BYTE_SIZE,
                                             0,
                                             &status);
    check_cl_error(status, "clCreateBuffer");

    //allocate input buffers on OpenCL devices and copy data
    cl_mem devV1 = clCreateBuffer(clenv.context,
                                  CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                  BYTE_SIZE,
                                  &V1[0], //<-- copy data from V1
                                  &status);
    check_cl_error(status, "clCreateBuffer");                              
    cl_mem devV2 = clCreateBuffer(clenv.context,
                                  CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                  BYTE_SIZE,
                                  &V2[0], //<-- copy data from V2
                                  &status);
    check_cl_error(status, "clCreateBuffer");                              

    //set kernel parameters
    status = clSetKernelArg(clenv.kernel, //kernel
                            0,      //parameter id
                            sizeof(cl_mem), //size of parameter
                            &devV1); //pointer to parameter
    check_cl_error(status, "clSetKernelArg(V1)");
    status = clSetKernelArg(clenv.kernel, //kernel
                            1,      //parameter id
                            sizeof(cl_mem), //size of parameter
                            &devV2); //pointer to parameter
    check_cl_error(status, "clSetKernelArg(V2)");
    status = clSetKernelArg(clenv.kernel, //kernel
                            2,      //parameter id
                            sizeof(cl_mem), //size of parameter
                            &partialReduction); //pointer to parameter
    check_cl_error(status, "clSetKernelArg(devOut)");
   

    //setup kernel launch configuration
    //total number of threads == number of array elements
    const size_t globalWorkSize[1] = {SIZE};
    //number of per-workgroup local threads
    const size_t localWorkSize[1] = {BLOCK_SIZE}; 

    //launch kernel
    status = clEnqueueNDRangeKernel(clenv.commandQueue, //queue
                                    clenv.kernel, //kernel                                   
                                    1, //number of dimensions for work-items
                                    0, //global work offset
                                    globalWorkSize, //total number of threads
                                    localWorkSize, //threads per workgroup
                                    0, //number of events that need to
                                       //complete before kernel executed
                                    0, //list of events that need to complete
                                       //before kernel executed
                                    0); //event object identifying this
                                        //particular kernel execution instance

    check_cl_error(status, "clEnqueueNDRangeKernel");
    
    //read back and print results
    std::vector< real_t > partialDot(REDUCED_SIZE); 
    status = clEnqueueReadBuffer(clenv.commandQueue,
                                 partialReduction,
                                 CL_TRUE, //blocking read
                                 0, //offset
                                 REDUCED_BYTE_SIZE, //byte size of data
                                 &partialDot[0], //destination buffer in host memory
                                 0, //number of events that need to
                                    //complete before transfer executed
                                 0, //list of events that need to complete
                                    //before transfer executed
                                 0); //event identifying this specific operation
    check_cl_error(status, "clEnqueueReadBuffer");
    
    deviceDot = std::accumulate(partialDot.begin(),
                                partialDot.end(), real_t(0));
    hostDot = host_dot_product(V1, V2);

    std::cout << deviceDot << ' ' << hostDot << std::endl;

    if(check_result(hostDot, deviceDot, EPS)) {
        std::cout << "PASSED" << std::endl;
    } else {
        std::cout << "FAILED" << std::endl;
    }   

    check_cl_error(clReleaseMemObject(devV1), "clReleaseMemObject");
    check_cl_error(clReleaseMemObject(devV2), "clReleaseMemObject");
    check_cl_error(clReleaseMemObject(partialReduction), "clReleaseMemObject");
    release_clenv(clenv);
   
    return 0;
}
Esempio n. 14
0
//------------------------------------------------------------------------------
CLEnv create_clenv(const std::string& platformName,
                   const std::string& deviceType,
                   int deviceNum,
                   bool enableProfiling,
                   const char* clSourcePath,
                   const char* kernelName, 
                   const std::string& clSourcePrefix, 
                   const std::string& buildOptions) {

    CLEnv rt;
    cl_int status;
    cl_device_id deviceID;

    //1)create context
    rt.context = create_cl_context(platformName, deviceType, deviceNum);
    //only a single device was selected
    //retrieve actual device id from context
    status = clGetContextInfo(rt.context,
                              CL_CONTEXT_DEVICES,
                              sizeof(cl_device_id),
                              &deviceID, 0);
    check_cl_error(status, "clGetContextInfo");
    
    //2)load kernel source
    if(clSourcePath != 0) {
        const std::string programSource = clSourcePrefix 
                                          + "\n" 
                                          + load_text(clSourcePath);
        const char* src = programSource.c_str();
        const size_t sourceLength = programSource.length();

        //3)build program and create kernel
        
        rt.program = clCreateProgramWithSource(rt.context, //context
                                               1,   //number of strings
                                               &src, //lines
                                               &sourceLength, // size 
                                               &status);  // status 
        check_cl_error(status, "clCreateProgramWithSource");
        
        cl_int buildStatus = buildOptions.size() ?
                             clBuildProgram(rt.program, 1, &deviceID,
                                buildOptions.c_str(), 0, 0)
                             : clBuildProgram(rt.program, 1, &deviceID,
                                0, 0, 0);
        //log output if any
        char buffer[0x10000] = "";
        size_t len = 0;
        status = clGetProgramBuildInfo(rt.program,
                                       deviceID,
                                       CL_PROGRAM_BUILD_LOG,
                                       sizeof(buffer),
                                       buffer,
                                       &len);
        check_cl_error(status, "clBuildProgramInfo");
        if(len > 1) std::cout << "Build output: " << buffer << std::endl;
        check_cl_error(buildStatus, "clBuildProgram");
        if(kernelName != 0) {
            rt.kernel = clCreateKernel(rt.program, kernelName, &status);
            check_cl_error(status, "clCreateKernel"); 
        }
    }

    rt.commandQueue = enableProfiling ?
                      clCreateCommandQueue(rt.context, deviceID,
                                           CL_QUEUE_PROFILING_ENABLE, &status)
                      : clCreateCommandQueue(rt.context, deviceID, 0, &status);
    check_cl_error(status, "clCreateCommandQueue");

    return rt;
}
//------------------------------------------------------------------------------
int main(int argc, char** argv) {
    if(argc < 8) {
        std::cerr << "usage: " << argv[0]
                  << " <platform name> <device type = default | cpu | gpu "
                     "| acc | all>  <device num> <OpenCL source file path>"
                     " <kernel name> <matrix size> <workgroup size>"
                  << std::endl;
        exit(EXIT_FAILURE);   
    }
    const int SIZE = atoi(argv[6]);
    const size_t BYTE_SIZE = SIZE * SIZE * sizeof(real_t);
    const int BLOCK_SIZE = atoi(argv[7]); //4 x 4 tiles
    if( SIZE < 1 || BLOCK_SIZE < 1 || (SIZE % BLOCK_SIZE) != 0) {
    	std::cerr << "ERROR - size and block size *must* be greater than zero "
    	             "and size *must* be evenly divsible by block size"
    	          << std::endl;
    	exit(EXIT_FAILURE);
    }
    //setup text header that will be prefixed to opencl code
    std::ostringstream clheaderStream;
    clheaderStream << "#define BLOCK_SIZE " << BLOCK_SIZE << '\n';
#ifdef USE_DOUBLE    
    clheaderStream << "#define DOUBLE\n";
    const double EPS = 0.000000001;
#else
    const double EPS = 0.00001;
#endif
    //enable profiling on queue    
    CLEnv clenv = create_clenv(argv[1], argv[2], atoi(argv[3]), true,
                               argv[4], argv[5], clheaderStream.str());
   
    cl_int status;
    //create input and output matrices
    std::vector<real_t> A = create_matrix(SIZE, SIZE);
    std::vector<real_t> B = create_matrix(SIZE, SIZE);
    std::vector<real_t> C(SIZE * SIZE,real_t(0));
    std::vector<real_t> refC(SIZE * SIZE,real_t(0));        
    
    //allocate output buffer on OpenCL device
    cl_mem devC = clCreateBuffer(clenv.context,
                                 CL_MEM_WRITE_ONLY,
                                 BYTE_SIZE,
                                 0,
                                 &status);
    check_cl_error(status, "clCreateBuffer");

    //allocate input buffers on OpenCL devices and copy data
    cl_mem devA = clCreateBuffer(clenv.context,
                                 CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                 BYTE_SIZE,
                                 &A[0], //<-- copy data from A
                                 &status);
    check_cl_error(status, "clCreateBuffer");                              
    cl_mem devB = clCreateBuffer(clenv.context,
                                 CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                 BYTE_SIZE,
                                 &B[0], //<-- copy data from B
                                 &status);
    check_cl_error(status, "clCreateBuffer");                              

    //set kernel parameters
    status = clSetKernelArg(clenv.kernel, //kernel
                            0,      //parameter id
                            sizeof(cl_mem), //size of parameter
                            &devA); //pointer to parameter
    check_cl_error(status, "clSetKernelArg(A)");
    status = clSetKernelArg(clenv.kernel, //kernel
                            1,      //parameter id
                            sizeof(cl_mem), //size of parameter
                            &devB); //pointer to parameter
    check_cl_error(status, "clSetKernelArg(B)");
    status = clSetKernelArg(clenv.kernel, //kernel
                            2,      //parameter id
                            sizeof(cl_mem), //size of parameter
                            &devC); //pointer to parameter
    check_cl_error(status, "clSetKernelArg(C)");
    status = clSetKernelArg(clenv.kernel, //kernel
                            3,      //parameter id
                            sizeof(int), //size of parameter
                            &SIZE); //pointer to parameter
    check_cl_error(status, "clSetKernelArg(SIZE)");


    //setup kernel launch configuration
    //total number of threads == number of array elements
    const size_t globalWorkSize[2] = {SIZE, SIZE};
    //number of per-workgroup local threads
    const size_t localWorkSize[2] = {BLOCK_SIZE, BLOCK_SIZE}; 

    //launch kernel
    //to make sure there are no pending commands in the queue do wait
    //for any commands to finish execution
    status = clFinish(clenv.commandQueue);
    check_cl_error(status, "clFinish");
    cl_event profilingEvent;
    status = clEnqueueNDRangeKernel(clenv.commandQueue, //queue
                                    clenv.kernel, //kernel                                   
                                    2, //number of dimensions for work-items
                                    0, //global work offset
                                    globalWorkSize, //total number of threads
                                    localWorkSize, //threads per workgroup
                                    0, //number of events that need to
                                       //complete before kernel executed
                                    0, //list of events that need to complete
                                       //before kernel executed
                                    &profilingEvent); //event object 
                                                     //identifying this
                                                     //particular kernel
                                                     //execution instance
    check_cl_error(status, "clEnqueueNDRangeKernel");
    status = clFinish(clenv.commandQueue); //ensure kernel execution is
    //terminated; used for timing purposes only; there is no need to enforce
    //termination when issuing a subsequent blocking data transfer operation
    check_cl_error(status, "clFinish");
    status = clWaitForEvents(1, &profilingEvent);
    check_cl_error(status, "clWaitForEvents");
    cl_ulong kernelStartTime = cl_ulong(0);
    cl_ulong kernelEndTime   = cl_ulong(0);
    size_t retBytes = size_t(0);
    double kernelElapsedTime_ms = double(0); 
   
    status = clGetEventProfilingInfo(profilingEvent,
									 CL_PROFILING_COMMAND_QUEUED,
									 sizeof(cl_ulong),
                                     &kernelStartTime, &retBytes);
    check_cl_error(status, "clGetEventProfilingInfo");
    status = clGetEventProfilingInfo(profilingEvent,
									 CL_PROFILING_COMMAND_END,
									 sizeof(cl_ulong),
                                     &kernelEndTime, &retBytes);
    check_cl_error(status, "clGetEventProfilingInfo");
    //event timing is reported in nano seconds: divide by 1e6 to get
    //time in milliseconds
    kernelElapsedTime_ms =  (double)(kernelEndTime - kernelStartTime) / 1E6;
    //read back and check results
    status = clEnqueueReadBuffer(clenv.commandQueue,
                                 devC,
                                 CL_TRUE, //blocking read
                                 0, //offset
                                 BYTE_SIZE, //byte size of data
                                 &C[0], //destination buffer in host memory
                                 0, //number of events that need to
                                    //complete before transfer executed
                                 0, //list of events that need to complete
                                    //before transfer executed
                                 0); //event identifying this specific operation
    check_cl_error(status, "clEnqueueReadBuffer");
    
    host_matmul(A, B, refC, SIZE, SIZE);

    if(check_result(refC, C, EPS)) {
    	std::cout << "PASSED" << std::endl;
    	std::cout << "Elapsed time(ms): " << kernelElapsedTime_ms << std::endl;
    } else {
    	std::cout << "FAILED" << std::endl;
    }	

    check_cl_error(clReleaseMemObject(devA), "clReleaseMemObject");
    check_cl_error(clReleaseMemObject(devB), "clReleaseMemObject");
    check_cl_error(clReleaseMemObject(devC), "clReleaseMemObject");
    release_clenv(clenv);
   
    return 0;
}