示例#1
0
int 
exec_dot_product_kernel(const char *program_source, 
                        int n, cl_float4 *srcA, cl_float4 *srcB, cl_float4 *dst) 
{ 
  cl_context  context; 
  cl_command_queue cmd_queue; 
  cl_device_id  *devices; 
  cl_program  program; 
  cl_kernel  kernel; 
  cl_mem       memobjs[3]; 
  size_t       global_work_size[1]; 
  size_t       local_work_size[1]; 
  size_t       cb; 
  cl_int       err; 
  int          i;
 
  // create the OpenCL context on any available OCL device 
  context = clCreateContextFromType(
      NULL, 
      CL_DEVICE_TYPE_ALL,
      NULL, NULL, NULL); 
  if (context == (cl_context)0) 
    return -1; 
 
  // get the list of GPU devices associated with context 
  clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &cb); 
  devices = (cl_device_id *) malloc(cb); 
  clGetContextInfo(context, CL_CONTEXT_DEVICES, cb, devices, NULL); 
 
  // create a command-queue 
  cmd_queue = clCreateCommandQueue(context, devices[0], 0, NULL); 
  if (cmd_queue == (cl_command_queue)0) 
    { 
      clReleaseContext(context); 
      free(devices); 
      return -1; 
    } 

  for (i = 0; i < n; ++i)
    {
       poclu_bswap_cl_float_array(devices[0], (cl_float*)&srcA[i], 4);
       poclu_bswap_cl_float_array(devices[0], (cl_float*)&srcB[i], 4);
    }

 
  // allocate the buffer memory objects 
  memobjs[0] = clCreateBuffer(context, 
                              CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
                              sizeof(cl_float4) * n, srcA, NULL); 
  if (memobjs[0] == (cl_mem)0) 
    { 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  memobjs[1] = clCreateBuffer(context, 
                              CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
                              sizeof(cl_float4) * n, srcB, NULL); 
  if (memobjs[1] == (cl_mem)0) 
    { 
      delete_memobjs(memobjs, 1); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1;
    } 
 
  memobjs[2] = clCreateBuffer(context, 
			      CL_MEM_READ_WRITE, 
			      sizeof(cl_float) * n, NULL, NULL); 
  if (memobjs[2] == (cl_mem)0) 
    { 
      delete_memobjs(memobjs, 2); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // create the program 
  program = clCreateProgramWithSource(context, 
				      1, (const char**)&program_source, NULL, NULL); 
  if (program == (cl_program)0) 
    { 
      delete_memobjs(memobjs, 3); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // build the program 
  err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); 
  if (err != CL_SUCCESS) 
    { 
      delete_memobjs(memobjs, 3); 
      clReleaseProgram(program); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // create the kernel 
  kernel = clCreateKernel(program, "dot_product", NULL); 
  if (kernel == (cl_kernel)0) 
    { 
      delete_memobjs(memobjs, 3); 
      clReleaseProgram(program); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // set the args values 
  err = clSetKernelArg(kernel,  0,  
		       sizeof(cl_mem), (void *) &memobjs[0]); 
  err |= clSetKernelArg(kernel, 1,  
			sizeof(cl_mem), (void *) &memobjs[1]); 
  err |= clSetKernelArg(kernel, 2,
			sizeof(cl_mem), (void *) &memobjs[2]); 
 
  if (err != CL_SUCCESS) 
    { 
      delete_memobjs(memobjs, 3); 
      clReleaseKernel(kernel); 
      clReleaseProgram(program); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // set work-item dimensions 
  global_work_size[0] = n; 
  local_work_size[0]= 2; 
 
  // execute kernel 
  err = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, 
			       global_work_size, local_work_size,  
			       0, NULL, NULL); 
  if (err != CL_SUCCESS) 
    { 
      delete_memobjs(memobjs, 3); 
      clReleaseKernel(kernel); 
      clReleaseProgram(program); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
 
  // read output image 
  err = clEnqueueReadBuffer(cmd_queue, memobjs[2], CL_TRUE, 
			    0, n * sizeof(cl_float), dst, 
			    0, NULL, NULL); 
  if (err != CL_SUCCESS) 
    { 
      delete_memobjs(memobjs, 3); 
      clReleaseKernel(kernel); 
      clReleaseProgram(program); 
      clReleaseCommandQueue(cmd_queue); 
      clReleaseContext(context); 
      return -1; 
    } 
  for (i = 0; i < n; ++i)
    {
      poclu_bswap_cl_float_array(devices[0], (cl_float*)&dst[i], 4);
      poclu_bswap_cl_float_array(devices[0], (cl_float*)&srcA[i], 4);
      poclu_bswap_cl_float_array(devices[0], (cl_float*)&srcB[i], 4);
    }
  free(devices); 


  // release kernel, program, and memory objects 
  delete_memobjs(memobjs, 3); 
  clReleaseKernel(kernel); 
  clReleaseProgram(program); 
  clReleaseCommandQueue(cmd_queue); 
  clReleaseContext(context); 
  return 0; // success... 
}
int
main(void)
{
    cl_float A[BUFFER_SIZE];
    cl_int R[WORK_ITEMS];

    for (int i = 0; i < BUFFER_SIZE; i++) {
        A[i] = i;
    }

    for (int i = 0; i < WORK_ITEMS; i++) {
        R[i] = i;
    }

    try {
        std::vector<cl::Platform> platformList;

        // Pick platform
        cl::Platform::get(&platformList);

        // Pick first platform
        cl_context_properties cprops[] = {
            CL_CONTEXT_PLATFORM, (cl_context_properties)(platformList[0])(), 0};
        cl::Context context(CL_DEVICE_TYPE_CPU | CL_DEVICE_TYPE_GPU, cprops);

        // Query the set of devices attched to the context
        std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();

        // Create and program from source
        cl::Program::Sources sources(1, std::make_pair(kernelSourceCode, 0));
        cl::Program program(context, sources);

        cl_device_id dev_id = devices.at(0)();

        poclu_bswap_cl_float_array(dev_id, A, BUFFER_SIZE);
        poclu_bswap_cl_int_array(dev_id, R, WORK_ITEMS);

        // Build program
        program.build(devices);

        // Create buffer for A and copy host contents
        cl::Buffer aBuffer = cl::Buffer(
            context, 
            CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
            BUFFER_SIZE * sizeof(float), 
            (void *) &A[0]);

        // Create buffer for that uses the host ptr C
        cl::Buffer cBuffer = cl::Buffer(
            context, 
            CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, 
            WORK_ITEMS * sizeof(int), 
            (void *) &R[0]);

        // Create kernel object
        cl::Kernel kernel(program, "test_kernel");

        // Set kernel args
        kernel.setArg(0, aBuffer);
        kernel.setArg(1, cBuffer);

        // Create command queue
        cl::CommandQueue queue(context, devices[0], 0);
 
        // Do the work
        queue.enqueueNDRangeKernel(
            kernel, 
            cl::NullRange, 
            cl::NDRange(WORK_ITEMS),
            cl::NullRange);
 

        // Map cBuffer to host pointer. This enforces a sync with 
        // the host backing space, remember we choose GPU device.
        int * output = (int *) queue.enqueueMapBuffer(
            cBuffer,
            CL_TRUE, // block 
            CL_MAP_READ,
            0,
            WORK_ITEMS * sizeof(int));

        bool ok = true;
        for (int i = 0; i < WORK_ITEMS; i++) {

            float global_sum = 0.0f;
            int j;
            float result;

            result = global_sum;
            for (j=0; j < 32; ++j) {
                float value = poclu_bswap_cl_float (dev_id, A[i+j]);
                global_sum += value;
            }
            result = result + global_sum;
            for (j=0; j < 32; ++j) {
                float value = poclu_bswap_cl_float (dev_id, A[i+j]);
                global_sum += value;
            }
            result = result + global_sum;

            if ((int)result != poclu_bswap_cl_int (dev_id, R[i])) {
                std::cout 
                    << "F(" << i << ": " << (int)result << " != " << R[i] 
                    << ") ";
                ok = false;
            }
        }
        if (ok) 
            return EXIT_SUCCESS; 
        else
            return EXIT_FAILURE;

        // Finally release our hold on accessing the memory
        queue.enqueueUnmapMemObject(
            cBuffer,
            (void *) output);
 
        // There is no need to perform a finish on the final unmap
        // or release any objects as this all happens implicitly with
        // the C++ Wrapper API.
    } 
    catch (cl::Error err) {
         std::cerr
             << "ERROR: "
             << err.what()
             << "("
             << err.err()
             << ")"
             << std::endl;

         return EXIT_FAILURE;
    }

    return EXIT_SUCCESS;
}