int exec_dot_product_kernel(const char *program_source, int n, cl_float4 *srcA, cl_float4 *srcB, cl_float4 *dst) { cl_context context; cl_command_queue cmd_queue; cl_device_id *devices; cl_program program; cl_kernel kernel; cl_mem memobjs[3]; size_t global_work_size[1]; size_t local_work_size[1]; size_t cb; cl_int err; int i; // create the OpenCL context on any available OCL device context = clCreateContextFromType( NULL, CL_DEVICE_TYPE_ALL, NULL, NULL, NULL); if (context == (cl_context)0) return -1; // get the list of GPU devices associated with context clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &cb); devices = (cl_device_id *) malloc(cb); clGetContextInfo(context, CL_CONTEXT_DEVICES, cb, devices, NULL); // create a command-queue cmd_queue = clCreateCommandQueue(context, devices[0], 0, NULL); if (cmd_queue == (cl_command_queue)0) { clReleaseContext(context); free(devices); return -1; } for (i = 0; i < n; ++i) { poclu_bswap_cl_float_array(devices[0], (cl_float*)&srcA[i], 4); poclu_bswap_cl_float_array(devices[0], (cl_float*)&srcB[i], 4); } // allocate the buffer memory objects memobjs[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float4) * n, srcA, NULL); if (memobjs[0] == (cl_mem)0) { clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } memobjs[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float4) * n, srcB, NULL); if (memobjs[1] == (cl_mem)0) { delete_memobjs(memobjs, 1); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } memobjs[2] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float) * n, NULL, NULL); if (memobjs[2] == (cl_mem)0) { delete_memobjs(memobjs, 2); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // create the program program = clCreateProgramWithSource(context, 1, (const char**)&program_source, NULL, NULL); if (program == (cl_program)0) { delete_memobjs(memobjs, 3); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 3); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // create the kernel kernel = clCreateKernel(program, "dot_product", NULL); if (kernel == (cl_kernel)0) { delete_memobjs(memobjs, 3); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // set the args values err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &memobjs[0]); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &memobjs[1]); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &memobjs[2]); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 3); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // set work-item dimensions global_work_size[0] = n; local_work_size[0]= 2; // execute kernel err = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 3); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // read output image err = clEnqueueReadBuffer(cmd_queue, memobjs[2], CL_TRUE, 0, n * sizeof(cl_float), dst, 0, NULL, NULL); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 3); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } for (i = 0; i < n; ++i) { poclu_bswap_cl_float_array(devices[0], (cl_float*)&dst[i], 4); poclu_bswap_cl_float_array(devices[0], (cl_float*)&srcA[i], 4); poclu_bswap_cl_float_array(devices[0], (cl_float*)&srcB[i], 4); } free(devices); // release kernel, program, and memory objects delete_memobjs(memobjs, 3); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return 0; // success... }
int main(void) { cl_float A[BUFFER_SIZE]; cl_int R[WORK_ITEMS]; for (int i = 0; i < BUFFER_SIZE; i++) { A[i] = i; } for (int i = 0; i < WORK_ITEMS; i++) { R[i] = i; } try { std::vector<cl::Platform> platformList; // Pick platform cl::Platform::get(&platformList); // Pick first platform cl_context_properties cprops[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(platformList[0])(), 0}; cl::Context context(CL_DEVICE_TYPE_CPU | CL_DEVICE_TYPE_GPU, cprops); // Query the set of devices attched to the context std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>(); // Create and program from source cl::Program::Sources sources(1, std::make_pair(kernelSourceCode, 0)); cl::Program program(context, sources); cl_device_id dev_id = devices.at(0)(); poclu_bswap_cl_float_array(dev_id, A, BUFFER_SIZE); poclu_bswap_cl_int_array(dev_id, R, WORK_ITEMS); // Build program program.build(devices); // Create buffer for A and copy host contents cl::Buffer aBuffer = cl::Buffer( context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, BUFFER_SIZE * sizeof(float), (void *) &A[0]); // Create buffer for that uses the host ptr C cl::Buffer cBuffer = cl::Buffer( context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, WORK_ITEMS * sizeof(int), (void *) &R[0]); // Create kernel object cl::Kernel kernel(program, "test_kernel"); // Set kernel args kernel.setArg(0, aBuffer); kernel.setArg(1, cBuffer); // Create command queue cl::CommandQueue queue(context, devices[0], 0); // Do the work queue.enqueueNDRangeKernel( kernel, cl::NullRange, cl::NDRange(WORK_ITEMS), cl::NullRange); // Map cBuffer to host pointer. This enforces a sync with // the host backing space, remember we choose GPU device. int * output = (int *) queue.enqueueMapBuffer( cBuffer, CL_TRUE, // block CL_MAP_READ, 0, WORK_ITEMS * sizeof(int)); bool ok = true; for (int i = 0; i < WORK_ITEMS; i++) { float global_sum = 0.0f; int j; float result; result = global_sum; for (j=0; j < 32; ++j) { float value = poclu_bswap_cl_float (dev_id, A[i+j]); global_sum += value; } result = result + global_sum; for (j=0; j < 32; ++j) { float value = poclu_bswap_cl_float (dev_id, A[i+j]); global_sum += value; } result = result + global_sum; if ((int)result != poclu_bswap_cl_int (dev_id, R[i])) { std::cout << "F(" << i << ": " << (int)result << " != " << R[i] << ") "; ok = false; } } if (ok) return EXIT_SUCCESS; else return EXIT_FAILURE; // Finally release our hold on accessing the memory queue.enqueueUnmapMemObject( cBuffer, (void *) output); // There is no need to perform a finish on the final unmap // or release any objects as this all happens implicitly with // the C++ Wrapper API. } catch (cl::Error err) { std::cerr << "ERROR: " << err.what() << "(" << err.err() << ")" << std::endl; return EXIT_FAILURE; } return EXIT_SUCCESS; }