int main(int argc, char **argv) { cl_int err; const char *krn_src; cl_program empty, program; cl_context ctx; cl_device_id did; cl_command_queue queue; cl_uint num_krn; cl_kernel kernels[2]; poclu_get_any_device(&ctx, &did, &queue); TEST_ASSERT( ctx ); TEST_ASSERT( did ); TEST_ASSERT( queue ); /* Test creating a program from an empty source */ empty = clCreateProgramWithSource(ctx, 1, &empty_src, NULL, &err); CHECK_OPENCL_ERROR_IN("clCreateProgramWithSource"); err = clBuildProgram(empty, 0, NULL, NULL, NULL, NULL); CHECK_OPENCL_ERROR_IN("clBuildProgram"); err = clCreateKernelsInProgram(empty, 0, NULL, &num_krn); CHECK_OPENCL_ERROR_IN("clCreateKernelsInProgram"); TEST_ASSERT(num_krn == 0); krn_src = poclu_read_file(SRCDIR "/tests/runtime/test_clCreateKernelsInProgram.cl"); TEST_ASSERT(krn_src); program = clCreateProgramWithSource(ctx, 1, &krn_src, NULL, &err); CHECK_OPENCL_ERROR_IN("clCreateProgramWithSource"); err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); CHECK_OPENCL_ERROR_IN("clBuildProgram"); err = clCreateKernelsInProgram(program, 0, NULL, &num_krn); CHECK_OPENCL_ERROR_IN("clCreateKernelsInProgram"); // test_clCreateKernelsInProgram.cl has two kernel functions. TEST_ASSERT(num_krn == 2); err = clCreateKernelsInProgram(program, 2, kernels, NULL); CHECK_OPENCL_ERROR_IN("clCreateKernelsInProgram"); // make sure the kernels were actually created // Note: nothing in the specification says which kernel function // is kernels[0], which is kernels[1]. For now assume pocl/LLVM // orders these deterministacally err = clEnqueueTask(queue, kernels[0], 0, NULL, NULL); CHECK_OPENCL_ERROR_IN("clEnqueueTask"); err = clFinish(queue); CHECK_OPENCL_ERROR_IN("clFinish"); err = clEnqueueTask(queue, kernels[1], 0, NULL, NULL); CHECK_OPENCL_ERROR_IN("clEnqueueTask"); err = clFinish(queue); CHECK_OPENCL_ERROR_IN("clFinish"); return EXIT_SUCCESS; }
/// Enqueues a kernel to execute using a single work-item. /// /// \see_opencl_ref{clEnqueueTask} event enqueue_task(const kernel &kernel, const wait_list &events = wait_list()) { BOOST_ASSERT(m_queue != 0); BOOST_ASSERT(kernel.get_context() == this->get_context()); event event_; // clEnqueueTask() was deprecated in OpenCL 2.0. In that case we // just forward to the equivalent clEnqueueNDRangeKernel() call. #ifdef CL_VERSION_2_0 size_t one = 1; cl_int ret = clEnqueueNDRangeKernel( m_queue, kernel, 1, 0, &one, &one, events.size(), events.get_event_ptr(), &event_.get() ); #else cl_int ret = clEnqueueTask( m_queue, kernel, events.size(), events.get_event_ptr(), &event_.get() ); #endif if(ret != CL_SUCCESS){ BOOST_THROW_EXCEPTION(opencl_error(ret)); } return event_; }
void enqueue(KernelType & k, viennacl::ocl::command_queue const & queue) { // 1D kernel: if (k.local_work_size(1) == 0) { #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL) std::cout << "ViennaCL: Starting 1D-kernel '" << k.name() << "'..." << std::endl; std::cout << "ViennaCL: Global work size: '" << k.global_work_size() << "'..." << std::endl; std::cout << "ViennaCL: Local work size: '" << k.local_work_size() << "'..." << std::endl; #endif vcl_size_t tmp_global = k.global_work_size(); vcl_size_t tmp_local = k.local_work_size(); cl_int err; if (tmp_global == 1 && tmp_local == 1) err = clEnqueueTask(queue.handle().get(), k.handle().get(), 0, NULL, NULL); else err = clEnqueueNDRangeKernel(queue.handle().get(), k.handle().get(), 1, NULL, &tmp_global, &tmp_local, 0, NULL, NULL); if (err != CL_SUCCESS) { std::cerr << "ViennaCL: FATAL ERROR: Kernel start failed for '" << k.name() << "'." << std::endl; std::cerr << "ViennaCL: Smaller work sizes could not solve the problem. " << std::endl; VIENNACL_ERR_CHECK(err); } } else //2D or 3D kernel { #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL) std::cout << "ViennaCL: Starting 2D/3D-kernel '" << k.name() << "'..." << std::endl; std::cout << "ViennaCL: Global work size: '" << k.global_work_size(0) << ", " << k.global_work_size(1) << ", " << k.global_work_size(2) << "'..." << std::endl; std::cout << "ViennaCL: Local work size: '" << k.local_work_size(0) << ", " << k.local_work_size(1) << ", " << k.local_work_size(2) << "'..." << std::endl; #endif vcl_size_t tmp_global[3]; tmp_global[0] = k.global_work_size(0); tmp_global[1] = k.global_work_size(1); tmp_global[2] = k.global_work_size(2); vcl_size_t tmp_local[3]; tmp_local[0] = k.local_work_size(0); tmp_local[1] = k.local_work_size(1); tmp_local[2] = k.local_work_size(2); cl_int err = clEnqueueNDRangeKernel(queue.handle().get(), k.handle().get(), (tmp_global[2] == 0) ? 2 : 3, NULL, tmp_global, tmp_local, 0, NULL, NULL); if (err != CL_SUCCESS) { //could not start kernel with any parameters std::cerr << "ViennaCL: FATAL ERROR: Kernel start failed for '" << k.name() << "'." << std::endl; VIENNACL_ERR_CHECK(err); } } #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL) queue.finish(); std::cout << "ViennaCL: Kernel " << k.name() << " finished!" << std::endl; #endif } //enqueue()
void execute_kernel() { int err; cl_event kernel_event; /* Complete OpenGL processing */ glFinish(); /* Execute the kernel */ err = clEnqueueAcquireGLObjects(queue, 6, mem_objects, 0, NULL, NULL); if(err < 0) { perror("Couldn't acquire the GL objects"); exit(1); } err = clEnqueueTask(queue, kernel, 0, NULL, &kernel_event); if(err < 0) { perror("Couldn't enqueue the kernel"); exit(1); } err = clWaitForEvents(1, &kernel_event); if(err < 0) { perror("Couldn't enqueue the kernel"); exit(1); } clEnqueueReleaseGLObjects(queue, 6, mem_objects, 0, NULL, NULL); clFinish(queue); clReleaseEvent(kernel_event); }
int main(void) { const char *source = "__kernel void main(int in, __global int *out) {\n" " out[0] = in + 1;\n" "}\n"; cl_command_queue command_queue; cl_context context; cl_device_id device; cl_int input = 1; cl_kernel kernel; cl_mem buffer; cl_platform_id platform; cl_program program; clGetPlatformIDs(1, &platform, NULL); clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL); context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL); command_queue = clCreateCommandQueue(context, device, 0, NULL); buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, NULL); program = clCreateProgramWithSource(context, 1, &source, NULL, NULL); clBuildProgram(program, 1, &device, "", NULL, NULL); kernel = clCreateKernel(program, "main", NULL); clSetKernelArg(kernel, 0, sizeof(cl_int), &input); clSetKernelArg(kernel, 1, sizeof(cl_mem), &buffer); clEnqueueTask(command_queue, kernel, 0, NULL, NULL); clFlush(command_queue); clFinish(command_queue); clEnqueueReadBuffer(command_queue, buffer, CL_TRUE, 0, sizeof(cl_int), &input, 0, NULL, NULL); assert(input == 2); return EXIT_SUCCESS; }
cl_int WINAPI wine_clEnqueueTask(cl_command_queue command_queue, cl_kernel kernel, cl_uint num_events_in_wait_list, cl_event * event_wait_list, cl_event * event) { cl_int ret; TRACE("\n"); ret = clEnqueueTask(command_queue, kernel, num_events_in_wait_list, event_wait_list, event); return ret; }
inline void Queue::runTask(cl_command_queue queue, cl_kernel kernel, cl_uint waitListSize, const cl_event* waitList, cl_event* event) { cl_int errorCode = clEnqueueTask(queue, kernel, waitListSize, waitList, event); verifyOutputCode(errorCode, "Error launching the task"); }
int main(int argc, char **argv) { cl_int err; const char *krn_src; cl_program program; cl_context ctx; cl_device_id did; cl_command_queue queue; cl_uint num_krn; cl_kernel kernels[2]; poclu_get_any_device(&ctx, &did, &queue); assert( ctx ); assert( did ); assert( queue ); krn_src = poclu_read_file(SRCDIR "/tests/runtime/test_clCreateKernelsInProgram.cl"); assert(krn_src); program = clCreateProgramWithSource(ctx, 1, &krn_src, NULL, NULL); err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); assert(err == CL_SUCCESS); err = clCreateKernelsInProgram(program, 0, NULL, &num_krn); assert(err == CL_SUCCESS); // test_clCreateKernelsInProgram.cl has two kernel functions. assert(num_krn == 2); err = clCreateKernelsInProgram(program, 2, kernels, NULL); assert(err == CL_SUCCESS); // make sure the kernels were actually created // Note: nothing in the specification says which kernel function // is kernels[0], which is kernels[1]. For now assume pocl/LLVM // orders these deterministacally err = clEnqueueTask(queue, kernels[0], 0, NULL, NULL); assert(err == CL_SUCCESS); err = clEnqueueTask(queue, kernels[1], 0, NULL, NULL); assert(err == CL_SUCCESS); clFinish(queue); }
void value_profiler::check_value_on_device(ad_rule rule) { cl_mem dest; cl_int status; printf("Not Implemented "); exit(-1); //ad_setKernelArg(test_kernel,0,sizeof(cl_mem),(void *)&(rule.get_target_buff())); //! Action to be done //ad_setKernelArg(test_kernel,1,sizeof(cl_int),(void *)&(rule.get_target_buff())); clEnqueueTask(access_queue,test_kernel,0,NULL,NULL); }
void run_benchmark( void *vargs, cl_context& context, cl_command_queue& commands, cl_program& program, cl_kernel& kernel ) { struct bench_args_t *args = (struct bench_args_t *)vargs; // Create device buffers // static unsigned *nzval_buffer = (unsigned int*)clSVMAllocAltera(context, 0, sizeof(args->nzval), 1024); static unsigned *cols_buffer = (unsigned int*)clSVMAllocAltera(context, 0, sizeof(args->cols), 1024); static unsigned *vec_buffer = (unsigned int*)clSVMAllocAltera(context, 0, sizeof(args->vec), 1024); static unsigned *out_buffer = (unsigned int*)clSVMAllocAltera(context, 0, sizeof(args->out), 1024); // Write our data set into device buffers // memcpy(nzval_buffer, args->nzval, sizeof(args->nzval)); memcpy(cols_buffer, args->cols, sizeof(args->cols)); memcpy(vec_buffer, args->vec, sizeof(args->vec)); // Set the arguments to our compute kernel // int status; status = clSetKernelArgSVMPointerAltera(kernel, 0, (void*)nzval_buffer); status |= clSetKernelArgSVMPointerAltera(kernel, 1, (void*)cols_buffer); status |= clSetKernelArgSVMPointerAltera(kernel, 2, (void*)vec_buffer); status |= clSetKernelArgSVMPointerAltera(kernel, 3, (void*)out_buffer); if(status != CL_SUCCESS) { dump_error("Failed set args.", status); exit(1); } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // #ifdef OPENCL_KERNEL status = clEnqueueTask(commands, kernel, 0, NULL, NULL); #else printf("Error: C kernel is not currently supported!\n"); exit(1); #endif if (status) { printf("Error: Failed to execute kernel! %d\n", status); printf("Test failed\n"); exit(1); } clFinish(commands); // Read back the results from the device to verify the output // memcpy(args->out, out_buffer, sizeof(args->out)); }
int oclFluid3D::compile() { clInitFluid = 0; clIntegrateForce = 0; clIntegrateVelocity = 0; clHash = 0; clReorder = 0; clInitBounds = 0; if (!mRadixSort.compile()) { return 0; } if (!oclProgram::compile()) { return 0; } clInitFluid = createKernel("clInitFluid"); KERNEL_VALIDATE(clInitFluid) clIntegrateForce = createKernel("clIntegrateForce"); KERNEL_VALIDATE(clIntegrateForce) clIntegrateVelocity = createKernel("clIntegrateVelocity"); KERNEL_VALIDATE(clIntegrateVelocity) clHash = createKernel("clHash"); KERNEL_VALIDATE(clHash) clReorder = createKernel("clReorder"); KERNEL_VALIDATE(clReorder) clInitBounds = createKernel("clInitBounds"); KERNEL_VALIDATE(clInitBounds) clFindBounds = createKernel("clFindBounds"); KERNEL_VALIDATE(clFindBounds) clCalculateDensity = createKernel("clCalculateDensity"); KERNEL_VALIDATE(clCalculateDensity) clCalculateForces = createKernel("clCalculateForces"); KERNEL_VALIDATE(clCalculateForces) clGravity = createKernel("clGravity"); KERNEL_VALIDATE(clGravity) clClipBox = createKernel("clClipBox"); KERNEL_VALIDATE(clClipBox) // init fluid parameters clSetKernelArg(clInitFluid, 0, sizeof(cl_mem), bfParams); clEnqueueTask(mContext.getDevice(0), clInitFluid, 0, NULL, clInitFluid.getEvent()); bfParams.map(CL_MAP_READ); return bindBuffers(); }
int _tmain(int argc, _TCHAR* argv[]) { cl_int ret; o2o_init(); o2o_create_cmd_queue(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE); o2o_create_program_from_source(kernel1); o2o_build_program(); o2o_create_kernel("kernel1"); size_t k_size = strlen(kernel2); cl_program p2 = clCreateProgramWithSource(ocl_ctx, 1, &kernel2, &k_size, &ret); CHECK(ret); ret = clBuildProgram(p2, 1, &d_id, NULL, NULL, NULL); CHECK(ret); cl_kernel k2 = clCreateKernel(p2, "kernel2", &ret); CHECK(ret); ret = clEnqueueTask(cmd_q, kernel, 0, NULL, NULL); CHECK(ret); ret = clEnqueueTask(cmd_q, k2, 0, NULL, NULL); CHECK(ret); o2o_finalize(); printf("... Program Done\n"); getchar(); return 0; }
bool piglit_cl_enqueue_task(cl_command_queue command_queue, cl_kernel kernel) { cl_int errNo; errNo = clEnqueueTask(command_queue, kernel, 0, NULL, NULL); if(!piglit_cl_check_error(errNo, CL_SUCCESS)) { fprintf(stderr, "Could not enqueue task: %s\n", piglit_cl_get_error_name(errNo)); return false; } return true; }
int main(void) { const char *source = /* kernel pointer arguments must be __global, __constant, or __local. */ /* https://www.khronos.org/registry/cl/sdk/2.1/docs/man/xhtml/restrictions.html */ "__kernel void increment(__global int *out) {\n" " out[0]++;\n" "}\n"; cl_command_queue command_queue; cl_context context; cl_device_id device; cl_int input = 1; cl_kernel kernel; cl_mem buffer; cl_platform_id platform; cl_program program; /* Run kernel. */ clGetPlatformIDs(1, &platform, NULL); clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL); context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL); program = clCreateProgramWithSource(context, 1, &source, NULL, NULL); clBuildProgram(program, 1, &device, "", NULL, NULL); /* The name of the kernel function we want to call. */ kernel = clCreateKernel(program, "increment", NULL); buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(cl_int), &input, NULL); clSetKernelArg(kernel, 0, sizeof(buffer), &buffer); command_queue = clCreateCommandQueue(context, device, 0, NULL); clEnqueueTask(command_queue, kernel, 0, NULL, NULL); clFlush(command_queue); clFinish(command_queue); clEnqueueReadBuffer(command_queue, buffer, CL_TRUE, 0, sizeof(input), &input, 0, NULL, NULL); /* Asserts. */ assert(input == 2); /* Cleanup. */ clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(command_queue); clReleaseContext(context); clReleaseMemObject(buffer); return EXIT_SUCCESS; }
int main(int argc, char** argv) { if (argc < 2) { printf("Missing required argument input.\n"); printf("Usage: %s input\n", argv[0]); return -1; } int input = atoi(argv[1]); // 8 is an arbitrary maximum number of platforms. cl_uint num_entries = 8; cl_platform_id* platforms = malloc(num_entries * sizeof (cl_platform_id)); cl_uint num_platforms = -1; clGetPlatformIDs(num_entries, platforms, &num_platforms); cl_uint num_devices = -1; cl_device_id* devices = malloc(num_entries * sizeof (cl_device_id)); clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, num_entries, devices, &num_devices); for (int i = 0; i < num_devices; i++) { size_t device_type_size = sizeof(cl_device_type); cl_device_type* device_type = malloc(device_type_size); clGetDeviceInfo(devices[i], CL_DEVICE_TYPE, device_type_size, device_type, NULL); if (device_type[0] == CL_DEVICE_TYPE_GPU) { cl_context context = clCreateContext(NULL, 1, &devices[i], NULL, NULL, NULL); cl_command_queue command_queue = clCreateCommandQueue(context, devices[i], 0, NULL); cl_mem buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof (cl_int), NULL, NULL); const char* source = "__kernel void increment(int in, __global int* out) { out[0] = in + 1; }"; cl_program program = clCreateProgramWithSource(context, 1, &source, NULL, NULL); clBuildProgram(program, 1, &devices[i], "", NULL, NULL); cl_kernel kernel = clCreateKernel(program, "increment", NULL); clSetKernelArg(kernel, 0, sizeof(cl_int), &input); clSetKernelArg(kernel, 1, sizeof(cl_mem), &buffer); clEnqueueTask(command_queue, kernel, 0, NULL, NULL); clFlush(command_queue); clFinish(command_queue); cl_int kernel_result = 0; clEnqueueReadBuffer(command_queue, buffer, CL_TRUE, 0, sizeof (cl_int), &kernel_result, 0, NULL, NULL); printf("%i\n", kernel_result); } free(device_type); } free(devices); free(platforms); return 0; }
void execute_device(){ int err; #ifdef C_KERNEL err = clEnqueueTask(commands, kernel, 0, NULL, NULL); #else size_t global[1]; // global domain size for our calculation size_t local[1]; // local domain size for our calculation global[0] = 1; local[0] = 1; err = clEnqueueNDRangeKernel(commands, kernel_in, 1, NULL, (size_t*)global, (size_t*)local, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel_in! %d\n", err); printf("Test failed\n"); exit(1); } err = clEnqueueNDRangeKernel(commands, kernel_inter, 1, NULL, (size_t*)global, (size_t*)local, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel_inter! %d\n", err); printf("Test failed\n"); exit(1); } err = clEnqueueNDRangeKernel(commands, kernel_out, 1, NULL, (size_t*)global, (size_t*)local, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel_out! %d\n", err); printf("Test failed\n"); exit(1); } #endif clFinish(commands); }
int main() { /* Host/device data structures */ cl_device_id device; cl_context context; cl_command_queue queue; cl_program program; cl_kernel kernel; cl_int i, err; /* Data and buffers */ float shuffle1[8]; char shuffle2[16]; cl_mem shuffle1_buffer, shuffle2_buffer; /* Create a context */ device = create_device(); context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); if(err < 0) { perror("Couldn't create a context"); exit(1); } /* Build the program and create a kernel */ program = build_program(context, device, PROGRAM_FILE); kernel = clCreateKernel(program, KERNEL_FUNC, &err); if(err < 0) { perror("Couldn't create a kernel"); exit(1); }; /* Create a write-only buffer to hold the output data */ shuffle1_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(shuffle1), NULL, &err); if(err < 0) { perror("Couldn't create a buffer"); exit(1); }; shuffle2_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(shuffle2), NULL, &err); /* Create kernel argument */ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &shuffle1_buffer); if(err < 0) { perror("Couldn't set a kernel argument"); exit(1); }; clSetKernelArg(kernel, 1, sizeof(cl_mem), &shuffle2_buffer); /* Create a command queue */ queue = clCreateCommandQueue(context, device, 0, &err); if(err < 0) { perror("Couldn't create a command queue"); exit(1); }; /* Enqueue kernel */ err = clEnqueueTask(queue, kernel, 0, NULL, NULL); if(err < 0) { perror("Couldn't enqueue the kernel"); exit(1); } /* Read and print the result */ err = clEnqueueReadBuffer(queue, shuffle1_buffer, CL_TRUE, 0, sizeof(shuffle1), &shuffle1, 0, NULL, NULL); if(err < 0) { perror("Couldn't read the buffer"); exit(1); } clEnqueueReadBuffer(queue, shuffle2_buffer, CL_TRUE, 0, sizeof(shuffle2), &shuffle2, 0, NULL, NULL); printf("Shuffle1: "); for(i=0; i<7; i++) { printf("%.2f, ", shuffle1[i]); } printf("%.2f\n", shuffle1[7]); printf("Shuffle2: "); for(i=0; i<16; i++) { printf("%c", shuffle2[i]); } printf("\n"); /* Deallocate resources */ clReleaseMemObject(shuffle1_buffer); clReleaseMemObject(shuffle2_buffer); clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseProgram(program); clReleaseContext(context); return 0; }
int test_context(cl_context ctx, const char *prog_src, int mul, int ndevs, cl_device_id *devs) { cl_int err; cl_command_queue queue[ndevs]; cl_program prog; cl_kernel krn; cl_mem buf; cl_event evt[ndevs]; cl_int i; prog = clCreateProgramWithSource(ctx, 1, &prog_src, NULL, &err); CHECK_OPENCL_ERROR_IN("create program"); CHECK_CL_ERROR(clBuildProgram(prog, 0, NULL, NULL, NULL, NULL)); krn = clCreateKernel(prog, "setidx", &err); CHECK_OPENCL_ERROR_IN("create kernel"); buf = clCreateBuffer(ctx, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, ndevs*sizeof(cl_int), NULL, &err); CHECK_OPENCL_ERROR_IN("create buffer"); CHECK_CL_ERROR(clSetKernelArg(krn, 0, sizeof(cl_mem), &buf)); /* create one queue per device, and submit task, waiting for all * previous */ for (i = 0; i < ndevs; ++i) { queue[i] = clCreateCommandQueue(ctx, devs[i], 0, &err); CHECK_OPENCL_ERROR_IN("create queue"); err = clSetKernelArg(krn, 1, sizeof(i), &i); CHECK_OPENCL_ERROR_IN("set kernel arg 1"); // no wait list for first (root) device err = clEnqueueTask(queue[i], krn, i, i ? evt : NULL, evt + i); CHECK_OPENCL_ERROR_IN("submit task"); } /* enqueue map on last */ cl_int *buf_host = clEnqueueMapBuffer(queue[ndevs - 1], buf, CL_TRUE, CL_MAP_READ, 0, ndevs*sizeof(cl_int), ndevs, evt, NULL, &err); CHECK_OPENCL_ERROR_IN("map buffer"); int mismatch = 0; for (i = 0; i < ndevs; ++i) { mismatch += !!(buf_host[i] != i*mul); } TEST_ASSERT(mismatch == 0); /* enqueue unmap on first */ CHECK_CL_ERROR(clEnqueueUnmapMemObject(queue[0], buf, buf_host, 0, NULL, NULL)); for (i = 0 ; i < ndevs; ++i) { err = clFinish(queue[i]); err |= clReleaseCommandQueue(queue[i]); err |= clReleaseEvent(evt[i]); } err |= clReleaseKernel(krn); err |= clReleaseMemObject(buf); err |= clReleaseProgram(prog); err |= clReleaseContext(ctx); CHECK_OPENCL_ERROR_IN("cleanup"); return CL_SUCCESS; }
int main() { cl_device_id device_id = NULL; cl_context context = NULL; cl_command_queue command_queue = NULL; cl_mem memobj = NULL; cl_program program = NULL; cl_kernel kernel = NULL; cl_platform_id platform_id = NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret; char string[MEM_SIZE]; FILE *fp; char fileName[] = "./hello.cl"; char *source_str; size_t source_size; /* Load the source code containing the kernel*/ fp = fopen(fileName, "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } source_str = (char*)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp); fclose( fp ); /* Get Platform and Device Info */ ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); /* Create OpenCL context */ context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret); /* Create Command Queue */ command_queue = clCreateCommandQueue(context, device_id, 0, &ret); /* Create Memory Buffer */ memobj = clCreateBuffer(context, CL_MEM_READ_WRITE,MEM_SIZE * sizeof(char), NULL, &ret); /* Create Kernel Program from the source */ program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); /* Build Kernel Program */ ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); /* Create OpenCL Kernel */ kernel = clCreateKernel(program, "hello", &ret); /* Set OpenCL Kernel Arguments */ ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj); /* Execute OpenCL Kernel */ ret = clEnqueueTask(command_queue, kernel, 0, NULL,NULL); /* Copy results from the memory buffer */ ret = clEnqueueReadBuffer(command_queue, memobj, CL_TRUE, 0, MEM_SIZE * sizeof(char),string, 0, NULL, NULL); /* Display Result */ puts(string); /* Finalization */ ret = clFlush(command_queue); ret = clFinish(command_queue); ret = clReleaseKernel(kernel); ret = clReleaseProgram(program); ret = clReleaseMemObject(memobj); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); free(source_str); return 0; }
int main() { /* Host/device data structures */ cl_device_id device; cl_context context; cl_command_queue queue; cl_program program; cl_kernel kernel; cl_int err; /* Data and buffers */ float reflect[4]; cl_mem reflect_buffer; float x[4] = {1.0f, 2.0f, 3.0f, 4.0f}; float u[4] = {0.0f, 5.0f, 0.0f, 0.0f}; /* Create a device and context */ device = create_device(); context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); if(err < 0) { perror("Couldn't create a context"); exit(1); } /* Build the program */ program = build_program(context, device, PROGRAM_FILE); /* Create a kernel */ kernel = clCreateKernel(program, KERNEL_FUNC, &err); if(err < 0) { perror("Couldn't create a kernel"); exit(1); }; /* Create buffer */ reflect_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 4*sizeof(float), NULL, &err); if(err < 0) { perror("Couldn't create a buffer"); exit(1); }; /* Create kernel argument */ err = clSetKernelArg(kernel, 0, sizeof(x), x); err |= clSetKernelArg(kernel, 1, sizeof(u), u); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &reflect_buffer); if(err < 0) { printf("Couldn't set a kernel argument"); exit(1); }; /* Create a command queue */ queue = clCreateCommandQueue(context, device, 0, &err); if(err < 0) { perror("Couldn't create a command queue"); exit(1); }; /* Enqueue kernel */ err = clEnqueueTask(queue, kernel, 0, NULL, NULL); if(err < 0) { perror("Couldn't enqueue the kernel"); exit(1); } /* Read and print the result */ err = clEnqueueReadBuffer(queue, reflect_buffer, CL_TRUE, 0, sizeof(reflect), reflect, 0, NULL, NULL); if(err < 0) { perror("Couldn't read the buffer"); exit(1); } printf("\nResult: %f %f %f %f\n", reflect[0], reflect[1], reflect[2], reflect[3]); /* Deallocate resources */ clReleaseMemObject(reflect_buffer); clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseProgram(program); clReleaseContext(context); return 0; }
int main(void) { cl_platform_id platform_id = NULL; cl_uint ret_num_platforms; cl_device_id device_id = NULL; cl_uint ret_num_devices; cl_context context = NULL; cl_command_queue command_queue = NULL; cl_mem memobj_in = NULL; cl_mem memobj_out = NULL; cl_program program = NULL; cl_kernel kernel = NULL; size_t kernel_code_size; char *kernel_src_str; float *result; cl_int ret; FILE *fp; int data_num = sizeof(stock_array1) / sizeof(stock_array1[0]); int window_num = (int)WINDOW_SIZE; int i; /* Allocate space to read in kernel code */ kernel_src_str = (char *)malloc(MAX_SOURCE_SIZE); /* Allocate space for the result on the host side */ result = (float *)malloc(data_num*sizeof(float)); printf("starting/n"); /* Get Platform */ ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); /* Get Device */ ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, &ret_num_devices); /* status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); if (numDevices == 0) //no GPU available. { cout << "No GPU device available."<<endl; cout << "Choose CPU as default device."<<endl; status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 0, NULL, &numDevices); devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id)); status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, numDevices, devices, NULL); } */ /* Create Context */ context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); /* Create Command Queue */ command_queue = clCreateCommandQueue(context, device_id, 0, &ret); printf("after create command queue/n"); /* Read Kernel Code */ fp = fopen("moving_average.cl", "r"); kernel_code_size = fread(kernel_src_str, 1, MAX_SOURCE_SIZE, fp); fclose(fp); /* Create Program Object */ program = clCreateProgramWithSource(context, 1, (const char **)&kernel_src_str, (const size_t *)&kernel_code_size, &ret); /* Compile kernel */ ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); /* Create Kernel */ kernel = clCreateKernel(program, "moving_average", &ret); /* Create buffer for the input data on the device */ memobj_in = clCreateBuffer(context, CL_MEM_READ_WRITE, data_num * sizeof(int), NULL, &ret); /* Create buffer for the result on the device */ memobj_out = clCreateBuffer(context, CL_MEM_READ_WRITE, data_num * sizeof(float), NULL, &ret); /* Copy input data to the global memory on the device*/ ret = clEnqueueWriteBuffer(command_queue, memobj_in, CL_TRUE, 0, data_num * sizeof(int), stock_array1, 0, NULL, NULL); /* Set kernel arguments */ ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj_in); ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobj_out); ret = clSetKernelArg(kernel, 2, sizeof(int), (void *)&data_num); ret = clSetKernelArg(kernel, 3, sizeof(int), (void *)&window_num); /* Execute the kernel */ ret = clEnqueueTask(command_queue, kernel, 0, NULL, NULL); /* Copy result from device to host */ ret = clEnqueueReadBuffer(command_queue, memobj_out, CL_TRUE, 0, data_num * sizeof(float), result, 0, NULL, NULL); /* OpenCL Object Finalization */ ret = clReleaseKernel(kernel); ret = clReleaseProgram(program); ret = clReleaseMemObject(memobj_in); ret = clReleaseMemObject(memobj_out); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); /* Display Results */ for (i=0; i < data_num; i++) { printf("result[%d] = %f\n", i, result[i]); } /* Deallocate memory on the host */ free(result); free(kernel_src_str); return 0; }
int main() { cl_platform_id platform_id = NULL; cl_device_id device_id = NULL; cl_context context = NULL; cl_command_queue command_queue = NULL; cl_mem Amobj = NULL; cl_mem Bmobj = NULL; cl_mem Cmobj = NULL; cl_program program = NULL; cl_kernel kernel[4] = {NULL, NULL, NULL, NULL}; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret; int i, j; float* A; float* B; float* C; A = (float*)malloc(4*4*sizeof(float)); B = (float*)malloc(4*4*sizeof(float)); C = (float*)malloc(4*4*sizeof(float)); FILE *fp; const char fileName[] = "./taskParallel.cl"; size_t source_size; char *source_str; /* Load kernel source file */ fp = fopen(fileName, "rb"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); fclose(fp); /* Initialize input data */ for (i=0; i < 4; i++) { for (j=0; j < 4; j++) { A[i*4+j] = i*4+j+1; B[i*4+j] = j*4+i+1; } } /* Get platform/device information */ ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); /* Create OpenCL Context */ context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); /* Create command queue */ command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &ret); /* Create buffer object */ Amobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret); Bmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret); Cmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret); /* Copy input data to memory buffer */ ret = clEnqueueWriteBuffer(command_queue, Amobj, CL_TRUE, 0, 4*4*sizeof(float), A, 0, NULL, NULL); ret = clEnqueueWriteBuffer(command_queue, Bmobj, CL_TRUE, 0, 4*4*sizeof(float), B, 0, NULL, NULL); /* Create kernel from source */ program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); /* Create task parallel OpenCL kernel */ kernel[0] = clCreateKernel(program, "taskParallelAdd", &ret); kernel[1] = clCreateKernel(program, "taskParallelSub", &ret); kernel[2] = clCreateKernel(program, "taskParallelMul", &ret); kernel[3] = clCreateKernel(program, "taskParallelDiv", &ret); /* Set OpenCL kernel arguments */ for (i=0; i < 4; i++) { ret = clSetKernelArg(kernel[i], 0, sizeof(cl_mem), (void *)&Amobj); ret = clSetKernelArg(kernel[i], 1, sizeof(cl_mem), (void *)&Bmobj); ret = clSetKernelArg(kernel[i], 2, sizeof(cl_mem), (void *)&Cmobj); } /* Execute OpenCL kernel as task parallel */ for (i=0; i < 4; i++) { ret = clEnqueueTask(command_queue, kernel[i], 0, NULL, NULL); } /* Copy result to host */ ret = clEnqueueReadBuffer(command_queue, Cmobj, CL_TRUE, 0, 4*4*sizeof(float), C, 0, NULL, NULL); /* Display result */ for (i=0; i < 4; i++) { for (j=0; j < 4; j++) { printf("%7.2f ", C[i*4+j]); } printf("\n"); } /* Finalization */ ret = clFlush(command_queue); ret = clFinish(command_queue); ret = clReleaseKernel(kernel[0]); ret = clReleaseKernel(kernel[1]); ret = clReleaseKernel(kernel[2]); ret = clReleaseKernel(kernel[3]); ret = clReleaseProgram(program); ret = clReleaseMemObject(Amobj); ret = clReleaseMemObject(Bmobj); ret = clReleaseMemObject(Cmobj); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); free(source_str); free(A); free(B); free(C); return 0; }
int main(int argc, char *argv[]) { #ifdef DEBUG printf("Argument count = [%d]\n", argc); #endif if(argc!=2) { printf("Expecting one argument!\n"); exit(1); } if(argv[1]==NULL) { printf("Expecting one non-null argument!\n"); exit(1); } char *progName = argv[1]; char fileName[100]; sprintf(fileName, "./target/%s.cl",progName); printf("Using kernel file [%s], with kernel name [%s]\n", fileName, progName); cl_device_id device_id = NULL; cl_context context = NULL; cl_command_queue command_queue = NULL; cl_program program = NULL; cl_kernel kernel = NULL; cl_platform_id platform_id = NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret; float *result; int i; cl_mem image, out; cl_bool support; cl_image_format fmt; int num_out = 9; FILE *fp; char *source_str; size_t source_size, r_size; int mem_size = sizeof(cl_float4) * num_out; /*load the source code containing the kernel*/ fp = fopen (fileName, "r"); if (!fp) { fprintf(stderr, "failed to load kernel.\n"); exit(1); } source_str = (char*)malloc(MAX_SOURCE_SIZE); source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); fclose(fp); /*Get platform and device info*/ ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); printf("ret_num_platforms = %d\n", ret_num_platforms); ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1 ,&device_id, &ret_num_devices); printf("ret_num_platforms = %d\n", ret_num_platforms); context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); result = (float*) malloc(mem_size); //check image support clGetDeviceInfo(device_id, CL_DEVICE_IMAGE_SUPPORT, sizeof(support), &support, &r_size); if (support != CL_TRUE) { puts("image not supported"); return 1; } command_queue = clCreateCommandQueue(context, device_id, 0, &ret); printf("queue ret = %d\n", ret); out = clCreateBuffer(context, CL_MEM_READ_WRITE, mem_size, NULL, &ret); printf("create buffer ret = %d\n", ret); fmt.image_channel_order = CL_R; fmt.image_channel_data_type = CL_FLOAT; image = clCreateImage2D(context, CL_MEM_READ_ONLY, &fmt, 4, 4, 0, 0, NULL); size_t origin[] = {0,0,0}; size_t region[] = {4,4,1}; float data[] = { 10,20,30,40, 10,20,30,40, 10,20,30,40, 10,20,30,40 }; clEnqueueWriteImage(command_queue, image, CL_TRUE, origin, region, 4*sizeof(float), 0, data, 0, NULL, NULL); program = clCreateProgramWithSource(context, 1, (const char**) &source_str, (const size_t*) &source_size, &ret); ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); printf("build program ret = %d\n", ret); kernel = clCreateKernel(program, progName, &ret); printf("create kernel ret = %d\n", ret); //How to set int arguments? ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*) &image); printf("arg 0 ret = %d\n", ret); ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &out); printf("arg 1 ret = %d\n", ret); cl_event ev; ret = clEnqueueTask(command_queue, kernel, 0, NULL, &ev); //How to read a int? ret = clEnqueueReadBuffer(command_queue, out, CL_TRUE, 0, mem_size, result, 0, NULL, NULL); for(int i=0; i < num_out; i++) { printf("%f,%f,%f,%f\n", result[i*4+0], result[i*4+1], result[i*4+2], result[i*4+3]); } ret=clFlush(command_queue); ret=clFinish(command_queue); ret=clReleaseKernel(kernel); ret=clReleaseProgram(program); ret=clReleaseMemObject(out); ret=clReleaseMemObject(image); ret=clReleaseCommandQueue(command_queue); ret=clReleaseContext(context); free(source_str); printf("\n"); return 0; }
void run_benchmark( void *vargs, cl_context& context, cl_command_queue& commands, cl_program& program, cl_kernel& kernel ) { struct bench_args_t *args = (struct bench_args_t *)vargs; int num_jobs = 1 << 16; char* seqA_batch = (char *)malloc(sizeof(args->seqA) * num_jobs); char* seqB_batch = (char *)malloc(sizeof(args->seqB) * num_jobs); char* alignedA_batch = (char *)malloc(sizeof(args->alignedA) * num_jobs); char* alignedB_batch = (char *)malloc(sizeof(args->alignedB) * num_jobs); int i; for (i=0; i<num_jobs; i++) { memcpy(seqA_batch + i*sizeof(args->seqA), args->seqA, sizeof(args->seqA)); memcpy(seqB_batch + i*sizeof(args->seqB), args->seqB, sizeof(args->seqB)); memcpy(alignedA_batch + i*sizeof(args->alignedA), args->alignedA, sizeof(args->alignedA)); memcpy(alignedB_batch + i*sizeof(args->alignedB), args->alignedB, sizeof(args->alignedB)); } // 0th: initialize the timer at the beginning of the program timespec timer = tic(); // Create device buffers // cl_mem seqA_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(args->seqA)*num_jobs, NULL, NULL); cl_mem seqB_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(args->seqB)*num_jobs, NULL, NULL); cl_mem alignedA_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(args->alignedA)*num_jobs, NULL, NULL); cl_mem alignedB_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(args->alignedB)*num_jobs, NULL, NULL); if (!seqA_buffer || !seqB_buffer || !alignedA_buffer || !alignedB_buffer) { printf("Error: Failed to allocate device memory!\n"); printf("Test failed\n"); exit(1); } // 1st: time of buffer allocation toc(&timer, "buffer allocation"); // Write our data set into device buffers // int err; err = clEnqueueWriteBuffer(commands, seqA_buffer, CL_TRUE, 0, sizeof(args->seqA)*num_jobs, seqA_batch, 0, NULL, NULL); err |= clEnqueueWriteBuffer(commands, seqB_buffer, CL_TRUE, 0, sizeof(args->seqB)*num_jobs, seqB_batch, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to device memory!\n"); printf("Test failed\n"); exit(1); } // 2nd: time of pageable-pinned memory copy toc(&timer, "memory copy"); // Set the arguments to our compute kernel // err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &seqA_buffer); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &seqB_buffer); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &alignedA_buffer); err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &alignedB_buffer); err |= clSetKernelArg(kernel, 4, sizeof(int), &num_jobs); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); printf("Test failed\n"); exit(1); } // 3rd: time of setting arguments toc(&timer, "set arguments"); // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // #ifdef C_KERNEL err = clEnqueueTask(commands, kernel, 0, NULL, NULL); #else printf("Error: OpenCL kernel is not currently supported!\n"); exit(1); #endif if (err) { printf("Error: Failed to execute kernel! %d\n", err); printf("Test failed\n"); exit(1); } // 4th: time of kernel execution clFinish(commands); toc(&timer, "kernel execution"); // Read back the results from the device to verify the output // err = clEnqueueReadBuffer( commands, alignedA_buffer, CL_TRUE, 0, sizeof(args->alignedA)*num_jobs, alignedA_batch, 0, NULL, NULL ); err |= clEnqueueReadBuffer( commands, alignedB_buffer, CL_TRUE, 0, sizeof(args->alignedB)*num_jobs, alignedB_batch, 0, NULL, NULL ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); printf("Test failed\n"); exit(1); } // 5th: time of data retrieving (PCIe + memcpy) toc(&timer, "data retrieving"); // memcpy(args->alignedA, alignedA_batch, sizeof(args->alignedA)); // memcpy(args->alignedB, alignedB_batch, sizeof(args->alignedB)); for (i=0; i<sizeof(args->alignedA); i++) { args->alignedA[i] = 'a'; } for (i=0; i<sizeof(args->alignedB); i++) { args->alignedB[i] = 'b'; } free(seqA_batch); free(seqB_batch); free(alignedA_batch); free(alignedB_batch); }
void enqueue(KernelType & k, viennacl::ocl::command_queue const & queue) { // 1D kernel: if (k.local_work_size(1) == 0) { #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL) std::cout << "ViennaCL: Starting 1D-kernel '" << k.name() << "'..." << std::endl; std::cout << "ViennaCL: Global work size: '" << k.global_work_size() << "'..." << std::endl; std::cout << "ViennaCL: Local work size: '" << k.local_work_size() << "'..." << std::endl; #endif size_t tmp_global = k.global_work_size(); size_t tmp_local = k.local_work_size(); cl_int err; if (tmp_global == 1 && tmp_local == 1) err = clEnqueueTask(queue.handle().get(), k.handle().get(), 0, NULL, NULL); else err = clEnqueueNDRangeKernel(queue.handle().get(), k.handle().get(), 1, NULL, &tmp_global, &tmp_local, 0, NULL, NULL); if (err != CL_SUCCESS) //if not successful, try to start with smaller work size { //std::cout << "FAIL: " << std::endl; exit(0); while (err != CL_SUCCESS && tmp_local > 1) { //std::cout << "Flushing queue, then enqueuing again with half the size..." << std::endl; //std::cout << "Error code: " << err << std::endl; tmp_global /= 2; tmp_local /= 2; #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL) std::cout << "ViennaCL: Kernel start failed for '" << k.name() << "'." << std::endl; std::cout << "ViennaCL: Global work size: '" << tmp_global << "'..." << std::endl; std::cout << "ViennaCL: Local work size: '" << tmp_local << "'..." << std::endl; #endif queue.finish(); err = clEnqueueNDRangeKernel(queue.handle().get(), k.handle().get(), 1, NULL, &tmp_global, &tmp_local, 0, NULL, NULL); } if (err != CL_SUCCESS) { //could not start kernel with any parameters std::cerr << "ViennaCL: FATAL ERROR: Kernel start failed for '" << k.name() << "'." << std::endl; std::cerr << "ViennaCL: Smaller work sizes could not solve the problem. " << std::endl; VIENNACL_ERR_CHECK(err); } else { //remember parameters: k.local_work_size(0, tmp_local); k.global_work_size(0, tmp_global); #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL) std::cout << "ViennaCL: Kernel '" << k.name() << "' now uses global work size " << tmp_global << " and local work size " << tmp_local << "." << std::endl; #endif } } } else //2D kernel { #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL) std::cout << "ViennaCL: Starting 2D-kernel '" << k.name() << "'..." << std::endl; std::cout << "ViennaCL: Global work size: '" << k.global_work_size(0) << ", " << k.global_work_size(1) << "'..." << std::endl; std::cout << "ViennaCL: Local work size: '" << k.local_work_size(0) << ", " << k.local_work_size(1) << "'..." << std::endl; #endif size_t tmp_global[2]; tmp_global[0] = k.global_work_size(0); tmp_global[1] = k.global_work_size(1); size_t tmp_local[2]; tmp_local[0] = k.local_work_size(0); tmp_local[1] = k.local_work_size(1); cl_int err = clEnqueueNDRangeKernel(queue.handle().get(), k.handle().get(), 2, NULL, tmp_global, tmp_local, 0, NULL, NULL); if (err != CL_SUCCESS) { //could not start kernel with any parameters std::cerr << "ViennaCL: FATAL ERROR: Kernel start failed for '" << k.name() << "'." << std::endl; VIENNACL_ERR_CHECK(err); } } #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL) queue.finish(); std::cout << "ViennaCL: Kernel " << k.name() << " finished!" << std::endl; #endif } //enqueue()
int main() { cl_platform_id platform = NULL; cl_device_id device = NULL; cl_context context = NULL; cl_command_queue command_queue = NULL; cl_program program = NULL; cl_kernel kernel = NULL; cl_int status = 0; cl_event task_event, map_event; cl_device_type dType = CL_DEVICE_TYPE_GPU; cl_int image_width, image_height; cl_float4 *result; int i, j; cl_mem clImage, out; cl_bool support; int pixels_read = 8; //Setup the OpenCL Platform, //Get the first available platform. Use it as the default platform status = clGetPlatformIDs(1, &platform, NULL); LOG_OCL_ERROR(status, "clGetPlatformIDs Failed" ); //Get the first available device status = clGetDeviceIDs (platform, dType, 1, &device, NULL); LOG_OCL_ERROR(status, "clGetDeviceIDs Failed" ); /*Check if the device support images */ clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(support), &support, NULL); if (support != CL_TRUE) { std::cout <<"IMAGES not supported\n"; return 1; } //Create an execution context for the selected platform and device. cl_context_properties contextProperty[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType( contextProperty, dType, NULL, NULL, &status); LOG_OCL_ERROR(status, "clCreateContextFromType Failed" ); /*Create command queue*/ command_queue = clCreateCommandQueue(context, device, 0, &status); LOG_OCL_ERROR(status, "clCreateCommandQueue Failed" ); /* Create Image Object */ //Create OpenCL device input image with the format and descriptor as below cl_image_format image_format; image_format.image_channel_data_type = CL_FLOAT; image_format.image_channel_order = CL_R; //We create a 5 X 5 2D image image_width = 5; image_height = 5; cl_image_desc image_desc; image_desc.image_type = CL_MEM_OBJECT_IMAGE2D; image_desc.image_width = image_width; image_desc.image_height = image_height; image_desc.image_depth = 1; image_desc.image_array_size = 1; image_desc.image_row_pitch = image_width*sizeof(float); image_desc.image_slice_pitch = 25*sizeof(float); image_desc.num_mip_levels = 0; image_desc.num_samples = 0; image_desc.buffer = NULL; /* Create output buffer */ out = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float4)*pixels_read, NULL, &status); LOG_OCL_ERROR(status, "clCreateBuffer Failed" ); size_t origin[] = {0,0,0}; /* Transfer target coordinate*/ size_t region[] = {image_width,image_height,1}; /* Size of object to be transferred */ float *data = (float *)malloc(image_width*image_height*sizeof(float)); float pixels[] = { /* Transfer Data */ 10, 20, 10, 40, 50, 10, 20, 20, 40, 50, 10, 20, 30, 40, 50, 10, 20, 40, 40, 50, 10, 20, 50, 40, 50 }; memcpy(data, pixels, image_width*image_height*sizeof(float)); clImage = clCreateImage(context, CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR, &image_format, &image_desc, pixels, &status); LOG_OCL_ERROR(status, "clCreateImage Failed" ); /* If the image was not created using CL_MEM_USE_HOST_PTR, then you can write the image data to the device using the clEnqueueWriteImage function. */ //status = clEnqueueWriteImage(command_queue, clImage, CL_TRUE, origin, region, 5*sizeof(float), 25*sizeof(float), data, 0, NULL, NULL); //LOG_OCL_ERROR(status, "clCreateBuffer Failed" ); /* Build program */ program = clCreateProgramWithSource(context, 1, (const char **)&sample_image_kernel, NULL, &status); LOG_OCL_ERROR(status, "clCreateProgramWithSource Failed" ); // Build the program status = clBuildProgram(program, 1, &device, "", NULL, NULL); LOG_OCL_ERROR(status, "clBuildProgram Failed" ); if(status != CL_SUCCESS) { if(status == CL_BUILD_PROGRAM_FAILURE) LOG_OCL_COMPILER_ERROR(program, device); LOG_OCL_ERROR(status, "clBuildProgram Failed" ); } printf("Printing the image pixels\n"); for (i=0; i<image_height; i++) { for (j=0; j<image_width; j++) { printf("%f ",data[i*image_width +j]); } printf("\n"); } //Create kernel and set the kernel arguments kernel = clCreateKernel(program, "image_test", &status); clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&clImage); clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&out); /*********Image sampler with image repeated at every 1.0 normalized coordinate***********/ /*If host side sampler is not required the sampler objects can also be created on the kernel code. Don't pass the thirsd argument to the kernel and create a sample object as shown below in the kernel code*/ //const sampler_t sampler = CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; cl_sampler sampler = clCreateSampler (context, CL_TRUE, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST, &status); clSetKernelArg(kernel, 2, sizeof(cl_sampler), (void*)&sampler); //Enqueue the kernel status = clEnqueueTask(command_queue, kernel, 0, NULL, &task_event); LOG_OCL_ERROR(status, "clEnqueueTask Failed" ); /* Map the result back to host address */ result = (cl_float4*)clEnqueueMapBuffer(command_queue, out, CL_TRUE, CL_MAP_READ, 0, sizeof(cl_float4)*pixels_read, 1, &task_event, &map_event, &status); printf(" SAMPLER mode set to CL_ADDRESS_REPEAT | CL_FILTER_NEAREST\n"); printf("\nPixel values retreived based on the filter and Addressing mode selected\n"); printf("(float2)(0.5f,0.5f) = %f,%f,%f,%f\n",result[0].s[0],result[0].s[1],result[0].s[2],result[0].s[3]); printf("(float2)(0.8f,0.5f) = %f,%f,%f,%f\n",result[1].s[0],result[1].s[1],result[1].s[2],result[1].s[3]); printf("(float2)(1.3f,0.5f) = %f,%f,%f,%f\n",result[2].s[0],result[2].s[1],result[2].s[2],result[2].s[3]); printf("(float2)(0.5f,0.5f) = %f,%f,%f,%f\n",result[3].s[0],result[3].s[1],result[3].s[2],result[3].s[3]); printf("(float2)(0.5f,0.8f) = %f,%f,%f,%f\n",result[4].s[0],result[4].s[1],result[4].s[2],result[4].s[3]); printf("(float2)(0.5f,1.3f) = %f,%f,%f,%f\n",result[5].s[0],result[5].s[1],result[5].s[2],result[5].s[3]); printf("(float2)(4.5f,0.5f) = %f,%f,%f,%f\n",result[5].s[0],result[5].s[1],result[5].s[2],result[5].s[3]); printf("(float2)(5.0f,0.5f) = %f,%f,%f,%f\n",result[7].s[0],result[7].s[1],result[7].s[2],result[7].s[3]); clEnqueueUnmapMemObject(command_queue, out, result, 0, NULL, NULL); clReleaseSampler(sampler); /*********Image sampler with image mirrored at every 1.0 normalized coordinate***********/ sampler = clCreateSampler (context, CL_TRUE, CL_ADDRESS_MIRRORED_REPEAT, CL_FILTER_LINEAR, &status); clSetKernelArg(kernel, 2, sizeof(cl_sampler), (void*)&sampler); //Enqueue the kernel status = clEnqueueTask(command_queue, kernel, 0, NULL, &task_event); LOG_OCL_ERROR(status, "clEnqueueTask Failed" ); /* Map the result back to host address */ result = (cl_float4*)clEnqueueMapBuffer(command_queue, out, CL_TRUE, CL_MAP_READ, 0, sizeof(cl_float4)*pixels_read, 1, &task_event, &map_event, &status); printf(" SAMPLER mode set to CL_ADDRESS_MIRRORED_REPEAT | CL_FILTER_LINEAR\n"); printf("\nPixel values retreived based on the filter and Addressing mode selected\n"); printf("(float2)(0.5f,0.5f) = %f,%f,%f,%f\n",result[0].s[0],result[0].s[1],result[0].s[2],result[0].s[3]); printf("(float2)(0.8f,0.5f) = %f,%f,%f,%f\n",result[1].s[0],result[1].s[1],result[1].s[2],result[1].s[3]); printf("(float2)(1.3f,0.5f) = %f,%f,%f,%f\n",result[2].s[0],result[2].s[1],result[2].s[2],result[2].s[3]); printf("(float2)(0.5f,0.5f) = %f,%f,%f,%f\n",result[3].s[0],result[3].s[1],result[3].s[2],result[3].s[3]); printf("(float2)(0.5f,0.8f) = %f,%f,%f,%f\n",result[4].s[0],result[4].s[1],result[4].s[2],result[4].s[3]); printf("(float2)(0.5f,1.3f) = %f,%f,%f,%f\n",result[5].s[0],result[5].s[1],result[5].s[2],result[5].s[3]); printf("(float2)(4.5f,0.5f) = %f,%f,%f,%f\n",result[5].s[0],result[5].s[1],result[5].s[2],result[5].s[3]); printf("(float2)(5.0f,0.5f) = %f,%f,%f,%f\n",result[7].s[0],result[7].s[1],result[7].s[2],result[7].s[3]); clEnqueueUnmapMemObject(command_queue, out, result, 0, NULL, NULL); clReleaseSampler(sampler); /********************/ //Free All OpenCL objects. clReleaseMemObject(out); clReleaseMemObject(clImage); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(command_queue); clReleaseContext(context); return 0; }
void Device::scoreCandidates(eObj *e) { //e->iNumBufferedCandidates = 0; //return; //MEA: static? static cObj* p; //static size_t iNumBlocks; static size_t stGlobalDim; static size_t globalTransDim = Tempest::mround(Tempest::data.iNumMS2Bins, this->transform_size); static float fElapsedTime; long lSpectrumOffset = e->lIndex*Tempest::data.iNumMS2Bins; long lScratchOffset = (long)Tempest::data.iCrossCorrelationWidth; long lNoOffset = 0; int err; cl_ulong start; cl_ulong end; err = clEnqueueWriteBuffer(clCommandQueue, cl_cCandidates, CL_FALSE, 0, sizeof(cObj) * e->iNumBufferedCandidates, e->candidateBuffer, 0, NULL, &(e->clEventSent)); Tempest::check_cl_error(__FILE__, __LINE__, err, "Unable to copy candidate data from host to GPU"); stGlobalDim = Tempest::mround(Tempest::data.host_iPeakCounts[e->lIndex], this->build_size); cl_mem spectrumBuffer; std::map<long,cl_mem>::iterator s2bElem = spectrum2buffer.find(e->lIndex); if (s2bElem == spectrum2buffer.end()) { //spectrum not cached if (!unusedBuffers.empty()) { spectrumBuffer = unusedBuffers.top(); unusedBuffers.pop(); } else { spectrumBuffer = spectrum2buffer.begin()->second; spectrum2buffer.erase(spectrum2buffer.begin()); } spectrum2buffer[e->lIndex] = spectrumBuffer; //initialize buffer err = clEnqueueCopyBuffer(clCommandQueue, cl_init_fSpectra, spectrumBuffer, 0, 0, Tempest::data.iNumMS2Bins*sizeof(cl_float), 0, NULL, Tempest::config.profile ? &memsetEvent : NULL); //Tempest::check_cl_error(__FILE__, __LINE__, err, "Unable to clear spectrum memory"); if (err != 0) { //memory cap reached. Stop filling new buffers. unusedBuffers = std::stack<cl_mem>(); spectrumBuffer = spectrum2buffer.begin()->second; spectrum2buffer.erase(spectrum2buffer.begin()); spectrum2buffer[e->lIndex] = spectrumBuffer; err = clEnqueueCopyBuffer(clCommandQueue, cl_init_fSpectra, spectrumBuffer, 0, 0, Tempest::data.iNumMS2Bins*sizeof(cl_float), 0, NULL, Tempest::config.profile ? &memsetEvent : NULL); Tempest::check_cl_error(__FILE__, __LINE__, err, "Unable to clear spectrum memory"); } if (Tempest::config.profile) { clFinish(clCommandQueue); clGetEventProfilingInfo(memsetEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); clGetEventProfilingInfo(memsetEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); totalMemsetTime += (end-start); clReleaseEvent(memsetEvent); } // build err = clSetKernelArg(__cl_build, 0, sizeof(cl_mem), &spectrumBuffer); err |= clSetKernelArg(__cl_build, 1, sizeof(int), &(Tempest::data.host_iPeakCounts[e->lIndex])); err |= clSetKernelArg(__cl_build, 4, sizeof(long), &(Tempest::data.host_lPeakIndices[e->lIndex])); err |= clEnqueueNDRangeKernel(clCommandQueue, __cl_build, 1, NULL, &stGlobalDim, &(this->build_size), 0, NULL, Tempest::config.profile ? &buildEvent : NULL); Tempest::check_cl_error(__FILE__, __LINE__, err, "Could not build spectrum (cl_build kernel)"); if (Tempest::config.profile) { clFinish(clCommandQueue); clGetEventProfilingInfo(buildEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); clGetEventProfilingInfo(buildEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); totalBuildTime += (end-start); buildLaunches += 1; clReleaseEvent(buildEvent); } // transform if (Tempest::params.xcorrTransformWidth) { //size_t localDim = CROSS_CORRELATION_WINDOW * 2; //size_t globalDim = localDim * Tempest::data.iNumMS2Bins; size_t globalDim = Tempest::mround(Tempest::data.iNumMS2Bins, this->transform_size); err = clSetKernelArg(__cl_transform, 0, sizeof(cl_mem), &spectrumBuffer); err |= clEnqueueNDRangeKernel(clCommandQueue, __cl_transform, 1, NULL, &globalDim, &(this->transform_size), 0, NULL, Tempest::config.profile ? & transformEvent : NULL); Tempest::check_cl_error(__FILE__, __LINE__, err, "Could not transform spectrum (cl_transform kernel)"); if (Tempest::config.profile) { clFinish(clCommandQueue); clGetEventProfilingInfo(transformEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); clGetEventProfilingInfo(transformEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); totalTransformTime += (end-start); clReleaseEvent(transformEvent); } } } else { //move spectrum entry to end of map by reinserting spectrumBuffer = s2bElem->second; spectrum2buffer.erase(s2bElem); spectrum2buffer[e->lIndex] = spectrumBuffer; } // score err = clSetKernelArg(__cl_score, 0, sizeof(int), &(e->iPrecursorCharge)); err |= clSetKernelArg(__cl_score, 1, sizeof(int), &(e->iNumBufferedCandidates)); err |= clSetKernelArg(__cl_score, 4, sizeof(cl_mem), &spectrumBuffer); err |= clSetKernelArg(__cl_score, 5, sizeof(long), &lNoOffset); err |= clEnqueueNDRangeKernel(clCommandQueue, __cl_score, 1, NULL, &(this->candidateBufferSize), &(this->score_size), 0, NULL, Tempest::config.profile ? &scoreEvent : NULL); Tempest::check_cl_error(__FILE__, __LINE__, err, "Could not score candidates (cl_score kernel)"); if (Tempest::config.profile) { clFinish(clCommandQueue); clGetEventProfilingInfo(scoreEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); clGetEventProfilingInfo(scoreEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); totalScoreTime += (end-start); clReleaseEvent(scoreEvent); scoreKernelLaunches++; } // Process Scores // TODO what if buffer size is less than 512? long lPSMsOffset = e->lIndex * Tempest::params.numInternalPSMs; err |= clSetKernelArg(__cl_reduce_scores, 4, sizeof(long), &lPSMsOffset); if (Tempest::config.parallelReduce) err |= clEnqueueNDRangeKernel(clCommandQueue, __cl_reduce_scores, 1, NULL, &(this->reduce_scores_size), &(this->reduce_scores_size), 0, NULL, Tempest::config.profile ? &reduceEvent : NULL); else err |= clEnqueueTask(clCommandQueue, __cl_reduce_scores, 0, NULL, Tempest::config.profile ? &reduceEvent : NULL); Tempest::check_cl_error(__FILE__, __LINE__, err, "Could not process scores (cl_reduce_scores kernel)"); if (Tempest::config.profile) { clFinish(clCommandQueue); clGetEventProfilingInfo(reduceEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); clGetEventProfilingInfo(reduceEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); totalReduceTime += (end-start); clReleaseEvent(reduceEvent); } // reset buffer e->iNumBufferedCandidates = 0; }
int main() { /* Host/device data structures */ cl_device_id device; cl_context context; cl_command_queue queue; cl_program program; cl_kernel kernel; cl_int i, err; /* Data and buffers */ unsigned char test[16]; cl_mem test_buffer; /* Create a context */ device = create_device(); context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); if(err < 0) { perror("Couldn't create a context"); exit(1); } /* Build the program and create a kernel */ program = build_program(context, device, PROGRAM_FILE); kernel = clCreateKernel(program, KERNEL_FUNC, &err); if(err < 0) { perror("Couldn't create a kernel"); exit(1); }; /* Create a write-only buffer to hold the output data */ test_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(test), NULL, &err); if(err < 0) { perror("Couldn't create a buffer"); exit(1); }; /* Create kernel argument */ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &test_buffer); if(err < 0) { perror("Couldn't set a kernel argument"); exit(1); }; /* Create a command queue */ queue = clCreateCommandQueue(context, device, 0, &err); if(err < 0) { perror("Couldn't create a command queue"); exit(1); }; /* Enqueue kernel */ err = clEnqueueTask(queue, kernel, 0, NULL, NULL); if(err < 0) { perror("Couldn't enqueue the kernel"); exit(1); } /* Read and print the result */ err = clEnqueueReadBuffer(queue, test_buffer, CL_TRUE, 0, sizeof(test), &test, 0, NULL, NULL); if(err < 0) { perror("Couldn't read the buffer"); exit(1); } for(i=0; i<15; i++) { printf("0x%X, ", test[i]); } printf("0x%X\n", test[15]); /* Deallocate resources */ clReleaseMemObject(test_buffer); clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseProgram(program); clReleaseContext(context); return 0; }
/* * To ease testing, each kernel will be a Task kernel taking a pointer to an * integer and running built-in functions. If an error is encountered, the * integer pointed to by the arg will be set accordingly. If the kernel succeeds, * this integer is set to 0. */ static uint32_t run_kernel(const char *source, TestCaseKind kind) { cl_platform_id platform = 0; cl_device_id device; cl_context ctx; cl_command_queue queue; cl_program program; cl_int result; cl_kernel kernel; cl_event event; cl_mem rs_buf; cl_sampler sampler; cl_mem mem1, mem2, mem3; cl_image_format fmt; unsigned char image2d_data[3*3*4] = { 255, 0, 0, 0, 0, 255, 0, 0, 128, 128, 128, 0, 0, 0, 255, 0, 255, 255, 0, 0, 0, 128, 0, 0, 255, 128, 0, 0, 128, 0, 255, 0, 0, 0, 0, 0 }; uint32_t rs = 0; result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0); if (result != CL_SUCCESS) return 65536; ctx = clCreateContext(0, 1, &device, 0, 0, &result); if (result != CL_SUCCESS) return 65537; queue = clCreateCommandQueue(ctx, device, 0, &result); if (result != CL_SUCCESS) return 65538; program = clCreateProgramWithSource(ctx, 1, &source, 0, &result); if (result != CL_SUCCESS) return 65539; result = clBuildProgram(program, 1, &device, "", 0, 0); if (result != CL_SUCCESS) { // Print log char *log = 0; size_t len = 0; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, 0, &len); log = (char *)std::malloc(len); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, len, log, 0); std::cout << log << std::endl; std::free(log); return 65540; } kernel = clCreateKernel(program, "test_case", &result); if (result != CL_SUCCESS) return 65541; // Create the result buffer rs_buf = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(rs), &rs, &result); if (result != CL_SUCCESS) return 65542; result = clSetKernelArg(kernel, 0, sizeof(cl_mem), &rs_buf); if (result != CL_SUCCESS) return 65543; // Kind switch (kind) { case NormalKind: break; case SamplerKind: sampler = clCreateSampler(ctx, 1, CL_ADDRESS_MIRRORED_REPEAT, CL_FILTER_NEAREST, &result); if (result != CL_SUCCESS) return 65546; result = clSetKernelArg(kernel, 1, sizeof(cl_sampler), &sampler); if (result != CL_SUCCESS) return 65547; break; case ImageKind: fmt.image_channel_data_type = CL_UNORM_INT8; fmt.image_channel_order = CL_RGBA; mem1 = clCreateImage2D(ctx, CL_MEM_WRITE_ONLY, &fmt, 4, 4, 0, 0, &result); if (result != CL_SUCCESS) return 65548; mem3 = clCreateImage2D(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &fmt, 3, 3, 0, image2d_data, &result); if (result != CL_SUCCESS) return 65548; fmt.image_channel_data_type = CL_SIGNED_INT16; mem2 = clCreateImage2D(ctx, CL_MEM_WRITE_ONLY, &fmt, 4, 4, 0, 0, &result); if (result != CL_SUCCESS) return 65548; result = clSetKernelArg(kernel, 1, sizeof(cl_mem), &mem1); if (result != CL_SUCCESS) return 65549; result = clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem2); if (result != CL_SUCCESS) return 65549; result = clSetKernelArg(kernel, 3, sizeof(cl_mem), &mem3); if (result != CL_SUCCESS) return 65549; break; default: break; } if (kind == BarrierKind) { size_t local_size = 64; size_t global_size = 64; result = clEnqueueNDRangeKernel(queue, kernel, 1, 0, &global_size, &local_size, 0, 0, &event); if (result != CL_SUCCESS) return 65544; } else { result = clEnqueueTask(queue, kernel, 0, 0, &event); if (result != CL_SUCCESS) return 65544; } result = clWaitForEvents(1, &event); if (result != CL_SUCCESS) return 65545; if (kind == SamplerKind) clReleaseSampler(sampler); if (kind == ImageKind) { clReleaseMemObject(mem1); clReleaseMemObject(mem2); clReleaseMemObject(mem3); } clReleaseEvent(event); clReleaseMemObject(rs_buf); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(queue); clReleaseContext(ctx); return rs; }
//--------------------------------------------------------------------- // this function computes the norm of the difference between the // computed solution and the exact solution //--------------------------------------------------------------------- void error_norm(double rms[5]) { int i, m, d; cl_kernel *k_en; cl_mem *m_rms; double (*g_rms)[5]; cl_int ecode; g_rms = (double (*)[5])malloc(sizeof(double)*5 * num_devices); m_rms = (cl_mem *)malloc(sizeof(cl_mem) * num_devices); k_en = (cl_kernel *)malloc(sizeof(cl_kernel) * num_devices); for (i = 0; i < num_devices; i++) { m_rms[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double) * 5, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer()"); k_en[i] = clCreateKernel(p_error[i], "error_norm", &ecode); clu_CheckError(ecode, "clCreateKernel()"); ecode = clSetKernelArg(k_en[i], 0, sizeof(cl_mem), &m_u[i]); ecode |= clSetKernelArg(k_en[i], 1, sizeof(cl_mem), &m_ce[i]); ecode |= clSetKernelArg(k_en[i], 2, sizeof(cl_mem), &m_rms[i]); ecode |= clSetKernelArg(k_en[i], 3, sizeof(cl_mem), &m_cell_low[i]); ecode |= clSetKernelArg(k_en[i], 4, sizeof(cl_mem), &m_cell_high[i]); ecode |= clSetKernelArg(k_en[i], 5, sizeof(int), &ncells); clu_CheckError(ecode, "clSetKernelArg()"); ecode = clEnqueueTask(cmd_queue[i], k_en[i], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueTask()"); clFinish(cmd_queue[i]); ecode = clEnqueueReadBuffer(cmd_queue[i], m_rms[i], CL_TRUE, 0, sizeof(double)*5, &g_rms[i], 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueReadBuffer()"); } for (m = 0; m < 5; m++) { rms[m] = 0.0; } for (i = 0; i < num_devices; i++) { ecode = clFinish(cmd_queue[i]); clu_CheckError(ecode, "clFinish()"); } // reduction for (i = 0; i < num_devices; i++) { for (m = 0; m < 5; m++) { rms[m] += g_rms[i][m]; } } for (m = 0; m < 5; m++) { for (d = 0; d < 3; d++) { rms[m] = rms[m] / (double)(grid_points[d]-2); } rms[m] = sqrt(rms[m]); } for (i = 0; i < num_devices; i++) { clReleaseMemObject(m_rms[i]); clReleaseKernel(k_en[i]); } free(g_rms); free(m_rms); free(k_en); }