// main function //***************************************************************************** int main(int argc, const char **argv) { cl_context cxGPUContext; // OpenCL context cl_command_queue cqCommandQue[MAX_GPU_COUNT]; // OpenCL command que cl_device_id* cdDevices; // OpenCL device list cl_int err = 1; // Error code var shrSetLogFileName ("oclHiddenMarkovModel.txt"); shrLog(LOGBOTH, 0, "%s Starting...\n\n", argv[0]); shrLog(LOGBOTH, 0, "Create context\n"); cxGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &err); shrCheckErrorEX(err, CL_SUCCESS, NULL); shrLog(LOGBOTH, 0, "Get device info...\n"); int nDevice = 0; size_t nDeviceBytes; err |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes); cdDevices = (cl_device_id*)malloc(nDeviceBytes); err |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, nDeviceBytes, cdDevices, NULL); nDevice = (int)(nDeviceBytes/sizeof(cl_device_id)); shrLog(LOGBOTH, 0, "clCreateCommandQueue\n"); int id_device; if(shrGetCmdLineArgumenti(argc, argv, "device", &id_device)) // Set up command queue(s) for GPU specified on the command line { // get & log device index # and name cl_device_id cdDevice = cdDevices[id_device]; // create a command que cqCommandQue[0] = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &err); shrCheckErrorEX(err, CL_SUCCESS, NULL); oclPrintDevInfo(LOGBOTH, cdDevice); nDevice = 1; } else { // create command queues for all available devices for (int i = 0; i < nDevice; i++) { cqCommandQue[i] = clCreateCommandQueue(cxGPUContext, cdDevices[i], 0, &err); shrCheckErrorEX(err, CL_SUCCESS, NULL); } for (int i = 0; i < nDevice; i++) oclPrintDevInfo(LOGBOTH, cdDevices[i]); } shrLog(LOGBOTH, 0, "\nUsing %d GPU(s)...\n\n", nDevice); int wgSize; if (!shrGetCmdLineArgumenti(argc, argv, "work-group-size", &wgSize)) { wgSize = 256; } shrLog(LOGBOTH, 0, "Init Hidden Markov Model parameters\n"); int nState = 256*16; // number of states int nEmit = 128; // number of possible observations float *initProb = (float*)malloc(sizeof(float)*nState); // initial probability float *mtState = (float*)malloc(sizeof(float)*nState*nState); // state transition matrix float *mtEmit = (float*)malloc(sizeof(float)*nEmit*nState); // emission matrix initHMM(initProb, mtState, mtEmit, nState, nEmit); // define observational sequence int nObs = 500; // size of observational sequence int **obs = (int**)malloc(nDevice*sizeof(int*)); int **viterbiPathCPU = (int**)malloc(nDevice*sizeof(int*)); int **viterbiPathGPU = (int**)malloc(nDevice*sizeof(int*)); float *viterbiProbCPU = (float*)malloc(nDevice*sizeof(float)); for (int iDevice = 0; iDevice < nDevice; iDevice++) { obs[iDevice] = (int*)malloc(sizeof(int)*nObs); for (int i = 0; i < nObs; i++) obs[iDevice][i] = i % 15; viterbiPathCPU[iDevice] = (int*)malloc(sizeof(int)*nObs); viterbiPathGPU[iDevice] = (int*)malloc(sizeof(int)*nObs); } shrLog(LOGBOTH, 0, "# of states = %d\n# of possible observations = %d \nSize of observational sequence = %d\n\n", nState, nEmit, nObs); shrLog(LOGBOTH, 0, "Compute Viterbi path on GPU\n\n"); HMM **oHmm = (HMM**)malloc(nDevice*sizeof(HMM*)); for (int iDevice = 0; iDevice < nDevice; iDevice++) { oHmm[iDevice] = new HMM(cxGPUContext, cqCommandQue[iDevice], initProb, mtState, mtEmit, nState, nEmit, nObs, argv[0], wgSize); } cl_mem *vProb = (cl_mem*)malloc(sizeof(cl_mem)*nDevice); cl_mem *vPath = (cl_mem*)malloc(sizeof(cl_mem)*nDevice); for (int iDevice = 0; iDevice < nDevice; iDevice++) { vProb[iDevice] = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(float), NULL, &err); vPath[iDevice] = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(int)*nObs, NULL, &err); } #ifdef GPU_PROFILING for (int iDevice = 0; iDevice < nDevice; iDevice++) { clFinish(cqCommandQue[iDevice]);; } shrDeltaT(1); #endif size_t szWorkGroup; for (int iDevice = 0; iDevice < nDevice; iDevice++) { szWorkGroup = oHmm[iDevice]->ViterbiSearch(vProb[iDevice], vPath[iDevice], obs[iDevice]); } for (int iDevice = 0; iDevice < nDevice; iDevice++) { clFinish(cqCommandQue[iDevice]); } #ifdef GPU_PROFILING double dElapsedTime = shrDeltaT(1); shrLog(LOGBOTH | MASTER, 0, "oclHiddenMarkovModel, Throughput = %.4f, Time = %.5f, Size = %u, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * nDevice * nState * nObs)/dElapsedTime, dElapsedTime, (nDevice * nState * nObs), nDevice, szWorkGroup); #endif for (int iDevice = 0; iDevice < nDevice; iDevice++) { err = clEnqueueReadBuffer(cqCommandQue[iDevice], vPath[iDevice], CL_TRUE, 0, sizeof(int)*nObs, viterbiPathGPU[iDevice], 0, NULL, NULL); } shrLog(LOGBOTH, 0, "\nCompute Viterbi path on CPU\n"); for (int iDevice = 0; iDevice < nDevice; iDevice++) { err = ViterbiCPU(viterbiProbCPU[iDevice], viterbiPathCPU[iDevice], obs[iDevice], nObs, initProb, mtState, nState, mtEmit); } if (!err) { shrEXIT(argc, argv); } bool pass = true; for (int iDevice = 0; iDevice < nDevice; iDevice++) { for (int i = 0; i < nObs; i++) { if (viterbiPathCPU[iDevice][i] != viterbiPathGPU[iDevice][i]) { pass = false; break; } } } shrLog(LOGBOTH, 0, "\nTEST %s\n\n", (pass) ? "PASSED" : "FAILED !!!"); // NOTE: Most properly this should be done at any of the exit points above, but it is omitted elsewhere for clarity. shrLog(LOGBOTH, 0, "Release CPU buffers and OpenCL objects...\n"); free(initProb); free(mtState); free(mtEmit); for (int iDevice = 0; iDevice < nDevice; iDevice++) { free(obs[iDevice]); free(viterbiPathCPU[iDevice]); free(viterbiPathGPU[iDevice]); delete oHmm[iDevice]; clReleaseCommandQueue(cqCommandQue[iDevice]); } free(obs); free(viterbiPathCPU); free(viterbiPathGPU); free(viterbiProbCPU); free(cdDevices); free(oHmm); clReleaseContext(cxGPUContext); shrEXIT(argc, argv); }
// main function //***************************************************************************** int main(int argc, const char **argv) { cl_platform_id cpPlatform; // OpenCL platform cl_uint nDevice; // OpenCL device count cl_device_id* cdDevices; // OpenCL device list cl_context cxGPUContext; // OpenCL context cl_command_queue cqCommandQue[MAX_GPU_COUNT]; // OpenCL command que cl_int ciErrNum = 1; // Error code var shrQAStart(argc, (char **)argv); // Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("clGetPlatformID...\n"); //Get all the devices cl_uint uiNumDevices = 0; // Number of devices available cl_uint uiTargetDevice = 0; // Default Device to compute on cl_uint uiNumComputeUnits; // Number of compute units (SM's on NV GPU) shrLog("Get the Device info and select Device...\n"); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); // Get command line device options and config accordingly shrLog(" # of Devices Available = %u\n", uiNumDevices); if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE) { uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1)); } shrLog(" Using Device %u: ", uiTargetDevice); oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]); ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("\n # of Compute Units = %u\n", uiNumComputeUnits); shrSetLogFileName ("oclHiddenMarkovModel.txt"); shrLog("%s Starting...\n\n", argv[0]); shrLog("Get platform...\n"); ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("Get devices...\n"); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &nDevice); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); cdDevices = (cl_device_id *)malloc(nDevice * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, nDevice, cdDevices, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("clCreateContext\n"); cxGPUContext = clCreateContext(0, nDevice, cdDevices, NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("clCreateCommandQueue\n"); int id_device; if(shrGetCmdLineArgumenti(argc, argv, "device", &id_device)) // Set up command queue(s) for GPU specified on the command line { // create a command que cqCommandQue[0] = clCreateCommandQueue(cxGPUContext, cdDevices[id_device], 0, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); oclPrintDevInfo(LOGBOTH, cdDevices[id_device]); nDevice = 1; } else { // create command queues for all available devices for (cl_uint i = 0; i < nDevice; i++) { cqCommandQue[i] = clCreateCommandQueue(cxGPUContext, cdDevices[i], CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); } for (cl_uint i = 0; i < nDevice; i++) oclPrintDevInfo(LOGBOTH, cdDevices[i]); } shrLog("\nUsing %d GPU(s)...\n\n", nDevice); int wgSize; if (!shrGetCmdLineArgumenti(argc, argv, "work-group-size", &wgSize)) { wgSize = 256; } shrLog("Init Hidden Markov Model parameters\n"); int nState = 256*16; // number of states, must be a multiple of 256 int nEmit = 128; // number of possible observations float *initProb = (float*)malloc(sizeof(float)*nState); // initial probability float *mtState = (float*)malloc(sizeof(float)*nState*nState); // state transition matrix float *mtEmit = (float*)malloc(sizeof(float)*nEmit*nState); // emission matrix initHMM(initProb, mtState, mtEmit, nState, nEmit); // define observational sequence int nObs = 100; // size of observational sequence int **obs = (int**)malloc(nDevice*sizeof(int*)); int **viterbiPathCPU = (int**)malloc(nDevice*sizeof(int*)); int **viterbiPathGPU = (int**)malloc(nDevice*sizeof(int*)); float *viterbiProbCPU = (float*)malloc(nDevice*sizeof(float)); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { obs[iDevice] = (int*)malloc(sizeof(int)*nObs); for (int i = 0; i < nObs; i++) obs[iDevice][i] = i % 15; viterbiPathCPU[iDevice] = (int*)malloc(sizeof(int)*nObs); viterbiPathGPU[iDevice] = (int*)malloc(sizeof(int)*nObs); } shrLog("# of states = %d\n# of possible observations = %d \nSize of observational sequence = %d\n\n", nState, nEmit, nObs); shrLog("Compute Viterbi path on GPU\n\n"); HMM **oHmm = (HMM**)malloc(nDevice*sizeof(HMM*)); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { oHmm[iDevice] = new HMM(cxGPUContext, cqCommandQue[iDevice], initProb, mtState, mtEmit, nState, nEmit, nObs, argv[0], wgSize); } cl_mem *vProb = (cl_mem*)malloc(sizeof(cl_mem)*nDevice); cl_mem *vPath = (cl_mem*)malloc(sizeof(cl_mem)*nDevice); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { vProb[iDevice] = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(float), NULL, &ciErrNum); vPath[iDevice] = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(int)*nObs, NULL, &ciErrNum); } #ifdef GPU_PROFILING for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { clFinish(cqCommandQue[iDevice]);; } shrDeltaT(1); #endif size_t szWorkGroup; for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { szWorkGroup = oHmm[iDevice]->ViterbiSearch(vProb[iDevice], vPath[iDevice], obs[iDevice]); } for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { clFinish(cqCommandQue[iDevice]); } #ifdef GPU_PROFILING double dElapsedTime = shrDeltaT(1); shrLogEx(LOGBOTH | MASTER, 0, "oclHiddenMarkovModel, Throughput = %.4f GB/s, Time = %.5f s, Size = %u items, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-9 * 2.0 * sizeof(float) * nDevice * nState * nState * (nObs-1))/dElapsedTime, dElapsedTime, (nDevice * nState * nObs), nDevice, szWorkGroup); #endif for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { ciErrNum = clEnqueueReadBuffer(cqCommandQue[iDevice], vPath[iDevice], CL_TRUE, 0, sizeof(int)*nObs, viterbiPathGPU[iDevice], 0, NULL, NULL); } shrLog("\nCompute Viterbi path on CPU\n"); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { ciErrNum = ViterbiCPU(viterbiProbCPU[iDevice], viterbiPathCPU[iDevice], obs[iDevice], nObs, initProb, mtState, nState, mtEmit); } if (!ciErrNum) { shrEXIT(argc, argv); } bool pass = true; for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { for (int i = 0; i < nObs; i++) { if (viterbiPathCPU[iDevice][i] != viterbiPathGPU[iDevice][i]) { pass = false; break; } } } // NOTE: Most properly this should be done at any of the exit points above, but it is omitted elsewhere for clarity. shrLog("Release CPU buffers and OpenCL objects...\n"); free(initProb); free(mtState); free(mtEmit); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { free(obs[iDevice]); free(viterbiPathCPU[iDevice]); free(viterbiPathGPU[iDevice]); delete oHmm[iDevice]; clReleaseCommandQueue(cqCommandQue[iDevice]); } free(obs); free(viterbiPathCPU); free(viterbiPathGPU); free(viterbiProbCPU); free(cdDevices); free(oHmm); clReleaseContext(cxGPUContext); // finish shrQAFinishExit(argc, (const char **)argv, pass ? QA_PASSED : QA_FAILED); shrEXIT(argc, argv); }
int main(int argc, char *argv[]) { cl_platform_id platform; cl_device_id device; cl_context context; cl_command_queue queue; cl_program program; cl_kernel kernel_one, kernel_path; cl_mem d_mt_state, d_mt_emit, d_max_prob_old; cl_mem d_max_prob_new, d_path, v_prob, v_path; int wg_size = 256; int n_state = 256*16; int n_emit = 128; int n_obs = 100; size_t init_prob_size = sizeof(float) * n_state; size_t mt_state_size = sizeof(float) * n_state * n_state; size_t mt_emit_size = sizeof(float) * n_emit * n_state; float *init_prob = (float *) malloc(init_prob_size); float *mt_state = (float *) malloc(mt_state_size); float *mt_emit = (float *) malloc(mt_emit_size); int *obs = (int *) malloc(sizeof(int) * n_obs); int *viterbi_gpu = (int *) malloc(sizeof(int) * n_obs); srand(2012); initHMM(init_prob, mt_state, mt_emit, n_state, n_emit); int i; for (i = 0; i < n_obs; i++) { obs[i] = i % 15; } const char *source = load_program_source("Viterbi.cl"); size_t source_len = strlen(source);; cl_uint err = 0; char *flags = "-cl-fast-relaxed-math"; clGetPlatformIDs(1, &platform, NULL); printf("platform %p err %d\n", platform, err); clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, &err); printf("device %p err %d\n", device, err); context = clCreateContext(0, 1, &device, NULL, NULL, &err); printf("context %p err %d\n", context, err); queue = clCreateCommandQueue(context, device, 0, &err); printf("queue %p err %d\n", queue, err); program = clCreateProgramWithSource(context, 1, &source, &source_len, &err); printf("program %p err %d\n", program, err); err = clBuildProgram(program, 0, NULL, flags, NULL, NULL); printf("err %d\n", err); /* char tmp[102400]; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(tmp), tmp, NULL); printf("error %s\n", tmp); */ kernel_one = clCreateKernel(program, "ViterbiOneStep", &err); printf("kernel %p err %d\n", kernel_one, err); kernel_path = clCreateKernel(program, "ViterbiPath", &err); printf("kernel %p err %d\n", kernel_path, err); d_mt_state = clCreateBuffer(context, CL_MEM_READ_ONLY, mt_state_size, NULL, &err); printf("buffer %p\n", d_mt_state); d_mt_emit = clCreateBuffer(context, CL_MEM_READ_ONLY, mt_emit_size, NULL, &err); printf("buffer %p\n", d_mt_emit); d_max_prob_new = clCreateBuffer(context, CL_MEM_READ_WRITE, init_prob_size, NULL, &err); printf("buffer %p\n", d_max_prob_new); d_max_prob_old = clCreateBuffer(context, CL_MEM_READ_WRITE, init_prob_size, NULL, &err); printf("buffer %p\n", d_max_prob_old); d_path = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int)*(n_obs-1)*n_state, NULL, &err); printf("buffer %p\n", d_path); v_prob = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float), NULL, &err); printf("buffer %p\n", v_prob); v_path = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int)*n_obs, NULL, &err); printf("buffer %p\n", v_prob); err = clEnqueueWriteBuffer(queue, d_mt_state, CL_TRUE, 0, mt_state_size, mt_state, 0, NULL, NULL); printf("err %d\n", err); err = clEnqueueWriteBuffer(queue, d_mt_emit, CL_TRUE, 0, mt_emit_size, mt_emit, 0, NULL, NULL); printf("err %d\n", err); err = clEnqueueWriteBuffer(queue, d_max_prob_old, CL_TRUE, 0, init_prob_size, init_prob, 0, NULL, NULL); printf("err %d\n", err); // max_wg_size is 1024 for Intel Core 2 CPU size_t max_wg_size; err = clGetKernelWorkGroupInfo(kernel_one, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_wg_size, NULL); printf("max_wg_size %d\n", max_wg_size); size_t local_work_size[2], global_work_size[2]; local_work_size[0] = wg_size; local_work_size[1] = 1; global_work_size[0] = local_work_size[0] * 256; global_work_size[1] = n_state/256; for (i = 1; i < n_obs; i++) { err = clSetKernelArg(kernel_one, 0, sizeof(cl_mem), (void*)&d_max_prob_new); printf("err %d\n", err); err = clSetKernelArg(kernel_one, 1, sizeof(cl_mem), (void*)&d_path); printf("err %d\n", err); err = clSetKernelArg(kernel_one, 2, sizeof(cl_mem), (void*)&d_max_prob_old); printf("err %d\n", err); err = clSetKernelArg(kernel_one, 3, sizeof(cl_mem), (void*)&d_mt_state); printf("err %d\n", err); err = clSetKernelArg(kernel_one, 4, sizeof(cl_mem), (void*)&d_mt_emit); printf("err %d\n", err); err = clSetKernelArg(kernel_one, 5, sizeof(float)*local_work_size[0], NULL); printf("err %d\n", err); err = clSetKernelArg(kernel_one, 6, sizeof(int)*local_work_size[0], NULL); printf("err %d\n", err); err = clSetKernelArg(kernel_one, 7, sizeof(int), (void*)&n_state); printf("err %d\n", err); err = clSetKernelArg(kernel_one, 8, sizeof(int), (void*)&(obs[i])); printf("err %d\n", err); err = clSetKernelArg(kernel_one, 9, sizeof(int), (void*)&i); printf("err %d\n", err); err = clEnqueueNDRangeKernel(queue, kernel_one, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL); printf("err %d\n", err); err = clEnqueueCopyBuffer(queue, d_max_prob_new, d_max_prob_old, 0, 0, sizeof(float)*n_state, 0, NULL, NULL); printf("err %d\n", err); } local_work_size[0] = 1; global_work_size[0] = 1; err = clSetKernelArg(kernel_path, 0, sizeof(cl_mem), (void*)&v_prob); printf("err %d\n", err); err = clSetKernelArg(kernel_path, 1, sizeof(cl_mem), (void*)&v_path); printf("err %d\n", err); err = clSetKernelArg(kernel_path, 2, sizeof(cl_mem), (void*)&d_max_prob_new); printf("err %d\n", err); err = clSetKernelArg(kernel_path, 3, sizeof(cl_mem), (void*)&d_path); printf("err %d\n", err); err = clSetKernelArg(kernel_path, 4, sizeof(int), (void*)&n_state); printf("err %d\n", err); err = clSetKernelArg(kernel_path, 5, sizeof(int), (void*)&n_obs); printf("err %d\n", err); err = clEnqueueNDRangeKernel(queue, kernel_path, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); printf("err %d\n", err); clFinish(queue); printf("finish done\n"); err = clEnqueueReadBuffer(queue, v_path, CL_TRUE, 0, sizeof(int)*n_obs, viterbi_gpu, 0, NULL, NULL); printf("err %d\n", err); for (i = 0; i < n_obs; i++) { printf("%d %d\n", i, viterbi_gpu[i]); } clReleaseMemObject(d_mt_state); clReleaseMemObject(d_mt_emit); clReleaseMemObject(d_max_prob_old); clReleaseMemObject(d_max_prob_new); clReleaseMemObject(d_path); clReleaseMemObject(v_prob); clReleaseMemObject(v_path); clReleaseProgram(program); clReleaseKernel(kernel_one); clReleaseKernel(kernel_path); clReleaseCommandQueue(queue); }