int Lucas::normalize2_runCLKernels () { cl_int status; /* * Kernel runs over complete output matrix with blocks of blockSize x blockSize * running concurrently */ size_t globalThreads = Nn / threads; size_t localThreads = 256; cl_int eventStatus = CL_QUEUED; // Set input data to matrix A and matrix B cl::Event ndrEvt; // Set appropriate arguments to the kernel status = normalize2_kernel.setArg (0, g_x); CHECK_OPENCL_ERROR (status, "cl::setArg failed. (g_x)"); status = normalize2_kernel.setArg (1, threads); CHECK_OPENCL_ERROR (status, "cl::setArg failed. (threads)"); status = normalize2_kernel.setArg (2, bigAB); CHECK_OPENCL_ERROR (status, "cl::setArg failed. (bigAB)"); status = normalize2_kernel.setArg (3, bigAB); CHECK_OPENCL_ERROR (status, "cl::setArg failed. (bigAB)"); status = normalize2_kernel.setArg (4, g_carry); CHECK_OPENCL_ERROR (status, "cl::setArg failed. (g_carry)"); status = normalize2_kernel.setArg (5, Nn); CHECK_OPENCL_ERROR (status, "cl::setArg failed. (Nn)"); status = normalize2_kernel.setArg (6, g_inv2); CHECK_OPENCL_ERROR (status, "cl::setArg failed. (g_inv2)"); status = normalize2_kernel.setArg (7, g_ttp2); CHECK_OPENCL_ERROR (status, "cl::setArg failed. (g_ttp2)"); status = normalize2_kernel.setArg (8, g_ttmp2); CHECK_OPENCL_ERROR (status, "cl::setArg failed. (g_ttmp2)"); status = normalize2_kernel.setArg (9, g_inv3); CHECK_OPENCL_ERROR (status, "cl::setArg failed. (g_inv3)"); status = normalize2_kernel.setArg (10, g_ttp3); CHECK_OPENCL_ERROR (status, "cl::setArg failed. (g_ttp3)"); status = normalize2_kernel.setArg (11, g_ttmp3); CHECK_OPENCL_ERROR (status, "cl::setArg failed. (g_ttmp3)"); /* * Enqueue a kernel run call. */ // Each thread calculates 2 gaussian numbers cl::NDRange gThreads (globalThreads); cl::NDRange lThreads (localThreads); status = commandQueue.enqueueNDRangeKernel (normalize2_kernel, cl::NullRange, gThreads, lThreads, 0, &ndrEvt); CHECK_OPENCL_ERROR (status, "CommandQueue::enqueueNDRangeKernel() failed."); status = commandQueue.flush (); CHECK_OPENCL_ERROR (status, "cl::CommandQueue.flush failed."); return SDK_SUCCESS; }
int MatrixMulDouble::runCLKernels(void) { cl_int status; /* * Kernel runs over complete output matrix with blocks of blockSize x blockSize * running concurrently */ size_t globalThreads[2]= {widthB / 4, heightA / 4}; size_t localThreads[2] = {blockSize, blockSize}; cl_int eventStatus = CL_QUEUED; // Set input data to matrix A and matrix B cl::Event inMapEvtA; cl::Event inMapEvtB; cl::Event inUnmapEvtA; cl::Event inUnmapEvtB; cl::Event outMapEvt; cl::Event outUnmapEvt; cl::Event ndrEvt; void* inMapPtrA = NULL; void* inMapPtrB = NULL; void* outMapPtr = NULL; inMapPtrA = commandQueue.enqueueMapBuffer( inputBufA, CL_FALSE, CL_MAP_WRITE, 0, widthA * heightA * sizeof(cl_double), NULL, &inMapEvtA, &status); CHECK_OPENCL_ERROR(status, "cl::CommandQueue.enqueueMapBuffer failed. (inputBufA)"); inMapPtrB = commandQueue.enqueueMapBuffer( inputBufB, CL_FALSE, CL_MAP_WRITE, 0, widthB * heightB * sizeof(cl_double), NULL, &inMapEvtB, &status); CHECK_OPENCL_ERROR(status, "cl::CommandQueue.enqueueMapBuffer failed. (inputBufB)"); status = commandQueue.flush(); CHECK_OPENCL_ERROR(status, "cl::CommandQueue.flush failed."); eventStatus = CL_QUEUED; while(eventStatus != CL_COMPLETE) { status = inMapEvtA.getInfo<cl_int>( CL_EVENT_COMMAND_EXECUTION_STATUS, &eventStatus); CHECK_OPENCL_ERROR(status, "cl:Event.getInfo(CL_EVENT_COMMAND_EXECUTION_STATUS) failed."); } memcpy(inMapPtrA, inputA, sizeof(cl_double) * widthA * heightA); eventStatus = CL_QUEUED; while(eventStatus != CL_COMPLETE) { status = inMapEvtB.getInfo<cl_int>( CL_EVENT_COMMAND_EXECUTION_STATUS, &eventStatus); CHECK_OPENCL_ERROR(status, "cl:Event.getInfo(CL_EVENT_COMMAND_EXECUTION_STATUS) failed."); } memcpy(inMapPtrB, inputB, sizeof(cl_double) * widthB * heightB); status = commandQueue.enqueueUnmapMemObject( inputBufA, inMapPtrA, NULL, &inUnmapEvtA); CHECK_OPENCL_ERROR(status, "cl::CommandQueue.enqueueUnmapMemObject (inputBufA)."); status = commandQueue.enqueueUnmapMemObject( inputBufB, inMapPtrB, NULL, &inUnmapEvtB); CHECK_OPENCL_ERROR(status, "cl::CommandQueue.enqueueUnmapMemObject (inputBufB)"); status = commandQueue.flush(); CHECK_OPENCL_ERROR(status, "cl::CommandQueue.flush failed."); eventStatus = CL_QUEUED; while(eventStatus != CL_COMPLETE) { status = inUnmapEvtA.getInfo<cl_int>( CL_EVENT_COMMAND_EXECUTION_STATUS, &eventStatus); CHECK_OPENCL_ERROR(status, "cl:Event.getInfo(CL_EVENT_COMMAND_EXECUTION_STATUS) failed."); } eventStatus = CL_QUEUED; while(eventStatus != CL_COMPLETE) { status = inUnmapEvtB.getInfo<cl_int>( CL_EVENT_COMMAND_EXECUTION_STATUS, &eventStatus); CHECK_OPENCL_ERROR(status, "cl:Event.getInfo(CL_EVENT_COMMAND_EXECUTION_STATUS) failed."); } // Set appropriate arguments to the kernel // output array as the 1st argument : stores product of inputA and inputB status = kernel.setArg(0, inputBufA); CHECK_OPENCL_ERROR(status, "cl::setArg failed. (inputBufA)"); status = kernel.setArg(1, inputBufB); CHECK_OPENCL_ERROR(status, "cl::setArg failed. (inputBufB)"); status = kernel.setArg(2, outputBuf); CHECK_OPENCL_ERROR(status, "cl::setArg failed. (outputBuf)"); // widthA of the inputA matrix as 4th argument - widthA status = kernel.setArg(3, widthA); CHECK_OPENCL_ERROR(status, "cl::setArg failed. (widthA)"); // Set local memory argument if Scratchpad is available if(lds) { status = kernel.setArg( 4, (blockSize * 4) * (blockSize * 4) * sizeof(cl_double), NULL); CHECK_OPENCL_ERROR(status, "cl::setArg failed. (local memory)"); } else { status = kernel.setArg(4, sizeof(cl_int), &widthB); CHECK_OPENCL_ERROR(status, "cl::setArg failed. (widthB)"); } /* * Enqueue a kernel run call. */ // Each thread calculates 2 gaussian numbers cl::NDRange gThreads(globalThreads[0], globalThreads[1]); cl::NDRange lThreads(localThreads[0], localThreads[1]); status = commandQueue.enqueueNDRangeKernel(kernel, cl::NullRange, gThreads, lThreads, 0, &ndrEvt); CHECK_OPENCL_ERROR(status, "CommandQueue::enqueueNDRangeKernel() failed."); status = commandQueue.flush(); CHECK_OPENCL_ERROR(status, "cl::CommandQueue.flush failed."); eventStatus = CL_QUEUED; while(eventStatus != CL_COMPLETE) { status = ndrEvt.getInfo<cl_int>( CL_EVENT_COMMAND_EXECUTION_STATUS, &eventStatus); CHECK_OPENCL_ERROR(status, "cl:Event.getInfo(CL_EVENT_COMMAND_EXECUTION_STATUS) failed."); } if(!eAppGFLOPS) { // Calculate performance cl_ulong startTime; cl_ulong endTime; status = ndrEvt.getProfilingInfo<cl_ulong>( CL_PROFILING_COMMAND_START, &startTime); CHECK_OPENCL_ERROR(status, "cl::Event.getProfilingInfo failed.(startTime)."); status = ndrEvt.getProfilingInfo<cl_ulong>( CL_PROFILING_COMMAND_END, &endTime); CHECK_OPENCL_ERROR(status, "cl::Event.getProfilingInfo failed.(endTime)."); // Print performance numbers double sec = 1e-9 * (endTime - startTime); kernelTime += sec; } outMapPtr = commandQueue.enqueueMapBuffer( outputBuf, CL_FALSE, CL_MAP_READ, 0, widthB * heightA * sizeof(cl_double), NULL, &outMapEvt, &status); CHECK_OPENCL_ERROR(status, "cl::CommandQueue.enqueueMapBuffer failed. (outputBuf)."); status = commandQueue.flush(); CHECK_OPENCL_ERROR(status, "cl::CommandQueue.flush failed."); eventStatus = CL_QUEUED; while(eventStatus != CL_COMPLETE) { status = outMapEvt.getInfo<cl_int>( CL_EVENT_COMMAND_EXECUTION_STATUS, &eventStatus); CHECK_OPENCL_ERROR(status, "cl:Event.getInfo(CL_EVENT_COMMAND_EXECUTION_STATUS) failed."); } memcpy(output, outMapPtr, sizeof(cl_double) * widthB * heightA); status = commandQueue.enqueueUnmapMemObject( outputBuf, outMapPtr, NULL, &outUnmapEvt); CHECK_OPENCL_ERROR(status, "cl::CommandQueue.enqueueUnmapMemObject (outputBuf)"); status = commandQueue.flush(); CHECK_OPENCL_ERROR(status, "cl::CommandQueue.flush failed."); eventStatus = CL_QUEUED; while(eventStatus != CL_COMPLETE) { status = outUnmapEvt.getInfo<cl_int>( CL_EVENT_COMMAND_EXECUTION_STATUS, &eventStatus); CHECK_OPENCL_ERROR(status, "cl:Event.getInfo(CL_EVENT_COMMAND_EXECUTION_STATUS) failed."); } return SDK_SUCCESS; }