//////////////////////////////////////////////////////////////////////////////// // Large scan launcher //////////////////////////////////////////////////////////////////////////////// static void scanExclusiveLocal2( cl_command_queue cqCommandQueue, cl_mem d_Buffer, cl_mem d_Dst, cl_mem d_Src, uint n, uint size ) { cl_int ciErrNum; size_t localWorkSize, globalWorkSize; uint elements = n * size; ciErrNum = clSetKernelArg(ckScanExclusiveLocal2, 0, sizeof(cl_mem), (void *)&d_Buffer); ciErrNum |= clSetKernelArg(ckScanExclusiveLocal2, 1, sizeof(cl_mem), (void *)&d_Dst); ciErrNum |= clSetKernelArg(ckScanExclusiveLocal2, 2, sizeof(cl_mem), (void *)&d_Src); ciErrNum |= clSetKernelArg(ckScanExclusiveLocal2, 3, 2 * WORKGROUP_SIZE * sizeof(uint), NULL); ciErrNum |= clSetKernelArg(ckScanExclusiveLocal2, 4, sizeof(uint), (void *)&elements); ciErrNum |= clSetKernelArg(ckScanExclusiveLocal2, 5, sizeof(uint), (void *)&size); oclCheckError(ciErrNum, CL_SUCCESS); localWorkSize = WORKGROUP_SIZE; globalWorkSize = iSnapUp(elements, WORKGROUP_SIZE); ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckScanExclusiveLocal2, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); }
void NuiOpenCLPrefixSum::scanExclusiveLocal2(unsigned int batchSize, cl_mem d_input, cl_kernel scanKernel /*= NULL*/) { // Get the kernel if (!scanKernel) { scanKernel = NuiOpenCLKernelManager::instance().acquireKernel(E_PREFIX_FLAG_SUM_EXCLUSIVE2); assert(scanKernel); if (!scanKernel) { NUI_ERROR("Get kernel 'E_PREFIX_FLAG_SUM_EXCLUSIVE2' failed!\n"); return; } } // OpenCL command queue and device cl_int err = CL_SUCCESS; cl_command_queue queue = NuiOpenCLGlobal::instance().clQueue(); // Set kernel arguments cl_uint idx = 0; err = clSetKernelArg(scanKernel, idx++, sizeof(cl_mem), &d_input); NUI_CHECK_CL_ERR(err); err = clSetKernelArg(scanKernel, idx++, sizeof(cl_mem), &m_buffer); NUI_CHECK_CL_ERR(err); err = clSetKernelArg(scanKernel, idx++, sizeof(cl_mem), &m_outPrefixCL); NUI_CHECK_CL_ERR(err); err = clSetKernelArg(scanKernel, idx++, 2 * WORKGROUP_SIZE * sizeof(cl_uint), NULL); NUI_CHECK_CL_ERR(err); err = clSetKernelArg(scanKernel, idx++, sizeof(cl_uint), &batchSize); NUI_CHECK_CL_ERR(err); // Run kernel to calculate // iSnapUp size_t kernelGlobalSize[1] = { iSnapUp(batchSize, WORKGROUP_SIZE) }; size_t local_ws[1] = { WORKGROUP_SIZE }; err = clEnqueueNDRangeKernel( queue, scanKernel, 1, nullptr, kernelGlobalSize, local_ws, 0, NULL, NULL ); NUI_CHECK_CL_ERR(err); // Debug //unsigned int* pBuffer = new unsigned int[numElements]; //err = clEnqueueReadBuffer( // queue, // m_buffer, // CL_TRUE,//blocking // 0, //offset // numElements * sizeof(cl_uint), // pBuffer, // 0, // NULL, // NULL // ); //NUI_CHECK_CL_ERR(err); //for(unsigned int i = 0; i < numElements; i++) //{ // unsigned int hustztz = pBuffer[i]; // //std::cout << pBuffer[i] << std::endl; //} //delete[] pBuffer; }