////////////////////////////////////////////////////////////////////////////////
// Large scan launcher
////////////////////////////////////////////////////////////////////////////////
static void scanExclusiveLocal2(
    cl_command_queue cqCommandQueue,
    cl_mem d_Buffer,
    cl_mem d_Dst,
    cl_mem d_Src,
    uint n,
    uint size
) {
    cl_int ciErrNum;
    size_t localWorkSize, globalWorkSize;

    uint elements = n * size;
    ciErrNum  = clSetKernelArg(ckScanExclusiveLocal2, 0, sizeof(cl_mem), (void *)&d_Buffer);
    ciErrNum |= clSetKernelArg(ckScanExclusiveLocal2, 1, sizeof(cl_mem), (void *)&d_Dst);
    ciErrNum |= clSetKernelArg(ckScanExclusiveLocal2, 2, sizeof(cl_mem), (void *)&d_Src);
    ciErrNum |= clSetKernelArg(ckScanExclusiveLocal2, 3, 2 * WORKGROUP_SIZE * sizeof(uint), NULL);
    ciErrNum |= clSetKernelArg(ckScanExclusiveLocal2, 4, sizeof(uint), (void *)&elements);
    ciErrNum |= clSetKernelArg(ckScanExclusiveLocal2, 5, sizeof(uint), (void *)&size);
    oclCheckError(ciErrNum, CL_SUCCESS);

    localWorkSize = WORKGROUP_SIZE;
    globalWorkSize = iSnapUp(elements, WORKGROUP_SIZE);

    ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckScanExclusiveLocal2, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
    oclCheckError(ciErrNum, CL_SUCCESS);
}
void NuiOpenCLPrefixSum::scanExclusiveLocal2(unsigned int batchSize, cl_mem d_input, cl_kernel scanKernel /*= NULL*/)
{
	// Get the kernel
	if (!scanKernel)
	{
		scanKernel = NuiOpenCLKernelManager::instance().acquireKernel(E_PREFIX_FLAG_SUM_EXCLUSIVE2);
		assert(scanKernel);
		if (!scanKernel)
		{
			NUI_ERROR("Get kernel 'E_PREFIX_FLAG_SUM_EXCLUSIVE2' failed!\n");
			return;
		}
	}
	// OpenCL command queue and device
	cl_int           err = CL_SUCCESS;
	cl_command_queue queue = NuiOpenCLGlobal::instance().clQueue();

	// Set kernel arguments
	cl_uint idx = 0;
	err = clSetKernelArg(scanKernel, idx++, sizeof(cl_mem), &d_input);
	NUI_CHECK_CL_ERR(err);
	err = clSetKernelArg(scanKernel, idx++, sizeof(cl_mem), &m_buffer);
	NUI_CHECK_CL_ERR(err);
	err = clSetKernelArg(scanKernel, idx++, sizeof(cl_mem), &m_outPrefixCL);
	NUI_CHECK_CL_ERR(err);
	err = clSetKernelArg(scanKernel, idx++, 2 * WORKGROUP_SIZE * sizeof(cl_uint), NULL);
	NUI_CHECK_CL_ERR(err);
	err = clSetKernelArg(scanKernel, idx++, sizeof(cl_uint), &batchSize);
	NUI_CHECK_CL_ERR(err);

	// Run kernel to calculate 
	// iSnapUp
	size_t kernelGlobalSize[1] = { iSnapUp(batchSize, WORKGROUP_SIZE) };
	size_t local_ws[1] = { WORKGROUP_SIZE };
	err = clEnqueueNDRangeKernel(
		queue,
		scanKernel,
		1,
		nullptr,
		kernelGlobalSize,
		local_ws,
		0,
		NULL,
		NULL
		);
	NUI_CHECK_CL_ERR(err);

	// Debug
	//unsigned int* pBuffer = new unsigned int[numElements];
	//err = clEnqueueReadBuffer(
	//	queue,
	//	m_buffer,
	//	CL_TRUE,//blocking
	//	0, //offset
	//	numElements * sizeof(cl_uint),
	//	pBuffer,
	//	0,
	//	NULL,
	//	NULL
	//	);
	//NUI_CHECK_CL_ERR(err);
	//for(unsigned int i = 0; i < numElements; i++)
	//{
	//	unsigned int hustztz = pBuffer[i];
	//	//std::cout << pBuffer[i]  << std::endl;
	//}
	//delete[] pBuffer;
}