unsigned int NuiOpenCLPrefixSum::prefixSum(cl_kernel scan1Kernel, cl_kernel scan2Kernel, cl_mem d_input, cl_mem d_output)
{
	scanExclusiveLocal1(d_input, scan1Kernel);
	const unsigned int cArrayLength = WORKGROUP_SIZE * 8;
	unsigned int batchSize = m_numElements / cArrayLength;
	scanExclusiveLocal2(batchSize, d_input, scan2Kernel);
	uniformUpdate(batchSize);
	generateValidIDs(d_output);
	return getValidCount();
}
extern "C" size_t scanExclusiveLarge(
    cl_command_queue cqCommandQueue,
    cl_mem d_Dst,
    cl_mem d_Src,
    uint batchSize,
    uint arrayLength
) {
    //Check power-of-two factorization
    uint log2L;
    uint factorizationRemainder = factorRadix2(log2L, arrayLength);
    oclCheckError( factorizationRemainder == 1, shrTRUE);

    //Check supported size range
    oclCheckError( (arrayLength >= MIN_LARGE_ARRAY_SIZE) && (arrayLength <= MAX_LARGE_ARRAY_SIZE), shrTRUE );

    //Check total batch size limit
    oclCheckError( (batchSize * arrayLength) <= MAX_BATCH_ELEMENTS, shrTRUE );

    scanExclusiveLocal1(
        cqCommandQueue,
        d_Dst,
        d_Src,
        (batchSize * arrayLength) / (4 * WORKGROUP_SIZE),
        4 * WORKGROUP_SIZE
    );

    scanExclusiveLocal2(
        cqCommandQueue,
        d_Buffer,
        d_Dst,
        d_Src,
        batchSize,
        arrayLength / (4 * WORKGROUP_SIZE)
    );

    return uniformUpdate(
               cqCommandQueue,
               d_Dst,
               d_Buffer,
               (batchSize * arrayLength) / (4 * WORKGROUP_SIZE)
           );
}