unsigned int NuiOpenCLPrefixSum::prefixSum(cl_kernel scan1Kernel, cl_kernel scan2Kernel, cl_mem d_input, cl_mem d_output) { scanExclusiveLocal1(d_input, scan1Kernel); const unsigned int cArrayLength = WORKGROUP_SIZE * 8; unsigned int batchSize = m_numElements / cArrayLength; scanExclusiveLocal2(batchSize, d_input, scan2Kernel); uniformUpdate(batchSize); generateValidIDs(d_output); return getValidCount(); }
extern "C" size_t scanExclusiveLarge( cl_command_queue cqCommandQueue, cl_mem d_Dst, cl_mem d_Src, uint batchSize, uint arrayLength ) { //Check power-of-two factorization uint log2L; uint factorizationRemainder = factorRadix2(log2L, arrayLength); oclCheckError( factorizationRemainder == 1, shrTRUE); //Check supported size range oclCheckError( (arrayLength >= MIN_LARGE_ARRAY_SIZE) && (arrayLength <= MAX_LARGE_ARRAY_SIZE), shrTRUE ); //Check total batch size limit oclCheckError( (batchSize * arrayLength) <= MAX_BATCH_ELEMENTS, shrTRUE ); scanExclusiveLocal1( cqCommandQueue, d_Dst, d_Src, (batchSize * arrayLength) / (4 * WORKGROUP_SIZE), 4 * WORKGROUP_SIZE ); scanExclusiveLocal2( cqCommandQueue, d_Buffer, d_Dst, d_Src, batchSize, arrayLength / (4 * WORKGROUP_SIZE) ); return uniformUpdate( cqCommandQueue, d_Dst, d_Buffer, (batchSize * arrayLength) / (4 * WORKGROUP_SIZE) ); }