clppSort_RadixSortGPU::clppSort_RadixSortGPU(clppContext* context, unsigned int maxElements, unsigned int bits, bool keysOnly) { _keysOnly = keysOnly; _valueSize = 4; _keySize = 4; _clBuffer_dataSet = 0; _clBuffer_dataSetOut = 0; _bits = bits; if (!compile(context, "clppSort_RadixSortGPU.cl")) return; //---- Prepare all the kernels cl_int clStatus; _kernel_RadixLocalSort = clCreateKernel(_clProgram, "kernel__radixLocalSort", &clStatus); checkCLStatus(clStatus); _kernel_LocalHistogram = clCreateKernel(_clProgram, "kernel__localHistogram", &clStatus); checkCLStatus(clStatus); _kernel_RadixPermute = clCreateKernel(_clProgram, "kernel__radixPermute", &clStatus); checkCLStatus(clStatus); //---- Get the workgroup size //clGetKernelWorkGroupInfo(_kernel_RadixLocalSort, _context->clDevice, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &_workgroupSize, 0); _workgroupSize = 32; _scan = clpp::createBestScan(context, sizeof(int), maxElements); _clBuffer_radixHist1 = NULL; _clBuffer_radixHist2 = NULL; _datasetSize = 0; _is_clBuffersOwner = false; }
void clppSort_RadixSort::pushCLDatas(cl_mem clBuffer_dataSet, size_t datasetSize) { cl_int clStatus; _is_clBuffersOwner = false; //---- Store some values bool reallocate = datasetSize > _datasetSize; _datasetSize = datasetSize; //---- Prepare some buffers if (reallocate) { //---- Release if (_clBuffer_dataSet) { clReleaseMemObject(_clBuffer_dataSet); clReleaseMemObject(_clBuffer_dataSetOut); clReleaseMemObject(_clBuffer_radixHist1); clReleaseMemObject(_clBuffer_radixHist2); } //---- Allocate unsigned int numBlocks = roundUpDiv(_datasetSize, _workgroupSize * 4); // column size = 2^b = 16 // row size = numblocks _clBuffer_radixHist1 = clCreateBuffer(_context->clContext, CL_MEM_READ_WRITE, sizeof(int) * 16 * numBlocks, NULL, &clStatus); checkCLStatus(clStatus); _clBuffer_radixHist2 = clCreateBuffer(_context->clContext, CL_MEM_READ_WRITE, (_valueSize + _keySize) * 16 * numBlocks, NULL, &clStatus); checkCLStatus(clStatus); } _clBuffer_dataSet = clBuffer_dataSet; _clBuffer_dataSetOut = clBuffer_dataSet; }
void clppSort_RadixSort::localHistogram(const size_t* global, const size_t* local, cl_mem data, cl_mem hist, cl_mem blockHists, int bitOffset) { cl_int clStatus; clStatus = clSetKernelArg(_kernel_LocalHistogram, 0, sizeof(cl_mem), (const void*)&data); clStatus |= clSetKernelArg(_kernel_LocalHistogram, 1, sizeof(int), (const void*)&bitOffset); clStatus |= clSetKernelArg(_kernel_LocalHistogram, 2, sizeof(cl_mem), (const void*)&hist); clStatus |= clSetKernelArg(_kernel_LocalHistogram, 3, sizeof(cl_mem), (const void*)&blockHists); clStatus |= clSetKernelArg(_kernel_LocalHistogram, 4, sizeof(unsigned int), (const void*)&_datasetSize); clStatus |= clEnqueueNDRangeKernel(_context->clQueue, _kernel_LocalHistogram, 1, NULL, global, local, 0, NULL, NULL); #ifdef BENCHMARK clStatus |= clFinish(_context->clQueue); checkCLStatus(clStatus); #endif }
void clppScan_Default::pushDatas(void* values, size_t datasetSize) { cl_int clStatus; //---- Store some values _values = values; bool reallocate = datasetSize > _datasetSize || !_is_clBuffersOwner; bool recompute = datasetSize != _datasetSize; _datasetSize = datasetSize; //---- Compute the size of the different block we can use for '_datasetSize' (can be < maxElements) // Compute the number of levels requested to do the scan if (recompute) { _pass = 0; unsigned int n = _datasetSize; do { n = (n + _workgroupSize - 1) / _workgroupSize; // round up _pass++; } while(n > 1); // Compute the block-sum sizes n = _datasetSize; for(unsigned int i = 0; i < _pass; i++) { _blockSumsSizes[i] = n; n = (n + _workgroupSize - 1) / _workgroupSize; // round up } _blockSumsSizes[_pass] = n; } //---- Copy on the device if (reallocate) { //---- Release if (_clBuffer_values) clReleaseMemObject(_clBuffer_values); _clBuffer_values = clCreateBuffer(_context->clContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, _valueSize * _datasetSize, _values, &clStatus); _is_clBuffersOwner = true; checkCLStatus(clStatus); } else // Just resend clEnqueueWriteBuffer(_context->clQueue, _clBuffer_values, CL_FALSE, 0, _valueSize * _datasetSize, _values, 0, 0, 0); }
void clppSort_RadixSortGPU::pushDatas(void* dataSet, size_t datasetSize) { cl_int clStatus; //---- Store some values _dataSet = dataSet; _dataSetOut = dataSet; bool reallocate = datasetSize > _datasetSize || !_is_clBuffersOwner; _datasetSize = datasetSize; //---- Prepare some buffers if (reallocate) { //---- Release if (_clBuffer_dataSet) { clReleaseMemObject(_clBuffer_dataSet); clReleaseMemObject(_clBuffer_dataSetOut); clReleaseMemObject(_clBuffer_radixHist1); clReleaseMemObject(_clBuffer_radixHist2); } //---- Allocate unsigned int numBlocks = roundUpDiv(_datasetSize, _workgroupSize * 4); _clBuffer_radixHist1 = clCreateBuffer(_context->clContext, CL_MEM_READ_WRITE, _keySize * 16 * numBlocks, NULL, &clStatus); checkCLStatus(clStatus); _clBuffer_radixHist2 = clCreateBuffer(_context->clContext, CL_MEM_READ_WRITE, _keySize * 2 * 16 * numBlocks, NULL, &clStatus); checkCLStatus(clStatus); //---- Copy on the device if (_keysOnly) { _clBuffer_dataSet = clCreateBuffer(_context->clContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, _keySize * _datasetSize, _dataSet, &clStatus); checkCLStatus(clStatus); _clBuffer_dataSetOut = clCreateBuffer(_context->clContext, CL_MEM_READ_WRITE, _keySize * _datasetSize, NULL, &clStatus); checkCLStatus(clStatus); } else { _clBuffer_dataSet = clCreateBuffer(_context->clContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, (_valueSize+_keySize) * _datasetSize, _dataSet, &clStatus); checkCLStatus(clStatus); _clBuffer_dataSetOut = clCreateBuffer(_context->clContext, CL_MEM_READ_WRITE, (_valueSize+_keySize) * _datasetSize, NULL, &clStatus); checkCLStatus(clStatus); } _is_clBuffersOwner = true; } else // Just resend clEnqueueWriteBuffer(_context->clQueue, _clBuffer_dataSet, CL_FALSE, 0, (_valueSize+_keySize) * _datasetSize, _dataSet, 0, 0, 0); }
void clppSort_RadixSort::radixPermute(const size_t* global, const size_t* local, cl_mem* dataIn, cl_mem* dataOut, cl_mem* histScan, cl_mem* blockHists, int bitOffset, unsigned int numBlocks) { cl_int clStatus; clStatus = clSetKernelArg(_kernel_RadixPermute, 0, sizeof(cl_mem), (const void*)dataIn); clStatus |= clSetKernelArg(_kernel_RadixPermute, 1, sizeof(cl_mem), (const void*)dataOut); clStatus |= clSetKernelArg(_kernel_RadixPermute, 2, sizeof(cl_mem), (const void*)histScan); clStatus |= clSetKernelArg(_kernel_RadixPermute, 3, sizeof(cl_mem), (const void*)blockHists); clStatus |= clSetKernelArg(_kernel_RadixPermute, 4, sizeof(int), (const void*)&bitOffset); clStatus |= clSetKernelArg(_kernel_RadixPermute, 5, sizeof(unsigned int), (const void*)&_datasetSize); clStatus |= clSetKernelArg(_kernel_RadixPermute, 6, sizeof(unsigned int), (const void*)&numBlocks); clStatus |= clEnqueueNDRangeKernel(_context->clQueue, _kernel_RadixPermute, 1, NULL, global, local, 0, NULL, NULL); #ifdef BENCHMARK clStatus |= clFinish(_context->clQueue); checkCLStatus(clStatus); #endif }
void clppSort_RadixSort::radixLocal(const size_t* global, const size_t* local, cl_mem* data, int bitOffset) { cl_int clStatus; unsigned int a = 0; if (_keysOnly) clStatus = clSetKernelArg(_kernel_RadixLocalSort, a++, _keySize * 2 * 4 * _workgroupSize, (const void*)NULL); // 2 KV array of 128 items (2 for permutations) else clStatus = clSetKernelArg(_kernel_RadixLocalSort, a++, (_valueSize+_keySize) * 2 * 4 * _workgroupSize, (const void*)NULL); // 2 KV array of 128 items (2 for permutations) clStatus |= clSetKernelArg(_kernel_RadixLocalSort, a++, sizeof(cl_mem), (const void*)data); clStatus |= clSetKernelArg(_kernel_RadixLocalSort, a++, sizeof(int), (const void*)&bitOffset); clStatus |= clSetKernelArg(_kernel_RadixLocalSort, a++, sizeof(unsigned int), (const void*)&_datasetSize); clStatus |= clEnqueueNDRangeKernel(_context->clQueue, _kernel_RadixLocalSort, 1, NULL, global, local, 0, NULL, NULL); #ifdef BENCHMARK clStatus |= clFinish(_context->clQueue); checkCLStatus(clStatus); #endif }
void clppScan_GPU::scan() { cl_int clStatus; int blockSize = _datasetSize / _workgroupSize; int B = blockSize * _workgroupSize; if ((_datasetSize % _workgroupSize) > 0) { blockSize++; }; size_t localWorkSize = {_workgroupSize}; size_t globalWorkSize = {toMultipleOf(_datasetSize / blockSize, _workgroupSize)}; clStatus = clSetKernelArg(kernel__scan, 0, _workgroupSize * _valueSize, 0); clStatus |= clSetKernelArg(kernel__scan, 1, sizeof(cl_mem), &_clBuffer_values); clStatus |= clSetKernelArg(kernel__scan, 2, sizeof(int), &B); clStatus |= clSetKernelArg(kernel__scan, 3, sizeof(int), &_datasetSize); clStatus |= clSetKernelArg(kernel__scan, 4, sizeof(int), &blockSize); clStatus |= clEnqueueNDRangeKernel(_context->clQueue, kernel__scan, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); checkCLStatus(clStatus); }
clppScan_GPU::clppScan_GPU(clppContext* context, size_t valueSize, unsigned int maxElements) : clppScan(context, valueSize, maxElements) { cl_int clStatus; _clBuffer_values = 0; //---- Compilation if (!compile(context, "clppScan_GPU.cl")) return; //---- Prepare all the kernels kernel__scan = clCreateKernel(_clProgram, "kernel__scan_block_anylength", &clStatus); checkCLStatus(clStatus); //---- Get the workgroup size // ATI : Actually the wavefront size is only 64 for the highend cards(48XX, 58XX, 57XX), but 32 for the middleend cards and 16 for the lowend cards. // NVidia : 32 clGetKernelWorkGroupInfo(kernel__scan, _context->clDevice, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &_workgroupSize, 0); //clGetKernelWorkGroupInfo(kernel__scan, _context->clDevice, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &_workgroupSize, 0); _is_clBuffersOwner = false; }
void clppScan_Default::scan() { cl_int clStatus; clStatus = clSetKernelArg(_kernel_Scan, 1, _workgroupSize * _valueSize, 0); checkCLStatus(clStatus); //---- Apply the scan to each level cl_mem clValues = _clBuffer_values; for(unsigned int i = 0; i < _pass; i++) { size_t globalWorkSize = {toMultipleOf(_blockSumsSizes[i] / 2, _workgroupSize / 2)}; size_t localWorkSize = {_workgroupSize / 2}; clStatus = clSetKernelArg(_kernel_Scan, 0, sizeof(cl_mem), &clValues); clStatus |= clSetKernelArg(_kernel_Scan, 2, sizeof(cl_mem), &_clBuffer_BlockSums[i]); clStatus |= clSetKernelArg(_kernel_Scan, 3, sizeof(int), &_blockSumsSizes[i]); clStatus |= clEnqueueNDRangeKernel(_context->clQueue, _kernel_Scan, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); checkCLStatus(clStatus); clValues = _clBuffer_BlockSums[i]; } //---- Uniform addition for(int i = _pass - 2; i >= 0; i--) { size_t globalWorkSize = {toMultipleOf(_blockSumsSizes[i] / 2, _workgroupSize / 2)}; size_t localWorkSize = {_workgroupSize / 2}; cl_mem dest = (i > 0) ? _clBuffer_BlockSums[i-1] : _clBuffer_values; clStatus = clSetKernelArg(_kernel_UniformAdd, 0, sizeof(cl_mem), &dest); checkCLStatus(clStatus); clStatus = clSetKernelArg(_kernel_UniformAdd, 1, sizeof(cl_mem), &_clBuffer_BlockSums[i]); checkCLStatus(clStatus); clStatus = clSetKernelArg(_kernel_UniformAdd, 2, sizeof(int), &_blockSumsSizes[i]); checkCLStatus(clStatus); clStatus = clEnqueueNDRangeKernel(_context->clQueue, _kernel_UniformAdd, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); checkCLStatus(clStatus); } }
void clppScan_Default::popDatas() { cl_int clStatus = clEnqueueReadBuffer(_context->clQueue, _clBuffer_values, CL_TRUE, 0, _valueSize * _datasetSize, _values, 0, NULL, NULL); checkCLStatus(clStatus); }