void clppSort_RadixSortGPU::radixLocal(const size_t* global, const size_t* local, cl_mem data, cl_mem hist, cl_mem blockHists, int bitOffset) { cl_int clStatus; unsigned int a = 0; int workgroupSize = 128; unsigned int Ndiv = roundUpDiv(_datasetSize, 4); // Each work item handle 4 entries size_t global_128[1] = {toMultipleOf(Ndiv, workgroupSize)}; size_t local_128[1] = {workgroupSize}; /*if (_keysOnly) clStatus = clSetKernelArg(_kernel_RadixLocalSort, a++, _keySize * 2 * 4 * workgroupSize, (const void*)NULL); else clStatus = clSetKernelArg(_kernel_RadixLocalSort, a++, (_valueSize+_keySize) * 2 * 4 * workgroupSize, (const void*)NULL);// 2 KV array of 128 items (2 for permutations)*/ clStatus |= clSetKernelArg(_kernel_RadixLocalSort, a++, sizeof(cl_mem), (const void*)&data); clStatus |= clSetKernelArg(_kernel_RadixLocalSort, a++, sizeof(int), (const void*)&bitOffset); clStatus |= clSetKernelArg(_kernel_RadixLocalSort, a++, sizeof(unsigned int), (const void*)&_datasetSize); clStatus |= clEnqueueNDRangeKernel(_context->clQueue, _kernel_RadixLocalSort, 1, NULL, global_128, local_128, 0, NULL, NULL); #ifdef BENCHMARK clStatus |= clFinish(_context->clQueue); checkCLStatus(clStatus); #endif }
void clppSort_RadixSortGPU::pushDatas(void* dataSet, size_t datasetSize) { cl_int clStatus; //---- Store some values _dataSet = dataSet; _dataSetOut = dataSet; bool reallocate = datasetSize > _datasetSize || !_is_clBuffersOwner; _datasetSize = datasetSize; //---- Prepare some buffers if (reallocate) { //---- Release if (_clBuffer_dataSet) { clReleaseMemObject(_clBuffer_dataSet); clReleaseMemObject(_clBuffer_dataSetOut); clReleaseMemObject(_clBuffer_radixHist1); clReleaseMemObject(_clBuffer_radixHist2); } //---- Allocate unsigned int numBlocks = roundUpDiv(_datasetSize, _workgroupSize * 4); _clBuffer_radixHist1 = clCreateBuffer(_context->clContext, CL_MEM_READ_WRITE, _keySize * 16 * numBlocks, NULL, &clStatus); checkCLStatus(clStatus); _clBuffer_radixHist2 = clCreateBuffer(_context->clContext, CL_MEM_READ_WRITE, _keySize * 2 * 16 * numBlocks, NULL, &clStatus); checkCLStatus(clStatus); //---- Copy on the device if (_keysOnly) { _clBuffer_dataSet = clCreateBuffer(_context->clContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, _keySize * _datasetSize, _dataSet, &clStatus); checkCLStatus(clStatus); _clBuffer_dataSetOut = clCreateBuffer(_context->clContext, CL_MEM_READ_WRITE, _keySize * _datasetSize, NULL, &clStatus); checkCLStatus(clStatus); } else { _clBuffer_dataSet = clCreateBuffer(_context->clContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, (_valueSize+_keySize) * _datasetSize, _dataSet, &clStatus); checkCLStatus(clStatus); _clBuffer_dataSetOut = clCreateBuffer(_context->clContext, CL_MEM_READ_WRITE, (_valueSize+_keySize) * _datasetSize, NULL, &clStatus); checkCLStatus(clStatus); } _is_clBuffersOwner = true; } else // Just resend clEnqueueWriteBuffer(_context->clQueue, _clBuffer_dataSet, CL_FALSE, 0, (_valueSize+_keySize) * _datasetSize, _dataSet, 0, 0, 0); }
void clppSort_RadixSort::pushCLDatas(cl_mem clBuffer_dataSet, size_t datasetSize) { cl_int clStatus; _is_clBuffersOwner = false; //---- Store some values bool reallocate = datasetSize > _datasetSize; _datasetSize = datasetSize; //---- Prepare some buffers if (reallocate) { //---- Release if (_clBuffer_dataSet) { clReleaseMemObject(_clBuffer_dataSet); clReleaseMemObject(_clBuffer_dataSetOut); clReleaseMemObject(_clBuffer_radixHist1); clReleaseMemObject(_clBuffer_radixHist2); } //---- Allocate unsigned int numBlocks = roundUpDiv(_datasetSize, _workgroupSize * 4); // column size = 2^b = 16 // row size = numblocks _clBuffer_radixHist1 = clCreateBuffer(_context->clContext, CL_MEM_READ_WRITE, sizeof(int) * 16 * numBlocks, NULL, &clStatus); checkCLStatus(clStatus); _clBuffer_radixHist2 = clCreateBuffer(_context->clContext, CL_MEM_READ_WRITE, sizeof(int) * 16 * numBlocks, NULL, &clStatus); checkCLStatus(clStatus); } // ISSUE : We need 2 different buffers, but // a) when using 32 bits sort(by example) the result buffer is _clBuffer_dataSet // b) when using 28 bits sort(by example) the result buffer is _clBuffer_dataSetOut // Without copy, how can we do to put the result in _clBuffer_dataSet when using 28 bits ? _clBuffer_dataSet = clBuffer_dataSet; if (_keysOnly) _clBuffer_dataSetOut = clCreateBuffer(_context->clContext, CL_MEM_READ_WRITE, _keySize * _datasetSize, NULL, &clStatus); else _clBuffer_dataSetOut = clCreateBuffer(_context->clContext, CL_MEM_READ_WRITE, (_valueSize+_keySize) * _datasetSize, NULL, &clStatus); checkCLStatus(clStatus); }
void clppSort_RadixSortGPU::pushCLDatas(cl_mem clBuffer_dataSet, size_t datasetSize) { cl_int clStatus; _is_clBuffersOwner = false; //---- Store some values bool reallocate = datasetSize > _datasetSize; _datasetSize = datasetSize; //---- Prepare some buffers if (reallocate) { //---- Release if (_clBuffer_dataSet) { clReleaseMemObject(_clBuffer_dataSet); clReleaseMemObject(_clBuffer_dataSetOut); clReleaseMemObject(_clBuffer_radixHist1); clReleaseMemObject(_clBuffer_radixHist2); } //---- Allocate unsigned int numBlocks = roundUpDiv(_datasetSize, _workgroupSize * 4); // column size = 2^b = 16 // row size = numblocks _clBuffer_radixHist1 = clCreateBuffer(_context->clContext, CL_MEM_READ_WRITE, sizeof(int) * 16 * numBlocks, NULL, &clStatus); checkCLStatus(clStatus); _clBuffer_radixHist2 = clCreateBuffer(_context->clContext, CL_MEM_READ_WRITE, (_valueSize + _keySize) * 16 * numBlocks, NULL, &clStatus); checkCLStatus(clStatus); } _clBuffer_dataSet = clBuffer_dataSet; _clBuffer_dataSetOut = clBuffer_dataSet; }
void clppSort_RadixSort::sort() { // Satish et al. empirically set b = 4. The size of a work-group is in hundreds of // work-items, depending on the concrete device and each work-item processes more than one // stream element, usually 4, in order to hide latencies. StopWatch sw; cl_int clStatus; unsigned int numBlocks = roundUpDiv(_datasetSize, _workgroupSize * 4); unsigned int Ndiv4 = roundUpDiv(_datasetSize, 4); size_t global[1] = {toMultipleOf(Ndiv4, _workgroupSize)}; size_t local[1] = {_workgroupSize}; cl_mem* dataA = &_clBuffer_dataSet; cl_mem* dataB = &_clBuffer_dataSetOut; for(unsigned int bitOffset = 0; bitOffset < _bits; bitOffset += 4) { // 1) Each workgroup sorts its tile by using local memory // 2) Create an histogram of d=2^b digits entries #ifdef BENCHMARK sw.StartTimer(); #endif radixLocal(global, local, dataA, bitOffset); #ifdef BENCHMARK sw.StopTimer(); cout << "Local sort " << sw.GetElapsedTime() << endl; sw.StartTimer(); #endif localHistogram(global, local, dataA, &_clBuffer_radixHist1, &_clBuffer_radixHist2, bitOffset); #ifdef BENCHMARK sw.StopTimer(); cout << "Local histogram " << sw.GetElapsedTime() << endl; //********** //clEnqueueReadBuffer(_context->clQueue, dataA, CL_TRUE, 0, sizeof(int) * _datasetSize, _dataSetOut, 0, NULL, NULL); //********** // 3) Scan the p*2^b = p*(16) entry histogram table. Stored in column-major order, computes global digit offsets. sw.StartTimer(); #endif _scan->pushCLDatas(_clBuffer_radixHist1, 16 * numBlocks); _scan->scan(); #ifdef BENCHMARK _scan->waitCompletion(); sw.StopTimer(); cout << "Global scan " << sw.GetElapsedTime() << endl; // 4) Prefix sum results are used to scatter each work-group's elements to their correct position. sw.StartTimer(); #endif radixPermute(global, local, dataA, dataB, &_clBuffer_radixHist1, &_clBuffer_radixHist2, bitOffset, numBlocks); #ifdef BENCHMARK sw.StopTimer(); cout << "Global reorder " << sw.GetElapsedTime() << endl; #endif std::swap(dataA, dataB); } }