void ZSort::sort(float* sortArray,int numberofValues){ int ctaSize=128; /*if (!shrGetCmdLineArgumenti(argc, argv, "work-group-size", &ctaSize)) { ctaSize = 128; }*/ printf("Running Radix Sort on %d GPU(s) ...\n\n", nDevice); unsigned int numElements = numberofValues; // Alloc and init some data on the host, then alloc and init GPU buffer unsigned int **h_keys = (unsigned int**)malloc(nDevice * sizeof(unsigned int*)); unsigned int **h_keysSorted = (unsigned int**)malloc(nDevice * sizeof(unsigned int*)); cl_mem *d_keys = (cl_mem* )malloc(nDevice * sizeof(cl_mem)); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { h_keys[iDevice] = (unsigned int*)malloc(numElements * sizeof(unsigned int)); h_keysSorted[iDevice] = (unsigned int*)malloc(numElements * sizeof(unsigned int)); memcpy( h_keys[iDevice], sortArray, sizeof(unsigned int)*numberofValues ); //makeRandomUintVector(h_keys[iDevice], numElements, keybits); d_keys[iDevice] = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(unsigned int) * numElements, NULL, &ciErrNum); ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[iDevice], d_keys[iDevice], CL_TRUE, 0, sizeof(unsigned int) * numElements, h_keys[iDevice], 0, NULL, NULL); //oclCheckError(ciErrNum, CL_SUCCESS); } // instantiate RadixSort objects RadixSort **radixSort = (RadixSort**)malloc(nDevice * sizeof(RadixSort*)); //StartCounter(); printf("Number of devices: %i\n",nDevice); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { radixSort[iDevice] = new RadixSort(cxGPUContext, cqCommandQueue[iDevice], numElements, "./", ctaSize, true); } for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { radixSort[iDevice]->sort(d_keys[iDevice], 0, numElements, keybits); } // copy sorted keys to CPU for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { clEnqueueReadBuffer(cqCommandQueue[iDevice], d_keys[iDevice], CL_TRUE, 0, sizeof(unsigned int) * numElements, h_keysSorted[iDevice], 0, NULL, NULL); } //printf("Size %d array sorted in %.6f milliseconds.\n",numElements, GetCounter() ); // Check results bool passed = true; for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { passed &= verifySortUint(h_keysSorted[iDevice], NULL, h_keys[iDevice], numElements); } printf((passed?"Passed":"Failed")); // cleanup allocs for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { clReleaseMemObject(d_keys[iDevice]); free(h_keys[iDevice]); free(h_keysSorted[iDevice]); delete radixSort[iDevice]; } free(radixSort); free(h_keys); free(h_keysSorted); // remaining cleanup and exit free(cdDevices); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { clReleaseCommandQueue(cqCommandQueue[iDevice]); } clReleaseContext(cxGPUContext); // finish //shrQAFinishExit(argc, (const char **)argv, passed ? QA_PASSED : QA_FAILED); //shrEXIT(argc, argv); }
int main(int argc, const char **argv) { cl_platform_id cpPlatform; // OpenCL platform cl_uint nDevice; // OpenCL device count cl_device_id* cdDevices; // OpenCL device list cl_context cxGPUContext; // OpenCL context cl_command_queue cqCommandQueue[MAX_GPU_COUNT]; // OpenCL command que cl_int ciErrNum; shrSetLogFileName ("oclRadixSort.txt"); shrLog("%s starting...\n\n", argv[0]); shrLog("clGetPlatformID...\n"); ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("clGetDeviceIDs...\n"); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &nDevice); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); cdDevices = (cl_device_id *)malloc(nDevice * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, nDevice, cdDevices, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("clCreateContext...\n"); cxGPUContext = clCreateContext(0, nDevice, cdDevices, NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("Create command queue...\n\n"); int id_device; if(shrGetCmdLineArgumenti(argc, argv, "device", &id_device)) // Set up command queue(s) for GPU specified on the command line { // get & log device index # and name cl_device_id cdDevice = cdDevices[id_device]; // create a command que cqCommandQueue[0] = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); oclPrintDevInfo(LOGBOTH, cdDevice); nDevice = 1; } else { // create command queues for all available devices for (cl_uint i = 0; i < nDevice; i++) { cqCommandQueue[i] = clCreateCommandQueue(cxGPUContext, cdDevices[i], 0, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); } for (cl_uint i = 0; i < nDevice; i++) oclPrintDevInfo(LOGBOTH, cdDevices[i]); } int ctaSize; if (!shrGetCmdLineArgumenti(argc, argv, "work-group-size", &ctaSize)) { ctaSize = 128; } shrLog("Running Radix Sort on %d GPU(s) ...\n\n", nDevice); unsigned int numElements = 1048576;//128*128*128*2; // Alloc and init some data on the host, then alloc and init GPU buffer unsigned int **h_keys = (unsigned int**)malloc(nDevice * sizeof(unsigned int*)); unsigned int **h_keysSorted = (unsigned int**)malloc(nDevice * sizeof(unsigned int*)); cl_mem *d_keys = (cl_mem* )malloc(nDevice * sizeof(cl_mem)); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { h_keys[iDevice] = (unsigned int*)malloc(numElements * sizeof(unsigned int)); h_keysSorted[iDevice] = (unsigned int*)malloc(numElements * sizeof(unsigned int)); makeRandomUintVector(h_keys[iDevice], numElements, keybits); d_keys[iDevice] = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(unsigned int) * numElements, NULL, &ciErrNum); ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[iDevice], d_keys[iDevice], CL_TRUE, 0, sizeof(unsigned int) * numElements, h_keys[iDevice], 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); } // instantiate RadixSort objects RadixSort **radixSort = (RadixSort**)malloc(nDevice * sizeof(RadixSort*)); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { radixSort[iDevice] = new RadixSort(cxGPUContext, cqCommandQueue[iDevice], numElements, argv[0], ctaSize, true); } #ifdef GPU_PROFILING int numIterations = 30; for (int i = -1; i < numIterations; i++) { if (i == 0) { for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { clFinish(cqCommandQueue[iDevice]); } shrDeltaT(1); } #endif for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { radixSort[iDevice]->sort(d_keys[iDevice], 0, numElements, keybits); } #ifdef GPU_PROFILING } for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { clFinish(cqCommandQueue[iDevice]); } double gpuTime = shrDeltaT(1)/(double)numIterations; shrLogEx(LOGBOTH | MASTER, 0, "oclRadixSort, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u elements, NumDevsUsed = %d, Workgroup = %d\n", (1.0e-6 * (double)(nDevice * numElements)/gpuTime), gpuTime, nDevice * numElements, nDevice, ctaSize); #endif // copy sorted keys to CPU for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { clEnqueueReadBuffer(cqCommandQueue[iDevice], d_keys[iDevice], CL_TRUE, 0, sizeof(unsigned int) * numElements, h_keysSorted[iDevice], 0, NULL, NULL); } // Check results bool passed = true; for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { passed &= verifySortUint(h_keysSorted[iDevice], NULL, h_keys[iDevice], numElements); } shrLog("\n%s\n\n", passed ? "PASSED" : "FAILED"); // cleanup allocs for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { clReleaseMemObject(d_keys[iDevice]); free(h_keys[iDevice]); free(h_keysSorted[iDevice]); delete radixSort[iDevice]; } free(radixSort); free(h_keys); free(h_keysSorted); // remaining cleanup and exit free(cdDevices); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { clReleaseCommandQueue(cqCommandQueue[iDevice]); } clReleaseContext(cxGPUContext); shrEXIT(argc, argv); }