Esempio n. 1
0
void ZSort::sort(float* sortArray,int numberofValues){
    int ctaSize=128;
	/*if (!shrGetCmdLineArgumenti(argc, argv, "work-group-size", &ctaSize)) 
	{
		ctaSize = 128;
	}*/

    printf("Running Radix Sort on %d GPU(s) ...\n\n", nDevice);

	unsigned int numElements = numberofValues;

    // Alloc and init some data on the host, then alloc and init GPU buffer  
    unsigned int **h_keys       = (unsigned int**)malloc(nDevice * sizeof(unsigned int*));
    unsigned int **h_keysSorted = (unsigned int**)malloc(nDevice * sizeof(unsigned int*));
    cl_mem       *d_keys        = (cl_mem*       )malloc(nDevice * sizeof(cl_mem));
    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
        h_keys[iDevice]       = (unsigned int*)malloc(numElements * sizeof(unsigned int));
	    h_keysSorted[iDevice] = (unsigned int*)malloc(numElements * sizeof(unsigned int));
        memcpy( h_keys[iDevice], sortArray, sizeof(unsigned int)*numberofValues );
        //makeRandomUintVector(h_keys[iDevice], numElements, keybits);

        d_keys[iDevice] = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, 
            sizeof(unsigned int) * numElements, NULL, &ciErrNum);
        ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[iDevice], d_keys[iDevice], CL_TRUE, 0, 
            sizeof(unsigned int) * numElements, h_keys[iDevice], 0, NULL, NULL);
        //oclCheckError(ciErrNum, CL_SUCCESS);
    }
	
    // instantiate RadixSort objects
    RadixSort **radixSort = (RadixSort**)malloc(nDevice * sizeof(RadixSort*));
    //StartCounter();
    printf("Number of devices: %i\n",nDevice);
    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
	    radixSort[iDevice] = new RadixSort(cxGPUContext, cqCommandQueue[iDevice], numElements, "./", ctaSize, true);		    
    }
        for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
        {
	        radixSort[iDevice]->sort(d_keys[iDevice], 0, numElements, keybits);
        }
    // copy sorted keys to CPU 
    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
	    clEnqueueReadBuffer(cqCommandQueue[iDevice], d_keys[iDevice], CL_TRUE, 0, sizeof(unsigned int) * numElements, 
            h_keysSorted[iDevice], 0, NULL, NULL);
    }
    //printf("Size %d array sorted in %.6f milliseconds.\n",numElements, GetCounter() );
	// Check results
	bool passed = true;
    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
	    passed &= verifySortUint(h_keysSorted[iDevice], NULL, h_keys[iDevice], numElements);
    }
    printf((passed?"Passed":"Failed"));
    // cleanup allocs
    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
        clReleaseMemObject(d_keys[iDevice]);
	    free(h_keys[iDevice]);
	    free(h_keysSorted[iDevice]);
        delete radixSort[iDevice];
    }
    free(radixSort);
    free(h_keys);
    free(h_keysSorted);
    
    // remaining cleanup and exit
	free(cdDevices);
    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
	    clReleaseCommandQueue(cqCommandQueue[iDevice]);
    }
    clReleaseContext(cxGPUContext);

    // finish
    //shrQAFinishExit(argc, (const char **)argv, passed ? QA_PASSED : QA_FAILED);

    //shrEXIT(argc, argv);
}
Esempio n. 2
0
int main(int argc, const char **argv)
{
    cl_platform_id cpPlatform;                      // OpenCL platform
    cl_uint nDevice;                                // OpenCL device count
    cl_device_id* cdDevices;                        // OpenCL device list
    cl_context cxGPUContext;                        // OpenCL context
    cl_command_queue cqCommandQueue[MAX_GPU_COUNT]; // OpenCL command que
    cl_int ciErrNum;

    shrSetLogFileName ("oclRadixSort.txt");
    shrLog("%s starting...\n\n", argv[0]);

    shrLog("clGetPlatformID...\n");
    ciErrNum = oclGetPlatformID(&cpPlatform);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);

    shrLog("clGetDeviceIDs...\n");
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &nDevice);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
    cdDevices = (cl_device_id *)malloc(nDevice * sizeof(cl_device_id) );
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, nDevice, cdDevices, NULL);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);

    shrLog("clCreateContext...\n");
    cxGPUContext = clCreateContext(0, nDevice, cdDevices, NULL, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);

    shrLog("Create command queue...\n\n");
    int id_device;
    if(shrGetCmdLineArgumenti(argc, argv, "device", &id_device)) // Set up command queue(s) for GPU specified on the command line
    {
        // get & log device index # and name
        cl_device_id cdDevice = cdDevices[id_device];

        // create a command que
        cqCommandQueue[0] = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &ciErrNum);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
        oclPrintDevInfo(LOGBOTH, cdDevice);
        nDevice = 1;
    }
    else
    {
        // create command queues for all available devices
        for (cl_uint i = 0; i < nDevice; i++)
        {
            cqCommandQueue[i] = clCreateCommandQueue(cxGPUContext, cdDevices[i], 0, &ciErrNum);
            oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
        }
        for (cl_uint i = 0; i < nDevice; i++) oclPrintDevInfo(LOGBOTH, cdDevices[i]);
    }

    int ctaSize;
    if (!shrGetCmdLineArgumenti(argc, argv, "work-group-size", &ctaSize))
    {
        ctaSize = 128;
    }

    shrLog("Running Radix Sort on %d GPU(s) ...\n\n", nDevice);

    unsigned int numElements = 1048576;//128*128*128*2;

    // Alloc and init some data on the host, then alloc and init GPU buffer
    unsigned int **h_keys       = (unsigned int**)malloc(nDevice * sizeof(unsigned int*));
    unsigned int **h_keysSorted = (unsigned int**)malloc(nDevice * sizeof(unsigned int*));
    cl_mem       *d_keys        = (cl_mem*       )malloc(nDevice * sizeof(cl_mem));
    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
        h_keys[iDevice]       = (unsigned int*)malloc(numElements * sizeof(unsigned int));
        h_keysSorted[iDevice] = (unsigned int*)malloc(numElements * sizeof(unsigned int));
        makeRandomUintVector(h_keys[iDevice], numElements, keybits);

        d_keys[iDevice] = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE,
                sizeof(unsigned int) * numElements, NULL, &ciErrNum);
        ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[iDevice], d_keys[iDevice], CL_TRUE, 0,
                sizeof(unsigned int) * numElements, h_keys[iDevice], 0, NULL, NULL);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
    }

    // instantiate RadixSort objects
    RadixSort **radixSort = (RadixSort**)malloc(nDevice * sizeof(RadixSort*));
    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
        radixSort[iDevice] = new RadixSort(cxGPUContext, cqCommandQueue[iDevice], numElements, argv[0], ctaSize, true);
    }

#ifdef GPU_PROFILING
    int numIterations = 30;
    for (int i = -1; i < numIterations; i++)
    {
        if (i == 0)
        {
            for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
            {
                clFinish(cqCommandQueue[iDevice]);
            }
            shrDeltaT(1);
        }
#endif
        for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
        {
            radixSort[iDevice]->sort(d_keys[iDevice], 0, numElements, keybits);
        }
#ifdef GPU_PROFILING
    }
    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
        clFinish(cqCommandQueue[iDevice]);
    }
    double gpuTime = shrDeltaT(1)/(double)numIterations;
    shrLogEx(LOGBOTH | MASTER, 0, "oclRadixSort, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u elements, NumDevsUsed = %d, Workgroup = %d\n",
            (1.0e-6 * (double)(nDevice * numElements)/gpuTime), gpuTime, nDevice * numElements, nDevice, ctaSize);
#endif

    // copy sorted keys to CPU
    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
        clEnqueueReadBuffer(cqCommandQueue[iDevice], d_keys[iDevice], CL_TRUE, 0, sizeof(unsigned int) * numElements,
                h_keysSorted[iDevice], 0, NULL, NULL);
    }

    // Check results
    bool passed = true;
    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
        passed &= verifySortUint(h_keysSorted[iDevice], NULL, h_keys[iDevice], numElements);
    }
    shrLog("\n%s\n\n", passed ? "PASSED" : "FAILED");

    // cleanup allocs
    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
        clReleaseMemObject(d_keys[iDevice]);
        free(h_keys[iDevice]);
        free(h_keysSorted[iDevice]);
        delete radixSort[iDevice];
    }
    free(radixSort);
    free(h_keys);
    free(h_keysSorted);

    // remaining cleanup and exit
    free(cdDevices);
    for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++)
    {
        clReleaseCommandQueue(cqCommandQueue[iDevice]);
    }
    clReleaseContext(cxGPUContext);
    shrEXIT(argc, argv);
}