Пример #1
0
// Main function 
// *********************************************************************
int main(int argc, char** argv)
{
    shrQAStart(argc, argv);
    // get command line arg for quick test, if provided
    bNoPrompt = shrCheckCmdLineFlag(argc, (const char **)argv, "noprompt");

    // start logs
	cExecutableName = argv[0];
    shrSetLogFileName ("oclMatVecMul.txt");
    shrLog("%s Starting...\n\n", argv[0]); 

    // calculate matrix height given GPU memory
    shrLog("Determining Matrix height from available GPU mem...\n");
    memsize_t memsize;
    getTargetDeviceGlobalMemSize(&memsize, argc, (const char **)argv);
    height = memsize/width/16;
    if (height > MAX_HEIGHT)
        height = MAX_HEIGHT;
    shrLog(" Matrix width\t= %u\n Matrix height\t= %u\n\n", width, height); 

    // Allocate and initialize host arrays
    shrLog("Allocate and Init Host Mem...\n\n");
    unsigned int size = width * height;
    unsigned int mem_size_M = size * sizeof(float);
    M = (float*)malloc(mem_size_M);
    unsigned int mem_size_V = width * sizeof(float);
    V = (float*)malloc(mem_size_V);
    unsigned int mem_size_W = height * sizeof(float);
    W = (float*)malloc(mem_size_W);
    shrFillArray(M, size);
    shrFillArray(V, width);
    Golden = (float*)malloc(mem_size_W);
    MatVecMulHost(M, V, width, height, Golden);

    //Get the NVIDIA platform
    shrLog("Get the Platform ID...\n\n");
    ciErrNum = oclGetPlatformID(&cpPlatform);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    //Get all the devices
    shrLog("Get the Device info and select Device...\n");
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Set target device and Query number of compute units on targetDevice
    shrLog(" # of Devices Available = %u\n", uiNumDevices); 
    if(shrGetCmdLineArgumentu(argc, (const char **)argv, "device", &targetDevice)== shrTRUE) 
    {
        targetDevice = CLAMP(targetDevice, 0, (uiNumDevices - 1));
    }
    shrLog(" Using Device %u: ", targetDevice); 
    oclPrintDevName(LOGBOTH, cdDevices[targetDevice]);  
    cl_uint num_compute_units;
    clGetDeviceInfo(cdDevices[targetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(num_compute_units), &num_compute_units, NULL);
    shrLog("\n # of Compute Units = %u\n\n", num_compute_units); 

    //Create the context
    shrLog("clCreateContext...\n"); 
    cxGPUContext = clCreateContext(0, uiNumDevsUsed, &cdDevices[targetDevice], NULL, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Create a command-queue
    shrLog("clCreateCommandQueue...\n"); 
    cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[targetDevice], CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Allocate the OpenCL buffer memory objects for source and result on the device GMEM
    shrLog("clCreateBuffer (M, V and W in device global memory, mem_size_m = %u)...\n", mem_size_M); 
    cmM = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, mem_size_M, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    cmV = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, mem_size_V, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    cmW = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, mem_size_W, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Read the OpenCL kernel in from source file
    shrLog("oclLoadProgSource (%s)...\n", cSourceFile); 
    cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
    oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup);
    cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);
    oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup);

    // Create the program
    shrLog("clCreateProgramWithSource...\n"); 
    cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);

    // Build the program
    shrLog("clBuildProgram...\n"); 
    ciErrNum = clBuildProgram(cpProgram, uiNumDevsUsed, &cdDevices[targetDevice], "-cl-fast-relaxed-math", NULL, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        // write out standard error, Build Log and PTX, then cleanup and exit
        shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
        oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
        oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclMatVecMul.ptx");
        shrQAFinish(argc, (const char **)argv, QA_FAILED);
        Cleanup(EXIT_FAILURE); 
    }

    // --------------------------------------------------------
    // Core sequence... copy input data to GPU, compute, copy results back

    // Asynchronous write of data to GPU device
    shrLog("clEnqueueWriteBuffer (M and V)...\n\n"); 
    ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmM, CL_FALSE, 0, mem_size_M, M, 0, NULL, NULL);
    ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue, cmV, CL_FALSE, 0, mem_size_V, V, 0, NULL, NULL);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Kernels
    const char* kernels[] = {
        "MatVecMulUncoalesced0",
        "MatVecMulUncoalesced1",
        "MatVecMulCoalesced0",
        "MatVecMulCoalesced1",
        "MatVecMulCoalesced2",
        "MatVecMulCoalesced3" };

    for (int k = 0; k < (int)(sizeof(kernels)/sizeof(char*)); ++k) {
        shrLog("Running with Kernel %s...\n\n", kernels[k]); 

        // Clear result
        shrLog("  Clear result with clEnqueueWriteBuffer (W)...\n"); 
        memset(W, 0, mem_size_W);
        ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmW, CL_FALSE, 0, mem_size_W, W, 0, NULL, NULL);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

        // Create the kernel
        shrLog("  clCreateKernel...\n"); 
        if (ckKernel) {
            clReleaseKernel(ckKernel);
            ckKernel = 0;
        }
        ckKernel = clCreateKernel(cpProgram, kernels[k], &ciErrNum);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

        // Set and log Global and Local work size dimensions
        szLocalWorkSize = 256;
        if (k == 0)
            szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, height);  // rounded up to the nearest multiple of the LocalWorkSize
        else
            // Some experiments should be done here for determining the best global work size for a given device
            // We will assume here that we can run 2 work-groups per compute unit
            szGlobalWorkSize = 2 * num_compute_units * szLocalWorkSize;
        shrLog("  Global Work Size \t\t= %u\n  Local Work Size \t\t= %u\n  # of Work Groups \t\t= %u\n", 
               szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); 

        // Set the Argument values
        shrLog("  clSetKernelArg...\n\n");
        int n = 0;
        ciErrNum = clSetKernelArg(ckKernel,  n++, sizeof(cl_mem), (void*)&cmM);
        ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_mem), (void*)&cmV);
        ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_int), (void*)&width);
        ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_int), (void*)&height);
        ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_mem), (void*)&cmW);
        if (k > 1)
            ciErrNum |= clSetKernelArg(ckKernel, n++, szLocalWorkSize * sizeof(float), 0);    
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

        // Launch kernel
        shrLog("  clEnqueueNDRangeKernel (%s)...\n", kernels[k]); 
        ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, &ceEvent);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

        // Read back results and check accumulated errors
        shrLog("  clEnqueueReadBuffer (W)...\n"); 
        ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmW, CL_TRUE, 0, mem_size_W, W, 0, NULL, NULL);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    #ifdef GPU_PROFILING
        // Execution time
        ciErrNum = clWaitForEvents(1, &ceEvent);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
        cl_ulong start, end;
        ciErrNum = clGetEventProfilingInfo(ceEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
        ciErrNum |= clGetEventProfilingInfo(ceEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
        double dSeconds = 1.0e-9 * (double)(end - start);
        shrLog("  Kernel execution time: %.5f s\n\n", dSeconds);
    #endif

        // Compare results for golden-host and report errors and pass/fail
        shrLog("  Comparing against Host/C++ computation...\n\n"); 
        shrBOOL res = shrCompareL2fe(Golden, W, height, 1e-6f);
        shrLog("    GPU Result %s CPU Result within allowable tolerance\n\n", (res == shrTRUE) ? "MATCHES" : "DOESN'T MATCH");
        bPassFlag &= (res == shrTRUE); 

        // Release event
        ciErrNum = clReleaseEvent(ceEvent);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
        ceEvent = 0;
    }

    // Master status Pass/Fail (all tests)
    shrQAFinish(argc, (const char **)argv, (bPassFlag ? QA_PASSED : QA_FAILED) );

    // Cleanup and leave
    Cleanup (EXIT_SUCCESS);
}
Пример #2
0
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for 
////////////////////////////////////////////////////////////////////////////////
int runTest(int argc, const char** argv)
{
    cl_platform_id cpPlatform = NULL;
    cl_uint ciDeviceCount = 0;
    cl_device_id *cdDevices = NULL;
    cl_int ciErrNum = CL_SUCCESS;

    //Get the NVIDIA platform
    ciErrNum = oclGetPlatformID(&cpPlatform);
    if (ciErrNum != CL_SUCCESS)
    {
        shrLog("Error: Failed to create OpenCL context!\n");
        return ciErrNum;
    }

    //Get the devices
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &ciDeviceCount);
    cdDevices = (cl_device_id *)malloc(ciDeviceCount * sizeof(cl_device_id) );
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, ciDeviceCount, cdDevices, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        shrLog("Error: Failed to create OpenCL context!\n");
        return ciErrNum;
    }

    //Create the context
    cxGPUContext = clCreateContext(0, ciDeviceCount, cdDevices, NULL, NULL, &ciErrNum);
    if (ciErrNum != CL_SUCCESS)
    {
        shrLog("Error: Failed to create OpenCL context!\n");
        return ciErrNum;
    }

    if(shrCheckCmdLineFlag(argc, (const char**)argv, "device"))
    {
        // User specified GPUs
        char* deviceList;
        char* deviceStr;
        char* next_token;
        shrGetCmdLineArgumentstr(argc, (const char**)argv, "device", &deviceList);

        #ifdef WIN32
            deviceStr = strtok_s (deviceList," ,.-", &next_token);
        #else
            deviceStr = strtok (deviceList," ,.-");
        #endif   
        ciDeviceCount = 0;
        while(deviceStr != NULL) 
        {
            // get and print the device for this queue
            cl_device_id device = oclGetDev(cxGPUContext, atoi(deviceStr));
			if( device == (cl_device_id) -1  ) {
				shrLog(" Device %s does not exist!\n", deviceStr);
				return -1;
			}
			
			shrLog("Device %s: ", deviceStr);
            oclPrintDevName(LOGBOTH, device);            
            shrLog("\n");
           
            // create command queue
            commandQueue[ciDeviceCount] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
            if (ciErrNum != CL_SUCCESS)
            {
                shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
                return ciErrNum;
            }
                
            ++ciDeviceCount;

            #ifdef WIN32
                deviceStr = strtok_s (NULL," ,.-", &next_token);
            #else            
                deviceStr = strtok (NULL," ,.-");
            #endif
        }

        free(deviceList);
    } 
    else 
    {
        // Find out how many GPU's to compute on all available GPUs
	    size_t nDeviceBytes;
	    ciErrNum |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes);
	    ciDeviceCount = (cl_uint)nDeviceBytes/sizeof(cl_device_id);

        if (ciErrNum != CL_SUCCESS)
        {
            shrLog(" Error %i in clGetDeviceIDs call !!!\n\n", ciErrNum);
            return ciErrNum;
        }
        else if (ciDeviceCount == 0)
        {
            shrLog(" There are no devices supporting OpenCL (return code %i)\n\n", ciErrNum);
            return -1;
        } 

        // create command-queues
        for(unsigned int i = 0; i < ciDeviceCount; ++i) 
        {
            // get and print the device for this queue
            cl_device_id device = oclGetDev(cxGPUContext, i);
            shrLog("Device %d: ", i);
            oclPrintDevName(LOGBOTH, device);            
            shrLog("\n");

            // create command queue
            commandQueue[i] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
            if (ciErrNum != CL_SUCCESS)
            {
                shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
                return ciErrNum;
            }
        }
    }

    // Optional Command-line multiplier for matrix sizes
    shrGetCmdLineArgumenti(argc, (const char**)argv, "sizemult", &iSizeMultiple); 
    iSizeMultiple = CLAMP(iSizeMultiple, 1, 10);
    uiWA = WA * iSizeMultiple;
    uiHA = HA * iSizeMultiple;
    uiWB = WB * iSizeMultiple;
    uiHB = HB * iSizeMultiple;
    uiWC = WC * iSizeMultiple;
    uiHC = HC * iSizeMultiple;
    shrLog("\nUsing Matrix Sizes: A(%u x %u), B(%u x %u), C(%u x %u)\n", 
            uiWA, uiHA, uiWB, uiHB, uiWC, uiHC);

    // allocate host memory for matrices A and B
    unsigned int size_A = uiWA * uiHA;
    unsigned int mem_size_A = sizeof(float) * size_A;
    float* h_A_data = (float*)malloc(mem_size_A);
    unsigned int size_B = uiWB * uiHB;
    unsigned int mem_size_B = sizeof(float) * size_B;
    float* h_B_data = (float*)malloc(mem_size_B);

    // initialize host memory
    srand(2006);
    shrFillArray(h_A_data, size_A);
    shrFillArray(h_B_data, size_B);

    // allocate host memory for result
    unsigned int size_C = uiWC * uiHC;
    unsigned int mem_size_C = sizeof(float) * size_C;
    float* h_C = (float*) malloc(mem_size_C);

    // create OpenCL buffer pointing to the host memory
    cl_mem h_A = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
				    mem_size_A, h_A_data, &ciErrNum);
    if (ciErrNum != CL_SUCCESS)
    {
        shrLog("Error: clCreateBuffer\n");
        return ciErrNum;
    }

    // Program Setup
    size_t program_length;
    const char* header_path = shrFindFilePath("matrixMul.h", argv[0]);
    oclCheckError(header_path != NULL, shrTRUE);
    char* header = oclLoadProgSource(header_path, "", &program_length);
    if(!header)
    {
        shrLog("Error: Failed to load the header %s!\n", header_path);
        return -1000;
    }
    const char* source_path = shrFindFilePath("matrixMul.cl", argv[0]);
    oclCheckError(source_path != NULL, shrTRUE);
    char *source = oclLoadProgSource(source_path, header, &program_length);
    if(!source)
    {
        shrLog("Error: Failed to load compute program %s!\n", source_path);
        return -2000;
    }

    // create the program
    cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&source, 
                                                    &program_length, &ciErrNum);
    if (ciErrNum != CL_SUCCESS)
    {
        shrLog("Error: Failed to create program\n");
        return ciErrNum;
    }
    free(header);
    free(source);
    
    // build the program
    ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        // write out standard error, Build Log and PTX, then return error
        shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
        oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
        oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclMatrixMul.ptx");
        return ciErrNum;
    }

    // write out PTX if requested on the command line
    if(shrCheckCmdLineFlag(argc, argv, "dump-ptx") )
    {
        oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclMatrixMul.ptx");
    }

    // Create Kernel
    for(unsigned int i = 0; i < ciDeviceCount; ++i) {
        multiplicationKernel[i] = clCreateKernel(cpProgram, "matrixMul", &ciErrNum);
        if (ciErrNum != CL_SUCCESS)
        {
            shrLog("Error: Failed to create kernel\n");
            return ciErrNum;
        }
    }
        
    // Run multiplication on 1..deviceCount GPUs to compare improvement
    shrLog("\nRunning Computations on 1 - %d GPU's...\n\n", ciDeviceCount);
    for(unsigned int k = 1; k <= ciDeviceCount; ++k) 
    {
        matrixMulGPU(k, h_A, h_B_data, mem_size_B, h_C);
    }

    // compute reference solution
    shrLog("Comparing results with CPU computation... \n\n");
    float* reference = (float*) malloc(mem_size_C);
    computeGold(reference, h_A_data, h_B_data, uiHA, uiWA, uiWB);

    // check result
    shrBOOL res = shrCompareL2fe(reference, h_C, size_C, 1.0e-6f);
    if (res != shrTRUE) 
    {
        printDiff(reference, h_C, uiWC, uiHC, 100, 1.0e-5f);
    }

    // clean up OCL resources
    ciErrNum = clReleaseMemObject(h_A);
    for(unsigned int k = 0; k < ciDeviceCount; ++k) 
    {
        ciErrNum |= clReleaseKernel( multiplicationKernel[k] );
        ciErrNum |= clReleaseCommandQueue( commandQueue[k] );
    }
    ciErrNum |= clReleaseProgram(cpProgram);
    ciErrNum |= clReleaseContext(cxGPUContext);
    if(ciErrNum != CL_SUCCESS)
    {
        shrLog("Error: Failure releasing OpenCL resources: %d\n", ciErrNum);
        return ciErrNum;
    }

    // clean up memory
    free(h_A_data);
    free(h_B_data);
    free(h_C);
    free(reference);
    
    return ((shrTRUE == res) ? CL_SUCCESS : -3000);
}
Пример #3
0
int main(int argc, const char** argv) {
	cl_uint platform_count;
	cl_platform_id platforms[5];

	cl_int err = CL_SUCCESS;
	unsigned int i, p;

	cl_device_type dev_type = CL_DEVICE_TYPE_ALL;

	void * ptrs[BLOCKS];
	cl_command_queue cqs[BLOCKS];
	cl_mem d_A[BLOCKS];
	cl_mem d_C[BLOCKS];
	cl_mem d_B[BLOCKS];

	cl_event GPUDone[BLOCKS];
	cl_event GPUExecution[BLOCKS];
	struct timeval start, end;

	int workOffset[BLOCKS];
	int workSize[BLOCKS];

	unsigned int sizePerGPU = HC / BLOCKS;
	unsigned int sizeMod = HC % BLOCKS;

	size_t A_size = WA * HA;
	size_t A_mem_size = sizeof(TYPE) * A_size;
	TYPE* A_data;

	size_t B_size = WB * HB;
	size_t B_mem_size = sizeof(TYPE) * B_size;
	TYPE* B_data;

	size_t C_size = WC * HC;
	size_t C_mem_size = sizeof(TYPE) * C_size;
	TYPE* C_data;

	parse_args(argc, argv);

	check(clGetPlatformIDs(5, platforms, &platform_count));
	if (platform_count == 0) {
		printf("No platform found\n");
		exit(77);
	}

	cl_uint device_count;
	cl_uint devs[platform_count];
	cl_device_id * devices[platform_count];
	cl_context ctx[platform_count];
	cl_command_queue * commandQueue[platform_count];

	device_count = 0;
	for (p=0; p<platform_count; p++) {
		cl_platform_id platform = platforms[p];

		err = clGetDeviceIDs(platform, dev_type, 0, NULL, &devs[p]);
		if (err == CL_DEVICE_NOT_FOUND) {
			devs[p] = 0;
			continue;
		}
		if (devs[p] == 0) {
		     printf("No OpenCL device found\n");
		     exit(77);
		}
		if (err != CL_SUCCESS) {
			fprintf(stderr, "OpenCL Error (%d) in clGetDeviceIDs()\n", err);
			exit(EXIT_FAILURE);
		}
		if (devs[p] == 0)
			continue;

		devices[p] = (cl_device_id*)malloc(sizeof(cl_device_id) * devs[p]);
		commandQueue[p] = (cl_command_queue*)malloc(sizeof(cl_command_queue) * devs[p]);

		check(clGetDeviceIDs(platform, dev_type, devs[p], devices[p], NULL));

		cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0};
		check2(ctx[p] = clCreateContext(properties, devs[p], devices[p], NULL, NULL, &err));

		for(i = 0; i < devs[p]; ++i)
		{
			cl_device_id device = devices[p][i];
			char name[2048];
			name[0] = '\0';
			clGetDeviceInfo(device, CL_DEVICE_NAME, 2048, name, NULL);
			printf("Device %d: %s\n", i, name);

			check2(commandQueue[p][i] = clCreateCommandQueue(ctx[p], device, CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err));
		}

		device_count += devs[p];
	}

	if (device_count == 0)
		error("No device found\n");



	cl_kernel multiplicationKernel[platform_count];

	printf("\nUsing Matrix Sizes: A(%lu x %lu), B(%lu x %lu), C(%lu x %lu)\n",
			(unsigned long)WA, (unsigned long)HA, (unsigned long)WB, (unsigned long)HB, (unsigned long)WC, (unsigned long)HC);

	// allocate host memory for matrices A, B and C
	A_data = (TYPE*)malloc(A_mem_size);
	if (A_data == NULL) {
		perror("malloc");
	}

	B_data = (TYPE*)malloc(B_mem_size);
	if (B_data == NULL) {
		perror("malloc");
	}

	C_data = (TYPE*) malloc(C_mem_size);
	if (C_data == NULL) {
		perror("malloc");
	}

	cl_program program[platform_count];

	for (p=0; p<platform_count; p++) {
		if (devs[p] == 0)
			continue;

		check2(program[p] = clCreateProgramWithSource(ctx[p], 1, (const char **)&code, NULL, &err));

		check(clBuildProgram(program[p], 0, NULL, NULL, NULL, NULL));

		check2(multiplicationKernel[p] = clCreateKernel(program[p], "sgemmNN", &err));
	}

	printf("Initializing data...\n");
	srand(2008);
	fillArray(A_data, A_size);
	fillArray(B_data, B_size);
	memset(C_data, 0, C_size);


	printf("Computing...\n");
	workOffset[0] = 0;
	gettimeofday(&start, NULL);

	size_t localWorkSize[] = {BLOCK_SIZE, BLOCK_SIZE};
	int c = 0;
	for (p=0; p<platform_count;p++) {
		for (i=0; i<devs[p]; i++) {
			check2(d_B[c] = clCreateBuffer(ctx[p], CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR, HB * WB * sizeof(TYPE), B_data, &err));
			c++;
		}
	}

	for(i=0; i < BLOCKS; ++i)
	{
		int d = i % device_count;
		cl_uint platform = 0;

		// determine device platform
		int dev = d;
		for (platform = 0; platform < platform_count; platform++) {
			if ((cl_int)(dev - devs[platform]) < 0)
				break;
			dev -= devs[platform];
		}

		workSize[i] = (i < sizeMod) ? sizePerGPU+1 : sizePerGPU;

		check2(d_A[i] = clCreateBuffer(ctx[platform], CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR, workSize[i] * WA * sizeof(TYPE), &A_data[workOffset[i] * WA], &err));
		check2(d_C[i] = clCreateBuffer(ctx[platform], CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, workSize[i] * WC * sizeof(TYPE), &C_data[workOffset[i] * WC], &err));

		check(clSetKernelArg(multiplicationKernel[platform], 0, sizeof(cl_int), &workSize[i]));
		check(clSetKernelArg(multiplicationKernel[platform], 1, sizeof(cl_int), &workSize[i]));
		check(clSetKernelArg(multiplicationKernel[platform], 2, sizeof(cl_int), &workSize[i]));
		check(clSetKernelArg(multiplicationKernel[platform], 3, sizeof(cl_mem), (void *) &d_A[i]));
		check(clSetKernelArg(multiplicationKernel[platform], 4, sizeof(cl_mem), (void *) &d_B[d]));
		check(clSetKernelArg(multiplicationKernel[platform], 5, sizeof(cl_mem), (void *) &d_C[i]));

		size_t globalWorkSize[] = {roundUp(BLOCK_SIZE,WC), roundUp(BLOCK_SIZE,workSize[i])};

		check(clEnqueueNDRangeKernel(commandQueue[platform][dev], multiplicationKernel[platform], 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &GPUExecution[i]));

		// Non-blocking copy of result from device to host
		cqs[i] = commandQueue[platform][dev];
		check2(ptrs[i] = clEnqueueMapBuffer(cqs[i], d_C[i], CL_FALSE, CL_MAP_READ, 0, WC * sizeof(TYPE) * workSize[i], 1, &GPUExecution[i], &GPUDone[i], &err));

		if(i+1 < BLOCKS)
			workOffset[i + 1] = workOffset[i] + workSize[i];
	}


	// CPU sync with GPU
	for (p=0; p<platform_count;p++) {
		cl_uint dev;
		for (dev=0; dev<devs[p]; dev++) {
			clFinish(commandQueue[p][dev]);
		}
	}

	gettimeofday(&end, NULL);
	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));

	double dSeconds = timing/1000/1000;
	double dNumOps = 2.0 * (double)WA * (double)HA * (double)WB;
	double gflops = 1.0e-9 * dNumOps/dSeconds;

	printf("Throughput = %.4f GFlops/s, Time = %.5f s, Size = %.0f, NumDevsUsed = %d, Blocks = %ld, Workgroup = %zu\n",
			gflops, dSeconds, dNumOps, device_count, BLOCKS, localWorkSize[0] * localWorkSize[1]);

	// compute reference solution
	if (check) {
		printf("Comparing results with CPU computation... ");
		TYPE* reference = (TYPE*)malloc(C_mem_size);
		computeReference(reference, A_data, B_data, HA, WA, WB);

		// check result
		int res = shrCompareL2fe(reference, C_data, C_size, 1.0e-6f);
		if (res == 0) {
			printf("\n\n");
			printDiff(reference, C_data, WC, HC, 100, 1.0e-5f);
		}
		else printf("PASSED\n\n");
		free(reference);
	}

	for(i = 0; i < BLOCKS; i++)
	{
		clEnqueueUnmapMemObject(cqs[i], d_C[i], ptrs[i], 0, NULL, NULL);
	}

	for(i = 0; i < BLOCKS; i++)
	{
		clFinish(cqs[i]);
	}

	for (i=0; i<device_count; i++) {
		clReleaseMemObject(d_B[i]);
	}

	for(i = 0; i < BLOCKS; i++)
	{
		clReleaseMemObject(d_A[i]);
		clReleaseMemObject(d_C[i]);
		clReleaseEvent(GPUExecution[i]);
		clReleaseEvent(GPUDone[i]);
	}


	for (p=0; p<platform_count;p++) {
		if (devs[p] == 0)
			continue;

		check(clReleaseKernel(multiplicationKernel[p]));
		check(clReleaseProgram(program[p]));
		check(clReleaseContext(ctx[p]));
		cl_uint k;
		for(k = 0; k < devs[p]; ++k)
		{
			check(clReleaseCommandQueue(commandQueue[p][k]));
		}
	}

	free(A_data);
	free(B_data);
	free(C_data);

	return 0;
}