bool HighpassFilter::filter(cl_command_queue GPUCommandQueue)
{

    int iLocalPixPitch = iBlockDimX + 2;
    GPUError = clSetKernelArg(GPUFilter, 0, sizeof(cl_mem), (void*)&GPUTransfer->cmDevBuf);
	GPUError |= clSetKernelArg(GPUFilter, 1, sizeof(cl_mem), (void*)&cmDevBufMaskV);
	GPUError |= clSetKernelArg(GPUFilter, 2, sizeof(cl_mem), (void*)&cmDevBufMaskH);
    GPUError |= clSetKernelArg(GPUFilter, 3, (iLocalPixPitch * (iBlockDimY + 2) * GPUTransfer->nChannels * sizeof(cl_uchar)), NULL);
	GPUError |= clSetKernelArg(GPUFilter, 4, ( 9 * sizeof(int)), NULL);
	GPUError |= clSetKernelArg(GPUFilter, 5, ( 9 * sizeof(int)), NULL);
    GPUError |= clSetKernelArg(GPUFilter, 6, sizeof(cl_int), (void*)&iLocalPixPitch);
    GPUError |= clSetKernelArg(GPUFilter, 7, sizeof(cl_uint), (void*)&GPUTransfer->ImageWidth);
    GPUError |= clSetKernelArg(GPUFilter, 8, sizeof(cl_uint), (void*)&GPUTransfer->ImageHeight);
	GPUError |= clSetKernelArg(GPUFilter, 9, sizeof(cl_int), (void*)&GPUTransfer->nChannels);
    if(GPUError) return false;

	size_t GPULocalWorkSize[2]; 
    GPULocalWorkSize[0] = iBlockDimX;
    GPULocalWorkSize[1] = iBlockDimY;
    GPUGlobalWorkSize[0] = shrRoundUp((int)GPULocalWorkSize[0], GPUTransfer->ImageWidth); 

    GPUGlobalWorkSize[1] = shrRoundUp((int)GPULocalWorkSize[1], (int)GPUTransfer->ImageHeight);

    if(clEnqueueNDRangeKernel( GPUCommandQueue, GPUFilter, 2, NULL, GPUGlobalWorkSize, GPULocalWorkSize, 0, NULL, NULL)) return false;
	return true;
}
// Kernel function
//*****************************************************************************
int executeKernel(cl_int radius)
{

    // set global and local work item dimensions
    szLocalWorkSize[0] = 16;
    szLocalWorkSize[1] = 16;
    szGlobalWorkSize[0] = shrRoundUp((int)szLocalWorkSize[0], image_width);
    szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], image_height);

    // set the args values
    cl_int tilew =  (cl_int)szLocalWorkSize[0]+(2*radius);
    ciErrNum = clSetKernelArg(ckKernel, 4, sizeof(tilew), &tilew);
    ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(radius), &radius);    
    cl_float threshold = 0.8f;
    ciErrNum |= clSetKernelArg(ckKernel, 6, sizeof(threshold), &threshold);        
    cl_float highlight = 4.0f;
    ciErrNum |= clSetKernelArg(ckKernel, 7, sizeof(highlight), &highlight);            
    
    // Local memory
    ciErrNum |= clSetKernelArg(ckKernel, 8, (szLocalWorkSize[0]+(2*16))*(szLocalWorkSize[1]+(2*16))*sizeof(int), NULL);

    // launch computation kernel
#ifdef GPU_PROFILING
    int nIter = 30;
    for( int i=-1; i< nIter; ++i) {
        if( i ==0 )
            shrDeltaT(0);
#endif        
    ciErrNum |= clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 2, NULL,
                                      szGlobalWorkSize, szLocalWorkSize, 
                                     0, NULL, NULL);
#ifdef GPU_PROFILING
    }
    clFinish(cqCommandQueue);
    double dSeconds = shrDeltaT(0)/(double)nIter;
    double dNumTexels = (double)image_width * (double)image_height;
    double mtexps = 1.0e-6 * dNumTexels/dSeconds;
    shrLogEx(LOGBOTH | MASTER, 0, "oclPostprocessGL, Throughput = %.4f MTexels/s, Time = %.5f s, Size = %.0f Texels, NumDevsUsed = %u, Workgroup = %u\n", 
            mtexps, dSeconds, dNumTexels, uiNumDevsUsed, szLocalWorkSize[0] * szLocalWorkSize[1]);

#endif

    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    return 0;
}
bool Subtract::Process()
{
	size_t GPULocalWorkSize[2];
	GPULocalWorkSize[0] = iBlockDimX;
	GPULocalWorkSize[1] = iBlockDimY;
	GPUGlobalWorkSize[0] = shrRoundUp((int)GPULocalWorkSize[0], (int)imageWidth);
	GPUGlobalWorkSize[1] = shrRoundUp((int)GPULocalWorkSize[1], (int)imageHeight);
	
	int iLocalPixPitch = iBlockDimX + 2;
	GPUError = clSetKernelArg(GPUKernel, 0, sizeof(cl_mem), (void*)&GPU::getInstance().buffersListIn[0]);
	GPUError = clSetKernelArg(GPUKernel, 1, sizeof(cl_mem), (void*)&GPU::getInstance().buffersListIn[1]);
	GPUError = clSetKernelArg(GPUKernel, 2, sizeof(cl_mem), (void*)&GPU::getInstance().buffersListOut[0]);
	GPUError |= clSetKernelArg(GPUKernel, 3, sizeof(cl_uint), (void*)&imageWidth);
	GPUError |= clSetKernelArg(GPUKernel, 4, sizeof(cl_uint), (void*)&imageHeight);
	if(GPUError) return false;

	if(clEnqueueNDRangeKernel( GPUCommandQueue, GPUKernel, 2, NULL, GPUGlobalWorkSize, GPULocalWorkSize, 0, NULL, NULL)) return false;
	return true;
}
Esempio n. 4
0
bool RGB2HSV::filter(cl_command_queue GPUCommandQueue)
{
	
    int iLocalPixPitch = iBlockDimX + 2;
    GPUError = clSetKernelArg(GPUFilter, 0, sizeof(cl_mem), (void*)&GPUTransfer->cmDevBuf);
    GPUError |= clSetKernelArg(GPUFilter, 1, sizeof(cl_uint), (void*)&GPUTransfer->ImageWidth);
    GPUError |= clSetKernelArg(GPUFilter, 2, sizeof(cl_uint), (void*)&GPUTransfer->ImageHeight);
    
	if( GPUError != 0 ) return false;

    size_t GPULocalWorkSize[2];    
    GPULocalWorkSize[0] = iBlockDimX;
    GPULocalWorkSize[1] = iBlockDimY;
    GPUGlobalWorkSize[0] = shrRoundUp((int)GPULocalWorkSize[0], GPUTransfer->ImageWidth); 

    GPUGlobalWorkSize[1] = shrRoundUp((int)GPULocalWorkSize[1], (int)GPUTransfer->ImageHeight);


	
    if( clEnqueueNDRangeKernel( GPUCommandQueue, GPUFilter, 2, NULL, GPUGlobalWorkSize, GPULocalWorkSize, 0, NULL, NULL) ) return false;
    return true;
}
Esempio n. 5
0
void matrixMulGPU(cl_uint ciDeviceCount, cl_mem h_A, float* h_B_data, unsigned int mem_size_B, float* h_C )
{
    cl_mem d_A[MAX_GPU_COUNT];
    cl_mem d_C[MAX_GPU_COUNT];
    cl_mem d_B[MAX_GPU_COUNT];

    cl_event GPUDone[MAX_GPU_COUNT];
    cl_event GPUExecution[MAX_GPU_COUNT];

    // Start the computation on each available GPU
    
    // Create buffers for each GPU
    // Each GPU will compute sizePerGPU rows of the result
    int sizePerGPU = uiHA / ciDeviceCount;

    int workOffset[MAX_GPU_COUNT];
    int workSize[MAX_GPU_COUNT];

    workOffset[0] = 0;
    for(unsigned int i=0; i < ciDeviceCount; ++i) 
    {
        // Input buffer
        workSize[i] = (i != (ciDeviceCount - 1)) ? sizePerGPU : (uiHA - workOffset[i]);        

        d_A[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, workSize[i] * sizeof(float) * uiWA, NULL,NULL);

        // Copy only assigned rows from host to device
        clEnqueueCopyBuffer(commandQueue[i], h_A, d_A[i], workOffset[i] * sizeof(float) * uiWA, 
                            0, workSize[i] * sizeof(float) * uiWA, 0, NULL, NULL);        
        
        // create OpenCL buffer on device that will be initiatlize from the host memory on first use
        // on device
        d_B[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                mem_size_B, h_B_data, NULL);

        // Output buffer
        d_C[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY,  workSize[i] * uiWC * sizeof(float), NULL,NULL);
              
        // set the args values
        clSetKernelArg(multiplicationKernel[i], 0, sizeof(cl_mem), (void *) &d_C[i]);
        clSetKernelArg(multiplicationKernel[i], 1, sizeof(cl_mem), (void *) &d_A[i]);
        clSetKernelArg(multiplicationKernel[i], 2, sizeof(cl_mem), (void *) &d_B[i]);
        clSetKernelArg(multiplicationKernel[i], 3, sizeof(float) * BLOCK_SIZE *BLOCK_SIZE, 0 );
        clSetKernelArg(multiplicationKernel[i], 4, sizeof(float) * BLOCK_SIZE *BLOCK_SIZE, 0 );
        clSetKernelArg(multiplicationKernel[i], 5, sizeof(cl_int), (void *) &uiWA);
        clSetKernelArg(multiplicationKernel[i], 6, sizeof(cl_int), (void *) &uiWB);

        if(i+1 < ciDeviceCount)
            workOffset[i + 1] = workOffset[i] + workSize[i];
    }
    
    // Execute Multiplication on all GPUs in parallel
    size_t localWorkSize[] = {BLOCK_SIZE, BLOCK_SIZE};
    size_t globalWorkSize[] = {shrRoundUp(BLOCK_SIZE, uiWC), shrRoundUp(BLOCK_SIZE, workSize[0])};
    
    // Launch kernels on devices
#ifdef GPU_PROFILING
	
	int nIter = 30;

    for (int j = -1; j < nIter; j++) 
    {
        // Sync all queues to host and start timer first time through loop
        if(j == 0){
            for(unsigned int i = 0; i < ciDeviceCount; i++) 
            {
                clFinish(commandQueue[i]);
            }

            shrDeltaT(0);
        }
#endif
        for(unsigned int i = 0; i < ciDeviceCount; i++) 
        {
			// Multiplication - non-blocking execution:  launch and push to device(s)
			globalWorkSize[1] = shrRoundUp(BLOCK_SIZE, workSize[i]);
			clEnqueueNDRangeKernel(commandQueue[i], multiplicationKernel[i], 2, 0, globalWorkSize, localWorkSize,
				                   0, NULL, &GPUExecution[i]);
            clFlush(commandQueue[i]);
		}

#ifdef GPU_PROFILING
    }
#endif

    // sync all queues to host
	for(unsigned int i = 0; i < ciDeviceCount; i++) 
    {
		clFinish(commandQueue[i]);
	}

#ifdef GPU_PROFILING

    // stop and log timer 
    double dSeconds = shrDeltaT(0)/(double)nIter;
    double dNumOps = 2.0 * (double)uiWA * (double)uiHA * (double)uiWB;
    double gflops = 1.0e-9 * dNumOps/dSeconds;
    shrLogEx(LOGBOTH | MASTER, 0, "oclMatrixMul, Throughput = %.4f GFlops/s, Time = %.5f s, Size = %.0f, NumDevsUsed = %d, Workgroup = %u\n", 
            gflops, dSeconds, dNumOps, ciDeviceCount, localWorkSize[0] * localWorkSize[1]);

    // Print kernel timing per GPU
    shrLog("\n");
    for(unsigned int i = 0; i < ciDeviceCount; i++) 
    {    
        shrLog("  Kernel execution time on GPU %d \t: %.5f s\n", i, executionTime(GPUExecution[i]));
    }
    shrLog("\n");
#endif

    for(unsigned int i = 0; i < ciDeviceCount; i++) 
    {    
        // Non-blocking copy of result from device to host
        clEnqueueReadBuffer(commandQueue[i], d_C[i], CL_FALSE, 0, uiWC * sizeof(float) * workSize[i], 
                            h_C + workOffset[i] * uiWC, 0, NULL, &GPUDone[i]);
    }

	// CPU sync with GPU
    clWaitForEvents(ciDeviceCount, GPUDone);


    // Release mem and event objects    
    for(unsigned int i = 0; i < ciDeviceCount; i++) 
    {
        clReleaseMemObject(d_A[i]);
        clReleaseMemObject(d_C[i]);
        clReleaseMemObject(d_B[i]);
	    clReleaseEvent(GPUExecution[i]);
	    clReleaseEvent(GPUDone[i]);
    }
}
//-----------------------------------------------------------------------------
//! Run the CL part of the computation
//-----------------------------------------------------------------------------
void RunKernels()
{
    static float t = 0.0f;

	// ----------------------------------------------------------------
    // populate the 2d texture
    {
		// set global and local work item dimensions
		szLocalWorkSize[0] = 16;
		szLocalWorkSize[1] = 16;
		szGlobalWorkSize[0] = shrRoundUp((int)szLocalWorkSize[0], g_texture_2d.width);
		szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], g_texture_2d.height);

		// set the args values
#ifdef USE_STAGING_BUFFER
		ciErrNum |= clSetKernelArg(ckKernel_tex2d, 0, sizeof(g_texture_2d.clMem), (void *) &(g_texture_2d.clMem));
#else
		ciErrNum |= clSetKernelArg(ckKernel_tex2d, 0, sizeof(g_texture_2d.clTexture), (void *) &(g_texture_2d.clTexture));
#endif
		ciErrNum |= clSetKernelArg(ckKernel_tex2d, 1, sizeof(g_texture_2d.clTexture), (void *) &(g_texture_2d.clTexture));
		ciErrNum |= clSetKernelArg(ckKernel_tex2d, 2, sizeof(g_texture_2d.width), &g_texture_2d.width);
		ciErrNum |= clSetKernelArg(ckKernel_tex2d, 3, sizeof(g_texture_2d.height), &g_texture_2d.height);
		ciErrNum |= clSetKernelArg(ckKernel_tex2d, 4, sizeof(g_texture_2d.pitch), &g_texture_2d.pitch);
		ciErrNum |= clSetKernelArg(ckKernel_tex2d, 5, sizeof(t), &t);
		oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
	    
		// launch computation kernel
		ciErrNum |= clEnqueueNDRangeKernel(cqCommandQueue, ckKernel_tex2d, 2, NULL,
										  szGlobalWorkSize, szLocalWorkSize, 
										 0, NULL, NULL);
		oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

#ifdef USE_STAGING_BUFFER
		size_t dst[3] = { 0, 0, 0};
		size_t region[3] = { g_texture_2d.width, g_texture_2d.height, 1};
		ciErrNum |= clEnqueueCopyBufferToImage(cqCommandQueue,
                   g_texture_2d.clMem		/* src_buffer */,
                   g_texture_2d.clTexture	/* dst_image */, 
                   0						/* src_offset */,
                   dst						/* dst_origin[3] */,
                   region					/* region[3] */, 
                   0						/* num_events_in_wait_list */,
                   NULL						/* event_wait_list */,
                   NULL						/* event */);
		oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
#endif
    }
	// ----------------------------------------------------------------
    // populate the volume texture
    {
		// set global and local work item dimensions
		szLocalWorkSize[0] = 16;
		szLocalWorkSize[1] = 16;
		szGlobalWorkSize[0] = shrRoundUp((int)szLocalWorkSize[0], g_texture_vol.width);
		szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], g_texture_vol.height);

		// set the args values
		ciErrNum |= clSetKernelArg(ckKernel_texvolume, 0, sizeof(g_texture_vol.clMem), (void *) &(g_texture_vol.clMem));
		ciErrNum |= clSetKernelArg(ckKernel_texvolume, 1, sizeof(g_texture_vol.width), &g_texture_vol.width);
		ciErrNum |= clSetKernelArg(ckKernel_texvolume, 2, sizeof(g_texture_vol.height), &g_texture_vol.height);
		ciErrNum |= clSetKernelArg(ckKernel_texvolume, 3, sizeof(g_texture_vol.depth), &g_texture_vol.depth);
		ciErrNum |= clSetKernelArg(ckKernel_texvolume, 4, sizeof(g_texture_vol.pitch), &g_texture_vol.pitch);
		ciErrNum |= clSetKernelArg(ckKernel_texvolume, 5, sizeof(g_texture_vol.pitchslice), &g_texture_vol.pitchslice);
		oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
	    
		// launch computation kernel
		ciErrNum |= clEnqueueNDRangeKernel(cqCommandQueue, ckKernel_texvolume, 2, NULL,
										  szGlobalWorkSize, szLocalWorkSize, 
										 0, NULL, NULL);
		oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

		// ONLY staging buffer works, for volume texture
		// do the copy here
		size_t dst[3] = { 0, 0, 0};
		size_t region[3] = { g_texture_vol.width, g_texture_vol.height, g_texture_vol.depth};
		ciErrNum |= clEnqueueCopyBufferToImage(cqCommandQueue,
                   g_texture_vol.clMem		/* src_buffer */,
                   g_texture_vol.clTexture	/* dst_image */, 
                   0						/* src_offset */,
                   dst						/* dst_origin[3] */,
                   region					/* region[3] */, 
                   0						/* num_events_in_wait_list */,
                   NULL						/* event_wait_list */,
                   NULL						/* event */);
		oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    }

	// ----------------------------------------------------------------
    // populate the faces of the cube map
    for (int face = 0; face < 6; ++face)
    {
		// set global and local work item dimensions
		szLocalWorkSize[0] = 16;
		szLocalWorkSize[1] = 16;
		szGlobalWorkSize[0] = shrRoundUp((int)szLocalWorkSize[0], g_texture_cube.size);
		szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], g_texture_cube.size);

		// set the args values
#ifdef USE_STAGING_BUFFER
		ciErrNum |= clSetKernelArg(ckKernel_texcube, 0, sizeof(g_texture_cube.clMem[face]), (void *) &(g_texture_cube.clMem[face]));
#else
		ciErrNum |= clSetKernelArg(ckKernel_texcube, 0, sizeof(g_texture_cube.clTexture[face]), (void *) &(g_texture_cube.clTexture[face]));
#endif
		ciErrNum |= clSetKernelArg(ckKernel_texcube, 1, sizeof(g_texture_cube.size), &g_texture_cube.size);
		ciErrNum |= clSetKernelArg(ckKernel_texcube, 2, sizeof(g_texture_cube.pitch), &g_texture_cube.pitch);
		ciErrNum |= clSetKernelArg(ckKernel_texcube, 3, sizeof(int), &face);
		ciErrNum |= clSetKernelArg(ckKernel_texcube, 4, sizeof(t), &t);
		oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
	    
		// launch computation kernel
		ciErrNum |= clEnqueueNDRangeKernel(cqCommandQueue, ckKernel_texcube, 2, NULL,
										  szGlobalWorkSize, szLocalWorkSize, 
										 0, NULL, NULL);
		oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

#ifdef USE_STAGING_BUFFER
		size_t dst[3] = { 0, 0, 0};
		size_t region[3] = { g_texture_cube.size, g_texture_cube.size, 1};
		ciErrNum |= clEnqueueCopyBufferToImage(cqCommandQueue,
                   g_texture_cube.clMem[face]/* src_buffer */,
                   g_texture_cube.clTexture[face]/* dst_image */, 
                   0						/* src_offset */,
                   dst						/* dst_origin[3] */,
                   region					/* region[3] */, 
                   0						/* num_events_in_wait_list */,
                   NULL						/* event_wait_list */,
                   NULL						/* event */);
		oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
#endif
    }
    t += 0.1f;
}
// Main function
// *********************************************************************
int main(int argc, char **argv)
{
    gp_argc = &argc;
    gp_argv = &argv;

    shrQAStart(argc, argv);

    // Get the NVIDIA platform
    ciErrNum = oclGetPlatformID(&cpPlatform);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
    shrLog("clGetPlatformID...\n");

    // Get the NVIDIA platform
    ciErrNum = oclGetPlatformID(&cpPlatform);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
    shrLog("clGetPlatformID...\n");

    //Get all the devices
    cl_uint uiNumDevices = 0;           // Number of devices available
    cl_uint uiTargetDevice = 0;	        // Default Device to compute on
    cl_uint uiNumComputeUnits;          // Number of compute units (SM's on NV GPU)
    shrLog("Get the Device info and select Device...\n");
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
    cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);

    // Get command line device options and config accordingly
    shrLog("  # of Devices Available = %u\n", uiNumDevices);
    if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE)
    {
        uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
    }
    shrLog("  Using Device %u: ", uiTargetDevice);
    oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
    ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
    shrLog("\n  # of Compute Units = %u\n", uiNumComputeUnits);

    // get command line arg for quick test, if provided
    bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt");

    // start logs
    cExecutableName = argv[0];
    shrSetLogFileName ("oclDotProduct.txt");
    shrLog("%s Starting...\n\n# of float elements per Array \t= %u\n", argv[0], iNumElements);

    // set and log Global and Local work size dimensions
    szLocalWorkSize = 256;
    szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements);  // rounded up to the nearest multiple of the LocalWorkSize
    shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n",
           szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize));

    // Allocate and initialize host arrays
    shrLog( "Allocate and Init Host Mem...\n");
    srcA = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize);
    srcB = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize);
    dst = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize);
    Golden = (void *)malloc(sizeof(cl_float) * iNumElements);
    shrFillArray((float*)srcA, 4 * iNumElements);
    shrFillArray((float*)srcB, 4 * iNumElements);

    // Get the NVIDIA platform
    ciErrNum = oclGetPlatformID(&cpPlatform);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Get a GPU device
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevices[uiTargetDevice], NULL);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Create the context
    cxGPUContext = clCreateContext(0, 1, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Create a command-queue
    shrLog("clCreateCommandQueue...\n");
    cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Allocate the OpenCL buffer memory objects for source and result on the device GMEM
    shrLog("clCreateBuffer (SrcA, SrcB and Dst in Device GMEM)...\n");
    cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize * 4, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize * 4, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Read the OpenCL kernel in from source file
    shrLog("oclLoadProgSource (%s)...\n", cSourceFile);
    cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
    oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup);
    cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);
    oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup);

    // Create the program
    shrLog("clCreateProgramWithSource...\n");
    cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);

    // Build the program with 'mad' Optimization option
#ifdef MAC
    char* flags = "-cl-fast-relaxed-math -DMAC";
#else
    char* flags = "-cl-fast-relaxed-math";
#endif
    shrLog("clBuildProgram...\n");
    ciErrNum = clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        // write out standard error, Build Log and PTX, then cleanup and exit
        shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
        oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
        oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclDotProduct.ptx");
        Cleanup(EXIT_FAILURE);
    }

    // Create the kernel
    shrLog("clCreateKernel (DotProduct)...\n");
    ckKernel = clCreateKernel(cpProgram, "DotProduct", &ciErrNum);

    // Set the Argument values
    shrLog("clSetKernelArg 0 - 3...\n\n");
    ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA);
    ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB);
    ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst);
    ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // --------------------------------------------------------
    // Core sequence... copy input data to GPU, compute, copy results back

    // Asynchronous write of data to GPU device
    shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n");
    ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcA, 0, NULL, NULL);
    ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcB, 0, NULL, NULL);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Launch kernel
    shrLog("clEnqueueNDRangeKernel (DotProduct)...\n");
    ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Read back results and check accumulated errors
    shrLog("clEnqueueReadBuffer (Dst)...\n\n");
    ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Compute and compare results for golden-host and report errors and pass/fail
    shrLog("Comparing against Host/C++ computation...\n\n");
    DotProductHost ((const float*)srcA, (const float*)srcB, (float*)Golden, iNumElements);
    shrBOOL bMatch = shrComparefet((const float*)Golden, (const float*)dst, (unsigned int)iNumElements, 0.0f, 0);

    // Cleanup and leave
    Cleanup (EXIT_SUCCESS);
}
Esempio n. 8
0
// Main function 
// *********************************************************************
int ymain(int argc, char **argv)
{
    shrQAStart(argc, argv);

    // get command line arg for quick test, if provided
    bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt");
    
    // start logs 
	cExecutableName = argv[0];
    shrSetLogFileName ("oclVectorAdd2.txt");
    shrLog("%s Starting...\n\n# of float elements per Array \t= %i\n", argv[0], iNumElements); 

    // set and log Global and Local work size dimensions
    szLocalWorkSize = 256;
    szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements);  // rounded up to the nearest multiple of the LocalWorkSize
    shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n", 
           szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); 

    // Allocate and initialize host arrays 
    shrLog( "Allocate and Init Host Mem...\n"); 
    srcA = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize);
    srcB = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize);
    dst = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize);
    Golden = (void *)malloc(sizeof(cl_float) * iNumElements);
    shrFillArray((float*)srcA, iNumElements);
    shrFillArray((float*)srcB, iNumElements);

    //Get an OpenCL platform
    ciErr1 = clGetPlatformIDs(1, &cpPlatform, NULL);

    shrLog("clGetPlatformID...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clGetPlatformID, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    //Get the devices
    ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);
    shrLog("clGetDeviceIDs...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clGetDeviceIDs, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    //Create the context
    cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr1);
    shrLog("clCreateContext...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clCreateContext, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    // Create a command-queue
    cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &ciErr1);
    shrLog("clCreateCommandQueue...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clCreateCommandQueue, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    // Allocate the OpenCL buffer memory objects for source and result on the device GMEM
    cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr1);
    cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2);
    ciErr1 |= ciErr2;
    cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2);
    ciErr1 |= ciErr2;
    shrLog("clCreateBuffer...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clCreateBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }
    
    // Read the OpenCL kernel in from source file
    shrLog("oclLoadProgSource (%s)...\n", cSourceFile); 
    cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
    shrLog("Looking for: %s in Path: %s\n", cSourceFile, argv[0]);
    cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);

    // Create the program
    cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1);
    shrLog("clCreateProgramWithSource...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    // Build the program with 'mad' Optimization option
    #ifdef MAC
        char* flags = "-cl-fast-relaxed-math -DMAC";
    #else
        char* flags = "-cl-fast-relaxed-math";
    #endif
    ciErr1 = clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL);
    shrLog("clBuildProgram...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    // Create the kernel
    ckKernel = clCreateKernel(cpProgram, "VectorAdd", &ciErr1);
    shrLog("clCreateKernel (VectorAdd)...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    // Set the Argument values
    ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA);
    ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB);
    ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst);
    ciErr1 |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements);
    shrLog("clSetKernelArg 0 - 3...\n\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clSetKernelArg, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    // --------------------------------------------------------
    // Start Core sequence... copy input data to GPU, compute, copy results back

    // Asynchronous write of data to GPU device
    ciErr1 = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, srcA, 0, NULL, NULL);
    ciErr1 |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, srcB, 0, NULL, NULL);
    shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clEnqueueWriteBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    // Launch kernel
    ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
    shrLog("clEnqueueNDRangeKernel (VectorAdd)...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clEnqueueNDRangeKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    // Synchronous/blocking read of results, and check accumulated errors
    ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL);
    shrLog("clEnqueueReadBuffer (Dst)...\n\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clEnqueueReadBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }
    //--------------------------------------------------------

    // Compute and compare results for golden-host and report errors and pass/fail
    shrLog("Comparing against Host/C++ computation...\n\n"); 
    VectorAddHost ((const float*)srcA, (const float*)srcB, (float*)Golden, iNumElements);
    shrBOOL bMatch = shrComparefet((const float*)Golden, (const float*)dst, (unsigned int)iNumElements, 0.0f, 0);

    // Cleanup and leave
    Cleanup (argc, argv, (bMatch == shrTRUE) ? EXIT_SUCCESS : EXIT_FAILURE);
}
Esempio n. 9
0
int main(int argc, char** argv)
{

    if (argc < 2) {
        printf("%s nrepeat\n", argv[0]);
        exit(1);
    }

    int nrepeat=atoi(argv[1]);

    // Storage for the arrays.
    static cl_mem output;
    // OpenCL state
    static cl_command_queue queue;
    //static cl_kernel kernel;

    static cl_device_id device_ids[NDEV];
    static cl_context context;

    static cl_platform_id platform_id;

    cl_int nrow=25;
    cl_int ncol=25;
    float cenrow0=12.;
    float cencol0=12.;
    float irr0=2.;
    float irc0=0.;
    float icc0=3.;

    int nelem=nrow*ncol;

    int nwalkers=20;
    int ntot=nrow*ncol*nwalkers;

    cl_uint numPlatforms;
    cl_int err = CL_SUCCESS;

    clock_t t0,t1;
    int nsteps=600;

    int device_type=0;
    if (1) {
        device_type=CL_DEVICE_TYPE_GPU;
    } else {
        device_type=CL_DEVICE_TYPE_CPU;
    }

    //SETUP PLATFORM
    err = clGetPlatformIDs(0, NULL, &numPlatforms);
    if (err != CL_SUCCESS) {
        fprintf(stderr,"could not get platform\n");
        exit(EXIT_FAILURE);
    }
    if(numPlatforms > 0)
    {
        //we have at least one
        //cl_platform_id* platforms = new cl_platform_id[numPlatforms];
        cl_platform_id* platforms = calloc(numPlatforms, sizeof(cl_platform_id));
        err = clGetPlatformIDs(numPlatforms, platforms, NULL);
        if (err != CL_SUCCESS) {
            fprintf(stderr,"could not get platform id\n");
            exit(EXIT_FAILURE);
        }

        fprintf(stderr,"Found %d platforms\n", numPlatforms);
        platform_id = platforms[0];
        //delete[] platforms;
        free(platforms);
    }
    else
        exit(0);
    //END PLATFORM

    //SETUP CONTEXT
    cl_context_properties cps[3] =
    {
        CL_CONTEXT_PLATFORM,
        (cl_context_properties)platform_id,
        0
    };

    context = clCreateContextFromType(
            cps,
            device_type,
            NULL,
            NULL,
            &err);
    //END CONTEXT

    int num_devices=0;
    err = clGetDeviceIDs(platform_id, device_type, NDEV, device_ids, &num_devices);
    fprintf(stderr,"found %d devices\n", num_devices);
    if (err != CL_SUCCESS) {
        fprintf(stderr,"could not get device ids\n");
        exit(EXIT_FAILURE);
    }
    if (NDEV != num_devices) {
        printf("expected %d devices\n", NDEV);
        exit(1);
    }

    for (int i=0; i<num_devices; i++) {
        size_t len=0;
        cl_uint avail=0;
        cl_uint id=0;
        
        clGetDeviceInfo(device_ids[i], CL_DEVICE_AVAILABLE, sizeof(cl_uint), &avail, &len);
        clGetDeviceInfo(device_ids[i], CL_DEVICE_VENDOR_ID, sizeof(cl_uint), &id, &len);
        printf("device #: %d id: %d avail: %d\n", i, id, avail);
    }
    int devnum=0;
    printf("choosing device %d\n", devnum);

    cl_program program = clCreateProgramWithSource(context, 1, &kernel_source , NULL, &err);
    if (err != CL_SUCCESS) {
        fprintf(stderr,"could not create program\n");
        exit(EXIT_FAILURE);
    }




    size_t len;
    char buffer[2048];
    clGetProgramBuildInfo(program, device_ids[devnum], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);

    clGetDeviceInfo(device_ids[devnum], CL_DEVICE_NAME, sizeof(buffer), buffer, &len);
    cl_ulong memsize;
    clGetDeviceInfo(device_ids[devnum], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &memsize, &len);
    cl_uint nunits;
    clGetDeviceInfo(device_ids[devnum], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &nunits, &len);
    cl_ulong max_work_group_size;
    clGetDeviceInfo(device_ids[devnum], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(cl_ulong), &max_work_group_size, &len);
    //cl_uint warp_size;
    //clGetDeviceInfo(device_ids[devnum], CL_NV_DEVICE_WARP_SIZE, sizeof(cl_uint), &warp_size, &len);
     
    printf("CL_DEVICE_NAME:                    '%s'\n", buffer);
    printf("CL_DEVICE_GLOBAL_MEM_SIZE:          %lu\n", memsize);
    // compute unit is a lump of hardware that executes 'work groups'
    printf("CL_DEVICE_MAX_COMPUTE_UNITS:        %u\n", nunits);
    // max number of items per work group
    printf("CL_DEVICE_MAX_WORK_GROUP_SIZE: %lu\n", max_work_group_size);
    //printf("CL_NV_DEVICE_WARP_SIZE:             %u\n", warp_size);

    size_t szLocalWorkSize = nrow;
    //size_t szLocalWorkSize = 512;
    // make sure multiple of 32
    szLocalWorkSize=shrRoundUp((int)256, (int)szLocalWorkSize);
    // rounded up to the nearest multiple of the LocalWorkSize
    size_t szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)ntot);

    printf("nrow: %d\n", nrow);
    printf("ncol %d\n", ncol);
    printf("setting nelem: %d\n", nelem);
    printf("setting ntot: %d\n", ntot);
    printf("setting local work size: %lu\n", szLocalWorkSize);
    printf("setting global work size: %lu\n", szGlobalWorkSize);



    //queue = clCreateCommandQueue(context, device_ids, 0, &err);
    queue = clCreateCommandQueue(context, 
                                 device_ids[devnum],
                                 //CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 
                                 0,
                                 &err);
    if (err != CL_SUCCESS) {
        fprintf(stderr,"could not create command queue\n");
        exit(EXIT_FAILURE);
    }



    //OPTIMIZATION OPTIONS FOUND AT http://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clBuildProgram.html

    err = clBuildProgram(program, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
    if (err != CL_SUCCESS) {
        fprintf(stderr,"could not build program\n");
        exit(EXIT_FAILURE);
    }



    //SETUP KERNEL
    cl_kernel kernel = clCreateKernel(program, "gmix", &err);
    if (err != CL_SUCCESS) {
        fprintf(stderr,"could not create kernel\n");
        exit(EXIT_FAILURE);
    }


    clReleaseProgram(program); // no longer needed


    printf("processing %dx%d image %d walkers %d steps nrepeat %d\n",
           nrow,ncol,nwalkers,nsteps,nrepeat);
    double tstandard=0;
    double topencl=0;



    cl_float *image=NULL;

    srand48(10);
    t0=clock();
    for (int rep=0; rep<nrepeat; rep++) {

        // we can probably instead re-use rows so this
        // is overkill simulating overhead
        image=get_new_image(nrow,ncol);

        cl_float *data_from_gpu = calloc(szGlobalWorkSize, sizeof(cl_float));
        cl_float *rows=calloc(szGlobalWorkSize,sizeof(cl_float));
        cl_float *cols=calloc(szGlobalWorkSize,sizeof(cl_float));

        fill_rows_cols(nwalkers, nrow, ncol, rows, cols);

        err=0;
        cl_mem image_in = clCreateBuffer(context,  
                CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,  sizeof(cl_float)*nrow*ncol, image, &err);
        if (err != CL_SUCCESS) {
            fprintf(stderr,"could not create image buffer\n");
            exit(EXIT_FAILURE);
        }

        cl_mem rows_in = clCreateBuffer(context,  
                CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,  sizeof(cl_float)*szGlobalWorkSize, rows, &err);
        if (err != CL_SUCCESS) {
            fprintf(stderr,"could not create rows buffer\n");
            exit(EXIT_FAILURE);
        }
        cl_mem cols_in = clCreateBuffer(context,  
                CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,  sizeof(cl_float)*szGlobalWorkSize, cols, &err);
        if (err != CL_SUCCESS) {
            fprintf(stderr,"could not create cols buffer\n");
            exit(EXIT_FAILURE);
        }

        output = clCreateBuffer(context,  
                CL_MEM_READ_WRITE,  sizeof(cl_float)*szGlobalWorkSize, NULL, &err);
        if (err != CL_SUCCESS) {
            fprintf(stderr,"could not create buffer\n");
            exit(EXIT_FAILURE);
        }

        err =  clSetKernelArg(kernel, 0, sizeof(cl_int), &ntot);
        err |= clSetKernelArg(kernel, 1, sizeof(cl_int), &ncol);
        err |= clSetKernelArg(kernel, 8, sizeof(cl_mem), &image_in);
        err |= clSetKernelArg(kernel, 9, sizeof(cl_mem), &rows_in);
        err |= clSetKernelArg(kernel, 10, sizeof(cl_mem), &cols_in);
        err |=  clSetKernelArg(kernel, 11, sizeof(cl_mem), &output);


        for (int step=0; step<nsteps; step++) {

            float cenrow = cenrow0 + 0.01*(drand48()-0.5);
            float cencol = cencol0 + 0.01*(drand48()-0.5);
            float irr = irr0+0.01*(drand48()-0.5);
            float irc = irc0+0.01*(drand48()-0.5);
            float icc = icc0+0.01*(drand48()-0.5);
            float det = irr*icc - irc*irc;
            float idet = 1./det;

            // a copy of the kernel is made each time, so we can add new arguments
            err |=  clSetKernelArg(kernel, 2, sizeof(cl_float), (void*)&cenrow);
            err |=  clSetKernelArg(kernel, 3, sizeof(cl_float), (void*)&cencol);
            err |=  clSetKernelArg(kernel, 4, sizeof(cl_float), (void*)&idet);
            err |=  clSetKernelArg(kernel, 5, sizeof(cl_float), (void*)&irr);
            err |=  clSetKernelArg(kernel, 6, sizeof(cl_float), (void*)&irc);
            err |=  clSetKernelArg(kernel, 7, sizeof(cl_float), (void*)&icc);

            if (err != CL_SUCCESS) {
                fprintf(stderr,"could not set step kernel args\n");
                exit(EXIT_FAILURE);
            }

            err = clEnqueueNDRangeKernel(queue, 
                    kernel, 
                    1, 
                    NULL, 
                    &szGlobalWorkSize, 
                    &szLocalWorkSize, 
                    0, 
                    NULL, 
                    NULL);

            if (err != CL_SUCCESS) {
                fprintf(stderr,"error executing kernel\n");
                exit(EXIT_FAILURE);
            }

        }
        clReleaseMemObject(image_in);
        clReleaseMemObject(rows_in);
        clReleaseMemObject(cols_in);
        clReleaseMemObject(output);

        free(image);
        free(rows);
        free(cols);
        free(data_from_gpu);
    }
    t1=clock();
    topencl = ((double)(t1-t0))/CLOCKS_PER_SEC;


    printf("time for GPU: %lf\n", topencl);
    printf("time per repeat: %lf\n", topencl/nrepeat);

    /*
    clReleaseKernel(kernel);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
    */

    return 0;
}
// Main function 
// *********************************************************************
int main(int argc, char** argv)
{
    shrQAStart(argc, argv);
    // get command line arg for quick test, if provided
    bNoPrompt = shrCheckCmdLineFlag(argc, (const char **)argv, "noprompt");

    // start logs
	cExecutableName = argv[0];
    shrSetLogFileName ("oclMatVecMul.txt");
    shrLog("%s Starting...\n\n", argv[0]); 

    // calculate matrix height given GPU memory
    shrLog("Determining Matrix height from available GPU mem...\n");
    memsize_t memsize;
    getTargetDeviceGlobalMemSize(&memsize, argc, (const char **)argv);
    height = memsize/width/16;
    if (height > MAX_HEIGHT)
        height = MAX_HEIGHT;
    shrLog(" Matrix width\t= %u\n Matrix height\t= %u\n\n", width, height); 

    // Allocate and initialize host arrays
    shrLog("Allocate and Init Host Mem...\n\n");
    unsigned int size = width * height;
    unsigned int mem_size_M = size * sizeof(float);
    M = (float*)malloc(mem_size_M);
    unsigned int mem_size_V = width * sizeof(float);
    V = (float*)malloc(mem_size_V);
    unsigned int mem_size_W = height * sizeof(float);
    W = (float*)malloc(mem_size_W);
    shrFillArray(M, size);
    shrFillArray(V, width);
    Golden = (float*)malloc(mem_size_W);
    MatVecMulHost(M, V, width, height, Golden);

    //Get the NVIDIA platform
    shrLog("Get the Platform ID...\n\n");
    ciErrNum = oclGetPlatformID(&cpPlatform);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    //Get all the devices
    shrLog("Get the Device info and select Device...\n");
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Set target device and Query number of compute units on targetDevice
    shrLog(" # of Devices Available = %u\n", uiNumDevices); 
    if(shrGetCmdLineArgumentu(argc, (const char **)argv, "device", &targetDevice)== shrTRUE) 
    {
        targetDevice = CLAMP(targetDevice, 0, (uiNumDevices - 1));
    }
    shrLog(" Using Device %u: ", targetDevice); 
    oclPrintDevName(LOGBOTH, cdDevices[targetDevice]);  
    cl_uint num_compute_units;
    clGetDeviceInfo(cdDevices[targetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(num_compute_units), &num_compute_units, NULL);
    shrLog("\n # of Compute Units = %u\n\n", num_compute_units); 

    //Create the context
    shrLog("clCreateContext...\n"); 
    cxGPUContext = clCreateContext(0, uiNumDevsUsed, &cdDevices[targetDevice], NULL, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Create a command-queue
    shrLog("clCreateCommandQueue...\n"); 
    cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[targetDevice], CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Allocate the OpenCL buffer memory objects for source and result on the device GMEM
    shrLog("clCreateBuffer (M, V and W in device global memory, mem_size_m = %u)...\n", mem_size_M); 
    cmM = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, mem_size_M, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    cmV = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, mem_size_V, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    cmW = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, mem_size_W, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Read the OpenCL kernel in from source file
    shrLog("oclLoadProgSource (%s)...\n", cSourceFile); 
    cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
    oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup);
    cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);
    oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup);

    // Create the program
    shrLog("clCreateProgramWithSource...\n"); 
    cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);

    // Build the program
    shrLog("clBuildProgram...\n"); 
    ciErrNum = clBuildProgram(cpProgram, uiNumDevsUsed, &cdDevices[targetDevice], "-cl-fast-relaxed-math", NULL, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        // write out standard error, Build Log and PTX, then cleanup and exit
        shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
        oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
        oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclMatVecMul.ptx");
        shrQAFinish(argc, (const char **)argv, QA_FAILED);
        Cleanup(EXIT_FAILURE); 
    }

    // --------------------------------------------------------
    // Core sequence... copy input data to GPU, compute, copy results back

    // Asynchronous write of data to GPU device
    shrLog("clEnqueueWriteBuffer (M and V)...\n\n"); 
    ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmM, CL_FALSE, 0, mem_size_M, M, 0, NULL, NULL);
    ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue, cmV, CL_FALSE, 0, mem_size_V, V, 0, NULL, NULL);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Kernels
    const char* kernels[] = {
        "MatVecMulUncoalesced0",
        "MatVecMulUncoalesced1",
        "MatVecMulCoalesced0",
        "MatVecMulCoalesced1",
        "MatVecMulCoalesced2",
        "MatVecMulCoalesced3" };

    for (int k = 0; k < (int)(sizeof(kernels)/sizeof(char*)); ++k) {
        shrLog("Running with Kernel %s...\n\n", kernels[k]); 

        // Clear result
        shrLog("  Clear result with clEnqueueWriteBuffer (W)...\n"); 
        memset(W, 0, mem_size_W);
        ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmW, CL_FALSE, 0, mem_size_W, W, 0, NULL, NULL);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

        // Create the kernel
        shrLog("  clCreateKernel...\n"); 
        if (ckKernel) {
            clReleaseKernel(ckKernel);
            ckKernel = 0;
        }
        ckKernel = clCreateKernel(cpProgram, kernels[k], &ciErrNum);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

        // Set and log Global and Local work size dimensions
        szLocalWorkSize = 256;
        if (k == 0)
            szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, height);  // rounded up to the nearest multiple of the LocalWorkSize
        else
            // Some experiments should be done here for determining the best global work size for a given device
            // We will assume here that we can run 2 work-groups per compute unit
            szGlobalWorkSize = 2 * num_compute_units * szLocalWorkSize;
        shrLog("  Global Work Size \t\t= %u\n  Local Work Size \t\t= %u\n  # of Work Groups \t\t= %u\n", 
               szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); 

        // Set the Argument values
        shrLog("  clSetKernelArg...\n\n");
        int n = 0;
        ciErrNum = clSetKernelArg(ckKernel,  n++, sizeof(cl_mem), (void*)&cmM);
        ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_mem), (void*)&cmV);
        ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_int), (void*)&width);
        ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_int), (void*)&height);
        ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_mem), (void*)&cmW);
        if (k > 1)
            ciErrNum |= clSetKernelArg(ckKernel, n++, szLocalWorkSize * sizeof(float), 0);    
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

        // Launch kernel
        shrLog("  clEnqueueNDRangeKernel (%s)...\n", kernels[k]); 
        ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, &ceEvent);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

        // Read back results and check accumulated errors
        shrLog("  clEnqueueReadBuffer (W)...\n"); 
        ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmW, CL_TRUE, 0, mem_size_W, W, 0, NULL, NULL);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    #ifdef GPU_PROFILING
        // Execution time
        ciErrNum = clWaitForEvents(1, &ceEvent);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
        cl_ulong start, end;
        ciErrNum = clGetEventProfilingInfo(ceEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
        ciErrNum |= clGetEventProfilingInfo(ceEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
        double dSeconds = 1.0e-9 * (double)(end - start);
        shrLog("  Kernel execution time: %.5f s\n\n", dSeconds);
    #endif

        // Compare results for golden-host and report errors and pass/fail
        shrLog("  Comparing against Host/C++ computation...\n\n"); 
        shrBOOL res = shrCompareL2fe(Golden, W, height, 1e-6f);
        shrLog("    GPU Result %s CPU Result within allowable tolerance\n\n", (res == shrTRUE) ? "MATCHES" : "DOESN'T MATCH");
        bPassFlag &= (res == shrTRUE); 

        // Release event
        ciErrNum = clReleaseEvent(ceEvent);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
        ceEvent = 0;
    }

    // Master status Pass/Fail (all tests)
    shrQAFinish(argc, (const char **)argv, (bPassFlag ? QA_PASSED : QA_FAILED) );

    // Cleanup and leave
    Cleanup (EXIT_SUCCESS);
}
Esempio n. 11
0
// Main function 
// *********************************************************************
int main(int argc, char **argv)
{
    //shrQAStart(argc, argv);


	// get command line arg for quick test, if provided
    bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt");
    
    // start logs 
	cExecutableName = argv[0];
    shrSetLogFileName ("Barrier.txt");
    printf("%s Starting...\n\n# of THREADS \t= %i\n", argv[0], iNumElements); 

    // set and log Global and Local work size dimensions
    szLocalWorkSize = NUM_THREADS ;
    szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements);  // rounded up to the nearest multiple of the LocalWorkSize
    printf("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n", 
           szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); 

    

    //Get an OpenCL platform
    ciErr1 = clGetPlatformIDs(1, &cpPlatform, NULL);

    printf("clGetPlatformID...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clGetPlatformID, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    //Get the devices
    ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);
    printf("clGetDeviceIDs...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clGetDeviceIDs, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    //Create the context
    cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr1);
    printf("clCreateContext...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clCreateContext, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    // Create a command-queue
    cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErr1);
    printf("clCreateCommandQueue...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clCreateCommandQueue, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }


	

    // Read the OpenCL kernel in from source file
    printf("oclLoadProgSource (%s)...\n", cSourceFile); 
    cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
    cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);

    // Create the program
    cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1);
    printf("clCreateProgramWithSource...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    // Build the program with 'mad' Optimization option
    #ifdef MAC
        char* flags = "-cl-fast-relaxed-math -DMAC";
    #else
        char* flags = "-cl-fast-relaxed-math";
    #endif
    ciErr1 = clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL);
    printf("clBuildProgram...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    // Create the kernel
    ckKernel = clCreateKernel(cpProgram, "Barrier", &ciErr1);
    printf("clCreateKernel (Barrier)...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }



	 // Allocate and initialize host arrays 
    printf( "Allocate and Init Host Mem...\n"); 
    input = (int *)malloc(sizeof(int) * NUM_BLOCKS);

	for(int i =0; i<=NUM_BLOCKS; i++)
	{
		input[i]=0;

	}

	// Allocate the OpenCL buffer memory objects for source and result on the device GMEM
    array_in = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(int)* NUM_BLOCKS, NULL, &ciErr1);
    array_out = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(int)* NUM_BLOCKS, NULL, &ciErr1);
	
	if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clCreateBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }


    // Set the Argument values
    
    ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_int), (void*)&goal_val);
	ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&array_in);
	ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&array_out);

   // ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_int), (void*)&iNumElements);
    printf("clSetKernelArg 0 - 2...\n\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clSetKernelArg, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }






    // --------------------------------------------------------
    // Start Core sequence... copy input data to GPU, compute, copy results back



	ciErr1 = clEnqueueWriteBuffer(cqCommandQueue, array_in, CL_FALSE, 0, sizeof(int) * NUM_BLOCKS,(void*) input, 0, NULL, NULL);
    
    printf("clEnqueueWriteBuffer (SrcA and SrcB)...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clEnqueueWriteBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }


    // Launch kernel
    ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, &ceEvent);
    printf("clEnqueueNDRangeKernel (Barrier)...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clEnqueueNDRangeKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

   /*ciErr1 = clEnqueueReadBuffer(cqCommandQueue, global_mutex, CL_TRUE, 0, sizeof(cl_int), &original_goal, 0, NULL, NULL);
    printf("clEnqueueReadBuffer (Dst)...%d \n\n", original_goal); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clEnqueueReadBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }*/


	//GPU_PROFILING
    ciErr1=clWaitForEvents(1, &ceEvent);
	if (ciErr1 != CL_SUCCESS)
    {
        printf("Error 1 !\n\n");
        Cleanup(argc, argv, EXIT_FAILURE);
    }
       
        cl_ulong start, end;
     ciErr1 =   clGetEventProfilingInfo(ceEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
      ciErr1 |= clGetEventProfilingInfo(ceEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
        //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

	if (ciErr1 != CL_SUCCESS)
    {
        printf("Error 2 !\n\n");
        Cleanup(argc, argv, EXIT_FAILURE);
    }
        double dSeconds = 1.0e-9 * (double)(end - start);
		printf("Done! time taken %llu \n",end - start );
      // printf("Done! Kernel execution time: %.5f s\n\n", dSeconds);


		// Release event
       clReleaseEvent(ceEvent);
       ceEvent = 0;

    Cleanup (argc, argv,  EXIT_SUCCESS);
}
// Main function 
// *********************************************************************
int main(int argc, char **argv)
{
	//////////////////////////////////////////////////////////////////////////
	unsigned int count = iNumElements;
	int k = 8;
	unsigned int random_seed, random_seed2;
	srand( (unsigned)time( NULL ) );
	random_seed = rand();
	random_seed2 = rand();
	//////////////////////////////////////////////////////////////////////////

	// get command line arg for quick test, if provided
	bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt");

	// start logs 
	shrSetLogFileName ("oclVectorAdd.txt");
	shrLog("%s Starting...\n\n# of float elements per Array \t= %i\n", argv[0], iNumElements); 

	// set and log Global and Local work size dimensions
	szLocalWorkSize = 256;
	szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements);  // rounded up to the nearest multiple of the LocalWorkSize
	shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n", 
		szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); 

	// Allocate and initialize host arrays 
	shrLog( "Allocate and Init Host Mem...\n"); 
	srcA = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize);
	srcB = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize);
	dst = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize);
	Golden = (void *)malloc(sizeof(cl_float) * iNumElements);
	shrFillArray((float*)srcA, iNumElements);
	shrFillArray((float*)srcB, iNumElements);
	//////////////////////////////////////////////////////////////////////////
	float *scalar_value = new float[count];
	float *gradient_magnitude = new float[count];
	float *second_derivative_magnitude = new float[count];
	unsigned char *label_ptr = new unsigned char[count];
	shrFillArray(scalar_value, count);
	shrFillArray(gradient_magnitude, count);
	shrFillArray(second_derivative_magnitude, count);
	//////////////////////////////////////////////////////////////////////////

	//Get an OpenCL platform
	ciErr1 = clGetPlatformIDs(1, &cpPlatform, NULL);

	shrLog("clGetPlatformID...\n"); 
	if (ciErr1 != CL_SUCCESS)
	{
		shrLog("Error in clGetPlatformID, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
		Cleanup(EXIT_FAILURE);
	}

	//Get the devices
	ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);
	shrLog("clGetDeviceIDs...\n"); 
	if (ciErr1 != CL_SUCCESS)
	{
		shrLog("Error in clGetDeviceIDs, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
		Cleanup(EXIT_FAILURE);
	}

	//Create the context
	cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr1);
	shrLog("clCreateContext...\n"); 
	if (ciErr1 != CL_SUCCESS)
	{
		shrLog("Error in clCreateContext, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
		Cleanup(EXIT_FAILURE);
	}

	// Create a command-queue
	cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &ciErr1);
	shrLog("clCreateCommandQueue...\n"); 
	if (ciErr1 != CL_SUCCESS)
	{
		shrLog("Error in clCreateCommandQueue, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
		Cleanup(EXIT_FAILURE);
	}

	// Allocate the OpenCL buffer memory objects for source and result on the device GMEM
	cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr1);
	cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2);
	ciErr1 |= ciErr2;
	cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2);
	ciErr1 |= ciErr2;
	//////////////////////////////////////////////////////////////////////////
	cmDevSrc_scalar_value = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr1);
	cmDevSrc_gradient_magnitude = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2);
	ciErr1 |= ciErr2;
	cmDevSrc_second_derivative_magnitude = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2);
	ciErr1 |= ciErr2;
	cmDevDst_label_ptr = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2);
	ciErr1 |= ciErr2;
	//////////////////////////////////////////////////////////////////////////
	shrLog("clCreateBuffer...\n"); 
	if (ciErr1 != CL_SUCCESS)
	{
		shrLog("Error in clCreateBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
		Cleanup(EXIT_FAILURE);
	}

	// Read the OpenCL kernel in from source file
	shrLog("oclLoadProgSource (%s)...\n", cSourceFile); 
	cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
	cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);
	printf("%s\n%s\n", cSourceFile, cPathAndName);

	// Create the program
	cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1);
	shrLog("clCreateProgramWithSource...\n"); 
	if (ciErr1 != CL_SUCCESS)
	{
		shrLog("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
		Cleanup(EXIT_FAILURE);
	}

	// Build the program with 'mad' Optimization option
#ifdef MAC
	char* flags = "-cl-fast-relaxed-math -DMAC";
#else
	char* flags = "-cl-fast-relaxed-math";
#endif
	ciErr1 = clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL);
	shrLog("clBuildProgram...\n"); 
	if (ciErr1 != CL_SUCCESS)
	{
		shrLog("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
		Cleanup(EXIT_FAILURE);
	}

	// Create the kernel
	ckKernel = clCreateKernel(cpProgram, "k_means", &ciErr1);
	shrLog("clCreateKernel (VectorAdd)...\n"); 
	if (ciErr1 != CL_SUCCESS)
	{
		shrLog("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
		Cleanup(EXIT_FAILURE);
	}

	// Set the Argument values
	//ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA);
	//ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB);
	//ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst);
	//ciErr1 |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements);
	//////////////////////////////////////////////////////////////////////////
	// __global const float *scalar_value, __global const float *gradient_magnitude, __global const float *second_derivative_magnitude, __global unsigned char *label_ptr, __global const unsigned int count, __global const int k, __global const unsigned int random_seed, __global const unsigned int random_seed2
	ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrc_scalar_value);
	ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrc_gradient_magnitude);
	ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevSrc_second_derivative_magnitude);
	ciErr1 |= clSetKernelArg(ckKernel, 3, sizeof(cl_mem), (void*)&cmDevDst_label_ptr);
	ciErr1 |= clSetKernelArg(ckKernel, 4, sizeof(cl_uint), (void*)&count);
	ciErr1 |= clSetKernelArg(ckKernel, 5, sizeof(cl_uint), (void*)&k);
	ciErr1 |= clSetKernelArg(ckKernel, 6, sizeof(cl_uint), (void*)&random_seed);
	ciErr1 |= clSetKernelArg(ckKernel, 7, sizeof(cl_uint), (void*)&random_seed2);
	//////////////////////////////////////////////////////////////////////////
	shrLog("clSetKernelArg 0 - 3...\n\n"); 
	if (ciErr1 != CL_SUCCESS)
	{
		shrLog("Error in clSetKernelArg, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
		Cleanup(EXIT_FAILURE);
	}

	// --------------------------------------------------------
	// Start Core sequence... copy input data to GPU, compute, copy results back

	// Asynchronous write of data to GPU device
	//ciErr1 = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, srcA, 0, NULL, NULL);
	//ciErr1 |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, srcB, 0, NULL, NULL);
	//////////////////////////////////////////////////////////////////////////
	ciErr1 = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrc_scalar_value, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, scalar_value, 0, NULL, NULL);
	ciErr1 |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrc_gradient_magnitude, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, gradient_magnitude, 0, NULL, NULL);
	ciErr1 |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrc_second_derivative_magnitude, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, second_derivative_magnitude, 0, NULL, NULL);
	//////////////////////////////////////////////////////////////////////////
	shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n"); 
	if (ciErr1 != CL_SUCCESS)
	{
		shrLog("Error in clEnqueueWriteBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
		Cleanup(EXIT_FAILURE);
	}

	// Launch kernel
	ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
	shrLog("clEnqueueNDRangeKernel (VectorAdd)...\n"); 
	if (ciErr1 != CL_SUCCESS)
	{
		shrLog("Error in clEnqueueNDRangeKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
		Cleanup(EXIT_FAILURE);
	}

	// Synchronous/blocking read of results, and check accumulated errors
	//ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL);
	//////////////////////////////////////////////////////////////////////////
		ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmDevDst_label_ptr, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, label_ptr, 0, NULL, NULL);
	//////////////////////////////////////////////////////////////////////////
	shrLog("clEnqueueReadBuffer (Dst)...\n\n"); 
	if (ciErr1 != CL_SUCCESS)
	{
		shrLog("Error in clEnqueueReadBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
		Cleanup(EXIT_FAILURE);
	}
	//--------------------------------------------------------

	// Compute and compare results for golden-host and report errors and pass/fail
	shrLog("Comparing against Host/C++ computation...\n\n"); 
	VectorAddHost ((const float*)srcA, (const float*)srcB, (float*)Golden, iNumElements);
	shrBOOL bMatch = shrComparefet((const float*)Golden, (const float*)dst, (unsigned int)iNumElements, 0.0f, 0);
	shrLog("%s\n\n", (bMatch == shrTRUE) ? "PASSED" : "FAILED");

	//////////////////////////////////////////////////////////////////////////
	//float *a = (float *)srcA;
	//float *b = (float *)srcB;
	//float *c = (float *)dst;
	//float *d = (float *)Golden;
	//for (int i=0; i<iNumElements; i++)
	//{
	//	printf("%f+%f=%f=%f\t", a[i], b[i], c[i], a[i]+b[i]);
	//	printf("%s\n", (a[i]+b[i]==c[i]?"equal":"not equal"));
	//}

	//for (int i=0; i<iNumElements; i++)
	//{
	//	printf("%f\n", ((float *)dst)[i]);
	//}
	//////////////////////////////////////////////////////////////////////////

	// Cleanup and leave
	Cleanup (EXIT_SUCCESS);

	//////////////////////////////////////////////////////////////////////////
	delete [] scalar_value;
	delete [] gradient_magnitude;
	delete [] second_derivative_magnitude;
	delete [] label_ptr;
	//////////////////////////////////////////////////////////////////////////
}
// OpenCL computation function for 1 or more GPUs  
// Copies input data from pinned host buf to the device, runs kernel, copies output data back to pinned output host buf
//*****************************************************************************
double SobelFilterGPU(cl_uint* uiInputImage, cl_uint* uiOutputImage)
{
    // If this is a video application, fresh data in pinned host buffer is needed beyond here 
    //      This line could be a sync point assuring that an asynchronous acqusition is complete.
    //      That ascynchronous acquisition would do a map, update and unmap for the pinned input buffer
    //
    //      Otherwise a synchronous acquisition call ('get next frame') could be placed here, but that would be less optimal.

    // For each device: copy fresh input H2D 
    ciErrNum = CL_SUCCESS;
    for (cl_uint i = 0; i < GpuDevMngr->uiUsefulDevCt; i++)
    {
        // Nonblocking Write of input image data from host to device
        ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[i], cmDevBufIn[i], CL_FALSE, 0, szAllocDevBytes[i], 
                                        (void*)&uiInputImage[uiInHostPixOffsets[i]], 0, NULL, NULL);
    }

    // Sync all queues to host and start computation timer on host to get computation elapsed wall clock  time
    // (Only for timing... can be omitted in a production app)
    for (cl_uint j = 0; j < GpuDevMngr->uiUsefulDevCt; j++)
    {
        ciErrNum |= clFinish(cqCommandQueue[j]);
    }
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // For each device: Process
    shrDeltaT(0);
    for (cl_uint i = 0; i < GpuDevMngr->uiUsefulDevCt; i++)
    {
        // Determine configuration bytes, offsets and launch config, based on position of device region vertically in image
        if (GpuDevMngr->uiUsefulDevCt == 1)
        {
            // One device processes the whole image with no offset tricks needed
            szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], (int)uiDevImageHeight[i]);
        }
        else if (i == 0)
        {
            // Multiple devices, top boundary tile:  
            // Process whole device allocation, including extra row 
            // No offset, but don't return the last row (dark/garbage row) D2H 
            szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], (int)uiDevImageHeight[i]);
        }
        else if (i < (GpuDevMngr->uiUsefulDevCt - 1))
        {
            // Multiple devices, middle tile:  
            // Process whole device allocation, including extra 2 rows 
            // Offset down by 1 row, and don't return the first and last rows (dark/garbage rows) D2H 
            szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], (int)uiDevImageHeight[i]);
        }
        else 
        {   
            // Multiple devices, last boundary tile:  
            // Process whole device allocation, including extra row 
            // Offset down by 1 row, and don't return the first row (dark/garbage row) D2H 
            szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], (int)uiDevImageHeight[i]);
        }

        // Pass in dev image height (# of rows worked on) for this device
        ciErrNum |= clSetKernelArg(ckSobel[i], 5, sizeof(cl_uint), (void*)&uiDevImageHeight[i]);

        // Launch Sobel kernel(s) into queue(s) and push to device(s)
        ciErrNum |= clEnqueueNDRangeKernel(cqCommandQueue[i], ckSobel[i], 2, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);

        // Push to device(s) so subsequent clFinish in queue 0 doesn't block driver from issuing enqueue command for higher queues
        ciErrNum |= clFlush(cqCommandQueue[i]);
    }

    // Sync all queues to host and get elapsed wall clock time for computation in all queues
    // (Only for timing... can be omitted in a production app)
    for (cl_uint j = 0; j < GpuDevMngr->uiUsefulDevCt; j++)
    {
        ciErrNum |= clFinish(cqCommandQueue[j]);
    }
    double dKernelTime = shrDeltaT(0); // Time from launch of first compute kernel to end of all compute kernels 
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // For each device: copy fresh output D2H
    for (cl_uint i = 0; i < GpuDevMngr->uiUsefulDevCt; i++)
    {
        // Determine configuration bytes and offsets based on position of device region vertically in image
        size_t szReturnBytes;
        cl_uint uiOutDevByteOffset;        
        if (GpuDevMngr->uiUsefulDevCt == 1)
        {
            // One device processes the whole image with no offset tricks needed
            szReturnBytes = szBuffBytes;
            uiOutDevByteOffset = 0;
        } 
        else if (i == 0)
        {
            // Multiple devices, top boundary tile:  
            // Process whole device allocation, including extra row 
            // No offset, but don't return the last row (dark/garbage row) D2H 
            szReturnBytes = szAllocDevBytes[i] - (uiImageWidth * sizeof(cl_uint));
            uiOutDevByteOffset = 0;
        }
        else if (i < (GpuDevMngr->uiUsefulDevCt - 1))
        {
            // Multiple devices, middle tile:  
            // Process whole device allocation, including extra 2 rows 
            // Offset down by 1 row, and don't return the first and last rows (dark/garbage rows) D2H 
            szReturnBytes = szAllocDevBytes[i] - ((uiImageWidth * sizeof(cl_uint)) * 2);
            uiOutDevByteOffset = uiImageWidth * sizeof(cl_uint);
        }        
        else 
        {   
            // Multiple devices, last boundary tile:  
            // Process whole device allocation, including extra row 
            // Offset down by 1 row, and don't return the first row (dark/garbage row) D2H 
            szReturnBytes = szAllocDevBytes[i] - (uiImageWidth * sizeof(cl_uint));
            uiOutDevByteOffset = uiImageWidth * sizeof(cl_uint);
        }        
        
        // Non Blocking Read of output image data from device to host 
        ciErrNum |= clEnqueueReadBuffer(cqCommandQueue[i], cmDevBufOut[i], CL_FALSE, uiOutDevByteOffset, szReturnBytes, 
                                       (void*)&uiOutputImage[uiOutHostPixOffsets[i]], 0, NULL, NULL);
    }

    // Finish all queues and check for errors before returning 
    // The block here assures valid output data for subsequent host processing
    for (cl_uint j = 0; j < GpuDevMngr->uiUsefulDevCt; j++)
    {
        ciErrNum |= clFinish(cqCommandQueue[j]);
    }
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    return dKernelTime;
}
// Main program
//*****************************************************************************
int main(int argc, char** argv)
{
	pArgc = &argc;
	pArgv = argv;

	shrQAStart(argc, argv);

    // Start logs 
	cExecutableName = argv[0];
    shrSetLogFileName ("oclSobelFilter.txt");
    shrLog("%s Starting (Using %s)...\n\n", argv[0], clSourcefile); 

    // Get command line args for quick test or QA test, if provided
    bNoPrompt = (bool)shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt");
    bQATest   = (bool)shrCheckCmdLineFlag(argc, (const char**)argv, "qatest");

    // Menu items
    if (!(bQATest))
    {
        ShowMenuItems();
    }

    // Find the path from the exe to the image file 
    cPathAndName = shrFindFilePath(cImageFile, argv[0]);
    oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup);
    shrLog("Image File\t = %s\nImage Dimensions = %u w x %u h x %u bpp\n\n", cPathAndName, uiImageWidth, uiImageHeight, sizeof(unsigned int)<<3);

    // Initialize OpenGL items (if not No-GL QA test)
    shrLog("%sInitGL...\n\n", bQATest ? "Skipping " : "Calling "); 
    if (!(bQATest))
    {
        InitGL(&argc, argv);
    }

    //Get the NVIDIA platform if available, otherwise use default
    char cBuffer[1024];
    bool bNV = false;
    shrLog("Get Platform ID... ");
    ciErrNum = oclGetPlatformID(&cpPlatform);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    ciErrNum = clGetPlatformInfo (cpPlatform, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    shrLog("%s\n\n", cBuffer);
    bNV = (strstr(cBuffer, "NVIDIA") != NULL);

    //Get the devices
    shrLog("Get Device Info...\n");
    cl_uint uiNumAllDevs = 0;
    GpuDevMngr = new DeviceManager(cpPlatform, &uiNumAllDevs, pCleanup);

    // Get selected device if specified, otherwise examine avaiable ones and choose by perf
    cl_int iSelectedDevice = 0;
    if((shrGetCmdLineArgumenti(argc, (const char**)argv, "device", &iSelectedDevice)) || (uiNumAllDevs == 1)) 
    {
        // Use 1 selected device
        GpuDevMngr->uiUsefulDevCt = 1;  
        iSelectedDevice = CLAMP((cl_uint)iSelectedDevice, 0, (uiNumAllDevs - 1));
        GpuDevMngr->uiUsefulDevs[0] = iSelectedDevice;
        GpuDevMngr->fLoadProportions[0] = 1.0f;
        shrLog("  Using 1 Selected Device for Sobel Filter Computation...\n"); 
 
    } 
    else 
    {
        // Use available useful devices and Compute the device load proportions
        ciErrNum = GpuDevMngr->GetDevLoadProportions(bNV);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
        if (GpuDevMngr->uiUsefulDevCt == 1)
        {
            iSelectedDevice = GpuDevMngr->uiUsefulDevs[0];
        }
        shrLog("    Using %u Device(s) for Sobel Filter Computation\n", GpuDevMngr->uiUsefulDevCt); 
    }

    //Create the context
    shrLog("\nclCreateContext...\n\n");
    cxGPUContext = clCreateContext(0, uiNumAllDevs, GpuDevMngr->cdDevices, NULL, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Allocate per-device OpenCL objects for useful devices
    cqCommandQueue = new cl_command_queue[GpuDevMngr->uiUsefulDevCt];
    ckSobel = new cl_kernel[GpuDevMngr->uiUsefulDevCt];
    cmDevBufIn = new cl_mem[GpuDevMngr->uiUsefulDevCt];
    cmDevBufOut = new cl_mem[GpuDevMngr->uiUsefulDevCt];
    szAllocDevBytes = new size_t[GpuDevMngr->uiUsefulDevCt];
    uiInHostPixOffsets = new cl_uint[GpuDevMngr->uiUsefulDevCt];
    uiOutHostPixOffsets = new cl_uint[GpuDevMngr->uiUsefulDevCt];
    uiDevImageHeight = new cl_uint[GpuDevMngr->uiUsefulDevCt];

    // Create command queue(s) for device(s)     
    shrLog("clCreateCommandQueue...\n");
    for (cl_uint i = 0; i < GpuDevMngr->uiUsefulDevCt; i++) 
    {
        cqCommandQueue[i] = clCreateCommandQueue(cxGPUContext, GpuDevMngr->cdDevices[GpuDevMngr->uiUsefulDevs[i]], 0, &ciErrNum);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
        shrLog("  CommandQueue %u, Device %u, Device Load Proportion = %.2f, ", i, GpuDevMngr->uiUsefulDevs[i], GpuDevMngr->fLoadProportions[i]); 
        oclPrintDevName(LOGBOTH, GpuDevMngr->cdDevices[GpuDevMngr->uiUsefulDevs[i]]);  
        shrLog("\n");
    }

    // Allocate pinned input and output host image buffers:  mem copy operations to/from pinned memory is much faster than paged memory
    szBuffBytes = uiImageWidth * uiImageHeight * sizeof (unsigned int);
    cmPinnedBufIn = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    cmPinnedBufOut = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    shrLog("\nclCreateBuffer (Input and Output Pinned Host buffers)...\n"); 

    // Get mapped pointers for writing to pinned input and output host image pointers 
    uiInput = (cl_uint*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedBufIn, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    uiOutput = (cl_uint*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedBufOut, CL_TRUE, CL_MAP_READ, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    shrLog("clEnqueueMapBuffer (Pointer to Input and Output pinned host buffers)...\n"); 

    // Load image data from file to pinned input host buffer
    ciErrNum = shrLoadPPM4ub(cPathAndName, (unsigned char **)&uiInput, &uiImageWidth, &uiImageHeight);
    oclCheckErrorEX(ciErrNum, shrTRUE, pCleanup);
    shrLog("Load Input Image to Input pinned host buffer...\n"); 

    // Read the kernel in from file
    free(cPathAndName);
    cPathAndName = shrFindFilePath(clSourcefile, argv[0]);
    oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup);
    cSourceCL = oclLoadProgSource(cPathAndName, "// My comment\n", &szKernelLength);
    oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup);
    shrLog("Load OpenCL Prog Source from File...\n"); 

    // Create the program object
    cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    shrLog("clCreateProgramWithSource...\n"); 

    // Build the program with 'mad' Optimization option
#ifdef MAC
    char *flags = "-cl-fast-relaxed-math -DMAC";
#else
    char *flags = "-cl-fast-relaxed-math";
#endif

    ciErrNum = clBuildProgram(cpProgram, 0, NULL, flags, NULL, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        // On error: write out standard error, Build Log and PTX, then cleanup and exit
        shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
        oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
        oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclSobelFilter.ptx");
        Cleanup(EXIT_FAILURE);
    }
    shrLog("clBuildProgram...\n\n"); 

    // Determine, the size/shape of the image portions for each dev and create the device buffers
    unsigned uiSumHeight = 0;
    for (cl_uint i = 0; i < GpuDevMngr->uiUsefulDevCt; i++)
    {
        // Create kernel instance
        ckSobel[i] = clCreateKernel(cpProgram, "ckSobel", &ciErrNum);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
        shrLog("clCreateKernel (ckSobel), Device %u...\n", i); 

        // Allocations and offsets for the portion of the image worked on by each device
        if (GpuDevMngr->uiUsefulDevCt == 1)
        {
            // One device processes the whole image with no offset 
            uiDevImageHeight[i] = uiImageHeight; 
            uiInHostPixOffsets[i] = 0;
            uiOutHostPixOffsets[i] = 0;
            szAllocDevBytes[i] = uiDevImageHeight[i] * uiImageWidth * sizeof(cl_uint);
        }
        else if (i == 0)
        {
            // Multiple devices, top stripe zone including topmost row of image:  
            // Over-allocate on device by 1 row 
            // Set offset and size to copy extra 1 padding row H2D (below bottom of stripe)
            // Won't return the last row (dark/garbage row) D2H
            uiInHostPixOffsets[i] = 0;
            uiOutHostPixOffsets[i] = 0;
            uiDevImageHeight[i] = (cl_uint)(GpuDevMngr->fLoadProportions[GpuDevMngr->uiUsefulDevs[i]] * (float)uiImageHeight);     // height is proportional to dev perf 
            uiSumHeight += uiDevImageHeight[i];
            uiDevImageHeight[i] += 1;
            szAllocDevBytes[i] = uiDevImageHeight[i] * uiImageWidth * sizeof(cl_uint);
        }
        else if (i < (GpuDevMngr->uiUsefulDevCt - 1))
        {
            // Multiple devices, middle stripe zone:  
            // Over-allocate on device by 2 rows 
            // Set offset and size to copy extra 2 padding rows H2D (above top and below bottom of stripe)
            // Won't return the first and last rows (dark/garbage rows) D2H
            uiInHostPixOffsets[i] = (uiSumHeight - 1) * uiImageWidth;
            uiOutHostPixOffsets[i] = uiInHostPixOffsets[i] + uiImageWidth;
            uiDevImageHeight[i] = (cl_uint)(GpuDevMngr->fLoadProportions[GpuDevMngr->uiUsefulDevs[i]] * (float)uiImageHeight);     // height is proportional to dev perf 
            uiSumHeight += uiDevImageHeight[i];
            uiDevImageHeight[i] += 2;
            szAllocDevBytes[i] = uiDevImageHeight[i] * uiImageWidth * sizeof(cl_uint);
        }
        else 
        {
            // Multiple devices, last boundary tile:  
            // Over-allocate on device by 1 row 
            // Set offset and size to copy extra 1 padding row H2D (above top of stripe)
            // Won't return the first row (dark/garbage rows D2H 
            uiInHostPixOffsets[i] = (uiSumHeight - 1) * uiImageWidth;
            uiOutHostPixOffsets[i] = uiInHostPixOffsets[i] + uiImageWidth;
            uiDevImageHeight[i] = uiImageHeight - uiSumHeight;                              // "leftover" rows 
            uiSumHeight += uiDevImageHeight[i];
            uiDevImageHeight[i] += 1;
            szAllocDevBytes[i] = uiDevImageHeight[i] * uiImageWidth * sizeof(cl_uint);
        }
        shrLog("Image Height (rows) for Device %u = %u...\n", i, uiDevImageHeight[i]); 

        // Create the device buffers in GMEM on each device
        cmDevBufIn[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szAllocDevBytes[i], NULL, &ciErrNum);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
        cmDevBufOut[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, szAllocDevBytes[i], NULL, &ciErrNum);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
        shrLog("clCreateBuffer (Input and Output GMEM buffers, Device %u)...\n", i); 

        // Set the common argument values for the Median kernel instance for each device
        int iLocalPixPitch = iBlockDimX + 2;
        ciErrNum = clSetKernelArg(ckSobel[i], 0, sizeof(cl_mem), (void*)&cmDevBufIn[i]);
        ciErrNum |= clSetKernelArg(ckSobel[i], 1, sizeof(cl_mem), (void*)&cmDevBufOut[i]);
        ciErrNum |= clSetKernelArg(ckSobel[i], 2, (iLocalPixPitch * (iBlockDimY + 2) * sizeof(cl_uchar4)), NULL);
        ciErrNum |= clSetKernelArg(ckSobel[i], 3, sizeof(cl_int), (void*)&iLocalPixPitch);
        ciErrNum |= clSetKernelArg(ckSobel[i], 4, sizeof(cl_uint), (void*)&uiImageWidth);
        ciErrNum |= clSetKernelArg(ckSobel[i], 6, sizeof(cl_float), (void*)&fThresh);        
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
        shrLog("clSetKernelArg (0-4), Device %u...\n\n", i); 
    }

    // Set common global and local work sizes for Median kernel
    szLocalWorkSize[0] = iBlockDimX;
    szLocalWorkSize[1] = iBlockDimY;
    szGlobalWorkSize[0] = shrRoundUp((int)szLocalWorkSize[0], uiImageWidth); 

    // init running timers
    shrDeltaT(0);   // timer 0 used for computation timing 
    shrDeltaT(1);   // timer 1 used for fps computation

    // Start main GLUT rendering loop for processing and rendering, 
    // or otherwise run No-GL Q/A test sequence
    if (!(bQATest))
    {
        glutMainLoop();
    }
    else 
    {
        TestNoGL();
    }

    Cleanup(EXIT_SUCCESS);
}
Esempio n. 15
0
int main(int argc, char *argv[])
{
    cl_platform_id platform;
    cl_device_id device;
    cl_context context;
    cl_command_queue queue;
    cl_program program;
    cl_kernel kernel;
    cl_mem buff_A, buff_B, buff_C;

    int mult = 1;
    uint32_t uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
    uiWA = WA * mult;
    uiHA = HA * mult;
    uiWB = WB * mult;
    uiHB = HB * mult;
    uiWC = WC * mult;
    uiHC = HC * mult;

    printf("sizes WA %u HA %u WB %u HB %u WC %u HC %u\n",
            uiWA, uiHA, uiWB, uiHB, uiWC, uiHC); 

    uint32_t size_A = uiWA * uiHA;
    uint32_t size_B = uiWB * uiHB;
    uint32_t size_C = uiWC * uiHC;

    size_t mem_size_A = size_A * sizeof(float); 
    size_t mem_size_B = size_B * sizeof(float); 
    size_t mem_size_C = size_C * sizeof(float); 

    float *data_A = (float *)malloc(mem_size_A);
    float *data_B = (float *)malloc(mem_size_B);
    float *data_C = (float *)malloc(mem_size_C);

    srand(2012);
    shrFillArray(data_A, size_A);
    shrFillArray(data_B, size_B);

    size_t global_work_size[2];
    size_t local_work_size[] = { BLOCK_SIZE, BLOCK_SIZE };

    global_work_size[0] = shrRoundUp(BLOCK_SIZE, uiWC);
    global_work_size[1] = shrRoundUp(BLOCK_SIZE, uiHA);

    const char *source = load_program_source("MatrixMul.cl");
    size_t source_len = strlen(source);;
    cl_uint err = 0;

    char *flags = "-cl-fast-relaxed-math";

    clGetPlatformIDs(1, &platform, NULL);
    printf("platform %p err %d\n", platform, err);

    clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, &err);
    printf("device %p err %d\n", device, err);

    context = clCreateContext(0, 1, &device, NULL, NULL, &err);
    printf("context %p err %d\n", context, err);

    queue = clCreateCommandQueue(context, device, 0, &err);
    printf("queue %p err %d\n", queue, err);

    program = clCreateProgramWithSource(context, 1, &source, &source_len, &err);
    printf("program %p err %d\n", program, err);

    err = clBuildProgram(program, 0, NULL, flags, NULL, NULL);
    printf("err %d\n", err);

    kernel = clCreateKernel(program, "matrixMul", &err);
    printf("kernel %p err %d\n", kernel, err);

    buff_A = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
        mem_size_A, data_A, NULL);
    printf("buff_A %p\n", buff_A);

    buff_B = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
        mem_size_B, data_B, NULL);
    printf("buff_B %p\n", buff_B);

    buff_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size_C, NULL, NULL);
    printf("buff_C %p\n", buff_C);

    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&buff_C);
    printf("err %d\n", err);

    err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&buff_A);
    printf("err %d\n", err);

    err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&buff_B);
    printf("err %d\n", err);

    err = clSetKernelArg(kernel, 3, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, NULL);
    printf("err %d\n", err);

    err = clSetKernelArg(kernel, 4, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, NULL);
    printf("err %d\n", err);

    err = clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&uiWA);
    printf("err %d\n", err);

    err = clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&uiWB);
    printf("err %d\n", err);

    err = clSetKernelArg(kernel, 7, sizeof(cl_int), (void*)&uiHA);
    printf("err %d\n", err);

    err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, 
        local_work_size, 0, NULL, NULL);
    printf("err %d\n", err);

    err = clFlush(queue);
    printf("err %d\n", err);

    err = clFinish(queue);
    printf("err %d\n", err);

    err = clEnqueueReadBuffer(queue, buff_C, CL_TRUE, 0, mem_size_C, data_C, 0,
          NULL, NULL);
    printf("err %d\n", err);

    int i;
    for (i = 0; i < size_C; i++) {
        printf("%d %f\n", i, data_C[i]);
    }

    clReleaseMemObject(buff_A);
    clReleaseMemObject(buff_B);
    clReleaseMemObject(buff_C);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(queue);
    clReleaseProgram(program);
}
Esempio n. 16
0
void
bluesteinsFFTGpu(const char* const argv[],const unsigned n, 
		 const unsigned orign,const unsigned size)
{
  const unsigned powM = (unsigned) log2(n);
  printf("Compiling Bluesteins Program..\n");

  compileProgram(argv, "fft.h", "kernels/bluesteins.cl");

    printf("Creating Kernel\n");
    for (unsigned i = 0; i < deviceCount; ++i) {
        createKernel(i, "bluesteins");
    }

    const unsigned sizePerGPU = size / deviceCount;
    for (unsigned i = 0; i < deviceCount; ++i) {
        workSize[i] = (i != (deviceCount - 1)) ? sizePerGPU 
                                               : (size - workOffset[i]);       
        
        allocateDeviceMemoryBS(i , workSize[i], workOffset[i]);
        
        clSetKernelArg(kernel[i], 0, sizeof(cl_mem), (void*) &d_Hreal[i]);
        clSetKernelArg(kernel[i], 1, sizeof(cl_mem), (void*) &d_Himag[i]);
	clSetKernelArg(kernel[i], 2, sizeof(cl_mem), (void*) &d_Yreal[i]);
        clSetKernelArg(kernel[i], 3, sizeof(cl_mem), (void*) &d_Yimag[i]);
	clSetKernelArg(kernel[i], 4, sizeof(cl_mem), (void*) &d_Zreal[i]);
        clSetKernelArg(kernel[i], 5, sizeof(cl_mem), (void*) &d_Zimag[i]);
	clSetKernelArg(kernel[i], 6, sizeof(unsigned), &n);
	clSetKernelArg(kernel[i], 7, sizeof(unsigned), &orign);
	clSetKernelArg(kernel[i], 8, sizeof(unsigned), &powM);
	clSetKernelArg(kernel[i], 9, sizeof(unsigned), &blockSize);
       

        if ((i + 1) < deviceCount) {
            workOffset[i + 1] = workOffset[i] + workSize[i];
        } 

    }

    size_t localWorkSize[] = {blockSize};
    for (unsigned i = 0; i < deviceCount; ++i) {
        size_t globalWorkSize[] = {shrRoundUp(blockSize, workSize[i])}; 
        // kernel non blocking execution 
        runKernel(i, localWorkSize, globalWorkSize);
    }

    h_Rreal = h_Hreal;
    h_Rimag = h_Himag;
    
    for (unsigned i = 0; i < deviceCount; ++i) {
        copyFromDevice(i, d_Hreal[i], h_Rreal + workOffset[i],
                                                workSize[i]); 
        copyFromDevice(i, d_Himag[i], h_Rimag + workOffset[i],
                                                 workSize[i]);
    }

    // wait for copy event
    const cl_int ciErrNum = clWaitForEvents(deviceCount, gpuDone);
    checkError(ciErrNum, CL_SUCCESS, "clWaitForEvents");
    printGpuTime();
}