Ejemplo n.º 1
0
Filter::Filter(char* source, cl_context GPUContext ,GPUTransferManager* transfer,char* KernelName)
{
    GPUTransfer = transfer;

    iBlockDimX = 16;
    iBlockDimY = 16;
    size_t szKernelLength;	
	size_t szKernelLengthFilter;
	size_t szKernelLengthSum;

    // Load OpenCL kernel
	SourceOpenCL = oclLoadProgSource("./OpenCL/GPUCode.cl", "// My comment\n", &szKernelLength);
    SourceOpenCLFilter = oclLoadProgSource(source, "// My comment\n", &szKernelLengthFilter);
	//strncat (SourceOpenCL, SourceOpenCLFilter,szKernelLengthFilter );
	szKernelLengthSum = szKernelLength + szKernelLengthFilter;
	char* sourceCL = new char[szKernelLengthSum];
	strcpy(sourceCL,SourceOpenCL);
	strcat (sourceCL, SourceOpenCLFilter);

    // creates a program object for a context, and loads the source code specified by the text strings in
    //the strings array into the program object. The devices associated with the program object are the
    //devices associated with context.
    GPUProgram = clCreateProgramWithSource( GPUContext , 1, (const char **)&sourceCL, &szKernelLengthSum, &GPUError);
    CheckError(GPUError);

    // Build the program with 'mad' Optimization option
    char *flags = "-cl-mad-enable";

    GPUError = clBuildProgram(GPUProgram, 0, NULL, flags, NULL, NULL);
    CheckError(GPUError);

    GPUFilter = clCreateKernel(GPUProgram, KernelName, &GPUError);
}
extern "C" void initHistogram64(cl_context cxGPUContext, cl_command_queue cqParamCommandQue, const char **argv){
    cl_int ciErrNum;
    size_t kernelLength;

    shrLog("...loading Histogram64.cl from file\n");
        char *cHistogram64 = oclLoadProgSource(shrFindFilePath("Histogram64.cl", argv[0]), "// My comment\n", &kernelLength);
        shrCheckError(cHistogram64 != NULL, shrTRUE);

    shrLog("...creating histogram64 program\n");
         cpHistogram64 = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cHistogram64, &kernelLength, &ciErrNum);
        shrCheckError(ciErrNum, CL_SUCCESS);

    shrLog("...building histogram64 program\n");
        ciErrNum = clBuildProgram(cpHistogram64, 0, NULL, compileOptions, NULL, NULL);
        shrCheckError(ciErrNum, CL_SUCCESS);

    shrLog("...creating histogram64 kernels\n");
        ckHistogram64 = clCreateKernel(cpHistogram64, "histogram64", &ciErrNum);
        shrCheckError(ciErrNum, CL_SUCCESS);
        ckMergeHistogram64 = clCreateKernel(cpHistogram64, "mergeHistogram64", &ciErrNum);
        shrCheckError(ciErrNum, CL_SUCCESS);

    shrLog("...allocating internal histogram64 buffer\n");
        d_PartialHistograms = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, MAX_PARTIAL_HISTOGRAM64_COUNT * HISTOGRAM64_BIN_COUNT * sizeof(uint), NULL, &ciErrNum);
        shrCheckError(ciErrNum, CL_SUCCESS);

    //Save default command queue
    cqDefaultCommandQue = cqParamCommandQue;

    //Discard temp storage
    free(cHistogram64);

    //Save ptx code to separate file
    oclLogPtx(cpHistogram64, oclGetFirstDev(cxGPUContext), "Histogram64.ptx");
}
Ejemplo n.º 3
0
GPUBase::GPUBase(char* source, char* KernelName)
{
	
	printf("\n ----------- GPUBase START --------------- \n");
	kernelFuncName = KernelName;
	size_t szKernelLength = 0;
	size_t szKernelLengthFilter = 0;
	size_t szKernelLengthSum = 0;
	char* SourceOpenCLShared;
	char* SourceOpenCL;
	iBlockDimX = 16;
	iBlockDimY = 16;

	
	GPUContext = GPU::getInstance().GPUContext;
	GPUCommandQueue = GPU::getInstance().GPUCommandQueue;
	
    	// Load OpenCL kernel
	SourceOpenCLShared = oclLoadProgSource("/home/mati/Dropbox/MGR/DisCODe/DCL_SIFTOpenCL/src/Components/SIFTOpenCL/OpenCL/GPUCode.cl", "// My comment\n", &szKernelLength);

	SourceOpenCL = oclLoadProgSource(source, "// My comment\n", &szKernelLengthFilter);
	
	szKernelLengthSum = szKernelLength + szKernelLengthFilter + 100;
	char* sourceCL = new char[szKernelLengthSum];
	
	
	strcpy(sourceCL,SourceOpenCLShared);
	strcat (sourceCL, SourceOpenCL);
	
	
	GPUProgram = clCreateProgramWithSource( GPUContext , 1, (const char **)&sourceCL, &szKernelLengthSum, &GPUError);
	CheckError(GPUError);

	// Build the program with 'mad' Optimization option
	char *flags = "-cl-unsafe-math-optimizations";

	GPUError = clBuildProgram(GPUProgram, 0, NULL, flags, NULL, NULL);
	CheckError(GPUError);
	
	GPUKernel = clCreateKernel(GPUProgram, kernelFuncName, &GPUError);
	CheckError(GPUError);

	printf("\n ----------- GPUBase END --------------- \n");

}
extern "C" void initConvolutionSeparable(cl_context cxGPUContext, cl_command_queue cqParamCommandQueue, const char **argv){
    cl_int ciErrNum;
    size_t kernelLength;

    shrLog("Loading ConvolutionSeparable.cl...\n");
        char *cPathAndName = shrFindFilePath("ConvolutionSeparable.cl", argv[0]);
        oclCheckError(cPathAndName != NULL, shrTRUE);
        char *cConvolutionSeparable = oclLoadProgSource(cPathAndName, "// My comment\n", &kernelLength);
        oclCheckError(cConvolutionSeparable != NULL, shrTRUE);

    shrLog("Creating convolutionSeparable program...\n");
        cpConvolutionSeparable = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cConvolutionSeparable, &kernelLength, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

    shrLog("Building convolutionSeparable program...\n");
        char compileOptions[2048];
        #ifdef _WIN32
            sprintf_s(compileOptions, 2048, "\
                -cl-fast-relaxed-math                                  \
                -D KERNEL_RADIUS=%u\
                -D ROWS_BLOCKDIM_X=%u -D COLUMNS_BLOCKDIM_X=%u\
                -D ROWS_BLOCKDIM_Y=%u -D COLUMNS_BLOCKDIM_Y=%u\
                -D ROWS_RESULT_STEPS=%u -D COLUMNS_RESULT_STEPS=%u\
                -D ROWS_HALO_STEPS=%u -D COLUMNS_HALO_STEPS=%u\
                ",
                KERNEL_RADIUS,
                ROWS_BLOCKDIM_X,   COLUMNS_BLOCKDIM_X,
                ROWS_BLOCKDIM_Y,   COLUMNS_BLOCKDIM_Y,
                ROWS_RESULT_STEPS, COLUMNS_RESULT_STEPS,
                ROWS_HALO_STEPS,   COLUMNS_HALO_STEPS
            );
        #else
            sprintf(compileOptions, "\
                -cl-fast-relaxed-math                                  \
                -D KERNEL_RADIUS=%u\
                -D ROWS_BLOCKDIM_X=%u -D COLUMNS_BLOCKDIM_X=%u\
                -D ROWS_BLOCKDIM_Y=%u -D COLUMNS_BLOCKDIM_Y=%u\
                -D ROWS_RESULT_STEPS=%u -D COLUMNS_RESULT_STEPS=%u\
                -D ROWS_HALO_STEPS=%u -D COLUMNS_HALO_STEPS=%u\
                ",
                KERNEL_RADIUS,
                ROWS_BLOCKDIM_X,   COLUMNS_BLOCKDIM_X,
                ROWS_BLOCKDIM_Y,   COLUMNS_BLOCKDIM_Y,
                ROWS_RESULT_STEPS, COLUMNS_RESULT_STEPS,
                ROWS_HALO_STEPS,   COLUMNS_HALO_STEPS
            );
        #endif
        ciErrNum = clBuildProgram(cpConvolutionSeparable, 0, NULL, compileOptions, NULL, NULL);
        oclCheckError(ciErrNum, CL_SUCCESS);
        ckConvolutionRows = clCreateKernel(cpConvolutionSeparable, "convolutionRows", &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);
        ckConvolutionColumns = clCreateKernel(cpConvolutionSeparable, "convolutionColumns", &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

    cqDefaultCommandQueue = cqParamCommandQueue;
    free(cConvolutionSeparable);
}
Ejemplo n.º 5
0
void build_cl_radix_sort(cl_context ctx,
		cl_device_id* devices){

	context = ctx;
	num_device= devices[0];

	cl_int status;

	command_que = clCreateCommandQueue(
				context,
				num_device,
				CL_QUEUE_PROFILING_ENABLE,
				&status);
		assert (status == CL_SUCCESS);


	cl_int err;
	size_t szKernelLength;
	char* prog = NULL;
	const char* cSourceFile = "./cl_radix_sort2.cl";//커널파일 이름
	printf("oclLoadProgSource (%s)...\n", cSourceFile);
	prog = oclLoadProgSource(cSourceFile, "", &szKernelLength);



	program = clCreateProgramWithSource(context, 1, (const char **)&prog, NULL, &err);
	if (!program) {
		printf("Error: Failed to create compute program!\n");
	}
	assert(err == CL_SUCCESS);


	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	if (err != CL_SUCCESS) {
		size_t len;
		char buffer[2048];
		printf("Error: Failed to build program executable!\n");
		clGetProgramBuildInfo(program, num_device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
		printf("%s\n", buffer);
		assert( err == CL_SUCCESS);
	}


	ckHistogram = clCreateKernel(program, "histogram", &err);
	assert(err == CL_SUCCESS);
	ckScanHistogram = clCreateKernel(program, "scanhistograms", &err);
	assert(err == CL_SUCCESS);
	ckPasteHistogram = clCreateKernel(program, "pastehistograms", &err);
	assert(err == CL_SUCCESS);
	ckReorder = clCreateKernel(program, "reorder", &err);
	assert(err == CL_SUCCESS);
	ckTranspose = clCreateKernel(program, "transpose", &err);
	assert(err == CL_SUCCESS);

	printf("Create Kernel finished !!\n");

}
extern "C" void initScan(cl_context cxGPUContext, cl_command_queue cqParamCommandQue, const char **argv) {
    cl_int ciErrNum;
    size_t kernelLength;

    shrLog(" ...loading Scan.cl\n");
    char *cScan = oclLoadProgSource(shrFindFilePath("Scan.cl", argv[0]), "// My comment\n", &kernelLength);
    oclCheckError(cScan != NULL, shrTRUE);

    shrLog(" ...creating scan program\n");
    cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cScan, &kernelLength, &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);

    shrLog(" ...building scan program\n");
    ciErrNum = clBuildProgram(cpProgram, 0, NULL, compileOptions, NULL, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        // write out standard error, Build Log and PTX, then cleanup and exit
        shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
        oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
        oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclScan.ptx");
        oclCheckError(ciErrNum, CL_SUCCESS);
    }

    shrLog(" ...creating scan kernels\n");
    ckScanExclusiveLocal1 = clCreateKernel(cpProgram, "scanExclusiveLocal1", &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);
    ckScanExclusiveLocal2 = clCreateKernel(cpProgram, "scanExclusiveLocal2", &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);
    ckUniformUpdate = clCreateKernel(cpProgram, "uniformUpdate", &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);

    shrLog( " ...checking minimum supported workgroup size\n");
    //Check for work group size
    cl_device_id device;
    size_t szScanExclusiveLocal1, szScanExclusiveLocal2, szUniformUpdate;

    ciErrNum  = clGetCommandQueueInfo(cqParamCommandQue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device, NULL);
    ciErrNum |= clGetKernelWorkGroupInfo(ckScanExclusiveLocal1,  device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &szScanExclusiveLocal1, NULL);
    ciErrNum |= clGetKernelWorkGroupInfo(ckScanExclusiveLocal2, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &szScanExclusiveLocal2, NULL);
    ciErrNum |= clGetKernelWorkGroupInfo(ckUniformUpdate, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &szUniformUpdate, NULL);
    oclCheckError(ciErrNum, CL_SUCCESS);

    if( (szScanExclusiveLocal1 < WORKGROUP_SIZE) || (szScanExclusiveLocal2 < WORKGROUP_SIZE) || (szUniformUpdate < WORKGROUP_SIZE) ) {
        shrLog("ERROR: Minimum work-group size %u required by this application is not supported on this device.\n", WORKGROUP_SIZE);
        exit(0);
    }

    shrLog(" ...allocating internal buffers\n");
    d_Buffer = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, (MAX_BATCH_ELEMENTS / (4 * WORKGROUP_SIZE)) * sizeof(uint), NULL, &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);

    //Discard temp storage
    free(cScan);
}
Ejemplo n.º 7
0
	int COpenCL::LoadKernel3(const char* cSourceFile)
	{
		cSourceCL = NULL;

		// Read the OpenCL kernel in from source file
		std::string depth = std::string("#define STACK_SIZE  ") + std::to_string(Nastavenia->OCTREE_Depth - 2);
		cSourceCL = oclLoadProgSource(cSourceFile, depth.c_str(), &szKernelLength);

		if(cSourceCL != NULL)
			return EXIT_SUCCESS;

		return EXIT_FAIL_LOAD;
	}
Ejemplo n.º 8
0
//-----------------------------------------------------------------------------
// Name: CreateKernelProgram()
// Desc: Creates OpenCL program and kernel instances
//-----------------------------------------------------------------------------
HRESULT CreateKernelProgram(
	const char *exepath, const char *clName, const char *clPtx, const char *kernelEntryPoint,
	cl_program			&cpProgram,
	cl_kernel			&ckKernel )
{
    // Program Setup
    size_t program_length;
    const char* source_path = shrFindFilePath(clName, exepath);
    char *source = oclLoadProgSource(source_path, "", &program_length);
    oclCheckErrorEX(source != NULL, shrTRUE, pCleanup);

    // create the program
    cpProgram = clCreateProgramWithSource(cxGPUContext, 1,(const char **) &source, &program_length, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    free(source);

    // build the program
#ifdef USE_STAGING_BUFFER
	static char *opts = "-cl-fast-relaxed-math -DUSE_STAGING_BUFFER";
#else
	static char *opts = "-cl-fast-relaxed-math";
#endif
    ciErrNum = clBuildProgram(cpProgram, 0, NULL, opts, NULL, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        // write out standard error, Build Log and PTX, then cleanup and exit
        shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
        oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
        oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), clPtx);
        Cleanup(EXIT_FAILURE); 
    }

    // create the kernel
    ckKernel = clCreateKernel(cpProgram, kernelEntryPoint, &ciErrNum);
    if (!ckKernel)
    {
        Cleanup(EXIT_FAILURE); 
    }

    // set the args values
	return ciErrNum ? E_FAIL : S_OK;
}
extern "C" void initBlackScholes(cl_context cxGPUContext, cl_command_queue cqParamCommandQueue, const char **argv){
    cl_int ciErrNum;
    size_t kernelLength;

    shrLog(LOGBOTH, 0, "...loading BlackScholes.cl\n");
        char *cBlackScholes = oclLoadProgSource(shrFindFilePath("BlackScholes.cl", argv[0]), "// My comment\n", &kernelLength);
        shrCheckError(cBlackScholes != NULL, shrTRUE);

    shrLog(LOGBOTH, 0, "...creating BlackScholes program\n");
        cpBlackScholes = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cBlackScholes, &kernelLength, &ciErrNum);
        shrCheckError(ciErrNum, CL_SUCCESS);

    shrLog(LOGBOTH, 0, "...building BlackScholes program\n");
        ciErrNum = clBuildProgram(cpBlackScholes, 0, NULL, NULL, NULL, NULL);
        shrCheckError(ciErrNum, CL_SUCCESS);

    shrLog(LOGBOTH, 0, "...creating BlackScholes kernels\n");
        ckBlackScholes = clCreateKernel(cpBlackScholes, "BlackScholes", &ciErrNum);
        shrCheckError(ciErrNum, CL_SUCCESS);

    cqDefaultCommandQueue = cqParamCommandQueue;
    free(cBlackScholes);
}
Ejemplo n.º 10
0
bool fdtdGPU(float *output, const float *input, const float *coeff, const int dimx, const int dimy, const int dimz, const int radius, const int timesteps, const int argc, const char **argv)
{
    bool ok = true;
    const int         outerDimx  = dimx + 2 * radius;
    const int         outerDimy  = dimy + 2 * radius;
    const int         outerDimz  = dimz + 2 * radius;
    const size_t      volumeSize = outerDimx * outerDimy * outerDimz;
    cl_context        context      = 0;
    cl_platform_id    platform     = 0;
    cl_device_id     *devices      = 0;
    cl_command_queue  commandQueue = 0;
    cl_mem            bufferOut    = 0;
    cl_mem            bufferIn     = 0;
    cl_mem            bufferCoeff  = 0;
    cl_program        program      = 0;
    cl_kernel         kernel       = 0;
    cl_event         *kernelEvents = 0;
#ifdef GPU_PROFILING
    cl_ulong          kernelEventStart;
    cl_ulong          kernelEventEnd;
#endif
    double            hostElapsedTimeS;
    char             *cPathAndName = 0;
    char             *cSourceCL = 0;
    size_t            szKernelLength;
    size_t            globalWorkSize[2];
    size_t            localWorkSize[2];
    cl_uint           deviceCount  = 0;
    cl_uint           targetDevice = 0;
    cl_int            errnum       = 0;
    char              buildOptions[128];

    // Ensure that the inner data starts on a 128B boundary
    const int padding = (128 / sizeof(float)) - radius;
    const size_t paddedVolumeSize = volumeSize + padding;

#ifdef GPU_PROFILING
    const int profileTimesteps = timesteps - 1;
    if (ok)
    {
        if (profileTimesteps < 1)
        {
            shrLog(" cannot profile with fewer than two timesteps (timesteps=%d), profiling is disabled.\n", timesteps);
        }
    }
#endif

    // Get the NVIDIA platform
    if (ok)
    {
        shrLog(" oclGetPlatformID...\n");
        errnum = oclGetPlatformID(&platform);
        if (errnum != CL_SUCCESS)
        {
            shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR);
            shrLog("oclGetPlatformID (returned %d).\n", errnum);
            ok = false;
        }
    }

    // Get the list of GPU devices associated with the platform
    if (ok)
    {
        shrLog(" clGetDeviceIDs");
        errnum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &deviceCount);
        devices = (cl_device_id *)malloc(deviceCount * sizeof(cl_device_id) );
        errnum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, deviceCount, devices, NULL);
        if (errnum != CL_SUCCESS)
        {
            shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR);
            shrLog("clGetDeviceIDs (returned %d).\n", errnum);
            ok = false;
        }
    }

    // Create the OpenCL context
    if (ok)
    {
        shrLog(" clCreateContext...\n");
        context = clCreateContext(0, deviceCount, devices, NULL, NULL, &errnum);
        if (errnum != CL_SUCCESS)
        {
            shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR);
            shrLog("clCreateContext (returned %d).\n", errnum);
            ok = false;
        }
    }

    // Select target device (device 0 by default)
    if (ok)
    {
        char *device = 0;
        if (shrGetCmdLineArgumentstr(argc, argv, "device", &device))
        {
            targetDevice = (cl_uint)atoi(device);
            if (targetDevice >= deviceCount)
            {
                shrLogEx(LOGBOTH | ERRORMSG, -2001, STDERROR);
                shrLog("invalid target device specified on command line (device %d does not exist).\n", targetDevice);
                ok = false;
            }
        }
        else
        {
            targetDevice = 0;
        }
        if (device)
        {
            free(device);
        }
    }

    // Create a command-queue
    if (ok)
    {
        shrLog(" clCreateCommandQueue\n"); 
        commandQueue = clCreateCommandQueue(context, devices[targetDevice], CL_QUEUE_PROFILING_ENABLE, &errnum);
        if (errnum != CL_SUCCESS)
        {
            shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR);
            shrLog("clCreateCommandQueue (returned %d).\n", errnum);
            ok = false;
        }
    }

    // Create memory buffer objects
    if (ok)
    {
        shrLog(" clCreateBuffer bufferOut\n"); 
        bufferOut = clCreateBuffer(context, CL_MEM_READ_WRITE, paddedVolumeSize * sizeof(float), NULL, &errnum);
        if (errnum != CL_SUCCESS)
        {
            shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR);
            shrLog("clCreateBuffer (returned %d).\n", errnum);
            ok = false;
        }
    }
    if (ok)
    {
        shrLog(" clCreateBuffer bufferIn\n"); 
        bufferIn = clCreateBuffer(context, CL_MEM_READ_WRITE, paddedVolumeSize * sizeof(float), NULL, &errnum);
        if (errnum != CL_SUCCESS)
        {
            shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR);
            shrLog("clCreateBuffer (returned %d).\n", errnum);
            ok = false;
        }
    }
    if (ok)
    {
        shrLog(" clCreateBuffer bufferCoeff\n"); 
        bufferCoeff = clCreateBuffer(context, CL_MEM_READ_ONLY, (radius + 1) * sizeof(float), NULL, &errnum);
        if (errnum != CL_SUCCESS)
        {
            shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR);
            shrLog("clCreateBuffer (returned %d).\n", errnum);
            ok = false;
        }
    }

    // Load the kernel from file
    if (ok)
    {
        shrLog(" shrFindFilePath\n"); 
        cPathAndName = shrFindFilePath(clSourceFile, argv[0]);
        if (cPathAndName == NULL)
        {
            shrLogEx(LOGBOTH | ERRORMSG, -2002, STDERROR);
            shrLog("shrFindFilePath returned null.\n");
            ok = false;
        }
    }
    if (ok)
    {
        shrLog(" oclLoadProgSource\n"); 
        cSourceCL = oclLoadProgSource(cPathAndName, "// Preamble\n", &szKernelLength);
        if (cSourceCL == NULL)
        {
            shrLogEx(LOGBOTH | ERRORMSG, -2003, STDERROR);
            shrLog("oclLoadProgSource returned null.\n");
            ok = false;
        }
    }

    // Create the program
    if (ok)
    {
        shrLog(" clCreateProgramWithSource\n");
        program = clCreateProgramWithSource(context, 1, (const char **)&cSourceCL, &szKernelLength, &errnum);
        if (errnum != CL_SUCCESS)
        {
            shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR);
            shrLog("clCreateProgramWithSource (returned %d).\n", errnum);
            ok = false;
        }
    }

    // Check for a command-line specified work group size
    size_t userWorkSize;
    int    localWorkMaxY;
	if (ok)
    {
        int userWorkSizeInt;
        if (shrGetCmdLineArgumenti(argc, argv, "work-group-size", &userWorkSizeInt))
        {
            // We can't clamp to CL_KERNEL_WORK_GROUP_SIZE yet since that is
            // dependent on the build.
            if (userWorkSizeInt < k_localWorkMin || userWorkSizeInt > k_localWorkMax)
            {
                shrLogEx(LOGBOTH | ERRORMSG, -2004, STDERROR);
                shrLog("invalid work group size specified on command line (must be between %d and %d).\n", k_localWorkMin, k_localWorkMax);
                ok = false;
            }
            // Constrain to a multiple of k_localWorkX
            userWorkSize = (userWorkSizeInt / k_localWorkX * k_localWorkX);
        }
        else
        {
            userWorkSize = k_localWorkY * k_localWorkX;
        }
        
        // Divide by k_localWorkX (integer division to clamp)
        localWorkMaxY = userWorkSize / k_localWorkX;
    }

    // Build the program
    if (ok)
    {
#ifdef WIN32
        if (sprintf_s(buildOptions, sizeof(buildOptions), "-DRADIUS=%d -DMAXWORKX=%d -DMAXWORKY=%d -cl-fast-relaxed-math", radius, k_localWorkX, localWorkMaxY) < 0)
        {
            shrLogEx(LOGBOTH | ERRORMSG, -2005, STDERROR);
            shrLog("sprintf_s (failed).\n");
            ok = false;
        }
#else
        if (snprintf(buildOptions, sizeof(buildOptions), "-DRADIUS=%d -DMAXWORKX=%d -DMAXWORKY=%d -cl-fast-relaxed-math", radius, k_localWorkX, localWorkMaxY) < 0)
        {
            shrLogEx(LOGBOTH | ERRORMSG, -2005, STDERROR);
            shrLog("snprintf (failed).\n");
            ok = false;
        }
#endif
    }
    if (ok)
    {
        shrLog(" clBuildProgram (%s)\n", buildOptions);
        errnum = clBuildProgram(program, 0, NULL, buildOptions, NULL, NULL);
        if (errnum != CL_SUCCESS)
        {
            char buildLog[10240];
            clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(buildLog), buildLog, NULL);
            shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR);
            shrLog("clBuildProgram (returned %d).\n", errnum);
            shrLog("Log:\n%s\n", buildLog);
            ok = false;
        }
    }

    // Create the kernel
    if (ok)
    {
        shrLog(" clCreateKernel\n");
        kernel = clCreateKernel(program, "FiniteDifferences", &errnum);
        if (kernel == (cl_kernel)NULL || errnum != CL_SUCCESS)
        {
            shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR);
            shrLog("clCreateKernel (returned %d).\n", errnum);
            ok = false;
        }
    }

    // Get the maximum work group size
    size_t maxWorkSize;
    if (ok)
    {
        shrLog(" clGetKernelWorkGroupInfo\n");
        errnum = clGetKernelWorkGroupInfo(kernel, devices[targetDevice], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &maxWorkSize, NULL);
        if (errnum != CL_SUCCESS)
        {
            shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR);
            shrLog("clGetKernelWorkGroupInfo (returned %d).\n", errnum);
            ok = false;
        }
    }

    // Set the work group size
    if (ok)
    {
		userWorkSize = CLAMP(userWorkSize, k_localWorkMin, maxWorkSize);
        localWorkSize[0] = k_localWorkX;
        localWorkSize[1] = userWorkSize / k_localWorkX;
        globalWorkSize[0] = localWorkSize[0] * (unsigned int)ceil((float)dimx / localWorkSize[0]);
        globalWorkSize[1] = localWorkSize[1] * (unsigned int)ceil((float)dimy / localWorkSize[1]);
        shrLog(" set local work group size to %dx%d\n", localWorkSize[0], localWorkSize[1]);
        shrLog(" set total work size to %dx%d\n", globalWorkSize[0], globalWorkSize[1]);
    }

    // Copy the input to the device input buffer
    if (ok)
    {
        shrLog(" clEnqueueWriteBuffer bufferIn\n");
        errnum = clEnqueueWriteBuffer(commandQueue, bufferIn, CL_TRUE, padding * sizeof(float), volumeSize * sizeof(float), input, 0, NULL, NULL);
        if (errnum != CL_SUCCESS)
        {
            shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR);
            shrLog("clEnqueueWriteBuffer bufferIn (returned %d).\n", errnum);
            ok = false;
        }
    }
    // Copy the input to the device output buffer (actually only need the halo)
    if (ok)
    {
        shrLog(" clEnqueueWriteBuffer bufferOut\n");
        errnum = clEnqueueWriteBuffer(commandQueue, bufferOut, CL_TRUE, padding * sizeof(float), volumeSize * sizeof(float), input, 0, NULL, NULL);
        if (errnum != CL_SUCCESS)
        {
            shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR);
            shrLog("clEnqueueWriteBuffer bufferOut (returned %d).\n", errnum);
            ok = false;
        }
    }
    // Copy the coefficients to the device coefficient buffer
    if (ok)
    {
        shrLog(" clEnqueueWriteBuffer bufferCoeff\n");
        errnum = clEnqueueWriteBuffer(commandQueue, bufferCoeff, CL_TRUE, 0, (radius + 1) * sizeof(float), coeff, 0, NULL, NULL);
        if (errnum != CL_SUCCESS)
        {
            shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR);
            shrLog("clEnqueueWriteBuffer bufferCoeff (returned %d).\n", errnum);
            ok = false;
        }
    }

    // Allocate the events
    if (ok)
    {
        shrLog(" calloc events\n");
        if ((kernelEvents = (cl_event *)calloc(timesteps, sizeof(cl_event))) == NULL)
        {
            shrLogEx(LOGBOTH | ERRORMSG, -2006, STDERROR);
            shrLog("Insufficient memory for events calloc, please try a smaller volume (use --help for syntax).\n");
            ok = false;        
        }
    }

    // Start the clock
    shrDeltaT(0);

    // Set the constant arguments
    if (ok)
    {
        shrLog(" clSetKernelArg 2-6\n");
        errnum = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&bufferCoeff);
        errnum |= clSetKernelArg(kernel, 3, sizeof(int), &dimx);
        errnum |= clSetKernelArg(kernel, 4, sizeof(int), &dimy);
        errnum |= clSetKernelArg(kernel, 5, sizeof(int), &dimz);
        errnum |= clSetKernelArg(kernel, 6, sizeof(int), &padding);
        if (errnum != CL_SUCCESS)
        {
            shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR);
            shrLog("clSetKernelArg 2-6 (returned %d).\n", errnum);
            ok = false;     
        }
    }

    // Execute the FDTD
    cl_mem bufferSrc = bufferIn;
    cl_mem bufferDst = bufferOut;
    if (ok)
    {
        shrLog(" GPU FDTD loop\n");
    }
    for (int it = 0 ; ok && it < timesteps ; it++)
    {
        shrLog("\tt = %d ", it);

        // Set the dynamic arguments
        if (ok)
        {
            shrLog(" clSetKernelArg 0-1,");
            errnum = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&bufferDst);
            errnum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&bufferSrc);
            if (errnum != CL_SUCCESS)
            {
                shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR);
                shrLog("clSetKernelArg 0-1 (returned %d).\n", errnum);
                ok = false;               
            }
        }

        // Launch the kernel
        if (ok)
        {
            shrLog(" clEnqueueNDRangeKernel\n");
            errnum = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &kernelEvents[it]);
            if (errnum != CL_SUCCESS)
            {
                shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR);
                shrLog("clEnqueueNDRangeKernel (returned %d).\n", errnum);
                ok = false;   
            }
        }
        // Toggle the buffers
        cl_mem tmp = bufferSrc;
        bufferSrc = bufferDst;
        bufferDst = tmp;
    }
    if (ok)
        shrLog("\n");

    // Wait for the kernel to complete
    if (ok)
    {
        shrLog(" clWaitForEvents\n");
        errnum = clWaitForEvents(1, &kernelEvents[timesteps-1]);
        if (errnum != CL_SUCCESS)
        {
            shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR);
            shrLog("clWaitForEvents (returned %d).\n", errnum);
            ok = false;  
        }
    }

    // Stop the clock
    hostElapsedTimeS = shrDeltaT(0);

    // Read the result back, result is in bufferSrc (after final toggle)
    if (ok)
    {
        shrLog(" clEnqueueReadBuffer\n");
        errnum = clEnqueueReadBuffer(commandQueue, bufferSrc, CL_TRUE, padding * sizeof(float), volumeSize * sizeof(float), output, 0, NULL, NULL);
        if (errnum != CL_SUCCESS)
        {
            shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR);
            shrLog("clEnqueueReadBuffer bufferSrc (returned %d).\n", errnum);
            ok = false;  
        }
    }

    // Report time
#ifdef GPU_PROFILING
    double elapsedTime = 0.0;
    if (ok && profileTimesteps > 0)
        shrLog(" Collect profile information\n");
    for (int it = 1 ; ok && it <= profileTimesteps ; it++)
    {
        shrLog("\tt = %d ", it);
        shrLog(" clGetEventProfilingInfo,", it);
        errnum = clGetEventProfilingInfo(kernelEvents[it], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernelEventStart, NULL);
        if (errnum != CL_SUCCESS)
        {
            shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR);
            shrLog("clGetEventProfilingInfo (returned %d).\n", errnum);
            ok = false;  
        }
        shrLog(" clGetEventProfilingInfo\n", it);
        errnum = clGetEventProfilingInfo(kernelEvents[it], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernelEventEnd, NULL);
        if (errnum != CL_SUCCESS)
        {
            shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR);
            shrLog("clGetEventProfilingInfo (returned %d).\n", errnum);
            ok = false;  
        }
        elapsedTime += (double)kernelEventEnd - (double)kernelEventStart;
    }
    if (ok && profileTimesteps > 0)
    {
        shrLog("\n");
        // Convert nanoseconds to seconds
        elapsedTime *= 1.0e-9;
        double avgElapsedTime = elapsedTime / (double)profileTimesteps;
        // Determine number of computations per timestep
        size_t pointsComputed = dimx * dimy * dimz;
        // Determine throughput
        double throughputM    = 1.0e-6 * (double)pointsComputed / avgElapsedTime;
        shrLogEx(LOGBOTH | MASTER, 0, "oclFDTD3d, Throughput = %.4f MPoints/s, Time = %.5f s, Size = %u Points, NumDevsUsed = %i, Workgroup = %u\n", 
            throughputM, avgElapsedTime, pointsComputed, 1, localWorkSize[0] * localWorkSize[1]); 
    }
#endif
    
    // Cleanup
    if (kernelEvents)
    {
        for (int it = 0 ; it < timesteps ; it++)
        {
            if (kernelEvents[it])
                clReleaseEvent(kernelEvents[it]);
        }
        free(kernelEvents);
    }
    if (kernel)
        clReleaseKernel(kernel);
    if (program)
        clReleaseProgram(program);
    if (cSourceCL)
        free(cSourceCL);
    if (cPathAndName)
        free(cPathAndName);
    if (bufferCoeff)
        clReleaseMemObject(bufferCoeff);
    if (bufferIn)
        clReleaseMemObject(bufferIn);
    if (bufferOut)
        clReleaseMemObject(bufferOut);
    if (commandQueue)
        clReleaseCommandQueue(commandQueue);
    if (devices)
        free(devices);
    if (context)
        clReleaseContext(context);
    return ok;
}
Ejemplo n.º 11
0
void setup_cl(BS_test_t *t)
{
	cl_int errcode_ret;

	// Get OpenCL platform count
	cl_uint NumPlatforms;
	clGetPlatformIDs (0, NULL, &NumPlatforms);

	// Get all OpenCL platform IDs
	cl_platform_id* PlatformIDs;
	PlatformIDs = new cl_platform_id[NumPlatforms];
	clGetPlatformIDs (NumPlatforms, PlatformIDs, NULL);

	// find NVIDIA & AMD platforms
	char cBuffer[1024];
	cl_int NvPlatform = -1;
	cl_int AMDPlatform = -1;
	for(cl_uint i = 0; i < NumPlatforms; ++i)
	{
		clGetPlatformInfo (PlatformIDs[i], CL_PLATFORM_NAME, 1024, cBuffer, NULL);
		printf("%s\n", cBuffer);
		if(strstr(cBuffer, "NVIDIA") != NULL) {
			NvPlatform = i;
		}
		else if (strstr(cBuffer, "AMD") != NULL) {
			AMDPlatform = i;
		}
	}

	// check for AMD and NVIDIA GPU devices
	cl_device_id cdDevice;
	cl_uint NvNumDevices = 0;
	cl_uint AMDNumDevices = 0;
	if (AMDPlatform != -1)
		clGetDeviceIDs(PlatformIDs[AMDPlatform], CL_DEVICE_TYPE_GPU, 0, NULL, &AMDNumDevices);
	if (NvPlatform != -1)
		clGetDeviceIDs(PlatformIDs[NvPlatform], CL_DEVICE_TYPE_GPU, 0, NULL, &NvNumDevices);

	// if there is an AMD GPU, take it, or take an NVIDIA GPU if it is there
	if (AMDNumDevices > 0)
		clGetDeviceIDs(PlatformIDs[AMDPlatform], CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);
	else if (NvNumDevices > 0)
		clGetDeviceIDs(PlatformIDs[NvPlatform], CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);
	else {
		fprintf(stderr, "could not find any GPU devices, exiting\n");
		delete [] PlatformIDs;
		exit(-1);
	}

	delete [] PlatformIDs;

	// get max work group size, just in case
	clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &(t->maxBlockSize), NULL);
	
	//Create a context
	printf("Creating context for %s GPU...\n", (AMDNumDevices > 0) ? "AMD" : "NVIDIA");
	t->hContext = clCreateContext(NULL, 1, &cdDevice, NULL, NULL, &errcode_ret);
	ERR_CHECK(errcode_ret, "clCreateContext");

	size_t nContextDescriptorSize; 
	clGetContextInfo(t->hContext, CL_CONTEXT_DEVICES, 0, 0, &nContextDescriptorSize); 
	cl_device_id * aDevices = (cl_device_id *) malloc(nContextDescriptorSize); 
	clGetContextInfo(t->hContext, CL_CONTEXT_DEVICES, nContextDescriptorSize, aDevices, 0);

	// create a command queue for first
	// device the context reported  
	t->hCmdQueue = clCreateCommandQueue(t->hContext, aDevices[0], CL_QUEUE_PROFILING_ENABLE, &errcode_ret);
	ERR_CHECK(errcode_ret, "clCreateCommandQueue");
	free(aDevices);
	
	// create & compile Black-Scholes program
	cl_program hBSProg; 
	char * BSCode;
	size_t BSLen;
	
	printf("Compiling Black-Scholes program...\n");
	BSCode = oclLoadProgSource("bs_kernel.cl", "", &BSLen);
	hBSProg = clCreateProgramWithSource(t->hContext,1, (const char **)&BSCode, &BSLen, &errcode_ret); 
	ERR_CHECK(errcode_ret, "clCreateProgramWithSource BS");
	errcode_ret = clBuildProgram(hBSProg, 0, NULL, NULL, NULL, NULL);
	ERR_CHECK(errcode_ret, "clBuildProgram BS");
	free(BSCode);

#ifdef PTX_OUTPUT
	size_t progSize;
	errcode_ret = clGetProgramInfo(hBSProg, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &progSize, NULL);
	ERR_CHECK(errcode_ret, "clGetProgramInfo");
	char ** prog = (char **) malloc (sizeof(char **));
	prog[0] = (char *) malloc(progSize * sizeof(char *));
	errcode_ret = clGetProgramInfo(hBSProg, CL_PROGRAM_BINARIES, sizeof(char **) * progSize, prog, NULL);
	ERR_CHECK(errcode_ret, "clGetProgramInfo");
	FILE * f = fopen("bs_cl.ptx", "w");
	fprintf(f, "%s\n", prog[0]);
	fclose(f);
#endif

	// create BS kernel 
	printf("Creating Black-Scholes kernel...\n");
	t->hBSKernel = clCreateKernel(hBSProg, "BlackScholes", &errcode_ret);
	ERR_CHECK(errcode_ret, "clCreateKernel BlackScholes");
} 
Ejemplo n.º 12
0
// Main function 
// *********************************************************************
int main(int argc, char** argv)
{
    shrQAStart(argc, argv);
    // get command line arg for quick test, if provided
    bNoPrompt = shrCheckCmdLineFlag(argc, (const char **)argv, "noprompt");

    // start logs
	cExecutableName = argv[0];
    shrSetLogFileName ("oclMatVecMul.txt");
    shrLog("%s Starting...\n\n", argv[0]); 

    // calculate matrix height given GPU memory
    shrLog("Determining Matrix height from available GPU mem...\n");
    memsize_t memsize;
    getTargetDeviceGlobalMemSize(&memsize, argc, (const char **)argv);
    height = memsize/width/16;
    if (height > MAX_HEIGHT)
        height = MAX_HEIGHT;
    shrLog(" Matrix width\t= %u\n Matrix height\t= %u\n\n", width, height); 

    // Allocate and initialize host arrays
    shrLog("Allocate and Init Host Mem...\n\n");
    unsigned int size = width * height;
    unsigned int mem_size_M = size * sizeof(float);
    M = (float*)malloc(mem_size_M);
    unsigned int mem_size_V = width * sizeof(float);
    V = (float*)malloc(mem_size_V);
    unsigned int mem_size_W = height * sizeof(float);
    W = (float*)malloc(mem_size_W);
    shrFillArray(M, size);
    shrFillArray(V, width);
    Golden = (float*)malloc(mem_size_W);
    MatVecMulHost(M, V, width, height, Golden);

    //Get the NVIDIA platform
    shrLog("Get the Platform ID...\n\n");
    ciErrNum = oclGetPlatformID(&cpPlatform);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    //Get all the devices
    shrLog("Get the Device info and select Device...\n");
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Set target device and Query number of compute units on targetDevice
    shrLog(" # of Devices Available = %u\n", uiNumDevices); 
    if(shrGetCmdLineArgumentu(argc, (const char **)argv, "device", &targetDevice)== shrTRUE) 
    {
        targetDevice = CLAMP(targetDevice, 0, (uiNumDevices - 1));
    }
    shrLog(" Using Device %u: ", targetDevice); 
    oclPrintDevName(LOGBOTH, cdDevices[targetDevice]);  
    cl_uint num_compute_units;
    clGetDeviceInfo(cdDevices[targetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(num_compute_units), &num_compute_units, NULL);
    shrLog("\n # of Compute Units = %u\n\n", num_compute_units); 

    //Create the context
    shrLog("clCreateContext...\n"); 
    cxGPUContext = clCreateContext(0, uiNumDevsUsed, &cdDevices[targetDevice], NULL, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Create a command-queue
    shrLog("clCreateCommandQueue...\n"); 
    cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[targetDevice], CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Allocate the OpenCL buffer memory objects for source and result on the device GMEM
    shrLog("clCreateBuffer (M, V and W in device global memory, mem_size_m = %u)...\n", mem_size_M); 
    cmM = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, mem_size_M, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    cmV = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, mem_size_V, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    cmW = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, mem_size_W, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Read the OpenCL kernel in from source file
    shrLog("oclLoadProgSource (%s)...\n", cSourceFile); 
    cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
    oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup);
    cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);
    oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup);

    // Create the program
    shrLog("clCreateProgramWithSource...\n"); 
    cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);

    // Build the program
    shrLog("clBuildProgram...\n"); 
    ciErrNum = clBuildProgram(cpProgram, uiNumDevsUsed, &cdDevices[targetDevice], "-cl-fast-relaxed-math", NULL, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        // write out standard error, Build Log and PTX, then cleanup and exit
        shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
        oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
        oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclMatVecMul.ptx");
        shrQAFinish(argc, (const char **)argv, QA_FAILED);
        Cleanup(EXIT_FAILURE); 
    }

    // --------------------------------------------------------
    // Core sequence... copy input data to GPU, compute, copy results back

    // Asynchronous write of data to GPU device
    shrLog("clEnqueueWriteBuffer (M and V)...\n\n"); 
    ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmM, CL_FALSE, 0, mem_size_M, M, 0, NULL, NULL);
    ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue, cmV, CL_FALSE, 0, mem_size_V, V, 0, NULL, NULL);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Kernels
    const char* kernels[] = {
        "MatVecMulUncoalesced0",
        "MatVecMulUncoalesced1",
        "MatVecMulCoalesced0",
        "MatVecMulCoalesced1",
        "MatVecMulCoalesced2",
        "MatVecMulCoalesced3" };

    for (int k = 0; k < (int)(sizeof(kernels)/sizeof(char*)); ++k) {
        shrLog("Running with Kernel %s...\n\n", kernels[k]); 

        // Clear result
        shrLog("  Clear result with clEnqueueWriteBuffer (W)...\n"); 
        memset(W, 0, mem_size_W);
        ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmW, CL_FALSE, 0, mem_size_W, W, 0, NULL, NULL);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

        // Create the kernel
        shrLog("  clCreateKernel...\n"); 
        if (ckKernel) {
            clReleaseKernel(ckKernel);
            ckKernel = 0;
        }
        ckKernel = clCreateKernel(cpProgram, kernels[k], &ciErrNum);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

        // Set and log Global and Local work size dimensions
        szLocalWorkSize = 256;
        if (k == 0)
            szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, height);  // rounded up to the nearest multiple of the LocalWorkSize
        else
            // Some experiments should be done here for determining the best global work size for a given device
            // We will assume here that we can run 2 work-groups per compute unit
            szGlobalWorkSize = 2 * num_compute_units * szLocalWorkSize;
        shrLog("  Global Work Size \t\t= %u\n  Local Work Size \t\t= %u\n  # of Work Groups \t\t= %u\n", 
               szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); 

        // Set the Argument values
        shrLog("  clSetKernelArg...\n\n");
        int n = 0;
        ciErrNum = clSetKernelArg(ckKernel,  n++, sizeof(cl_mem), (void*)&cmM);
        ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_mem), (void*)&cmV);
        ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_int), (void*)&width);
        ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_int), (void*)&height);
        ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_mem), (void*)&cmW);
        if (k > 1)
            ciErrNum |= clSetKernelArg(ckKernel, n++, szLocalWorkSize * sizeof(float), 0);    
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

        // Launch kernel
        shrLog("  clEnqueueNDRangeKernel (%s)...\n", kernels[k]); 
        ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, &ceEvent);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

        // Read back results and check accumulated errors
        shrLog("  clEnqueueReadBuffer (W)...\n"); 
        ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmW, CL_TRUE, 0, mem_size_W, W, 0, NULL, NULL);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    #ifdef GPU_PROFILING
        // Execution time
        ciErrNum = clWaitForEvents(1, &ceEvent);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
        cl_ulong start, end;
        ciErrNum = clGetEventProfilingInfo(ceEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
        ciErrNum |= clGetEventProfilingInfo(ceEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
        double dSeconds = 1.0e-9 * (double)(end - start);
        shrLog("  Kernel execution time: %.5f s\n\n", dSeconds);
    #endif

        // Compare results for golden-host and report errors and pass/fail
        shrLog("  Comparing against Host/C++ computation...\n\n"); 
        shrBOOL res = shrCompareL2fe(Golden, W, height, 1e-6f);
        shrLog("    GPU Result %s CPU Result within allowable tolerance\n\n", (res == shrTRUE) ? "MATCHES" : "DOESN'T MATCH");
        bPassFlag &= (res == shrTRUE); 

        // Release event
        ciErrNum = clReleaseEvent(ceEvent);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
        ceEvent = 0;
    }

    // Master status Pass/Fail (all tests)
    shrQAFinish(argc, (const char **)argv, (bPassFlag ? QA_PASSED : QA_FAILED) );

    // Cleanup and leave
    Cleanup (EXIT_SUCCESS);
}
Ejemplo n.º 13
0
RadixSort::RadixSort(cl_context GPUContext,
					 cl_command_queue CommandQue,
					 unsigned int maxElements, 
					 const char* path, 
					 const int ctaSize,
					 bool keysOnly = true) :
					 mNumElements(0),
					 mTempValues(0),
					 mCounters(0),
					 mCountersSum(0),
					 mBlockOffsets(0),
					 cxGPUContext(GPUContext),
					 cqCommandQueue(CommandQue),
					 CTA_SIZE(ctaSize),
					 scan(GPUContext, CommandQue, maxElements/2/CTA_SIZE*16, path)
{

	unsigned int numBlocks = ((maxElements % (CTA_SIZE * 4)) == 0) ? 
            (maxElements / (CTA_SIZE * 4)) : (maxElements / (CTA_SIZE * 4) + 1);
    unsigned int numBlocks2 = ((maxElements % (CTA_SIZE * 2)) == 0) ?
            (maxElements / (CTA_SIZE * 2)) : (maxElements / (CTA_SIZE * 2) + 1);

	cl_int ciErrNum;
	d_tempKeys = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(unsigned int) * maxElements, NULL, &ciErrNum);
	mCounters = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, WARP_SIZE * numBlocks * sizeof(unsigned int), NULL, &ciErrNum);
	mCountersSum = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, WARP_SIZE * numBlocks * sizeof(unsigned int), NULL, &ciErrNum);
	mBlockOffsets = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, WARP_SIZE * numBlocks * sizeof(unsigned int), NULL, &ciErrNum); 

	size_t szKernelLength; // Byte size of kernel code
    char *cSourcePath = "./RadixSort.cl";//shrFindFilePath("RadixSort.cl", path);
    //printf("%s\n",cSourcePath);
    //shrCheckError(cSourcePath != NULL, shrTRUE);
    char *cRadixSort = oclLoadProgSource(cSourcePath, "// My comment\n", &szKernelLength);
    ////oclCheckError(cRadixSort != NULL, shrTRUE);
    cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cRadixSort, &szKernelLength, &ciErrNum);
    //oclCheckError(ciErrNum, CL_SUCCESS);
#ifdef MAC
    char *flags = "-DMAC -cl-fast-relaxed-math";
#else
    char *flags = "-cl-fast-relaxed-math";
#endif
    ciErrNum = clBuildProgram(cpProgram, 0, NULL, flags, NULL, NULL);
    //if (ciErrNum != CL_SUCCESS)
    //{
    //    // write out standard ciErrNumor, Build Log and PTX, then cleanup and exit
    //    //printf(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
    //    oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
    //    oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "RadixSort.ptx");
    //    oclCheckError(ciErrNum, CL_SUCCESS); 
    //}

	ckRadixSortBlocksKeysOnly = clCreateKernel(cpProgram, "radixSortBlocksKeysOnly", &ciErrNum);
	//oclCheckError(ciErrNum, CL_SUCCESS);
	ckFindRadixOffsets        = clCreateKernel(cpProgram, "findRadixOffsets",        &ciErrNum);
	//oclCheckError(ciErrNum, CL_SUCCESS);
	ckScanNaive               = clCreateKernel(cpProgram, "scanNaive",               &ciErrNum);
	//oclCheckError(ciErrNum, CL_SUCCESS);
	ckReorderDataKeysOnly     = clCreateKernel(cpProgram, "reorderDataKeysOnly",     &ciErrNum);
	//oclCheckError(ciErrNum, CL_SUCCESS);
	free(cRadixSort);
    //free(cSourcePath);
}
// Main function
// *********************************************************************
int main(int argc, char** argv) 
{
    shrQAStart(argc, argv);
    
    int use_gpu = 0;
    for(int i = 0; i < argc && argv; i++)
    {
        if(!argv[i])
            continue;
          
        if(strstr(argv[i], "cpu"))
            use_gpu = 0;        

        else if(strstr(argv[i], "gpu"))
            use_gpu = 1;
    }

    // start logs
    shrSetLogFileName ("oclDXTCompression.txt");
    shrLog("%s Starting...\n\n", argv[0]); 

    cl_platform_id cpPlatform = NULL;
    cl_uint uiNumDevices = 0;
    cl_device_id *cdDevices = NULL;
    cl_context cxGPUContext;
    cl_command_queue cqCommandQueue;
    cl_program cpProgram;
    cl_kernel ckKernel;
    cl_mem cmMemObjs[3];
    cl_mem cmAlphaTable4, cmProds4;
    cl_mem cmAlphaTable3, cmProds3;
    size_t szGlobalWorkSize[1];
    size_t szLocalWorkSize[1];
    cl_int ciErrNum;

    // Get the path of the filename
    char *filename;
    if (shrGetCmdLineArgumentstr(argc, (const char **)argv, "image", &filename)) {
        image_filename = filename;
    }
    // load image
    const char* image_path = shrFindFilePath(image_filename, argv[0]);
    oclCheckError(image_path != NULL, shrTRUE);
    shrLoadPPM4ub(image_path, (unsigned char **)&h_img, &width, &height);
    oclCheckError(h_img != NULL, shrTRUE);
    shrLog("Loaded '%s', %d x %d pixels\n\n", image_path, width, height);

    // Convert linear image to block linear. 
    const uint memSize = width * height * sizeof(cl_uint);
    uint* block_image = (uint*)malloc(memSize);

    // Convert linear image to block linear. 
    for(uint by = 0; by < height/4; by++) {
        for(uint bx = 0; bx < width/4; bx++) {
            for (int i = 0; i < 16; i++) {
                const int x = i & 3;
                const int y = i / 4;
                block_image[(by * width/4 + bx) * 16 + i] = 
                    ((uint *)h_img)[(by * 4 + y) * 4 * (width/4) + bx * 4 + x];
            }
        }
    }

    // Get the NVIDIA platform
    ciErrNum = oclGetPlatformID(&cpPlatform);
    oclCheckError(ciErrNum, CL_SUCCESS);

    // Get the platform's GPU devices
    ciErrNum = clGetDeviceIDs(cpPlatform, use_gpu?CL_DEVICE_TYPE_GPU:CL_DEVICE_TYPE_CPU, 0, NULL, &uiNumDevices);
    oclCheckError(ciErrNum, CL_SUCCESS);
    cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
    ciErrNum = clGetDeviceIDs(cpPlatform, use_gpu?CL_DEVICE_TYPE_GPU:CL_DEVICE_TYPE_CPU, uiNumDevices, cdDevices, NULL);
    oclCheckError(ciErrNum, CL_SUCCESS);

    // Create the context
    cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);

    // get and log device
    cl_device_id device;
    if( shrCheckCmdLineFlag(argc, (const char **)argv, "device") ) {
      int device_nr = 0;
      shrGetCmdLineArgumenti(argc, (const char **)argv, "device", &device_nr);
      device = oclGetDev(cxGPUContext, device_nr);
      if( device == (cl_device_id)-1 ) {
          shrLog(" Invalid GPU Device: devID=%d.  %d valid GPU devices detected\n\n", device_nr, uiNumDevices);
		  shrLog(" exiting...\n");
          return -1;
      }
    } else {
      device = oclGetMaxFlopsDev(cxGPUContext);
    }

    oclPrintDevName(LOGBOTH, device);
    shrLog("\n");

    // create a command-queue
    cqCommandQueue = clCreateCommandQueue(cxGPUContext, device, 0, &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);

    // Memory Setup

    // Constants
    cmAlphaTable4 = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 4 * sizeof(cl_float), (void*)&alphaTable4[0], &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);
    cmProds4 = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 4 * sizeof(cl_int), (void*)&prods4[0], &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);
    cmAlphaTable3 = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 4 * sizeof(cl_float), (void*)&alphaTable3[0], &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);
    cmProds3 = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 4 * sizeof(cl_int), (void*)&prods3[0], &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);

    // Compute permutations.
    cl_uint permutations[1024];
    computePermutations(permutations);

    // Upload permutations.
    cmMemObjs[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                  sizeof(cl_uint) * 1024, permutations, &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);

    // Image
    cmMemObjs[1] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, memSize, NULL, &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);
    
    // Result
    const uint compressedSize = (width / 4) * (height / 4) * 8;
    cmMemObjs[2] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, compressedSize, NULL , &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);
    
    unsigned int * h_result = (uint*)malloc(compressedSize);

    // Program Setup
    size_t program_length;
    const char* source_path = shrFindFilePath("DXTCompression.cl", argv[0]);
    oclCheckError(source_path != NULL, shrTRUE);
    char *source = oclLoadProgSource(source_path, "", &program_length);
    oclCheckError(source != NULL, shrTRUE);

    // create the program
    cpProgram = clCreateProgramWithSource(cxGPUContext, 1,
        (const char **) &source, &program_length, &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);

    // build the program
    ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        // write out standard error, Build Log and PTX, then cleanup and exit
        shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
        oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
        oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclDXTCompression.ptx");
        oclCheckError(ciErrNum, CL_SUCCESS); 
    }

    // create the kernel
    ckKernel = clCreateKernel(cpProgram, "compress", &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);

    // set the args values
    ciErrNum  = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void *) &cmMemObjs[0]);
    ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void *) &cmMemObjs[1]);
    ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void *) &cmMemObjs[2]);
    ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(cl_mem), (void*)&cmAlphaTable4);
    ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(cl_mem), (void*)&cmProds4);
    ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(cl_mem), (void*)&cmAlphaTable3);
    ciErrNum |= clSetKernelArg(ckKernel, 6, sizeof(cl_mem), (void*)&cmProds3);
    oclCheckError(ciErrNum, CL_SUCCESS);

    // Copy input data host to device
    clEnqueueWriteBuffer(cqCommandQueue, cmMemObjs[1], CL_FALSE, 0, sizeof(cl_uint) * width * height, block_image, 0,0,0);

    // Determine launch configuration and run timed computation numIterations times
	int blocks = ((width + 3) / 4) * ((height + 3) / 4); // rounds up by 1 block in each dim if %4 != 0

	// Restrict the numbers of blocks to launch on low end GPUs to avoid kernel timeout
	cl_uint compute_units;
    clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
	int blocksPerLaunch = MIN(blocks, 768 * (int)compute_units);

    // set work-item dimensions
    szGlobalWorkSize[0] = blocksPerLaunch * NUM_THREADS;
    szLocalWorkSize[0]= NUM_THREADS;

#ifdef GPU_PROFILING
    shrLog("\nRunning DXT Compression on %u x %u image...\n", width, height);
    shrLog("\n%u Workgroups, %u Work Items per Workgroup, %u Work Items in NDRange...\n\n", 
           blocks, NUM_THREADS, blocks * NUM_THREADS);

    int numIterations = 50;
    for (int i = -1; i < numIterations; ++i) {
        if (i == 0) { // start timing only after the first warmup iteration
            clFinish(cqCommandQueue); // flush command queue
            shrDeltaT(0); // start timer
        }
#endif
        // execute kernel
		for( int j=0; j<blocks; j+= blocksPerLaunch ) {
			clSetKernelArg(ckKernel, 7, sizeof(int), &j);
			szGlobalWorkSize[0] = MIN( blocksPerLaunch, blocks-j ) * NUM_THREADS;
			ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL,
				                              szGlobalWorkSize, szLocalWorkSize, 
					                          0, NULL, NULL);
			oclCheckError(ciErrNum, CL_SUCCESS);
		}

#ifdef GPU_PROFILING
    }
    clFinish(cqCommandQueue);
    double dAvgTime = shrDeltaT(0) / (double)numIterations;
    shrLogEx(LOGBOTH | MASTER, 0, "oclDXTCompression, Throughput = %.4f MPixels/s, Time = %.5f s, Size = %u Pixels, NumDevsUsed = %i, Workgroup = %d\n", 
           (1.0e-6 * (double)(width * height)/ dAvgTime), dAvgTime, (width * height), 1, szLocalWorkSize[0]); 
#endif

    // blocking read output
    ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmMemObjs[2], CL_TRUE, 0,
                                   compressedSize, h_result, 0, NULL, NULL);
    oclCheckError(ciErrNum, CL_SUCCESS);

    // Write DDS file.
    FILE* fp = NULL;
    char output_filename[1024];
    #ifdef WIN32
        strcpy_s(output_filename, 1024, image_path);
        strcpy_s(output_filename + strlen(image_path) - 3, 1024 - strlen(image_path) + 3, "dds");
        fopen_s(&fp, output_filename, "wb");
    #else
        strcpy(output_filename, image_path);
        strcpy(output_filename + strlen(image_path) - 3, "dds");
        fp = fopen(output_filename, "wb");
    #endif
    oclCheckError(fp != NULL, shrTRUE);

    DDSHeader header;
    header.fourcc = FOURCC_DDS;
    header.size = 124;
    header.flags  = (DDSD_WIDTH|DDSD_HEIGHT|DDSD_CAPS|DDSD_PIXELFORMAT|DDSD_LINEARSIZE);
    header.height = height;
    header.width = width;
    header.pitch = compressedSize;
    header.depth = 0;
    header.mipmapcount = 0;
    memset(header.reserved, 0, sizeof(header.reserved));
    header.pf.size = 32;
    header.pf.flags = DDPF_FOURCC;
    header.pf.fourcc = FOURCC_DXT1;
    header.pf.bitcount = 0;
    header.pf.rmask = 0;
    header.pf.gmask = 0;
    header.pf.bmask = 0;
    header.pf.amask = 0;
    header.caps.caps1 = DDSCAPS_TEXTURE;
    header.caps.caps2 = 0;
    header.caps.caps3 = 0;
    header.caps.caps4 = 0;
    header.notused = 0;

    fwrite(&header, sizeof(DDSHeader), 1, fp);
    fwrite(h_result, compressedSize, 1, fp);

    fclose(fp);

    // Make sure the generated image matches the reference image (regression check)
    shrLog("\nComparing against Host/C++ computation...\n");     
    const char* reference_image_path = shrFindFilePath(refimage_filename, argv[0]);
    oclCheckError(reference_image_path != NULL, shrTRUE);

    // read in the reference image from file
    #ifdef WIN32
        fopen_s(&fp, reference_image_path, "rb");
    #else
        fp = fopen(reference_image_path, "rb");
    #endif
    oclCheckError(fp != NULL, shrTRUE);
    fseek(fp, sizeof(DDSHeader), SEEK_SET);
    uint referenceSize = (width / 4) * (height / 4) * 8;
    uint * reference = (uint *)malloc(referenceSize);
    fread(reference, referenceSize, 1, fp);
    fclose(fp);

    // compare the reference image data to the sample/generated image
    float rms = 0;
    for (uint y = 0; y < height; y += 4)
    {
        for (uint x = 0; x < width; x += 4)
        {
            // binary comparison of data
            uint referenceBlockIdx = ((y/4) * (width/4) + (x/4));
            uint resultBlockIdx = ((y/4) * (width/4) + (x/4));
            int cmp = compareBlock(((BlockDXT1 *)h_result) + resultBlockIdx, ((BlockDXT1 *)reference) + referenceBlockIdx);

            // log deviations, if any
            if (cmp != 0.0f) 
            {
                compareBlock(((BlockDXT1 *)h_result) + resultBlockIdx, ((BlockDXT1 *)reference) + referenceBlockIdx);
                shrLog("Deviation at (%d, %d):\t%f rms\n", x/4, y/4, float(cmp)/16/3);
            }
            rms += cmp;
        }
    }
    rms /= width * height * 3;
    shrLog("RMS(reference, result) = %f\n\n", rms);

    // Free OpenCL resources
    oclDeleteMemObjs(cmMemObjs, 3);
    clReleaseMemObject(cmAlphaTable4);
    clReleaseMemObject(cmProds4);
    clReleaseMemObject(cmAlphaTable3);
    clReleaseMemObject(cmProds3);
    clReleaseKernel(ckKernel);
    clReleaseProgram(cpProgram);
    clReleaseCommandQueue(cqCommandQueue);
    clReleaseContext(cxGPUContext);

    // Free host memory
    free(source);
    free(h_img);

    // finish
    shrQAFinishExit(argc, (const char **)argv, (rms <= ERROR_THRESHOLD) ? QA_PASSED : QA_FAILED);
}
Ejemplo n.º 15
0
// Main function 
// *********************************************************************
int main(int argc, char **argv)
{
    //shrQAStart(argc, argv);


	// get command line arg for quick test, if provided
    bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt");
    
    // start logs 
	cExecutableName = argv[0];
    shrSetLogFileName ("Barrier.txt");
    printf("%s Starting...\n\n# of THREADS \t= %i\n", argv[0], iNumElements); 

    // set and log Global and Local work size dimensions
    szLocalWorkSize = NUM_THREADS ;
    szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements);  // rounded up to the nearest multiple of the LocalWorkSize
    printf("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n", 
           szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); 

    

    //Get an OpenCL platform
    ciErr1 = clGetPlatformIDs(1, &cpPlatform, NULL);

    printf("clGetPlatformID...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clGetPlatformID, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    //Get the devices
    ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);
    printf("clGetDeviceIDs...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clGetDeviceIDs, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    //Create the context
    cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr1);
    printf("clCreateContext...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clCreateContext, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    // Create a command-queue
    cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErr1);
    printf("clCreateCommandQueue...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clCreateCommandQueue, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }


	

    // Read the OpenCL kernel in from source file
    printf("oclLoadProgSource (%s)...\n", cSourceFile); 
    cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
    cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);

    // Create the program
    cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1);
    printf("clCreateProgramWithSource...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    // Build the program with 'mad' Optimization option
    #ifdef MAC
        char* flags = "-cl-fast-relaxed-math -DMAC";
    #else
        char* flags = "-cl-fast-relaxed-math";
    #endif
    ciErr1 = clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL);
    printf("clBuildProgram...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    // Create the kernel
    ckKernel = clCreateKernel(cpProgram, "Barrier", &ciErr1);
    printf("clCreateKernel (Barrier)...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }



	 // Allocate and initialize host arrays 
    printf( "Allocate and Init Host Mem...\n"); 
    input = (int *)malloc(sizeof(int) * NUM_BLOCKS);

	for(int i =0; i<=NUM_BLOCKS; i++)
	{
		input[i]=0;

	}

	// Allocate the OpenCL buffer memory objects for source and result on the device GMEM
    array_in = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(int)* NUM_BLOCKS, NULL, &ciErr1);
    array_out = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(int)* NUM_BLOCKS, NULL, &ciErr1);
	
	if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clCreateBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }


    // Set the Argument values
    
    ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_int), (void*)&goal_val);
	ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&array_in);
	ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&array_out);

   // ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_int), (void*)&iNumElements);
    printf("clSetKernelArg 0 - 2...\n\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clSetKernelArg, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }






    // --------------------------------------------------------
    // Start Core sequence... copy input data to GPU, compute, copy results back



	ciErr1 = clEnqueueWriteBuffer(cqCommandQueue, array_in, CL_FALSE, 0, sizeof(int) * NUM_BLOCKS,(void*) input, 0, NULL, NULL);
    
    printf("clEnqueueWriteBuffer (SrcA and SrcB)...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clEnqueueWriteBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }


    // Launch kernel
    ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, &ceEvent);
    printf("clEnqueueNDRangeKernel (Barrier)...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clEnqueueNDRangeKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

   /*ciErr1 = clEnqueueReadBuffer(cqCommandQueue, global_mutex, CL_TRUE, 0, sizeof(cl_int), &original_goal, 0, NULL, NULL);
    printf("clEnqueueReadBuffer (Dst)...%d \n\n", original_goal); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clEnqueueReadBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }*/


	//GPU_PROFILING
    ciErr1=clWaitForEvents(1, &ceEvent);
	if (ciErr1 != CL_SUCCESS)
    {
        printf("Error 1 !\n\n");
        Cleanup(argc, argv, EXIT_FAILURE);
    }
       
        cl_ulong start, end;
     ciErr1 =   clGetEventProfilingInfo(ceEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
      ciErr1 |= clGetEventProfilingInfo(ceEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
        //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

	if (ciErr1 != CL_SUCCESS)
    {
        printf("Error 2 !\n\n");
        Cleanup(argc, argv, EXIT_FAILURE);
    }
        double dSeconds = 1.0e-9 * (double)(end - start);
		printf("Done! time taken %llu \n",end - start );
      // printf("Done! Kernel execution time: %.5f s\n\n", dSeconds);


		// Release event
       clReleaseEvent(ceEvent);
       ceEvent = 0;

    Cleanup (argc, argv,  EXIT_SUCCESS);
}
// Main program
//*****************************************************************************
int main(int argc, char** argv)
{
	pArgc = &argc;
	pArgv = argv;

	shrQAStart(argc, argv);

    // Start logs 
	cExecutableName = argv[0];
    shrSetLogFileName ("oclSobelFilter.txt");
    shrLog("%s Starting (Using %s)...\n\n", argv[0], clSourcefile); 

    // Get command line args for quick test or QA test, if provided
    bNoPrompt = (bool)shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt");
    bQATest   = (bool)shrCheckCmdLineFlag(argc, (const char**)argv, "qatest");

    // Menu items
    if (!(bQATest))
    {
        ShowMenuItems();
    }

    // Find the path from the exe to the image file 
    cPathAndName = shrFindFilePath(cImageFile, argv[0]);
    oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup);
    shrLog("Image File\t = %s\nImage Dimensions = %u w x %u h x %u bpp\n\n", cPathAndName, uiImageWidth, uiImageHeight, sizeof(unsigned int)<<3);

    // Initialize OpenGL items (if not No-GL QA test)
    shrLog("%sInitGL...\n\n", bQATest ? "Skipping " : "Calling "); 
    if (!(bQATest))
    {
        InitGL(&argc, argv);
    }

    //Get the NVIDIA platform if available, otherwise use default
    char cBuffer[1024];
    bool bNV = false;
    shrLog("Get Platform ID... ");
    ciErrNum = oclGetPlatformID(&cpPlatform);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    ciErrNum = clGetPlatformInfo (cpPlatform, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    shrLog("%s\n\n", cBuffer);
    bNV = (strstr(cBuffer, "NVIDIA") != NULL);

    //Get the devices
    shrLog("Get Device Info...\n");
    cl_uint uiNumAllDevs = 0;
    GpuDevMngr = new DeviceManager(cpPlatform, &uiNumAllDevs, pCleanup);

    // Get selected device if specified, otherwise examine avaiable ones and choose by perf
    cl_int iSelectedDevice = 0;
    if((shrGetCmdLineArgumenti(argc, (const char**)argv, "device", &iSelectedDevice)) || (uiNumAllDevs == 1)) 
    {
        // Use 1 selected device
        GpuDevMngr->uiUsefulDevCt = 1;  
        iSelectedDevice = CLAMP((cl_uint)iSelectedDevice, 0, (uiNumAllDevs - 1));
        GpuDevMngr->uiUsefulDevs[0] = iSelectedDevice;
        GpuDevMngr->fLoadProportions[0] = 1.0f;
        shrLog("  Using 1 Selected Device for Sobel Filter Computation...\n"); 
 
    } 
    else 
    {
        // Use available useful devices and Compute the device load proportions
        ciErrNum = GpuDevMngr->GetDevLoadProportions(bNV);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
        if (GpuDevMngr->uiUsefulDevCt == 1)
        {
            iSelectedDevice = GpuDevMngr->uiUsefulDevs[0];
        }
        shrLog("    Using %u Device(s) for Sobel Filter Computation\n", GpuDevMngr->uiUsefulDevCt); 
    }

    //Create the context
    shrLog("\nclCreateContext...\n\n");
    cxGPUContext = clCreateContext(0, uiNumAllDevs, GpuDevMngr->cdDevices, NULL, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Allocate per-device OpenCL objects for useful devices
    cqCommandQueue = new cl_command_queue[GpuDevMngr->uiUsefulDevCt];
    ckSobel = new cl_kernel[GpuDevMngr->uiUsefulDevCt];
    cmDevBufIn = new cl_mem[GpuDevMngr->uiUsefulDevCt];
    cmDevBufOut = new cl_mem[GpuDevMngr->uiUsefulDevCt];
    szAllocDevBytes = new size_t[GpuDevMngr->uiUsefulDevCt];
    uiInHostPixOffsets = new cl_uint[GpuDevMngr->uiUsefulDevCt];
    uiOutHostPixOffsets = new cl_uint[GpuDevMngr->uiUsefulDevCt];
    uiDevImageHeight = new cl_uint[GpuDevMngr->uiUsefulDevCt];

    // Create command queue(s) for device(s)     
    shrLog("clCreateCommandQueue...\n");
    for (cl_uint i = 0; i < GpuDevMngr->uiUsefulDevCt; i++) 
    {
        cqCommandQueue[i] = clCreateCommandQueue(cxGPUContext, GpuDevMngr->cdDevices[GpuDevMngr->uiUsefulDevs[i]], 0, &ciErrNum);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
        shrLog("  CommandQueue %u, Device %u, Device Load Proportion = %.2f, ", i, GpuDevMngr->uiUsefulDevs[i], GpuDevMngr->fLoadProportions[i]); 
        oclPrintDevName(LOGBOTH, GpuDevMngr->cdDevices[GpuDevMngr->uiUsefulDevs[i]]);  
        shrLog("\n");
    }

    // Allocate pinned input and output host image buffers:  mem copy operations to/from pinned memory is much faster than paged memory
    szBuffBytes = uiImageWidth * uiImageHeight * sizeof (unsigned int);
    cmPinnedBufIn = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    cmPinnedBufOut = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    shrLog("\nclCreateBuffer (Input and Output Pinned Host buffers)...\n"); 

    // Get mapped pointers for writing to pinned input and output host image pointers 
    uiInput = (cl_uint*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedBufIn, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    uiOutput = (cl_uint*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedBufOut, CL_TRUE, CL_MAP_READ, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    shrLog("clEnqueueMapBuffer (Pointer to Input and Output pinned host buffers)...\n"); 

    // Load image data from file to pinned input host buffer
    ciErrNum = shrLoadPPM4ub(cPathAndName, (unsigned char **)&uiInput, &uiImageWidth, &uiImageHeight);
    oclCheckErrorEX(ciErrNum, shrTRUE, pCleanup);
    shrLog("Load Input Image to Input pinned host buffer...\n"); 

    // Read the kernel in from file
    free(cPathAndName);
    cPathAndName = shrFindFilePath(clSourcefile, argv[0]);
    oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup);
    cSourceCL = oclLoadProgSource(cPathAndName, "// My comment\n", &szKernelLength);
    oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup);
    shrLog("Load OpenCL Prog Source from File...\n"); 

    // Create the program object
    cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    shrLog("clCreateProgramWithSource...\n"); 

    // Build the program with 'mad' Optimization option
#ifdef MAC
    char *flags = "-cl-fast-relaxed-math -DMAC";
#else
    char *flags = "-cl-fast-relaxed-math";
#endif

    ciErrNum = clBuildProgram(cpProgram, 0, NULL, flags, NULL, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        // On error: write out standard error, Build Log and PTX, then cleanup and exit
        shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
        oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
        oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclSobelFilter.ptx");
        Cleanup(EXIT_FAILURE);
    }
    shrLog("clBuildProgram...\n\n"); 

    // Determine, the size/shape of the image portions for each dev and create the device buffers
    unsigned uiSumHeight = 0;
    for (cl_uint i = 0; i < GpuDevMngr->uiUsefulDevCt; i++)
    {
        // Create kernel instance
        ckSobel[i] = clCreateKernel(cpProgram, "ckSobel", &ciErrNum);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
        shrLog("clCreateKernel (ckSobel), Device %u...\n", i); 

        // Allocations and offsets for the portion of the image worked on by each device
        if (GpuDevMngr->uiUsefulDevCt == 1)
        {
            // One device processes the whole image with no offset 
            uiDevImageHeight[i] = uiImageHeight; 
            uiInHostPixOffsets[i] = 0;
            uiOutHostPixOffsets[i] = 0;
            szAllocDevBytes[i] = uiDevImageHeight[i] * uiImageWidth * sizeof(cl_uint);
        }
        else if (i == 0)
        {
            // Multiple devices, top stripe zone including topmost row of image:  
            // Over-allocate on device by 1 row 
            // Set offset and size to copy extra 1 padding row H2D (below bottom of stripe)
            // Won't return the last row (dark/garbage row) D2H
            uiInHostPixOffsets[i] = 0;
            uiOutHostPixOffsets[i] = 0;
            uiDevImageHeight[i] = (cl_uint)(GpuDevMngr->fLoadProportions[GpuDevMngr->uiUsefulDevs[i]] * (float)uiImageHeight);     // height is proportional to dev perf 
            uiSumHeight += uiDevImageHeight[i];
            uiDevImageHeight[i] += 1;
            szAllocDevBytes[i] = uiDevImageHeight[i] * uiImageWidth * sizeof(cl_uint);
        }
        else if (i < (GpuDevMngr->uiUsefulDevCt - 1))
        {
            // Multiple devices, middle stripe zone:  
            // Over-allocate on device by 2 rows 
            // Set offset and size to copy extra 2 padding rows H2D (above top and below bottom of stripe)
            // Won't return the first and last rows (dark/garbage rows) D2H
            uiInHostPixOffsets[i] = (uiSumHeight - 1) * uiImageWidth;
            uiOutHostPixOffsets[i] = uiInHostPixOffsets[i] + uiImageWidth;
            uiDevImageHeight[i] = (cl_uint)(GpuDevMngr->fLoadProportions[GpuDevMngr->uiUsefulDevs[i]] * (float)uiImageHeight);     // height is proportional to dev perf 
            uiSumHeight += uiDevImageHeight[i];
            uiDevImageHeight[i] += 2;
            szAllocDevBytes[i] = uiDevImageHeight[i] * uiImageWidth * sizeof(cl_uint);
        }
        else 
        {
            // Multiple devices, last boundary tile:  
            // Over-allocate on device by 1 row 
            // Set offset and size to copy extra 1 padding row H2D (above top of stripe)
            // Won't return the first row (dark/garbage rows D2H 
            uiInHostPixOffsets[i] = (uiSumHeight - 1) * uiImageWidth;
            uiOutHostPixOffsets[i] = uiInHostPixOffsets[i] + uiImageWidth;
            uiDevImageHeight[i] = uiImageHeight - uiSumHeight;                              // "leftover" rows 
            uiSumHeight += uiDevImageHeight[i];
            uiDevImageHeight[i] += 1;
            szAllocDevBytes[i] = uiDevImageHeight[i] * uiImageWidth * sizeof(cl_uint);
        }
        shrLog("Image Height (rows) for Device %u = %u...\n", i, uiDevImageHeight[i]); 

        // Create the device buffers in GMEM on each device
        cmDevBufIn[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szAllocDevBytes[i], NULL, &ciErrNum);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
        cmDevBufOut[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, szAllocDevBytes[i], NULL, &ciErrNum);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
        shrLog("clCreateBuffer (Input and Output GMEM buffers, Device %u)...\n", i); 

        // Set the common argument values for the Median kernel instance for each device
        int iLocalPixPitch = iBlockDimX + 2;
        ciErrNum = clSetKernelArg(ckSobel[i], 0, sizeof(cl_mem), (void*)&cmDevBufIn[i]);
        ciErrNum |= clSetKernelArg(ckSobel[i], 1, sizeof(cl_mem), (void*)&cmDevBufOut[i]);
        ciErrNum |= clSetKernelArg(ckSobel[i], 2, (iLocalPixPitch * (iBlockDimY + 2) * sizeof(cl_uchar4)), NULL);
        ciErrNum |= clSetKernelArg(ckSobel[i], 3, sizeof(cl_int), (void*)&iLocalPixPitch);
        ciErrNum |= clSetKernelArg(ckSobel[i], 4, sizeof(cl_uint), (void*)&uiImageWidth);
        ciErrNum |= clSetKernelArg(ckSobel[i], 6, sizeof(cl_float), (void*)&fThresh);        
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
        shrLog("clSetKernelArg (0-4), Device %u...\n\n", i); 
    }

    // Set common global and local work sizes for Median kernel
    szLocalWorkSize[0] = iBlockDimX;
    szLocalWorkSize[1] = iBlockDimY;
    szGlobalWorkSize[0] = shrRoundUp((int)szLocalWorkSize[0], uiImageWidth); 

    // init running timers
    shrDeltaT(0);   // timer 0 used for computation timing 
    shrDeltaT(1);   // timer 1 used for fps computation

    // Start main GLUT rendering loop for processing and rendering, 
    // or otherwise run No-GL Q/A test sequence
    if (!(bQATest))
    {
        glutMainLoop();
    }
    else 
    {
        TestNoGL();
    }

    Cleanup(EXIT_SUCCESS);
}
// Main function 
// *********************************************************************
int main(int argc, char **argv)
{
	//////////////////////////////////////////////////////////////////////////
	unsigned int count = iNumElements;
	int k = 8;
	unsigned int random_seed, random_seed2;
	srand( (unsigned)time( NULL ) );
	random_seed = rand();
	random_seed2 = rand();
	//////////////////////////////////////////////////////////////////////////

	// get command line arg for quick test, if provided
	bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt");

	// start logs 
	shrSetLogFileName ("oclVectorAdd.txt");
	shrLog("%s Starting...\n\n# of float elements per Array \t= %i\n", argv[0], iNumElements); 

	// set and log Global and Local work size dimensions
	szLocalWorkSize = 256;
	szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements);  // rounded up to the nearest multiple of the LocalWorkSize
	shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n", 
		szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); 

	// Allocate and initialize host arrays 
	shrLog( "Allocate and Init Host Mem...\n"); 
	srcA = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize);
	srcB = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize);
	dst = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize);
	Golden = (void *)malloc(sizeof(cl_float) * iNumElements);
	shrFillArray((float*)srcA, iNumElements);
	shrFillArray((float*)srcB, iNumElements);
	//////////////////////////////////////////////////////////////////////////
	float *scalar_value = new float[count];
	float *gradient_magnitude = new float[count];
	float *second_derivative_magnitude = new float[count];
	unsigned char *label_ptr = new unsigned char[count];
	shrFillArray(scalar_value, count);
	shrFillArray(gradient_magnitude, count);
	shrFillArray(second_derivative_magnitude, count);
	//////////////////////////////////////////////////////////////////////////

	//Get an OpenCL platform
	ciErr1 = clGetPlatformIDs(1, &cpPlatform, NULL);

	shrLog("clGetPlatformID...\n"); 
	if (ciErr1 != CL_SUCCESS)
	{
		shrLog("Error in clGetPlatformID, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
		Cleanup(EXIT_FAILURE);
	}

	//Get the devices
	ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);
	shrLog("clGetDeviceIDs...\n"); 
	if (ciErr1 != CL_SUCCESS)
	{
		shrLog("Error in clGetDeviceIDs, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
		Cleanup(EXIT_FAILURE);
	}

	//Create the context
	cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr1);
	shrLog("clCreateContext...\n"); 
	if (ciErr1 != CL_SUCCESS)
	{
		shrLog("Error in clCreateContext, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
		Cleanup(EXIT_FAILURE);
	}

	// Create a command-queue
	cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &ciErr1);
	shrLog("clCreateCommandQueue...\n"); 
	if (ciErr1 != CL_SUCCESS)
	{
		shrLog("Error in clCreateCommandQueue, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
		Cleanup(EXIT_FAILURE);
	}

	// Allocate the OpenCL buffer memory objects for source and result on the device GMEM
	cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr1);
	cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2);
	ciErr1 |= ciErr2;
	cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2);
	ciErr1 |= ciErr2;
	//////////////////////////////////////////////////////////////////////////
	cmDevSrc_scalar_value = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr1);
	cmDevSrc_gradient_magnitude = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2);
	ciErr1 |= ciErr2;
	cmDevSrc_second_derivative_magnitude = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2);
	ciErr1 |= ciErr2;
	cmDevDst_label_ptr = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2);
	ciErr1 |= ciErr2;
	//////////////////////////////////////////////////////////////////////////
	shrLog("clCreateBuffer...\n"); 
	if (ciErr1 != CL_SUCCESS)
	{
		shrLog("Error in clCreateBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
		Cleanup(EXIT_FAILURE);
	}

	// Read the OpenCL kernel in from source file
	shrLog("oclLoadProgSource (%s)...\n", cSourceFile); 
	cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
	cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);
	printf("%s\n%s\n", cSourceFile, cPathAndName);

	// Create the program
	cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1);
	shrLog("clCreateProgramWithSource...\n"); 
	if (ciErr1 != CL_SUCCESS)
	{
		shrLog("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
		Cleanup(EXIT_FAILURE);
	}

	// Build the program with 'mad' Optimization option
#ifdef MAC
	char* flags = "-cl-fast-relaxed-math -DMAC";
#else
	char* flags = "-cl-fast-relaxed-math";
#endif
	ciErr1 = clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL);
	shrLog("clBuildProgram...\n"); 
	if (ciErr1 != CL_SUCCESS)
	{
		shrLog("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
		Cleanup(EXIT_FAILURE);
	}

	// Create the kernel
	ckKernel = clCreateKernel(cpProgram, "k_means", &ciErr1);
	shrLog("clCreateKernel (VectorAdd)...\n"); 
	if (ciErr1 != CL_SUCCESS)
	{
		shrLog("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
		Cleanup(EXIT_FAILURE);
	}

	// Set the Argument values
	//ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA);
	//ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB);
	//ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst);
	//ciErr1 |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements);
	//////////////////////////////////////////////////////////////////////////
	// __global const float *scalar_value, __global const float *gradient_magnitude, __global const float *second_derivative_magnitude, __global unsigned char *label_ptr, __global const unsigned int count, __global const int k, __global const unsigned int random_seed, __global const unsigned int random_seed2
	ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrc_scalar_value);
	ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrc_gradient_magnitude);
	ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevSrc_second_derivative_magnitude);
	ciErr1 |= clSetKernelArg(ckKernel, 3, sizeof(cl_mem), (void*)&cmDevDst_label_ptr);
	ciErr1 |= clSetKernelArg(ckKernel, 4, sizeof(cl_uint), (void*)&count);
	ciErr1 |= clSetKernelArg(ckKernel, 5, sizeof(cl_uint), (void*)&k);
	ciErr1 |= clSetKernelArg(ckKernel, 6, sizeof(cl_uint), (void*)&random_seed);
	ciErr1 |= clSetKernelArg(ckKernel, 7, sizeof(cl_uint), (void*)&random_seed2);
	//////////////////////////////////////////////////////////////////////////
	shrLog("clSetKernelArg 0 - 3...\n\n"); 
	if (ciErr1 != CL_SUCCESS)
	{
		shrLog("Error in clSetKernelArg, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
		Cleanup(EXIT_FAILURE);
	}

	// --------------------------------------------------------
	// Start Core sequence... copy input data to GPU, compute, copy results back

	// Asynchronous write of data to GPU device
	//ciErr1 = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, srcA, 0, NULL, NULL);
	//ciErr1 |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, srcB, 0, NULL, NULL);
	//////////////////////////////////////////////////////////////////////////
	ciErr1 = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrc_scalar_value, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, scalar_value, 0, NULL, NULL);
	ciErr1 |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrc_gradient_magnitude, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, gradient_magnitude, 0, NULL, NULL);
	ciErr1 |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrc_second_derivative_magnitude, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, second_derivative_magnitude, 0, NULL, NULL);
	//////////////////////////////////////////////////////////////////////////
	shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n"); 
	if (ciErr1 != CL_SUCCESS)
	{
		shrLog("Error in clEnqueueWriteBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
		Cleanup(EXIT_FAILURE);
	}

	// Launch kernel
	ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
	shrLog("clEnqueueNDRangeKernel (VectorAdd)...\n"); 
	if (ciErr1 != CL_SUCCESS)
	{
		shrLog("Error in clEnqueueNDRangeKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
		Cleanup(EXIT_FAILURE);
	}

	// Synchronous/blocking read of results, and check accumulated errors
	//ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL);
	//////////////////////////////////////////////////////////////////////////
		ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmDevDst_label_ptr, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, label_ptr, 0, NULL, NULL);
	//////////////////////////////////////////////////////////////////////////
	shrLog("clEnqueueReadBuffer (Dst)...\n\n"); 
	if (ciErr1 != CL_SUCCESS)
	{
		shrLog("Error in clEnqueueReadBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
		Cleanup(EXIT_FAILURE);
	}
	//--------------------------------------------------------

	// Compute and compare results for golden-host and report errors and pass/fail
	shrLog("Comparing against Host/C++ computation...\n\n"); 
	VectorAddHost ((const float*)srcA, (const float*)srcB, (float*)Golden, iNumElements);
	shrBOOL bMatch = shrComparefet((const float*)Golden, (const float*)dst, (unsigned int)iNumElements, 0.0f, 0);
	shrLog("%s\n\n", (bMatch == shrTRUE) ? "PASSED" : "FAILED");

	//////////////////////////////////////////////////////////////////////////
	//float *a = (float *)srcA;
	//float *b = (float *)srcB;
	//float *c = (float *)dst;
	//float *d = (float *)Golden;
	//for (int i=0; i<iNumElements; i++)
	//{
	//	printf("%f+%f=%f=%f\t", a[i], b[i], c[i], a[i]+b[i]);
	//	printf("%s\n", (a[i]+b[i]==c[i]?"equal":"not equal"));
	//}

	//for (int i=0; i<iNumElements; i++)
	//{
	//	printf("%f\n", ((float *)dst)[i]);
	//}
	//////////////////////////////////////////////////////////////////////////

	// Cleanup and leave
	Cleanup (EXIT_SUCCESS);

	//////////////////////////////////////////////////////////////////////////
	delete [] scalar_value;
	delete [] gradient_magnitude;
	delete [] second_derivative_magnitude;
	delete [] label_ptr;
	//////////////////////////////////////////////////////////////////////////
}
extern "C" void initBlackScholes(cl_context cxGPUContext, cl_command_queue cqParamCommandQueue, const char **argv){
    cl_int ciErrNum;
    size_t kernelLength;

    shrLog("...loading BlackScholes.cl\n");
        char *cPathAndName = shrFindFilePath("BlackScholes.cl", argv[0]);
        shrCheckError(cPathAndName != NULL, shrTRUE);
        char *cBlackScholes = oclLoadProgSource(cPathAndName, "// My comment\n", &kernelLength);
        shrCheckError(cBlackScholes != NULL, shrTRUE);

    shrLog("...creating BlackScholes program\n");
        cpBlackScholes = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cBlackScholes, &kernelLength, &ciErrNum);
        shrCheckError(ciErrNum, CL_SUCCESS);

    shrLog("...building BlackScholes program\n");
        ciErrNum = clBuildProgram(cpBlackScholes, 0, NULL, "-cl-fast-relaxed-math -Werror", NULL, NULL);

        if(ciErrNum != CL_BUILD_SUCCESS){
            shrLog("*** Compilation failure ***\n");

            size_t deviceNum;
            cl_device_id *cdDevices;
            ciErrNum = clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &deviceNum);
            shrCheckError(ciErrNum, CL_SUCCESS);

            cdDevices = (cl_device_id *)malloc(deviceNum * sizeof(cl_device_id));
            shrCheckError(cdDevices != NULL, shrTRUE);

            ciErrNum = clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, deviceNum * sizeof(cl_device_id), cdDevices, NULL);
            shrCheckError(ciErrNum, CL_SUCCESS);

            size_t logSize;
            char *logTxt;

            ciErrNum = clGetProgramBuildInfo(cpBlackScholes, cdDevices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
            shrCheckError(ciErrNum, CL_SUCCESS);

            logTxt = (char *)malloc(logSize);
            shrCheckError(logTxt != NULL, shrTRUE);

            ciErrNum = clGetProgramBuildInfo(cpBlackScholes, cdDevices[0], CL_PROGRAM_BUILD_LOG, logSize, logTxt, NULL);
            shrCheckError(ciErrNum, CL_SUCCESS);

            shrLog("%s\n", logTxt);
            shrLog("*** Exiting ***\n");
            free(logTxt);
            free(cdDevices);
            exit(666);
        }

    //Save ptx code to separate file
    oclLogPtx(cpBlackScholes, oclGetFirstDev(cxGPUContext), "BlackScholes.ptx");

    shrLog("...creating BlackScholes kernels\n");
        ckBlackScholes = clCreateKernel(cpBlackScholes, "BlackScholes", &ciErrNum);
        shrCheckError(ciErrNum, CL_SUCCESS);

    cqDefaultCommandQueue = cqParamCommandQueue;
    free(cBlackScholes);
    free(cPathAndName);
}
Ejemplo n.º 19
0
// Solve the problem using a parallel approach with the help of OpenCL
void parallelSolve()
{
	std::chrono::system_clock::time_point timeStart = std::chrono::high_resolution_clock::now();
	float* solutionMatrix = getInitialMatrixAsArray();
	cl_platform_id platformId = nullptr;
	cl_device_id deviceId = nullptr;

	cl_context context = nullptr;
	cl_command_queue commandQueue = nullptr;
	cl_mem bufferPM = nullptr;
	cl_mem bufferCM = nullptr;
	cl_program program = nullptr;

	cl_int status = 0;
	cl_int bufferSize = rowCount * columnCount * sizeof(float);
	cl_uint numPlatforms;
	cl_uint numDevices;

	// Setup the context, command queue, buffer, and program.
	status = clGetPlatformIDs(1, &platformId, &numPlatforms);
	checkForError(status, "getting platforms");
	printf("Num platforms : %d\nPlatform ID : %d\n", numPlatforms, platformId);

	status = clGetDeviceIDs(platformId, CL_DEVICE_TYPE_GPU, 1, &deviceId, &numDevices);
	checkForError(status, "getting devices");
	printf("Num devices : %d\nDevice ID : %d\n", numDevices, deviceId);

	context = clCreateContext(0, 1, &deviceId, nullptr, nullptr, &status);
	checkForError(status, "creating context");

	commandQueue = clCreateCommandQueue(context, deviceId, 0, &status);
	checkForError(status, "creating commandQueue");

	bufferPM = clCreateBuffer(context, CL_MEM_READ_WRITE, bufferSize, 0, &status);
	checkForError(status, "creating bufferPM");

	bufferCM = clCreateBuffer(context, CL_MEM_READ_WRITE, bufferSize, 0, &status);
	checkForError(status, "creating bufferCM");

	size_t programLength = 0;
	char* programSource = oclLoadProgSource("TP4.cl", "", &programLength);

	program = clCreateProgramWithSource(context, 1, (const char **)&programSource, &programLength, &status);
	checkForError(status, "creating program from source");

	status = clBuildProgram(program, 0, nullptr, "", nullptr, nullptr);
	checkForError(status, "building program");

	float* pM = getInitialMatrixAsArray(); // Previous Matrix (shortened for equation brevity
	float* cM = getInitialMatrixAsArray(); // Current Matrix (shortened for equation brevity)

	status = clEnqueueWriteBuffer(commandQueue, bufferPM, true, 0, bufferSize, pM, 0, nullptr, nullptr);
	checkForError(status, "Copying the initial matrix (odd)");

	status = clEnqueueWriteBuffer(commandQueue, bufferCM, true, 0, bufferSize, cM, 0, nullptr, nullptr);
	checkForError(status, "Copying the initial matrix (even)");

	// The work size corresponds to the size of the matrix excluding borders
	size_t globalWorkSize[] = { (rowCount - 2) * (columnCount - 2)};

	// The two kernels use the same function but have opposite read/write matrices
	cl_kernel kernelOdd;
	cl_kernel kernelEven;

	kernelOdd = clCreateKernel(program, "HeatTransfer", &status);
	status = clSetKernelArg(kernelOdd, 0, sizeof(bufferCM), &bufferCM);
	status = clSetKernelArg(kernelOdd, 1, sizeof(bufferPM), &bufferPM);
	status = clSetKernelArg(kernelOdd, 2, sizeof(rowCount), &rowCount);
	status = clSetKernelArg(kernelOdd, 3, sizeof(columnCount), &columnCount);
	status = clSetKernelArg(kernelOdd, 4, sizeof(td), &td);
	status = clSetKernelArg(kernelOdd, 5, sizeof(h), &h);

	kernelEven = clCreateKernel(program, "HeatTransfer", &status);
	status = clSetKernelArg(kernelEven, 0, sizeof(bufferPM), &bufferPM);
	status = clSetKernelArg(kernelEven, 1, sizeof(bufferCM), &bufferCM);
	status = clSetKernelArg(kernelEven, 2, sizeof(rowCount), &rowCount);
	status = clSetKernelArg(kernelEven, 3, sizeof(columnCount), &columnCount);
	status = clSetKernelArg(kernelEven, 4, sizeof(td), &td);
	status = clSetKernelArg(kernelEven, 5, sizeof(h), &h);

	// Perform actual calculations
	int k;
	for (k = 0; k < timeSteps; k++)
	{
		cl_event taskComplete;
		if (k % 2 == 0)
			status = clEnqueueNDRangeKernel(commandQueue, kernelEven, 1, nullptr, globalWorkSize, nullptr, 0, nullptr, &taskComplete);
		else
			status = clEnqueueNDRangeKernel(commandQueue, kernelOdd, 1, nullptr, globalWorkSize, nullptr, 0, nullptr, &taskComplete);
		clWaitForEvents(1, &taskComplete);
		clReleaseEvent(taskComplete);
	}

	// Read appropriate buffer and display the result
	if (k % 2 == 0)
		status = clEnqueueReadBuffer(commandQueue, bufferCM, true, 0, bufferSize, solutionMatrix, 0, nullptr, nullptr);
	else
		status = clEnqueueReadBuffer(commandQueue, bufferPM, true, 0, bufferSize, solutionMatrix, 0, nullptr, nullptr);

	checkForError(status, "reading solution matrix");

	std::chrono::system_clock::time_point timerStop = std::chrono::high_resolution_clock::now();
	parSolveTime = (double)std::chrono::duration_cast<std::chrono::nanoseconds>(timerStop - timeStart).count() / 1000000;
	printMatrixAsArray(rowCount, columnCount, solutionMatrix);
	printf("Parallel solve duration : %.2f ms\n", parSolveTime);
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    shrQAStart(argc, argv);
    // start logs 
    shrSetLogFileName ("oclSimpleMultiGPU.txt");
    shrLog("%s Starting, Array = %u float values...\n\n", argv[0], DATA_N); 

    // OpenCL
    cl_platform_id cpPlatform;
    cl_uint ciDeviceCount;
    cl_device_id* cdDevices;
    cl_context cxGPUContext;
    cl_device_id cdDevice;                          // GPU device
    int deviceNr[MAX_GPU_COUNT];
    cl_command_queue commandQueue[MAX_GPU_COUNT];
    cl_mem d_Data[MAX_GPU_COUNT];
    cl_mem d_Result[MAX_GPU_COUNT];
    cl_program cpProgram; 
    cl_kernel reduceKernel[MAX_GPU_COUNT];
    cl_event GPUDone[MAX_GPU_COUNT];
    cl_event GPUExecution[MAX_GPU_COUNT];
    size_t programLength;
    cl_int ciErrNum;			               
    char cDeviceName [256];
    cl_mem h_DataBuffer;

    // Vars for reduction results
    float h_SumGPU[MAX_GPU_COUNT * ACCUM_N];   
    float *h_Data;
    double sumGPU;
    double sumCPU, dRelError;

    // allocate and init host buffer with with some random generated input data
    h_Data = (float *)malloc(DATA_N * sizeof(float));
    shrFillArray(h_Data, DATA_N);

    // start timer & logs 
    shrLog("Setting up OpenCL on the Host...\n\n"); 
    shrDeltaT(1);

    // Annotate profiling state
    #ifdef GPU_PROFILING
        shrLog("OpenCL Profiling is enabled...\n\n"); 
    #endif

     //Get the NVIDIA platform
    ciErrNum = oclGetPlatformID(&cpPlatform);
    oclCheckError(ciErrNum, CL_SUCCESS);
    shrLog("clGetPlatformID...\n"); 

    //Get the devices
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &ciDeviceCount);
    oclCheckError(ciErrNum, CL_SUCCESS);
    cdDevices = (cl_device_id *)malloc(ciDeviceCount * sizeof(cl_device_id) );
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, ciDeviceCount, cdDevices, NULL);
    oclCheckError(ciErrNum, CL_SUCCESS);
    shrLog("clGetDeviceIDs...\n"); 

    //Create the context
    cxGPUContext = clCreateContext(0, ciDeviceCount, cdDevices, NULL, NULL, &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);
    shrLog("clCreateContext...\n");

    // Set up command queue(s) for GPU's specified on the command line or all GPU's
    if(shrCheckCmdLineFlag(argc, (const char **)argv, "device"))
    {
        // User specified GPUs
        int ciMaxDeviceID = ciDeviceCount-1;

        ciDeviceCount = 0;
        char* deviceList;
        char* deviceStr;
        char* next_token;
        shrGetCmdLineArgumentstr(argc, (const char **)argv, "device", &deviceList);

        #ifdef WIN32
            deviceStr = strtok_s (deviceList," ,.-", &next_token);
        #else
            deviceStr = strtok (deviceList," ,.-");
        #endif   

        // Create command queues for all Requested GPU's
        while(deviceStr != NULL) 
        {
            // get & log device index # and name
            deviceNr[ciDeviceCount] = atoi(deviceStr);
            if( deviceNr[ciDeviceCount] > ciMaxDeviceID ) {
                shrLog(" Invalid user specified device ID: %d\n", deviceNr[ciDeviceCount]);
                return 1;
            }

            cdDevice = oclGetDev(cxGPUContext, deviceNr[ciDeviceCount]);
            ciErrNum = clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cDeviceName), cDeviceName, NULL);
            oclCheckError(ciErrNum, CL_SUCCESS);
            shrLog(" Device %i: %s\n\n", deviceNr[ciDeviceCount], cDeviceName);

            // create a command que
            commandQueue[ciDeviceCount] = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
            oclCheckError(ciErrNum, CL_SUCCESS);
            shrLog("clCreateCommandQueue\n"); 

            ++ciDeviceCount;

            #ifdef WIN32
                deviceStr = strtok_s (NULL," ,.-", &next_token);
            #else            
                deviceStr = strtok (NULL," ,.-");
            #endif
        }

        free(deviceList);
    } 
    else 
    {
        // Find out how many GPU's to compute on all available GPUs
        size_t nDeviceBytes;
        ciErrNum = clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes);
        oclCheckError(ciErrNum, CL_SUCCESS);
        ciDeviceCount = (cl_uint)nDeviceBytes/sizeof(cl_device_id);

        for(unsigned int i = 0; i < ciDeviceCount; ++i ) 
        {
            // get & log device index # and name
            deviceNr[i] = i;
            cdDevice = oclGetDev(cxGPUContext, i);
            ciErrNum = clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cDeviceName), cDeviceName, NULL);
            oclCheckError(ciErrNum, CL_SUCCESS);
            shrLog(" Device %i: %s\n", i, cDeviceName);

            // create a command que
            commandQueue[i] = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
            oclCheckError(ciErrNum, CL_SUCCESS);
            shrLog("clCreateCommandQueue\n\n"); 
        }
    }

    // Load the OpenCL source code from the .cl file 
    const char* source_path = shrFindFilePath("simpleMultiGPU.cl", argv[0]);
    char *source = oclLoadProgSource(source_path, "", &programLength);
    oclCheckError(source != NULL, shrTRUE);
    shrLog("oclLoadProgSource\n"); 

    // Create the program for all GPUs in the context
    cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&source, &programLength, &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);
    shrLog("clCreateProgramWithSource\n"); 
    
    // build the program
    ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        // write out standard error, Build Log and PTX, then cleanup and exit
        shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
        oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
        oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclSimpleMultiGPU.ptx");
        oclCheckError(ciErrNum, CL_SUCCESS); 
    }
    shrLog("clBuildProgram\n"); 

    // Create host buffer with page-locked memory
    h_DataBuffer = clCreateBuffer(cxGPUContext, CL_MEM_COPY_HOST_PTR | CL_MEM_ALLOC_HOST_PTR,
                                  DATA_N * sizeof(float), h_Data, &ciErrNum);
    oclCheckError(ciErrNum, CL_SUCCESS);
    shrLog("clCreateBuffer (Page-locked Host)\n\n"); 

    // Create buffers for each GPU, with data divided evenly among GPU's
    int sizePerGPU = DATA_N / ciDeviceCount;
    int workOffset[MAX_GPU_COUNT];
    int workSize[MAX_GPU_COUNT];
    workOffset[0] = 0;
    for(unsigned int i = 0; i < ciDeviceCount; ++i ) 
    {
        workSize[i] = (i != (ciDeviceCount - 1)) ? sizePerGPU : (DATA_N - workOffset[i]);        

        // Input buffer
        d_Data[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, workSize[i] * sizeof(float), NULL, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);
        shrLog("clCreateBuffer (Input)\t\tDev %i\n", i); 

        // Copy data from host to device
        ciErrNum = clEnqueueCopyBuffer(commandQueue[i], h_DataBuffer, d_Data[i], workOffset[i] * sizeof(float), 
                                      0, workSize[i] * sizeof(float), 0, NULL, NULL);        
        shrLog("clEnqueueCopyBuffer (Input)\tDev %i\n", i);

        // Output buffer
        d_Result[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, ACCUM_N * sizeof(float), NULL, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);
        shrLog("clCreateBuffer (Output)\t\tDev %i\n", i);
        
        // Create kernel
        reduceKernel[i] = clCreateKernel(cpProgram, "reduce", &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);
        shrLog("clCreateKernel\t\t\tDev %i\n", i); 
        
        // Set the args values and check for errors
        ciErrNum |= clSetKernelArg(reduceKernel[i], 0, sizeof(cl_mem), &d_Result[i]);
        ciErrNum |= clSetKernelArg(reduceKernel[i], 1, sizeof(cl_mem), &d_Data[i]);
        ciErrNum |= clSetKernelArg(reduceKernel[i], 2, sizeof(int), &workSize[i]);
        oclCheckError(ciErrNum, CL_SUCCESS);
        shrLog("clSetKernelArg\t\t\tDev %i\n\n", i);

        workOffset[i + 1] = workOffset[i] + workSize[i];
    }

    // Set # of work items in work group and total in 1 dimensional range
    size_t localWorkSize[] = {THREAD_N};        
    size_t globalWorkSize[] = {ACCUM_N};        

    // Start timer and launch reduction kernel on each GPU, with data split between them 
    shrLog("Launching Kernels on GPU(s)...\n\n");
    for(unsigned int i = 0; i < ciDeviceCount; i++) 
    {        
        ciErrNum = clEnqueueNDRangeKernel(commandQueue[i], reduceKernel[i], 1, 0, globalWorkSize, localWorkSize,
                                         0, NULL, &GPUExecution[i]);
        oclCheckError(ciErrNum, CL_SUCCESS);
    }
    
    // Copy result from device to host for each device
    for(unsigned int i = 0; i < ciDeviceCount; i++) 
    {
        ciErrNum = clEnqueueReadBuffer(commandQueue[i], d_Result[i], CL_FALSE, 0, ACCUM_N * sizeof(float), 
                            h_SumGPU + i *  ACCUM_N, 0, NULL, &GPUDone[i]);
        oclCheckError(ciErrNum, CL_SUCCESS);
    }

    // Synchronize with the GPUs and do accumulated error check
    clWaitForEvents(ciDeviceCount, GPUDone);
    shrLog("clWaitForEvents complete...\n\n"); 

    // Aggregate results for multiple GPU's and stop/log processing time
    sumGPU = 0;
    for(unsigned int i = 0; i < ciDeviceCount * ACCUM_N; i++)
    {
         sumGPU += h_SumGPU[i];
    }

    // Print Execution Times for each GPU
    #ifdef GPU_PROFILING
        shrLog("Profiling Information for GPU Processing:\n\n");
        for(unsigned int i = 0; i < ciDeviceCount; i++) 
        {
            cdDevice = oclGetDev(cxGPUContext, deviceNr[i]);
            clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cDeviceName), cDeviceName, NULL);
            shrLog("Device %i : %s\n", deviceNr[i], cDeviceName);
            shrLog("  Reduce Kernel     : %.5f s\n", executionTime(GPUExecution[i]));
            shrLog("  Copy Device->Host : %.5f s\n\n\n", executionTime(GPUDone[i]));
        }
    #endif

    // Run the computation on the Host CPU and log processing time 
    shrLog("Launching Host/CPU C++ Computation...\n\n");
    sumCPU = 0;
    for(unsigned int i = 0; i < DATA_N; i++)
    {
        sumCPU += h_Data[i];
    }

    // Check GPU result against CPU result 
    dRelError = 100.0 * fabs(sumCPU - sumGPU) / fabs(sumCPU);
    shrLog("Comparing against Host/C++ computation...\n"); 
    shrLog(" GPU sum: %f\n CPU sum: %f\n", sumGPU, sumCPU);
    shrLog(" Relative Error (100.0 * Error / Golden) = %f \n\n", dRelError);

    // cleanup 
    free(source);
    free(h_Data);
    for(unsigned int i = 0; i < ciDeviceCount; ++i ) 
    {
        clReleaseKernel(reduceKernel[i]);
        clReleaseCommandQueue(commandQueue[i]);
    }
    clReleaseProgram(cpProgram);
    clReleaseContext(cxGPUContext);

    // finish
    shrQAFinishExit(argc, (const char **)argv, (dRelError < 1e-4) ? QA_PASSED : QA_FAILED);
  }
Ejemplo n.º 21
0
int main(int argc, char* argv[]) {
  struct pb_TimerSet timers;
  struct pb_Parameters *parameters;

  parameters = pb_ReadParameters(&argc, argv);
  if (!parameters)
    return -1;

  if(!parameters->inpFiles[0]){
    fputs("Input file expected\n", stderr);
    return -1;
  }
  
  char oclOverhead[] = "OCL Overhead";
  char prescans[] = "PreScanKernel";
  char postpremems[] = "PostPreMems";
  char intermediates[] = "IntermediatesKernel";
  char mains[] = "MainKernel";
  char finals[] = "FinalKernel";

  pb_InitializeTimerSet(&timers);
  
  pb_AddSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);
  pb_AddSubTimer(&timers, prescans, pb_TimerID_KERNEL);
  pb_AddSubTimer(&timers, postpremems, pb_TimerID_KERNEL);
  pb_AddSubTimer(&timers, intermediates, pb_TimerID_KERNEL);
  pb_AddSubTimer(&timers, mains, pb_TimerID_KERNEL);
  pb_AddSubTimer(&timers, finals, pb_TimerID_KERNEL);

  pb_SwitchToTimer(&timers, pb_TimerID_IO);

  int numIterations;
  if (argc >= 2){
    numIterations = atoi(argv[1]);
  } else {
    fputs("Expected at least one command line argument\n", stderr);
    return -1;
  }

  unsigned int img_width, img_height;
  unsigned int histo_width, histo_height;
  unsigned int lmemKB;
  unsigned int nThreads;
  unsigned int bins_per_block;

  FILE* f = fopen(parameters->inpFiles[0],"rb");
  int result = 0;

  result += fread(&img_width,    sizeof(unsigned int), 1, f);
  result += fread(&img_height,   sizeof(unsigned int), 1, f);
  result += fread(&histo_width,  sizeof(unsigned int), 1, f);
  result += fread(&histo_height, sizeof(unsigned int), 1, f);

  if (result != 4){
    fputs("Error reading input and output dimensions from file\n", stderr);
    return -1;
  }

  unsigned int* img = (unsigned int*) malloc (img_width*img_height*sizeof(unsigned int));
  unsigned char* histo = (unsigned char*) calloc (histo_width*histo_height, sizeof(unsigned char));

  result = fread(img, sizeof(unsigned int), img_width*img_height, f);

  fclose(f);

  if (result != img_width*img_height){
    fputs("Error reading input array from file\n", stderr);
    return -1;
  }

  cl_int ciErrNum;
  pb_Context* pb_context;
  pb_context = pb_InitOpenCLContext(parameters);
  if (pb_context == NULL) {
    fprintf (stderr, "Error: No OpenCL platform/device can be found."); 
    return -1;
  }

  cl_int clStatus;
  cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId;
  cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId;
  cl_context clContext = (cl_context) pb_context->clContext;
  cl_command_queue clCommandQueue;
  
  cl_program clProgram[4];
  
  cl_kernel histo_prescan_kernel;
  cl_kernel histo_intermediates_kernel;
  cl_kernel histo_main_kernel;
  cl_kernel histo_final_kernel;

  int even_width = ((img_width+1)/2)*2;

  cl_mem input;
  cl_mem ranges;
  cl_mem sm_mappings;
  cl_mem global_subhisto;
  cl_mem global_histo;
  cl_mem global_overflow;
  cl_mem final_histo;
  
  clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  
  pb_SetOpenCL(&clContext, &clCommandQueue);  
  pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);
  
  long unsigned int lmemSize = 0;
  OCL_ERRCK_RETVAL ( clGetDeviceInfo(clDevice, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &lmemSize, NULL) );
  
  // lmemKB = lmemSize / 1024; // Should be valid, but not taken into consideration for initial programming
  
  if (lmemSize >= 48*1024) {
    lmemKB = 48;
  } else if (lmemSize >= 24*1024) {
    lmemKB = 24;
  } else {
    lmemKB = 8;
  }
  
  lmemKB = 24;
  
  bins_per_block = lmemKB * 1024;
  
  switch (lmemKB) {
    case 48: nThreads = 1024; break;
    case 24: nThreads = 768; break;
    default: nThreads = 512; break;
  }
  
  
  
  size_t program_length[4];
  const char *source_path[4] = { "src/opencl_nvidia/histo_prescan.cl",
    "src/opencl_nvidia/histo_intermediates.cl", "src/opencl_nvidia/histo_main.cl","src/opencl_nvidia/histo_final.cl"};
  char *source[4];

  for (int i = 0; i < 4; ++i) {
    // Dynamically allocate buffer for source
    source[i] = oclLoadProgSource(source_path[i], "", &program_length[i]);
    if(!source[i]) {
      fprintf(stderr, "Could not load program source\n"); exit(1);
    }
  	
  	clProgram[i] = clCreateProgramWithSource(clContext, 1, (const char **)&source[i], &program_length[i], &ciErrNum);
  	OCL_ERRCK_VAR(ciErrNum);
  	  	
  	free(source[i]);
  }
  	
  	  	  	  	  	  	  	
  char compileOptions[1024];
  //                -cl-nv-verbose // Provides register info for NVIDIA devices
  // Set all Macros referenced by kernels
  sprintf(compileOptions, "\
                -D PRESCAN_THREADS=%u\
                -D KB=%u -D UNROLL=%u\
                -D BINS_PER_BLOCK=%u -D BLOCK_X=%u",
                
                PRESCAN_THREADS,
                lmemKB, UNROLL,
                bins_per_block, BLOCK_X
            ); 
  
  for (int i = 0; i < 4; ++i) {
//fprintf(stderr, "Building Program #%d...\n", i);
    OCL_ERRCK_RETVAL ( clBuildProgram(clProgram[i], 1, &clDevice, compileOptions, NULL, NULL) );
       
          /*
       char *build_log;
       size_t ret_val_size;
       ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);	OCL_ERRCK_VAR(ciErrNum);
       build_log = (char *)malloc(ret_val_size+1);
       ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
       	OCL_ERRCK_VAR(ciErrNum);
       	

       // to be carefully, terminate with \0
       // there's no information in the reference whether the string is 0 terminated or not
       build_log[ret_val_size] = '\0';

       fprintf(stderr, "%s\n", build_log );
       */
  }
  	
  histo_prescan_kernel = clCreateKernel(clProgram[0], "histo_prescan_kernel", &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  histo_intermediates_kernel = clCreateKernel(clProgram[1], "histo_intermediates_kernel", &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  histo_main_kernel = clCreateKernel(clProgram[2], "histo_main_kernel", &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  histo_final_kernel = clCreateKernel(clProgram[3], "histo_final_kernel", &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);  	

  	
  pb_SwitchToTimer(&timers, pb_TimerID_IO);  

  input = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 
      even_width*(((img_height+UNROLL)/UNROLL)*UNROLL)*sizeof(unsigned int), NULL, &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  ranges = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 
      2*sizeof(unsigned int), NULL, &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);  
  sm_mappings = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 
      img_width*img_height*4*sizeof(unsigned char), NULL, &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  global_subhisto = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 
      img_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  global_histo = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 
      img_width*histo_height*sizeof(unsigned short), NULL, &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  global_overflow = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 
      img_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  final_histo = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 
      img_width*histo_height*sizeof(unsigned char), NULL, &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  

  // Must dynamically allocate. Too large for stack
  unsigned int *zeroData;
  zeroData = (unsigned int *) malloc(sizeof(unsigned int) *img_width*histo_height);
  if (zeroData == NULL) {
    fprintf(stderr, "Failed to allocate %ld bytes of memory!\n", sizeof(unsigned int) * img_width * histo_height);
    exit(1);
  }
  memset(zeroData, 0, img_width*histo_height*sizeof(unsigned int));
   
  for (int y=0; y < img_height; y++){
    OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, input, CL_FALSE, 
                          y*even_width*sizeof(unsigned int), // Offset in bytes
                          img_width*sizeof(unsigned int), // Size of data to write
                          &img[y*img_width], // Host Source
                          0, NULL, NULL) );
  }
 
  pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);

  unsigned int img_dim = img_height*img_width;
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_prescan_kernel, 0, sizeof(cl_mem), (void *)&input) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_prescan_kernel, 1, sizeof(unsigned int), &img_dim) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_prescan_kernel, 2, sizeof(cl_mem), (void *)&ranges) );

  unsigned int half_width = (img_width+1)/2;
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 0, sizeof(cl_mem), (void *)&input) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 1, sizeof(unsigned int), &img_height) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 2, sizeof(unsigned int), &img_width) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 3, sizeof(unsigned int), &half_width) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 4, sizeof(cl_mem), (void *)&sm_mappings) );

  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 0, sizeof(cl_mem), (void *)&sm_mappings) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 1, sizeof(unsigned int), &img_dim) );

  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 4, sizeof(unsigned int), &histo_height) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 5, sizeof(unsigned int), &histo_width) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 6, sizeof(cl_mem), (void *)&global_subhisto) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 7, sizeof(cl_mem), (void *)&global_histo) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 8, sizeof(cl_mem), (void *)&global_overflow) );
  

  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 2, sizeof(unsigned int), &histo_height) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 3, sizeof(unsigned int), &histo_width) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 4, sizeof(cl_mem), (void *)&global_subhisto) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 5, sizeof(cl_mem), (void *)&global_histo) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 6, sizeof(cl_mem), (void *)&global_overflow) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 7, sizeof(cl_mem), (void *)&final_histo) );

  size_t prescan_localWS[1] = {PRESCAN_THREADS};
  size_t prescan_globalWS[1] = {PRESCAN_BLOCKS_X*prescan_localWS[0]};
  size_t inter_localWS[1] = {(img_width+1)/2};
  size_t inter_globalWS[1] = {((img_height + UNROLL-1)/UNROLL) * inter_localWS[0]};
  size_t main_localWS[2] = {nThreads, 1};
  size_t main_globalWS[2];  main_globalWS[0] = BLOCK_X * main_localWS[0];
  size_t final_localWS[1] = {512};
  size_t final_globalWS[1] = {BLOCK_X*3 * final_localWS[0]};
    

  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);

  for (int iter = 0; iter < numIterations; iter++) {
    unsigned int ranges_h[2] = {UINT32_MAX/2, 0};
    
    // how about something like
    // __global__ unsigned int ranges[2];
    // ...kernel
    // __shared__ unsigned int s_ranges[2];
    // if (threadIdx.x == 0) {s_ranges[0] = ranges[0]; s_ranges[1] = ranges[1];}
    // __syncthreads();
    
    // Although then removing the blocking cudaMemcpy's might cause something about
    // concurrent kernel execution.
    // If kernel launches are synchronous, then how can 2 kernels run concurrently? different host threads?

  OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, ranges, CL_TRUE, 
                          0, // Offset in bytes
                          2*sizeof(unsigned int), // Size of data to write
                          ranges_h, // Host Source
                          0, NULL, NULL) );
                                                    
  pb_SwitchToSubTimer(&timers, prescans , pb_TimerID_KERNEL);
                         
  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_prescan_kernel, 1, 0,
                            prescan_globalWS, prescan_localWS, 0, 0, 0) );

  pb_SwitchToSubTimer(&timers, postpremems , pb_TimerID_KERNEL);
    
  OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, ranges, CL_TRUE, 
                          0, // Offset in bytes
                          2*sizeof(unsigned int), // Size of data to read
                          ranges_h, // Host Source
                          0, NULL, NULL) );

  OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, global_subhisto, CL_TRUE, 
                          0, // Offset in bytes
                          img_width*histo_height*sizeof(unsigned int), // Size of data to write
                          zeroData, // Host Source
                          0, NULL, NULL) );

  pb_SwitchToSubTimer(&timers, intermediates, pb_TimerID_KERNEL);
                     
  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_intermediates_kernel, 1, 0,
                            inter_globalWS, inter_localWS, 0, 0, 0) );                          

  main_globalWS[1] = ranges_h[1]-ranges_h[0]+1;
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 2, sizeof(unsigned int), &ranges_h[0]) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 3, sizeof(unsigned int), &ranges_h[1]) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 0, sizeof(unsigned int), &ranges_h[0]) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 1, sizeof(unsigned int), &ranges_h[1]) );

  pb_SwitchToSubTimer(&timers, mains, pb_TimerID_KERNEL);


  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_main_kernel, 2, 0,
                            main_globalWS, main_localWS, 0, 0, 0) );

  pb_SwitchToSubTimer(&timers, finals, pb_TimerID_KERNEL);

  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_final_kernel, 1, 0,
                            final_globalWS, final_localWS, 0, 0, 0) );
  
  }

  pb_SwitchToTimer(&timers, pb_TimerID_IO);


  OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, final_histo, CL_TRUE, 
                          0, // Offset in bytes
                          histo_height*histo_width*sizeof(unsigned char), // Size of data to read
                          histo, // Host Source
                          0, NULL, NULL) );

  OCL_ERRCK_RETVAL ( clReleaseKernel(histo_prescan_kernel) );
  OCL_ERRCK_RETVAL ( clReleaseKernel(histo_intermediates_kernel) );
  OCL_ERRCK_RETVAL ( clReleaseKernel(histo_main_kernel) );
  OCL_ERRCK_RETVAL ( clReleaseKernel(histo_final_kernel) );
  OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[0]) );
  OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[1]) );
  OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[2]) );
  OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[3]) );

  OCL_ERRCK_RETVAL ( clReleaseMemObject(input) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(ranges) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(sm_mappings) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(global_subhisto) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(global_histo) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(global_overflow) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(final_histo) );
  


  if (parameters->outFile) {
    dump_histo_img(histo, histo_height, histo_width, parameters->outFile);
  }


  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);


  free(zeroData);
  free(img);
  free(histo);

  pb_SwitchToTimer(&timers, pb_TimerID_NONE);

  printf("\n");
  pb_PrintTimerSet(&timers);
  pb_FreeParameters(parameters);
  
  OCL_ERRCK_RETVAL ( clReleaseCommandQueue(clCommandQueue) );
  OCL_ERRCK_RETVAL ( clReleaseContext(clContext) );
  
  pb_DestroyTimerSet(&timers);

  sleep(1);

  return 0;
}
// Init OpenCL
//*****************************************************************************
int initCL(int argc, const char** argv)
{
    cl_platform_id cpPlatform;
    cl_uint uiDevCount;
    cl_device_id *cdDevices;

    //Get the NVIDIA platform
    ciErrNum = oclGetPlatformID(&cpPlatform);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Get the number of GPU devices available to the platform
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiDevCount);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Create the device list
    cdDevices = new cl_device_id [uiDevCount];
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiDevCount, cdDevices, NULL);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Get device requested on command line, if any
    unsigned int uiDeviceUsed = 0;
    unsigned int uiEndDev = uiDevCount - 1;
    if(shrGetCmdLineArgumentu(argc, argv, "device", &uiDeviceUsed))
    {
      uiDeviceUsed = CLAMP(uiDeviceUsed, 0, uiEndDev);
      uiEndDev = uiDeviceUsed; 
    } 

    // Check if the requested device (or any of the devices if none requested) supports context sharing with OpenGL   
    if(bGLinterop && !bQATest)
    {
        bool bSharingSupported = false;
        for(unsigned int i = uiDeviceUsed; (!bSharingSupported && (i <= uiEndDev)); ++i) 
        {
            size_t extensionSize;
            ciErrNum = clGetDeviceInfo(cdDevices[i], CL_DEVICE_EXTENSIONS, 0, NULL, &extensionSize );
            oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
            if(extensionSize > 0) 
            {
                char* extensions = (char*)malloc(extensionSize);
                ciErrNum = clGetDeviceInfo(cdDevices[i], CL_DEVICE_EXTENSIONS, extensionSize, extensions, &extensionSize);
                oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
                std::string stdDevString(extensions);
                free(extensions);

                size_t szOldPos = 0;
                size_t szSpacePos = stdDevString.find(' ', szOldPos); // extensions string is space delimited
                while (szSpacePos != stdDevString.npos)
                {
                    if( strcmp(GL_SHARING_EXTENSION, stdDevString.substr(szOldPos, szSpacePos - szOldPos).c_str()) == 0 ) 
                    {
                        // Device supports context sharing with OpenGL
                        uiDeviceUsed = i;
                        bSharingSupported = true;
                        break;
                    }
                    do 
                    {
                        szOldPos = szSpacePos + 1;
                        szSpacePos = stdDevString.find(' ', szOldPos);
                    } 
                    while (szSpacePos == szOldPos);
                }
            }
        }
       
        shrLog("%s...\n\n", bSharingSupported ? "Using CL-GL Interop" : "No device found that supports CL/GL context sharing");  
        oclCheckErrorEX(bSharingSupported, true, pCleanup);

        // Define OS-specific context properties and create the OpenCL context
        #if defined (__APPLE__) || defined (MACOSX)
            CGLContextObj kCGLContext = CGLGetCurrentContext();
            CGLShareGroupObj kCGLShareGroup = CGLGetShareGroup(kCGLContext);
            cl_context_properties props[] = 
            {
                CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE, (cl_context_properties)kCGLShareGroup, 
                0 
            };
            cxGPUContext = clCreateContext(props, 0,0, NULL, NULL, &ciErrNum);
        #else
            #ifdef UNIX
                cl_context_properties props[] = 
                {
                    CL_GL_CONTEXT_KHR, (cl_context_properties)glXGetCurrentContext(), 
                    CL_GLX_DISPLAY_KHR, (cl_context_properties)glXGetCurrentDisplay(), 
                    CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, 
                    0
                };
                cxGPUContext = clCreateContext(props, 1, &cdDevices[uiDeviceUsed], NULL, NULL, &ciErrNum);
            #else // Win32
                cl_context_properties props[] = 
                {
                    CL_GL_CONTEXT_KHR, (cl_context_properties)wglGetCurrentContext(), 
                    CL_WGL_HDC_KHR, (cl_context_properties)wglGetCurrentDC(), 
                    CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, 
                    0
                };
                cxGPUContext = clCreateContext(props, 1, &cdDevices[uiDeviceUsed], NULL, NULL, &ciErrNum);
            #endif
        #endif
    }
    else 
    {
		// No GL interop
        cl_context_properties props[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, 0};
        cxGPUContext = clCreateContext(props, 1, &cdDevices[uiDeviceUsed], NULL, NULL, &ciErrNum);

		bGLinterop = shrFALSE;
    }

    shrCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Log device used 
    shrLog("Device # %u, ", uiDeviceUsed);
    oclPrintDevName(LOGBOTH, cdDevices[uiDeviceUsed]);
    shrLog("\n");

    // create a command-queue
    cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiDeviceUsed], 0, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Memory Setup
	if( bGLinterop ) {
        cl_pbos[0] = clCreateFromGLBuffer(cxGPUContext, CL_MEM_READ_ONLY, pbo_source, &ciErrNum);
        cl_pbos[1] = clCreateFromGLBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, pbo_dest, &ciErrNum);
	} else {
        cl_pbos[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, 4 * image_width * image_height, NULL, &ciErrNum);
        cl_pbos[1] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, 4 * image_width * image_height, NULL, &ciErrNum);
	}
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Program Setup
    size_t program_length;
    const char* source_path = shrFindFilePath(clSourcefile, argv[0]);
    char *source = oclLoadProgSource(source_path, "", &program_length);
    oclCheckErrorEX(source != NULL, shrTRUE, pCleanup);

    // create the program
    cpProgram = clCreateProgramWithSource(cxGPUContext, 1,(const char **) &source, &program_length, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    free(source);

    // build the program
    ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        // write out standard error, Build Log and PTX, then cleanup and exit
        shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
        oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
        oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclPostProcessGL.ptx");
        Cleanup(EXIT_FAILURE); 
    }

    // create the kernel
    ckKernel = clCreateKernel(cpProgram, "postprocess", &ciErrNum);

    // set the args values
    ciErrNum |= clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void *) &(cl_pbos[0]));
    ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void *) &(cl_pbos[1]));
    ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(image_width), &image_width);
    ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(image_width), &image_height);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    
    return 0;
}
Ejemplo n.º 23
0
int InitOpenCLContext() 
{
	// start logs
	shrSetLogFileName ("oclVolumeRender.txt");

	// get command line arg for quick test, if provided
	// process command line arguments

	// First initialize OpenGL context, so we can properly setup the OpenGL / OpenCL interop.

// 	glewInit();
// 	GLboolean bGLEW = glewIsSupported("GL_VERSION_2_0 GL_ARB_pixel_buffer_object"); 
// 	oclCheckErrorEX(bGLEW, shrTRUE, pCleanup);
	g_glInterop = true;


	// Create OpenCL context, get device info, select device, select options for image/texture and CL-GL interop
	createCLContext();

	// Print device info
	clGetDeviceInfo(cdDevices[uiDeviceUsed], CL_DEVICE_IMAGE_SUPPORT, sizeof(g_bImageSupport), &g_bImageSupport, NULL);
	//shrLog("%s...\n\n", g_bImageSupport ? "Using Image (Texture)" : "No Image (Texuture) Support");      
//	shrLog("Detailed Device info:\n\n");
	oclPrintDevInfo(LOGBOTH, cdDevices[uiDeviceUsed]);

	// create a command-queue
	cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiDeviceUsed], 0, &ciErrNum);
	oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

	// Program Setup
	size_t program_length;
	cPathAndName = shrFindFilePath("Transform.cl", ".");
	oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup);
	cSourceCL = oclLoadProgSource(cPathAndName, "", &program_length);
	oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup);

	// create the program
	cpProgram = clCreateProgramWithSource(cxGPUContext, 1,
		(const char **)&cSourceCL, &program_length, &ciErrNum);
	oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

	// build the program
	std::string buildOpts = "-cl-single-precision_constant";
//	buildOpts += g_bImageSupport ? " -DIMAGE_SUPPORT" : "";
//	ciErrNum = clBuildProgram(cpProgram, 1, &cdDevices[uiDeviceUsed],"-cl-fast-relaxed-math", NULL, NULL);
	ciErrNum = clBuildProgram(cpProgram, 1, &cdDevices[uiDeviceUsed],NULL, NULL, NULL);
	if (ciErrNum != CL_SUCCESS)
	{
		// write out standard error, Build Log and PTX, then cleanup and return error
		shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
		oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
		oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclVolumeRender.ptx");
		Cleanup(EXIT_FAILURE); 
	}

	// create the kernel
	ScalseKernel = clCreateKernel(cpProgram, "d_render", &ciErrNum);
	oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

	TransformKernel = clCreateKernel(cpProgram, "angle", &ciErrNum);
	oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

	LongToShortKernel = clCreateKernel(cpProgram, "transfer", &ciErrNum);
	oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
	return TRUE;
}
Ejemplo n.º 24
0
int
main(int argc, char **argv)
{
  struct image_i16 *ref_image;
  struct image_i16 *cur_image;
  unsigned short *sads_computed; /* SADs generated by the program */

  int image_size_bytes;
  int image_width_macroblocks, image_height_macroblocks;
  int image_size_macroblocks;

  struct pb_TimerSet timers;
  struct pb_Parameters *params;

  char oclOverhead[]= "OpenCL Overhead";

  pb_InitializeTimerSet(&timers);
  pb_AddSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);

  params = pb_ReadParameters(&argc, argv);

  if (pb_Parameters_CountInputs(params) != 2)
    {
      fprintf(stderr, "Expecting two input filenames\n");
      exit(-1);
    }

  /* Read input files */
  pb_SwitchToTimer(&timers, pb_TimerID_IO);
  ref_image = load_image(params->inpFiles[0]);
  cur_image = load_image(params->inpFiles[1]);
  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);

  if ((ref_image->width != cur_image->width) ||
      (ref_image->height != cur_image->height))
    {
      fprintf(stderr, "Input images must be the same size\n");
      exit(-1);
    }
  if ((ref_image->width % 16) || (ref_image->height % 16))
    {
      fprintf(stderr, "Input image size must be an integral multiple of 16\n");
      exit(-1);
    }

  /* Compute parameters, allocate memory */
  image_size_bytes = ref_image->width * ref_image->height * sizeof(short);
  image_width_macroblocks = ref_image->width >> 4;
  image_height_macroblocks = ref_image->height >> 4;
  image_size_macroblocks = image_width_macroblocks * image_height_macroblocks;

  sads_computed = (unsigned short *)
    malloc(41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(short));

  // Run the kernel code
  // ************************************************************************

  	cl_int ciErrNum;
	cl_command_queue clCommandQueue;

	cl_kernel mb_sad_calc;
	cl_kernel larger_sad_calc_8;
	cl_kernel larger_sad_calc_16;

	cl_mem imgRef;		/* Reference image on the device */
	cl_mem d_cur_image;	/* Current image on the device */
	cl_mem d_sads;		/* SADs on the device */

    // x : image_width_macroblocks
    // y : image_height_macroblocks

  pb_Context* pb_context;
  pb_context = pb_InitOpenCLContext(params);
  if (pb_context == NULL) {
    fprintf (stderr, "Error: No OpenCL platform/device can be found.");
    return -1;
  }

    cl_int clStatus;
    cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId;
    cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId;
    cl_context clContext = (cl_context) pb_context->clContext;

  	clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
  	OCL_ERRCK_VAR(ciErrNum);

    pb_SetOpenCL(&clContext, &clCommandQueue);
    pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);

  	// Read Source Code File
  	size_t program_length;
    const char* source_path = "src/opencl_base/kernel.cl";
    char* source = oclLoadProgSource(source_path, "", &program_length);
    if(!source) {
        fprintf(stderr, "Could not load program source\n"); exit(1);
    }

  	cl_program clProgram = clCreateProgramWithSource(clContext, 1, (const char **)&source, &program_length, &ciErrNum);
  	OCL_ERRCK_VAR(ciErrNum);

  	free(source);

    // JIT Compilation Options
    char compileOptions[1024];
    //                -cl-nv-verbose
    sprintf(compileOptions, "\
                -D MAX_POS=%u -D CEIL_POS=%u\
                -D POS_PER_THREAD=%u -D MAX_POS_PADDED=%u\
                -D THREADS_W=%u -D THREADS_H=%u\
                -D SEARCH_RANGE=%u -D SEARCH_DIMENSION=%u\
                \0",
                MAX_POS, CEIL(MAX_POS, POS_PER_THREAD),
                POS_PER_THREAD,   MAX_POS_PADDED,
                THREADS_W,   THREADS_H,
                SEARCH_RANGE, SEARCH_DIMENSION
            );
    printf ("options = %s\n", compileOptions);

    OCL_ERRCK_RETVAL( clBuildProgram(clProgram, 1, &clDevice, compileOptions, NULL, NULL) );

   /*
   char *build_log;
       size_t ret_val_size;
       OCL_ERRCK_RETVAL( clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size) );
       build_log = (char *)malloc(ret_val_size+1);
       OCL_ERRCK_RETVAL( clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL) );

       // Null terminate (original writer wasn't sure)
       build_log[ret_val_size] = '\0';

       fprintf(stderr, "%s\n", build_log );
   */

    mb_sad_calc = clCreateKernel(clProgram, "mb_sad_calc", &ciErrNum);
   	OCL_ERRCK_VAR(ciErrNum);
   	larger_sad_calc_8 = clCreateKernel(clProgram, "larger_sad_calc_8", &ciErrNum);
   	OCL_ERRCK_VAR(ciErrNum);
   	larger_sad_calc_16 = clCreateKernel(clProgram, "larger_sad_calc_16", &ciErrNum);
   	OCL_ERRCK_VAR(ciErrNum);

    size_t wgSize;
    size_t comp_wgSize[3];
    cl_ulong localMemSize;
    size_t prefwgSizeMult;
    cl_ulong privateMemSize;

    pb_SwitchToTimer(&timers, pb_TimerID_COPY);

#if 0
    cl_image_format img_format;
    img_format.image_channel_order = CL_R;
    img_format.image_channel_data_type = CL_UNSIGNED_INT16;

    /* Transfer reference image to device */
	imgRef = clCreateImage2D(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &img_format,
                                      ref_image->width /** sizeof(unsigned short)*/, // width
                                      ref_image->height, // height
                                      ref_image->width * sizeof(unsigned short), // row_pitch
                                      ref_image->data, &ciErrNum);
#endif

#if 1
    imgRef = clCreateBuffer(clContext, CL_MEM_READ_ONLY,
                            ref_image->width * ref_image->height * sizeof(unsigned short),
                            NULL, &ciErrNum);
    OCL_ERRCK_VAR(ciErrNum);
    OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, imgRef, CL_TRUE,
        0,
        ref_image->width * ref_image->height * sizeof(unsigned short),
        ref_image->data, 0, NULL, NULL) );
#else
    imgRef = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                            ref_image->width * ref_image->height * sizeof(unsigned short),
                            ref_image->data, &ciErrNum);
    printf ("Allocating %d bytes\n", ref_image->width * ref_image->height * sizeof(unsigned short));

#endif
    OCL_ERRCK_VAR(ciErrNum);

    /* Allocate SAD data on the device */

    unsigned short *tmpZero = (unsigned short *)calloc(41 * MAX_POS_PADDED * image_size_macroblocks, sizeof(unsigned short));

/*
    size_t max_alloc_size = 0;
    clGetDeviceInfo(clDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
                    sizeof(max_alloc_size), &max_alloc_size, NULL);
    if (max_alloc_size < (41 * MAX_POS_PADDED *
            image_size_macroblocks * sizeof(unsigned short))) {
      fprintf(stderr, "Can't allocate sad buffer: max alloc size is %dMB\n",
              (int) (max_alloc_size >> 20));
      exit(-1);
    }
*/

    d_sads = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, 41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(unsigned short), tmpZero, &ciErrNum);
    OCL_ERRCK_VAR(ciErrNum);
    free(tmpZero);

    d_cur_image = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, image_size_bytes, cur_image->data, &ciErrNum);
    OCL_ERRCK_VAR(ciErrNum);

	/* Set Kernel Parameters */

	OCL_ERRCK_RETVAL( clSetKernelArg(mb_sad_calc, 0, sizeof(cl_mem), (void *)&d_sads) );
	OCL_ERRCK_RETVAL( clSetKernelArg(mb_sad_calc, 1, sizeof(cl_mem), (void *)&d_cur_image) );
	OCL_ERRCK_RETVAL( clSetKernelArg(mb_sad_calc, 2, sizeof(int), &image_width_macroblocks) );
	OCL_ERRCK_RETVAL( clSetKernelArg(mb_sad_calc, 3, sizeof(int), &image_height_macroblocks) );
	OCL_ERRCK_RETVAL( clSetKernelArg(mb_sad_calc, 4, sizeof(cl_mem), (void *)&imgRef) );

	OCL_ERRCK_RETVAL( clSetKernelArg(larger_sad_calc_8, 0, sizeof(cl_mem), (void *)&d_sads) );
	OCL_ERRCK_RETVAL( clSetKernelArg(larger_sad_calc_8, 1, sizeof(int), &image_width_macroblocks) );
	OCL_ERRCK_RETVAL( clSetKernelArg(larger_sad_calc_8, 2, sizeof(int), &image_height_macroblocks) );

	OCL_ERRCK_RETVAL( clSetKernelArg(larger_sad_calc_16, 0, sizeof(cl_mem), (void *)&d_sads) );
	OCL_ERRCK_RETVAL( clSetKernelArg(larger_sad_calc_16, 1, sizeof(int), &image_width_macroblocks) );
	OCL_ERRCK_RETVAL( clSetKernelArg(larger_sad_calc_16, 2, sizeof(int), &image_height_macroblocks) );

	size_t mb_sad_calc_localWorkSize[2] = {
	    CEIL(MAX_POS, POS_PER_THREAD) * THREADS_W * THREADS_H,
	    1 };
	size_t mb_sad_calc_globalWorkSize[2] = {
        mb_sad_calc_localWorkSize[0] * CEIL(ref_image->width / 4, THREADS_W),
	    mb_sad_calc_localWorkSize[1] * CEIL(ref_image->height / 4, THREADS_H) };

	size_t larger_sad_calc_8_localWorkSize[2] = {32,4};
	size_t larger_sad_calc_8_globalWorkSize[2] = {image_width_macroblocks * 32,
	  image_height_macroblocks * 4};

	size_t larger_sad_calc_16_localWorkSize[2] = {32, 1};
	size_t larger_sad_calc_16_globalWorkSize[2] = {image_width_macroblocks * 32,
	  image_height_macroblocks * 1};

    pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);

    /* Run the 4x4 kernel */
    printf ("DBlock = %dx%d\n", mb_sad_calc_localWorkSize[1], mb_sad_calc_localWorkSize[0]);
	OCL_ERRCK_RETVAL( clEnqueueNDRangeKernel(clCommandQueue, mb_sad_calc, 2, 0, mb_sad_calc_globalWorkSize, mb_sad_calc_localWorkSize, 0, 0, 0) );

	/* Run the larger-blocks kernels */
	OCL_ERRCK_RETVAL( clEnqueueNDRangeKernel(clCommandQueue, larger_sad_calc_8, 2, 0, larger_sad_calc_8_globalWorkSize, larger_sad_calc_8_localWorkSize, 0, 0, 0) );

	OCL_ERRCK_RETVAL( clEnqueueNDRangeKernel(clCommandQueue, larger_sad_calc_16, 2, 0, larger_sad_calc_16_globalWorkSize, larger_sad_calc_16_localWorkSize, 0, 0, 0) );

    OCL_ERRCK_RETVAL( clFinish(clCommandQueue) );
    pb_SwitchToTimer(&timers, pb_TimerID_COPY);

    /* Transfer SAD data to the host */
    OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, d_sads, CL_TRUE,
        0,
        41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(unsigned short),
        sads_computed, 0, NULL, NULL) );

    /* Free GPU memory */
    OCL_ERRCK_RETVAL( clReleaseKernel(larger_sad_calc_8) );
    OCL_ERRCK_RETVAL( clReleaseKernel(larger_sad_calc_16) );
    OCL_ERRCK_RETVAL( clReleaseProgram(clProgram) );

    OCL_ERRCK_RETVAL( clReleaseMemObject(d_sads) );
    OCL_ERRCK_RETVAL( clReleaseMemObject(imgRef) );
    OCL_ERRCK_RETVAL( clReleaseMemObject(d_cur_image) );

    OCL_ERRCK_RETVAL( clFinish(clCommandQueue) );
    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);

  // ************************************************************************
  // End GPU Code

  /* Print output */
  if (params->outFile)
    {
      pb_SwitchToTimer(&timers, pb_TimerID_IO);
      write_sads(params->outFile,
		 image_width_macroblocks,
		 image_height_macroblocks,
		 sads_computed);
      pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
    }

#if 0  /* Debugging */
  print_test_sads(sads_computed, image_size_macroblocks);
  write_sads_directly("sad-debug.bin",
		      ref_image->width / 16, ref_image->height / 16,
		      sads_computed);
#endif

  /* Free memory */
  free(sads_computed);
  free_image(ref_image);
  free_image(cur_image);

  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
  pb_PrintTimerSet(&timers);
  pb_FreeParameters(params);

  OCL_ERRCK_RETVAL( clReleaseCommandQueue(clCommandQueue) );
  OCL_ERRCK_RETVAL( clReleaseContext(clContext) );

  pb_DestroyTimerSet(&timers);

  return 0;
}
Ejemplo n.º 25
0
// Main function 
// *********************************************************************
int ymain(int argc, char **argv)
{
    shrQAStart(argc, argv);

    // get command line arg for quick test, if provided
    bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt");
    
    // start logs 
	cExecutableName = argv[0];
    shrSetLogFileName ("oclVectorAdd2.txt");
    shrLog("%s Starting...\n\n# of float elements per Array \t= %i\n", argv[0], iNumElements); 

    // set and log Global and Local work size dimensions
    szLocalWorkSize = 256;
    szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements);  // rounded up to the nearest multiple of the LocalWorkSize
    shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n", 
           szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); 

    // Allocate and initialize host arrays 
    shrLog( "Allocate and Init Host Mem...\n"); 
    srcA = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize);
    srcB = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize);
    dst = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize);
    Golden = (void *)malloc(sizeof(cl_float) * iNumElements);
    shrFillArray((float*)srcA, iNumElements);
    shrFillArray((float*)srcB, iNumElements);

    //Get an OpenCL platform
    ciErr1 = clGetPlatformIDs(1, &cpPlatform, NULL);

    shrLog("clGetPlatformID...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clGetPlatformID, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    //Get the devices
    ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);
    shrLog("clGetDeviceIDs...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clGetDeviceIDs, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    //Create the context
    cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr1);
    shrLog("clCreateContext...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clCreateContext, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    // Create a command-queue
    cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &ciErr1);
    shrLog("clCreateCommandQueue...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clCreateCommandQueue, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    // Allocate the OpenCL buffer memory objects for source and result on the device GMEM
    cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr1);
    cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2);
    ciErr1 |= ciErr2;
    cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2);
    ciErr1 |= ciErr2;
    shrLog("clCreateBuffer...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clCreateBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }
    
    // Read the OpenCL kernel in from source file
    shrLog("oclLoadProgSource (%s)...\n", cSourceFile); 
    cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
    shrLog("Looking for: %s in Path: %s\n", cSourceFile, argv[0]);
    cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);

    // Create the program
    cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1);
    shrLog("clCreateProgramWithSource...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    // Build the program with 'mad' Optimization option
    #ifdef MAC
        char* flags = "-cl-fast-relaxed-math -DMAC";
    #else
        char* flags = "-cl-fast-relaxed-math";
    #endif
    ciErr1 = clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL);
    shrLog("clBuildProgram...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    // Create the kernel
    ckKernel = clCreateKernel(cpProgram, "VectorAdd", &ciErr1);
    shrLog("clCreateKernel (VectorAdd)...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    // Set the Argument values
    ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA);
    ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB);
    ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst);
    ciErr1 |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements);
    shrLog("clSetKernelArg 0 - 3...\n\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clSetKernelArg, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    // --------------------------------------------------------
    // Start Core sequence... copy input data to GPU, compute, copy results back

    // Asynchronous write of data to GPU device
    ciErr1 = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, srcA, 0, NULL, NULL);
    ciErr1 |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, srcB, 0, NULL, NULL);
    shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clEnqueueWriteBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    // Launch kernel
    ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
    shrLog("clEnqueueNDRangeKernel (VectorAdd)...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clEnqueueNDRangeKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }

    // Synchronous/blocking read of results, and check accumulated errors
    ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL);
    shrLog("clEnqueueReadBuffer (Dst)...\n\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        shrLog("Error in clEnqueueReadBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        Cleanup(argc, argv, EXIT_FAILURE);
    }
    //--------------------------------------------------------

    // Compute and compare results for golden-host and report errors and pass/fail
    shrLog("Comparing against Host/C++ computation...\n\n"); 
    VectorAddHost ((const float*)srcA, (const float*)srcB, (float*)Golden, iNumElements);
    shrBOOL bMatch = shrComparefet((const float*)Golden, (const float*)dst, (unsigned int)iNumElements, 0.0f, 0);

    // Cleanup and leave
    Cleanup (argc, argv, (bMatch == shrTRUE) ? EXIT_SUCCESS : EXIT_FAILURE);
}
Ejemplo n.º 26
0
int main(int argc, char* argv[]) {
  struct pb_Parameters *parameters;

  parameters = pb_ReadParameters(&argc, argv);
  if (!parameters)
    return -1;

  if(!parameters->inpFiles[0]){
    fputs("Input file expected\n", stderr);
    return -1;
  }

  
  struct pb_TimerSet timers;
  
  char oclOverhead[] = "OCL Overhead";
  char intermediates[] = "IntermediatesKernel";
  char finals[] = "FinalKernel";

  pb_InitializeTimerSet(&timers);
  
  pb_AddSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);
  pb_AddSubTimer(&timers, intermediates, pb_TimerID_KERNEL);
  pb_AddSubTimer(&timers, finals, pb_TimerID_KERNEL);
    
  pb_SwitchToTimer(&timers, pb_TimerID_IO);
  
  int numIterations;
  if (argc >= 2){
    numIterations = atoi(argv[1]);
  } else {
    fputs("Expected at least one command line argument\n", stderr);
    return -1;
  }

  unsigned int img_width, img_height;
  unsigned int histo_width, histo_height;

  FILE* f = fopen(parameters->inpFiles[0],"rb");
  int result = 0;

  result += fread(&img_width,    sizeof(unsigned int), 1, f);
  result += fread(&img_height,   sizeof(unsigned int), 1, f);
  result += fread(&histo_width,  sizeof(unsigned int), 1, f);
  result += fread(&histo_height, sizeof(unsigned int), 1, f);

  if (result != 4){
    fputs("Error reading input and output dimensions from file\n", stderr);
    return -1;
  }

  unsigned int* img = (unsigned int*) malloc (img_width*img_height*sizeof(unsigned int));
  unsigned char* histo = (unsigned char*) calloc (histo_width*histo_height, sizeof(unsigned char));

  result = fread(img, sizeof(unsigned int), img_width*img_height, f);

  fclose(f);

  if (result != img_width*img_height){
    fputs("Error reading input array from file\n", stderr);
    return -1;
  }

  cl_int ciErrNum;
  pb_Context* pb_context;
  pb_context = pb_InitOpenCLContext();
  if (pb_context == NULL) {
    fprintf (stderr, "Error: No OpenCL platform/device can be found."); 
    return -1;
  }

  cl_int clStatus;
  cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId;
  cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId;
  cl_context clContext = (cl_context) pb_context->clContext;
  cl_command_queue clCommandQueue;
  
  cl_program clProgram[2];
  
  cl_kernel histo_intermediates_kernel;
  cl_kernel histo_final_kernel;
  
  cl_mem input;
  cl_mem ranges;
  cl_mem sm_mappings;
  cl_mem global_subhisto;
  cl_mem global_overflow;
  cl_mem final_histo;
  
  clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  
  pb_SetOpenCL(&clContext, &clCommandQueue);
  pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);

  cl_uint workItemDimensions;
  OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), &workItemDimensions, NULL) );
  
  size_t workItemSizes[workItemDimensions];
  OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, workItemDimensions*sizeof(size_t), workItemSizes, NULL) );
  
  size_t program_length[2];
  const char *source_path[2] = { 
    "src/opencl_naive/histo_intermediates.cl", 
   "src/opencl_naive/histo_final.cl"};
  char *source[4];

  for (int i = 0; i < 2; ++i) {
    // Dynamically allocate buffer for source
    source[i] = oclLoadProgSource(source_path[i], "", &program_length[i]);
    if(!source[i]) {
      fprintf(stderr, "Could not load program source\n"); exit(1);
    }
  	
  	clProgram[i] = clCreateProgramWithSource(clContext, 1, (const char **)&source[i], &program_length[i], &ciErrNum);
  	OCL_ERRCK_VAR(ciErrNum);
  	  	
  	free(source[i]);
  }
  	
  	  	  	  	  	  	  	
  for (int i = 0; i < 2; ++i) {
    //fprintf(stderr, "Building Program #%d...\n", i);
    OCL_ERRCK_RETVAL ( clBuildProgram(clProgram[i], 1, &clDevice, NULL, NULL, NULL) );
       
    #if 1
       char *build_log;
       size_t ret_val_size;
       ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);	OCL_ERRCK_VAR(ciErrNum);
       build_log = (char *)malloc(ret_val_size+1);
       ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
       	OCL_ERRCK_VAR(ciErrNum);
       	

       // to be carefully, terminate with \0
       // there's no information in the reference whether the string is 0 terminated or not
       build_log[ret_val_size] = '\0';

       fprintf(stderr, "%s\n", build_log );
     #endif
  }
  	
  histo_intermediates_kernel = clCreateKernel(clProgram[0], "histo_intermediates_kernel", &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  histo_final_kernel = clCreateKernel(clProgram[1], "histo_final_kernel", &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  
  pb_SwitchToTimer(&timers, pb_TimerID_COPY);  

  input =           clCreateBuffer(clContext, CL_MEM_READ_WRITE, 
      img_width*img_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
  ranges =          clCreateBuffer(clContext, CL_MEM_READ_WRITE, 2*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);  
  sm_mappings =     clCreateBuffer(clContext, CL_MEM_READ_WRITE, img_width*img_height*4*sizeof(unsigned char), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
  global_subhisto = clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
  global_overflow = clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
  final_histo =     clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned char), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);

  // Must dynamically allocate. Too large for stack
  unsigned int *zeroData;
  zeroData = (unsigned int *) calloc(img_width*histo_height, sizeof(unsigned int));
  if (zeroData == NULL) {
    fprintf(stderr, "Failed to allocate %ld bytes of memory on host!\n", sizeof(unsigned int) * img_width * histo_height);
    exit(1);
  }
   
  for (int y=0; y < img_height; y++){
    OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, input, CL_TRUE, 
                          y*img_width*sizeof(unsigned int), // Offset in bytes
                          img_width*sizeof(unsigned int), // Size of data to write
                          &img[y*img_width], // Host Source
                          0, NULL, NULL) );
  }
 
  pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);

  unsigned int img_dim = img_height*img_width;
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 0, sizeof(cl_mem), (void *)&input) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 1, sizeof(unsigned int), &img_width) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 2, sizeof(cl_mem), (void *)&global_subhisto) );
  
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 0, sizeof(unsigned int), &histo_height) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 1, sizeof(unsigned int), &histo_width) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 2, sizeof(cl_mem), (void *)&global_subhisto) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 3, sizeof(cl_mem), (void *)&final_histo) );

  size_t inter_localWS[1] = { workItemSizes[0] };
  size_t inter_globalWS[1] = { img_height * inter_localWS[0] };
  
  size_t final_localWS[1] = { workItemSizes[0] };
  size_t final_globalWS[1] = {((histo_height*histo_width+(final_localWS[0]-1)) /
                                          final_localWS[0])*final_localWS[0] };
  
  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);

  for (int iter = 0; iter < numIterations; iter++) {
    unsigned int ranges_h[2] = {UINT32_MAX, 0};
    
    // how about something like
    // __global__ unsigned int ranges[2];
    // ...kernel
    // __shared__ unsigned int s_ranges[2];
    // if (threadIdx.x == 0) {s_ranges[0] = ranges[0]; s_ranges[1] = ranges[1];}
    // __syncthreads();
    
    // Although then removing the blocking cudaMemcpy's might cause something about
    // concurrent kernel execution.
    // If kernel launches are synchronous, then how can 2 kernels run concurrently? different host threads?


  OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, ranges, CL_TRUE, 
                          0, // Offset in bytes
                          2*sizeof(unsigned int), // Size of data to write
                          ranges_h, // Host Source
                          0, NULL, NULL) );
                          
  OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, global_subhisto, CL_TRUE, 
                          0, // Offset in bytes
                          histo_width*histo_height*sizeof(unsigned int), // Size of data to write
                          zeroData, // Host Source
                          0, NULL, NULL) );
                          
  pb_SwitchToSubTimer(&timers, intermediates, pb_TimerID_KERNEL);

  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_intermediates_kernel /*histo_intermediates_kernel*/, 1, 0,
                            inter_globalWS, inter_localWS, 0, 0, 0) );              
  pb_SwitchToSubTimer(&timers, finals, pb_TimerID_KERNEL);                            
  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_final_kernel, 1, 0,
                            final_globalWS, final_localWS, 0, 0, 0) );                           
  }

  pb_SwitchToTimer(&timers, pb_TimerID_IO);

  OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, final_histo, CL_TRUE, 
                          0, // Offset in bytes
                          histo_height*histo_width*sizeof(unsigned char), // Size of data to read
                          histo, // Host Source
                          0, NULL, NULL) );                         

  OCL_ERRCK_RETVAL ( clReleaseKernel(histo_intermediates_kernel) );
  OCL_ERRCK_RETVAL ( clReleaseKernel(histo_final_kernel) );
  OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[0]) );
  OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[1]) );
  
  OCL_ERRCK_RETVAL ( clReleaseMemObject(input) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(ranges) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(sm_mappings) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(global_subhisto) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(global_overflow) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(final_histo) );

  if (parameters->outFile) {
    dump_histo_img(histo, histo_height, histo_width, parameters->outFile);
  }

  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);

  free(zeroData);
  free(img);
  free(histo);

  pb_SwitchToTimer(&timers, pb_TimerID_NONE);

  printf("\n");
  pb_PrintTimerSet(&timers);
  pb_FreeParameters(parameters);
  
  pb_DestroyTimerSet(&timers);

  OCL_ERRCK_RETVAL ( clReleaseCommandQueue(clCommandQueue) );
  OCL_ERRCK_RETVAL ( clReleaseContext(clContext) );

  return 0;
}
// Main function
// *********************************************************************
int main(int argc, char **argv)
{
    gp_argc = &argc;
    gp_argv = &argv;

    shrQAStart(argc, argv);

    // Get the NVIDIA platform
    ciErrNum = oclGetPlatformID(&cpPlatform);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
    shrLog("clGetPlatformID...\n");

    // Get the NVIDIA platform
    ciErrNum = oclGetPlatformID(&cpPlatform);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
    shrLog("clGetPlatformID...\n");

    //Get all the devices
    cl_uint uiNumDevices = 0;           // Number of devices available
    cl_uint uiTargetDevice = 0;	        // Default Device to compute on
    cl_uint uiNumComputeUnits;          // Number of compute units (SM's on NV GPU)
    shrLog("Get the Device info and select Device...\n");
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
    cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);

    // Get command line device options and config accordingly
    shrLog("  # of Devices Available = %u\n", uiNumDevices);
    if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE)
    {
        uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
    }
    shrLog("  Using Device %u: ", uiTargetDevice);
    oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
    ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
    shrLog("\n  # of Compute Units = %u\n", uiNumComputeUnits);

    // get command line arg for quick test, if provided
    bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt");

    // start logs
    cExecutableName = argv[0];
    shrSetLogFileName ("oclDotProduct.txt");
    shrLog("%s Starting...\n\n# of float elements per Array \t= %u\n", argv[0], iNumElements);

    // set and log Global and Local work size dimensions
    szLocalWorkSize = 256;
    szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements);  // rounded up to the nearest multiple of the LocalWorkSize
    shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n",
           szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize));

    // Allocate and initialize host arrays
    shrLog( "Allocate and Init Host Mem...\n");
    srcA = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize);
    srcB = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize);
    dst = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize);
    Golden = (void *)malloc(sizeof(cl_float) * iNumElements);
    shrFillArray((float*)srcA, 4 * iNumElements);
    shrFillArray((float*)srcB, 4 * iNumElements);

    // Get the NVIDIA platform
    ciErrNum = oclGetPlatformID(&cpPlatform);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Get a GPU device
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevices[uiTargetDevice], NULL);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Create the context
    cxGPUContext = clCreateContext(0, 1, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Create a command-queue
    shrLog("clCreateCommandQueue...\n");
    cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Allocate the OpenCL buffer memory objects for source and result on the device GMEM
    shrLog("clCreateBuffer (SrcA, SrcB and Dst in Device GMEM)...\n");
    cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize * 4, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize * 4, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErrNum);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Read the OpenCL kernel in from source file
    shrLog("oclLoadProgSource (%s)...\n", cSourceFile);
    cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
    oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup);
    cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);
    oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup);

    // Create the program
    shrLog("clCreateProgramWithSource...\n");
    cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);

    // Build the program with 'mad' Optimization option
#ifdef MAC
    char* flags = "-cl-fast-relaxed-math -DMAC";
#else
    char* flags = "-cl-fast-relaxed-math";
#endif
    shrLog("clBuildProgram...\n");
    ciErrNum = clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        // write out standard error, Build Log and PTX, then cleanup and exit
        shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
        oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
        oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclDotProduct.ptx");
        Cleanup(EXIT_FAILURE);
    }

    // Create the kernel
    shrLog("clCreateKernel (DotProduct)...\n");
    ckKernel = clCreateKernel(cpProgram, "DotProduct", &ciErrNum);

    // Set the Argument values
    shrLog("clSetKernelArg 0 - 3...\n\n");
    ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA);
    ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB);
    ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst);
    ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // --------------------------------------------------------
    // Core sequence... copy input data to GPU, compute, copy results back

    // Asynchronous write of data to GPU device
    shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n");
    ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcA, 0, NULL, NULL);
    ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcB, 0, NULL, NULL);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Launch kernel
    shrLog("clEnqueueNDRangeKernel (DotProduct)...\n");
    ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Read back results and check accumulated errors
    shrLog("clEnqueueReadBuffer (Dst)...\n\n");
    ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Compute and compare results for golden-host and report errors and pass/fail
    shrLog("Comparing against Host/C++ computation...\n\n");
    DotProductHost ((const float*)srcA, (const float*)srcB, (float*)Golden, iNumElements);
    shrBOOL bMatch = shrComparefet((const float*)Golden, (const float*)dst, (unsigned int)iNumElements, 0.0f, 0);

    // Cleanup and leave
    Cleanup (EXIT_SUCCESS);
}
Ejemplo n.º 28
0
void sort (int numElems, unsigned int max_value, cl_mem* &dkeysPtr, cl_mem* &dvaluesPtr, cl_mem* &dkeys_oPtr, cl_mem* &dvalues_oPtr, cl_context *clContextPtr, cl_command_queue clCommandQueue, const cl_device_id clDevice, size_t *workItemSizes){
  
  size_t block[1] = { SORT_BS };
  size_t grid[1] = { ((numElems+4*SORT_BS-1)/(4*SORT_BS)) * block[0] };

  unsigned int iterations = 0;
  while(max_value > 0){
    max_value >>= BITS;
    iterations++;
  }

  cl_int ciErrNum;
  
  cl_context clContext = *clContextPtr;
  cl_program sort_program;
  cl_kernel splitSort;
  cl_kernel splitRearrange;
  
  cl_mem dhisto;
  cl_mem* original = dkeysPtr;

  unsigned int *zeroData;
  zeroData = (unsigned int *) calloc( (1<<BITS)*grid[0], sizeof(unsigned int) );
  if (zeroData == NULL) { fprintf(stderr, "Could not allocate host memory! (%s: %d)\n", __FILE__, __LINE__); exit(1); }

  dhisto = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, (1<<BITS)*((numElems+4*SORT_BS-1)/(4*SORT_BS))*sizeof(unsigned int), zeroData, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
  
  free(zeroData);
  
  //char compileOptions[256];
  //                -cl-nv-verbose // Provides register info for NVIDIA devices
  // Set all Macros referenced by kernels
  /*  sprintf(compileOptions, "\
                -D CUTOFF2_VAL=%f -D CUTOFF_VAL=%f\
                -D GRIDSIZE_VAL1=%d -D GRIDSIZE_VAL2=%d -D GRIDSIZE_VAL3=%d\
                -D SIZE_XY_VAL=%d -D ONE_OVER_CUTOFF2_VAL=%f",
                cutoff2, cutoff,
                params.gridSize[0], params.gridSize[1], params.gridSize[2],
                size_xy, _1overCutoff2
            );*/ 
  
  size_t program_length;
  const char *source_path = "src/opencl_base/sort.cl";
  char *source;

  // Dynamically allocate buffer for source
  source = oclLoadProgSource(source_path, "", &program_length);
  if(!source) {
    fprintf(stderr, "Could not load program source (%s)\n", __FILE__); exit(1);
  }
  	
  sort_program = clCreateProgramWithSource(clContext, 1, (const char **)&source, &program_length, &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  	  	
  free(source);
  
  OCL_ERRCK_RETVAL ( clBuildProgram(sort_program, 1, &clDevice, NULL /*compileOptions*/, NULL, NULL) );  
  
  
  // Uncomment to get build log from compiler for debugging
  char *build_log;
       size_t ret_val_size;
       ciErrNum = clGetProgramBuildInfo(sort_program, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);	OCL_ERRCK_VAR(ciErrNum);
       build_log = (char *)malloc(ret_val_size+1);
       ciErrNum = clGetProgramBuildInfo(sort_program, clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
       	OCL_ERRCK_VAR(ciErrNum);
       	

       // to be carefully, terminate with \0
       // there's no information in the reference whether the string is 0 terminated or not
       build_log[ret_val_size] = '\0';

       fprintf(stderr, "%s\n", build_log );
  
  
  splitSort = clCreateKernel(sort_program, "splitSort", &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  splitRearrange = clCreateKernel(sort_program, "splitRearrange", &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);      
  
  OCL_ERRCK_RETVAL( clSetKernelArg(splitSort, 0, sizeof(int), &numElems) );
  OCL_ERRCK_RETVAL( clSetKernelArg(splitSort, 2, sizeof(cl_mem), (void *)dkeysPtr) );
  OCL_ERRCK_RETVAL( clSetKernelArg(splitSort, 3, sizeof(cl_mem), (void *)dvaluesPtr) );
  OCL_ERRCK_RETVAL( clSetKernelArg(splitSort, 4, sizeof(cl_mem), (void *)&dhisto) );
  
  OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 0, sizeof(int), &numElems) );
  
  OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 2, sizeof(cl_mem), (void *)dkeysPtr) );
  OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 3, sizeof(cl_mem), (void *)dkeys_oPtr) );
  OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 4, sizeof(cl_mem), (void *)dvaluesPtr) );
  OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 5, sizeof(cl_mem), (void *)dvalues_oPtr) );
  OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 6, sizeof(cl_mem), (void *)&dhisto) );

  for (int i=0; i<iterations; i++){
  
    OCL_ERRCK_RETVAL( clSetKernelArg(splitSort, 1, sizeof(int), &i) );
    OCL_ERRCK_RETVAL( clSetKernelArg(splitSort, 2, sizeof(cl_mem), (void *)dkeysPtr) );
    OCL_ERRCK_RETVAL( clSetKernelArg(splitSort, 3, sizeof(cl_mem), (void *)dvaluesPtr) );    
    OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, splitSort, 1, 0,
                            grid, block, 0, 0, 0) );
    
    scanLargeArray(((numElems+4*SORT_BS-1)/(4*SORT_BS))*(1<<BITS), dhisto, clContext, clCommandQueue, clDevice, workItemSizes);

    OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 1, sizeof(int), &i ) );
    OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 2, sizeof(cl_mem), (void *)dkeysPtr) );
    OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 3, sizeof(cl_mem), (void *)dkeys_oPtr) );
    OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 4, sizeof(cl_mem), (void *)dvaluesPtr) );
    OCL_ERRCK_RETVAL( clSetKernelArg(splitRearrange, 5, sizeof(cl_mem), (void *)dvalues_oPtr) );

    OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, splitRearrange, 1, 0,
                            grid, block, 0, 0, 0) );

    cl_mem* temp = dkeysPtr;
    dkeysPtr = dkeys_oPtr;
    dkeys_oPtr = temp;

    temp = dvaluesPtr;
    dvaluesPtr = dvalues_oPtr;
    dvalues_oPtr = temp;
  }
  
  OCL_ERRCK_RETVAL ( clReleaseKernel(splitSort) );
  OCL_ERRCK_RETVAL ( clReleaseKernel(splitRearrange) );
  
  OCL_ERRCK_RETVAL ( clReleaseMemObject(*dkeys_oPtr) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(*dvalues_oPtr) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(dhisto) );
  
  OCL_ERRCK_RETVAL ( clReleaseProgram(sort_program) );

}
Ejemplo n.º 29
0
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for 
////////////////////////////////////////////////////////////////////////////////
int runTest(int argc, const char** argv)
{
    cl_platform_id cpPlatform = NULL;
    cl_uint ciDeviceCount = 0;
    cl_device_id *cdDevices = NULL;
    cl_int ciErrNum = CL_SUCCESS;

    //Get the NVIDIA platform
    ciErrNum = oclGetPlatformID(&cpPlatform);
    if (ciErrNum != CL_SUCCESS)
    {
        shrLog("Error: Failed to create OpenCL context!\n");
        return ciErrNum;
    }

    //Get the devices
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &ciDeviceCount);
    cdDevices = (cl_device_id *)malloc(ciDeviceCount * sizeof(cl_device_id) );
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, ciDeviceCount, cdDevices, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        shrLog("Error: Failed to create OpenCL context!\n");
        return ciErrNum;
    }

    //Create the context
    cxGPUContext = clCreateContext(0, ciDeviceCount, cdDevices, NULL, NULL, &ciErrNum);
    if (ciErrNum != CL_SUCCESS)
    {
        shrLog("Error: Failed to create OpenCL context!\n");
        return ciErrNum;
    }

    if(shrCheckCmdLineFlag(argc, (const char**)argv, "device"))
    {
        // User specified GPUs
        char* deviceList;
        char* deviceStr;
        char* next_token;
        shrGetCmdLineArgumentstr(argc, (const char**)argv, "device", &deviceList);

        #ifdef WIN32
            deviceStr = strtok_s (deviceList," ,.-", &next_token);
        #else
            deviceStr = strtok (deviceList," ,.-");
        #endif   
        ciDeviceCount = 0;
        while(deviceStr != NULL) 
        {
            // get and print the device for this queue
            cl_device_id device = oclGetDev(cxGPUContext, atoi(deviceStr));
			if( device == (cl_device_id) -1  ) {
				shrLog(" Device %s does not exist!\n", deviceStr);
				return -1;
			}
			
			shrLog("Device %s: ", deviceStr);
            oclPrintDevName(LOGBOTH, device);            
            shrLog("\n");
           
            // create command queue
            commandQueue[ciDeviceCount] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
            if (ciErrNum != CL_SUCCESS)
            {
                shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
                return ciErrNum;
            }
                
            ++ciDeviceCount;

            #ifdef WIN32
                deviceStr = strtok_s (NULL," ,.-", &next_token);
            #else            
                deviceStr = strtok (NULL," ,.-");
            #endif
        }

        free(deviceList);
    } 
    else 
    {
        // Find out how many GPU's to compute on all available GPUs
	    size_t nDeviceBytes;
	    ciErrNum |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes);
	    ciDeviceCount = (cl_uint)nDeviceBytes/sizeof(cl_device_id);

        if (ciErrNum != CL_SUCCESS)
        {
            shrLog(" Error %i in clGetDeviceIDs call !!!\n\n", ciErrNum);
            return ciErrNum;
        }
        else if (ciDeviceCount == 0)
        {
            shrLog(" There are no devices supporting OpenCL (return code %i)\n\n", ciErrNum);
            return -1;
        } 

        // create command-queues
        for(unsigned int i = 0; i < ciDeviceCount; ++i) 
        {
            // get and print the device for this queue
            cl_device_id device = oclGetDev(cxGPUContext, i);
            shrLog("Device %d: ", i);
            oclPrintDevName(LOGBOTH, device);            
            shrLog("\n");

            // create command queue
            commandQueue[i] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
            if (ciErrNum != CL_SUCCESS)
            {
                shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
                return ciErrNum;
            }
        }
    }

    // Optional Command-line multiplier for matrix sizes
    shrGetCmdLineArgumenti(argc, (const char**)argv, "sizemult", &iSizeMultiple); 
    iSizeMultiple = CLAMP(iSizeMultiple, 1, 10);
    uiWA = WA * iSizeMultiple;
    uiHA = HA * iSizeMultiple;
    uiWB = WB * iSizeMultiple;
    uiHB = HB * iSizeMultiple;
    uiWC = WC * iSizeMultiple;
    uiHC = HC * iSizeMultiple;
    shrLog("\nUsing Matrix Sizes: A(%u x %u), B(%u x %u), C(%u x %u)\n", 
            uiWA, uiHA, uiWB, uiHB, uiWC, uiHC);

    // allocate host memory for matrices A and B
    unsigned int size_A = uiWA * uiHA;
    unsigned int mem_size_A = sizeof(float) * size_A;
    float* h_A_data = (float*)malloc(mem_size_A);
    unsigned int size_B = uiWB * uiHB;
    unsigned int mem_size_B = sizeof(float) * size_B;
    float* h_B_data = (float*)malloc(mem_size_B);

    // initialize host memory
    srand(2006);
    shrFillArray(h_A_data, size_A);
    shrFillArray(h_B_data, size_B);

    // allocate host memory for result
    unsigned int size_C = uiWC * uiHC;
    unsigned int mem_size_C = sizeof(float) * size_C;
    float* h_C = (float*) malloc(mem_size_C);

    // create OpenCL buffer pointing to the host memory
    cl_mem h_A = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
				    mem_size_A, h_A_data, &ciErrNum);
    if (ciErrNum != CL_SUCCESS)
    {
        shrLog("Error: clCreateBuffer\n");
        return ciErrNum;
    }

    // Program Setup
    size_t program_length;
    const char* header_path = shrFindFilePath("matrixMul.h", argv[0]);
    oclCheckError(header_path != NULL, shrTRUE);
    char* header = oclLoadProgSource(header_path, "", &program_length);
    if(!header)
    {
        shrLog("Error: Failed to load the header %s!\n", header_path);
        return -1000;
    }
    const char* source_path = shrFindFilePath("matrixMul.cl", argv[0]);
    oclCheckError(source_path != NULL, shrTRUE);
    char *source = oclLoadProgSource(source_path, header, &program_length);
    if(!source)
    {
        shrLog("Error: Failed to load compute program %s!\n", source_path);
        return -2000;
    }

    // create the program
    cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&source, 
                                                    &program_length, &ciErrNum);
    if (ciErrNum != CL_SUCCESS)
    {
        shrLog("Error: Failed to create program\n");
        return ciErrNum;
    }
    free(header);
    free(source);
    
    // build the program
    ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        // write out standard error, Build Log and PTX, then return error
        shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
        oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
        oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclMatrixMul.ptx");
        return ciErrNum;
    }

    // write out PTX if requested on the command line
    if(shrCheckCmdLineFlag(argc, argv, "dump-ptx") )
    {
        oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclMatrixMul.ptx");
    }

    // Create Kernel
    for(unsigned int i = 0; i < ciDeviceCount; ++i) {
        multiplicationKernel[i] = clCreateKernel(cpProgram, "matrixMul", &ciErrNum);
        if (ciErrNum != CL_SUCCESS)
        {
            shrLog("Error: Failed to create kernel\n");
            return ciErrNum;
        }
    }
        
    // Run multiplication on 1..deviceCount GPUs to compare improvement
    shrLog("\nRunning Computations on 1 - %d GPU's...\n\n", ciDeviceCount);
    for(unsigned int k = 1; k <= ciDeviceCount; ++k) 
    {
        matrixMulGPU(k, h_A, h_B_data, mem_size_B, h_C);
    }

    // compute reference solution
    shrLog("Comparing results with CPU computation... \n\n");
    float* reference = (float*) malloc(mem_size_C);
    computeGold(reference, h_A_data, h_B_data, uiHA, uiWA, uiWB);

    // check result
    shrBOOL res = shrCompareL2fe(reference, h_C, size_C, 1.0e-6f);
    if (res != shrTRUE) 
    {
        printDiff(reference, h_C, uiWC, uiHC, 100, 1.0e-5f);
    }

    // clean up OCL resources
    ciErrNum = clReleaseMemObject(h_A);
    for(unsigned int k = 0; k < ciDeviceCount; ++k) 
    {
        ciErrNum |= clReleaseKernel( multiplicationKernel[k] );
        ciErrNum |= clReleaseCommandQueue( commandQueue[k] );
    }
    ciErrNum |= clReleaseProgram(cpProgram);
    ciErrNum |= clReleaseContext(cxGPUContext);
    if(ciErrNum != CL_SUCCESS)
    {
        shrLog("Error: Failure releasing OpenCL resources: %d\n", ciErrNum);
        return ciErrNum;
    }

    // clean up memory
    free(h_A_data);
    free(h_B_data);
    free(h_C);
    free(reference);
    
    return ((shrTRUE == res) ? CL_SUCCESS : -3000);
}
    // Function to read in kernel from uncompiled source, create the OCL program and build the OCL program 
    // **************************************************************************************************
    int CreateProgramAndKernel(cl_context cxGPUContext, cl_device_id* cdDevices, const char *kernel_name, cl_kernel *kernel, bool bDouble)
    {
        cl_program cpProgram;
        size_t szSourceLen;
        cl_int ciErrNum = CL_SUCCESS; 

        // Read the kernel in from file
        shrLog("\nLoading Uncompiled kernel from .cl file, using %s\n", clSourcefile);
        char* cPathAndFile = shrFindFilePath(clSourcefile, cExecutablePath);
        oclCheckError(cPathAndFile != NULL, shrTRUE);
        char* pcSource = oclLoadProgSource(cPathAndFile, "", &szSourceLen);
        oclCheckError(pcSource != NULL, shrTRUE);

	// Check OpenCL version -> vec3 types are supported only from version 1.1 and above
	char cOCLVersion[32];
	clGetDeviceInfo(cdDevices[0], CL_DEVICE_VERSION, sizeof(cOCLVersion), &cOCLVersion, 0);

	int iVec3Length = 3;
	if( strncmp("OpenCL 1.0", cOCLVersion, 10) == 0 ) {
		iVec3Length = 4;
	}


		//for double precision
		char *pcSourceForDouble;
		std::stringstream header;
		if (bDouble)
		{
			header << "#define REAL double";
			header << std::endl;
			header << "#define REAL4 double4";
			header << std::endl;
			header << "#define REAL3 double" << iVec3Length;
			header << std::endl;
			header << "#define ZERO3 {0.0, 0.0, 0.0" << ((iVec3Length == 4) ? ", 0.0}" : "}");
			header << std::endl;
		}
		else
		{
			header << "#define REAL float";
			header << std::endl;
			header << "#define REAL4 float4";
			header << std::endl;
			header << "#define REAL3 float" << iVec3Length;
			header << std::endl;
			header << "#define ZERO3 {0.0f, 0.0f, 0.0f" << ((iVec3Length == 4) ? ", 0.0f}" : "}");
			header << std::endl;
		}
		
		header << pcSource;
		pcSourceForDouble = (char *)malloc(header.str().size() + 1);
		szSourceLen = header.str().size();
#ifdef WIN32
        strcpy_s(pcSourceForDouble, szSourceLen + 1, header.str().c_str());
#else
        strcpy(pcSourceForDouble, header.str().c_str());
#endif

        // create the program 
        cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&pcSourceForDouble, &szSourceLen, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);
        shrLog("clCreateProgramWithSource\n"); 

        // Build the program with 'mad' Optimization option
#ifdef MAC
	char *flags = "-cl-fast-relaxed-math -DMAC";
#else
	char *flags = "-cl-fast-relaxed-math";
#endif
        ciErrNum = clBuildProgram(cpProgram, 0, NULL, flags, NULL, NULL);
        if (ciErrNum != CL_SUCCESS)
        {
            // write out standard error, Build Log and PTX, then cleanup and exit
            shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
            oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
            oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclNbody.ptx");
            oclCheckError(ciErrNum, CL_SUCCESS); 
        }
        shrLog("clBuildProgram\n"); 

        // create the kernel
        *kernel = clCreateKernel(cpProgram, kernel_name, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS); 
        shrLog("clCreateKernel\n"); 

		size_t wgSize;
		ciErrNum = clGetKernelWorkGroupInfo(*kernel, cdDevices[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL);
		if (wgSize == 64) {
		  shrLog(
			 "ERROR: Minimum work-group size 256 required by this application is not supported on this device.\n");
		  exit(0);
		}
	
		free(pcSourceForDouble);

        return 0;
    }