void b2CLCommonData::initReadLastJointImpulses() { if (b2clGlobal_OpenCLSupported) { printf("Initializing b2CLCommonData...\n"); int err; //load opencl programs from files char* commonDataSource = 0; size_t commonDataSourceLen = 0; shrLog("...loading b2CLCommonData.cl\n"); #ifdef linux commonDataSource = b2clLoadProgSource(shrFindFilePath("/opt/apps/com.samsung.browser/include/Box2D/Common/OpenCL/b2CLCommonData.cl", NULL), "// My comment\n", &commonDataSourceLen); #elif defined (_WIN32) commonDataSource = b2clLoadProgSource(shrFindFilePath("../../Box2D/Common/OpenCL/b2CLCommonData.cl", NULL), "// My comment\n", &commonDataSourceLen); #elif defined (__EMSCRIPTEN__) commonDataSource = b2clLoadProgSource(shrFindFilePath("./Common/OpenCL/b2CLCommonData.cl", NULL), "// My comment\n", &commonDataSourceLen); #else commonDataSource = b2clLoadProgSource(shrFindFilePath("../../../Box2D/Common/OpenCL/b2CLCommonData.cl", NULL), "// My comment\n", &commonDataSourceLen); #endif if(commonDataSource == NULL) { b2Log("Could not load program source, is path 'b2CLCommonData.cl' correct?"); } //create the compute program from source kernel code commonDataProgram = clCreateProgramWithSource(b2CLDevice::instance().GetContext(), 1, (const char**) &commonDataSource, NULL, &err); if (!commonDataProgram) { printf("Error: Failed to create compute program!\n"); exit(1); } //build the program err = clBuildProgram(commonDataProgram, 0, NULL, OPENCL_BUILD_PATH, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[20480]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(commonDataProgram, b2CLDevice::instance().GetCurrentDevice(), CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } //create the compute kernel readLastJointImpulseKernel = clCreateKernel(commonDataProgram, "ReadLastJointImpulses", &err); if (!readLastJointImpulseKernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } b2CLDevice::instance().getMaximumKernelWorkGroupSize(readLastJointImpulseKernel, maxWorkGroupSizeForreadLastJointImpulse); } }
extern "C" void initHistogram64(cl_context cxGPUContext, cl_command_queue cqParamCommandQue, const char **argv){ cl_int ciErrNum; size_t kernelLength; shrLog("...loading Histogram64.cl from file\n"); char *cHistogram64 = oclLoadProgSource(shrFindFilePath("Histogram64.cl", argv[0]), "// My comment\n", &kernelLength); shrCheckError(cHistogram64 != NULL, shrTRUE); shrLog("...creating histogram64 program\n"); cpHistogram64 = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cHistogram64, &kernelLength, &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); shrLog("...building histogram64 program\n"); ciErrNum = clBuildProgram(cpHistogram64, 0, NULL, compileOptions, NULL, NULL); shrCheckError(ciErrNum, CL_SUCCESS); shrLog("...creating histogram64 kernels\n"); ckHistogram64 = clCreateKernel(cpHistogram64, "histogram64", &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); ckMergeHistogram64 = clCreateKernel(cpHistogram64, "mergeHistogram64", &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); shrLog("...allocating internal histogram64 buffer\n"); d_PartialHistograms = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, MAX_PARTIAL_HISTOGRAM64_COUNT * HISTOGRAM64_BIN_COUNT * sizeof(uint), NULL, &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); //Save default command queue cqDefaultCommandQue = cqParamCommandQue; //Discard temp storage free(cHistogram64); //Save ptx code to separate file oclLogPtx(cpHistogram64, oclGetFirstDev(cxGPUContext), "Histogram64.ptx"); }
extern "C" void initConvolutionSeparable(cl_context cxGPUContext, cl_command_queue cqParamCommandQueue, const char **argv){ cl_int ciErrNum; size_t kernelLength; shrLog("Loading ConvolutionSeparable.cl...\n"); char *cPathAndName = shrFindFilePath("ConvolutionSeparable.cl", argv[0]); oclCheckError(cPathAndName != NULL, shrTRUE); char *cConvolutionSeparable = oclLoadProgSource(cPathAndName, "// My comment\n", &kernelLength); oclCheckError(cConvolutionSeparable != NULL, shrTRUE); shrLog("Creating convolutionSeparable program...\n"); cpConvolutionSeparable = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cConvolutionSeparable, &kernelLength, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Building convolutionSeparable program...\n"); char compileOptions[2048]; #ifdef _WIN32 sprintf_s(compileOptions, 2048, "\ -cl-fast-relaxed-math \ -D KERNEL_RADIUS=%u\ -D ROWS_BLOCKDIM_X=%u -D COLUMNS_BLOCKDIM_X=%u\ -D ROWS_BLOCKDIM_Y=%u -D COLUMNS_BLOCKDIM_Y=%u\ -D ROWS_RESULT_STEPS=%u -D COLUMNS_RESULT_STEPS=%u\ -D ROWS_HALO_STEPS=%u -D COLUMNS_HALO_STEPS=%u\ ", KERNEL_RADIUS, ROWS_BLOCKDIM_X, COLUMNS_BLOCKDIM_X, ROWS_BLOCKDIM_Y, COLUMNS_BLOCKDIM_Y, ROWS_RESULT_STEPS, COLUMNS_RESULT_STEPS, ROWS_HALO_STEPS, COLUMNS_HALO_STEPS ); #else sprintf(compileOptions, "\ -cl-fast-relaxed-math \ -D KERNEL_RADIUS=%u\ -D ROWS_BLOCKDIM_X=%u -D COLUMNS_BLOCKDIM_X=%u\ -D ROWS_BLOCKDIM_Y=%u -D COLUMNS_BLOCKDIM_Y=%u\ -D ROWS_RESULT_STEPS=%u -D COLUMNS_RESULT_STEPS=%u\ -D ROWS_HALO_STEPS=%u -D COLUMNS_HALO_STEPS=%u\ ", KERNEL_RADIUS, ROWS_BLOCKDIM_X, COLUMNS_BLOCKDIM_X, ROWS_BLOCKDIM_Y, COLUMNS_BLOCKDIM_Y, ROWS_RESULT_STEPS, COLUMNS_RESULT_STEPS, ROWS_HALO_STEPS, COLUMNS_HALO_STEPS ); #endif ciErrNum = clBuildProgram(cpConvolutionSeparable, 0, NULL, compileOptions, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); ckConvolutionRows = clCreateKernel(cpConvolutionSeparable, "convolutionRows", &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); ckConvolutionColumns = clCreateKernel(cpConvolutionSeparable, "convolutionColumns", &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); cqDefaultCommandQueue = cqParamCommandQueue; free(cConvolutionSeparable); }
extern "C" void initScan(cl_context cxGPUContext, cl_command_queue cqParamCommandQue, const char **argv) { cl_int ciErrNum; size_t kernelLength; shrLog(" ...loading Scan.cl\n"); char *cScan = oclLoadProgSource(shrFindFilePath("Scan.cl", argv[0]), "// My comment\n", &kernelLength); oclCheckError(cScan != NULL, shrTRUE); shrLog(" ...creating scan program\n"); cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cScan, &kernelLength, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog(" ...building scan program\n"); ciErrNum = clBuildProgram(cpProgram, 0, NULL, compileOptions, NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclScan.ptx"); oclCheckError(ciErrNum, CL_SUCCESS); } shrLog(" ...creating scan kernels\n"); ckScanExclusiveLocal1 = clCreateKernel(cpProgram, "scanExclusiveLocal1", &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); ckScanExclusiveLocal2 = clCreateKernel(cpProgram, "scanExclusiveLocal2", &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); ckUniformUpdate = clCreateKernel(cpProgram, "uniformUpdate", &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog( " ...checking minimum supported workgroup size\n"); //Check for work group size cl_device_id device; size_t szScanExclusiveLocal1, szScanExclusiveLocal2, szUniformUpdate; ciErrNum = clGetCommandQueueInfo(cqParamCommandQue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device, NULL); ciErrNum |= clGetKernelWorkGroupInfo(ckScanExclusiveLocal1, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &szScanExclusiveLocal1, NULL); ciErrNum |= clGetKernelWorkGroupInfo(ckScanExclusiveLocal2, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &szScanExclusiveLocal2, NULL); ciErrNum |= clGetKernelWorkGroupInfo(ckUniformUpdate, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &szUniformUpdate, NULL); oclCheckError(ciErrNum, CL_SUCCESS); if( (szScanExclusiveLocal1 < WORKGROUP_SIZE) || (szScanExclusiveLocal2 < WORKGROUP_SIZE) || (szUniformUpdate < WORKGROUP_SIZE) ) { shrLog("ERROR: Minimum work-group size %u required by this application is not supported on this device.\n", WORKGROUP_SIZE); exit(0); } shrLog(" ...allocating internal buffers\n"); d_Buffer = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, (MAX_BATCH_ELEMENTS / (4 * WORKGROUP_SIZE)) * sizeof(uint), NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); //Discard temp storage free(cScan); }
bool CheckRender::PPMvsPPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold ) { char *ref_file_path = shrFindFilePath(ref_file, m_ExecPath); if (ref_file_path == NULL) { shrLog("\nCheckRender::PPMvsPPM unable to find <%s> in <%s> Aborting comparison!\n", ref_file, m_ExecPath); return false; } if (src_file == NULL || ref_file == NULL) { shrLog("\nCheckRender::PPMvsPPM: Aborting comparison\n"); return false; } return (shrComparePPM(src_file, ref_file_path, epsilon, threshold) == shrTRUE ? true : false); }
bool CheckRender::PGMvsPGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold ) { unsigned char *src_data = NULL, *ref_data = NULL; unsigned long error_count = 0; unsigned int width, height; char *ref_file_path = shrFindFilePath(ref_file, m_ExecPath); if (ref_file_path == NULL) { printf("CheckRender::PGMvsPGM unable to find <%s> in <%s> Aborting comparison!\n", ref_file, m_ExecPath); printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", ref_file); printf("Aborting comparison!\n"); printf(" FAILED!\n"); error_count++; } else { if (src_file == NULL || ref_file_path == NULL) { printf("PGMvsPGM: Aborting comparison\n"); return false; } printf(" src_file <%s>\n", src_file); printf(" ref_file <%s>\n", ref_file_path); if (shrLoadPGMub(ref_file_path, &ref_data, &width, &height) != shrTRUE) { printf("PGMvsPGM: unable to load ref image file: %s\n", ref_file_path); return false; } if (shrLoadPGMub(src_file, &src_data, &width, &height) != shrTRUE) { printf("PGMvsPGM: unable to load src image file: %s\n", src_file); return false; } printf("PGMvsPGM: comparing images size (%d,%d) epsilon(%2.4f), threshold(%4.2f%%)\n", m_Height, m_Width, epsilon, threshold*100); if (shrCompareubt( ref_data, src_data, m_Height*m_Width, epsilon, threshold ) == shrFALSE) { error_count = 1; } } if (error_count == 0) { printf(" PASSED!\n"); } else { printf(" FAILED! %d errors...\n", (unsigned int)error_count); } return (error_count == 0); // returns true if all pixels pass }
void loadImageData(int argc, char **argv) { // load image (needed so we can get the width and height before we create the window char* image_path = NULL; if (argc >= 1) image_path = shrFindFilePath(image_filename, argv[0]); if (image_path == 0) { shrLog("Error finding image file '%s'\n", image_filename); exit(EXIT_FAILURE); } cutilCheckError(cutLoadPPM4ub(image_path, (unsigned char **) &h_img, &width, &height)); if (!h_img) { shrLog("Error opening file '%s'\n", image_path); exit(-1); } shrLog("Loaded '%s', %d x %d pixels\n\n", image_path, width, height); }
//----------------------------------------------------------------------------- // Name: CreateKernelProgram() // Desc: Creates OpenCL program and kernel instances //----------------------------------------------------------------------------- HRESULT CreateKernelProgram( const char *exepath, const char *clName, const char *clPtx, const char *kernelEntryPoint, cl_program &cpProgram, cl_kernel &ckKernel ) { // Program Setup size_t program_length; const char* source_path = shrFindFilePath(clName, exepath); char *source = oclLoadProgSource(source_path, "", &program_length); oclCheckErrorEX(source != NULL, shrTRUE, pCleanup); // create the program cpProgram = clCreateProgramWithSource(cxGPUContext, 1,(const char **) &source, &program_length, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); free(source); // build the program #ifdef USE_STAGING_BUFFER static char *opts = "-cl-fast-relaxed-math -DUSE_STAGING_BUFFER"; #else static char *opts = "-cl-fast-relaxed-math"; #endif ciErrNum = clBuildProgram(cpProgram, 0, NULL, opts, NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), clPtx); Cleanup(EXIT_FAILURE); } // create the kernel ckKernel = clCreateKernel(cpProgram, kernelEntryPoint, &ciErrNum); if (!ckKernel) { Cleanup(EXIT_FAILURE); } // set the args values return ciErrNum ? E_FAIL : S_OK; }
bool CheckRender::PPMvsPPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold ) { unsigned long error_count = 0; char *ref_file_path = shrFindFilePath(ref_file, m_ExecPath); if (ref_file_path == NULL) { printf("CheckRender::PPMvsPPM unable to find <%s> in <%s> Aborting comparison!\n", ref_file, m_ExecPath); printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", ref_file); printf("Aborting comparison!\n"); printf(" FAILED!\n"); error_count++; } if (src_file == NULL || ref_file_path == NULL) { printf("PPMvsPPM: Aborting comparison\n"); return false; } printf(" src_file <%s>\n", src_file); printf(" ref_file <%s>\n", ref_file_path); return (shrComparePPM( src_file, ref_file_path, epsilon, threshold, true ) == shrTRUE ? true : false); }
extern "C" void initBlackScholes(cl_context cxGPUContext, cl_command_queue cqParamCommandQueue, const char **argv){ cl_int ciErrNum; size_t kernelLength; shrLog(LOGBOTH, 0, "...loading BlackScholes.cl\n"); char *cBlackScholes = oclLoadProgSource(shrFindFilePath("BlackScholes.cl", argv[0]), "// My comment\n", &kernelLength); shrCheckError(cBlackScholes != NULL, shrTRUE); shrLog(LOGBOTH, 0, "...creating BlackScholes program\n"); cpBlackScholes = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cBlackScholes, &kernelLength, &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); shrLog(LOGBOTH, 0, "...building BlackScholes program\n"); ciErrNum = clBuildProgram(cpBlackScholes, 0, NULL, NULL, NULL, NULL); shrCheckError(ciErrNum, CL_SUCCESS); shrLog(LOGBOTH, 0, "...creating BlackScholes kernels\n"); ckBlackScholes = clCreateKernel(cpBlackScholes, "BlackScholes", &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); cqDefaultCommandQueue = cqParamCommandQueue; free(cBlackScholes); }
// Main function // ********************************************************************* int ymain(int argc, char **argv) { shrQAStart(argc, argv); // get command line arg for quick test, if provided bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt"); // start logs cExecutableName = argv[0]; shrSetLogFileName ("oclVectorAdd2.txt"); shrLog("%s Starting...\n\n# of float elements per Array \t= %i\n", argv[0], iNumElements); // set and log Global and Local work size dimensions szLocalWorkSize = 256; szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements); // rounded up to the nearest multiple of the LocalWorkSize shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n", szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); // Allocate and initialize host arrays shrLog( "Allocate and Init Host Mem...\n"); srcA = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize); srcB = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize); dst = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize); Golden = (void *)malloc(sizeof(cl_float) * iNumElements); shrFillArray((float*)srcA, iNumElements); shrFillArray((float*)srcB, iNumElements); //Get an OpenCL platform ciErr1 = clGetPlatformIDs(1, &cpPlatform, NULL); shrLog("clGetPlatformID...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clGetPlatformID, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } //Get the devices ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL); shrLog("clGetDeviceIDs...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clGetDeviceIDs, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } //Create the context cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr1); shrLog("clCreateContext...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateContext, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &ciErr1); shrLog("clCreateCommandQueue...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateCommandQueue, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Allocate the OpenCL buffer memory objects for source and result on the device GMEM cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr1); cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2); ciErr1 |= ciErr2; cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2); ciErr1 |= ciErr2; shrLog("clCreateBuffer...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Read the OpenCL kernel in from source file shrLog("oclLoadProgSource (%s)...\n", cSourceFile); cPathAndName = shrFindFilePath(cSourceFile, argv[0]); shrLog("Looking for: %s in Path: %s\n", cSourceFile, argv[0]); cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength); // Create the program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1); shrLog("clCreateProgramWithSource...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Build the program with 'mad' Optimization option #ifdef MAC char* flags = "-cl-fast-relaxed-math -DMAC"; #else char* flags = "-cl-fast-relaxed-math"; #endif ciErr1 = clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL); shrLog("clBuildProgram...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Create the kernel ckKernel = clCreateKernel(cpProgram, "VectorAdd", &ciErr1); shrLog("clCreateKernel (VectorAdd)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Set the Argument values ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA); ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB); ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst); ciErr1 |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements); shrLog("clSetKernelArg 0 - 3...\n\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clSetKernelArg, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // -------------------------------------------------------- // Start Core sequence... copy input data to GPU, compute, copy results back // Asynchronous write of data to GPU device ciErr1 = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, srcA, 0, NULL, NULL); ciErr1 |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, srcB, 0, NULL, NULL); shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueWriteBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Launch kernel ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL); shrLog("clEnqueueNDRangeKernel (VectorAdd)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueNDRangeKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Synchronous/blocking read of results, and check accumulated errors ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL); shrLog("clEnqueueReadBuffer (Dst)...\n\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueReadBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } //-------------------------------------------------------- // Compute and compare results for golden-host and report errors and pass/fail shrLog("Comparing against Host/C++ computation...\n\n"); VectorAddHost ((const float*)srcA, (const float*)srcB, (float*)Golden, iNumElements); shrBOOL bMatch = shrComparefet((const float*)Golden, (const float*)dst, (unsigned int)iNumElements, 0.0f, 0); // Cleanup and leave Cleanup (argc, argv, (bMatch == shrTRUE) ? EXIT_SUCCESS : EXIT_FAILURE); }
extern "C" void initBlackScholes(cl_context cxGPUContext, cl_command_queue cqParamCommandQueue, const char **argv){ cl_int ciErrNum; size_t kernelLength; shrLog("...loading BlackScholes.cl\n"); char *cPathAndName = shrFindFilePath("BlackScholes.cl", argv[0]); shrCheckError(cPathAndName != NULL, shrTRUE); char *cBlackScholes = oclLoadProgSource(cPathAndName, "// My comment\n", &kernelLength); shrCheckError(cBlackScholes != NULL, shrTRUE); shrLog("...creating BlackScholes program\n"); cpBlackScholes = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cBlackScholes, &kernelLength, &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); shrLog("...building BlackScholes program\n"); ciErrNum = clBuildProgram(cpBlackScholes, 0, NULL, "-cl-fast-relaxed-math -Werror", NULL, NULL); if(ciErrNum != CL_BUILD_SUCCESS){ shrLog("*** Compilation failure ***\n"); size_t deviceNum; cl_device_id *cdDevices; ciErrNum = clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &deviceNum); shrCheckError(ciErrNum, CL_SUCCESS); cdDevices = (cl_device_id *)malloc(deviceNum * sizeof(cl_device_id)); shrCheckError(cdDevices != NULL, shrTRUE); ciErrNum = clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, deviceNum * sizeof(cl_device_id), cdDevices, NULL); shrCheckError(ciErrNum, CL_SUCCESS); size_t logSize; char *logTxt; ciErrNum = clGetProgramBuildInfo(cpBlackScholes, cdDevices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize); shrCheckError(ciErrNum, CL_SUCCESS); logTxt = (char *)malloc(logSize); shrCheckError(logTxt != NULL, shrTRUE); ciErrNum = clGetProgramBuildInfo(cpBlackScholes, cdDevices[0], CL_PROGRAM_BUILD_LOG, logSize, logTxt, NULL); shrCheckError(ciErrNum, CL_SUCCESS); shrLog("%s\n", logTxt); shrLog("*** Exiting ***\n"); free(logTxt); free(cdDevices); exit(666); } //Save ptx code to separate file oclLogPtx(cpBlackScholes, oclGetFirstDev(cxGPUContext), "BlackScholes.ptx"); shrLog("...creating BlackScholes kernels\n"); ckBlackScholes = clCreateKernel(cpBlackScholes, "BlackScholes", &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); cqDefaultCommandQueue = cqParamCommandQueue; free(cBlackScholes); free(cPathAndName); }
bool fdtdGPU(float *output, const float *input, const float *coeff, const int dimx, const int dimy, const int dimz, const int radius, const int timesteps, const int argc, const char **argv) { bool ok = true; const int outerDimx = dimx + 2 * radius; const int outerDimy = dimy + 2 * radius; const int outerDimz = dimz + 2 * radius; const size_t volumeSize = outerDimx * outerDimy * outerDimz; cl_context context = 0; cl_platform_id platform = 0; cl_device_id *devices = 0; cl_command_queue commandQueue = 0; cl_mem bufferOut = 0; cl_mem bufferIn = 0; cl_mem bufferCoeff = 0; cl_program program = 0; cl_kernel kernel = 0; cl_event *kernelEvents = 0; #ifdef GPU_PROFILING cl_ulong kernelEventStart; cl_ulong kernelEventEnd; #endif double hostElapsedTimeS; char *cPathAndName = 0; char *cSourceCL = 0; size_t szKernelLength; size_t globalWorkSize[2]; size_t localWorkSize[2]; cl_uint deviceCount = 0; cl_uint targetDevice = 0; cl_int errnum = 0; char buildOptions[128]; // Ensure that the inner data starts on a 128B boundary const int padding = (128 / sizeof(float)) - radius; const size_t paddedVolumeSize = volumeSize + padding; #ifdef GPU_PROFILING const int profileTimesteps = timesteps - 1; if (ok) { if (profileTimesteps < 1) { shrLog(" cannot profile with fewer than two timesteps (timesteps=%d), profiling is disabled.\n", timesteps); } } #endif // Get the NVIDIA platform if (ok) { shrLog(" oclGetPlatformID...\n"); errnum = oclGetPlatformID(&platform); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("oclGetPlatformID (returned %d).\n", errnum); ok = false; } } // Get the list of GPU devices associated with the platform if (ok) { shrLog(" clGetDeviceIDs"); errnum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &deviceCount); devices = (cl_device_id *)malloc(deviceCount * sizeof(cl_device_id) ); errnum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, deviceCount, devices, NULL); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clGetDeviceIDs (returned %d).\n", errnum); ok = false; } } // Create the OpenCL context if (ok) { shrLog(" clCreateContext...\n"); context = clCreateContext(0, deviceCount, devices, NULL, NULL, &errnum); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clCreateContext (returned %d).\n", errnum); ok = false; } } // Select target device (device 0 by default) if (ok) { char *device = 0; if (shrGetCmdLineArgumentstr(argc, argv, "device", &device)) { targetDevice = (cl_uint)atoi(device); if (targetDevice >= deviceCount) { shrLogEx(LOGBOTH | ERRORMSG, -2001, STDERROR); shrLog("invalid target device specified on command line (device %d does not exist).\n", targetDevice); ok = false; } } else { targetDevice = 0; } if (device) { free(device); } } // Create a command-queue if (ok) { shrLog(" clCreateCommandQueue\n"); commandQueue = clCreateCommandQueue(context, devices[targetDevice], CL_QUEUE_PROFILING_ENABLE, &errnum); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clCreateCommandQueue (returned %d).\n", errnum); ok = false; } } // Create memory buffer objects if (ok) { shrLog(" clCreateBuffer bufferOut\n"); bufferOut = clCreateBuffer(context, CL_MEM_READ_WRITE, paddedVolumeSize * sizeof(float), NULL, &errnum); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clCreateBuffer (returned %d).\n", errnum); ok = false; } } if (ok) { shrLog(" clCreateBuffer bufferIn\n"); bufferIn = clCreateBuffer(context, CL_MEM_READ_WRITE, paddedVolumeSize * sizeof(float), NULL, &errnum); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clCreateBuffer (returned %d).\n", errnum); ok = false; } } if (ok) { shrLog(" clCreateBuffer bufferCoeff\n"); bufferCoeff = clCreateBuffer(context, CL_MEM_READ_ONLY, (radius + 1) * sizeof(float), NULL, &errnum); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clCreateBuffer (returned %d).\n", errnum); ok = false; } } // Load the kernel from file if (ok) { shrLog(" shrFindFilePath\n"); cPathAndName = shrFindFilePath(clSourceFile, argv[0]); if (cPathAndName == NULL) { shrLogEx(LOGBOTH | ERRORMSG, -2002, STDERROR); shrLog("shrFindFilePath returned null.\n"); ok = false; } } if (ok) { shrLog(" oclLoadProgSource\n"); cSourceCL = oclLoadProgSource(cPathAndName, "// Preamble\n", &szKernelLength); if (cSourceCL == NULL) { shrLogEx(LOGBOTH | ERRORMSG, -2003, STDERROR); shrLog("oclLoadProgSource returned null.\n"); ok = false; } } // Create the program if (ok) { shrLog(" clCreateProgramWithSource\n"); program = clCreateProgramWithSource(context, 1, (const char **)&cSourceCL, &szKernelLength, &errnum); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clCreateProgramWithSource (returned %d).\n", errnum); ok = false; } } // Check for a command-line specified work group size size_t userWorkSize; int localWorkMaxY; if (ok) { int userWorkSizeInt; if (shrGetCmdLineArgumenti(argc, argv, "work-group-size", &userWorkSizeInt)) { // We can't clamp to CL_KERNEL_WORK_GROUP_SIZE yet since that is // dependent on the build. if (userWorkSizeInt < k_localWorkMin || userWorkSizeInt > k_localWorkMax) { shrLogEx(LOGBOTH | ERRORMSG, -2004, STDERROR); shrLog("invalid work group size specified on command line (must be between %d and %d).\n", k_localWorkMin, k_localWorkMax); ok = false; } // Constrain to a multiple of k_localWorkX userWorkSize = (userWorkSizeInt / k_localWorkX * k_localWorkX); } else { userWorkSize = k_localWorkY * k_localWorkX; } // Divide by k_localWorkX (integer division to clamp) localWorkMaxY = userWorkSize / k_localWorkX; } // Build the program if (ok) { #ifdef WIN32 if (sprintf_s(buildOptions, sizeof(buildOptions), "-DRADIUS=%d -DMAXWORKX=%d -DMAXWORKY=%d -cl-fast-relaxed-math", radius, k_localWorkX, localWorkMaxY) < 0) { shrLogEx(LOGBOTH | ERRORMSG, -2005, STDERROR); shrLog("sprintf_s (failed).\n"); ok = false; } #else if (snprintf(buildOptions, sizeof(buildOptions), "-DRADIUS=%d -DMAXWORKX=%d -DMAXWORKY=%d -cl-fast-relaxed-math", radius, k_localWorkX, localWorkMaxY) < 0) { shrLogEx(LOGBOTH | ERRORMSG, -2005, STDERROR); shrLog("snprintf (failed).\n"); ok = false; } #endif } if (ok) { shrLog(" clBuildProgram (%s)\n", buildOptions); errnum = clBuildProgram(program, 0, NULL, buildOptions, NULL, NULL); if (errnum != CL_SUCCESS) { char buildLog[10240]; clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(buildLog), buildLog, NULL); shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clBuildProgram (returned %d).\n", errnum); shrLog("Log:\n%s\n", buildLog); ok = false; } } // Create the kernel if (ok) { shrLog(" clCreateKernel\n"); kernel = clCreateKernel(program, "FiniteDifferences", &errnum); if (kernel == (cl_kernel)NULL || errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clCreateKernel (returned %d).\n", errnum); ok = false; } } // Get the maximum work group size size_t maxWorkSize; if (ok) { shrLog(" clGetKernelWorkGroupInfo\n"); errnum = clGetKernelWorkGroupInfo(kernel, devices[targetDevice], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &maxWorkSize, NULL); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clGetKernelWorkGroupInfo (returned %d).\n", errnum); ok = false; } } // Set the work group size if (ok) { userWorkSize = CLAMP(userWorkSize, k_localWorkMin, maxWorkSize); localWorkSize[0] = k_localWorkX; localWorkSize[1] = userWorkSize / k_localWorkX; globalWorkSize[0] = localWorkSize[0] * (unsigned int)ceil((float)dimx / localWorkSize[0]); globalWorkSize[1] = localWorkSize[1] * (unsigned int)ceil((float)dimy / localWorkSize[1]); shrLog(" set local work group size to %dx%d\n", localWorkSize[0], localWorkSize[1]); shrLog(" set total work size to %dx%d\n", globalWorkSize[0], globalWorkSize[1]); } // Copy the input to the device input buffer if (ok) { shrLog(" clEnqueueWriteBuffer bufferIn\n"); errnum = clEnqueueWriteBuffer(commandQueue, bufferIn, CL_TRUE, padding * sizeof(float), volumeSize * sizeof(float), input, 0, NULL, NULL); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clEnqueueWriteBuffer bufferIn (returned %d).\n", errnum); ok = false; } } // Copy the input to the device output buffer (actually only need the halo) if (ok) { shrLog(" clEnqueueWriteBuffer bufferOut\n"); errnum = clEnqueueWriteBuffer(commandQueue, bufferOut, CL_TRUE, padding * sizeof(float), volumeSize * sizeof(float), input, 0, NULL, NULL); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clEnqueueWriteBuffer bufferOut (returned %d).\n", errnum); ok = false; } } // Copy the coefficients to the device coefficient buffer if (ok) { shrLog(" clEnqueueWriteBuffer bufferCoeff\n"); errnum = clEnqueueWriteBuffer(commandQueue, bufferCoeff, CL_TRUE, 0, (radius + 1) * sizeof(float), coeff, 0, NULL, NULL); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clEnqueueWriteBuffer bufferCoeff (returned %d).\n", errnum); ok = false; } } // Allocate the events if (ok) { shrLog(" calloc events\n"); if ((kernelEvents = (cl_event *)calloc(timesteps, sizeof(cl_event))) == NULL) { shrLogEx(LOGBOTH | ERRORMSG, -2006, STDERROR); shrLog("Insufficient memory for events calloc, please try a smaller volume (use --help for syntax).\n"); ok = false; } } // Start the clock shrDeltaT(0); // Set the constant arguments if (ok) { shrLog(" clSetKernelArg 2-6\n"); errnum = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&bufferCoeff); errnum |= clSetKernelArg(kernel, 3, sizeof(int), &dimx); errnum |= clSetKernelArg(kernel, 4, sizeof(int), &dimy); errnum |= clSetKernelArg(kernel, 5, sizeof(int), &dimz); errnum |= clSetKernelArg(kernel, 6, sizeof(int), &padding); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clSetKernelArg 2-6 (returned %d).\n", errnum); ok = false; } } // Execute the FDTD cl_mem bufferSrc = bufferIn; cl_mem bufferDst = bufferOut; if (ok) { shrLog(" GPU FDTD loop\n"); } for (int it = 0 ; ok && it < timesteps ; it++) { shrLog("\tt = %d ", it); // Set the dynamic arguments if (ok) { shrLog(" clSetKernelArg 0-1,"); errnum = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&bufferDst); errnum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&bufferSrc); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clSetKernelArg 0-1 (returned %d).\n", errnum); ok = false; } } // Launch the kernel if (ok) { shrLog(" clEnqueueNDRangeKernel\n"); errnum = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &kernelEvents[it]); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clEnqueueNDRangeKernel (returned %d).\n", errnum); ok = false; } } // Toggle the buffers cl_mem tmp = bufferSrc; bufferSrc = bufferDst; bufferDst = tmp; } if (ok) shrLog("\n"); // Wait for the kernel to complete if (ok) { shrLog(" clWaitForEvents\n"); errnum = clWaitForEvents(1, &kernelEvents[timesteps-1]); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clWaitForEvents (returned %d).\n", errnum); ok = false; } } // Stop the clock hostElapsedTimeS = shrDeltaT(0); // Read the result back, result is in bufferSrc (after final toggle) if (ok) { shrLog(" clEnqueueReadBuffer\n"); errnum = clEnqueueReadBuffer(commandQueue, bufferSrc, CL_TRUE, padding * sizeof(float), volumeSize * sizeof(float), output, 0, NULL, NULL); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clEnqueueReadBuffer bufferSrc (returned %d).\n", errnum); ok = false; } } // Report time #ifdef GPU_PROFILING double elapsedTime = 0.0; if (ok && profileTimesteps > 0) shrLog(" Collect profile information\n"); for (int it = 1 ; ok && it <= profileTimesteps ; it++) { shrLog("\tt = %d ", it); shrLog(" clGetEventProfilingInfo,", it); errnum = clGetEventProfilingInfo(kernelEvents[it], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernelEventStart, NULL); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clGetEventProfilingInfo (returned %d).\n", errnum); ok = false; } shrLog(" clGetEventProfilingInfo\n", it); errnum = clGetEventProfilingInfo(kernelEvents[it], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernelEventEnd, NULL); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clGetEventProfilingInfo (returned %d).\n", errnum); ok = false; } elapsedTime += (double)kernelEventEnd - (double)kernelEventStart; } if (ok && profileTimesteps > 0) { shrLog("\n"); // Convert nanoseconds to seconds elapsedTime *= 1.0e-9; double avgElapsedTime = elapsedTime / (double)profileTimesteps; // Determine number of computations per timestep size_t pointsComputed = dimx * dimy * dimz; // Determine throughput double throughputM = 1.0e-6 * (double)pointsComputed / avgElapsedTime; shrLogEx(LOGBOTH | MASTER, 0, "oclFDTD3d, Throughput = %.4f MPoints/s, Time = %.5f s, Size = %u Points, NumDevsUsed = %i, Workgroup = %u\n", throughputM, avgElapsedTime, pointsComputed, 1, localWorkSize[0] * localWorkSize[1]); } #endif // Cleanup if (kernelEvents) { for (int it = 0 ; it < timesteps ; it++) { if (kernelEvents[it]) clReleaseEvent(kernelEvents[it]); } free(kernelEvents); } if (kernel) clReleaseKernel(kernel); if (program) clReleaseProgram(program); if (cSourceCL) free(cSourceCL); if (cPathAndName) free(cPathAndName); if (bufferCoeff) clReleaseMemObject(bufferCoeff); if (bufferIn) clReleaseMemObject(bufferIn); if (bufferOut) clReleaseMemObject(bufferOut); if (commandQueue) clReleaseCommandQueue(commandQueue); if (devices) free(devices); if (context) clReleaseContext(context); return ok; }
// Main function // ********************************************************************* int main(int argc, char** argv) { shrQAStart(argc, argv); // get command line arg for quick test, if provided bNoPrompt = shrCheckCmdLineFlag(argc, (const char **)argv, "noprompt"); // start logs cExecutableName = argv[0]; shrSetLogFileName ("oclMatVecMul.txt"); shrLog("%s Starting...\n\n", argv[0]); // calculate matrix height given GPU memory shrLog("Determining Matrix height from available GPU mem...\n"); memsize_t memsize; getTargetDeviceGlobalMemSize(&memsize, argc, (const char **)argv); height = memsize/width/16; if (height > MAX_HEIGHT) height = MAX_HEIGHT; shrLog(" Matrix width\t= %u\n Matrix height\t= %u\n\n", width, height); // Allocate and initialize host arrays shrLog("Allocate and Init Host Mem...\n\n"); unsigned int size = width * height; unsigned int mem_size_M = size * sizeof(float); M = (float*)malloc(mem_size_M); unsigned int mem_size_V = width * sizeof(float); V = (float*)malloc(mem_size_V); unsigned int mem_size_W = height * sizeof(float); W = (float*)malloc(mem_size_W); shrFillArray(M, size); shrFillArray(V, width); Golden = (float*)malloc(mem_size_W); MatVecMulHost(M, V, width, height, Golden); //Get the NVIDIA platform shrLog("Get the Platform ID...\n\n"); ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); //Get all the devices shrLog("Get the Device info and select Device...\n"); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Set target device and Query number of compute units on targetDevice shrLog(" # of Devices Available = %u\n", uiNumDevices); if(shrGetCmdLineArgumentu(argc, (const char **)argv, "device", &targetDevice)== shrTRUE) { targetDevice = CLAMP(targetDevice, 0, (uiNumDevices - 1)); } shrLog(" Using Device %u: ", targetDevice); oclPrintDevName(LOGBOTH, cdDevices[targetDevice]); cl_uint num_compute_units; clGetDeviceInfo(cdDevices[targetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(num_compute_units), &num_compute_units, NULL); shrLog("\n # of Compute Units = %u\n\n", num_compute_units); //Create the context shrLog("clCreateContext...\n"); cxGPUContext = clCreateContext(0, uiNumDevsUsed, &cdDevices[targetDevice], NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Create a command-queue shrLog("clCreateCommandQueue...\n"); cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[targetDevice], CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Allocate the OpenCL buffer memory objects for source and result on the device GMEM shrLog("clCreateBuffer (M, V and W in device global memory, mem_size_m = %u)...\n", mem_size_M); cmM = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, mem_size_M, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cmV = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, mem_size_V, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cmW = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, mem_size_W, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Read the OpenCL kernel in from source file shrLog("oclLoadProgSource (%s)...\n", cSourceFile); cPathAndName = shrFindFilePath(cSourceFile, argv[0]); oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup); cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength); oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup); // Create the program shrLog("clCreateProgramWithSource...\n"); cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum); // Build the program shrLog("clBuildProgram...\n"); ciErrNum = clBuildProgram(cpProgram, uiNumDevsUsed, &cdDevices[targetDevice], "-cl-fast-relaxed-math", NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclMatVecMul.ptx"); shrQAFinish(argc, (const char **)argv, QA_FAILED); Cleanup(EXIT_FAILURE); } // -------------------------------------------------------- // Core sequence... copy input data to GPU, compute, copy results back // Asynchronous write of data to GPU device shrLog("clEnqueueWriteBuffer (M and V)...\n\n"); ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmM, CL_FALSE, 0, mem_size_M, M, 0, NULL, NULL); ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue, cmV, CL_FALSE, 0, mem_size_V, V, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Kernels const char* kernels[] = { "MatVecMulUncoalesced0", "MatVecMulUncoalesced1", "MatVecMulCoalesced0", "MatVecMulCoalesced1", "MatVecMulCoalesced2", "MatVecMulCoalesced3" }; for (int k = 0; k < (int)(sizeof(kernels)/sizeof(char*)); ++k) { shrLog("Running with Kernel %s...\n\n", kernels[k]); // Clear result shrLog(" Clear result with clEnqueueWriteBuffer (W)...\n"); memset(W, 0, mem_size_W); ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmW, CL_FALSE, 0, mem_size_W, W, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Create the kernel shrLog(" clCreateKernel...\n"); if (ckKernel) { clReleaseKernel(ckKernel); ckKernel = 0; } ckKernel = clCreateKernel(cpProgram, kernels[k], &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Set and log Global and Local work size dimensions szLocalWorkSize = 256; if (k == 0) szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, height); // rounded up to the nearest multiple of the LocalWorkSize else // Some experiments should be done here for determining the best global work size for a given device // We will assume here that we can run 2 work-groups per compute unit szGlobalWorkSize = 2 * num_compute_units * szLocalWorkSize; shrLog(" Global Work Size \t\t= %u\n Local Work Size \t\t= %u\n # of Work Groups \t\t= %u\n", szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); // Set the Argument values shrLog(" clSetKernelArg...\n\n"); int n = 0; ciErrNum = clSetKernelArg(ckKernel, n++, sizeof(cl_mem), (void*)&cmM); ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_mem), (void*)&cmV); ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_int), (void*)&width); ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_int), (void*)&height); ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_mem), (void*)&cmW); if (k > 1) ciErrNum |= clSetKernelArg(ckKernel, n++, szLocalWorkSize * sizeof(float), 0); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Launch kernel shrLog(" clEnqueueNDRangeKernel (%s)...\n", kernels[k]); ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, &ceEvent); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Read back results and check accumulated errors shrLog(" clEnqueueReadBuffer (W)...\n"); ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmW, CL_TRUE, 0, mem_size_W, W, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); #ifdef GPU_PROFILING // Execution time ciErrNum = clWaitForEvents(1, &ceEvent); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cl_ulong start, end; ciErrNum = clGetEventProfilingInfo(ceEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); ciErrNum |= clGetEventProfilingInfo(ceEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); double dSeconds = 1.0e-9 * (double)(end - start); shrLog(" Kernel execution time: %.5f s\n\n", dSeconds); #endif // Compare results for golden-host and report errors and pass/fail shrLog(" Comparing against Host/C++ computation...\n\n"); shrBOOL res = shrCompareL2fe(Golden, W, height, 1e-6f); shrLog(" GPU Result %s CPU Result within allowable tolerance\n\n", (res == shrTRUE) ? "MATCHES" : "DOESN'T MATCH"); bPassFlag &= (res == shrTRUE); // Release event ciErrNum = clReleaseEvent(ceEvent); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); ceEvent = 0; } // Master status Pass/Fail (all tests) shrQAFinish(argc, (const char **)argv, (bPassFlag ? QA_PASSED : QA_FAILED) ); // Cleanup and leave Cleanup (EXIT_SUCCESS); }
// Main function // ********************************************************************* int main(int argc, char **argv) { //shrQAStart(argc, argv); // get command line arg for quick test, if provided bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt"); // start logs cExecutableName = argv[0]; shrSetLogFileName ("Barrier.txt"); printf("%s Starting...\n\n# of THREADS \t= %i\n", argv[0], iNumElements); // set and log Global and Local work size dimensions szLocalWorkSize = NUM_THREADS ; szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements); // rounded up to the nearest multiple of the LocalWorkSize printf("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n", szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); //Get an OpenCL platform ciErr1 = clGetPlatformIDs(1, &cpPlatform, NULL); printf("clGetPlatformID...\n"); if (ciErr1 != CL_SUCCESS) { printf("Error in clGetPlatformID, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } //Get the devices ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL); printf("clGetDeviceIDs...\n"); if (ciErr1 != CL_SUCCESS) { printf("Error in clGetDeviceIDs, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } //Create the context cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr1); printf("clCreateContext...\n"); if (ciErr1 != CL_SUCCESS) { printf("Error in clCreateContext, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErr1); printf("clCreateCommandQueue...\n"); if (ciErr1 != CL_SUCCESS) { printf("Error in clCreateCommandQueue, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Read the OpenCL kernel in from source file printf("oclLoadProgSource (%s)...\n", cSourceFile); cPathAndName = shrFindFilePath(cSourceFile, argv[0]); cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength); // Create the program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1); printf("clCreateProgramWithSource...\n"); if (ciErr1 != CL_SUCCESS) { printf("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Build the program with 'mad' Optimization option #ifdef MAC char* flags = "-cl-fast-relaxed-math -DMAC"; #else char* flags = "-cl-fast-relaxed-math"; #endif ciErr1 = clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL); printf("clBuildProgram...\n"); if (ciErr1 != CL_SUCCESS) { printf("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Create the kernel ckKernel = clCreateKernel(cpProgram, "Barrier", &ciErr1); printf("clCreateKernel (Barrier)...\n"); if (ciErr1 != CL_SUCCESS) { printf("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Allocate and initialize host arrays printf( "Allocate and Init Host Mem...\n"); input = (int *)malloc(sizeof(int) * NUM_BLOCKS); for(int i =0; i<=NUM_BLOCKS; i++) { input[i]=0; } // Allocate the OpenCL buffer memory objects for source and result on the device GMEM array_in = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(int)* NUM_BLOCKS, NULL, &ciErr1); array_out = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(int)* NUM_BLOCKS, NULL, &ciErr1); if (ciErr1 != CL_SUCCESS) { printf("Error in clCreateBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Set the Argument values ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_int), (void*)&goal_val); ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&array_in); ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&array_out); // ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_int), (void*)&iNumElements); printf("clSetKernelArg 0 - 2...\n\n"); if (ciErr1 != CL_SUCCESS) { printf("Error in clSetKernelArg, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // -------------------------------------------------------- // Start Core sequence... copy input data to GPU, compute, copy results back ciErr1 = clEnqueueWriteBuffer(cqCommandQueue, array_in, CL_FALSE, 0, sizeof(int) * NUM_BLOCKS,(void*) input, 0, NULL, NULL); printf("clEnqueueWriteBuffer (SrcA and SrcB)...\n"); if (ciErr1 != CL_SUCCESS) { printf("Error in clEnqueueWriteBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Launch kernel ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, &ceEvent); printf("clEnqueueNDRangeKernel (Barrier)...\n"); if (ciErr1 != CL_SUCCESS) { printf("Error in clEnqueueNDRangeKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } /*ciErr1 = clEnqueueReadBuffer(cqCommandQueue, global_mutex, CL_TRUE, 0, sizeof(cl_int), &original_goal, 0, NULL, NULL); printf("clEnqueueReadBuffer (Dst)...%d \n\n", original_goal); if (ciErr1 != CL_SUCCESS) { printf("Error in clEnqueueReadBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); }*/ //GPU_PROFILING ciErr1=clWaitForEvents(1, &ceEvent); if (ciErr1 != CL_SUCCESS) { printf("Error 1 !\n\n"); Cleanup(argc, argv, EXIT_FAILURE); } cl_ulong start, end; ciErr1 = clGetEventProfilingInfo(ceEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); ciErr1 |= clGetEventProfilingInfo(ceEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); if (ciErr1 != CL_SUCCESS) { printf("Error 2 !\n\n"); Cleanup(argc, argv, EXIT_FAILURE); } double dSeconds = 1.0e-9 * (double)(end - start); printf("Done! time taken %llu \n",end - start ); // printf("Done! Kernel execution time: %.5f s\n\n", dSeconds); // Release event clReleaseEvent(ceEvent); ceEvent = 0; Cleanup (argc, argv, EXIT_SUCCESS); }
// Function to read in kernel from uncompiled source, create the OCL program and build the OCL program // ************************************************************************************************** int CreateProgramAndKernel(cl_context cxGPUContext, cl_device_id* cdDevices, const char *kernel_name, cl_kernel *kernel, bool bDouble) { cl_program cpProgram; size_t szSourceLen; cl_int ciErrNum = CL_SUCCESS; // Read the kernel in from file shrLog("\nLoading Uncompiled kernel from .cl file, using %s\n", clSourcefile); char* cPathAndFile = shrFindFilePath(clSourcefile, cExecutablePath); oclCheckError(cPathAndFile != NULL, shrTRUE); char* pcSource = oclLoadProgSource(cPathAndFile, "", &szSourceLen); oclCheckError(pcSource != NULL, shrTRUE); // Check OpenCL version -> vec3 types are supported only from version 1.1 and above char cOCLVersion[32]; clGetDeviceInfo(cdDevices[0], CL_DEVICE_VERSION, sizeof(cOCLVersion), &cOCLVersion, 0); int iVec3Length = 3; if( strncmp("OpenCL 1.0", cOCLVersion, 10) == 0 ) { iVec3Length = 4; } //for double precision char *pcSourceForDouble; std::stringstream header; if (bDouble) { header << "#define REAL double"; header << std::endl; header << "#define REAL4 double4"; header << std::endl; header << "#define REAL3 double" << iVec3Length; header << std::endl; header << "#define ZERO3 {0.0, 0.0, 0.0" << ((iVec3Length == 4) ? ", 0.0}" : "}"); header << std::endl; } else { header << "#define REAL float"; header << std::endl; header << "#define REAL4 float4"; header << std::endl; header << "#define REAL3 float" << iVec3Length; header << std::endl; header << "#define ZERO3 {0.0f, 0.0f, 0.0f" << ((iVec3Length == 4) ? ", 0.0f}" : "}"); header << std::endl; } header << pcSource; pcSourceForDouble = (char *)malloc(header.str().size() + 1); szSourceLen = header.str().size(); #ifdef WIN32 strcpy_s(pcSourceForDouble, szSourceLen + 1, header.str().c_str()); #else strcpy(pcSourceForDouble, header.str().c_str()); #endif // create the program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&pcSourceForDouble, &szSourceLen, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateProgramWithSource\n"); // Build the program with 'mad' Optimization option #ifdef MAC char *flags = "-cl-fast-relaxed-math -DMAC"; #else char *flags = "-cl-fast-relaxed-math"; #endif ciErrNum = clBuildProgram(cpProgram, 0, NULL, flags, NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclNbody.ptx"); oclCheckError(ciErrNum, CL_SUCCESS); } shrLog("clBuildProgram\n"); // create the kernel *kernel = clCreateKernel(cpProgram, kernel_name, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateKernel\n"); size_t wgSize; ciErrNum = clGetKernelWorkGroupInfo(*kernel, cdDevices[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL); if (wgSize == 64) { shrLog( "ERROR: Minimum work-group size 256 required by this application is not supported on this device.\n"); exit(0); } free(pcSourceForDouble); return 0; }
b2CLSolveTOI::b2CLSolveTOI() { printf("Initializing b2CLSolveTOI...\n"); int err; //load opencl programs from files char* TOIKernelSource = 0; size_t TOIKernelSourceLen = 0; shrLog("...loading b2CLSolveTOI.cl\n"); #ifdef linux TOIKernelSource = b2clLoadProgSource(shrFindFilePath("/opt/apps/com.samsung.browser/include/Box2D/Common/OpenCL/b2CLSolveTOI.cl", NULL), "// My comment\n", &TOIKernelSourceLen); #elif defined (_WIN32) TOIKernelSource = b2clLoadProgSource(shrFindFilePath("../../Box2D/Common/OpenCL/b2CLSolveTOI.cl", NULL), "// My comment\n", &TOIKernelSourceLen); #elif defined (__EMSCRIPTEN__) TOIKernelSource = b2clLoadProgSource(shrFindFilePath("./Common/OpenCL/b2CLSolveTOI.cl", NULL), "// My comment\n", &TOIKernelSourceLen); #else TOIKernelSource = b2clLoadProgSource(shrFindFilePath("../../../Box2D/Common/OpenCL/b2CLSolveTOI.cl", NULL), "// My comment\n", &TOIKernelSourceLen); #endif if(TOIKernelSource == NULL) { b2Log("Could not load program source, is path 'b2CLSolveTOI.cl' correct?"); } //create the compute program from source kernel code SolveTOIProgram = clCreateProgramWithSource(b2CLDevice::instance().GetContext(), 1, (const char**)&TOIKernelSource, NULL, &err); if (!SolveTOIProgram) { printf("Error: Failed to create compute program!\n"); exit(1); } //build the program err = clBuildProgram(SolveTOIProgram, 0, NULL, OPENCL_BUILD_PATH, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[204800]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(SolveTOIProgram, b2CLDevice::instance().GetCurrentDevice(), CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } #if 0 //create the compute kernel ComputeTOIKernel = clCreateKernel(SolveTOIProgram, "b2clComputeTOI", &err); if (!ComputeTOIKernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } b2CLDevice::instance().getMaximumKernelWorkGroupSize(ComputeTOIKernel, ComputeTOIKernelWorkGroupSize); lastNumContacts = 0; sortCount = 0; ContactTOIsBuffer = NULL; ContactIndicesBuffer = NULL; //create the AdvanceBodiesKernel AdvanceBodiesKernel = clCreateKernel (SolveTOIProgram, "b2clAdvanceBodiesKernel", &err); if (!AdvanceBodiesKernel || err != CL_SUCCESS) { printf("Error: Failed to create AdvanceBodiesKernel kernel!\n"); exit(1); } b2CLDevice::instance().getMaximumKernelWorkGroupSize(AdvanceBodiesKernel, ComputeTOIKernelWorkGroupSize); this->MarkConnectedContactsKernel = clCreateKernel (SolveTOIProgram, "b2clMarkConnectedContactsKernel", &err); if (!MarkConnectedContactsKernel || err != CL_SUCCESS) { printf("Error: Failed to create MarkConnectedContactsKernel kernel!\n"); exit(1); } #endif syncMovedBodyKernel = clCreateKernel (SolveTOIProgram, "b2clsyncMovedBodyKernel", &err); if (!syncMovedBodyKernel || err != CL_SUCCESS) { printf("Error: Failed to create syncMovedBodyKernel kernel!\n"); exit(1); } b2CLDevice::instance().getMaximumKernelWorkGroupSize(syncMovedBodyKernel, ComputeTOIKernelWorkGroupSize); movedBodyList = NULL ; bodyListSize = 0 ; this->copyAllPositionList = NULL ; this->copyAllVelocityList = NULL; this->copyAllXFList = NULL ; copyBodyNum = 0 ; }
// Main function // ********************************************************************* int main(int argc, char** argv) { shrQAStart(argc, argv); int use_gpu = 0; for(int i = 0; i < argc && argv; i++) { if(!argv[i]) continue; if(strstr(argv[i], "cpu")) use_gpu = 0; else if(strstr(argv[i], "gpu")) use_gpu = 1; } // start logs shrSetLogFileName ("oclDXTCompression.txt"); shrLog("%s Starting...\n\n", argv[0]); cl_platform_id cpPlatform = NULL; cl_uint uiNumDevices = 0; cl_device_id *cdDevices = NULL; cl_context cxGPUContext; cl_command_queue cqCommandQueue; cl_program cpProgram; cl_kernel ckKernel; cl_mem cmMemObjs[3]; cl_mem cmAlphaTable4, cmProds4; cl_mem cmAlphaTable3, cmProds3; size_t szGlobalWorkSize[1]; size_t szLocalWorkSize[1]; cl_int ciErrNum; // Get the path of the filename char *filename; if (shrGetCmdLineArgumentstr(argc, (const char **)argv, "image", &filename)) { image_filename = filename; } // load image const char* image_path = shrFindFilePath(image_filename, argv[0]); oclCheckError(image_path != NULL, shrTRUE); shrLoadPPM4ub(image_path, (unsigned char **)&h_img, &width, &height); oclCheckError(h_img != NULL, shrTRUE); shrLog("Loaded '%s', %d x %d pixels\n\n", image_path, width, height); // Convert linear image to block linear. const uint memSize = width * height * sizeof(cl_uint); uint* block_image = (uint*)malloc(memSize); // Convert linear image to block linear. for(uint by = 0; by < height/4; by++) { for(uint bx = 0; bx < width/4; bx++) { for (int i = 0; i < 16; i++) { const int x = i & 3; const int y = i / 4; block_image[(by * width/4 + bx) * 16 + i] = ((uint *)h_img)[(by * 4 + y) * 4 * (width/4) + bx * 4 + x]; } } } // Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckError(ciErrNum, CL_SUCCESS); // Get the platform's GPU devices ciErrNum = clGetDeviceIDs(cpPlatform, use_gpu?CL_DEVICE_TYPE_GPU:CL_DEVICE_TYPE_CPU, 0, NULL, &uiNumDevices); oclCheckError(ciErrNum, CL_SUCCESS); cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, use_gpu?CL_DEVICE_TYPE_GPU:CL_DEVICE_TYPE_CPU, uiNumDevices, cdDevices, NULL); oclCheckError(ciErrNum, CL_SUCCESS); // Create the context cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); // get and log device cl_device_id device; if( shrCheckCmdLineFlag(argc, (const char **)argv, "device") ) { int device_nr = 0; shrGetCmdLineArgumenti(argc, (const char **)argv, "device", &device_nr); device = oclGetDev(cxGPUContext, device_nr); if( device == (cl_device_id)-1 ) { shrLog(" Invalid GPU Device: devID=%d. %d valid GPU devices detected\n\n", device_nr, uiNumDevices); shrLog(" exiting...\n"); return -1; } } else { device = oclGetMaxFlopsDev(cxGPUContext); } oclPrintDevName(LOGBOTH, device); shrLog("\n"); // create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, device, 0, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); // Memory Setup // Constants cmAlphaTable4 = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 4 * sizeof(cl_float), (void*)&alphaTable4[0], &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); cmProds4 = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 4 * sizeof(cl_int), (void*)&prods4[0], &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); cmAlphaTable3 = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 4 * sizeof(cl_float), (void*)&alphaTable3[0], &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); cmProds3 = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 4 * sizeof(cl_int), (void*)&prods3[0], &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); // Compute permutations. cl_uint permutations[1024]; computePermutations(permutations); // Upload permutations. cmMemObjs[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_uint) * 1024, permutations, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); // Image cmMemObjs[1] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, memSize, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); // Result const uint compressedSize = (width / 4) * (height / 4) * 8; cmMemObjs[2] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, compressedSize, NULL , &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); unsigned int * h_result = (uint*)malloc(compressedSize); // Program Setup size_t program_length; const char* source_path = shrFindFilePath("DXTCompression.cl", argv[0]); oclCheckError(source_path != NULL, shrTRUE); char *source = oclLoadProgSource(source_path, "", &program_length); oclCheckError(source != NULL, shrTRUE); // create the program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **) &source, &program_length, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); // build the program ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclDXTCompression.ptx"); oclCheckError(ciErrNum, CL_SUCCESS); } // create the kernel ckKernel = clCreateKernel(cpProgram, "compress", &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); // set the args values ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void *) &cmMemObjs[0]); ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void *) &cmMemObjs[1]); ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void *) &cmMemObjs[2]); ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(cl_mem), (void*)&cmAlphaTable4); ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(cl_mem), (void*)&cmProds4); ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(cl_mem), (void*)&cmAlphaTable3); ciErrNum |= clSetKernelArg(ckKernel, 6, sizeof(cl_mem), (void*)&cmProds3); oclCheckError(ciErrNum, CL_SUCCESS); // Copy input data host to device clEnqueueWriteBuffer(cqCommandQueue, cmMemObjs[1], CL_FALSE, 0, sizeof(cl_uint) * width * height, block_image, 0,0,0); // Determine launch configuration and run timed computation numIterations times int blocks = ((width + 3) / 4) * ((height + 3) / 4); // rounds up by 1 block in each dim if %4 != 0 // Restrict the numbers of blocks to launch on low end GPUs to avoid kernel timeout cl_uint compute_units; clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL); int blocksPerLaunch = MIN(blocks, 768 * (int)compute_units); // set work-item dimensions szGlobalWorkSize[0] = blocksPerLaunch * NUM_THREADS; szLocalWorkSize[0]= NUM_THREADS; #ifdef GPU_PROFILING shrLog("\nRunning DXT Compression on %u x %u image...\n", width, height); shrLog("\n%u Workgroups, %u Work Items per Workgroup, %u Work Items in NDRange...\n\n", blocks, NUM_THREADS, blocks * NUM_THREADS); int numIterations = 50; for (int i = -1; i < numIterations; ++i) { if (i == 0) { // start timing only after the first warmup iteration clFinish(cqCommandQueue); // flush command queue shrDeltaT(0); // start timer } #endif // execute kernel for( int j=0; j<blocks; j+= blocksPerLaunch ) { clSetKernelArg(ckKernel, 7, sizeof(int), &j); szGlobalWorkSize[0] = MIN( blocksPerLaunch, blocks-j ) * NUM_THREADS; ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); } #ifdef GPU_PROFILING } clFinish(cqCommandQueue); double dAvgTime = shrDeltaT(0) / (double)numIterations; shrLogEx(LOGBOTH | MASTER, 0, "oclDXTCompression, Throughput = %.4f MPixels/s, Time = %.5f s, Size = %u Pixels, NumDevsUsed = %i, Workgroup = %d\n", (1.0e-6 * (double)(width * height)/ dAvgTime), dAvgTime, (width * height), 1, szLocalWorkSize[0]); #endif // blocking read output ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmMemObjs[2], CL_TRUE, 0, compressedSize, h_result, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); // Write DDS file. FILE* fp = NULL; char output_filename[1024]; #ifdef WIN32 strcpy_s(output_filename, 1024, image_path); strcpy_s(output_filename + strlen(image_path) - 3, 1024 - strlen(image_path) + 3, "dds"); fopen_s(&fp, output_filename, "wb"); #else strcpy(output_filename, image_path); strcpy(output_filename + strlen(image_path) - 3, "dds"); fp = fopen(output_filename, "wb"); #endif oclCheckError(fp != NULL, shrTRUE); DDSHeader header; header.fourcc = FOURCC_DDS; header.size = 124; header.flags = (DDSD_WIDTH|DDSD_HEIGHT|DDSD_CAPS|DDSD_PIXELFORMAT|DDSD_LINEARSIZE); header.height = height; header.width = width; header.pitch = compressedSize; header.depth = 0; header.mipmapcount = 0; memset(header.reserved, 0, sizeof(header.reserved)); header.pf.size = 32; header.pf.flags = DDPF_FOURCC; header.pf.fourcc = FOURCC_DXT1; header.pf.bitcount = 0; header.pf.rmask = 0; header.pf.gmask = 0; header.pf.bmask = 0; header.pf.amask = 0; header.caps.caps1 = DDSCAPS_TEXTURE; header.caps.caps2 = 0; header.caps.caps3 = 0; header.caps.caps4 = 0; header.notused = 0; fwrite(&header, sizeof(DDSHeader), 1, fp); fwrite(h_result, compressedSize, 1, fp); fclose(fp); // Make sure the generated image matches the reference image (regression check) shrLog("\nComparing against Host/C++ computation...\n"); const char* reference_image_path = shrFindFilePath(refimage_filename, argv[0]); oclCheckError(reference_image_path != NULL, shrTRUE); // read in the reference image from file #ifdef WIN32 fopen_s(&fp, reference_image_path, "rb"); #else fp = fopen(reference_image_path, "rb"); #endif oclCheckError(fp != NULL, shrTRUE); fseek(fp, sizeof(DDSHeader), SEEK_SET); uint referenceSize = (width / 4) * (height / 4) * 8; uint * reference = (uint *)malloc(referenceSize); fread(reference, referenceSize, 1, fp); fclose(fp); // compare the reference image data to the sample/generated image float rms = 0; for (uint y = 0; y < height; y += 4) { for (uint x = 0; x < width; x += 4) { // binary comparison of data uint referenceBlockIdx = ((y/4) * (width/4) + (x/4)); uint resultBlockIdx = ((y/4) * (width/4) + (x/4)); int cmp = compareBlock(((BlockDXT1 *)h_result) + resultBlockIdx, ((BlockDXT1 *)reference) + referenceBlockIdx); // log deviations, if any if (cmp != 0.0f) { compareBlock(((BlockDXT1 *)h_result) + resultBlockIdx, ((BlockDXT1 *)reference) + referenceBlockIdx); shrLog("Deviation at (%d, %d):\t%f rms\n", x/4, y/4, float(cmp)/16/3); } rms += cmp; } } rms /= width * height * 3; shrLog("RMS(reference, result) = %f\n\n", rms); // Free OpenCL resources oclDeleteMemObjs(cmMemObjs, 3); clReleaseMemObject(cmAlphaTable4); clReleaseMemObject(cmProds4); clReleaseMemObject(cmAlphaTable3); clReleaseMemObject(cmProds3); clReleaseKernel(ckKernel); clReleaseProgram(cpProgram); clReleaseCommandQueue(cqCommandQueue); clReleaseContext(cxGPUContext); // Free host memory free(source); free(h_img); // finish shrQAFinishExit(argc, (const char **)argv, (rms <= ERROR_THRESHOLD) ? QA_PASSED : QA_FAILED); }
// Main function // ********************************************************************* int main(const int argc, const char** argv) { // start logs shrSetLogFileName ("oclDXTCompression.txt"); shrLog(LOGBOTH, 0, "%s Starting...\n\n", argv[0]); cl_context cxGPUContext; cl_command_queue cqCommandQueue; cl_program cpProgram; cl_kernel ckKernel; cl_mem cmMemObjs[3]; size_t szGlobalWorkSize[1]; size_t szLocalWorkSize[1]; cl_int ciErrNum; // Get the path of the filename char *filename; if (shrGetCmdLineArgumentstr(argc, argv, "image", &filename)) { image_filename = filename; } // load image const char* image_path = shrFindFilePath(image_filename, argv[0]); shrCheckError(image_path != NULL, shrTRUE); shrLoadPPM4ub(image_path, (unsigned char **)&h_img, &width, &height); shrCheckError(h_img != NULL, shrTRUE); shrLog(LOGBOTH, 0, "Loaded '%s', %d x %d pixels\n", image_path, width, height); // Convert linear image to block linear. uint * block_image = (uint *) malloc(width * height * 4); // Convert linear image to block linear. for(uint by = 0; by < height/4; by++) { for(uint bx = 0; bx < width/4; bx++) { for (int i = 0; i < 16; i++) { const int x = i & 3; const int y = i / 4; block_image[(by * width/4 + bx) * 16 + i] = ((uint *)h_img)[(by * 4 + y) * 4 * (width/4) + bx * 4 + x]; } } } // create the OpenCL context on a GPU device cxGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); // get and log device cl_device_id device; if( shrCheckCmdLineFlag(argc, argv, "device") ) { int device_nr = 0; shrGetCmdLineArgumenti(argc, argv, "device", &device_nr); device = oclGetDev(cxGPUContext, device_nr); } else { device = oclGetMaxFlopsDev(cxGPUContext); } oclPrintDevInfo(LOGBOTH, device); // create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, device, 0, &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); // Memory Setup // Compute permutations. cl_uint permutations[1024]; computePermutations(permutations); // Upload permutations. cmMemObjs[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_uint) * 1024, permutations, &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); // Image cmMemObjs[1] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY , sizeof(cl_uint) * width * height, NULL, &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); // Result const uint compressedSize = (width / 4) * (height / 4) * 8; cmMemObjs[2] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, compressedSize, NULL , &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); unsigned int * h_result = (uint *)malloc(compressedSize); // Program Setup size_t program_length; const char* source_path = shrFindFilePath("DXTCompression.cl", argv[0]); shrCheckError(source_path != NULL, shrTRUE); char *source = oclLoadProgSource(source_path, "", &program_length); shrCheckError(source != NULL, shrTRUE); // create the program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **) &source, &program_length, &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); // build the program ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-mad-enable", NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLog(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclDXTCompression.ptx"); shrCheckError(ciErrNum, CL_SUCCESS); } // create the kernel ckKernel = clCreateKernel(cpProgram, "compress", &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); // set the args values ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void *) &cmMemObjs[0]); ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void *) &cmMemObjs[1]); ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void *) &cmMemObjs[2]); ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(float) * 4 * 16, NULL); ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(float) * 4 * 16, NULL); ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(int) * 64, NULL); ciErrNum |= clSetKernelArg(ckKernel, 6, sizeof(float) * 16 * 6, NULL); ciErrNum |= clSetKernelArg(ckKernel, 7, sizeof(unsigned int) * 160, NULL); ciErrNum |= clSetKernelArg(ckKernel, 8, sizeof(int) * 16, NULL); shrCheckError(ciErrNum, CL_SUCCESS); shrLog(LOGBOTH, 0, "Running DXT Compression on %u x %u image...\n\n", width, height); // Upload the image clEnqueueWriteBuffer(cqCommandQueue, cmMemObjs[1], CL_FALSE, 0, sizeof(cl_uint) * width * height, block_image, 0,0,0); // set work-item dimensions szGlobalWorkSize[0] = width * height * (NUM_THREADS/16); szLocalWorkSize[0]= NUM_THREADS; #ifdef GPU_PROFILING int numIterations = 100; for (int i = -1; i < numIterations; ++i) { if (i == 0) { // start timing only after the first warmup iteration clFinish(cqCommandQueue); // flush command queue shrDeltaT(0); // start timer } #endif // execute kernel ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL); shrCheckError(ciErrNum, CL_SUCCESS); #ifdef GPU_PROFILING } clFinish(cqCommandQueue); double dAvgTime = shrDeltaT(0) / (double)numIterations; shrLog(LOGBOTH | MASTER, 0, "oclDXTCompression, Throughput = %.4f, Time = %.5f, Size = %u, NumDevsUsed = %i\n", (1.0e-6 * (double)(width * height)/ dAvgTime), dAvgTime, (width * height), 1); #endif // blocking read output ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmMemObjs[2], CL_TRUE, 0, compressedSize, h_result, 0, NULL, NULL); shrCheckError(ciErrNum, CL_SUCCESS); // Write DDS file. FILE* fp = NULL; char output_filename[1024]; #ifdef WIN32 strcpy_s(output_filename, 1024, image_path); strcpy_s(output_filename + strlen(image_path) - 3, 1024 - strlen(image_path) + 3, "dds"); fopen_s(&fp, output_filename, "wb"); #else strcpy(output_filename, image_path); strcpy(output_filename + strlen(image_path) - 3, "dds"); fp = fopen(output_filename, "wb"); #endif shrCheckError(fp != NULL, shrTRUE); DDSHeader header; header.fourcc = FOURCC_DDS; header.size = 124; header.flags = (DDSD_WIDTH|DDSD_HEIGHT|DDSD_CAPS|DDSD_PIXELFORMAT|DDSD_LINEARSIZE); header.height = height; header.width = width; header.pitch = compressedSize; header.depth = 0; header.mipmapcount = 0; memset(header.reserved, 0, sizeof(header.reserved)); header.pf.size = 32; header.pf.flags = DDPF_FOURCC; header.pf.fourcc = FOURCC_DXT1; header.pf.bitcount = 0; header.pf.rmask = 0; header.pf.gmask = 0; header.pf.bmask = 0; header.pf.amask = 0; header.caps.caps1 = DDSCAPS_TEXTURE; header.caps.caps2 = 0; header.caps.caps3 = 0; header.caps.caps4 = 0; header.notused = 0; fwrite(&header, sizeof(DDSHeader), 1, fp); fwrite(h_result, compressedSize, 1, fp); fclose(fp); // Make sure the generated image matches the reference image (regression check) shrLog(LOGBOTH, 0, "\nComparing against Host/C++ computation...\n"); const char* reference_image_path = shrFindFilePath(refimage_filename, argv[0]); shrCheckError(reference_image_path != NULL, shrTRUE); // read in the reference image from file #ifdef WIN32 fopen_s(&fp, reference_image_path, "rb"); #else fp = fopen(reference_image_path, "rb"); #endif shrCheckError(fp != NULL, shrTRUE); fseek(fp, sizeof(DDSHeader), SEEK_SET); uint referenceSize = (width / 4) * (height / 4) * 8; uint * reference = (uint *)malloc(referenceSize); fread(reference, referenceSize, 1, fp); fclose(fp); // compare the reference image data to the sample/generated image float rms = 0; for (uint y = 0; y < height; y += 4) { for (uint x = 0; x < width; x += 4) { // binary comparison of data uint referenceBlockIdx = ((y/4) * (width/4) + (x/4)); uint resultBlockIdx = ((y/4) * (width/4) + (x/4)); int cmp = compareBlock(((BlockDXT1 *)h_result) + resultBlockIdx, ((BlockDXT1 *)reference) + referenceBlockIdx); // log deviations, if any if (cmp != 0.0f) { compareBlock(((BlockDXT1 *)h_result) + resultBlockIdx, ((BlockDXT1 *)reference) + referenceBlockIdx); shrLog(LOGBOTH, 0, "Deviation at (%d, %d):\t%f rms\n", x/4, y/4, float(cmp)/16/3); } rms += cmp; } } rms /= width * height * 3; shrLog(LOGBOTH, 0, "RMS(reference, result) = %f\n\n", rms); shrLog(LOGBOTH, 0, "TEST %s\n\n", (rms <= ERROR_THRESHOLD) ? "PASSED" : "FAILED !!!"); // Free OpenCL resources oclDeleteMemObjs(cmMemObjs, 3); clReleaseKernel(ckKernel); clReleaseProgram(cpProgram); clReleaseCommandQueue(cqCommandQueue); clReleaseContext(cxGPUContext); // Free host memory free(source); free(h_img); // finish shrEXIT(argc, argv); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { //start logs shrSetLogFileName ("volumeRender.txt"); shrLog("%s Starting...\n\n", argv[0]); if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") || cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) { g_bQAReadback = true; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) { g_bQAGLVerify = true; fpsLimit = frameCheckNumber; } if (g_bQAReadback) { // use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilDeviceInit(argc, argv); } else { cudaSetDevice( cutGetMaxGflopsDeviceId() ); } } else { // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL( &argc, argv ); // use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilGLDeviceInit(argc, argv); } else { cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); } /* int device; struct cudaDeviceProp prop; cudaGetDevice( &device ); cudaGetDeviceProperties( &prop, device ); if( !strncmp( "Tesla", prop.name, 5 ) ) { shrLog("This sample needs a card capable of OpenGL and display.\n"); shrLog("Please choose a different device with the -device=x argument.\n"); cutilExit(argc, argv); } */ } // parse arguments char *filename; if (cutGetCmdLineArgumentstr( argc, (const char**) argv, "file", &filename)) { volumeFilename = filename; } int n; if (cutGetCmdLineArgumenti( argc, (const char**) argv, "size", &n)) { volumeSize.width = volumeSize.height = volumeSize.depth = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "xsize", &n)) { volumeSize.width = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "ysize", &n)) { volumeSize.height = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "zsize", &n)) { volumeSize.depth = n; } // load volume data char* path = shrFindFilePath(volumeFilename, argv[0]); if (path == 0) { shrLog("Error finding file '%s'\n", volumeFilename); exit(EXIT_FAILURE); } size_t size = volumeSize.width*volumeSize.height*volumeSize.depth*sizeof(VolumeType); void *h_volume = loadRawFile(path, size); initCuda(h_volume, volumeSize); free(h_volume); cutilCheckError( cutCreateTimer( &timer)); shrLog("Press '=' and '-' to change density\n" " ']' and '[' to change brightness\n" " ';' and ''' to modify transfer function offset\n" " '.' and ',' to modify transfer function scale\n\n"); // calculate new grid size gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y)); if (g_bQAReadback) { g_CheckRender = new CheckBackBuffer(width, height, 4, false); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); uint *d_output; cutilSafeCall(cudaMalloc((void**)&d_output, width*height*sizeof(uint))); cutilSafeCall(cudaMemset(d_output, 0, width*height*sizeof(uint))); float modelView[16] = { 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 4.0f, 1.0f }; invViewMatrix[0] = modelView[0]; invViewMatrix[1] = modelView[4]; invViewMatrix[2] = modelView[8]; invViewMatrix[3] = modelView[12]; invViewMatrix[4] = modelView[1]; invViewMatrix[5] = modelView[5]; invViewMatrix[6] = modelView[9]; invViewMatrix[7] = modelView[13]; invViewMatrix[8] = modelView[2]; invViewMatrix[9] = modelView[6]; invViewMatrix[10] = modelView[10]; invViewMatrix[11] = modelView[14]; // call CUDA kernel, writing results to PBO copyInvViewMatrix(invViewMatrix, sizeof(float4)*3); // Start timer 0 and process n loops on the GPU int nIter = 10; for (int i = -1; i < nIter; i++) { if( i == 0 ) { cudaThreadSynchronize(); cutStartTimer(timer); } render_kernel(gridSize, blockSize, d_output, width, height, density, brightness, transferOffset, transferScale); } cudaThreadSynchronize(); cutStopTimer(timer); // Get elapsed time and throughput, then log to sample and master logs double dAvgTime = cutGetTimerValue(timer)/(nIter * 1000.0); shrLogEx(LOGBOTH | MASTER, 0, "volumeRender, Throughput = %.4f MTexels/s, Time = %.5f s, Size = %u Texels, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * width * height)/dAvgTime, dAvgTime, (width * height), 1, blockSize.x * blockSize.y); cutilCheckMsg("Error: render_kernel() execution FAILED"); cutilSafeCall( cudaThreadSynchronize() ); cutilSafeCall( cudaMemcpy(g_CheckRender->imageData(), d_output, width*height*4, cudaMemcpyDeviceToHost) ); g_CheckRender->savePPM(sOriginal[g_Index], true, NULL); if (!g_CheckRender->PPMvsPPM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, THRESHOLD)) { shrLog("\nFAILED\n\n"); } else { shrLog("\nPASSED\n\n"); } cudaFree(d_output); freeCudaBuffers(); if (g_CheckRender) { delete g_CheckRender; g_CheckRender = NULL; } } else { // This is the normal rendering path for VolumeRender glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutMouseFunc(mouse); glutMotionFunc(motion); glutReshapeFunc(reshape); glutIdleFunc(idle); initPixelBuffer(); if (g_bQAGLVerify) { g_CheckRender = new CheckBackBuffer(width, height, 4); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); } atexit(cleanup); glutMainLoop(); } cudaThreadExit(); shrEXIT(argc, (const char**)argv); }
int InitOpenCLContext() { // start logs shrSetLogFileName ("oclVolumeRender.txt"); // get command line arg for quick test, if provided // process command line arguments // First initialize OpenGL context, so we can properly setup the OpenGL / OpenCL interop. // glewInit(); // GLboolean bGLEW = glewIsSupported("GL_VERSION_2_0 GL_ARB_pixel_buffer_object"); // oclCheckErrorEX(bGLEW, shrTRUE, pCleanup); g_glInterop = true; // Create OpenCL context, get device info, select device, select options for image/texture and CL-GL interop createCLContext(); // Print device info clGetDeviceInfo(cdDevices[uiDeviceUsed], CL_DEVICE_IMAGE_SUPPORT, sizeof(g_bImageSupport), &g_bImageSupport, NULL); //shrLog("%s...\n\n", g_bImageSupport ? "Using Image (Texture)" : "No Image (Texuture) Support"); // shrLog("Detailed Device info:\n\n"); oclPrintDevInfo(LOGBOTH, cdDevices[uiDeviceUsed]); // create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiDeviceUsed], 0, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Program Setup size_t program_length; cPathAndName = shrFindFilePath("Transform.cl", "."); oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup); cSourceCL = oclLoadProgSource(cPathAndName, "", &program_length); oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup); // create the program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &program_length, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // build the program std::string buildOpts = "-cl-single-precision_constant"; // buildOpts += g_bImageSupport ? " -DIMAGE_SUPPORT" : ""; // ciErrNum = clBuildProgram(cpProgram, 1, &cdDevices[uiDeviceUsed],"-cl-fast-relaxed-math", NULL, NULL); ciErrNum = clBuildProgram(cpProgram, 1, &cdDevices[uiDeviceUsed],NULL, NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and return error shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclVolumeRender.ptx"); Cleanup(EXIT_FAILURE); } // create the kernel ScalseKernel = clCreateKernel(cpProgram, "d_render", &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); TransformKernel = clCreateKernel(cpProgram, "angle", &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); LongToShortKernel = clCreateKernel(cpProgram, "transfer", &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); return TRUE; }
// Main function // ********************************************************************* int main(int argc, char **argv) { ////////////////////////////////////////////////////////////////////////// unsigned int count = iNumElements; int k = 8; unsigned int random_seed, random_seed2; srand( (unsigned)time( NULL ) ); random_seed = rand(); random_seed2 = rand(); ////////////////////////////////////////////////////////////////////////// // get command line arg for quick test, if provided bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt"); // start logs shrSetLogFileName ("oclVectorAdd.txt"); shrLog("%s Starting...\n\n# of float elements per Array \t= %i\n", argv[0], iNumElements); // set and log Global and Local work size dimensions szLocalWorkSize = 256; szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements); // rounded up to the nearest multiple of the LocalWorkSize shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n", szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); // Allocate and initialize host arrays shrLog( "Allocate and Init Host Mem...\n"); srcA = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize); srcB = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize); dst = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize); Golden = (void *)malloc(sizeof(cl_float) * iNumElements); shrFillArray((float*)srcA, iNumElements); shrFillArray((float*)srcB, iNumElements); ////////////////////////////////////////////////////////////////////////// float *scalar_value = new float[count]; float *gradient_magnitude = new float[count]; float *second_derivative_magnitude = new float[count]; unsigned char *label_ptr = new unsigned char[count]; shrFillArray(scalar_value, count); shrFillArray(gradient_magnitude, count); shrFillArray(second_derivative_magnitude, count); ////////////////////////////////////////////////////////////////////////// //Get an OpenCL platform ciErr1 = clGetPlatformIDs(1, &cpPlatform, NULL); shrLog("clGetPlatformID...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clGetPlatformID, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } //Get the devices ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL); shrLog("clGetDeviceIDs...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clGetDeviceIDs, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } //Create the context cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr1); shrLog("clCreateContext...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateContext, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &ciErr1); shrLog("clCreateCommandQueue...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateCommandQueue, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Allocate the OpenCL buffer memory objects for source and result on the device GMEM cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr1); cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2); ciErr1 |= ciErr2; cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2); ciErr1 |= ciErr2; ////////////////////////////////////////////////////////////////////////// cmDevSrc_scalar_value = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr1); cmDevSrc_gradient_magnitude = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2); ciErr1 |= ciErr2; cmDevSrc_second_derivative_magnitude = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2); ciErr1 |= ciErr2; cmDevDst_label_ptr = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2); ciErr1 |= ciErr2; ////////////////////////////////////////////////////////////////////////// shrLog("clCreateBuffer...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Read the OpenCL kernel in from source file shrLog("oclLoadProgSource (%s)...\n", cSourceFile); cPathAndName = shrFindFilePath(cSourceFile, argv[0]); cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength); printf("%s\n%s\n", cSourceFile, cPathAndName); // Create the program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1); shrLog("clCreateProgramWithSource...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Build the program with 'mad' Optimization option #ifdef MAC char* flags = "-cl-fast-relaxed-math -DMAC"; #else char* flags = "-cl-fast-relaxed-math"; #endif ciErr1 = clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL); shrLog("clBuildProgram...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Create the kernel ckKernel = clCreateKernel(cpProgram, "k_means", &ciErr1); shrLog("clCreateKernel (VectorAdd)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Set the Argument values //ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA); //ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB); //ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst); //ciErr1 |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements); ////////////////////////////////////////////////////////////////////////// // __global const float *scalar_value, __global const float *gradient_magnitude, __global const float *second_derivative_magnitude, __global unsigned char *label_ptr, __global const unsigned int count, __global const int k, __global const unsigned int random_seed, __global const unsigned int random_seed2 ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrc_scalar_value); ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrc_gradient_magnitude); ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevSrc_second_derivative_magnitude); ciErr1 |= clSetKernelArg(ckKernel, 3, sizeof(cl_mem), (void*)&cmDevDst_label_ptr); ciErr1 |= clSetKernelArg(ckKernel, 4, sizeof(cl_uint), (void*)&count); ciErr1 |= clSetKernelArg(ckKernel, 5, sizeof(cl_uint), (void*)&k); ciErr1 |= clSetKernelArg(ckKernel, 6, sizeof(cl_uint), (void*)&random_seed); ciErr1 |= clSetKernelArg(ckKernel, 7, sizeof(cl_uint), (void*)&random_seed2); ////////////////////////////////////////////////////////////////////////// shrLog("clSetKernelArg 0 - 3...\n\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clSetKernelArg, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // -------------------------------------------------------- // Start Core sequence... copy input data to GPU, compute, copy results back // Asynchronous write of data to GPU device //ciErr1 = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, srcA, 0, NULL, NULL); //ciErr1 |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, srcB, 0, NULL, NULL); ////////////////////////////////////////////////////////////////////////// ciErr1 = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrc_scalar_value, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, scalar_value, 0, NULL, NULL); ciErr1 |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrc_gradient_magnitude, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, gradient_magnitude, 0, NULL, NULL); ciErr1 |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrc_second_derivative_magnitude, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, second_derivative_magnitude, 0, NULL, NULL); ////////////////////////////////////////////////////////////////////////// shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueWriteBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Launch kernel ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL); shrLog("clEnqueueNDRangeKernel (VectorAdd)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueNDRangeKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Synchronous/blocking read of results, and check accumulated errors //ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL); ////////////////////////////////////////////////////////////////////////// ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmDevDst_label_ptr, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, label_ptr, 0, NULL, NULL); ////////////////////////////////////////////////////////////////////////// shrLog("clEnqueueReadBuffer (Dst)...\n\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueReadBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } //-------------------------------------------------------- // Compute and compare results for golden-host and report errors and pass/fail shrLog("Comparing against Host/C++ computation...\n\n"); VectorAddHost ((const float*)srcA, (const float*)srcB, (float*)Golden, iNumElements); shrBOOL bMatch = shrComparefet((const float*)Golden, (const float*)dst, (unsigned int)iNumElements, 0.0f, 0); shrLog("%s\n\n", (bMatch == shrTRUE) ? "PASSED" : "FAILED"); ////////////////////////////////////////////////////////////////////////// //float *a = (float *)srcA; //float *b = (float *)srcB; //float *c = (float *)dst; //float *d = (float *)Golden; //for (int i=0; i<iNumElements; i++) //{ // printf("%f+%f=%f=%f\t", a[i], b[i], c[i], a[i]+b[i]); // printf("%s\n", (a[i]+b[i]==c[i]?"equal":"not equal")); //} //for (int i=0; i<iNumElements; i++) //{ // printf("%f\n", ((float *)dst)[i]); //} ////////////////////////////////////////////////////////////////////////// // Cleanup and leave Cleanup (EXIT_SUCCESS); ////////////////////////////////////////////////////////////////////////// delete [] scalar_value; delete [] gradient_magnitude; delete [] second_derivative_magnitude; delete [] label_ptr; ////////////////////////////////////////////////////////////////////////// }
// Main program //***************************************************************************** int main(int argc, char** argv) { pArgc = &argc; pArgv = argv; shrQAStart(argc, argv); // Start logs cExecutableName = argv[0]; shrSetLogFileName ("oclSobelFilter.txt"); shrLog("%s Starting (Using %s)...\n\n", argv[0], clSourcefile); // Get command line args for quick test or QA test, if provided bNoPrompt = (bool)shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt"); bQATest = (bool)shrCheckCmdLineFlag(argc, (const char**)argv, "qatest"); // Menu items if (!(bQATest)) { ShowMenuItems(); } // Find the path from the exe to the image file cPathAndName = shrFindFilePath(cImageFile, argv[0]); oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup); shrLog("Image File\t = %s\nImage Dimensions = %u w x %u h x %u bpp\n\n", cPathAndName, uiImageWidth, uiImageHeight, sizeof(unsigned int)<<3); // Initialize OpenGL items (if not No-GL QA test) shrLog("%sInitGL...\n\n", bQATest ? "Skipping " : "Calling "); if (!(bQATest)) { InitGL(&argc, argv); } //Get the NVIDIA platform if available, otherwise use default char cBuffer[1024]; bool bNV = false; shrLog("Get Platform ID... "); ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); ciErrNum = clGetPlatformInfo (cpPlatform, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("%s\n\n", cBuffer); bNV = (strstr(cBuffer, "NVIDIA") != NULL); //Get the devices shrLog("Get Device Info...\n"); cl_uint uiNumAllDevs = 0; GpuDevMngr = new DeviceManager(cpPlatform, &uiNumAllDevs, pCleanup); // Get selected device if specified, otherwise examine avaiable ones and choose by perf cl_int iSelectedDevice = 0; if((shrGetCmdLineArgumenti(argc, (const char**)argv, "device", &iSelectedDevice)) || (uiNumAllDevs == 1)) { // Use 1 selected device GpuDevMngr->uiUsefulDevCt = 1; iSelectedDevice = CLAMP((cl_uint)iSelectedDevice, 0, (uiNumAllDevs - 1)); GpuDevMngr->uiUsefulDevs[0] = iSelectedDevice; GpuDevMngr->fLoadProportions[0] = 1.0f; shrLog(" Using 1 Selected Device for Sobel Filter Computation...\n"); } else { // Use available useful devices and Compute the device load proportions ciErrNum = GpuDevMngr->GetDevLoadProportions(bNV); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); if (GpuDevMngr->uiUsefulDevCt == 1) { iSelectedDevice = GpuDevMngr->uiUsefulDevs[0]; } shrLog(" Using %u Device(s) for Sobel Filter Computation\n", GpuDevMngr->uiUsefulDevCt); } //Create the context shrLog("\nclCreateContext...\n\n"); cxGPUContext = clCreateContext(0, uiNumAllDevs, GpuDevMngr->cdDevices, NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Allocate per-device OpenCL objects for useful devices cqCommandQueue = new cl_command_queue[GpuDevMngr->uiUsefulDevCt]; ckSobel = new cl_kernel[GpuDevMngr->uiUsefulDevCt]; cmDevBufIn = new cl_mem[GpuDevMngr->uiUsefulDevCt]; cmDevBufOut = new cl_mem[GpuDevMngr->uiUsefulDevCt]; szAllocDevBytes = new size_t[GpuDevMngr->uiUsefulDevCt]; uiInHostPixOffsets = new cl_uint[GpuDevMngr->uiUsefulDevCt]; uiOutHostPixOffsets = new cl_uint[GpuDevMngr->uiUsefulDevCt]; uiDevImageHeight = new cl_uint[GpuDevMngr->uiUsefulDevCt]; // Create command queue(s) for device(s) shrLog("clCreateCommandQueue...\n"); for (cl_uint i = 0; i < GpuDevMngr->uiUsefulDevCt; i++) { cqCommandQueue[i] = clCreateCommandQueue(cxGPUContext, GpuDevMngr->cdDevices[GpuDevMngr->uiUsefulDevs[i]], 0, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog(" CommandQueue %u, Device %u, Device Load Proportion = %.2f, ", i, GpuDevMngr->uiUsefulDevs[i], GpuDevMngr->fLoadProportions[i]); oclPrintDevName(LOGBOTH, GpuDevMngr->cdDevices[GpuDevMngr->uiUsefulDevs[i]]); shrLog("\n"); } // Allocate pinned input and output host image buffers: mem copy operations to/from pinned memory is much faster than paged memory szBuffBytes = uiImageWidth * uiImageHeight * sizeof (unsigned int); cmPinnedBufIn = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cmPinnedBufOut = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("\nclCreateBuffer (Input and Output Pinned Host buffers)...\n"); // Get mapped pointers for writing to pinned input and output host image pointers uiInput = (cl_uint*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedBufIn, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); uiOutput = (cl_uint*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedBufOut, CL_TRUE, CL_MAP_READ, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("clEnqueueMapBuffer (Pointer to Input and Output pinned host buffers)...\n"); // Load image data from file to pinned input host buffer ciErrNum = shrLoadPPM4ub(cPathAndName, (unsigned char **)&uiInput, &uiImageWidth, &uiImageHeight); oclCheckErrorEX(ciErrNum, shrTRUE, pCleanup); shrLog("Load Input Image to Input pinned host buffer...\n"); // Read the kernel in from file free(cPathAndName); cPathAndName = shrFindFilePath(clSourcefile, argv[0]); oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup); cSourceCL = oclLoadProgSource(cPathAndName, "// My comment\n", &szKernelLength); oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup); shrLog("Load OpenCL Prog Source from File...\n"); // Create the program object cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("clCreateProgramWithSource...\n"); // Build the program with 'mad' Optimization option #ifdef MAC char *flags = "-cl-fast-relaxed-math -DMAC"; #else char *flags = "-cl-fast-relaxed-math"; #endif ciErrNum = clBuildProgram(cpProgram, 0, NULL, flags, NULL, NULL); if (ciErrNum != CL_SUCCESS) { // On error: write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclSobelFilter.ptx"); Cleanup(EXIT_FAILURE); } shrLog("clBuildProgram...\n\n"); // Determine, the size/shape of the image portions for each dev and create the device buffers unsigned uiSumHeight = 0; for (cl_uint i = 0; i < GpuDevMngr->uiUsefulDevCt; i++) { // Create kernel instance ckSobel[i] = clCreateKernel(cpProgram, "ckSobel", &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("clCreateKernel (ckSobel), Device %u...\n", i); // Allocations and offsets for the portion of the image worked on by each device if (GpuDevMngr->uiUsefulDevCt == 1) { // One device processes the whole image with no offset uiDevImageHeight[i] = uiImageHeight; uiInHostPixOffsets[i] = 0; uiOutHostPixOffsets[i] = 0; szAllocDevBytes[i] = uiDevImageHeight[i] * uiImageWidth * sizeof(cl_uint); } else if (i == 0) { // Multiple devices, top stripe zone including topmost row of image: // Over-allocate on device by 1 row // Set offset and size to copy extra 1 padding row H2D (below bottom of stripe) // Won't return the last row (dark/garbage row) D2H uiInHostPixOffsets[i] = 0; uiOutHostPixOffsets[i] = 0; uiDevImageHeight[i] = (cl_uint)(GpuDevMngr->fLoadProportions[GpuDevMngr->uiUsefulDevs[i]] * (float)uiImageHeight); // height is proportional to dev perf uiSumHeight += uiDevImageHeight[i]; uiDevImageHeight[i] += 1; szAllocDevBytes[i] = uiDevImageHeight[i] * uiImageWidth * sizeof(cl_uint); } else if (i < (GpuDevMngr->uiUsefulDevCt - 1)) { // Multiple devices, middle stripe zone: // Over-allocate on device by 2 rows // Set offset and size to copy extra 2 padding rows H2D (above top and below bottom of stripe) // Won't return the first and last rows (dark/garbage rows) D2H uiInHostPixOffsets[i] = (uiSumHeight - 1) * uiImageWidth; uiOutHostPixOffsets[i] = uiInHostPixOffsets[i] + uiImageWidth; uiDevImageHeight[i] = (cl_uint)(GpuDevMngr->fLoadProportions[GpuDevMngr->uiUsefulDevs[i]] * (float)uiImageHeight); // height is proportional to dev perf uiSumHeight += uiDevImageHeight[i]; uiDevImageHeight[i] += 2; szAllocDevBytes[i] = uiDevImageHeight[i] * uiImageWidth * sizeof(cl_uint); } else { // Multiple devices, last boundary tile: // Over-allocate on device by 1 row // Set offset and size to copy extra 1 padding row H2D (above top of stripe) // Won't return the first row (dark/garbage rows D2H uiInHostPixOffsets[i] = (uiSumHeight - 1) * uiImageWidth; uiOutHostPixOffsets[i] = uiInHostPixOffsets[i] + uiImageWidth; uiDevImageHeight[i] = uiImageHeight - uiSumHeight; // "leftover" rows uiSumHeight += uiDevImageHeight[i]; uiDevImageHeight[i] += 1; szAllocDevBytes[i] = uiDevImageHeight[i] * uiImageWidth * sizeof(cl_uint); } shrLog("Image Height (rows) for Device %u = %u...\n", i, uiDevImageHeight[i]); // Create the device buffers in GMEM on each device cmDevBufIn[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szAllocDevBytes[i], NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cmDevBufOut[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, szAllocDevBytes[i], NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("clCreateBuffer (Input and Output GMEM buffers, Device %u)...\n", i); // Set the common argument values for the Median kernel instance for each device int iLocalPixPitch = iBlockDimX + 2; ciErrNum = clSetKernelArg(ckSobel[i], 0, sizeof(cl_mem), (void*)&cmDevBufIn[i]); ciErrNum |= clSetKernelArg(ckSobel[i], 1, sizeof(cl_mem), (void*)&cmDevBufOut[i]); ciErrNum |= clSetKernelArg(ckSobel[i], 2, (iLocalPixPitch * (iBlockDimY + 2) * sizeof(cl_uchar4)), NULL); ciErrNum |= clSetKernelArg(ckSobel[i], 3, sizeof(cl_int), (void*)&iLocalPixPitch); ciErrNum |= clSetKernelArg(ckSobel[i], 4, sizeof(cl_uint), (void*)&uiImageWidth); ciErrNum |= clSetKernelArg(ckSobel[i], 6, sizeof(cl_float), (void*)&fThresh); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("clSetKernelArg (0-4), Device %u...\n\n", i); } // Set common global and local work sizes for Median kernel szLocalWorkSize[0] = iBlockDimX; szLocalWorkSize[1] = iBlockDimY; szGlobalWorkSize[0] = shrRoundUp((int)szLocalWorkSize[0], uiImageWidth); // init running timers shrDeltaT(0); // timer 0 used for computation timing shrDeltaT(1); // timer 1 used for fps computation // Start main GLUT rendering loop for processing and rendering, // or otherwise run No-GL Q/A test sequence if (!(bQATest)) { glutMainLoop(); } else { TestNoGL(); } Cleanup(EXIT_SUCCESS); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { shrQAStart(argc, argv); // start logs shrSetLogFileName ("oclSimpleMultiGPU.txt"); shrLog("%s Starting, Array = %u float values...\n\n", argv[0], DATA_N); // OpenCL cl_platform_id cpPlatform; cl_uint ciDeviceCount; cl_device_id* cdDevices; cl_context cxGPUContext; cl_device_id cdDevice; // GPU device int deviceNr[MAX_GPU_COUNT]; cl_command_queue commandQueue[MAX_GPU_COUNT]; cl_mem d_Data[MAX_GPU_COUNT]; cl_mem d_Result[MAX_GPU_COUNT]; cl_program cpProgram; cl_kernel reduceKernel[MAX_GPU_COUNT]; cl_event GPUDone[MAX_GPU_COUNT]; cl_event GPUExecution[MAX_GPU_COUNT]; size_t programLength; cl_int ciErrNum; char cDeviceName [256]; cl_mem h_DataBuffer; // Vars for reduction results float h_SumGPU[MAX_GPU_COUNT * ACCUM_N]; float *h_Data; double sumGPU; double sumCPU, dRelError; // allocate and init host buffer with with some random generated input data h_Data = (float *)malloc(DATA_N * sizeof(float)); shrFillArray(h_Data, DATA_N); // start timer & logs shrLog("Setting up OpenCL on the Host...\n\n"); shrDeltaT(1); // Annotate profiling state #ifdef GPU_PROFILING shrLog("OpenCL Profiling is enabled...\n\n"); #endif //Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clGetPlatformID...\n"); //Get the devices ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &ciDeviceCount); oclCheckError(ciErrNum, CL_SUCCESS); cdDevices = (cl_device_id *)malloc(ciDeviceCount * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, ciDeviceCount, cdDevices, NULL); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clGetDeviceIDs...\n"); //Create the context cxGPUContext = clCreateContext(0, ciDeviceCount, cdDevices, NULL, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateContext...\n"); // Set up command queue(s) for GPU's specified on the command line or all GPU's if(shrCheckCmdLineFlag(argc, (const char **)argv, "device")) { // User specified GPUs int ciMaxDeviceID = ciDeviceCount-1; ciDeviceCount = 0; char* deviceList; char* deviceStr; char* next_token; shrGetCmdLineArgumentstr(argc, (const char **)argv, "device", &deviceList); #ifdef WIN32 deviceStr = strtok_s (deviceList," ,.-", &next_token); #else deviceStr = strtok (deviceList," ,.-"); #endif // Create command queues for all Requested GPU's while(deviceStr != NULL) { // get & log device index # and name deviceNr[ciDeviceCount] = atoi(deviceStr); if( deviceNr[ciDeviceCount] > ciMaxDeviceID ) { shrLog(" Invalid user specified device ID: %d\n", deviceNr[ciDeviceCount]); return 1; } cdDevice = oclGetDev(cxGPUContext, deviceNr[ciDeviceCount]); ciErrNum = clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cDeviceName), cDeviceName, NULL); oclCheckError(ciErrNum, CL_SUCCESS); shrLog(" Device %i: %s\n\n", deviceNr[ciDeviceCount], cDeviceName); // create a command que commandQueue[ciDeviceCount] = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateCommandQueue\n"); ++ciDeviceCount; #ifdef WIN32 deviceStr = strtok_s (NULL," ,.-", &next_token); #else deviceStr = strtok (NULL," ,.-"); #endif } free(deviceList); } else { // Find out how many GPU's to compute on all available GPUs size_t nDeviceBytes; ciErrNum = clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes); oclCheckError(ciErrNum, CL_SUCCESS); ciDeviceCount = (cl_uint)nDeviceBytes/sizeof(cl_device_id); for(unsigned int i = 0; i < ciDeviceCount; ++i ) { // get & log device index # and name deviceNr[i] = i; cdDevice = oclGetDev(cxGPUContext, i); ciErrNum = clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cDeviceName), cDeviceName, NULL); oclCheckError(ciErrNum, CL_SUCCESS); shrLog(" Device %i: %s\n", i, cDeviceName); // create a command que commandQueue[i] = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateCommandQueue\n\n"); } } // Load the OpenCL source code from the .cl file const char* source_path = shrFindFilePath("simpleMultiGPU.cl", argv[0]); char *source = oclLoadProgSource(source_path, "", &programLength); oclCheckError(source != NULL, shrTRUE); shrLog("oclLoadProgSource\n"); // Create the program for all GPUs in the context cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&source, &programLength, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateProgramWithSource\n"); // build the program ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclSimpleMultiGPU.ptx"); oclCheckError(ciErrNum, CL_SUCCESS); } shrLog("clBuildProgram\n"); // Create host buffer with page-locked memory h_DataBuffer = clCreateBuffer(cxGPUContext, CL_MEM_COPY_HOST_PTR | CL_MEM_ALLOC_HOST_PTR, DATA_N * sizeof(float), h_Data, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateBuffer (Page-locked Host)\n\n"); // Create buffers for each GPU, with data divided evenly among GPU's int sizePerGPU = DATA_N / ciDeviceCount; int workOffset[MAX_GPU_COUNT]; int workSize[MAX_GPU_COUNT]; workOffset[0] = 0; for(unsigned int i = 0; i < ciDeviceCount; ++i ) { workSize[i] = (i != (ciDeviceCount - 1)) ? sizePerGPU : (DATA_N - workOffset[i]); // Input buffer d_Data[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, workSize[i] * sizeof(float), NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateBuffer (Input)\t\tDev %i\n", i); // Copy data from host to device ciErrNum = clEnqueueCopyBuffer(commandQueue[i], h_DataBuffer, d_Data[i], workOffset[i] * sizeof(float), 0, workSize[i] * sizeof(float), 0, NULL, NULL); shrLog("clEnqueueCopyBuffer (Input)\tDev %i\n", i); // Output buffer d_Result[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, ACCUM_N * sizeof(float), NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateBuffer (Output)\t\tDev %i\n", i); // Create kernel reduceKernel[i] = clCreateKernel(cpProgram, "reduce", &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateKernel\t\t\tDev %i\n", i); // Set the args values and check for errors ciErrNum |= clSetKernelArg(reduceKernel[i], 0, sizeof(cl_mem), &d_Result[i]); ciErrNum |= clSetKernelArg(reduceKernel[i], 1, sizeof(cl_mem), &d_Data[i]); ciErrNum |= clSetKernelArg(reduceKernel[i], 2, sizeof(int), &workSize[i]); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clSetKernelArg\t\t\tDev %i\n\n", i); workOffset[i + 1] = workOffset[i] + workSize[i]; } // Set # of work items in work group and total in 1 dimensional range size_t localWorkSize[] = {THREAD_N}; size_t globalWorkSize[] = {ACCUM_N}; // Start timer and launch reduction kernel on each GPU, with data split between them shrLog("Launching Kernels on GPU(s)...\n\n"); for(unsigned int i = 0; i < ciDeviceCount; i++) { ciErrNum = clEnqueueNDRangeKernel(commandQueue[i], reduceKernel[i], 1, 0, globalWorkSize, localWorkSize, 0, NULL, &GPUExecution[i]); oclCheckError(ciErrNum, CL_SUCCESS); } // Copy result from device to host for each device for(unsigned int i = 0; i < ciDeviceCount; i++) { ciErrNum = clEnqueueReadBuffer(commandQueue[i], d_Result[i], CL_FALSE, 0, ACCUM_N * sizeof(float), h_SumGPU + i * ACCUM_N, 0, NULL, &GPUDone[i]); oclCheckError(ciErrNum, CL_SUCCESS); } // Synchronize with the GPUs and do accumulated error check clWaitForEvents(ciDeviceCount, GPUDone); shrLog("clWaitForEvents complete...\n\n"); // Aggregate results for multiple GPU's and stop/log processing time sumGPU = 0; for(unsigned int i = 0; i < ciDeviceCount * ACCUM_N; i++) { sumGPU += h_SumGPU[i]; } // Print Execution Times for each GPU #ifdef GPU_PROFILING shrLog("Profiling Information for GPU Processing:\n\n"); for(unsigned int i = 0; i < ciDeviceCount; i++) { cdDevice = oclGetDev(cxGPUContext, deviceNr[i]); clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cDeviceName), cDeviceName, NULL); shrLog("Device %i : %s\n", deviceNr[i], cDeviceName); shrLog(" Reduce Kernel : %.5f s\n", executionTime(GPUExecution[i])); shrLog(" Copy Device->Host : %.5f s\n\n\n", executionTime(GPUDone[i])); } #endif // Run the computation on the Host CPU and log processing time shrLog("Launching Host/CPU C++ Computation...\n\n"); sumCPU = 0; for(unsigned int i = 0; i < DATA_N; i++) { sumCPU += h_Data[i]; } // Check GPU result against CPU result dRelError = 100.0 * fabs(sumCPU - sumGPU) / fabs(sumCPU); shrLog("Comparing against Host/C++ computation...\n"); shrLog(" GPU sum: %f\n CPU sum: %f\n", sumGPU, sumCPU); shrLog(" Relative Error (100.0 * Error / Golden) = %f \n\n", dRelError); // cleanup free(source); free(h_Data); for(unsigned int i = 0; i < ciDeviceCount; ++i ) { clReleaseKernel(reduceKernel[i]); clReleaseCommandQueue(commandQueue[i]); } clReleaseProgram(cpProgram); clReleaseContext(cxGPUContext); // finish shrQAFinishExit(argc, (const char **)argv, (dRelError < 1e-4) ? QA_PASSED : QA_FAILED); }
// Main function // ********************************************************************* int main(int argc, char **argv) { gp_argc = &argc; gp_argv = &argv; shrQAStart(argc, argv); // Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("clGetPlatformID...\n"); // Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("clGetPlatformID...\n"); //Get all the devices cl_uint uiNumDevices = 0; // Number of devices available cl_uint uiTargetDevice = 0; // Default Device to compute on cl_uint uiNumComputeUnits; // Number of compute units (SM's on NV GPU) shrLog("Get the Device info and select Device...\n"); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); // Get command line device options and config accordingly shrLog(" # of Devices Available = %u\n", uiNumDevices); if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE) { uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1)); } shrLog(" Using Device %u: ", uiTargetDevice); oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]); ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("\n # of Compute Units = %u\n", uiNumComputeUnits); // get command line arg for quick test, if provided bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt"); // start logs cExecutableName = argv[0]; shrSetLogFileName ("oclDotProduct.txt"); shrLog("%s Starting...\n\n# of float elements per Array \t= %u\n", argv[0], iNumElements); // set and log Global and Local work size dimensions szLocalWorkSize = 256; szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements); // rounded up to the nearest multiple of the LocalWorkSize shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n", szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); // Allocate and initialize host arrays shrLog( "Allocate and Init Host Mem...\n"); srcA = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize); srcB = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize); dst = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize); Golden = (void *)malloc(sizeof(cl_float) * iNumElements); shrFillArray((float*)srcA, 4 * iNumElements); shrFillArray((float*)srcB, 4 * iNumElements); // Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Get a GPU device ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevices[uiTargetDevice], NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Create the context cxGPUContext = clCreateContext(0, 1, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Create a command-queue shrLog("clCreateCommandQueue...\n"); cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Allocate the OpenCL buffer memory objects for source and result on the device GMEM shrLog("clCreateBuffer (SrcA, SrcB and Dst in Device GMEM)...\n"); cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize * 4, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize * 4, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Read the OpenCL kernel in from source file shrLog("oclLoadProgSource (%s)...\n", cSourceFile); cPathAndName = shrFindFilePath(cSourceFile, argv[0]); oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup); cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength); oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup); // Create the program shrLog("clCreateProgramWithSource...\n"); cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum); // Build the program with 'mad' Optimization option #ifdef MAC char* flags = "-cl-fast-relaxed-math -DMAC"; #else char* flags = "-cl-fast-relaxed-math"; #endif shrLog("clBuildProgram...\n"); ciErrNum = clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclDotProduct.ptx"); Cleanup(EXIT_FAILURE); } // Create the kernel shrLog("clCreateKernel (DotProduct)...\n"); ckKernel = clCreateKernel(cpProgram, "DotProduct", &ciErrNum); // Set the Argument values shrLog("clSetKernelArg 0 - 3...\n\n"); ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA); ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB); ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst); ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // -------------------------------------------------------- // Core sequence... copy input data to GPU, compute, copy results back // Asynchronous write of data to GPU device shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n"); ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcA, 0, NULL, NULL); ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcB, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Launch kernel shrLog("clEnqueueNDRangeKernel (DotProduct)...\n"); ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Read back results and check accumulated errors shrLog("clEnqueueReadBuffer (Dst)...\n\n"); ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Compute and compare results for golden-host and report errors and pass/fail shrLog("Comparing against Host/C++ computation...\n\n"); DotProductHost ((const float*)srcA, (const float*)srcB, (float*)Golden, iNumElements); shrBOOL bMatch = shrComparefet((const float*)Golden, (const float*)dst, (unsigned int)iNumElements, 0.0f, 0); // Cleanup and leave Cleanup (EXIT_SUCCESS); }
bool CheckRender::compareBin2BinFloat(const char *src_file, const char *ref_file, unsigned int nelements, const float epsilon, const float threshold) { float *src_buffer, *ref_buffer; FILE *src_fp = NULL, *ref_fp = NULL; unsigned long error_count = 0; #ifdef WIN32 fopen_s(&src_fp, src_file, "rb"); #else src_fp = fopen(src_file, "rb"); #endif if (src_fp == NULL) { shrLog("compareBin2Bin <float> unable to open src_file: %s\n", src_file); error_count = 1; } char *ref_file_path = shrFindFilePath(ref_file, m_ExecPath); if (ref_file_path == NULL) { shrLog("compareBin2Bin <float> unable to find <%s> in <%s>\n", ref_file, m_ExecPath); shrLog(">>> Check info.xml and [project//data] folder <%s> <<<\n", m_ExecPath); shrLog("Aborting comparison!\n"); error_count++; if (src_fp) fclose(src_fp); if (ref_fp) fclose(ref_fp); } else { #ifdef WIN32 fopen_s(&ref_fp, ref_file_path, "rb"); #else ref_fp = fopen(ref_file_path, "rb"); #endif if (ref_fp == NULL) { shrLog("compareBin2Bin <float> unable to open ref_file: %s\n", ref_file_path); error_count = 1; } if (src_fp && ref_fp) { src_buffer = (float *)malloc(nelements*sizeof(float)); ref_buffer = (float *)malloc(nelements*sizeof(float)); fread(src_buffer, nelements, sizeof(float), src_fp); fread(ref_buffer, nelements, sizeof(float), ref_fp); shrLog("> compareBin2Bin <float> nelements=%d, epsilon=%.2f, threshold=%.2f\n", nelements, epsilon, threshold); shrLog(" src_file <%s>\n", src_file); shrLog(" ref_file <%s>\n", ref_file_path); if (!shrComparefet( ref_buffer, src_buffer, nelements, epsilon, threshold)) { error_count++; } fclose(src_fp); fclose(ref_fp); free(src_buffer); free(ref_buffer); } else { if (src_fp) fclose(src_fp); if (ref_fp) fclose(ref_fp); } } if (error_count == 0) { shrLog(" Data Matches\n"); } else { shrLog(" Data Doesn't Match!!! %d errors...\n", (unsigned int)error_count); } return (error_count == 0); // returns true if all pixels pass }
bool CheckRender::compareBin2BinUint(const char *src_file, const char *ref_file, unsigned int nelements, const float epsilon, const float threshold) { unsigned int *src_buffer, *ref_buffer; FILE *src_fp = NULL, *ref_fp = NULL; unsigned long error_count = 0; #ifdef WIN32 errno_t err; if ((err = fopen_s(&src_fp, src_file, "rb")) != 0) #else if ((src_fp = fopen(src_file, "rb")) == NULL) #endif { printf(" compareBin2Bin <unsigned int> unable to open src_file: %s\n", src_file); error_count++; } char *ref_file_path = shrFindFilePath(ref_file, m_ExecPath); if (ref_file_path == NULL) { printf(" compareBin2Bin <unsigned int> unable to find <%s> in <%s>\n", ref_file, m_ExecPath); printf(" >>> Check info.xml and [project//data] folder <%s> <<<\n", ref_file); printf(" Aborting comparison!\n"); printf(" FAILED!\n"); error_count++; if (src_fp) fclose(src_fp); if (ref_fp) fclose(ref_fp); } else { #ifdef WIN32 errno_t err; if ((err = fopen_s(&ref_fp, ref_file_path, "rb")) != 0) #else if ((ref_fp = fopen(ref_file_path, "rb")) == NULL) #endif { printf("compareBin2Bin <unsigned int> unable to open ref_file: %s\n", ref_file_path); error_count++; } if (src_fp && ref_fp) { src_buffer = (unsigned int *)malloc(nelements*sizeof(unsigned int)); ref_buffer = (unsigned int *)malloc(nelements*sizeof(unsigned int)); if (fread(src_buffer, sizeof(unsigned int), nelements, src_fp) != nelements) { printf("compareBin2Bin <unsigned int> unable to read from src_file: %s\n", src_file); error_count++; } else if (fread(ref_buffer, sizeof(unsigned int), nelements, ref_fp) != nelements) { printf("compareBin2Bin <unsigned int> unable to read from ref_file: %s\n", ref_file_path); error_count++; } else { printf(" > compareBin2Bin <unsigned int> nelements=%d, epsilon=%4.2f, threshold=%4.2f\n", nelements, epsilon, threshold); printf(" src_file <%s>\n", src_file); printf(" ref_file <%s>\n", ref_file_path); if (!shrCompareuit( ref_buffer, src_buffer, nelements, epsilon, threshold)) error_count++; } fclose(src_fp); fclose(ref_fp); free(src_buffer); free(ref_buffer); } else { if (src_fp) fclose(src_fp); if (ref_fp) fclose(ref_fp); } } if (error_count == 0) { printf("TEST PASSED\n\n"); } else { printf("TEST FAILED !!! %d errors...\n\n", (unsigned int)error_count); } return (error_count == 0); // returns true if all pixels pass }
// Init OpenCL //***************************************************************************** int initCL(int argc, const char** argv) { cl_platform_id cpPlatform; cl_uint uiDevCount; cl_device_id *cdDevices; //Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Get the number of GPU devices available to the platform ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiDevCount); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Create the device list cdDevices = new cl_device_id [uiDevCount]; ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiDevCount, cdDevices, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Get device requested on command line, if any unsigned int uiDeviceUsed = 0; unsigned int uiEndDev = uiDevCount - 1; if(shrGetCmdLineArgumentu(argc, argv, "device", &uiDeviceUsed)) { uiDeviceUsed = CLAMP(uiDeviceUsed, 0, uiEndDev); uiEndDev = uiDeviceUsed; } // Check if the requested device (or any of the devices if none requested) supports context sharing with OpenGL if(bGLinterop && !bQATest) { bool bSharingSupported = false; for(unsigned int i = uiDeviceUsed; (!bSharingSupported && (i <= uiEndDev)); ++i) { size_t extensionSize; ciErrNum = clGetDeviceInfo(cdDevices[i], CL_DEVICE_EXTENSIONS, 0, NULL, &extensionSize ); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); if(extensionSize > 0) { char* extensions = (char*)malloc(extensionSize); ciErrNum = clGetDeviceInfo(cdDevices[i], CL_DEVICE_EXTENSIONS, extensionSize, extensions, &extensionSize); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); std::string stdDevString(extensions); free(extensions); size_t szOldPos = 0; size_t szSpacePos = stdDevString.find(' ', szOldPos); // extensions string is space delimited while (szSpacePos != stdDevString.npos) { if( strcmp(GL_SHARING_EXTENSION, stdDevString.substr(szOldPos, szSpacePos - szOldPos).c_str()) == 0 ) { // Device supports context sharing with OpenGL uiDeviceUsed = i; bSharingSupported = true; break; } do { szOldPos = szSpacePos + 1; szSpacePos = stdDevString.find(' ', szOldPos); } while (szSpacePos == szOldPos); } } } shrLog("%s...\n\n", bSharingSupported ? "Using CL-GL Interop" : "No device found that supports CL/GL context sharing"); oclCheckErrorEX(bSharingSupported, true, pCleanup); // Define OS-specific context properties and create the OpenCL context #if defined (__APPLE__) || defined (MACOSX) CGLContextObj kCGLContext = CGLGetCurrentContext(); CGLShareGroupObj kCGLShareGroup = CGLGetShareGroup(kCGLContext); cl_context_properties props[] = { CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE, (cl_context_properties)kCGLShareGroup, 0 }; cxGPUContext = clCreateContext(props, 0,0, NULL, NULL, &ciErrNum); #else #ifdef UNIX cl_context_properties props[] = { CL_GL_CONTEXT_KHR, (cl_context_properties)glXGetCurrentContext(), CL_GLX_DISPLAY_KHR, (cl_context_properties)glXGetCurrentDisplay(), CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, 0 }; cxGPUContext = clCreateContext(props, 1, &cdDevices[uiDeviceUsed], NULL, NULL, &ciErrNum); #else // Win32 cl_context_properties props[] = { CL_GL_CONTEXT_KHR, (cl_context_properties)wglGetCurrentContext(), CL_WGL_HDC_KHR, (cl_context_properties)wglGetCurrentDC(), CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, 0 }; cxGPUContext = clCreateContext(props, 1, &cdDevices[uiDeviceUsed], NULL, NULL, &ciErrNum); #endif #endif } else { // No GL interop cl_context_properties props[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, 0}; cxGPUContext = clCreateContext(props, 1, &cdDevices[uiDeviceUsed], NULL, NULL, &ciErrNum); bGLinterop = shrFALSE; } shrCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Log device used shrLog("Device # %u, ", uiDeviceUsed); oclPrintDevName(LOGBOTH, cdDevices[uiDeviceUsed]); shrLog("\n"); // create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiDeviceUsed], 0, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Memory Setup if( bGLinterop ) { cl_pbos[0] = clCreateFromGLBuffer(cxGPUContext, CL_MEM_READ_ONLY, pbo_source, &ciErrNum); cl_pbos[1] = clCreateFromGLBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, pbo_dest, &ciErrNum); } else { cl_pbos[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, 4 * image_width * image_height, NULL, &ciErrNum); cl_pbos[1] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, 4 * image_width * image_height, NULL, &ciErrNum); } oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Program Setup size_t program_length; const char* source_path = shrFindFilePath(clSourcefile, argv[0]); char *source = oclLoadProgSource(source_path, "", &program_length); oclCheckErrorEX(source != NULL, shrTRUE, pCleanup); // create the program cpProgram = clCreateProgramWithSource(cxGPUContext, 1,(const char **) &source, &program_length, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); free(source); // build the program ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclPostProcessGL.ptx"); Cleanup(EXIT_FAILURE); } // create the kernel ckKernel = clCreateKernel(cpProgram, "postprocess", &ciErrNum); // set the args values ciErrNum |= clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void *) &(cl_pbos[0])); ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void *) &(cl_pbos[1])); ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(image_width), &image_width); ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(image_width), &image_height); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); return 0; }
//////////////////////////////////////////////////////////////////////////////// //! Run a simple test for //////////////////////////////////////////////////////////////////////////////// int runTest(int argc, const char** argv) { cl_platform_id cpPlatform = NULL; cl_uint ciDeviceCount = 0; cl_device_id *cdDevices = NULL; cl_int ciErrNum = CL_SUCCESS; //Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); if (ciErrNum != CL_SUCCESS) { shrLog("Error: Failed to create OpenCL context!\n"); return ciErrNum; } //Get the devices ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &ciDeviceCount); cdDevices = (cl_device_id *)malloc(ciDeviceCount * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, ciDeviceCount, cdDevices, NULL); if (ciErrNum != CL_SUCCESS) { shrLog("Error: Failed to create OpenCL context!\n"); return ciErrNum; } //Create the context cxGPUContext = clCreateContext(0, ciDeviceCount, cdDevices, NULL, NULL, &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog("Error: Failed to create OpenCL context!\n"); return ciErrNum; } if(shrCheckCmdLineFlag(argc, (const char**)argv, "device")) { // User specified GPUs char* deviceList; char* deviceStr; char* next_token; shrGetCmdLineArgumentstr(argc, (const char**)argv, "device", &deviceList); #ifdef WIN32 deviceStr = strtok_s (deviceList," ,.-", &next_token); #else deviceStr = strtok (deviceList," ,.-"); #endif ciDeviceCount = 0; while(deviceStr != NULL) { // get and print the device for this queue cl_device_id device = oclGetDev(cxGPUContext, atoi(deviceStr)); if( device == (cl_device_id) -1 ) { shrLog(" Device %s does not exist!\n", deviceStr); return -1; } shrLog("Device %s: ", deviceStr); oclPrintDevName(LOGBOTH, device); shrLog("\n"); // create command queue commandQueue[ciDeviceCount] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum); return ciErrNum; } ++ciDeviceCount; #ifdef WIN32 deviceStr = strtok_s (NULL," ,.-", &next_token); #else deviceStr = strtok (NULL," ,.-"); #endif } free(deviceList); } else { // Find out how many GPU's to compute on all available GPUs size_t nDeviceBytes; ciErrNum |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes); ciDeviceCount = (cl_uint)nDeviceBytes/sizeof(cl_device_id); if (ciErrNum != CL_SUCCESS) { shrLog(" Error %i in clGetDeviceIDs call !!!\n\n", ciErrNum); return ciErrNum; } else if (ciDeviceCount == 0) { shrLog(" There are no devices supporting OpenCL (return code %i)\n\n", ciErrNum); return -1; } // create command-queues for(unsigned int i = 0; i < ciDeviceCount; ++i) { // get and print the device for this queue cl_device_id device = oclGetDev(cxGPUContext, i); shrLog("Device %d: ", i); oclPrintDevName(LOGBOTH, device); shrLog("\n"); // create command queue commandQueue[i] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum); return ciErrNum; } } } // Optional Command-line multiplier for matrix sizes shrGetCmdLineArgumenti(argc, (const char**)argv, "sizemult", &iSizeMultiple); iSizeMultiple = CLAMP(iSizeMultiple, 1, 10); uiWA = WA * iSizeMultiple; uiHA = HA * iSizeMultiple; uiWB = WB * iSizeMultiple; uiHB = HB * iSizeMultiple; uiWC = WC * iSizeMultiple; uiHC = HC * iSizeMultiple; shrLog("\nUsing Matrix Sizes: A(%u x %u), B(%u x %u), C(%u x %u)\n", uiWA, uiHA, uiWB, uiHB, uiWC, uiHC); // allocate host memory for matrices A and B unsigned int size_A = uiWA * uiHA; unsigned int mem_size_A = sizeof(float) * size_A; float* h_A_data = (float*)malloc(mem_size_A); unsigned int size_B = uiWB * uiHB; unsigned int mem_size_B = sizeof(float) * size_B; float* h_B_data = (float*)malloc(mem_size_B); // initialize host memory srand(2006); shrFillArray(h_A_data, size_A); shrFillArray(h_B_data, size_B); // allocate host memory for result unsigned int size_C = uiWC * uiHC; unsigned int mem_size_C = sizeof(float) * size_C; float* h_C = (float*) malloc(mem_size_C); // create OpenCL buffer pointing to the host memory cl_mem h_A = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, mem_size_A, h_A_data, &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog("Error: clCreateBuffer\n"); return ciErrNum; } // Program Setup size_t program_length; const char* header_path = shrFindFilePath("matrixMul.h", argv[0]); oclCheckError(header_path != NULL, shrTRUE); char* header = oclLoadProgSource(header_path, "", &program_length); if(!header) { shrLog("Error: Failed to load the header %s!\n", header_path); return -1000; } const char* source_path = shrFindFilePath("matrixMul.cl", argv[0]); oclCheckError(source_path != NULL, shrTRUE); char *source = oclLoadProgSource(source_path, header, &program_length); if(!source) { shrLog("Error: Failed to load compute program %s!\n", source_path); return -2000; } // create the program cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&source, &program_length, &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog("Error: Failed to create program\n"); return ciErrNum; } free(header); free(source); // build the program ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then return error shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclMatrixMul.ptx"); return ciErrNum; } // write out PTX if requested on the command line if(shrCheckCmdLineFlag(argc, argv, "dump-ptx") ) { oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclMatrixMul.ptx"); } // Create Kernel for(unsigned int i = 0; i < ciDeviceCount; ++i) { multiplicationKernel[i] = clCreateKernel(cpProgram, "matrixMul", &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog("Error: Failed to create kernel\n"); return ciErrNum; } } // Run multiplication on 1..deviceCount GPUs to compare improvement shrLog("\nRunning Computations on 1 - %d GPU's...\n\n", ciDeviceCount); for(unsigned int k = 1; k <= ciDeviceCount; ++k) { matrixMulGPU(k, h_A, h_B_data, mem_size_B, h_C); } // compute reference solution shrLog("Comparing results with CPU computation... \n\n"); float* reference = (float*) malloc(mem_size_C); computeGold(reference, h_A_data, h_B_data, uiHA, uiWA, uiWB); // check result shrBOOL res = shrCompareL2fe(reference, h_C, size_C, 1.0e-6f); if (res != shrTRUE) { printDiff(reference, h_C, uiWC, uiHC, 100, 1.0e-5f); } // clean up OCL resources ciErrNum = clReleaseMemObject(h_A); for(unsigned int k = 0; k < ciDeviceCount; ++k) { ciErrNum |= clReleaseKernel( multiplicationKernel[k] ); ciErrNum |= clReleaseCommandQueue( commandQueue[k] ); } ciErrNum |= clReleaseProgram(cpProgram); ciErrNum |= clReleaseContext(cxGPUContext); if(ciErrNum != CL_SUCCESS) { shrLog("Error: Failure releasing OpenCL resources: %d\n", ciErrNum); return ciErrNum; } // clean up memory free(h_A_data); free(h_B_data); free(h_C); free(reference); return ((shrTRUE == res) ? CL_SUCCESS : -3000); }
b2CLNarrowPhase::b2CLNarrowPhase() { #if defined(NARROWPHASE_OPENCL) if (b2clGlobal_OpenCLSupported) { printf("Initializing b2CLNarrowPhase...\n"); int err; //load opencl programs from files char* narrowPhaseKernelSource = 0; size_t narrowPhaseKernelSourceLen = 0; shrLog("...loading b2CLNarrowPhase.cl\n"); #if defined(SCAN_OPENCL) #ifdef linux narrowPhaseKernelSource = b2clLoadProgSource(shrFindFilePath("/opt/usr/apps/com.samsung.browser/include/Box2D/Common/OpenCL/b2CLNarrowPhase.cl", NULL), "// My comment\n", &narrowPhaseKernelSourceLen); #elif defined (_WIN32) narrowPhaseKernelSource = b2clLoadProgSource(shrFindFilePath("../../Box2D/Common/OpenCL/b2CLNarrowPhase.cl", NULL), "// My comment\n", &narrowPhaseKernelSourceLen); #else narrowPhaseKernelSource = b2clLoadProgSource(shrFindFilePath("/usr/local/include/Box2D/Common/OpenCL/b2CLNarrowPhase.cl", NULL), "// My comment\n", &narrowPhaseKernelSourceLen); #endif #else #ifdef linux narrowPhaseKernelSource = b2clLoadProgSource(shrFindFilePath("/opt/usr/apps/com.samsung.browser/include/Box2D/Common/OpenCL/b2CLNarrowPhase_Alone.cl", NULL), "// My comment\n", &narrowPhaseKernelSourceLen); #elif defined (_WIN32) narrowPhaseKernelSource = b2clLoadProgSource(shrFindFilePath("../../Box2D/Common/OpenCL/b2CLNarrowPhase_Alone.cl", NULL), "// My comment\n", &narrowPhaseKernelSourceLen); #else narrowPhaseKernelSource = b2clLoadProgSource(shrFindFilePath("/usr/local/include/Box2D/Common/OpenCL/b2CLNarrowPhase_Alone.cl", NULL), "// My comment\n", &narrowPhaseKernelSourceLen); #endif #endif if(narrowPhaseKernelSource == NULL) { b2Log("Could not load program source, is path 'b2CLNarrowPhase.cl' correct?"); } //create the compute program from source kernel code narrowPhaseProgram = clCreateProgramWithSource(b2CLDevice::instance().GetContext(), 1, (const char**)&narrowPhaseKernelSource, NULL, &err); if (!narrowPhaseProgram) { printf("Error: Failed to create compute program!\n"); exit(1); } //build the program err = clBuildProgram(narrowPhaseProgram, 0, NULL, OPENCL_BUILD_PATH, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[204800]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(narrowPhaseProgram, b2CLDevice::instance().GetCurrentDevice(), CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } //create the compute kernel collidePolygonsKernel = clCreateKernel(narrowPhaseProgram, "b2clCollidePolygons", &err); if (!collidePolygonsKernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } collideCirclesKernel = clCreateKernel(narrowPhaseProgram, "b2clCollideCircles", &err); if (!collideCirclesKernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } collidePolygonAndCircleKernel = clCreateKernel(narrowPhaseProgram, "b2clCollidePolygonAndCircle", &err); if (!collidePolygonAndCircleKernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } collideEdgeAndCircleKernel = clCreateKernel(narrowPhaseProgram, "b2clCollideEdgeAndCircle", &err); if (!collideEdgeAndCircleKernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } collideEdgeAndPolygonKernel = clCreateKernel(narrowPhaseProgram, "b2clCollideEdgeAndPolygon", &err); if (!collideEdgeAndPolygonKernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } b2CLDevice::instance().getMaximumKernelWorkGroupSize(collidePolygonsKernel, kernel_work_group_size); //clGetKernelWorkGroupInfo(collidePolygonsKernel, b2CLDevice::instance().GetCurrentDevice(), CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &kernel_preferred_work_group_size_multiple, NULL); //create the compute kernel compactForOneContactKernel = clCreateKernel(narrowPhaseProgram, "b2clCompactForOneContact", &err); if (!compactForOneContactKernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } /*old_xf_num = */old_contact_num = 0; xfListData = NULL; b2CLCommonData::instance().manifoldListData = NULL; globalIndicesData = pairIndicesData = NULL; #if !defined(BROADPHASE_OPENCL) b2CLCommonData::instance().manifoldListBuffers[0] = b2CLCommonData::instance().manifoldListBuffers[1] = b2CLCommonData::instance().pairIndicesBuffer = #endif #if defined(SCAN_OPENCL) b2CLCommonData::instance().manifoldBinaryBitListBuffer = b2CLCommonData::instance().validContactIndicesBuffer = #endif NULL; b2CLScan::instance(); // initializing b2CLScan } #endif b2CLCommonData::instance().currentManifoldBuffer = 0; manifold_nums[0] = 0; manifold_nums[1] = 0; }