// Main function // ********************************************************************* int main(int argc, char **argv) { gp_argc = &argc; gp_argv = &argv; shrQAStart(argc, argv); // Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("clGetPlatformID...\n"); // Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("clGetPlatformID...\n"); //Get all the devices cl_uint uiNumDevices = 0; // Number of devices available cl_uint uiTargetDevice = 0; // Default Device to compute on cl_uint uiNumComputeUnits; // Number of compute units (SM's on NV GPU) shrLog("Get the Device info and select Device...\n"); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); // Get command line device options and config accordingly shrLog(" # of Devices Available = %u\n", uiNumDevices); if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE) { uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1)); } shrLog(" Using Device %u: ", uiTargetDevice); oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]); ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("\n # of Compute Units = %u\n", uiNumComputeUnits); // get command line arg for quick test, if provided bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt"); // start logs cExecutableName = argv[0]; shrSetLogFileName ("oclDotProduct.txt"); shrLog("%s Starting...\n\n# of float elements per Array \t= %u\n", argv[0], iNumElements); // set and log Global and Local work size dimensions szLocalWorkSize = 256; szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements); // rounded up to the nearest multiple of the LocalWorkSize shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n", szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); // Allocate and initialize host arrays shrLog( "Allocate and Init Host Mem...\n"); srcA = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize); srcB = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize); dst = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize); Golden = (void *)malloc(sizeof(cl_float) * iNumElements); shrFillArray((float*)srcA, 4 * iNumElements); shrFillArray((float*)srcB, 4 * iNumElements); // Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Get a GPU device ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevices[uiTargetDevice], NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Create the context cxGPUContext = clCreateContext(0, 1, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Create a command-queue shrLog("clCreateCommandQueue...\n"); cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Allocate the OpenCL buffer memory objects for source and result on the device GMEM shrLog("clCreateBuffer (SrcA, SrcB and Dst in Device GMEM)...\n"); cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize * 4, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize * 4, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Read the OpenCL kernel in from source file shrLog("oclLoadProgSource (%s)...\n", cSourceFile); cPathAndName = shrFindFilePath(cSourceFile, argv[0]); oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup); cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength); oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup); // Create the program shrLog("clCreateProgramWithSource...\n"); cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum); // Build the program with 'mad' Optimization option #ifdef MAC char* flags = "-cl-fast-relaxed-math -DMAC"; #else char* flags = "-cl-fast-relaxed-math"; #endif shrLog("clBuildProgram...\n"); ciErrNum = clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclDotProduct.ptx"); Cleanup(EXIT_FAILURE); } // Create the kernel shrLog("clCreateKernel (DotProduct)...\n"); ckKernel = clCreateKernel(cpProgram, "DotProduct", &ciErrNum); // Set the Argument values shrLog("clSetKernelArg 0 - 3...\n\n"); ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA); ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB); ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst); ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // -------------------------------------------------------- // Core sequence... copy input data to GPU, compute, copy results back // Asynchronous write of data to GPU device shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n"); ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcA, 0, NULL, NULL); ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcB, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Launch kernel shrLog("clEnqueueNDRangeKernel (DotProduct)...\n"); ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Read back results and check accumulated errors shrLog("clEnqueueReadBuffer (Dst)...\n\n"); ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Compute and compare results for golden-host and report errors and pass/fail shrLog("Comparing against Host/C++ computation...\n\n"); DotProductHost ((const float*)srcA, (const float*)srcB, (float*)Golden, iNumElements); shrBOOL bMatch = shrComparefet((const float*)Golden, (const float*)dst, (unsigned int)iNumElements, 0.0f, 0); // Cleanup and leave Cleanup (EXIT_SUCCESS); }
bool CheckRender::compareBin2BinFloat(const char *src_file, const char *ref_file, unsigned int nelements, const float epsilon, const float threshold) { float *src_buffer, *ref_buffer; FILE *src_fp = NULL, *ref_fp = NULL; unsigned long error_count = 0; #ifdef WIN32 fopen_s(&src_fp, src_file, "rb"); #else src_fp = fopen(src_file, "rb"); #endif if (src_fp == NULL) { shrLog("compareBin2Bin <float> unable to open src_file: %s\n", src_file); error_count = 1; } char *ref_file_path = shrFindFilePath(ref_file, m_ExecPath); if (ref_file_path == NULL) { shrLog("compareBin2Bin <float> unable to find <%s> in <%s>\n", ref_file, m_ExecPath); shrLog(">>> Check info.xml and [project//data] folder <%s> <<<\n", m_ExecPath); shrLog("Aborting comparison!\n"); error_count++; if (src_fp) fclose(src_fp); if (ref_fp) fclose(ref_fp); } else { #ifdef WIN32 fopen_s(&ref_fp, ref_file_path, "rb"); #else ref_fp = fopen(ref_file_path, "rb"); #endif if (ref_fp == NULL) { shrLog("compareBin2Bin <float> unable to open ref_file: %s\n", ref_file_path); error_count = 1; } if (src_fp && ref_fp) { src_buffer = (float *)malloc(nelements*sizeof(float)); ref_buffer = (float *)malloc(nelements*sizeof(float)); fread(src_buffer, nelements, sizeof(float), src_fp); fread(ref_buffer, nelements, sizeof(float), ref_fp); shrLog("> compareBin2Bin <float> nelements=%d, epsilon=%.2f, threshold=%.2f\n", nelements, epsilon, threshold); shrLog(" src_file <%s>\n", src_file); shrLog(" ref_file <%s>\n", ref_file_path); if (!shrComparefet( ref_buffer, src_buffer, nelements, epsilon, threshold)) { error_count++; } fclose(src_fp); fclose(ref_fp); free(src_buffer); free(ref_buffer); } else { if (src_fp) fclose(src_fp); if (ref_fp) fclose(ref_fp); } } if (error_count == 0) { shrLog(" Data Matches\n"); } else { shrLog(" Data Doesn't Match!!! %d errors...\n", (unsigned int)error_count); } return (error_count == 0); // returns true if all pixels pass }
// Main function // ********************************************************************* int ymain(int argc, char **argv) { shrQAStart(argc, argv); // get command line arg for quick test, if provided bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt"); // start logs cExecutableName = argv[0]; shrSetLogFileName ("oclVectorAdd2.txt"); shrLog("%s Starting...\n\n# of float elements per Array \t= %i\n", argv[0], iNumElements); // set and log Global and Local work size dimensions szLocalWorkSize = 256; szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements); // rounded up to the nearest multiple of the LocalWorkSize shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n", szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); // Allocate and initialize host arrays shrLog( "Allocate and Init Host Mem...\n"); srcA = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize); srcB = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize); dst = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize); Golden = (void *)malloc(sizeof(cl_float) * iNumElements); shrFillArray((float*)srcA, iNumElements); shrFillArray((float*)srcB, iNumElements); //Get an OpenCL platform ciErr1 = clGetPlatformIDs(1, &cpPlatform, NULL); shrLog("clGetPlatformID...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clGetPlatformID, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } //Get the devices ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL); shrLog("clGetDeviceIDs...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clGetDeviceIDs, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } //Create the context cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr1); shrLog("clCreateContext...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateContext, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &ciErr1); shrLog("clCreateCommandQueue...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateCommandQueue, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Allocate the OpenCL buffer memory objects for source and result on the device GMEM cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr1); cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2); ciErr1 |= ciErr2; cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2); ciErr1 |= ciErr2; shrLog("clCreateBuffer...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Read the OpenCL kernel in from source file shrLog("oclLoadProgSource (%s)...\n", cSourceFile); cPathAndName = shrFindFilePath(cSourceFile, argv[0]); shrLog("Looking for: %s in Path: %s\n", cSourceFile, argv[0]); cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength); // Create the program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1); shrLog("clCreateProgramWithSource...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Build the program with 'mad' Optimization option #ifdef MAC char* flags = "-cl-fast-relaxed-math -DMAC"; #else char* flags = "-cl-fast-relaxed-math"; #endif ciErr1 = clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL); shrLog("clBuildProgram...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Create the kernel ckKernel = clCreateKernel(cpProgram, "VectorAdd", &ciErr1); shrLog("clCreateKernel (VectorAdd)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Set the Argument values ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA); ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB); ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst); ciErr1 |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements); shrLog("clSetKernelArg 0 - 3...\n\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clSetKernelArg, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // -------------------------------------------------------- // Start Core sequence... copy input data to GPU, compute, copy results back // Asynchronous write of data to GPU device ciErr1 = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, srcA, 0, NULL, NULL); ciErr1 |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, srcB, 0, NULL, NULL); shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueWriteBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Launch kernel ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL); shrLog("clEnqueueNDRangeKernel (VectorAdd)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueNDRangeKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Synchronous/blocking read of results, and check accumulated errors ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL); shrLog("clEnqueueReadBuffer (Dst)...\n\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueReadBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } //-------------------------------------------------------- // Compute and compare results for golden-host and report errors and pass/fail shrLog("Comparing against Host/C++ computation...\n\n"); VectorAddHost ((const float*)srcA, (const float*)srcB, (float*)Golden, iNumElements); shrBOOL bMatch = shrComparefet((const float*)Golden, (const float*)dst, (unsigned int)iNumElements, 0.0f, 0); // Cleanup and leave Cleanup (argc, argv, (bMatch == shrTRUE) ? EXIT_SUCCESS : EXIT_FAILURE); }
// Main function // ********************************************************************* int main(int argc, char **argv) { ////////////////////////////////////////////////////////////////////////// unsigned int count = iNumElements; int k = 8; unsigned int random_seed, random_seed2; srand( (unsigned)time( NULL ) ); random_seed = rand(); random_seed2 = rand(); ////////////////////////////////////////////////////////////////////////// // get command line arg for quick test, if provided bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt"); // start logs shrSetLogFileName ("oclVectorAdd.txt"); shrLog("%s Starting...\n\n# of float elements per Array \t= %i\n", argv[0], iNumElements); // set and log Global and Local work size dimensions szLocalWorkSize = 256; szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements); // rounded up to the nearest multiple of the LocalWorkSize shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n", szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); // Allocate and initialize host arrays shrLog( "Allocate and Init Host Mem...\n"); srcA = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize); srcB = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize); dst = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize); Golden = (void *)malloc(sizeof(cl_float) * iNumElements); shrFillArray((float*)srcA, iNumElements); shrFillArray((float*)srcB, iNumElements); ////////////////////////////////////////////////////////////////////////// float *scalar_value = new float[count]; float *gradient_magnitude = new float[count]; float *second_derivative_magnitude = new float[count]; unsigned char *label_ptr = new unsigned char[count]; shrFillArray(scalar_value, count); shrFillArray(gradient_magnitude, count); shrFillArray(second_derivative_magnitude, count); ////////////////////////////////////////////////////////////////////////// //Get an OpenCL platform ciErr1 = clGetPlatformIDs(1, &cpPlatform, NULL); shrLog("clGetPlatformID...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clGetPlatformID, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } //Get the devices ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL); shrLog("clGetDeviceIDs...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clGetDeviceIDs, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } //Create the context cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr1); shrLog("clCreateContext...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateContext, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &ciErr1); shrLog("clCreateCommandQueue...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateCommandQueue, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Allocate the OpenCL buffer memory objects for source and result on the device GMEM cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr1); cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2); ciErr1 |= ciErr2; cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2); ciErr1 |= ciErr2; ////////////////////////////////////////////////////////////////////////// cmDevSrc_scalar_value = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr1); cmDevSrc_gradient_magnitude = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2); ciErr1 |= ciErr2; cmDevSrc_second_derivative_magnitude = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2); ciErr1 |= ciErr2; cmDevDst_label_ptr = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErr2); ciErr1 |= ciErr2; ////////////////////////////////////////////////////////////////////////// shrLog("clCreateBuffer...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Read the OpenCL kernel in from source file shrLog("oclLoadProgSource (%s)...\n", cSourceFile); cPathAndName = shrFindFilePath(cSourceFile, argv[0]); cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength); printf("%s\n%s\n", cSourceFile, cPathAndName); // Create the program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1); shrLog("clCreateProgramWithSource...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Build the program with 'mad' Optimization option #ifdef MAC char* flags = "-cl-fast-relaxed-math -DMAC"; #else char* flags = "-cl-fast-relaxed-math"; #endif ciErr1 = clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL); shrLog("clBuildProgram...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Create the kernel ckKernel = clCreateKernel(cpProgram, "k_means", &ciErr1); shrLog("clCreateKernel (VectorAdd)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Set the Argument values //ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA); //ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB); //ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst); //ciErr1 |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements); ////////////////////////////////////////////////////////////////////////// // __global const float *scalar_value, __global const float *gradient_magnitude, __global const float *second_derivative_magnitude, __global unsigned char *label_ptr, __global const unsigned int count, __global const int k, __global const unsigned int random_seed, __global const unsigned int random_seed2 ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrc_scalar_value); ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrc_gradient_magnitude); ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevSrc_second_derivative_magnitude); ciErr1 |= clSetKernelArg(ckKernel, 3, sizeof(cl_mem), (void*)&cmDevDst_label_ptr); ciErr1 |= clSetKernelArg(ckKernel, 4, sizeof(cl_uint), (void*)&count); ciErr1 |= clSetKernelArg(ckKernel, 5, sizeof(cl_uint), (void*)&k); ciErr1 |= clSetKernelArg(ckKernel, 6, sizeof(cl_uint), (void*)&random_seed); ciErr1 |= clSetKernelArg(ckKernel, 7, sizeof(cl_uint), (void*)&random_seed2); ////////////////////////////////////////////////////////////////////////// shrLog("clSetKernelArg 0 - 3...\n\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clSetKernelArg, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // -------------------------------------------------------- // Start Core sequence... copy input data to GPU, compute, copy results back // Asynchronous write of data to GPU device //ciErr1 = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, srcA, 0, NULL, NULL); //ciErr1 |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, srcB, 0, NULL, NULL); ////////////////////////////////////////////////////////////////////////// ciErr1 = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrc_scalar_value, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, scalar_value, 0, NULL, NULL); ciErr1 |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrc_gradient_magnitude, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, gradient_magnitude, 0, NULL, NULL); ciErr1 |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrc_second_derivative_magnitude, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, second_derivative_magnitude, 0, NULL, NULL); ////////////////////////////////////////////////////////////////////////// shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueWriteBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Launch kernel ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL); shrLog("clEnqueueNDRangeKernel (VectorAdd)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueNDRangeKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } // Synchronous/blocking read of results, and check accumulated errors //ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL); ////////////////////////////////////////////////////////////////////////// ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmDevDst_label_ptr, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, label_ptr, 0, NULL, NULL); ////////////////////////////////////////////////////////////////////////// shrLog("clEnqueueReadBuffer (Dst)...\n\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueReadBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(EXIT_FAILURE); } //-------------------------------------------------------- // Compute and compare results for golden-host and report errors and pass/fail shrLog("Comparing against Host/C++ computation...\n\n"); VectorAddHost ((const float*)srcA, (const float*)srcB, (float*)Golden, iNumElements); shrBOOL bMatch = shrComparefet((const float*)Golden, (const float*)dst, (unsigned int)iNumElements, 0.0f, 0); shrLog("%s\n\n", (bMatch == shrTRUE) ? "PASSED" : "FAILED"); ////////////////////////////////////////////////////////////////////////// //float *a = (float *)srcA; //float *b = (float *)srcB; //float *c = (float *)dst; //float *d = (float *)Golden; //for (int i=0; i<iNumElements; i++) //{ // printf("%f+%f=%f=%f\t", a[i], b[i], c[i], a[i]+b[i]); // printf("%s\n", (a[i]+b[i]==c[i]?"equal":"not equal")); //} //for (int i=0; i<iNumElements; i++) //{ // printf("%f\n", ((float *)dst)[i]); //} ////////////////////////////////////////////////////////////////////////// // Cleanup and leave Cleanup (EXIT_SUCCESS); ////////////////////////////////////////////////////////////////////////// delete [] scalar_value; delete [] gradient_magnitude; delete [] second_derivative_magnitude; delete [] label_ptr; ////////////////////////////////////////////////////////////////////////// }