int ConstantBandwidth::genBinaryImage() { streamsdk::bifData binaryData; binaryData.kernelName = std::string("ConstantBandwidth_Kernels.cl"); // Pass vectorSize as DATATYPE to kernel cl_uint size = (WAVEFRONT + NUM_READS) * vectorSize * sizeof(cl_float); char buildOption[64]; if(vectorSize == 1) { sprintf(buildOption, "-D DATATYPE=float -D DATATYPE2=float4 -D SIZE=%d ", size); } else { sprintf(buildOption, "-D DATATYPE=float%d -D DATATYPE2=float%d -D SIZE=%d ", vec3 == true ? 3 : vectorSize, vec3 == true ? 3 : vectorSize,size); } binaryData.flagsStr = std::string(buildOption); if(isComplierFlagsSpecified()) binaryData.flagsFileName = std::string(flags.c_str()); binaryData.binaryName = std::string(dumpBinary.c_str()); int status = sampleCommon->generateBinaryImage(binaryData); CHECK_ERROR(status, SDK_SUCCESS, "OpenCL Generate Binary Image Failed"); return SDK_SUCCESS; }
int PrefixSum::genBinaryImage() { streamsdk::bifData binaryData; binaryData.kernelName = std::string("PrefixSum_Kernels.cl"); binaryData.flagsStr = std::string(""); if(isComplierFlagsSpecified()) binaryData.flagsFileName = std::string(flags.c_str()); binaryData.binaryName = std::string(dumpBinary.c_str()); int status = sampleCommon->generateBinaryImage(binaryData); return status; }
int ScanLargeArrays::genBinaryImage() { streamsdk::bifData binaryData; binaryData.kernelName = std::string("ScanLargeArrays_Kernels.cl"); binaryData.flagsStr = std::string(""); if(isComplierFlagsSpecified()) binaryData.flagsFileName = std::string(flags.c_str()); binaryData.binaryName = std::string(dumpBinary.c_str()); int status = sampleCommon->generateBinaryImage(binaryData); CHECK_ERROR(status, SDK_SUCCESS, "OpenCL Generate Binary Image Failed"); return SDK_SUCCESS; }
int GaussianNoise::genBinaryImage() { /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ streamsdk::bifData binaryData; binaryData.kernelName = std::string("GaussianNoise_Kernels.cl"); binaryData.flagsStr = std::string(""); if(isComplierFlagsSpecified()) binaryData.flagsFileName = std::string(flags.c_str()); binaryData.binaryName = std::string(dumpBinary.c_str()); int status = sampleCommon->generateBinaryImage(binaryData); return status; }
int GlobalMemoryBandwidth::genBinaryImage() { streamsdk::bifData binaryData; binaryData.kernelName = std::string("GlobalMemoryBandwidth_Kernels.cl"); // Pass vectorSize as DATATYPE to kernel char buildOption[128]; if(vectorSize == 1) sprintf(buildOption, "-D DATATYPE=float -D OFFSET=%d ", OFFSET); else sprintf(buildOption, "-D DATATYPE=float%d -D OFFSET=%d ", (vec3 == true) ? 3 : vectorSize, OFFSET); binaryData.flagsStr = std::string(buildOption); if(isComplierFlagsSpecified()) binaryData.flagsFileName = std::string(flags.c_str()); binaryData.binaryName = std::string(dumpBinary.c_str()); int status = sampleCommon->generateBinaryImage(binaryData); CHECK_ERROR(status, SDK_SUCCESS, "OpenCL Generate Binary Image Failed"); return status; }
int FluidSimulation2D::setupCL() { cl_int status = CL_SUCCESS; cl_device_type dType; if(deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; if(isThereGPU() == false) { std::cout << "GPU not found. Falling back to CPU device" << std::endl; dType = CL_DEVICE_TYPE_CPU; } } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_platform_id platform = NULL; int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled()); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed"); // Display available devices. retValue = sampleCommon->displayDevices(platform, dType); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed"); // If we could find our platform, use it. Otherwise use just available platform. cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType( cps, dType, NULL, NULL, &status); CHECK_OPENCL_ERROR( status, "clCreateContextFromType failed."); // getting device on which to run the sample status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled()); CHECK_ERROR(status, SDK_SUCCESS, "sampleCommon::getDevices() failed"); { // The block is to move the declaration of prop closer to its use cl_command_queue_properties prop = 0; commandQueue = clCreateCommandQueue( context, devices[deviceId], prop, &status); CHECK_OPENCL_ERROR( status, "clCreateCommandQueue failed."); } //Set device info of given cl_device_id retValue = deviceInfo.setDeviceInfo(devices[deviceId]); CHECK_ERROR(retValue, 0, "SDKDeviceInfo::setDeviceInfo() failed"); std::string buildOptions = std::string(""); // Check if cl_khr_fp64 extension is supported if(strstr(deviceInfo.extensions, "cl_khr_fp64")) { buildOptions.append("-D KHR_DP_EXTENSION"); } else { // Check if cl_amd_fp64 extension is supported if(!strstr(deviceInfo.extensions, "cl_amd_fp64")) { reqdExtSupport = false; OPENCL_EXPECTED_ERROR("Device does not support cl_amd_fp64 extension!"); } } /* * Create and initialize memory objects */ size_t temp = dims[0] * dims[1]; d_if0 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_double) * temp, 0, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (d_if0)"); status = clEnqueueWriteBuffer(commandQueue, d_if0, 1, 0, sizeof(cl_double) * temp, h_if0, 0, 0, 0); CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (d_if0)"); d_if1234 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_double4) * temp, 0, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (d_if1234)"); status = clEnqueueWriteBuffer(commandQueue, d_if1234, 1, 0, sizeof(cl_double4) * temp, h_if1234, 0, 0, 0); CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (d_if1234)"); d_if5678 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_double4) * temp, 0, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (d_if5678)"); status = clEnqueueWriteBuffer(commandQueue, d_if5678, 1, 0, sizeof(cl_double4) * temp, h_if5678, 0, 0, 0); CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (d_if5678)"); d_of0 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_double) * temp, 0, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (d_of0)"); d_of1234 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_double4) * temp, 0, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (d_of1234)"); d_of5678 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_double4) * temp, 0, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (d_of5678)"); status = clEnqueueCopyBuffer(commandQueue, d_if0, d_of0, 0, 0, sizeof(cl_double) * temp, 0, 0, 0); CHECK_OPENCL_ERROR(status, "clEnqueueCopyBuffer failed. (d_if0->d_of0)"); status = clEnqueueCopyBuffer(commandQueue, d_if1234, d_of1234, 0, 0, sizeof(cl_double4) * temp, 0, 0, 0); CHECK_OPENCL_ERROR(status, "clEnqueueCopyBuffer failed. (d_if1234->d_of1234)"); status = clEnqueueCopyBuffer(commandQueue, d_if5678, d_of5678, 0, 0, sizeof(cl_double4) * temp, 0, 0, 0); CHECK_OPENCL_ERROR(status, "clEnqueueCopyBuffer failed. (d_if5678->d_of5678)"); status = clFinish(commandQueue); CHECK_OPENCL_ERROR(status, "clFinish failed."); // Set Presistent memory only for AMD platform cl_mem_flags inMemFlags = CL_MEM_READ_ONLY; if(isAmdPlatform()) inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD; //Constant arrays type = clCreateBuffer(context, inMemFlags, sizeof(cl_bool) * temp, 0, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (type)"); weight = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_double) * 9, 0, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (weight)"); status = clEnqueueWriteBuffer(commandQueue, weight, 1, 0, sizeof(cl_double) * 9, w, 0, 0, 0); CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (weight)"); velocity = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof(cl_double2) * temp, 0, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (velocity)"); // create a CL program using the kernel source streamsdk::buildProgramData buildData; buildData.kernelName = std::string("FluidSimulation2D_Kernels.cl"); buildData.devices = devices; buildData.deviceId = deviceId; buildData.flagsStr = std::string(""); if(isLoadBinaryEnabled()) buildData.binaryName = std::string(loadBinary.c_str()); if(isComplierFlagsSpecified()) buildData.flagsFileName = std::string(flags.c_str()); retValue = sampleCommon->buildOpenCLProgram(program, context, buildData); CHECK_ERROR(retValue, 0, "sampleCommon::buildOpenCLProgram() failed"); // get a kernel object handle for a kernel with the given name kernel = clCreateKernel( program, "lbm", &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed."); return SDK_SUCCESS; }
int FloydWarshall::setupCL(void) { cl_int status = 0; cl_device_type dType; if(deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; if(isThereGPU() == false) { std::cout << "GPU not found. Fall back to CPU device" << std::endl; dType = CL_DEVICE_TYPE_CPU; } } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_platform_id platform = NULL; int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled()); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed"); // Display available devices. retValue = sampleCommon->displayDevices(platform, dType); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed"); /* * If we could find our platform, use it. Otherwise use just available platform. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType(cps, dType, NULL, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed."); // getting device on which to run the sample status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled()); CHECK_ERROR(status, SDK_SUCCESS, "sampleCommon::getDevices() failed"); { // The block is to move the declaration of prop closer to its use cl_command_queue_properties prop = 0; commandQueue = clCreateCommandQueue(context, devices[deviceId], prop, &status); CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed."); } pathDistanceBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint) * numNodes * numNodes, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (pathDistanceBuffer)"); pathBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof(cl_uint) * numNodes * numNodes, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (pathBuffer)"); // create a CL program using the kernel source streamsdk::buildProgramData buildData; buildData.kernelName = std::string("FloydWarshall_Kernels.cl"); buildData.devices = devices; buildData.deviceId = deviceId; buildData.flagsStr = std::string(""); if(isLoadBinaryEnabled()) buildData.binaryName = std::string(loadBinary.c_str()); if(isComplierFlagsSpecified()) buildData.flagsFileName = std::string(flags.c_str()); retValue = sampleCommon->buildOpenCLProgram(program, context, buildData); CHECK_ERROR(retValue, 0, "sampleCommon::buildOpenCLProgram() failed"); // get a kernel object handle for a kernel with the given name kernel = clCreateKernel(program, "floydWarshallPass", &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed."); return SDK_SUCCESS; }
int NBody::genBinaryImage() { cl_int status = CL_SUCCESS; /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_uint numPlatforms; cl_platform_id platform = NULL; status = clGetPlatformIDs(0, NULL, &numPlatforms); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) { return SDK_FAILURE; } if (0 < numPlatforms) { cl_platform_id* platforms = new cl_platform_id[numPlatforms]; status = clGetPlatformIDs(numPlatforms, platforms, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) { return SDK_FAILURE; } char platformName[100]; for (unsigned i = 0; i < numPlatforms; ++i) { status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(platformName), platformName, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformInfo failed.")) { return SDK_FAILURE; } platform = platforms[i]; if (!strcmp(platformName, "Advanced Micro Devices, Inc.")) { break; } } std::cout << "Platform found : " << platformName << "\n"; delete[] platforms; } if(NULL == platform) { sampleCommon->error("NULL platform found so Exiting Application."); return SDK_FAILURE; } /* * If we could find our platform, use it. Otherwise use just available platform. */ cl_context_properties cps[5] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, CL_CONTEXT_OFFLINE_DEVICES_AMD, (cl_context_properties)1, 0 }; context = clCreateContextFromType(cps, CL_DEVICE_TYPE_ALL, NULL, NULL, &status); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clCreateContextFromType failed.")) { return SDK_FAILURE; } /* create a CL program using the kernel source */ streamsdk::SDKFile kernelFile; std::string kernelPath = sampleCommon->getPath(); kernelPath.append("NBody_Kernels.cl"); if(!kernelFile.open(kernelPath.c_str())) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } const char * source = kernelFile.source().c_str(); size_t sourceSize[] = {strlen(source)}; program = clCreateProgramWithSource(context, 1, &source, sourceSize, &status); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clCreateProgramWithSource failed.")) { return SDK_FAILURE; } std::string flagsStr = std::string(""); // Get additional options if(isComplierFlagsSpecified()) { streamsdk::SDKFile flagsFile; std::string flagsPath = sampleCommon->getPath(); flagsPath.append(flags.c_str()); if(!flagsFile.open(flagsPath.c_str())) { std::cout << "Failed to load flags file: " << flagsPath << std::endl; return SDK_FAILURE; } flagsFile.replaceNewlineWithSpaces(); const char * flags = flagsFile.source().c_str(); flagsStr.append(flags); } if(flagsStr.size() != 0) std::cout << "Build Options are : " << flagsStr.c_str() << std::endl; /* create a cl program executable for all the devices specified */ status = clBuildProgram(program, 0, NULL, flagsStr.c_str(), NULL, NULL); sampleCommon->checkVal(status, CL_SUCCESS, "clBuildProgram failed."); size_t numDevices; status = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES, sizeof(numDevices), &numDevices, NULL ); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetProgramInfo(CL_PROGRAM_NUM_DEVICES) failed.")) { return SDK_FAILURE; } std::cout << "Number of devices found : " << numDevices << "\n\n"; devices = (cl_device_id *)malloc( sizeof(cl_device_id) * numDevices ); if(devices == NULL) { sampleCommon->error("Failed to allocate host memory.(devices)"); return SDK_FAILURE; } /* grab the handles to all of the devices in the program. */ status = clGetProgramInfo(program, CL_PROGRAM_DEVICES, sizeof(cl_device_id) * numDevices, devices, NULL ); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetProgramInfo(CL_PROGRAM_DEVICES) failed.")) { return SDK_FAILURE; } /* figure out the sizes of each of the binaries. */ size_t *binarySizes = (size_t*)malloc( sizeof(size_t) * numDevices ); if(devices == NULL) { sampleCommon->error("Failed to allocate host memory.(binarySizes)"); return SDK_FAILURE; } status = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * numDevices, binarySizes, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetProgramInfo(CL_PROGRAM_BINARY_SIZES) failed.")) { return SDK_FAILURE; } size_t i = 0; /* copy over all of the generated binaries. */ char **binaries = (char **)malloc( sizeof(char *) * numDevices ); if(binaries == NULL) { sampleCommon->error("Failed to allocate host memory.(binaries)"); return SDK_FAILURE; } for(i = 0; i < numDevices; i++) { if(binarySizes[i] != 0) { binaries[i] = (char *)malloc( sizeof(char) * binarySizes[i]); if(binaries[i] == NULL) { sampleCommon->error("Failed to allocate host memory.(binaries[i])"); return SDK_FAILURE; } } else { binaries[i] = NULL; } } status = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(char *) * numDevices, binaries, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetProgramInfo(CL_PROGRAM_BINARIES) failed.")) { return SDK_FAILURE; } /* dump out each binary into its own separate file. */ for(i = 0; i < numDevices; i++) { char fileName[100]; sprintf(fileName, "%s.%d", dumpBinary.c_str(), (int)i); if(binarySizes[i] != 0) { char deviceName[1024]; status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetDeviceInfo(CL_DEVICE_NAME) failed.")) { return SDK_FAILURE; } printf( "%s binary kernel: %s\n", deviceName, fileName); streamsdk::SDKFile BinaryFile; if(!BinaryFile.writeBinaryToFile(fileName, binaries[i], binarySizes[i])) { std::cout << "Failed to load kernel file : " << fileName << std::endl; return SDK_FAILURE; } } else { printf("Skipping %s since there is no binary data to write!\n", fileName); } } // Release all resouces and memory for(i = 0; i < numDevices; i++) { if(binaries[i] != NULL) { free(binaries[i]); binaries[i] = NULL; } } if(binaries != NULL) { free(binaries); binaries = NULL; } if(binarySizes != NULL) { free(binarySizes); binarySizes = NULL; } if(devices != NULL) { free(devices); devices = NULL; } status = clReleaseProgram(program); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clReleaseProgram failed.")) { return SDK_FAILURE; } status = clReleaseContext(context); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clReleaseContext failed.")) { return SDK_FAILURE; } return SDK_SUCCESS; }
int MatrixMulImage::setupCL(void) { cl_int status = 0; cl_device_type dType; if(deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; if(isThereGPU() == false) { std::cout << "GPU not found. Falling back to CPU device" << std::endl; dType = CL_DEVICE_TYPE_CPU; } } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_platform_id platform = NULL; int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled()); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed"); // Display available devices. retValue = sampleCommon->displayDevices(platform, dType); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed"); /* * If we could find our platform, use it. Otherwise use just available platform. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType( cps, dType, NULL, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed."); // getting device on which to run the sample status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled()); CHECK_ERROR(status, 0, "sampleCommon::getDevices() failed"); //Set device info of given cl_device_id retValue = deviceInfo.setDeviceInfo(devices[deviceId]); CHECK_ERROR(retValue, SDK_SUCCESS, "deviceInfo.setDeviceInfo. failed"); { // The block is to move the declaration of prop closer to its use cl_command_queue_properties prop = 0; prop |= CL_QUEUE_PROFILING_ENABLE; commandQueue = clCreateCommandQueue( context, devices[deviceId], prop, &status); CHECK_ERROR(retValue, SDK_SUCCESS, "clCreateCommandQueue. failed"); } cl_image_format imageFormat; imageFormat.image_channel_data_type = CL_FLOAT; imageFormat.image_channel_order = CL_RGBA; if(!deviceInfo.imageSupport) { std::cout << "Expected Error: Image is not supported on the Device" << std::endl; return SDK_EXPECTED_FAILURE; } cl_image_desc imageDesc; memset(&imageDesc, '\0', sizeof(cl_image_desc)); imageDesc.image_type = CL_MEM_OBJECT_IMAGE2D; // Create image for matrix A imageDesc.image_width = width0 / 4; imageDesc.image_height = height0; inputBuffer0 = clCreateImage(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &imageFormat, &imageDesc, input0, &status); CHECK_OPENCL_ERROR(status, "clCreateImage failed. (inputBuffer0)"); // Create image for matrix B imageDesc.image_width = width1 / 4; imageDesc.image_height = height1; inputBuffer1 = clCreateImage(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &imageFormat, &imageDesc, input1, &status); CHECK_OPENCL_ERROR(status, "clCreateImage failed. (inputBuffer1)"); // Create image for matrix C imageDesc.image_width = width1 / 4; imageDesc.image_height = height0; outputBuffer = clCreateImage(context, CL_MEM_WRITE_ONLY, &imageFormat, &imageDesc, 0, &status); CHECK_OPENCL_ERROR(status, "clCreateImage failed. (outputBuffer)"); // create a CL program using the kernel source streamsdk::buildProgramData buildData; buildData.kernelName = std::string("MatrixMulImage_Kernels.cl"); buildData.devices = devices; buildData.deviceId = deviceId; buildData.flagsStr = std::string(""); if(isLoadBinaryEnabled()) buildData.binaryName = std::string(loadBinary.c_str()); if(isComplierFlagsSpecified()) buildData.flagsFileName = std::string(flags.c_str()); retValue = sampleCommon->buildOpenCLProgram(program, context, buildData); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::buildOpenCLProgram() failed"); kernel = clCreateKernel(program, "mmmKernel3", &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(kernel)"); return SDK_SUCCESS; }
int ImageOverlap::setupCL() { cl_int status = CL_SUCCESS; cl_device_type dType; if(deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; if(isThereGPU() == false) { std::cout << "GPU not found. Falling back to CPU device" << std::endl; dType = CL_DEVICE_TYPE_CPU; } } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_platform_id platform = NULL; int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled()); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed"); // Display available devices. retValue = sampleCommon->displayDevices(platform, dType); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed"); // If we could find our platform, use it. Otherwise use just available platform. cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType( cps, dType, NULL, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed."); // getting device on which to run the sample status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled()); CHECK_ERROR(status, SDK_SUCCESS, "sampleCommon::getDevices() failed"); status = deviceInfo.setDeviceInfo(devices[deviceId]); CHECK_OPENCL_ERROR(status, "deviceInfo.setDeviceInfo failed"); if(!deviceInfo.imageSupport) { OPENCL_EXPECTED_ERROR(" Expected Error: Device does not support Images"); } blockSizeX = deviceInfo.maxWorkGroupSize<GROUP_SIZE?deviceInfo.maxWorkGroupSize:GROUP_SIZE; // Create command queue cl_command_queue_properties prop = 0; for(int i=0;i<3;i++) { commandQueue[i] = clCreateCommandQueue( context, devices[deviceId], prop, &status); CHECK_OPENCL_ERROR(status,"clCreateCommandQueuefailed."); } // Create and initialize image objects // Create map image mapImage = clCreateImage(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &imageFormat, &image_desc, mapImageData, &status); CHECK_OPENCL_ERROR(status,"clCreateBuffer failed. (mapImage)"); int color[4] = {0,0,80,255}; size_t origin[3] = {300,300,0}; size_t region[3] = {100,100,1}; status = clEnqueueFillImage(commandQueue[0], mapImage, color, origin, region, NULL, NULL, &eventlist[0]); // Create fill image fillImage = clCreateImage(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &imageFormat, &image_desc, fillImageData, &status); CHECK_OPENCL_ERROR(status,"clCreateBuffer failed. (fillImage)"); color[0] = 80; color[1] = 0; color[2] = 0; color[3] = 0; origin[0] = 50; origin[1] = 50; status = clEnqueueFillImage(commandQueue[1], fillImage, color, origin, region, NULL, NULL, &eventlist[1]); //Create output image outputImage = clCreateImage(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, &imageFormat, &image_desc, NULL, &status); CHECK_OPENCL_ERROR(status,"clCreateBuffer failed. (outputImage)"); // create a CL program using the kernel source streamsdk::buildProgramData buildData; buildData.kernelName = std::string("ImageOverlap_Kernels.cl"); buildData.devices = devices; buildData.deviceId = deviceId; buildData.flagsStr = std::string(""); if(isLoadBinaryEnabled()) buildData.binaryName = std::string(loadBinary.c_str()); if(isComplierFlagsSpecified()) buildData.flagsFileName = std::string(flags.c_str()); retValue = sampleCommon->buildOpenCLProgram(program, context, buildData); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::buildOpenCLProgram() failed"); // get a kernel object handle for a kernel with the given name kernelOverLap = clCreateKernel(program, "OverLap", &status); CHECK_OPENCL_ERROR(status,"clCreateKernel failed.(OverLap)"); return SDK_SUCCESS; }
int MatrixMulDouble::setupCL(void) { cl_int status = 0; cl_device_type dType; if(deviceType.compare("cpu") == 0) dType = CL_DEVICE_TYPE_CPU; else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; if(isThereGPU() == false) { std::cout << "GPU not found. Falling back to CPU device" << std::endl; dType = CL_DEVICE_TYPE_CPU; } } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ status = cl::Platform::get(&platforms); CHECK_OPENCL_ERROR(status, "Platform::get() failed."); std::vector<cl::Platform>::iterator i; if(platforms.size() > 0) { if(isPlatformEnabled()) { i = platforms.begin() + platformId; } else { for(i = platforms.begin(); i != platforms.end(); ++i) { if(!strcmp((*i).getInfo<CL_PLATFORM_VENDOR>().c_str(), "Advanced Micro Devices, Inc.")) { break; } } } } cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(*i)(), 0 }; if(NULL == (*i)()) { sampleCommon->error("NULL platform found so Exiting Application."); return SDK_FAILURE; } context = cl::Context(dType, cps, NULL, NULL, &status); CHECK_OPENCL_ERROR(status, "Context::Context() failed."); devices = context.getInfo<CL_CONTEXT_DEVICES>(); CHECK_OPENCL_ERROR(status, "Context::getInfo() failed."); std::cout << "Platform :" << (*i).getInfo<CL_PLATFORM_VENDOR>().c_str() << "\n"; int deviceCount = (int)devices.size(); int j = 0; for (std::vector<cl::Device>::iterator i = devices.begin(); i != devices.end(); ++i, ++j) { std::cout << "Device " << j << " : "; std::string deviceName = (*i).getInfo<CL_DEVICE_NAME>(); std::cout << deviceName.c_str() << "\n"; } std::cout << "\n"; if (deviceCount == 0) { std::cerr << "No device available\n"; return SDK_FAILURE; } if(sampleCommon->validateDeviceId(deviceId, deviceCount)) { sampleCommon->error("sampleCommon::validateDeviceId() failed"); return SDK_FAILURE; } std::string extensions = devices[deviceId].getInfo<CL_DEVICE_EXTENSIONS>(); std::string buildOptions = std::string(""); // Check if cl_khr_fp64 extension is supported if(strstr(extensions.c_str(), "cl_khr_fp64")) { buildOptions.append("-D KHR_DP_EXTENSION"); } else { // Check if cl_amd_fp64 extension is supported if(!strstr(extensions.c_str(), "cl_amd_fp64")) { OPENCL_EXPECTED_ERROR("Device does not support cl_amd_fp64 extension!"); } } cl_uint localMemType; // Get device specific information status = devices[deviceId].getInfo<cl_uint>( CL_DEVICE_LOCAL_MEM_TYPE, &localMemType); CHECK_OPENCL_ERROR(status, "Device::getInfo CL_DEVICE_LOCAL_MEM_TYPE) failed."); // If scratchpad is available then update the flag if(localMemType == CL_LOCAL) lds = true; // Get Device specific Information status = devices[deviceId].getInfo<size_t>( CL_DEVICE_MAX_WORK_GROUP_SIZE, &maxWorkGroupSize); CHECK_OPENCL_ERROR(status, "Device::getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE) failed."); status = devices[deviceId].getInfo<cl_uint>( CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, &maxDimensions); CHECK_OPENCL_ERROR(status, "Device::getInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) failed."); maxWorkItemSizes = (size_t*)malloc(maxDimensions * sizeof(size_t)); std::vector<size_t> workItems = devices[deviceId].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>(); for(cl_uint i = 0; i < maxDimensions; ++i) maxWorkItemSizes[i] = workItems[i]; status = devices[deviceId].getInfo<cl_ulong>( CL_DEVICE_LOCAL_MEM_SIZE, &totalLocalMemory); CHECK_OPENCL_ERROR(status, "Device::getInfo(CL_DEVICE_LOCAL_MEM_SIZES) failed."); // Set command queue properties cl_command_queue_properties prop = 0; if(!eAppGFLOPS) prop |= CL_QUEUE_PROFILING_ENABLE; commandQueue = cl::CommandQueue(context, devices[deviceId], prop, &status); CHECK_OPENCL_ERROR(status, "CommandQueue::CommandQueue() failed."); // Set Presistent memory only for AMD platform cl_mem_flags inMemFlags = CL_MEM_READ_ONLY; if(isAmdPlatform()) inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD; // Create buffer for matrix A inputBufA = cl::Buffer( context, inMemFlags, sizeof(cl_double) * widthA * heightA, NULL, &status); CHECK_OPENCL_ERROR(status, "cl::Buffer failed. (inputBufA)"); // Create buffer for matrix B inputBufB = cl::Buffer( context, inMemFlags, sizeof(cl_double) * widthB * heightB, NULL, &status); CHECK_OPENCL_ERROR(status, "cl::Buffer failed. (inputBufB)"); outputBuf = cl::Buffer( context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof(cl_double) * heightA * widthB, NULL, &status); CHECK_OPENCL_ERROR(status, "cl::Buffer failed. (outputBuf)"); device.push_back(devices[deviceId]); // create a CL program using the kernel source streamsdk::SDKFile kernelFile; std::string kernelPath = sampleCommon->getPath(); if(isLoadBinaryEnabled()) { kernelPath.append(loadBinary.c_str()); if(!kernelFile.readBinaryFromFile(kernelPath.c_str())) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } cl::Program::Binaries programBinary(1,std::make_pair( (const void*)kernelFile.source().data(), kernelFile.source().size())); program = cl::Program(context, device, programBinary, NULL, &status); CHECK_OPENCL_ERROR(status, "Program::Program(Binary) failed."); } else { kernelPath.append("MatrixMulDouble_Kernels.cl"); if(!kernelFile.open(kernelPath.c_str())) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } cl::Program::Sources programSource( 1, std::make_pair(kernelFile.source().data(), kernelFile.source().size())); program = cl::Program(context, programSource, &status); CHECK_OPENCL_ERROR(status, "Program::Program(Source) failed."); } std::string flagsStr = std::string(""); // Get build options if any flagsStr.append(buildOptions.c_str()); // Get additional options if(isComplierFlagsSpecified()) { streamsdk::SDKFile flagsFile; std::string flagsPath = sampleCommon->getPath(); flagsPath.append(flags.c_str()); if(!flagsFile.open(flagsPath.c_str())) { std::cout << "Failed to load flags file: " << flagsPath << std::endl; return SDK_FAILURE; } flagsFile.replaceNewlineWithSpaces(); const char * flags = flagsFile.source().c_str(); flagsStr.append(flags); } if(flagsStr.size() != 0) std::cout << "Build Options are : " << flagsStr.c_str() << std::endl; status = program.build(device, flagsStr.c_str()); if(status != CL_SUCCESS) { if(status == CL_BUILD_PROGRAM_FAILURE) { std::string str = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[deviceId]); std::cout << " \n\t\t\tBUILD LOG\n"; std::cout << " ************************************************\n"; std::cout << str << std::endl; std::cout << " ************************************************\n"; } } CHECK_OPENCL_ERROR(status, "Program::build() failed."); // Create kernel // If local memory is present then use the specific kernel if(lds) kernel = cl::Kernel(program, "mmmKernel_local", &status); else kernel = cl::Kernel(program, "mmmKernel", &status); CHECK_OPENCL_ERROR(status, "cl::Kernel failed."); status = kernel.getWorkGroupInfo<cl_ulong>( devices[deviceId], CL_KERNEL_LOCAL_MEM_SIZE, &usedLocalMemory); CHECK_OPENCL_ERROR(status, "Kernel::getWorkGroupInfo(CL_KERNEL_LOCAL_MEM_SIZE) failed" ".(usedLocalMemory)"); availableLocalMemory = totalLocalMemory - usedLocalMemory; if(lds) neededLocalMemory = (blockSize * 4) * (blockSize * 4) * sizeof(cl_double); else neededLocalMemory = 0; if(neededLocalMemory > availableLocalMemory) { std::cout << "Unsupported: Insufficient local memory on device." << std::endl; return SDK_FAILURE; } // Check group size against group size returned by kernel kernelWorkGroupSize = kernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(devices[deviceId], &status); CHECK_OPENCL_ERROR(status, "Kernel::getWorkGroupInfo() failed."); if((cl_uint)(blockSize * blockSize) > kernelWorkGroupSize) { if(kernelWorkGroupSize >= 64) blockSize = 8; else if(kernelWorkGroupSize >= 32) blockSize = 4; else { std::cout << "Out of Resources!" << std::endl; std::cout << "Group Size specified : " << blockSize * blockSize << std::endl; std::cout << "Max Group Size supported on the kernel : " << kernelWorkGroupSize<<std::endl; return SDK_FAILURE; } } if(blockSize > maxWorkItemSizes[0] || blockSize > maxWorkItemSizes[1] || blockSize * blockSize > maxWorkGroupSize) { sampleCommon->error("Unsupported: Device does not support requested number of work items."); return SDK_FAILURE; } return SDK_SUCCESS; }
int BinomialOption::setupCL() { cl_int status = CL_SUCCESS; cl_device_type dType; if(deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; if(isThereGPU() == false) { std::cout << "GPU not found. Falling back to CPU device" << std::endl; dType = CL_DEVICE_TYPE_CPU; } } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_platform_id platform = NULL; int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled()); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed"); // Display available devices. retValue = sampleCommon->displayDevices(platform, dType); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed"); /* * If we could find our platform, use it. Otherwise use just available platform. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType(cps, dType, NULL, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed."); // getting device on which to run the sample status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled()); CHECK_ERROR(status, SDK_SUCCESS, "sampleCommon::getDevices() failed"); status = deviceInfo.setDeviceInfo(devices[deviceId]); CHECK_OPENCL_ERROR(status, "deviceInfo.setDeviceInfo failed"); { // The block is to move the declaration of prop closer to its use cl_command_queue_properties prop = 0; commandQueue = clCreateCommandQueue(context, devices[deviceId], prop, &status); CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed."); } // Create and initialize memory objects // Set Presistent memory only for AMD platform cl_mem_flags inMemFlags = CL_MEM_READ_ONLY; // if(isAmdPlatform()) // inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD; // Create memory object for stock price randBuffer = clCreateBuffer(context, inMemFlags, numSamples * sizeof(cl_float4), NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (randBuffer)"); // Create memory object for output array outBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, numSamples * sizeof(cl_float4), NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (outBuffer)"); // create a CL program using the kernel source streamsdk::buildProgramData buildData; buildData.kernelName = std::string("BinomialOption_Kernels.cl"); buildData.devices = devices; buildData.deviceId = deviceId; buildData.flagsStr = std::string(""); if(isLoadBinaryEnabled()) buildData.binaryName = std::string(loadBinary.c_str()); if(isComplierFlagsSpecified()) buildData.flagsFileName = std::string(flags.c_str()); retValue = sampleCommon->buildOpenCLProgram(program, context, buildData); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::buildOpenCLProgram() failed"); // get a kernel object handle for a kernel with the given name kernel = clCreateKernel(program, "binomial_options", &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed."); status = kernelInfo.setKernelWorkGroupInfo(kernel, devices[deviceId]); CHECK_OPENCL_ERROR(status, "kernelInfo.setKernelWorkGroupInfo failed"); // If group-size is gerater than maximum supported on kernel if((size_t)(numSteps + 1) > kernelInfo.kernelWorkGroupSize) { if(!quiet) { std::cout << "Out of Resources!" << std::endl; std::cout << "Group Size specified : " << (numSteps + 1) << std::endl; std::cout << "Max Group Size supported on the kernel : " << kernelInfo.kernelWorkGroupSize << std::endl; std::cout << "Using appropiate group-size." << std::endl; std::cout << "-------------------------------------------" << std::endl; } numSteps = (cl_int)kernelInfo.kernelWorkGroupSize - 2; } return SDK_SUCCESS; }
int GaussianNoise::setupCL() { cl_int err = CL_SUCCESS; cl_device_type dType; if(deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; if(isThereGPU() == false) { std::cout << "GPU not found. Falling back to CPU device" << std::endl; dType = CL_DEVICE_TYPE_CPU; } } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ err = cl::Platform::get(&platforms); CHECK_OPENCL_ERROR(err, "Platform::get() failed."); std::vector<cl::Platform>::iterator i; if(platforms.size() > 0) { if(isPlatformEnabled()) { i = platforms.begin() + platformId; } else { for(i = platforms.begin(); i != platforms.end(); ++i) { if(!strcmp((*i).getInfo<CL_PLATFORM_VENDOR>().c_str(), "Advanced Micro Devices, Inc.")) { break; } } } } cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(*i)(), 0 }; context = cl::Context(dType, cps, NULL, NULL, &err); CHECK_OPENCL_ERROR(err, "Context::Context() failed."); devices = context.getInfo<CL_CONTEXT_DEVICES>(); CHECK_OPENCL_ERROR(err, "Context::getInfo() failed."); std::cout << "Platform :" << (*i).getInfo<CL_PLATFORM_VENDOR>().c_str() << "\n"; int deviceCount = (int)devices.size(); int j = 0; for (std::vector<cl::Device>::iterator i = devices.begin(); i != devices.end(); ++i, ++j) { std::cout << "Device " << j << " : "; std::string deviceName = (*i).getInfo<CL_DEVICE_NAME>(); std::cout << deviceName.c_str() << "\n"; } std::cout << "\n"; if (deviceCount == 0) { std::cerr << "No device available\n"; return SDK_FAILURE; } if(sampleCommon->validateDeviceId(deviceId, deviceCount)) { sampleCommon->error("sampleCommon::validateDeviceId() failed"); return SDK_FAILURE; } commandQueue = cl::CommandQueue(context, devices[deviceId], 0, &err); CHECK_OPENCL_ERROR(err, "CommandQueue::CommandQueue() failed."); /* * Create and initialize memory objects */ // Set Presistent memory only for AMD platform cl_mem_flags inMemFlags = CL_MEM_READ_ONLY; if(isAmdPlatform()) inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD; // Create memory object for input Image inputImageBuffer = cl::Buffer(context, inMemFlags, width * height * pixelSize, 0, &err); CHECK_OPENCL_ERROR(err, "Buffer::Buffer() failed. (inputImageBuffer)"); // Create memory object for output Image outputImageBuffer = cl::Buffer(context, CL_MEM_WRITE_ONLY, width * height * pixelSize, NULL, &err); CHECK_OPENCL_ERROR(err, "Buffer::Buffer() failed. (outputImageBuffer)"); device.push_back(devices[deviceId]); // create a CL program using the kernel source streamsdk::SDKFile kernelFile; std::string kernelPath = sampleCommon->getPath(); if(isLoadBinaryEnabled()) { kernelPath.append(loadBinary.c_str()); if(!kernelFile.readBinaryFromFile(kernelPath.c_str())) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } cl::Program::Binaries programBinary(1,std::make_pair( (const void*)kernelFile.source().data(), kernelFile.source().size())); program = cl::Program(context, device, programBinary, NULL, &err); CHECK_OPENCL_ERROR(err, "Program::Program(Binary) failed."); } else { kernelPath.append("GaussianNoise_Kernels.cl"); if(!kernelFile.open(kernelPath.c_str())) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } cl::Program::Sources programSource(1, std::make_pair(kernelFile.source().data(), kernelFile.source().size())); program = cl::Program(context, programSource, &err); CHECK_OPENCL_ERROR(err, "Program::Program(Source) failed."); } std::string flagsStr = std::string(""); // Get additional options if(isComplierFlagsSpecified()) { streamsdk::SDKFile flagsFile; std::string flagsPath = sampleCommon->getPath(); flagsPath.append(flags.c_str()); if(!flagsFile.open(flagsPath.c_str())) { std::cout << "Failed to load flags file: " << flagsPath << std::endl; return SDK_FAILURE; } flagsFile.replaceNewlineWithSpaces(); const char * flags = flagsFile.source().c_str(); flagsStr.append(flags); } if(flagsStr.size() != 0) std::cout << "Build Options are : " << flagsStr.c_str() << std::endl; err = program.build(device, flagsStr.c_str()); if(err != CL_SUCCESS) { if(err == CL_BUILD_PROGRAM_FAILURE) { std::string str = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[deviceId]); std::cout << " \n\t\t\tBUILD LOG\n"; std::cout << " ************************************************\n"; std::cout << str << std::endl; std::cout << " ************************************************\n"; } } CHECK_OPENCL_ERROR(err, "Program::build() failed."); // Create kernel kernel = cl::Kernel(program, "gaussian_transform", &err); CHECK_OPENCL_ERROR(err, "Kernel::Kernel() failed."); // Check group size against group size returned by kernel kernelWorkGroupSize = kernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(devices[deviceId], &err); CHECK_OPENCL_ERROR(err, "Kernel::getWorkGroupInfo() failed."); if((blockSizeX * blockSizeY) > kernelWorkGroupSize) { if(!quiet) { std::cout << "Out of Resources!" << std::endl; std::cout << "Group Size specified : " << blockSizeX * blockSizeY << std::endl; std::cout << "Max Group Size supported on the kernel : " << kernelWorkGroupSize << std::endl; std::cout << "Falling back to " << kernelWorkGroupSize << std::endl; } if(blockSizeX > kernelWorkGroupSize) { blockSizeX = kernelWorkGroupSize; blockSizeY = 1; } } return SDK_SUCCESS; }
int ScanLargeArrays::setupCL(void) { cl_int status = 0; cl_device_type dType; if(deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; if(isThereGPU() == false) { std::cout << "GPU not found. Falling back to CPU device" << std::endl; dType = CL_DEVICE_TYPE_CPU; } } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_platform_id platform = NULL; int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled()); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed"); // Display available devices. retValue = sampleCommon->displayDevices(platform, dType); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed"); /* * If we could find our platform, use it. Otherwise use just available platform. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType( cps, dType, NULL, NULL, &status); CHECK_OPENCL_ERROR(status,"clCreateContextFromType failed."); status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled()); CHECK_ERROR(status, SDK_SUCCESS, "sampleCommon::getDevices() failed"); { // The block is to move the declaration of prop closer to its use cl_command_queue_properties prop = 0; commandQueue = clCreateCommandQueue( context, devices[deviceId], prop, &status); if(sampleCommon->checkVal( status, 0, "clCreateCommandQueue failed.")) return SDK_FAILURE; } // Get Device specific Information //Set device info of given cl_device_id retValue = deviceInfo.setDeviceInfo(devices[deviceId]); CHECK_ERROR(retValue, SDK_SUCCESS, "SDKDeviceInfo::setDeviceInfo() failed"); // create a CL program using the kernel source streamsdk::buildProgramData buildData; buildData.kernelName = std::string("ScanLargeArrays_Kernels.cl"); buildData.devices = devices; buildData.deviceId = deviceId; buildData.flagsStr = std::string(""); if(isLoadBinaryEnabled()) buildData.binaryName = std::string(loadBinary.c_str()); if(isComplierFlagsSpecified()) buildData.flagsFileName = std::string(flags.c_str()); retValue = sampleCommon->buildOpenCLProgram(program, context, buildData); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::buildOpenCLProgram() failed"); // get a kernel object handle for a kernel with the given name bScanKernel = clCreateKernel(program, "ScanLargeArrays", &status); CHECK_OPENCL_ERROR(status,"clCreateKernel failed.(bScanKernel)"); bAddKernel = clCreateKernel(program, "blockAddition", &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(bAddKernel)"); // get a kernel object handle for a kernel with the given name pScanKernel = clCreateKernel(program, "prefixSum", &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(pScanKernel)"); status = kernelInfoBScan.setKernelWorkGroupInfo(bScanKernel,devices[deviceId]); CHECK_ERROR(status, SDK_SUCCESS, " setKernelWorkGroupInfo() failed"); status = kernelInfoBAdd.setKernelWorkGroupInfo(pScanKernel,devices[deviceId]); CHECK_ERROR(status, SDK_SUCCESS, " setKernelWorkGroupInfo() failed"); status = kernelInfoPScan.setKernelWorkGroupInfo(bAddKernel,devices[deviceId]); CHECK_ERROR(status, SDK_SUCCESS, " setKernelWorkGroupInfo() failed"); // Find munimum of all kernel's group-sizes size_t temp = min(kernelInfoBScan.kernelWorkGroupSize, kernelInfoPScan.kernelWorkGroupSize); temp = (temp > kernelInfoBAdd.kernelWorkGroupSize) ? kernelInfoBAdd.kernelWorkGroupSize : temp; if(blockSize > (cl_uint)temp) { if(!quiet) { std::cout << "Out of Resources!" << std::endl; std::cout << "Group Size specified : " << blockSize << std::endl; std::cout << "Max Group Size supported on the kernel : " << temp << std::endl; std::cout << "Falling back to " << temp << std::endl; } blockSize = (cl_uint)temp; } blockSize = min(blockSize,length/2); // Calculate number of passes required float t = log((float)length) / log((float)blockSize); pass = (cl_uint)t; // If t is equal to pass if(fabs(t - (float)pass) < 1e-7) { pass--; } // Create input buffer on device inputBuffer = clCreateBuffer( context, CL_MEM_READ_ONLY, sizeof(cl_float) * length, 0, &status); CHECK_OPENCL_ERROR(status,"clCreateBuffer failed.(inputBuffer)"); // Allocate output buffers outputBuffer = (cl_mem*)malloc(pass * sizeof(cl_mem)); for(int i = 0; i < (int)pass; i++) { int size = (int)(length / pow((float)blockSize,(float)i)); outputBuffer[i] = clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof(cl_float) * size, 0, &status); CHECK_OPENCL_ERROR(status,"clCreateBuffer failed.(outputBuffer)"); } // Allocate blockSumBuffers blockSumBuffer = (cl_mem*)malloc(pass * sizeof(cl_mem)); for(int i = 0; i < (int)pass; i++) { int size = (int)(length / pow((float)blockSize,(float)(i + 1))); blockSumBuffer[i] = clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof(cl_float) * size, 0, &status); CHECK_OPENCL_ERROR(status,"clCreateBuffer failed.(blockSumBuffer)"); } // Create a tempBuffer on device int tempLength = (int)(length / pow((float)blockSize, (float)pass)); tempBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float) * tempLength, 0, &status); CHECK_OPENCL_ERROR(status,"clCreateBuffer failed.(tempBuffer)"); return SDK_SUCCESS; }
int BoxFilterSeparable::setupCL() { cl_int status = 0; cl_device_type dType; if(deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; if(isThereGPU() == false) { std::cout << "GPU not found. Falling back to CPU device" << std::endl; dType = CL_DEVICE_TYPE_CPU; } } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_platform_id platform = NULL; int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled()); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed"); // Display available devices. retValue = sampleCommon->displayDevices(platform, dType); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed"); // If we could find our platform, use it. Otherwise use just available platform. cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType( cps, dType, NULL, NULL, &status); CHECK_OPENCL_ERROR( status, "clCreateContextFromType failed."); // getting device on which to run the sample status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled()); CHECK_ERROR(status, SDK_SUCCESS, "sampleCommon::getDevices() failed"); { // The block is to move the declaration of prop closer to its use cl_command_queue_properties prop = 0; commandQueue = clCreateCommandQueue( context, devices[deviceId], prop, &status); CHECK_OPENCL_ERROR( status, "clCreateCommandQueue failed."); } //Set device info of given cl_device_id retValue = deviceInfo.setDeviceInfo(devices[deviceId]); CHECK_ERROR(retValue, 0, "SDKDeviceInfo::setDeviceInfo() failed"); // Create and initialize memory objects // Set Presistent memory only for AMD platform cl_mem_flags inMemFlags = CL_MEM_READ_ONLY; if(isAmdPlatform()) inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD; // Create memory object for input Image inputImageBuffer = clCreateBuffer( context, inMemFlags, width * height * pixelSize, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (inputImageBuffer)"); // Create memory object for temp Image tempImageBuffer = clCreateBuffer( context, CL_MEM_READ_WRITE, width * height * pixelSize, 0, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (tempImageBuffer)"); // Create memory objects for output Image outputImageBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, width * height * pixelSize, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (outputImageBuffer)"); // create a CL program using the kernel source streamsdk::buildProgramData buildData; buildData.kernelName = std::string("BoxFilter_Kernels.cl"); buildData.devices = devices; buildData.deviceId = deviceId; buildData.flagsStr = std::string(""); if(isLoadBinaryEnabled()) buildData.binaryName = std::string(loadBinary.c_str()); if(isComplierFlagsSpecified()) buildData.flagsFileName = std::string(flags.c_str()); retValue = sampleCommon->buildOpenCLProgram(program, context, buildData); CHECK_ERROR(retValue, 0, "sampleCommon::buildOpenCLProgram() failed"); // get a kernel object handle for a kernel with the given name verticalKernel = clCreateKernel(program, "box_filter_vertical", &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed. (vertical)"); #ifdef USE_LDS horizontalKernel = clCreateKernel(program, "box_filter_horizontal_local", &status); #else horizontalKernel = clCreateKernel(program, "box_filter_horizontal", &status); #endif CHECK_OPENCL_ERROR(status, "clCreateKernel failed. (horizontal)"); status = kernelInfoH.setKernelWorkGroupInfo(horizontalKernel, devices[deviceId]); CHECK_ERROR(status, SDK_SUCCESS, "setKErnelWorkGroupInfo() failed"); status = kernelInfoV.setKernelWorkGroupInfo(verticalKernel, devices[deviceId]); CHECK_ERROR(status, SDK_SUCCESS, "setKErnelWorkGroupInfo() failed"); if((blockSizeX * blockSizeY) > kernelInfoV.kernelWorkGroupSize) { if(!quiet) { std::cout << "Out of Resources!" << std::endl; std::cout << "Group Size specified : " << blockSizeX * blockSizeY << std::endl; std::cout << "Max Group Size supported on the kernel : " << kernelInfoV.kernelWorkGroupSize << std::endl; std::cout << "Falling back to " << kernelInfoV.kernelWorkGroupSize << std::endl; } // Three possible cases if(blockSizeX > kernelInfoV.kernelWorkGroupSize) { blockSizeX = kernelInfoV.kernelWorkGroupSize; blockSizeY = 1; } } return SDK_SUCCESS; }
int FastWalshTransform::setupCL(void) { cl_int status = 0; size_t deviceListSize; cl_device_type dType; if(deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; if(isThereGPU() == false) { std::cout << "GPU not found. Falling back to CPU device" << std::endl; dType = CL_DEVICE_TYPE_CPU; } } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_uint numPlatforms; cl_platform_id platform = NULL; status = clGetPlatformIDs(0, NULL, &numPlatforms); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) { return SDK_FAILURE; } if (0 < numPlatforms) { cl_platform_id* platforms = new cl_platform_id[numPlatforms]; status = clGetPlatformIDs(numPlatforms, platforms, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) { return SDK_FAILURE; } if(isPlatformEnabled()) { platform = platforms[platformId]; } else { for (unsigned i = 0; i < numPlatforms; ++i) { char pbuf[100]; status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformInfo failed.")) { return SDK_FAILURE; } platform = platforms[i]; if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { break; } } } delete[] platforms; } if(NULL == platform) { sampleCommon->error("NULL platform found so Exiting Application."); return SDK_FAILURE; } // Display available devices. if(!sampleCommon->displayDevices(platform, dType)) { sampleCommon->error("sampleCommon::displayDevices() failed"); return SDK_FAILURE; } /* * If we could find our platform, use it. Otherwise use just available platform. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType( cps, dType, NULL, NULL, &status); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clCreateContextFromType failed.")) return SDK_FAILURE; /* First, get the size of device list data */ status = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetContextInfo failed.")) return SDK_FAILURE; int deviceCount = (int)(deviceListSize / sizeof(cl_device_id)); if(!sampleCommon->validateDeviceId(deviceId, deviceCount)) { sampleCommon->error("sampleCommon::validateDeviceId() failed"); return SDK_FAILURE; } /* Now allocate memory for device list based on the size we got earlier */ devices = (cl_device_id *)malloc(deviceListSize); if(devices == NULL) { sampleCommon->error("Failed to allocate memory (devices)."); return SDK_FAILURE; } /* Now, get the device list data */ status = clGetContextInfo( context, CL_CONTEXT_DEVICES, deviceListSize, devices, NULL); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetGetContextInfo failed.")) return SDK_FAILURE; { /* The block is to move the declaration of prop closer to its use */ cl_command_queue_properties prop = 0; commandQueue = clCreateCommandQueue( context, devices[deviceId], prop, &status); if(!sampleCommon->checkVal( status, 0, "clCreateCommandQueue failed.")) return SDK_FAILURE; } inputBuffer = clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof(cl_float) * length, 0, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateBuffer failed. (inputBuffer)")) return SDK_FAILURE; /* create a CL program using the kernel source */ streamsdk::SDKFile kernelFile; std::string kernelPath = sampleCommon->getPath(); if(isLoadBinaryEnabled()) { kernelPath.append(loadBinary.c_str()); if(!kernelFile.readBinaryFromFile(kernelPath.c_str())) { std::cout << "(3) Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } const char * binary = kernelFile.source().c_str(); size_t binarySize = kernelFile.source().size(); program = clCreateProgramWithBinary(context, 1, &devices[deviceId], (const size_t *)&binarySize, (const unsigned char**)&binary, NULL, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateProgramWithBinary failed.")) return SDK_FAILURE; } else { // special case for packetized OpenCL (can not yet compile .cl directly) char vName[100]; status = clGetPlatformInfo(platform, CL_PLATFORM_VENDOR, sizeof(vName), vName, NULL); const bool platformIsPacketizedOpenCL = !strcmp(vName, "Ralf Karrenberg, Saarland University"); if (!strcmp(vName, "Intel(R) Corporation")) { vendorName = "intel"; } else if (!strcmp(vName, "Advanced Micro Devices, Inc.")) { vendorName = "amd"; } else if (platformIsPacketizedOpenCL) { vendorName = "pkt"; } else { printf("ERROR: vendor not recognized: %s\n", vName); } kernelPath.append("FastWalshTransform_Kernels.cl"); if(!kernelFile.open(kernelPath.c_str())) { std::cout << "(4) Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } const char * source = kernelFile.source().c_str(); size_t sourceSize[] = { strlen(source) }; program = clCreateProgramWithSource(context, 1, &source, sourceSize, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateProgramWithSource failed.")) return SDK_FAILURE; } std::string flagsStr = std::string(""); // Get additional options if(isComplierFlagsSpecified()) { streamsdk::SDKFile flagsFile; std::string flagsPath = sampleCommon->getPath(); flagsPath.append(flags.c_str()); if(!flagsFile.open(flagsPath.c_str())) { std::cout << "Failed to load flags file: " << flagsPath << std::endl; return SDK_FAILURE; } flagsFile.replaceNewlineWithSpaces(); const char * flags = flagsFile.source().c_str(); flagsStr.append(flags); } if(flagsStr.size() != 0) std::cout << "Build Options are : " << flagsStr.c_str() << std::endl; /* create a cl program executable for all the devices specified */ status = clBuildProgram(program, 1, &devices[deviceId], flagsStr.c_str(), NULL, NULL); if(status != CL_SUCCESS) { if(status == CL_BUILD_PROGRAM_FAILURE) { cl_int logStatus; char * buildLog = NULL; size_t buildLogSize = 0; logStatus = clGetProgramBuildInfo(program, devices[deviceId], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, &buildLogSize); if(!sampleCommon->checkVal(logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) { return SDK_FAILURE; } buildLog = (char*)malloc(buildLogSize); if(buildLog == NULL) { sampleCommon->error("Failed to allocate host memory. (buildLog)"); return SDK_FAILURE; } memset(buildLog, 0, buildLogSize); logStatus = clGetProgramBuildInfo(program, devices[deviceId], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL); if(!sampleCommon->checkVal(logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) { free(buildLog); return SDK_FAILURE; } std::cout << " \n\t\t\tBUILD LOG\n"; std::cout << " ************************************************\n"; std::cout << buildLog << std::endl; std::cout << " ************************************************\n"; free(buildLog); } if(!sampleCommon->checkVal(status, CL_SUCCESS, "clBuildProgram failed.")) { return SDK_FAILURE; } } /* get a kernel object handle for a kernel with the given name */ kernel = clCreateKernel(program, "fastWalshTransform", &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateKernel failed.")) return SDK_FAILURE; return SDK_SUCCESS; }
int DeviceFission::setupCLRuntime() { cl_int status = CL_SUCCESS; // Create a CL program using the kernel source streamsdk::buildProgramData buildData; buildData.kernelName = std::string("DeviceFission_Kernels.cl"); buildData.devices = Devices; buildData.deviceId = deviceId; buildData.flagsStr = std::string(""); if(isLoadBinaryEnabled()) buildData.binaryName = std::string(loadBinary.c_str()); if(isComplierFlagsSpecified()) buildData.flagsFileName = std::string(flags.c_str()); // Get allocate memory for subCmdQueue subCmdQueue = (cl_command_queue*)malloc(numSubDevices * sizeof(cl_command_queue)); CHECK_ALLOCATION(subCmdQueue,"Failed to allocate memory. (subCmdQueue)"); // Create command queue subCmdQueue for(cl_uint i = 0; i < numSubDevices; i++) { // Create command queue subCmdQueue[i] = clCreateCommandQueue(rContext, subDevices[i], 0, &status); CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed. (subCmdQueue)"); } // Create command queue gpuCmdQueue gpuCmdQueue = clCreateCommandQueue(rContext, gpuDevice, 0, &status); CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed. (gpuCmdQueue)"); // Create memory objects for input InBuf = clCreateBuffer(rContext, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, length * sizeof(cl_int), NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (InBuf)"); // Get allocate memory for sub devices output subOutBuf = (cl_mem*)malloc(numSubDevices * sizeof(cl_mem)); for(cl_uint i = 0; i < numSubDevices; i++) { // Create memory objects for sub devices output subOutBuf[i] = clCreateBuffer(rContext, CL_MEM_WRITE_ONLY, half_length * sizeof(cl_int) , NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (subOutBuf)"); } // Get allocate memory for GPU device output gpuOutBuf = (cl_mem*)malloc(numSubDevices * sizeof(cl_mem)); for(cl_uint i = 0; i < numSubDevices; i++) { // Create memory objects for GPU device output gpuOutBuf[i] = clCreateBuffer(rContext, CL_MEM_WRITE_ONLY, half_length * sizeof(cl_int) , NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (gpuOutBuf)"); } streamsdk::SDKFile kernelFile; std::string kernelPath = sampleCommon->getPath(); char * source = NULL; size_t sourceSize[] = {0}; char * binary = NULL; size_t binarySize = 0; if(isLoadBinaryEnabled()) { kernelPath += loadBinary; if(kernelFile.readBinaryFromFile(kernelPath.c_str())) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } // Get binaries and binary sizes for CPU devices char** subBinaries = (char**)malloc(numSubDevices * sizeof(char*)); if(subBinaries == NULL) { sampleCommon->error("Failed to allocate memory(subBinaries)"); return SDK_FAILURE; } size_t* subBinariesSize = (size_t*)malloc(numSubDevices * sizeof(size_t*)); if(subBinariesSize == NULL) { sampleCommon->error("Failed to allocate memory(subBinariesSize)"); return SDK_FAILURE; } for(cl_uint i = 0; i < numSubDevices; ++i) { subBinaries[i] = (char*)kernelFile.source().c_str(); subBinariesSize[i] = kernelFile.source().size(); } subProgram = clCreateProgramWithBinary(rContext, numSubDevices, subDevices, (const size_t *)subBinariesSize, (const unsigned char**)subBinaries, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateProgramWithBinary failed.(subProgram)"); streamsdk::SDKFile kernelFileGPU; std::string kernelPathGPU = sampleCommon->getPath(); if(!gpuAvailable) { loadBinaryGPU = loadBinary; } kernelPathGPU += loadBinaryGPU; if(loadBinaryGPU.length() == 0) { std::cout << "Failed to load GPU kernel file, please assign it by '--loadgpu'. "<< std::endl; return SDK_FAILURE; } if(kernelFileGPU.readBinaryFromFile(kernelPathGPU.c_str())) { std::cout << "Failed to load GPU kernel file : " << kernelPathGPU << std::endl; return SDK_FAILURE; } // Get binaries and binary sizes for GPU device char* subBinariesGPU; size_t subBinariesSizeGPU;; subBinariesGPU = (char*)kernelFileGPU.source().c_str(); subBinariesSizeGPU = kernelFileGPU.source().size(); gpuProgram = clCreateProgramWithBinary(rContext, 1, &gpuDevice, &subBinariesSizeGPU, (const unsigned char **)&subBinariesGPU, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateProgramWithBinary failed.(gpuProgram)"); free(subBinaries); free(subBinariesSize); subBinariesSize = NULL; subBinaries = NULL; } else { kernelPath.append("DeviceFission_Kernels.cl"); if(!kernelFile.open(kernelPath.c_str()))//bool { std::cout << "Failed to load kernel file: " << kernelPath << std::endl; return SDK_FAILURE; } const char * source = kernelFile.source().c_str(); size_t sourceSize[] = {strlen(source)}; // Create a CL program for sub-devices using the kernel source subProgram = clCreateProgramWithSource(rContext, 1, (const char**)&source, sourceSize, &status); CHECK_OPENCL_ERROR(status, "clCreateProgramWithSource failed.(subProgram)"); // Create a CL program for GPU device using the kernel source gpuProgram = clCreateProgramWithSource(rContext, 1, (const char**)&source, sourceSize, &status); CHECK_OPENCL_ERROR(status, "clCreateProgramWithSource failed.(gpuProgram)"); } // Get build options const char *flags; streamsdk::SDKFile flagsFile; std::string flagsPath = sampleCommon->getPath(); if(buildData.flagsFileName.size() != 0) { flagsPath.append(buildData.flagsFileName.c_str()); if(!flagsFile.open(flagsPath.c_str())) { std::cout << "Failed to load flags file: " << flagsPath << std::endl; return SDK_FAILURE; } flagsFile.replaceNewlineWithSpaces(); flags = flagsFile.source().c_str(); if(strlen(flags) != 0) std::cout << "Build Options are : " << flags << std::endl; } else { flags = NULL; } // Create a cl program executable for all sub-devices status = clBuildProgram(subProgram, numSubDevices, subDevices, flags, NULL, NULL); CHECK_OPENCL_ERROR(status, "clBuildProgram failed.(subProgram)"); if(status != CL_SUCCESS) { if(status == CL_BUILD_PROGRAM_FAILURE) { cl_int logStatus; char * buildLog = NULL; size_t buildLogSize = 0; logStatus = clGetProgramBuildInfo(subProgram, subDevices[0], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, &buildLogSize); if(!sampleCommon->checkVal(logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) return SDK_FAILURE; buildLog = (char*)malloc(buildLogSize); if(NULL == buildLog) { sampleCommon->error("Failed to allocate host memory.(buildLog)"); return SDK_FAILURE; } memset(buildLog, 0, buildLogSize); logStatus = clGetProgramBuildInfo(subProgram, subDevices[0], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL); if(!sampleCommon->checkVal(logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) { free(buildLog); return SDK_FAILURE; } std::cout << " \n\t\t\tBUILD LOG(SUB-DEVICES)\n"; std::cout << " ************************************************\n"; std::cout << buildLog << std::endl; std::cout << " ************************************************\n"; free(buildLog); } if(!sampleCommon->checkVal(status, CL_SUCCESS, "clBuildProgram failed. (SUB-DEVICES)")) return SDK_FAILURE; } // Create a cl program executable for GPU device status = clBuildProgram(gpuProgram, 1, &gpuDevice, flags, NULL, NULL); CHECK_OPENCL_ERROR(status, "clBuildProgram failed.(gpuProgram)"); if(status != CL_SUCCESS) { if(status == CL_BUILD_PROGRAM_FAILURE) { cl_int logStatus; char * buildLog = NULL; size_t buildLogSize = 0; logStatus = clGetProgramBuildInfo(gpuProgram, gpuDevice, CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, &buildLogSize); if(!sampleCommon->checkVal(logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) return SDK_FAILURE; buildLog = (char*)malloc(buildLogSize); if(NULL == buildLog) { sampleCommon->error("Failed to allocate host memory.(buildLog)"); return SDK_FAILURE; } memset(buildLog, 0, buildLogSize); logStatus = clGetProgramBuildInfo(gpuProgram, gpuDevice, CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL); if(!sampleCommon->checkVal(logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) { free(buildLog); return SDK_FAILURE; } std::cout << " \n\t\t\tBUILD LOG(GPU-DEVICE)\n"; std::cout << " ************************************************\n"; std::cout << buildLog << std::endl; std::cout << " ************************************************\n"; free(buildLog); } if(!sampleCommon->checkVal(status, CL_SUCCESS, "clBuildProgram failed. (GPU-DEVICE)")) return SDK_FAILURE; } // Get a kernel object handle for a kernel with the given name subKernel[0] = clCreateKernel(subProgram, "Add", &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(subKernel[0])"); // Get a kernel object handle for a kernel with the given name subKernel[1] = clCreateKernel(subProgram, "Sub", &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(subKernel[1])"); // Get a kernel object handle for a kernel with the given name gpuKernel[0] = clCreateKernel(gpuProgram, "Add", &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(gpuKernel[0])"); // Get a kernel object handle for a kernel with the given name gpuKernel[1] = clCreateKernel(gpuProgram, "Sub", &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(gpuKernel[1])"); return SDK_SUCCESS; }
int PrefixSum::setupCL(void) { cl_int status = 0; cl_device_type dType; if(deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; if(isThereGPU() == false) { std::cout << "GPU not found. Falling back to CPU device" << std::endl; dType = CL_DEVICE_TYPE_CPU; } } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_platform_id platform = NULL; int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled()); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon->getPlatform() failed"); // Display available devices. retValue = sampleCommon->displayDevices(platform, dType); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed"); /* * If we could find our platform, use it. Otherwise use just available platform. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType( cps, dType, NULL, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed."); status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled()); CHECK_ERROR(status, SDK_SUCCESS, "sampleCommon::getDevices() failed"); //Set device info of given cl_device_id status = deviceInfo.setDeviceInfo(devices[deviceId]); CHECK_ERROR(status, SDK_SUCCESS, "SDKDeviceInfo::setDeviceInfo() failed"); { // The block is to move the declaration of prop closer to its use cl_command_queue_properties prop = 0; commandQueue = clCreateCommandQueue( context, devices[deviceId], prop, &status); CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed."); } // Set Presistent memory only for AMD platform cl_mem_flags inMemFlags = CL_MEM_READ_ONLY; if(isAmdPlatform()) inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD; inputBuffer = clCreateBuffer( context, inMemFlags, sizeof(cl_float) * length, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (inputBuffer)"); outputBuffer = clCreateBuffer( context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof(cl_float) * length, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (outputBuffer)"); // create a CL program using the kernel source streamsdk::buildProgramData buildData; buildData.kernelName = std::string("PrefixSum_Kernels.cl"); buildData.devices = devices; buildData.deviceId = deviceId; buildData.flagsStr = std::string(""); if(isLoadBinaryEnabled()) buildData.binaryName = std::string(loadBinary.c_str()); if(isComplierFlagsSpecified()) buildData.flagsFileName = std::string(flags.c_str()); retValue = sampleCommon->buildOpenCLProgram(program, context, buildData); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::buildOpenCLProgram() failed"); // get a kernel object handle for a kernel with the given name kernel = clCreateKernel(program, "prefixSum", &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed."); return SDK_SUCCESS; }
int MatrixTranspose::setupCL(void) { cl_int status = 0; cl_device_type dType; if(deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_ACCELERATOR; if(isThereGPU() == false) { std::cout << "GPU not found. Falling back to CPU device" << std::endl; dType = CL_DEVICE_TYPE_CPU; } } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_platform_id platform = NULL; int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled()); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed"); // Display available devices. retValue = sampleCommon->displayDevices(platform, dType); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed"); /* * If we could find our platform, use it. Otherwise use just available platform. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType( cps, dType, NULL, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed."); // getting device on which to run the sample status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled()); CHECK_ERROR(status, SDK_SUCCESS, "sampleCommon::getDevices() failed"); // Get Device specific Information, Set device info of given cl_device_id retValue = deviceInfo.setDeviceInfo(devices[deviceId]); CHECK_ERROR(retValue, SDK_SUCCESS, "SDKDeviceInfo::setDeviceInfo() failed"); { // The block is to move the declaration of prop closer to its use cl_command_queue_properties prop = CL_QUEUE_PROFILING_ENABLE; commandQueue = clCreateCommandQueue( context, devices[deviceId], prop, &status); CHECK_ERROR(status, 0, "clCreateCommandQueue failed."); } // Set Presistent memory only for AMD platform cl_mem_flags inMemFlags = CL_MEM_READ_ONLY; /* if(isAmdPlatform()) // To achieve best performance, use persistent memory together with // clEnqueueMapBuffer (instead of clEnqeueRead/Write). // At the same time, in general, the best performance is the function // of access pattern and size of the buffer. inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD;*/ inputBuffer = clCreateBuffer( context, inMemFlags, sizeof(cl_float) * width * height, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (inputBuffer)"); outputBuffer = clCreateBuffer( context, CL_MEM_WRITE_ONLY, sizeof(cl_float) * width * height, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (outputBuffer)"); // create a CL program using the kernel source streamsdk::buildProgramData buildData; buildData.kernelName = std::string("MatrixTranspose_Kernels.cl"); buildData.devices = devices; buildData.deviceId = deviceId; buildData.flagsStr = std::string(""); if(isLoadBinaryEnabled()) buildData.binaryName = std::string(loadBinary.c_str()); if(isComplierFlagsSpecified()) buildData.flagsFileName = std::string(flags.c_str()); retValue = sampleCommon->buildOpenCLProgram(program, context, buildData); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::buildOpenCLProgram() failed"); // get a kernel object handle for a kernel with the given name kernel = clCreateKernel(program, "matrixTranspose", &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed."); status = kernelInfo.setKernelWorkGroupInfo(kernel, devices[deviceId]); CHECK_ERROR(status, SDK_SUCCESS, "setKErnelWorkGroupInfo() failed"); availableLocalMemory = deviceInfo.localMemSize - kernelInfo.localMemoryUsed; // each work item is going to work on [elemsPerThread1Dim x elemsPerThread1Dim] matrix elements, // therefore the total size of needed local memory is calculated as // # of WIs in a group multiplied by # of matrix elements per a WI neededLocalMemory = blockSize * blockSize * elemsPerThread1Dim * elemsPerThread1Dim * sizeof(cl_float); if(neededLocalMemory > availableLocalMemory) { std::cout << "Unsupported: Insufficient local memory on device." << std::endl; return SDK_FAILURE; } if((cl_uint)(blockSize * blockSize) > kernelInfo.kernelWorkGroupSize) { if(kernelInfo.kernelWorkGroupSize >= 64) blockSize = 8; else if(kernelInfo.kernelWorkGroupSize >= 32) blockSize = 4; else { std::cout << "Out of Resources!" << std::endl; std::cout << "Group Size specified : " << blockSize * blockSize << std::endl; std::cout << "Max Group Size supported on the kernel : " << kernelInfo.kernelWorkGroupSize << std::endl; return SDK_FAILURE; } } if(blockSize > deviceInfo.maxWorkItemSizes[0] || blockSize > deviceInfo.maxWorkItemSizes[1] || (size_t)blockSize * blockSize > deviceInfo.maxWorkGroupSize) { std::cout << "Unsupported: Device does not support requested number of work items." << std::endl; return SDK_FAILURE; } return SDK_SUCCESS; }
int GlobalMemoryBandwidth::setupCL(void) { cl_int status = 0; cl_device_type dType; if(deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; if(isThereGPU() == false) { std::cout << "GPU not found. Falling back to CPU device" << std::endl; dType = CL_DEVICE_TYPE_CPU; } } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_platform_id platform = NULL; int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled()); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed"); // Display available devices. retValue = sampleCommon->displayDevices(platform, dType); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed"); /* * If we could find our platform, use it. Otherwise use just available platform. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType(cps, dType, NULL, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed."); // getting device on which to run the sample status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled()); CHECK_ERROR(status, SDK_SUCCESS, "sampleCommon::getDevices() failed"); //Set device info of given cl_device_id retValue = deviceInfo.setDeviceInfo(devices[deviceId]); CHECK_ERROR(retValue, SDK_SUCCESS, "SDKDeviceInfo::setDeviceInfo() failed"); std::string deviceStr(deviceInfo.deviceVersion); size_t vStart = deviceStr.find(" ", 0); size_t vEnd = deviceStr.find(" ", vStart + 1); std::string vStrVal = deviceStr.substr(vStart + 1, vEnd - vStart - 1); #ifdef CL_VERSION_1_1 if(vStrVal.compare("1.0") > 0) { char openclVersion[1024]; status = clGetDeviceInfo(devices[deviceId], CL_DEVICE_OPENCL_C_VERSION, sizeof(openclVersion), openclVersion, 0); CHECK_OPENCL_ERROR(status, "clGetDeviceInfo failed."); std::string tempStr(openclVersion); size_t dotPos = tempStr.find_first_of("."); size_t spacePos = tempStr.find_last_of(" "); tempStr = tempStr.substr(dotPos + 1, spacePos - dotPos); int minorVersion = atoi(tempStr.c_str()); // OpenCL 1.1 has inbuilt support for vec3 data types if(minorVersion < 1 && vec3 == true) { OPENCL_EXPECTED_ERROR("Device doesn't support built-in 3 component vectors!"); } } else { // OpenCL 1.1 has inbuilt support for vec3 data types if(vec3 == true) { OPENCL_EXPECTED_ERROR("Device doesn't support built-in 3 component vectors!"); } } #else // OpenCL 1.1 has inbuilt support for vec3 data types if(vec3 == true) { OPENCL_EXPECTED_ERROR("Device doesn't support built-in 3 component vectors!"); } #endif { // The block is to move the declaration of prop closer to its use cl_command_queue_properties prop = 0; prop |= CL_QUEUE_PROFILING_ENABLE; commandQueue = clCreateCommandQueue(context, devices[deviceId], prop, &status); CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed."); } cl_uint sizeElement = vectorSize * sizeof(cl_float); cl_uint readLength = length + (NUM_READS * 1024 / sizeElement) + EXTRA_BYTES; cl_uint size = readLength * vectorSize * sizeof(cl_float); // Create input buffer inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY, size, 0, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (inputBuffer)"); // Write data to buffer status = clEnqueueWriteBuffer(commandQueue, inputBuffer, 1, 0, size, input, 0, 0, 0); CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (inputBuffer)"); outputBufferReadSingle = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float) * vectorSize * length, 0, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (outputBufferReadSingle)"); // Write data to buffer status = clEnqueueWriteBuffer(commandQueue, outputBufferReadSingle, CL_TRUE, 0, sizeof(cl_float) * vectorSize * length, outputReadSingle, 0, NULL, NULL); CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (outputBufferReadSingle)"); outputBufferReadLinear = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float) * vectorSize * length, 0, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (outputBufferReadLinear)"); // Write data to buffer status = clEnqueueWriteBuffer(commandQueue, outputBufferReadLinear, CL_TRUE, 0, sizeof(cl_float) * vectorSize * length, outputReadLinear, 0, NULL, NULL); CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (outputBufferReadLinear)"); outputBufferReadLU = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float) * vectorSize * length, 0, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (outputBufferReadLU)"); // Write data to buffer status = clEnqueueWriteBuffer(commandQueue, outputBufferReadLU, CL_TRUE, 0, sizeof(cl_float) * vectorSize * length, outputReadLU, 0, NULL, NULL); CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (outputBufferReadLU)"); outputBufferWriteLinear = clCreateBuffer(context, CL_MEM_WRITE_ONLY, size, 0, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (outputBufferWriteLinear)"); // Write data to buffer status = clEnqueueWriteBuffer(commandQueue, outputBufferWriteLinear, CL_TRUE, 0, size, outputWriteLinear, 0, NULL, NULL); CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed. (outputBufferWriteLinear)"); // create a CL program using the kernel source char buildOption[128]; if(vectorSize == 1) sprintf(buildOption, "-D DATATYPE=float -D OFFSET=%d ", OFFSET); else sprintf(buildOption, "-D DATATYPE=float%d -D OFFSET=%d ", (vec3 == true) ? 3 : vectorSize, OFFSET); // create a CL program using the kernel source streamsdk::buildProgramData buildData; buildData.kernelName = std::string("GlobalMemoryBandwidth_Kernels.cl"); buildData.devices = devices; buildData.deviceId = deviceId; buildData.flagsStr = std::string(buildOption); if(isLoadBinaryEnabled()) buildData.binaryName = std::string(loadBinary.c_str()); if(isComplierFlagsSpecified()) buildData.flagsFileName = std::string(flags.c_str()); retValue = sampleCommon->buildOpenCLProgram(program, context, buildData); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::buildOpenCLProgram() failed"); // Global memory bandwidth from read-single access kernel[0] = clCreateKernel(program, "read_single", &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(read_single)"); // Global memory bandwidth from read-linear access kernel[1] = clCreateKernel(program, "read_linear", &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(read_linear)"); // Global memory bandwidth from read-linear access kernel[2] = clCreateKernel(program, "read_linear_uncached", &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(read_linear_uncached)"); // Global memory bandwidth from write-linear access kernel[3] = clCreateKernel(program, "write_linear", &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(GlobalBandwidth_write_linear)"); return SDK_SUCCESS; }
int EigenValue::setupCL(void) { cl_int status = 0; cl_device_type dType; if(deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; if(isThereGPU() == false) { std::cout << "GPU not found. Falling back to CPU device" << std::endl; dType = CL_DEVICE_TYPE_CPU; } } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_platform_id platform = NULL; int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled()); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed"); // Display available devices. retValue = sampleCommon->displayDevices(platform, dType); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed"); // If we could find our platform, use it. Otherwise use just available platform. cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType( cps, dType, NULL, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed."); // getting device on which to run the sample status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled()); CHECK_ERROR(status, 0, "sampleCommon::getDevices() failed"); { // The block is to move the declaration of prop closer to its use cl_command_queue_properties prop = 0; commandQueue = clCreateCommandQueue( context, devices[deviceId], prop, &status); CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed."); } // Set Presistent memory only for AMD platform cl_mem_flags inMemFlags = CL_MEM_READ_ONLY; if(isAmdPlatform()) inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD; // cl mem to store the diagonal elements of the matrix diagonalBuffer = clCreateBuffer( context, inMemFlags, sizeof(cl_float) * length, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (diagonalBuffer)"); // cl mem to store the number of eigenvalues in each interval numEigenValuesIntervalBuffer = clCreateBuffer( context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(cl_uint) * length, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (diagonalBuffer)"); // cl mem to store the offDiagonal elements of the matrix offDiagonalBuffer = clCreateBuffer( context, inMemFlags, sizeof(cl_float) * (length-1), NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (offDiagonalBuffer)"); // cl mem to store the eigenvalue intervals for(int i = 0 ; i < 2 ; ++ i) { eigenIntervalBuffer[i] = clCreateBuffer( context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(cl_uint) * length * 2, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (eigenIntervalBuffer)"); } // create a CL program using the kernel source streamsdk::buildProgramData buildData; buildData.kernelName = std::string("EigenValue_Kernels.cl"); buildData.devices = devices; buildData.deviceId = deviceId; buildData.flagsStr = std::string("-x clc++"); if(isLoadBinaryEnabled()) buildData.binaryName = std::string(loadBinary.c_str()); if(isComplierFlagsSpecified()) buildData.flagsFileName = std::string(flags.c_str()); retValue = sampleCommon->buildOpenCLProgram(program, context, buildData); CHECK_ERROR(retValue, 0, "sampleCommon::buildOpenCLProgram() failed"); // get a kernel object handle for a kernel with the given name kernel[0] = clCreateKernel(program, "calNumEigenValueInterval", &status); if(sampleCommon->checkVal( status, CL_SUCCESS, "clCreateKernel failed.")) return SDK_FAILURE; // get a kernel object handle for a kernel with the given name kernel[1] = clCreateKernel(program, "recalculateEigenIntervals", &status); if(sampleCommon->checkVal( status, CL_SUCCESS, "clCreateKernel failed.")) return SDK_FAILURE; return SDK_SUCCESS; }
int DwtHaar1D::setupCL(void) { cl_int status = 0; cl_device_type dType; if(deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; if(isThereGPU() == false) { std::cout << "GPU not found. Falling back to CPU device" << std::endl; dType = CL_DEVICE_TYPE_CPU; } } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_platform_id platform = NULL; int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled()); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed"); // Display available devices. retValue = sampleCommon->displayDevices(platform, dType); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed"); // If we could find our platform, use it. Otherwise use just available platform. cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType(cps, dType, NULL, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed."); // getting device on which to run the sample status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled()); CHECK_ERROR(status, SDK_SUCCESS, "sampleCommon::getDevices() failed"); commandQueue = clCreateCommandQueue(context, devices[deviceId], 0, &status); CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed."); //Set device info of given cl_device_id retValue = deviceInfo.setDeviceInfo(devices[deviceId]); CHECK_ERROR(retValue, 0, "SDKDeviceInfo::setDeviceInfo() failed"); // Set Presistent memory only for AMD platform cl_mem_flags inMemFlags = CL_MEM_READ_ONLY; if(isAmdPlatform()) inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD; inDataBuf = clCreateBuffer(context, inMemFlags, sizeof(cl_float) * signalLength, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (inDataBuf)"); dOutDataBuf = clCreateBuffer(context, CL_MEM_WRITE_ONLY, signalLength * sizeof(cl_float), NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (dOutDataBuf)"); dPartialOutDataBuf = clCreateBuffer(context, CL_MEM_WRITE_ONLY, signalLength * sizeof(cl_float), NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (dPartialOutDataBuf)"); // create a CL program using the kernel source streamsdk::buildProgramData buildData; buildData.kernelName = std::string("DwtHaar1DCPPKernel_Kernels.cl"); buildData.devices = devices; buildData.deviceId = deviceId; buildData.flagsStr = std::string("-x clc++ "); if(isLoadBinaryEnabled()) buildData.binaryName = std::string(loadBinary.c_str()); if(isComplierFlagsSpecified()) buildData.flagsFileName = std::string(flags.c_str()); retValue = sampleCommon->buildOpenCLProgram(program, context, buildData); CHECK_ERROR(retValue, 0, "sampleCommon::buildOpenCLProgram() failed"); // get a kernel object handle for a kernel with the given name kernel = clCreateKernel(program, "dwtHaar1D", &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed."); status = kernelInfo.setKernelWorkGroupInfo(kernel,devices[deviceId]); CHECK_ERROR(status, SDK_SUCCESS, " setKernelWorkGroupInfo() failed"); return SDK_SUCCESS; }
int MathBenchmark::setupCL(void) { cl_int status = 0; cl_device_type dType; if (deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; if (isThereGPU() == false) { std::cout << "GPU not found. Falling back to CPU device" << std::endl; dType = CL_DEVICE_TYPE_CPU; } } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_platform_id platform = NULL; int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled()); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed"); // Display available devices. retValue = sampleCommon->displayDevices(platform, dType); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed"); /* * If we could find our platform, use it. Otherwise use just available platform. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) platform, 0 }; context = clCreateContextFromType(cps, dType, NULL, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed."); // getting device on which to run the sample status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled()); CHECK_ERROR(status, SDK_SUCCESS, "sampleCommon::getDevices() failed"); //Set device info of given cl_device_id retValue = deviceInfo.setDeviceInfo(devices[deviceId]); CHECK_ERROR(retValue, SDK_SUCCESS, "SDKDeviceInfo::setDeviceInfo() failed"); maxWorkGroup = deviceInfo.maxWorkGroupSize; max_mem_alloc_size = deviceInfo.maxMemAllocSize; while (maxMemSize <= (unsigned int) (max_mem_alloc_size)) { maxMemSize *= 2; } maxMemSize /= 2; if (maxMemSize > 134217728 && dType == CL_DEVICE_TYPE_CPU) { maxMemSize = 134217728; } std::cout << "CL_DEVICE_MAX_WORK_GROUP_SIZE:\t" << maxWorkGroup << std::endl; std::cout << "MaxMemSize:\t" << maxMemSize / (1024 * 1024) << "MB" << std::endl; { // The block is to move the declaration of prop closer to its use cl_command_queue_properties prop = 0; prop |= CL_QUEUE_PROFILING_ENABLE; commandQueue = clCreateCommandQueue(context, devices[deviceId], prop, &status); CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed."); } // create a CL program using the kernel source streamsdk::buildProgramData buildData; buildData.kernelName = std::string("mathoper.cl"); buildData.devices = devices; buildData.deviceId = deviceId; buildData.flagsStr = std::string(""); if (isLoadBinaryEnabled()) buildData.binaryName = std::string(loadBinary.c_str()); if (isComplierFlagsSpecified()) buildData.flagsFileName = std::string(flags.c_str()); retValue = sampleCommon->buildOpenCLProgram(program, context, buildData); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::buildOpenCLProgram() failed"); std::string s; std::stringstream ss(s); ss << "kernel_asinh_withDD"; ss << vectorSize; // Create the cKermel_kernel_asinh_withDD kernel[0] = clCreateKernel(program, ss.str().c_str(), &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(kernel_asinh_withDD)"); std::stringstream asinh_withoutDD(s); asinh_withoutDD << "kernel_asinh_withoutDD"; asinh_withoutDD << vectorSize; //dumpPTXCode(context,program,asinh_withoutDD.str().c_str()); // Create the cKermel_kernel_asinh_withoutDD kernel[1] = clCreateKernel(program, asinh_withoutDD.str().c_str(), &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(kernel_asinh_withoutDD)"); std::stringstream acosh_withDD(s); acosh_withDD << "kernel_acosh_withDD"; acosh_withDD << vectorSize; //dumpPTXCode(context,program,acosh_withDD.str().c_str()); // Create the cKermel_kernel_acosh_withDD kernel[2] = clCreateKernel(program, acosh_withDD.str().c_str(), &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(kernel_acosh_withDD)"); std::stringstream acosh_withoutDD(s); acosh_withoutDD << "kernel_acosh_withoutDD"; acosh_withoutDD << vectorSize; //dumpPTXCode(context,program,acosh_withoutDD.str().c_str()); // Create the cKermel_kernel_acosh_withoutDD kernel[3] = clCreateKernel(program, acosh_withoutDD.str().c_str(), &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(kernel_acosh_withoutDD)"); std::stringstream atanh_withDD(s); atanh_withDD << "kernel_atanh_withDD"; atanh_withDD << vectorSize; //dumpPTXCode(context,program,atanh_withDD.str().c_str()); // Create the cKermel_kernel_atanh_withDD kernel[4] = clCreateKernel(program, atanh_withDD.str().c_str(), &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(kernel_atanh_withDD)"); std::stringstream atanh_withoutDD(s); atanh_withoutDD << "kernel_atanh_withoutDD"; atanh_withoutDD << vectorSize; //dumpPTXCode(context,program,atanh_withoutDD.str().c_str()); // Create the cKermel_kernel_atanh_withoutDD kernel[5] = clCreateKernel(program, atanh_withoutDD.str().c_str(), &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(kernel_atanh_withoutDD)"); std::stringstream asinpi_withDD(s); asinpi_withDD << "kernel_asinpi_withDD"; asinpi_withDD << vectorSize; //dumpPTXCode(context,program,asinpi_withDD.str().c_str()); // Create the cKermel_kernel_asinpi_withDD kernel[6] = clCreateKernel(program, asinpi_withDD.str().c_str(), &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(kernel_asinpi_withDD)"); std::stringstream asinpi_withoutDD(s); asinpi_withoutDD << "kernel_asinpi_withoutDD"; asinpi_withoutDD << vectorSize; //dumpPTXCode(context,program,asinpi_withoutDD.str().c_str()); // Create the cKermel_kernel_asinpi_withoutDD kernel[7] = clCreateKernel(program, asinpi_withoutDD.str().c_str(), &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(kernel_asinpi_withoutDD)"); return SDK_SUCCESS; }
int NBody::setupCL() { cl_int status = CL_SUCCESS; cl_device_type dType; if(deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; if(isThereGPU() == false) { std::cout << "GPU not found. Falling back to CPU device" << std::endl; dType = CL_DEVICE_TYPE_CPU; } } //Exit if deviceId option is used if(isDeviceIdEnabled()) { sampleCommon->expectedError("-d(--deviceId) is not a supported"); return SDK_EXPECTED_FAILURE; } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_uint numPlatforms; cl_platform_id platform = NULL; status = clGetPlatformIDs(0, NULL, &numPlatforms); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) { return SDK_FAILURE; } if (0 < numPlatforms) { cl_platform_id* platforms = new cl_platform_id[numPlatforms]; status = clGetPlatformIDs(numPlatforms, platforms, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) { return SDK_FAILURE; } if(isPlatformEnabled()) { platform = platforms[platformId]; } else { for (unsigned i = 0; i < numPlatforms; ++i) { char pbuf[100]; status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformInfo failed.")) { return SDK_FAILURE; } platform = platforms[i]; if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { break; } } } delete[] platforms; } if(NULL == platform) { sampleCommon->error("NULL platform found so Exiting Application."); return SDK_FAILURE; } // Display available devices. if(!sampleCommon->displayDevices(platform, dType)) { sampleCommon->error("sampleCommon::displayDevices() failed"); return SDK_FAILURE; } /* * If we could find our platform, use it. Otherwise use just available platform. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType( cps, dType, NULL, NULL, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateContextFromType failed.")) { return SDK_FAILURE; } size_t deviceListSize; /* First, get the size of device list data */ status = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetContextInfo failed.")) return SDK_FAILURE; int deviceCount = (int)(deviceListSize / sizeof(cl_device_id)); if(!sampleCommon->validateDeviceId(deviceId, deviceCount)) { sampleCommon->error("sampleCommon::validateDeviceId() failed"); return SDK_FAILURE; } /* Now allocate memory for device list based on the size we got earlier */ devices = (cl_device_id*)malloc(deviceListSize); if(devices == NULL) { sampleCommon->error("Failed to allocate memory (devices)."); return SDK_FAILURE; } /* Now, get the device list data */ status = clGetContextInfo( context, CL_CONTEXT_DEVICES, deviceListSize, devices, NULL); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetContextInfo failed.")) return SDK_FAILURE; /* Create command queue */ commandQueue = clCreateCommandQueue( context, devices[deviceId], 0, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateCommandQueue failed.")) { return SDK_FAILURE; } /* Get Device specific Information */ status = clGetDeviceInfo( devices[deviceId], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void*)&maxWorkGroupSize, NULL); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetDeviceInfo CL_DEVICE_MAX_WORK_GROUP_SIZE failed.")) return SDK_FAILURE; status = clGetDeviceInfo( devices[deviceId], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), (void*)&maxDimensions, NULL); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetDeviceInfo CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS failed.")) return SDK_FAILURE; maxWorkItemSizes = (size_t*)malloc(maxDimensions * sizeof(size_t)); status = clGetDeviceInfo( devices[deviceId], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * maxDimensions, (void*)maxWorkItemSizes, NULL); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetDeviceInfo CL_DEVICE_MAX_WORK_ITEM_SIZES failed.")) return SDK_FAILURE; status = clGetDeviceInfo( devices[deviceId], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), (void *)&totalLocalMemory, NULL); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetDeviceInfo CL_DEVICE_LOCAL_MEM_SIZE failed.")) return SDK_FAILURE; /* * Create and initialize memory objects */ /* Create memory objects for position */ currPos = clCreateBuffer( context, CL_MEM_READ_WRITE, numBodies * sizeof(cl_float4), 0, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateBuffer failed. (oldPos)")) { return SDK_FAILURE; } /* Initialize position buffer */ status = clEnqueueWriteBuffer(commandQueue, currPos, 1, 0, numBodies * sizeof(cl_float4), pos, 0, 0, 0); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueWriteBuffer failed. (oldPos)")) { return SDK_FAILURE; } /* Create memory objects for position */ newPos = clCreateBuffer( context, CL_MEM_READ_WRITE, numBodies * sizeof(cl_float4), 0, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateBuffer failed. (newPos)")) { return SDK_FAILURE; } /* Create memory objects for velocity */ currVel = clCreateBuffer( context, CL_MEM_READ_WRITE, numBodies * sizeof(cl_float4), 0, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateBuffer failed. (oldVel)")) { return SDK_FAILURE; } /* Initialize velocity buffer */ status = clEnqueueWriteBuffer(commandQueue, currVel, 1, 0, numBodies * sizeof(cl_float4), vel, 0, 0, 0); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueWriteBuffer failed. (oldVel)")) { return SDK_FAILURE; } /* Create memory objects for velocity */ newVel = clCreateBuffer( context, CL_MEM_READ_ONLY, numBodies * sizeof(cl_float4), 0, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateBuffer failed. (newVel)")) { return SDK_FAILURE; } /* create a CL program using the kernel source */ streamsdk::SDKFile kernelFile; std::string kernelPath = sampleCommon->getPath(); if(isLoadBinaryEnabled()) { kernelPath.append(loadBinary.c_str()); if(!kernelFile.readBinaryFromFile(kernelPath.c_str())) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } const char * binary = kernelFile.source().c_str(); size_t binarySize = kernelFile.source().size(); program = clCreateProgramWithBinary(context, 1, &devices[deviceId], (const size_t *)&binarySize, (const unsigned char**)&binary, NULL, &status); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clCreateProgramWithBinary failed.")) { return SDK_FAILURE; } } else { // special case for packetized OpenCL (can not yet compile .cl directly) char vName[100]; status = clGetPlatformInfo(platform, CL_PLATFORM_VENDOR, sizeof(vName), vName, NULL); const bool platformIsPacketizedOpenCL = !strcmp(vName, "Ralf Karrenberg, Saarland University"); if (!strcmp(vName, "Intel(R) Corporation")) { vendorName = "intel"; } else if (!strcmp(vName, "Advanced Micro Devices, Inc.")) { vendorName = "amd"; } else if (platformIsPacketizedOpenCL) { vendorName = "pkt"; } else { printf("ERROR: vendor not recognized: %s\n", vName); } kernelPath.append("NBody_Kernels.cl"); if(!kernelFile.open(kernelPath.c_str())) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } const char * source = kernelFile.source().c_str(); size_t sourceSize[] = { strlen(source) }; program = clCreateProgramWithSource(context, 1, &source, sourceSize, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateProgramWithSource failed.")) return SDK_FAILURE; } std::string flagsStr = std::string(""); // Get additional options if(isComplierFlagsSpecified()) { streamsdk::SDKFile flagsFile; std::string flagsPath = sampleCommon->getPath(); flagsPath.append(flags.c_str()); if(!flagsFile.open(flagsPath.c_str())) { std::cout << "Failed to load flags file: " << flagsPath << std::endl; return SDK_FAILURE; } flagsFile.replaceNewlineWithSpaces(); const char * flags = flagsFile.source().c_str(); flagsStr.append(flags); } if(flagsStr.size() != 0) std::cout << "Build Options are : " << flagsStr.c_str() << std::endl; /* create a cl program executable for all the devices specified */ status = clBuildProgram(program, 1, &devices[deviceId], flagsStr.c_str(), NULL, NULL); if(status != CL_SUCCESS) { if(status == CL_BUILD_PROGRAM_FAILURE) { cl_int logStatus; char * buildLog = NULL; size_t buildLogSize = 0; logStatus = clGetProgramBuildInfo (program, devices[deviceId], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, &buildLogSize); if(!sampleCommon->checkVal( logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) return SDK_FAILURE; buildLog = (char*)malloc(buildLogSize); if(buildLog == NULL) { sampleCommon->error("Failed to allocate host memory. (buildLog)"); return SDK_FAILURE; } memset(buildLog, 0, buildLogSize); logStatus = clGetProgramBuildInfo (program, devices[deviceId], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL); if(!sampleCommon->checkVal( logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) { free(buildLog); return SDK_FAILURE; } std::cout << " \n\t\t\tBUILD LOG\n"; std::cout << " ************************************************\n"; std::cout << buildLog << std::endl; std::cout << " ************************************************\n"; free(buildLog); } if(!sampleCommon->checkVal( status, CL_SUCCESS, "clBuildProgram failed.")) return SDK_FAILURE; } /* get a kernel object handle for a kernel with the given name */ kernel = clCreateKernel( program, "nbody_sim", &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateKernel failed.")) { return SDK_FAILURE; } return SDK_SUCCESS; }
int HDRToneMapping::setupCL() { cl_int err = CL_SUCCESS; cl_device_type dType; if(deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; if(isThereGPU() == false) { std::cout << "GPU not found. Falling back to CPU device" << std::endl; dType = CL_DEVICE_TYPE_CPU; } } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ err = cl::Platform::get(&platforms); CHECK_OPENCL_ERROR(err, "Platform::get() failed."); std::vector<cl::Platform>::iterator i; if(platforms.size() > 0) { if(isPlatformEnabled()) { i = platforms.begin() + platformId; } else { for(i = platforms.begin(); i != platforms.end(); ++i) { if(!strcmp((*i).getInfo<CL_PLATFORM_VENDOR>().c_str(), "Advanced Micro Devices, Inc.")) { break; } } } } cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(*i)(), 0 }; context = cl::Context(dType, cps, NULL, NULL, &err); CHECK_OPENCL_ERROR(err, "Context::Context() failed."); devices = context.getInfo<CL_CONTEXT_DEVICES>(); CHECK_OPENCL_ERROR(err, "Context::getInfo() failed."); std::cout << "Platform :" << (*i).getInfo<CL_PLATFORM_VENDOR>().c_str() << "\n"; int deviceCount = (int)devices.size(); int j = 0; for (std::vector<cl::Device>::iterator i = devices.begin(); i != devices.end(); ++i, ++j) { std::cout << "Device " << j << " : "; std::string deviceName = (*i).getInfo<CL_DEVICE_NAME>(); std::cout << deviceName.c_str() << "\n"; } std::cout << "\n"; if (deviceCount == 0) { std::cout << "No device available\n"; return SDK_FAILURE; } if(sampleCommon->validateDeviceId(deviceId, deviceCount) != SDK_SUCCESS) { std::cout << "sampleCommon::validateDeviceId() failed"; return SDK_FAILURE; } // Get Device specific Information err = devices[deviceId].getInfo<size_t>( CL_DEVICE_MAX_WORK_GROUP_SIZE, &maxWorkGroupSize); CHECK_OPENCL_ERROR(err, "Device::getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE) failed."); err = devices[deviceId].getInfo<cl_uint>( CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, &maxDimensions); CHECK_OPENCL_ERROR(err, "Device::getInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) failed."); maxWorkItemSizes = (size_t*)malloc(maxDimensions * sizeof(size_t)); std::vector<size_t> workItems = devices[deviceId].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>(); for(cl_uint i = 0; i < maxDimensions; ++i) maxWorkItemSizes[i] = workItems[i]; err = devices[deviceId].getInfo<cl_ulong>( CL_DEVICE_LOCAL_MEM_SIZE, &totalLocalMemory); CHECK_OPENCL_ERROR(err, "Device::getInfo(CL_DEVICE_LOCAL_MEM_SIZES) failed."); commandQueue = cl::CommandQueue(context, devices[deviceId], 0, &err); CHECK_OPENCL_ERROR(err, "CommandQueue::CommandQueue() failed."); /* * Create and initialize memory objects */ // Set Presistent memory only for AMD platform cl_mem_flags inMemFlags = CL_MEM_READ_ONLY; if(isAmdPlatform()) inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD; // Create memory object for input Image /** * We use CL_MEM_USE_HOST_PTR for CPU as the CPU device is running the kernel * on the actual buffer provided by the application */ if (dType == CL_DEVICE_TYPE_CPU) { inputImageBuffer = cl::Buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, width * height * numChannels * sizeof(cl_float), input, &err); CHECK_OPENCL_ERROR(err, "Buffer::Buffer() failed. (inputImageBuffer)"); // Create memory object for output Image outputImageBuffer = cl::Buffer(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, width * height * numChannels * sizeof(cl_float), output, &err); CHECK_OPENCL_ERROR(err, "Buffer::Buffer() failed. (outputImageBuffer)"); } else if (dType == CL_DEVICE_TYPE_GPU) { inputImageBuffer = cl::Buffer(context, inMemFlags, width * height * numChannels * sizeof(cl_float), 0, &err); CHECK_OPENCL_ERROR(err, "Buffer::Buffer() failed. (inputImageBuffer)"); // Create memory object for output Image outputImageBuffer = cl::Buffer(context, CL_MEM_WRITE_ONLY, width * height * numChannels * sizeof(cl_float), NULL, &err); CHECK_OPENCL_ERROR(err, "Buffer::Buffer() failed. (outputImageBuffer)"); } device.push_back(devices[deviceId]); // create a CL program using the kernel source streamsdk::SDKFile kernelFile; std::string kernelPath = sampleCommon->getPath(); if(isLoadBinaryEnabled()) { kernelPath.append(loadBinary.c_str()); if(kernelFile.readBinaryFromFile(kernelPath.c_str())) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } cl::Program::Binaries programBinary(1,std::make_pair( (const void*)kernelFile.source().data(), kernelFile.source().size())); program = cl::Program(context, device, programBinary, NULL, &err); CHECK_OPENCL_ERROR(err, "Program::Program(Binary) failed."); } else { kernelPath.append("HDRToneMapping_Kernels.cl"); if(!kernelFile.open(kernelPath.c_str())) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } cl::Program::Sources programSource(1, std::make_pair(kernelFile.source().data(), kernelFile.source().size())); program = cl::Program(context, programSource, &err); CHECK_OPENCL_ERROR(err, "Program::Program(Source) failed."); } std::string flagsStr = std::string(""); // Get additional options if(isComplierFlagsSpecified()) { streamsdk::SDKFile flagsFile; std::string flagsPath = sampleCommon->getPath(); flagsPath.append(flags.c_str()); if(!flagsFile.open(flagsPath.c_str())) { std::cout << "Failed to load flags file: " << flagsPath << std::endl; return SDK_FAILURE; } flagsFile.replaceNewlineWithSpaces(); const char * flags = flagsFile.source().c_str(); flagsStr.append(flags); } if(flagsStr.size() != 0) std::cout << "Build Options are : " << flagsStr.c_str() << std::endl; err = program.build(device, flagsStr.c_str()); if(err != CL_SUCCESS) { if(err == CL_BUILD_PROGRAM_FAILURE) { std::string str = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[deviceId]); std::cout << " \n\t\t\tBUILD LOG\n"; std::cout << " ************************************************\n"; std::cout << str << std::endl; std::cout << " ************************************************\n"; } } CHECK_OPENCL_ERROR(err, "Program::build() failed."); // Create kernel kernel = cl::Kernel(program, "toneMappingPattanaik", &err); CHECK_OPENCL_ERROR(err, "Kernel::Kernel() failed."); // Check group size against group size returned by kernel kernelWorkGroupSize = kernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(devices[deviceId], &err); CHECK_OPENCL_ERROR(err, "Kernel::getWorkGroupInfo() failed."); /** * For CPU device the kernel work group size is 1024. * Workgroup creation/replacement is an overhead - * avoid workgroups with small number of workitems (we pay more for replacing a WG than running more WI in a for loop). */ if (kernelWorkGroupSize >= 1024) { blockSizeX = 32; blockSizeY = 32; } if((cl_uint)(blockSizeX * blockSizeY) > kernelWorkGroupSize) { if(kernelWorkGroupSize >= 64) { blockSizeX = 8; blockSizeY = 8; } else if(kernelWorkGroupSize >= 32) { blockSizeX = 4; blockSizeY = 4; } else { std::cout << "Out of Resources!" << std::endl; std::cout << "Group Size specified : " << blockSizeX * blockSizeY << std::endl; std::cout << "Max Group Size supported on the kernel : " << kernelWorkGroupSize<<std::endl; return SDK_FAILURE; } } if(blockSizeX > maxWorkItemSizes[0] || blockSizeY > maxWorkItemSizes[1] || blockSizeX * blockSizeY > maxWorkGroupSize) { std::cout << "Unsupported: Device does not support requested number of work items." << std::endl; return SDK_FAILURE; } return SDK_SUCCESS; }