bool Assignment::InitCLResources() { std::cout << "InitCLResources(): Initialize the opencl buffers on the device" << std::endl; //clCreateBuffer: context, flags, size, *host_ptr, *error cl_int clError; //training data this->d_trainingInputBuffer = clCreateBuffer( this->h_CLContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * this->trainingData->numberOfSamples * this->trainingData->numberOfInputs, this->trainingInputBuffer, &clError ); V_RETURN_FALSE_CL(clError, "Error allocating device buffer d_trainingInputBuffer"); this->d_trainingLabelBuffer = clCreateBuffer( this->h_CLContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * this->trainingData->numberOfSamples * this->trainingData->numberOfOutputs, this->trainingLabelBuffer, &clError ); V_RETURN_FALSE_CL(clError, "Error allocating device buffer d_trainingLabelBuffer"); //weight buffers and delta update buffers for (unsigned int i = 0; i < this->sizeOfWeightBuffer.size(); i++) { this->d_weightBuffers.push_back( clCreateBuffer( this->h_CLContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * this->sizeOfWeightBuffer[i], this->h_weightBuffers[i], &clError ) ); V_RETURN_FALSE_CL(clError, "Error allocating device buffer d_weightBuffers[]"); this->d_deltaUpdates.push_back( clCreateBuffer( this->h_CLContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * this->sizeOfWeightBuffer[i], this->h_weightBuffers[i], &clError ) ); V_RETURN_FALSE_CL(clError, "Error allocating device buffer d_deltaUpdates[]"); } //partial result buffers and delta buffers for (unsigned int i = 0; i < this->hiddenLayers.size(); i++) { //weight buffer this->d_partialResults.push_back( clCreateBuffer( this->h_CLContext, CL_MEM_READ_WRITE, sizeof(float) * this->hiddenLayers[i] * this->parallelBackpropagationSize, NULL, &clError ) ); V_RETURN_FALSE_CL(clError, "Error allocating device buffer d_partialResults[]"); //delta buffer this->d_deltaBuffer.push_back( clCreateBuffer( this->h_CLContext, CL_MEM_READ_WRITE, sizeof(float) * this->hiddenLayers[i] * this->parallelBackpropagationSize, NULL, &clError ) ); V_RETURN_FALSE_CL(clError, "Error allocating device buffer d_deltaBuffer[]"); } //output layer partial results buffer this->d_partialResults.push_back( clCreateBuffer( this->h_CLContext, CL_MEM_READ_WRITE, sizeof(float) * this->trainingData->numberOfOutputs * this->parallelBackpropagationSize, NULL, &clError ) ); V_RETURN_FALSE_CL(clError, "Error allocating device buffer d_partialResults[]"); //output layer partial results buffer this->d_deltaBuffer.push_back( clCreateBuffer( this->h_CLContext, CL_MEM_READ_WRITE, sizeof(float) * this->trainingData->numberOfOutputs * this->parallelBackpropagationSize, NULL, &clError ) ); V_RETURN_FALSE_CL(clError, "Error allocating device buffer d_deltaBuffer[]"); //crossEntropy buffer this->d_crossEntropy = clCreateBuffer(this->h_CLContext, CL_MEM_READ_WRITE, sizeof(float), NULL, &clError); V_RETURN_FALSE_CL(clError, "Error allocating device buffer d_crossEntropy"); //load and compile kernels std::string programCode; //size_t programSize = 0; CLUtil::LoadProgramSourceToMemory("neuronalNet.cl", programCode); this->h_Program = CLUtil::BuildCLProgramFromMemory(this->h_CLDevice, this->h_CLContext, programCode); if(this->h_Program == nullptr) return false; //create kernels h_feedForwardKernel = clCreateKernel(this->h_Program, "feedForward", &clError); V_RETURN_FALSE_CL(clError, "Failed to create kernel: feedForward."); h_softMaxKernel = clCreateKernel(this->h_Program, "softMax", &clError); V_RETURN_FALSE_CL(clError, "Failed to create kernel: softMax."); h_zeroBufferKernel = clCreateKernel(this->h_Program, "zeroBuffer", &clError); V_RETURN_FALSE_CL(clError, "Failed to create kernel: zeroBuffer."); h_gradientDescentOutputLayerKernel = clCreateKernel(this->h_Program, "gradientDescentOutputLayer", &clError); V_RETURN_FALSE_CL(clError, "Failed to create kernel: gradientDescentOutputLayer."); h_gradientDescentHiddenLayerKernel = clCreateKernel(this->h_Program, "gradientDescentHiddenLayer", &clError); V_RETURN_FALSE_CL(clError, "Failed to create kernel: gradientDescentHiddenLayer."); h_updateWeightsGPUKernel = clCreateKernel(this->h_Program, "updateWeights", &clError); V_RETURN_FALSE_CL(clError, "Failed to create kernel: updateWeights."); h_calculateCrossEntropyKernel = clCreateKernel(this->h_Program, "calculateCrossEntropy", &clError); V_RETURN_FALSE_CL(clError, "Failed to create kernel: calculateCrossEntropy."); //set kernel arguments: cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value return true; }
bool Assignment::InitCLContext() { std::cout << std::endl << "InitCLContext():" << std::endl; // 1. get all platform IDs std::vector<cl_platform_id> platformIds; const cl_uint c_MaxPlatforms = 16; platformIds.resize(c_MaxPlatforms); cl_uint countPlatforms; V_RETURN_FALSE_CL(clGetPlatformIDs(c_MaxPlatforms, &platformIds[0], &countPlatforms), "Failed to get CL platform ID"); platformIds.resize(countPlatforms); // 2. find all available GPU devices std::vector<cl_device_id> deviceIds; const int maxDevices = 16; deviceIds.resize(maxDevices); int countAllDevices = 0; //look for gpus only cl_device_type deviceType = CL_DEVICE_TYPE_GPU; for (size_t i = 0; i < platformIds.size(); i++) { // Getting the available devices. cl_uint countDevices; clGetDeviceIDs(platformIds[i], deviceType, 1, &deviceIds[countAllDevices], &countDevices); countAllDevices += countDevices; } deviceIds.resize(countAllDevices); if (countAllDevices == 0) { std::cout << "No device of the selected type with OpenCL support was found."; return false; } // Choosing the first available device. this->h_CLDevice = deviceIds[0]; clGetDeviceInfo(this->h_CLDevice, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &this->h_CLPlatform, NULL); // Printing platform and device data. const int maxBufferSize = 1024; char buffer[maxBufferSize]; size_t bufferSize; std::cout << "OpenCL platform:" << std::endl << std::endl; PRINT_INFO( "Name", buffer, bufferSize, maxBufferSize, clGetPlatformInfo( this->h_CLPlatform, CL_PLATFORM_NAME, maxBufferSize, (void*)buffer, &bufferSize ) ); PRINT_INFO( "Vendor", buffer, bufferSize, maxBufferSize, clGetPlatformInfo( this->h_CLPlatform, CL_PLATFORM_VENDOR, maxBufferSize, (void*)buffer, &bufferSize ) ); PRINT_INFO( "Version", buffer, bufferSize, maxBufferSize, clGetPlatformInfo( this->h_CLPlatform, CL_PLATFORM_VERSION, maxBufferSize, (void*)buffer, &bufferSize ) ); PRINT_INFO( "Profile", buffer, bufferSize, maxBufferSize, clGetPlatformInfo( this->h_CLPlatform, CL_PLATFORM_PROFILE, maxBufferSize, (void*)buffer, &bufferSize ) ); std::cout << std::endl << "Device:" << std::endl << std::endl; PRINT_INFO( "Name", buffer, bufferSize, maxBufferSize, clGetDeviceInfo( this->h_CLDevice, CL_DEVICE_NAME, maxBufferSize, (void*)buffer, &bufferSize ) ); PRINT_INFO( "Vendor", buffer, bufferSize, maxBufferSize, clGetDeviceInfo( this->h_CLDevice, CL_DEVICE_VENDOR, maxBufferSize, (void*)buffer, &bufferSize ) ); PRINT_INFO( "Driver version", buffer, bufferSize, maxBufferSize, clGetDeviceInfo( this->h_CLDevice, CL_DRIVER_VERSION, maxBufferSize, (void*)buffer, &bufferSize ) ); cl_ulong localMemorySize; clGetDeviceInfo( this->h_CLDevice, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &localMemorySize, &bufferSize ); std::cout << "Local memory size: " << localMemorySize << " Byte" << std::endl; std::cout << std::endl << "******************************" << std::endl << std::endl; cl_int clError; this->h_CLContext = clCreateContext(NULL, 1, &this->h_CLDevice, NULL, NULL, &clError); V_RETURN_FALSE_CL(clError, "Failed to create OpenCL context."); // Finally, create a command queue. All the asynchronous commands to the device will be issued // from the CPU into this queue. This way the host program can continue the execution // until some results from that device are needed. this->h_CLCommandQueue = clCreateCommandQueue(this->h_CLContext, this->h_CLDevice, 0, &clError); V_RETURN_FALSE_CL(clError, "Failed to create the command queue in the context"); return true; }
bool Reduction::initContextResources() { //error code cl_int clError; //get platform ID V_RETURN_FALSE_CL(clGetPlatformIDs(1, &clPlatform, NULL), "Failed to get CL platform ID"); cl_uint numberDevices = 0; //get a reference to the first available GPU device V_RETURN_FALSE_CL(clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 0, 0, &numberDevices), "No GPU device found."); cout << "Found " << numberDevices << " devices" << endl; std::vector<cl_device_id> devicesIds(numberDevices); V_RETURN_FALSE_CL(clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, numberDevices, devicesIds.data(), NULL), "No GPU device found."); //Additional attributes to OpenCL context creation //which associate an OpenGL context with the OpenCL context cl_context_properties props[] = { //OpenCL platform CL_CONTEXT_PLATFORM, (cl_context_properties) clPlatform, //OpenGL context CL_GL_CONTEXT_KHR, (cl_context_properties) glXGetCurrentContext(), CL_GLX_DISPLAY_KHR , (cl_context_properties) glXGetCurrentDisplay() , 0 }; for(auto dev : devicesIds) { cl_device_id deviceToTry = dev; cl_context contextToTry = 0; contextToTry = clCreateContext( props, 1, &deviceToTry, 0, 0, &clError); if(clError == CL_SUCCESS) { clDevice = deviceToTry; clContext = contextToTry; break; } } char deviceName[1024]; V_RETURN_FALSE_CL(clGetDeviceInfo(clDevice, CL_DEVICE_NAME, 256, &deviceName, NULL), "Unable to query device name."); cout << "Device: " << deviceName << endl; //Finally, create the command queue. All the asynchronous commands to the device will be issued //from the CPU into this queue. This way the host program can continue the execution until some results //from that device are needed. clCommandQueue = clCreateCommandQueue(clContext, clDevice, 0, &clError); V_RETURN_FALSE_CL(clError, "Failed to create the command queue in the context"); //Now create and compile the programs size_t programSize = 0; QFile f(":/shaders/Reduce.cl"); if(!f.open(QIODevice::ReadOnly | QIODevice::Text)) return false; std::string programCodeStr = std::string(f.readAll().data()); const char *programCode = programCodeStr.c_str(); programSize = f.size(); clProgram = clCreateProgramWithSource(clContext, 1, (const char**) &programCode, &programSize, &clError); V_RETURN_FALSE_CL(clError, "Failed to create program file"); clError = clBuildProgram(clProgram, 1, &clDevice, NULL, NULL, NULL); if(clError != CL_SUCCESS) { PrintBuildLog(clProgram, clDevice); return false; } reduceHorizontalTransposeKernel = clCreateKernel(clProgram, "ReduceHorizontal", &clError); V_RETURN_FALSE_CL(clError, "Failed to compile kernel: ReduceHorizontal"); reduceVerticalKernel = clCreateKernel(clProgram, "ReduceVertical", &clError); V_RETURN_FALSE_CL(clError, "Failed to compile kernel: ReduceVertical"); return true; }