bool Assignment::InitCLResources() {
	
	std::cout << "InitCLResources(): Initialize the opencl buffers on the device" << std::endl; 

	//clCreateBuffer: context, flags, size, *host_ptr, *error

	cl_int clError;

	//training data
	this->d_trainingInputBuffer = clCreateBuffer(
		this->h_CLContext,
		CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
		sizeof(float) * this->trainingData->numberOfSamples * this->trainingData->numberOfInputs,
		this->trainingInputBuffer,
		&clError
	);
	V_RETURN_FALSE_CL(clError, "Error allocating device buffer d_trainingInputBuffer");

	this->d_trainingLabelBuffer = clCreateBuffer(
		this->h_CLContext,
		CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
		sizeof(float) * this->trainingData->numberOfSamples * this->trainingData->numberOfOutputs,
		this->trainingLabelBuffer,
		&clError
	);
	V_RETURN_FALSE_CL(clError, "Error allocating device buffer d_trainingLabelBuffer");

	//weight buffers and delta update buffers
	for (unsigned int i = 0; i < this->sizeOfWeightBuffer.size(); i++) {
		this->d_weightBuffers.push_back(
			clCreateBuffer(
				this->h_CLContext,
				CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
				sizeof(float) * this->sizeOfWeightBuffer[i],
				this->h_weightBuffers[i],
				&clError
			)
		);
		V_RETURN_FALSE_CL(clError, "Error allocating device buffer d_weightBuffers[]"); 
		this->d_deltaUpdates.push_back(
			clCreateBuffer(
				this->h_CLContext,
				CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
				sizeof(float) * this->sizeOfWeightBuffer[i],
				this->h_weightBuffers[i],
				&clError
			)
		);
		V_RETURN_FALSE_CL(clError, "Error allocating device buffer d_deltaUpdates[]"); 	
	}

	//partial result buffers and delta buffers
	for (unsigned int i = 0; i < this->hiddenLayers.size(); i++) {
		//weight buffer
		this->d_partialResults.push_back(
			clCreateBuffer(
				this->h_CLContext,
				CL_MEM_READ_WRITE,
				sizeof(float) * this->hiddenLayers[i] * this->parallelBackpropagationSize,
				NULL,
				&clError
			)
		);
		V_RETURN_FALSE_CL(clError, "Error allocating device buffer d_partialResults[]");

		//delta buffer
		this->d_deltaBuffer.push_back(
			clCreateBuffer(
				this->h_CLContext,
				CL_MEM_READ_WRITE,
				sizeof(float) * this->hiddenLayers[i] * this->parallelBackpropagationSize,
				NULL,
				&clError
			)
		);
		V_RETURN_FALSE_CL(clError, "Error allocating device buffer d_deltaBuffer[]");
	}
	//output layer partial results buffer
	this->d_partialResults.push_back(
		clCreateBuffer(
			this->h_CLContext,
			CL_MEM_READ_WRITE,
			sizeof(float) * this->trainingData->numberOfOutputs * this->parallelBackpropagationSize,
			NULL,
			&clError
		)
	);
	V_RETURN_FALSE_CL(clError, "Error allocating device buffer d_partialResults[]");

	//output layer partial results buffer
	this->d_deltaBuffer.push_back(
		clCreateBuffer(
			this->h_CLContext,
			CL_MEM_READ_WRITE,
			sizeof(float) * this->trainingData->numberOfOutputs * this->parallelBackpropagationSize,
			NULL,
			&clError
		)
	);
	V_RETURN_FALSE_CL(clError, "Error allocating device buffer d_deltaBuffer[]");

	//crossEntropy buffer
	this->d_crossEntropy = clCreateBuffer(this->h_CLContext, CL_MEM_READ_WRITE,	sizeof(float), NULL, &clError);
	V_RETURN_FALSE_CL(clError, "Error allocating device buffer d_crossEntropy");

	//load and compile kernels
	std::string programCode;
	//size_t programSize = 0;

	CLUtil::LoadProgramSourceToMemory("neuronalNet.cl", programCode);
	this->h_Program = CLUtil::BuildCLProgramFromMemory(this->h_CLDevice, this->h_CLContext, programCode);
	if(this->h_Program == nullptr) return false;

	//create kernels
	h_feedForwardKernel = clCreateKernel(this->h_Program, "feedForward", &clError);
	V_RETURN_FALSE_CL(clError, "Failed to create kernel: feedForward.");

	h_softMaxKernel = clCreateKernel(this->h_Program, "softMax", &clError);
	V_RETURN_FALSE_CL(clError, "Failed to create kernel: softMax.");

	h_zeroBufferKernel = clCreateKernel(this->h_Program, "zeroBuffer", &clError);
	V_RETURN_FALSE_CL(clError, "Failed to create kernel: zeroBuffer.");

	h_gradientDescentOutputLayerKernel = clCreateKernel(this->h_Program, "gradientDescentOutputLayer", &clError);
	V_RETURN_FALSE_CL(clError, "Failed to create kernel: gradientDescentOutputLayer.");

	h_gradientDescentHiddenLayerKernel = clCreateKernel(this->h_Program, "gradientDescentHiddenLayer", &clError);
	V_RETURN_FALSE_CL(clError, "Failed to create kernel: gradientDescentHiddenLayer.");

	h_updateWeightsGPUKernel = clCreateKernel(this->h_Program, "updateWeights", &clError);
	V_RETURN_FALSE_CL(clError, "Failed to create kernel: updateWeights.");

	h_calculateCrossEntropyKernel = clCreateKernel(this->h_Program, "calculateCrossEntropy", &clError);
	V_RETURN_FALSE_CL(clError, "Failed to create kernel: calculateCrossEntropy.");

	//set kernel arguments: cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value

	return true;
}
bool Assignment::InitCLContext() {

	std::cout << std::endl << "InitCLContext():" << std::endl;
	// 1. get all platform IDs
	std::vector<cl_platform_id> platformIds;
	const cl_uint c_MaxPlatforms = 16;
	platformIds.resize(c_MaxPlatforms);
	
	cl_uint countPlatforms;
	V_RETURN_FALSE_CL(clGetPlatformIDs(c_MaxPlatforms, &platformIds[0], &countPlatforms),
		"Failed to get CL platform ID");
	platformIds.resize(countPlatforms);

	// 2. find all available GPU devices
	std::vector<cl_device_id> deviceIds;
	const int maxDevices = 16;
	deviceIds.resize(maxDevices);
	int countAllDevices = 0;

	//look for gpus only
	cl_device_type deviceType = CL_DEVICE_TYPE_GPU;

	for (size_t i = 0; i < platformIds.size(); i++)
	{
		// Getting the available devices.
		cl_uint countDevices;
		clGetDeviceIDs(platformIds[i], deviceType, 1, &deviceIds[countAllDevices], &countDevices);
		countAllDevices += countDevices;
	}
	deviceIds.resize(countAllDevices);

	if (countAllDevices == 0)
	{
		std::cout << "No device of the selected type with OpenCL support was found.";
		return false;
	}
	// Choosing the first available device.
	this->h_CLDevice = deviceIds[0];
	clGetDeviceInfo(this->h_CLDevice, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &this->h_CLPlatform, NULL);

	// Printing platform and device data.
	const int maxBufferSize = 1024;
	char buffer[maxBufferSize];
	size_t bufferSize;

	std::cout << "OpenCL platform:" << std::endl << std::endl;
	PRINT_INFO(
		"Name",
		buffer,
		bufferSize,
		maxBufferSize,
		clGetPlatformInfo(
			this->h_CLPlatform,
			CL_PLATFORM_NAME,
			maxBufferSize,
			(void*)buffer,
			&bufferSize
		)
	);

	PRINT_INFO(
		"Vendor", 
		buffer, 
		bufferSize, 
		maxBufferSize, 
		clGetPlatformInfo(
			this->h_CLPlatform, 
			CL_PLATFORM_VENDOR, 
			maxBufferSize, 
			(void*)buffer, 
			&bufferSize
		)
	);

	PRINT_INFO(
		"Version",
		buffer, 
		bufferSize, 
		maxBufferSize, 
		clGetPlatformInfo(
			this->h_CLPlatform, 
			CL_PLATFORM_VERSION, 
			maxBufferSize, 
			(void*)buffer, 
			&bufferSize
		)
	);

	PRINT_INFO(
		"Profile", 
		buffer, 
		bufferSize, 
		maxBufferSize, 
		clGetPlatformInfo(
			this->h_CLPlatform, 
			CL_PLATFORM_PROFILE, 
			maxBufferSize, 
			(void*)buffer, 
			&bufferSize
		)
	);

	std::cout << std::endl << "Device:" << std::endl << std::endl;

	PRINT_INFO(
		"Name", 
		buffer, 
		bufferSize, 
		maxBufferSize, 
		clGetDeviceInfo(
			this->h_CLDevice, 
			CL_DEVICE_NAME, 
			maxBufferSize, 
			(void*)buffer, 
			&bufferSize
		)
	);

	PRINT_INFO(
		"Vendor", 
		buffer, 
		bufferSize, 
		maxBufferSize, 
		clGetDeviceInfo(
			this->h_CLDevice, 
			CL_DEVICE_VENDOR, 
			maxBufferSize, 
			(void*)buffer, 
			&bufferSize
		)
	);

	PRINT_INFO(
		"Driver version", 
		buffer, 
		bufferSize, 
		maxBufferSize, 
		clGetDeviceInfo(
			this->h_CLDevice, 
			CL_DRIVER_VERSION, 
			maxBufferSize, 
			(void*)buffer, 
			&bufferSize
		)
	);

	cl_ulong localMemorySize;
	clGetDeviceInfo(
		this->h_CLDevice, 
		CL_DEVICE_LOCAL_MEM_SIZE, 
		sizeof(cl_ulong), 
		&localMemorySize, 
		&bufferSize
	);

	std::cout << "Local memory size: " << localMemorySize << " Byte" << std::endl;
	std::cout << std::endl << "******************************" << std::endl << std::endl;
        
	cl_int clError;

	this->h_CLContext = clCreateContext(NULL, 1, &this->h_CLDevice, NULL, NULL, &clError);	
	V_RETURN_FALSE_CL(clError, "Failed to create OpenCL context.");

	// Finally, create a command queue. All the asynchronous commands to the device will be issued
	// from the CPU into this queue. This way the host program can continue the execution
	// until some results from that device are needed.

	this->h_CLCommandQueue = clCreateCommandQueue(this->h_CLContext, this->h_CLDevice, 0, &clError);
	V_RETURN_FALSE_CL(clError, "Failed to create the command queue in the context");

	return true;
}
bool Reduction::initContextResources() {
	//error code
	cl_int clError;

	//get platform ID
	V_RETURN_FALSE_CL(clGetPlatformIDs(1, &clPlatform, NULL), "Failed to get CL platform ID");

	cl_uint numberDevices = 0;
	//get a reference to the first available GPU device
	V_RETURN_FALSE_CL(clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 0, 0, &numberDevices), "No GPU device found.");
	cout << "Found " << numberDevices << " devices" << endl;
	std::vector<cl_device_id> devicesIds(numberDevices);
	V_RETURN_FALSE_CL(clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, numberDevices, devicesIds.data(), NULL), "No GPU device found.");

	//Additional attributes to OpenCL context creation
	//which associate an OpenGL context with the OpenCL context
	cl_context_properties props[] = {
		//OpenCL platform
		CL_CONTEXT_PLATFORM, (cl_context_properties)   clPlatform,
		//OpenGL context
		CL_GL_CONTEXT_KHR,   (cl_context_properties)   glXGetCurrentContext(),
		CL_GLX_DISPLAY_KHR , (cl_context_properties) glXGetCurrentDisplay() ,
		0
	};

	for(auto dev : devicesIds) {
		cl_device_id deviceToTry = dev;
		cl_context contextToTry = 0;

		contextToTry = clCreateContext(
			props,
			1, &deviceToTry,
			0, 0,
			&clError);
		if(clError == CL_SUCCESS) {
			clDevice = deviceToTry;
			clContext = contextToTry;
			break;
		}
	}

	char deviceName[1024];
	V_RETURN_FALSE_CL(clGetDeviceInfo(clDevice, CL_DEVICE_NAME, 256, &deviceName, NULL), "Unable to query device name.");
	cout << "Device: " << deviceName << endl;

	//Finally, create the command queue. All the asynchronous commands to the device will be issued
	//from the CPU into this queue. This way the host program can continue the execution until some results
	//from that device are needed.
	clCommandQueue = clCreateCommandQueue(clContext, clDevice, 0, &clError);
	V_RETURN_FALSE_CL(clError, "Failed to create the command queue in the context");


	//Now create and compile the programs
	size_t programSize = 0;

	QFile f(":/shaders/Reduce.cl");
	if(!f.open(QIODevice::ReadOnly | QIODevice::Text)) return false;


	std::string programCodeStr = std::string(f.readAll().data());
	const char *programCode = programCodeStr.c_str();
	programSize = f.size();

	clProgram = clCreateProgramWithSource(clContext, 1, (const char**) &programCode, &programSize, &clError);
	V_RETURN_FALSE_CL(clError, "Failed to create program file");

	clError = clBuildProgram(clProgram, 1, &clDevice, NULL, NULL, NULL);

	if(clError != CL_SUCCESS) {
		PrintBuildLog(clProgram, clDevice);
		return false;
	}

	reduceHorizontalTransposeKernel = clCreateKernel(clProgram, "ReduceHorizontal", &clError);
	V_RETURN_FALSE_CL(clError, "Failed to compile kernel: ReduceHorizontal");
	reduceVerticalKernel = clCreateKernel(clProgram, "ReduceVertical", &clError);
	V_RETURN_FALSE_CL(clError, "Failed to compile kernel: ReduceVertical");
	return true;
}