void Assignment::StochasticBackPropagateTaskGPU(unsigned int numEpochs) {
	InitCLContext();

	InitCLResources();

	std::cout << std::endl;
	std::cout << "You have selected the back propagation task on the GPU " << 
		"using stochastic gradient descent." << std::endl;
	std::cout << "The neuronal network will now train " << this->trainingData->numberOfSamples <<
		" samples for " << numEpochs << " epochs with a learning rate of " <<
		this->learningRate << " using a local group size of " << this->localGroupSize <<
		" and stop the time." << std::endl;
	
	CTimer timer;

	timer.Start();
	for (unsigned int j = 0; j < numEpochs; j++) {
		std::cout << "Starting with epoch " << j << std::endl;
		double crossEntropy = 0.0f;
		
		zeroCrossEntropyGPU();
		for (unsigned int i = 0; i < this->trainingData->numberOfSamples; i++) {
			zeroDeltaBuffersGPU();
			feedForwardGPU(i, 1);
			calculateCrossEntropyGPU(i, 1);
			gradientDescentGPU(i, 1);
			updateWeightsGPU();
			if ( i % 100 == 0) {
				crossEntropy += readCrossEntropyGPU();
				zeroCrossEntropyGPU();
			}
		}
		crossEntropy += readCrossEntropyGPU();

		std::cout << "Done." << std::endl;
		std::cout << "Accumulated crossEntropy error for this epoch: " << crossEntropy << std::endl;
	}
	timer.Stop();

	copyWeightBuffersFromDevice();

	std::cout << "Done with back propagation (stochastic)." << std::endl;
	std::cout << "The task took " << timer.GetElapsedMilliseconds() <<
		" milliseconds to complete." << std::endl;

	ReleaseClResources();

	ReleaseCLContext();
}
void Assignment::batchBackPropagateTaskGPU(unsigned int numEpochs, unsigned int batchSize) {

	InitCLContext();

	InitCLResources();

	std::cout << std::endl;
	std::cout << "You have selected the back propagation task on the GPU " << 
		"using batch gradient descent with a batch size of " << batchSize << "." << std::endl;
	std::cout << "The gpu will use local group size of " << this->localGroupSize <<
		" and try to evaluate " << this->parallelBackpropagationSize << " inputs at once." << std::endl;
	std::cout << "The neuronal network will now train " << this->trainingData->numberOfSamples <<
		" samples for " << numEpochs << " epochs with a learning rate of " << this->learningRate <<
		" and stop the time." << std::endl;
	
	CTimer timer;
	timer.Start();
	int fullBatches = this->trainingData->numberOfSamples / batchSize;
	int lastBatchSize = this->trainingData->numberOfSamples % batchSize;
	//iterate over epochs
	for (unsigned int i = 0; i < numEpochs; i++) {
		std::cout << "Starting with epoch " << i << std::endl;
		double crossEntropy = 0.0;
		//iterate over batches
		for (int j = 0; j < fullBatches; j++) {
			crossEntropy += calculateBackPropForBatch(j * batchSize, batchSize);
		}
		//do the last small batch
		if (lastBatchSize > 0) {
			crossEntropy += calculateBackPropForBatch(fullBatches * batchSize, lastBatchSize);
		}

		std::cout << "Done." << std::endl;
		std::cout << "Accumulated crossEntropy error for this epoch: " << crossEntropy << std::endl;
	}
	timer.Stop();
	
	copyWeightBuffersFromDevice();

	std::cout << "Done with back propagation (batch)." << std::endl;
	std::cout << "The task took " << timer.GetElapsedMilliseconds() <<
		" milliseconds to complete." << std::endl;

	ReleaseClResources();

	ReleaseCLContext();
}
void Assignment::feedForwardTaskGPU() {

	InitCLContext();

	InitCLResources();

	std::cout << std::endl;
	std::cout << "You have selected the feed forward task on the GPU." << std::endl;
	std::cout << "Will now feed forward " << this->parallelBackpropagationSize <<
		" samples in parallel using a local group size of " << this->localGroupSize <<
		", stop the time and accumulate the crossEntropy error." << std::endl;
	
	CTimer timer;
	double crossEntropy = 0.0;

	timer.Start();
	int fullBatches = this->trainingData->numberOfSamples / this->parallelBackpropagationSize;
	int lastBatchSize = this->trainingData->numberOfSamples % this->parallelBackpropagationSize;
	for (int i = 0; i < fullBatches; i++) {
		zeroCrossEntropyGPU();
		feedForwardGPU(i * this->parallelBackpropagationSize, this->parallelBackpropagationSize);
		//printFeedForwardResultGPU(this->parallelBackpropagationSize);
		calculateCrossEntropyGPU(i * this->parallelBackpropagationSize, this->parallelBackpropagationSize);
		crossEntropy += readCrossEntropyGPU();
	}
	if (lastBatchSize > 0) {
		feedForwardGPU(fullBatches * this->parallelBackpropagationSize, lastBatchSize);
		calculateCrossEntropyGPU(fullBatches * this->parallelBackpropagationSize, lastBatchSize);
	}
	crossEntropy += readCrossEntropyGPU();
	timer.Stop();

	std::cout << "done." << std::endl;
	std::cout << "The crossEntropy error is " << crossEntropy << "." << std::endl;
	std::cout << "The task took " << timer.GetElapsedMilliseconds() <<
		" milliseconds to complete." << std::endl;

	ReleaseClResources();

	ReleaseCLContext();
}
double CConvolutionSeparableTask::ConvolutionChannelCPU(unsigned int Channel)
{
	CTimer timer;
	timer.Start();

	//horizontal pass
	for(int y = 0; y < (int)m_Height; y++)
		for(int x = 0; x < (int)m_Width; x++)
		{
			float value = 0;
			//apply horizontal kernel
			for(int k = -m_KernelRadius; k <= m_KernelRadius; k++)
			{
				int sx = x + k;
				if(sx >= 0 && sx < (int)m_Width)
					value += m_hSourceChannels[Channel][y * m_Pitch + sx] * m_hKernelHorizontal[m_KernelRadius - k];
			}
			m_hCPUWorkingBuffer[y * m_Pitch + x] = value;
            //m_hCPUResultChannels[Channel][y * m_Pitch + x] = value;
		}

	//vertical pass
	for(int x = 0; x < (int)m_Width; x++)
		for(int y = 0; y < (int)m_Height; y++)
		{
			float value = 0;
			//apply horizontal kernel
			for(int k = -m_KernelRadius; k <= m_KernelRadius; k++)
			{
				int sy = y + k;
				if(sy >= 0 && sy < (int)m_Height)
					value += m_hCPUWorkingBuffer[sy * m_Pitch + x] * m_hKernelVertical[m_KernelRadius - k];
			}
			m_hCPUResultChannels[Channel][y * m_Pitch + x] = value;
		}

	timer.Stop();

	return timer.GetElapsedMilliseconds();
}