void printMenu(){
	int choice;
	do{
		choice = 0;
		printf("Performance assessment:\n");
		printf("-----------------------\n");
		printf("1) Enter parameters\n");
		printf("2) Print table of parameters\n");
		printf("3) Print table of performance\n");
		printf("4) Quit\n");
		printf("\nEnter selection: ");
		scanf("%d", &choice);
		switch(choice){
			case 1:
				enterParameters();
				break;
			case 2:
				printParameters();
				break;
			case 3:
				printPerformance();
				break;
			case 4:
				exit(0);
				break;
			default:
				printf("Invalid selection\n\n");
				//Clear the input stream in case of erroneous inputs
				while ((choice = getchar()) != '\n' && choice != EOF);
				break;	
		}//Switch
	}while(choice != 4);
}//printMenu
void glut_keyboardSpecial(int key, int x, int y)
{
	switch(key)
	{
	case GLUT_KEY_F1:
		printPerformance();
		renderingMethod=0;
		break;
	case GLUT_KEY_F2:
		printPerformance();
		renderingMethod=1;
		break;
	case GLUT_KEY_F3:
		printPerformance();
		renderingMethod=2;
		break;
	case GLUT_KEY_F4:
		printPerformance();
		renderingMethod=3;
		break;
	default:
		break;
	}
}
void printPerformance(const std::string& name, Core::TimeSpan timeCalc, Core::TimeSpan timeCpu) {
	printPerformance(name, timeCalc, Core::TimeSpan::fromSeconds(0), timeCpu, false);
}
//////////////////////////////////////////////////////////////////////////////
// Main function
//////////////////////////////////////////////////////////////////////////////
int main(int argc, char** argv) {
	// Create a context
	cl::Context context(CL_DEVICE_TYPE_GPU);

	// Get a device of the context
	int deviceNr = argc < 2 ? 1 : atoi(argv[1]);
	std::cout << "Using device " << deviceNr << " / " << context.getInfo<CL_CONTEXT_DEVICES>().size() << std::endl;
	ASSERT (deviceNr > 0);
	ASSERT ((size_t) deviceNr <= context.getInfo<CL_CONTEXT_DEVICES>().size());
	cl::Device device = context.getInfo<CL_CONTEXT_DEVICES>()[deviceNr - 1];
	std::vector<cl::Device> devices;
	devices.push_back(device);
	OpenCL::printDeviceInfo(std::cout, device);

	// Create a command queue
	cl::CommandQueue queue(context, device, CL_QUEUE_PROFILING_ENABLE);

	// Declare some values
	std::size_t wgSize = 16;
	std::size_t countAX_BY = 512;
	std::size_t countAY = 1024;
	std::size_t countBX = 768;

	std::size_t countCX = countBX;
	std::size_t countCY = countAY;
	std::size_t countA = countAX_BY * countAY;
	std::size_t countB = countBX * countAX_BY;
	std::size_t countC = countCX * countCY;
	std::size_t sizeA = countA * sizeof (float);
	std::size_t sizeB = countB * sizeof (float);
	std::size_t sizeC = countC * sizeof (float);

	// Load the source code
	cl::Program program = OpenCL::loadProgramSource(context, "src/OpenCLExercise4_MatrixMultiplication.cl");
	// Compile the source code. This is similar to program.build(devices) but will print more detailed error messages
	// This will pass the value of wgSize as a preprocessor constant "WG_SIZE" to the OpenCL C compiler
	OpenCL::buildProgram(program, devices, "-DWG_SIZE=" + boost::lexical_cast<std::string>(wgSize));

	// Allocate space for output data from CPU and GPU on the host
	std::vector<float> h_inputA (countA);
	std::vector<float> h_inputB (countB);
	std::vector<float> h_outputCCpu (countC);
	std::vector<float> h_outputCAtlas (countC);
	std::vector<float> h_outputCGpu (countC);

	// Allocate space for input and output data on the device
	cl::Buffer d_inputA (context, CL_MEM_READ_WRITE, sizeA);
	cl::Buffer d_inputB (context, CL_MEM_READ_WRITE, sizeB);
	cl::Buffer d_outputC (context, CL_MEM_READ_WRITE, sizeC);
	cl::Image2D d_inputAImg (context, CL_MEM_READ_ONLY, cl::ImageFormat(CL_R, CL_FLOAT), countAX_BY, countAY);
	cl::Image2D d_inputBImg (context, CL_MEM_READ_ONLY, cl::ImageFormat(CL_R, CL_FLOAT), countBX, countAX_BY);

	// Initialize memory to 0xff (useful for debugging because otherwise GPU memory will contain information from last execution)
	memset(h_inputA.data(), 255, sizeA);
	memset(h_inputB.data(), 255, sizeB);
	memset(h_outputCCpu.data(), 255, sizeC);
	memset(h_outputCAtlas.data(), 255, sizeC);
	memset(h_outputCGpu.data(), 255, sizeC);
	//TODO: GPU
	queue.enqueueWriteBuffer(d_inputA, true, 0, sizeA, h_inputA.data());
	queue.enqueueWriteBuffer(d_inputB, true, 0, sizeB, h_inputB.data());
	queue.enqueueWriteBuffer(d_outputC, true, 0, sizeC, h_outputCGpu.data());

	//////// Generate input data ////////////////////////////////
	// Use random input data
	for (std::size_t i = 0; i < countA; i++)
		h_inputA[i] = (rand() % 100) / 5.0f - 10.0f;
	for (std::size_t i = 0; i < countB; i++)
		h_inputB[i] = (rand() % 100) / 5.0f - 10.0f;
	// Use integer numbers as data
	/*
	for (std::size_t i = 0; i < countA; i++)
		h_inputA[i] = i;
	for (std::size_t i = 0; i < countB; i++)
		h_inputB[i] = (int)i - 5;
	*/

	// Do calculation on the host side
	Core::TimeSpan cpuStart = Core::getCurrentTime();
	matrixMulHost(h_inputA, h_inputB, h_outputCCpu, countAX_BY, countAY, countBX);
	Core::TimeSpan cpuEnd = Core::getCurrentTime();

	// Do calculation on using libatlas
	Core::TimeSpan atlasStart = Core::getCurrentTime();
	cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, countAY, countBX, countAX_BY, 1.0, h_inputA.data(), countAX_BY, h_inputB.data(), countBX, 0.0, h_outputCAtlas.data(), countCX);
	Core::TimeSpan atlasEnd = Core::getCurrentTime();

	Core::TimeSpan cpuTime = cpuEnd - cpuStart;
	Core::TimeSpan atlasTime = atlasEnd - atlasStart;
	printPerformanceHeader();
	printPerformance("CPU", cpuTime, atlasTime);
	printPerformance("Atlas", atlasTime, atlasTime);

	if (!compareMatrices(h_outputCCpu, "CPU", h_outputCAtlas, "Atlas", countCX, countCY))
		return 1;

	// Copy input data to device
	cl::Event copy1;
	cl::Event copy2;
	queue.enqueueWriteBuffer(d_inputA, true, 0, sizeA, h_inputA.data(), NULL, &copy1);
	queue.enqueueWriteBuffer(d_inputB, true, 0, sizeB, h_inputB.data(), NULL, &copy2);

	// Iterate over all implementations (task 1 - 2)
	for (int impl = 1; impl <= 4; impl++) {
		// Reinitialize output memory to 0xff
		memset(h_outputCGpu.data(), 255, sizeC);
		queue.enqueueWriteBuffer(d_outputC, true, 0, sizeC, h_outputCGpu.data());

		// Create a kernel object
		std::string kernelName = "matrixMulKernel" + boost::lexical_cast<std::string> (impl);
		cl::Kernel matrixMulKernel(program, kernelName.c_str ());

		if (impl == 4) {
		cl::size_t<3> origin;
		origin[0] = origin[1] = origin[2] = 0;
		cl::size_t<3> region;
		region[0] = countAX_BY;
		region[1] = countAY;
		region[2] = 1;
		queue.enqueueWriteImage(d_inputAImg, true, origin, region, countAX_BY * sizeof (float), 0, h_inputA.data(), NULL, &copy1);
		region[0] = countBX;
		region[1] = countAX_BY;
		queue.enqueueWriteImage(d_inputBImg, true, origin, region, countBX * sizeof (float), 0, h_inputB.data(), NULL, &copy2);
		}

		// Launch kernel on the device
		cl::Event kernelExecution;
		if (impl == 4)
		matrixMulKernel.setArg<cl::Image2D>(0, d_inputAImg);
		else
		matrixMulKernel.setArg<cl::Buffer>(0, d_inputA);
		if (impl == 4)
		matrixMulKernel.setArg<cl::Image2D>(1, d_inputBImg);
		else
		matrixMulKernel.setArg<cl::Buffer>(1, d_inputB);
		matrixMulKernel.setArg<cl::Buffer>(2, d_outputC);
		matrixMulKernel.setArg<cl_uint>(3, countAX_BY);
		matrixMulKernel.setArg<cl_uint>(4, countAY);
		matrixMulKernel.setArg<cl_uint>(5, countBX);
		if (impl == 3)
			matrixMulKernel.setArg(6, cl::Local(2 * wgSize * wgSize * sizeof(float)));
		queue.enqueueNDRangeKernel(matrixMulKernel, cl::NullRange, cl::NDRange(countCX, countCY), cl::NDRange(wgSize, wgSize), NULL, &kernelExecution);

		// Copy output data back to host
		cl::Event copy3;
		queue.enqueueReadBuffer(d_outputC, true, 0, sizeC, h_outputCGpu.data(), NULL, &copy3);

		// Print performance data
		Core::TimeSpan gpuTime = OpenCL::getElapsedTime(kernelExecution);
		Core::TimeSpan copyTime = OpenCL::getElapsedTime(copy1) + OpenCL::getElapsedTime(copy2) + OpenCL::getElapsedTime(copy3);
		printPerformance(kernelName, gpuTime, copyTime, atlasTime);

		// Check whether results are correct
		if (!compareMatrices(h_outputCCpu, "CPU", h_outputCGpu, "GPU", countCX, countCY))
			return 1;
	}

	std::cout << "Success" << std::endl;

	//dumpMatrix ("A", h_inputA, countAX_BY, countAY);
	//dumpMatrix ("B", h_inputB, countBX, countAX_BY);
	//dumpMatrix ("C", h_outputCCpu, countCX, countCY);

	return 0;
}