void printMenu(){ int choice; do{ choice = 0; printf("Performance assessment:\n"); printf("-----------------------\n"); printf("1) Enter parameters\n"); printf("2) Print table of parameters\n"); printf("3) Print table of performance\n"); printf("4) Quit\n"); printf("\nEnter selection: "); scanf("%d", &choice); switch(choice){ case 1: enterParameters(); break; case 2: printParameters(); break; case 3: printPerformance(); break; case 4: exit(0); break; default: printf("Invalid selection\n\n"); //Clear the input stream in case of erroneous inputs while ((choice = getchar()) != '\n' && choice != EOF); break; }//Switch }while(choice != 4); }//printMenu
void glut_keyboardSpecial(int key, int x, int y) { switch(key) { case GLUT_KEY_F1: printPerformance(); renderingMethod=0; break; case GLUT_KEY_F2: printPerformance(); renderingMethod=1; break; case GLUT_KEY_F3: printPerformance(); renderingMethod=2; break; case GLUT_KEY_F4: printPerformance(); renderingMethod=3; break; default: break; } }
void printPerformance(const std::string& name, Core::TimeSpan timeCalc, Core::TimeSpan timeCpu) { printPerformance(name, timeCalc, Core::TimeSpan::fromSeconds(0), timeCpu, false); }
////////////////////////////////////////////////////////////////////////////// // Main function ////////////////////////////////////////////////////////////////////////////// int main(int argc, char** argv) { // Create a context cl::Context context(CL_DEVICE_TYPE_GPU); // Get a device of the context int deviceNr = argc < 2 ? 1 : atoi(argv[1]); std::cout << "Using device " << deviceNr << " / " << context.getInfo<CL_CONTEXT_DEVICES>().size() << std::endl; ASSERT (deviceNr > 0); ASSERT ((size_t) deviceNr <= context.getInfo<CL_CONTEXT_DEVICES>().size()); cl::Device device = context.getInfo<CL_CONTEXT_DEVICES>()[deviceNr - 1]; std::vector<cl::Device> devices; devices.push_back(device); OpenCL::printDeviceInfo(std::cout, device); // Create a command queue cl::CommandQueue queue(context, device, CL_QUEUE_PROFILING_ENABLE); // Declare some values std::size_t wgSize = 16; std::size_t countAX_BY = 512; std::size_t countAY = 1024; std::size_t countBX = 768; std::size_t countCX = countBX; std::size_t countCY = countAY; std::size_t countA = countAX_BY * countAY; std::size_t countB = countBX * countAX_BY; std::size_t countC = countCX * countCY; std::size_t sizeA = countA * sizeof (float); std::size_t sizeB = countB * sizeof (float); std::size_t sizeC = countC * sizeof (float); // Load the source code cl::Program program = OpenCL::loadProgramSource(context, "src/OpenCLExercise4_MatrixMultiplication.cl"); // Compile the source code. This is similar to program.build(devices) but will print more detailed error messages // This will pass the value of wgSize as a preprocessor constant "WG_SIZE" to the OpenCL C compiler OpenCL::buildProgram(program, devices, "-DWG_SIZE=" + boost::lexical_cast<std::string>(wgSize)); // Allocate space for output data from CPU and GPU on the host std::vector<float> h_inputA (countA); std::vector<float> h_inputB (countB); std::vector<float> h_outputCCpu (countC); std::vector<float> h_outputCAtlas (countC); std::vector<float> h_outputCGpu (countC); // Allocate space for input and output data on the device cl::Buffer d_inputA (context, CL_MEM_READ_WRITE, sizeA); cl::Buffer d_inputB (context, CL_MEM_READ_WRITE, sizeB); cl::Buffer d_outputC (context, CL_MEM_READ_WRITE, sizeC); cl::Image2D d_inputAImg (context, CL_MEM_READ_ONLY, cl::ImageFormat(CL_R, CL_FLOAT), countAX_BY, countAY); cl::Image2D d_inputBImg (context, CL_MEM_READ_ONLY, cl::ImageFormat(CL_R, CL_FLOAT), countBX, countAX_BY); // Initialize memory to 0xff (useful for debugging because otherwise GPU memory will contain information from last execution) memset(h_inputA.data(), 255, sizeA); memset(h_inputB.data(), 255, sizeB); memset(h_outputCCpu.data(), 255, sizeC); memset(h_outputCAtlas.data(), 255, sizeC); memset(h_outputCGpu.data(), 255, sizeC); //TODO: GPU queue.enqueueWriteBuffer(d_inputA, true, 0, sizeA, h_inputA.data()); queue.enqueueWriteBuffer(d_inputB, true, 0, sizeB, h_inputB.data()); queue.enqueueWriteBuffer(d_outputC, true, 0, sizeC, h_outputCGpu.data()); //////// Generate input data //////////////////////////////// // Use random input data for (std::size_t i = 0; i < countA; i++) h_inputA[i] = (rand() % 100) / 5.0f - 10.0f; for (std::size_t i = 0; i < countB; i++) h_inputB[i] = (rand() % 100) / 5.0f - 10.0f; // Use integer numbers as data /* for (std::size_t i = 0; i < countA; i++) h_inputA[i] = i; for (std::size_t i = 0; i < countB; i++) h_inputB[i] = (int)i - 5; */ // Do calculation on the host side Core::TimeSpan cpuStart = Core::getCurrentTime(); matrixMulHost(h_inputA, h_inputB, h_outputCCpu, countAX_BY, countAY, countBX); Core::TimeSpan cpuEnd = Core::getCurrentTime(); // Do calculation on using libatlas Core::TimeSpan atlasStart = Core::getCurrentTime(); cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, countAY, countBX, countAX_BY, 1.0, h_inputA.data(), countAX_BY, h_inputB.data(), countBX, 0.0, h_outputCAtlas.data(), countCX); Core::TimeSpan atlasEnd = Core::getCurrentTime(); Core::TimeSpan cpuTime = cpuEnd - cpuStart; Core::TimeSpan atlasTime = atlasEnd - atlasStart; printPerformanceHeader(); printPerformance("CPU", cpuTime, atlasTime); printPerformance("Atlas", atlasTime, atlasTime); if (!compareMatrices(h_outputCCpu, "CPU", h_outputCAtlas, "Atlas", countCX, countCY)) return 1; // Copy input data to device cl::Event copy1; cl::Event copy2; queue.enqueueWriteBuffer(d_inputA, true, 0, sizeA, h_inputA.data(), NULL, ©1); queue.enqueueWriteBuffer(d_inputB, true, 0, sizeB, h_inputB.data(), NULL, ©2); // Iterate over all implementations (task 1 - 2) for (int impl = 1; impl <= 4; impl++) { // Reinitialize output memory to 0xff memset(h_outputCGpu.data(), 255, sizeC); queue.enqueueWriteBuffer(d_outputC, true, 0, sizeC, h_outputCGpu.data()); // Create a kernel object std::string kernelName = "matrixMulKernel" + boost::lexical_cast<std::string> (impl); cl::Kernel matrixMulKernel(program, kernelName.c_str ()); if (impl == 4) { cl::size_t<3> origin; origin[0] = origin[1] = origin[2] = 0; cl::size_t<3> region; region[0] = countAX_BY; region[1] = countAY; region[2] = 1; queue.enqueueWriteImage(d_inputAImg, true, origin, region, countAX_BY * sizeof (float), 0, h_inputA.data(), NULL, ©1); region[0] = countBX; region[1] = countAX_BY; queue.enqueueWriteImage(d_inputBImg, true, origin, region, countBX * sizeof (float), 0, h_inputB.data(), NULL, ©2); } // Launch kernel on the device cl::Event kernelExecution; if (impl == 4) matrixMulKernel.setArg<cl::Image2D>(0, d_inputAImg); else matrixMulKernel.setArg<cl::Buffer>(0, d_inputA); if (impl == 4) matrixMulKernel.setArg<cl::Image2D>(1, d_inputBImg); else matrixMulKernel.setArg<cl::Buffer>(1, d_inputB); matrixMulKernel.setArg<cl::Buffer>(2, d_outputC); matrixMulKernel.setArg<cl_uint>(3, countAX_BY); matrixMulKernel.setArg<cl_uint>(4, countAY); matrixMulKernel.setArg<cl_uint>(5, countBX); if (impl == 3) matrixMulKernel.setArg(6, cl::Local(2 * wgSize * wgSize * sizeof(float))); queue.enqueueNDRangeKernel(matrixMulKernel, cl::NullRange, cl::NDRange(countCX, countCY), cl::NDRange(wgSize, wgSize), NULL, &kernelExecution); // Copy output data back to host cl::Event copy3; queue.enqueueReadBuffer(d_outputC, true, 0, sizeC, h_outputCGpu.data(), NULL, ©3); // Print performance data Core::TimeSpan gpuTime = OpenCL::getElapsedTime(kernelExecution); Core::TimeSpan copyTime = OpenCL::getElapsedTime(copy1) + OpenCL::getElapsedTime(copy2) + OpenCL::getElapsedTime(copy3); printPerformance(kernelName, gpuTime, copyTime, atlasTime); // Check whether results are correct if (!compareMatrices(h_outputCCpu, "CPU", h_outputCGpu, "GPU", countCX, countCY)) return 1; } std::cout << "Success" << std::endl; //dumpMatrix ("A", h_inputA, countAX_BY, countAY); //dumpMatrix ("B", h_inputB, countBX, countAX_BY); //dumpMatrix ("C", h_outputCCpu, countCX, countCY); return 0; }