void testMatrix::testMatrices() { Vector4 testTransV = Vector4(4,5,6,1); Matrix4x4 testTrans = Matrix4x4(1,0,0,-4,0,1,0,-5,0,0,1,-6,0,0,0,1); Matrix4x4 testRotZ90 = Matrix4x4(0.f,-1.f,0.f,0.f,1.f,0.f,0.f,0.f,0.f,0.f,1.f,0.f,0.f,0.f,0.f,1.f); Matrix4x4 testRotY90 = Matrix4x4(0.f,0.f,1.f,0.f,0.f,1.f,0.f,0.f,-1.f,0.f,0.f,0.f,0.f,0.f,0.f,1.f); Matrix4x4 testRotX90 = Matrix4x4(1.f,0.f,0.f,0.f,0.f,0.f,-1.f,0.f,0.f,1.f,0.f,0.f,0.f,0.f,0.f,1.f); //y Matrix4x4 testRot1 = getRotMat(Vector4(0,0,0,1),Vector4(0,1,0,1),90); //x Matrix4x4 testRot2 = getRotMat(Vector4(0,0,0,1),Vector4(1,0,0,1), 90); //z Matrix4x4 testRot3 = getRotMat(Vector4(0,0,0,1),Vector4(0,0,1,1), 90); Matrix4x4 trz90 = getRotZMat(M_PI/2); Matrix4x4 try90 = getRotYMat(M_PI/2); Matrix4x4 trx90 = getRotXMat(M_PI/2); Matrix4x4 trans = getTransMat(testTransV); //Test rotation on z axis compareMatrices(&testRotZ90,&trz90); //Z axis as arbitrary rotation compareMatrices(&testRotZ90,&testRot3); //inverses are equivalent: compareMatrices(&getInvRotZMat(M_PI/2),&getInvRotMat(Vector4(0,0,0,1),Vector4(0,0,1,1),90)); //Test rotation on y axis compareMatrices(&testRotY90,&try90); //Y axis as arbitrary rotation compareMatrices(&testRotY90,&testRot1); //inverses are equivalent: compareMatrices(&getInvRotYMat(M_PI/2),&getInvRotMat(Vector4(0,0,0,1),Vector4(0,1,0,1),90)); //Test rotation on x axis compareMatrices(&testRotX90,&trx90); //X axis as arbitrary rotation compareMatrices(&testRotX90,&testRot2); //inverses are equivalent: compareMatrices(&getInvRotXMat(M_PI/2),&getInvRotMat(Vector4(0,0,0,1),Vector4(1,0,0,1),90)); //test translation compareMatrices(&testTrans,&trans); }
TEST_F(ExoticaVelSolverTest, IFT) //!< Inverse-computation Function Test { //!< Temporaries Eigen::MatrixXd test_matrix; tinyxml2::XMLDocument xml_document; boost::shared_ptr<tinyxml2::XMLHandle> xml_handle_ptr; //!< Load the Matrix for inversion testing ASSERT_EQ(tinyxml2::XML_NO_ERROR, xml_document.LoadFile((resource_path_ + std::string("../storage.xml")).c_str())); xml_handle_ptr.reset(new tinyxml2::XMLHandle(xml_document.RootElement())); ASSERT_NE(nullptr, xml_handle_ptr->FirstChildElement("InvertibleMatrix").ToElement()); ASSERT_TRUE(exotica::getMatrix(*(xml_handle_ptr->FirstChildElement("InvertibleMatrix").ToElement()), test_matrix)); //!< Now iterate through all the registered velocity solvers for (int i=0; i<registered_types_.size(); i++)//!< Iterate through the registered tasks { std::string xml_path; if (exotica::TestRegistrar::Instance()->findXML(registered_types_[i], xml_path)) //!< If it is wished to be tested... { if (xml_document.LoadFile((resource_path_ + xml_path).c_str()) != tinyxml2::XML_NO_ERROR) //!< Attempt to load file { ADD_FAILURE() << " : Could not Load initialiser for " << registered_types_[i] << " (file: "<<resource_path_ + xml_path <<")."; //!< Have to use this method to add failure since I do not want to continue for this object but do not wish to abort for all the others... continue;//!< Go to next object } if (!(vel_solv_ptr_ = exotica::VelocitySolverCreator::Instance()->createObject(registered_types_[i], params_))) //!< If we could not create { ADD_FAILURE() << " : Could not create object of type " << registered_types_[i]; //!< Have to use this method to add failure since I do not want to continue for this object but do not wish to abort for all the others... continue;//!< Go to next object } xml_handle_ptr.reset(new tinyxml2::XMLHandle(xml_document.RootElement())); //!< Get handle to root element *xml_handle_ptr = xml_handle_ptr->FirstChildElement("VelocitySolver");//!< Locate the child if (!vel_solv_ptr_->initBase(*xml_handle_ptr)) { ADD_FAILURE() << " : Could not initialise " << registered_types_[i]; continue; } Eigen::MatrixXd temp_inverse; if (!vel_solv_ptr_->getInverse(test_matrix, Eigen::MatrixXd::Identity(30,30)*0.0000001, Eigen::MatrixXd::Identity(30,30), temp_inverse)) //!< Since we know matrix is perfectly invertible { ADD_FAILURE() << " : Could not compute inverse for " << registered_types_[i]; continue; } if (temp_inverse.rows() == 30 and temp_inverse.cols() == 30) { EXPECT_TRUE(compareMatrices(Eigen::MatrixXd::Identity(30,30), test_matrix*temp_inverse, TOLERANCE_L)); } else { ADD_FAILURE() << " : Jacobian computation for " << registered_types_[i] << " incorrect"; continue; } } } }
inline int compareAndSum(int val,const pair<MyMat,MyMat > &p) { return val+compareMatrices(p.first,p.second); }
////////////////////////////////////////////////////////////////////////////// // Main function ////////////////////////////////////////////////////////////////////////////// int main(int argc, char** argv) { // Create a context cl::Context context(CL_DEVICE_TYPE_GPU); // Get a device of the context int deviceNr = argc < 2 ? 1 : atoi(argv[1]); std::cout << "Using device " << deviceNr << " / " << context.getInfo<CL_CONTEXT_DEVICES>().size() << std::endl; ASSERT (deviceNr > 0); ASSERT ((size_t) deviceNr <= context.getInfo<CL_CONTEXT_DEVICES>().size()); cl::Device device = context.getInfo<CL_CONTEXT_DEVICES>()[deviceNr - 1]; std::vector<cl::Device> devices; devices.push_back(device); OpenCL::printDeviceInfo(std::cout, device); // Create a command queue cl::CommandQueue queue(context, device, CL_QUEUE_PROFILING_ENABLE); // Declare some values std::size_t wgSize = 16; std::size_t countAX_BY = 512; std::size_t countAY = 1024; std::size_t countBX = 768; std::size_t countCX = countBX; std::size_t countCY = countAY; std::size_t countA = countAX_BY * countAY; std::size_t countB = countBX * countAX_BY; std::size_t countC = countCX * countCY; std::size_t sizeA = countA * sizeof (float); std::size_t sizeB = countB * sizeof (float); std::size_t sizeC = countC * sizeof (float); // Load the source code cl::Program program = OpenCL::loadProgramSource(context, "src/OpenCLExercise4_MatrixMultiplication.cl"); // Compile the source code. This is similar to program.build(devices) but will print more detailed error messages // This will pass the value of wgSize as a preprocessor constant "WG_SIZE" to the OpenCL C compiler OpenCL::buildProgram(program, devices, "-DWG_SIZE=" + boost::lexical_cast<std::string>(wgSize)); // Allocate space for output data from CPU and GPU on the host std::vector<float> h_inputA (countA); std::vector<float> h_inputB (countB); std::vector<float> h_outputCCpu (countC); std::vector<float> h_outputCAtlas (countC); std::vector<float> h_outputCGpu (countC); // Allocate space for input and output data on the device cl::Buffer d_inputA (context, CL_MEM_READ_WRITE, sizeA); cl::Buffer d_inputB (context, CL_MEM_READ_WRITE, sizeB); cl::Buffer d_outputC (context, CL_MEM_READ_WRITE, sizeC); cl::Image2D d_inputAImg (context, CL_MEM_READ_ONLY, cl::ImageFormat(CL_R, CL_FLOAT), countAX_BY, countAY); cl::Image2D d_inputBImg (context, CL_MEM_READ_ONLY, cl::ImageFormat(CL_R, CL_FLOAT), countBX, countAX_BY); // Initialize memory to 0xff (useful for debugging because otherwise GPU memory will contain information from last execution) memset(h_inputA.data(), 255, sizeA); memset(h_inputB.data(), 255, sizeB); memset(h_outputCCpu.data(), 255, sizeC); memset(h_outputCAtlas.data(), 255, sizeC); memset(h_outputCGpu.data(), 255, sizeC); //TODO: GPU queue.enqueueWriteBuffer(d_inputA, true, 0, sizeA, h_inputA.data()); queue.enqueueWriteBuffer(d_inputB, true, 0, sizeB, h_inputB.data()); queue.enqueueWriteBuffer(d_outputC, true, 0, sizeC, h_outputCGpu.data()); //////// Generate input data //////////////////////////////// // Use random input data for (std::size_t i = 0; i < countA; i++) h_inputA[i] = (rand() % 100) / 5.0f - 10.0f; for (std::size_t i = 0; i < countB; i++) h_inputB[i] = (rand() % 100) / 5.0f - 10.0f; // Use integer numbers as data /* for (std::size_t i = 0; i < countA; i++) h_inputA[i] = i; for (std::size_t i = 0; i < countB; i++) h_inputB[i] = (int)i - 5; */ // Do calculation on the host side Core::TimeSpan cpuStart = Core::getCurrentTime(); matrixMulHost(h_inputA, h_inputB, h_outputCCpu, countAX_BY, countAY, countBX); Core::TimeSpan cpuEnd = Core::getCurrentTime(); // Do calculation on using libatlas Core::TimeSpan atlasStart = Core::getCurrentTime(); cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, countAY, countBX, countAX_BY, 1.0, h_inputA.data(), countAX_BY, h_inputB.data(), countBX, 0.0, h_outputCAtlas.data(), countCX); Core::TimeSpan atlasEnd = Core::getCurrentTime(); Core::TimeSpan cpuTime = cpuEnd - cpuStart; Core::TimeSpan atlasTime = atlasEnd - atlasStart; printPerformanceHeader(); printPerformance("CPU", cpuTime, atlasTime); printPerformance("Atlas", atlasTime, atlasTime); if (!compareMatrices(h_outputCCpu, "CPU", h_outputCAtlas, "Atlas", countCX, countCY)) return 1; // Copy input data to device cl::Event copy1; cl::Event copy2; queue.enqueueWriteBuffer(d_inputA, true, 0, sizeA, h_inputA.data(), NULL, ©1); queue.enqueueWriteBuffer(d_inputB, true, 0, sizeB, h_inputB.data(), NULL, ©2); // Iterate over all implementations (task 1 - 2) for (int impl = 1; impl <= 4; impl++) { // Reinitialize output memory to 0xff memset(h_outputCGpu.data(), 255, sizeC); queue.enqueueWriteBuffer(d_outputC, true, 0, sizeC, h_outputCGpu.data()); // Create a kernel object std::string kernelName = "matrixMulKernel" + boost::lexical_cast<std::string> (impl); cl::Kernel matrixMulKernel(program, kernelName.c_str ()); if (impl == 4) { cl::size_t<3> origin; origin[0] = origin[1] = origin[2] = 0; cl::size_t<3> region; region[0] = countAX_BY; region[1] = countAY; region[2] = 1; queue.enqueueWriteImage(d_inputAImg, true, origin, region, countAX_BY * sizeof (float), 0, h_inputA.data(), NULL, ©1); region[0] = countBX; region[1] = countAX_BY; queue.enqueueWriteImage(d_inputBImg, true, origin, region, countBX * sizeof (float), 0, h_inputB.data(), NULL, ©2); } // Launch kernel on the device cl::Event kernelExecution; if (impl == 4) matrixMulKernel.setArg<cl::Image2D>(0, d_inputAImg); else matrixMulKernel.setArg<cl::Buffer>(0, d_inputA); if (impl == 4) matrixMulKernel.setArg<cl::Image2D>(1, d_inputBImg); else matrixMulKernel.setArg<cl::Buffer>(1, d_inputB); matrixMulKernel.setArg<cl::Buffer>(2, d_outputC); matrixMulKernel.setArg<cl_uint>(3, countAX_BY); matrixMulKernel.setArg<cl_uint>(4, countAY); matrixMulKernel.setArg<cl_uint>(5, countBX); if (impl == 3) matrixMulKernel.setArg(6, cl::Local(2 * wgSize * wgSize * sizeof(float))); queue.enqueueNDRangeKernel(matrixMulKernel, cl::NullRange, cl::NDRange(countCX, countCY), cl::NDRange(wgSize, wgSize), NULL, &kernelExecution); // Copy output data back to host cl::Event copy3; queue.enqueueReadBuffer(d_outputC, true, 0, sizeC, h_outputCGpu.data(), NULL, ©3); // Print performance data Core::TimeSpan gpuTime = OpenCL::getElapsedTime(kernelExecution); Core::TimeSpan copyTime = OpenCL::getElapsedTime(copy1) + OpenCL::getElapsedTime(copy2) + OpenCL::getElapsedTime(copy3); printPerformance(kernelName, gpuTime, copyTime, atlasTime); // Check whether results are correct if (!compareMatrices(h_outputCCpu, "CPU", h_outputCGpu, "GPU", countCX, countCY)) return 1; } std::cout << "Success" << std::endl; //dumpMatrix ("A", h_inputA, countAX_BY, countAY); //dumpMatrix ("B", h_inputB, countBX, countAX_BY); //dumpMatrix ("C", h_outputCCpu, countCX, countCY); return 0; }
int main(int argc, char * argv[]) { //checking for desired number of threads if (argc != 1) { if (strcmp(argv[1],"-t") || (threads = atoi(argv[2])) < 2) { printf("Usage: %s [-t <NUMBER_OF_THREADS>]\n Default Number of Threads: 2\n" , argv[0]); exit(-1); } printf("Pthreads and OpenMP Calculations will be done with %i threads.\n", threads); } struct timeval start, end; matrix firsts[MAX_TEST_CASES]; matrix seconds[MAX_TEST_CASES]; matrix results[MAX_TEST_CASES]; //parses all Testmatrices and stores them in two arrays // int num; //the actual number of testcases printf("Reading matrices from InputFile...\n"); for (int i = 0; i < MAX_TEST_CASES; ++i) { int end = parseMatrices(PATH_TO_TESTS, i, &firsts[i], &seconds[i]); if (end == 0) { printf("Parsing went wrong\n"); exit(0); } } FILE* performance; //first, sequential computation printf("Sequential calculation...\n"); gettimeofday(&start, NULL); for (int i = 0; i < MAX_TEST_CASES; ++i) { sequential(&firsts[i], &seconds[i], &results[i]); } gettimeofday(&end, NULL); //prints the results of sequential computation to file printf("Printing results...\n"); for (int i = 0; i < MAX_TEST_CASES; ++i) { if(!printMatrix(&results[i], PATH_TO_RESULTS)) { printf("Theres a problem with the output stream.\n"); exit(0); } } printf("Sequential calculation complete. Check %s for results\n" , PATH_TO_RESULTS); performance = fopen(PATH_TO_TIMES, "w"); fprintf(performance, "Sequential Implementation took %.3lf seconds for all testcases.\n" , getDifference(start, end)); //openMP Implementation is up next matrix ompresults[MAX_TEST_CASES]; printf("OpenMP calculation...\n"); gettimeofday(&start, NULL); for (int i = 0; i < MAX_TEST_CASES; ++i) { openMP(&(firsts[i]), &(seconds[i]), &(ompresults[i]), threads); } gettimeofday(&end, NULL); for (int i = 0; i < MAX_TEST_CASES; ++i) { if(!compareMatrices(&results[i], &ompresults[i])) { printf("OMP-Implementation has faults!"); exit(0); } } fprintf(performance, "OpenMP-Implementation took %.3lf seconds for all testcases.\n" , getDifference(start, end)); //lastly, pthreads implementation matrix ptresults[MAX_TEST_CASES]; printf("Posix-Threads calculation...\n"); gettimeofday(&start, NULL); for (int i = 0; i < MAX_TEST_CASES; ++i) { multithreaded(&firsts[i], &seconds[i], &ptresults[i], threads); } gettimeofday(&end, NULL); for (int i = 0; i < MAX_TEST_CASES; ++i) { if(!compareMatrices(&results[i], &ptresults[i])) { printf("Pthread-Implementation has faults!"); exit(0); } } fprintf(performance, "Pthreads-Implementation took %.3lf seconds for all testcases.\n" , getDifference(start, end)); //cleaning up printf("Cleaning up...\n"); fclose(performance); for (int i = 0; i < MAX_TEST_CASES; ++i) { free(firsts[i].values); free(seconds[i].values); free(results[i].values); free(ompresults[i].values); free(ptresults[i].values); } printf("All done, see %s for a summarization of each implementation's performance.\n" , PATH_TO_TIMES); exit(0); }