int main(void) { //mkl_set_num_threads(1); mkl_domain_set_num_threads(1, MKL_BLAS); CU_pSuite pSuite = NULL; /* initialize the CUnit test registry */ if (CUE_SUCCESS != CU_initialize_registry()) { return CU_get_error(); } /* Create a test array */ CU_TestInfo test_array[] = { { "Parallel Gaussian" , test_pfunc }, CU_TEST_INFO_NULL, }; /* Create the test suite */ CU_SuiteInfo suites[] = { { "Parallel Evaluation of Expressions", init_suite, clean_suite, NULL, NULL, test_array }, CU_SUITE_INFO_NULL, }; /* Register test suites */ CU_ErrorCode CU_error = CU_register_suites(suites); if (CU_error != CUE_SUCCESS) { debug_body("%s", CU_get_error_msg()); CU_cleanup_registry(); return CU_get_error(); } /* Run all tests using the CUnit Basic interface */ CU_basic_set_mode(CU_BRM_VERBOSE); CU_basic_run_tests(); CU_cleanup_registry(); return CU_get_error(); }
int main(int argc, char *argv[]) { long matrixSize= 16384; int blockSize = 128; bool runSequential = false; bool validate = false; int numBlasThreads = 40; int numGausElimThreads = 2; int numFactorLowerThreads = 4; int numFactorUpperThreads = 4; int numMatrixMulThreads = 30; std::string runtimeFileStr("runtimes"); int numRetry = 1; if (argc > 1) { for (int arg = 1; arg < argc; arg++) { std::string argvs(argv[arg]); if (argvs == "--size") { arg++; matrixSize = atoi(argv[arg]); } if (argvs == "--num-threads-blas") { arg++; numBlasThreads = atoi(argv[arg]); } if (argvs == "num-threads-factor-l") { arg++; numFactorLowerThreads = atoi(argv[arg]); } if (argvs == "num-threads-factor-u") { arg++; numFactorUpperThreads = atoi(argv[arg]); } if (argvs == "num-threads-gaus") { arg++; numGausElimThreads = atoi(argv[arg]); } if (argvs == "num-threads-gemm") { arg++; numMatrixMulThreads = atoi(argv[arg]); } if (argvs == "--run-sequential") { runSequential = true; } if (argvs == "--num-retry" && arg + 1 < argc) { arg++; numRetry = atoi(argv[arg]); } if (argvs == "--block-size") { arg++; blockSize = atoi(argv[arg]); } if (argvs == "--runtime-file" && arg + 1 < argc) { runtimeFileStr = argv[arg + 1]; arg++; } if (argvs == "--validate-results") { validate = true; } if (argvs == "--help") { std::cout << argv[0] << " args: [--size <#>] [--block-size <#>] [--num-retry <#>] [--runtime-file <filename>] [--validate-results] [--run-sequential] [--num-threads-factor-l <#>] [--num-threads-factor-u <#>] [--num-threads-gaus <#>] [--num-threads-gemm <#>] [--num-threads-blas <#>] [--help]" << std::endl; exit(0); } } } std::ofstream runtimeFile(runtimeFileStr, std::ios::app); double *matrix = new double[matrixSize * matrixSize]; double *matrixTest = nullptr; // TODO: Ensure diagonally dominant initMatrixDiagDom(matrix, matrixSize, matrixSize, true); if (validate) { matrixTest = new double[matrixSize * matrixSize]; for (int i = 0; i < matrixSize * matrixSize; i++) matrixTest[i] = matrix[i]; } for (int numTry = 0; numTry < numRetry; numTry++) { SimpleClock clk; SimpleClock endToEnd; if (runSequential) { endToEnd.start(); mkl_domain_set_num_threads(numBlasThreads, MKL_DOMAIN_ALL); // mkl_set_num_threads(40); clk.start(); runSequentialLU(matrix, matrixSize); // computeSequentialMatMul(matrixA, matrixB, matrixC, matrixAHeight, sharedDim, matrixBWidth); clk.stopAndIncrement(); endToEnd.stopAndIncrement(); } else { endToEnd.start(); mkl_domain_set_num_threads(numBlasThreads, MKL_DOMAIN_ALL); int gridHeight = (int) matrixSize / blockSize; int gridWidth = (int) matrixSize / blockSize; // TODO: Build graph and runtime htgs::StateContainer<std::shared_ptr<MatrixBlockData<double *>>> *matrixBlocks = new htgs::StateContainer<std::shared_ptr<MatrixBlockData<double *>>>(gridHeight, gridWidth, nullptr); for (int r = 0; r < gridHeight; r++) { for (int c = 0; c < gridWidth; c++) { // Store pointer locations for all blocks double *ptr = &matrix[IDX2C(r * blockSize, c *blockSize, matrixSize)]; std::shared_ptr<MatrixRequestData> request(new MatrixRequestData(r, c, MatrixType::MatrixA)); std::shared_ptr<MatrixBlockData<double *>> data(new MatrixBlockData<double *>(request, ptr, blockSize, blockSize)); matrixBlocks->set(r, c, data); } } GausElimTask *gausElimTask = new GausElimTask(numGausElimThreads, matrixSize, matrixSize); auto gausElimBk = new htgs::Bookkeeper<MatrixBlockData<double *>>(); GausElimRuleUpper *gausElimRuleUpper = new GausElimRuleUpper(matrixBlocks, gridHeight, gridWidth); GausElimRuleLower *gausElimRuleLower = new GausElimRuleLower(matrixBlocks, gridHeight, gridWidth); FactorUpperTask *factorUpperTask = new FactorUpperTask(numFactorUpperThreads, matrixSize, matrixSize); FactorLowerTask *factorLowerTask = new FactorLowerTask(numFactorLowerThreads, matrixSize, matrixSize); auto matrixMulBk = new htgs::Bookkeeper<MatrixBlockData<double *>>(); MatrixMulRule *matrixMulRule = new MatrixMulRule(matrixBlocks, gridHeight, gridWidth); MatrixMulBlkTask *matrixMulTask = new MatrixMulBlkTask(numMatrixMulThreads, matrixSize, matrixSize, matrixSize, matrixSize, blockSize); auto matrixMulResultBk = new htgs::Bookkeeper<MatrixBlockData<double *>>(); int numDiagonals = gridWidth - 1; GausElimRule *gausElimRule = new GausElimRule(numDiagonals, gridHeight, gridWidth); // Number of updates excluding the diagonal and the top/left row/column int numUpdates = (1.0/6.0) * (double)gridWidth * (2.0 * ((double)gridWidth * (double)gridWidth) - 3.0 * (double)gridWidth + 1.0); UpdateRule *updateRule = new UpdateRule(numUpdates); UpdateRule *updateRule2 = new UpdateRule(numUpdates); auto taskGraph = new htgs::TaskGraph<MatrixBlockData<double *>, htgs::VoidData>(); taskGraph->addGraphInputConsumer(gausElimTask); taskGraph->addEdge(gausElimTask, gausElimBk); taskGraph->addRule(gausElimBk, factorUpperTask, gausElimRuleUpper); taskGraph->addRule(gausElimBk, factorLowerTask, gausElimRuleLower); taskGraph->addEdge(factorUpperTask, matrixMulBk); taskGraph->addEdge(factorLowerTask, matrixMulBk); taskGraph->addRule(matrixMulBk, matrixMulTask, matrixMulRule); taskGraph->addEdge(matrixMulTask, matrixMulResultBk); if (numDiagonals > 0) taskGraph->addRule(matrixMulResultBk, gausElimTask, gausElimRule); if (numUpdates > 0) taskGraph->addRule(matrixMulResultBk, matrixMulBk, updateRule); if (numUpdates > 0) taskGraph->addRule(matrixMulResultBk, gausElimBk, updateRule2); taskGraph->incrementGraphInputProducer(); taskGraph->writeDotToFile("lud-graph.dot"); htgs::Runtime *runtime = new htgs::Runtime(taskGraph); clk.start(); runtime->executeRuntime(); taskGraph->produceData(matrixBlocks->get(0, 0)); taskGraph->finishedProducingData(); runtime->waitForRuntime(); clk.stopAndIncrement(); delete runtime; endToEnd.stopAndIncrement(); } double operations = (2.0 * (matrixSize * matrixSize * matrixSize)) / 3.0; double flops = operations / clk.getAverageTime(TimeVal::SEC); double gflops = flops / 1073741824.0; std::cout << (runSequential ? "sequential" : "htgs") << ", matrix-size: " << matrixSize << ", " << "blockSize: " << (runSequential ? 0 : blockSize) << ", blasThreads: " << numBlasThreads << ", gausThreads: " << numGausElimThreads << ", factorUpperThreads: " << numFactorUpperThreads << ", factorLowerThreads: " << numFactorLowerThreads << ", gemmThreads: " << numMatrixMulThreads << ", time:" << clk.getAverageTime(TimeVal::MILLI) << ", end-to-end:" << endToEnd.getAverageTime(TimeVal::MILLI) << ", gflops: " << gflops << std::endl; runtimeFile << (runSequential ? "sequential" : "htgs") << ", " << matrixSize << ", " << blockSize << ", " << numBlasThreads << ", " << numGausElimThreads << ", " << numFactorUpperThreads << ", " << numFactorLowerThreads << ", " << numMatrixMulThreads << ", " << clk.getAverageTime(TimeVal::MILLI) << ", " << endToEnd.getAverageTime(TimeVal::MILLI) << ", " << gflops << std::endl; if (validate) { int res = validateResults(matrix, matrixTest, matrixSize); std::cout << (res == 0 ? "PASSED" : "FAILED") << std::endl; } } delete[] matrix; delete[] matrixTest; }