TEST_F(MergeReceiveExecutorTest, singlePartitionLimitOffsetTest) { std::vector<int> values; values.push_back(0); values.push_back(1); values.push_back(1); values.push_back(2); std::vector<TableTuple> tuples; std::vector<int64_t> partitionTupleCounts; boost::scoped_ptr<char> cleaner( addPartitionData(values, tuples, partitionTupleCounts)); std::vector<SortDirectionType> dirs(1, SORT_DIRECTION_TYPE_ASC); AbstractExecutor::TupleComparer comp(getSortKeys(), dirs); int limit = 2; int offset = 1; AggregateExecutorBase* agg_exec = NULL; ProgressMonitorProxy* pmp = NULL; MergeReceiveExecutor::merge_sort(tuples, partitionTupleCounts, comp, limit, offset, agg_exec, getDstTempTable(), pmp); validateResults(comp, tuples, limit, offset); }
BOOL readTest(DWORD dwByteCount, char cResult) { HANDLE hFile = NULL; DWORD dwBytesRead; BOOL bRc = FALSE; // open the test file hFile = CreateFile(szReadableFile, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); if(hFile == INVALID_HANDLE_VALUE) { Trace("ReadFile: ERROR -> Unable to open file \"%s\".\n", szReadableFile); return FALSE; } memset(readBuffer, 0, PAGESIZE); bRc = ReadFile(hFile, readBuffer, dwByteCount, &dwBytesRead, NULL); if (bRc == FALSE) { // if it failed, was it supposed to fail? if (cResult == '1') { Trace("\nbRc = %d\n", bRc); Trace("readBuffer = [%s] dwByteCount = %d dwBytesRead = %d\n", readBuffer, dwByteCount, dwBytesRead); Trace("cresult = 1\n"); Trace("getlasterror = %d\n", GetLastError()); CloseHandle(hFile); return FALSE; } } else { CloseHandle(hFile); // if it passed, was it supposed to pass? if (cResult == '0') { Trace("cresult = 0\n"); return FALSE; } else { return (validateResults(readBuffer, dwByteCount, dwBytesRead)); } } CloseHandle(hFile); return TRUE; }
TEST_F(MergeReceiveExecutorTest, multipleOverlapPartitionsTest) { std::vector<int> values1; values1.push_back(10); values1.push_back(11); values1.push_back(11); values1.push_back(12); std::vector<int> values2; values2.push_back(1); values2.push_back(3); values2.push_back(4); values2.push_back(10); values2.push_back(11); values2.push_back(15); values2.push_back(20); values2.push_back(21); values2.push_back(25); std::vector<int> values3; values3.push_back(2); values3.push_back(4); values3.push_back(10); values3.push_back(12); values3.push_back(13); values3.push_back(15); std::vector<TableTuple> tuples; std::vector<int64_t> partitionTupleCounts; boost::scoped_ptr<char> cleaner1( addPartitionData(values1, tuples, partitionTupleCounts)); boost::scoped_ptr<char> cleaner2( addPartitionData(values2, tuples, partitionTupleCounts)); boost::scoped_ptr<char> cleaner3( addPartitionData(values3, tuples, partitionTupleCounts)); std::vector<SortDirectionType> dirs(1, SORT_DIRECTION_TYPE_ASC); AbstractExecutor::TupleComparer comp(getSortKeys(), dirs); int limit = -1; int offset = 0; AggregateExecutorBase* agg_exec = NULL; ProgressMonitorProxy* pmp = NULL; MergeReceiveExecutor::merge_sort(tuples, partitionTupleCounts, comp, limit, offset, agg_exec, getDstTempTable(), pmp); validateResults(comp, tuples); }
BOOL writeTest(const char* szString) { HANDLE hFile = NULL; DWORD dwBytesWritten; BOOL bRc = FALSE; BOOL bAllPassed = TRUE; int nStringLength = 0; char* szPtr = NULL; int i = 0; // create the test file hFile = CreateFile(szWritableFile, GENERIC_WRITE, FILE_SHARE_WRITE, NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL); if(hFile == INVALID_HANDLE_VALUE) { Trace("WriteFile: ERROR -> Unable to create file \"%s\".\n", szWritableFile); return FALSE; } nStringLength = strlen(szString); szPtr = (char*) szString; for (i = 0; i < nStringLength; i++) { bRc = WriteFile(hFile, szPtr++, 1, &dwBytesWritten, NULL); if ((bRc == FALSE) || (dwBytesWritten != 1)) { bAllPassed = FALSE; } } CloseHandle(hFile); if (bAllPassed == FALSE) { Trace ("WriteFile: ERROR: Failed to write data.\n"); return FALSE; } else { return (validateResults(szString)); } return TRUE; }
int main(int argc, char *argv[]) { MatMulArgs matMulArgs; matMulArgs.processArgs(argc, argv); size_t matrixAHeight = matMulArgs.getMatrixAHeight(); size_t matrixBWidth = matMulArgs.getMatrixBWidth(); size_t sharedDim = matMulArgs.getSharedDim(); size_t blockSize = matMulArgs.getBlockSize(); size_t numReadThreads = matMulArgs.getNumReadThreads(); size_t numProdThreads = matMulArgs.getNumMatMulThreads(); size_t numAccumThreads = (size_t) ceil((double)numProdThreads / 2.0); std::string directory = matMulArgs.getDirectory(); std::string outputDirectory = matMulArgs.getOutputDir(); bool runSequential = matMulArgs.isRunSequential(); bool validate = matMulArgs.isValidateResults(); size_t numGPUs = matMulArgs.getNumGPUs(); int gpuIds[numGPUs]; matMulArgs.copyGpuIds(gpuIds); // CUcontext *contexts = initCuda(numGPUs, gpuIds); std::string runtimeFileStr("runtimes"); int numRetry = 1; std::ofstream runtimeFile(runtimeFileStr, std::ios::app); double *matrixA = new double[matrixAHeight * sharedDim]; double *matrixB = new double[matrixBWidth * sharedDim]; double *matrixC = new double[matrixAHeight * matrixBWidth]; initMatrix(matrixA, sharedDim, matrixAHeight, true); initMatrix(matrixB, matrixBWidth, sharedDim, true); for (int numTry = 0; numTry < numRetry; numTry++) { SimpleClock clk; SimpleClock endToEnd; if (runSequential) { endToEnd.start(); initMatMul(numProdThreads); cublasXtHandle_t handle; cublasXtCreate(&handle); cublasXtDeviceSelect(handle, numGPUs, gpuIds); cublasXtSetBlockDim(handle, blockSize); clk.start(); computeSequentialMatMul(matrixA, matrixB, matrixC, (size_t) matrixAHeight, (size_t) sharedDim, (size_t) matrixBWidth, handle); clk.stopAndIncrement(); cublasXtDestroy(handle); endToEnd.stopAndIncrement(); } else { endToEnd.start(); initMatMul(1); LoadMatrixTask *readAMatTask = new LoadMatrixTask(matrixA, numReadThreads, MatrixType::MatrixA, blockSize, sharedDim, matrixAHeight, true); LoadMatrixTask *readBMatTask = new LoadMatrixTask(matrixB, numReadThreads, MatrixType::MatrixB, blockSize, matrixBWidth, sharedDim, true); MatrixMulBlkCudaTask *mmulTask = new MatrixMulBlkCudaTask(gpuIds, numGPUs); MatMulAccumTask *accumTask = new MatMulAccumTask(numAccumThreads, true); MatMulOutputTask *outputTask = new MatMulOutputTask(matrixC, matrixAHeight, blockSize, true); size_t blkHeightMatB = readBMatTask->getNumBlocksRows(); size_t blkWidthMatB = readBMatTask->getNumBlocksCols(); size_t blkHeightMatA = readAMatTask->getNumBlocksRows(); size_t blkWidthMatA = readAMatTask->getNumBlocksCols(); CudaCopyInTask *cudaCopyInATask = new CudaCopyInTask(gpuIds, numGPUs, MatrixType::MatrixA, blkWidthMatB); CudaCopyInTask *cudaCopyInBTask = new CudaCopyInTask(gpuIds, numGPUs, MatrixType::MatrixB, blkHeightMatA); CudaCopyOutTask *cudaCopyOutCTask = new CudaCopyOutTask(gpuIds, numGPUs, MatrixType::MatrixC); MatMulDistributeRule *distributeRuleMatA = new MatMulDistributeRule(MatrixType::MatrixA); MatMulDistributeRule *distributeRuleMatB = new MatMulDistributeRule(MatrixType::MatrixB); MatMulLoadRule<htgs::m_data_t<double>> *loadRule = new MatMulLoadRule<htgs::m_data_t<double>>(blkWidthMatA, blkHeightMatA, blkWidthMatB, blkHeightMatB); MatMulAccumulateRule<double *> *accumulateRule = new MatMulAccumulateRule<double *>(blkWidthMatB, blkHeightMatA, blkWidthMatA); MatMulOutputRule *outputRule = new MatMulOutputRule(blkWidthMatB, blkHeightMatA, blkWidthMatA); auto distributeBk = new htgs::Bookkeeper<MatrixRequestData>(); auto matMulBk = new htgs::Bookkeeper<MatrixBlockData<htgs::m_data_t<double>>>(); auto matAccumBk = new htgs::Bookkeeper<MatrixBlockData<double *>>(); auto taskGraph = new htgs::TaskGraphConf<MatrixRequestData, MatrixBlockData<double *>>(); taskGraph->setGraphConsumerTask(distributeBk); taskGraph->addRuleEdge(distributeBk, distributeRuleMatA, readAMatTask); taskGraph->addRuleEdge(distributeBk, distributeRuleMatB, readBMatTask); taskGraph->addEdge(readAMatTask, cudaCopyInATask); taskGraph->addEdge(readBMatTask, cudaCopyInBTask); taskGraph->addEdge(cudaCopyInATask, matMulBk); taskGraph->addEdge(cudaCopyInBTask, matMulBk); taskGraph->addRuleEdge(matMulBk, loadRule, mmulTask); taskGraph->addEdge(mmulTask, cudaCopyOutCTask); taskGraph->addGraphProducerTask(cudaCopyOutCTask); taskGraph->addCudaMemoryManagerEdge(matrixTypeToString(MatrixType::MatrixA) + "Copy", cudaCopyInATask, new CudaAllocator(blockSize, blockSize), blkWidthMatB+1, htgs::MMType::Static, gpuIds); taskGraph->addCudaMemoryManagerEdge(matrixTypeToString(MatrixType::MatrixB) + "Copy", cudaCopyInBTask, new CudaAllocator(blockSize, blockSize), blkHeightMatA+1, htgs::MMType::Static, gpuIds); taskGraph->addCudaMemoryManagerEdge(matrixTypeToString(MatrixType::MatrixC), mmulTask, new CudaAllocator(blockSize, blockSize), 4, htgs::MMType::Static, gpuIds); auto mainTaskGraph = new htgs::TaskGraphConf<MatrixRequestData, MatrixRequestData>(); auto execPipeline = new htgs::ExecutionPipeline<MatrixRequestData, MatrixBlockData<double *>>(numGPUs, taskGraph); auto decompositionRule = new MatrixDecompositionRule(numGPUs); execPipeline->addInputRule(decompositionRule); mainTaskGraph->setGraphConsumerTask(execPipeline); mainTaskGraph->addEdge(execPipeline, matAccumBk); mainTaskGraph->addRuleEdge(matAccumBk, outputRule, outputTask); mainTaskGraph->addRuleEdge(matAccumBk, accumulateRule, accumTask); mainTaskGraph->addEdge(accumTask, matAccumBk); mainTaskGraph->addGraphProducerTask(outputTask); // mainTaskGraph->writeDotToFile("pre-execution.dot"); htgs::TaskGraphRuntime *runtime = new htgs::TaskGraphRuntime(mainTaskGraph); clk.start(); runtime->executeRuntime(); for (size_t col = 0; col < blkWidthMatA; col++) { for (size_t row = 0; row < blkHeightMatA; row++) { MatrixRequestData *matA = new MatrixRequestData(row, col, MatrixType::MatrixA); mainTaskGraph->produceData(matA); } } for (size_t row = 0; row < blkHeightMatB; row++) { for (size_t col = 0; col < blkWidthMatB; col++) { MatrixRequestData *matB = new MatrixRequestData(row, col, MatrixType::MatrixB); mainTaskGraph->produceData(matB); } } mainTaskGraph->finishedProducingData(); while (!mainTaskGraph->isOutputTerminated()) { auto data = mainTaskGraph->consumeData(); if (data != nullptr) { // std::cout << data->getRow() << ", " << data->getCol() << std::endl; } } runtime->waitForRuntime(); // taskGraph->writeDotToFile("profile-graph.dot"); // mainTaskGraph->writeDotToFile("profile-all-threads-graph.dot", DOTGEN_FLAG_SHOW_ALL_THREADING); mainTaskGraph->writeDotToFile("matrix-multiplication-cuda-multigpu.dot", DOTGEN_COLOR_COMP_TIME); clk.stopAndIncrement(); delete runtime; endToEnd.stopAndIncrement(); } if (validate) { double *matrixCTest = new double[matrixAHeight * matrixBWidth]; initMatMul(numProdThreads); cublasXtHandle_t handle; cublasXtCreate(&handle); cublasXtDeviceSelect(handle, (int)numGPUs, gpuIds); cublasXtSetBlockDim(handle, (int)blockSize); computeSequentialMatMul(matrixA, matrixB, matrixCTest, (size_t) matrixAHeight, (size_t) sharedDim, (size_t) matrixBWidth, handle); cublasXtDestroy(handle); int res = validateResults(matrixC, matrixCTest, matrixAHeight, matrixBWidth); if (res != 0) { std::cout << "Error validating test failed!" << std::endl; } else { std::cout << "Test PASSED" << std::endl; } delete []matrixCTest; } double numGflops = (2.0 * matrixAHeight *sharedDim * matrixBWidth) * 1.0e-9d; double gflops = numGflops / clk.getAverageTime(TimeVal::SEC); std::cout << (runSequential ? "sequential" : "htgs") << ", " << numProdThreads << ", accum-threads: " << numAccumThreads << ", width-b: " << matrixBWidth << ", height-a: " << matrixAHeight << ", shared-dim: " << sharedDim << ", blockSize: " << blockSize << ", time:" << clk.getAverageTime(TimeVal::MILLI) << ", end-to-end:" << endToEnd.getAverageTime(TimeVal::MILLI) << ", gflops: " << gflops << std::endl; runtimeFile << "MULTIGPU-MM" << (runSequential ? "sequential" : "htgs") << ", " << numProdThreads << ", " << numAccumThreads << ", " << matrixBWidth << ", " << matrixAHeight << ", " << sharedDim << ", " << blockSize << ", " << clk.getAverageTime(TimeVal::MILLI) << ", " << endToEnd.getAverageTime(TimeVal::MILLI) << std::endl; } delete[] matrixA; delete[] matrixB; delete[] matrixC; }
int main(int argc, char *argv[]) { long matrixSize= 16384; int blockSize = 128; bool runSequential = false; bool validate = false; int numBlasThreads = 40; int numGausElimThreads = 2; int numFactorLowerThreads = 4; int numFactorUpperThreads = 4; int numMatrixMulThreads = 30; std::string runtimeFileStr("runtimes"); int numRetry = 1; if (argc > 1) { for (int arg = 1; arg < argc; arg++) { std::string argvs(argv[arg]); if (argvs == "--size") { arg++; matrixSize = atoi(argv[arg]); } if (argvs == "--num-threads-blas") { arg++; numBlasThreads = atoi(argv[arg]); } if (argvs == "num-threads-factor-l") { arg++; numFactorLowerThreads = atoi(argv[arg]); } if (argvs == "num-threads-factor-u") { arg++; numFactorUpperThreads = atoi(argv[arg]); } if (argvs == "num-threads-gaus") { arg++; numGausElimThreads = atoi(argv[arg]); } if (argvs == "num-threads-gemm") { arg++; numMatrixMulThreads = atoi(argv[arg]); } if (argvs == "--run-sequential") { runSequential = true; } if (argvs == "--num-retry" && arg + 1 < argc) { arg++; numRetry = atoi(argv[arg]); } if (argvs == "--block-size") { arg++; blockSize = atoi(argv[arg]); } if (argvs == "--runtime-file" && arg + 1 < argc) { runtimeFileStr = argv[arg + 1]; arg++; } if (argvs == "--validate-results") { validate = true; } if (argvs == "--help") { std::cout << argv[0] << " args: [--size <#>] [--block-size <#>] [--num-retry <#>] [--runtime-file <filename>] [--validate-results] [--run-sequential] [--num-threads-factor-l <#>] [--num-threads-factor-u <#>] [--num-threads-gaus <#>] [--num-threads-gemm <#>] [--num-threads-blas <#>] [--help]" << std::endl; exit(0); } } } std::ofstream runtimeFile(runtimeFileStr, std::ios::app); double *matrix = new double[matrixSize * matrixSize]; double *matrixTest = nullptr; // TODO: Ensure diagonally dominant initMatrixDiagDom(matrix, matrixSize, matrixSize, true); if (validate) { matrixTest = new double[matrixSize * matrixSize]; for (int i = 0; i < matrixSize * matrixSize; i++) matrixTest[i] = matrix[i]; } for (int numTry = 0; numTry < numRetry; numTry++) { SimpleClock clk; SimpleClock endToEnd; if (runSequential) { endToEnd.start(); mkl_domain_set_num_threads(numBlasThreads, MKL_DOMAIN_ALL); // mkl_set_num_threads(40); clk.start(); runSequentialLU(matrix, matrixSize); // computeSequentialMatMul(matrixA, matrixB, matrixC, matrixAHeight, sharedDim, matrixBWidth); clk.stopAndIncrement(); endToEnd.stopAndIncrement(); } else { endToEnd.start(); mkl_domain_set_num_threads(numBlasThreads, MKL_DOMAIN_ALL); int gridHeight = (int) matrixSize / blockSize; int gridWidth = (int) matrixSize / blockSize; // TODO: Build graph and runtime htgs::StateContainer<std::shared_ptr<MatrixBlockData<double *>>> *matrixBlocks = new htgs::StateContainer<std::shared_ptr<MatrixBlockData<double *>>>(gridHeight, gridWidth, nullptr); for (int r = 0; r < gridHeight; r++) { for (int c = 0; c < gridWidth; c++) { // Store pointer locations for all blocks double *ptr = &matrix[IDX2C(r * blockSize, c *blockSize, matrixSize)]; std::shared_ptr<MatrixRequestData> request(new MatrixRequestData(r, c, MatrixType::MatrixA)); std::shared_ptr<MatrixBlockData<double *>> data(new MatrixBlockData<double *>(request, ptr, blockSize, blockSize)); matrixBlocks->set(r, c, data); } } GausElimTask *gausElimTask = new GausElimTask(numGausElimThreads, matrixSize, matrixSize); auto gausElimBk = new htgs::Bookkeeper<MatrixBlockData<double *>>(); GausElimRuleUpper *gausElimRuleUpper = new GausElimRuleUpper(matrixBlocks, gridHeight, gridWidth); GausElimRuleLower *gausElimRuleLower = new GausElimRuleLower(matrixBlocks, gridHeight, gridWidth); FactorUpperTask *factorUpperTask = new FactorUpperTask(numFactorUpperThreads, matrixSize, matrixSize); FactorLowerTask *factorLowerTask = new FactorLowerTask(numFactorLowerThreads, matrixSize, matrixSize); auto matrixMulBk = new htgs::Bookkeeper<MatrixBlockData<double *>>(); MatrixMulRule *matrixMulRule = new MatrixMulRule(matrixBlocks, gridHeight, gridWidth); MatrixMulBlkTask *matrixMulTask = new MatrixMulBlkTask(numMatrixMulThreads, matrixSize, matrixSize, matrixSize, matrixSize, blockSize); auto matrixMulResultBk = new htgs::Bookkeeper<MatrixBlockData<double *>>(); int numDiagonals = gridWidth - 1; GausElimRule *gausElimRule = new GausElimRule(numDiagonals, gridHeight, gridWidth); // Number of updates excluding the diagonal and the top/left row/column int numUpdates = (1.0/6.0) * (double)gridWidth * (2.0 * ((double)gridWidth * (double)gridWidth) - 3.0 * (double)gridWidth + 1.0); UpdateRule *updateRule = new UpdateRule(numUpdates); UpdateRule *updateRule2 = new UpdateRule(numUpdates); auto taskGraph = new htgs::TaskGraph<MatrixBlockData<double *>, htgs::VoidData>(); taskGraph->addGraphInputConsumer(gausElimTask); taskGraph->addEdge(gausElimTask, gausElimBk); taskGraph->addRule(gausElimBk, factorUpperTask, gausElimRuleUpper); taskGraph->addRule(gausElimBk, factorLowerTask, gausElimRuleLower); taskGraph->addEdge(factorUpperTask, matrixMulBk); taskGraph->addEdge(factorLowerTask, matrixMulBk); taskGraph->addRule(matrixMulBk, matrixMulTask, matrixMulRule); taskGraph->addEdge(matrixMulTask, matrixMulResultBk); if (numDiagonals > 0) taskGraph->addRule(matrixMulResultBk, gausElimTask, gausElimRule); if (numUpdates > 0) taskGraph->addRule(matrixMulResultBk, matrixMulBk, updateRule); if (numUpdates > 0) taskGraph->addRule(matrixMulResultBk, gausElimBk, updateRule2); taskGraph->incrementGraphInputProducer(); taskGraph->writeDotToFile("lud-graph.dot"); htgs::Runtime *runtime = new htgs::Runtime(taskGraph); clk.start(); runtime->executeRuntime(); taskGraph->produceData(matrixBlocks->get(0, 0)); taskGraph->finishedProducingData(); runtime->waitForRuntime(); clk.stopAndIncrement(); delete runtime; endToEnd.stopAndIncrement(); } double operations = (2.0 * (matrixSize * matrixSize * matrixSize)) / 3.0; double flops = operations / clk.getAverageTime(TimeVal::SEC); double gflops = flops / 1073741824.0; std::cout << (runSequential ? "sequential" : "htgs") << ", matrix-size: " << matrixSize << ", " << "blockSize: " << (runSequential ? 0 : blockSize) << ", blasThreads: " << numBlasThreads << ", gausThreads: " << numGausElimThreads << ", factorUpperThreads: " << numFactorUpperThreads << ", factorLowerThreads: " << numFactorLowerThreads << ", gemmThreads: " << numMatrixMulThreads << ", time:" << clk.getAverageTime(TimeVal::MILLI) << ", end-to-end:" << endToEnd.getAverageTime(TimeVal::MILLI) << ", gflops: " << gflops << std::endl; runtimeFile << (runSequential ? "sequential" : "htgs") << ", " << matrixSize << ", " << blockSize << ", " << numBlasThreads << ", " << numGausElimThreads << ", " << numFactorUpperThreads << ", " << numFactorLowerThreads << ", " << numMatrixMulThreads << ", " << clk.getAverageTime(TimeVal::MILLI) << ", " << endToEnd.getAverageTime(TimeVal::MILLI) << ", " << gflops << std::endl; if (validate) { int res = validateResults(matrix, matrixTest, matrixSize); std::cout << (res == 0 ? "PASSED" : "FAILED") << std::endl; } } delete[] matrix; delete[] matrixTest; }