TEST_F(MergeReceiveExecutorTest, singlePartitionLimitOffsetTest)
{
    std::vector<int> values;
    values.push_back(0);
    values.push_back(1);
    values.push_back(1);
    values.push_back(2);

    std::vector<TableTuple> tuples;
    std::vector<int64_t> partitionTupleCounts;

    boost::scoped_ptr<char> cleaner(
        addPartitionData(values, tuples, partitionTupleCounts));

    std::vector<SortDirectionType> dirs(1, SORT_DIRECTION_TYPE_ASC);
    AbstractExecutor::TupleComparer comp(getSortKeys(), dirs);
    int limit = 2;
    int offset = 1;
    AggregateExecutorBase* agg_exec = NULL;
    ProgressMonitorProxy* pmp = NULL;
    MergeReceiveExecutor::merge_sort(tuples,
                               partitionTupleCounts,
                               comp,
                               limit,
                               offset,
                               agg_exec,
                               getDstTempTable(),
                               pmp);

    validateResults(comp, tuples, limit, offset);
}
Beispiel #2
0
BOOL readTest(DWORD dwByteCount, char cResult)
{
    HANDLE hFile = NULL;
    DWORD dwBytesRead;
    BOOL bRc = FALSE;
    
    // open the test file 
    hFile = CreateFile(szReadableFile, 
        GENERIC_READ,
        FILE_SHARE_READ,
        NULL,
        OPEN_EXISTING,
        FILE_ATTRIBUTE_NORMAL,
        NULL);
    if(hFile == INVALID_HANDLE_VALUE)
    {
        Trace("ReadFile: ERROR -> Unable to open file \"%s\".\n", 
            szReadableFile);
        return FALSE;
    }

    memset(readBuffer, 0, PAGESIZE);

    bRc = ReadFile(hFile, readBuffer, dwByteCount, &dwBytesRead, NULL);

    if (bRc == FALSE)
    {
        // if it failed, was it supposed to fail?
        if (cResult == '1')
        {
            Trace("\nbRc = %d\n", bRc);
            Trace("readBuffer = [%s]  dwByteCount = %d  dwBytesRead = %d\n", readBuffer, dwByteCount, dwBytesRead);
            Trace("cresult = 1\n");
            Trace("getlasterror = %d\n", GetLastError()); 
            CloseHandle(hFile);
            return FALSE;
        }
    }
    else
    {
        CloseHandle(hFile);
        // if it passed, was it supposed to pass?
        if (cResult == '0')
        {
            Trace("cresult = 0\n");
            return FALSE;
        }
        else
        {
            return (validateResults(readBuffer, dwByteCount, dwBytesRead));
        }
    }

    CloseHandle(hFile);
    return TRUE;
}
TEST_F(MergeReceiveExecutorTest, multipleOverlapPartitionsTest)
{
    std::vector<int> values1;
    values1.push_back(10);
    values1.push_back(11);
    values1.push_back(11);
    values1.push_back(12);

    std::vector<int> values2;
    values2.push_back(1);
    values2.push_back(3);
    values2.push_back(4);
    values2.push_back(10);
    values2.push_back(11);
    values2.push_back(15);
    values2.push_back(20);
    values2.push_back(21);
    values2.push_back(25);

    std::vector<int> values3;
    values3.push_back(2);
    values3.push_back(4);
    values3.push_back(10);
    values3.push_back(12);
    values3.push_back(13);
    values3.push_back(15);

    std::vector<TableTuple> tuples;
    std::vector<int64_t> partitionTupleCounts;

    boost::scoped_ptr<char> cleaner1(
        addPartitionData(values1, tuples, partitionTupleCounts));
    boost::scoped_ptr<char> cleaner2(
        addPartitionData(values2, tuples, partitionTupleCounts));
    boost::scoped_ptr<char> cleaner3(
        addPartitionData(values3, tuples, partitionTupleCounts));

    std::vector<SortDirectionType> dirs(1, SORT_DIRECTION_TYPE_ASC);
    AbstractExecutor::TupleComparer comp(getSortKeys(), dirs);
    int limit = -1;
    int offset = 0;
    AggregateExecutorBase* agg_exec = NULL;
    ProgressMonitorProxy* pmp = NULL;
    MergeReceiveExecutor::merge_sort(tuples,
                               partitionTupleCounts,
                               comp,
                               limit,
                               offset,
                               agg_exec,
                               getDstTempTable(),
                               pmp);
    validateResults(comp, tuples);
}
Beispiel #4
0
BOOL writeTest(const char* szString)
{
    HANDLE hFile = NULL;
    DWORD dwBytesWritten;
    BOOL bRc = FALSE;
    BOOL bAllPassed = TRUE;
    int nStringLength = 0;
    char* szPtr = NULL;
    int i = 0;

    // create the test file 
    hFile = CreateFile(szWritableFile, 
        GENERIC_WRITE,
        FILE_SHARE_WRITE,
        NULL,
        CREATE_ALWAYS,
        FILE_ATTRIBUTE_NORMAL,
        NULL);

    if(hFile == INVALID_HANDLE_VALUE)
    {
        Trace("WriteFile: ERROR -> Unable to create file \"%s\".\n", 
            szWritableFile);
        return FALSE;
    }

    nStringLength = strlen(szString);
    szPtr = (char*) szString;

    for (i = 0; i < nStringLength; i++)
    {
        bRc = WriteFile(hFile, szPtr++, 1, &dwBytesWritten, NULL);
        if ((bRc == FALSE) || (dwBytesWritten != 1))
        {
            bAllPassed = FALSE;
        }
    }
    CloseHandle(hFile);

    if (bAllPassed == FALSE)
    {
        Trace ("WriteFile: ERROR: Failed to write data.\n"); 
        return FALSE;
    }
    else
    {
        return (validateResults(szString));
    }

    return TRUE;
}
int main(int argc, char *argv[])
{
  MatMulArgs matMulArgs;
  matMulArgs.processArgs(argc, argv);

  size_t matrixAHeight = matMulArgs.getMatrixAHeight();
  size_t matrixBWidth = matMulArgs.getMatrixBWidth();
  size_t sharedDim = matMulArgs.getSharedDim();

  size_t blockSize = matMulArgs.getBlockSize();
  size_t numReadThreads = matMulArgs.getNumReadThreads();
  size_t numProdThreads = matMulArgs.getNumMatMulThreads();
  size_t numAccumThreads = (size_t) ceil((double)numProdThreads / 2.0);
  std::string directory = matMulArgs.getDirectory();
  std::string outputDirectory = matMulArgs.getOutputDir();
  bool runSequential = matMulArgs.isRunSequential();
  bool validate = matMulArgs.isValidateResults();

  size_t numGPUs = matMulArgs.getNumGPUs();
  int gpuIds[numGPUs];

  matMulArgs.copyGpuIds(gpuIds);


//  CUcontext *contexts = initCuda(numGPUs, gpuIds);

  std::string runtimeFileStr("runtimes");

  int numRetry = 1;

  std::ofstream runtimeFile(runtimeFileStr, std::ios::app);
  double *matrixA = new double[matrixAHeight * sharedDim];
  double *matrixB = new double[matrixBWidth * sharedDim];
  double *matrixC = new double[matrixAHeight * matrixBWidth];

  initMatrix(matrixA, sharedDim, matrixAHeight, true);
  initMatrix(matrixB, matrixBWidth, sharedDim, true);

  for (int numTry = 0; numTry < numRetry; numTry++) {
    SimpleClock clk;
    SimpleClock endToEnd;

    if (runSequential) {
      endToEnd.start();
      initMatMul(numProdThreads);

      cublasXtHandle_t handle;

      cublasXtCreate(&handle);

      cublasXtDeviceSelect(handle, numGPUs, gpuIds);
      cublasXtSetBlockDim(handle, blockSize);

      clk.start();
      computeSequentialMatMul(matrixA, matrixB, matrixC, (size_t) matrixAHeight, (size_t) sharedDim,
                              (size_t) matrixBWidth, handle);
      clk.stopAndIncrement();

      cublasXtDestroy(handle);

      endToEnd.stopAndIncrement();
    }
    else {
      endToEnd.start();
      initMatMul(1);

      LoadMatrixTask *readAMatTask =
          new LoadMatrixTask(matrixA,
                             numReadThreads,
                             MatrixType::MatrixA,
                             blockSize,
                             sharedDim,
                             matrixAHeight,
                             true);

      LoadMatrixTask *readBMatTask =
          new LoadMatrixTask(matrixB,
                             numReadThreads,
                             MatrixType::MatrixB,
                             blockSize,
                             matrixBWidth,
                             sharedDim,
                             true);

      MatrixMulBlkCudaTask *mmulTask = new MatrixMulBlkCudaTask(gpuIds, numGPUs);
      MatMulAccumTask *accumTask = new MatMulAccumTask(numAccumThreads, true);

      MatMulOutputTask *outputTask = new MatMulOutputTask(matrixC, matrixAHeight, blockSize, true);

      size_t blkHeightMatB = readBMatTask->getNumBlocksRows();
      size_t blkWidthMatB = readBMatTask->getNumBlocksCols();

      size_t blkHeightMatA = readAMatTask->getNumBlocksRows();
      size_t blkWidthMatA = readAMatTask->getNumBlocksCols();

      CudaCopyInTask *cudaCopyInATask = new CudaCopyInTask(gpuIds, numGPUs, MatrixType::MatrixA, blkWidthMatB);
      CudaCopyInTask *cudaCopyInBTask = new CudaCopyInTask(gpuIds, numGPUs, MatrixType::MatrixB, blkHeightMatA);

      CudaCopyOutTask *cudaCopyOutCTask = new CudaCopyOutTask(gpuIds, numGPUs, MatrixType::MatrixC);

      MatMulDistributeRule *distributeRuleMatA = new MatMulDistributeRule(MatrixType::MatrixA);
      MatMulDistributeRule *distributeRuleMatB = new MatMulDistributeRule(MatrixType::MatrixB);

      MatMulLoadRule<htgs::m_data_t<double>> *loadRule = new MatMulLoadRule<htgs::m_data_t<double>>(blkWidthMatA, blkHeightMatA, blkWidthMatB, blkHeightMatB);
      MatMulAccumulateRule<double *> *accumulateRule = new MatMulAccumulateRule<double *>(blkWidthMatB, blkHeightMatA, blkWidthMatA);

      MatMulOutputRule *outputRule = new MatMulOutputRule(blkWidthMatB, blkHeightMatA, blkWidthMatA);

      auto distributeBk = new htgs::Bookkeeper<MatrixRequestData>();
      auto matMulBk = new htgs::Bookkeeper<MatrixBlockData<htgs::m_data_t<double>>>();
      auto matAccumBk = new htgs::Bookkeeper<MatrixBlockData<double *>>();

      auto taskGraph = new htgs::TaskGraphConf<MatrixRequestData, MatrixBlockData<double *>>();

      taskGraph->setGraphConsumerTask(distributeBk);
      taskGraph->addRuleEdge(distributeBk, distributeRuleMatA, readAMatTask);
      taskGraph->addRuleEdge(distributeBk, distributeRuleMatB, readBMatTask);


      taskGraph->addEdge(readAMatTask, cudaCopyInATask);
      taskGraph->addEdge(readBMatTask, cudaCopyInBTask);

      taskGraph->addEdge(cudaCopyInATask, matMulBk);
      taskGraph->addEdge(cudaCopyInBTask, matMulBk);

      taskGraph->addRuleEdge(matMulBk, loadRule, mmulTask);

      taskGraph->addEdge(mmulTask, cudaCopyOutCTask);

      taskGraph->addGraphProducerTask(cudaCopyOutCTask);

      taskGraph->addCudaMemoryManagerEdge(matrixTypeToString(MatrixType::MatrixA) + "Copy",
                                          cudaCopyInATask,
                                          new CudaAllocator(blockSize, blockSize),
                                          blkWidthMatB+1,
                                          htgs::MMType::Static,
                                          gpuIds);
      taskGraph->addCudaMemoryManagerEdge(matrixTypeToString(MatrixType::MatrixB) + "Copy",
                                          cudaCopyInBTask,
                                          new CudaAllocator(blockSize, blockSize),
                                          blkHeightMatA+1,
                                          htgs::MMType::Static,
                                          gpuIds);

      taskGraph->addCudaMemoryManagerEdge(matrixTypeToString(MatrixType::MatrixC),
                                          mmulTask,
                                          new CudaAllocator(blockSize, blockSize),
                                          4,
                                          htgs::MMType::Static,
                                          gpuIds);


      auto mainTaskGraph = new htgs::TaskGraphConf<MatrixRequestData, MatrixRequestData>();


      auto execPipeline = new htgs::ExecutionPipeline<MatrixRequestData, MatrixBlockData<double *>>(numGPUs, taskGraph);
      auto decompositionRule = new MatrixDecompositionRule(numGPUs);

      execPipeline->addInputRule(decompositionRule);

      mainTaskGraph->setGraphConsumerTask(execPipeline);
      mainTaskGraph->addEdge(execPipeline, matAccumBk);


      mainTaskGraph->addRuleEdge(matAccumBk, outputRule, outputTask);
      mainTaskGraph->addRuleEdge(matAccumBk, accumulateRule, accumTask);

      mainTaskGraph->addEdge(accumTask, matAccumBk);

      mainTaskGraph->addGraphProducerTask(outputTask);

//      mainTaskGraph->writeDotToFile("pre-execution.dot");

      htgs::TaskGraphRuntime *runtime = new htgs::TaskGraphRuntime(mainTaskGraph);

      clk.start();

      runtime->executeRuntime();

      for (size_t col = 0; col < blkWidthMatA; col++) {
        for (size_t row = 0; row < blkHeightMatA; row++) {

          MatrixRequestData *matA = new MatrixRequestData(row, col, MatrixType::MatrixA);
          mainTaskGraph->produceData(matA);
        }
      }

      for (size_t row = 0; row < blkHeightMatB; row++) {
        for (size_t col = 0; col < blkWidthMatB; col++) {
          MatrixRequestData *matB = new MatrixRequestData(row, col, MatrixType::MatrixB);
          mainTaskGraph->produceData(matB);

        }
      }

      mainTaskGraph->finishedProducingData();

      while (!mainTaskGraph->isOutputTerminated()) {
        auto data = mainTaskGraph->consumeData();
        if (data != nullptr) {
//          std::cout << data->getRow() << ", " << data->getCol() << std::endl;
        }
      }

      runtime->waitForRuntime();


//      taskGraph->writeDotToFile("profile-graph.dot");
//      mainTaskGraph->writeDotToFile("profile-all-threads-graph.dot", DOTGEN_FLAG_SHOW_ALL_THREADING);
      mainTaskGraph->writeDotToFile("matrix-multiplication-cuda-multigpu.dot", DOTGEN_COLOR_COMP_TIME);

      clk.stopAndIncrement();

      delete runtime;
      endToEnd.stopAndIncrement();
    }

    if (validate) {
      double *matrixCTest = new double[matrixAHeight * matrixBWidth];
      initMatMul(numProdThreads);

      cublasXtHandle_t handle;

      cublasXtCreate(&handle);

      cublasXtDeviceSelect(handle, (int)numGPUs, gpuIds);
      cublasXtSetBlockDim(handle, (int)blockSize);

      computeSequentialMatMul(matrixA, matrixB, matrixCTest, (size_t) matrixAHeight, (size_t) sharedDim,
                              (size_t) matrixBWidth, handle);

      cublasXtDestroy(handle);


      int res = validateResults(matrixC, matrixCTest, matrixAHeight, matrixBWidth);
      if (res != 0) {
        std::cout << "Error validating test failed!" << std::endl;
      }
      else
      {
        std::cout << "Test PASSED" << std::endl;
      }

      delete []matrixCTest;

    }


    double numGflops = (2.0 * matrixAHeight *sharedDim * matrixBWidth) * 1.0e-9d;
    double gflops = numGflops / clk.getAverageTime(TimeVal::SEC);



    std::cout << (runSequential ? "sequential" : "htgs") << ", " << numProdThreads
              << ", accum-threads: " << numAccumThreads << ", width-b: " << matrixBWidth << ", height-a: " << matrixAHeight
              << ", shared-dim: " << sharedDim
              << ", blockSize: " << blockSize
              << ", time:" << clk.getAverageTime(TimeVal::MILLI)
              << ", end-to-end:" << endToEnd.getAverageTime(TimeVal::MILLI)
              << ", gflops: " << gflops
              << std::endl;

    runtimeFile << "MULTIGPU-MM" << (runSequential ? "sequential" : "htgs") << ", " << numProdThreads
                << ", " << numAccumThreads << ", "
                << matrixBWidth << ", " << matrixAHeight
                << ", " << sharedDim << ", " << blockSize << ", " << clk.getAverageTime(TimeVal::MILLI)
                << ", " << endToEnd.getAverageTime(TimeVal::MILLI)
                << std::endl;

  }

  delete[] matrixA;
  delete[] matrixB;
  delete[] matrixC;
}
int main(int argc, char *argv[]) {
  long matrixSize= 16384;
  int blockSize = 128;
  bool runSequential = false;
  bool validate = false;

  int numBlasThreads = 40;

  int numGausElimThreads = 2;
  int numFactorLowerThreads = 4;
  int numFactorUpperThreads = 4;
  int numMatrixMulThreads = 30;

  std::string runtimeFileStr("runtimes");

  int numRetry = 1;

  if (argc > 1) {
    for (int arg = 1; arg < argc; arg++) {
      std::string argvs(argv[arg]);

      if (argvs == "--size") {
        arg++;
        matrixSize = atoi(argv[arg]);
      }

      if (argvs == "--num-threads-blas") {
        arg++;
        numBlasThreads = atoi(argv[arg]);
      }


      if (argvs == "num-threads-factor-l") {
        arg++;
        numFactorLowerThreads = atoi(argv[arg]);
      }

      if (argvs == "num-threads-factor-u") {
        arg++;
        numFactorUpperThreads = atoi(argv[arg]);
      }

      if (argvs == "num-threads-gaus") {
        arg++;
        numGausElimThreads = atoi(argv[arg]);
      }

      if (argvs == "num-threads-gemm") {
        arg++;
        numMatrixMulThreads = atoi(argv[arg]);
      }

      if (argvs == "--run-sequential") {
        runSequential = true;
      }

      if (argvs == "--num-retry" && arg + 1 < argc) {
        arg++;
        numRetry = atoi(argv[arg]);
      }

      if (argvs == "--block-size") {
        arg++;
        blockSize = atoi(argv[arg]);
      }


      if (argvs == "--runtime-file" && arg + 1 < argc) {
        runtimeFileStr = argv[arg + 1];
        arg++;
      }

      if (argvs == "--validate-results") {
        validate = true;
      }

      if (argvs == "--help") {
        std::cout << argv[0]
                  << " args: [--size <#>] [--block-size <#>] [--num-retry <#>] [--runtime-file <filename>] [--validate-results] [--run-sequential] [--num-threads-factor-l <#>] [--num-threads-factor-u <#>] [--num-threads-gaus <#>] [--num-threads-gemm <#>] [--num-threads-blas <#>] [--help]"
                  << std::endl;
        exit(0);

      }
    }
  }

  std::ofstream runtimeFile(runtimeFileStr, std::ios::app);
  double *matrix = new double[matrixSize * matrixSize];
  double *matrixTest = nullptr;

  // TODO: Ensure diagonally dominant
  initMatrixDiagDom(matrix, matrixSize, matrixSize, true);

  if (validate) {
    matrixTest = new double[matrixSize * matrixSize];
    for (int i = 0; i < matrixSize * matrixSize; i++)
      matrixTest[i] = matrix[i];
  }

  for (int numTry = 0; numTry < numRetry; numTry++) {
    SimpleClock clk;
    SimpleClock endToEnd;

    if (runSequential) {
      endToEnd.start();
      mkl_domain_set_num_threads(numBlasThreads, MKL_DOMAIN_ALL);
//      mkl_set_num_threads(40);

      clk.start();
      runSequentialLU(matrix, matrixSize);
//      computeSequentialMatMul(matrixA, matrixB, matrixC, matrixAHeight, sharedDim, matrixBWidth);
      clk.stopAndIncrement();
      endToEnd.stopAndIncrement();
    }
    else {
      endToEnd.start();
      mkl_domain_set_num_threads(numBlasThreads, MKL_DOMAIN_ALL);

      int gridHeight = (int) matrixSize / blockSize;
      int gridWidth = (int) matrixSize / blockSize;

      // TODO: Build graph and runtime
      htgs::StateContainer<std::shared_ptr<MatrixBlockData<double *>>> *matrixBlocks = new htgs::StateContainer<std::shared_ptr<MatrixBlockData<double *>>>(gridHeight, gridWidth, nullptr);

      for (int r = 0; r < gridHeight; r++)
      {
        for (int c = 0; c < gridWidth; c++)
        {
          // Store pointer locations for all blocks
          double *ptr = &matrix[IDX2C(r * blockSize, c *blockSize, matrixSize)];

          std::shared_ptr<MatrixRequestData> request(new MatrixRequestData(r, c, MatrixType::MatrixA));
          std::shared_ptr<MatrixBlockData<double *>> data(new MatrixBlockData<double *>(request, ptr, blockSize, blockSize));

          matrixBlocks->set(r, c, data);
        }
      }

      GausElimTask *gausElimTask = new GausElimTask(numGausElimThreads, matrixSize, matrixSize);

      auto gausElimBk = new htgs::Bookkeeper<MatrixBlockData<double *>>();

      GausElimRuleUpper *gausElimRuleUpper = new GausElimRuleUpper(matrixBlocks, gridHeight, gridWidth);
      GausElimRuleLower *gausElimRuleLower = new GausElimRuleLower(matrixBlocks, gridHeight, gridWidth);

      FactorUpperTask *factorUpperTask = new FactorUpperTask(numFactorUpperThreads, matrixSize, matrixSize);
      FactorLowerTask *factorLowerTask = new FactorLowerTask(numFactorLowerThreads, matrixSize, matrixSize);

      auto matrixMulBk = new htgs::Bookkeeper<MatrixBlockData<double *>>();
      MatrixMulRule *matrixMulRule = new MatrixMulRule(matrixBlocks, gridHeight, gridWidth);

      MatrixMulBlkTask *matrixMulTask = new MatrixMulBlkTask(numMatrixMulThreads, matrixSize, matrixSize, matrixSize, matrixSize, blockSize);


      auto matrixMulResultBk = new htgs::Bookkeeper<MatrixBlockData<double *>>();

      int numDiagonals = gridWidth - 1;
      GausElimRule *gausElimRule = new GausElimRule(numDiagonals, gridHeight, gridWidth);

      // Number of updates excluding the diagonal and the top/left row/column
      int numUpdates = (1.0/6.0) * (double)gridWidth * (2.0 * ((double)gridWidth * (double)gridWidth) - 3.0 * (double)gridWidth + 1.0);

      UpdateRule *updateRule = new UpdateRule(numUpdates);
      UpdateRule *updateRule2 = new UpdateRule(numUpdates);

      auto taskGraph = new htgs::TaskGraph<MatrixBlockData<double *>, htgs::VoidData>();
      taskGraph->addGraphInputConsumer(gausElimTask);

      taskGraph->addEdge(gausElimTask, gausElimBk);
      taskGraph->addRule(gausElimBk, factorUpperTask, gausElimRuleUpper);
      taskGraph->addRule(gausElimBk, factorLowerTask, gausElimRuleLower);

      taskGraph->addEdge(factorUpperTask, matrixMulBk);
      taskGraph->addEdge(factorLowerTask, matrixMulBk);

      taskGraph->addRule(matrixMulBk, matrixMulTask, matrixMulRule);
      taskGraph->addEdge(matrixMulTask, matrixMulResultBk);

      if (numDiagonals > 0)
        taskGraph->addRule(matrixMulResultBk, gausElimTask, gausElimRule);

      if (numUpdates > 0)
        taskGraph->addRule(matrixMulResultBk, matrixMulBk, updateRule);

      if (numUpdates > 0)
        taskGraph->addRule(matrixMulResultBk, gausElimBk, updateRule2);

      taskGraph->incrementGraphInputProducer();

      taskGraph->writeDotToFile("lud-graph.dot");

      htgs::Runtime *runtime = new htgs::Runtime(taskGraph);

      clk.start();

      runtime->executeRuntime();

      taskGraph->produceData(matrixBlocks->get(0, 0));
      taskGraph->finishedProducingData();

      runtime->waitForRuntime();

      clk.stopAndIncrement();


      delete runtime;
      endToEnd.stopAndIncrement();
    }

    double operations = (2.0 * (matrixSize * matrixSize * matrixSize)) / 3.0;
    double flops = operations / clk.getAverageTime(TimeVal::SEC);
    double gflops = flops / 1073741824.0;

    std::cout << (runSequential ? "sequential" : "htgs")
              << ", matrix-size: " << matrixSize
              << ", " << "blockSize: " << (runSequential ? 0 : blockSize)
              << ", blasThreads: " << numBlasThreads
              << ", gausThreads: " << numGausElimThreads
              << ", factorUpperThreads: " << numFactorUpperThreads
              << ", factorLowerThreads: " << numFactorLowerThreads
              << ", gemmThreads: " << numMatrixMulThreads
              << ", time:" << clk.getAverageTime(TimeVal::MILLI)
              << ", end-to-end:" << endToEnd.getAverageTime(TimeVal::MILLI)
              << ", gflops: " << gflops
        << std::endl;

    runtimeFile << (runSequential ? "sequential" : "htgs")
                << ", " << matrixSize
                << ", " << blockSize
                << ", " << numBlasThreads
                << ", " << numGausElimThreads
                << ", " << numFactorUpperThreads
                << ", " << numFactorLowerThreads
                << ", " << numMatrixMulThreads
                << ", " << clk.getAverageTime(TimeVal::MILLI)
                << ", " << endToEnd.getAverageTime(TimeVal::MILLI)
                << ", " << gflops
                << std::endl;



    if (validate)
    {
      int res = validateResults(matrix, matrixTest, matrixSize);
      std::cout << (res == 0 ? "PASSED" : "FAILED") << std::endl;
    }


  }

  delete[] matrix;
  delete[] matrixTest;

}