Beispiel #1
0
int main(int argc, char** argv) {
  // Initialize runtime
  madness::World& world = madness::initialize(argc, argv);

  // Get command line arguments
  if(argc < 2) {
    std::cout << "Usage: fock_build matrix_size block_size df_size df_block_size [repetitions]\n";
    return 0;
  }
  const long matrix_size = atol(argv[1]);
  const long block_size = atol(argv[2]);
  const long df_size = atol(argv[3]);
  const long df_block_size = atol(argv[4]);
  if (matrix_size <= 0) {
    std::cerr << "Error: matrix size must greater than zero.\n";
    return 1;
  }
  if (df_size <= 0) {
    std::cerr << "Error: third rank size must greater than zero.\n";
    return 1;
  }
  if (block_size <= 0 || df_block_size <= 0) {
    std::cerr << "Error: block size must greater than zero.\n";
    return 1;
  }
  if(matrix_size % block_size != 0ul && df_size % df_block_size != 0ul) {
    std::cerr << "Error: tensor size must be evenly divisible by block size.\n";
    return 1;
  }
  const long repeat = (argc >= 6 ? atol(argv[5]) : 5);
  if (repeat <= 0) {
    std::cerr << "Error: number of repititions must greater than zero.\n";
    return 1;
  }

  const std::size_t num_blocks = matrix_size / block_size;
  const std::size_t df_num_blocks = df_size / df_block_size;
  const std::size_t block_count = num_blocks * num_blocks;
  const std::size_t df_block_count = df_num_blocks * num_blocks * num_blocks;

  if(world.rank() == 0)
    std::cout << "TiledArray: Fock Build Test ...\n"
              << "Number of nodes     = " << world.size()
              << "\nMatrix size         = " << matrix_size << "x" << matrix_size
              << "\nTensor size         = " << matrix_size << "x" << matrix_size << "x" << df_size
              << "\nBlock size          = " << block_size << "x" << block_size << "x" << df_block_size
              << "\nMemory per matrix   = " << double(matrix_size * matrix_size * sizeof(double)) / 1.0e9
              << " GB\nMemory per tensor   = " << double(matrix_size * matrix_size * df_size * sizeof(double)) / 1.0e9
              << " GB\nNumber of matrix blocks    = " << block_count
              << "\nNumber of tensor blocks    = " << df_block_count
              << "\nAverage blocks/node matrix = " << double(block_count) / double(world.size())
              << "\nAverage blocks/node tensor = " << double(df_block_count) / double(world.size()) << "\n";

  // Construct TiledRange
  std::vector<unsigned int> blocking;
  blocking.reserve(num_blocks + 1);
  for(std::size_t i = 0; i <= matrix_size; i += block_size)
    blocking.push_back(i);

  std::vector<unsigned int> df_blocking;
  blocking.reserve(df_num_blocks + 1);
  for(std::size_t i = 0; i <= df_size; i += df_block_size)
    df_blocking.push_back(i);

  std::vector<TiledArray::TiledRange1> blocking2(2,
      TiledArray::TiledRange1(blocking.begin(), blocking.end()));

  std::vector<TiledArray::TiledRange1> blocking3 = {
      TiledArray::TiledRange1(blocking.begin(), blocking.end()),
      TiledArray::TiledRange1(blocking.begin(), blocking.end()),
      TiledArray::TiledRange1(df_blocking.begin(), df_blocking.end()) };


  TiledArray::TiledRange trange(blocking2.begin(), blocking2.end());
  TiledArray::TiledRange df_trange(blocking3.begin(), blocking3.end());

  // Construct and initialize arrays
  TiledArray::Array<double, 2> D(world, trange);
  TiledArray::Array<double, 2> DL(world, trange);
  TiledArray::Array<double, 2> F(world, trange);
  TiledArray::Array<double, 2> G(world, trange);
  TiledArray::Array<double, 2> H(world, trange);
  TiledArray::Array<double, 3> TCInts(world, df_trange);
  TiledArray::Array<double, 3> ExchTemp(world, df_trange);
  D.set_all_local(1.0);
  DL.set_all_local(1.0);
  H.set_all_local(2.0);
  TCInts.set_all_local(3.0);

  // Start clock
  world.gop.fence();
  const double wall_time_start = madness::wall_time();

  // Do fock build
  for(int i = 0; i < repeat; ++i) {
      // Assume we have the cholesky decompositon of the density matrix
      ExchTemp("s,j,P") = DL("s,n") * TCInts("n,j,P");
      // Compute coulomb and exchange
      G("i,j") = 2.0 * TCInts("i,j,P") * ( D("n,m") * TCInts("n,m,P") ) -
                       ExchTemp("s,i,P") * ExchTemp("s,j,P");
      F("i,j") = G("i,j") + H("i,j");
      world.gop.fence();
    if(world.rank() == 0)
      std::cout << "Iteration " << i + 1 << "\n";
  }

  // Stop clock
  const double wall_time_stop = madness::wall_time();

  if(world.rank() == 0){
    std::cout << "Average wall time   = " << (wall_time_stop - wall_time_start) / double(repeat)
        << " sec\nAverage GFLOPS      = " << double(repeat) *
        (double(4.0 * matrix_size * matrix_size * df_size) + // Coulomb flops
        double(4.0 * matrix_size * matrix_size * matrix_size * df_size)) // Exchange flops
        / (wall_time_stop - wall_time_start) / 1.0e9 << "\n";
  }

  madness::finalize();
  return 0;
}
Beispiel #2
0
int main(int argc, char** argv) {
  int rc = 0;

  try {
    // Initialize runtime
    TiledArray::World& world = TiledArray::initialize(argc, argv);

    // Get command line arguments
    if(argc < 2) {
      std::cout << "Usage: " << argv[0] << " matrix_size block_size [repetitions]\n";
      return 0;
    }
    const long matrix_size = atol(argv[1]);
    const long block_size = atol(argv[2]);
    if(matrix_size <= 0) {
      std::cerr << "Error: matrix size must be greater than zero.\n";
      return 1;
    }
    if(block_size <= 0) {
      std::cerr << "Error: block size must be greater than zero.\n";
      return 1;
    }
    if((matrix_size % block_size) != 0ul) {
      std::cerr << "Error: matrix size must be evenly divisible by block size.\n";
      return 1;
    }
    const long repeat = (argc >= 4 ? atol(argv[3]) : 4);
    if(repeat <= 0) {
      std::cerr << "Error: number of repetitions must be greater than zero.\n";
      return 1;
    }

    // Print information about the test
    const std::size_t num_blocks = matrix_size / block_size;
    const double app_flop = 2.0 * matrix_size * matrix_size * matrix_size;
    std::vector<std::vector<double> > gflops;
    std::vector<std::vector<double> > times;
    std::vector<std::vector<double> > app_gflops;

    if(world.rank() == 0)
      std::cout << "TiledArray: block-sparse matrix multiply test..."
                << "\nGit HASH: " << TILEDARRAY_REVISION
                << "\nNumber of nodes    = " << world.size()
                << "\nMatrix size        = " << matrix_size << "x" << matrix_size
                << "\nBlock size         = " << block_size << "x" << block_size;


    // Construct TiledRange
    std::vector<unsigned int> blocking;
    blocking.reserve(num_blocks + 1);
    for(long i = 0l; i <= matrix_size; i += block_size)
      blocking.push_back(i);

    std::vector<TiledArray::TiledRange1> blocking2(2,
        TiledArray::TiledRange1(blocking.begin(), blocking.end()));

    TiledArray::TiledRange
      trange(blocking2.begin(), blocking2.end());

    TiledArray::SparseShape<float> forced_shape;
    for(unsigned int left_sparsity = 10; left_sparsity <= 100; left_sparsity += 10){
      std::vector<double> inner_gflops, inner_times, inner_app_gflops;
      for(unsigned int right_sparsity = 10; right_sparsity <= left_sparsity; right_sparsity += 10){
        const long l_block_count = (double(left_sparsity) / 100.0) * double(num_blocks * num_blocks);
        const long r_block_count = (double(right_sparsity) / 100.0) * double(num_blocks * num_blocks);
        if(world.rank() == 0)
                    std::cout << "\nMemory per left matrix  = " << double(l_block_count * block_size * block_size * sizeof(double)) / 1.0e9 << " GB"
                    << "\nMemory per right matrix  = " << double(r_block_count * block_size * block_size * sizeof(double)) / 1.0e9 << " GB"
                    << "\nNumber of left blocks   = " << l_block_count << "   " << 100.0 * double(l_block_count) / double(num_blocks * num_blocks) << "%"
                    << "\nNumber of right blocks   = " << r_block_count << "   " << 100.0 * double(r_block_count) / double(num_blocks * num_blocks) << "%"
                    << "\nAverage left blocks/node = " << double(l_block_count) / double(world.size())
                    << "\nAverage right blocks/node = " << double(r_block_count) / double(world.size()) << "\n";

        // Construct shape
        TiledArray::Tensor<float>
            a_tile_norms(trange.tiles_range(), 0.0),
            b_tile_norms(trange.tiles_range(), 0.0);
        if(world.rank() == 0) {
          world.srand(time(NULL));
          for(long count = 0l; count < l_block_count; ++count) {
            std::size_t index = world.rand() % trange.tiles_range().volume();

            // Avoid setting the same tile to non-zero.
            while(a_tile_norms[index] > TiledArray::SparseShape<float>::threshold())
              index = world.rand() % trange.tiles_range().volume();

            a_tile_norms[index] = std::sqrt(float(block_size * block_size));
          }
          for(long count = 0l; count < r_block_count; ++count) {
            std::size_t index = world.rand() % trange.tiles_range().volume();

            // Avoid setting the same tile to non-zero.
            while(b_tile_norms[index] > TiledArray::SparseShape<float>::threshold())
              index = world.rand() % trange.tiles_range().volume();

            b_tile_norms[index] = std::sqrt(float(block_size * block_size));
          }

        }
        TiledArray::SparseShape<float>
            a_shape(world, a_tile_norms, trange),
            b_shape(world, b_tile_norms, trange);

        if(left_sparsity == 10){
            forced_shape = a_shape;
        }


        // Construct and initialize arrays
        TiledArray::TSpArrayD a(world, trange, a_shape);
        TiledArray::TSpArrayD b(world, trange, b_shape);
        TiledArray::TSpArrayD c;
        a.fill(1.0);
        b.fill(1.0);

        // Start clock
        TiledArray::TSpArrayD::wait_for_lazy_cleanup(world);
        world.gop.fence();
        if(world.rank() == 0)
          std::cout << "Starting iterations:\n";

        double total_time = 0.0, flop = 0.0;

        // Do matrix multiplication
        try {
          for(int i = 0; i < repeat; ++i) {
            const double start = madness::wall_time();
            c("m,n") = (a("m,k") * b("k,n")).set_shape(forced_shape);
            const double time = madness::wall_time() - start;
            total_time += time;
            if(flop < 1.0)
              flop = 2.0 * c("m,n").sum();
            if(world.rank() == 0)
              std::cout << "Iteration " << i + 1 << "   time=" << time << "   GFLOPS="
                  << flop / time / 1.0e9 << "   apparent GFLOPS=" << app_flop / time / 1.0e9 << "\n";
            std::cout << "C sparsity = " << c.shape().sparsity() << "\n";

          }
        } catch(...) {
          if(world.rank() == 0) {
            std::stringstream ss;
            ss << "left shape  = " << a.shape().data() << "\n"
               << "right shape = " << b.shape().data() << "\n";
            std::cout << ss.str();
          }
          throw;
        }

        // Stop clock
        inner_gflops.push_back(double(repeat) * flop / total_time / 1.0e9);
        inner_times.push_back(total_time / repeat);
        inner_app_gflops.push_back(double(repeat) * app_flop / total_time / 1.0e9);

        // Print results
        if(world.rank() == 0) {
          std::cout << "Average wall time = " << total_time / double(repeat)
              << "\nAverage GFLOPS = " << double(repeat) * double(flop) / total_time / 1.0e9
              << "\nAverage apparent GFLOPS = " << double(repeat) * double(app_flop) / total_time / 1.0e9 << "\n";
        }
      }
      gflops.push_back(inner_gflops);
      times.push_back(inner_times);
      app_gflops.push_back(inner_app_gflops);
    }

    if(world.rank() == 0) {
      std::cout << "\n--------------------------------------------------------------------------------------------------------\nGFLOPS\n";
      print_results(world, gflops);
      std::cout << "\n--------------------------------------------------------------------------------------------------------\nAverage wall times\n";
      print_results(world, times);
      std::cout << "\n--------------------------------------------------------------------------------------------------------\nApparent GFLOPS\n";
      print_results(world, app_gflops);
    }


    TiledArray::finalize();

  } catch(TiledArray::Exception& e) {
    std::cerr << "!! TiledArray exception: " << e.what() << "\n";
    rc = 1;
  } catch(madness::MadnessException& e) {
    std::cerr << "!! MADNESS exception: " << e.what() << "\n";
    rc = 1;
  } catch(SafeMPI::Exception& e) {
    std::cerr << "!! SafeMPI exception: " << e.what() << "\n";
    rc = 1;
  } catch(std::exception& e) {
    std::cerr << "!! std exception: " << e.what() << "\n";
    rc = 1;
  } catch(...) {
    std::cerr << "!! exception: unknown exception\n";
    rc = 1;
  }

  return rc;
}
Beispiel #3
0
int main(int argc, char** argv) {
  int rc = 0;

  try {
    // Initialize runtime
    madness::World& world = madness::initialize(argc, argv);

    // Get command line arguments
    if(argc < 2) {
      std::cout << "Usage: ta_sparse matrix_size block_size [repetitions]\n";
      return 0;
    }
    const long matrix_size = atol(argv[1]);
    const long block_size = atol(argv[2]);
    if(matrix_size <= 0) {
      std::cerr << "Error: matrix size must greater than zero.\n";
      return 1;
    }
    if(block_size <= 0) {
      std::cerr << "Error: block size must greater than zero.\n";
      return 1;
    }
    if((matrix_size % block_size) != 0ul) {
      std::cerr << "Error: matrix size must be evenly divisible by block size.\n";
      return 1;
    }
    const long repeat = (argc >= 4 ? atol(argv[3]) : 4);
    if(repeat <= 0) {
      std::cerr << "Error: number of repetitions must greater than zero.\n";
      return 1;
    }

    // Print information about the test
    const std::size_t num_blocks = matrix_size / block_size;
    std::vector<std::vector<double> > gflops;
    std::vector<std::vector<double> > times;

    std::cout << "TiledArray: block-sparse matrix multiply test...\n"
              << "Number of nodes    = " << world.size()
              << "\nMatrix size        = " << matrix_size << "x" << matrix_size
              << "\nBlock size         = " << block_size << "x" << block_size;

    for(unsigned int left_sparsity = 10; left_sparsity <= 100; left_sparsity += 10){
      std::vector<double> inner_gflops;
      std::vector<double> inner_times;
      for(unsigned int right_sparsity = 10; right_sparsity <= left_sparsity; right_sparsity += 10){
        const long l_block_count = (double(left_sparsity) / 100.0) * double(num_blocks * num_blocks);
        const long r_block_count = (double(right_sparsity) / 100.0) * double(num_blocks * num_blocks);
        if(world.rank() == 0)
                    std::cout << "\nMemory per left matrix  = " << double(l_block_count * block_size * block_size * sizeof(double)) / 1.0e9 << " GB"
                    << "\nMemory per right matrix  = " << double(r_block_count * block_size * block_size * sizeof(double)) / 1.0e9 << " GB"
                    << "\nNumber of left blocks   = " << l_block_count
                    << " " << left_sparsity << " percent"
                    << "\nNumber of right blocks   = " << r_block_count
                    << " " << right_sparsity << " percent"
                    << "\nAverage left blocks/node = " << double(l_block_count) / double(world.size())
                    << "\nAverage right blocks/node = " << double(r_block_count) / double(world.size()) << "\n";

        // Construct TiledRange
        std::vector<unsigned int> blocking;
        blocking.reserve(num_blocks + 1);
        for(long i = 0l; i <= matrix_size; i += block_size)
          blocking.push_back(i);

        std::vector<TiledArray::TiledRange1> blocking2(2,
            TiledArray::TiledRange1(blocking.begin(), blocking.end()));

        TiledArray::TiledRange
          trange(blocking2.begin(), blocking2.end());

        // Construct shape
        TiledArray::Tensor<float>
            a_shape_tensor(trange.tiles(), 0.0),
            b_shape_tensor(trange.tiles(), 0.0),
            c_shape_tensor(trange.tiles(), 0.0);
        if(world.rank() == 0) {
          world.srand(time(NULL));
          const long l_process_block_count = l_block_count / world.size() +
              (world.rank() < (l_block_count / world.size()) ? 1 : 0);
          const long r_process_block_count = r_block_count / world.size() +
              (world.rank() < (r_block_count / world.size()) ? 1 : 0);
          for(long i = 0; i < l_process_block_count; ++i)
            a_shape_tensor.data()[world.rand() % trange.tiles().volume()] = 1.0;

          for(long i = 0; i < r_process_block_count; ++i)
            b_shape_tensor.data()[world.rand() % trange.tiles().volume()] = 1.0;

        }
        TiledArray::SparseShape<float>
            a_shape(world, a_shape_tensor, trange),
            b_shape(world, b_shape_tensor, trange),
            c_shape(world, c_shape_tensor, trange);


        typedef TiledArray::Array<double, 2, TiledArray::Tensor<double>,
            TiledArray::SparsePolicy > SpTArray2;

        // Construct and initialize arrays
        SpTArray2 a(world, trange, a_shape);
        SpTArray2 b(world, trange, b_shape);
        SpTArray2 c(world, trange, c_shape);
        a.set_all_local(1.0);
        b.set_all_local(1.0);

        // Start clock
        world.gop.fence();
        const double wall_time_start = madness::wall_time();

        // Do matrix multiplication
        for(int i = 0; i < repeat; ++i) {
          c("m,n") = a("m,k") * b("k,n");
          world.gop.fence();
          if(world.rank() == 0)
            std::cout << "Iteration " << i + 1 << "\n";
        }

        // Stop clock
        const double wall_time_stop = madness::wall_time();
        const long flop = 2.0 * c("m,n").sum();
        inner_gflops.push_back(double(repeat) * double(flop) / (wall_time_stop - wall_time_start) / 1.0e9);
        inner_times.push_back((wall_time_stop - wall_time_start)/double(repeat));

        // Print results
        if(world.rank() == 0) {
          std::cout << "Average wall time = " << (wall_time_stop - wall_time_start) / double(repeat)
              << "\nAverage GFLOPS = " << double(repeat) * double(flop) / (wall_time_stop - wall_time_start) / 1.0e9 << "\n";
        }
      }
      gflops.push_back(inner_gflops);
      times.push_back(inner_times);
      world.gop.fence();
    }
    if(world.rank() == 0){
      for(unsigned int i = 0; i < gflops.size(); ++i){
        if(i == 0){
          std::cout << std::defaultfloat;
          std::cout << "   ";
          for(unsigned int j = 10; j <= 100; j+=10){
            std::cout << "        " << j;
          }
          std::cout << std::endl;
        }
        for(unsigned int j = 0; j < gflops[i].size(); ++j){
          if(j == 0){
            std::cout << std::defaultfloat;
            int num = (i+1) * 10;
            if(num < 100){
              std::cout << num << " |";
            } else { std::cout << num << "|"; }
          }

          std::cout << std::setprecision(3) << std::scientific;
          std::cout << double(gflops[i][j]) << " ";
        }
        std::cout << std::endl;
      }
    }

    if(world.rank() == 0){
      for(unsigned int i = 0; i < times.size(); ++i){
        if(i == 0){
          std::cout << std::defaultfloat;
          std::cout << "   ";
          for(unsigned int j = 10; j <= 100; j+=10){
            std::cout << "        " << j;
          }
          std::cout << std::endl;
        }
        for(unsigned int j = 0; j < times[i].size(); ++j){
          if(j == 0){
            std::cout << std::defaultfloat;
            int num = (i+1) * 10;
            if(num < 100){
              std::cout << num << " |";
            } else { std::cout << num << "|"; }
          }

          std::cout << std::setprecision(3) << std::scientific;
          std::cout << double(times[i][j]) << " ";
        }
        std::cout << std::endl;
      }
    }

    madness::finalize();

  } catch(TiledArray::Exception& e) {
    std::cerr << "!!ERROR TiledArray: " << e.what() << "\n";
    rc = 1;
  } catch(madness::MadnessException& e) {
    std::cerr << "!!ERROR MADNESS: " << e.what() << "\n";
    rc = 1;
  } catch(SafeMPI::Exception& e) {
    std::cerr << "!!ERROR SafeMPI: " << e.what() << "\n";
    rc = 1;
  } catch(std::exception& e) {
    std::cerr << "!!ERROR std: " << e.what() << "\n";
    rc = 1;
  } catch(...) {
    std::cerr << "!!ERROR: unknown exception\n";
    rc = 1;
  }


  return rc;
}
Beispiel #4
0
int main(int argc, char** argv) {
  // Initialize runtime
  madness::World& world = madness::initialize(argc, argv);
  elem::Grid grid(elem::DefaultGrid().Comm());

  // Get command line arguments
  if(argc < 2) {
    std::cout << "Usage: ta_dense matrix_size block_size [repetitions]\n";
    return 0;
  }
  const long matrix_size = atol(argv[1]);
  const long block_size = atol(argv[2]);
  if (matrix_size <= 0) {
    std::cerr << "Error: matrix size must greater than zero.\n";
    return 1;
  }
  if (block_size <= 0) {
    std::cerr << "Error: block size must greater than zero.\n";
    return 1;
  }
  if((matrix_size % block_size) != 0ul) {
    std::cerr << "Error: matrix size must be evenly divisible by block size.\n";
    return 1;
  }
  const long repeat = (argc >= 4 ? atol(argv[3]) : 5);
  if (repeat <= 0) {
    std::cerr << "Error: number of repetitions must greater than zero.\n";
    return 1;
  }

  const std::size_t num_blocks = matrix_size / block_size;
  const std::size_t block_count = num_blocks * num_blocks;

  if(world.rank() == 0)
    std::cout << "TiledArray: dense matrix multiply test...\n"
              << "Number of nodes     = " << world.size()
              << "\nMatrix size         = " << matrix_size << "x" << matrix_size
              << "\nBlock size          = " << block_size << "x" << block_size
              << "\nMemory per matrix   = " << double(matrix_size * matrix_size * sizeof(double)) / 1.0e9
              << " GB\nNumber of blocks    = " << block_count
              << "\nAverage blocks/node = " << double(block_count) / double(world.size()) << "\n";

  // Construct TiledRange
  std::vector<unsigned int> blocking;
  blocking.reserve(num_blocks + 1);
  for(std::size_t i = 0; i <= matrix_size; i += block_size)
    blocking.push_back(i);

  std::vector<TiledArray::TiledRange1> blocking2(2,
      TiledArray::TiledRange1(blocking.begin(), blocking.end()));

  TiledArray::TiledRange
    trange(blocking2.begin(), blocking2.end());

  // Construct and initialize arrays
  TiledArray::Array<double, 2> a = make_random_array(world, trange);
  TiledArray::Array<double, 2> b = make_random_array(world, trange);
  TiledArray::Array<double, 2> c(world, trange);
  if(world.rank() == 0 && matrix_size < 11){
    std::cout << "a = \n" << a << std::endl;
    std::cout << "b = \n" << b << std::endl;
  }

  // Start clock
  world.gop.fence();
  const double wall_time_start = madness::wall_time();

  // Do matrix multiplication
  for(int i = 0; i < repeat; ++i) {
    c("m,n") = a("m,k") * b("k,n");
    world.gop.fence();
    if(world.rank() == 0)
      std::cout << "Iteration " << i + 1 << "\n";
  }
  // Stop clock
  const double wall_time_stop = madness::wall_time();

  if(world.rank() == 0){
    std::cout << "Average wall time   = " << (wall_time_stop - wall_time_start) / double(repeat)
        << " sec\nAverage GFLOPS      = " << double(repeat) * 2.0 * double(matrix_size *
            matrix_size * matrix_size) / (wall_time_stop - wall_time_start) / 1.0e9 << "\n" << std::endl;
  }

  // Copying matrices to elemental
  elem::DistMatrix<double> a_elem = array_to_elem(a,grid);
  elem::DistMatrix<double> b_elem = array_to_elem(b,grid);
  elem::mpi::Barrier(grid.Comm());
  if(matrix_size < 11){
    Print(a_elem, "a from elem");
    Print(b_elem, "b from elem");
  }

  // Timed copy
  const double wall_time_copy0 = madness::wall_time();
  int j = 0;
  while(j++ < repeat){
    a_elem = array_to_elem(a,grid);
    b_elem = array_to_elem(b,grid);
    elem::mpi::Barrier(grid.Comm());
  }
  const double wall_time_copy1 = madness::wall_time();

  // How long the copy took
  if(world.rank() == 0){
    std::cout << "Spent " <<
      (wall_time_copy1 - wall_time_copy0)/(2.0 * double(repeat)) <<
      " s for an array copy to elemental on average.\n" << std::endl;
  }

  // Make the data output array
  elem::DistMatrix<double> c_elem(matrix_size, matrix_size, grid);
  elem::Zero(c_elem);
  elem::mpi::Barrier(grid.Comm());

  // Do the multiply
  const double wt_elem_start = madness::wall_time();
  for(std::size_t i = 0; i < repeat; ++i){
    elem::Gemm(elem::NORMAL, elem::NORMAL, 1., a_elem, b_elem, 0., c_elem);
    elem::mpi::Barrier(grid.Comm());
    if(grid.Rank() == 0){
      std::cout << "Elem Iteration " << i + 1 << "\n";
    }
  }
  const double wt_elem_end = madness::wall_time();

  // Time elemental
  if(world.rank() == 0){
    std::cout << "Average Elemental wall time   = " << (wt_elem_end - wt_elem_start) / double(repeat)
        << " sec\nAverage GFLOPS      = " << double(repeat) * 2.0 * double(matrix_size *
            matrix_size * matrix_size) / (wt_elem_end - wt_elem_start) / 1.0e9 << "\n";
  }

  // copy back to ta
  int i = 0;
  const double e_to_t_start = madness::wall_time();
  while(i++ < repeat){
    TiledArray::elem_to_array(c, c_elem);
    elem::mpi::Barrier(grid.Comm());
  }
  const double e_to_t_end = madness::wall_time();

  if(world.rank() == 0){
    std::cout << "Copying to TA from Elemental took " << (e_to_t_end - e_to_t_start)/(double(repeat)) << " s on average." << std::endl;
  }

  madness::finalize();
  return 0;
}