Ejemplo n.º 1
0
int main(int argc, char *argv[])
{
  bool success = true;
  bool verbose = false;
  try {

    const size_t num_sockets = Kokkos::hwloc::get_available_numa_count();
    const size_t num_cores_per_socket =
      Kokkos::hwloc::get_available_cores_per_numa();
    const size_t num_threads_per_core =
      Kokkos::hwloc::get_available_threads_per_core();

    // Setup command line options
    Teuchos::CommandLineProcessor CLP;
    CLP.setDocString(
      "This test performance of MP::Vector multiply routines.\n");
    int nGrid = 32;
    CLP.setOption("n", &nGrid, "Number of mesh points in the each direction");
    int nIter = 10;
    CLP.setOption("ni", &nIter, "Number of multiply iterations");
#ifdef KOKKOS_HAVE_PTHREAD
    bool threads = true;
    CLP.setOption("threads", "no-threads", &threads, "Enable Threads device");
    int num_cores = num_cores_per_socket * num_sockets;
    CLP.setOption("cores", &num_cores,
                  "Number of CPU cores to use (defaults to all)");
    int num_hyper_threads = num_threads_per_core;
    CLP.setOption("hyperthreads", &num_hyper_threads,
                  "Number of hyper threads per core to use (defaults to all)");
    int threads_per_vector = 1;
    CLP.setOption("threads_per_vector", &threads_per_vector,
                  "Number of threads to use within each vector");
#endif
#ifdef KOKKOS_HAVE_CUDA
    bool cuda = true;
    CLP.setOption("cuda", "no-cuda", &cuda, "Enable Cuda device");
    int cuda_threads_per_vector = 16;
    CLP.setOption("cuda_threads_per_vector", &cuda_threads_per_vector,
                  "Number of Cuda threads to use within each vector");
    int cuda_block_size = 0;
    CLP.setOption("cuda_block_size", &cuda_block_size,
                  "Cuda block size (0 implies the default choice)");
    int num_cuda_blocks = 0;
    CLP.setOption("num_cuda_blocks", &num_cuda_blocks,
                  "Number of Cuda blocks (0 implies the default choice)");
    int device_id = 0;
    CLP.setOption("device", &device_id, "CUDA device ID");
#endif
    CLP.parse( argc, argv );

    typedef int Ordinal;
    typedef double Scalar;

#ifdef KOKKOS_HAVE_PTHREAD
    if (threads) {
      typedef Kokkos::Threads Device;
      typedef Stokhos::StaticFixedStorage<Ordinal,Scalar,1,Device> Storage;

      Kokkos::Threads::initialize(num_cores*num_hyper_threads);

      std::cout << std::endl
                << "Threads performance with " << num_cores*num_hyper_threads
                << " threads:" << std::endl;

      Kokkos::DeviceConfig dev_config(num_cores,
                                       threads_per_vector,
                                       num_hyper_threads / threads_per_vector);

      mainHost<Storage>(nGrid, nIter, dev_config);

      Kokkos::Threads::finalize();
    }
#endif

#ifdef KOKKOS_HAVE_CUDA
    if (cuda) {
      typedef Kokkos::Cuda Device;
      typedef Stokhos::StaticFixedStorage<Ordinal,Scalar,1,Device> Storage;

      Kokkos::Cuda::host_mirror_device_type::initialize();
      Kokkos::Cuda::initialize(Kokkos::Cuda::SelectDevice(device_id));

      cudaDeviceProp deviceProp;
      cudaGetDeviceProperties(&deviceProp, device_id);
      std::cout << std::endl
                << "CUDA performance for device " << device_id << " ("
                << deviceProp.name << "):"
                << std::endl;

      Kokkos::DeviceConfig dev_config(
        num_cuda_blocks,
        cuda_threads_per_vector,
        cuda_threads_per_vector == 0 ? 0 : cuda_block_size / cuda_threads_per_vector);

      mainCuda<Storage>(nGrid,nIter,dev_config);

      Kokkos::Cuda::host_mirror_device_type::finalize();
      Kokkos::Cuda::finalize();
    }
#endif

  }
  TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success);

  if (success)
    return 0;
  return -1;
}
Ejemplo n.º 2
0
int main(int argc, char *argv[])
{
  bool success = true;
  bool verbose = false;
  try {

    Teuchos::oblackholestream blackHole;
    Teuchos::GlobalMPISession mpiSession (&argc, &argv, &blackHole);

    Teuchos::RCP<const Teuchos::Comm<int> > comm =
      Teuchos::DefaultComm<int>::getComm();

    const size_t num_sockets = Kokkos::hwloc::get_available_numa_count();
    const size_t num_cores_per_socket =
      Kokkos::hwloc::get_available_cores_per_numa();
    const size_t num_threads_per_core =
      Kokkos::hwloc::get_available_threads_per_core();

    // Setup command line options
    Teuchos::CommandLineProcessor CLP;
    CLP.setDocString(
      "This test performance of MP::Vector FEM assembly.\n");
    int nGrid = 32;
    CLP.setOption("n", &nGrid, "Number of mesh points in the each direction");
    int nIter = 10;
    CLP.setOption("ni", &nIter, "Number of assembly iterations");
    bool print = false;
    CLP.setOption("print", "no-print", &print, "Print debugging output");
    bool check = false;
    int num_cores = num_cores_per_socket * num_sockets;
    CLP.setOption("cores", &num_cores,
                  "Number of CPU cores to use (defaults to all)");
    int num_hyper_threads = num_threads_per_core;
    CLP.setOption("hyperthreads", &num_hyper_threads,
                  "Number of hyper threads per core to use (defaults to all)");
    int threads_per_vector = 1;
    CLP.setOption("threads_per_vector", &threads_per_vector,
                  "Number of threads to use within each vector");
    CLP.setOption("check", "no-check", &check, "Check correctness");
#ifdef KOKKOS_HAVE_SERIAL
    bool serial = true;
    CLP.setOption("serial", "no-serial", &serial, "Enable Serial device");
#endif
#ifdef KOKKOS_HAVE_PTHREAD
    bool threads = true;
    CLP.setOption("threads", "no-threads", &threads, "Enable Threads device");
#endif
#ifdef KOKKOS_HAVE_OPENMP
    bool openmp = true;
    CLP.setOption("openmp", "no-openmp", &openmp, "Enable OpenMP device");
#endif
#ifdef KOKKOS_HAVE_CUDA
    bool cuda = true;
    CLP.setOption("cuda", "no-cuda", &cuda, "Enable Cuda device");
    int cuda_threads_per_vector = 16;
    CLP.setOption("cuda_threads_per_vector", &cuda_threads_per_vector,
                  "Number of Cuda threads to use within each vector");
    int cuda_block_size = 256;
    CLP.setOption("cuda_block_size", &cuda_block_size,
                  "Cuda block size");
    int num_cuda_blocks = 0;
    CLP.setOption("num_cuda_blocks", &num_cuda_blocks,
                  "Number of Cuda blocks (0 implies the default choice)");
    int device_id = -1;
    CLP.setOption("device", &device_id, "CUDA device ID.  Set to default of -1 to use the default device as determined by the local node MPI rank and --ngpus");
    int ngpus = 1;
    CLP.setOption("ngpus", &ngpus, "Number of GPUs per node for multi-GPU runs via MPI");
#endif
    CLP.parse( argc, argv );

    int use_nodes[3];
    use_nodes[0] = nGrid; use_nodes[1] = nGrid; use_nodes[2] = nGrid;

    typedef int Ordinal;
    typedef double Scalar;
    const Kokkos::Example::FENL::AssemblyMethod Method =
      Kokkos::Example::FENL::FadElementOptimized;
    // const Kokkos::Example::FENL::AssemblyMethod Method =
    //   Kokkos::Example::FENL::Analytic;

#ifdef KOKKOS_HAVE_SERIAL
    if (serial) {
      typedef Kokkos::Serial Device;
      typedef Stokhos::StaticFixedStorage<Ordinal,Scalar,1,Device> Storage;

      Kokkos::Serial::initialize();

      if (comm->getRank() == 0)
        std::cout << std::endl
                  << "Serial performance with " << comm->getSize()
                  << " MPI ranks" << std::endl;

      Kokkos::Example::FENL::DeviceConfig dev_config(1, 1, 1);

      mainHost<Storage,Method>(comm, print, nIter, use_nodes, check,
                               dev_config);

      Kokkos::Serial::finalize();
    }
#endif

#ifdef KOKKOS_HAVE_PTHREAD
    if (threads) {
      typedef Kokkos::Threads Device;
      typedef Stokhos::StaticFixedStorage<Ordinal,Scalar,1,Device> Storage;

      Kokkos::Threads::initialize(num_cores*num_hyper_threads);

      if (comm->getRank() == 0)
        std::cout << std::endl
                  << "Threads performance with " << comm->getSize()
                  << " MPI ranks and " << num_cores*num_hyper_threads
                  << " threads per rank:" << std::endl;

      Kokkos::Example::FENL::DeviceConfig dev_config(num_cores,
                                       threads_per_vector,
                                       num_hyper_threads / threads_per_vector);

      mainHost<Storage,Method>(comm, print, nIter, use_nodes, check,
                               dev_config);

      Kokkos::Threads::finalize();
    }
#endif

#ifdef KOKKOS_HAVE_OPENMP
    if (openmp) {
      typedef Kokkos::OpenMP Device;
      typedef Stokhos::StaticFixedStorage<Ordinal,Scalar,1,Device> Storage;

      Kokkos::OpenMP::initialize(num_cores*num_hyper_threads);

      if (comm->getRank() == 0)
        std::cout << std::endl
                  << "OpenMP performance with " << comm->getSize()
                  << " MPI ranks and " << num_cores*num_hyper_threads
                  << " threads per rank:" << std::endl;

      Kokkos::Example::FENL::DeviceConfig dev_config(num_cores,
                                       threads_per_vector,
                                       num_hyper_threads / threads_per_vector);

      mainHost<Storage,Method>(comm, print, nIter, use_nodes, check,
                               dev_config);

      Kokkos::OpenMP::finalize();
    }
#endif

#ifdef KOKKOS_HAVE_CUDA
    if (cuda) {
      typedef Kokkos::Cuda Device;
      typedef Stokhos::StaticFixedStorage<Ordinal,Scalar,1,Device> Storage;

      if (device_id == -1) {
        int local_rank = 0;
        char *str;
        if ((str = std::getenv("SLURM_LOCALID")))
          local_rank = std::atoi(str);
        else if ((str = std::getenv("MV2_COMM_WORLD_LOCAL_RANK")))
          local_rank = std::atoi(str);
        else if ((str = getenv("OMPI_COMM_WORLD_LOCAL_RANK")))
          local_rank = std::atoi(str);
        device_id = local_rank % ngpus;

        // Check device is valid
        int num_device; cudaGetDeviceCount(&num_device);
        TEUCHOS_TEST_FOR_EXCEPTION(
          device_id >= num_device, std::logic_error,
          "Invalid device ID " << device_id << ".  You probably are trying" <<
          " to run with too many GPUs per node");
      }

      Kokkos::HostSpace::execution_space::initialize();
      Kokkos::Cuda::initialize(Kokkos::Cuda::SelectDevice(device_id));

      cudaDeviceProp deviceProp;
      cudaGetDeviceProperties(&deviceProp, device_id);
      if (comm->getRank() == 0)
        std::cout << std::endl
                  << "CUDA performance performance with " << comm->getSize()
                  << " MPI ranks and device " << device_id << " ("
                  << deviceProp.name << "):"
                  << std::endl;

      Kokkos::Example::FENL::DeviceConfig dev_config(
        num_cuda_blocks,
        cuda_threads_per_vector,
        cuda_threads_per_vector == 0 ? 0 : cuda_block_size / cuda_threads_per_vector);

      mainCuda<Storage,Method>(comm, print, nIter, use_nodes, check,
                               dev_config);

      Kokkos::HostSpace::execution_space::finalize();
      Kokkos::Cuda::finalize();
    }
#endif

  }
  TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success);

  if (success)
    return 0;
  return -1;
}