/**
 * \brief NCCL implementation of \ref gpucomm_broadcast.
 */
static int broadcast(gpudata *array, size_t offset, size_t count, int typecode,
                     int root, gpucomm *comm) {
  // need dummy init so that compiler shuts up
  ncclDataType_t datatype = ncclNumTypes;
  int rank = 0;
  cuda_context *ctx;

  ASSERT_BUF(array);
  ASSERT_COMM(comm);
  GA_CHECK(check_restrictions(array, offset, NULL, 0, count, typecode, 0, comm,
                              &datatype, NULL));
  GA_CHECK(get_rank(comm, &rank));

  ctx = comm->ctx;
  cuda_enter(ctx);

  // sync: wait till a write has finished (out of concurrent kernels)
  if (rank == root)
    GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(array, CUDA_WAIT_READ));
  else
    GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(array, CUDA_WAIT_WRITE));

  // change stream of nccl ops to enable concurrency
  NCCL_EXIT_ON_ERROR(ctx, ncclBcast((void *)(array->ptr + offset), count,
                                    datatype, root, comm->c, ctx->s));

  if (rank == root)
    GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(array, CUDA_WAIT_READ));
  else
    GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(array, CUDA_WAIT_WRITE));

  cuda_exit(ctx);

  return GA_NO_ERROR;
}
Exemple #2
0
void NCCL<Dtype>::Broadcast() {
  if (barrier_) {  // NULL in multi process case
    barrier_->wait();
  }
  NCCL_CHECK(ncclBcast(data_, static_cast<int>(size_),
                       nccl::dataType<Dtype>::type, 0,
                       comm_, cudaStreamDefault));
  if (barrier_) {
    barrier_->wait();
  }
}
Exemple #3
0
PyObject * THCPModule_nccl_broadcast(PyObject *self, PyObject *args) {
  HANDLE_TH_ERRORS
  PyObject *_inputs;
  int root;

  if (!PyArg_ParseTuple(args, "Oi", &_inputs, &root)) {
    THPUtils_invalidArguments(args, NULL, "nccl_broadcast", 1,
			      "(sequence[Tensor] inputs, int root");
    return NULL;
  }

  std::vector<at::Tensor> inputs = THPUtils_PySequence_to_TensorList(_inputs);

  // we can safely release GIL after this line, no python API used
  AutoNoGIL no_gil;
  _check_inputs(inputs, inputs, 1, 1);
  size_t len = inputs.size();

  ncclDataType_t data_type = _get_data_type(inputs[0].type().ID());

  int64_t count = inputs[0].numel();
  std::lock_guard<std::mutex> lock(*(THCCachingAllocator_getCudaFreeMutex()));
  ncclComm_t *comm = _get_communicator(inputs);
  AutoGPU gpu_guard;
#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
  CHECK(ncclGroupStart());
#endif
  for (size_t i = 0; i < len; i++) {
    int device = inputs[i].get_device();
    gpu_guard.setDevice(device);
    CHECK(ncclBcast(inputs[i].data_ptr(), count, data_type, root, comm[i], NULL));
  }
#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
  CHECK(ncclGroupEnd());
#endif

  Py_RETURN_NONE;
  END_HANDLE_TH_ERRORS
}