/**
 * \brief NCCL implementation of \ref gpucomm_reduce_scatter.
 */
static int reduce_scatter(gpudata *src, size_t offsrc, gpudata *dest,
                          size_t offdest, size_t count, int typecode,
                          int opcode, gpucomm *comm) {
  // need dummy init so that compiler shuts up
  ncclRedOp_t op = ncclNumOps;
  ncclDataType_t datatype = ncclNumTypes;
  int ndev = 0;
  size_t resc_size;
  cuda_context *ctx;

  ASSERT_BUF(src);
  ASSERT_COMM(comm);
  ASSERT_BUF(dest);
  GA_CHECK(get_count(comm, &ndev));
  GA_CHECK(check_restrictions(src, offsrc, NULL, 0, count * ndev, typecode,
                              opcode, comm, &datatype, &op));
  if (dest->ctx != comm->ctx)
    return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination and comm context differ");
  resc_size = count * gpuarray_get_elsize(typecode);
  if ((dest->sz - offdest) < resc_size)
    return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination too small for operation");
  assert(!(offdest > dest->sz));

  ctx = comm->ctx;
  cuda_enter(ctx);

  // sync: wait till a write has finished (out of concurrent kernels)
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(src, CUDA_WAIT_READ));
  // sync: wait till a read/write has finished (out of concurrent kernels)
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(dest, CUDA_WAIT_WRITE));

  // change stream of nccl ops to enable concurrency
  NCCL_EXIT_ON_ERROR(ctx, ncclReduceScatter((void *)(src->ptr + offsrc),
                                            (void *)(dest->ptr + offdest), count,
                                            datatype, op, comm->c, ctx->s));

  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ));
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(dest, CUDA_WAIT_WRITE));

  cuda_exit(ctx);

  return GA_NO_ERROR;
}
Beispiel #2
0
PyObject * THCPModule_nccl_reduce_scatter(PyObject *self, PyObject *args) {
  HANDLE_TH_ERRORS
  PyObject *_inputs, *_outputs;
  int op;

  if (!PyArg_ParseTuple(args, "OOi", &_inputs, &_outputs, &op)) {
    THPUtils_invalidArguments(args, NULL, "nccl_reduce_scatter", 1,
			      "(sequence[Tensor] inputs, sequence[Tensor] outputs, int op");
    return NULL;
  }

  std::vector<at::Tensor> inputs = THPUtils_PySequence_to_TensorList(_inputs);
  std::vector<at::Tensor> outputs = THPUtils_PySequence_to_TensorList(_outputs);

  // we can safely release GIL after this line, no python API used
  AutoNoGIL no_gil;
  size_t len = inputs.size();
  _check_inputs(inputs, outputs, 1, len);

  ncclDataType_t data_type = _get_data_type(inputs[0].type().ID());

  int64_t count = inputs[0].numel() / len;
  std::lock_guard<std::mutex> lock(*(THCCachingAllocator_getCudaFreeMutex()));
  ncclComm_t *comm = _get_communicator(inputs);
  AutoGPU gpu_guard;
#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
  CHECK(ncclGroupStart());
#endif
  for (size_t i = 0; i < len; i++) {
    int device = inputs[i].get_device();
    gpu_guard.setDevice(device);
    CHECK(ncclReduceScatter(inputs[i].data_ptr(), outputs[i].data_ptr(),
			    count, data_type, (ncclRedOp_t) op, comm[i], NULL));
  }
#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
  CHECK(ncclGroupEnd());
#endif

  Py_RETURN_NONE;
  END_HANDLE_TH_ERRORS
}