/** * \brief NCCL implementation of \ref gpucomm_broadcast. */ static int broadcast(gpudata *array, size_t offset, size_t count, int typecode, int root, gpucomm *comm) { // need dummy init so that compiler shuts up ncclDataType_t datatype = ncclNumTypes; int rank = 0; cuda_context *ctx; ASSERT_BUF(array); ASSERT_COMM(comm); GA_CHECK(check_restrictions(array, offset, NULL, 0, count, typecode, 0, comm, &datatype, NULL)); GA_CHECK(get_rank(comm, &rank)); ctx = comm->ctx; cuda_enter(ctx); // sync: wait till a write has finished (out of concurrent kernels) if (rank == root) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(array, CUDA_WAIT_READ)); else GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(array, CUDA_WAIT_WRITE)); // change stream of nccl ops to enable concurrency NCCL_EXIT_ON_ERROR(ctx, ncclBcast((void *)(array->ptr + offset), count, datatype, root, comm->c, ctx->s)); if (rank == root) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(array, CUDA_WAIT_READ)); else GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(array, CUDA_WAIT_WRITE)); cuda_exit(ctx); return GA_NO_ERROR; }
void NCCL<Dtype>::Broadcast() { if (barrier_) { // NULL in multi process case barrier_->wait(); } NCCL_CHECK(ncclBcast(data_, static_cast<int>(size_), nccl::dataType<Dtype>::type, 0, comm_, cudaStreamDefault)); if (barrier_) { barrier_->wait(); } }
PyObject * THCPModule_nccl_broadcast(PyObject *self, PyObject *args) { HANDLE_TH_ERRORS PyObject *_inputs; int root; if (!PyArg_ParseTuple(args, "Oi", &_inputs, &root)) { THPUtils_invalidArguments(args, NULL, "nccl_broadcast", 1, "(sequence[Tensor] inputs, int root"); return NULL; } std::vector<at::Tensor> inputs = THPUtils_PySequence_to_TensorList(_inputs); // we can safely release GIL after this line, no python API used AutoNoGIL no_gil; _check_inputs(inputs, inputs, 1, 1); size_t len = inputs.size(); ncclDataType_t data_type = _get_data_type(inputs[0].type().ID()); int64_t count = inputs[0].numel(); std::lock_guard<std::mutex> lock(*(THCCachingAllocator_getCudaFreeMutex())); ncclComm_t *comm = _get_communicator(inputs); AutoGPU gpu_guard; #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2) CHECK(ncclGroupStart()); #endif for (size_t i = 0; i < len; i++) { int device = inputs[i].get_device(); gpu_guard.setDevice(device); CHECK(ncclBcast(inputs[i].data_ptr(), count, data_type, root, comm[i], NULL)); } #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2) CHECK(ncclGroupEnd()); #endif Py_RETURN_NONE; END_HANDLE_TH_ERRORS }