PyObject * THCPModule_nccl_all_gather(PyObject *self, PyObject *args) { HANDLE_TH_ERRORS PyObject *_inputs, *_outputs; if (!PyArg_ParseTuple(args, "OO", &_inputs, &_outputs)) { THPUtils_invalidArguments(args, NULL, "nccl_all_gather", 1, "(sequence[Tensor] inputs, sequence[Tensor] outputs"); return NULL; } std::vector<at::Tensor> inputs = THPUtils_PySequence_to_TensorList(_inputs); std::vector<at::Tensor> outputs = THPUtils_PySequence_to_TensorList(_outputs); // we can safely release GIL after this line, no python API used AutoNoGIL no_gil; size_t len = inputs.size(); _check_inputs(inputs, outputs, len, 1); ncclDataType_t data_type = _get_data_type(inputs[0].type().ID()); int64_t count = inputs[0].numel(); std::lock_guard<std::mutex> lock(*(THCCachingAllocator_getCudaFreeMutex())); ncclComm_t *comm = _get_communicator(inputs); AutoGPU gpu_guard; #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2) CHECK(ncclGroupStart()); #endif for (size_t i = 0; i < len; i++) { int device = inputs[i].get_device(); gpu_guard.setDevice(device); #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2) CHECK(ncclAllGather(inputs[i].data_ptr(), outputs[i].data_ptr(), count, data_type, comm[i], NULL)); #else CHECK(ncclAllGather(inputs[i].data_ptr(), count, data_type, outputs[i].data_ptr(), comm[i], NULL)); #endif } #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2) CHECK(ncclGroupEnd()); #endif Py_RETURN_NONE; END_HANDLE_TH_ERRORS }
/** * \brief NCCL implementation of \ref gpucomm_all_gather. */ static int all_gather(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, gpucomm *comm) { // need dummy init so that compiler shuts up ncclDataType_t datatype = ncclNumTypes; int ndev = 0; size_t resc_size; cuda_context *ctx; ASSERT_BUF(src); ASSERT_COMM(comm); ASSERT_BUF(dest); GA_CHECK(check_restrictions(src, offsrc, NULL, 0, count, typecode, 0, comm, &datatype, NULL)); if (dest->ctx != comm->ctx) return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination and comm context differ"); GA_CHECK(get_count(comm, &ndev)); resc_size = ndev * count * gpuarray_get_elsize(typecode); if ((dest->sz - offdest) < resc_size) return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination too small for operation"); assert(!(offdest > dest->sz)); ctx = comm->ctx; cuda_enter(ctx); // sync: wait till a write has finished (out of concurrent kernels) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(src, CUDA_WAIT_READ)); // sync: wait till a read/write has finished (out of concurrent kernels) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(dest, CUDA_WAIT_WRITE)); // change stream of nccl ops to enable concurrency NCCL_EXIT_ON_ERROR( ctx, ncclAllGather((void *)(src->ptr + offsrc), (void *)(dest->ptr + offdest), count, datatype, comm->c, ctx->s)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(dest, CUDA_WAIT_WRITE)); cuda_exit(ctx); return GA_NO_ERROR; }