void NCCL<Dtype>::run(int layer) { CHECK(solver_->param().layer_wise_reduce()); vector<shared_ptr<Blob<Dtype> > >& blobs = solver_->net()->layers()[layer]->blobs(); #ifdef DEBUG // Assert blobs are contiguous to reduce in one step (e.g. bias often small) for (int i = 1; i < blobs.size(); ++i) { CHECK_EQ(blobs[i - 1]->gpu_diff() + blobs[i - 1]->count(), blobs[i + 0]->gpu_diff()); } #endif if (blobs.size() > 0) { // Make sure default stream is done computing gradients. Could be // replaced by cudaEventRecord+cudaStreamWaitEvent to avoid // blocking the default stream, but it's actually slower. CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault)); // Reduce asynchronously int size = 0; for (int i = 0; i < blobs.size(); ++i) { size += blobs[i]->count(); } if (barrier_) { // NULL in multi process case barrier_->wait(); } NCCL_CHECK(ncclAllReduce(blobs[0]->mutable_gpu_diff(), blobs[0]->mutable_gpu_diff(), size, nccl::dataType<Dtype>::type, ncclSum, comm_, stream_)); caffe_gpu_scal(size, (Dtype) 1.0 / Caffe::solver_count(), blobs[0]->mutable_gpu_diff(), stream_); } }
/** * \brief NCCL implementation of \ref gpucomm_all_reduce. */ static int all_reduce(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, int opcode, gpucomm *comm) { // need dummy init so that compiler shuts up ncclRedOp_t op = ncclNumOps; ncclDataType_t datatype = ncclNumTypes; cuda_context *ctx; ASSERT_BUF(src); ASSERT_COMM(comm); ASSERT_BUF(dest); GA_CHECK(check_restrictions(src, offsrc, dest, offdest, count, typecode, opcode, comm, &datatype, &op)); ctx = comm->ctx; cuda_enter(ctx); // sync: wait till a write has finished (out of concurrent kernels) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(src, CUDA_WAIT_READ)); // sync: wait till a read/write has finished (out of concurrent kernels) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(dest, CUDA_WAIT_WRITE)); // change stream of nccl ops to enable concurrency NCCL_EXIT_ON_ERROR(ctx, ncclAllReduce((void *)(src->ptr + offsrc), (void *)(dest->ptr + offdest), count, datatype, op, comm->c, ctx->s)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(dest, CUDA_WAIT_WRITE)); cuda_exit(ctx); return GA_NO_ERROR; }
void NCCL<Dtype>::on_gradients_ready() { if (solver_->param().layer_wise_reduce()) { CHECK_EQ(solver_->net()->params().size(), solver_->net()->learnable_params().size()) << "Layer-wise reduce is not supported for nets with shared weights."; // Make sure reduction is done before applying gradients CUDA_CHECK(cudaStreamSynchronize(stream_)); } else { if (barrier_) { // NULL in multi process case barrier_->wait(); } NCCL_CHECK(ncclAllReduce(diff_, diff_, static_cast<int>(size_), nccl::dataType<Dtype>::type, ncclSum, comm_, cudaStreamDefault)); caffe_gpu_scal(static_cast<int>(size_), (Dtype) 1.0 / Caffe::solver_count(), diff_); } }
PyObject * THCPModule_nccl_all_reduce(PyObject *self, PyObject *args) { HANDLE_TH_ERRORS PyObject *_inputs, *_outputs; int op; if (!PyArg_ParseTuple(args, "OOi", &_inputs, &_outputs, &op)) { THPUtils_invalidArguments(args, NULL, "nccl_all_reduce", 1, "(sequence[Tensor] inputs, sequence[Tensor]" " outputs, int op"); return NULL; } std::vector<at::Tensor> inputs = THPUtils_PySequence_to_TensorList(_inputs); std::vector<at::Tensor> outputs = THPUtils_PySequence_to_TensorList(_outputs); // we can safely release GIL after this line, no python API used AutoNoGIL no_gil; _check_inputs(inputs, outputs, 1, 1); size_t len = inputs.size(); ncclDataType_t data_type = _get_data_type(inputs[0].type().ID()); int64_t count = inputs[0].numel(); std::lock_guard<std::mutex> lock(*(THCCachingAllocator_getCudaFreeMutex())); ncclComm_t *comm = _get_communicator(inputs); AutoGPU gpu_guard; #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2) CHECK(ncclGroupStart()); #endif for (size_t i = 0; i < len; i++) { int device = inputs[i].get_device(); gpu_guard.setDevice(device); CHECK(ncclAllReduce(inputs[i].data_ptr(), outputs[i].data_ptr(), count, data_type, (ncclRedOp_t) op, comm[i], NULL)); } #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2) CHECK(ncclGroupEnd()); #endif Py_RETURN_NONE; END_HANDLE_TH_ERRORS }