コード例 #1
0
ファイル: parallel.cpp プロジェクト: 20337112/caffe
void NCCL<Dtype>::run(int layer) {
  CHECK(solver_->param().layer_wise_reduce());
  vector<shared_ptr<Blob<Dtype> > >& blobs =
    solver_->net()->layers()[layer]->blobs();
#ifdef DEBUG
  // Assert blobs are contiguous to reduce in one step (e.g. bias often small)
  for (int i = 1; i < blobs.size(); ++i) {
    CHECK_EQ(blobs[i - 1]->gpu_diff() + blobs[i - 1]->count(),
             blobs[i + 0]->gpu_diff());
  }
#endif
  if (blobs.size() > 0) {
    // Make sure default stream is done computing gradients. Could be
    // replaced by cudaEventRecord+cudaStreamWaitEvent to avoid
    // blocking the default stream, but it's actually slower.
    CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));

    // Reduce asynchronously
    int size = 0;
    for (int i = 0; i < blobs.size(); ++i) {
      size += blobs[i]->count();
    }
    if (barrier_) {  // NULL in multi process case
      barrier_->wait();
    }
    NCCL_CHECK(ncclAllReduce(blobs[0]->mutable_gpu_diff(),
                             blobs[0]->mutable_gpu_diff(),
                             size,
                             nccl::dataType<Dtype>::type,
                             ncclSum, comm_, stream_));
    caffe_gpu_scal(size, (Dtype) 1.0 / Caffe::solver_count(),
                   blobs[0]->mutable_gpu_diff(), stream_);
  }
}
コード例 #2
0
/**
 * \brief NCCL implementation of \ref gpucomm_all_reduce.
 */
static int all_reduce(gpudata *src, size_t offsrc, gpudata *dest,
                      size_t offdest, size_t count, int typecode, int opcode,
                      gpucomm *comm) {
  // need dummy init so that compiler shuts up
  ncclRedOp_t op = ncclNumOps;
  ncclDataType_t datatype = ncclNumTypes;
  cuda_context *ctx;

  ASSERT_BUF(src);
  ASSERT_COMM(comm);
  ASSERT_BUF(dest);
  GA_CHECK(check_restrictions(src, offsrc, dest, offdest, count, typecode,
                              opcode, comm, &datatype, &op));

  ctx = comm->ctx;
  cuda_enter(ctx);

  // sync: wait till a write has finished (out of concurrent kernels)
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(src, CUDA_WAIT_READ));
  // sync: wait till a read/write has finished (out of concurrent kernels)
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(dest, CUDA_WAIT_WRITE));

  // change stream of nccl ops to enable concurrency
  NCCL_EXIT_ON_ERROR(ctx, ncclAllReduce((void *)(src->ptr + offsrc),
                                        (void *)(dest->ptr + offdest), count,
                                        datatype, op, comm->c, ctx->s));

  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ));
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(dest, CUDA_WAIT_WRITE));

  cuda_exit(ctx);

  return GA_NO_ERROR;
}
コード例 #3
0
ファイル: parallel.cpp プロジェクト: 20337112/caffe
void NCCL<Dtype>::on_gradients_ready() {
  if (solver_->param().layer_wise_reduce()) {
    CHECK_EQ(solver_->net()->params().size(),
             solver_->net()->learnable_params().size())
      << "Layer-wise reduce is not supported for nets with shared weights.";

    // Make sure reduction is done before applying gradients
    CUDA_CHECK(cudaStreamSynchronize(stream_));
  } else {
    if (barrier_) {  // NULL in multi process case
      barrier_->wait();
    }
    NCCL_CHECK(ncclAllReduce(diff_, diff_, static_cast<int>(size_),
                             nccl::dataType<Dtype>::type, ncclSum, comm_,
                             cudaStreamDefault));
    caffe_gpu_scal(static_cast<int>(size_),
                   (Dtype) 1.0 / Caffe::solver_count(), diff_);
  }
}
コード例 #4
0
ファイル: nccl.cpp プロジェクト: Northrend/pytorch
PyObject * THCPModule_nccl_all_reduce(PyObject *self, PyObject *args) {
  HANDLE_TH_ERRORS
  PyObject *_inputs, *_outputs;
  int op;

  if (!PyArg_ParseTuple(args, "OOi", &_inputs, &_outputs, &op)) {
    THPUtils_invalidArguments(args, NULL, "nccl_all_reduce", 1,
			      "(sequence[Tensor] inputs, sequence[Tensor]"
			      " outputs, int op");
    return NULL;
  }

  std::vector<at::Tensor> inputs = THPUtils_PySequence_to_TensorList(_inputs);
  std::vector<at::Tensor> outputs = THPUtils_PySequence_to_TensorList(_outputs);

  // we can safely release GIL after this line, no python API used
  AutoNoGIL no_gil;
  _check_inputs(inputs, outputs, 1, 1);
  size_t len = inputs.size();

  ncclDataType_t data_type = _get_data_type(inputs[0].type().ID());

  int64_t count = inputs[0].numel();
  std::lock_guard<std::mutex> lock(*(THCCachingAllocator_getCudaFreeMutex()));
  ncclComm_t *comm = _get_communicator(inputs);
  AutoGPU gpu_guard;
#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
  CHECK(ncclGroupStart());
#endif
  for (size_t i = 0; i < len; i++) {
    int device = inputs[i].get_device();
    gpu_guard.setDevice(device);
    CHECK(ncclAllReduce(inputs[i].data_ptr(), outputs[i].data_ptr(),
			count, data_type, (ncclRedOp_t) op, comm[i], NULL));
  }
#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
  CHECK(ncclGroupEnd());
#endif

  Py_RETURN_NONE;
  END_HANDLE_TH_ERRORS
}