C++ (Cpp) ncclReduce 예제들

프로그래밍 언어: C++ (Cpp)

메소드/함수: ncclReduce

hotexamples.com에서의 예제들: 2

C++ (Cpp) ncclReduce - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 C++ (Cpp)의 ncclReduce에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: gpuarray_collectives_cuda_nccl.c 프로젝트: Theano/libgpuarray

/**
 * \brief NCCL implementation of \ref gpucomm_reduce.
 */
static int reduce(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest,
                  size_t count, int typecode, int opcode, int root,
                  gpucomm *comm) {
  // need dummy init so that compiler shuts up
  ncclRedOp_t op = ncclNumOps;
  ncclDataType_t datatype = ncclNumTypes;
  gpudata *dst = NULL;
  int rank = 0;
  cuda_context *ctx;

  ASSERT_BUF(src);
  ASSERT_COMM(comm);
  GA_CHECK(get_rank(comm, &rank));
  if (rank == root) {
    dst = dest;
    ASSERT_BUF(dest);
  }
  GA_CHECK(check_restrictions(src, offsrc, dst, offdest, count, typecode,
                              opcode, comm, &datatype, &op));

  ctx = comm->ctx;
  cuda_enter(ctx);

  // sync: wait till a write has finished (out of concurrent kernels)
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(src, CUDA_WAIT_READ));
  // sync: wait till a read/write has finished (out of concurrent kernels)
  if (rank == root)
    GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(dest, CUDA_WAIT_WRITE));

  // change stream of nccl ops to enable concurrency
  if (rank == root)
    NCCL_EXIT_ON_ERROR(ctx, ncclReduce((void *)(src->ptr + offsrc),
                                       (void *)(dest->ptr + offdest), count,
                                       datatype, op, root, comm->c, ctx->s));
  else
    NCCL_EXIT_ON_ERROR(ctx, ncclReduce((void *)(src->ptr + offsrc), NULL, count,
                                       datatype, op, root, comm->c, ctx->s));

  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ));
  if (rank == root)
    GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(dest, CUDA_WAIT_WRITE));

  cuda_exit(ctx);

  return GA_NO_ERROR;
}

예제 #2

파일 보기

파일: nccl.cpp 프로젝트: Northrend/pytorch

PyObject * THCPModule_nccl_reduce(PyObject *self, PyObject *args) {
  HANDLE_TH_ERRORS
  PyObject *_inputs, *_outputs, *_streams;
  int root, op;

  if (!PyArg_ParseTuple(args, "OOOii", &_inputs, &_outputs, &_streams, &root, &op)) {
    THPUtils_invalidArguments(args, NULL, "nccl_reduce", 1,
			      "(sequence[Tensor] inputs, sequence[Tensor]"
			      " outputs, sequence[torch.cuda.Stream or None], int root, int op");
    return NULL;
  }

  std::vector<at::Tensor> inputs = THPUtils_PySequence_to_TensorList(_inputs);
  std::vector<at::Tensor> outputs = THPUtils_PySequence_to_TensorList(_outputs);
  std::vector<THCStream*> streams = THPUtils_PySequence_to_THCStreamList(_streams);

  THPUtils_assert(inputs.size() == streams.size(), "number of streams is not equal to number of inputs");

  // we can safely release GIL after this line, no python API used
  AutoNoGIL no_gil;
  _check_inputs(inputs, outputs, 1, 1);
  size_t len = inputs.size();

  ncclDataType_t data_type = _get_data_type(inputs[0].type().ID());

  int64_t count = inputs[0].numel();
  std::lock_guard<std::mutex> lock(*(THCCachingAllocator_getCudaFreeMutex()));
  ncclComm_t *comm = _get_communicator(inputs);
  AutoGPU gpu_guard;
#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
  CHECK(ncclGroupStart());
#endif
  for (size_t i = 0; i < len; i++) {
    int device = inputs[i].get_device();
    gpu_guard.setDevice(device);
    auto stream = (streams[i] == NULL) ? NULL : streams[i]->stream;
    CHECK(ncclReduce(inputs[i].data_ptr(), outputs[i].data_ptr(),
		     count, data_type, (ncclRedOp_t) op, root, comm[i], stream));
  }
#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
  CHECK(ncclGroupEnd());
#endif

  Py_RETURN_NONE;
  END_HANDLE_TH_ERRORS
}