예제 #1
0
static PyObject * THPStorage_(shareCuda)(THPStorage *self)
{
  HANDLE_TH_ERRORS
  THStorage *storage = self->cdata;
  AutoGPU gpu_guard(storage->device);
  THPObjectPtr tuple(PyTuple_New(5));
  THPObjectPtr device(PyLong_FromLong(storage->device));
  THPObjectPtr _handle(Py_None);
  Py_INCREF(Py_None);
  THPObjectPtr size(PyLong_FromLong(storage->size));
  THPObjectPtr _offset(PyLong_FromLong(0));
  THPObjectPtr view_size(PyLong_FromLong(storage->size));
  if (storage->data) {
    size_t base_size;
    void *base_ptr = THCCachingAllocator_getBaseAllocation(storage->data, &base_size);
    ptrdiff_t offset = (char*)storage->data - (char*)base_ptr;

    cudaIpcMemHandle_t handle;
    THCudaCheck(cudaIpcGetMemHandle(&handle, base_ptr));

    _handle = PyBytes_FromStringAndSize((char *)&handle, CUDA_IPC_HANDLE_SIZE);
    _offset = PyLong_FromSsize_t((Py_ssize_t)offset);
    size = PyLong_FromSize_t(base_size / sizeof(real));
  }
  if (!tuple || !device || !_handle || !size || !_offset || !view_size) {
    return NULL;
  }
  PyTuple_SET_ITEM(tuple.get(), 0, device.release());
  PyTuple_SET_ITEM(tuple.get(), 1, _handle.release());
  PyTuple_SET_ITEM(tuple.get(), 2, size.release());
  PyTuple_SET_ITEM(tuple.get(), 3, _offset.release());
  PyTuple_SET_ITEM(tuple.get(), 4, view_size.release());
  return tuple.release();
  END_HANDLE_TH_ERRORS
}
예제 #2
0
at::Tensor DataChannelMPI::_newLikeFlat(std::vector<at::Tensor>& tensors) const {
  // TODO: check if all outputs are contiguous in memory and skip this step is yes
  if (tensors.size() == 0)
    throw std::runtime_error("received an empty list");
  auto & t = tensors[0];
  at::DeviceGuard gpu_guard(t.is_cuda() ? t.get_device() : -1);
  std::vector<int64_t> sizes { static_cast<int64_t>(tensors.size()) };  // sizes = [output.size()] + input.sizes()
  sizes.insert(sizes.end(), t.sizes().begin(), t.sizes().end());
  return t.type().tensor(sizes);
}