static PyObject * THPStorage_(shareCuda)(THPStorage *self) { HANDLE_TH_ERRORS THStorage *storage = self->cdata; AutoGPU gpu_guard(storage->device); THPObjectPtr tuple(PyTuple_New(5)); THPObjectPtr device(PyLong_FromLong(storage->device)); THPObjectPtr _handle(Py_None); Py_INCREF(Py_None); THPObjectPtr size(PyLong_FromLong(storage->size)); THPObjectPtr _offset(PyLong_FromLong(0)); THPObjectPtr view_size(PyLong_FromLong(storage->size)); if (storage->data) { size_t base_size; void *base_ptr = THCCachingAllocator_getBaseAllocation(storage->data, &base_size); ptrdiff_t offset = (char*)storage->data - (char*)base_ptr; cudaIpcMemHandle_t handle; THCudaCheck(cudaIpcGetMemHandle(&handle, base_ptr)); _handle = PyBytes_FromStringAndSize((char *)&handle, CUDA_IPC_HANDLE_SIZE); _offset = PyLong_FromSsize_t((Py_ssize_t)offset); size = PyLong_FromSize_t(base_size / sizeof(real)); } if (!tuple || !device || !_handle || !size || !_offset || !view_size) { return NULL; } PyTuple_SET_ITEM(tuple.get(), 0, device.release()); PyTuple_SET_ITEM(tuple.get(), 1, _handle.release()); PyTuple_SET_ITEM(tuple.get(), 2, size.release()); PyTuple_SET_ITEM(tuple.get(), 3, _offset.release()); PyTuple_SET_ITEM(tuple.get(), 4, view_size.release()); return tuple.release(); END_HANDLE_TH_ERRORS }
at::Tensor DataChannelMPI::_newLikeFlat(std::vector<at::Tensor>& tensors) const { // TODO: check if all outputs are contiguous in memory and skip this step is yes if (tensors.size() == 0) throw std::runtime_error("received an empty list"); auto & t = tensors[0]; at::DeviceGuard gpu_guard(t.is_cuda() ? t.get_device() : -1); std::vector<int64_t> sizes { static_cast<int64_t>(tensors.size()) }; // sizes = [output.size()] + input.sizes() sizes.insert(sizes.end(), t.sizes().begin(), t.sizes().end()); return t.type().tensor(sizes); }