static PyObject * THPStorage_(newSharedFd)(PyObject *_unused, PyObject *args) { HANDLE_TH_ERRORS THPUtils_assert(PyTuple_GET_SIZE(args) == 2, "tuple of 2 items expected"); PyObject *_tmp_fd = PyTuple_GET_ITEM(args, 0); PyObject *_size = PyTuple_GET_ITEM(args, 1); if (!THPUtils_checkLong(_tmp_fd) || !THPUtils_checkLong(_size)) { THPUtils_invalidArguments(args, NULL, "_new_shared in file descriptor mode", 1, "a file descriptor (int) and storage size (int)"); return NULL; } int fd; int tmp_fd = (int) THPUtils_unpackLong(_tmp_fd); int64_t size = THPUtils_unpackLong(_size); if ((fd = dup(tmp_fd)) == -1) { THPUtils_setError("could not duplicate a shared memory file descriptor"); return NULL; } int flags = TH_ALLOCATOR_MAPPED_SHAREDMEM | TH_ALLOCATOR_MAPPED_NOCREATE | TH_ALLOCATOR_MAPPED_KEEPFD | TH_ALLOCATOR_MAPPED_FROMFD; THMapAllocatorContext *ctx = THMapAllocatorContext_newWithFd(NULL, fd, flags); return THPStorage_(New)(THStorage_(newWithAllocator)(size, &THMapAllocator, (void*)ctx)); END_HANDLE_TH_ERRORS }
static PyObject * THPStorage_(newSharedCuda)(PyObject *_unused, PyObject *args) { HANDLE_TH_ERRORS THPUtils_assert(PyTuple_GET_SIZE(args) == 5, "tuple of 5 items expected"); PyObject *_device = PyTuple_GET_ITEM(args, 0); PyObject *_handle = PyTuple_GET_ITEM(args, 1); PyObject *_size = PyTuple_GET_ITEM(args, 2); PyObject *_offset = PyTuple_GET_ITEM(args, 3); PyObject *_view_size = PyTuple_GET_ITEM(args, 4); if (!(THPUtils_checkLong(_device) && THPUtils_checkLong(_size) && (_handle == Py_None || PyBytes_Check(_handle)) && THPUtils_checkLong(_offset) && THPUtils_checkLong(_view_size))) { THPUtils_invalidArguments(args, NULL, "_new_shared in CUDA mode", 1, "(int device, bytes handle, int storage_size, int offset, int view_size"); return NULL; } size_t storage_size = (size_t)THPUtils_unpackLong(_size); ptrdiff_t offset = (ptrdiff_t)THPUtils_unpackLong(_offset); size_t view_size = (size_t)THPUtils_unpackLong(_view_size); int64_t device = THPUtils_unpackLong(_device); AutoGPU __autogpu(device); char *buffer; Py_ssize_t handle_size; if (PyBytes_AsStringAndSize(_handle, &buffer, &handle_size) == -1) { return NULL; } THPUtils_assert(handle_size == CUDA_IPC_HANDLE_SIZE, "incorrect handle size"); cudaIpcMemHandle_t handle = *(cudaIpcMemHandle_t*)buffer; void *devPtr = NULL; THCudaCheck(cudaIpcOpenMemHandle(&devPtr, handle, cudaIpcMemLazyEnablePeerAccess)); THStoragePtr base(THStorage_(newWithDataAndAllocator)( LIBRARY_STATE (real*)devPtr, storage_size, &THCIpcAllocator, (void*)device)); base->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_FREEMEM; if (offset != 0 || view_size != storage_size) { return THPStorage_(newTHView)(base.get(), offset, view_size); } return THPStorage_(New)(base.release()); END_HANDLE_TH_ERRORS }
static PyObject * THPModule_initNames(PyObject *self, PyObject *arg) { static std::vector<std::string> names; THPObjectPtr types(PySequence_Fast(arg, "expected a sequence")); if (!types) return NULL; int num_classes = PySequence_Fast_GET_SIZE(types.get()); names.reserve(names.size() + num_classes); for (int i = 0; i < num_classes; i++) { PyObject* obj = PySequence_Fast_GET_ITEM(types.get(), i); THPUtils_assert(PyType_Check(obj), "expected a PyTypeObject"); PyTypeObject* type = (PyTypeObject*)obj; THPObjectPtr module_name(PyObject_GetAttrString(obj, "__module__")); if (!module_name) return NULL; THPUtils_assert(THPUtils_checkString(module_name.get()), "expected __module__ to be a string"); std::string name = THPUtils_unpackString(module_name.get()); names.push_back(name + "." + type->tp_name); type->tp_name = names.back().c_str(); } Py_RETURN_NONE; }
PyObject * THCPModule_nccl_reduce(PyObject *self, PyObject *args) { HANDLE_TH_ERRORS PyObject *_inputs, *_outputs, *_streams; int root, op; if (!PyArg_ParseTuple(args, "OOOii", &_inputs, &_outputs, &_streams, &root, &op)) { THPUtils_invalidArguments(args, NULL, "nccl_reduce", 1, "(sequence[Tensor] inputs, sequence[Tensor]" " outputs, sequence[torch.cuda.Stream or None], int root, int op"); return NULL; } std::vector<at::Tensor> inputs = THPUtils_PySequence_to_TensorList(_inputs); std::vector<at::Tensor> outputs = THPUtils_PySequence_to_TensorList(_outputs); std::vector<THCStream*> streams = THPUtils_PySequence_to_THCStreamList(_streams); THPUtils_assert(inputs.size() == streams.size(), "number of streams is not equal to number of inputs"); // we can safely release GIL after this line, no python API used AutoNoGIL no_gil; _check_inputs(inputs, outputs, 1, 1); size_t len = inputs.size(); ncclDataType_t data_type = _get_data_type(inputs[0].type().ID()); int64_t count = inputs[0].numel(); std::lock_guard<std::mutex> lock(*(THCCachingAllocator_getCudaFreeMutex())); ncclComm_t *comm = _get_communicator(inputs); AutoGPU gpu_guard; #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2) CHECK(ncclGroupStart()); #endif for (size_t i = 0; i < len; i++) { int device = inputs[i].get_device(); gpu_guard.setDevice(device); auto stream = (streams[i] == NULL) ? NULL : streams[i]->stream; CHECK(ncclReduce(inputs[i].data_ptr(), outputs[i].data_ptr(), count, data_type, (ncclRedOp_t) op, root, comm[i], stream)); } #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2) CHECK(ncclGroupEnd()); #endif Py_RETURN_NONE; END_HANDLE_TH_ERRORS }
PyObject * THPStorage_(newWithWeakPtr)(PyObject *_unused, PyObject *arg) { HANDLE_TH_ERRORS THPObjectPtr ref(PyObject_GetAttrString(arg, "cdata")); if (!ref) { return NULL; } else if (ref.get() == Py_None) { Py_RETURN_NONE; } THPUtils_assert(THPUtils_checkLong(ref.get()), "_new_with_weak_ptr(): arg.cdata must be an 'int'"); THStorage *storage = (THStorage*)PyLong_AsVoidPtr(ref.get()); if (THStorage_(retainIfLive)(LIBRARY_STATE storage)) { return THPStorage_(New)(storage); } Py_RETURN_NONE; END_HANDLE_TH_ERRORS }
PyObject * THPStorage_(sharedFd)(THPStorage *self) { HANDLE_TH_ERRORS THMapAllocatorContext *ctx = NULL; #ifndef THC_GENERIC_FILE THStorage *storage = self->cdata; if (storage->allocator == &THMapAllocator) { ctx = (THMapAllocatorContext*)storage->allocatorContext; } else if (storage->allocator == &THStorageWeakRefAllocator) { auto allocator_obj = ((StorageWeakRefAllocator*)storage->allocatorContext); if (allocator_obj->allocator == &THMapAllocator) { ctx = (THMapAllocatorContext*)allocator_obj->allocatorContext; } } #endif THPUtils_assert(ctx, "couldn't retrieve a shared file descriptor"); return PyLong_FromLong(THMapAllocatorContext_fd(ctx)); END_HANDLE_TH_ERRORS }
static PyObject * THPStorage_(newSharedFilename)(PyObject *_unused, PyObject *args) { HANDLE_TH_ERRORS THPUtils_assert(PyTuple_GET_SIZE(args) == 3, "tuple of 3 items expected"); PyObject *_manager_handle = PyTuple_GET_ITEM(args, 0); PyObject *_object_handle = PyTuple_GET_ITEM(args, 1); PyObject *_size = PyTuple_GET_ITEM(args, 2); if (!PyBytes_Check(_manager_handle) || !PyBytes_Check(_object_handle) || !THPUtils_checkLong(_size)) { THPUtils_invalidArguments(args, NULL, "_new_shared in file system mode", 1, "a handle (string/bytes) and storage size (int)"); return NULL; } const char *manager_handle = PyBytes_AS_STRING(_manager_handle); const char *object_handle = PyBytes_AS_STRING(_object_handle); int64_t size = THPUtils_unpackLong(_size); int flags = TH_ALLOCATOR_MAPPED_SHAREDMEM | TH_ALLOCATOR_MAPPED_NOCREATE; libshm_context *ctx = libshm_context_new(manager_handle, object_handle, flags); return THPStorage_(New)(THStorage_(newWithAllocator)(size, &THManagedSharedAllocator, (void*)ctx)); END_HANDLE_TH_ERRORS }
static PyObject * THPStorage_(pynew)(PyTypeObject *type, PyObject *args, PyObject *kwargs) { HANDLE_TH_ERRORS Py_ssize_t num_args = args ? PyTuple_Size(args) : 0; THPStoragePtr self((THPStorage *)type->tp_alloc(type, 0)); THPUtils_assert(self, "failed to allocate a " THPStorageStr " object"); THAllocator* allocator = NULL; // Internally we allow constructing with a keywoard only argument cdata if (kwargs != NULL) { PyObject *allocator_ptr = PyDict_GetItemString(kwargs, "allocator"); if (allocator_ptr) { THPUtils_assert(THPUtils_checkLong(allocator_ptr), "invalid allocator"); allocator = (THAllocator*) PyLong_AsVoidPtr(allocator_ptr); PyDict_DelItemString(kwargs, "allocator"); } Py_ssize_t num_kwargs = PyDict_Size(kwargs); if (num_args == 0) { PyObject *cdata_ptr = PyDict_GetItemString(kwargs, "cdata"); if (num_kwargs == 1 && cdata_ptr && THPUtils_checkLong(cdata_ptr)) { THStorage *ptr = (THStorage*)PyLong_AsVoidPtr(cdata_ptr); self->cdata = ptr; return (PyObject*)self.release(); } } THPUtils_assert(num_kwargs == 0, THPStorageStr "(): invalid keyword arguments"); } // torch.Storage() if (num_args == 0) { if (allocator) { self->cdata = THPStorage_(newWithAllocator)(0, allocator); } else { self->cdata = THStorage_(new)(LIBRARY_STATE_NOARGS); } return (PyObject*)self.release(); } PyObject *first_arg = PyTuple_GET_ITEM(args, 0); // torch.Storage(size) if (num_args == 1 && THPUtils_checkLong(first_arg)) { int64_t size = THPUtils_unpackLong(first_arg); if (allocator) { self->cdata = THPStorage_(newWithAllocator)(size, allocator); } else { self->cdata = THStorage_(newWithSize)(LIBRARY_STATE size); } return (PyObject*)self.release(); } // torch.Storage(view_source, [offset, [size]]) if (num_args < 4 && THPStorage_(Check)(first_arg)) { #ifdef THD_GENERIC_FILE THPUtils_setError("distributed storages don't support storage views"); return NULL; #else THPStorage *storage_arg = (THPStorage *)first_arg; int64_t numel = storage_arg->cdata->size; int64_t offset = 0; if (num_args >= 2) { PyObject *second_arg = PyTuple_GET_ITEM(args, 1); if (!THPUtils_checkLong(second_arg)) goto invalid_arguments; offset = THPUtils_unpackLong(second_arg); } int64_t size = numel - offset; if (num_args >= 3) { PyObject *third_arg = PyTuple_GET_ITEM(args, 2); if (!THPUtils_checkLong(third_arg)) goto invalid_arguments; size = THPUtils_unpackLong(third_arg); } THPUtils_assert(offset >= 0 && offset <= numel, "specified an offset of " "%" PRId64 ", but the viewed storage has only %" PRId64 " element(s)", offset, numel); THPUtils_assert(size >= 1 && size <= numel - offset, "specified a size of " "%" PRId64 ", but the viewed storage has only %" PRId64 " element(s) after offset %" PRId64, size, numel - offset, offset); real *data_ptr = THStorage_(data)(LIBRARY_STATE storage_arg->cdata) + offset; THStoragePtr storage(THStorage_(newWithData)(LIBRARY_STATE data_ptr, size)); storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_VIEW; storage->view = storage_arg->cdata; THStorage_(retain)(LIBRARY_STATE storage_arg->cdata); self->cdata = storage.release(); return (PyObject*)self.release(); #endif } // torch.Storage(sequence) if (num_args == 1 && PySequence_Check(first_arg)) { #ifdef THD_GENERIC_FILE THPUtils_setError("distributed storages don't support construction from a sequence"); #else Py_ssize_t length = PySequence_Length(first_arg); THPUtils_assert(length >= 0, "couldn't obtain the length of %s", THPUtils_typename(first_arg)); self->cdata = THStorage_(newWithSize)(LIBRARY_STATE length); THPObjectPtr item; try { for (Py_ssize_t i = 0; i < length; i++) { item = PySequence_GetItem(first_arg, i); real value = THPUtils_(unpackReal)(item.get()); #if !defined(THC_GENERIC_FILE) self->cdata->unsafe_data<real>()[i] = value; #else // TODO: this might be slow - consider batched updates? THCStorage_(set)(LIBRARY_STATE self->cdata, i, value); #endif } } catch (std::runtime_error &e) { THPUtils_setError("tried to construct a storage from a sequence (%s), " "but one of the items was of type %s instead of %s", THPUtils_typename(first_arg), THPUtils_typename(item.get()), THPUtils_typeTraits<real>::python_type_str); return NULL; } return (PyObject*)self.release(); #endif } #ifndef THD_GENERIC_FILE invalid_arguments: #endif THPUtils_invalidArguments(args, kwargs, THPStorageStr " constructor", 6, "no arguments", "(int size)", "(Sequence data)", "(" THPStorageStr " view_source)", "(" THPStorageStr " view_source, int offset)", "(" THPStorageStr " view_source, int offset, int size)"); return NULL; END_HANDLE_TH_ERRORS }
THStorage * THPStorage_(readFileRaw)(io file, THStorage *_storage) { real *data; int64_t size; ssize_t result = doRead(file, &size, sizeof(int64_t)); if (result == 0) throw std::runtime_error("unexpected EOF. The file might be corrupted."); if (result != sizeof(int64_t)) throw std::system_error(result, std::system_category()); THStoragePtr storage; if (_storage == nullptr) { storage = THStorage_(newWithSize)(LIBRARY_STATE size); } else { THPUtils_assert(_storage->size == size, "storage has wrong size: expected %ld got %ld", size, _storage->size); storage = _storage; } #ifndef THC_GENERIC_FILE data = storage->data; #else std::unique_ptr<char[]> cpu_data(new char[size * sizeof(real)]); data = (real*)cpu_data.get(); #endif // fast track for bytes and little endian if (sizeof(real) == 1 || THP_nativeByteOrder() == THPByteOrder::THP_LITTLE_ENDIAN) { char *bytes = (char *) data; int64_t remaining = sizeof(real) * storage->size; while (remaining > 0) { // we write and read in 1GB blocks to avoid bugs on some OSes ssize_t result = doRead(file, bytes, THMin(remaining, 1073741824)); if (result == 0) // 0 means EOF, which is also an error throw std::runtime_error("unexpected EOF. The file might be corrupted."); if (result < 0) throw std::system_error(result, std::system_category()); bytes += result; remaining -= result; } if (remaining != 0) throw std::system_error(result, std::system_category()); } else { int64_t buffer_size = std::min(size, (int64_t)5000); std::unique_ptr<uint8_t[]> le_buffer(new uint8_t[buffer_size * sizeof(real)]); for (int64_t i = 0; i < size; i += buffer_size) { size_t to_convert = std::min(size - i, buffer_size); SYSCHECK(doRead(file, le_buffer.get(), sizeof(real) * to_convert)); if (sizeof(real) == 2) { THP_decodeInt16Buffer((int16_t*)data + i, le_buffer.get(), THPByteOrder::THP_LITTLE_ENDIAN, to_convert); } else if (sizeof(real) == 4) { THP_decodeInt32Buffer((int32_t*)data + i, le_buffer.get(), THPByteOrder::THP_LITTLE_ENDIAN, to_convert); } else if (sizeof(real) == 8) { THP_decodeInt64Buffer((int64_t*)data + i, le_buffer.get(), THPByteOrder::THP_LITTLE_ENDIAN, to_convert); } } } #ifdef THC_GENERIC_FILE THCudaCheck(cudaMemcpy(storage->data, data, size * sizeof(real), cudaMemcpyHostToDevice)); #endif return storage.release(); }