示例#1
0
static PyObject * THPStorage_(newSharedFd)(PyObject *_unused, PyObject *args)
{
  HANDLE_TH_ERRORS
  THPUtils_assert(PyTuple_GET_SIZE(args) == 2, "tuple of 2 items expected");
  PyObject *_tmp_fd = PyTuple_GET_ITEM(args, 0);
  PyObject *_size = PyTuple_GET_ITEM(args, 1);
  if (!THPUtils_checkLong(_tmp_fd) || !THPUtils_checkLong(_size)) {
    THPUtils_invalidArguments(args, NULL, "_new_shared in file descriptor mode",
        1, "a file descriptor (int) and storage size (int)");
    return NULL;
  }
  int fd;
  int tmp_fd = (int) THPUtils_unpackLong(_tmp_fd);
  int64_t size = THPUtils_unpackLong(_size);
  if ((fd = dup(tmp_fd)) == -1) {
    THPUtils_setError("could not duplicate a shared memory file descriptor");
    return NULL;
  }

  int flags = TH_ALLOCATOR_MAPPED_SHAREDMEM |
              TH_ALLOCATOR_MAPPED_NOCREATE |
              TH_ALLOCATOR_MAPPED_KEEPFD |
              TH_ALLOCATOR_MAPPED_FROMFD;
  THMapAllocatorContext *ctx = THMapAllocatorContext_newWithFd(NULL, fd, flags);
  return THPStorage_(New)(THStorage_(newWithAllocator)(size, &THMapAllocator,
      (void*)ctx));
  END_HANDLE_TH_ERRORS
}
示例#2
0
static PyObject * THPStorage_(newSharedCuda)(PyObject *_unused, PyObject *args)
{
  HANDLE_TH_ERRORS
  THPUtils_assert(PyTuple_GET_SIZE(args) == 5, "tuple of 5 items expected");
  PyObject *_device = PyTuple_GET_ITEM(args, 0);
  PyObject *_handle = PyTuple_GET_ITEM(args, 1);
  PyObject *_size = PyTuple_GET_ITEM(args, 2);
  PyObject *_offset = PyTuple_GET_ITEM(args, 3);
  PyObject *_view_size = PyTuple_GET_ITEM(args, 4);
  if (!(THPUtils_checkLong(_device) && THPUtils_checkLong(_size)
      && (_handle == Py_None || PyBytes_Check(_handle))
      && THPUtils_checkLong(_offset) && THPUtils_checkLong(_view_size))) {
    THPUtils_invalidArguments(args, NULL, "_new_shared in CUDA mode", 1,
        "(int device, bytes handle, int storage_size, int offset, int view_size");
    return NULL;
  }

  size_t storage_size = (size_t)THPUtils_unpackLong(_size);
  ptrdiff_t offset = (ptrdiff_t)THPUtils_unpackLong(_offset);
  size_t view_size =  (size_t)THPUtils_unpackLong(_view_size);

  int64_t device = THPUtils_unpackLong(_device);
  AutoGPU __autogpu(device);

  char *buffer;
  Py_ssize_t handle_size;
  if (PyBytes_AsStringAndSize(_handle, &buffer, &handle_size) == -1) {
    return NULL;
  }
  THPUtils_assert(handle_size == CUDA_IPC_HANDLE_SIZE, "incorrect handle size");
  cudaIpcMemHandle_t handle = *(cudaIpcMemHandle_t*)buffer;

  void *devPtr = NULL;
  THCudaCheck(cudaIpcOpenMemHandle(&devPtr, handle, cudaIpcMemLazyEnablePeerAccess));

  THStoragePtr base(THStorage_(newWithDataAndAllocator)(
      LIBRARY_STATE (real*)devPtr, storage_size, &THCIpcAllocator, (void*)device));
  base->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_FREEMEM;

  if (offset != 0 || view_size != storage_size) {
    return THPStorage_(newTHView)(base.get(), offset, view_size);
  }

  return THPStorage_(New)(base.release());
  END_HANDLE_TH_ERRORS
}
示例#3
0
static PyObject * THPModule_initNames(PyObject *self, PyObject *arg)
{
  static std::vector<std::string> names;

  THPObjectPtr types(PySequence_Fast(arg, "expected a sequence"));
  if (!types) return NULL;

  int num_classes = PySequence_Fast_GET_SIZE(types.get());
  names.reserve(names.size() + num_classes);
  for (int i = 0; i < num_classes; i++) {
    PyObject* obj = PySequence_Fast_GET_ITEM(types.get(), i);
    THPUtils_assert(PyType_Check(obj), "expected a PyTypeObject");
    PyTypeObject* type = (PyTypeObject*)obj;

    THPObjectPtr module_name(PyObject_GetAttrString(obj, "__module__"));
    if (!module_name) return NULL;
    THPUtils_assert(THPUtils_checkString(module_name.get()),
        "expected __module__ to be a string");
    std::string name = THPUtils_unpackString(module_name.get());
    names.push_back(name + "." + type->tp_name);
    type->tp_name = names.back().c_str();
  }
  Py_RETURN_NONE;
}
示例#4
0
PyObject * THCPModule_nccl_reduce(PyObject *self, PyObject *args) {
  HANDLE_TH_ERRORS
  PyObject *_inputs, *_outputs, *_streams;
  int root, op;

  if (!PyArg_ParseTuple(args, "OOOii", &_inputs, &_outputs, &_streams, &root, &op)) {
    THPUtils_invalidArguments(args, NULL, "nccl_reduce", 1,
			      "(sequence[Tensor] inputs, sequence[Tensor]"
			      " outputs, sequence[torch.cuda.Stream or None], int root, int op");
    return NULL;
  }

  std::vector<at::Tensor> inputs = THPUtils_PySequence_to_TensorList(_inputs);
  std::vector<at::Tensor> outputs = THPUtils_PySequence_to_TensorList(_outputs);
  std::vector<THCStream*> streams = THPUtils_PySequence_to_THCStreamList(_streams);

  THPUtils_assert(inputs.size() == streams.size(), "number of streams is not equal to number of inputs");

  // we can safely release GIL after this line, no python API used
  AutoNoGIL no_gil;
  _check_inputs(inputs, outputs, 1, 1);
  size_t len = inputs.size();

  ncclDataType_t data_type = _get_data_type(inputs[0].type().ID());

  int64_t count = inputs[0].numel();
  std::lock_guard<std::mutex> lock(*(THCCachingAllocator_getCudaFreeMutex()));
  ncclComm_t *comm = _get_communicator(inputs);
  AutoGPU gpu_guard;
#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
  CHECK(ncclGroupStart());
#endif
  for (size_t i = 0; i < len; i++) {
    int device = inputs[i].get_device();
    gpu_guard.setDevice(device);
    auto stream = (streams[i] == NULL) ? NULL : streams[i]->stream;
    CHECK(ncclReduce(inputs[i].data_ptr(), outputs[i].data_ptr(),
		     count, data_type, (ncclRedOp_t) op, root, comm[i], stream));
  }
#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
  CHECK(ncclGroupEnd());
#endif

  Py_RETURN_NONE;
  END_HANDLE_TH_ERRORS
}
示例#5
0
PyObject * THPStorage_(newWithWeakPtr)(PyObject *_unused, PyObject *arg)
{
  HANDLE_TH_ERRORS
  THPObjectPtr ref(PyObject_GetAttrString(arg, "cdata"));
  if (!ref) {
    return NULL;
  } else if (ref.get() == Py_None) {
    Py_RETURN_NONE;
  }
  THPUtils_assert(THPUtils_checkLong(ref.get()),
      "_new_with_weak_ptr(): arg.cdata must be an 'int'");
  THStorage *storage = (THStorage*)PyLong_AsVoidPtr(ref.get());
  if (THStorage_(retainIfLive)(LIBRARY_STATE storage)) {
    return THPStorage_(New)(storage);
  }
  Py_RETURN_NONE;
  END_HANDLE_TH_ERRORS
}
示例#6
0
PyObject * THPStorage_(sharedFd)(THPStorage *self)
{
  HANDLE_TH_ERRORS
  THMapAllocatorContext *ctx = NULL;
#ifndef THC_GENERIC_FILE
  THStorage *storage = self->cdata;
  if (storage->allocator == &THMapAllocator) {
    ctx = (THMapAllocatorContext*)storage->allocatorContext;
  } else if (storage->allocator == &THStorageWeakRefAllocator) {
    auto allocator_obj = ((StorageWeakRefAllocator*)storage->allocatorContext);
    if (allocator_obj->allocator == &THMapAllocator) {
      ctx = (THMapAllocatorContext*)allocator_obj->allocatorContext;
    }
  }
#endif

  THPUtils_assert(ctx, "couldn't retrieve a shared file descriptor");
  return PyLong_FromLong(THMapAllocatorContext_fd(ctx));
  END_HANDLE_TH_ERRORS
}
示例#7
0
static PyObject * THPStorage_(newSharedFilename)(PyObject *_unused, PyObject *args)
{
  HANDLE_TH_ERRORS
  THPUtils_assert(PyTuple_GET_SIZE(args) == 3, "tuple of 3 items expected");
  PyObject *_manager_handle = PyTuple_GET_ITEM(args, 0);
  PyObject *_object_handle = PyTuple_GET_ITEM(args, 1);
  PyObject *_size = PyTuple_GET_ITEM(args, 2);
  if (!PyBytes_Check(_manager_handle) || !PyBytes_Check(_object_handle) || !THPUtils_checkLong(_size)) {
    THPUtils_invalidArguments(args, NULL, "_new_shared in file system mode", 1,
        "a handle (string/bytes) and storage size (int)");
    return NULL;
  }
  const char *manager_handle = PyBytes_AS_STRING(_manager_handle);
  const char *object_handle = PyBytes_AS_STRING(_object_handle);
  int64_t size = THPUtils_unpackLong(_size);
  int flags = TH_ALLOCATOR_MAPPED_SHAREDMEM |
              TH_ALLOCATOR_MAPPED_NOCREATE;
  libshm_context *ctx = libshm_context_new(manager_handle, object_handle, flags);
  return THPStorage_(New)(THStorage_(newWithAllocator)(size,
      &THManagedSharedAllocator, (void*)ctx));
  END_HANDLE_TH_ERRORS
}
示例#8
0
static PyObject * THPStorage_(pynew)(PyTypeObject *type, PyObject *args, PyObject *kwargs)
{
  HANDLE_TH_ERRORS
  Py_ssize_t num_args = args ? PyTuple_Size(args) : 0;

  THPStoragePtr self((THPStorage *)type->tp_alloc(type, 0));
  THPUtils_assert(self, "failed to allocate a " THPStorageStr " object");
  THAllocator* allocator = NULL;

  // Internally we allow constructing with a keywoard only argument cdata
  if (kwargs != NULL) {
    PyObject *allocator_ptr = PyDict_GetItemString(kwargs, "allocator");
    if (allocator_ptr) {
      THPUtils_assert(THPUtils_checkLong(allocator_ptr), "invalid allocator");
      allocator = (THAllocator*) PyLong_AsVoidPtr(allocator_ptr);
      PyDict_DelItemString(kwargs, "allocator");
    }

    Py_ssize_t num_kwargs = PyDict_Size(kwargs);
    if (num_args == 0) {
      PyObject *cdata_ptr = PyDict_GetItemString(kwargs, "cdata");
      if (num_kwargs == 1 && cdata_ptr && THPUtils_checkLong(cdata_ptr)) {
        THStorage *ptr = (THStorage*)PyLong_AsVoidPtr(cdata_ptr);
        self->cdata = ptr;
        return (PyObject*)self.release();
      }
    }
    THPUtils_assert(num_kwargs == 0, THPStorageStr "(): invalid keyword arguments");
  }

  // torch.Storage()
  if (num_args == 0) {
    if (allocator) {
      self->cdata = THPStorage_(newWithAllocator)(0, allocator);
    } else {
      self->cdata = THStorage_(new)(LIBRARY_STATE_NOARGS);
    }
    return (PyObject*)self.release();
  }

  PyObject *first_arg = PyTuple_GET_ITEM(args, 0);

  // torch.Storage(size)
  if (num_args == 1 && THPUtils_checkLong(first_arg)) {
    int64_t size = THPUtils_unpackLong(first_arg);
    if (allocator) {
      self->cdata = THPStorage_(newWithAllocator)(size, allocator);
    } else {
      self->cdata = THStorage_(newWithSize)(LIBRARY_STATE size);
    }
    return (PyObject*)self.release();
  }

  // torch.Storage(view_source, [offset, [size]])
  if (num_args < 4 && THPStorage_(Check)(first_arg)) {
#ifdef THD_GENERIC_FILE
    THPUtils_setError("distributed storages don't support storage views");
    return NULL;
#else
    THPStorage *storage_arg = (THPStorage *)first_arg;
    int64_t numel = storage_arg->cdata->size;
    int64_t offset = 0;

    if (num_args >= 2) {
      PyObject *second_arg = PyTuple_GET_ITEM(args, 1);
      if (!THPUtils_checkLong(second_arg))
        goto invalid_arguments;
      offset = THPUtils_unpackLong(second_arg);
    }

    int64_t size = numel - offset;
    if (num_args >= 3) {
      PyObject *third_arg = PyTuple_GET_ITEM(args, 2);
      if (!THPUtils_checkLong(third_arg))
        goto invalid_arguments;
      size = THPUtils_unpackLong(third_arg);
    }

    THPUtils_assert(offset >= 0 && offset <= numel, "specified an offset of "
        "%" PRId64 ", but the viewed storage has only %" PRId64 " element(s)", offset, numel);
    THPUtils_assert(size >= 1 && size <= numel - offset, "specified a size of "
        "%" PRId64 ", but the viewed storage has only %" PRId64 " element(s) after offset %" PRId64,
        size, numel - offset, offset);

    real *data_ptr = THStorage_(data)(LIBRARY_STATE storage_arg->cdata) + offset;
    THStoragePtr storage(THStorage_(newWithData)(LIBRARY_STATE data_ptr, size));
    storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_VIEW;
    storage->view = storage_arg->cdata;
    THStorage_(retain)(LIBRARY_STATE storage_arg->cdata);
    self->cdata = storage.release();
    return (PyObject*)self.release();
#endif
  }

  // torch.Storage(sequence)
  if (num_args == 1 && PySequence_Check(first_arg)) {
#ifdef THD_GENERIC_FILE
    THPUtils_setError("distributed storages don't support construction from a sequence");
#else
    Py_ssize_t length = PySequence_Length(first_arg);
    THPUtils_assert(length >= 0, "couldn't obtain the length of %s",
        THPUtils_typename(first_arg));
    self->cdata = THStorage_(newWithSize)(LIBRARY_STATE length);
    THPObjectPtr item;
    try {
      for (Py_ssize_t i = 0; i < length; i++) {
        item = PySequence_GetItem(first_arg, i);
        real value = THPUtils_(unpackReal)(item.get());
#if !defined(THC_GENERIC_FILE)
        self->cdata->unsafe_data<real>()[i] = value;
#else
        // TODO: this might be slow - consider batched updates?
        THCStorage_(set)(LIBRARY_STATE self->cdata, i, value);
#endif
      }
    } catch (std::runtime_error &e) {
      THPUtils_setError("tried to construct a storage from a sequence (%s), "
          "but one of the items was of type %s instead of %s",
          THPUtils_typename(first_arg),
          THPUtils_typename(item.get()),
          THPUtils_typeTraits<real>::python_type_str);
      return NULL;
    }
    return (PyObject*)self.release();
#endif
  }

#ifndef THD_GENERIC_FILE
invalid_arguments:
#endif
  THPUtils_invalidArguments(args, kwargs, THPStorageStr " constructor", 6,
          "no arguments",
          "(int size)",
          "(Sequence data)",
          "(" THPStorageStr " view_source)",
          "(" THPStorageStr " view_source, int offset)",
          "(" THPStorageStr " view_source, int offset, int size)");
  return NULL;
  END_HANDLE_TH_ERRORS
}
示例#9
0
THStorage * THPStorage_(readFileRaw)(io file, THStorage *_storage)
{
  real *data;
  int64_t size;
  ssize_t result = doRead(file, &size, sizeof(int64_t));
  if (result == 0)
    throw std::runtime_error("unexpected EOF. The file might be corrupted.");
  if (result != sizeof(int64_t))
    throw std::system_error(result, std::system_category());
  THStoragePtr storage;
  if (_storage == nullptr) {
    storage = THStorage_(newWithSize)(LIBRARY_STATE size);
  } else {
    THPUtils_assert(_storage->size == size,
        "storage has wrong size: expected %ld got %ld",
        size, _storage->size);
    storage = _storage;
  }

#ifndef THC_GENERIC_FILE
  data = storage->data;
#else
  std::unique_ptr<char[]> cpu_data(new char[size * sizeof(real)]);
  data = (real*)cpu_data.get();
#endif

  // fast track for bytes and little endian
  if (sizeof(real) == 1 || THP_nativeByteOrder() == THPByteOrder::THP_LITTLE_ENDIAN) {
    char *bytes = (char *) data;
    int64_t remaining = sizeof(real) * storage->size;
    while (remaining > 0) {
      // we write and read in 1GB blocks to avoid bugs on some OSes
      ssize_t result = doRead(file, bytes, THMin(remaining, 1073741824));
      if (result == 0) // 0 means EOF, which is also an error
        throw std::runtime_error("unexpected EOF. The file might be corrupted.");
      if (result < 0)
        throw std::system_error(result, std::system_category());
      bytes += result;
      remaining -= result;
    }
    if (remaining != 0)
      throw std::system_error(result, std::system_category());
  } else {
    int64_t buffer_size = std::min(size, (int64_t)5000);
    std::unique_ptr<uint8_t[]> le_buffer(new uint8_t[buffer_size * sizeof(real)]);


    for (int64_t i = 0; i < size; i += buffer_size) {
      size_t to_convert = std::min(size - i, buffer_size);
      SYSCHECK(doRead(file, le_buffer.get(), sizeof(real) * to_convert));

      if (sizeof(real) == 2) {
        THP_decodeInt16Buffer((int16_t*)data + i,
            le_buffer.get(),
            THPByteOrder::THP_LITTLE_ENDIAN,
            to_convert);
      } else if (sizeof(real) == 4) {
        THP_decodeInt32Buffer((int32_t*)data + i,
            le_buffer.get(),
            THPByteOrder::THP_LITTLE_ENDIAN,
            to_convert);
      } else if (sizeof(real) == 8) {
        THP_decodeInt64Buffer((int64_t*)data + i,
            le_buffer.get(),
            THPByteOrder::THP_LITTLE_ENDIAN,
            to_convert);
      }
    }
  }

#ifdef THC_GENERIC_FILE
  THCudaCheck(cudaMemcpy(storage->data, data, size * sizeof(real), cudaMemcpyHostToDevice));
#endif
  return storage.release();
}