std::vector<int64_t> to_arg_sizes(TensorList tensors, int64_t dim) { std::vector<int64_t> arg_sizes(tensors.size()); for (size_t i = 0; i < tensors.size(); ++i) { arg_sizes[i] = tensors[i].size(dim); } return arg_sizes; }
Tensor index(const Tensor & self, TensorList indices) { if (indices.size() > (size_t)self.dim()) { AT_ERROR("too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")"); } Tensor src, linearIndex; std::tie(src, linearIndex) = makeLinearIndex(self, indices); return src.take(linearIndex); }
void _check_inputs(TensorList inputs, TensorList outputs, int input_multiplier, int output_multiplier) { // len(inputs) == len(outputs) size_t len = inputs.size(); if (len <= 0) { throw std::runtime_error("input sequence can't be empty"); } if (len != outputs.size()) { std::stringstream err; err << "inputs and outputs sequences have to be of the same length, but got input of length " << len << " and output of length " << outputs.size(); throw std::runtime_error(err.str()); } device_set devices; int64_t numel = inputs[0].numel(); auto& type = inputs[0].type(); for (size_t i = 0; i < len; i++) { auto input = inputs[i]; auto output = outputs[i]; if (!(input.type().is_cuda() && !input.type().is_sparse() && output.type().is_cuda() && !output.type().is_sparse())) { throw std::runtime_error("input and output elements have to be cuda dense Tensors"); } if (!(type == input.type() && type == output.type())) { throw std::runtime_error("all inputs and outputs must be of the same Tensor type"); } if (!input.is_contiguous() || !output.is_contiguous()) { throw std::runtime_error("all inputs and outputs have to be contiguous"); } auto input_device = input.get_device(); // inputs must be on unique devices if (devices.test(input_device)) { throw std::runtime_error("inputs must be on unique devices"); } devices.set(input_device); // inputs and outputs must be on same device respectively if (input_device != output.get_device()) { throw std::runtime_error("input and output must be on the same device"); } // all inputs must be same size if (input.numel() != numel) { throw std::runtime_error("all inputs must have the same number of elements"); } if (output.numel() * output_multiplier != numel * input_multiplier) { throw std::runtime_error("output must be of size input_size * size_multiplier"); } } }
void broadcast(TensorList tensors, const stream_list& streams, const comm_list& user_comms) { #ifdef WITH_NCCL using namespace torch::cuda::nccl::detail; _check_inputs(tensors, tensors, 1, 1); ncclDataType_t data_type = _get_data_type(tensors[0].type()); int64_t numel = tensors[0].numel(); std::lock_guard<std::mutex> free_mutex(*(THCCachingAllocator_getCudaFreeMutex())); const auto comms = user_comms.empty() ? _get_communicators(tensors) : ArrayRef<ncclComm_t>(user_comms); AutoGPU gpu_guard; AutoNcclGroup nccl_group_guard; for (size_t i = 0, num_tensors = tensors.size(); i < num_tensors; i++) { gpu_guard.setDevice(tensors[i].get_device()); // TODO: use current stream const auto stream = (streams.empty() || !streams[i]) ? NULL : streams[i]->stream; CHECK(ncclBcast(tensors[i].data_ptr(), numel, data_type, 0, comms[i], stream)); } #else throw std::runtime_error("PyTorch built without NCCL support"); #endif }