void DataChannelMPI::receive(at::Tensor& data, rank_type src_rank) { if (!data.is_contiguous()) throw std::logic_error("tensor to receive is not contiguous"); MPI_Recv(data.data_ptr(), data.numel(), mpi_datatype.at(data.type().scalarType()), src_rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); }
void DataChannelMPI::send(at::Tensor& data, rank_type dst_rank) { if (!data.is_contiguous()) throw std::logic_error("tensor to send is not contiguous"); MPI_Send(data.data_ptr(), data.numel(), mpi_datatype.at(data.type().scalarType()), dst_rank, 0, MPI_COMM_WORLD); }
at::Tensor sigmoid_add(at::Tensor x, at::Tensor y) { AT_CHECK(x.type().is_cuda(), "x must be a CUDA tensor"); AT_CHECK(y.type().is_cuda(), "y must be a CUDA tensor"); auto output = at::zeros_like(x); sigmoid_add_cuda( x.data<float>(), y.data<float>(), output.data<float>(), output.numel()); return output; }
rank_type DataChannelMPI::receive(at::Tensor& data) { if (!data.is_contiguous()) throw std::logic_error("tensor to receive is not contiguous"); MPI_Status status; MPI_Recv(data.data_ptr(), data.numel(), mpi_datatype.at(data.type().scalarType()), MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, &status); return status.MPI_SOURCE; }
void DataChannelMPI::allReduce(at::Tensor& data, THDReduceOp operation, THDGroup group_id) { const auto& comm = _groups.at(group_id).first; if (comm == MPI_COMM_NULL) return; if (!data.is_contiguous()) throw std::runtime_error("all_reduce input has to be contiguous"); MPI_Allreduce(MPI_IN_PLACE, data.data_ptr(), data.numel(), mpi_datatype.at(data.type().scalarType()), mpi_op.at(operation), comm); }
void DataChannelMPI::broadcast(at::Tensor& data, rank_type src_rank, THDGroup group_id) { const auto& group_pair = _groups.at(group_id); const auto& comm = group_pair.first; if (comm == MPI_COMM_NULL) return; if (!data.is_contiguous()) throw std::runtime_error("broadcast input has to be contiguous"); rank_type group_src_rank = group_pair.second.mustGetGroupRank(src_rank); MPI_Bcast(data.data_ptr(), data.numel(), mpi_datatype.at(data.type().scalarType()), group_src_rank, comm); }
void DataChannelMPI::reduce(at::Tensor& data, THDReduceOp operation, rank_type dst_rank, THDGroup group_id) { const auto& group_pair = _groups.at(group_id); const auto& comm = group_pair.first; if (comm == MPI_COMM_NULL) return; if (!data.is_contiguous()) throw std::runtime_error("reduce input has to be contiguous"); auto group_dst_rank = group_pair.second.mustGetGroupRank(dst_rank); void *sendbuf = (_rank == dst_rank) ? MPI_IN_PLACE : data.data_ptr(); void *recvbuf = (_rank == dst_rank) ? data.data_ptr() : nullptr; MPI_Reduce(sendbuf, recvbuf, data.numel(), mpi_datatype.at(data.type().scalarType()), mpi_op.at(operation), group_dst_rank, comm); }
void DataChannelMPI::allGather(std::vector<at::Tensor>& output, at::Tensor& input, THDGroup group_id) { const auto& group_pair = _groups.at(group_id); const auto& comm = group_pair.first; if (comm == MPI_COMM_NULL) return; if (output.size() != group_pair.second.size()) throw std::logic_error("allGather: number of output tensors and group size does not match"); for (auto out_tensor : output) assertSameSizeAndType(out_tensor, input, "allGather"); auto recv_buffer = _newLikeFlat(output); auto contig_input = input.contiguous(); MPI_Allgather( contig_input.data_ptr(), contig_input.numel(), mpi_datatype.at(contig_input.type().scalarType()), recv_buffer.data_ptr(), contig_input.numel(), mpi_datatype.at(recv_buffer.type().scalarType()), comm ); for (size_t i = 0; i < output.size(); ++i) output[i].copy_(recv_buffer[i]); }
std::tuple<Tensor, Tensor> fractional_max_pool2d_cpu( const at::Tensor& input, IntArrayRef pool_size, IntArrayRef output_size, const at::Tensor& randomSamples) { Tensor output = at::empty({0}, input.options()); Tensor indices = at::empty({0}, input.options().dtype(kLong)); fractional_max_pool2d_out_cpu_template( input, output, output_size, pool_size, indices, randomSamples); return std::tuple<Tensor, Tensor>(output, indices); }
std::tuple<at::Tensor,at::Tensor,at::Tensor> mkldnn_convolution_backward( const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight, IntList padding, IntList stride, IntList dilation, std::array<bool,3> output_mask) { Tensor grad_output = grad_output_t.contiguous(); Tensor grad_input, grad_weight, grad_bias; if (output_mask[0]) { grad_input = at::mkldnn_convolution_backward_input( input.sizes(), grad_output, weight, padding, stride, dilation, output_mask[2]); } if (output_mask[1] || output_mask[2]) { std::tie(grad_weight, grad_bias) = at::mkldnn_convolution_backward_weights( weight.sizes(), grad_output, input, padding, stride, dilation, output_mask[2]); } return std::tuple<Tensor, Tensor, Tensor>{grad_input, grad_weight, grad_bias}; }
at::Tensor nms_cuda(const at::Tensor input, float thresh) { AT_CHECK(input.ndimension() == 3, "First argument should be a 3D Tensor, (batch_sz x n_boxes x 4)"); // AT_CHECK(scores.ndimens/ion() == 2, // "Second argument should be a 2D Tensor, (batch_sz x n_boxes)"); // AT_CHECK(input.size(0) == scores.size(0), // "First and second arguments must have equal-sized first dimensions"); // AT_CHECK(input.size(1) == scores.size(1), // "First and second arguments must have equal-sized second dimensions"); AT_CHECK(input.size(2) == 4, "First argument dimension 2 must have size 4, and should be of the form [x, y, w, h]"); AT_CHECK(input.is_contiguous(), "First argument must be a contiguous Tensor"); // AT_CHECK(scores.is_contiguous(), "Second argument must be a contiguous Tensor"); AT_CHECK(input.type().scalarType() == at::kFloat || input.type().scalarType() == at::kDouble, "First argument must be Float or Double Tensor"); // AT_CHECK(scores.type().scalarType() == at::kFloat || scores.type().scalarType() == at::kDouble, // "Second argument must be Float or Double Tensor"); AT_CHECK(input.is_contiguous(), "First argument must be a contiguous Tensor"); // AT_CHECK(scores.is_contiguous(), "Second argument must be a contiguous Tensor"); return non_max_suppression_cuda(input, thresh); }
void DataChannelMPI::gather(std::vector<at::Tensor>& output, at::Tensor& input, rank_type dst_rank, THDGroup group_id) { const auto& group_pair = _groups.at(group_id); const auto& comm = group_pair.first; if (comm == MPI_COMM_NULL) return; at::Tensor recv_buffer; void *recvbuf = nullptr; if (_rank != dst_rank) { if (output.size() > 0) throw std::logic_error("gather: number of input tensors should be 0 for non root"); } else { if (output.size() != group_pair.second.size()) throw std::logic_error("gather: number of output tensors and group size does not match"); for (auto out_tensor : output) assertSameSizeAndType(out_tensor, input, "gather"); recv_buffer = _newLikeFlat(output); recvbuf = recv_buffer.data_ptr(); } rank_type group_dst_rank = group_pair.second.mustGetGroupRank(dst_rank); auto contig_input = input.contiguous(); MPI_Gather( contig_input.data_ptr(), input.numel(), mpi_datatype.at(input.type().scalarType()), recvbuf, input.numel(), mpi_datatype.at(input.type().scalarType()), group_dst_rank, comm ); // NOTE: this is a no-op in all processes except dst_rank for (size_t i = 0; i < output.size(); ++i) output[i].copy_(recv_buffer[i]); }
Tensor fractional_max_pool2d_backward_cpu( const at::Tensor& gradOutput_, const at::Tensor& input, IntArrayRef pool_size, IntArrayRef output_size, const at::Tensor& indices) { Tensor gradInput = at::empty({0}, input.options()); fractional_max_pool2d_backward_out_cpu_template( input, gradOutput_, gradInput, output_size, pool_size, indices); return gradInput; }
void DataChannelMPI::scatter(std::vector<at::Tensor>& input, at::Tensor& output, rank_type src_rank, THDGroup group_id) { const auto& group_pair = _groups.at(group_id); const auto& comm = group_pair.first; if (comm == MPI_COMM_NULL) return; if (!output.is_contiguous()) throw std::runtime_error("scatter output has to be a contiguous tensor"); at::Tensor send_buffer; void *sendbuf = nullptr; if (_rank != src_rank) { if (input.size() > 0) throw std::logic_error("scatter: number of input tensors should be 0 for non root"); } else { if (input.size() != group_pair.second.size()) throw std::logic_error("scatter: number of input tensors and group size does not match"); for (auto in_tensor : input) assertSameSizeAndType(in_tensor, output, "scatter"); send_buffer = _newLikeFlat(input); for (size_t i = 0; i < input.size(); ++i) send_buffer[i].copy_(input[i]); sendbuf = send_buffer.data_ptr(); } rank_type group_src_rank = group_pair.second.mustGetGroupRank(src_rank); MPI_Scatter( sendbuf, output.numel(), mpi_datatype.at(output.type().scalarType()), output.data_ptr(), output.numel(), mpi_datatype.at(output.type().scalarType()), group_src_rank, comm ); }
at::Tensor mkldnn_convolution( const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias, IntList padding, IntList stride, IntList dilation) { auto output = input.type().tensor(conv_output_size( input.sizes(), weight.sizes(), padding, stride, dilation)); auto cpu_engine = CpuEngine::Instance().get_engine(); int32_t n = input.size(0); int32_t ic = input.size(1); int32_t ih = input.size(2); int32_t iw = input.size(3); int32_t oc = output.size(1); int32_t oh = output.size(2); int32_t ow = output.size(3); int32_t kh = weight.size(2); int32_t kw = weight.size(3); int32_t sh = stride[0]; int32_t sw = stride[1]; int32_t ph = padding[0]; int32_t pw = padding[1]; auto data_t = memory::data_type::f32; auto format_any = memory::format::any; auto format_nchw = memory::format::nchw; auto format_oihw = memory::format::oihw; auto format_x = memory::format::x; memory::dims input_tz = {n, ic, ih, iw}; memory::dims weight_tz = {oc, ic, kh, kw}; memory::dims bias_tz = {oc}; memory::dims output_tz = {n, oc, oh, ow}; memory::dims _stride = {sh, sw}; memory::dims _padding = {ph, pw}; auto input_md = memory::desc({input_tz}, data_t, format_any); auto weight_md = memory::desc({weight_tz}, data_t, format_any); auto bias_md = memory::desc({bias_tz}, data_t, format_any); auto output_md = memory::desc({output_tz}, data_t, format_any); std::shared_ptr<convolution_forward::desc> conv_forward_desc; if (bias.defined()) { conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward, convolution_direct, input_md, weight_md, bias_md, output_md, _stride, _padding, _padding, padding_kind::zero)); } else { conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward, convolution_direct, input_md, weight_md, output_md, _stride, _padding, _padding, padding_kind::zero)); } std::shared_ptr<convolution_forward::primitive_desc> conv_forward_pd; conv_forward_pd.reset(new convolution_forward::primitive_desc( *conv_forward_desc, cpu_engine)); auto input_usr_memory = memory({{{input_tz}, data_t, format_nchw}, cpu_engine}, input.data_ptr()); auto weight_usr_memory = memory({{{weight_tz}, data_t, format_oihw}, cpu_engine}, weight.data_ptr()); auto output_usr_memory = memory({{{output_tz}, data_t, format_nchw}, cpu_engine}, output.data_ptr()); std::vector<primitive> net; auto input_pd = conv_forward_pd->src_primitive_desc(); auto input_memory = input_usr_memory; if (input_usr_memory.get_primitive_desc() != memory::primitive_desc(input_pd)) { input_memory = memory(input_pd); net.push_back(reorder(input_usr_memory, input_memory)); } auto weight_pd = conv_forward_pd->weights_primitive_desc(); auto weight_memory = weight_usr_memory; if (weight_usr_memory.get_primitive_desc() != memory::primitive_desc(weight_pd)) { weight_memory = memory(weight_pd); net.push_back(reorder(weight_usr_memory, weight_memory)); } auto output_pd = conv_forward_pd->dst_primitive_desc(); auto output_memory = output_usr_memory; if (output_usr_memory.get_primitive_desc() != memory::primitive_desc(output_pd)) { output_memory = memory(output_pd); } std::shared_ptr<convolution_forward> conv_forward; std::shared_ptr<memory> bias_usr_memory; if (bias.defined()) { bias_usr_memory.reset(new memory({{{bias_tz}, data_t, format_x}, cpu_engine}, bias.data_ptr())); conv_forward.reset(new convolution_forward(*conv_forward_pd, input_memory, weight_memory, *bias_usr_memory, output_memory)); } else { conv_forward.reset(new convolution_forward(*conv_forward_pd, input_memory, weight_memory, output_memory)); } net.push_back(*conv_forward); if (output_memory != output_usr_memory) { net.push_back(reorder(output_memory, output_usr_memory)); } Stream::Instance().get_stream().submit(net); return output; }
std::tuple<at::Tensor, at::Tensor> mkldnn_convolution_backward_weights( IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input, IntList padding, IntList stride, IntList dilation, bool bias_defined) { auto grad_weight = grad_output.type().tensor(weight_size); Tensor grad_bias; if (bias_defined) { grad_bias = grad_output.type().tensor({grad_output.size(1)}); } auto cpu_engine = CpuEngine::Instance().get_engine(); int32_t n = input.size(0); int32_t ic = input.size(1); int32_t ih = input.size(2); int32_t iw = input.size(3); int32_t oc = grad_output.size(1); int32_t oh = grad_output.size(2); int32_t ow = grad_output.size(3); int32_t kh = grad_weight.size(2); int32_t kw = grad_weight.size(3); int32_t sh = stride[0]; int32_t sw = stride[1]; int32_t ph = padding[0]; int32_t pw = padding[1]; auto data_t = memory::data_type::f32; auto format_any = memory::format::any; auto format_nchw = memory::format::nchw; auto format_oihw = memory::format::oihw; auto format_x = memory::format::x; memory::dims input_tz = {n, ic, ih, iw}; memory::dims weight_tz = {oc, ic, kh, kw}; memory::dims bias_tz = {oc}; memory::dims output_tz = {n, oc, oh, ow}; memory::dims _stride = {sh, sw}; memory::dims _padding = {ph, pw}; memory::desc input_md({input_tz}, data_t, format_any); memory::desc weight_md({weight_tz}, data_t, format_any); memory::desc bias_md({bias_tz}, data_t, format_any); memory::desc output_md({output_tz}, data_t, format_any); // need to re-create conv_forward_pd to feed conv_backward_weight_pd std::shared_ptr<convolution_forward::desc> conv_forward_desc; if (bias_defined) { conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward, convolution_direct, input_md, weight_md, bias_md, output_md, _stride, _padding, _padding, padding_kind::zero)); } else { conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward, convolution_direct, input_md, weight_md, output_md, _stride, _padding, _padding, padding_kind::zero)); } std::shared_ptr<convolution_forward::primitive_desc> conv_forward_pd; conv_forward_pd.reset(new convolution_forward::primitive_desc( *conv_forward_desc, cpu_engine)); std::shared_ptr<convolution_backward_weights::desc> conv_backward_weight_desc; if (bias_defined) { conv_backward_weight_desc.reset(new convolution_backward_weights::desc( convolution_direct, input_md, weight_md, bias_md, output_md, _stride, _padding, _padding, padding_kind::zero)); } else { conv_backward_weight_desc.reset(new convolution_backward_weights::desc( convolution_direct, input_md, weight_md, output_md, _stride, _padding, _padding, padding_kind::zero)); } std::shared_ptr<convolution_backward_weights::primitive_desc> conv_backward_weight_pd; conv_backward_weight_pd.reset(new convolution_backward_weights::primitive_desc( *conv_backward_weight_desc, cpu_engine, *conv_forward_pd)); auto input_usr_memory = memory({{{input_tz}, data_t, format_nchw}, cpu_engine}, input.data_ptr()); auto grad_output_usr_memory = memory({{{output_tz}, data_t, format_nchw}, cpu_engine}, grad_output.data_ptr()); auto grad_weight_usr_memory = memory({{{weight_tz}, data_t, format_oihw}, cpu_engine}, grad_weight.data_ptr()); std::shared_ptr<memory> grad_bias_memory; std::vector<primitive> net; auto input_pd = conv_backward_weight_pd->src_primitive_desc(); auto input_memory = input_usr_memory; if (input_usr_memory.get_primitive_desc() != memory::primitive_desc(input_pd)) { input_memory = memory(input_pd); net.push_back(reorder(input_usr_memory, input_memory)); } auto grad_output_pd = conv_backward_weight_pd->diff_dst_primitive_desc(); auto grad_output_memory = grad_output_usr_memory; if (grad_output_usr_memory.get_primitive_desc() != memory::primitive_desc(grad_output_pd)) { grad_output_memory = memory(grad_output_pd); net.push_back(reorder(grad_output_usr_memory, grad_output_memory)); } auto grad_weight_pd = conv_backward_weight_pd->diff_weights_primitive_desc(); auto grad_weight_memory = grad_weight_usr_memory; if (grad_weight_usr_memory.get_primitive_desc() != memory::primitive_desc(grad_weight_pd)) { grad_weight_memory = memory(grad_weight_pd); } std::shared_ptr<convolution_backward_weights> conv_backward_weight; if (bias_defined) { grad_bias_memory.reset(new memory({{{bias_tz}, data_t, format_x}, cpu_engine}, grad_bias.data_ptr())); conv_backward_weight.reset(new convolution_backward_weights(*conv_backward_weight_pd, input_memory, grad_output_memory, grad_weight_memory, *grad_bias_memory)); } else { conv_backward_weight.reset(new convolution_backward_weights(*conv_backward_weight_pd, input_memory, grad_output_memory, grad_weight_memory)); } net.push_back(*conv_backward_weight); if (grad_weight_memory != grad_weight_usr_memory) { net.push_back(reorder(grad_weight_memory, grad_weight_usr_memory)); } Stream::Instance().get_stream().submit(net); return std::tuple<at::Tensor, at::Tensor>{grad_weight, grad_bias}; }