예제 #1
0
void DataChannelMPI::receive(at::Tensor& data, rank_type src_rank) {
  if (!data.is_contiguous())
    throw std::logic_error("tensor to receive is not contiguous");

  MPI_Recv(data.data_ptr(), data.numel(), mpi_datatype.at(data.type().scalarType()),
           src_rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
예제 #2
0
void DataChannelMPI::send(at::Tensor& data, rank_type dst_rank) {
  if (!data.is_contiguous())
    throw std::logic_error("tensor to send is not contiguous");

  MPI_Send(data.data_ptr(), data.numel(), mpi_datatype.at(data.type().scalarType()),
           dst_rank, 0, MPI_COMM_WORLD);
}
예제 #3
0
at::Tensor sigmoid_add(at::Tensor x, at::Tensor y) {
  AT_CHECK(x.type().is_cuda(), "x must be a CUDA tensor");
  AT_CHECK(y.type().is_cuda(), "y must be a CUDA tensor");
  auto output = at::zeros_like(x);
  sigmoid_add_cuda(
      x.data<float>(), y.data<float>(), output.data<float>(), output.numel());
  return output;
}
예제 #4
0
rank_type DataChannelMPI::receive(at::Tensor& data) {
  if (!data.is_contiguous())
    throw std::logic_error("tensor to receive is not contiguous");

  MPI_Status status;
  MPI_Recv(data.data_ptr(), data.numel(), mpi_datatype.at(data.type().scalarType()),
           MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, &status);
  return status.MPI_SOURCE;
}
예제 #5
0
void DataChannelMPI::allReduce(at::Tensor& data, THDReduceOp operation,
                               THDGroup group_id) {
  const auto& comm = _groups.at(group_id).first;
  if (comm == MPI_COMM_NULL)
    return;

  if (!data.is_contiguous())
    throw std::runtime_error("all_reduce input has to be contiguous");

  MPI_Allreduce(MPI_IN_PLACE, data.data_ptr(), data.numel(),
                mpi_datatype.at(data.type().scalarType()), mpi_op.at(operation), comm);
}
예제 #6
0
void DataChannelMPI::broadcast(at::Tensor& data, rank_type src_rank,
                               THDGroup group_id) {
  const auto& group_pair = _groups.at(group_id);
  const auto& comm = group_pair.first;
  if (comm == MPI_COMM_NULL)
    return;

  if (!data.is_contiguous())
    throw std::runtime_error("broadcast input has to be contiguous");

  rank_type group_src_rank = group_pair.second.mustGetGroupRank(src_rank);
  MPI_Bcast(data.data_ptr(), data.numel(), mpi_datatype.at(data.type().scalarType()),
            group_src_rank, comm);
}
예제 #7
0
void DataChannelMPI::reduce(at::Tensor& data, THDReduceOp operation,
                            rank_type dst_rank, THDGroup group_id) {
  const auto& group_pair = _groups.at(group_id);
  const auto& comm = group_pair.first;
  if (comm == MPI_COMM_NULL)
    return;

  if (!data.is_contiguous())
    throw std::runtime_error("reduce input has to be contiguous");

  auto group_dst_rank = group_pair.second.mustGetGroupRank(dst_rank);
  void *sendbuf = (_rank == dst_rank) ? MPI_IN_PLACE    : data.data_ptr();
  void *recvbuf = (_rank == dst_rank) ? data.data_ptr() : nullptr;
  MPI_Reduce(sendbuf, recvbuf, data.numel(), mpi_datatype.at(data.type().scalarType()),
             mpi_op.at(operation), group_dst_rank, comm);
}
예제 #8
0
void DataChannelMPI::allGather(std::vector<at::Tensor>& output,
                               at::Tensor& input, THDGroup group_id) {
  const auto& group_pair = _groups.at(group_id);
  const auto& comm = group_pair.first;
  if (comm == MPI_COMM_NULL)
    return;

  if (output.size() != group_pair.second.size())
    throw std::logic_error("allGather: number of output tensors and group size does not match");

  for (auto out_tensor : output)
    assertSameSizeAndType(out_tensor, input, "allGather");

  auto recv_buffer = _newLikeFlat(output);
  auto contig_input = input.contiguous();

  MPI_Allgather(
    contig_input.data_ptr(), contig_input.numel(), mpi_datatype.at(contig_input.type().scalarType()),
    recv_buffer.data_ptr(), contig_input.numel(), mpi_datatype.at(recv_buffer.type().scalarType()),
    comm
  );

  for (size_t i = 0; i < output.size(); ++i)
    output[i].copy_(recv_buffer[i]);
}
std::tuple<Tensor, Tensor> fractional_max_pool2d_cpu(
  const at::Tensor& input,
  IntArrayRef pool_size,
  IntArrayRef output_size,
  const at::Tensor& randomSamples)
{
  Tensor output = at::empty({0}, input.options());
  Tensor indices = at::empty({0}, input.options().dtype(kLong));
  fractional_max_pool2d_out_cpu_template(
    input,
    output,
    output_size,
    pool_size,
    indices,
    randomSamples);
  return std::tuple<Tensor, Tensor>(output, indices);
}
예제 #10
0
std::tuple<at::Tensor,at::Tensor,at::Tensor> mkldnn_convolution_backward(
    const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight,
    IntList padding, IntList stride, IntList dilation, std::array<bool,3> output_mask)
{
  Tensor grad_output = grad_output_t.contiguous();

  Tensor grad_input, grad_weight, grad_bias;
  if (output_mask[0]) {
    grad_input = at::mkldnn_convolution_backward_input(
      input.sizes(), grad_output, weight, padding, stride, dilation, output_mask[2]);
  }
  if (output_mask[1] || output_mask[2]) {
    std::tie(grad_weight, grad_bias) = at::mkldnn_convolution_backward_weights(
      weight.sizes(), grad_output, input, padding, stride, dilation, output_mask[2]);
  }

  return std::tuple<Tensor, Tensor, Tensor>{grad_input, grad_weight, grad_bias};
}
예제 #11
0
파일: nms_cuda.cpp 프로젝트: g0josh/mtcnn
at::Tensor nms_cuda(const at::Tensor input,
                    float thresh)
{

    AT_CHECK(input.ndimension() == 3,
        "First argument should be a 3D Tensor, (batch_sz x n_boxes x 4)");
    // AT_CHECK(scores.ndimens/ion() == 2,
        // "Second argument should be a 2D Tensor, (batch_sz x n_boxes)");
    // AT_CHECK(input.size(0) == scores.size(0),
        // "First and second arguments must have equal-sized first dimensions");
    // AT_CHECK(input.size(1) == scores.size(1),
        // "First and second arguments must have equal-sized second dimensions");
    AT_CHECK(input.size(2) == 4,
        "First argument dimension 2 must have size 4, and should be of the form [x, y, w, h]");
    AT_CHECK(input.is_contiguous(), "First argument must be a contiguous Tensor");
    // AT_CHECK(scores.is_contiguous(), "Second argument must be a contiguous Tensor");
    AT_CHECK(input.type().scalarType() == at::kFloat || input.type().scalarType() == at::kDouble,
        "First argument must be Float or Double Tensor");
    // AT_CHECK(scores.type().scalarType() == at::kFloat || scores.type().scalarType() == at::kDouble,
        // "Second argument must be Float or Double Tensor");
    AT_CHECK(input.is_contiguous(), "First argument must be a contiguous Tensor");
    // AT_CHECK(scores.is_contiguous(), "Second argument must be a contiguous Tensor");

    return non_max_suppression_cuda(input, thresh);

}
예제 #12
0
void DataChannelMPI::gather(std::vector<at::Tensor>& output,
                            at::Tensor& input, rank_type dst_rank,
                            THDGroup group_id) {
  const auto& group_pair = _groups.at(group_id);
  const auto& comm = group_pair.first;
  if (comm == MPI_COMM_NULL)
    return;

  at::Tensor recv_buffer;
  void *recvbuf = nullptr;
  if (_rank != dst_rank) {
    if (output.size() > 0)
      throw std::logic_error("gather: number of input tensors should be 0 for non root");
  } else {
    if (output.size() != group_pair.second.size())
      throw std::logic_error("gather: number of output tensors and group size does not match");

    for (auto out_tensor : output)
      assertSameSizeAndType(out_tensor, input, "gather");

    recv_buffer = _newLikeFlat(output);
    recvbuf = recv_buffer.data_ptr();
  }

  rank_type group_dst_rank = group_pair.second.mustGetGroupRank(dst_rank);
  auto contig_input = input.contiguous();

  MPI_Gather(
    contig_input.data_ptr(), input.numel(), mpi_datatype.at(input.type().scalarType()),
    recvbuf, input.numel(), mpi_datatype.at(input.type().scalarType()),
    group_dst_rank, comm
  );

  // NOTE: this is a no-op in all processes except dst_rank
  for (size_t i = 0; i < output.size(); ++i)
    output[i].copy_(recv_buffer[i]);
}
Tensor fractional_max_pool2d_backward_cpu(
  const at::Tensor& gradOutput_,
  const at::Tensor& input,
  IntArrayRef pool_size,
  IntArrayRef output_size,
  const at::Tensor& indices)
{
  Tensor gradInput = at::empty({0}, input.options());
  fractional_max_pool2d_backward_out_cpu_template(
    input,
    gradOutput_,
    gradInput,
    output_size,
    pool_size,
    indices);
  return gradInput;
}
예제 #14
0
void DataChannelMPI::scatter(std::vector<at::Tensor>& input,
                             at::Tensor& output,
                             rank_type src_rank, THDGroup group_id) {
  const auto& group_pair = _groups.at(group_id);
  const auto& comm = group_pair.first;
  if (comm == MPI_COMM_NULL)
    return;

  if (!output.is_contiguous())
    throw std::runtime_error("scatter output has to be a contiguous tensor");

  at::Tensor send_buffer;
  void *sendbuf = nullptr;
  if (_rank != src_rank) {
    if (input.size() > 0)
      throw std::logic_error("scatter: number of input tensors should be 0 for non root");
  } else {
    if (input.size() != group_pair.second.size())
      throw std::logic_error("scatter: number of input tensors and group size does not match");

    for (auto in_tensor : input)
      assertSameSizeAndType(in_tensor, output, "scatter");

    send_buffer = _newLikeFlat(input);
    for (size_t i = 0; i < input.size(); ++i)
      send_buffer[i].copy_(input[i]);
    sendbuf = send_buffer.data_ptr();
  }

  rank_type group_src_rank = group_pair.second.mustGetGroupRank(src_rank);

  MPI_Scatter(
    sendbuf, output.numel(), mpi_datatype.at(output.type().scalarType()),
    output.data_ptr(), output.numel(), mpi_datatype.at(output.type().scalarType()),
    group_src_rank, comm
  );
}
예제 #15
0
at::Tensor mkldnn_convolution(
    const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias,
    IntList padding, IntList stride, IntList dilation)
{
  auto output = input.type().tensor(conv_output_size(
    input.sizes(), weight.sizes(), padding, stride, dilation));

  auto cpu_engine = CpuEngine::Instance().get_engine();
  
  int32_t n = input.size(0);
  int32_t ic = input.size(1);
  int32_t ih = input.size(2);
  int32_t iw = input.size(3);

  int32_t oc = output.size(1);
  int32_t oh = output.size(2);
  int32_t ow = output.size(3);

  int32_t kh = weight.size(2);
  int32_t kw = weight.size(3);

  int32_t sh = stride[0];
  int32_t sw = stride[1];
  int32_t ph = padding[0];
  int32_t pw = padding[1];

  auto data_t = memory::data_type::f32;
  auto format_any = memory::format::any;
  auto format_nchw = memory::format::nchw;
  auto format_oihw = memory::format::oihw;
  auto format_x = memory::format::x;

  memory::dims input_tz = {n, ic, ih, iw};
  memory::dims weight_tz = {oc, ic, kh, kw};
  memory::dims bias_tz = {oc};
  memory::dims output_tz = {n, oc, oh, ow};
  memory::dims _stride = {sh, sw};
  memory::dims _padding = {ph, pw};

  auto input_md = memory::desc({input_tz}, data_t, format_any);
  auto weight_md = memory::desc({weight_tz}, data_t, format_any);
  auto bias_md = memory::desc({bias_tz}, data_t, format_any);
  auto output_md = memory::desc({output_tz}, data_t, format_any);

  std::shared_ptr<convolution_forward::desc> conv_forward_desc;
  if (bias.defined()) {
    conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward,
      convolution_direct, input_md, weight_md, bias_md, output_md,
      _stride, _padding, _padding, padding_kind::zero));
  } else {
    conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward,
      convolution_direct, input_md, weight_md, output_md,
      _stride, _padding, _padding, padding_kind::zero));
  }

  std::shared_ptr<convolution_forward::primitive_desc> conv_forward_pd;
  conv_forward_pd.reset(new convolution_forward::primitive_desc(
    *conv_forward_desc, cpu_engine));

  auto input_usr_memory = memory({{{input_tz}, data_t, format_nchw}, cpu_engine},
    input.data_ptr());
  auto weight_usr_memory = memory({{{weight_tz}, data_t,  format_oihw}, cpu_engine},
    weight.data_ptr());
  auto output_usr_memory = memory({{{output_tz}, data_t, format_nchw}, cpu_engine},
    output.data_ptr());

  std::vector<primitive> net;

  auto input_pd = conv_forward_pd->src_primitive_desc();
  auto input_memory = input_usr_memory;
  if (input_usr_memory.get_primitive_desc() != memory::primitive_desc(input_pd)) {
    input_memory = memory(input_pd);
    net.push_back(reorder(input_usr_memory, input_memory));
  }

  auto weight_pd = conv_forward_pd->weights_primitive_desc();
  auto weight_memory = weight_usr_memory;
  if (weight_usr_memory.get_primitive_desc() != memory::primitive_desc(weight_pd)) {
    weight_memory = memory(weight_pd);
    net.push_back(reorder(weight_usr_memory, weight_memory));
  }

  auto output_pd = conv_forward_pd->dst_primitive_desc();
  auto output_memory = output_usr_memory;
  if (output_usr_memory.get_primitive_desc() != memory::primitive_desc(output_pd)) {
    output_memory = memory(output_pd);
  }

  std::shared_ptr<convolution_forward> conv_forward;
  std::shared_ptr<memory> bias_usr_memory;
  if (bias.defined()) {
    bias_usr_memory.reset(new memory({{{bias_tz}, data_t, format_x}, cpu_engine},
      bias.data_ptr()));
    conv_forward.reset(new convolution_forward(*conv_forward_pd, input_memory,
      weight_memory, *bias_usr_memory, output_memory));
  } else {
    conv_forward.reset(new convolution_forward(*conv_forward_pd, input_memory,
      weight_memory, output_memory));
  }
  net.push_back(*conv_forward);

  if (output_memory != output_usr_memory) {
    net.push_back(reorder(output_memory, output_usr_memory));
  }

  Stream::Instance().get_stream().submit(net);

  return output;
}
예제 #16
0
std::tuple<at::Tensor, at::Tensor> mkldnn_convolution_backward_weights(
    IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input,
    IntList padding, IntList stride, IntList dilation, bool bias_defined)
{
  auto grad_weight = grad_output.type().tensor(weight_size);

  Tensor grad_bias;
  if (bias_defined) {
    grad_bias = grad_output.type().tensor({grad_output.size(1)});
  }

  auto cpu_engine = CpuEngine::Instance().get_engine();

  int32_t n = input.size(0);
  int32_t ic = input.size(1);
  int32_t ih = input.size(2);
  int32_t iw = input.size(3);

  int32_t oc = grad_output.size(1);
  int32_t oh = grad_output.size(2);
  int32_t ow = grad_output.size(3);

  int32_t kh = grad_weight.size(2);
  int32_t kw = grad_weight.size(3);

  int32_t sh = stride[0];
  int32_t sw = stride[1];
  int32_t ph = padding[0];
  int32_t pw = padding[1];

  auto data_t = memory::data_type::f32;
  auto format_any = memory::format::any;
  auto format_nchw = memory::format::nchw;
  auto format_oihw = memory::format::oihw;
  auto format_x = memory::format::x;

  memory::dims input_tz = {n, ic, ih, iw};
  memory::dims weight_tz = {oc, ic, kh, kw};
  memory::dims bias_tz = {oc};
  memory::dims output_tz = {n, oc, oh, ow};
  memory::dims _stride = {sh, sw};
  memory::dims _padding = {ph, pw};

  memory::desc input_md({input_tz}, data_t, format_any);
  memory::desc weight_md({weight_tz}, data_t, format_any);
  memory::desc bias_md({bias_tz}, data_t, format_any);
  memory::desc output_md({output_tz}, data_t, format_any);

  // need to re-create conv_forward_pd to feed conv_backward_weight_pd
  std::shared_ptr<convolution_forward::desc> conv_forward_desc;
  if (bias_defined) {
    conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward,
      convolution_direct, input_md, weight_md, bias_md, output_md,
      _stride, _padding, _padding, padding_kind::zero));
  } else {
    conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward,
      convolution_direct, input_md, weight_md, output_md,
      _stride, _padding, _padding, padding_kind::zero));
  }

  std::shared_ptr<convolution_forward::primitive_desc> conv_forward_pd;
  conv_forward_pd.reset(new convolution_forward::primitive_desc(
    *conv_forward_desc, cpu_engine));

  std::shared_ptr<convolution_backward_weights::desc> conv_backward_weight_desc;
  if (bias_defined) {
    conv_backward_weight_desc.reset(new convolution_backward_weights::desc(
      convolution_direct, input_md, weight_md, bias_md, output_md,
      _stride, _padding, _padding, padding_kind::zero));
  } else {
    conv_backward_weight_desc.reset(new convolution_backward_weights::desc(
      convolution_direct, input_md, weight_md, output_md,
      _stride, _padding, _padding, padding_kind::zero));
  }

  std::shared_ptr<convolution_backward_weights::primitive_desc> conv_backward_weight_pd;
  conv_backward_weight_pd.reset(new convolution_backward_weights::primitive_desc(
    *conv_backward_weight_desc, cpu_engine, *conv_forward_pd));

  auto input_usr_memory = memory({{{input_tz}, data_t, format_nchw}, cpu_engine},
    input.data_ptr());
  auto grad_output_usr_memory = memory({{{output_tz}, data_t, format_nchw}, cpu_engine},
    grad_output.data_ptr());
  auto grad_weight_usr_memory = memory({{{weight_tz}, data_t, format_oihw}, cpu_engine},
    grad_weight.data_ptr());
  std::shared_ptr<memory> grad_bias_memory;

  std::vector<primitive> net;

  auto input_pd = conv_backward_weight_pd->src_primitive_desc();
  auto input_memory = input_usr_memory;
  if (input_usr_memory.get_primitive_desc() != memory::primitive_desc(input_pd)) {
    input_memory = memory(input_pd);
    net.push_back(reorder(input_usr_memory, input_memory));
  }

  auto grad_output_pd = conv_backward_weight_pd->diff_dst_primitive_desc();
  auto grad_output_memory = grad_output_usr_memory;
  if (grad_output_usr_memory.get_primitive_desc() != memory::primitive_desc(grad_output_pd)) {
    grad_output_memory = memory(grad_output_pd);
    net.push_back(reorder(grad_output_usr_memory, grad_output_memory));
  }

  auto grad_weight_pd = conv_backward_weight_pd->diff_weights_primitive_desc();
  auto grad_weight_memory = grad_weight_usr_memory;
  if (grad_weight_usr_memory.get_primitive_desc() != memory::primitive_desc(grad_weight_pd)) {
    grad_weight_memory = memory(grad_weight_pd);
  }

  std::shared_ptr<convolution_backward_weights> conv_backward_weight;
  if (bias_defined) {
    grad_bias_memory.reset(new memory({{{bias_tz}, data_t, format_x}, cpu_engine},
      grad_bias.data_ptr()));
    conv_backward_weight.reset(new convolution_backward_weights(*conv_backward_weight_pd,
      input_memory, grad_output_memory, grad_weight_memory, *grad_bias_memory));
  } else {
    conv_backward_weight.reset(new convolution_backward_weights(*conv_backward_weight_pd,
      input_memory, grad_output_memory, grad_weight_memory));
  }

  net.push_back(*conv_backward_weight);

  if (grad_weight_memory != grad_weight_usr_memory) {
    net.push_back(reorder(grad_weight_memory, grad_weight_usr_memory));
  }

  Stream::Instance().get_stream().submit(net);

  return std::tuple<at::Tensor, at::Tensor>{grad_weight, grad_bias};
}