void SpatialBatchNormalization::init(std::shared_ptr<TorchData> input) {
  RASSERT(input->type() == TorchDataType::TENSOR_DATA);

  Tensor<float>* in = TO_TENSOR_PTR(input.get());
  Tensor<float>* out = TO_TENSOR_PTR(output.get());

  RASSERT(in->dim() >= 3);
  RASSERT(in->size()[2] == nfeats_);

  if (output != nullptr && in->dim() != out->dim()) {
    output = nullptr;
  }

  // Check that the input and output size are the same.
  if (output != nullptr) {
    if (in->size()[0] != out->size()[0] ||
        in->size()[1] != out->size()[1] ||
        in->size()[2] != out->size()[2]) {
      output = nullptr;
    }
  }

  if (output == nullptr) {
    output.reset(new Tensor<float>(in->dim(), in->size()));
  }
}
Esempio n. 2
0
void SpatialDropout::forwardProp(std::shared_ptr<TorchData> input) {
  init(input);

  Tensor<float>::copy(*TO_TENSOR_PTR(output.get()),
                      *TO_TENSOR_PTR(input.get()));
  Tensor<float>::mul(*TO_TENSOR_PTR(output.get()), 1 - p_);
}
void SpatialLPPooling::forwardPropThread(const uint32_t outf) {
  const uint32_t out_w = TO_TENSOR_PTR(output.get())->size()[0];
  const uint32_t out_h = TO_TENSOR_PTR(output.get())->size()[1];
  const uint32_t in_w = cur_in_w;
  const uint32_t in_h = cur_in_h;
  const float one_over_p_norm = 1.0f / p_norm_;

  float* out = &output_cpu_[outf * out_w * out_h];
  float* in = &input_cpu_[outf * in_w * in_h];

  for (uint32_t outv = 0; outv < out_h; outv++) {
    for (uint32_t outu = 0; outu < out_w; outu++) {
      uint32_t out_index = outv * out_w + outu;
      out[out_index] = 0.0f;
      // Now perform max pooling:
      for (uint32_t inv = outv * poolsize_v_; inv < (outv + 1) * poolsize_v_;
           inv++) {
        for (uint32_t inu = outu * poolsize_u_; inu < (outu + 1) * poolsize_u_;
             inu++) {
          float val = fabsf(in[inv * in_w + inu]);
          out[out_index] += powf(val, p_norm_);
        }
      }
      out[outv * out_w + outu] =
          powf(out[outv * out_w + outu], one_over_p_norm);
    }
  }
  std::unique_lock<std::mutex> ul(thread_update_lock_);
  threads_finished_++;
  not_finished_.notify_all();  // Signify that all threads might have finished
  ul.unlock();
}
Esempio n. 4
0
void Threshold::forwardProp(std::shared_ptr<TorchData> input) {
  init(input);
  cl_context->useKernelCStr(kThresholdKernel, "Threshold1D");
  cl_context->setArg(0, TO_TENSOR_PTR(input.get())->storage());
  cl_context->setArg(1, TO_TENSOR_PTR(output.get())->storage());
  cl_context->setArg(2, threshold);
  cl_context->setArg(3, val);
  uint32_t dim = 1;
  uint32_t nelem = TO_TENSOR_PTR(output.get())->nelems();
  cl_context->runKernel(jtorch::deviceid, dim, &nelem, false);
}
Esempio n. 5
0
void Threshold::init(std::shared_ptr<TorchData> input) {
  RASSERT(input->type() == TorchDataType::TENSOR_DATA);
  Tensor<float>* in = TO_TENSOR_PTR(input.get());
  if (output != nullptr) {
    if (!in->isSameSizeAs(*TO_TENSOR_PTR(output.get()))) {
      // Input dimension has changed!
      output = nullptr;
    }
  }
  if (output == nullptr) {
    output.reset(new Tensor<float>(in->dim(), in->size()));
  }
}
Esempio n. 6
0
void SpatialDropout::init(std::shared_ptr<TorchData> input) {
  RASSERT(input->type() == TorchDataType::TENSOR_DATA);

  Tensor<float>* in = TO_TENSOR_PTR(input.get());
  if (output != nullptr) {
    if (!TO_TENSOR_PTR(output.get())->isSameSizeAs(*in)) {
      output = nullptr;
    }
  }
  if (output == nullptr) {
    output.reset(Tensor<float>::clone(*in));
  }
}
Esempio n. 7
0
  void JoinTable::forwardProp(TorchData& input) {
    init(input);

    Table& in = (Table&)input;

    // AT THE MOMENT ONLY JOINS ALONG THE TOP DIMENSION ARE SUPPORTED
    if (dimension_ != 0) {
      throw std::runtime_error("JoinTable::forwardProp() - "
        "Only dimension=0 is supported for now");
    }

    // Copy each table element's raw data into the output
    std::string kernel = jtorch::jtorch_path + "kernels/join_table.cl";
    cl_context->useKernel(kernel.c_str(), "JoinTable1D");
    int out_offset = 0;
    for (uint32_t i = 0; i < in.tableSize(); i++) {
      Tensor<float>* cur_input = (Tensor<float>*)in(i);
      cl_context->setArg(0, cur_input->storage());
      cl_context->setArg(1, TO_TENSOR_PTR(output)->storage());
      cl_context->setArg(2, out_offset);
      uint32_t dim = 1;
      uint32_t nelem = cur_input->nelems();
      cl_context->runKernel(jtorch::deviceid, dim, &nelem, false);

      out_offset += nelem;
    }
  }
void SpatialLPPooling::forwardProp(std::shared_ptr<TorchData> input) {
  init(input);
  Tensor<float>* in = TO_TENSOR_PTR(input.get());
  in->getData(input_cpu_.get());
  cur_in_w = in->size()[0];
  cur_in_h = in->size()[1];
  threads_finished_ = 0;
  for (uint32_t i = 0; i < thread_cbs_.size(); i++) {
    tp_->addTask(thread_cbs_[i].get());
  }

  // Wait for all threads to finish
  std::unique_lock<std::mutex> ul(thread_update_lock_);  // Get lock
  while (threads_finished_ != static_cast<int32_t>(thread_cbs_.size())) {
    not_finished_.wait(ul);
  }
  ul.unlock();  // Release lock
  TO_TENSOR_PTR(output.get())->setData(output_cpu_.get());
}
Esempio n. 9
0
void SpatialMaxPooling::init(std::shared_ptr<TorchData> input) {
  RASSERT(input->type() == TorchDataType::TENSOR_DATA);
  Tensor<float>* in = TO_TENSOR_PTR(input.get());
  RASSERT(in->dim() == 2 || in->dim() == 3);

  // We'll escentially do ceil_mode = false from torch
  const uint32_t iwidth = in->size()[0];
  const uint32_t iheight = in->size()[1];
  uint32_t oheight = (long)(floor((float)(iheight - kh_ + 2*padh_) / dh_)) + 1;
  uint32_t owidth  = (long)(floor((float)(iwidth  - kw_ + 2*padw_) / dw_)) + 1;

  if (output != nullptr && TO_TENSOR_PTR(output.get())->dim() != in->dim()) {
    // Input dimension has changed!
    output = nullptr;
  }

  if (output != nullptr) {
    // Check that the dimensions above the lowest 2 match
    for (uint32_t i = 2; i < in->dim() && output != nullptr; i++) {
      if (TO_TENSOR_PTR(output.get())->size()[i] != in->size()[i]) {
        output = nullptr;
      }
    }
  }

  if (output != nullptr) {
    // Check that the lowest 2 dimensions are the correct size
    if (TO_TENSOR_PTR(output.get())->size()[0] != owidth ||
        TO_TENSOR_PTR(output.get())->size()[1] != oheight) {
      output = nullptr;
    }
  }

  if (output == nullptr) {
    std::unique_ptr<uint32_t[]> out_size(new uint32_t[in->dim()]);
    out_size[0] = owidth;
    out_size[1] = oheight;
    for (uint32_t i = 2; i < in->dim(); i++) {
      out_size[i] = in->size()[i];
    }
    output.reset(new Tensor<float>(in->dim(), out_size.get()));
  }
}
void SpatialLPPooling::init(std::shared_ptr<TorchData> input) {
  RASSERT(input->type() == TorchDataType::TENSOR_DATA);
  Tensor<float>* in = TO_TENSOR_PTR(input.get());
  RASSERT(in->dim() == 2 || in->dim() == 3);

  if (output != nullptr && TO_TENSOR_PTR(output.get())->dim() != in->dim()) {
    // Input dimension has changed!
    cleanup();
  }

  if (output != nullptr) {
    // Check that the dimensions above the lowest 2 match
    for (uint32_t i = 2; i < in->dim() && output != nullptr; i++) {
      if (TO_TENSOR_PTR(output.get())->size()[i] != in->size()[i]) {
        cleanup();
      }
    }
  }

  if (output != nullptr) {
    // Check that the lowest 2 dimensions are the correct size
    if (TO_TENSOR_PTR(output.get())->size()[0] != in->size()[0] / poolsize_u_ ||
        TO_TENSOR_PTR(output.get())->size()[1] != in->size()[1] / poolsize_v_) {
      cleanup();
    }
  }

  if (output == nullptr) {
    // Check that the width and height is a multiple of the poolsize
    RASSERT(in->size()[0] % poolsize_u_ == 0 &&
           in->size()[1] % poolsize_v_ == 0);

    std::unique_ptr<uint32_t[]> out_size(new uint32_t[in->dim()]);
    out_size[0] = in->size()[0] / poolsize_u_;
    out_size[1] = in->size()[1] / poolsize_v_;
    for (uint32_t i = 2; i < in->dim(); i++) {
      out_size[i] = in->size()[i];
    }

    output.reset(new Tensor<float>(in->dim(), out_size.get()));
    input_cpu_.reset(new float[in->nelems()]);
    output_cpu_.reset(new float[TO_TENSOR_PTR(output.get())->nelems()]);
  }

  uint32_t n_threads = 1;
  if (in->dim() > 2) {
    n_threads = TO_TENSOR_PTR(output.get())->size()[2];
  }
  if (thread_cbs_.size() != n_threads) {
    thread_cbs_.empty();
    for (uint32_t f = 0; f < n_threads; f++) {
      thread_cbs_.push_back(std::unique_ptr<jcl::threading::Callback<void>>(
          MakeCallableMany(&SpatialLPPooling::forwardPropThread, this, f)));
    }
  }
}
Esempio n. 11
0
void JoinTable::init(std::shared_ptr<TorchData> input) {
  RASSERT(input->type() == TorchDataType::TABLE_DATA);  // Table expected

  Table* in = TO_TABLE_PTR(input.get());

  RASSERT(in->tableSize() > 0);

  // Check that it is a table of FloatTensors
  for (uint32_t i = 0; i < in->tableSize(); i++) {
    // Table of float tensors expected
    RASSERT((*in)(i)->type() == TENSOR_DATA);
  }

  uint32_t dim = TO_TENSOR_PTR((*in)(0).get())->dim();
  RASSERT(dim > dimension_);  // Otherwise input is smaller than join dimension
  uint32_t jdim = dim - dimension_ - 1;  // dimension_=0 is the top dim

  // Make sure the dimensions OTHER than the join dimension are all the same
  for (uint32_t d = 0; d < dim; d++) {
    if (d != jdim) {
      for (uint32_t j = 1; j < in->tableSize(); j++) {
        // sizes must match
        RASSERT(TO_TENSOR_PTR((*in)(j).get())->size()[d] ==
               TO_TENSOR_PTR((*in)(0).get())->size()[d]);
      }
      if (output != nullptr &&
          TO_TENSOR_PTR(output.get())->size()[d] !=
              TO_TENSOR_PTR((*in)(0).get())->size()[d]) {
        output = nullptr;
      }
    }
  }

  uint32_t nelems_jdim = 0;
  for (uint32_t j = 1; j < in->tableSize(); j++) {
    nelems_jdim += TO_TENSOR_PTR((*in)(j).get())->size()[jdim];
  }

  if (output != nullptr &&
      TO_TENSOR_PTR(output.get())->size()[jdim] != nelems_jdim) {
    output = nullptr;
  }

  if (output == nullptr) {
    std::unique_ptr<uint32_t[]> size(new uint32_t[dim]);
    memcpy(size.get(), TO_TENSOR_PTR((*in)(0).get())->size(),
           sizeof(size[0]) * dim);
    size[dimension_] = nelems_jdim;
    output = std::shared_ptr<TorchData>(new Tensor<float>(dim, size.get()));
  }
}
Esempio n. 12
0
void Reshape::init(std::shared_ptr<TorchData> input) {
    RASSERT(input->type() == TorchDataType::TENSOR_DATA);
    Tensor<float>* in = TO_TENSOR_PTR(input.get());

    int32_t nelems = outNElem();
    static_cast<void>(nelems);
    // Check the input size.
    RASSERT(in->nelems() == static_cast<uint32_t>(nelems));

    if (output != nullptr) {
        Tensor<float>* out = TO_TENSOR_PTR(output.get());
        if (out->storage() != in->storage()) {
            // The tensors don't share the same storage! Reinitialize the view.
            output = nullptr;
        }
    }

    if (output == nullptr) {
        output = Tensor<float>::view(*in, odim_, osize_.get());
    }
}
Esempio n. 13
0
void JoinTable::forwardProp(std::shared_ptr<TorchData> input) {
  init(input);

  Table* in = (Table*)input.get();

  // AT THE MOMENT ONLY JOINS ALONG THE TOP DIMENSION ARE SUPPORTED
  RASSERT(dimension_ == 0);  // Only dimension=0 is supported for now

  // Copy each table element's raw data into the output
  cl_context->useKernelCStr(kJoinTable1DKernel, "JoinTable1D");
  int out_offset = 0;
  for (uint32_t i = 0; i < in->tableSize(); i++) {
    Tensor<float>* cur_input = TO_TENSOR_PTR((*in)(i).get());
    cl_context->setArg(0, cur_input->storage());
    cl_context->setArg(1, TO_TENSOR_PTR(output.get())->storage());
    cl_context->setArg(2, out_offset);
    uint32_t dim = 1;
    uint32_t nelem = cur_input->nelems();
    cl_context->runKernel(jtorch::deviceid, dim, &nelem, false);

    out_offset += nelem;
  }
}
void SpatialConvolution::forwardProp(std::shared_ptr<TorchData> input) {
  init(input);
  Tensor<float>* in = TO_TENSOR_PTR(input.get());
  if (padding_ > 0) {
    cl_context->useKernelCStr(kSpatialConvolutionKernel, "SpatialConvolutionPadding");
  } else {
    cl_context->useKernelCStr(kSpatialConvolutionKernel, "SpatialConvolution");
  }
  cl_context->setArg(0, in->storage());
  cl_context->setArg(1, TO_TENSOR_PTR(output.get())->storage());
  cl_context->setArg(2, weights_->storage());
  cl_context->setArg(3, biases_->storage());
  cl_context->setArg(4, (int)in->size()[2]);
  cl_context->setArg(5, (int)in->size()[1]);
  cl_context->setArg(6, (int)in->size()[0]);
  cl_context->setArg(7, (int)filt_height_);
  cl_context->setArg(8, (int)filt_width_);
  if (padding_ > 0) {
    cl_context->setArg(9, (int)padding_);
  }
  uint32_t dim = 3;
  cl_context->runKernel(jtorch::deviceid, dim,
                        TO_TENSOR_PTR(output.get())->size(), false);
}
void SpatialConvolution::init(std::shared_ptr<TorchData> input) {
  RASSERT(input->type() == TorchDataType::TENSOR_DATA);

  Tensor<float>* in = TO_TENSOR_PTR(input.get());
  RASSERT(in->dim() == 3);
  RASSERT(in->size()[2] == feats_in_);

  if (output != nullptr) {
    uint32_t owidth = in->size()[0] - filt_width_ + 1 + 2 * padding_;
    uint32_t oheight = in->size()[1] - filt_height_ + 1 + 2 * padding_;
    const uint32_t* out_size = TO_TENSOR_PTR(output.get())->size();
    if (out_size[0] != owidth || out_size[1] != oheight ||
        out_size[2] != feats_out_) {
      output = nullptr;
    }
  }
  if (output == nullptr) {
    uint32_t out_dim[3];
    out_dim[0] = in->size()[0] - filt_width_ + 1 + 2 * padding_;
    out_dim[1] = in->size()[1] - filt_height_ + 1 + 2 * padding_;
    out_dim[2] = feats_out_;
    output.reset(new Tensor<float>(3, out_dim));
  }
}
Esempio n. 16
0
void Select::forwardProp(std::shared_ptr<TorchData> input) {
  RASSERT(input->type() == TorchDataType::TENSOR_DATA);

  // For now we only support the Select operation along the outer dimension.
  // In torch indexing this is always 1.
  RASSERT(this->dimension_ == 1);

  if (src_tensor_ != input.get()) {
    // Only create the tensor slice if the input has changed.
    src_tensor_ = TO_TENSOR_PTR(input.get());

    // Note the index is torch 1-indexed.
    output = Tensor<float>::selectOuterDim(*src_tensor_, this->index_ - 1);
  }
}
void SpatialBatchNormalization::forwardProp(std::shared_ptr<TorchData> input) {
  init(input);

  Tensor<float>* in = TO_TENSOR_PTR(input.get());
  Tensor<float>* out = TO_TENSOR_PTR(output.get());
  if (affine_) {
    cl_context->useKernelCStr(kSpatialBatchNormalizationKernel,
                              "SpatialBatchNormalizationAffine");
  } else {
    cl_context->useKernelCStr(kSpatialBatchNormalizationKernel,
                              "SpatialBatchNormalization");
  }
  cl_context->setArg(0, in->storage());
  cl_context->setArg(1, TO_TENSOR_PTR(running_mean_.get())->storage());
  cl_context->setArg(2, TO_TENSOR_PTR(running_std_.get())->storage());
  cl_context->setArg(3, out->storage());
  if (affine_) {
    cl_context->setArg(4, TO_TENSOR_PTR(weights_.get())->storage());
    cl_context->setArg(5, TO_TENSOR_PTR(biases_.get())->storage());
  }
  cl_context->runKernel(jtorch::deviceid, TO_TENSOR_PTR(output.get())->dim(),
                        TO_TENSOR_PTR(output.get())->size(), false);
}
Esempio n. 18
0
void SpatialMaxPooling::forwardProp(std::shared_ptr<TorchData> input) {
  init(input);
  bool two_dim = TO_TENSOR_PTR(input.get())->dim() == 2;
  if (two_dim) {
    cl_context->useKernelCStr(kSpatialMaxPoolingKernel, "SpatialMaxPooling2D");
  } else {
    cl_context->useKernelCStr(kSpatialMaxPoolingKernel, "SpatialMaxPooling");
  }
  cl_context->setArg(0, TO_TENSOR_PTR(input.get())->storage());
  cl_context->setArg(1, TO_TENSOR_PTR(output.get())->storage());
  cl_context->setArg(2, (int)TO_TENSOR_PTR(input.get())->size()[1]);
  cl_context->setArg(3, (int)TO_TENSOR_PTR(input.get())->size()[0]);
  cl_context->setArg(4, (int)kw_);
  cl_context->setArg(5, (int)kh_);
  cl_context->setArg(6, (int)dw_);
  cl_context->setArg(7, (int)dh_);
  cl_context->setArg(8, (int)padw_);
  cl_context->setArg(9, (int)padh_);
  cl_context->runKernel(jtorch::deviceid, TO_TENSOR_PTR(output.get())->dim(),
                        TO_TENSOR_PTR(output.get())->size(), false);
}
Esempio n. 19
0
  void JoinTable::init(TorchData& input) {
    if (input.type() != TorchDataType::TABLE_DATA) {
      throw std::runtime_error("JoinTable::forwardProp() - "
        "Table expected!");
    }
    Table& in = (Table&)input;

    if (in.tableSize() == 0) {
      throw std::runtime_error("JoinTable::forwardProp() - "
        "Empty input Table!");
    }

    // Check that it is a table of FloatTensors
    for (uint32_t i = 0; i < in.tableSize(); i++) {
      if (in(i)->type() != TENSOR_DATA) {
        throw std::runtime_error("JoinTable::forwardProp() - "
          "Table of float tensors expected!");
      }
    }

    uint32_t dim = TO_TENSOR_PTR(in(0))->dim();
    if (dim <= dimension_) {
      throw std::runtime_error("JoinTable::forwardProp() - "
        "Input is smaller than join dimension!");
    }
    uint32_t jdim = dim - dimension_ - 1;  // dimension_=0 is the top dim

    // Make sure the dimensions OTHER than the join dimension are all the same
    for (uint32_t d = 0; d < dim; d++) {
      if (d != jdim) {
        for (uint32_t j = 1; j < in.tableSize(); j++) {
          if (TO_TENSOR_PTR(in(j))->size()[d] != TO_TENSOR_PTR(in(0))->size()[d]) {
            throw std::runtime_error("JoinTable::forwardProp() - "
              "Size mismatch!");
          }
        }
        if (output != NULL && TO_TENSOR_PTR(output)->size()[d] != 
          TO_TENSOR_PTR(in(0))->size()[d]) {
            SAFE_DELETE(output);
        }
      }
    }

    uint32_t nelems_jdim = 0;
    for (uint32_t j = 1; j < in.tableSize(); j++) {
      nelems_jdim += TO_TENSOR_PTR(in(j))->size()[jdim];
    }

    if (output != NULL &&
      TO_TENSOR_PTR(output)->size()[jdim] != nelems_jdim) {
      SAFE_DELETE(output);
    }

    if (output == NULL) {
      uint32_t* size = new uint32_t[dim];
      memcpy(size, TO_TENSOR_PTR(in(0))->size(), sizeof(size[0]) * dim);
      size[dimension_] = nelems_jdim;
      output = new Tensor<float>(dim, size);
      SAFE_DELETE_ARR(size);
    }
  }
void SpatialDivisiveNormalization::forwardProp(
    std::shared_ptr<TorchData> input) {
  init(input);
  bool onedim_kernel = kernel_->dim() == 1;

  Tensor<float>* in = TO_TENSOR_PTR(input.get());
  Tensor<float>* out = TO_TENSOR_PTR(output.get());
  if (onedim_kernel) {
    int32_t filt_rad = ((int32_t)kernel_norm_->size()[0] - 1) / 2;

    // Perform horizontal filter pass
    cl_context->useKernelCStr(kSpatialDivisiveNormalizationKernel,
                          "SpatialDivisiveNormalizationHoriz");
    cl_context->setArg(0, in->storage());
    cl_context->setArg(1, std_pass1_->storage());
    cl_context->setArg(2, kernel_norm_->storage());
    cl_context->setArg(3, filt_rad);
    cl_context->runKernel(jtorch::deviceid, std_pass1_->dim(),
                          std_pass1_->size(), false);

    // Perform vertical filter pass
    cl_context->useKernelCStr(kSpatialDivisiveNormalizationKernel,
                          "SpatialDivisiveNormalizationVert");
    cl_context->setArg(0, std_pass1_->storage());
    cl_context->setArg(1, std_pass2_->storage());
    cl_context->setArg(2, kernel_norm_->storage());
    cl_context->setArg(3, filt_rad);
    cl_context->runKernel(jtorch::deviceid, std_pass2_->dim(),
                          std_pass2_->size(), false);
  } else {
    int32_t filt_rad_u = ((int32_t)kernel_norm_->size()[0] - 1) / 2;
    int32_t filt_rad_v = ((int32_t)kernel_norm_->size()[1] - 1) / 2;

    // Perform vertical filter pass
    cl_context->useKernelCStr(kSpatialDivisiveNormalizationKernel,
                          "SpatialDivisiveNormalization2D");
    cl_context->setArg(0, in->storage());
    cl_context->setArg(1, std_pass2_->storage());
    cl_context->setArg(2, kernel_norm_->storage());
    cl_context->setArg(3, filt_rad_u);
    cl_context->setArg(4, filt_rad_v);
    cl_context->runKernel(jtorch::deviceid, std_pass2_->dim(),
                          std_pass2_->size(), false);
  }

  // Perform accumulation and division pass
  cl_context->useKernelCStr(kSpatialDivisiveNormalizationKernel,
                        "SpatialDivisiveNormalizationAccumDiv");
  cl_context->setArg(0, std_pass2_->storage());
  cl_context->setArg(1, std_->storage());
  cl_context->setArg(2, std_coef_->storage());
  cl_context->setArg(3, (int)out->size()[2]);
  cl_context->setArg(4, threshold_);
  cl_context->runKernel(jtorch::deviceid, std_->dim(), std_->size(), false);

  // Perform normalization pass
  cl_context->useKernelCStr(kSpatialDivisiveNormalizationKernel,
                        "SpatialDivisiveNormalization");
  cl_context->setArg(0, in->storage());
  cl_context->setArg(1, out->storage());
  cl_context->setArg(2, std_->storage());
  cl_context->runKernel(jtorch::deviceid, out->dim(), out->size(), false);
}
  void SpatialSubtractiveNormalization::init(TorchData& input)  {
    if (input.type() != TorchDataType::TENSOR_DATA) {
      throw std::runtime_error("SpatialSubtractiveNormalization::init() - "
        "FloatTensor expected!");
    }
    Tensor<float>& in = (Tensor<float>&)input;

    if (in.dim() != 3) {
      throw std::runtime_error("SpatialDivisiveNormalization::init() - "
        "3D input is expected!");
    }

    if (output != NULL) {
      if (!in.isSameSizeAs(*(Tensor<float>*)output)) {
        // Input dimension has changed!
        cleanup();
      }
    }

    if (output == NULL) {
      output = new Tensor<float>(in.dim(), in.size());
      mean_pass1_ = new Tensor<float>(in.dim(), in.size());
      mean_pass2_ = new Tensor<float>(in.dim(), in.size());
    }

    if (mean_coef_ == NULL) {
      uint32_t mean_coeff_size[2];
      mean_coeff_size[0] = TO_TENSOR_PTR(output)->size()[0];
      mean_coeff_size[1] = TO_TENSOR_PTR(output)->size()[1];
      mean_coef_ = new Tensor<float>(2, mean_coeff_size);

      float* mean_coef_cpu = new float[mean_coef_->nelems()];
      float* kernel_cpu = new float[kernel_->nelems()];
      kernel_->getData(kernel_cpu);
      bool onedim_kernel = kernel_->dim() == 1;

      // Filter an image of all 1 values to create the normalization constants
      // See norm_test.lua for proof that this works as well as:
      // https://github.com/andresy/torch/blob/master/extra/nn/SpatialSubtractiveNormalization.lua
      int32_t n_feats = TO_TENSOR_PTR(output)->size()[2];
      int32_t height = TO_TENSOR_PTR(output)->size()[1];
      int32_t width = TO_TENSOR_PTR(output)->size()[0];
      if (onedim_kernel) {
        // 1D case - The filter is seperable, but we'll just do the dumb 2D 
        // version since we only do this once on startup.  --> O(n * m)
        uint32_t kernel_size = kernel_->size()[0];
        int32_t filt_rad = (kernel_size - 1) / 2;
        for (int32_t v = 0; v < height; v++) {
          for (int32_t u = 0; u < width; u++) {
            float tmp = 0.0f;
            for (int32_t v_filt = -filt_rad; v_filt <= filt_rad; v_filt++) {
              for (int32_t u_filt = -filt_rad; u_filt <= filt_rad; u_filt++) {
                int32_t u_in = u + u_filt;
                int32_t v_in = v + v_filt;
                if (u_in >= 0 && u_in < width && v_in >= 0 && v_in < height) {
                  // Pixel is inside --> We'll effectively clamp zeros elsewhere.
                  tmp += 
                    (kernel_cpu[v_filt + filt_rad] * kernel_cpu[u_filt + filt_rad]);
                }
              }
            }
            mean_coef_cpu[v * width + u] = tmp / n_feats;
          }
        }
      } else {
        // 2D case
        int32_t kernel_size_u = kernel_->size()[0];
        int32_t kernel_size_v = kernel_->size()[1];
        int32_t filt_rad_u = (kernel_size_u - 1) / 2;
        int32_t filt_rad_v = (kernel_size_v - 1) / 2;
        for (int32_t v = 0; v < height; v++) {
          for (int32_t u = 0; u < width; u++) {
            float tmp = 0.0f;
            for (int32_t v_filt = -filt_rad_v; v_filt <= filt_rad_v; v_filt++) {
              for (int32_t u_filt = -filt_rad_u; u_filt <= filt_rad_u; u_filt++) {
                int32_t u_in = u + u_filt;
                int32_t v_in = v + v_filt;
                if (u_in >= 0 && u_in < width && v_in >= 0 && v_in < height) {
                  // Pixel is inside --> We'll effectively clamp zeros elsewhere.
                  tmp += 
                    kernel_cpu[(v_filt + filt_rad_v) * kernel_size_u + (u_filt + filt_rad_u)];
                }
              }
            }
            mean_coef_cpu[v * width + u] = tmp / n_feats;
          }
        }
      }
      mean_coef_->setData(mean_coef_cpu);
      delete[] mean_coef_cpu;
      delete[] kernel_cpu;
    }
    if (mean_ == NULL) {
      uint32_t mean_coeff_size[2];
      mean_coeff_size[0] = TO_TENSOR_PTR(output)->size()[0];
      mean_coeff_size[1] = TO_TENSOR_PTR(output)->size()[1];
      mean_ = new Tensor<float>(2, mean_coeff_size);
    }
  }
void SpatialDivisiveNormalization::init(std::shared_ptr<TorchData> input) {
  RASSERT(input->type() == TorchDataType::TENSOR_DATA);
  Tensor<float>* in = TO_TENSOR_PTR(input.get());

  RASSERT(in->dim() == 3);

  if (output != nullptr) {
    if (!in->isSameSizeAs(*TO_TENSOR_PTR(output.get()))) {
      // Input dimension has changed!
      cleanup();
    }
  }

  if (output == nullptr) {
    output.reset(new Tensor<float>(in->dim(), in->size()));
    std_pass1_.reset(new Tensor<float>(in->dim(), in->size()));
    std_pass2_.reset(new Tensor<float>(in->dim(), in->size()));
  }
  if (kernel_norm_ == nullptr) {
    bool onedim_kernel = kernel_->dim() == 1;
    const float n_feats = (float)in->size()[2];

    // Clone and normalize the input kernel
    kernel_norm_.reset(Tensor<float>::clone(*kernel_));
    float sum = Tensor<float>::slowSum(*kernel_norm_);
    float div_val = onedim_kernel ? (sum * sqrtf(n_feats)) : (sum * n_feats);
    Tensor<float>::div(*kernel_norm_, div_val);
  }
  if (std_coef_ == nullptr) {
    uint32_t std_coeff_size[2];
    std_coeff_size[0] = TO_TENSOR_PTR(output.get())->size()[0];
    std_coeff_size[1] = TO_TENSOR_PTR(output.get())->size()[1];
    std_coef_.reset(new Tensor<float>(2, std_coeff_size));

    std::unique_ptr<float[]> std_coef_cpu(new float[std_coef_->nelems()]);
    std::unique_ptr<float[]> kernel_norm_cpu(new float[kernel_norm_->nelems()]);
    kernel_norm_->getData(kernel_norm_cpu.get());
    bool onedim_kernel = kernel_->dim() == 1;

    // Filter an image of all 1 values to create the normalization constants
    // See norm_test.lua for proof that this works as well as:
    // https://github.com/andresy/torch/blob/master/extra/nn/SpatialDivisiveNormalization.lua
    int32_t n_feats = TO_TENSOR_PTR(output.get())->size()[2];
    int32_t height = TO_TENSOR_PTR(output.get())->size()[1];
    int32_t width = TO_TENSOR_PTR(output.get())->size()[0];
    if (onedim_kernel) {
      // 1D case - The filter is seperable, but we'll just do the dumb 2D
      // version since we only do this once on startup.  --> O(n * m)
      int32_t kernel_size = kernel_norm_->size()[0];
      int32_t filt_rad = (kernel_size - 1) / 2;
      for (int32_t v = 0; v < height; v++) {
        for (int32_t u = 0; u < width; u++) {
          float tmp = 0.0f;
          for (int32_t v_filt = -filt_rad; v_filt <= filt_rad; v_filt++) {
            for (int32_t u_filt = -filt_rad; u_filt <= filt_rad; u_filt++) {
              int32_t u_in = u + u_filt;
              int32_t v_in = v + v_filt;
              if (u_in >= 0 && u_in < width && v_in >= 0 && v_in < height) {
                // Pixel is inside --> We'll effectively clamp zeros elsewhere.
                tmp += (kernel_norm_cpu[v_filt + filt_rad] *
                        kernel_norm_cpu[u_filt + filt_rad]);
              }
            }
          }
          std_coef_cpu[v * width + u] = tmp / n_feats;
        }
      }
    } else {
      // 2D case
      int32_t kernel_size_u = kernel_norm_->size()[0];
      int32_t kernel_size_v = kernel_norm_->size()[1];
      int32_t filt_rad_u = (kernel_size_u - 1) / 2;
      int32_t filt_rad_v = (kernel_size_v - 1) / 2;
      for (int32_t v = 0; v < height; v++) {
        for (int32_t u = 0; u < width; u++) {
          float tmp = 0.0f;
          for (int32_t v_filt = -filt_rad_v; v_filt <= filt_rad_v; v_filt++) {
            for (int32_t u_filt = -filt_rad_u; u_filt <= filt_rad_u; u_filt++) {
              int32_t u_in = u + u_filt;
              int32_t v_in = v + v_filt;
              if (u_in >= 0 && u_in < width && v_in >= 0 && v_in < height) {
                // Pixel is inside --> We'll effectively clamp zeros elsewhere.
                tmp += kernel_norm_cpu[(v_filt + filt_rad_v) * kernel_size_u +
                                       (u_filt + filt_rad_u)];
              }
            }
          }
          std_coef_cpu[v * width + u] = tmp / n_feats;
        }
      }
    }
    std_coef_->setData(std_coef_cpu.get());
  }
  if (std_ == nullptr) {
    uint32_t std_coeff_size[2];
    std_coeff_size[0] = TO_TENSOR_PTR(output.get())->size()[0];
    std_coeff_size[1] = TO_TENSOR_PTR(output.get())->size()[1];
    std_.reset(new Tensor<float>(2, std_coeff_size));
  }
}